aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorJames Morris <james.morris@microsoft.com>2018-09-04 14:35:54 -0400
committerJames Morris <james.morris@microsoft.com>2018-09-04 14:35:54 -0400
commite42f6f9be4f83c537aa81b4c6239ea94ff5b29ce (patch)
treef956a5ea0e83fc6d0df3e64681e7bbc1f201f3ee /net
parent4408e300a67ab2ce2505087986a9fe922c800ffd (diff)
parent57361846b52bc686112da6ca5368d11210796804 (diff)
Merge tag 'v4.19-rc2' into next-general
Sync to Linux 4.19-rc2 for downstream developers.
Diffstat (limited to 'net')
-rw-r--r--net/6lowpan/iphc.c1
-rw-r--r--net/8021q/Makefile1
-rw-r--r--net/8021q/vlan.c15
-rw-r--r--net/9p/client.c122
-rw-r--r--net/9p/mod.c4
-rw-r--r--net/9p/protocol.c2
-rw-r--r--net/9p/trans_fd.c22
-rw-r--r--net/9p/trans_rdma.c12
-rw-r--r--net/9p/trans_virtio.c66
-rw-r--r--net/9p/trans_xen.c3
-rw-r--r--net/9p/util.c1
-rw-r--r--net/Kconfig2
-rw-r--r--net/Makefile4
-rw-r--r--net/appletalk/ddp.c2
-rw-r--r--net/atm/common.c11
-rw-r--r--net/atm/common.h2
-rw-r--r--net/atm/mpoa_proc.c6
-rw-r--r--net/atm/pppoatm.c2
-rw-r--r--net/atm/pvc.c2
-rw-r--r--net/atm/svc.c2
-rw-r--r--net/ax25/af_ax25.c2
-rw-r--r--net/ax25/ax25_addr.c1
-rw-r--r--net/ax25/ax25_ds_in.c1
-rw-r--r--net/ax25/ax25_ds_subr.c1
-rw-r--r--net/ax25/ax25_ip.c1
-rw-r--r--net/ax25/ax25_out.c1
-rw-r--r--net/batman-adv/Kconfig8
-rw-r--r--net/batman-adv/bat_iv_ogm.c4
-rw-r--r--net/batman-adv/bat_iv_ogm.h6
-rw-r--r--net/batman-adv/bat_v.c4
-rw-r--r--net/batman-adv/bat_v_ogm.h6
-rw-r--r--net/batman-adv/bridge_loop_avoidance.c2
-rw-r--r--net/batman-adv/debugfs.c42
-rw-r--r--net/batman-adv/debugfs.h11
-rw-r--r--net/batman-adv/hard-interface.c37
-rw-r--r--net/batman-adv/originator.c17
-rw-r--r--net/batman-adv/translation-table.c7
-rw-r--r--net/batman-adv/types.h7
-rw-r--r--net/bluetooth/af_bluetooth.c9
-rw-r--r--net/bluetooth/hci_conn.c189
-rw-r--r--net/bluetooth/hci_core.c105
-rw-r--r--net/bluetooth/hci_debugfs.c19
-rw-r--r--net/bluetooth/hci_event.c579
-rw-r--r--net/bluetooth/hci_request.c616
-rw-r--r--net/bluetooth/hci_request.h8
-rw-r--r--net/bluetooth/hci_sock.c2
-rw-r--r--net/bluetooth/hidp/core.c6
-rw-r--r--net/bluetooth/l2cap_sock.c2
-rw-r--r--net/bluetooth/leds.c6
-rw-r--r--net/bluetooth/mgmt.c402
-rw-r--r--net/bluetooth/rfcomm/sock.c2
-rw-r--r--net/bluetooth/sco.c5
-rw-r--r--net/bpf/test_run.c30
-rw-r--r--net/bpfilter/Kconfig3
-rw-r--r--net/bpfilter/Makefile21
-rw-r--r--net/bpfilter/bpfilter_kern.c11
-rw-r--r--net/bpfilter/bpfilter_umh_blob.S7
-rw-r--r--net/bridge/br_forward.c16
-rw-r--r--net/bridge/br_if.c62
-rw-r--r--net/bridge/br_multicast.c12
-rw-r--r--net/bridge/br_netfilter_hooks.c1
-rw-r--r--net/bridge/br_netlink.c30
-rw-r--r--net/bridge/br_private.h5
-rw-r--r--net/bridge/br_sysfs_if.c94
-rw-r--r--net/bridge/netfilter/ebtable_filter.c1
-rw-r--r--net/bridge/netfilter/ebtable_nat.c1
-rw-r--r--net/bridge/netfilter/nft_reject_bridge.c3
-rw-r--r--net/caif/caif_dev.c4
-rw-r--r--net/caif/caif_socket.c12
-rw-r--r--net/can/bcm.c2
-rw-r--r--net/can/raw.c2
-rw-r--r--net/ceph/Kconfig1
-rw-r--r--net/ceph/Makefile1
-rw-r--r--net/ceph/auth.c16
-rw-r--r--net/ceph/auth_none.c1
-rw-r--r--net/ceph/auth_none.h1
-rw-r--r--net/ceph/auth_x.c239
-rw-r--r--net/ceph/auth_x.h3
-rw-r--r--net/ceph/auth_x_protocol.h7
-rw-r--r--net/ceph/ceph_common.c13
-rw-r--r--net/ceph/cls_lock_client.c4
-rw-r--r--net/ceph/crush/mapper.c4
-rw-r--r--net/ceph/messenger.c113
-rw-r--r--net/ceph/mon_client.c2
-rw-r--r--net/ceph/osd_client.c27
-rw-r--r--net/ceph/pagevec.c1
-rw-r--r--net/compat.c6
-rw-r--r--net/core/datagram.c13
-rw-r--r--net/core/dev.c892
-rw-r--r--net/core/dev_ioctl.c14
-rw-r--r--net/core/devlink.c1322
-rw-r--r--net/core/dst.c1
-rw-r--r--net/core/ethtool.c1
-rw-r--r--net/core/fib_rules.c83
-rw-r--r--net/core/filter.c869
-rw-r--r--net/core/flow_dissector.c65
-rw-r--r--net/core/gen_estimator.c21
-rw-r--r--net/core/gen_stats.c16
-rw-r--r--net/core/lwt_bpf.c4
-rw-r--r--net/core/neighbour.c4
-rw-r--r--net/core/net-sysfs.c159
-rw-r--r--net/core/net_namespace.c44
-rw-r--r--net/core/page_pool.c2
-rw-r--r--net/core/pktgen.c12
-rw-r--r--net/core/rtnetlink.c91
-rw-r--r--net/core/secure_seq.c1
-rw-r--r--net/core/skbuff.c32
-rw-r--r--net/core/sock.c119
-rw-r--r--net/core/sock_diag.c2
-rw-r--r--net/core/sock_reuseport.c92
-rw-r--r--net/core/utils.c2
-rw-r--r--net/core/xdp.c58
-rw-r--r--net/dcb/dcbnl.c97
-rw-r--r--net/dccp/ccids/ccid2.c6
-rw-r--r--net/dccp/ccids/ccid3.c16
-rw-r--r--net/dccp/dccp.h3
-rw-r--r--net/dccp/ipv4.c2
-rw-r--r--net/dccp/ipv6.c2
-rw-r--r--net/dccp/proto.c13
-rw-r--r--net/decnet/Kconfig1
-rw-r--r--net/decnet/Makefile1
-rw-r--r--net/decnet/TODO5
-rw-r--r--net/decnet/af_decnet.c6
-rw-r--r--net/decnet/dn_fib.c2
-rw-r--r--net/decnet/dn_nsp_in.c2
-rw-r--r--net/decnet/dn_nsp_out.c1
-rw-r--r--net/decnet/dn_route.c5
-rw-r--r--net/decnet/dn_rules.c2
-rw-r--r--net/decnet/netfilter/Makefile1
-rw-r--r--net/decnet/netfilter/dn_rtmsg.c1
-rw-r--r--net/dns_resolver/dns_key.c29
-rw-r--r--net/dsa/dsa2.c14
-rw-r--r--net/dsa/slave.c16
-rw-r--r--net/dsa/switch.c22
-rw-r--r--net/ethernet/eth.c12
-rw-r--r--net/ieee802154/6lowpan/core.c6
-rw-r--r--net/ieee802154/6lowpan/reassembly.c7
-rw-r--r--net/ieee802154/6lowpan/tx.c21
-rw-r--r--net/ieee802154/core.c1
-rw-r--r--net/ieee802154/nl_policy.c1
-rw-r--r--net/ieee802154/socket.c21
-rw-r--r--net/ipv4/Kconfig4
-rw-r--r--net/ipv4/Makefile2
-rw-r--r--net/ipv4/af_inet.c26
-rw-r--r--net/ipv4/bpfilter/Makefile1
-rw-r--r--net/ipv4/devinet.c11
-rw-r--r--net/ipv4/esp4_offload.c10
-rw-r--r--net/ipv4/fib_frontend.c5
-rw-r--r--net/ipv4/fou.c24
-rw-r--r--net/ipv4/gre_offload.c10
-rw-r--r--net/ipv4/icmp.c9
-rw-r--r--net/ipv4/igmp.c54
-rw-r--r--net/ipv4/inet_connection_sock.c9
-rw-r--r--net/ipv4/inet_fragment.c25
-rw-r--r--net/ipv4/inet_hashtables.c19
-rw-r--r--net/ipv4/ip_forward.c3
-rw-r--r--net/ipv4/ip_fragment.c355
-rw-r--r--net/ipv4/ip_gre.c7
-rw-r--r--net/ipv4/ip_input.c147
-rw-r--r--net/ipv4/ip_output.c24
-rw-r--r--net/ipv4/ip_sockglue.c11
-rw-r--r--net/ipv4/ip_vti.c3
-rw-r--r--net/ipv4/ipmr.c22
-rw-r--r--net/ipv4/ipmr_base.c1
-rw-r--r--net/ipv4/netfilter.c53
-rw-r--r--net/ipv4/netfilter/Kconfig22
-rw-r--r--net/ipv4/netfilter/Makefile6
-rw-r--r--net/ipv4/netfilter/ip_tables.c1
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c472
-rw-r--r--net/ipv4/netfilter/nf_log_ipv4.c8
-rw-r--r--net/ipv4/netfilter/nf_tproxy_ipv4.c18
-rw-r--r--net/ipv4/ping.c16
-rw-r--r--net/ipv4/proc.c3
-rw-r--r--net/ipv4/raw.c11
-rw-r--r--net/ipv4/route.c6
-rw-r--r--net/ipv4/sysctl_net_ipv4.c49
-rw-r--r--net/ipv4/tcp.c112
-rw-r--r--net/ipv4/tcp_bbr.c52
-rw-r--r--net/ipv4/tcp_dctcp.c75
-rw-r--r--net/ipv4/tcp_input.c153
-rw-r--r--net/ipv4/tcp_ipv4.c32
-rw-r--r--net/ipv4/tcp_minisocks.c229
-rw-r--r--net/ipv4/tcp_offload.c17
-rw-r--r--net/ipv4/tcp_output.c55
-rw-r--r--net/ipv4/tcp_rate.c4
-rw-r--r--net/ipv4/tcp_recovery.c2
-rw-r--r--net/ipv4/tcp_timer.c51
-rw-r--r--net/ipv4/tcp_ulp.c4
-rw-r--r--net/ipv4/udp.c30
-rw-r--r--net/ipv4/udp_offload.c15
-rw-r--r--net/ipv6/Kconfig3
-rw-r--r--net/ipv6/addrconf.c63
-rw-r--r--net/ipv6/af_inet6.c12
-rw-r--r--net/ipv6/calipso.c9
-rw-r--r--net/ipv6/datagram.c13
-rw-r--r--net/ipv6/esp6.c4
-rw-r--r--net/ipv6/esp6_offload.c10
-rw-r--r--net/ipv6/exthdrs.c111
-rw-r--r--net/ipv6/icmp.c37
-rw-r--r--net/ipv6/ila/Makefile2
-rw-r--r--net/ipv6/ila/ila.h27
-rw-r--r--net/ipv6/ila/ila_common.c31
-rw-r--r--net/ipv6/ila/ila_main.c121
-rw-r--r--net/ipv6/ila/ila_xlat.c292
-rw-r--r--net/ipv6/inet6_hashtables.c14
-rw-r--r--net/ipv6/ip6_fib.c158
-rw-r--r--net/ipv6/ip6_flowlabel.c3
-rw-r--r--net/ipv6/ip6_gre.c11
-rw-r--r--net/ipv6/ip6_input.c131
-rw-r--r--net/ipv6/ip6_offload.c16
-rw-r--r--net/ipv6/ip6_output.c40
-rw-r--r--net/ipv6/ip6_tunnel.c12
-rw-r--r--net/ipv6/ip6_vti.c30
-rw-r--r--net/ipv6/ip6mr.c1
-rw-r--r--net/ipv6/ipv6_sockglue.c35
-rw-r--r--net/ipv6/mcast.c70
-rw-r--r--net/ipv6/ndisc.c2
-rw-r--r--net/ipv6/netfilter.c62
-rw-r--r--net/ipv6/netfilter/Kconfig27
-rw-r--r--net/ipv6/netfilter/Makefile6
-rw-r--r--net/ipv6/netfilter/ip6_tables.c1
-rw-r--r--net/ipv6/netfilter/ip6t_rpfilter.c12
-rw-r--r--net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c460
-rw-r--r--net/ipv6/netfilter/nf_conntrack_reasm.c30
-rw-r--r--net/ipv6/netfilter/nf_defrag_ipv6_hooks.c4
-rw-r--r--net/ipv6/netfilter/nf_log_ipv6.c8
-rw-r--r--net/ipv6/netfilter/nf_tproxy_ipv6.c18
-rw-r--r--net/ipv6/ping.c7
-rw-r--r--net/ipv6/raw.c22
-rw-r--r--net/ipv6/reassembly.c97
-rw-r--r--net/ipv6/route.c57
-rw-r--r--net/ipv6/seg6.c1
-rw-r--r--net/ipv6/seg6_hmac.c3
-rw-r--r--net/ipv6/seg6_iptunnel.c2
-rw-r--r--net/ipv6/seg6_local.c54
-rw-r--r--net/ipv6/tcp_ipv6.c6
-rw-r--r--net/ipv6/tcpv6_offload.c4
-rw-r--r--net/ipv6/udp.c17
-rw-r--r--net/ipv6/udp_offload.c4
-rw-r--r--net/ipv6/xfrm6_mode_ro.c2
-rw-r--r--net/iucv/af_iucv.c11
-rw-r--r--net/kcm/Kconfig1
-rw-r--r--net/kcm/kcmsock.c11
-rw-r--r--net/key/af_key.c8
-rw-r--r--net/l2tp/l2tp_core.c86
-rw-r--r--net/l2tp/l2tp_core.h73
-rw-r--r--net/l2tp/l2tp_debugfs.c8
-rw-r--r--net/l2tp/l2tp_eth.c32
-rw-r--r--net/l2tp/l2tp_ip.c6
-rw-r--r--net/l2tp/l2tp_ip6.c17
-rw-r--r--net/l2tp/l2tp_netlink.c37
-rw-r--r--net/l2tp/l2tp_ppp.c559
-rw-r--r--net/llc/Kconfig2
-rw-r--r--net/llc/Makefile2
-rw-r--r--net/llc/af_llc.c2
-rw-r--r--net/llc/llc_core.c4
-rw-r--r--net/llc/llc_if.c1
-rw-r--r--net/mac80211/Makefile1
-rw-r--r--net/mac80211/agg-rx.c10
-rw-r--r--net/mac80211/agg-tx.c19
-rw-r--r--net/mac80211/cfg.c9
-rw-r--r--net/mac80211/ethtool.c6
-rw-r--r--net/mac80211/he.c55
-rw-r--r--net/mac80211/ht.c2
-rw-r--r--net/mac80211/ieee80211_i.h47
-rw-r--r--net/mac80211/iface.c4
-rw-r--r--net/mac80211/key.c24
-rw-r--r--net/mac80211/led.c20
-rw-r--r--net/mac80211/main.c36
-rw-r--r--net/mac80211/mlme.c312
-rw-r--r--net/mac80211/offchannel.c2
-rw-r--r--net/mac80211/rc80211_minstrel.c1
-rw-r--r--net/mac80211/rx.c134
-rw-r--r--net/mac80211/scan.c56
-rw-r--r--net/mac80211/sta_info.c101
-rw-r--r--net/mac80211/sta_info.h20
-rw-r--r--net/mac80211/trace.h2
-rw-r--r--net/mac80211/tx.c25
-rw-r--r--net/mac80211/util.c162
-rw-r--r--net/mac802154/tx.c15
-rw-r--r--net/mpls/mpls_iptunnel.c2
-rw-r--r--net/ncsi/ncsi-netlink.c4
-rw-r--r--net/netfilter/Kconfig82
-rw-r--r--net/netfilter/Makefile19
-rw-r--r--net/netfilter/core.c15
-rw-r--r--net/netfilter/ipvs/Kconfig8
-rw-r--r--net/netfilter/ipvs/ip_vs_conn.c89
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c15
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_mh.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_proto.c19
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_sctp.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_tcp.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_udp.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_sync.c18
-rw-r--r--net/netfilter/nf_conncount.c424
-rw-r--r--net/netfilter/nf_conntrack_broadcast.c2
-rw-r--r--net/netfilter/nf_conntrack_core.c319
-rw-r--r--net/netfilter/nf_conntrack_expect.c3
-rw-r--r--net/netfilter/nf_conntrack_helper.c15
-rw-r--r--net/netfilter/nf_conntrack_l3proto_generic.c66
-rw-r--r--net/netfilter/nf_conntrack_netlink.c124
-rw-r--r--net/netfilter/nf_conntrack_proto.c851
-rw-r--r--net/netfilter/nf_conntrack_proto_dccp.c52
-rw-r--r--net/netfilter/nf_conntrack_proto_generic.c32
-rw-r--r--net/netfilter/nf_conntrack_proto_gre.c24
-rw-r--r--net/netfilter/nf_conntrack_proto_icmp.c (renamed from net/ipv4/netfilter/nf_conntrack_proto_icmp.c)19
-rw-r--r--net/netfilter/nf_conntrack_proto_icmpv6.c (renamed from net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c)17
-rw-r--r--net/netfilter/nf_conntrack_proto_sctp.c46
-rw-r--r--net/netfilter/nf_conntrack_proto_tcp.c52
-rw-r--r--net/netfilter/nf_conntrack_proto_udp.c55
-rw-r--r--net/netfilter/nf_conntrack_standalone.c28
-rw-r--r--net/netfilter/nf_conntrack_timeout.c21
-rw-r--r--net/netfilter/nf_flow_table_core.c13
-rw-r--r--net/netfilter/nf_log.c13
-rw-r--r--net/netfilter/nf_log_common.c5
-rw-r--r--net/netfilter/nf_nat_core.c18
-rw-r--r--net/netfilter/nf_osf.c218
-rw-r--r--net/netfilter/nf_tables_api.c548
-rw-r--r--net/netfilter/nf_tables_core.c16
-rw-r--r--net/netfilter/nf_tables_set_core.c28
-rw-r--r--net/netfilter/nfnetlink.c23
-rw-r--r--net/netfilter/nfnetlink_acct.c29
-rw-r--r--net/netfilter/nfnetlink_cttimeout.c74
-rw-r--r--net/netfilter/nfnetlink_osf.c436
-rw-r--r--net/netfilter/nfnetlink_queue.c3
-rw-r--r--net/netfilter/nft_chain_filter.c18
-rw-r--r--net/netfilter/nft_compat.c13
-rw-r--r--net/netfilter/nft_connlimit.c36
-rw-r--r--net/netfilter/nft_ct.c221
-rw-r--r--net/netfilter/nft_dynset.c4
-rw-r--r--net/netfilter/nft_immediate.c3
-rw-r--r--net/netfilter/nft_lookup.c19
-rw-r--r--net/netfilter/nft_meta.c15
-rw-r--r--net/netfilter/nft_numgen.c4
-rw-r--r--net/netfilter/nft_osf.c104
-rw-r--r--net/netfilter/nft_set_bitmap.c25
-rw-r--r--net/netfilter/nft_set_hash.c38
-rw-r--r--net/netfilter/nft_set_rbtree.c30
-rw-r--r--net/netfilter/nft_socket.c22
-rw-r--r--net/netfilter/nft_tproxy.c318
-rw-r--r--net/netfilter/nft_tunnel.c566
-rw-r--r--net/netfilter/utils.c131
-rw-r--r--net/netfilter/x_tables.c7
-rw-r--r--net/netfilter/xt_AUDIT.c2
-rw-r--r--net/netfilter/xt_CT.c6
-rw-r--r--net/netfilter/xt_TEE.c4
-rw-r--r--net/netfilter/xt_TPROXY.c17
-rw-r--r--net/netfilter/xt_cgroup.c6
-rw-r--r--net/netfilter/xt_connlimit.c4
-rw-r--r--net/netfilter/xt_osf.c149
-rw-r--r--net/netfilter/xt_owner.c2
-rw-r--r--net/netfilter/xt_recent.c3
-rw-r--r--net/netfilter/xt_socket.c8
-rw-r--r--net/netlabel/netlabel_user.c2
-rw-r--r--net/netlink/af_netlink.c14
-rw-r--r--net/netrom/af_netrom.c2
-rw-r--r--net/nfc/llcp_commands.c9
-rw-r--r--net/nfc/llcp_sock.c9
-rw-r--r--net/nfc/rawsock.c4
-rw-r--r--net/nsh/nsh.c2
-rw-r--r--net/openvswitch/actions.c33
-rw-r--r--net/openvswitch/conntrack.c20
-rw-r--r--net/openvswitch/flow_netlink.c80
-rw-r--r--net/openvswitch/meter.c10
-rw-r--r--net/packet/af_packet.c101
-rw-r--r--net/packet/internal.h1
-rw-r--r--net/phonet/socket.c9
-rw-r--r--net/qrtr/qrtr.c15
-rw-r--r--net/rds/Kconfig2
-rw-r--r--net/rds/Makefile1
-rw-r--r--net/rds/af_rds.c205
-rw-r--r--net/rds/bind.c138
-rw-r--r--net/rds/cong.c23
-rw-r--r--net/rds/connection.c294
-rw-r--r--net/rds/ib.c138
-rw-r--r--net/rds/ib.h53
-rw-r--r--net/rds/ib_cm.c320
-rw-r--r--net/rds/ib_frmr.c17
-rw-r--r--net/rds/ib_mr.h5
-rw-r--r--net/rds/ib_rdma.c47
-rw-r--r--net/rds/ib_recv.c39
-rw-r--r--net/rds/ib_send.c19
-rw-r--r--net/rds/loop.c63
-rw-r--r--net/rds/loop.h2
-rw-r--r--net/rds/message.c1
-rw-r--r--net/rds/rdma.c19
-rw-r--r--net/rds/rdma_transport.c95
-rw-r--r--net/rds/rdma_transport.h5
-rw-r--r--net/rds/rds.h93
-rw-r--r--net/rds/recv.c78
-rw-r--r--net/rds/send.c128
-rw-r--r--net/rds/tcp.c153
-rw-r--r--net/rds/tcp.h2
-rw-r--r--net/rds/tcp_connect.c68
-rw-r--r--net/rds/tcp_listen.c87
-rw-r--r--net/rds/tcp_recv.c9
-rw-r--r--net/rds/tcp_send.c4
-rw-r--r--net/rds/threads.c69
-rw-r--r--net/rds/transport.c16
-rw-r--r--net/rfkill/core.c4
-rw-r--r--net/rose/af_rose.c2
-rw-r--r--net/rxrpc/af_rxrpc.c10
-rw-r--r--net/rxrpc/ar-internal.h12
-rw-r--r--net/rxrpc/call_accept.c4
-rw-r--r--net/rxrpc/call_event.c2
-rw-r--r--net/rxrpc/call_object.c2
-rw-r--r--net/rxrpc/conn_client.c3
-rw-r--r--net/rxrpc/conn_event.c21
-rw-r--r--net/rxrpc/conn_object.c4
-rw-r--r--net/rxrpc/input.c15
-rw-r--r--net/rxrpc/local_event.c5
-rw-r--r--net/rxrpc/local_object.c2
-rw-r--r--net/rxrpc/net_ns.c6
-rw-r--r--net/rxrpc/output.c44
-rw-r--r--net/rxrpc/peer_event.c156
-rw-r--r--net/rxrpc/peer_object.c10
-rw-r--r--net/rxrpc/proc.c22
-rw-r--r--net/rxrpc/recvmsg.c56
-rw-r--r--net/rxrpc/rxkad.c35
-rw-r--r--net/rxrpc/sysctl.c1
-rw-r--r--net/sched/Kconfig39
-rw-r--r--net/sched/Makefile5
-rw-r--r--net/sched/act_api.c432
-rw-r--r--net/sched/act_bpf.c46
-rw-r--r--net/sched/act_connmark.c28
-rw-r--r--net/sched/act_csum.c72
-rw-r--r--net/sched/act_gact.c39
-rw-r--r--net/sched/act_ife.c130
-rw-r--r--net/sched/act_ipt.c39
-rw-r--r--net/sched/act_mirred.c177
-rw-r--r--net/sched/act_nat.c28
-rw-r--r--net/sched/act_pedit.c134
-rw-r--r--net/sched/act_police.c48
-rw-r--r--net/sched/act_sample.c55
-rw-r--r--net/sched/act_simple.c35
-rw-r--r--net/sched/act_skbedit.c167
-rw-r--r--net/sched/act_skbmod.c65
-rw-r--r--net/sched/act_tunnel_key.c321
-rw-r--r--net/sched/act_vlan.c84
-rw-r--r--net/sched/cls_api.c721
-rw-r--r--net/sched/cls_basic.c1
-rw-r--r--net/sched/cls_bpf.c43
-rw-r--r--net/sched/cls_flower.c668
-rw-r--r--net/sched/cls_matchall.c34
-rw-r--r--net/sched/cls_tcindex.c8
-rw-r--r--net/sched/cls_u32.c121
-rw-r--r--net/sched/sch_api.c11
-rw-r--r--net/sched/sch_cake.c3034
-rw-r--r--net/sched/sch_cbs.c134
-rw-r--r--net/sched/sch_etf.c484
-rw-r--r--net/sched/sch_fq_codel.c25
-rw-r--r--net/sched/sch_hfsc.c4
-rw-r--r--net/sched/sch_htb.c13
-rw-r--r--net/sched/sch_netem.c73
-rw-r--r--net/sched/sch_skbprio.c320
-rw-r--r--net/sctp/Kconfig4
-rw-r--r--net/sctp/associola.c15
-rw-r--r--net/sctp/chunk.c10
-rw-r--r--net/sctp/input.c1
-rw-r--r--net/sctp/ipv6.c22
-rw-r--r--net/sctp/outqueue.c11
-rw-r--r--net/sctp/protocol.c18
-rw-r--r--net/sctp/sm_sideeffect.c1
-rw-r--r--net/sctp/socket.c253
-rw-r--r--net/sctp/stream.c153
-rw-r--r--net/sctp/stream_interleave.c20
-rw-r--r--net/sctp/stream_sched.c13
-rw-r--r--net/sctp/stream_sched_prio.c22
-rw-r--r--net/sctp/stream_sched_rr.c8
-rw-r--r--net/sctp/transport.c2
-rw-r--r--net/smc/Makefile2
-rw-r--r--net/smc/af_smc.c459
-rw-r--r--net/smc/smc.h17
-rw-r--r--net/smc/smc_cdc.c116
-rw-r--r--net/smc/smc_cdc.h86
-rw-r--r--net/smc/smc_clc.c200
-rw-r--r--net/smc/smc_clc.h99
-rw-r--r--net/smc/smc_close.c2
-rw-r--r--net/smc/smc_core.c350
-rw-r--r--net/smc/smc_core.h85
-rw-r--r--net/smc/smc_diag.c33
-rw-r--r--net/smc/smc_ib.c173
-rw-r--r--net/smc/smc_ib.h7
-rw-r--r--net/smc/smc_ism.c348
-rw-r--r--net/smc/smc_ism.h48
-rw-r--r--net/smc/smc_llc.c80
-rw-r--r--net/smc/smc_llc.h7
-rw-r--r--net/smc/smc_pnet.c171
-rw-r--r--net/smc/smc_pnet.h19
-rw-r--r--net/smc/smc_rx.c21
-rw-r--r--net/smc/smc_tx.c248
-rw-r--r--net/smc/smc_tx.h6
-rw-r--r--net/smc/smc_wr.c41
-rw-r--r--net/smc/smc_wr.h3
-rw-r--r--net/socket.c96
-rw-r--r--net/strparser/strparser.c52
-rw-r--r--net/sunrpc/auth.c4
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c50
-rw-r--r--net/sunrpc/auth_gss/gss_generic_token.c1
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_crypto.c15
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_keys.c1
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_seal.c1
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_unseal.c1
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_wrap.c3
-rw-r--r--net/sunrpc/auth_gss/gss_rpc_upcall.c70
-rw-r--r--net/sunrpc/auth_gss/svcauth_gss.c2
-rw-r--r--net/sunrpc/auth_null.c2
-rw-r--r--net/sunrpc/auth_unix.c2
-rw-r--r--net/sunrpc/backchannel_rqst.c1
-rw-r--r--net/sunrpc/clnt.c30
-rw-r--r--net/sunrpc/rpcb_clnt.c2
-rw-r--r--net/sunrpc/stats.c55
-rw-r--r--net/sunrpc/sunrpc.h1
-rw-r--r--net/sunrpc/svc.c78
-rw-r--r--net/sunrpc/xprt.c2
-rw-r--r--net/sunrpc/xprtrdma/fmr_ops.c4
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c7
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma.c2
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c12
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_rw.c35
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c7
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c5
-rw-r--r--net/sunrpc/xprtrdma/verbs.c10
-rw-r--r--net/sunrpc/xprtsock.c1
-rw-r--r--net/tipc/bcast.c2
-rw-r--r--net/tipc/bearer.c2
-rw-r--r--net/tipc/discover.c18
-rw-r--r--net/tipc/group.c41
-rw-r--r--net/tipc/group.h1
-rw-r--r--net/tipc/link.c134
-rw-r--r--net/tipc/link.h2
-rw-r--r--net/tipc/monitor.c3
-rw-r--r--net/tipc/msg.c35
-rw-r--r--net/tipc/name_table.c2
-rw-r--r--net/tipc/net.c15
-rw-r--r--net/tipc/node.c97
-rw-r--r--net/tipc/node.h14
-rw-r--r--net/tipc/socket.c24
-rw-r--r--net/tls/tls_device.c304
-rw-r--r--net/tls/tls_device_fallback.c11
-rw-r--r--net/tls/tls_main.c44
-rw-r--r--net/tls/tls_sw.c366
-rw-r--r--net/unix/af_unix.c37
-rw-r--r--net/vmw_vsock/af_vsock.c34
-rw-r--r--net/vmw_vsock/virtio_transport.c2
-rw-r--r--net/vmw_vsock/vmci_transport.c3
-rw-r--r--net/wimax/Makefile2
-rw-r--r--net/wimax/debugfs.c2
-rw-r--r--net/wimax/op-msg.c1
-rw-r--r--net/wimax/stack.c4
-rw-r--r--net/wireless/core.c21
-rw-r--r--net/wireless/core.h2
-rw-r--r--net/wireless/lib80211_crypt_tkip.c55
-rw-r--r--net/wireless/nl80211.c254
-rw-r--r--net/wireless/reg.c28
-rw-r--r--net/wireless/sysfs.c4
-rw-r--r--net/wireless/trace.h18
-rw-r--r--net/wireless/util.c87
-rw-r--r--net/wireless/wext-compat.c10
-rw-r--r--net/x25/Kconfig2
-rw-r--r--net/x25/af_x25.c2
-rw-r--r--net/x25/x25_subr.c1
-rw-r--r--net/xdp/xdp_umem.c70
-rw-r--r--net/xdp/xsk.c41
-rw-r--r--net/xdp/xsk_queue.h11
-rw-r--r--net/xfrm/Kconfig9
-rw-r--r--net/xfrm/Makefile1
-rw-r--r--net/xfrm/xfrm_device.c19
-rw-r--r--net/xfrm/xfrm_input.c5
-rw-r--r--net/xfrm/xfrm_interface.c975
-rw-r--r--net/xfrm/xfrm_output.c3
-rw-r--r--net/xfrm/xfrm_policy.c317
-rw-r--r--net/xfrm/xfrm_state.c48
-rw-r--r--net/xfrm/xfrm_user.c113
575 files changed, 27398 insertions, 9558 deletions
diff --git a/net/6lowpan/iphc.c b/net/6lowpan/iphc.c
index 6b1042e21656..52fad5dad9f7 100644
--- a/net/6lowpan/iphc.c
+++ b/net/6lowpan/iphc.c
@@ -770,6 +770,7 @@ int lowpan_header_decompress(struct sk_buff *skb, const struct net_device *dev,
770 hdr.hop_limit, &hdr.daddr); 770 hdr.hop_limit, &hdr.daddr);
771 771
772 skb_push(skb, sizeof(hdr)); 772 skb_push(skb, sizeof(hdr));
773 skb_reset_mac_header(skb);
773 skb_reset_network_header(skb); 774 skb_reset_network_header(skb);
774 skb_copy_to_linear_data(skb, &hdr, sizeof(hdr)); 775 skb_copy_to_linear_data(skb, &hdr, sizeof(hdr));
775 776
diff --git a/net/8021q/Makefile b/net/8021q/Makefile
index 9b703454b93e..e05d4d7aab35 100644
--- a/net/8021q/Makefile
+++ b/net/8021q/Makefile
@@ -9,4 +9,3 @@ obj-$(CONFIG_VLAN_8021Q) += 8021q.o
98021q-$(CONFIG_VLAN_8021Q_GVRP) += vlan_gvrp.o 98021q-$(CONFIG_VLAN_8021Q_GVRP) += vlan_gvrp.o
108021q-$(CONFIG_VLAN_8021Q_MVRP) += vlan_mvrp.o 108021q-$(CONFIG_VLAN_8021Q_MVRP) += vlan_mvrp.o
118021q-$(CONFIG_PROC_FS) += vlanproc.o 118021q-$(CONFIG_PROC_FS) += vlanproc.o
12
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 73a65789271b..5e9950453955 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -647,13 +647,14 @@ out:
647 return err; 647 return err;
648} 648}
649 649
650static struct sk_buff **vlan_gro_receive(struct sk_buff **head, 650static struct sk_buff *vlan_gro_receive(struct list_head *head,
651 struct sk_buff *skb) 651 struct sk_buff *skb)
652{ 652{
653 struct sk_buff *p, **pp = NULL;
654 struct vlan_hdr *vhdr;
655 unsigned int hlen, off_vlan;
656 const struct packet_offload *ptype; 653 const struct packet_offload *ptype;
654 unsigned int hlen, off_vlan;
655 struct sk_buff *pp = NULL;
656 struct vlan_hdr *vhdr;
657 struct sk_buff *p;
657 __be16 type; 658 __be16 type;
658 int flush = 1; 659 int flush = 1;
659 660
@@ -675,7 +676,7 @@ static struct sk_buff **vlan_gro_receive(struct sk_buff **head,
675 676
676 flush = 0; 677 flush = 0;
677 678
678 for (p = *head; p; p = p->next) { 679 list_for_each_entry(p, head, list) {
679 struct vlan_hdr *vhdr2; 680 struct vlan_hdr *vhdr2;
680 681
681 if (!NAPI_GRO_CB(p)->same_flow) 682 if (!NAPI_GRO_CB(p)->same_flow)
@@ -693,7 +694,7 @@ static struct sk_buff **vlan_gro_receive(struct sk_buff **head,
693out_unlock: 694out_unlock:
694 rcu_read_unlock(); 695 rcu_read_unlock();
695out: 696out:
696 NAPI_GRO_CB(skb)->flush |= flush; 697 skb_gro_flush_final(skb, pp, flush);
697 698
698 return pp; 699 return pp;
699} 700}
diff --git a/net/9p/client.c b/net/9p/client.c
index 18c5271910dc..deae53a7dffc 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -225,7 +225,8 @@ static int parse_opts(char *opts, struct p9_client *clnt)
225 } 225 }
226 226
227free_and_return: 227free_and_return:
228 v9fs_put_trans(clnt->trans_mod); 228 if (ret)
229 v9fs_put_trans(clnt->trans_mod);
229 kfree(tmp_options); 230 kfree(tmp_options);
230 return ret; 231 return ret;
231} 232}
@@ -282,8 +283,9 @@ p9_tag_alloc(struct p9_client *c, u16 tag, unsigned int max_size)
282 return ERR_PTR(-ENOMEM); 283 return ERR_PTR(-ENOMEM);
283 } 284 }
284 for (col = 0; col < P9_ROW_MAXTAG; col++) { 285 for (col = 0; col < P9_ROW_MAXTAG; col++) {
285 c->reqs[row][col].status = REQ_STATUS_IDLE; 286 req = &c->reqs[row][col];
286 c->reqs[row][col].tc = NULL; 287 req->status = REQ_STATUS_IDLE;
288 init_waitqueue_head(&req->wq);
287 } 289 }
288 c->max_tag += P9_ROW_MAXTAG; 290 c->max_tag += P9_ROW_MAXTAG;
289 } 291 }
@@ -293,13 +295,6 @@ p9_tag_alloc(struct p9_client *c, u16 tag, unsigned int max_size)
293 col = tag % P9_ROW_MAXTAG; 295 col = tag % P9_ROW_MAXTAG;
294 296
295 req = &c->reqs[row][col]; 297 req = &c->reqs[row][col];
296 if (!req->wq) {
297 req->wq = kmalloc(sizeof(wait_queue_head_t), GFP_NOFS);
298 if (!req->wq)
299 goto grow_failed;
300 init_waitqueue_head(req->wq);
301 }
302
303 if (!req->tc) 298 if (!req->tc)
304 req->tc = p9_fcall_alloc(alloc_msize); 299 req->tc = p9_fcall_alloc(alloc_msize);
305 if (!req->rc) 300 if (!req->rc)
@@ -319,9 +314,7 @@ grow_failed:
319 pr_err("Couldn't grow tag array\n"); 314 pr_err("Couldn't grow tag array\n");
320 kfree(req->tc); 315 kfree(req->tc);
321 kfree(req->rc); 316 kfree(req->rc);
322 kfree(req->wq);
323 req->tc = req->rc = NULL; 317 req->tc = req->rc = NULL;
324 req->wq = NULL;
325 return ERR_PTR(-ENOMEM); 318 return ERR_PTR(-ENOMEM);
326} 319}
327 320
@@ -340,7 +333,7 @@ struct p9_req_t *p9_tag_lookup(struct p9_client *c, u16 tag)
340 * buffer to read the data into */ 333 * buffer to read the data into */
341 tag++; 334 tag++;
342 335
343 if(tag >= c->max_tag) 336 if (tag >= c->max_tag)
344 return NULL; 337 return NULL;
345 338
346 row = tag / P9_ROW_MAXTAG; 339 row = tag / P9_ROW_MAXTAG;
@@ -409,7 +402,6 @@ static void p9_tag_cleanup(struct p9_client *c)
409 /* free requests associated with tags */ 402 /* free requests associated with tags */
410 for (row = 0; row < (c->max_tag/P9_ROW_MAXTAG); row++) { 403 for (row = 0; row < (c->max_tag/P9_ROW_MAXTAG); row++) {
411 for (col = 0; col < P9_ROW_MAXTAG; col++) { 404 for (col = 0; col < P9_ROW_MAXTAG; col++) {
412 kfree(c->reqs[row][col].wq);
413 kfree(c->reqs[row][col].tc); 405 kfree(c->reqs[row][col].tc);
414 kfree(c->reqs[row][col].rc); 406 kfree(c->reqs[row][col].rc);
415 } 407 }
@@ -447,12 +439,12 @@ void p9_client_cb(struct p9_client *c, struct p9_req_t *req, int status)
447 439
448 /* 440 /*
449 * This barrier is needed to make sure any change made to req before 441 * This barrier is needed to make sure any change made to req before
450 * the other thread wakes up will indeed be seen by the waiting side. 442 * the status change is visible to another thread
451 */ 443 */
452 smp_wmb(); 444 smp_wmb();
453 req->status = status; 445 req->status = status;
454 446
455 wake_up(req->wq); 447 wake_up(&req->wq);
456 p9_debug(P9_DEBUG_MUX, "wakeup: %d\n", req->tc->tag); 448 p9_debug(P9_DEBUG_MUX, "wakeup: %d\n", req->tc->tag);
457} 449}
458EXPORT_SYMBOL(p9_client_cb); 450EXPORT_SYMBOL(p9_client_cb);
@@ -477,20 +469,11 @@ p9_parse_header(struct p9_fcall *pdu, int32_t *size, int8_t *type, int16_t *tag,
477 int err; 469 int err;
478 470
479 pdu->offset = 0; 471 pdu->offset = 0;
480 if (pdu->size == 0)
481 pdu->size = 7;
482 472
483 err = p9pdu_readf(pdu, 0, "dbw", &r_size, &r_type, &r_tag); 473 err = p9pdu_readf(pdu, 0, "dbw", &r_size, &r_type, &r_tag);
484 if (err) 474 if (err)
485 goto rewind_and_exit; 475 goto rewind_and_exit;
486 476
487 pdu->size = r_size;
488 pdu->id = r_type;
489 pdu->tag = r_tag;
490
491 p9_debug(P9_DEBUG_9P, "<<< size=%d type: %d tag: %d\n",
492 pdu->size, pdu->id, pdu->tag);
493
494 if (type) 477 if (type)
495 *type = r_type; 478 *type = r_type;
496 if (tag) 479 if (tag)
@@ -498,6 +481,16 @@ p9_parse_header(struct p9_fcall *pdu, int32_t *size, int8_t *type, int16_t *tag,
498 if (size) 481 if (size)
499 *size = r_size; 482 *size = r_size;
500 483
484 if (pdu->size != r_size || r_size < 7) {
485 err = -EINVAL;
486 goto rewind_and_exit;
487 }
488
489 pdu->id = r_type;
490 pdu->tag = r_tag;
491
492 p9_debug(P9_DEBUG_9P, "<<< size=%d type: %d tag: %d\n",
493 pdu->size, pdu->id, pdu->tag);
501 494
502rewind_and_exit: 495rewind_and_exit:
503 if (rewind) 496 if (rewind)
@@ -524,6 +517,12 @@ static int p9_check_errors(struct p9_client *c, struct p9_req_t *req)
524 int ecode; 517 int ecode;
525 518
526 err = p9_parse_header(req->rc, NULL, &type, NULL, 0); 519 err = p9_parse_header(req->rc, NULL, &type, NULL, 0);
520 if (req->rc->size >= c->msize) {
521 p9_debug(P9_DEBUG_ERROR,
522 "requested packet size too big: %d\n",
523 req->rc->size);
524 return -EIO;
525 }
527 /* 526 /*
528 * dump the response from server 527 * dump the response from server
529 * This should be after check errors which poplulate pdu_fcall. 528 * This should be after check errors which poplulate pdu_fcall.
@@ -773,7 +772,7 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...)
773 } 772 }
774again: 773again:
775 /* Wait for the response */ 774 /* Wait for the response */
776 err = wait_event_killable(*req->wq, req->status >= REQ_STATUS_RCVD); 775 err = wait_event_killable(req->wq, req->status >= REQ_STATUS_RCVD);
777 776
778 /* 777 /*
779 * Make sure our req is coherent with regard to updates in other 778 * Make sure our req is coherent with regard to updates in other
@@ -908,34 +907,31 @@ static struct p9_fid *p9_fid_create(struct p9_client *clnt)
908{ 907{
909 int ret; 908 int ret;
910 struct p9_fid *fid; 909 struct p9_fid *fid;
911 unsigned long flags;
912 910
913 p9_debug(P9_DEBUG_FID, "clnt %p\n", clnt); 911 p9_debug(P9_DEBUG_FID, "clnt %p\n", clnt);
914 fid = kmalloc(sizeof(struct p9_fid), GFP_KERNEL); 912 fid = kmalloc(sizeof(struct p9_fid), GFP_KERNEL);
915 if (!fid) 913 if (!fid)
916 return ERR_PTR(-ENOMEM); 914 return NULL;
917
918 ret = p9_idpool_get(clnt->fidpool);
919 if (ret < 0) {
920 ret = -ENOSPC;
921 goto error;
922 }
923 fid->fid = ret;
924 915
925 memset(&fid->qid, 0, sizeof(struct p9_qid)); 916 memset(&fid->qid, 0, sizeof(struct p9_qid));
926 fid->mode = -1; 917 fid->mode = -1;
927 fid->uid = current_fsuid(); 918 fid->uid = current_fsuid();
928 fid->clnt = clnt; 919 fid->clnt = clnt;
929 fid->rdir = NULL; 920 fid->rdir = NULL;
930 spin_lock_irqsave(&clnt->lock, flags); 921 fid->fid = 0;
931 list_add(&fid->flist, &clnt->fidlist);
932 spin_unlock_irqrestore(&clnt->lock, flags);
933 922
934 return fid; 923 idr_preload(GFP_KERNEL);
924 spin_lock_irq(&clnt->lock);
925 ret = idr_alloc_u32(&clnt->fids, fid, &fid->fid, P9_NOFID - 1,
926 GFP_NOWAIT);
927 spin_unlock_irq(&clnt->lock);
928 idr_preload_end();
929
930 if (!ret)
931 return fid;
935 932
936error:
937 kfree(fid); 933 kfree(fid);
938 return ERR_PTR(ret); 934 return NULL;
939} 935}
940 936
941static void p9_fid_destroy(struct p9_fid *fid) 937static void p9_fid_destroy(struct p9_fid *fid)
@@ -945,9 +941,8 @@ static void p9_fid_destroy(struct p9_fid *fid)
945 941
946 p9_debug(P9_DEBUG_FID, "fid %d\n", fid->fid); 942 p9_debug(P9_DEBUG_FID, "fid %d\n", fid->fid);
947 clnt = fid->clnt; 943 clnt = fid->clnt;
948 p9_idpool_put(fid->fid, clnt->fidpool);
949 spin_lock_irqsave(&clnt->lock, flags); 944 spin_lock_irqsave(&clnt->lock, flags);
950 list_del(&fid->flist); 945 idr_remove(&clnt->fids, fid->fid);
951 spin_unlock_irqrestore(&clnt->lock, flags); 946 spin_unlock_irqrestore(&clnt->lock, flags);
952 kfree(fid->rdir); 947 kfree(fid->rdir);
953 kfree(fid); 948 kfree(fid);
@@ -957,7 +952,7 @@ static int p9_client_version(struct p9_client *c)
957{ 952{
958 int err = 0; 953 int err = 0;
959 struct p9_req_t *req; 954 struct p9_req_t *req;
960 char *version; 955 char *version = NULL;
961 int msize; 956 int msize;
962 957
963 p9_debug(P9_DEBUG_9P, ">>> TVERSION msize %d protocol %d\n", 958 p9_debug(P9_DEBUG_9P, ">>> TVERSION msize %d protocol %d\n",
@@ -1030,7 +1025,7 @@ struct p9_client *p9_client_create(const char *dev_name, char *options)
1030 memcpy(clnt->name, client_id, strlen(client_id) + 1); 1025 memcpy(clnt->name, client_id, strlen(client_id) + 1);
1031 1026
1032 spin_lock_init(&clnt->lock); 1027 spin_lock_init(&clnt->lock);
1033 INIT_LIST_HEAD(&clnt->fidlist); 1028 idr_init(&clnt->fids);
1034 1029
1035 err = p9_tag_init(clnt); 1030 err = p9_tag_init(clnt);
1036 if (err < 0) 1031 if (err < 0)
@@ -1050,18 +1045,12 @@ struct p9_client *p9_client_create(const char *dev_name, char *options)
1050 goto destroy_tagpool; 1045 goto destroy_tagpool;
1051 } 1046 }
1052 1047
1053 clnt->fidpool = p9_idpool_create();
1054 if (IS_ERR(clnt->fidpool)) {
1055 err = PTR_ERR(clnt->fidpool);
1056 goto put_trans;
1057 }
1058
1059 p9_debug(P9_DEBUG_MUX, "clnt %p trans %p msize %d protocol %d\n", 1048 p9_debug(P9_DEBUG_MUX, "clnt %p trans %p msize %d protocol %d\n",
1060 clnt, clnt->trans_mod, clnt->msize, clnt->proto_version); 1049 clnt, clnt->trans_mod, clnt->msize, clnt->proto_version);
1061 1050
1062 err = clnt->trans_mod->create(clnt, dev_name, options); 1051 err = clnt->trans_mod->create(clnt, dev_name, options);
1063 if (err) 1052 if (err)
1064 goto destroy_fidpool; 1053 goto put_trans;
1065 1054
1066 if (clnt->msize > clnt->trans_mod->maxsize) 1055 if (clnt->msize > clnt->trans_mod->maxsize)
1067 clnt->msize = clnt->trans_mod->maxsize; 1056 clnt->msize = clnt->trans_mod->maxsize;
@@ -1074,8 +1063,6 @@ struct p9_client *p9_client_create(const char *dev_name, char *options)
1074 1063
1075close_trans: 1064close_trans:
1076 clnt->trans_mod->close(clnt); 1065 clnt->trans_mod->close(clnt);
1077destroy_fidpool:
1078 p9_idpool_destroy(clnt->fidpool);
1079put_trans: 1066put_trans:
1080 v9fs_put_trans(clnt->trans_mod); 1067 v9fs_put_trans(clnt->trans_mod);
1081destroy_tagpool: 1068destroy_tagpool:
@@ -1088,7 +1075,8 @@ EXPORT_SYMBOL(p9_client_create);
1088 1075
1089void p9_client_destroy(struct p9_client *clnt) 1076void p9_client_destroy(struct p9_client *clnt)
1090{ 1077{
1091 struct p9_fid *fid, *fidptr; 1078 struct p9_fid *fid;
1079 int id;
1092 1080
1093 p9_debug(P9_DEBUG_MUX, "clnt %p\n", clnt); 1081 p9_debug(P9_DEBUG_MUX, "clnt %p\n", clnt);
1094 1082
@@ -1097,14 +1085,11 @@ void p9_client_destroy(struct p9_client *clnt)
1097 1085
1098 v9fs_put_trans(clnt->trans_mod); 1086 v9fs_put_trans(clnt->trans_mod);
1099 1087
1100 list_for_each_entry_safe(fid, fidptr, &clnt->fidlist, flist) { 1088 idr_for_each_entry(&clnt->fids, fid, id) {
1101 pr_info("Found fid %d not clunked\n", fid->fid); 1089 pr_info("Found fid %d not clunked\n", fid->fid);
1102 p9_fid_destroy(fid); 1090 p9_fid_destroy(fid);
1103 } 1091 }
1104 1092
1105 if (clnt->fidpool)
1106 p9_idpool_destroy(clnt->fidpool);
1107
1108 p9_tag_cleanup(clnt); 1093 p9_tag_cleanup(clnt);
1109 1094
1110 kfree(clnt); 1095 kfree(clnt);
@@ -1137,9 +1122,8 @@ struct p9_fid *p9_client_attach(struct p9_client *clnt, struct p9_fid *afid,
1137 p9_debug(P9_DEBUG_9P, ">>> TATTACH afid %d uname %s aname %s\n", 1122 p9_debug(P9_DEBUG_9P, ">>> TATTACH afid %d uname %s aname %s\n",
1138 afid ? afid->fid : -1, uname, aname); 1123 afid ? afid->fid : -1, uname, aname);
1139 fid = p9_fid_create(clnt); 1124 fid = p9_fid_create(clnt);
1140 if (IS_ERR(fid)) { 1125 if (!fid) {
1141 err = PTR_ERR(fid); 1126 err = -ENOMEM;
1142 fid = NULL;
1143 goto error; 1127 goto error;
1144 } 1128 }
1145 fid->uid = n_uname; 1129 fid->uid = n_uname;
@@ -1188,9 +1172,8 @@ struct p9_fid *p9_client_walk(struct p9_fid *oldfid, uint16_t nwname,
1188 clnt = oldfid->clnt; 1172 clnt = oldfid->clnt;
1189 if (clone) { 1173 if (clone) {
1190 fid = p9_fid_create(clnt); 1174 fid = p9_fid_create(clnt);
1191 if (IS_ERR(fid)) { 1175 if (!fid) {
1192 err = PTR_ERR(fid); 1176 err = -ENOMEM;
1193 fid = NULL;
1194 goto error; 1177 goto error;
1195 } 1178 }
1196 1179
@@ -1575,7 +1558,7 @@ p9_client_read(struct p9_fid *fid, u64 offset, struct iov_iter *to, int *err)
1575 int count = iov_iter_count(to); 1558 int count = iov_iter_count(to);
1576 int rsize, non_zc = 0; 1559 int rsize, non_zc = 0;
1577 char *dataptr; 1560 char *dataptr;
1578 1561
1579 rsize = fid->iounit; 1562 rsize = fid->iounit;
1580 if (!rsize || rsize > clnt->msize-P9_IOHDRSZ) 1563 if (!rsize || rsize > clnt->msize-P9_IOHDRSZ)
1581 rsize = clnt->msize - P9_IOHDRSZ; 1564 rsize = clnt->msize - P9_IOHDRSZ;
@@ -1789,7 +1772,7 @@ struct p9_stat_dotl *p9_client_getattr_dotl(struct p9_fid *fid,
1789 "<<< st_mtime_sec=%lld st_mtime_nsec=%lld\n" 1772 "<<< st_mtime_sec=%lld st_mtime_nsec=%lld\n"
1790 "<<< st_ctime_sec=%lld st_ctime_nsec=%lld\n" 1773 "<<< st_ctime_sec=%lld st_ctime_nsec=%lld\n"
1791 "<<< st_btime_sec=%lld st_btime_nsec=%lld\n" 1774 "<<< st_btime_sec=%lld st_btime_nsec=%lld\n"
1792 "<<< st_gen=%lld st_data_version=%lld", 1775 "<<< st_gen=%lld st_data_version=%lld\n",
1793 ret->st_result_mask, ret->qid.type, ret->qid.path, 1776 ret->st_result_mask, ret->qid.type, ret->qid.path,
1794 ret->qid.version, ret->st_mode, ret->st_nlink, 1777 ret->qid.version, ret->st_mode, ret->st_nlink,
1795 from_kuid(&init_user_ns, ret->st_uid), 1778 from_kuid(&init_user_ns, ret->st_uid),
@@ -2018,9 +2001,8 @@ struct p9_fid *p9_client_xattrwalk(struct p9_fid *file_fid,
2018 err = 0; 2001 err = 0;
2019 clnt = file_fid->clnt; 2002 clnt = file_fid->clnt;
2020 attr_fid = p9_fid_create(clnt); 2003 attr_fid = p9_fid_create(clnt);
2021 if (IS_ERR(attr_fid)) { 2004 if (!attr_fid) {
2022 err = PTR_ERR(attr_fid); 2005 err = -ENOMEM;
2023 attr_fid = NULL;
2024 goto error; 2006 goto error;
2025 } 2007 }
2026 p9_debug(P9_DEBUG_9P, 2008 p9_debug(P9_DEBUG_9P,
diff --git a/net/9p/mod.c b/net/9p/mod.c
index eb9777f05755..253ba824a325 100644
--- a/net/9p/mod.c
+++ b/net/9p/mod.c
@@ -171,13 +171,11 @@ void v9fs_put_trans(struct p9_trans_module *m)
171 */ 171 */
172static int __init init_p9(void) 172static int __init init_p9(void)
173{ 173{
174 int ret = 0;
175
176 p9_error_init(); 174 p9_error_init();
177 pr_info("Installing 9P2000 support\n"); 175 pr_info("Installing 9P2000 support\n");
178 p9_trans_fd_init(); 176 p9_trans_fd_init();
179 177
180 return ret; 178 return 0;
181} 179}
182 180
183/** 181/**
diff --git a/net/9p/protocol.c b/net/9p/protocol.c
index 931ea00c4fed..4a1e1dd30b52 100644
--- a/net/9p/protocol.c
+++ b/net/9p/protocol.c
@@ -156,7 +156,7 @@ p9pdu_vreadf(struct p9_fcall *pdu, int proto_version, const char *fmt,
156 156
157 *sptr = kmalloc(len + 1, GFP_NOFS); 157 *sptr = kmalloc(len + 1, GFP_NOFS);
158 if (*sptr == NULL) { 158 if (*sptr == NULL) {
159 errcode = -EFAULT; 159 errcode = -ENOMEM;
160 break; 160 break;
161 } 161 }
162 if (pdu_read(pdu, *sptr, len)) { 162 if (pdu_read(pdu, *sptr, len)) {
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index 588bf88c3305..e2ef3c782c53 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -185,6 +185,8 @@ static void p9_mux_poll_stop(struct p9_conn *m)
185 spin_lock_irqsave(&p9_poll_lock, flags); 185 spin_lock_irqsave(&p9_poll_lock, flags);
186 list_del_init(&m->poll_pending_link); 186 list_del_init(&m->poll_pending_link);
187 spin_unlock_irqrestore(&p9_poll_lock, flags); 187 spin_unlock_irqrestore(&p9_poll_lock, flags);
188
189 flush_work(&p9_poll_work);
188} 190}
189 191
190/** 192/**
@@ -197,15 +199,14 @@ static void p9_mux_poll_stop(struct p9_conn *m)
197static void p9_conn_cancel(struct p9_conn *m, int err) 199static void p9_conn_cancel(struct p9_conn *m, int err)
198{ 200{
199 struct p9_req_t *req, *rtmp; 201 struct p9_req_t *req, *rtmp;
200 unsigned long flags;
201 LIST_HEAD(cancel_list); 202 LIST_HEAD(cancel_list);
202 203
203 p9_debug(P9_DEBUG_ERROR, "mux %p err %d\n", m, err); 204 p9_debug(P9_DEBUG_ERROR, "mux %p err %d\n", m, err);
204 205
205 spin_lock_irqsave(&m->client->lock, flags); 206 spin_lock(&m->client->lock);
206 207
207 if (m->err) { 208 if (m->err) {
208 spin_unlock_irqrestore(&m->client->lock, flags); 209 spin_unlock(&m->client->lock);
209 return; 210 return;
210 } 211 }
211 212
@@ -217,7 +218,6 @@ static void p9_conn_cancel(struct p9_conn *m, int err)
217 list_for_each_entry_safe(req, rtmp, &m->unsent_req_list, req_list) { 218 list_for_each_entry_safe(req, rtmp, &m->unsent_req_list, req_list) {
218 list_move(&req->req_list, &cancel_list); 219 list_move(&req->req_list, &cancel_list);
219 } 220 }
220 spin_unlock_irqrestore(&m->client->lock, flags);
221 221
222 list_for_each_entry_safe(req, rtmp, &cancel_list, req_list) { 222 list_for_each_entry_safe(req, rtmp, &cancel_list, req_list) {
223 p9_debug(P9_DEBUG_ERROR, "call back req %p\n", req); 223 p9_debug(P9_DEBUG_ERROR, "call back req %p\n", req);
@@ -226,6 +226,7 @@ static void p9_conn_cancel(struct p9_conn *m, int err)
226 req->t_err = err; 226 req->t_err = err;
227 p9_client_cb(m->client, req, REQ_STATUS_ERROR); 227 p9_client_cb(m->client, req, REQ_STATUS_ERROR);
228 } 228 }
229 spin_unlock(&m->client->lock);
229} 230}
230 231
231static __poll_t 232static __poll_t
@@ -324,7 +325,9 @@ static void p9_read_work(struct work_struct *work)
324 if ((!m->req) && (m->rc.offset == m->rc.capacity)) { 325 if ((!m->req) && (m->rc.offset == m->rc.capacity)) {
325 p9_debug(P9_DEBUG_TRANS, "got new header\n"); 326 p9_debug(P9_DEBUG_TRANS, "got new header\n");
326 327
327 err = p9_parse_header(&m->rc, NULL, NULL, NULL, 0); 328 /* Header size */
329 m->rc.size = 7;
330 err = p9_parse_header(&m->rc, &m->rc.size, NULL, NULL, 0);
328 if (err) { 331 if (err) {
329 p9_debug(P9_DEBUG_ERROR, 332 p9_debug(P9_DEBUG_ERROR,
330 "error parsing header: %d\n", err); 333 "error parsing header: %d\n", err);
@@ -369,12 +372,14 @@ static void p9_read_work(struct work_struct *work)
369 */ 372 */
370 if ((m->req) && (m->rc.offset == m->rc.capacity)) { 373 if ((m->req) && (m->rc.offset == m->rc.capacity)) {
371 p9_debug(P9_DEBUG_TRANS, "got new packet\n"); 374 p9_debug(P9_DEBUG_TRANS, "got new packet\n");
375 m->req->rc->size = m->rc.offset;
372 spin_lock(&m->client->lock); 376 spin_lock(&m->client->lock);
373 if (m->req->status != REQ_STATUS_ERROR) 377 if (m->req->status != REQ_STATUS_ERROR)
374 status = REQ_STATUS_RCVD; 378 status = REQ_STATUS_RCVD;
375 list_del(&m->req->req_list); 379 list_del(&m->req->req_list);
376 spin_unlock(&m->client->lock); 380 /* update req->status while holding client->lock */
377 p9_client_cb(m->client, m->req, status); 381 p9_client_cb(m->client, m->req, status);
382 spin_unlock(&m->client->lock);
378 m->rc.sdata = NULL; 383 m->rc.sdata = NULL;
379 m->rc.offset = 0; 384 m->rc.offset = 0;
380 m->rc.capacity = 0; 385 m->rc.capacity = 0;
@@ -940,7 +945,7 @@ p9_fd_create_tcp(struct p9_client *client, const char *addr, char *args)
940 if (err < 0) 945 if (err < 0)
941 return err; 946 return err;
942 947
943 if (valid_ipaddr4(addr) < 0) 948 if (addr == NULL || valid_ipaddr4(addr) < 0)
944 return -EINVAL; 949 return -EINVAL;
945 950
946 csocket = NULL; 951 csocket = NULL;
@@ -990,6 +995,9 @@ p9_fd_create_unix(struct p9_client *client, const char *addr, char *args)
990 995
991 csocket = NULL; 996 csocket = NULL;
992 997
998 if (addr == NULL)
999 return -EINVAL;
1000
993 if (strlen(addr) >= UNIX_PATH_MAX) { 1001 if (strlen(addr) >= UNIX_PATH_MAX) {
994 pr_err("%s (%d): address too long: %s\n", 1002 pr_err("%s (%d): address too long: %s\n",
995 __func__, task_pid_nr(current), addr); 1003 __func__, task_pid_nr(current), addr);
diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
index 3d414acb7015..b513cffeeb3c 100644
--- a/net/9p/trans_rdma.c
+++ b/net/9p/trans_rdma.c
@@ -320,6 +320,7 @@ recv_done(struct ib_cq *cq, struct ib_wc *wc)
320 if (wc->status != IB_WC_SUCCESS) 320 if (wc->status != IB_WC_SUCCESS)
321 goto err_out; 321 goto err_out;
322 322
323 c->rc->size = wc->byte_len;
323 err = p9_parse_header(c->rc, NULL, NULL, &tag, 1); 324 err = p9_parse_header(c->rc, NULL, NULL, &tag, 1);
324 if (err) 325 if (err)
325 goto err_out; 326 goto err_out;
@@ -396,7 +397,7 @@ static int
396post_recv(struct p9_client *client, struct p9_rdma_context *c) 397post_recv(struct p9_client *client, struct p9_rdma_context *c)
397{ 398{
398 struct p9_trans_rdma *rdma = client->trans; 399 struct p9_trans_rdma *rdma = client->trans;
399 struct ib_recv_wr wr, *bad_wr; 400 struct ib_recv_wr wr;
400 struct ib_sge sge; 401 struct ib_sge sge;
401 402
402 c->busa = ib_dma_map_single(rdma->cm_id->device, 403 c->busa = ib_dma_map_single(rdma->cm_id->device,
@@ -415,7 +416,7 @@ post_recv(struct p9_client *client, struct p9_rdma_context *c)
415 wr.wr_cqe = &c->cqe; 416 wr.wr_cqe = &c->cqe;
416 wr.sg_list = &sge; 417 wr.sg_list = &sge;
417 wr.num_sge = 1; 418 wr.num_sge = 1;
418 return ib_post_recv(rdma->qp, &wr, &bad_wr); 419 return ib_post_recv(rdma->qp, &wr, NULL);
419 420
420 error: 421 error:
421 p9_debug(P9_DEBUG_ERROR, "EIO\n"); 422 p9_debug(P9_DEBUG_ERROR, "EIO\n");
@@ -425,7 +426,7 @@ post_recv(struct p9_client *client, struct p9_rdma_context *c)
425static int rdma_request(struct p9_client *client, struct p9_req_t *req) 426static int rdma_request(struct p9_client *client, struct p9_req_t *req)
426{ 427{
427 struct p9_trans_rdma *rdma = client->trans; 428 struct p9_trans_rdma *rdma = client->trans;
428 struct ib_send_wr wr, *bad_wr; 429 struct ib_send_wr wr;
429 struct ib_sge sge; 430 struct ib_sge sge;
430 int err = 0; 431 int err = 0;
431 unsigned long flags; 432 unsigned long flags;
@@ -520,7 +521,7 @@ dont_need_post_recv:
520 * status in case of a very fast reply. 521 * status in case of a very fast reply.
521 */ 522 */
522 req->status = REQ_STATUS_SENT; 523 req->status = REQ_STATUS_SENT;
523 err = ib_post_send(rdma->qp, &wr, &bad_wr); 524 err = ib_post_send(rdma->qp, &wr, NULL);
524 if (err) 525 if (err)
525 goto send_error; 526 goto send_error;
526 527
@@ -644,6 +645,9 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args)
644 struct rdma_conn_param conn_param; 645 struct rdma_conn_param conn_param;
645 struct ib_qp_init_attr qp_attr; 646 struct ib_qp_init_attr qp_attr;
646 647
648 if (addr == NULL)
649 return -EINVAL;
650
647 /* Parse the transport specific mount options */ 651 /* Parse the transport specific mount options */
648 err = parse_opts(args, &opts); 652 err = parse_opts(args, &opts);
649 if (err < 0) 653 if (err < 0)
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index 05006cbb3361..7728b0acde09 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -89,10 +89,8 @@ struct virtio_chan {
89 unsigned long p9_max_pages; 89 unsigned long p9_max_pages;
90 /* Scatterlist: can be too big for stack. */ 90 /* Scatterlist: can be too big for stack. */
91 struct scatterlist sg[VIRTQUEUE_NUM]; 91 struct scatterlist sg[VIRTQUEUE_NUM];
92
93 int tag_len;
94 /* 92 /*
95 * tag name to identify a mount Non-null terminated 93 * tag name to identify a mount null terminated
96 */ 94 */
97 char *tag; 95 char *tag;
98 96
@@ -144,24 +142,27 @@ static void req_done(struct virtqueue *vq)
144 struct virtio_chan *chan = vq->vdev->priv; 142 struct virtio_chan *chan = vq->vdev->priv;
145 unsigned int len; 143 unsigned int len;
146 struct p9_req_t *req; 144 struct p9_req_t *req;
145 bool need_wakeup = false;
147 unsigned long flags; 146 unsigned long flags;
148 147
149 p9_debug(P9_DEBUG_TRANS, ": request done\n"); 148 p9_debug(P9_DEBUG_TRANS, ": request done\n");
150 149
151 while (1) { 150 spin_lock_irqsave(&chan->lock, flags);
152 spin_lock_irqsave(&chan->lock, flags); 151 while ((req = virtqueue_get_buf(chan->vq, &len)) != NULL) {
153 req = virtqueue_get_buf(chan->vq, &len); 152 if (!chan->ring_bufs_avail) {
154 if (req == NULL) { 153 chan->ring_bufs_avail = 1;
155 spin_unlock_irqrestore(&chan->lock, flags); 154 need_wakeup = true;
156 break;
157 } 155 }
158 chan->ring_bufs_avail = 1; 156
159 spin_unlock_irqrestore(&chan->lock, flags); 157 if (len) {
160 /* Wakeup if anyone waiting for VirtIO ring space. */ 158 req->rc->size = len;
161 wake_up(chan->vc_wq);
162 if (len)
163 p9_client_cb(chan->client, req, REQ_STATUS_RCVD); 159 p9_client_cb(chan->client, req, REQ_STATUS_RCVD);
160 }
164 } 161 }
162 spin_unlock_irqrestore(&chan->lock, flags);
163 /* Wakeup if anyone waiting for VirtIO ring space. */
164 if (need_wakeup)
165 wake_up(chan->vc_wq);
165} 166}
166 167
167/** 168/**
@@ -188,7 +189,7 @@ static int pack_sg_list(struct scatterlist *sg, int start,
188 s = rest_of_page(data); 189 s = rest_of_page(data);
189 if (s > count) 190 if (s > count)
190 s = count; 191 s = count;
191 BUG_ON(index > limit); 192 BUG_ON(index >= limit);
192 /* Make sure we don't terminate early. */ 193 /* Make sure we don't terminate early. */
193 sg_unmark_end(&sg[index]); 194 sg_unmark_end(&sg[index]);
194 sg_set_buf(&sg[index++], data, s); 195 sg_set_buf(&sg[index++], data, s);
@@ -233,6 +234,7 @@ pack_sg_list_p(struct scatterlist *sg, int start, int limit,
233 s = PAGE_SIZE - data_off; 234 s = PAGE_SIZE - data_off;
234 if (s > count) 235 if (s > count)
235 s = count; 236 s = count;
237 BUG_ON(index >= limit);
236 /* Make sure we don't terminate early. */ 238 /* Make sure we don't terminate early. */
237 sg_unmark_end(&sg[index]); 239 sg_unmark_end(&sg[index]);
238 sg_set_page(&sg[index++], pdata[i++], s, data_off); 240 sg_set_page(&sg[index++], pdata[i++], s, data_off);
@@ -382,8 +384,8 @@ static int p9_get_mapped_pages(struct virtio_chan *chan,
382 * p9_virtio_zc_request - issue a zero copy request 384 * p9_virtio_zc_request - issue a zero copy request
383 * @client: client instance issuing the request 385 * @client: client instance issuing the request
384 * @req: request to be issued 386 * @req: request to be issued
385 * @uidata: user bffer that should be ued for zero copy read 387 * @uidata: user buffer that should be used for zero copy read
386 * @uodata: user buffer that shoud be user for zero copy write 388 * @uodata: user buffer that should be used for zero copy write
387 * @inlen: read buffer size 389 * @inlen: read buffer size
388 * @outlen: write buffer size 390 * @outlen: write buffer size
389 * @in_hdr_len: reader header size, This is the size of response protocol data 391 * @in_hdr_len: reader header size, This is the size of response protocol data
@@ -406,6 +408,7 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req,
406 p9_debug(P9_DEBUG_TRANS, "virtio request\n"); 408 p9_debug(P9_DEBUG_TRANS, "virtio request\n");
407 409
408 if (uodata) { 410 if (uodata) {
411 __le32 sz;
409 int n = p9_get_mapped_pages(chan, &out_pages, uodata, 412 int n = p9_get_mapped_pages(chan, &out_pages, uodata,
410 outlen, &offs, &need_drop); 413 outlen, &offs, &need_drop);
411 if (n < 0) 414 if (n < 0)
@@ -416,6 +419,12 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req,
416 memcpy(&req->tc->sdata[req->tc->size - 4], &v, 4); 419 memcpy(&req->tc->sdata[req->tc->size - 4], &v, 4);
417 outlen = n; 420 outlen = n;
418 } 421 }
422 /* The size field of the message must include the length of the
423 * header and the length of the data. We didn't actually know
424 * the length of the data until this point so add it in now.
425 */
426 sz = cpu_to_le32(req->tc->size + outlen);
427 memcpy(&req->tc->sdata[0], &sz, sizeof(sz));
419 } else if (uidata) { 428 } else if (uidata) {
420 int n = p9_get_mapped_pages(chan, &in_pages, uidata, 429 int n = p9_get_mapped_pages(chan, &in_pages, uidata,
421 inlen, &offs, &need_drop); 430 inlen, &offs, &need_drop);
@@ -446,7 +455,7 @@ req_retry_pinned:
446 out += pack_sg_list_p(chan->sg, out, VIRTQUEUE_NUM, 455 out += pack_sg_list_p(chan->sg, out, VIRTQUEUE_NUM,
447 out_pages, out_nr_pages, offs, outlen); 456 out_pages, out_nr_pages, offs, outlen);
448 } 457 }
449 458
450 /* 459 /*
451 * Take care of in data 460 * Take care of in data
452 * For example TREAD have 11. 461 * For example TREAD have 11.
@@ -490,7 +499,7 @@ req_retry_pinned:
490 virtqueue_kick(chan->vq); 499 virtqueue_kick(chan->vq);
491 spin_unlock_irqrestore(&chan->lock, flags); 500 spin_unlock_irqrestore(&chan->lock, flags);
492 p9_debug(P9_DEBUG_TRANS, "virtio request kicked\n"); 501 p9_debug(P9_DEBUG_TRANS, "virtio request kicked\n");
493 err = wait_event_killable(*req->wq, req->status >= REQ_STATUS_RCVD); 502 err = wait_event_killable(req->wq, req->status >= REQ_STATUS_RCVD);
494 /* 503 /*
495 * Non kernel buffers are pinned, unpin them 504 * Non kernel buffers are pinned, unpin them
496 */ 505 */
@@ -517,14 +526,15 @@ static ssize_t p9_mount_tag_show(struct device *dev,
517{ 526{
518 struct virtio_chan *chan; 527 struct virtio_chan *chan;
519 struct virtio_device *vdev; 528 struct virtio_device *vdev;
529 int tag_len;
520 530
521 vdev = dev_to_virtio(dev); 531 vdev = dev_to_virtio(dev);
522 chan = vdev->priv; 532 chan = vdev->priv;
533 tag_len = strlen(chan->tag);
523 534
524 memcpy(buf, chan->tag, chan->tag_len); 535 memcpy(buf, chan->tag, tag_len + 1);
525 buf[chan->tag_len] = 0;
526 536
527 return chan->tag_len + 1; 537 return tag_len + 1;
528} 538}
529 539
530static DEVICE_ATTR(mount_tag, 0444, p9_mount_tag_show, NULL); 540static DEVICE_ATTR(mount_tag, 0444, p9_mount_tag_show, NULL);
@@ -563,7 +573,7 @@ static int p9_virtio_probe(struct virtio_device *vdev)
563 chan->vq = virtio_find_single_vq(vdev, req_done, "requests"); 573 chan->vq = virtio_find_single_vq(vdev, req_done, "requests");
564 if (IS_ERR(chan->vq)) { 574 if (IS_ERR(chan->vq)) {
565 err = PTR_ERR(chan->vq); 575 err = PTR_ERR(chan->vq);
566 goto out_free_vq; 576 goto out_free_chan;
567 } 577 }
568 chan->vq->vdev->priv = chan; 578 chan->vq->vdev->priv = chan;
569 spin_lock_init(&chan->lock); 579 spin_lock_init(&chan->lock);
@@ -577,7 +587,7 @@ static int p9_virtio_probe(struct virtio_device *vdev)
577 err = -EINVAL; 587 err = -EINVAL;
578 goto out_free_vq; 588 goto out_free_vq;
579 } 589 }
580 tag = kmalloc(tag_len, GFP_KERNEL); 590 tag = kzalloc(tag_len + 1, GFP_KERNEL);
581 if (!tag) { 591 if (!tag) {
582 err = -ENOMEM; 592 err = -ENOMEM;
583 goto out_free_vq; 593 goto out_free_vq;
@@ -586,7 +596,6 @@ static int p9_virtio_probe(struct virtio_device *vdev)
586 virtio_cread_bytes(vdev, offsetof(struct virtio_9p_config, tag), 596 virtio_cread_bytes(vdev, offsetof(struct virtio_9p_config, tag),
587 tag, tag_len); 597 tag, tag_len);
588 chan->tag = tag; 598 chan->tag = tag;
589 chan->tag_len = tag_len;
590 err = sysfs_create_file(&(vdev->dev.kobj), &dev_attr_mount_tag.attr); 599 err = sysfs_create_file(&(vdev->dev.kobj), &dev_attr_mount_tag.attr);
591 if (err) { 600 if (err) {
592 goto out_free_tag; 601 goto out_free_tag;
@@ -616,6 +625,7 @@ out_free_tag:
616 kfree(tag); 625 kfree(tag);
617out_free_vq: 626out_free_vq:
618 vdev->config->del_vqs(vdev); 627 vdev->config->del_vqs(vdev);
628out_free_chan:
619 kfree(chan); 629 kfree(chan);
620fail: 630fail:
621 return err; 631 return err;
@@ -643,10 +653,12 @@ p9_virtio_create(struct p9_client *client, const char *devname, char *args)
643 int ret = -ENOENT; 653 int ret = -ENOENT;
644 int found = 0; 654 int found = 0;
645 655
656 if (devname == NULL)
657 return -EINVAL;
658
646 mutex_lock(&virtio_9p_lock); 659 mutex_lock(&virtio_9p_lock);
647 list_for_each_entry(chan, &virtio_chan_list, chan_list) { 660 list_for_each_entry(chan, &virtio_chan_list, chan_list) {
648 if (!strncmp(devname, chan->tag, chan->tag_len) && 661 if (!strcmp(devname, chan->tag)) {
649 strlen(devname) == chan->tag_len) {
650 if (!chan->inuse) { 662 if (!chan->inuse) {
651 chan->inuse = true; 663 chan->inuse = true;
652 found = 1; 664 found = 1;
diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c
index 2e2b8bca54f3..c2d54ac76bfd 100644
--- a/net/9p/trans_xen.c
+++ b/net/9p/trans_xen.c
@@ -94,6 +94,9 @@ static int p9_xen_create(struct p9_client *client, const char *addr, char *args)
94{ 94{
95 struct xen_9pfs_front_priv *priv; 95 struct xen_9pfs_front_priv *priv;
96 96
97 if (addr == NULL)
98 return -EINVAL;
99
97 read_lock(&xen_9pfs_lock); 100 read_lock(&xen_9pfs_lock);
98 list_for_each_entry(priv, &xen_9pfs_devs, list) { 101 list_for_each_entry(priv, &xen_9pfs_devs, list) {
99 if (!strcmp(priv->tag, addr)) { 102 if (!strcmp(priv->tag, addr)) {
diff --git a/net/9p/util.c b/net/9p/util.c
index 59f278e64f58..55ad98277e85 100644
--- a/net/9p/util.c
+++ b/net/9p/util.c
@@ -138,4 +138,3 @@ int p9_idpool_check(int id, struct p9_idpool *p)
138 return idr_find(&p->pool, id) != NULL; 138 return idr_find(&p->pool, id) != NULL;
139} 139}
140EXPORT_SYMBOL(p9_idpool_check); 140EXPORT_SYMBOL(p9_idpool_check);
141
diff --git a/net/Kconfig b/net/Kconfig
index f738a6f27665..228dfa382eec 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -12,7 +12,7 @@ menuconfig NET
12 The reason is that some programs need kernel networking support even 12 The reason is that some programs need kernel networking support even
13 when running on a stand-alone machine that isn't connected to any 13 when running on a stand-alone machine that isn't connected to any
14 other computer. 14 other computer.
15 15
16 If you are upgrading from an older kernel, you 16 If you are upgrading from an older kernel, you
17 should consider updating your networking tools too because changes 17 should consider updating your networking tools too because changes
18 in the kernel and the tools often go hand in hand. The tools are 18 in the kernel and the tools often go hand in hand. The tools are
diff --git a/net/Makefile b/net/Makefile
index 13ec0d5415c7..bdaf53925acd 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -20,11 +20,7 @@ obj-$(CONFIG_TLS) += tls/
20obj-$(CONFIG_XFRM) += xfrm/ 20obj-$(CONFIG_XFRM) += xfrm/
21obj-$(CONFIG_UNIX) += unix/ 21obj-$(CONFIG_UNIX) += unix/
22obj-$(CONFIG_NET) += ipv6/ 22obj-$(CONFIG_NET) += ipv6/
23ifneq ($(CC_CAN_LINK),y)
24$(warning CC cannot link executables. Skipping bpfilter.)
25else
26obj-$(CONFIG_BPFILTER) += bpfilter/ 23obj-$(CONFIG_BPFILTER) += bpfilter/
27endif
28obj-$(CONFIG_PACKET) += packet/ 24obj-$(CONFIG_PACKET) += packet/
29obj-$(CONFIG_NET_KEY) += key/ 25obj-$(CONFIG_NET_KEY) += key/
30obj-$(CONFIG_BRIDGE) += bridge/ 26obj-$(CONFIG_BRIDGE) += bridge/
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 55fdba05d7d9..9b6bc5abe946 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -1869,7 +1869,7 @@ static const struct proto_ops atalk_dgram_ops = {
1869 .socketpair = sock_no_socketpair, 1869 .socketpair = sock_no_socketpair,
1870 .accept = sock_no_accept, 1870 .accept = sock_no_accept,
1871 .getname = atalk_getname, 1871 .getname = atalk_getname,
1872 .poll_mask = datagram_poll_mask, 1872 .poll = datagram_poll,
1873 .ioctl = atalk_ioctl, 1873 .ioctl = atalk_ioctl,
1874#ifdef CONFIG_COMPAT 1874#ifdef CONFIG_COMPAT
1875 .compat_ioctl = atalk_compat_ioctl, 1875 .compat_ioctl = atalk_compat_ioctl,
diff --git a/net/atm/common.c b/net/atm/common.c
index ff5748b2190f..9f8cb0d2e71e 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -647,11 +647,16 @@ out:
647 return error; 647 return error;
648} 648}
649 649
650__poll_t vcc_poll_mask(struct socket *sock, __poll_t events) 650__poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait)
651{ 651{
652 struct sock *sk = sock->sk; 652 struct sock *sk = sock->sk;
653 struct atm_vcc *vcc = ATM_SD(sock); 653 struct atm_vcc *vcc;
654 __poll_t mask = 0; 654 __poll_t mask;
655
656 sock_poll_wait(file, wait);
657 mask = 0;
658
659 vcc = ATM_SD(sock);
655 660
656 /* exceptional events */ 661 /* exceptional events */
657 if (sk->sk_err) 662 if (sk->sk_err)
diff --git a/net/atm/common.h b/net/atm/common.h
index 526796ad230f..5850649068bb 100644
--- a/net/atm/common.h
+++ b/net/atm/common.h
@@ -17,7 +17,7 @@ int vcc_connect(struct socket *sock, int itf, short vpi, int vci);
17int vcc_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 17int vcc_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
18 int flags); 18 int flags);
19int vcc_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len); 19int vcc_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len);
20__poll_t vcc_poll_mask(struct socket *sock, __poll_t events); 20__poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait);
21int vcc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); 21int vcc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
22int vcc_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); 22int vcc_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
23int vcc_setsockopt(struct socket *sock, int level, int optname, 23int vcc_setsockopt(struct socket *sock, int level, int optname,
diff --git a/net/atm/mpoa_proc.c b/net/atm/mpoa_proc.c
index b93cc0f18292..46d6cd9a36ae 100644
--- a/net/atm/mpoa_proc.c
+++ b/net/atm/mpoa_proc.c
@@ -307,9 +307,3 @@ void mpc_proc_clean(void)
307} 307}
308 308
309#endif /* CONFIG_PROC_FS */ 309#endif /* CONFIG_PROC_FS */
310
311
312
313
314
315
diff --git a/net/atm/pppoatm.c b/net/atm/pppoatm.c
index af8c4b38b746..d84227d75717 100644
--- a/net/atm/pppoatm.c
+++ b/net/atm/pppoatm.c
@@ -244,7 +244,7 @@ static int pppoatm_may_send(struct pppoatm_vcc *pvcc, int size)
244 * the packet count limit, so... 244 * the packet count limit, so...
245 */ 245 */
246 if (atm_may_send(pvcc->atmvcc, size) && 246 if (atm_may_send(pvcc->atmvcc, size) &&
247 atomic_inc_not_zero_hint(&pvcc->inflight, NONE_INFLIGHT)) 247 atomic_inc_not_zero(&pvcc->inflight))
248 return 1; 248 return 1;
249 249
250 /* 250 /*
diff --git a/net/atm/pvc.c b/net/atm/pvc.c
index 9f75092fe778..2cb10af16afc 100644
--- a/net/atm/pvc.c
+++ b/net/atm/pvc.c
@@ -113,7 +113,7 @@ static const struct proto_ops pvc_proto_ops = {
113 .socketpair = sock_no_socketpair, 113 .socketpair = sock_no_socketpair,
114 .accept = sock_no_accept, 114 .accept = sock_no_accept,
115 .getname = pvc_getname, 115 .getname = pvc_getname,
116 .poll_mask = vcc_poll_mask, 116 .poll = vcc_poll,
117 .ioctl = vcc_ioctl, 117 .ioctl = vcc_ioctl,
118#ifdef CONFIG_COMPAT 118#ifdef CONFIG_COMPAT
119 .compat_ioctl = vcc_compat_ioctl, 119 .compat_ioctl = vcc_compat_ioctl,
diff --git a/net/atm/svc.c b/net/atm/svc.c
index 53f4ad7087b1..2f91b766ac42 100644
--- a/net/atm/svc.c
+++ b/net/atm/svc.c
@@ -636,7 +636,7 @@ static const struct proto_ops svc_proto_ops = {
636 .socketpair = sock_no_socketpair, 636 .socketpair = sock_no_socketpair,
637 .accept = svc_accept, 637 .accept = svc_accept,
638 .getname = svc_getname, 638 .getname = svc_getname,
639 .poll_mask = vcc_poll_mask, 639 .poll = vcc_poll,
640 .ioctl = svc_ioctl, 640 .ioctl = svc_ioctl,
641#ifdef CONFIG_COMPAT 641#ifdef CONFIG_COMPAT
642 .compat_ioctl = svc_compat_ioctl, 642 .compat_ioctl = svc_compat_ioctl,
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index d1d2442ce573..c603d33d5410 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -1941,7 +1941,7 @@ static const struct proto_ops ax25_proto_ops = {
1941 .socketpair = sock_no_socketpair, 1941 .socketpair = sock_no_socketpair,
1942 .accept = ax25_accept, 1942 .accept = ax25_accept,
1943 .getname = ax25_getname, 1943 .getname = ax25_getname,
1944 .poll_mask = datagram_poll_mask, 1944 .poll = datagram_poll,
1945 .ioctl = ax25_ioctl, 1945 .ioctl = ax25_ioctl,
1946 .listen = ax25_listen, 1946 .listen = ax25_listen,
1947 .shutdown = ax25_shutdown, 1947 .shutdown = ax25_shutdown,
diff --git a/net/ax25/ax25_addr.c b/net/ax25/ax25_addr.c
index ac2542b7be88..a14cfa736b63 100644
--- a/net/ax25/ax25_addr.c
+++ b/net/ax25/ax25_addr.c
@@ -304,4 +304,3 @@ void ax25_digi_invert(const ax25_digi *in, ax25_digi *out)
304 } 304 }
305 } 305 }
306} 306}
307
diff --git a/net/ax25/ax25_ds_in.c b/net/ax25/ax25_ds_in.c
index 891596e74278..488fc2d7085a 100644
--- a/net/ax25/ax25_ds_in.c
+++ b/net/ax25/ax25_ds_in.c
@@ -299,4 +299,3 @@ int ax25_ds_frame_in(ax25_cb *ax25, struct sk_buff *skb, int type)
299 299
300 return queued; 300 return queued;
301} 301}
302
diff --git a/net/ax25/ax25_ds_subr.c b/net/ax25/ax25_ds_subr.c
index 28827e81ba2b..bc0329f43013 100644
--- a/net/ax25/ax25_ds_subr.c
+++ b/net/ax25/ax25_ds_subr.c
@@ -205,4 +205,3 @@ void ax25_dama_off(ax25_cb *ax25)
205 ax25->condition &= ~AX25_COND_DAMA_MODE; 205 ax25->condition &= ~AX25_COND_DAMA_MODE;
206 ax25_dev_dama_off(ax25->ax25_dev); 206 ax25_dev_dama_off(ax25->ax25_dev);
207} 207}
208
diff --git a/net/ax25/ax25_ip.c b/net/ax25/ax25_ip.c
index 183b1c583d56..70417e9b932d 100644
--- a/net/ax25/ax25_ip.c
+++ b/net/ax25/ax25_ip.c
@@ -249,4 +249,3 @@ const struct header_ops ax25_header_ops = {
249 249
250EXPORT_SYMBOL(ax25_header_ops); 250EXPORT_SYMBOL(ax25_header_ops);
251EXPORT_SYMBOL(ax25_ip_xmit); 251EXPORT_SYMBOL(ax25_ip_xmit);
252
diff --git a/net/ax25/ax25_out.c b/net/ax25/ax25_out.c
index b11a5f466fcc..3e5afc8dc93e 100644
--- a/net/ax25/ax25_out.c
+++ b/net/ax25/ax25_out.c
@@ -394,4 +394,3 @@ int ax25_check_iframes_acked(ax25_cb *ax25, unsigned short nr)
394 } 394 }
395 return 0; 395 return 0;
396} 396}
397
diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig
index de8034d80623..361116f77cb9 100644
--- a/net/batman-adv/Kconfig
+++ b/net/batman-adv/Kconfig
@@ -24,7 +24,6 @@ config BATMAN_ADV
24 depends on NET 24 depends on NET
25 select CRC16 25 select CRC16
26 select LIBCRC32C 26 select LIBCRC32C
27 default n
28 help 27 help
29 B.A.T.M.A.N. (better approach to mobile ad-hoc networking) is 28 B.A.T.M.A.N. (better approach to mobile ad-hoc networking) is
30 a routing protocol for multi-hop ad-hoc mesh networks. The 29 a routing protocol for multi-hop ad-hoc mesh networks. The
@@ -33,7 +32,7 @@ config BATMAN_ADV
33 tools. 32 tools.
34 33
35config BATMAN_ADV_BATMAN_V 34config BATMAN_ADV_BATMAN_V
36 bool "B.A.T.M.A.N. V protocol (experimental)" 35 bool "B.A.T.M.A.N. V protocol"
37 depends on BATMAN_ADV && !(CFG80211=m && BATMAN_ADV=y) 36 depends on BATMAN_ADV && !(CFG80211=m && BATMAN_ADV=y)
38 default y 37 default y
39 help 38 help
@@ -60,7 +59,7 @@ config BATMAN_ADV_BLA
60config BATMAN_ADV_DAT 59config BATMAN_ADV_DAT
61 bool "Distributed ARP Table" 60 bool "Distributed ARP Table"
62 depends on BATMAN_ADV && INET 61 depends on BATMAN_ADV && INET
63 default n 62 default y
64 help 63 help
65 This option enables DAT (Distributed ARP Table), a DHT based 64 This option enables DAT (Distributed ARP Table), a DHT based
66 mechanism that increases ARP reliability on sparse wireless 65 mechanism that increases ARP reliability on sparse wireless
@@ -70,7 +69,6 @@ config BATMAN_ADV_DAT
70config BATMAN_ADV_NC 69config BATMAN_ADV_NC
71 bool "Network Coding" 70 bool "Network Coding"
72 depends on BATMAN_ADV 71 depends on BATMAN_ADV
73 default n
74 help 72 help
75 This option enables network coding, a mechanism that aims to 73 This option enables network coding, a mechanism that aims to
76 increase the overall network throughput by fusing multiple 74 increase the overall network throughput by fusing multiple
@@ -84,7 +82,6 @@ config BATMAN_ADV_NC
84config BATMAN_ADV_MCAST 82config BATMAN_ADV_MCAST
85 bool "Multicast optimisation" 83 bool "Multicast optimisation"
86 depends on BATMAN_ADV && INET && !(BRIDGE=m && BATMAN_ADV=y) 84 depends on BATMAN_ADV && INET && !(BRIDGE=m && BATMAN_ADV=y)
87 default n
88 help 85 help
89 This option enables the multicast optimisation which aims to 86 This option enables the multicast optimisation which aims to
90 reduce the air overhead while improving the reliability of 87 reduce the air overhead while improving the reliability of
@@ -94,7 +91,6 @@ config BATMAN_ADV_DEBUGFS
94 bool "batman-adv debugfs entries" 91 bool "batman-adv debugfs entries"
95 depends on BATMAN_ADV 92 depends on BATMAN_ADV
96 depends on DEBUG_FS 93 depends on DEBUG_FS
97 default n
98 help 94 help
99 Enable this to export routing related debug tables via debugfs. 95 Enable this to export routing related debug tables via debugfs.
100 The information for each soft-interface and used hard-interface can be 96 The information for each soft-interface and used hard-interface can be
diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index be09a9883825..73bf6a93a3cf 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -2732,7 +2732,7 @@ static int batadv_iv_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
2732{ 2732{
2733 struct batadv_neigh_ifinfo *router_ifinfo = NULL; 2733 struct batadv_neigh_ifinfo *router_ifinfo = NULL;
2734 struct batadv_neigh_node *router; 2734 struct batadv_neigh_node *router;
2735 struct batadv_gw_node *curr_gw; 2735 struct batadv_gw_node *curr_gw = NULL;
2736 int ret = 0; 2736 int ret = 0;
2737 void *hdr; 2737 void *hdr;
2738 2738
@@ -2780,6 +2780,8 @@ static int batadv_iv_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
2780 ret = 0; 2780 ret = 0;
2781 2781
2782out: 2782out:
2783 if (curr_gw)
2784 batadv_gw_node_put(curr_gw);
2783 if (router_ifinfo) 2785 if (router_ifinfo)
2784 batadv_neigh_ifinfo_put(router_ifinfo); 2786 batadv_neigh_ifinfo_put(router_ifinfo);
2785 if (router) 2787 if (router)
diff --git a/net/batman-adv/bat_iv_ogm.h b/net/batman-adv/bat_iv_ogm.h
index 317cafd302cf..3dc6a7a43eb7 100644
--- a/net/batman-adv/bat_iv_ogm.h
+++ b/net/batman-adv/bat_iv_ogm.h
@@ -16,11 +16,11 @@
16 * along with this program; if not, see <http://www.gnu.org/licenses/>. 16 * along with this program; if not, see <http://www.gnu.org/licenses/>.
17 */ 17 */
18 18
19#ifndef _BATMAN_ADV_BATADV_IV_OGM_H_ 19#ifndef _NET_BATMAN_ADV_BAT_IV_OGM_H_
20#define _BATMAN_ADV_BATADV_IV_OGM_H_ 20#define _NET_BATMAN_ADV_BAT_IV_OGM_H_
21 21
22#include "main.h" 22#include "main.h"
23 23
24int batadv_iv_init(void); 24int batadv_iv_init(void);
25 25
26#endif /* _BATMAN_ADV_BATADV_IV_OGM_H_ */ 26#endif /* _NET_BATMAN_ADV_BAT_IV_OGM_H_ */
diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c
index ec93337ee259..6baec4e68898 100644
--- a/net/batman-adv/bat_v.c
+++ b/net/batman-adv/bat_v.c
@@ -927,7 +927,7 @@ static int batadv_v_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
927{ 927{
928 struct batadv_neigh_ifinfo *router_ifinfo = NULL; 928 struct batadv_neigh_ifinfo *router_ifinfo = NULL;
929 struct batadv_neigh_node *router; 929 struct batadv_neigh_node *router;
930 struct batadv_gw_node *curr_gw; 930 struct batadv_gw_node *curr_gw = NULL;
931 int ret = 0; 931 int ret = 0;
932 void *hdr; 932 void *hdr;
933 933
@@ -995,6 +995,8 @@ static int batadv_v_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
995 ret = 0; 995 ret = 0;
996 996
997out: 997out:
998 if (curr_gw)
999 batadv_gw_node_put(curr_gw);
998 if (router_ifinfo) 1000 if (router_ifinfo)
999 batadv_neigh_ifinfo_put(router_ifinfo); 1001 batadv_neigh_ifinfo_put(router_ifinfo);
1000 if (router) 1002 if (router)
diff --git a/net/batman-adv/bat_v_ogm.h b/net/batman-adv/bat_v_ogm.h
index ed36c5e79fde..e5be14c908c6 100644
--- a/net/batman-adv/bat_v_ogm.h
+++ b/net/batman-adv/bat_v_ogm.h
@@ -16,8 +16,8 @@
16 * along with this program; if not, see <http://www.gnu.org/licenses/>. 16 * along with this program; if not, see <http://www.gnu.org/licenses/>.
17 */ 17 */
18 18
19#ifndef _BATMAN_ADV_BATADV_V_OGM_H_ 19#ifndef _NET_BATMAN_ADV_BAT_V_OGM_H_
20#define _BATMAN_ADV_BATADV_V_OGM_H_ 20#define _NET_BATMAN_ADV_BAT_V_OGM_H_
21 21
22#include "main.h" 22#include "main.h"
23 23
@@ -34,4 +34,4 @@ void batadv_v_ogm_primary_iface_set(struct batadv_hard_iface *primary_iface);
34int batadv_v_ogm_packet_recv(struct sk_buff *skb, 34int batadv_v_ogm_packet_recv(struct sk_buff *skb,
35 struct batadv_hard_iface *if_incoming); 35 struct batadv_hard_iface *if_incoming);
36 36
37#endif /* _BATMAN_ADV_BATADV_V_OGM_H_ */ 37#endif /* _NET_BATMAN_ADV_BAT_V_OGM_H_ */
diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c
index a2de5a44bd41..ff9659af6b91 100644
--- a/net/batman-adv/bridge_loop_avoidance.c
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -1449,7 +1449,7 @@ static void batadv_bla_periodic_work(struct work_struct *work)
1449 * detection frames. Set the locally administered bit to avoid 1449 * detection frames. Set the locally administered bit to avoid
1450 * collisions with users mac addresses. 1450 * collisions with users mac addresses.
1451 */ 1451 */
1452 random_ether_addr(bat_priv->bla.loopdetect_addr); 1452 eth_random_addr(bat_priv->bla.loopdetect_addr);
1453 bat_priv->bla.loopdetect_addr[0] = 0xba; 1453 bat_priv->bla.loopdetect_addr[0] = 0xba;
1454 bat_priv->bla.loopdetect_addr[1] = 0xbe; 1454 bat_priv->bla.loopdetect_addr[1] = 0xbe;
1455 bat_priv->bla.loopdetect_lasttime = jiffies; 1455 bat_priv->bla.loopdetect_lasttime = jiffies;
diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c
index 4229b01ac7b5..3cb82378300b 100644
--- a/net/batman-adv/debugfs.c
+++ b/net/batman-adv/debugfs.c
@@ -19,6 +19,7 @@
19#include "debugfs.h" 19#include "debugfs.h"
20#include "main.h" 20#include "main.h"
21 21
22#include <linux/dcache.h>
22#include <linux/debugfs.h> 23#include <linux/debugfs.h>
23#include <linux/err.h> 24#include <linux/err.h>
24#include <linux/errno.h> 25#include <linux/errno.h>
@@ -117,7 +118,7 @@ static int batadv_bla_backbone_table_open(struct inode *inode,
117 118
118#ifdef CONFIG_BATMAN_ADV_DAT 119#ifdef CONFIG_BATMAN_ADV_DAT
119/** 120/**
120 * batadv_dat_cache_open() - Prepare file handler for reads from dat_chache 121 * batadv_dat_cache_open() - Prepare file handler for reads from dat_cache
121 * @inode: inode which was opened 122 * @inode: inode which was opened
122 * @file: file handle to be initialized 123 * @file: file handle to be initialized
123 * 124 *
@@ -344,6 +345,25 @@ out:
344} 345}
345 346
346/** 347/**
348 * batadv_debugfs_rename_hardif() - Fix debugfs path for renamed hardif
349 * @hard_iface: hard interface which was renamed
350 */
351void batadv_debugfs_rename_hardif(struct batadv_hard_iface *hard_iface)
352{
353 const char *name = hard_iface->net_dev->name;
354 struct dentry *dir;
355 struct dentry *d;
356
357 dir = hard_iface->debug_dir;
358 if (!dir)
359 return;
360
361 d = debugfs_rename(dir->d_parent, dir, dir->d_parent, name);
362 if (!d)
363 pr_err("Can't rename debugfs dir to %s\n", name);
364}
365
366/**
347 * batadv_debugfs_del_hardif() - delete the base directory for a hard interface 367 * batadv_debugfs_del_hardif() - delete the base directory for a hard interface
348 * in debugfs. 368 * in debugfs.
349 * @hard_iface: hard interface which is deleted. 369 * @hard_iface: hard interface which is deleted.
@@ -414,6 +434,26 @@ out:
414} 434}
415 435
416/** 436/**
437 * batadv_debugfs_rename_meshif() - Fix debugfs path for renamed softif
438 * @dev: net_device which was renamed
439 */
440void batadv_debugfs_rename_meshif(struct net_device *dev)
441{
442 struct batadv_priv *bat_priv = netdev_priv(dev);
443 const char *name = dev->name;
444 struct dentry *dir;
445 struct dentry *d;
446
447 dir = bat_priv->debug_dir;
448 if (!dir)
449 return;
450
451 d = debugfs_rename(dir->d_parent, dir, dir->d_parent, name);
452 if (!d)
453 pr_err("Can't rename debugfs dir to %s\n", name);
454}
455
456/**
417 * batadv_debugfs_del_meshif() - Remove interface dependent debugfs entries 457 * batadv_debugfs_del_meshif() - Remove interface dependent debugfs entries
418 * @dev: netdev struct of the soft interface 458 * @dev: netdev struct of the soft interface
419 */ 459 */
diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h
index 37b069698b04..08a592ffbee5 100644
--- a/net/batman-adv/debugfs.h
+++ b/net/batman-adv/debugfs.h
@@ -30,8 +30,10 @@ struct net_device;
30void batadv_debugfs_init(void); 30void batadv_debugfs_init(void);
31void batadv_debugfs_destroy(void); 31void batadv_debugfs_destroy(void);
32int batadv_debugfs_add_meshif(struct net_device *dev); 32int batadv_debugfs_add_meshif(struct net_device *dev);
33void batadv_debugfs_rename_meshif(struct net_device *dev);
33void batadv_debugfs_del_meshif(struct net_device *dev); 34void batadv_debugfs_del_meshif(struct net_device *dev);
34int batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface); 35int batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface);
36void batadv_debugfs_rename_hardif(struct batadv_hard_iface *hard_iface);
35void batadv_debugfs_del_hardif(struct batadv_hard_iface *hard_iface); 37void batadv_debugfs_del_hardif(struct batadv_hard_iface *hard_iface);
36 38
37#else 39#else
@@ -49,6 +51,10 @@ static inline int batadv_debugfs_add_meshif(struct net_device *dev)
49 return 0; 51 return 0;
50} 52}
51 53
54static inline void batadv_debugfs_rename_meshif(struct net_device *dev)
55{
56}
57
52static inline void batadv_debugfs_del_meshif(struct net_device *dev) 58static inline void batadv_debugfs_del_meshif(struct net_device *dev)
53{ 59{
54} 60}
@@ -60,6 +66,11 @@ int batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface)
60} 66}
61 67
62static inline 68static inline
69void batadv_debugfs_rename_hardif(struct batadv_hard_iface *hard_iface)
70{
71}
72
73static inline
63void batadv_debugfs_del_hardif(struct batadv_hard_iface *hard_iface) 74void batadv_debugfs_del_hardif(struct batadv_hard_iface *hard_iface)
64{ 75{
65} 76}
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index c405d15befd6..2f0d42f2f913 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -989,6 +989,32 @@ void batadv_hardif_remove_interfaces(void)
989 rtnl_unlock(); 989 rtnl_unlock();
990} 990}
991 991
992/**
993 * batadv_hard_if_event_softif() - Handle events for soft interfaces
994 * @event: NETDEV_* event to handle
995 * @net_dev: net_device which generated an event
996 *
997 * Return: NOTIFY_* result
998 */
999static int batadv_hard_if_event_softif(unsigned long event,
1000 struct net_device *net_dev)
1001{
1002 struct batadv_priv *bat_priv;
1003
1004 switch (event) {
1005 case NETDEV_REGISTER:
1006 batadv_sysfs_add_meshif(net_dev);
1007 bat_priv = netdev_priv(net_dev);
1008 batadv_softif_create_vlan(bat_priv, BATADV_NO_FLAGS);
1009 break;
1010 case NETDEV_CHANGENAME:
1011 batadv_debugfs_rename_meshif(net_dev);
1012 break;
1013 }
1014
1015 return NOTIFY_DONE;
1016}
1017
992static int batadv_hard_if_event(struct notifier_block *this, 1018static int batadv_hard_if_event(struct notifier_block *this,
993 unsigned long event, void *ptr) 1019 unsigned long event, void *ptr)
994{ 1020{
@@ -997,12 +1023,8 @@ static int batadv_hard_if_event(struct notifier_block *this,
997 struct batadv_hard_iface *primary_if = NULL; 1023 struct batadv_hard_iface *primary_if = NULL;
998 struct batadv_priv *bat_priv; 1024 struct batadv_priv *bat_priv;
999 1025
1000 if (batadv_softif_is_valid(net_dev) && event == NETDEV_REGISTER) { 1026 if (batadv_softif_is_valid(net_dev))
1001 batadv_sysfs_add_meshif(net_dev); 1027 return batadv_hard_if_event_softif(event, net_dev);
1002 bat_priv = netdev_priv(net_dev);
1003 batadv_softif_create_vlan(bat_priv, BATADV_NO_FLAGS);
1004 return NOTIFY_DONE;
1005 }
1006 1028
1007 hard_iface = batadv_hardif_get_by_netdev(net_dev); 1029 hard_iface = batadv_hardif_get_by_netdev(net_dev);
1008 if (!hard_iface && (event == NETDEV_REGISTER || 1030 if (!hard_iface && (event == NETDEV_REGISTER ||
@@ -1051,6 +1073,9 @@ static int batadv_hard_if_event(struct notifier_block *this,
1051 if (batadv_is_wifi_hardif(hard_iface)) 1073 if (batadv_is_wifi_hardif(hard_iface))
1052 hard_iface->num_bcasts = BATADV_NUM_BCASTS_WIRELESS; 1074 hard_iface->num_bcasts = BATADV_NUM_BCASTS_WIRELESS;
1053 break; 1075 break;
1076 case NETDEV_CHANGENAME:
1077 batadv_debugfs_rename_hardif(hard_iface);
1078 break;
1054 default: 1079 default:
1055 break; 1080 break;
1056 } 1081 }
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index 716e5b43acfa..1d295da3e342 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -1339,7 +1339,11 @@ static bool batadv_purge_orig_node(struct batadv_priv *bat_priv,
1339 return false; 1339 return false;
1340} 1340}
1341 1341
1342static void _batadv_purge_orig(struct batadv_priv *bat_priv) 1342/**
1343 * batadv_purge_orig_ref() - Purge all outdated originators
1344 * @bat_priv: the bat priv with all the soft interface information
1345 */
1346void batadv_purge_orig_ref(struct batadv_priv *bat_priv)
1343{ 1347{
1344 struct batadv_hashtable *hash = bat_priv->orig_hash; 1348 struct batadv_hashtable *hash = bat_priv->orig_hash;
1345 struct hlist_node *node_tmp; 1349 struct hlist_node *node_tmp;
@@ -1385,21 +1389,12 @@ static void batadv_purge_orig(struct work_struct *work)
1385 1389
1386 delayed_work = to_delayed_work(work); 1390 delayed_work = to_delayed_work(work);
1387 bat_priv = container_of(delayed_work, struct batadv_priv, orig_work); 1391 bat_priv = container_of(delayed_work, struct batadv_priv, orig_work);
1388 _batadv_purge_orig(bat_priv); 1392 batadv_purge_orig_ref(bat_priv);
1389 queue_delayed_work(batadv_event_workqueue, 1393 queue_delayed_work(batadv_event_workqueue,
1390 &bat_priv->orig_work, 1394 &bat_priv->orig_work,
1391 msecs_to_jiffies(BATADV_ORIG_WORK_PERIOD)); 1395 msecs_to_jiffies(BATADV_ORIG_WORK_PERIOD));
1392} 1396}
1393 1397
1394/**
1395 * batadv_purge_orig_ref() - Purge all outdated originators
1396 * @bat_priv: the bat priv with all the soft interface information
1397 */
1398void batadv_purge_orig_ref(struct batadv_priv *bat_priv)
1399{
1400 _batadv_purge_orig(bat_priv);
1401}
1402
1403#ifdef CONFIG_BATMAN_ADV_DEBUGFS 1398#ifdef CONFIG_BATMAN_ADV_DEBUGFS
1404 1399
1405/** 1400/**
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 3986551397ca..12a2b7d21376 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -1705,7 +1705,9 @@ static bool batadv_tt_global_add(struct batadv_priv *bat_priv,
1705 ether_addr_copy(common->addr, tt_addr); 1705 ether_addr_copy(common->addr, tt_addr);
1706 common->vid = vid; 1706 common->vid = vid;
1707 1707
1708 common->flags = flags; 1708 if (!is_multicast_ether_addr(common->addr))
1709 common->flags = flags & (~BATADV_TT_SYNC_MASK);
1710
1709 tt_global_entry->roam_at = 0; 1711 tt_global_entry->roam_at = 0;
1710 /* node must store current time in case of roaming. This is 1712 /* node must store current time in case of roaming. This is
1711 * needed to purge this entry out on timeout (if nobody claims 1713 * needed to purge this entry out on timeout (if nobody claims
@@ -1768,7 +1770,8 @@ static bool batadv_tt_global_add(struct batadv_priv *bat_priv,
1768 * TT_CLIENT_TEMP, therefore they have to be copied in the 1770 * TT_CLIENT_TEMP, therefore they have to be copied in the
1769 * client entry 1771 * client entry
1770 */ 1772 */
1771 common->flags |= flags & (~BATADV_TT_SYNC_MASK); 1773 if (!is_multicast_ether_addr(common->addr))
1774 common->flags |= flags & (~BATADV_TT_SYNC_MASK);
1772 1775
1773 /* If there is the BATADV_TT_CLIENT_ROAM flag set, there is only 1776 /* If there is the BATADV_TT_CLIENT_ROAM flag set, there is only
1774 * one originator left in the list and we previously received a 1777 * one originator left in the list and we previously received a
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 360357f83f20..343d304851a5 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -43,12 +43,13 @@ struct seq_file;
43#ifdef CONFIG_BATMAN_ADV_DAT 43#ifdef CONFIG_BATMAN_ADV_DAT
44 44
45/** 45/**
46 * batadv_dat_addr_t - it is the type used for all DHT addresses. If it is 46 * typedef batadv_dat_addr_t - type used for all DHT addresses
47 * changed, BATADV_DAT_ADDR_MAX is changed as well. 47 *
48 * If it is changed, BATADV_DAT_ADDR_MAX is changed as well.
48 * 49 *
49 * *Please be careful: batadv_dat_addr_t must be UNSIGNED* 50 * *Please be careful: batadv_dat_addr_t must be UNSIGNED*
50 */ 51 */
51#define batadv_dat_addr_t u16 52typedef u16 batadv_dat_addr_t;
52 53
53#endif /* CONFIG_BATMAN_ADV_DAT */ 54#endif /* CONFIG_BATMAN_ADV_DAT */
54 55
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 510ab4f55df5..deacc52d7ff1 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -159,7 +159,7 @@ void bt_accept_enqueue(struct sock *parent, struct sock *sk)
159 BT_DBG("parent %p, sk %p", parent, sk); 159 BT_DBG("parent %p, sk %p", parent, sk);
160 160
161 sock_hold(sk); 161 sock_hold(sk);
162 lock_sock(sk); 162 lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
163 list_add_tail(&bt_sk(sk)->accept_q, &bt_sk(parent)->accept_q); 163 list_add_tail(&bt_sk(sk)->accept_q, &bt_sk(parent)->accept_q);
164 bt_sk(sk)->parent = parent; 164 bt_sk(sk)->parent = parent;
165 release_sock(sk); 165 release_sock(sk);
@@ -437,13 +437,16 @@ static inline __poll_t bt_accept_poll(struct sock *parent)
437 return 0; 437 return 0;
438} 438}
439 439
440__poll_t bt_sock_poll_mask(struct socket *sock, __poll_t events) 440__poll_t bt_sock_poll(struct file *file, struct socket *sock,
441 poll_table *wait)
441{ 442{
442 struct sock *sk = sock->sk; 443 struct sock *sk = sock->sk;
443 __poll_t mask = 0; 444 __poll_t mask = 0;
444 445
445 BT_DBG("sock %p, sk %p", sock, sk); 446 BT_DBG("sock %p, sk %p", sock, sk);
446 447
448 poll_wait(file, sk_sleep(sk), wait);
449
447 if (sk->sk_state == BT_LISTEN) 450 if (sk->sk_state == BT_LISTEN)
448 return bt_accept_poll(sk); 451 return bt_accept_poll(sk);
449 452
@@ -475,7 +478,7 @@ __poll_t bt_sock_poll_mask(struct socket *sock, __poll_t events)
475 478
476 return mask; 479 return mask;
477} 480}
478EXPORT_SYMBOL(bt_sock_poll_mask); 481EXPORT_SYMBOL(bt_sock_poll);
479 482
480int bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 483int bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
481{ 484{
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 45ff5dc124cc..bd4978ce8c45 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -748,11 +748,30 @@ static bool conn_use_rpa(struct hci_conn *conn)
748 return hci_dev_test_flag(hdev, HCI_PRIVACY); 748 return hci_dev_test_flag(hdev, HCI_PRIVACY);
749} 749}
750 750
751static void set_ext_conn_params(struct hci_conn *conn,
752 struct hci_cp_le_ext_conn_param *p)
753{
754 struct hci_dev *hdev = conn->hdev;
755
756 memset(p, 0, sizeof(*p));
757
758 /* Set window to be the same value as the interval to
759 * enable continuous scanning.
760 */
761 p->scan_interval = cpu_to_le16(hdev->le_scan_interval);
762 p->scan_window = p->scan_interval;
763 p->conn_interval_min = cpu_to_le16(conn->le_conn_min_interval);
764 p->conn_interval_max = cpu_to_le16(conn->le_conn_max_interval);
765 p->conn_latency = cpu_to_le16(conn->le_conn_latency);
766 p->supervision_timeout = cpu_to_le16(conn->le_supv_timeout);
767 p->min_ce_len = cpu_to_le16(0x0000);
768 p->max_ce_len = cpu_to_le16(0x0000);
769}
770
751static void hci_req_add_le_create_conn(struct hci_request *req, 771static void hci_req_add_le_create_conn(struct hci_request *req,
752 struct hci_conn *conn, 772 struct hci_conn *conn,
753 bdaddr_t *direct_rpa) 773 bdaddr_t *direct_rpa)
754{ 774{
755 struct hci_cp_le_create_conn cp;
756 struct hci_dev *hdev = conn->hdev; 775 struct hci_dev *hdev = conn->hdev;
757 u8 own_addr_type; 776 u8 own_addr_type;
758 777
@@ -775,25 +794,71 @@ static void hci_req_add_le_create_conn(struct hci_request *req,
775 return; 794 return;
776 } 795 }
777 796
778 memset(&cp, 0, sizeof(cp)); 797 if (use_ext_conn(hdev)) {
798 struct hci_cp_le_ext_create_conn *cp;
799 struct hci_cp_le_ext_conn_param *p;
800 u8 data[sizeof(*cp) + sizeof(*p) * 3];
801 u32 plen;
779 802
780 /* Set window to be the same value as the interval to enable 803 cp = (void *) data;
781 * continuous scanning. 804 p = (void *) cp->data;
782 */ 805
783 cp.scan_interval = cpu_to_le16(hdev->le_scan_interval); 806 memset(cp, 0, sizeof(*cp));
784 cp.scan_window = cp.scan_interval;
785 807
786 bacpy(&cp.peer_addr, &conn->dst); 808 bacpy(&cp->peer_addr, &conn->dst);
787 cp.peer_addr_type = conn->dst_type; 809 cp->peer_addr_type = conn->dst_type;
788 cp.own_address_type = own_addr_type; 810 cp->own_addr_type = own_addr_type;
789 cp.conn_interval_min = cpu_to_le16(conn->le_conn_min_interval);
790 cp.conn_interval_max = cpu_to_le16(conn->le_conn_max_interval);
791 cp.conn_latency = cpu_to_le16(conn->le_conn_latency);
792 cp.supervision_timeout = cpu_to_le16(conn->le_supv_timeout);
793 cp.min_ce_len = cpu_to_le16(0x0000);
794 cp.max_ce_len = cpu_to_le16(0x0000);
795 811
796 hci_req_add(req, HCI_OP_LE_CREATE_CONN, sizeof(cp), &cp); 812 plen = sizeof(*cp);
813
814 if (scan_1m(hdev)) {
815 cp->phys |= LE_SCAN_PHY_1M;
816 set_ext_conn_params(conn, p);
817
818 p++;
819 plen += sizeof(*p);
820 }
821
822 if (scan_2m(hdev)) {
823 cp->phys |= LE_SCAN_PHY_2M;
824 set_ext_conn_params(conn, p);
825
826 p++;
827 plen += sizeof(*p);
828 }
829
830 if (scan_coded(hdev)) {
831 cp->phys |= LE_SCAN_PHY_CODED;
832 set_ext_conn_params(conn, p);
833
834 plen += sizeof(*p);
835 }
836
837 hci_req_add(req, HCI_OP_LE_EXT_CREATE_CONN, plen, data);
838
839 } else {
840 struct hci_cp_le_create_conn cp;
841
842 memset(&cp, 0, sizeof(cp));
843
844 /* Set window to be the same value as the interval to enable
845 * continuous scanning.
846 */
847 cp.scan_interval = cpu_to_le16(hdev->le_scan_interval);
848 cp.scan_window = cp.scan_interval;
849
850 bacpy(&cp.peer_addr, &conn->dst);
851 cp.peer_addr_type = conn->dst_type;
852 cp.own_address_type = own_addr_type;
853 cp.conn_interval_min = cpu_to_le16(conn->le_conn_min_interval);
854 cp.conn_interval_max = cpu_to_le16(conn->le_conn_max_interval);
855 cp.conn_latency = cpu_to_le16(conn->le_conn_latency);
856 cp.supervision_timeout = cpu_to_le16(conn->le_supv_timeout);
857 cp.min_ce_len = cpu_to_le16(0x0000);
858 cp.max_ce_len = cpu_to_le16(0x0000);
859
860 hci_req_add(req, HCI_OP_LE_CREATE_CONN, sizeof(cp), &cp);
861 }
797 862
798 conn->state = BT_CONNECT; 863 conn->state = BT_CONNECT;
799 clear_bit(HCI_CONN_SCANNING, &conn->flags); 864 clear_bit(HCI_CONN_SCANNING, &conn->flags);
@@ -803,35 +868,81 @@ static void hci_req_directed_advertising(struct hci_request *req,
803 struct hci_conn *conn) 868 struct hci_conn *conn)
804{ 869{
805 struct hci_dev *hdev = req->hdev; 870 struct hci_dev *hdev = req->hdev;
806 struct hci_cp_le_set_adv_param cp;
807 u8 own_addr_type; 871 u8 own_addr_type;
808 u8 enable; 872 u8 enable;
809 873
810 /* Clear the HCI_LE_ADV bit temporarily so that the 874 if (ext_adv_capable(hdev)) {
811 * hci_update_random_address knows that it's safe to go ahead 875 struct hci_cp_le_set_ext_adv_params cp;
812 * and write a new random address. The flag will be set back on 876 bdaddr_t random_addr;
813 * as soon as the SET_ADV_ENABLE HCI command completes.
814 */
815 hci_dev_clear_flag(hdev, HCI_LE_ADV);
816 877
817 /* Set require_privacy to false so that the remote device has a 878 /* Set require_privacy to false so that the remote device has a
818 * chance of identifying us. 879 * chance of identifying us.
819 */ 880 */
820 if (hci_update_random_address(req, false, conn_use_rpa(conn), 881 if (hci_get_random_address(hdev, false, conn_use_rpa(conn), NULL,
821 &own_addr_type) < 0) 882 &own_addr_type, &random_addr) < 0)
822 return; 883 return;
823 884
824 memset(&cp, 0, sizeof(cp)); 885 memset(&cp, 0, sizeof(cp));
825 cp.type = LE_ADV_DIRECT_IND;
826 cp.own_address_type = own_addr_type;
827 cp.direct_addr_type = conn->dst_type;
828 bacpy(&cp.direct_addr, &conn->dst);
829 cp.channel_map = hdev->le_adv_channel_map;
830 886
831 hci_req_add(req, HCI_OP_LE_SET_ADV_PARAM, sizeof(cp), &cp); 887 cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_DIRECT_IND);
888 cp.own_addr_type = own_addr_type;
889 cp.channel_map = hdev->le_adv_channel_map;
890 cp.tx_power = HCI_TX_POWER_INVALID;
891 cp.primary_phy = HCI_ADV_PHY_1M;
892 cp.secondary_phy = HCI_ADV_PHY_1M;
893 cp.handle = 0; /* Use instance 0 for directed adv */
894 cp.own_addr_type = own_addr_type;
895 cp.peer_addr_type = conn->dst_type;
896 bacpy(&cp.peer_addr, &conn->dst);
897
898 hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_PARAMS, sizeof(cp), &cp);
899
900 if (own_addr_type == ADDR_LE_DEV_RANDOM &&
901 bacmp(&random_addr, BDADDR_ANY) &&
902 bacmp(&random_addr, &hdev->random_addr)) {
903 struct hci_cp_le_set_adv_set_rand_addr cp;
904
905 memset(&cp, 0, sizeof(cp));
906
907 cp.handle = 0;
908 bacpy(&cp.bdaddr, &random_addr);
909
910 hci_req_add(req,
911 HCI_OP_LE_SET_ADV_SET_RAND_ADDR,
912 sizeof(cp), &cp);
913 }
832 914
833 enable = 0x01; 915 __hci_req_enable_ext_advertising(req);
834 hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable); 916 } else {
917 struct hci_cp_le_set_adv_param cp;
918
919 /* Clear the HCI_LE_ADV bit temporarily so that the
920 * hci_update_random_address knows that it's safe to go ahead
921 * and write a new random address. The flag will be set back on
922 * as soon as the SET_ADV_ENABLE HCI command completes.
923 */
924 hci_dev_clear_flag(hdev, HCI_LE_ADV);
925
926 /* Set require_privacy to false so that the remote device has a
927 * chance of identifying us.
928 */
929 if (hci_update_random_address(req, false, conn_use_rpa(conn),
930 &own_addr_type) < 0)
931 return;
932
933 memset(&cp, 0, sizeof(cp));
934 cp.type = LE_ADV_DIRECT_IND;
935 cp.own_address_type = own_addr_type;
936 cp.direct_addr_type = conn->dst_type;
937 bacpy(&cp.direct_addr, &conn->dst);
938 cp.channel_map = hdev->le_adv_channel_map;
939
940 hci_req_add(req, HCI_OP_LE_SET_ADV_PARAM, sizeof(cp), &cp);
941
942 enable = 0x01;
943 hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable),
944 &enable);
945 }
835 946
836 conn->state = BT_CONNECT; 947 conn->state = BT_CONNECT;
837} 948}
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index ee8ef1228263..74b29c7d841c 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -695,11 +695,42 @@ static int hci_init3_req(struct hci_request *req, unsigned long opt)
695 if (hdev->commands[35] & (0x20 | 0x40)) 695 if (hdev->commands[35] & (0x20 | 0x40))
696 events[1] |= 0x08; /* LE PHY Update Complete */ 696 events[1] |= 0x08; /* LE PHY Update Complete */
697 697
698 /* If the controller supports LE Set Extended Scan Parameters
699 * and LE Set Extended Scan Enable commands, enable the
700 * corresponding event.
701 */
702 if (use_ext_scan(hdev))
703 events[1] |= 0x10; /* LE Extended Advertising
704 * Report
705 */
706
707 /* If the controller supports the LE Extended Create Connection
708 * command, enable the corresponding event.
709 */
710 if (use_ext_conn(hdev))
711 events[1] |= 0x02; /* LE Enhanced Connection
712 * Complete
713 */
714
715 /* If the controller supports the LE Extended Advertising
716 * command, enable the corresponding event.
717 */
718 if (ext_adv_capable(hdev))
719 events[2] |= 0x02; /* LE Advertising Set
720 * Terminated
721 */
722
698 hci_req_add(req, HCI_OP_LE_SET_EVENT_MASK, sizeof(events), 723 hci_req_add(req, HCI_OP_LE_SET_EVENT_MASK, sizeof(events),
699 events); 724 events);
700 725
701 if (hdev->commands[25] & 0x40) { 726 /* Read LE Advertising Channel TX Power */
702 /* Read LE Advertising Channel TX Power */ 727 if ((hdev->commands[25] & 0x40) && !ext_adv_capable(hdev)) {
728 /* HCI TS spec forbids mixing of legacy and extended
729 * advertising commands wherein READ_ADV_TX_POWER is
730 * also included. So do not call it if extended adv
731 * is supported otherwise controller will return
732 * COMMAND_DISALLOWED for extended commands.
733 */
703 hci_req_add(req, HCI_OP_LE_READ_ADV_TX_POWER, 0, NULL); 734 hci_req_add(req, HCI_OP_LE_READ_ADV_TX_POWER, 0, NULL);
704 } 735 }
705 736
@@ -714,6 +745,17 @@ static int hci_init3_req(struct hci_request *req, unsigned long opt)
714 hci_req_add(req, HCI_OP_LE_CLEAR_WHITE_LIST, 0, NULL); 745 hci_req_add(req, HCI_OP_LE_CLEAR_WHITE_LIST, 0, NULL);
715 } 746 }
716 747
748 if (hdev->commands[34] & 0x40) {
749 /* Read LE Resolving List Size */
750 hci_req_add(req, HCI_OP_LE_READ_RESOLV_LIST_SIZE,
751 0, NULL);
752 }
753
754 if (hdev->commands[34] & 0x20) {
755 /* Clear LE Resolving List */
756 hci_req_add(req, HCI_OP_LE_CLEAR_RESOLV_LIST, 0, NULL);
757 }
758
717 if (hdev->le_features[0] & HCI_LE_DATA_LEN_EXT) { 759 if (hdev->le_features[0] & HCI_LE_DATA_LEN_EXT) {
718 /* Read LE Maximum Data Length */ 760 /* Read LE Maximum Data Length */
719 hci_req_add(req, HCI_OP_LE_READ_MAX_DATA_LEN, 0, NULL); 761 hci_req_add(req, HCI_OP_LE_READ_MAX_DATA_LEN, 0, NULL);
@@ -722,6 +764,12 @@ static int hci_init3_req(struct hci_request *req, unsigned long opt)
722 hci_req_add(req, HCI_OP_LE_READ_DEF_DATA_LEN, 0, NULL); 764 hci_req_add(req, HCI_OP_LE_READ_DEF_DATA_LEN, 0, NULL);
723 } 765 }
724 766
767 if (ext_adv_capable(hdev)) {
768 /* Read LE Number of Supported Advertising Sets */
769 hci_req_add(req, HCI_OP_LE_READ_NUM_SUPPORTED_ADV_SETS,
770 0, NULL);
771 }
772
725 hci_set_le_support(req); 773 hci_set_le_support(req);
726 } 774 }
727 775
@@ -802,10 +850,9 @@ static int hci_init4_req(struct hci_request *req, unsigned long opt)
802 if (hdev->commands[35] & 0x20) { 850 if (hdev->commands[35] & 0x20) {
803 struct hci_cp_le_set_default_phy cp; 851 struct hci_cp_le_set_default_phy cp;
804 852
805 /* No transmitter PHY or receiver PHY preferences */ 853 cp.all_phys = 0x00;
806 cp.all_phys = 0x03; 854 cp.tx_phys = hdev->le_tx_def_phys;
807 cp.tx_phys = 0; 855 cp.rx_phys = hdev->le_rx_def_phys;
808 cp.rx_phys = 0;
809 856
810 hci_req_add(req, HCI_OP_LE_SET_DEFAULT_PHY, sizeof(cp), &cp); 857 hci_req_add(req, HCI_OP_LE_SET_DEFAULT_PHY, sizeof(cp), &cp);
811 } 858 }
@@ -1368,7 +1415,8 @@ static int hci_dev_do_open(struct hci_dev *hdev)
1368 atomic_set(&hdev->cmd_cnt, 1); 1415 atomic_set(&hdev->cmd_cnt, 1);
1369 set_bit(HCI_INIT, &hdev->flags); 1416 set_bit(HCI_INIT, &hdev->flags);
1370 1417
1371 if (hci_dev_test_flag(hdev, HCI_SETUP)) { 1418 if (hci_dev_test_flag(hdev, HCI_SETUP) ||
1419 test_bit(HCI_QUIRK_NON_PERSISTENT_SETUP, &hdev->quirks)) {
1372 hci_sock_dev_event(hdev, HCI_DEV_SETUP); 1420 hci_sock_dev_event(hdev, HCI_DEV_SETUP);
1373 1421
1374 if (hdev->setup) 1422 if (hdev->setup)
@@ -1432,6 +1480,7 @@ static int hci_dev_do_open(struct hci_dev *hdev)
1432 if (!ret) { 1480 if (!ret) {
1433 hci_dev_hold(hdev); 1481 hci_dev_hold(hdev);
1434 hci_dev_set_flag(hdev, HCI_RPA_EXPIRED); 1482 hci_dev_set_flag(hdev, HCI_RPA_EXPIRED);
1483 hci_adv_instances_set_rpa_expired(hdev, true);
1435 set_bit(HCI_UP, &hdev->flags); 1484 set_bit(HCI_UP, &hdev->flags);
1436 hci_sock_dev_event(hdev, HCI_DEV_UP); 1485 hci_sock_dev_event(hdev, HCI_DEV_UP);
1437 hci_leds_update_powered(hdev, true); 1486 hci_leds_update_powered(hdev, true);
@@ -1587,9 +1636,15 @@ int hci_dev_do_close(struct hci_dev *hdev)
1587 if (hci_dev_test_and_clear_flag(hdev, HCI_SERVICE_CACHE)) 1636 if (hci_dev_test_and_clear_flag(hdev, HCI_SERVICE_CACHE))
1588 cancel_delayed_work(&hdev->service_cache); 1637 cancel_delayed_work(&hdev->service_cache);
1589 1638
1590 if (hci_dev_test_flag(hdev, HCI_MGMT)) 1639 if (hci_dev_test_flag(hdev, HCI_MGMT)) {
1640 struct adv_info *adv_instance;
1641
1591 cancel_delayed_work_sync(&hdev->rpa_expired); 1642 cancel_delayed_work_sync(&hdev->rpa_expired);
1592 1643
1644 list_for_each_entry(adv_instance, &hdev->adv_instances, list)
1645 cancel_delayed_work_sync(&adv_instance->rpa_expired_cb);
1646 }
1647
1593 /* Avoid potential lockdep warnings from the *_flush() calls by 1648 /* Avoid potential lockdep warnings from the *_flush() calls by
1594 * ensuring the workqueue is empty up front. 1649 * ensuring the workqueue is empty up front.
1595 */ 1650 */
@@ -1897,7 +1952,11 @@ int hci_dev_cmd(unsigned int cmd, void __user *arg)
1897 break; 1952 break;
1898 1953
1899 case HCISETPTYPE: 1954 case HCISETPTYPE:
1955 if (hdev->pkt_type == (__u16) dr.dev_opt)
1956 break;
1957
1900 hdev->pkt_type = (__u16) dr.dev_opt; 1958 hdev->pkt_type = (__u16) dr.dev_opt;
1959 mgmt_phy_configuration_changed(hdev, NULL);
1901 break; 1960 break;
1902 1961
1903 case HCISETACLMTU: 1962 case HCISETACLMTU:
@@ -2661,6 +2720,8 @@ int hci_remove_adv_instance(struct hci_dev *hdev, u8 instance)
2661 hdev->cur_adv_instance = 0x00; 2720 hdev->cur_adv_instance = 0x00;
2662 } 2721 }
2663 2722
2723 cancel_delayed_work_sync(&adv_instance->rpa_expired_cb);
2724
2664 list_del(&adv_instance->list); 2725 list_del(&adv_instance->list);
2665 kfree(adv_instance); 2726 kfree(adv_instance);
2666 2727
@@ -2669,6 +2730,14 @@ int hci_remove_adv_instance(struct hci_dev *hdev, u8 instance)
2669 return 0; 2730 return 0;
2670} 2731}
2671 2732
2733void hci_adv_instances_set_rpa_expired(struct hci_dev *hdev, bool rpa_expired)
2734{
2735 struct adv_info *adv_instance, *n;
2736
2737 list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances, list)
2738 adv_instance->rpa_expired = rpa_expired;
2739}
2740
2672/* This function requires the caller holds hdev->lock */ 2741/* This function requires the caller holds hdev->lock */
2673void hci_adv_instances_clear(struct hci_dev *hdev) 2742void hci_adv_instances_clear(struct hci_dev *hdev)
2674{ 2743{
@@ -2680,6 +2749,7 @@ void hci_adv_instances_clear(struct hci_dev *hdev)
2680 } 2749 }
2681 2750
2682 list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances, list) { 2751 list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances, list) {
2752 cancel_delayed_work_sync(&adv_instance->rpa_expired_cb);
2683 list_del(&adv_instance->list); 2753 list_del(&adv_instance->list);
2684 kfree(adv_instance); 2754 kfree(adv_instance);
2685 } 2755 }
@@ -2688,6 +2758,16 @@ void hci_adv_instances_clear(struct hci_dev *hdev)
2688 hdev->cur_adv_instance = 0x00; 2758 hdev->cur_adv_instance = 0x00;
2689} 2759}
2690 2760
2761static void adv_instance_rpa_expired(struct work_struct *work)
2762{
2763 struct adv_info *adv_instance = container_of(work, struct adv_info,
2764 rpa_expired_cb.work);
2765
2766 BT_DBG("");
2767
2768 adv_instance->rpa_expired = true;
2769}
2770
2691/* This function requires the caller holds hdev->lock */ 2771/* This function requires the caller holds hdev->lock */
2692int hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags, 2772int hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags,
2693 u16 adv_data_len, u8 *adv_data, 2773 u16 adv_data_len, u8 *adv_data,
@@ -2736,6 +2816,11 @@ int hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags,
2736 else 2816 else
2737 adv_instance->duration = duration; 2817 adv_instance->duration = duration;
2738 2818
2819 adv_instance->tx_power = HCI_TX_POWER_INVALID;
2820
2821 INIT_DELAYED_WORK(&adv_instance->rpa_expired_cb,
2822 adv_instance_rpa_expired);
2823
2739 BT_DBG("%s for %dMR", hdev->name, instance); 2824 BT_DBG("%s for %dMR", hdev->name, instance);
2740 2825
2741 return 0; 2826 return 0;
@@ -2999,6 +3084,8 @@ struct hci_dev *hci_alloc_dev(void)
2999 hdev->le_max_tx_time = 0x0148; 3084 hdev->le_max_tx_time = 0x0148;
3000 hdev->le_max_rx_len = 0x001b; 3085 hdev->le_max_rx_len = 0x001b;
3001 hdev->le_max_rx_time = 0x0148; 3086 hdev->le_max_rx_time = 0x0148;
3087 hdev->le_tx_def_phys = HCI_LE_SET_PHY_1M;
3088 hdev->le_rx_def_phys = HCI_LE_SET_PHY_1M;
3002 3089
3003 hdev->rpa_timeout = HCI_DEFAULT_RPA_TIMEOUT; 3090 hdev->rpa_timeout = HCI_DEFAULT_RPA_TIMEOUT;
3004 hdev->discov_interleaved_timeout = DISCOV_INTERLEAVED_TIMEOUT; 3091 hdev->discov_interleaved_timeout = DISCOV_INTERLEAVED_TIMEOUT;
@@ -3017,6 +3104,7 @@ struct hci_dev *hci_alloc_dev(void)
3017 INIT_LIST_HEAD(&hdev->identity_resolving_keys); 3104 INIT_LIST_HEAD(&hdev->identity_resolving_keys);
3018 INIT_LIST_HEAD(&hdev->remote_oob_data); 3105 INIT_LIST_HEAD(&hdev->remote_oob_data);
3019 INIT_LIST_HEAD(&hdev->le_white_list); 3106 INIT_LIST_HEAD(&hdev->le_white_list);
3107 INIT_LIST_HEAD(&hdev->le_resolv_list);
3020 INIT_LIST_HEAD(&hdev->le_conn_params); 3108 INIT_LIST_HEAD(&hdev->le_conn_params);
3021 INIT_LIST_HEAD(&hdev->pend_le_conns); 3109 INIT_LIST_HEAD(&hdev->pend_le_conns);
3022 INIT_LIST_HEAD(&hdev->pend_le_reports); 3110 INIT_LIST_HEAD(&hdev->pend_le_reports);
@@ -3218,6 +3306,7 @@ void hci_unregister_dev(struct hci_dev *hdev)
3218 hci_remote_oob_data_clear(hdev); 3306 hci_remote_oob_data_clear(hdev);
3219 hci_adv_instances_clear(hdev); 3307 hci_adv_instances_clear(hdev);
3220 hci_bdaddr_list_clear(&hdev->le_white_list); 3308 hci_bdaddr_list_clear(&hdev->le_white_list);
3309 hci_bdaddr_list_clear(&hdev->le_resolv_list);
3221 hci_conn_params_clear_all(hdev); 3310 hci_conn_params_clear_all(hdev);
3222 hci_discovery_filter_clear(hdev); 3311 hci_discovery_filter_clear(hdev);
3223 hci_dev_unlock(hdev); 3312 hci_dev_unlock(hdev);
diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c
index 0d8ab5b3c177..51f5b1efc3a5 100644
--- a/net/bluetooth/hci_debugfs.c
+++ b/net/bluetooth/hci_debugfs.c
@@ -694,6 +694,21 @@ static int white_list_show(struct seq_file *f, void *ptr)
694 694
695DEFINE_SHOW_ATTRIBUTE(white_list); 695DEFINE_SHOW_ATTRIBUTE(white_list);
696 696
697static int resolv_list_show(struct seq_file *f, void *ptr)
698{
699 struct hci_dev *hdev = f->private;
700 struct bdaddr_list *b;
701
702 hci_dev_lock(hdev);
703 list_for_each_entry(b, &hdev->le_resolv_list, list)
704 seq_printf(f, "%pMR (type %u)\n", &b->bdaddr, b->bdaddr_type);
705 hci_dev_unlock(hdev);
706
707 return 0;
708}
709
710DEFINE_SHOW_ATTRIBUTE(resolv_list);
711
697static int identity_resolving_keys_show(struct seq_file *f, void *ptr) 712static int identity_resolving_keys_show(struct seq_file *f, void *ptr)
698{ 713{
699 struct hci_dev *hdev = f->private; 714 struct hci_dev *hdev = f->private;
@@ -955,6 +970,10 @@ void hci_debugfs_create_le(struct hci_dev *hdev)
955 &hdev->le_white_list_size); 970 &hdev->le_white_list_size);
956 debugfs_create_file("white_list", 0444, hdev->debugfs, hdev, 971 debugfs_create_file("white_list", 0444, hdev->debugfs, hdev,
957 &white_list_fops); 972 &white_list_fops);
973 debugfs_create_u8("resolv_list_size", 0444, hdev->debugfs,
974 &hdev->le_resolv_list_size);
975 debugfs_create_file("resolv_list", 0444, hdev->debugfs, hdev,
976 &resolv_list_fops);
958 debugfs_create_file("identity_resolving_keys", 0400, hdev->debugfs, 977 debugfs_create_file("identity_resolving_keys", 0400, hdev->debugfs,
959 hdev, &identity_resolving_keys_fops); 978 hdev, &identity_resolving_keys_fops);
960 debugfs_create_file("long_term_keys", 0400, hdev->debugfs, hdev, 979 debugfs_create_file("long_term_keys", 0400, hdev->debugfs, hdev,
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 235b5aaab23d..f12555f23a49 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -221,6 +221,7 @@ static void hci_cc_reset(struct hci_dev *hdev, struct sk_buff *skb)
221 hdev->ssp_debug_mode = 0; 221 hdev->ssp_debug_mode = 0;
222 222
223 hci_bdaddr_list_clear(&hdev->le_white_list); 223 hci_bdaddr_list_clear(&hdev->le_white_list);
224 hci_bdaddr_list_clear(&hdev->le_resolv_list);
224} 225}
225 226
226static void hci_cc_read_stored_link_key(struct hci_dev *hdev, 227static void hci_cc_read_stored_link_key(struct hci_dev *hdev,
@@ -1041,6 +1042,57 @@ static void hci_cc_le_set_random_addr(struct hci_dev *hdev, struct sk_buff *skb)
1041 hci_dev_unlock(hdev); 1042 hci_dev_unlock(hdev);
1042} 1043}
1043 1044
1045static void hci_cc_le_set_default_phy(struct hci_dev *hdev, struct sk_buff *skb)
1046{
1047 __u8 status = *((__u8 *) skb->data);
1048 struct hci_cp_le_set_default_phy *cp;
1049
1050 BT_DBG("%s status 0x%2.2x", hdev->name, status);
1051
1052 if (status)
1053 return;
1054
1055 cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_DEFAULT_PHY);
1056 if (!cp)
1057 return;
1058
1059 hci_dev_lock(hdev);
1060
1061 hdev->le_tx_def_phys = cp->tx_phys;
1062 hdev->le_rx_def_phys = cp->rx_phys;
1063
1064 hci_dev_unlock(hdev);
1065}
1066
1067static void hci_cc_le_set_adv_set_random_addr(struct hci_dev *hdev,
1068 struct sk_buff *skb)
1069{
1070 __u8 status = *((__u8 *) skb->data);
1071 struct hci_cp_le_set_adv_set_rand_addr *cp;
1072 struct adv_info *adv_instance;
1073
1074 if (status)
1075 return;
1076
1077 cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_ADV_SET_RAND_ADDR);
1078 if (!cp)
1079 return;
1080
1081 hci_dev_lock(hdev);
1082
1083 if (!hdev->cur_adv_instance) {
1084 /* Store in hdev for instance 0 (Set adv and Directed advs) */
1085 bacpy(&hdev->random_addr, &cp->bdaddr);
1086 } else {
1087 adv_instance = hci_find_adv_instance(hdev,
1088 hdev->cur_adv_instance);
1089 if (adv_instance)
1090 bacpy(&adv_instance->random_addr, &cp->bdaddr);
1091 }
1092
1093 hci_dev_unlock(hdev);
1094}
1095
1044static void hci_cc_le_set_adv_enable(struct hci_dev *hdev, struct sk_buff *skb) 1096static void hci_cc_le_set_adv_enable(struct hci_dev *hdev, struct sk_buff *skb)
1045{ 1097{
1046 __u8 *sent, status = *((__u8 *) skb->data); 1098 __u8 *sent, status = *((__u8 *) skb->data);
@@ -1076,6 +1128,40 @@ static void hci_cc_le_set_adv_enable(struct hci_dev *hdev, struct sk_buff *skb)
1076 hci_dev_unlock(hdev); 1128 hci_dev_unlock(hdev);
1077} 1129}
1078 1130
1131static void hci_cc_le_set_ext_adv_enable(struct hci_dev *hdev,
1132 struct sk_buff *skb)
1133{
1134 struct hci_cp_le_set_ext_adv_enable *cp;
1135 __u8 status = *((__u8 *) skb->data);
1136
1137 BT_DBG("%s status 0x%2.2x", hdev->name, status);
1138
1139 if (status)
1140 return;
1141
1142 cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_EXT_ADV_ENABLE);
1143 if (!cp)
1144 return;
1145
1146 hci_dev_lock(hdev);
1147
1148 if (cp->enable) {
1149 struct hci_conn *conn;
1150
1151 hci_dev_set_flag(hdev, HCI_LE_ADV);
1152
1153 conn = hci_lookup_le_connect(hdev);
1154 if (conn)
1155 queue_delayed_work(hdev->workqueue,
1156 &conn->le_conn_timeout,
1157 conn->conn_timeout);
1158 } else {
1159 hci_dev_clear_flag(hdev, HCI_LE_ADV);
1160 }
1161
1162 hci_dev_unlock(hdev);
1163}
1164
1079static void hci_cc_le_set_scan_param(struct hci_dev *hdev, struct sk_buff *skb) 1165static void hci_cc_le_set_scan_param(struct hci_dev *hdev, struct sk_buff *skb)
1080{ 1166{
1081 struct hci_cp_le_set_scan_param *cp; 1167 struct hci_cp_le_set_scan_param *cp;
@@ -1097,6 +1183,31 @@ static void hci_cc_le_set_scan_param(struct hci_dev *hdev, struct sk_buff *skb)
1097 hci_dev_unlock(hdev); 1183 hci_dev_unlock(hdev);
1098} 1184}
1099 1185
1186static void hci_cc_le_set_ext_scan_param(struct hci_dev *hdev,
1187 struct sk_buff *skb)
1188{
1189 struct hci_cp_le_set_ext_scan_params *cp;
1190 __u8 status = *((__u8 *) skb->data);
1191 struct hci_cp_le_scan_phy_params *phy_param;
1192
1193 BT_DBG("%s status 0x%2.2x", hdev->name, status);
1194
1195 if (status)
1196 return;
1197
1198 cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_EXT_SCAN_PARAMS);
1199 if (!cp)
1200 return;
1201
1202 phy_param = (void *)cp->data;
1203
1204 hci_dev_lock(hdev);
1205
1206 hdev->le_scan_type = phy_param->type;
1207
1208 hci_dev_unlock(hdev);
1209}
1210
1100static bool has_pending_adv_report(struct hci_dev *hdev) 1211static bool has_pending_adv_report(struct hci_dev *hdev)
1101{ 1212{
1102 struct discovery_state *d = &hdev->discovery; 1213 struct discovery_state *d = &hdev->discovery;
@@ -1126,24 +1237,11 @@ static void store_pending_adv_report(struct hci_dev *hdev, bdaddr_t *bdaddr,
1126 d->last_adv_data_len = len; 1237 d->last_adv_data_len = len;
1127} 1238}
1128 1239
1129static void hci_cc_le_set_scan_enable(struct hci_dev *hdev, 1240static void le_set_scan_enable_complete(struct hci_dev *hdev, u8 enable)
1130 struct sk_buff *skb)
1131{ 1241{
1132 struct hci_cp_le_set_scan_enable *cp;
1133 __u8 status = *((__u8 *) skb->data);
1134
1135 BT_DBG("%s status 0x%2.2x", hdev->name, status);
1136
1137 if (status)
1138 return;
1139
1140 cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_SCAN_ENABLE);
1141 if (!cp)
1142 return;
1143
1144 hci_dev_lock(hdev); 1242 hci_dev_lock(hdev);
1145 1243
1146 switch (cp->enable) { 1244 switch (enable) {
1147 case LE_SCAN_ENABLE: 1245 case LE_SCAN_ENABLE:
1148 hci_dev_set_flag(hdev, HCI_LE_SCAN); 1246 hci_dev_set_flag(hdev, HCI_LE_SCAN);
1149 if (hdev->le_scan_type == LE_SCAN_ACTIVE) 1247 if (hdev->le_scan_type == LE_SCAN_ACTIVE)
@@ -1189,13 +1287,63 @@ static void hci_cc_le_set_scan_enable(struct hci_dev *hdev,
1189 1287
1190 default: 1288 default:
1191 bt_dev_err(hdev, "use of reserved LE_Scan_Enable param %d", 1289 bt_dev_err(hdev, "use of reserved LE_Scan_Enable param %d",
1192 cp->enable); 1290 enable);
1193 break; 1291 break;
1194 } 1292 }
1195 1293
1196 hci_dev_unlock(hdev); 1294 hci_dev_unlock(hdev);
1197} 1295}
1198 1296
1297static void hci_cc_le_set_scan_enable(struct hci_dev *hdev,
1298 struct sk_buff *skb)
1299{
1300 struct hci_cp_le_set_scan_enable *cp;
1301 __u8 status = *((__u8 *) skb->data);
1302
1303 BT_DBG("%s status 0x%2.2x", hdev->name, status);
1304
1305 if (status)
1306 return;
1307
1308 cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_SCAN_ENABLE);
1309 if (!cp)
1310 return;
1311
1312 le_set_scan_enable_complete(hdev, cp->enable);
1313}
1314
1315static void hci_cc_le_set_ext_scan_enable(struct hci_dev *hdev,
1316 struct sk_buff *skb)
1317{
1318 struct hci_cp_le_set_ext_scan_enable *cp;
1319 __u8 status = *((__u8 *) skb->data);
1320
1321 BT_DBG("%s status 0x%2.2x", hdev->name, status);
1322
1323 if (status)
1324 return;
1325
1326 cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_EXT_SCAN_ENABLE);
1327 if (!cp)
1328 return;
1329
1330 le_set_scan_enable_complete(hdev, cp->enable);
1331}
1332
1333static void hci_cc_le_read_num_adv_sets(struct hci_dev *hdev,
1334 struct sk_buff *skb)
1335{
1336 struct hci_rp_le_read_num_supported_adv_sets *rp = (void *) skb->data;
1337
1338 BT_DBG("%s status 0x%2.2x No of Adv sets %u", hdev->name, rp->status,
1339 rp->num_of_sets);
1340
1341 if (rp->status)
1342 return;
1343
1344 hdev->le_num_of_adv_sets = rp->num_of_sets;
1345}
1346
1199static void hci_cc_le_read_white_list_size(struct hci_dev *hdev, 1347static void hci_cc_le_read_white_list_size(struct hci_dev *hdev,
1200 struct sk_buff *skb) 1348 struct sk_buff *skb)
1201{ 1349{
@@ -1306,6 +1454,56 @@ static void hci_cc_le_write_def_data_len(struct hci_dev *hdev,
1306 hdev->le_def_tx_time = le16_to_cpu(sent->tx_time); 1454 hdev->le_def_tx_time = le16_to_cpu(sent->tx_time);
1307} 1455}
1308 1456
1457static void hci_cc_le_clear_resolv_list(struct hci_dev *hdev,
1458 struct sk_buff *skb)
1459{
1460 __u8 status = *((__u8 *) skb->data);
1461
1462 BT_DBG("%s status 0x%2.2x", hdev->name, status);
1463
1464 if (status)
1465 return;
1466
1467 hci_bdaddr_list_clear(&hdev->le_resolv_list);
1468}
1469
1470static void hci_cc_le_read_resolv_list_size(struct hci_dev *hdev,
1471 struct sk_buff *skb)
1472{
1473 struct hci_rp_le_read_resolv_list_size *rp = (void *) skb->data;
1474
1475 BT_DBG("%s status 0x%2.2x size %u", hdev->name, rp->status, rp->size);
1476
1477 if (rp->status)
1478 return;
1479
1480 hdev->le_resolv_list_size = rp->size;
1481}
1482
1483static void hci_cc_le_set_addr_resolution_enable(struct hci_dev *hdev,
1484 struct sk_buff *skb)
1485{
1486 __u8 *sent, status = *((__u8 *) skb->data);
1487
1488 BT_DBG("%s status 0x%2.2x", hdev->name, status);
1489
1490 if (status)
1491 return;
1492
1493 sent = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_ADDR_RESOLV_ENABLE);
1494 if (!sent)
1495 return;
1496
1497 hci_dev_lock(hdev);
1498
1499 if (*sent)
1500 hci_dev_set_flag(hdev, HCI_LL_RPA_RESOLUTION);
1501 else
1502 hci_dev_clear_flag(hdev, HCI_LL_RPA_RESOLUTION);
1503
1504 hci_dev_unlock(hdev);
1505}
1506
1309static void hci_cc_le_read_max_data_len(struct hci_dev *hdev, 1507static void hci_cc_le_read_max_data_len(struct hci_dev *hdev,
1310 struct sk_buff *skb) 1508 struct sk_buff *skb)
1311{ 1509{
@@ -1375,6 +1573,37 @@ static void hci_cc_set_adv_param(struct hci_dev *hdev, struct sk_buff *skb)
1375 hci_dev_unlock(hdev); 1573 hci_dev_unlock(hdev);
1376} 1574}
1377 1575
1576static void hci_cc_set_ext_adv_param(struct hci_dev *hdev, struct sk_buff *skb)
1577{
1578 struct hci_rp_le_set_ext_adv_params *rp = (void *) skb->data;
1579 struct hci_cp_le_set_ext_adv_params *cp;
1580 struct adv_info *adv_instance;
1581
1582 BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
1583
1584 if (rp->status)
1585 return;
1586
1587 cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_EXT_ADV_PARAMS);
1588 if (!cp)
1589 return;
1590
1591 hci_dev_lock(hdev);
1592 hdev->adv_addr_type = cp->own_addr_type;
1593 if (!hdev->cur_adv_instance) {
1594 /* Store in hdev for instance 0 */
1595 hdev->adv_tx_power = rp->tx_power;
1596 } else {
1597 adv_instance = hci_find_adv_instance(hdev,
1598 hdev->cur_adv_instance);
1599 if (adv_instance)
1600 adv_instance->tx_power = rp->tx_power;
1601 }
1602 /* Update adv data as tx power is known now */
1603 hci_req_update_adv_data(hdev, hdev->cur_adv_instance);
1604 hci_dev_unlock(hdev);
1605}
1606
1378static void hci_cc_read_rssi(struct hci_dev *hdev, struct sk_buff *skb) 1607static void hci_cc_read_rssi(struct hci_dev *hdev, struct sk_buff *skb)
1379{ 1608{
1380 struct hci_rp_read_rssi *rp = (void *) skb->data; 1609 struct hci_rp_read_rssi *rp = (void *) skb->data;
@@ -1896,10 +2125,44 @@ static void hci_cs_disconnect(struct hci_dev *hdev, u8 status)
1896 hci_dev_unlock(hdev); 2125 hci_dev_unlock(hdev);
1897} 2126}
1898 2127
2128static void cs_le_create_conn(struct hci_dev *hdev, bdaddr_t *peer_addr,
2129 u8 peer_addr_type, u8 own_address_type,
2130 u8 filter_policy)
2131{
2132 struct hci_conn *conn;
2133
2134 conn = hci_conn_hash_lookup_le(hdev, peer_addr,
2135 peer_addr_type);
2136 if (!conn)
2137 return;
2138
2139 /* Store the initiator and responder address information which
2140 * is needed for SMP. These values will not change during the
2141 * lifetime of the connection.
2142 */
2143 conn->init_addr_type = own_address_type;
2144 if (own_address_type == ADDR_LE_DEV_RANDOM)
2145 bacpy(&conn->init_addr, &hdev->random_addr);
2146 else
2147 bacpy(&conn->init_addr, &hdev->bdaddr);
2148
2149 conn->resp_addr_type = peer_addr_type;
2150 bacpy(&conn->resp_addr, peer_addr);
2151
2152 /* We don't want the connection attempt to stick around
2153 * indefinitely since LE doesn't have a page timeout concept
2154 * like BR/EDR. Set a timer for any connection that doesn't use
2155 * the white list for connecting.
2156 */
2157 if (filter_policy == HCI_LE_USE_PEER_ADDR)
2158 queue_delayed_work(conn->hdev->workqueue,
2159 &conn->le_conn_timeout,
2160 conn->conn_timeout);
2161}
2162
1899static void hci_cs_le_create_conn(struct hci_dev *hdev, u8 status) 2163static void hci_cs_le_create_conn(struct hci_dev *hdev, u8 status)
1900{ 2164{
1901 struct hci_cp_le_create_conn *cp; 2165 struct hci_cp_le_create_conn *cp;
1902 struct hci_conn *conn;
1903 2166
1904 BT_DBG("%s status 0x%2.2x", hdev->name, status); 2167 BT_DBG("%s status 0x%2.2x", hdev->name, status);
1905 2168
@@ -1916,35 +2179,34 @@ static void hci_cs_le_create_conn(struct hci_dev *hdev, u8 status)
1916 2179
1917 hci_dev_lock(hdev); 2180 hci_dev_lock(hdev);
1918 2181
1919 conn = hci_conn_hash_lookup_le(hdev, &cp->peer_addr, 2182 cs_le_create_conn(hdev, &cp->peer_addr, cp->peer_addr_type,
1920 cp->peer_addr_type); 2183 cp->own_address_type, cp->filter_policy);
1921 if (!conn)
1922 goto unlock;
1923 2184
1924 /* Store the initiator and responder address information which 2185 hci_dev_unlock(hdev);
1925 * is needed for SMP. These values will not change during the 2186}
1926 * lifetime of the connection.
1927 */
1928 conn->init_addr_type = cp->own_address_type;
1929 if (cp->own_address_type == ADDR_LE_DEV_RANDOM)
1930 bacpy(&conn->init_addr, &hdev->random_addr);
1931 else
1932 bacpy(&conn->init_addr, &hdev->bdaddr);
1933 2187
1934 conn->resp_addr_type = cp->peer_addr_type; 2188static void hci_cs_le_ext_create_conn(struct hci_dev *hdev, u8 status)
1935 bacpy(&conn->resp_addr, &cp->peer_addr); 2189{
2190 struct hci_cp_le_ext_create_conn *cp;
1936 2191
1937 /* We don't want the connection attempt to stick around 2192 BT_DBG("%s status 0x%2.2x", hdev->name, status);
1938 * indefinitely since LE doesn't have a page timeout concept 2193
1939 * like BR/EDR. Set a timer for any connection that doesn't use 2194 /* All connection failure handling is taken care of by the
1940 * the white list for connecting. 2195 * hci_le_conn_failed function which is triggered by the HCI
2196 * request completion callbacks used for connecting.
1941 */ 2197 */
1942 if (cp->filter_policy == HCI_LE_USE_PEER_ADDR) 2198 if (status)
1943 queue_delayed_work(conn->hdev->workqueue, 2199 return;
1944 &conn->le_conn_timeout, 2200
1945 conn->conn_timeout); 2201 cp = hci_sent_cmd_data(hdev, HCI_OP_LE_EXT_CREATE_CONN);
2202 if (!cp)
2203 return;
2204
2205 hci_dev_lock(hdev);
2206
2207 cs_le_create_conn(hdev, &cp->peer_addr, cp->peer_addr_type,
2208 cp->own_addr_type, cp->filter_policy);
1946 2209
1947unlock:
1948 hci_dev_unlock(hdev); 2210 hci_dev_unlock(hdev);
1949} 2211}
1950 2212
@@ -2618,8 +2880,10 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, struct sk_buff *skb)
2618 /* We should disregard the current RPA and generate a new one 2880 /* We should disregard the current RPA and generate a new one
2619 * whenever the encryption procedure fails. 2881 * whenever the encryption procedure fails.
2620 */ 2882 */
2621 if (ev->status && conn->type == LE_LINK) 2883 if (ev->status && conn->type == LE_LINK) {
2622 hci_dev_set_flag(hdev, HCI_RPA_EXPIRED); 2884 hci_dev_set_flag(hdev, HCI_RPA_EXPIRED);
2885 hci_adv_instances_set_rpa_expired(hdev, true);
2886 }
2623 2887
2624 clear_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags); 2888 clear_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags);
2625 2889
@@ -3015,6 +3279,18 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb,
3015 hci_cc_le_write_def_data_len(hdev, skb); 3279 hci_cc_le_write_def_data_len(hdev, skb);
3016 break; 3280 break;
3017 3281
3282 case HCI_OP_LE_CLEAR_RESOLV_LIST:
3283 hci_cc_le_clear_resolv_list(hdev, skb);
3284 break;
3285
3286 case HCI_OP_LE_READ_RESOLV_LIST_SIZE:
3287 hci_cc_le_read_resolv_list_size(hdev, skb);
3288 break;
3289
3290 case HCI_OP_LE_SET_ADDR_RESOLV_ENABLE:
3291 hci_cc_le_set_addr_resolution_enable(hdev, skb);
3292 break;
3293
3018 case HCI_OP_LE_READ_MAX_DATA_LEN: 3294 case HCI_OP_LE_READ_MAX_DATA_LEN:
3019 hci_cc_le_read_max_data_len(hdev, skb); 3295 hci_cc_le_read_max_data_len(hdev, skb);
3020 break; 3296 break;
@@ -3039,6 +3315,34 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb,
3039 hci_cc_write_ssp_debug_mode(hdev, skb); 3315 hci_cc_write_ssp_debug_mode(hdev, skb);
3040 break; 3316 break;
3041 3317
3318 case HCI_OP_LE_SET_EXT_SCAN_PARAMS:
3319 hci_cc_le_set_ext_scan_param(hdev, skb);
3320 break;
3321
3322 case HCI_OP_LE_SET_EXT_SCAN_ENABLE:
3323 hci_cc_le_set_ext_scan_enable(hdev, skb);
3324 break;
3325
3326 case HCI_OP_LE_SET_DEFAULT_PHY:
3327 hci_cc_le_set_default_phy(hdev, skb);
3328 break;
3329
3330 case HCI_OP_LE_READ_NUM_SUPPORTED_ADV_SETS:
3331 hci_cc_le_read_num_adv_sets(hdev, skb);
3332 break;
3333
3334 case HCI_OP_LE_SET_EXT_ADV_PARAMS:
3335 hci_cc_set_ext_adv_param(hdev, skb);
3336 break;
3337
3338 case HCI_OP_LE_SET_EXT_ADV_ENABLE:
3339 hci_cc_le_set_ext_adv_enable(hdev, skb);
3340 break;
3341
3342 case HCI_OP_LE_SET_ADV_SET_RAND_ADDR:
3343 hci_cc_le_set_adv_set_random_addr(hdev, skb);
3344 break;
3345
3042 default: 3346 default:
3043 BT_DBG("%s opcode 0x%4.4x", hdev->name, *opcode); 3347 BT_DBG("%s opcode 0x%4.4x", hdev->name, *opcode);
3044 break; 3348 break;
@@ -3134,6 +3438,10 @@ static void hci_cmd_status_evt(struct hci_dev *hdev, struct sk_buff *skb,
3134 hci_cs_le_start_enc(hdev, ev->status); 3438 hci_cs_le_start_enc(hdev, ev->status);
3135 break; 3439 break;
3136 3440
3441 case HCI_OP_LE_EXT_CREATE_CONN:
3442 hci_cs_le_ext_create_conn(hdev, ev->status);
3443 break;
3444
3137 default: 3445 default:
3138 BT_DBG("%s opcode 0x%4.4x", hdev->name, *opcode); 3446 BT_DBG("%s opcode 0x%4.4x", hdev->name, *opcode);
3139 break; 3447 break;
@@ -4460,16 +4768,15 @@ static void hci_disconn_phylink_complete_evt(struct hci_dev *hdev,
4460} 4768}
4461#endif 4769#endif
4462 4770
4463static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) 4771static void le_conn_complete_evt(struct hci_dev *hdev, u8 status,
4772 bdaddr_t *bdaddr, u8 bdaddr_type, u8 role, u16 handle,
4773 u16 interval, u16 latency, u16 supervision_timeout)
4464{ 4774{
4465 struct hci_ev_le_conn_complete *ev = (void *) skb->data;
4466 struct hci_conn_params *params; 4775 struct hci_conn_params *params;
4467 struct hci_conn *conn; 4776 struct hci_conn *conn;
4468 struct smp_irk *irk; 4777 struct smp_irk *irk;
4469 u8 addr_type; 4778 u8 addr_type;
4470 4779
4471 BT_DBG("%s status 0x%2.2x", hdev->name, ev->status);
4472
4473 hci_dev_lock(hdev); 4780 hci_dev_lock(hdev);
4474 4781
4475 /* All controllers implicitly stop advertising in the event of a 4782 /* All controllers implicitly stop advertising in the event of a
@@ -4479,13 +4786,13 @@ static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
4479 4786
4480 conn = hci_lookup_le_connect(hdev); 4787 conn = hci_lookup_le_connect(hdev);
4481 if (!conn) { 4788 if (!conn) {
4482 conn = hci_conn_add(hdev, LE_LINK, &ev->bdaddr, ev->role); 4789 conn = hci_conn_add(hdev, LE_LINK, bdaddr, role);
4483 if (!conn) { 4790 if (!conn) {
4484 bt_dev_err(hdev, "no memory for new connection"); 4791 bt_dev_err(hdev, "no memory for new connection");
4485 goto unlock; 4792 goto unlock;
4486 } 4793 }
4487 4794
4488 conn->dst_type = ev->bdaddr_type; 4795 conn->dst_type = bdaddr_type;
4489 4796
4490 /* If we didn't have a hci_conn object previously 4797 /* If we didn't have a hci_conn object previously
4491 * but we're in master role this must be something 4798 * but we're in master role this must be something
@@ -4496,8 +4803,8 @@ static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
4496 * initiator address based on the HCI_PRIVACY flag. 4803 * initiator address based on the HCI_PRIVACY flag.
4497 */ 4804 */
4498 if (conn->out) { 4805 if (conn->out) {
4499 conn->resp_addr_type = ev->bdaddr_type; 4806 conn->resp_addr_type = bdaddr_type;
4500 bacpy(&conn->resp_addr, &ev->bdaddr); 4807 bacpy(&conn->resp_addr, bdaddr);
4501 if (hci_dev_test_flag(hdev, HCI_PRIVACY)) { 4808 if (hci_dev_test_flag(hdev, HCI_PRIVACY)) {
4502 conn->init_addr_type = ADDR_LE_DEV_RANDOM; 4809 conn->init_addr_type = ADDR_LE_DEV_RANDOM;
4503 bacpy(&conn->init_addr, &hdev->rpa); 4810 bacpy(&conn->init_addr, &hdev->rpa);
@@ -4516,13 +4823,18 @@ static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
4516 * the advertising address type. 4823 * the advertising address type.
4517 */ 4824 */
4518 conn->resp_addr_type = hdev->adv_addr_type; 4825 conn->resp_addr_type = hdev->adv_addr_type;
4519 if (hdev->adv_addr_type == ADDR_LE_DEV_RANDOM) 4826 if (hdev->adv_addr_type == ADDR_LE_DEV_RANDOM) {
4520 bacpy(&conn->resp_addr, &hdev->random_addr); 4827 /* In case of ext adv, resp_addr will be updated in
4521 else 4828 * Adv Terminated event.
4829 */
4830 if (!ext_adv_capable(hdev))
4831 bacpy(&conn->resp_addr, &hdev->random_addr);
4832 } else {
4522 bacpy(&conn->resp_addr, &hdev->bdaddr); 4833 bacpy(&conn->resp_addr, &hdev->bdaddr);
4834 }
4523 4835
4524 conn->init_addr_type = ev->bdaddr_type; 4836 conn->init_addr_type = bdaddr_type;
4525 bacpy(&conn->init_addr, &ev->bdaddr); 4837 bacpy(&conn->init_addr, bdaddr);
4526 4838
4527 /* For incoming connections, set the default minimum 4839 /* For incoming connections, set the default minimum
4528 * and maximum connection interval. They will be used 4840 * and maximum connection interval. They will be used
@@ -4548,8 +4860,8 @@ static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
4548 conn->dst_type = irk->addr_type; 4860 conn->dst_type = irk->addr_type;
4549 } 4861 }
4550 4862
4551 if (ev->status) { 4863 if (status) {
4552 hci_le_conn_failed(conn, ev->status); 4864 hci_le_conn_failed(conn, status);
4553 goto unlock; 4865 goto unlock;
4554 } 4866 }
4555 4867
@@ -4568,17 +4880,17 @@ static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
4568 mgmt_device_connected(hdev, conn, 0, NULL, 0); 4880 mgmt_device_connected(hdev, conn, 0, NULL, 0);
4569 4881
4570 conn->sec_level = BT_SECURITY_LOW; 4882 conn->sec_level = BT_SECURITY_LOW;
4571 conn->handle = __le16_to_cpu(ev->handle); 4883 conn->handle = handle;
4572 conn->state = BT_CONFIG; 4884 conn->state = BT_CONFIG;
4573 4885
4574 conn->le_conn_interval = le16_to_cpu(ev->interval); 4886 conn->le_conn_interval = interval;
4575 conn->le_conn_latency = le16_to_cpu(ev->latency); 4887 conn->le_conn_latency = latency;
4576 conn->le_supv_timeout = le16_to_cpu(ev->supervision_timeout); 4888 conn->le_supv_timeout = supervision_timeout;
4577 4889
4578 hci_debugfs_create_conn(conn); 4890 hci_debugfs_create_conn(conn);
4579 hci_conn_add_sysfs(conn); 4891 hci_conn_add_sysfs(conn);
4580 4892
4581 if (!ev->status) { 4893 if (!status) {
4582 /* The remote features procedure is defined for master 4894 /* The remote features procedure is defined for master
4583 * role only. So only in case of an initiated connection 4895 * role only. So only in case of an initiated connection
4584 * request the remote features. 4896 * request the remote features.
@@ -4600,10 +4912,10 @@ static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
4600 hci_conn_hold(conn); 4912 hci_conn_hold(conn);
4601 } else { 4913 } else {
4602 conn->state = BT_CONNECTED; 4914 conn->state = BT_CONNECTED;
4603 hci_connect_cfm(conn, ev->status); 4915 hci_connect_cfm(conn, status);
4604 } 4916 }
4605 } else { 4917 } else {
4606 hci_connect_cfm(conn, ev->status); 4918 hci_connect_cfm(conn, status);
4607 } 4919 }
4608 4920
4609 params = hci_pend_le_action_lookup(&hdev->pend_le_conns, &conn->dst, 4921 params = hci_pend_le_action_lookup(&hdev->pend_le_conns, &conn->dst,
@@ -4622,6 +4934,61 @@ unlock:
4622 hci_dev_unlock(hdev); 4934 hci_dev_unlock(hdev);
4623} 4935}
4624 4936
4937static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
4938{
4939 struct hci_ev_le_conn_complete *ev = (void *) skb->data;
4940
4941 BT_DBG("%s status 0x%2.2x", hdev->name, ev->status);
4942
4943 le_conn_complete_evt(hdev, ev->status, &ev->bdaddr, ev->bdaddr_type,
4944 ev->role, le16_to_cpu(ev->handle),
4945 le16_to_cpu(ev->interval),
4946 le16_to_cpu(ev->latency),
4947 le16_to_cpu(ev->supervision_timeout));
4948}
4949
4950static void hci_le_enh_conn_complete_evt(struct hci_dev *hdev,
4951 struct sk_buff *skb)
4952{
4953 struct hci_ev_le_enh_conn_complete *ev = (void *) skb->data;
4954
4955 BT_DBG("%s status 0x%2.2x", hdev->name, ev->status);
4956
4957 le_conn_complete_evt(hdev, ev->status, &ev->bdaddr, ev->bdaddr_type,
4958 ev->role, le16_to_cpu(ev->handle),
4959 le16_to_cpu(ev->interval),
4960 le16_to_cpu(ev->latency),
4961 le16_to_cpu(ev->supervision_timeout));
4962}
4963
4964static void hci_le_ext_adv_term_evt(struct hci_dev *hdev, struct sk_buff *skb)
4965{
4966 struct hci_evt_le_ext_adv_set_term *ev = (void *) skb->data;
4967 struct hci_conn *conn;
4968
4969 BT_DBG("%s status 0x%2.2x", hdev->name, ev->status);
4970
4971 if (ev->status)
4972 return;
4973
4974 conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->conn_handle));
4975 if (conn) {
4976 struct adv_info *adv_instance;
4977
4978 if (hdev->adv_addr_type != ADDR_LE_DEV_RANDOM)
4979 return;
4980
4981 if (!hdev->cur_adv_instance) {
4982 bacpy(&conn->resp_addr, &hdev->random_addr);
4983 return;
4984 }
4985
4986 adv_instance = hci_find_adv_instance(hdev, hdev->cur_adv_instance);
4987 if (adv_instance)
4988 bacpy(&conn->resp_addr, &adv_instance->random_addr);
4989 }
4990}
4991
4625static void hci_le_conn_update_complete_evt(struct hci_dev *hdev, 4992static void hci_le_conn_update_complete_evt(struct hci_dev *hdev,
4626 struct sk_buff *skb) 4993 struct sk_buff *skb)
4627{ 4994{
@@ -4957,6 +5324,78 @@ static void hci_le_adv_report_evt(struct hci_dev *hdev, struct sk_buff *skb)
4957 hci_dev_unlock(hdev); 5324 hci_dev_unlock(hdev);
4958} 5325}
4959 5326
5327static u8 ext_evt_type_to_legacy(u16 evt_type)
5328{
5329 if (evt_type & LE_EXT_ADV_LEGACY_PDU) {
5330 switch (evt_type) {
5331 case LE_LEGACY_ADV_IND:
5332 return LE_ADV_IND;
5333 case LE_LEGACY_ADV_DIRECT_IND:
5334 return LE_ADV_DIRECT_IND;
5335 case LE_LEGACY_ADV_SCAN_IND:
5336 return LE_ADV_SCAN_IND;
5337 case LE_LEGACY_NONCONN_IND:
5338 return LE_ADV_NONCONN_IND;
5339 case LE_LEGACY_SCAN_RSP_ADV:
5340 case LE_LEGACY_SCAN_RSP_ADV_SCAN:
5341 return LE_ADV_SCAN_RSP;
5342 }
5343
5344 BT_ERR_RATELIMITED("Unknown advertising packet type: 0x%02x",
5345 evt_type);
5346
5347 return LE_ADV_INVALID;
5348 }
5349
5350 if (evt_type & LE_EXT_ADV_CONN_IND) {
5351 if (evt_type & LE_EXT_ADV_DIRECT_IND)
5352 return LE_ADV_DIRECT_IND;
5353
5354 return LE_ADV_IND;
5355 }
5356
5357 if (evt_type & LE_EXT_ADV_SCAN_RSP)
5358 return LE_ADV_SCAN_RSP;
5359
5360 if (evt_type & LE_EXT_ADV_SCAN_IND)
5361 return LE_ADV_SCAN_IND;
5362
5363 if (evt_type == LE_EXT_ADV_NON_CONN_IND ||
5364 evt_type & LE_EXT_ADV_DIRECT_IND)
5365 return LE_ADV_NONCONN_IND;
5366
5367 BT_ERR_RATELIMITED("Unknown advertising packet type: 0x%02x",
5368 evt_type);
5369
5370 return LE_ADV_INVALID;
5371}
5372
5373static void hci_le_ext_adv_report_evt(struct hci_dev *hdev, struct sk_buff *skb)
5374{
5375 u8 num_reports = skb->data[0];
5376 void *ptr = &skb->data[1];
5377
5378 hci_dev_lock(hdev);
5379
5380 while (num_reports--) {
5381 struct hci_ev_le_ext_adv_report *ev = ptr;
5382 u8 legacy_evt_type;
5383 u16 evt_type;
5384
5385 evt_type = __le16_to_cpu(ev->evt_type);
5386 legacy_evt_type = ext_evt_type_to_legacy(evt_type);
5387 if (legacy_evt_type != LE_ADV_INVALID) {
5388 process_adv_report(hdev, legacy_evt_type, &ev->bdaddr,
5389 ev->bdaddr_type, NULL, 0, ev->rssi,
5390 ev->data, ev->length);
5391 }
5392
5393 ptr += sizeof(*ev) + ev->length + 1;
5394 }
5395
5396 hci_dev_unlock(hdev);
5397}
5398
4960static void hci_le_remote_feat_complete_evt(struct hci_dev *hdev, 5399static void hci_le_remote_feat_complete_evt(struct hci_dev *hdev,
4961 struct sk_buff *skb) 5400 struct sk_buff *skb)
4962{ 5401{
@@ -5189,6 +5628,18 @@ static void hci_le_meta_evt(struct hci_dev *hdev, struct sk_buff *skb)
5189 hci_le_direct_adv_report_evt(hdev, skb); 5628 hci_le_direct_adv_report_evt(hdev, skb);
5190 break; 5629 break;
5191 5630
5631 case HCI_EV_LE_EXT_ADV_REPORT:
5632 hci_le_ext_adv_report_evt(hdev, skb);
5633 break;
5634
5635 case HCI_EV_LE_ENHANCED_CONN_COMPLETE:
5636 hci_le_enh_conn_complete_evt(hdev, skb);
5637 break;
5638
5639 case HCI_EV_LE_EXT_ADV_SET_TERM:
5640 hci_le_ext_adv_term_evt(hdev, skb);
5641 break;
5642
5192 default: 5643 default:
5193 break; 5644 break;
5194 } 5645 }
diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index e44d34734834..e8c9ef1e1922 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -647,11 +647,22 @@ void __hci_req_update_eir(struct hci_request *req)
647 647
648void hci_req_add_le_scan_disable(struct hci_request *req) 648void hci_req_add_le_scan_disable(struct hci_request *req)
649{ 649{
650 struct hci_cp_le_set_scan_enable cp; 650 struct hci_dev *hdev = req->hdev;
651 651
652 memset(&cp, 0, sizeof(cp)); 652 if (use_ext_scan(hdev)) {
653 cp.enable = LE_SCAN_DISABLE; 653 struct hci_cp_le_set_ext_scan_enable cp;
654 hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(cp), &cp); 654
655 memset(&cp, 0, sizeof(cp));
656 cp.enable = LE_SCAN_DISABLE;
657 hci_req_add(req, HCI_OP_LE_SET_EXT_SCAN_ENABLE, sizeof(cp),
658 &cp);
659 } else {
660 struct hci_cp_le_set_scan_enable cp;
661
662 memset(&cp, 0, sizeof(cp));
663 cp.enable = LE_SCAN_DISABLE;
664 hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(cp), &cp);
665 }
655} 666}
656 667
657static void add_to_white_list(struct hci_request *req, 668static void add_to_white_list(struct hci_request *req,
@@ -767,10 +778,86 @@ static bool scan_use_rpa(struct hci_dev *hdev)
767 return hci_dev_test_flag(hdev, HCI_PRIVACY); 778 return hci_dev_test_flag(hdev, HCI_PRIVACY);
768} 779}
769 780
781static void hci_req_start_scan(struct hci_request *req, u8 type, u16 interval,
782 u16 window, u8 own_addr_type, u8 filter_policy)
783{
784 struct hci_dev *hdev = req->hdev;
785
786 /* Use ext scanning if set ext scan param and ext scan enable is
787 * supported
788 */
789 if (use_ext_scan(hdev)) {
790 struct hci_cp_le_set_ext_scan_params *ext_param_cp;
791 struct hci_cp_le_set_ext_scan_enable ext_enable_cp;
792 struct hci_cp_le_scan_phy_params *phy_params;
793 u8 data[sizeof(*ext_param_cp) + sizeof(*phy_params) * 2];
794 u32 plen;
795
796 ext_param_cp = (void *)data;
797 phy_params = (void *)ext_param_cp->data;
798
799 memset(ext_param_cp, 0, sizeof(*ext_param_cp));
800 ext_param_cp->own_addr_type = own_addr_type;
801 ext_param_cp->filter_policy = filter_policy;
802
803 plen = sizeof(*ext_param_cp);
804
805 if (scan_1m(hdev) || scan_2m(hdev)) {
806 ext_param_cp->scanning_phys |= LE_SCAN_PHY_1M;
807
808 memset(phy_params, 0, sizeof(*phy_params));
809 phy_params->type = type;
810 phy_params->interval = cpu_to_le16(interval);
811 phy_params->window = cpu_to_le16(window);
812
813 plen += sizeof(*phy_params);
814 phy_params++;
815 }
816
817 if (scan_coded(hdev)) {
818 ext_param_cp->scanning_phys |= LE_SCAN_PHY_CODED;
819
820 memset(phy_params, 0, sizeof(*phy_params));
821 phy_params->type = type;
822 phy_params->interval = cpu_to_le16(interval);
823 phy_params->window = cpu_to_le16(window);
824
825 plen += sizeof(*phy_params);
826 phy_params++;
827 }
828
829 hci_req_add(req, HCI_OP_LE_SET_EXT_SCAN_PARAMS,
830 plen, ext_param_cp);
831
832 memset(&ext_enable_cp, 0, sizeof(ext_enable_cp));
833 ext_enable_cp.enable = LE_SCAN_ENABLE;
834 ext_enable_cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE;
835
836 hci_req_add(req, HCI_OP_LE_SET_EXT_SCAN_ENABLE,
837 sizeof(ext_enable_cp), &ext_enable_cp);
838 } else {
839 struct hci_cp_le_set_scan_param param_cp;
840 struct hci_cp_le_set_scan_enable enable_cp;
841
842 memset(&param_cp, 0, sizeof(param_cp));
843 param_cp.type = type;
844 param_cp.interval = cpu_to_le16(interval);
845 param_cp.window = cpu_to_le16(window);
846 param_cp.own_address_type = own_addr_type;
847 param_cp.filter_policy = filter_policy;
848 hci_req_add(req, HCI_OP_LE_SET_SCAN_PARAM, sizeof(param_cp),
849 &param_cp);
850
851 memset(&enable_cp, 0, sizeof(enable_cp));
852 enable_cp.enable = LE_SCAN_ENABLE;
853 enable_cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE;
854 hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(enable_cp),
855 &enable_cp);
856 }
857}
858
770void hci_req_add_le_passive_scan(struct hci_request *req) 859void hci_req_add_le_passive_scan(struct hci_request *req)
771{ 860{
772 struct hci_cp_le_set_scan_param param_cp;
773 struct hci_cp_le_set_scan_enable enable_cp;
774 struct hci_dev *hdev = req->hdev; 861 struct hci_dev *hdev = req->hdev;
775 u8 own_addr_type; 862 u8 own_addr_type;
776 u8 filter_policy; 863 u8 filter_policy;
@@ -804,20 +891,26 @@ void hci_req_add_le_passive_scan(struct hci_request *req)
804 (hdev->le_features[0] & HCI_LE_EXT_SCAN_POLICY)) 891 (hdev->le_features[0] & HCI_LE_EXT_SCAN_POLICY))
805 filter_policy |= 0x02; 892 filter_policy |= 0x02;
806 893
807 memset(&param_cp, 0, sizeof(param_cp)); 894 hci_req_start_scan(req, LE_SCAN_PASSIVE, hdev->le_scan_interval,
808 param_cp.type = LE_SCAN_PASSIVE; 895 hdev->le_scan_window, own_addr_type, filter_policy);
809 param_cp.interval = cpu_to_le16(hdev->le_scan_interval); 896}
810 param_cp.window = cpu_to_le16(hdev->le_scan_window); 897
811 param_cp.own_address_type = own_addr_type; 898static u8 get_adv_instance_scan_rsp_len(struct hci_dev *hdev, u8 instance)
812 param_cp.filter_policy = filter_policy; 899{
813 hci_req_add(req, HCI_OP_LE_SET_SCAN_PARAM, sizeof(param_cp), 900 struct adv_info *adv_instance;
814 &param_cp); 901
902 /* Ignore instance 0 */
903 if (instance == 0x00)
904 return 0;
905
906 adv_instance = hci_find_adv_instance(hdev, instance);
907 if (!adv_instance)
908 return 0;
815 909
816 memset(&enable_cp, 0, sizeof(enable_cp)); 910 /* TODO: Take into account the "appearance" and "local-name" flags here.
817 enable_cp.enable = LE_SCAN_ENABLE; 911 * These are currently being ignored as they are not supported.
818 enable_cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE; 912 */
819 hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(enable_cp), 913 return adv_instance->scan_rsp_len;
820 &enable_cp);
821} 914}
822 915
823static u8 get_cur_adv_instance_scan_rsp_len(struct hci_dev *hdev) 916static u8 get_cur_adv_instance_scan_rsp_len(struct hci_dev *hdev)
@@ -841,9 +934,19 @@ static u8 get_cur_adv_instance_scan_rsp_len(struct hci_dev *hdev)
841 934
842void __hci_req_disable_advertising(struct hci_request *req) 935void __hci_req_disable_advertising(struct hci_request *req)
843{ 936{
844 u8 enable = 0x00; 937 if (ext_adv_capable(req->hdev)) {
938 struct hci_cp_le_set_ext_adv_enable cp;
845 939
846 hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable); 940 cp.enable = 0x00;
941 /* Disable all sets since we only support one set at the moment */
942 cp.num_of_sets = 0x00;
943
944 hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_ENABLE, sizeof(cp), &cp);
945 } else {
946 u8 enable = 0x00;
947
948 hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable);
949 }
847} 950}
848 951
849static u32 get_adv_instance_flags(struct hci_dev *hdev, u8 instance) 952static u32 get_adv_instance_flags(struct hci_dev *hdev, u8 instance)
@@ -1081,29 +1184,58 @@ static u8 create_instance_scan_rsp_data(struct hci_dev *hdev, u8 instance,
1081void __hci_req_update_scan_rsp_data(struct hci_request *req, u8 instance) 1184void __hci_req_update_scan_rsp_data(struct hci_request *req, u8 instance)
1082{ 1185{
1083 struct hci_dev *hdev = req->hdev; 1186 struct hci_dev *hdev = req->hdev;
1084 struct hci_cp_le_set_scan_rsp_data cp;
1085 u8 len; 1187 u8 len;
1086 1188
1087 if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) 1189 if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED))
1088 return; 1190 return;
1089 1191
1090 memset(&cp, 0, sizeof(cp)); 1192 if (ext_adv_capable(hdev)) {
1193 struct hci_cp_le_set_ext_scan_rsp_data cp;
1091 1194
1092 if (instance) 1195 memset(&cp, 0, sizeof(cp));
1093 len = create_instance_scan_rsp_data(hdev, instance, cp.data);
1094 else
1095 len = create_default_scan_rsp_data(hdev, cp.data);
1096 1196
1097 if (hdev->scan_rsp_data_len == len && 1197 if (instance)
1098 !memcmp(cp.data, hdev->scan_rsp_data, len)) 1198 len = create_instance_scan_rsp_data(hdev, instance,
1099 return; 1199 cp.data);
1200 else
1201 len = create_default_scan_rsp_data(hdev, cp.data);
1202
1203 if (hdev->scan_rsp_data_len == len &&
1204 !memcmp(cp.data, hdev->scan_rsp_data, len))
1205 return;
1100 1206
1101 memcpy(hdev->scan_rsp_data, cp.data, sizeof(cp.data)); 1207 memcpy(hdev->scan_rsp_data, cp.data, sizeof(cp.data));
1102 hdev->scan_rsp_data_len = len; 1208 hdev->scan_rsp_data_len = len;
1103 1209
1104 cp.length = len; 1210 cp.handle = 0;
1211 cp.length = len;
1212 cp.operation = LE_SET_ADV_DATA_OP_COMPLETE;
1213 cp.frag_pref = LE_SET_ADV_DATA_NO_FRAG;
1214
1215 hci_req_add(req, HCI_OP_LE_SET_EXT_SCAN_RSP_DATA, sizeof(cp),
1216 &cp);
1217 } else {
1218 struct hci_cp_le_set_scan_rsp_data cp;
1219
1220 memset(&cp, 0, sizeof(cp));
1221
1222 if (instance)
1223 len = create_instance_scan_rsp_data(hdev, instance,
1224 cp.data);
1225 else
1226 len = create_default_scan_rsp_data(hdev, cp.data);
1227
1228 if (hdev->scan_rsp_data_len == len &&
1229 !memcmp(cp.data, hdev->scan_rsp_data, len))
1230 return;
1105 1231
1106 hci_req_add(req, HCI_OP_LE_SET_SCAN_RSP_DATA, sizeof(cp), &cp); 1232 memcpy(hdev->scan_rsp_data, cp.data, sizeof(cp.data));
1233 hdev->scan_rsp_data_len = len;
1234
1235 cp.length = len;
1236
1237 hci_req_add(req, HCI_OP_LE_SET_SCAN_RSP_DATA, sizeof(cp), &cp);
1238 }
1107} 1239}
1108 1240
1109static u8 create_instance_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr) 1241static u8 create_instance_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr)
@@ -1160,15 +1292,27 @@ static u8 create_instance_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr)
1160 ptr += adv_instance->adv_data_len; 1292 ptr += adv_instance->adv_data_len;
1161 } 1293 }
1162 1294
1163 /* Provide Tx Power only if we can provide a valid value for it */ 1295 if (instance_flags & MGMT_ADV_FLAG_TX_POWER) {
1164 if (hdev->adv_tx_power != HCI_TX_POWER_INVALID && 1296 s8 adv_tx_power;
1165 (instance_flags & MGMT_ADV_FLAG_TX_POWER)) {
1166 ptr[0] = 0x02;
1167 ptr[1] = EIR_TX_POWER;
1168 ptr[2] = (u8)hdev->adv_tx_power;
1169 1297
1170 ad_len += 3; 1298 if (ext_adv_capable(hdev)) {
1171 ptr += 3; 1299 if (adv_instance)
1300 adv_tx_power = adv_instance->tx_power;
1301 else
1302 adv_tx_power = hdev->adv_tx_power;
1303 } else {
1304 adv_tx_power = hdev->adv_tx_power;
1305 }
1306
1307 /* Provide Tx Power only if we can provide a valid value for it */
1308 if (adv_tx_power != HCI_TX_POWER_INVALID) {
1309 ptr[0] = 0x02;
1310 ptr[1] = EIR_TX_POWER;
1311 ptr[2] = (u8)adv_tx_power;
1312
1313 ad_len += 3;
1314 ptr += 3;
1315 }
1172 } 1316 }
1173 1317
1174 return ad_len; 1318 return ad_len;
@@ -1177,27 +1321,51 @@ static u8 create_instance_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr)
1177void __hci_req_update_adv_data(struct hci_request *req, u8 instance) 1321void __hci_req_update_adv_data(struct hci_request *req, u8 instance)
1178{ 1322{
1179 struct hci_dev *hdev = req->hdev; 1323 struct hci_dev *hdev = req->hdev;
1180 struct hci_cp_le_set_adv_data cp;
1181 u8 len; 1324 u8 len;
1182 1325
1183 if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) 1326 if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED))
1184 return; 1327 return;
1185 1328
1186 memset(&cp, 0, sizeof(cp)); 1329 if (ext_adv_capable(hdev)) {
1330 struct hci_cp_le_set_ext_adv_data cp;
1187 1331
1188 len = create_instance_adv_data(hdev, instance, cp.data); 1332 memset(&cp, 0, sizeof(cp));
1189 1333
1190 /* There's nothing to do if the data hasn't changed */ 1334 len = create_instance_adv_data(hdev, instance, cp.data);
1191 if (hdev->adv_data_len == len && 1335
1192 memcmp(cp.data, hdev->adv_data, len) == 0) 1336 /* There's nothing to do if the data hasn't changed */
1193 return; 1337 if (hdev->adv_data_len == len &&
1338 memcmp(cp.data, hdev->adv_data, len) == 0)
1339 return;
1340
1341 memcpy(hdev->adv_data, cp.data, sizeof(cp.data));
1342 hdev->adv_data_len = len;
1343
1344 cp.length = len;
1345 cp.handle = 0;
1346 cp.operation = LE_SET_ADV_DATA_OP_COMPLETE;
1347 cp.frag_pref = LE_SET_ADV_DATA_NO_FRAG;
1348
1349 hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_DATA, sizeof(cp), &cp);
1350 } else {
1351 struct hci_cp_le_set_adv_data cp;
1352
1353 memset(&cp, 0, sizeof(cp));
1354
1355 len = create_instance_adv_data(hdev, instance, cp.data);
1356
1357 /* There's nothing to do if the data hasn't changed */
1358 if (hdev->adv_data_len == len &&
1359 memcmp(cp.data, hdev->adv_data, len) == 0)
1360 return;
1194 1361
1195 memcpy(hdev->adv_data, cp.data, sizeof(cp.data)); 1362 memcpy(hdev->adv_data, cp.data, sizeof(cp.data));
1196 hdev->adv_data_len = len; 1363 hdev->adv_data_len = len;
1197 1364
1198 cp.length = len; 1365 cp.length = len;
1199 1366
1200 hci_req_add(req, HCI_OP_LE_SET_ADV_DATA, sizeof(cp), &cp); 1367 hci_req_add(req, HCI_OP_LE_SET_ADV_DATA, sizeof(cp), &cp);
1368 }
1201} 1369}
1202 1370
1203int hci_req_update_adv_data(struct hci_dev *hdev, u8 instance) 1371int hci_req_update_adv_data(struct hci_dev *hdev, u8 instance)
@@ -1229,9 +1397,13 @@ void hci_req_reenable_advertising(struct hci_dev *hdev)
1229 __hci_req_schedule_adv_instance(&req, hdev->cur_adv_instance, 1397 __hci_req_schedule_adv_instance(&req, hdev->cur_adv_instance,
1230 true); 1398 true);
1231 } else { 1399 } else {
1232 __hci_req_update_adv_data(&req, 0x00); 1400 if (ext_adv_capable(hdev)) {
1233 __hci_req_update_scan_rsp_data(&req, 0x00); 1401 __hci_req_start_ext_adv(&req, 0x00);
1234 __hci_req_enable_advertising(&req); 1402 } else {
1403 __hci_req_update_adv_data(&req, 0x00);
1404 __hci_req_update_scan_rsp_data(&req, 0x00);
1405 __hci_req_enable_advertising(&req);
1406 }
1235 } 1407 }
1236 1408
1237 hci_req_run(&req, adv_enable_complete); 1409 hci_req_run(&req, adv_enable_complete);
@@ -1268,6 +1440,245 @@ unlock:
1268 hci_dev_unlock(hdev); 1440 hci_dev_unlock(hdev);
1269} 1441}
1270 1442
1443int hci_get_random_address(struct hci_dev *hdev, bool require_privacy,
1444 bool use_rpa, struct adv_info *adv_instance,
1445 u8 *own_addr_type, bdaddr_t *rand_addr)
1446{
1447 int err;
1448
1449 bacpy(rand_addr, BDADDR_ANY);
1450
1451 /* If privacy is enabled use a resolvable private address. If
1452 * current RPA has expired then generate a new one.
1453 */
1454 if (use_rpa) {
1455 int to;
1456
1457 *own_addr_type = ADDR_LE_DEV_RANDOM;
1458
1459 if (adv_instance) {
1460 if (!adv_instance->rpa_expired &&
1461 !bacmp(&adv_instance->random_addr, &hdev->rpa))
1462 return 0;
1463
1464 adv_instance->rpa_expired = false;
1465 } else {
1466 if (!hci_dev_test_and_clear_flag(hdev, HCI_RPA_EXPIRED) &&
1467 !bacmp(&hdev->random_addr, &hdev->rpa))
1468 return 0;
1469 }
1470
1471 err = smp_generate_rpa(hdev, hdev->irk, &hdev->rpa);
1472 if (err < 0) {
1473 BT_ERR("%s failed to generate new RPA", hdev->name);
1474 return err;
1475 }
1476
1477 bacpy(rand_addr, &hdev->rpa);
1478
1479 to = msecs_to_jiffies(hdev->rpa_timeout * 1000);
1480 if (adv_instance)
1481 queue_delayed_work(hdev->workqueue,
1482 &adv_instance->rpa_expired_cb, to);
1483 else
1484 queue_delayed_work(hdev->workqueue,
1485 &hdev->rpa_expired, to);
1486
1487 return 0;
1488 }
1489
1490 /* In case of required privacy without resolvable private address,
1491 * use an non-resolvable private address. This is useful for
1492 * non-connectable advertising.
1493 */
1494 if (require_privacy) {
1495 bdaddr_t nrpa;
1496
1497 while (true) {
1498 /* The non-resolvable private address is generated
1499 * from random six bytes with the two most significant
1500 * bits cleared.
1501 */
1502 get_random_bytes(&nrpa, 6);
1503 nrpa.b[5] &= 0x3f;
1504
1505 /* The non-resolvable private address shall not be
1506 * equal to the public address.
1507 */
1508 if (bacmp(&hdev->bdaddr, &nrpa))
1509 break;
1510 }
1511
1512 *own_addr_type = ADDR_LE_DEV_RANDOM;
1513 bacpy(rand_addr, &nrpa);
1514
1515 return 0;
1516 }
1517
1518 /* No privacy so use a public address. */
1519 *own_addr_type = ADDR_LE_DEV_PUBLIC;
1520
1521 return 0;
1522}
1523
1524void __hci_req_clear_ext_adv_sets(struct hci_request *req)
1525{
1526 hci_req_add(req, HCI_OP_LE_CLEAR_ADV_SETS, 0, NULL);
1527}
1528
1529int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance)
1530{
1531 struct hci_cp_le_set_ext_adv_params cp;
1532 struct hci_dev *hdev = req->hdev;
1533 bool connectable;
1534 u32 flags;
1535 bdaddr_t random_addr;
1536 u8 own_addr_type;
1537 int err;
1538 struct adv_info *adv_instance;
1539 bool secondary_adv;
1540 /* In ext adv set param interval is 3 octets */
1541 const u8 adv_interval[3] = { 0x00, 0x08, 0x00 };
1542
1543 if (instance > 0) {
1544 adv_instance = hci_find_adv_instance(hdev, instance);
1545 if (!adv_instance)
1546 return -EINVAL;
1547 } else {
1548 adv_instance = NULL;
1549 }
1550
1551 flags = get_adv_instance_flags(hdev, instance);
1552
1553 /* If the "connectable" instance flag was not set, then choose between
1554 * ADV_IND and ADV_NONCONN_IND based on the global connectable setting.
1555 */
1556 connectable = (flags & MGMT_ADV_FLAG_CONNECTABLE) ||
1557 mgmt_get_connectable(hdev);
1558
1559 if (!is_advertising_allowed(hdev, connectable))
1560 return -EPERM;
1561
1562 /* Set require_privacy to true only when non-connectable
1563 * advertising is used. In that case it is fine to use a
1564 * non-resolvable private address.
1565 */
1566 err = hci_get_random_address(hdev, !connectable,
1567 adv_use_rpa(hdev, flags), adv_instance,
1568 &own_addr_type, &random_addr);
1569 if (err < 0)
1570 return err;
1571
1572 memset(&cp, 0, sizeof(cp));
1573
1574 memcpy(cp.min_interval, adv_interval, sizeof(cp.min_interval));
1575 memcpy(cp.max_interval, adv_interval, sizeof(cp.max_interval));
1576
1577 secondary_adv = (flags & MGMT_ADV_FLAG_SEC_MASK);
1578
1579 if (connectable) {
1580 if (secondary_adv)
1581 cp.evt_properties = cpu_to_le16(LE_EXT_ADV_CONN_IND);
1582 else
1583 cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_IND);
1584 } else if (get_adv_instance_scan_rsp_len(hdev, instance)) {
1585 if (secondary_adv)
1586 cp.evt_properties = cpu_to_le16(LE_EXT_ADV_SCAN_IND);
1587 else
1588 cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_SCAN_IND);
1589 } else {
1590 if (secondary_adv)
1591 cp.evt_properties = cpu_to_le16(LE_EXT_ADV_NON_CONN_IND);
1592 else
1593 cp.evt_properties = cpu_to_le16(LE_LEGACY_NONCONN_IND);
1594 }
1595
1596 cp.own_addr_type = own_addr_type;
1597 cp.channel_map = hdev->le_adv_channel_map;
1598 cp.tx_power = 127;
1599 cp.handle = 0;
1600
1601 if (flags & MGMT_ADV_FLAG_SEC_2M) {
1602 cp.primary_phy = HCI_ADV_PHY_1M;
1603 cp.secondary_phy = HCI_ADV_PHY_2M;
1604 } else if (flags & MGMT_ADV_FLAG_SEC_CODED) {
1605 cp.primary_phy = HCI_ADV_PHY_CODED;
1606 cp.secondary_phy = HCI_ADV_PHY_CODED;
1607 } else {
1608 /* In all other cases use 1M */
1609 cp.primary_phy = HCI_ADV_PHY_1M;
1610 cp.secondary_phy = HCI_ADV_PHY_1M;
1611 }
1612
1613 hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_PARAMS, sizeof(cp), &cp);
1614
1615 if (own_addr_type == ADDR_LE_DEV_RANDOM &&
1616 bacmp(&random_addr, BDADDR_ANY)) {
1617 struct hci_cp_le_set_adv_set_rand_addr cp;
1618
1619 /* Check if random address need to be updated */
1620 if (adv_instance) {
1621 if (!bacmp(&random_addr, &adv_instance->random_addr))
1622 return 0;
1623 } else {
1624 if (!bacmp(&random_addr, &hdev->random_addr))
1625 return 0;
1626 }
1627
1628 memset(&cp, 0, sizeof(cp));
1629
1630 cp.handle = 0;
1631 bacpy(&cp.bdaddr, &random_addr);
1632
1633 hci_req_add(req,
1634 HCI_OP_LE_SET_ADV_SET_RAND_ADDR,
1635 sizeof(cp), &cp);
1636 }
1637
1638 return 0;
1639}
1640
1641void __hci_req_enable_ext_advertising(struct hci_request *req)
1642{
1643 struct hci_cp_le_set_ext_adv_enable *cp;
1644 struct hci_cp_ext_adv_set *adv_set;
1645 u8 data[sizeof(*cp) + sizeof(*adv_set) * 1];
1646
1647 cp = (void *) data;
1648 adv_set = (void *) cp->data;
1649
1650 memset(cp, 0, sizeof(*cp));
1651
1652 cp->enable = 0x01;
1653 cp->num_of_sets = 0x01;
1654
1655 memset(adv_set, 0, sizeof(*adv_set));
1656
1657 adv_set->handle = 0;
1658
1659 hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_ENABLE,
1660 sizeof(*cp) + sizeof(*adv_set) * cp->num_of_sets,
1661 data);
1662}
1663
1664int __hci_req_start_ext_adv(struct hci_request *req, u8 instance)
1665{
1666 struct hci_dev *hdev = req->hdev;
1667 int err;
1668
1669 if (hci_dev_test_flag(hdev, HCI_LE_ADV))
1670 __hci_req_disable_advertising(req);
1671
1672 err = __hci_req_setup_ext_adv_instance(req, instance);
1673 if (err < 0)
1674 return err;
1675
1676 __hci_req_update_scan_rsp_data(req, instance);
1677 __hci_req_enable_ext_advertising(req);
1678
1679 return 0;
1680}
1681
1271int __hci_req_schedule_adv_instance(struct hci_request *req, u8 instance, 1682int __hci_req_schedule_adv_instance(struct hci_request *req, u8 instance,
1272 bool force) 1683 bool force)
1273{ 1684{
@@ -1321,9 +1732,13 @@ int __hci_req_schedule_adv_instance(struct hci_request *req, u8 instance,
1321 return 0; 1732 return 0;
1322 1733
1323 hdev->cur_adv_instance = instance; 1734 hdev->cur_adv_instance = instance;
1324 __hci_req_update_adv_data(req, instance); 1735 if (ext_adv_capable(hdev)) {
1325 __hci_req_update_scan_rsp_data(req, instance); 1736 __hci_req_start_ext_adv(req, instance);
1326 __hci_req_enable_advertising(req); 1737 } else {
1738 __hci_req_update_adv_data(req, instance);
1739 __hci_req_update_scan_rsp_data(req, instance);
1740 __hci_req_enable_advertising(req);
1741 }
1327 1742
1328 return 0; 1743 return 0;
1329} 1744}
@@ -1594,8 +2009,12 @@ static int connectable_update(struct hci_request *req, unsigned long opt)
1594 2009
1595 /* Update the advertising parameters if necessary */ 2010 /* Update the advertising parameters if necessary */
1596 if (hci_dev_test_flag(hdev, HCI_ADVERTISING) || 2011 if (hci_dev_test_flag(hdev, HCI_ADVERTISING) ||
1597 !list_empty(&hdev->adv_instances)) 2012 !list_empty(&hdev->adv_instances)) {
1598 __hci_req_enable_advertising(req); 2013 if (ext_adv_capable(hdev))
2014 __hci_req_start_ext_adv(req, hdev->cur_adv_instance);
2015 else
2016 __hci_req_enable_advertising(req);
2017 }
1599 2018
1600 __hci_update_background_scan(req); 2019 __hci_update_background_scan(req);
1601 2020
@@ -1704,8 +2123,12 @@ static int discoverable_update(struct hci_request *req, unsigned long opt)
1704 /* Discoverable mode affects the local advertising 2123 /* Discoverable mode affects the local advertising
1705 * address in limited privacy mode. 2124 * address in limited privacy mode.
1706 */ 2125 */
1707 if (hci_dev_test_flag(hdev, HCI_LIMITED_PRIVACY)) 2126 if (hci_dev_test_flag(hdev, HCI_LIMITED_PRIVACY)) {
1708 __hci_req_enable_advertising(req); 2127 if (ext_adv_capable(hdev))
2128 __hci_req_start_ext_adv(req, 0x00);
2129 else
2130 __hci_req_enable_advertising(req);
2131 }
1709 } 2132 }
1710 2133
1711 hci_dev_unlock(hdev); 2134 hci_dev_unlock(hdev);
@@ -1940,7 +2363,6 @@ discov_stopped:
1940static int le_scan_restart(struct hci_request *req, unsigned long opt) 2363static int le_scan_restart(struct hci_request *req, unsigned long opt)
1941{ 2364{
1942 struct hci_dev *hdev = req->hdev; 2365 struct hci_dev *hdev = req->hdev;
1943 struct hci_cp_le_set_scan_enable cp;
1944 2366
1945 /* If controller is not scanning we are done. */ 2367 /* If controller is not scanning we are done. */
1946 if (!hci_dev_test_flag(hdev, HCI_LE_SCAN)) 2368 if (!hci_dev_test_flag(hdev, HCI_LE_SCAN))
@@ -1948,10 +2370,23 @@ static int le_scan_restart(struct hci_request *req, unsigned long opt)
1948 2370
1949 hci_req_add_le_scan_disable(req); 2371 hci_req_add_le_scan_disable(req);
1950 2372
1951 memset(&cp, 0, sizeof(cp)); 2373 if (use_ext_scan(hdev)) {
1952 cp.enable = LE_SCAN_ENABLE; 2374 struct hci_cp_le_set_ext_scan_enable ext_enable_cp;
1953 cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE; 2375
1954 hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(cp), &cp); 2376 memset(&ext_enable_cp, 0, sizeof(ext_enable_cp));
2377 ext_enable_cp.enable = LE_SCAN_ENABLE;
2378 ext_enable_cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE;
2379
2380 hci_req_add(req, HCI_OP_LE_SET_EXT_SCAN_ENABLE,
2381 sizeof(ext_enable_cp), &ext_enable_cp);
2382 } else {
2383 struct hci_cp_le_set_scan_enable cp;
2384
2385 memset(&cp, 0, sizeof(cp));
2386 cp.enable = LE_SCAN_ENABLE;
2387 cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE;
2388 hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(cp), &cp);
2389 }
1955 2390
1956 return 0; 2391 return 0;
1957} 2392}
@@ -2010,8 +2445,6 @@ static int active_scan(struct hci_request *req, unsigned long opt)
2010{ 2445{
2011 uint16_t interval = opt; 2446 uint16_t interval = opt;
2012 struct hci_dev *hdev = req->hdev; 2447 struct hci_dev *hdev = req->hdev;
2013 struct hci_cp_le_set_scan_param param_cp;
2014 struct hci_cp_le_set_scan_enable enable_cp;
2015 u8 own_addr_type; 2448 u8 own_addr_type;
2016 int err; 2449 int err;
2017 2450
@@ -2050,22 +2483,8 @@ static int active_scan(struct hci_request *req, unsigned long opt)
2050 if (err < 0) 2483 if (err < 0)
2051 own_addr_type = ADDR_LE_DEV_PUBLIC; 2484 own_addr_type = ADDR_LE_DEV_PUBLIC;
2052 2485
2053 memset(&param_cp, 0, sizeof(param_cp)); 2486 hci_req_start_scan(req, LE_SCAN_ACTIVE, interval, DISCOV_LE_SCAN_WIN,
2054 param_cp.type = LE_SCAN_ACTIVE; 2487 own_addr_type, 0);
2055 param_cp.interval = cpu_to_le16(interval);
2056 param_cp.window = cpu_to_le16(DISCOV_LE_SCAN_WIN);
2057 param_cp.own_address_type = own_addr_type;
2058
2059 hci_req_add(req, HCI_OP_LE_SET_SCAN_PARAM, sizeof(param_cp),
2060 &param_cp);
2061
2062 memset(&enable_cp, 0, sizeof(enable_cp));
2063 enable_cp.enable = LE_SCAN_ENABLE;
2064 enable_cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE;
2065
2066 hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(enable_cp),
2067 &enable_cp);
2068
2069 return 0; 2488 return 0;
2070} 2489}
2071 2490
@@ -2302,11 +2721,26 @@ static int powered_update_hci(struct hci_request *req, unsigned long opt)
2302 */ 2721 */
2303 if (hci_dev_test_flag(hdev, HCI_ADVERTISING) || 2722 if (hci_dev_test_flag(hdev, HCI_ADVERTISING) ||
2304 list_empty(&hdev->adv_instances)) { 2723 list_empty(&hdev->adv_instances)) {
2305 __hci_req_update_adv_data(req, 0x00); 2724 int err;
2306 __hci_req_update_scan_rsp_data(req, 0x00); 2725
2307 2726 if (ext_adv_capable(hdev)) {
2308 if (hci_dev_test_flag(hdev, HCI_ADVERTISING)) 2727 err = __hci_req_setup_ext_adv_instance(req,
2309 __hci_req_enable_advertising(req); 2728 0x00);
2729 if (!err)
2730 __hci_req_update_scan_rsp_data(req,
2731 0x00);
2732 } else {
2733 err = 0;
2734 __hci_req_update_adv_data(req, 0x00);
2735 __hci_req_update_scan_rsp_data(req, 0x00);
2736 }
2737
2738 if (hci_dev_test_flag(hdev, HCI_ADVERTISING)) {
2739 if (!ext_adv_capable(hdev))
2740 __hci_req_enable_advertising(req);
2741 else if (!err)
2742 __hci_req_enable_ext_advertising(req);
2743 }
2310 } else if (!list_empty(&hdev->adv_instances)) { 2744 } else if (!list_empty(&hdev->adv_instances)) {
2311 struct adv_info *adv_instance; 2745 struct adv_info *adv_instance;
2312 2746
diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h
index 702beb140d9f..692cc8b13368 100644
--- a/net/bluetooth/hci_request.h
+++ b/net/bluetooth/hci_request.h
@@ -80,6 +80,14 @@ void hci_req_clear_adv_instance(struct hci_dev *hdev, struct sock *sk,
80 struct hci_request *req, u8 instance, 80 struct hci_request *req, u8 instance,
81 bool force); 81 bool force);
82 82
83int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance);
84int __hci_req_start_ext_adv(struct hci_request *req, u8 instance);
85void __hci_req_enable_ext_advertising(struct hci_request *req);
86void __hci_req_clear_ext_adv_sets(struct hci_request *req);
87int hci_get_random_address(struct hci_dev *hdev, bool require_privacy,
88 bool use_rpa, struct adv_info *adv_instance,
89 u8 *own_addr_type, bdaddr_t *rand_addr);
90
83void __hci_req_update_class(struct hci_request *req); 91void __hci_req_update_class(struct hci_request *req);
84 92
85/* Returns true if HCI commands were queued */ 93/* Returns true if HCI commands were queued */
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index d6c099861538..1506e1632394 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -1975,7 +1975,7 @@ static const struct proto_ops hci_sock_ops = {
1975 .sendmsg = hci_sock_sendmsg, 1975 .sendmsg = hci_sock_sendmsg,
1976 .recvmsg = hci_sock_recvmsg, 1976 .recvmsg = hci_sock_recvmsg,
1977 .ioctl = hci_sock_ioctl, 1977 .ioctl = hci_sock_ioctl,
1978 .poll_mask = datagram_poll_mask, 1978 .poll = datagram_poll,
1979 .listen = sock_no_listen, 1979 .listen = sock_no_listen,
1980 .shutdown = sock_no_shutdown, 1980 .shutdown = sock_no_shutdown,
1981 .setsockopt = hci_sock_setsockopt, 1981 .setsockopt = hci_sock_setsockopt,
diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index 1036e4fa1ea2..253975cce943 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -431,8 +431,8 @@ static void hidp_del_timer(struct hidp_session *session)
431 del_timer(&session->timer); 431 del_timer(&session->timer);
432} 432}
433 433
434static void hidp_process_report(struct hidp_session *session, 434static void hidp_process_report(struct hidp_session *session, int type,
435 int type, const u8 *data, int len, int intr) 435 const u8 *data, unsigned int len, int intr)
436{ 436{
437 if (len > HID_MAX_BUFFER_SIZE) 437 if (len > HID_MAX_BUFFER_SIZE)
438 len = HID_MAX_BUFFER_SIZE; 438 len = HID_MAX_BUFFER_SIZE;
@@ -775,7 +775,7 @@ static int hidp_setup_hid(struct hidp_session *session,
775 hid->version = req->version; 775 hid->version = req->version;
776 hid->country = req->country; 776 hid->country = req->country;
777 777
778 strncpy(hid->name, req->name, sizeof(req->name) - 1); 778 strncpy(hid->name, req->name, sizeof(hid->name));
779 779
780 snprintf(hid->phys, sizeof(hid->phys), "%pMR", 780 snprintf(hid->phys, sizeof(hid->phys), "%pMR",
781 &l2cap_pi(session->ctrl_sock->sk)->chan->src); 781 &l2cap_pi(session->ctrl_sock->sk)->chan->src);
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index 742a190034e6..686bdc6b35b0 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -1653,7 +1653,7 @@ static const struct proto_ops l2cap_sock_ops = {
1653 .getname = l2cap_sock_getname, 1653 .getname = l2cap_sock_getname,
1654 .sendmsg = l2cap_sock_sendmsg, 1654 .sendmsg = l2cap_sock_sendmsg,
1655 .recvmsg = l2cap_sock_recvmsg, 1655 .recvmsg = l2cap_sock_recvmsg,
1656 .poll_mask = bt_sock_poll_mask, 1656 .poll = bt_sock_poll,
1657 .ioctl = bt_sock_ioctl, 1657 .ioctl = bt_sock_ioctl,
1658 .mmap = sock_no_mmap, 1658 .mmap = sock_no_mmap,
1659 .socketpair = sock_no_socketpair, 1659 .socketpair = sock_no_socketpair,
diff --git a/net/bluetooth/leds.c b/net/bluetooth/leds.c
index cb670b5594eb..6d59a5023231 100644
--- a/net/bluetooth/leds.c
+++ b/net/bluetooth/leds.c
@@ -43,7 +43,7 @@ void hci_leds_update_powered(struct hci_dev *hdev, bool enabled)
43 led_trigger_event(bt_power_led_trigger, enabled ? LED_FULL : LED_OFF); 43 led_trigger_event(bt_power_led_trigger, enabled ? LED_FULL : LED_OFF);
44} 44}
45 45
46static void power_activate(struct led_classdev *led_cdev) 46static int power_activate(struct led_classdev *led_cdev)
47{ 47{
48 struct hci_basic_led_trigger *htrig; 48 struct hci_basic_led_trigger *htrig;
49 bool powered; 49 bool powered;
@@ -52,10 +52,12 @@ static void power_activate(struct led_classdev *led_cdev)
52 powered = test_bit(HCI_UP, &htrig->hdev->flags); 52 powered = test_bit(HCI_UP, &htrig->hdev->flags);
53 53
54 led_trigger_event(led_cdev->trigger, powered ? LED_FULL : LED_OFF); 54 led_trigger_event(led_cdev->trigger, powered ? LED_FULL : LED_OFF);
55
56 return 0;
55} 57}
56 58
57static struct led_trigger *led_allocate_basic(struct hci_dev *hdev, 59static struct led_trigger *led_allocate_basic(struct hci_dev *hdev,
58 void (*activate)(struct led_classdev *led_cdev), 60 int (*activate)(struct led_classdev *led_cdev),
59 const char *name) 61 const char *name)
60{ 62{
61 struct hci_basic_led_trigger *htrig; 63 struct hci_basic_led_trigger *htrig;
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 8a80d48d89c4..3bdc8f3ca259 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -617,6 +617,127 @@ static int read_config_info(struct sock *sk, struct hci_dev *hdev,
617 &rp, sizeof(rp)); 617 &rp, sizeof(rp));
618} 618}
619 619
620static u32 get_supported_phys(struct hci_dev *hdev)
621{
622 u32 supported_phys = 0;
623
624 if (lmp_bredr_capable(hdev)) {
625 supported_phys |= MGMT_PHY_BR_1M_1SLOT;
626
627 if (hdev->features[0][0] & LMP_3SLOT)
628 supported_phys |= MGMT_PHY_BR_1M_3SLOT;
629
630 if (hdev->features[0][0] & LMP_5SLOT)
631 supported_phys |= MGMT_PHY_BR_1M_5SLOT;
632
633 if (lmp_edr_2m_capable(hdev)) {
634 supported_phys |= MGMT_PHY_EDR_2M_1SLOT;
635
636 if (lmp_edr_3slot_capable(hdev))
637 supported_phys |= MGMT_PHY_EDR_2M_3SLOT;
638
639 if (lmp_edr_5slot_capable(hdev))
640 supported_phys |= MGMT_PHY_EDR_2M_5SLOT;
641
642 if (lmp_edr_3m_capable(hdev)) {
643 supported_phys |= MGMT_PHY_EDR_3M_1SLOT;
644
645 if (lmp_edr_3slot_capable(hdev))
646 supported_phys |= MGMT_PHY_EDR_3M_3SLOT;
647
648 if (lmp_edr_5slot_capable(hdev))
649 supported_phys |= MGMT_PHY_EDR_3M_5SLOT;
650 }
651 }
652 }
653
654 if (lmp_le_capable(hdev)) {
655 supported_phys |= MGMT_PHY_LE_1M_TX;
656 supported_phys |= MGMT_PHY_LE_1M_RX;
657
658 if (hdev->le_features[1] & HCI_LE_PHY_2M) {
659 supported_phys |= MGMT_PHY_LE_2M_TX;
660 supported_phys |= MGMT_PHY_LE_2M_RX;
661 }
662
663 if (hdev->le_features[1] & HCI_LE_PHY_CODED) {
664 supported_phys |= MGMT_PHY_LE_CODED_TX;
665 supported_phys |= MGMT_PHY_LE_CODED_RX;
666 }
667 }
668
669 return supported_phys;
670}
671
672static u32 get_selected_phys(struct hci_dev *hdev)
673{
674 u32 selected_phys = 0;
675
676 if (lmp_bredr_capable(hdev)) {
677 selected_phys |= MGMT_PHY_BR_1M_1SLOT;
678
679 if (hdev->pkt_type & (HCI_DM3 | HCI_DH3))
680 selected_phys |= MGMT_PHY_BR_1M_3SLOT;
681
682 if (hdev->pkt_type & (HCI_DM5 | HCI_DH5))
683 selected_phys |= MGMT_PHY_BR_1M_5SLOT;
684
685 if (lmp_edr_2m_capable(hdev)) {
686 if (!(hdev->pkt_type & HCI_2DH1))
687 selected_phys |= MGMT_PHY_EDR_2M_1SLOT;
688
689 if (lmp_edr_3slot_capable(hdev) &&
690 !(hdev->pkt_type & HCI_2DH3))
691 selected_phys |= MGMT_PHY_EDR_2M_3SLOT;
692
693 if (lmp_edr_5slot_capable(hdev) &&
694 !(hdev->pkt_type & HCI_2DH5))
695 selected_phys |= MGMT_PHY_EDR_2M_5SLOT;
696
697 if (lmp_edr_3m_capable(hdev)) {
698 if (!(hdev->pkt_type & HCI_3DH1))
699 selected_phys |= MGMT_PHY_EDR_3M_1SLOT;
700
701 if (lmp_edr_3slot_capable(hdev) &&
702 !(hdev->pkt_type & HCI_3DH3))
703 selected_phys |= MGMT_PHY_EDR_3M_3SLOT;
704
705 if (lmp_edr_5slot_capable(hdev) &&
706 !(hdev->pkt_type & HCI_3DH5))
707 selected_phys |= MGMT_PHY_EDR_3M_5SLOT;
708 }
709 }
710 }
711
712 if (lmp_le_capable(hdev)) {
713 if (hdev->le_tx_def_phys & HCI_LE_SET_PHY_1M)
714 selected_phys |= MGMT_PHY_LE_1M_TX;
715
716 if (hdev->le_rx_def_phys & HCI_LE_SET_PHY_1M)
717 selected_phys |= MGMT_PHY_LE_1M_RX;
718
719 if (hdev->le_tx_def_phys & HCI_LE_SET_PHY_2M)
720 selected_phys |= MGMT_PHY_LE_2M_TX;
721
722 if (hdev->le_rx_def_phys & HCI_LE_SET_PHY_2M)
723 selected_phys |= MGMT_PHY_LE_2M_RX;
724
725 if (hdev->le_tx_def_phys & HCI_LE_SET_PHY_CODED)
726 selected_phys |= MGMT_PHY_LE_CODED_TX;
727
728 if (hdev->le_rx_def_phys & HCI_LE_SET_PHY_CODED)
729 selected_phys |= MGMT_PHY_LE_CODED_RX;
730 }
731
732 return selected_phys;
733}
734
735static u32 get_configurable_phys(struct hci_dev *hdev)
736{
737 return (get_supported_phys(hdev) & ~MGMT_PHY_BR_1M_1SLOT &
738 ~MGMT_PHY_LE_1M_TX & ~MGMT_PHY_LE_1M_RX);
739}
740
620static u32 get_supported_settings(struct hci_dev *hdev) 741static u32 get_supported_settings(struct hci_dev *hdev)
621{ 742{
622 u32 settings = 0; 743 u32 settings = 0;
@@ -654,6 +775,8 @@ static u32 get_supported_settings(struct hci_dev *hdev)
654 hdev->set_bdaddr) 775 hdev->set_bdaddr)
655 settings |= MGMT_SETTING_CONFIGURATION; 776 settings |= MGMT_SETTING_CONFIGURATION;
656 777
778 settings |= MGMT_SETTING_PHY_CONFIGURATION;
779
657 return settings; 780 return settings;
658} 781}
659 782
@@ -817,7 +940,10 @@ static void rpa_expired(struct work_struct *work)
817 * function. 940 * function.
818 */ 941 */
819 hci_req_init(&req, hdev); 942 hci_req_init(&req, hdev);
820 __hci_req_enable_advertising(&req); 943 if (ext_adv_capable(hdev))
944 __hci_req_start_ext_adv(&req, hdev->cur_adv_instance);
945 else
946 __hci_req_enable_advertising(&req);
821 hci_req_run(&req, NULL); 947 hci_req_run(&req, NULL);
822} 948}
823 949
@@ -1721,10 +1847,17 @@ static void le_enable_complete(struct hci_dev *hdev, u8 status, u16 opcode)
1721 */ 1847 */
1722 if (hci_dev_test_flag(hdev, HCI_LE_ENABLED)) { 1848 if (hci_dev_test_flag(hdev, HCI_LE_ENABLED)) {
1723 struct hci_request req; 1849 struct hci_request req;
1724
1725 hci_req_init(&req, hdev); 1850 hci_req_init(&req, hdev);
1726 __hci_req_update_adv_data(&req, 0x00); 1851 if (ext_adv_capable(hdev)) {
1727 __hci_req_update_scan_rsp_data(&req, 0x00); 1852 int err;
1853
1854 err = __hci_req_setup_ext_adv_instance(&req, 0x00);
1855 if (!err)
1856 __hci_req_update_scan_rsp_data(&req, 0x00);
1857 } else {
1858 __hci_req_update_adv_data(&req, 0x00);
1859 __hci_req_update_scan_rsp_data(&req, 0x00);
1860 }
1728 hci_req_run(&req, NULL); 1861 hci_req_run(&req, NULL);
1729 hci_update_background_scan(hdev); 1862 hci_update_background_scan(hdev);
1730 } 1863 }
@@ -1823,6 +1956,9 @@ static int set_le(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
1823 } else { 1956 } else {
1824 if (hci_dev_test_flag(hdev, HCI_LE_ADV)) 1957 if (hci_dev_test_flag(hdev, HCI_LE_ADV))
1825 __hci_req_disable_advertising(&req); 1958 __hci_req_disable_advertising(&req);
1959
1960 if (ext_adv_capable(hdev))
1961 __hci_req_clear_ext_adv_sets(&req);
1826 } 1962 }
1827 1963
1828 hci_req_add(&req, HCI_OP_WRITE_LE_HOST_SUPPORTED, sizeof(hci_cp), 1964 hci_req_add(&req, HCI_OP_WRITE_LE_HOST_SUPPORTED, sizeof(hci_cp),
@@ -3184,6 +3320,225 @@ static int set_appearance(struct sock *sk, struct hci_dev *hdev, void *data,
3184 return err; 3320 return err;
3185} 3321}
3186 3322
3323static int get_phy_configuration(struct sock *sk, struct hci_dev *hdev,
3324 void *data, u16 len)
3325{
3326 struct mgmt_rp_get_phy_confguration rp;
3327
3328 BT_DBG("sock %p %s", sk, hdev->name);
3329
3330 hci_dev_lock(hdev);
3331
3332 memset(&rp, 0, sizeof(rp));
3333
3334 rp.supported_phys = cpu_to_le32(get_supported_phys(hdev));
3335 rp.selected_phys = cpu_to_le32(get_selected_phys(hdev));
3336 rp.configurable_phys = cpu_to_le32(get_configurable_phys(hdev));
3337
3338 hci_dev_unlock(hdev);
3339
3340 return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_PHY_CONFIGURATION, 0,
3341 &rp, sizeof(rp));
3342}
3343
3344int mgmt_phy_configuration_changed(struct hci_dev *hdev, struct sock *skip)
3345{
3346 struct mgmt_ev_phy_configuration_changed ev;
3347
3348 memset(&ev, 0, sizeof(ev));
3349
3350 ev.selected_phys = cpu_to_le32(get_selected_phys(hdev));
3351
3352 return mgmt_event(MGMT_EV_PHY_CONFIGURATION_CHANGED, hdev, &ev,
3353 sizeof(ev), skip);
3354}
3355
3356static void set_default_phy_complete(struct hci_dev *hdev, u8 status,
3357 u16 opcode, struct sk_buff *skb)
3358{
3359 struct mgmt_pending_cmd *cmd;
3360
3361 BT_DBG("status 0x%02x", status);
3362
3363 hci_dev_lock(hdev);
3364
3365 cmd = pending_find(MGMT_OP_SET_PHY_CONFIGURATION, hdev);
3366 if (!cmd)
3367 goto unlock;
3368
3369 if (status) {
3370 mgmt_cmd_status(cmd->sk, hdev->id,
3371 MGMT_OP_SET_PHY_CONFIGURATION,
3372 mgmt_status(status));
3373 } else {
3374 mgmt_cmd_complete(cmd->sk, hdev->id,
3375 MGMT_OP_SET_PHY_CONFIGURATION, 0,
3376 NULL, 0);
3377
3378 mgmt_phy_configuration_changed(hdev, cmd->sk);
3379 }
3380
3381 mgmt_pending_remove(cmd);
3382
3383unlock:
3384 hci_dev_unlock(hdev);
3385}
3386
3387static int set_phy_configuration(struct sock *sk, struct hci_dev *hdev,
3388 void *data, u16 len)
3389{
3390 struct mgmt_cp_set_phy_confguration *cp = data;
3391 struct hci_cp_le_set_default_phy cp_phy;
3392 struct mgmt_pending_cmd *cmd;
3393 struct hci_request req;
3394 u32 selected_phys, configurable_phys, supported_phys, unconfigure_phys;
3395 u16 pkt_type = (HCI_DH1 | HCI_DM1);
3396 bool changed = false;
3397 int err;
3398
3399 BT_DBG("sock %p %s", sk, hdev->name);
3400
3401 configurable_phys = get_configurable_phys(hdev);
3402 supported_phys = get_supported_phys(hdev);
3403 selected_phys = __le32_to_cpu(cp->selected_phys);
3404
3405 if (selected_phys & ~supported_phys)
3406 return mgmt_cmd_status(sk, hdev->id,
3407 MGMT_OP_SET_PHY_CONFIGURATION,
3408 MGMT_STATUS_INVALID_PARAMS);
3409
3410 unconfigure_phys = supported_phys & ~configurable_phys;
3411
3412 if ((selected_phys & unconfigure_phys) != unconfigure_phys)
3413 return mgmt_cmd_status(sk, hdev->id,
3414 MGMT_OP_SET_PHY_CONFIGURATION,
3415 MGMT_STATUS_INVALID_PARAMS);
3416
3417 if (selected_phys == get_selected_phys(hdev))
3418 return mgmt_cmd_complete(sk, hdev->id,
3419 MGMT_OP_SET_PHY_CONFIGURATION,
3420 0, NULL, 0);
3421
3422 hci_dev_lock(hdev);
3423
3424 if (!hdev_is_powered(hdev)) {
3425 err = mgmt_cmd_status(sk, hdev->id,
3426 MGMT_OP_SET_PHY_CONFIGURATION,
3427 MGMT_STATUS_REJECTED);
3428 goto unlock;
3429 }
3430
3431 if (pending_find(MGMT_OP_SET_PHY_CONFIGURATION, hdev)) {
3432 err = mgmt_cmd_status(sk, hdev->id,
3433 MGMT_OP_SET_PHY_CONFIGURATION,
3434 MGMT_STATUS_BUSY);
3435 goto unlock;
3436 }
3437
3438 if (selected_phys & MGMT_PHY_BR_1M_3SLOT)
3439 pkt_type |= (HCI_DH3 | HCI_DM3);
3440 else
3441 pkt_type &= ~(HCI_DH3 | HCI_DM3);
3442
3443 if (selected_phys & MGMT_PHY_BR_1M_5SLOT)
3444 pkt_type |= (HCI_DH5 | HCI_DM5);
3445 else
3446 pkt_type &= ~(HCI_DH5 | HCI_DM5);
3447
3448 if (selected_phys & MGMT_PHY_EDR_2M_1SLOT)
3449 pkt_type &= ~HCI_2DH1;
3450 else
3451 pkt_type |= HCI_2DH1;
3452
3453 if (selected_phys & MGMT_PHY_EDR_2M_3SLOT)
3454 pkt_type &= ~HCI_2DH3;
3455 else
3456 pkt_type |= HCI_2DH3;
3457
3458 if (selected_phys & MGMT_PHY_EDR_2M_5SLOT)
3459 pkt_type &= ~HCI_2DH5;
3460 else
3461 pkt_type |= HCI_2DH5;
3462
3463 if (selected_phys & MGMT_PHY_EDR_3M_1SLOT)
3464 pkt_type &= ~HCI_3DH1;
3465 else
3466 pkt_type |= HCI_3DH1;
3467
3468 if (selected_phys & MGMT_PHY_EDR_3M_3SLOT)
3469 pkt_type &= ~HCI_3DH3;
3470 else
3471 pkt_type |= HCI_3DH3;
3472
3473 if (selected_phys & MGMT_PHY_EDR_3M_5SLOT)
3474 pkt_type &= ~HCI_3DH5;
3475 else
3476 pkt_type |= HCI_3DH5;
3477
3478 if (pkt_type != hdev->pkt_type) {
3479 hdev->pkt_type = pkt_type;
3480 changed = true;
3481 }
3482
3483 if ((selected_phys & MGMT_PHY_LE_MASK) ==
3484 (get_selected_phys(hdev) & MGMT_PHY_LE_MASK)) {
3485 if (changed)
3486 mgmt_phy_configuration_changed(hdev, sk);
3487
3488 err = mgmt_cmd_complete(sk, hdev->id,
3489 MGMT_OP_SET_PHY_CONFIGURATION,
3490 0, NULL, 0);
3491
3492 goto unlock;
3493 }
3494
3495 cmd = mgmt_pending_add(sk, MGMT_OP_SET_PHY_CONFIGURATION, hdev, data,
3496 len);
3497 if (!cmd) {
3498 err = -ENOMEM;
3499 goto unlock;
3500 }
3501
3502 hci_req_init(&req, hdev);
3503
3504 memset(&cp_phy, 0, sizeof(cp_phy));
3505
3506 if (!(selected_phys & MGMT_PHY_LE_TX_MASK))
3507 cp_phy.all_phys |= 0x01;
3508
3509 if (!(selected_phys & MGMT_PHY_LE_RX_MASK))
3510 cp_phy.all_phys |= 0x02;
3511
3512 if (selected_phys & MGMT_PHY_LE_1M_TX)
3513 cp_phy.tx_phys |= HCI_LE_SET_PHY_1M;
3514
3515 if (selected_phys & MGMT_PHY_LE_2M_TX)
3516 cp_phy.tx_phys |= HCI_LE_SET_PHY_2M;
3517
3518 if (selected_phys & MGMT_PHY_LE_CODED_TX)
3519 cp_phy.tx_phys |= HCI_LE_SET_PHY_CODED;
3520
3521 if (selected_phys & MGMT_PHY_LE_1M_RX)
3522 cp_phy.rx_phys |= HCI_LE_SET_PHY_1M;
3523
3524 if (selected_phys & MGMT_PHY_LE_2M_RX)
3525 cp_phy.rx_phys |= HCI_LE_SET_PHY_2M;
3526
3527 if (selected_phys & MGMT_PHY_LE_CODED_RX)
3528 cp_phy.rx_phys |= HCI_LE_SET_PHY_CODED;
3529
3530 hci_req_add(&req, HCI_OP_LE_SET_DEFAULT_PHY, sizeof(cp_phy), &cp_phy);
3531
3532 err = hci_req_run_skb(&req, set_default_phy_complete);
3533 if (err < 0)
3534 mgmt_pending_remove(cmd);
3535
3536unlock:
3537 hci_dev_unlock(hdev);
3538
3539 return err;
3540}
3541
3187static void read_local_oob_data_complete(struct hci_dev *hdev, u8 status, 3542static void read_local_oob_data_complete(struct hci_dev *hdev, u8 status,
3188 u16 opcode, struct sk_buff *skb) 3543 u16 opcode, struct sk_buff *skb)
3189{ 3544{
@@ -4037,9 +4392,14 @@ static int set_advertising(struct sock *sk, struct hci_dev *hdev, void *data,
4037 * HCI_ADVERTISING flag is not yet set. 4392 * HCI_ADVERTISING flag is not yet set.
4038 */ 4393 */
4039 hdev->cur_adv_instance = 0x00; 4394 hdev->cur_adv_instance = 0x00;
4040 __hci_req_update_adv_data(&req, 0x00); 4395
4041 __hci_req_update_scan_rsp_data(&req, 0x00); 4396 if (ext_adv_capable(hdev)) {
4042 __hci_req_enable_advertising(&req); 4397 __hci_req_start_ext_adv(&req, 0x00);
4398 } else {
4399 __hci_req_update_adv_data(&req, 0x00);
4400 __hci_req_update_scan_rsp_data(&req, 0x00);
4401 __hci_req_enable_advertising(&req);
4402 }
4043 } else { 4403 } else {
4044 __hci_req_disable_advertising(&req); 4404 __hci_req_disable_advertising(&req);
4045 } 4405 }
@@ -4609,6 +4969,7 @@ static int set_privacy(struct sock *sk, struct hci_dev *hdev, void *cp_data,
4609 changed = !hci_dev_test_and_set_flag(hdev, HCI_PRIVACY); 4969 changed = !hci_dev_test_and_set_flag(hdev, HCI_PRIVACY);
4610 memcpy(hdev->irk, cp->irk, sizeof(hdev->irk)); 4970 memcpy(hdev->irk, cp->irk, sizeof(hdev->irk));
4611 hci_dev_set_flag(hdev, HCI_RPA_EXPIRED); 4971 hci_dev_set_flag(hdev, HCI_RPA_EXPIRED);
4972 hci_adv_instances_set_rpa_expired(hdev, true);
4612 if (cp->privacy == 0x02) 4973 if (cp->privacy == 0x02)
4613 hci_dev_set_flag(hdev, HCI_LIMITED_PRIVACY); 4974 hci_dev_set_flag(hdev, HCI_LIMITED_PRIVACY);
4614 else 4975 else
@@ -4617,6 +4978,7 @@ static int set_privacy(struct sock *sk, struct hci_dev *hdev, void *cp_data,
4617 changed = hci_dev_test_and_clear_flag(hdev, HCI_PRIVACY); 4978 changed = hci_dev_test_and_clear_flag(hdev, HCI_PRIVACY);
4618 memset(hdev->irk, 0, sizeof(hdev->irk)); 4979 memset(hdev->irk, 0, sizeof(hdev->irk));
4619 hci_dev_clear_flag(hdev, HCI_RPA_EXPIRED); 4980 hci_dev_clear_flag(hdev, HCI_RPA_EXPIRED);
4981 hci_adv_instances_set_rpa_expired(hdev, false);
4620 hci_dev_clear_flag(hdev, HCI_LIMITED_PRIVACY); 4982 hci_dev_clear_flag(hdev, HCI_LIMITED_PRIVACY);
4621 } 4983 }
4622 4984
@@ -5967,9 +6329,23 @@ static u32 get_supported_adv_flags(struct hci_dev *hdev)
5967 flags |= MGMT_ADV_FLAG_APPEARANCE; 6329 flags |= MGMT_ADV_FLAG_APPEARANCE;
5968 flags |= MGMT_ADV_FLAG_LOCAL_NAME; 6330 flags |= MGMT_ADV_FLAG_LOCAL_NAME;
5969 6331
5970 if (hdev->adv_tx_power != HCI_TX_POWER_INVALID) 6332 /* In extended adv TX_POWER returned from Set Adv Param
6333 * will be always valid.
6334 */
6335 if ((hdev->adv_tx_power != HCI_TX_POWER_INVALID) ||
6336 ext_adv_capable(hdev))
5971 flags |= MGMT_ADV_FLAG_TX_POWER; 6337 flags |= MGMT_ADV_FLAG_TX_POWER;
5972 6338
6339 if (ext_adv_capable(hdev)) {
6340 flags |= MGMT_ADV_FLAG_SEC_1M;
6341
6342 if (hdev->le_features[1] & HCI_LE_PHY_2M)
6343 flags |= MGMT_ADV_FLAG_SEC_2M;
6344
6345 if (hdev->le_features[1] & HCI_LE_PHY_CODED)
6346 flags |= MGMT_ADV_FLAG_SEC_CODED;
6347 }
6348
5973 return flags; 6349 return flags;
5974} 6350}
5975 6351
@@ -6175,7 +6551,7 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev,
6175 struct mgmt_cp_add_advertising *cp = data; 6551 struct mgmt_cp_add_advertising *cp = data;
6176 struct mgmt_rp_add_advertising rp; 6552 struct mgmt_rp_add_advertising rp;
6177 u32 flags; 6553 u32 flags;
6178 u32 supported_flags; 6554 u32 supported_flags, phy_flags;
6179 u8 status; 6555 u8 status;
6180 u16 timeout, duration; 6556 u16 timeout, duration;
6181 unsigned int prev_instance_cnt = hdev->adv_instance_cnt; 6557 unsigned int prev_instance_cnt = hdev->adv_instance_cnt;
@@ -6205,10 +6581,12 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev,
6205 duration = __le16_to_cpu(cp->duration); 6581 duration = __le16_to_cpu(cp->duration);
6206 6582
6207 /* The current implementation only supports a subset of the specified 6583 /* The current implementation only supports a subset of the specified
6208 * flags. 6584 * flags. Also need to check mutual exclusiveness of sec flags.
6209 */ 6585 */
6210 supported_flags = get_supported_adv_flags(hdev); 6586 supported_flags = get_supported_adv_flags(hdev);
6211 if (flags & ~supported_flags) 6587 phy_flags = flags & MGMT_ADV_FLAG_SEC_MASK;
6588 if (flags & ~supported_flags ||
6589 ((phy_flags && (phy_flags ^ (phy_flags & -phy_flags)))))
6212 return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING, 6590 return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING,
6213 MGMT_STATUS_INVALID_PARAMS); 6591 MGMT_STATUS_INVALID_PARAMS);
6214 6592
@@ -6544,6 +6922,8 @@ static const struct hci_mgmt_handler mgmt_handlers[] = {
6544 { read_ext_controller_info,MGMT_READ_EXT_INFO_SIZE, 6922 { read_ext_controller_info,MGMT_READ_EXT_INFO_SIZE,
6545 HCI_MGMT_UNTRUSTED }, 6923 HCI_MGMT_UNTRUSTED },
6546 { set_appearance, MGMT_SET_APPEARANCE_SIZE }, 6924 { set_appearance, MGMT_SET_APPEARANCE_SIZE },
6925 { get_phy_configuration, MGMT_GET_PHY_CONFIGURATION_SIZE },
6926 { set_phy_configuration, MGMT_SET_PHY_CONFIGURATION_SIZE },
6547}; 6927};
6548 6928
6549void mgmt_index_added(struct hci_dev *hdev) 6929void mgmt_index_added(struct hci_dev *hdev)
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index 1cf57622473a..d606e9212291 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -1049,7 +1049,7 @@ static const struct proto_ops rfcomm_sock_ops = {
1049 .setsockopt = rfcomm_sock_setsockopt, 1049 .setsockopt = rfcomm_sock_setsockopt,
1050 .getsockopt = rfcomm_sock_getsockopt, 1050 .getsockopt = rfcomm_sock_getsockopt,
1051 .ioctl = rfcomm_sock_ioctl, 1051 .ioctl = rfcomm_sock_ioctl,
1052 .poll_mask = bt_sock_poll_mask, 1052 .poll = bt_sock_poll,
1053 .socketpair = sock_no_socketpair, 1053 .socketpair = sock_no_socketpair,
1054 .mmap = sock_no_mmap 1054 .mmap = sock_no_mmap
1055}; 1055};
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index d60dbc61d170..8f0f9279eac9 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -393,7 +393,8 @@ static void sco_sock_cleanup_listen(struct sock *parent)
393 */ 393 */
394static void sco_sock_kill(struct sock *sk) 394static void sco_sock_kill(struct sock *sk)
395{ 395{
396 if (!sock_flag(sk, SOCK_ZAPPED) || sk->sk_socket) 396 if (!sock_flag(sk, SOCK_ZAPPED) || sk->sk_socket ||
397 sock_flag(sk, SOCK_DEAD))
397 return; 398 return;
398 399
399 BT_DBG("sk %p state %d", sk, sk->sk_state); 400 BT_DBG("sk %p state %d", sk, sk->sk_state);
@@ -1197,7 +1198,7 @@ static const struct proto_ops sco_sock_ops = {
1197 .getname = sco_sock_getname, 1198 .getname = sco_sock_getname,
1198 .sendmsg = sco_sock_sendmsg, 1199 .sendmsg = sco_sock_sendmsg,
1199 .recvmsg = sco_sock_recvmsg, 1200 .recvmsg = sco_sock_recvmsg,
1200 .poll_mask = bt_sock_poll_mask, 1201 .poll = bt_sock_poll,
1201 .ioctl = bt_sock_ioctl, 1202 .ioctl = bt_sock_ioctl,
1202 .mmap = sock_no_mmap, 1203 .mmap = sock_no_mmap,
1203 .socketpair = sock_no_socketpair, 1204 .socketpair = sock_no_socketpair,
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 68c3578343b4..f4078830ea50 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -11,12 +11,14 @@
11#include <linux/filter.h> 11#include <linux/filter.h>
12#include <linux/sched/signal.h> 12#include <linux/sched/signal.h>
13 13
14static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx) 14static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx,
15 struct bpf_cgroup_storage *storage)
15{ 16{
16 u32 ret; 17 u32 ret;
17 18
18 preempt_disable(); 19 preempt_disable();
19 rcu_read_lock(); 20 rcu_read_lock();
21 bpf_cgroup_storage_set(storage);
20 ret = BPF_PROG_RUN(prog, ctx); 22 ret = BPF_PROG_RUN(prog, ctx);
21 rcu_read_unlock(); 23 rcu_read_unlock();
22 preempt_enable(); 24 preempt_enable();
@@ -26,14 +28,19 @@ static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx)
26 28
27static u32 bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *time) 29static u32 bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *time)
28{ 30{
31 struct bpf_cgroup_storage *storage = NULL;
29 u64 time_start, time_spent = 0; 32 u64 time_start, time_spent = 0;
30 u32 ret = 0, i; 33 u32 ret = 0, i;
31 34
35 storage = bpf_cgroup_storage_alloc(prog);
36 if (IS_ERR(storage))
37 return PTR_ERR(storage);
38
32 if (!repeat) 39 if (!repeat)
33 repeat = 1; 40 repeat = 1;
34 time_start = ktime_get_ns(); 41 time_start = ktime_get_ns();
35 for (i = 0; i < repeat; i++) { 42 for (i = 0; i < repeat; i++) {
36 ret = bpf_test_run_one(prog, ctx); 43 ret = bpf_test_run_one(prog, ctx, storage);
37 if (need_resched()) { 44 if (need_resched()) {
38 if (signal_pending(current)) 45 if (signal_pending(current))
39 break; 46 break;
@@ -46,6 +53,8 @@ static u32 bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *time)
46 do_div(time_spent, repeat); 53 do_div(time_spent, repeat);
47 *time = time_spent > U32_MAX ? U32_MAX : (u32)time_spent; 54 *time = time_spent > U32_MAX ? U32_MAX : (u32)time_spent;
48 55
56 bpf_cgroup_storage_free(storage);
57
49 return ret; 58 return ret;
50} 59}
51 60
@@ -96,6 +105,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
96 u32 size = kattr->test.data_size_in; 105 u32 size = kattr->test.data_size_in;
97 u32 repeat = kattr->test.repeat; 106 u32 repeat = kattr->test.repeat;
98 u32 retval, duration; 107 u32 retval, duration;
108 int hh_len = ETH_HLEN;
99 struct sk_buff *skb; 109 struct sk_buff *skb;
100 void *data; 110 void *data;
101 int ret; 111 int ret;
@@ -131,12 +141,22 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
131 skb_reset_network_header(skb); 141 skb_reset_network_header(skb);
132 142
133 if (is_l2) 143 if (is_l2)
134 __skb_push(skb, ETH_HLEN); 144 __skb_push(skb, hh_len);
135 if (is_direct_pkt_access) 145 if (is_direct_pkt_access)
136 bpf_compute_data_pointers(skb); 146 bpf_compute_data_pointers(skb);
137 retval = bpf_test_run(prog, skb, repeat, &duration); 147 retval = bpf_test_run(prog, skb, repeat, &duration);
138 if (!is_l2) 148 if (!is_l2) {
139 __skb_push(skb, ETH_HLEN); 149 if (skb_headroom(skb) < hh_len) {
150 int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb));
151
152 if (pskb_expand_head(skb, nhead, 0, GFP_USER)) {
153 kfree_skb(skb);
154 return -ENOMEM;
155 }
156 }
157 memset(__skb_push(skb, hh_len), 0, hh_len);
158 }
159
140 size = skb->len; 160 size = skb->len;
141 /* bpf program can never convert linear skb to non-linear */ 161 /* bpf program can never convert linear skb to non-linear */
142 if (WARN_ON_ONCE(skb_is_nonlinear(skb))) 162 if (WARN_ON_ONCE(skb_is_nonlinear(skb)))
diff --git a/net/bpfilter/Kconfig b/net/bpfilter/Kconfig
index a948b072c28f..e558b46596c4 100644
--- a/net/bpfilter/Kconfig
+++ b/net/bpfilter/Kconfig
@@ -1,6 +1,5 @@
1menuconfig BPFILTER 1menuconfig BPFILTER
2 bool "BPF based packet filtering framework (BPFILTER)" 2 bool "BPF based packet filtering framework (BPFILTER)"
3 default n
4 depends on NET && BPF && INET 3 depends on NET && BPF && INET
5 help 4 help
6 This builds experimental bpfilter framework that is aiming to 5 This builds experimental bpfilter framework that is aiming to
@@ -9,8 +8,8 @@ menuconfig BPFILTER
9if BPFILTER 8if BPFILTER
10config BPFILTER_UMH 9config BPFILTER_UMH
11 tristate "bpfilter kernel module with user mode helper" 10 tristate "bpfilter kernel module with user mode helper"
11 depends on $(success,$(srctree)/scripts/cc-can-link.sh $(CC))
12 default m 12 default m
13 help 13 help
14 This builds bpfilter kernel module with embedded user mode helper 14 This builds bpfilter kernel module with embedded user mode helper
15endif 15endif
16
diff --git a/net/bpfilter/Makefile b/net/bpfilter/Makefile
index dd86b022eff0..0947ee7f70d5 100644
--- a/net/bpfilter/Makefile
+++ b/net/bpfilter/Makefile
@@ -5,30 +5,17 @@
5 5
6hostprogs-y := bpfilter_umh 6hostprogs-y := bpfilter_umh
7bpfilter_umh-objs := main.o 7bpfilter_umh-objs := main.o
8HOSTCFLAGS += -I. -Itools/include/ -Itools/include/uapi 8KBUILD_HOSTCFLAGS += -I. -Itools/include/ -Itools/include/uapi
9HOSTCC := $(CC) 9HOSTCC := $(CC)
10 10
11ifeq ($(CONFIG_BPFILTER_UMH), y) 11ifeq ($(CONFIG_BPFILTER_UMH), y)
12# builtin bpfilter_umh should be compiled with -static 12# builtin bpfilter_umh should be compiled with -static
13# since rootfs isn't mounted at the time of __init 13# since rootfs isn't mounted at the time of __init
14# function is called and do_execv won't find elf interpreter 14# function is called and do_execv won't find elf interpreter
15HOSTLDFLAGS += -static 15KBUILD_HOSTLDFLAGS += -static
16endif 16endif
17 17
18# a bit of elf magic to convert bpfilter_umh binary into a binary blob 18$(obj)/bpfilter_umh_blob.o: $(obj)/bpfilter_umh
19# inside bpfilter_umh.o elf file referenced by
20# _binary_net_bpfilter_bpfilter_umh_start symbol
21# which bpfilter_kern.c passes further into umh blob loader at run-time
22quiet_cmd_copy_umh = GEN $@
23 cmd_copy_umh = echo ':' > $(obj)/.bpfilter_umh.o.cmd; \
24 $(OBJCOPY) -I binary \
25 `LC_ALL=C objdump -f net/bpfilter/bpfilter_umh \
26 |awk -F' |,' '/file format/{print "-O",$$NF} \
27 /^architecture:/{print "-B",$$2}'` \
28 --rename-section .data=.init.rodata $< $@
29
30$(obj)/bpfilter_umh.o: $(obj)/bpfilter_umh
31 $(call cmd,copy_umh)
32 19
33obj-$(CONFIG_BPFILTER_UMH) += bpfilter.o 20obj-$(CONFIG_BPFILTER_UMH) += bpfilter.o
34bpfilter-objs += bpfilter_kern.o bpfilter_umh.o 21bpfilter-objs += bpfilter_kern.o bpfilter_umh_blob.o
diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c
index 09522573f611..f0fc182d3db7 100644
--- a/net/bpfilter/bpfilter_kern.c
+++ b/net/bpfilter/bpfilter_kern.c
@@ -10,11 +10,8 @@
10#include <linux/file.h> 10#include <linux/file.h>
11#include "msgfmt.h" 11#include "msgfmt.h"
12 12
13#define UMH_start _binary_net_bpfilter_bpfilter_umh_start 13extern char bpfilter_umh_start;
14#define UMH_end _binary_net_bpfilter_bpfilter_umh_end 14extern char bpfilter_umh_end;
15
16extern char UMH_start;
17extern char UMH_end;
18 15
19static struct umh_info info; 16static struct umh_info info;
20/* since ip_getsockopt() can run in parallel, serialize access to umh */ 17/* since ip_getsockopt() can run in parallel, serialize access to umh */
@@ -93,7 +90,9 @@ static int __init load_umh(void)
93 int err; 90 int err;
94 91
95 /* fork usermode process */ 92 /* fork usermode process */
96 err = fork_usermode_blob(&UMH_start, &UMH_end - &UMH_start, &info); 93 err = fork_usermode_blob(&bpfilter_umh_start,
94 &bpfilter_umh_end - &bpfilter_umh_start,
95 &info);
97 if (err) 96 if (err)
98 return err; 97 return err;
99 pr_info("Loaded bpfilter_umh pid %d\n", info.pid); 98 pr_info("Loaded bpfilter_umh pid %d\n", info.pid);
diff --git a/net/bpfilter/bpfilter_umh_blob.S b/net/bpfilter/bpfilter_umh_blob.S
new file mode 100644
index 000000000000..40311d10d2f2
--- /dev/null
+++ b/net/bpfilter/bpfilter_umh_blob.S
@@ -0,0 +1,7 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2 .section .init.rodata, "a"
3 .global bpfilter_umh_start
4bpfilter_umh_start:
5 .incbin "net/bpfilter/bpfilter_umh"
6 .global bpfilter_umh_end
7bpfilter_umh_end:
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index 9019f326fe81..5372e2042adf 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -142,7 +142,20 @@ static int deliver_clone(const struct net_bridge_port *prev,
142void br_forward(const struct net_bridge_port *to, 142void br_forward(const struct net_bridge_port *to,
143 struct sk_buff *skb, bool local_rcv, bool local_orig) 143 struct sk_buff *skb, bool local_rcv, bool local_orig)
144{ 144{
145 if (to && should_deliver(to, skb)) { 145 if (unlikely(!to))
146 goto out;
147
148 /* redirect to backup link if the destination port is down */
149 if (rcu_access_pointer(to->backup_port) && !netif_carrier_ok(to->dev)) {
150 struct net_bridge_port *backup_port;
151
152 backup_port = rcu_dereference(to->backup_port);
153 if (unlikely(!backup_port))
154 goto out;
155 to = backup_port;
156 }
157
158 if (should_deliver(to, skb)) {
146 if (local_rcv) 159 if (local_rcv)
147 deliver_clone(to, skb, local_orig); 160 deliver_clone(to, skb, local_orig);
148 else 161 else
@@ -150,6 +163,7 @@ void br_forward(const struct net_bridge_port *to,
150 return; 163 return;
151 } 164 }
152 165
166out:
153 if (!local_rcv) 167 if (!local_rcv)
154 kfree_skb(skb); 168 kfree_skb(skb);
155} 169}
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 05e42d86882d..0363f1bdc401 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -26,6 +26,7 @@
26#include <net/sock.h> 26#include <net/sock.h>
27#include <linux/if_vlan.h> 27#include <linux/if_vlan.h>
28#include <net/switchdev.h> 28#include <net/switchdev.h>
29#include <net/net_namespace.h>
29 30
30#include "br_private.h" 31#include "br_private.h"
31 32
@@ -169,6 +170,58 @@ void br_manage_promisc(struct net_bridge *br)
169 } 170 }
170} 171}
171 172
173int nbp_backup_change(struct net_bridge_port *p,
174 struct net_device *backup_dev)
175{
176 struct net_bridge_port *old_backup = rtnl_dereference(p->backup_port);
177 struct net_bridge_port *backup_p = NULL;
178
179 ASSERT_RTNL();
180
181 if (backup_dev) {
182 if (!br_port_exists(backup_dev))
183 return -ENOENT;
184
185 backup_p = br_port_get_rtnl(backup_dev);
186 if (backup_p->br != p->br)
187 return -EINVAL;
188 }
189
190 if (p == backup_p)
191 return -EINVAL;
192
193 if (old_backup == backup_p)
194 return 0;
195
196 /* if the backup link is already set, clear it */
197 if (old_backup)
198 old_backup->backup_redirected_cnt--;
199
200 if (backup_p)
201 backup_p->backup_redirected_cnt++;
202 rcu_assign_pointer(p->backup_port, backup_p);
203
204 return 0;
205}
206
207static void nbp_backup_clear(struct net_bridge_port *p)
208{
209 nbp_backup_change(p, NULL);
210 if (p->backup_redirected_cnt) {
211 struct net_bridge_port *cur_p;
212
213 list_for_each_entry(cur_p, &p->br->port_list, list) {
214 struct net_bridge_port *backup_p;
215
216 backup_p = rtnl_dereference(cur_p->backup_port);
217 if (backup_p == p)
218 nbp_backup_change(cur_p, NULL);
219 }
220 }
221
222 WARN_ON(rcu_access_pointer(p->backup_port) || p->backup_redirected_cnt);
223}
224
172static void nbp_update_port_count(struct net_bridge *br) 225static void nbp_update_port_count(struct net_bridge *br)
173{ 226{
174 struct net_bridge_port *p; 227 struct net_bridge_port *p;
@@ -204,11 +257,19 @@ static void release_nbp(struct kobject *kobj)
204 kfree(p); 257 kfree(p);
205} 258}
206 259
260static void brport_get_ownership(struct kobject *kobj, kuid_t *uid, kgid_t *gid)
261{
262 struct net_bridge_port *p = kobj_to_brport(kobj);
263
264 net_ns_get_ownership(dev_net(p->dev), uid, gid);
265}
266
207static struct kobj_type brport_ktype = { 267static struct kobj_type brport_ktype = {
208#ifdef CONFIG_SYSFS 268#ifdef CONFIG_SYSFS
209 .sysfs_ops = &brport_sysfs_ops, 269 .sysfs_ops = &brport_sysfs_ops,
210#endif 270#endif
211 .release = release_nbp, 271 .release = release_nbp,
272 .get_ownership = brport_get_ownership,
212}; 273};
213 274
214static void destroy_nbp(struct net_bridge_port *p) 275static void destroy_nbp(struct net_bridge_port *p)
@@ -286,6 +347,7 @@ static void del_nbp(struct net_bridge_port *p)
286 nbp_vlan_flush(p); 347 nbp_vlan_flush(p);
287 br_fdb_delete_by_port(br, p, 0, 1); 348 br_fdb_delete_by_port(br, p, 0, 1);
288 switchdev_deferred_process(); 349 switchdev_deferred_process();
350 nbp_backup_clear(p);
289 351
290 nbp_update_port_count(br); 352 nbp_update_port_count(br);
291 353
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 920665dd92db..20ed7adcf1cc 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -1423,10 +1423,10 @@ static void br_multicast_query_received(struct net_bridge *br,
1423 br_multicast_mark_router(br, port); 1423 br_multicast_mark_router(br, port);
1424} 1424}
1425 1425
1426static int br_ip4_multicast_query(struct net_bridge *br, 1426static void br_ip4_multicast_query(struct net_bridge *br,
1427 struct net_bridge_port *port, 1427 struct net_bridge_port *port,
1428 struct sk_buff *skb, 1428 struct sk_buff *skb,
1429 u16 vid) 1429 u16 vid)
1430{ 1430{
1431 const struct iphdr *iph = ip_hdr(skb); 1431 const struct iphdr *iph = ip_hdr(skb);
1432 struct igmphdr *ih = igmp_hdr(skb); 1432 struct igmphdr *ih = igmp_hdr(skb);
@@ -1439,7 +1439,6 @@ static int br_ip4_multicast_query(struct net_bridge *br,
1439 unsigned long now = jiffies; 1439 unsigned long now = jiffies;
1440 unsigned int offset = skb_transport_offset(skb); 1440 unsigned int offset = skb_transport_offset(skb);
1441 __be32 group; 1441 __be32 group;
1442 int err = 0;
1443 1442
1444 spin_lock(&br->multicast_lock); 1443 spin_lock(&br->multicast_lock);
1445 if (!netif_running(br->dev) || 1444 if (!netif_running(br->dev) ||
@@ -1498,7 +1497,6 @@ static int br_ip4_multicast_query(struct net_bridge *br,
1498 1497
1499out: 1498out:
1500 spin_unlock(&br->multicast_lock); 1499 spin_unlock(&br->multicast_lock);
1501 return err;
1502} 1500}
1503 1501
1504#if IS_ENABLED(CONFIG_IPV6) 1502#if IS_ENABLED(CONFIG_IPV6)
@@ -1828,7 +1826,7 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
1828 err = br_ip4_multicast_igmp3_report(br, port, skb_trimmed, vid); 1826 err = br_ip4_multicast_igmp3_report(br, port, skb_trimmed, vid);
1829 break; 1827 break;
1830 case IGMP_HOST_MEMBERSHIP_QUERY: 1828 case IGMP_HOST_MEMBERSHIP_QUERY:
1831 err = br_ip4_multicast_query(br, port, skb_trimmed, vid); 1829 br_ip4_multicast_query(br, port, skb_trimmed, vid);
1832 break; 1830 break;
1833 case IGMP_HOST_LEAVE_MESSAGE: 1831 case IGMP_HOST_LEAVE_MESSAGE:
1834 br_ip4_multicast_leave_group(br, port, ih->group, vid, src); 1832 br_ip4_multicast_leave_group(br, port, ih->group, vid, src);
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 9b16eaf33819..6e0dc6bcd32a 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -26,6 +26,7 @@
26#include <linux/if_pppox.h> 26#include <linux/if_pppox.h>
27#include <linux/ppp_defs.h> 27#include <linux/ppp_defs.h>
28#include <linux/netfilter_bridge.h> 28#include <linux/netfilter_bridge.h>
29#include <uapi/linux/netfilter_bridge.h>
29#include <linux/netfilter_ipv4.h> 30#include <linux/netfilter_ipv4.h>
30#include <linux/netfilter_ipv6.h> 31#include <linux/netfilter_ipv6.h>
31#include <linux/netfilter_arp.h> 32#include <linux/netfilter_arp.h>
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 9f5eb05b0373..ec2b58a09f76 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -169,13 +169,15 @@ static inline size_t br_nlmsg_size(struct net_device *dev, u32 filter_mask)
169 + nla_total_size(1) /* IFLA_OPERSTATE */ 169 + nla_total_size(1) /* IFLA_OPERSTATE */
170 + nla_total_size(br_port_info_size()) /* IFLA_PROTINFO */ 170 + nla_total_size(br_port_info_size()) /* IFLA_PROTINFO */
171 + nla_total_size(br_get_link_af_size_filtered(dev, 171 + nla_total_size(br_get_link_af_size_filtered(dev,
172 filter_mask)); /* IFLA_AF_SPEC */ 172 filter_mask)) /* IFLA_AF_SPEC */
173 + nla_total_size(4); /* IFLA_BRPORT_BACKUP_PORT */
173} 174}
174 175
175static int br_port_fill_attrs(struct sk_buff *skb, 176static int br_port_fill_attrs(struct sk_buff *skb,
176 const struct net_bridge_port *p) 177 const struct net_bridge_port *p)
177{ 178{
178 u8 mode = !!(p->flags & BR_HAIRPIN_MODE); 179 u8 mode = !!(p->flags & BR_HAIRPIN_MODE);
180 struct net_bridge_port *backup_p;
179 u64 timerval; 181 u64 timerval;
180 182
181 if (nla_put_u8(skb, IFLA_BRPORT_STATE, p->state) || 183 if (nla_put_u8(skb, IFLA_BRPORT_STATE, p->state) ||
@@ -237,6 +239,14 @@ static int br_port_fill_attrs(struct sk_buff *skb,
237 return -EMSGSIZE; 239 return -EMSGSIZE;
238#endif 240#endif
239 241
242 /* we might be called only with br->lock */
243 rcu_read_lock();
244 backup_p = rcu_dereference(p->backup_port);
245 if (backup_p)
246 nla_put_u32(skb, IFLA_BRPORT_BACKUP_PORT,
247 backup_p->dev->ifindex);
248 rcu_read_unlock();
249
240 return 0; 250 return 0;
241} 251}
242 252
@@ -663,6 +673,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = {
663 [IFLA_BRPORT_GROUP_FWD_MASK] = { .type = NLA_U16 }, 673 [IFLA_BRPORT_GROUP_FWD_MASK] = { .type = NLA_U16 },
664 [IFLA_BRPORT_NEIGH_SUPPRESS] = { .type = NLA_U8 }, 674 [IFLA_BRPORT_NEIGH_SUPPRESS] = { .type = NLA_U8 },
665 [IFLA_BRPORT_ISOLATED] = { .type = NLA_U8 }, 675 [IFLA_BRPORT_ISOLATED] = { .type = NLA_U8 },
676 [IFLA_BRPORT_BACKUP_PORT] = { .type = NLA_U32 },
666}; 677};
667 678
668/* Change the state of the port and notify spanning tree */ 679/* Change the state of the port and notify spanning tree */
@@ -817,6 +828,23 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
817 if (err) 828 if (err)
818 return err; 829 return err;
819 830
831 if (tb[IFLA_BRPORT_BACKUP_PORT]) {
832 struct net_device *backup_dev = NULL;
833 u32 backup_ifindex;
834
835 backup_ifindex = nla_get_u32(tb[IFLA_BRPORT_BACKUP_PORT]);
836 if (backup_ifindex) {
837 backup_dev = __dev_get_by_index(dev_net(p->dev),
838 backup_ifindex);
839 if (!backup_dev)
840 return -ENOENT;
841 }
842
843 err = nbp_backup_change(p, backup_dev);
844 if (err)
845 return err;
846 }
847
820 br_port_flags_change(p, old_flags ^ p->flags); 848 br_port_flags_change(p, old_flags ^ p->flags);
821 return 0; 849 return 0;
822} 850}
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 5216a524b537..11ed2029985f 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -237,6 +237,7 @@ struct net_bridge_port {
237#ifdef CONFIG_BRIDGE_VLAN_FILTERING 237#ifdef CONFIG_BRIDGE_VLAN_FILTERING
238 struct net_bridge_vlan_group __rcu *vlgrp; 238 struct net_bridge_vlan_group __rcu *vlgrp;
239#endif 239#endif
240 struct net_bridge_port __rcu *backup_port;
240 241
241 /* STP */ 242 /* STP */
242 u8 priority; 243 u8 priority;
@@ -281,8 +282,11 @@ struct net_bridge_port {
281 int offload_fwd_mark; 282 int offload_fwd_mark;
282#endif 283#endif
283 u16 group_fwd_mask; 284 u16 group_fwd_mask;
285 u16 backup_redirected_cnt;
284}; 286};
285 287
288#define kobj_to_brport(obj) container_of(obj, struct net_bridge_port, kobj)
289
286#define br_auto_port(p) ((p)->flags & BR_AUTO_MASK) 290#define br_auto_port(p) ((p)->flags & BR_AUTO_MASK)
287#define br_promisc_port(p) ((p)->flags & BR_PROMISC) 291#define br_promisc_port(p) ((p)->flags & BR_PROMISC)
288 292
@@ -595,6 +599,7 @@ netdev_features_t br_features_recompute(struct net_bridge *br,
595 netdev_features_t features); 599 netdev_features_t features);
596void br_port_flags_change(struct net_bridge_port *port, unsigned long mask); 600void br_port_flags_change(struct net_bridge_port *port, unsigned long mask);
597void br_manage_promisc(struct net_bridge *br); 601void br_manage_promisc(struct net_bridge *br);
602int nbp_backup_change(struct net_bridge_port *p, struct net_device *backup_dev);
598 603
599/* br_input.c */ 604/* br_input.c */
600int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb); 605int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb);
diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
index f99c5bf5c906..7c87a2fe5248 100644
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -25,6 +25,15 @@ struct brport_attribute {
25 struct attribute attr; 25 struct attribute attr;
26 ssize_t (*show)(struct net_bridge_port *, char *); 26 ssize_t (*show)(struct net_bridge_port *, char *);
27 int (*store)(struct net_bridge_port *, unsigned long); 27 int (*store)(struct net_bridge_port *, unsigned long);
28 int (*store_raw)(struct net_bridge_port *, char *);
29};
30
31#define BRPORT_ATTR_RAW(_name, _mode, _show, _store) \
32const struct brport_attribute brport_attr_##_name = { \
33 .attr = {.name = __stringify(_name), \
34 .mode = _mode }, \
35 .show = _show, \
36 .store_raw = _store, \
28}; 37};
29 38
30#define BRPORT_ATTR(_name, _mode, _show, _store) \ 39#define BRPORT_ATTR(_name, _mode, _show, _store) \
@@ -182,6 +191,38 @@ static int store_group_fwd_mask(struct net_bridge_port *p,
182static BRPORT_ATTR(group_fwd_mask, 0644, show_group_fwd_mask, 191static BRPORT_ATTR(group_fwd_mask, 0644, show_group_fwd_mask,
183 store_group_fwd_mask); 192 store_group_fwd_mask);
184 193
194static ssize_t show_backup_port(struct net_bridge_port *p, char *buf)
195{
196 struct net_bridge_port *backup_p;
197 int ret = 0;
198
199 rcu_read_lock();
200 backup_p = rcu_dereference(p->backup_port);
201 if (backup_p)
202 ret = sprintf(buf, "%s\n", backup_p->dev->name);
203 rcu_read_unlock();
204
205 return ret;
206}
207
208static int store_backup_port(struct net_bridge_port *p, char *buf)
209{
210 struct net_device *backup_dev = NULL;
211 char *nl = strchr(buf, '\n');
212
213 if (nl)
214 *nl = '\0';
215
216 if (strlen(buf) > 0) {
217 backup_dev = __dev_get_by_name(dev_net(p->dev), buf);
218 if (!backup_dev)
219 return -ENOENT;
220 }
221
222 return nbp_backup_change(p, backup_dev);
223}
224static BRPORT_ATTR_RAW(backup_port, 0644, show_backup_port, store_backup_port);
225
185BRPORT_ATTR_FLAG(hairpin_mode, BR_HAIRPIN_MODE); 226BRPORT_ATTR_FLAG(hairpin_mode, BR_HAIRPIN_MODE);
186BRPORT_ATTR_FLAG(bpdu_guard, BR_BPDU_GUARD); 227BRPORT_ATTR_FLAG(bpdu_guard, BR_BPDU_GUARD);
187BRPORT_ATTR_FLAG(root_block, BR_ROOT_BLOCK); 228BRPORT_ATTR_FLAG(root_block, BR_ROOT_BLOCK);
@@ -245,17 +286,17 @@ static const struct brport_attribute *brport_attrs[] = {
245 &brport_attr_group_fwd_mask, 286 &brport_attr_group_fwd_mask,
246 &brport_attr_neigh_suppress, 287 &brport_attr_neigh_suppress,
247 &brport_attr_isolated, 288 &brport_attr_isolated,
289 &brport_attr_backup_port,
248 NULL 290 NULL
249}; 291};
250 292
251#define to_brport_attr(_at) container_of(_at, struct brport_attribute, attr) 293#define to_brport_attr(_at) container_of(_at, struct brport_attribute, attr)
252#define to_brport(obj) container_of(obj, struct net_bridge_port, kobj)
253 294
254static ssize_t brport_show(struct kobject *kobj, 295static ssize_t brport_show(struct kobject *kobj,
255 struct attribute *attr, char *buf) 296 struct attribute *attr, char *buf)
256{ 297{
257 struct brport_attribute *brport_attr = to_brport_attr(attr); 298 struct brport_attribute *brport_attr = to_brport_attr(attr);
258 struct net_bridge_port *p = to_brport(kobj); 299 struct net_bridge_port *p = kobj_to_brport(kobj);
259 300
260 if (!brport_attr->show) 301 if (!brport_attr->show)
261 return -EINVAL; 302 return -EINVAL;
@@ -268,29 +309,48 @@ static ssize_t brport_store(struct kobject *kobj,
268 const char *buf, size_t count) 309 const char *buf, size_t count)
269{ 310{
270 struct brport_attribute *brport_attr = to_brport_attr(attr); 311 struct brport_attribute *brport_attr = to_brport_attr(attr);
271 struct net_bridge_port *p = to_brport(kobj); 312 struct net_bridge_port *p = kobj_to_brport(kobj);
272 ssize_t ret = -EINVAL; 313 ssize_t ret = -EINVAL;
273 char *endp;
274 unsigned long val; 314 unsigned long val;
315 char *endp;
275 316
276 if (!ns_capable(dev_net(p->dev)->user_ns, CAP_NET_ADMIN)) 317 if (!ns_capable(dev_net(p->dev)->user_ns, CAP_NET_ADMIN))
277 return -EPERM; 318 return -EPERM;
278 319
279 val = simple_strtoul(buf, &endp, 0); 320 if (!rtnl_trylock())
280 if (endp != buf) { 321 return restart_syscall();
281 if (!rtnl_trylock()) 322
282 return restart_syscall(); 323 if (!p->dev || !p->br)
283 if (p->dev && p->br && brport_attr->store) { 324 goto out_unlock;
284 spin_lock_bh(&p->br->lock); 325
285 ret = brport_attr->store(p, val); 326 if (brport_attr->store_raw) {
286 spin_unlock_bh(&p->br->lock); 327 char *buf_copy;
287 if (!ret) { 328
288 br_ifinfo_notify(RTM_NEWLINK, NULL, p); 329 buf_copy = kstrndup(buf, count, GFP_KERNEL);
289 ret = count; 330 if (!buf_copy) {
290 } 331 ret = -ENOMEM;
332 goto out_unlock;
291 } 333 }
292 rtnl_unlock(); 334 spin_lock_bh(&p->br->lock);
335 ret = brport_attr->store_raw(p, buf_copy);
336 spin_unlock_bh(&p->br->lock);
337 kfree(buf_copy);
338 } else if (brport_attr->store) {
339 val = simple_strtoul(buf, &endp, 0);
340 if (endp == buf)
341 goto out_unlock;
342 spin_lock_bh(&p->br->lock);
343 ret = brport_attr->store(p, val);
344 spin_unlock_bh(&p->br->lock);
293 } 345 }
346
347 if (!ret) {
348 br_ifinfo_notify(RTM_NEWLINK, NULL, p);
349 ret = count;
350 }
351out_unlock:
352 rtnl_unlock();
353
294 return ret; 354 return ret;
295} 355}
296 356
diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c
index c41da5fac84f..550324c516ee 100644
--- a/net/bridge/netfilter/ebtable_filter.c
+++ b/net/bridge/netfilter/ebtable_filter.c
@@ -9,6 +9,7 @@
9 */ 9 */
10 10
11#include <linux/netfilter_bridge/ebtables.h> 11#include <linux/netfilter_bridge/ebtables.h>
12#include <uapi/linux/netfilter_bridge.h>
12#include <linux/module.h> 13#include <linux/module.h>
13 14
14#define FILTER_VALID_HOOKS ((1 << NF_BR_LOCAL_IN) | (1 << NF_BR_FORWARD) | \ 15#define FILTER_VALID_HOOKS ((1 << NF_BR_LOCAL_IN) | (1 << NF_BR_FORWARD) | \
diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c
index 08df7406ecb3..c0fb3ca518af 100644
--- a/net/bridge/netfilter/ebtable_nat.c
+++ b/net/bridge/netfilter/ebtable_nat.c
@@ -9,6 +9,7 @@
9 */ 9 */
10 10
11#include <linux/netfilter_bridge/ebtables.h> 11#include <linux/netfilter_bridge/ebtables.h>
12#include <uapi/linux/netfilter_bridge.h>
12#include <linux/module.h> 13#include <linux/module.h>
13 14
14#define NAT_VALID_HOOKS ((1 << NF_BR_PRE_ROUTING) | (1 << NF_BR_LOCAL_OUT) | \ 15#define NAT_VALID_HOOKS ((1 << NF_BR_PRE_ROUTING) | (1 << NF_BR_LOCAL_OUT) | \
diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c
index 6de981270566..08cbed7d940e 100644
--- a/net/bridge/netfilter/nft_reject_bridge.c
+++ b/net/bridge/netfilter/nft_reject_bridge.c
@@ -89,8 +89,7 @@ static void nft_reject_br_send_v4_tcp_reset(struct net *net,
89 niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP, 89 niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP,
90 net->ipv4.sysctl_ip_default_ttl); 90 net->ipv4.sysctl_ip_default_ttl);
91 nf_reject_ip_tcphdr_put(nskb, oldskb, oth); 91 nf_reject_ip_tcphdr_put(nskb, oldskb, oth);
92 niph->ttl = net->ipv4.sysctl_ip_default_ttl; 92 niph->tot_len = htons(nskb->len);
93 niph->tot_len = htons(nskb->len);
94 ip_send_check(niph); 93 ip_send_check(niph);
95 94
96 nft_reject_br_push_etherhdr(oldskb, nskb); 95 nft_reject_br_push_etherhdr(oldskb, nskb);
diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c
index e0adcd123f48..711d7156efd8 100644
--- a/net/caif/caif_dev.c
+++ b/net/caif/caif_dev.c
@@ -131,8 +131,10 @@ static void caif_flow_cb(struct sk_buff *skb)
131 caifd = caif_get(skb->dev); 131 caifd = caif_get(skb->dev);
132 132
133 WARN_ON(caifd == NULL); 133 WARN_ON(caifd == NULL);
134 if (caifd == NULL) 134 if (!caifd) {
135 rcu_read_unlock();
135 return; 136 return;
137 }
136 138
137 caifd_hold(caifd); 139 caifd_hold(caifd);
138 rcu_read_unlock(); 140 rcu_read_unlock();
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index c7991867d622..d18965f3291f 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -934,11 +934,15 @@ static int caif_release(struct socket *sock)
934} 934}
935 935
936/* Copied from af_unix.c:unix_poll(), added CAIF tx_flow handling */ 936/* Copied from af_unix.c:unix_poll(), added CAIF tx_flow handling */
937static __poll_t caif_poll_mask(struct socket *sock, __poll_t events) 937static __poll_t caif_poll(struct file *file,
938 struct socket *sock, poll_table *wait)
938{ 939{
939 struct sock *sk = sock->sk; 940 struct sock *sk = sock->sk;
941 __poll_t mask;
940 struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); 942 struct caifsock *cf_sk = container_of(sk, struct caifsock, sk);
941 __poll_t mask = 0; 943
944 sock_poll_wait(file, wait);
945 mask = 0;
942 946
943 /* exceptional events? */ 947 /* exceptional events? */
944 if (sk->sk_err) 948 if (sk->sk_err)
@@ -972,7 +976,7 @@ static const struct proto_ops caif_seqpacket_ops = {
972 .socketpair = sock_no_socketpair, 976 .socketpair = sock_no_socketpair,
973 .accept = sock_no_accept, 977 .accept = sock_no_accept,
974 .getname = sock_no_getname, 978 .getname = sock_no_getname,
975 .poll_mask = caif_poll_mask, 979 .poll = caif_poll,
976 .ioctl = sock_no_ioctl, 980 .ioctl = sock_no_ioctl,
977 .listen = sock_no_listen, 981 .listen = sock_no_listen,
978 .shutdown = sock_no_shutdown, 982 .shutdown = sock_no_shutdown,
@@ -993,7 +997,7 @@ static const struct proto_ops caif_stream_ops = {
993 .socketpair = sock_no_socketpair, 997 .socketpair = sock_no_socketpair,
994 .accept = sock_no_accept, 998 .accept = sock_no_accept,
995 .getname = sock_no_getname, 999 .getname = sock_no_getname,
996 .poll_mask = caif_poll_mask, 1000 .poll = caif_poll,
997 .ioctl = sock_no_ioctl, 1001 .ioctl = sock_no_ioctl,
998 .listen = sock_no_listen, 1002 .listen = sock_no_listen,
999 .shutdown = sock_no_shutdown, 1003 .shutdown = sock_no_shutdown,
diff --git a/net/can/bcm.c b/net/can/bcm.c
index 9393f25df08d..0af8f0db892a 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -1660,7 +1660,7 @@ static const struct proto_ops bcm_ops = {
1660 .socketpair = sock_no_socketpair, 1660 .socketpair = sock_no_socketpair,
1661 .accept = sock_no_accept, 1661 .accept = sock_no_accept,
1662 .getname = sock_no_getname, 1662 .getname = sock_no_getname,
1663 .poll_mask = datagram_poll_mask, 1663 .poll = datagram_poll,
1664 .ioctl = can_ioctl, /* use can_ioctl() from af_can.c */ 1664 .ioctl = can_ioctl, /* use can_ioctl() from af_can.c */
1665 .listen = sock_no_listen, 1665 .listen = sock_no_listen,
1666 .shutdown = sock_no_shutdown, 1666 .shutdown = sock_no_shutdown,
diff --git a/net/can/raw.c b/net/can/raw.c
index fd7e2f49ea6a..1051eee82581 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -843,7 +843,7 @@ static const struct proto_ops raw_ops = {
843 .socketpair = sock_no_socketpair, 843 .socketpair = sock_no_socketpair,
844 .accept = sock_no_accept, 844 .accept = sock_no_accept,
845 .getname = raw_getname, 845 .getname = raw_getname,
846 .poll_mask = datagram_poll_mask, 846 .poll = datagram_poll,
847 .ioctl = can_ioctl, /* use can_ioctl() from af_can.c */ 847 .ioctl = can_ioctl, /* use can_ioctl() from af_can.c */
848 .listen = sock_no_listen, 848 .listen = sock_no_listen,
849 .shutdown = sock_no_shutdown, 849 .shutdown = sock_no_shutdown,
diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig
index f8cceb99e732..cd2d5b9301a1 100644
--- a/net/ceph/Kconfig
+++ b/net/ceph/Kconfig
@@ -41,4 +41,3 @@ config CEPH_LIB_USE_DNS_RESOLVER
41 Documentation/networking/dns_resolver.txt 41 Documentation/networking/dns_resolver.txt
42 42
43 If unsure, say N. 43 If unsure, say N.
44
diff --git a/net/ceph/Makefile b/net/ceph/Makefile
index 12bf49772d24..db09defe27d0 100644
--- a/net/ceph/Makefile
+++ b/net/ceph/Makefile
@@ -15,4 +15,3 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
15 auth_x.o \ 15 auth_x.o \
16 ceph_fs.o ceph_strings.o ceph_hash.o \ 16 ceph_fs.o ceph_strings.o ceph_hash.o \
17 pagevec.o snapshot.o string_table.o 17 pagevec.o snapshot.o string_table.o
18
diff --git a/net/ceph/auth.c b/net/ceph/auth.c
index dbde2b3c3c15..fbeee068ea14 100644
--- a/net/ceph/auth.c
+++ b/net/ceph/auth.c
@@ -315,6 +315,22 @@ int ceph_auth_update_authorizer(struct ceph_auth_client *ac,
315} 315}
316EXPORT_SYMBOL(ceph_auth_update_authorizer); 316EXPORT_SYMBOL(ceph_auth_update_authorizer);
317 317
318int ceph_auth_add_authorizer_challenge(struct ceph_auth_client *ac,
319 struct ceph_authorizer *a,
320 void *challenge_buf,
321 int challenge_buf_len)
322{
323 int ret = 0;
324
325 mutex_lock(&ac->mutex);
326 if (ac->ops && ac->ops->add_authorizer_challenge)
327 ret = ac->ops->add_authorizer_challenge(ac, a, challenge_buf,
328 challenge_buf_len);
329 mutex_unlock(&ac->mutex);
330 return ret;
331}
332EXPORT_SYMBOL(ceph_auth_add_authorizer_challenge);
333
318int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac, 334int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac,
319 struct ceph_authorizer *a) 335 struct ceph_authorizer *a)
320{ 336{
diff --git a/net/ceph/auth_none.c b/net/ceph/auth_none.c
index 41d2a0c72236..edb7042479ed 100644
--- a/net/ceph/auth_none.c
+++ b/net/ceph/auth_none.c
@@ -142,4 +142,3 @@ int ceph_auth_none_init(struct ceph_auth_client *ac)
142 ac->ops = &ceph_auth_none_ops; 142 ac->ops = &ceph_auth_none_ops;
143 return 0; 143 return 0;
144} 144}
145
diff --git a/net/ceph/auth_none.h b/net/ceph/auth_none.h
index 860ed9875791..4158f064302e 100644
--- a/net/ceph/auth_none.h
+++ b/net/ceph/auth_none.h
@@ -26,4 +26,3 @@ struct ceph_auth_none_info {
26int ceph_auth_none_init(struct ceph_auth_client *ac); 26int ceph_auth_none_init(struct ceph_auth_client *ac);
27 27
28#endif 28#endif
29
diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c
index 2f4a1baf5f52..b52732337ca6 100644
--- a/net/ceph/auth_x.c
+++ b/net/ceph/auth_x.c
@@ -9,6 +9,7 @@
9 9
10#include <linux/ceph/decode.h> 10#include <linux/ceph/decode.h>
11#include <linux/ceph/auth.h> 11#include <linux/ceph/auth.h>
12#include <linux/ceph/ceph_features.h>
12#include <linux/ceph/libceph.h> 13#include <linux/ceph/libceph.h>
13#include <linux/ceph/messenger.h> 14#include <linux/ceph/messenger.h>
14 15
@@ -70,25 +71,40 @@ static int ceph_x_encrypt(struct ceph_crypto_key *secret, void *buf,
70 return sizeof(u32) + ciphertext_len; 71 return sizeof(u32) + ciphertext_len;
71} 72}
72 73
74static int __ceph_x_decrypt(struct ceph_crypto_key *secret, void *p,
75 int ciphertext_len)
76{
77 struct ceph_x_encrypt_header *hdr = p;
78 int plaintext_len;
79 int ret;
80
81 ret = ceph_crypt(secret, false, p, ciphertext_len, ciphertext_len,
82 &plaintext_len);
83 if (ret)
84 return ret;
85
86 if (le64_to_cpu(hdr->magic) != CEPHX_ENC_MAGIC) {
87 pr_err("%s bad magic\n", __func__);
88 return -EINVAL;
89 }
90
91 return plaintext_len - sizeof(*hdr);
92}
93
73static int ceph_x_decrypt(struct ceph_crypto_key *secret, void **p, void *end) 94static int ceph_x_decrypt(struct ceph_crypto_key *secret, void **p, void *end)
74{ 95{
75 struct ceph_x_encrypt_header *hdr = *p + sizeof(u32); 96 int ciphertext_len;
76 int ciphertext_len, plaintext_len;
77 int ret; 97 int ret;
78 98
79 ceph_decode_32_safe(p, end, ciphertext_len, e_inval); 99 ceph_decode_32_safe(p, end, ciphertext_len, e_inval);
80 ceph_decode_need(p, end, ciphertext_len, e_inval); 100 ceph_decode_need(p, end, ciphertext_len, e_inval);
81 101
82 ret = ceph_crypt(secret, false, *p, end - *p, ciphertext_len, 102 ret = __ceph_x_decrypt(secret, *p, ciphertext_len);
83 &plaintext_len); 103 if (ret < 0)
84 if (ret)
85 return ret; 104 return ret;
86 105
87 if (hdr->struct_v != 1 || le64_to_cpu(hdr->magic) != CEPHX_ENC_MAGIC)
88 return -EPERM;
89
90 *p += ciphertext_len; 106 *p += ciphertext_len;
91 return plaintext_len - sizeof(struct ceph_x_encrypt_header); 107 return ret;
92 108
93e_inval: 109e_inval:
94 return -EINVAL; 110 return -EINVAL;
@@ -149,12 +165,12 @@ static int process_one_ticket(struct ceph_auth_client *ac,
149 void *dp, *dend; 165 void *dp, *dend;
150 int dlen; 166 int dlen;
151 char is_enc; 167 char is_enc;
152 struct timespec validity; 168 struct timespec64 validity;
153 void *tp, *tpend; 169 void *tp, *tpend;
154 void **ptp; 170 void **ptp;
155 struct ceph_crypto_key new_session_key = { 0 }; 171 struct ceph_crypto_key new_session_key = { 0 };
156 struct ceph_buffer *new_ticket_blob; 172 struct ceph_buffer *new_ticket_blob;
157 unsigned long new_expires, new_renew_after; 173 time64_t new_expires, new_renew_after;
158 u64 new_secret_id; 174 u64 new_secret_id;
159 int ret; 175 int ret;
160 176
@@ -189,11 +205,11 @@ static int process_one_ticket(struct ceph_auth_client *ac,
189 if (ret) 205 if (ret)
190 goto out; 206 goto out;
191 207
192 ceph_decode_timespec(&validity, dp); 208 ceph_decode_timespec64(&validity, dp);
193 dp += sizeof(struct ceph_timespec); 209 dp += sizeof(struct ceph_timespec);
194 new_expires = get_seconds() + validity.tv_sec; 210 new_expires = ktime_get_real_seconds() + validity.tv_sec;
195 new_renew_after = new_expires - (validity.tv_sec / 4); 211 new_renew_after = new_expires - (validity.tv_sec / 4);
196 dout(" expires=%lu renew_after=%lu\n", new_expires, 212 dout(" expires=%llu renew_after=%llu\n", new_expires,
197 new_renew_after); 213 new_renew_after);
198 214
199 /* ticket blob for service */ 215 /* ticket blob for service */
@@ -275,6 +291,51 @@ bad:
275 return -EINVAL; 291 return -EINVAL;
276} 292}
277 293
294/*
295 * Encode and encrypt the second part (ceph_x_authorize_b) of the
296 * authorizer. The first part (ceph_x_authorize_a) should already be
297 * encoded.
298 */
299static int encrypt_authorizer(struct ceph_x_authorizer *au,
300 u64 *server_challenge)
301{
302 struct ceph_x_authorize_a *msg_a;
303 struct ceph_x_authorize_b *msg_b;
304 void *p, *end;
305 int ret;
306
307 msg_a = au->buf->vec.iov_base;
308 WARN_ON(msg_a->ticket_blob.secret_id != cpu_to_le64(au->secret_id));
309 p = (void *)(msg_a + 1) + le32_to_cpu(msg_a->ticket_blob.blob_len);
310 end = au->buf->vec.iov_base + au->buf->vec.iov_len;
311
312 msg_b = p + ceph_x_encrypt_offset();
313 msg_b->struct_v = 2;
314 msg_b->nonce = cpu_to_le64(au->nonce);
315 if (server_challenge) {
316 msg_b->have_challenge = 1;
317 msg_b->server_challenge_plus_one =
318 cpu_to_le64(*server_challenge + 1);
319 } else {
320 msg_b->have_challenge = 0;
321 msg_b->server_challenge_plus_one = 0;
322 }
323
324 ret = ceph_x_encrypt(&au->session_key, p, end - p, sizeof(*msg_b));
325 if (ret < 0)
326 return ret;
327
328 p += ret;
329 if (server_challenge) {
330 WARN_ON(p != end);
331 } else {
332 WARN_ON(p > end);
333 au->buf->vec.iov_len = p - au->buf->vec.iov_base;
334 }
335
336 return 0;
337}
338
278static void ceph_x_authorizer_cleanup(struct ceph_x_authorizer *au) 339static void ceph_x_authorizer_cleanup(struct ceph_x_authorizer *au)
279{ 340{
280 ceph_crypto_key_destroy(&au->session_key); 341 ceph_crypto_key_destroy(&au->session_key);
@@ -291,7 +352,6 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
291 int maxlen; 352 int maxlen;
292 struct ceph_x_authorize_a *msg_a; 353 struct ceph_x_authorize_a *msg_a;
293 struct ceph_x_authorize_b *msg_b; 354 struct ceph_x_authorize_b *msg_b;
294 void *p, *end;
295 int ret; 355 int ret;
296 int ticket_blob_len = 356 int ticket_blob_len =
297 (th->ticket_blob ? th->ticket_blob->vec.iov_len : 0); 357 (th->ticket_blob ? th->ticket_blob->vec.iov_len : 0);
@@ -335,21 +395,13 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
335 dout(" th %p secret_id %lld %lld\n", th, th->secret_id, 395 dout(" th %p secret_id %lld %lld\n", th, th->secret_id,
336 le64_to_cpu(msg_a->ticket_blob.secret_id)); 396 le64_to_cpu(msg_a->ticket_blob.secret_id));
337 397
338 p = msg_a + 1;
339 p += ticket_blob_len;
340 end = au->buf->vec.iov_base + au->buf->vec.iov_len;
341
342 msg_b = p + ceph_x_encrypt_offset();
343 msg_b->struct_v = 1;
344 get_random_bytes(&au->nonce, sizeof(au->nonce)); 398 get_random_bytes(&au->nonce, sizeof(au->nonce));
345 msg_b->nonce = cpu_to_le64(au->nonce); 399 ret = encrypt_authorizer(au, NULL);
346 ret = ceph_x_encrypt(&au->session_key, p, end - p, sizeof(*msg_b)); 400 if (ret) {
347 if (ret < 0) 401 pr_err("failed to encrypt authorizer: %d", ret);
348 goto out_au; 402 goto out_au;
403 }
349 404
350 p += ret;
351 WARN_ON(p > end);
352 au->buf->vec.iov_len = p - au->buf->vec.iov_base;
353 dout(" built authorizer nonce %llx len %d\n", au->nonce, 405 dout(" built authorizer nonce %llx len %d\n", au->nonce,
354 (int)au->buf->vec.iov_len); 406 (int)au->buf->vec.iov_len);
355 return 0; 407 return 0;
@@ -385,13 +437,13 @@ static bool need_key(struct ceph_x_ticket_handler *th)
385 if (!th->have_key) 437 if (!th->have_key)
386 return true; 438 return true;
387 439
388 return get_seconds() >= th->renew_after; 440 return ktime_get_real_seconds() >= th->renew_after;
389} 441}
390 442
391static bool have_key(struct ceph_x_ticket_handler *th) 443static bool have_key(struct ceph_x_ticket_handler *th)
392{ 444{
393 if (th->have_key) { 445 if (th->have_key) {
394 if (get_seconds() >= th->expires) 446 if (ktime_get_real_seconds() >= th->expires)
395 th->have_key = false; 447 th->have_key = false;
396 } 448 }
397 449
@@ -626,6 +678,54 @@ static int ceph_x_update_authorizer(
626 return 0; 678 return 0;
627} 679}
628 680
681static int decrypt_authorize_challenge(struct ceph_x_authorizer *au,
682 void *challenge_buf,
683 int challenge_buf_len,
684 u64 *server_challenge)
685{
686 struct ceph_x_authorize_challenge *ch =
687 challenge_buf + sizeof(struct ceph_x_encrypt_header);
688 int ret;
689
690 /* no leading len */
691 ret = __ceph_x_decrypt(&au->session_key, challenge_buf,
692 challenge_buf_len);
693 if (ret < 0)
694 return ret;
695 if (ret < sizeof(*ch)) {
696 pr_err("bad size %d for ceph_x_authorize_challenge\n", ret);
697 return -EINVAL;
698 }
699
700 *server_challenge = le64_to_cpu(ch->server_challenge);
701 return 0;
702}
703
704static int ceph_x_add_authorizer_challenge(struct ceph_auth_client *ac,
705 struct ceph_authorizer *a,
706 void *challenge_buf,
707 int challenge_buf_len)
708{
709 struct ceph_x_authorizer *au = (void *)a;
710 u64 server_challenge;
711 int ret;
712
713 ret = decrypt_authorize_challenge(au, challenge_buf, challenge_buf_len,
714 &server_challenge);
715 if (ret) {
716 pr_err("failed to decrypt authorize challenge: %d", ret);
717 return ret;
718 }
719
720 ret = encrypt_authorizer(au, &server_challenge);
721 if (ret) {
722 pr_err("failed to encrypt authorizer w/ challenge: %d", ret);
723 return ret;
724 }
725
726 return 0;
727}
728
629static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac, 729static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
630 struct ceph_authorizer *a) 730 struct ceph_authorizer *a)
631{ 731{
@@ -637,8 +737,10 @@ static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
637 ret = ceph_x_decrypt(&au->session_key, &p, p + CEPHX_AU_ENC_BUF_LEN); 737 ret = ceph_x_decrypt(&au->session_key, &p, p + CEPHX_AU_ENC_BUF_LEN);
638 if (ret < 0) 738 if (ret < 0)
639 return ret; 739 return ret;
640 if (ret != sizeof(*reply)) 740 if (ret < sizeof(*reply)) {
641 return -EPERM; 741 pr_err("bad size %d for ceph_x_authorize_reply\n", ret);
742 return -EINVAL;
743 }
642 744
643 if (au->nonce + 1 != le64_to_cpu(reply->nonce_plus_one)) 745 if (au->nonce + 1 != le64_to_cpu(reply->nonce_plus_one))
644 ret = -EPERM; 746 ret = -EPERM;
@@ -704,26 +806,64 @@ static int calc_signature(struct ceph_x_authorizer *au, struct ceph_msg *msg,
704 __le64 *psig) 806 __le64 *psig)
705{ 807{
706 void *enc_buf = au->enc_buf; 808 void *enc_buf = au->enc_buf;
707 struct {
708 __le32 len;
709 __le32 header_crc;
710 __le32 front_crc;
711 __le32 middle_crc;
712 __le32 data_crc;
713 } __packed *sigblock = enc_buf + ceph_x_encrypt_offset();
714 int ret; 809 int ret;
715 810
716 sigblock->len = cpu_to_le32(4*sizeof(u32)); 811 if (!CEPH_HAVE_FEATURE(msg->con->peer_features, CEPHX_V2)) {
717 sigblock->header_crc = msg->hdr.crc; 812 struct {
718 sigblock->front_crc = msg->footer.front_crc; 813 __le32 len;
719 sigblock->middle_crc = msg->footer.middle_crc; 814 __le32 header_crc;
720 sigblock->data_crc = msg->footer.data_crc; 815 __le32 front_crc;
721 ret = ceph_x_encrypt(&au->session_key, enc_buf, CEPHX_AU_ENC_BUF_LEN, 816 __le32 middle_crc;
722 sizeof(*sigblock)); 817 __le32 data_crc;
723 if (ret < 0) 818 } __packed *sigblock = enc_buf + ceph_x_encrypt_offset();
724 return ret; 819
820 sigblock->len = cpu_to_le32(4*sizeof(u32));
821 sigblock->header_crc = msg->hdr.crc;
822 sigblock->front_crc = msg->footer.front_crc;
823 sigblock->middle_crc = msg->footer.middle_crc;
824 sigblock->data_crc = msg->footer.data_crc;
825
826 ret = ceph_x_encrypt(&au->session_key, enc_buf,
827 CEPHX_AU_ENC_BUF_LEN, sizeof(*sigblock));
828 if (ret < 0)
829 return ret;
830
831 *psig = *(__le64 *)(enc_buf + sizeof(u32));
832 } else {
833 struct {
834 __le32 header_crc;
835 __le32 front_crc;
836 __le32 front_len;
837 __le32 middle_crc;
838 __le32 middle_len;
839 __le32 data_crc;
840 __le32 data_len;
841 __le32 seq_lower_word;
842 } __packed *sigblock = enc_buf;
843 struct {
844 __le64 a, b, c, d;
845 } __packed *penc = enc_buf;
846 int ciphertext_len;
847
848 sigblock->header_crc = msg->hdr.crc;
849 sigblock->front_crc = msg->footer.front_crc;
850 sigblock->front_len = msg->hdr.front_len;
851 sigblock->middle_crc = msg->footer.middle_crc;
852 sigblock->middle_len = msg->hdr.middle_len;
853 sigblock->data_crc = msg->footer.data_crc;
854 sigblock->data_len = msg->hdr.data_len;
855 sigblock->seq_lower_word = *(__le32 *)&msg->hdr.seq;
856
857 /* no leading len, no ceph_x_encrypt_header */
858 ret = ceph_crypt(&au->session_key, true, enc_buf,
859 CEPHX_AU_ENC_BUF_LEN, sizeof(*sigblock),
860 &ciphertext_len);
861 if (ret)
862 return ret;
863
864 *psig = penc->a ^ penc->b ^ penc->c ^ penc->d;
865 }
725 866
726 *psig = *(__le64 *)(enc_buf + sizeof(u32));
727 return 0; 867 return 0;
728} 868}
729 869
@@ -778,6 +918,7 @@ static const struct ceph_auth_client_ops ceph_x_ops = {
778 .handle_reply = ceph_x_handle_reply, 918 .handle_reply = ceph_x_handle_reply,
779 .create_authorizer = ceph_x_create_authorizer, 919 .create_authorizer = ceph_x_create_authorizer,
780 .update_authorizer = ceph_x_update_authorizer, 920 .update_authorizer = ceph_x_update_authorizer,
921 .add_authorizer_challenge = ceph_x_add_authorizer_challenge,
781 .verify_authorizer_reply = ceph_x_verify_authorizer_reply, 922 .verify_authorizer_reply = ceph_x_verify_authorizer_reply,
782 .invalidate_authorizer = ceph_x_invalidate_authorizer, 923 .invalidate_authorizer = ceph_x_invalidate_authorizer,
783 .reset = ceph_x_reset, 924 .reset = ceph_x_reset,
@@ -823,5 +964,3 @@ out_nomem:
823out: 964out:
824 return ret; 965 return ret;
825} 966}
826
827
diff --git a/net/ceph/auth_x.h b/net/ceph/auth_x.h
index 454cb54568af..c03735f96df9 100644
--- a/net/ceph/auth_x.h
+++ b/net/ceph/auth_x.h
@@ -22,7 +22,7 @@ struct ceph_x_ticket_handler {
22 u64 secret_id; 22 u64 secret_id;
23 struct ceph_buffer *ticket_blob; 23 struct ceph_buffer *ticket_blob;
24 24
25 unsigned long renew_after, expires; 25 time64_t renew_after, expires;
26}; 26};
27 27
28#define CEPHX_AU_ENC_BUF_LEN 128 /* big enough for encrypted blob */ 28#define CEPHX_AU_ENC_BUF_LEN 128 /* big enough for encrypted blob */
@@ -52,4 +52,3 @@ struct ceph_x_info {
52int ceph_x_init(struct ceph_auth_client *ac); 52int ceph_x_init(struct ceph_auth_client *ac);
53 53
54#endif 54#endif
55
diff --git a/net/ceph/auth_x_protocol.h b/net/ceph/auth_x_protocol.h
index 32c13d763b9a..24b0b74564d0 100644
--- a/net/ceph/auth_x_protocol.h
+++ b/net/ceph/auth_x_protocol.h
@@ -70,6 +70,13 @@ struct ceph_x_authorize_a {
70struct ceph_x_authorize_b { 70struct ceph_x_authorize_b {
71 __u8 struct_v; 71 __u8 struct_v;
72 __le64 nonce; 72 __le64 nonce;
73 __u8 have_challenge;
74 __le64 server_challenge_plus_one;
75} __attribute__ ((packed));
76
77struct ceph_x_authorize_challenge {
78 __u8 struct_v;
79 __le64 server_challenge;
73} __attribute__ ((packed)); 80} __attribute__ ((packed));
74 81
75struct ceph_x_authorize_reply { 82struct ceph_x_authorize_reply {
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 584fdbef2088..87afb9ec4c68 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -304,7 +304,7 @@ static int get_secret(struct ceph_crypto_key *dst, const char *name) {
304 struct ceph_crypto_key *ckey; 304 struct ceph_crypto_key *ckey;
305 305
306 ukey = request_key(&key_type_ceph, name, NULL); 306 ukey = request_key(&key_type_ceph, name, NULL);
307 if (!ukey || IS_ERR(ukey)) { 307 if (IS_ERR(ukey)) {
308 /* request_key errors don't map nicely to mount(2) 308 /* request_key errors don't map nicely to mount(2)
309 errors; don't even try, but still printk */ 309 errors; don't even try, but still printk */
310 key_err = PTR_ERR(ukey); 310 key_err = PTR_ERR(ukey);
@@ -379,7 +379,7 @@ ceph_parse_options(char *options, const char *dev_name,
379 379
380 /* parse mount options */ 380 /* parse mount options */
381 while ((c = strsep(&options, ",")) != NULL) { 381 while ((c = strsep(&options, ",")) != NULL) {
382 int token, intval, ret; 382 int token, intval;
383 if (!*c) 383 if (!*c)
384 continue; 384 continue;
385 err = -EINVAL; 385 err = -EINVAL;
@@ -394,11 +394,10 @@ ceph_parse_options(char *options, const char *dev_name,
394 continue; 394 continue;
395 } 395 }
396 if (token < Opt_last_int) { 396 if (token < Opt_last_int) {
397 ret = match_int(&argstr[0], &intval); 397 err = match_int(&argstr[0], &intval);
398 if (ret < 0) { 398 if (err < 0) {
399 pr_err("bad mount option arg (not int) " 399 pr_err("bad option arg (not int) at '%s'\n", c);
400 "at '%s'\n", c); 400 goto out;
401 continue;
402 } 401 }
403 dout("got int token %d val %d\n", token, intval); 402 dout("got int token %d val %d\n", token, intval);
404 } else if (token > Opt_last_int && token < Opt_last_string) { 403 } else if (token > Opt_last_int && token < Opt_last_string) {
diff --git a/net/ceph/cls_lock_client.c b/net/ceph/cls_lock_client.c
index 8d2032b2f225..2105a6eaa66c 100644
--- a/net/ceph/cls_lock_client.c
+++ b/net/ceph/cls_lock_client.c
@@ -32,7 +32,7 @@ int ceph_cls_lock(struct ceph_osd_client *osdc,
32 int desc_len = strlen(desc); 32 int desc_len = strlen(desc);
33 void *p, *end; 33 void *p, *end;
34 struct page *lock_op_page; 34 struct page *lock_op_page;
35 struct timespec mtime; 35 struct timespec64 mtime;
36 int ret; 36 int ret;
37 37
38 lock_op_buf_size = name_len + sizeof(__le32) + 38 lock_op_buf_size = name_len + sizeof(__le32) +
@@ -63,7 +63,7 @@ int ceph_cls_lock(struct ceph_osd_client *osdc,
63 ceph_encode_string(&p, end, desc, desc_len); 63 ceph_encode_string(&p, end, desc, desc_len);
64 /* only support infinite duration */ 64 /* only support infinite duration */
65 memset(&mtime, 0, sizeof(mtime)); 65 memset(&mtime, 0, sizeof(mtime));
66 ceph_encode_timespec(p, &mtime); 66 ceph_encode_timespec64(p, &mtime);
67 p += sizeof(struct ceph_timespec); 67 p += sizeof(struct ceph_timespec);
68 ceph_encode_8(&p, flags); 68 ceph_encode_8(&p, flags);
69 69
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index 417df675c71b..3f323ed9df52 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -514,7 +514,7 @@ static int crush_choose_firstn(const struct crush_map *map,
514 in, work->work[-1-in->id], 514 in, work->work[-1-in->id],
515 x, r, 515 x, r,
516 (choose_args ? 516 (choose_args ?
517 &choose_args[-1-in->id] : 0), 517 &choose_args[-1-in->id] : NULL),
518 outpos); 518 outpos);
519 if (item >= map->max_devices) { 519 if (item >= map->max_devices) {
520 dprintk(" bad item %d\n", item); 520 dprintk(" bad item %d\n", item);
@@ -725,7 +725,7 @@ static void crush_choose_indep(const struct crush_map *map,
725 in, work->work[-1-in->id], 725 in, work->work[-1-in->id],
726 x, r, 726 x, r,
727 (choose_args ? 727 (choose_args ?
728 &choose_args[-1-in->id] : 0), 728 &choose_args[-1-in->id] : NULL),
729 outpos); 729 outpos);
730 if (item >= map->max_devices) { 730 if (item >= map->max_devices) {
731 dprintk(" bad item %d\n", item); 731 dprintk(" bad item %d\n", item);
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index c6413c360771..0a187196aeed 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1417,11 +1417,11 @@ static void prepare_write_keepalive(struct ceph_connection *con)
1417 dout("prepare_write_keepalive %p\n", con); 1417 dout("prepare_write_keepalive %p\n", con);
1418 con_out_kvec_reset(con); 1418 con_out_kvec_reset(con);
1419 if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) { 1419 if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) {
1420 struct timespec now; 1420 struct timespec64 now;
1421 1421
1422 ktime_get_real_ts(&now); 1422 ktime_get_real_ts64(&now);
1423 con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2); 1423 con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2);
1424 ceph_encode_timespec(&con->out_temp_keepalive2, &now); 1424 ceph_encode_timespec64(&con->out_temp_keepalive2, &now);
1425 con_out_kvec_add(con, sizeof(con->out_temp_keepalive2), 1425 con_out_kvec_add(con, sizeof(con->out_temp_keepalive2),
1426 &con->out_temp_keepalive2); 1426 &con->out_temp_keepalive2);
1427 } else { 1427 } else {
@@ -1434,24 +1434,26 @@ static void prepare_write_keepalive(struct ceph_connection *con)
1434 * Connection negotiation. 1434 * Connection negotiation.
1435 */ 1435 */
1436 1436
1437static struct ceph_auth_handshake *get_connect_authorizer(struct ceph_connection *con, 1437static int get_connect_authorizer(struct ceph_connection *con)
1438 int *auth_proto)
1439{ 1438{
1440 struct ceph_auth_handshake *auth; 1439 struct ceph_auth_handshake *auth;
1440 int auth_proto;
1441 1441
1442 if (!con->ops->get_authorizer) { 1442 if (!con->ops->get_authorizer) {
1443 con->auth = NULL;
1443 con->out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN; 1444 con->out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN;
1444 con->out_connect.authorizer_len = 0; 1445 con->out_connect.authorizer_len = 0;
1445 return NULL; 1446 return 0;
1446 } 1447 }
1447 1448
1448 auth = con->ops->get_authorizer(con, auth_proto, con->auth_retry); 1449 auth = con->ops->get_authorizer(con, &auth_proto, con->auth_retry);
1449 if (IS_ERR(auth)) 1450 if (IS_ERR(auth))
1450 return auth; 1451 return PTR_ERR(auth);
1451 1452
1452 con->auth_reply_buf = auth->authorizer_reply_buf; 1453 con->auth = auth;
1453 con->auth_reply_buf_len = auth->authorizer_reply_buf_len; 1454 con->out_connect.authorizer_protocol = cpu_to_le32(auth_proto);
1454 return auth; 1455 con->out_connect.authorizer_len = cpu_to_le32(auth->authorizer_buf_len);
1456 return 0;
1455} 1457}
1456 1458
1457/* 1459/*
@@ -1467,12 +1469,22 @@ static void prepare_write_banner(struct ceph_connection *con)
1467 con_flag_set(con, CON_FLAG_WRITE_PENDING); 1469 con_flag_set(con, CON_FLAG_WRITE_PENDING);
1468} 1470}
1469 1471
1472static void __prepare_write_connect(struct ceph_connection *con)
1473{
1474 con_out_kvec_add(con, sizeof(con->out_connect), &con->out_connect);
1475 if (con->auth)
1476 con_out_kvec_add(con, con->auth->authorizer_buf_len,
1477 con->auth->authorizer_buf);
1478
1479 con->out_more = 0;
1480 con_flag_set(con, CON_FLAG_WRITE_PENDING);
1481}
1482
1470static int prepare_write_connect(struct ceph_connection *con) 1483static int prepare_write_connect(struct ceph_connection *con)
1471{ 1484{
1472 unsigned int global_seq = get_global_seq(con->msgr, 0); 1485 unsigned int global_seq = get_global_seq(con->msgr, 0);
1473 int proto; 1486 int proto;
1474 int auth_proto; 1487 int ret;
1475 struct ceph_auth_handshake *auth;
1476 1488
1477 switch (con->peer_name.type) { 1489 switch (con->peer_name.type) {
1478 case CEPH_ENTITY_TYPE_MON: 1490 case CEPH_ENTITY_TYPE_MON:
@@ -1499,24 +1511,11 @@ static int prepare_write_connect(struct ceph_connection *con)
1499 con->out_connect.protocol_version = cpu_to_le32(proto); 1511 con->out_connect.protocol_version = cpu_to_le32(proto);
1500 con->out_connect.flags = 0; 1512 con->out_connect.flags = 0;
1501 1513
1502 auth_proto = CEPH_AUTH_UNKNOWN; 1514 ret = get_connect_authorizer(con);
1503 auth = get_connect_authorizer(con, &auth_proto); 1515 if (ret)
1504 if (IS_ERR(auth)) 1516 return ret;
1505 return PTR_ERR(auth);
1506
1507 con->out_connect.authorizer_protocol = cpu_to_le32(auth_proto);
1508 con->out_connect.authorizer_len = auth ?
1509 cpu_to_le32(auth->authorizer_buf_len) : 0;
1510
1511 con_out_kvec_add(con, sizeof (con->out_connect),
1512 &con->out_connect);
1513 if (auth && auth->authorizer_buf_len)
1514 con_out_kvec_add(con, auth->authorizer_buf_len,
1515 auth->authorizer_buf);
1516
1517 con->out_more = 0;
1518 con_flag_set(con, CON_FLAG_WRITE_PENDING);
1519 1517
1518 __prepare_write_connect(con);
1520 return 0; 1519 return 0;
1521} 1520}
1522 1521
@@ -1781,11 +1780,21 @@ static int read_partial_connect(struct ceph_connection *con)
1781 if (ret <= 0) 1780 if (ret <= 0)
1782 goto out; 1781 goto out;
1783 1782
1784 size = le32_to_cpu(con->in_reply.authorizer_len); 1783 if (con->auth) {
1785 end += size; 1784 size = le32_to_cpu(con->in_reply.authorizer_len);
1786 ret = read_partial(con, end, size, con->auth_reply_buf); 1785 if (size > con->auth->authorizer_reply_buf_len) {
1787 if (ret <= 0) 1786 pr_err("authorizer reply too big: %d > %zu\n", size,
1788 goto out; 1787 con->auth->authorizer_reply_buf_len);
1788 ret = -EINVAL;
1789 goto out;
1790 }
1791
1792 end += size;
1793 ret = read_partial(con, end, size,
1794 con->auth->authorizer_reply_buf);
1795 if (ret <= 0)
1796 goto out;
1797 }
1789 1798
1790 dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n", 1799 dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
1791 con, (int)con->in_reply.tag, 1800 con, (int)con->in_reply.tag,
@@ -1793,7 +1802,6 @@ static int read_partial_connect(struct ceph_connection *con)
1793 le32_to_cpu(con->in_reply.global_seq)); 1802 le32_to_cpu(con->in_reply.global_seq));
1794out: 1803out:
1795 return ret; 1804 return ret;
1796
1797} 1805}
1798 1806
1799/* 1807/*
@@ -2076,12 +2084,27 @@ static int process_connect(struct ceph_connection *con)
2076 2084
2077 dout("process_connect on %p tag %d\n", con, (int)con->in_tag); 2085 dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
2078 2086
2079 if (con->auth_reply_buf) { 2087 if (con->auth) {
2080 /* 2088 /*
2081 * Any connection that defines ->get_authorizer() 2089 * Any connection that defines ->get_authorizer()
2082 * should also define ->verify_authorizer_reply(). 2090 * should also define ->add_authorizer_challenge() and
2091 * ->verify_authorizer_reply().
2092 *
2083 * See get_connect_authorizer(). 2093 * See get_connect_authorizer().
2084 */ 2094 */
2095 if (con->in_reply.tag == CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) {
2096 ret = con->ops->add_authorizer_challenge(
2097 con, con->auth->authorizer_reply_buf,
2098 le32_to_cpu(con->in_reply.authorizer_len));
2099 if (ret < 0)
2100 return ret;
2101
2102 con_out_kvec_reset(con);
2103 __prepare_write_connect(con);
2104 prepare_read_connect(con);
2105 return 0;
2106 }
2107
2085 ret = con->ops->verify_authorizer_reply(con); 2108 ret = con->ops->verify_authorizer_reply(con);
2086 if (ret < 0) { 2109 if (ret < 0) {
2087 con->error_msg = "bad authorize reply"; 2110 con->error_msg = "bad authorize reply";
@@ -2555,7 +2578,7 @@ static int read_keepalive_ack(struct ceph_connection *con)
2555 int ret = read_partial(con, size, size, &ceph_ts); 2578 int ret = read_partial(con, size, size, &ceph_ts);
2556 if (ret <= 0) 2579 if (ret <= 0)
2557 return ret; 2580 return ret;
2558 ceph_decode_timespec(&con->last_keepalive_ack, &ceph_ts); 2581 ceph_decode_timespec64(&con->last_keepalive_ack, &ceph_ts);
2559 prepare_read_tag(con); 2582 prepare_read_tag(con);
2560 return 1; 2583 return 1;
2561} 2584}
@@ -3223,12 +3246,12 @@ bool ceph_con_keepalive_expired(struct ceph_connection *con,
3223{ 3246{
3224 if (interval > 0 && 3247 if (interval > 0 &&
3225 (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2)) { 3248 (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2)) {
3226 struct timespec now; 3249 struct timespec64 now;
3227 struct timespec ts; 3250 struct timespec64 ts;
3228 ktime_get_real_ts(&now); 3251 ktime_get_real_ts64(&now);
3229 jiffies_to_timespec(interval, &ts); 3252 jiffies_to_timespec64(interval, &ts);
3230 ts = timespec_add(con->last_keepalive_ack, ts); 3253 ts = timespec64_add(con->last_keepalive_ack, ts);
3231 return timespec_compare(&now, &ts) >= 0; 3254 return timespec64_compare(&now, &ts) >= 0;
3232 } 3255 }
3233 return false; 3256 return false;
3234} 3257}
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index d7a7a2330ef7..18deb3d889c4 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -1249,7 +1249,7 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
1249 if (monc->client->extra_mon_dispatch && 1249 if (monc->client->extra_mon_dispatch &&
1250 monc->client->extra_mon_dispatch(monc->client, msg) == 0) 1250 monc->client->extra_mon_dispatch(monc->client, msg) == 0)
1251 break; 1251 break;
1252 1252
1253 pr_err("received unknown message type %d %s\n", type, 1253 pr_err("received unknown message type %d %s\n", type,
1254 ceph_msg_type_name(type)); 1254 ceph_msg_type_name(type));
1255 } 1255 }
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index a00c74f1154e..60934bd8796c 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1978,7 +1978,7 @@ static void encode_request_partial(struct ceph_osd_request *req,
1978 p += sizeof(struct ceph_blkin_trace_info); 1978 p += sizeof(struct ceph_blkin_trace_info);
1979 1979
1980 ceph_encode_32(&p, 0); /* client_inc, always 0 */ 1980 ceph_encode_32(&p, 0); /* client_inc, always 0 */
1981 ceph_encode_timespec(p, &req->r_mtime); 1981 ceph_encode_timespec64(p, &req->r_mtime);
1982 p += sizeof(struct ceph_timespec); 1982 p += sizeof(struct ceph_timespec);
1983 1983
1984 encode_oloc(&p, end, &req->r_t.target_oloc); 1984 encode_oloc(&p, end, &req->r_t.target_oloc);
@@ -4512,7 +4512,7 @@ ceph_osdc_watch(struct ceph_osd_client *osdc,
4512 ceph_oid_copy(&lreq->t.base_oid, oid); 4512 ceph_oid_copy(&lreq->t.base_oid, oid);
4513 ceph_oloc_copy(&lreq->t.base_oloc, oloc); 4513 ceph_oloc_copy(&lreq->t.base_oloc, oloc);
4514 lreq->t.flags = CEPH_OSD_FLAG_WRITE; 4514 lreq->t.flags = CEPH_OSD_FLAG_WRITE;
4515 ktime_get_real_ts(&lreq->mtime); 4515 ktime_get_real_ts64(&lreq->mtime);
4516 4516
4517 lreq->reg_req = alloc_linger_request(lreq); 4517 lreq->reg_req = alloc_linger_request(lreq);
4518 if (!lreq->reg_req) { 4518 if (!lreq->reg_req) {
@@ -4570,7 +4570,7 @@ int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
4570 ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid); 4570 ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
4571 ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc); 4571 ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
4572 req->r_flags = CEPH_OSD_FLAG_WRITE; 4572 req->r_flags = CEPH_OSD_FLAG_WRITE;
4573 ktime_get_real_ts(&req->r_mtime); 4573 ktime_get_real_ts64(&req->r_mtime);
4574 osd_req_op_watch_init(req, 0, lreq->linger_id, 4574 osd_req_op_watch_init(req, 0, lreq->linger_id,
4575 CEPH_OSD_WATCH_OP_UNWATCH); 4575 CEPH_OSD_WATCH_OP_UNWATCH);
4576 4576
@@ -4591,7 +4591,7 @@ EXPORT_SYMBOL(ceph_osdc_unwatch);
4591 4591
4592static int osd_req_op_notify_ack_init(struct ceph_osd_request *req, int which, 4592static int osd_req_op_notify_ack_init(struct ceph_osd_request *req, int which,
4593 u64 notify_id, u64 cookie, void *payload, 4593 u64 notify_id, u64 cookie, void *payload,
4594 size_t payload_len) 4594 u32 payload_len)
4595{ 4595{
4596 struct ceph_osd_req_op *op; 4596 struct ceph_osd_req_op *op;
4597 struct ceph_pagelist *pl; 4597 struct ceph_pagelist *pl;
@@ -4628,7 +4628,7 @@ int ceph_osdc_notify_ack(struct ceph_osd_client *osdc,
4628 u64 notify_id, 4628 u64 notify_id,
4629 u64 cookie, 4629 u64 cookie,
4630 void *payload, 4630 void *payload,
4631 size_t payload_len) 4631 u32 payload_len)
4632{ 4632{
4633 struct ceph_osd_request *req; 4633 struct ceph_osd_request *req;
4634 int ret; 4634 int ret;
@@ -4661,7 +4661,7 @@ EXPORT_SYMBOL(ceph_osdc_notify_ack);
4661 4661
4662static int osd_req_op_notify_init(struct ceph_osd_request *req, int which, 4662static int osd_req_op_notify_init(struct ceph_osd_request *req, int which,
4663 u64 cookie, u32 prot_ver, u32 timeout, 4663 u64 cookie, u32 prot_ver, u32 timeout,
4664 void *payload, size_t payload_len) 4664 void *payload, u32 payload_len)
4665{ 4665{
4666 struct ceph_osd_req_op *op; 4666 struct ceph_osd_req_op *op;
4667 struct ceph_pagelist *pl; 4667 struct ceph_pagelist *pl;
@@ -4701,7 +4701,7 @@ int ceph_osdc_notify(struct ceph_osd_client *osdc,
4701 struct ceph_object_id *oid, 4701 struct ceph_object_id *oid,
4702 struct ceph_object_locator *oloc, 4702 struct ceph_object_locator *oloc,
4703 void *payload, 4703 void *payload,
4704 size_t payload_len, 4704 u32 payload_len,
4705 u32 timeout, 4705 u32 timeout,
4706 struct page ***preply_pages, 4706 struct page ***preply_pages,
4707 size_t *preply_len) 4707 size_t *preply_len)
@@ -5136,7 +5136,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
5136 struct ceph_snap_context *snapc, 5136 struct ceph_snap_context *snapc,
5137 u64 off, u64 len, 5137 u64 off, u64 len,
5138 u32 truncate_seq, u64 truncate_size, 5138 u32 truncate_seq, u64 truncate_size,
5139 struct timespec *mtime, 5139 struct timespec64 *mtime,
5140 struct page **pages, int num_pages) 5140 struct page **pages, int num_pages)
5141{ 5141{
5142 struct ceph_osd_request *req; 5142 struct ceph_osd_request *req;
@@ -5393,6 +5393,16 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
5393 return auth; 5393 return auth;
5394} 5394}
5395 5395
5396static int add_authorizer_challenge(struct ceph_connection *con,
5397 void *challenge_buf, int challenge_buf_len)
5398{
5399 struct ceph_osd *o = con->private;
5400 struct ceph_osd_client *osdc = o->o_osdc;
5401 struct ceph_auth_client *ac = osdc->client->monc.auth;
5402
5403 return ceph_auth_add_authorizer_challenge(ac, o->o_auth.authorizer,
5404 challenge_buf, challenge_buf_len);
5405}
5396 5406
5397static int verify_authorizer_reply(struct ceph_connection *con) 5407static int verify_authorizer_reply(struct ceph_connection *con)
5398{ 5408{
@@ -5442,6 +5452,7 @@ static const struct ceph_connection_operations osd_con_ops = {
5442 .put = put_osd_con, 5452 .put = put_osd_con,
5443 .dispatch = dispatch, 5453 .dispatch = dispatch,
5444 .get_authorizer = get_authorizer, 5454 .get_authorizer = get_authorizer,
5455 .add_authorizer_challenge = add_authorizer_challenge,
5445 .verify_authorizer_reply = verify_authorizer_reply, 5456 .verify_authorizer_reply = verify_authorizer_reply,
5446 .invalidate_authorizer = invalidate_authorizer, 5457 .invalidate_authorizer = invalidate_authorizer,
5447 .alloc_msg = alloc_msg, 5458 .alloc_msg = alloc_msg,
diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c
index e560d3975f41..d3736f5bffec 100644
--- a/net/ceph/pagevec.c
+++ b/net/ceph/pagevec.c
@@ -197,4 +197,3 @@ void ceph_zero_page_vector_range(int off, int len, struct page **pages)
197 } 197 }
198} 198}
199EXPORT_SYMBOL(ceph_zero_page_vector_range); 199EXPORT_SYMBOL(ceph_zero_page_vector_range);
200
diff --git a/net/compat.c b/net/compat.c
index 7242cce5631b..3b2105f6549d 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -466,8 +466,7 @@ int compat_sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
466 466
467 ctv = (struct compat_timeval __user *) userstamp; 467 ctv = (struct compat_timeval __user *) userstamp;
468 err = -ENOENT; 468 err = -ENOENT;
469 if (!sock_flag(sk, SOCK_TIMESTAMP)) 469 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
470 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
471 tv = ktime_to_timeval(sk->sk_stamp); 470 tv = ktime_to_timeval(sk->sk_stamp);
472 if (tv.tv_sec == -1) 471 if (tv.tv_sec == -1)
473 return err; 472 return err;
@@ -494,8 +493,7 @@ int compat_sock_get_timestampns(struct sock *sk, struct timespec __user *usersta
494 493
495 ctv = (struct compat_timespec __user *) userstamp; 494 ctv = (struct compat_timespec __user *) userstamp;
496 err = -ENOENT; 495 err = -ENOENT;
497 if (!sock_flag(sk, SOCK_TIMESTAMP)) 496 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
498 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
499 ts = ktime_to_timespec(sk->sk_stamp); 497 ts = ktime_to_timespec(sk->sk_stamp);
500 if (ts.tv_sec == -1) 498 if (ts.tv_sec == -1)
501 return err; 499 return err;
diff --git a/net/core/datagram.c b/net/core/datagram.c
index f19bf3dc2bd6..9aac0d63d53e 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -819,8 +819,9 @@ EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg);
819 819
820/** 820/**
821 * datagram_poll - generic datagram poll 821 * datagram_poll - generic datagram poll
822 * @file: file struct
822 * @sock: socket 823 * @sock: socket
823 * @events to wait for 824 * @wait: poll table
824 * 825 *
825 * Datagram poll: Again totally generic. This also handles 826 * Datagram poll: Again totally generic. This also handles
826 * sequenced packet sockets providing the socket receive queue 827 * sequenced packet sockets providing the socket receive queue
@@ -830,10 +831,14 @@ EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg);
830 * and you use a different write policy from sock_writeable() 831 * and you use a different write policy from sock_writeable()
831 * then please supply your own write_space callback. 832 * then please supply your own write_space callback.
832 */ 833 */
833__poll_t datagram_poll_mask(struct socket *sock, __poll_t events) 834__poll_t datagram_poll(struct file *file, struct socket *sock,
835 poll_table *wait)
834{ 836{
835 struct sock *sk = sock->sk; 837 struct sock *sk = sock->sk;
836 __poll_t mask = 0; 838 __poll_t mask;
839
840 sock_poll_wait(file, wait);
841 mask = 0;
837 842
838 /* exceptional events? */ 843 /* exceptional events? */
839 if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) 844 if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
@@ -866,4 +871,4 @@ __poll_t datagram_poll_mask(struct socket *sock, __poll_t events)
866 871
867 return mask; 872 return mask;
868} 873}
869EXPORT_SYMBOL(datagram_poll_mask); 874EXPORT_SYMBOL(datagram_poll);
diff --git a/net/core/dev.c b/net/core/dev.c
index a5aa1c7444e6..82114e1111e6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -93,7 +93,6 @@
93#include <linux/netdevice.h> 93#include <linux/netdevice.h>
94#include <linux/etherdevice.h> 94#include <linux/etherdevice.h>
95#include <linux/ethtool.h> 95#include <linux/ethtool.h>
96#include <linux/notifier.h>
97#include <linux/skbuff.h> 96#include <linux/skbuff.h>
98#include <linux/bpf.h> 97#include <linux/bpf.h>
99#include <linux/bpf_trace.h> 98#include <linux/bpf_trace.h>
@@ -149,7 +148,6 @@
149 148
150#include "net-sysfs.h" 149#include "net-sysfs.h"
151 150
152/* Instead of increasing this, you should create a hash table. */
153#define MAX_GRO_SKBS 8 151#define MAX_GRO_SKBS 8
154 152
155/* This should be increased if a protocol with a bigger head is added. */ 153/* This should be increased if a protocol with a bigger head is added. */
@@ -2068,11 +2066,13 @@ int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
2068 struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; 2066 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2069 int i; 2067 int i;
2070 2068
2069 /* walk through the TCs and see if it falls into any of them */
2071 for (i = 0; i < TC_MAX_QUEUE; i++, tc++) { 2070 for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
2072 if ((txq - tc->offset) < tc->count) 2071 if ((txq - tc->offset) < tc->count)
2073 return i; 2072 return i;
2074 } 2073 }
2075 2074
2075 /* didn't find it, just return -1 to indicate no match */
2076 return -1; 2076 return -1;
2077 } 2077 }
2078 2078
@@ -2081,6 +2081,10 @@ int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
2081EXPORT_SYMBOL(netdev_txq_to_tc); 2081EXPORT_SYMBOL(netdev_txq_to_tc);
2082 2082
2083#ifdef CONFIG_XPS 2083#ifdef CONFIG_XPS
2084struct static_key xps_needed __read_mostly;
2085EXPORT_SYMBOL(xps_needed);
2086struct static_key xps_rxqs_needed __read_mostly;
2087EXPORT_SYMBOL(xps_rxqs_needed);
2084static DEFINE_MUTEX(xps_map_mutex); 2088static DEFINE_MUTEX(xps_map_mutex);
2085#define xmap_dereference(P) \ 2089#define xmap_dereference(P) \
2086 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) 2090 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
@@ -2092,7 +2096,7 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
2092 int pos; 2096 int pos;
2093 2097
2094 if (dev_maps) 2098 if (dev_maps)
2095 map = xmap_dereference(dev_maps->cpu_map[tci]); 2099 map = xmap_dereference(dev_maps->attr_map[tci]);
2096 if (!map) 2100 if (!map)
2097 return false; 2101 return false;
2098 2102
@@ -2105,7 +2109,7 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
2105 break; 2109 break;
2106 } 2110 }
2107 2111
2108 RCU_INIT_POINTER(dev_maps->cpu_map[tci], NULL); 2112 RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
2109 kfree_rcu(map, rcu); 2113 kfree_rcu(map, rcu);
2110 return false; 2114 return false;
2111 } 2115 }
@@ -2135,34 +2139,71 @@ static bool remove_xps_queue_cpu(struct net_device *dev,
2135 return active; 2139 return active;
2136} 2140}
2137 2141
2142static void clean_xps_maps(struct net_device *dev, const unsigned long *mask,
2143 struct xps_dev_maps *dev_maps, unsigned int nr_ids,
2144 u16 offset, u16 count, bool is_rxqs_map)
2145{
2146 bool active = false;
2147 int i, j;
2148
2149 for (j = -1; j = netif_attrmask_next(j, mask, nr_ids),
2150 j < nr_ids;)
2151 active |= remove_xps_queue_cpu(dev, dev_maps, j, offset,
2152 count);
2153 if (!active) {
2154 if (is_rxqs_map) {
2155 RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
2156 } else {
2157 RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
2158
2159 for (i = offset + (count - 1); count--; i--)
2160 netdev_queue_numa_node_write(
2161 netdev_get_tx_queue(dev, i),
2162 NUMA_NO_NODE);
2163 }
2164 kfree_rcu(dev_maps, rcu);
2165 }
2166}
2167
2138static void netif_reset_xps_queues(struct net_device *dev, u16 offset, 2168static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
2139 u16 count) 2169 u16 count)
2140{ 2170{
2171 const unsigned long *possible_mask = NULL;
2141 struct xps_dev_maps *dev_maps; 2172 struct xps_dev_maps *dev_maps;
2142 int cpu, i; 2173 unsigned int nr_ids;
2143 bool active = false; 2174
2175 if (!static_key_false(&xps_needed))
2176 return;
2144 2177
2178 cpus_read_lock();
2145 mutex_lock(&xps_map_mutex); 2179 mutex_lock(&xps_map_mutex);
2146 dev_maps = xmap_dereference(dev->xps_maps);
2147 2180
2181 if (static_key_false(&xps_rxqs_needed)) {
2182 dev_maps = xmap_dereference(dev->xps_rxqs_map);
2183 if (dev_maps) {
2184 nr_ids = dev->num_rx_queues;
2185 clean_xps_maps(dev, possible_mask, dev_maps, nr_ids,
2186 offset, count, true);
2187 }
2188 }
2189
2190 dev_maps = xmap_dereference(dev->xps_cpus_map);
2148 if (!dev_maps) 2191 if (!dev_maps)
2149 goto out_no_maps; 2192 goto out_no_maps;
2150 2193
2151 for_each_possible_cpu(cpu) 2194 if (num_possible_cpus() > 1)
2152 active |= remove_xps_queue_cpu(dev, dev_maps, cpu, 2195 possible_mask = cpumask_bits(cpu_possible_mask);
2153 offset, count); 2196 nr_ids = nr_cpu_ids;
2154 2197 clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, count,
2155 if (!active) { 2198 false);
2156 RCU_INIT_POINTER(dev->xps_maps, NULL);
2157 kfree_rcu(dev_maps, rcu);
2158 }
2159
2160 for (i = offset + (count - 1); count--; i--)
2161 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
2162 NUMA_NO_NODE);
2163 2199
2164out_no_maps: 2200out_no_maps:
2201 if (static_key_enabled(&xps_rxqs_needed))
2202 static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
2203
2204 static_key_slow_dec_cpuslocked(&xps_needed);
2165 mutex_unlock(&xps_map_mutex); 2205 mutex_unlock(&xps_map_mutex);
2206 cpus_read_unlock();
2166} 2207}
2167 2208
2168static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index) 2209static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
@@ -2170,8 +2211,8 @@ static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2170 netif_reset_xps_queues(dev, index, dev->num_tx_queues - index); 2211 netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
2171} 2212}
2172 2213
2173static struct xps_map *expand_xps_map(struct xps_map *map, 2214static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
2174 int cpu, u16 index) 2215 u16 index, bool is_rxqs_map)
2175{ 2216{
2176 struct xps_map *new_map; 2217 struct xps_map *new_map;
2177 int alloc_len = XPS_MIN_MAP_ALLOC; 2218 int alloc_len = XPS_MIN_MAP_ALLOC;
@@ -2183,7 +2224,7 @@ static struct xps_map *expand_xps_map(struct xps_map *map,
2183 return map; 2224 return map;
2184 } 2225 }
2185 2226
2186 /* Need to add queue to this CPU's existing map */ 2227 /* Need to add tx-queue to this CPU's/rx-queue's existing map */
2187 if (map) { 2228 if (map) {
2188 if (pos < map->alloc_len) 2229 if (pos < map->alloc_len)
2189 return map; 2230 return map;
@@ -2191,9 +2232,14 @@ static struct xps_map *expand_xps_map(struct xps_map *map,
2191 alloc_len = map->alloc_len * 2; 2232 alloc_len = map->alloc_len * 2;
2192 } 2233 }
2193 2234
2194 /* Need to allocate new map to store queue on this CPU's map */ 2235 /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
2195 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL, 2236 * map
2196 cpu_to_node(cpu)); 2237 */
2238 if (is_rxqs_map)
2239 new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
2240 else
2241 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2242 cpu_to_node(attr_index));
2197 if (!new_map) 2243 if (!new_map)
2198 return NULL; 2244 return NULL;
2199 2245
@@ -2205,32 +2251,53 @@ static struct xps_map *expand_xps_map(struct xps_map *map,
2205 return new_map; 2251 return new_map;
2206} 2252}
2207 2253
2208int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, 2254/* Must be called under cpus_read_lock */
2209 u16 index) 2255int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
2256 u16 index, bool is_rxqs_map)
2210{ 2257{
2258 const unsigned long *online_mask = NULL, *possible_mask = NULL;
2211 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL; 2259 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2212 int i, cpu, tci, numa_node_id = -2; 2260 int i, j, tci, numa_node_id = -2;
2213 int maps_sz, num_tc = 1, tc = 0; 2261 int maps_sz, num_tc = 1, tc = 0;
2214 struct xps_map *map, *new_map; 2262 struct xps_map *map, *new_map;
2215 bool active = false; 2263 bool active = false;
2264 unsigned int nr_ids;
2216 2265
2217 if (dev->num_tc) { 2266 if (dev->num_tc) {
2267 /* Do not allow XPS on subordinate device directly */
2218 num_tc = dev->num_tc; 2268 num_tc = dev->num_tc;
2269 if (num_tc < 0)
2270 return -EINVAL;
2271
2272 /* If queue belongs to subordinate dev use its map */
2273 dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
2274
2219 tc = netdev_txq_to_tc(dev, index); 2275 tc = netdev_txq_to_tc(dev, index);
2220 if (tc < 0) 2276 if (tc < 0)
2221 return -EINVAL; 2277 return -EINVAL;
2222 } 2278 }
2223 2279
2224 maps_sz = XPS_DEV_MAPS_SIZE(num_tc);
2225 if (maps_sz < L1_CACHE_BYTES)
2226 maps_sz = L1_CACHE_BYTES;
2227
2228 mutex_lock(&xps_map_mutex); 2280 mutex_lock(&xps_map_mutex);
2281 if (is_rxqs_map) {
2282 maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
2283 dev_maps = xmap_dereference(dev->xps_rxqs_map);
2284 nr_ids = dev->num_rx_queues;
2285 } else {
2286 maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
2287 if (num_possible_cpus() > 1) {
2288 online_mask = cpumask_bits(cpu_online_mask);
2289 possible_mask = cpumask_bits(cpu_possible_mask);
2290 }
2291 dev_maps = xmap_dereference(dev->xps_cpus_map);
2292 nr_ids = nr_cpu_ids;
2293 }
2229 2294
2230 dev_maps = xmap_dereference(dev->xps_maps); 2295 if (maps_sz < L1_CACHE_BYTES)
2296 maps_sz = L1_CACHE_BYTES;
2231 2297
2232 /* allocate memory for queue storage */ 2298 /* allocate memory for queue storage */
2233 for_each_cpu_and(cpu, cpu_online_mask, mask) { 2299 for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
2300 j < nr_ids;) {
2234 if (!new_dev_maps) 2301 if (!new_dev_maps)
2235 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); 2302 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2236 if (!new_dev_maps) { 2303 if (!new_dev_maps) {
@@ -2238,73 +2305,85 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2238 return -ENOMEM; 2305 return -ENOMEM;
2239 } 2306 }
2240 2307
2241 tci = cpu * num_tc + tc; 2308 tci = j * num_tc + tc;
2242 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[tci]) : 2309 map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) :
2243 NULL; 2310 NULL;
2244 2311
2245 map = expand_xps_map(map, cpu, index); 2312 map = expand_xps_map(map, j, index, is_rxqs_map);
2246 if (!map) 2313 if (!map)
2247 goto error; 2314 goto error;
2248 2315
2249 RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map); 2316 RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2250 } 2317 }
2251 2318
2252 if (!new_dev_maps) 2319 if (!new_dev_maps)
2253 goto out_no_new_maps; 2320 goto out_no_new_maps;
2254 2321
2255 for_each_possible_cpu(cpu) { 2322 static_key_slow_inc_cpuslocked(&xps_needed);
2323 if (is_rxqs_map)
2324 static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
2325
2326 for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2327 j < nr_ids;) {
2256 /* copy maps belonging to foreign traffic classes */ 2328 /* copy maps belonging to foreign traffic classes */
2257 for (i = tc, tci = cpu * num_tc; dev_maps && i--; tci++) { 2329 for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) {
2258 /* fill in the new device map from the old device map */ 2330 /* fill in the new device map from the old device map */
2259 map = xmap_dereference(dev_maps->cpu_map[tci]); 2331 map = xmap_dereference(dev_maps->attr_map[tci]);
2260 RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map); 2332 RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2261 } 2333 }
2262 2334
2263 /* We need to explicitly update tci as prevous loop 2335 /* We need to explicitly update tci as prevous loop
2264 * could break out early if dev_maps is NULL. 2336 * could break out early if dev_maps is NULL.
2265 */ 2337 */
2266 tci = cpu * num_tc + tc; 2338 tci = j * num_tc + tc;
2267 2339
2268 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) { 2340 if (netif_attr_test_mask(j, mask, nr_ids) &&
2269 /* add queue to CPU maps */ 2341 netif_attr_test_online(j, online_mask, nr_ids)) {
2342 /* add tx-queue to CPU/rx-queue maps */
2270 int pos = 0; 2343 int pos = 0;
2271 2344
2272 map = xmap_dereference(new_dev_maps->cpu_map[tci]); 2345 map = xmap_dereference(new_dev_maps->attr_map[tci]);
2273 while ((pos < map->len) && (map->queues[pos] != index)) 2346 while ((pos < map->len) && (map->queues[pos] != index))
2274 pos++; 2347 pos++;
2275 2348
2276 if (pos == map->len) 2349 if (pos == map->len)
2277 map->queues[map->len++] = index; 2350 map->queues[map->len++] = index;
2278#ifdef CONFIG_NUMA 2351#ifdef CONFIG_NUMA
2279 if (numa_node_id == -2) 2352 if (!is_rxqs_map) {
2280 numa_node_id = cpu_to_node(cpu); 2353 if (numa_node_id == -2)
2281 else if (numa_node_id != cpu_to_node(cpu)) 2354 numa_node_id = cpu_to_node(j);
2282 numa_node_id = -1; 2355 else if (numa_node_id != cpu_to_node(j))
2356 numa_node_id = -1;
2357 }
2283#endif 2358#endif
2284 } else if (dev_maps) { 2359 } else if (dev_maps) {
2285 /* fill in the new device map from the old device map */ 2360 /* fill in the new device map from the old device map */
2286 map = xmap_dereference(dev_maps->cpu_map[tci]); 2361 map = xmap_dereference(dev_maps->attr_map[tci]);
2287 RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map); 2362 RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2288 } 2363 }
2289 2364
2290 /* copy maps belonging to foreign traffic classes */ 2365 /* copy maps belonging to foreign traffic classes */
2291 for (i = num_tc - tc, tci++; dev_maps && --i; tci++) { 2366 for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
2292 /* fill in the new device map from the old device map */ 2367 /* fill in the new device map from the old device map */
2293 map = xmap_dereference(dev_maps->cpu_map[tci]); 2368 map = xmap_dereference(dev_maps->attr_map[tci]);
2294 RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map); 2369 RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2295 } 2370 }
2296 } 2371 }
2297 2372
2298 rcu_assign_pointer(dev->xps_maps, new_dev_maps); 2373 if (is_rxqs_map)
2374 rcu_assign_pointer(dev->xps_rxqs_map, new_dev_maps);
2375 else
2376 rcu_assign_pointer(dev->xps_cpus_map, new_dev_maps);
2299 2377
2300 /* Cleanup old maps */ 2378 /* Cleanup old maps */
2301 if (!dev_maps) 2379 if (!dev_maps)
2302 goto out_no_old_maps; 2380 goto out_no_old_maps;
2303 2381
2304 for_each_possible_cpu(cpu) { 2382 for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2305 for (i = num_tc, tci = cpu * num_tc; i--; tci++) { 2383 j < nr_ids;) {
2306 new_map = xmap_dereference(new_dev_maps->cpu_map[tci]); 2384 for (i = num_tc, tci = j * num_tc; i--; tci++) {
2307 map = xmap_dereference(dev_maps->cpu_map[tci]); 2385 new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
2386 map = xmap_dereference(dev_maps->attr_map[tci]);
2308 if (map && map != new_map) 2387 if (map && map != new_map)
2309 kfree_rcu(map, rcu); 2388 kfree_rcu(map, rcu);
2310 } 2389 }
@@ -2317,19 +2396,23 @@ out_no_old_maps:
2317 active = true; 2396 active = true;
2318 2397
2319out_no_new_maps: 2398out_no_new_maps:
2320 /* update Tx queue numa node */ 2399 if (!is_rxqs_map) {
2321 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index), 2400 /* update Tx queue numa node */
2322 (numa_node_id >= 0) ? numa_node_id : 2401 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2323 NUMA_NO_NODE); 2402 (numa_node_id >= 0) ?
2403 numa_node_id : NUMA_NO_NODE);
2404 }
2324 2405
2325 if (!dev_maps) 2406 if (!dev_maps)
2326 goto out_no_maps; 2407 goto out_no_maps;
2327 2408
2328 /* removes queue from unused CPUs */ 2409 /* removes tx-queue from unused CPUs/rx-queues */
2329 for_each_possible_cpu(cpu) { 2410 for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2330 for (i = tc, tci = cpu * num_tc; i--; tci++) 2411 j < nr_ids;) {
2412 for (i = tc, tci = j * num_tc; i--; tci++)
2331 active |= remove_xps_queue(dev_maps, tci, index); 2413 active |= remove_xps_queue(dev_maps, tci, index);
2332 if (!cpumask_test_cpu(cpu, mask) || !cpu_online(cpu)) 2414 if (!netif_attr_test_mask(j, mask, nr_ids) ||
2415 !netif_attr_test_online(j, online_mask, nr_ids))
2333 active |= remove_xps_queue(dev_maps, tci, index); 2416 active |= remove_xps_queue(dev_maps, tci, index);
2334 for (i = num_tc - tc, tci++; --i; tci++) 2417 for (i = num_tc - tc, tci++; --i; tci++)
2335 active |= remove_xps_queue(dev_maps, tci, index); 2418 active |= remove_xps_queue(dev_maps, tci, index);
@@ -2337,7 +2420,10 @@ out_no_new_maps:
2337 2420
2338 /* free map if not active */ 2421 /* free map if not active */
2339 if (!active) { 2422 if (!active) {
2340 RCU_INIT_POINTER(dev->xps_maps, NULL); 2423 if (is_rxqs_map)
2424 RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
2425 else
2426 RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
2341 kfree_rcu(dev_maps, rcu); 2427 kfree_rcu(dev_maps, rcu);
2342 } 2428 }
2343 2429
@@ -2347,11 +2433,12 @@ out_no_maps:
2347 return 0; 2433 return 0;
2348error: 2434error:
2349 /* remove any maps that we added */ 2435 /* remove any maps that we added */
2350 for_each_possible_cpu(cpu) { 2436 for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2351 for (i = num_tc, tci = cpu * num_tc; i--; tci++) { 2437 j < nr_ids;) {
2352 new_map = xmap_dereference(new_dev_maps->cpu_map[tci]); 2438 for (i = num_tc, tci = j * num_tc; i--; tci++) {
2439 new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
2353 map = dev_maps ? 2440 map = dev_maps ?
2354 xmap_dereference(dev_maps->cpu_map[tci]) : 2441 xmap_dereference(dev_maps->attr_map[tci]) :
2355 NULL; 2442 NULL;
2356 if (new_map && new_map != map) 2443 if (new_map && new_map != map)
2357 kfree(new_map); 2444 kfree(new_map);
@@ -2363,14 +2450,41 @@ error:
2363 kfree(new_dev_maps); 2450 kfree(new_dev_maps);
2364 return -ENOMEM; 2451 return -ENOMEM;
2365} 2452}
2453EXPORT_SYMBOL_GPL(__netif_set_xps_queue);
2454
2455int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2456 u16 index)
2457{
2458 int ret;
2459
2460 cpus_read_lock();
2461 ret = __netif_set_xps_queue(dev, cpumask_bits(mask), index, false);
2462 cpus_read_unlock();
2463
2464 return ret;
2465}
2366EXPORT_SYMBOL(netif_set_xps_queue); 2466EXPORT_SYMBOL(netif_set_xps_queue);
2367 2467
2368#endif 2468#endif
2469static void netdev_unbind_all_sb_channels(struct net_device *dev)
2470{
2471 struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
2472
2473 /* Unbind any subordinate channels */
2474 while (txq-- != &dev->_tx[0]) {
2475 if (txq->sb_dev)
2476 netdev_unbind_sb_channel(dev, txq->sb_dev);
2477 }
2478}
2479
2369void netdev_reset_tc(struct net_device *dev) 2480void netdev_reset_tc(struct net_device *dev)
2370{ 2481{
2371#ifdef CONFIG_XPS 2482#ifdef CONFIG_XPS
2372 netif_reset_xps_queues_gt(dev, 0); 2483 netif_reset_xps_queues_gt(dev, 0);
2373#endif 2484#endif
2485 netdev_unbind_all_sb_channels(dev);
2486
2487 /* Reset TC configuration of device */
2374 dev->num_tc = 0; 2488 dev->num_tc = 0;
2375 memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq)); 2489 memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
2376 memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map)); 2490 memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
@@ -2399,11 +2513,77 @@ int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
2399#ifdef CONFIG_XPS 2513#ifdef CONFIG_XPS
2400 netif_reset_xps_queues_gt(dev, 0); 2514 netif_reset_xps_queues_gt(dev, 0);
2401#endif 2515#endif
2516 netdev_unbind_all_sb_channels(dev);
2517
2402 dev->num_tc = num_tc; 2518 dev->num_tc = num_tc;
2403 return 0; 2519 return 0;
2404} 2520}
2405EXPORT_SYMBOL(netdev_set_num_tc); 2521EXPORT_SYMBOL(netdev_set_num_tc);
2406 2522
2523void netdev_unbind_sb_channel(struct net_device *dev,
2524 struct net_device *sb_dev)
2525{
2526 struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
2527
2528#ifdef CONFIG_XPS
2529 netif_reset_xps_queues_gt(sb_dev, 0);
2530#endif
2531 memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
2532 memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));
2533
2534 while (txq-- != &dev->_tx[0]) {
2535 if (txq->sb_dev == sb_dev)
2536 txq->sb_dev = NULL;
2537 }
2538}
2539EXPORT_SYMBOL(netdev_unbind_sb_channel);
2540
2541int netdev_bind_sb_channel_queue(struct net_device *dev,
2542 struct net_device *sb_dev,
2543 u8 tc, u16 count, u16 offset)
2544{
2545 /* Make certain the sb_dev and dev are already configured */
2546 if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
2547 return -EINVAL;
2548
2549 /* We cannot hand out queues we don't have */
2550 if ((offset + count) > dev->real_num_tx_queues)
2551 return -EINVAL;
2552
2553 /* Record the mapping */
2554 sb_dev->tc_to_txq[tc].count = count;
2555 sb_dev->tc_to_txq[tc].offset = offset;
2556
2557 /* Provide a way for Tx queue to find the tc_to_txq map or
2558 * XPS map for itself.
2559 */
2560 while (count--)
2561 netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;
2562
2563 return 0;
2564}
2565EXPORT_SYMBOL(netdev_bind_sb_channel_queue);
2566
2567int netdev_set_sb_channel(struct net_device *dev, u16 channel)
2568{
2569 /* Do not use a multiqueue device to represent a subordinate channel */
2570 if (netif_is_multiqueue(dev))
2571 return -ENODEV;
2572
2573 /* We allow channels 1 - 32767 to be used for subordinate channels.
2574 * Channel 0 is meant to be "native" mode and used only to represent
2575 * the main root device. We allow writing 0 to reset the device back
2576 * to normal mode after being used as a subordinate channel.
2577 */
2578 if (channel > S16_MAX)
2579 return -EINVAL;
2580
2581 dev->num_tc = -channel;
2582
2583 return 0;
2584}
2585EXPORT_SYMBOL(netdev_set_sb_channel);
2586
2407/* 2587/*
2408 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues 2588 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2409 * greater than real_num_tx_queues stale skbs on the qdisc must be flushed. 2589 * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
@@ -2615,24 +2795,26 @@ EXPORT_SYMBOL(netif_device_attach);
2615 * Returns a Tx hash based on the given packet descriptor a Tx queues' number 2795 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2616 * to be used as a distribution range. 2796 * to be used as a distribution range.
2617 */ 2797 */
2618static u16 skb_tx_hash(const struct net_device *dev, struct sk_buff *skb) 2798static u16 skb_tx_hash(const struct net_device *dev,
2799 const struct net_device *sb_dev,
2800 struct sk_buff *skb)
2619{ 2801{
2620 u32 hash; 2802 u32 hash;
2621 u16 qoffset = 0; 2803 u16 qoffset = 0;
2622 u16 qcount = dev->real_num_tx_queues; 2804 u16 qcount = dev->real_num_tx_queues;
2623 2805
2806 if (dev->num_tc) {
2807 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2808
2809 qoffset = sb_dev->tc_to_txq[tc].offset;
2810 qcount = sb_dev->tc_to_txq[tc].count;
2811 }
2812
2624 if (skb_rx_queue_recorded(skb)) { 2813 if (skb_rx_queue_recorded(skb)) {
2625 hash = skb_get_rx_queue(skb); 2814 hash = skb_get_rx_queue(skb);
2626 while (unlikely(hash >= qcount)) 2815 while (unlikely(hash >= qcount))
2627 hash -= qcount; 2816 hash -= qcount;
2628 return hash; 2817 return hash + qoffset;
2629 }
2630
2631 if (dev->num_tc) {
2632 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2633
2634 qoffset = dev->tc_to_txq[tc].offset;
2635 qcount = dev->tc_to_txq[tc].count;
2636 } 2818 }
2637 2819
2638 return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset; 2820 return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
@@ -3376,32 +3558,64 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3376} 3558}
3377#endif /* CONFIG_NET_EGRESS */ 3559#endif /* CONFIG_NET_EGRESS */
3378 3560
3379static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) 3561#ifdef CONFIG_XPS
3562static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
3563 struct xps_dev_maps *dev_maps, unsigned int tci)
3564{
3565 struct xps_map *map;
3566 int queue_index = -1;
3567
3568 if (dev->num_tc) {
3569 tci *= dev->num_tc;
3570 tci += netdev_get_prio_tc_map(dev, skb->priority);
3571 }
3572
3573 map = rcu_dereference(dev_maps->attr_map[tci]);
3574 if (map) {
3575 if (map->len == 1)
3576 queue_index = map->queues[0];
3577 else
3578 queue_index = map->queues[reciprocal_scale(
3579 skb_get_hash(skb), map->len)];
3580 if (unlikely(queue_index >= dev->real_num_tx_queues))
3581 queue_index = -1;
3582 }
3583 return queue_index;
3584}
3585#endif
3586
3587static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
3588 struct sk_buff *skb)
3380{ 3589{
3381#ifdef CONFIG_XPS 3590#ifdef CONFIG_XPS
3382 struct xps_dev_maps *dev_maps; 3591 struct xps_dev_maps *dev_maps;
3383 struct xps_map *map; 3592 struct sock *sk = skb->sk;
3384 int queue_index = -1; 3593 int queue_index = -1;
3385 3594
3595 if (!static_key_false(&xps_needed))
3596 return -1;
3597
3386 rcu_read_lock(); 3598 rcu_read_lock();
3387 dev_maps = rcu_dereference(dev->xps_maps); 3599 if (!static_key_false(&xps_rxqs_needed))
3600 goto get_cpus_map;
3601
3602 dev_maps = rcu_dereference(sb_dev->xps_rxqs_map);
3388 if (dev_maps) { 3603 if (dev_maps) {
3389 unsigned int tci = skb->sender_cpu - 1; 3604 int tci = sk_rx_queue_get(sk);
3390 3605
3391 if (dev->num_tc) { 3606 if (tci >= 0 && tci < dev->num_rx_queues)
3392 tci *= dev->num_tc; 3607 queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
3393 tci += netdev_get_prio_tc_map(dev, skb->priority); 3608 tci);
3394 } 3609 }
3395 3610
3396 map = rcu_dereference(dev_maps->cpu_map[tci]); 3611get_cpus_map:
3397 if (map) { 3612 if (queue_index < 0) {
3398 if (map->len == 1) 3613 dev_maps = rcu_dereference(sb_dev->xps_cpus_map);
3399 queue_index = map->queues[0]; 3614 if (dev_maps) {
3400 else 3615 unsigned int tci = skb->sender_cpu - 1;
3401 queue_index = map->queues[reciprocal_scale(skb_get_hash(skb), 3616
3402 map->len)]; 3617 queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
3403 if (unlikely(queue_index >= dev->real_num_tx_queues)) 3618 tci);
3404 queue_index = -1;
3405 } 3619 }
3406 } 3620 }
3407 rcu_read_unlock(); 3621 rcu_read_unlock();
@@ -3412,17 +3626,36 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
3412#endif 3626#endif
3413} 3627}
3414 3628
3415static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) 3629u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
3630 struct net_device *sb_dev,
3631 select_queue_fallback_t fallback)
3632{
3633 return 0;
3634}
3635EXPORT_SYMBOL(dev_pick_tx_zero);
3636
3637u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
3638 struct net_device *sb_dev,
3639 select_queue_fallback_t fallback)
3640{
3641 return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
3642}
3643EXPORT_SYMBOL(dev_pick_tx_cpu_id);
3644
3645static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
3646 struct net_device *sb_dev)
3416{ 3647{
3417 struct sock *sk = skb->sk; 3648 struct sock *sk = skb->sk;
3418 int queue_index = sk_tx_queue_get(sk); 3649 int queue_index = sk_tx_queue_get(sk);
3419 3650
3651 sb_dev = sb_dev ? : dev;
3652
3420 if (queue_index < 0 || skb->ooo_okay || 3653 if (queue_index < 0 || skb->ooo_okay ||
3421 queue_index >= dev->real_num_tx_queues) { 3654 queue_index >= dev->real_num_tx_queues) {
3422 int new_index = get_xps_queue(dev, skb); 3655 int new_index = get_xps_queue(dev, sb_dev, skb);
3423 3656
3424 if (new_index < 0) 3657 if (new_index < 0)
3425 new_index = skb_tx_hash(dev, skb); 3658 new_index = skb_tx_hash(dev, sb_dev, skb);
3426 3659
3427 if (queue_index != new_index && sk && 3660 if (queue_index != new_index && sk &&
3428 sk_fullsock(sk) && 3661 sk_fullsock(sk) &&
@@ -3437,7 +3670,7 @@ static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
3437 3670
3438struct netdev_queue *netdev_pick_tx(struct net_device *dev, 3671struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3439 struct sk_buff *skb, 3672 struct sk_buff *skb,
3440 void *accel_priv) 3673 struct net_device *sb_dev)
3441{ 3674{
3442 int queue_index = 0; 3675 int queue_index = 0;
3443 3676
@@ -3452,10 +3685,10 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3452 const struct net_device_ops *ops = dev->netdev_ops; 3685 const struct net_device_ops *ops = dev->netdev_ops;
3453 3686
3454 if (ops->ndo_select_queue) 3687 if (ops->ndo_select_queue)
3455 queue_index = ops->ndo_select_queue(dev, skb, accel_priv, 3688 queue_index = ops->ndo_select_queue(dev, skb, sb_dev,
3456 __netdev_pick_tx); 3689 __netdev_pick_tx);
3457 else 3690 else
3458 queue_index = __netdev_pick_tx(dev, skb); 3691 queue_index = __netdev_pick_tx(dev, skb, sb_dev);
3459 3692
3460 queue_index = netdev_cap_txqueue(dev, queue_index); 3693 queue_index = netdev_cap_txqueue(dev, queue_index);
3461 } 3694 }
@@ -3467,7 +3700,7 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3467/** 3700/**
3468 * __dev_queue_xmit - transmit a buffer 3701 * __dev_queue_xmit - transmit a buffer
3469 * @skb: buffer to transmit 3702 * @skb: buffer to transmit
3470 * @accel_priv: private data used for L2 forwarding offload 3703 * @sb_dev: suboordinate device used for L2 forwarding offload
3471 * 3704 *
3472 * Queue a buffer for transmission to a network device. The caller must 3705 * Queue a buffer for transmission to a network device. The caller must
3473 * have set the device and priority and built the buffer before calling 3706 * have set the device and priority and built the buffer before calling
@@ -3490,7 +3723,7 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3490 * the BH enable code must have IRQs enabled so that it will not deadlock. 3723 * the BH enable code must have IRQs enabled so that it will not deadlock.
3491 * --BLG 3724 * --BLG
3492 */ 3725 */
3493static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) 3726static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
3494{ 3727{
3495 struct net_device *dev = skb->dev; 3728 struct net_device *dev = skb->dev;
3496 struct netdev_queue *txq; 3729 struct netdev_queue *txq;
@@ -3529,7 +3762,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3529 else 3762 else
3530 skb_dst_force(skb); 3763 skb_dst_force(skb);
3531 3764
3532 txq = netdev_pick_tx(dev, skb, accel_priv); 3765 txq = netdev_pick_tx(dev, skb, sb_dev);
3533 q = rcu_dereference_bh(txq->qdisc); 3766 q = rcu_dereference_bh(txq->qdisc);
3534 3767
3535 trace_net_dev_queue(skb); 3768 trace_net_dev_queue(skb);
@@ -3603,9 +3836,9 @@ int dev_queue_xmit(struct sk_buff *skb)
3603} 3836}
3604EXPORT_SYMBOL(dev_queue_xmit); 3837EXPORT_SYMBOL(dev_queue_xmit);
3605 3838
3606int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv) 3839int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev)
3607{ 3840{
3608 return __dev_queue_xmit(skb, accel_priv); 3841 return __dev_queue_xmit(skb, sb_dev);
3609} 3842}
3610EXPORT_SYMBOL(dev_queue_xmit_accel); 3843EXPORT_SYMBOL(dev_queue_xmit_accel);
3611 3844
@@ -4028,7 +4261,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
4028 /* Reinjected packets coming from act_mirred or similar should 4261 /* Reinjected packets coming from act_mirred or similar should
4029 * not get XDP generic processing. 4262 * not get XDP generic processing.
4030 */ 4263 */
4031 if (skb_cloned(skb)) 4264 if (skb_cloned(skb) || skb_is_tc_redirected(skb))
4032 return XDP_PASS; 4265 return XDP_PASS;
4033 4266
4034 /* XDP packets must be linear and must have sufficient headroom 4267 /* XDP packets must be linear and must have sufficient headroom
@@ -4378,6 +4611,10 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
4378 __skb_push(skb, skb->mac_len); 4611 __skb_push(skb, skb->mac_len);
4379 skb_do_redirect(skb); 4612 skb_do_redirect(skb);
4380 return NULL; 4613 return NULL;
4614 case TC_ACT_REINSERT:
4615 /* this does not scrub the packet, and updates stats on error */
4616 skb_tc_reinsert(skb, &cl_res);
4617 return NULL;
4381 default: 4618 default:
4382 break; 4619 break;
4383 } 4620 }
@@ -4494,7 +4731,8 @@ static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
4494 return 0; 4731 return 0;
4495} 4732}
4496 4733
4497static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc) 4734static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc,
4735 struct packet_type **ppt_prev)
4498{ 4736{
4499 struct packet_type *ptype, *pt_prev; 4737 struct packet_type *ptype, *pt_prev;
4500 rx_handler_func_t *rx_handler; 4738 rx_handler_func_t *rx_handler;
@@ -4624,8 +4862,7 @@ skip_classify:
4624 if (pt_prev) { 4862 if (pt_prev) {
4625 if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) 4863 if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
4626 goto drop; 4864 goto drop;
4627 else 4865 *ppt_prev = pt_prev;
4628 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4629 } else { 4866 } else {
4630drop: 4867drop:
4631 if (!deliver_exact) 4868 if (!deliver_exact)
@@ -4643,6 +4880,18 @@ out:
4643 return ret; 4880 return ret;
4644} 4881}
4645 4882
4883static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
4884{
4885 struct net_device *orig_dev = skb->dev;
4886 struct packet_type *pt_prev = NULL;
4887 int ret;
4888
4889 ret = __netif_receive_skb_core(skb, pfmemalloc, &pt_prev);
4890 if (pt_prev)
4891 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4892 return ret;
4893}
4894
4646/** 4895/**
4647 * netif_receive_skb_core - special purpose version of netif_receive_skb 4896 * netif_receive_skb_core - special purpose version of netif_receive_skb
4648 * @skb: buffer to process 4897 * @skb: buffer to process
@@ -4663,13 +4912,72 @@ int netif_receive_skb_core(struct sk_buff *skb)
4663 int ret; 4912 int ret;
4664 4913
4665 rcu_read_lock(); 4914 rcu_read_lock();
4666 ret = __netif_receive_skb_core(skb, false); 4915 ret = __netif_receive_skb_one_core(skb, false);
4667 rcu_read_unlock(); 4916 rcu_read_unlock();
4668 4917
4669 return ret; 4918 return ret;
4670} 4919}
4671EXPORT_SYMBOL(netif_receive_skb_core); 4920EXPORT_SYMBOL(netif_receive_skb_core);
4672 4921
4922static inline void __netif_receive_skb_list_ptype(struct list_head *head,
4923 struct packet_type *pt_prev,
4924 struct net_device *orig_dev)
4925{
4926 struct sk_buff *skb, *next;
4927
4928 if (!pt_prev)
4929 return;
4930 if (list_empty(head))
4931 return;
4932 if (pt_prev->list_func != NULL)
4933 pt_prev->list_func(head, pt_prev, orig_dev);
4934 else
4935 list_for_each_entry_safe(skb, next, head, list)
4936 pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4937}
4938
4939static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
4940{
4941 /* Fast-path assumptions:
4942 * - There is no RX handler.
4943 * - Only one packet_type matches.
4944 * If either of these fails, we will end up doing some per-packet
4945 * processing in-line, then handling the 'last ptype' for the whole
4946 * sublist. This can't cause out-of-order delivery to any single ptype,
4947 * because the 'last ptype' must be constant across the sublist, and all
4948 * other ptypes are handled per-packet.
4949 */
4950 /* Current (common) ptype of sublist */
4951 struct packet_type *pt_curr = NULL;
4952 /* Current (common) orig_dev of sublist */
4953 struct net_device *od_curr = NULL;
4954 struct list_head sublist;
4955 struct sk_buff *skb, *next;
4956
4957 INIT_LIST_HEAD(&sublist);
4958 list_for_each_entry_safe(skb, next, head, list) {
4959 struct net_device *orig_dev = skb->dev;
4960 struct packet_type *pt_prev = NULL;
4961
4962 list_del(&skb->list);
4963 __netif_receive_skb_core(skb, pfmemalloc, &pt_prev);
4964 if (!pt_prev)
4965 continue;
4966 if (pt_curr != pt_prev || od_curr != orig_dev) {
4967 /* dispatch old sublist */
4968 __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
4969 /* start new sublist */
4970 INIT_LIST_HEAD(&sublist);
4971 pt_curr = pt_prev;
4972 od_curr = orig_dev;
4973 }
4974 list_add_tail(&skb->list, &sublist);
4975 }
4976
4977 /* dispatch final sublist */
4978 __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
4979}
4980
4673static int __netif_receive_skb(struct sk_buff *skb) 4981static int __netif_receive_skb(struct sk_buff *skb)
4674{ 4982{
4675 int ret; 4983 int ret;
@@ -4687,14 +4995,44 @@ static int __netif_receive_skb(struct sk_buff *skb)
4687 * context down to all allocation sites. 4995 * context down to all allocation sites.
4688 */ 4996 */
4689 noreclaim_flag = memalloc_noreclaim_save(); 4997 noreclaim_flag = memalloc_noreclaim_save();
4690 ret = __netif_receive_skb_core(skb, true); 4998 ret = __netif_receive_skb_one_core(skb, true);
4691 memalloc_noreclaim_restore(noreclaim_flag); 4999 memalloc_noreclaim_restore(noreclaim_flag);
4692 } else 5000 } else
4693 ret = __netif_receive_skb_core(skb, false); 5001 ret = __netif_receive_skb_one_core(skb, false);
4694 5002
4695 return ret; 5003 return ret;
4696} 5004}
4697 5005
5006static void __netif_receive_skb_list(struct list_head *head)
5007{
5008 unsigned long noreclaim_flag = 0;
5009 struct sk_buff *skb, *next;
5010 bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
5011
5012 list_for_each_entry_safe(skb, next, head, list) {
5013 if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
5014 struct list_head sublist;
5015
5016 /* Handle the previous sublist */
5017 list_cut_before(&sublist, head, &skb->list);
5018 if (!list_empty(&sublist))
5019 __netif_receive_skb_list_core(&sublist, pfmemalloc);
5020 pfmemalloc = !pfmemalloc;
5021 /* See comments in __netif_receive_skb */
5022 if (pfmemalloc)
5023 noreclaim_flag = memalloc_noreclaim_save();
5024 else
5025 memalloc_noreclaim_restore(noreclaim_flag);
5026 }
5027 }
5028 /* Handle the remaining sublist */
5029 if (!list_empty(head))
5030 __netif_receive_skb_list_core(head, pfmemalloc);
5031 /* Restore pflags */
5032 if (pfmemalloc)
5033 memalloc_noreclaim_restore(noreclaim_flag);
5034}
5035
4698static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) 5036static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
4699{ 5037{
4700 struct bpf_prog *old = rtnl_dereference(dev->xdp_prog); 5038 struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
@@ -4717,7 +5055,6 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
4717 break; 5055 break;
4718 5056
4719 case XDP_QUERY_PROG: 5057 case XDP_QUERY_PROG:
4720 xdp->prog_attached = !!old;
4721 xdp->prog_id = old ? old->aux->id : 0; 5058 xdp->prog_id = old ? old->aux->id : 0;
4722 break; 5059 break;
4723 5060
@@ -4769,6 +5106,55 @@ static int netif_receive_skb_internal(struct sk_buff *skb)
4769 return ret; 5106 return ret;
4770} 5107}
4771 5108
5109static void netif_receive_skb_list_internal(struct list_head *head)
5110{
5111 struct bpf_prog *xdp_prog = NULL;
5112 struct sk_buff *skb, *next;
5113 struct list_head sublist;
5114
5115 INIT_LIST_HEAD(&sublist);
5116 list_for_each_entry_safe(skb, next, head, list) {
5117 net_timestamp_check(netdev_tstamp_prequeue, skb);
5118 list_del(&skb->list);
5119 if (!skb_defer_rx_timestamp(skb))
5120 list_add_tail(&skb->list, &sublist);
5121 }
5122 list_splice_init(&sublist, head);
5123
5124 if (static_branch_unlikely(&generic_xdp_needed_key)) {
5125 preempt_disable();
5126 rcu_read_lock();
5127 list_for_each_entry_safe(skb, next, head, list) {
5128 xdp_prog = rcu_dereference(skb->dev->xdp_prog);
5129 list_del(&skb->list);
5130 if (do_xdp_generic(xdp_prog, skb) == XDP_PASS)
5131 list_add_tail(&skb->list, &sublist);
5132 }
5133 rcu_read_unlock();
5134 preempt_enable();
5135 /* Put passed packets back on main list */
5136 list_splice_init(&sublist, head);
5137 }
5138
5139 rcu_read_lock();
5140#ifdef CONFIG_RPS
5141 if (static_key_false(&rps_needed)) {
5142 list_for_each_entry_safe(skb, next, head, list) {
5143 struct rps_dev_flow voidflow, *rflow = &voidflow;
5144 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
5145
5146 if (cpu >= 0) {
5147 /* Will be handled, remove from list */
5148 list_del(&skb->list);
5149 enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
5150 }
5151 }
5152 }
5153#endif
5154 __netif_receive_skb_list(head);
5155 rcu_read_unlock();
5156}
5157
4772/** 5158/**
4773 * netif_receive_skb - process receive buffer from network 5159 * netif_receive_skb - process receive buffer from network
4774 * @skb: buffer to process 5160 * @skb: buffer to process
@@ -4792,6 +5178,28 @@ int netif_receive_skb(struct sk_buff *skb)
4792} 5178}
4793EXPORT_SYMBOL(netif_receive_skb); 5179EXPORT_SYMBOL(netif_receive_skb);
4794 5180
5181/**
5182 * netif_receive_skb_list - process many receive buffers from network
5183 * @head: list of skbs to process.
5184 *
5185 * Since return value of netif_receive_skb() is normally ignored, and
5186 * wouldn't be meaningful for a list, this function returns void.
5187 *
5188 * This function may only be called from softirq context and interrupts
5189 * should be enabled.
5190 */
5191void netif_receive_skb_list(struct list_head *head)
5192{
5193 struct sk_buff *skb;
5194
5195 if (list_empty(head))
5196 return;
5197 list_for_each_entry(skb, head, list)
5198 trace_netif_receive_skb_list_entry(skb);
5199 netif_receive_skb_list_internal(head);
5200}
5201EXPORT_SYMBOL(netif_receive_skb_list);
5202
4795DEFINE_PER_CPU(struct work_struct, flush_works); 5203DEFINE_PER_CPU(struct work_struct, flush_works);
4796 5204
4797/* Network device is going away, flush any packets still pending */ 5205/* Network device is going away, flush any packets still pending */
@@ -4875,42 +5283,50 @@ out:
4875 return netif_receive_skb_internal(skb); 5283 return netif_receive_skb_internal(skb);
4876} 5284}
4877 5285
4878/* napi->gro_list contains packets ordered by age. 5286static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
4879 * youngest packets at the head of it. 5287 bool flush_old)
4880 * Complete skbs in reverse order to reduce latencies.
4881 */
4882void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4883{ 5288{
4884 struct sk_buff *skb, *prev = NULL; 5289 struct list_head *head = &napi->gro_hash[index].list;
4885 5290 struct sk_buff *skb, *p;
4886 /* scan list and build reverse chain */
4887 for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4888 skb->prev = prev;
4889 prev = skb;
4890 }
4891
4892 for (skb = prev; skb; skb = prev) {
4893 skb->next = NULL;
4894 5291
5292 list_for_each_entry_safe_reverse(skb, p, head, list) {
4895 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) 5293 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4896 return; 5294 return;
4897 5295 list_del(&skb->list);
4898 prev = skb->prev; 5296 skb->next = NULL;
4899 napi_gro_complete(skb); 5297 napi_gro_complete(skb);
4900 napi->gro_count--; 5298 napi->gro_hash[index].count--;
4901 } 5299 }
4902 5300
4903 napi->gro_list = NULL; 5301 if (!napi->gro_hash[index].count)
5302 __clear_bit(index, &napi->gro_bitmask);
5303}
5304
5305/* napi->gro_hash[].list contains packets ordered by age.
5306 * youngest packets at the head of it.
5307 * Complete skbs in reverse order to reduce latencies.
5308 */
5309void napi_gro_flush(struct napi_struct *napi, bool flush_old)
5310{
5311 u32 i;
5312
5313 for (i = 0; i < GRO_HASH_BUCKETS; i++) {
5314 if (test_bit(i, &napi->gro_bitmask))
5315 __napi_gro_flush_chain(napi, i, flush_old);
5316 }
4904} 5317}
4905EXPORT_SYMBOL(napi_gro_flush); 5318EXPORT_SYMBOL(napi_gro_flush);
4906 5319
4907static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb) 5320static struct list_head *gro_list_prepare(struct napi_struct *napi,
5321 struct sk_buff *skb)
4908{ 5322{
4909 struct sk_buff *p;
4910 unsigned int maclen = skb->dev->hard_header_len; 5323 unsigned int maclen = skb->dev->hard_header_len;
4911 u32 hash = skb_get_hash_raw(skb); 5324 u32 hash = skb_get_hash_raw(skb);
5325 struct list_head *head;
5326 struct sk_buff *p;
4912 5327
4913 for (p = napi->gro_list; p; p = p->next) { 5328 head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)].list;
5329 list_for_each_entry(p, head, list) {
4914 unsigned long diffs; 5330 unsigned long diffs;
4915 5331
4916 NAPI_GRO_CB(p)->flush = 0; 5332 NAPI_GRO_CB(p)->flush = 0;
@@ -4933,6 +5349,8 @@ static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4933 maclen); 5349 maclen);
4934 NAPI_GRO_CB(p)->same_flow = !diffs; 5350 NAPI_GRO_CB(p)->same_flow = !diffs;
4935 } 5351 }
5352
5353 return head;
4936} 5354}
4937 5355
4938static void skb_gro_reset_offset(struct sk_buff *skb) 5356static void skb_gro_reset_offset(struct sk_buff *skb)
@@ -4975,20 +5393,41 @@ static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4975 } 5393 }
4976} 5394}
4977 5395
5396static void gro_flush_oldest(struct list_head *head)
5397{
5398 struct sk_buff *oldest;
5399
5400 oldest = list_last_entry(head, struct sk_buff, list);
5401
5402 /* We are called with head length >= MAX_GRO_SKBS, so this is
5403 * impossible.
5404 */
5405 if (WARN_ON_ONCE(!oldest))
5406 return;
5407
5408 /* Do not adjust napi->gro_hash[].count, caller is adding a new
5409 * SKB to the chain.
5410 */
5411 list_del(&oldest->list);
5412 napi_gro_complete(oldest);
5413}
5414
4978static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) 5415static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4979{ 5416{
4980 struct sk_buff **pp = NULL; 5417 u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
5418 struct list_head *head = &offload_base;
4981 struct packet_offload *ptype; 5419 struct packet_offload *ptype;
4982 __be16 type = skb->protocol; 5420 __be16 type = skb->protocol;
4983 struct list_head *head = &offload_base; 5421 struct list_head *gro_head;
4984 int same_flow; 5422 struct sk_buff *pp = NULL;
4985 enum gro_result ret; 5423 enum gro_result ret;
5424 int same_flow;
4986 int grow; 5425 int grow;
4987 5426
4988 if (netif_elide_gro(skb->dev)) 5427 if (netif_elide_gro(skb->dev))
4989 goto normal; 5428 goto normal;
4990 5429
4991 gro_list_prepare(napi, skb); 5430 gro_head = gro_list_prepare(napi, skb);
4992 5431
4993 rcu_read_lock(); 5432 rcu_read_lock();
4994 list_for_each_entry_rcu(ptype, head, list) { 5433 list_for_each_entry_rcu(ptype, head, list) {
@@ -5022,7 +5461,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
5022 NAPI_GRO_CB(skb)->csum_valid = 0; 5461 NAPI_GRO_CB(skb)->csum_valid = 0;
5023 } 5462 }
5024 5463
5025 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb); 5464 pp = ptype->callbacks.gro_receive(gro_head, skb);
5026 break; 5465 break;
5027 } 5466 }
5028 rcu_read_unlock(); 5467 rcu_read_unlock();
@@ -5039,12 +5478,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
5039 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; 5478 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
5040 5479
5041 if (pp) { 5480 if (pp) {
5042 struct sk_buff *nskb = *pp; 5481 list_del(&pp->list);
5043 5482 pp->next = NULL;
5044 *pp = nskb->next; 5483 napi_gro_complete(pp);
5045 nskb->next = NULL; 5484 napi->gro_hash[hash].count--;
5046 napi_gro_complete(nskb);
5047 napi->gro_count--;
5048 } 5485 }
5049 5486
5050 if (same_flow) 5487 if (same_flow)
@@ -5053,26 +5490,16 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
5053 if (NAPI_GRO_CB(skb)->flush) 5490 if (NAPI_GRO_CB(skb)->flush)
5054 goto normal; 5491 goto normal;
5055 5492
5056 if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) { 5493 if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) {
5057 struct sk_buff *nskb = napi->gro_list; 5494 gro_flush_oldest(gro_head);
5058
5059 /* locate the end of the list to select the 'oldest' flow */
5060 while (nskb->next) {
5061 pp = &nskb->next;
5062 nskb = *pp;
5063 }
5064 *pp = NULL;
5065 nskb->next = NULL;
5066 napi_gro_complete(nskb);
5067 } else { 5495 } else {
5068 napi->gro_count++; 5496 napi->gro_hash[hash].count++;
5069 } 5497 }
5070 NAPI_GRO_CB(skb)->count = 1; 5498 NAPI_GRO_CB(skb)->count = 1;
5071 NAPI_GRO_CB(skb)->age = jiffies; 5499 NAPI_GRO_CB(skb)->age = jiffies;
5072 NAPI_GRO_CB(skb)->last = skb; 5500 NAPI_GRO_CB(skb)->last = skb;
5073 skb_shinfo(skb)->gso_size = skb_gro_len(skb); 5501 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
5074 skb->next = napi->gro_list; 5502 list_add(&skb->list, gro_head);
5075 napi->gro_list = skb;
5076 ret = GRO_HELD; 5503 ret = GRO_HELD;
5077 5504
5078pull: 5505pull:
@@ -5080,6 +5507,13 @@ pull:
5080 if (grow > 0) 5507 if (grow > 0)
5081 gro_pull_from_frag0(skb, grow); 5508 gro_pull_from_frag0(skb, grow);
5082ok: 5509ok:
5510 if (napi->gro_hash[hash].count) {
5511 if (!test_bit(hash, &napi->gro_bitmask))
5512 __set_bit(hash, &napi->gro_bitmask);
5513 } else if (test_bit(hash, &napi->gro_bitmask)) {
5514 __clear_bit(hash, &napi->gro_bitmask);
5515 }
5516
5083 return ret; 5517 return ret;
5084 5518
5085normal: 5519normal:
@@ -5478,7 +5912,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
5478 NAPIF_STATE_IN_BUSY_POLL))) 5912 NAPIF_STATE_IN_BUSY_POLL)))
5479 return false; 5913 return false;
5480 5914
5481 if (n->gro_list) { 5915 if (n->gro_bitmask) {
5482 unsigned long timeout = 0; 5916 unsigned long timeout = 0;
5483 5917
5484 if (work_done) 5918 if (work_done)
@@ -5687,21 +6121,31 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
5687 /* Note : we use a relaxed variant of napi_schedule_prep() not setting 6121 /* Note : we use a relaxed variant of napi_schedule_prep() not setting
5688 * NAPI_STATE_MISSED, since we do not react to a device IRQ. 6122 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
5689 */ 6123 */
5690 if (napi->gro_list && !napi_disable_pending(napi) && 6124 if (napi->gro_bitmask && !napi_disable_pending(napi) &&
5691 !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) 6125 !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
5692 __napi_schedule_irqoff(napi); 6126 __napi_schedule_irqoff(napi);
5693 6127
5694 return HRTIMER_NORESTART; 6128 return HRTIMER_NORESTART;
5695} 6129}
5696 6130
6131static void init_gro_hash(struct napi_struct *napi)
6132{
6133 int i;
6134
6135 for (i = 0; i < GRO_HASH_BUCKETS; i++) {
6136 INIT_LIST_HEAD(&napi->gro_hash[i].list);
6137 napi->gro_hash[i].count = 0;
6138 }
6139 napi->gro_bitmask = 0;
6140}
6141
5697void netif_napi_add(struct net_device *dev, struct napi_struct *napi, 6142void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
5698 int (*poll)(struct napi_struct *, int), int weight) 6143 int (*poll)(struct napi_struct *, int), int weight)
5699{ 6144{
5700 INIT_LIST_HEAD(&napi->poll_list); 6145 INIT_LIST_HEAD(&napi->poll_list);
5701 hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); 6146 hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
5702 napi->timer.function = napi_watchdog; 6147 napi->timer.function = napi_watchdog;
5703 napi->gro_count = 0; 6148 init_gro_hash(napi);
5704 napi->gro_list = NULL;
5705 napi->skb = NULL; 6149 napi->skb = NULL;
5706 napi->poll = poll; 6150 napi->poll = poll;
5707 if (weight > NAPI_POLL_WEIGHT) 6151 if (weight > NAPI_POLL_WEIGHT)
@@ -5734,6 +6178,19 @@ void napi_disable(struct napi_struct *n)
5734} 6178}
5735EXPORT_SYMBOL(napi_disable); 6179EXPORT_SYMBOL(napi_disable);
5736 6180
6181static void flush_gro_hash(struct napi_struct *napi)
6182{
6183 int i;
6184
6185 for (i = 0; i < GRO_HASH_BUCKETS; i++) {
6186 struct sk_buff *skb, *n;
6187
6188 list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list)
6189 kfree_skb(skb);
6190 napi->gro_hash[i].count = 0;
6191 }
6192}
6193
5737/* Must be called in process context */ 6194/* Must be called in process context */
5738void netif_napi_del(struct napi_struct *napi) 6195void netif_napi_del(struct napi_struct *napi)
5739{ 6196{
@@ -5743,9 +6200,8 @@ void netif_napi_del(struct napi_struct *napi)
5743 list_del_init(&napi->dev_list); 6200 list_del_init(&napi->dev_list);
5744 napi_free_frags(napi); 6201 napi_free_frags(napi);
5745 6202
5746 kfree_skb_list(napi->gro_list); 6203 flush_gro_hash(napi);
5747 napi->gro_list = NULL; 6204 napi->gro_bitmask = 0;
5748 napi->gro_count = 0;
5749} 6205}
5750EXPORT_SYMBOL(netif_napi_del); 6206EXPORT_SYMBOL(netif_napi_del);
5751 6207
@@ -5787,7 +6243,7 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
5787 goto out_unlock; 6243 goto out_unlock;
5788 } 6244 }
5789 6245
5790 if (n->gro_list) { 6246 if (n->gro_bitmask) {
5791 /* flush too old packets 6247 /* flush too old packets
5792 * If HZ < 1000, flush all packets. 6248 * If HZ < 1000, flush all packets.
5793 */ 6249 */
@@ -7080,13 +7536,15 @@ int __dev_set_mtu(struct net_device *dev, int new_mtu)
7080EXPORT_SYMBOL(__dev_set_mtu); 7536EXPORT_SYMBOL(__dev_set_mtu);
7081 7537
7082/** 7538/**
7083 * dev_set_mtu - Change maximum transfer unit 7539 * dev_set_mtu_ext - Change maximum transfer unit
7084 * @dev: device 7540 * @dev: device
7085 * @new_mtu: new transfer unit 7541 * @new_mtu: new transfer unit
7542 * @extack: netlink extended ack
7086 * 7543 *
7087 * Change the maximum transfer size of the network device. 7544 * Change the maximum transfer size of the network device.
7088 */ 7545 */
7089int dev_set_mtu(struct net_device *dev, int new_mtu) 7546int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
7547 struct netlink_ext_ack *extack)
7090{ 7548{
7091 int err, orig_mtu; 7549 int err, orig_mtu;
7092 7550
@@ -7095,14 +7553,12 @@ int dev_set_mtu(struct net_device *dev, int new_mtu)
7095 7553
7096 /* MTU must be positive, and in range */ 7554 /* MTU must be positive, and in range */
7097 if (new_mtu < 0 || new_mtu < dev->min_mtu) { 7555 if (new_mtu < 0 || new_mtu < dev->min_mtu) {
7098 net_err_ratelimited("%s: Invalid MTU %d requested, hw min %d\n", 7556 NL_SET_ERR_MSG(extack, "mtu less than device minimum");
7099 dev->name, new_mtu, dev->min_mtu);
7100 return -EINVAL; 7557 return -EINVAL;
7101 } 7558 }
7102 7559
7103 if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) { 7560 if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
7104 net_err_ratelimited("%s: Invalid MTU %d requested, hw max %d\n", 7561 NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
7105 dev->name, new_mtu, dev->max_mtu);
7106 return -EINVAL; 7562 return -EINVAL;
7107 } 7563 }
7108 7564
@@ -7130,6 +7586,18 @@ int dev_set_mtu(struct net_device *dev, int new_mtu)
7130 } 7586 }
7131 return err; 7587 return err;
7132} 7588}
7589
7590int dev_set_mtu(struct net_device *dev, int new_mtu)
7591{
7592 struct netlink_ext_ack extack;
7593 int err;
7594
7595 memset(&extack, 0, sizeof(extack));
7596 err = dev_set_mtu_ext(dev, new_mtu, &extack);
7597 if (err && extack._msg)
7598 net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
7599 return err;
7600}
7133EXPORT_SYMBOL(dev_set_mtu); 7601EXPORT_SYMBOL(dev_set_mtu);
7134 7602
7135/** 7603/**
@@ -7149,16 +7617,19 @@ int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
7149 dev->tx_queue_len = new_len; 7617 dev->tx_queue_len = new_len;
7150 res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev); 7618 res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
7151 res = notifier_to_errno(res); 7619 res = notifier_to_errno(res);
7152 if (res) { 7620 if (res)
7153 netdev_err(dev, 7621 goto err_rollback;
7154 "refused to change device tx_queue_len\n"); 7622 res = dev_qdisc_change_tx_queue_len(dev);
7155 dev->tx_queue_len = orig_len; 7623 if (res)
7156 return res; 7624 goto err_rollback;
7157 }
7158 return dev_qdisc_change_tx_queue_len(dev);
7159 } 7625 }
7160 7626
7161 return 0; 7627 return 0;
7628
7629err_rollback:
7630 netdev_err(dev, "refused to change device tx_queue_len\n");
7631 dev->tx_queue_len = orig_len;
7632 return res;
7162} 7633}
7163 7634
7164/** 7635/**
@@ -7276,23 +7747,21 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down)
7276} 7747}
7277EXPORT_SYMBOL(dev_change_proto_down); 7748EXPORT_SYMBOL(dev_change_proto_down);
7278 7749
7279void __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op, 7750u32 __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op,
7280 struct netdev_bpf *xdp) 7751 enum bpf_netdev_command cmd)
7281{ 7752{
7282 memset(xdp, 0, sizeof(*xdp)); 7753 struct netdev_bpf xdp;
7283 xdp->command = XDP_QUERY_PROG;
7284 7754
7285 /* Query must always succeed. */ 7755 if (!bpf_op)
7286 WARN_ON(bpf_op(dev, xdp) < 0); 7756 return 0;
7287}
7288 7757
7289static u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op) 7758 memset(&xdp, 0, sizeof(xdp));
7290{ 7759 xdp.command = cmd;
7291 struct netdev_bpf xdp;
7292 7760
7293 __dev_xdp_query(dev, bpf_op, &xdp); 7761 /* Query must always succeed. */
7762 WARN_ON(bpf_op(dev, &xdp) < 0 && cmd == XDP_QUERY_PROG);
7294 7763
7295 return xdp.prog_attached; 7764 return xdp.prog_id;
7296} 7765}
7297 7766
7298static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op, 7767static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op,
@@ -7326,12 +7795,19 @@ static void dev_xdp_uninstall(struct net_device *dev)
7326 if (!ndo_bpf) 7795 if (!ndo_bpf)
7327 return; 7796 return;
7328 7797
7329 __dev_xdp_query(dev, ndo_bpf, &xdp); 7798 memset(&xdp, 0, sizeof(xdp));
7330 if (xdp.prog_attached == XDP_ATTACHED_NONE) 7799 xdp.command = XDP_QUERY_PROG;
7331 return; 7800 WARN_ON(ndo_bpf(dev, &xdp));
7801 if (xdp.prog_id)
7802 WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
7803 NULL));
7332 7804
7333 /* Program removal should always succeed */ 7805 /* Remove HW offload */
7334 WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags, NULL)); 7806 memset(&xdp, 0, sizeof(xdp));
7807 xdp.command = XDP_QUERY_PROG_HW;
7808 if (!ndo_bpf(dev, &xdp) && xdp.prog_id)
7809 WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
7810 NULL));
7335} 7811}
7336 7812
7337/** 7813/**
@@ -7347,12 +7823,15 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
7347 int fd, u32 flags) 7823 int fd, u32 flags)
7348{ 7824{
7349 const struct net_device_ops *ops = dev->netdev_ops; 7825 const struct net_device_ops *ops = dev->netdev_ops;
7826 enum bpf_netdev_command query;
7350 struct bpf_prog *prog = NULL; 7827 struct bpf_prog *prog = NULL;
7351 bpf_op_t bpf_op, bpf_chk; 7828 bpf_op_t bpf_op, bpf_chk;
7352 int err; 7829 int err;
7353 7830
7354 ASSERT_RTNL(); 7831 ASSERT_RTNL();
7355 7832
7833 query = flags & XDP_FLAGS_HW_MODE ? XDP_QUERY_PROG_HW : XDP_QUERY_PROG;
7834
7356 bpf_op = bpf_chk = ops->ndo_bpf; 7835 bpf_op = bpf_chk = ops->ndo_bpf;
7357 if (!bpf_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE))) 7836 if (!bpf_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE)))
7358 return -EOPNOTSUPP; 7837 return -EOPNOTSUPP;
@@ -7362,10 +7841,11 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
7362 bpf_chk = generic_xdp_install; 7841 bpf_chk = generic_xdp_install;
7363 7842
7364 if (fd >= 0) { 7843 if (fd >= 0) {
7365 if (bpf_chk && __dev_xdp_attached(dev, bpf_chk)) 7844 if (__dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG) ||
7845 __dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG_HW))
7366 return -EEXIST; 7846 return -EEXIST;
7367 if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && 7847 if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) &&
7368 __dev_xdp_attached(dev, bpf_op)) 7848 __dev_xdp_query(dev, bpf_op, query))
7369 return -EBUSY; 7849 return -EBUSY;
7370 7850
7371 prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP, 7851 prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
@@ -8834,6 +9314,9 @@ static struct hlist_head * __net_init netdev_create_hash(void)
8834/* Initialize per network namespace state */ 9314/* Initialize per network namespace state */
8835static int __net_init netdev_init(struct net *net) 9315static int __net_init netdev_init(struct net *net)
8836{ 9316{
9317 BUILD_BUG_ON(GRO_HASH_BUCKETS >
9318 8 * FIELD_SIZEOF(struct napi_struct, gro_bitmask));
9319
8837 if (net != &init_net) 9320 if (net != &init_net)
8838 INIT_LIST_HEAD(&net->dev_base_head); 9321 INIT_LIST_HEAD(&net->dev_base_head);
8839 9322
@@ -9104,6 +9587,7 @@ static int __init net_dev_init(void)
9104 sd->cpu = i; 9587 sd->cpu = i;
9105#endif 9588#endif
9106 9589
9590 init_gro_hash(&sd->backlog);
9107 sd->backlog.poll = process_backlog; 9591 sd->backlog.poll = process_backlog;
9108 sd->backlog.weight = weight_p; 9592 sd->backlog.weight = weight_p;
9109 } 9593 }
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index a04e1e88bf3a..90e8aa36881e 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -284,19 +284,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
284 case SIOCSIFTXQLEN: 284 case SIOCSIFTXQLEN:
285 if (ifr->ifr_qlen < 0) 285 if (ifr->ifr_qlen < 0)
286 return -EINVAL; 286 return -EINVAL;
287 if (dev->tx_queue_len ^ ifr->ifr_qlen) { 287 return dev_change_tx_queue_len(dev, ifr->ifr_qlen);
288 unsigned int orig_len = dev->tx_queue_len;
289
290 dev->tx_queue_len = ifr->ifr_qlen;
291 err = call_netdevice_notifiers(
292 NETDEV_CHANGE_TX_QUEUE_LEN, dev);
293 err = notifier_to_errno(err);
294 if (err) {
295 dev->tx_queue_len = orig_len;
296 return err;
297 }
298 }
299 return 0;
300 288
301 case SIOCSIFNAME: 289 case SIOCSIFNAME:
302 ifr->ifr_newname[IFNAMSIZ-1] = '\0'; 290 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 22099705cc41..65fc366a78a4 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -326,6 +326,57 @@ devlink_sb_tc_index_get_from_info(struct devlink_sb *devlink_sb,
326 pool_type, p_tc_index); 326 pool_type, p_tc_index);
327} 327}
328 328
329struct devlink_region {
330 struct devlink *devlink;
331 struct list_head list;
332 const char *name;
333 struct list_head snapshot_list;
334 u32 max_snapshots;
335 u32 cur_snapshots;
336 u64 size;
337};
338
339struct devlink_snapshot {
340 struct list_head list;
341 struct devlink_region *region;
342 devlink_snapshot_data_dest_t *data_destructor;
343 u64 data_len;
344 u8 *data;
345 u32 id;
346};
347
348static struct devlink_region *
349devlink_region_get_by_name(struct devlink *devlink, const char *region_name)
350{
351 struct devlink_region *region;
352
353 list_for_each_entry(region, &devlink->region_list, list)
354 if (!strcmp(region->name, region_name))
355 return region;
356
357 return NULL;
358}
359
360static struct devlink_snapshot *
361devlink_region_snapshot_get_by_id(struct devlink_region *region, u32 id)
362{
363 struct devlink_snapshot *snapshot;
364
365 list_for_each_entry(snapshot, &region->snapshot_list, list)
366 if (snapshot->id == id)
367 return snapshot;
368
369 return NULL;
370}
371
372static void devlink_region_snapshot_del(struct devlink_snapshot *snapshot)
373{
374 snapshot->region->cur_snapshots--;
375 list_del(&snapshot->list);
376 (*snapshot->data_destructor)(snapshot->data);
377 kfree(snapshot);
378}
379
329#define DEVLINK_NL_FLAG_NEED_DEVLINK BIT(0) 380#define DEVLINK_NL_FLAG_NEED_DEVLINK BIT(0)
330#define DEVLINK_NL_FLAG_NEED_PORT BIT(1) 381#define DEVLINK_NL_FLAG_NEED_PORT BIT(1)
331#define DEVLINK_NL_FLAG_NEED_SB BIT(2) 382#define DEVLINK_NL_FLAG_NEED_SB BIT(2)
@@ -2604,6 +2655,919 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info)
2604 return devlink->ops->reload(devlink, info->extack); 2655 return devlink->ops->reload(devlink, info->extack);
2605} 2656}
2606 2657
2658static const struct devlink_param devlink_param_generic[] = {
2659 {
2660 .id = DEVLINK_PARAM_GENERIC_ID_INT_ERR_RESET,
2661 .name = DEVLINK_PARAM_GENERIC_INT_ERR_RESET_NAME,
2662 .type = DEVLINK_PARAM_GENERIC_INT_ERR_RESET_TYPE,
2663 },
2664 {
2665 .id = DEVLINK_PARAM_GENERIC_ID_MAX_MACS,
2666 .name = DEVLINK_PARAM_GENERIC_MAX_MACS_NAME,
2667 .type = DEVLINK_PARAM_GENERIC_MAX_MACS_TYPE,
2668 },
2669 {
2670 .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_SRIOV,
2671 .name = DEVLINK_PARAM_GENERIC_ENABLE_SRIOV_NAME,
2672 .type = DEVLINK_PARAM_GENERIC_ENABLE_SRIOV_TYPE,
2673 },
2674 {
2675 .id = DEVLINK_PARAM_GENERIC_ID_REGION_SNAPSHOT,
2676 .name = DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_NAME,
2677 .type = DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_TYPE,
2678 },
2679};
2680
2681static int devlink_param_generic_verify(const struct devlink_param *param)
2682{
2683 /* verify it match generic parameter by id and name */
2684 if (param->id > DEVLINK_PARAM_GENERIC_ID_MAX)
2685 return -EINVAL;
2686 if (strcmp(param->name, devlink_param_generic[param->id].name))
2687 return -ENOENT;
2688
2689 WARN_ON(param->type != devlink_param_generic[param->id].type);
2690
2691 return 0;
2692}
2693
2694static int devlink_param_driver_verify(const struct devlink_param *param)
2695{
2696 int i;
2697
2698 if (param->id <= DEVLINK_PARAM_GENERIC_ID_MAX)
2699 return -EINVAL;
2700 /* verify no such name in generic params */
2701 for (i = 0; i <= DEVLINK_PARAM_GENERIC_ID_MAX; i++)
2702 if (!strcmp(param->name, devlink_param_generic[i].name))
2703 return -EEXIST;
2704
2705 return 0;
2706}
2707
2708static struct devlink_param_item *
2709devlink_param_find_by_name(struct list_head *param_list,
2710 const char *param_name)
2711{
2712 struct devlink_param_item *param_item;
2713
2714 list_for_each_entry(param_item, param_list, list)
2715 if (!strcmp(param_item->param->name, param_name))
2716 return param_item;
2717 return NULL;
2718}
2719
2720static struct devlink_param_item *
2721devlink_param_find_by_id(struct list_head *param_list, u32 param_id)
2722{
2723 struct devlink_param_item *param_item;
2724
2725 list_for_each_entry(param_item, param_list, list)
2726 if (param_item->param->id == param_id)
2727 return param_item;
2728 return NULL;
2729}
2730
2731static bool
2732devlink_param_cmode_is_supported(const struct devlink_param *param,
2733 enum devlink_param_cmode cmode)
2734{
2735 return test_bit(cmode, &param->supported_cmodes);
2736}
2737
2738static int devlink_param_get(struct devlink *devlink,
2739 const struct devlink_param *param,
2740 struct devlink_param_gset_ctx *ctx)
2741{
2742 if (!param->get)
2743 return -EOPNOTSUPP;
2744 return param->get(devlink, param->id, ctx);
2745}
2746
2747static int devlink_param_set(struct devlink *devlink,
2748 const struct devlink_param *param,
2749 struct devlink_param_gset_ctx *ctx)
2750{
2751 if (!param->set)
2752 return -EOPNOTSUPP;
2753 return param->set(devlink, param->id, ctx);
2754}
2755
2756static int
2757devlink_param_type_to_nla_type(enum devlink_param_type param_type)
2758{
2759 switch (param_type) {
2760 case DEVLINK_PARAM_TYPE_U8:
2761 return NLA_U8;
2762 case DEVLINK_PARAM_TYPE_U16:
2763 return NLA_U16;
2764 case DEVLINK_PARAM_TYPE_U32:
2765 return NLA_U32;
2766 case DEVLINK_PARAM_TYPE_STRING:
2767 return NLA_STRING;
2768 case DEVLINK_PARAM_TYPE_BOOL:
2769 return NLA_FLAG;
2770 default:
2771 return -EINVAL;
2772 }
2773}
2774
2775static int
2776devlink_nl_param_value_fill_one(struct sk_buff *msg,
2777 enum devlink_param_type type,
2778 enum devlink_param_cmode cmode,
2779 union devlink_param_value val)
2780{
2781 struct nlattr *param_value_attr;
2782
2783 param_value_attr = nla_nest_start(msg, DEVLINK_ATTR_PARAM_VALUE);
2784 if (!param_value_attr)
2785 goto nla_put_failure;
2786
2787 if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_VALUE_CMODE, cmode))
2788 goto value_nest_cancel;
2789
2790 switch (type) {
2791 case DEVLINK_PARAM_TYPE_U8:
2792 if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu8))
2793 goto value_nest_cancel;
2794 break;
2795 case DEVLINK_PARAM_TYPE_U16:
2796 if (nla_put_u16(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu16))
2797 goto value_nest_cancel;
2798 break;
2799 case DEVLINK_PARAM_TYPE_U32:
2800 if (nla_put_u32(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu32))
2801 goto value_nest_cancel;
2802 break;
2803 case DEVLINK_PARAM_TYPE_STRING:
2804 if (nla_put_string(msg, DEVLINK_ATTR_PARAM_VALUE_DATA,
2805 val.vstr))
2806 goto value_nest_cancel;
2807 break;
2808 case DEVLINK_PARAM_TYPE_BOOL:
2809 if (val.vbool &&
2810 nla_put_flag(msg, DEVLINK_ATTR_PARAM_VALUE_DATA))
2811 goto value_nest_cancel;
2812 break;
2813 }
2814
2815 nla_nest_end(msg, param_value_attr);
2816 return 0;
2817
2818value_nest_cancel:
2819 nla_nest_cancel(msg, param_value_attr);
2820nla_put_failure:
2821 return -EMSGSIZE;
2822}
2823
2824static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink,
2825 struct devlink_param_item *param_item,
2826 enum devlink_command cmd,
2827 u32 portid, u32 seq, int flags)
2828{
2829 union devlink_param_value param_value[DEVLINK_PARAM_CMODE_MAX + 1];
2830 const struct devlink_param *param = param_item->param;
2831 struct devlink_param_gset_ctx ctx;
2832 struct nlattr *param_values_list;
2833 struct nlattr *param_attr;
2834 int nla_type;
2835 void *hdr;
2836 int err;
2837 int i;
2838
2839 /* Get value from driver part to driverinit configuration mode */
2840 for (i = 0; i <= DEVLINK_PARAM_CMODE_MAX; i++) {
2841 if (!devlink_param_cmode_is_supported(param, i))
2842 continue;
2843 if (i == DEVLINK_PARAM_CMODE_DRIVERINIT) {
2844 if (!param_item->driverinit_value_valid)
2845 return -EOPNOTSUPP;
2846 param_value[i] = param_item->driverinit_value;
2847 } else {
2848 ctx.cmode = i;
2849 err = devlink_param_get(devlink, param, &ctx);
2850 if (err)
2851 return err;
2852 param_value[i] = ctx.val;
2853 }
2854 }
2855
2856 hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
2857 if (!hdr)
2858 return -EMSGSIZE;
2859
2860 if (devlink_nl_put_handle(msg, devlink))
2861 goto genlmsg_cancel;
2862 param_attr = nla_nest_start(msg, DEVLINK_ATTR_PARAM);
2863 if (!param_attr)
2864 goto genlmsg_cancel;
2865 if (nla_put_string(msg, DEVLINK_ATTR_PARAM_NAME, param->name))
2866 goto param_nest_cancel;
2867 if (param->generic && nla_put_flag(msg, DEVLINK_ATTR_PARAM_GENERIC))
2868 goto param_nest_cancel;
2869
2870 nla_type = devlink_param_type_to_nla_type(param->type);
2871 if (nla_type < 0)
2872 goto param_nest_cancel;
2873 if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_TYPE, nla_type))
2874 goto param_nest_cancel;
2875
2876 param_values_list = nla_nest_start(msg, DEVLINK_ATTR_PARAM_VALUES_LIST);
2877 if (!param_values_list)
2878 goto param_nest_cancel;
2879
2880 for (i = 0; i <= DEVLINK_PARAM_CMODE_MAX; i++) {
2881 if (!devlink_param_cmode_is_supported(param, i))
2882 continue;
2883 err = devlink_nl_param_value_fill_one(msg, param->type,
2884 i, param_value[i]);
2885 if (err)
2886 goto values_list_nest_cancel;
2887 }
2888
2889 nla_nest_end(msg, param_values_list);
2890 nla_nest_end(msg, param_attr);
2891 genlmsg_end(msg, hdr);
2892 return 0;
2893
2894values_list_nest_cancel:
2895 nla_nest_end(msg, param_values_list);
2896param_nest_cancel:
2897 nla_nest_cancel(msg, param_attr);
2898genlmsg_cancel:
2899 genlmsg_cancel(msg, hdr);
2900 return -EMSGSIZE;
2901}
2902
2903static void devlink_param_notify(struct devlink *devlink,
2904 struct devlink_param_item *param_item,
2905 enum devlink_command cmd)
2906{
2907 struct sk_buff *msg;
2908 int err;
2909
2910 WARN_ON(cmd != DEVLINK_CMD_PARAM_NEW && cmd != DEVLINK_CMD_PARAM_DEL);
2911
2912 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
2913 if (!msg)
2914 return;
2915 err = devlink_nl_param_fill(msg, devlink, param_item, cmd, 0, 0, 0);
2916 if (err) {
2917 nlmsg_free(msg);
2918 return;
2919 }
2920
2921 genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
2922 msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
2923}
2924
2925static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg,
2926 struct netlink_callback *cb)
2927{
2928 struct devlink_param_item *param_item;
2929 struct devlink *devlink;
2930 int start = cb->args[0];
2931 int idx = 0;
2932 int err;
2933
2934 mutex_lock(&devlink_mutex);
2935 list_for_each_entry(devlink, &devlink_list, list) {
2936 if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
2937 continue;
2938 mutex_lock(&devlink->lock);
2939 list_for_each_entry(param_item, &devlink->param_list, list) {
2940 if (idx < start) {
2941 idx++;
2942 continue;
2943 }
2944 err = devlink_nl_param_fill(msg, devlink, param_item,
2945 DEVLINK_CMD_PARAM_GET,
2946 NETLINK_CB(cb->skb).portid,
2947 cb->nlh->nlmsg_seq,
2948 NLM_F_MULTI);
2949 if (err) {
2950 mutex_unlock(&devlink->lock);
2951 goto out;
2952 }
2953 idx++;
2954 }
2955 mutex_unlock(&devlink->lock);
2956 }
2957out:
2958 mutex_unlock(&devlink_mutex);
2959
2960 cb->args[0] = idx;
2961 return msg->len;
2962}
2963
2964static int
2965devlink_param_type_get_from_info(struct genl_info *info,
2966 enum devlink_param_type *param_type)
2967{
2968 if (!info->attrs[DEVLINK_ATTR_PARAM_TYPE])
2969 return -EINVAL;
2970
2971 switch (nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_TYPE])) {
2972 case NLA_U8:
2973 *param_type = DEVLINK_PARAM_TYPE_U8;
2974 break;
2975 case NLA_U16:
2976 *param_type = DEVLINK_PARAM_TYPE_U16;
2977 break;
2978 case NLA_U32:
2979 *param_type = DEVLINK_PARAM_TYPE_U32;
2980 break;
2981 case NLA_STRING:
2982 *param_type = DEVLINK_PARAM_TYPE_STRING;
2983 break;
2984 case NLA_FLAG:
2985 *param_type = DEVLINK_PARAM_TYPE_BOOL;
2986 break;
2987 default:
2988 return -EINVAL;
2989 }
2990
2991 return 0;
2992}
2993
2994static int
2995devlink_param_value_get_from_info(const struct devlink_param *param,
2996 struct genl_info *info,
2997 union devlink_param_value *value)
2998{
2999 if (param->type != DEVLINK_PARAM_TYPE_BOOL &&
3000 !info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA])
3001 return -EINVAL;
3002
3003 switch (param->type) {
3004 case DEVLINK_PARAM_TYPE_U8:
3005 value->vu8 = nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]);
3006 break;
3007 case DEVLINK_PARAM_TYPE_U16:
3008 value->vu16 = nla_get_u16(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]);
3009 break;
3010 case DEVLINK_PARAM_TYPE_U32:
3011 value->vu32 = nla_get_u32(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]);
3012 break;
3013 case DEVLINK_PARAM_TYPE_STRING:
3014 if (nla_len(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]) >
3015 DEVLINK_PARAM_MAX_STRING_VALUE)
3016 return -EINVAL;
3017 value->vstr = nla_data(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]);
3018 break;
3019 case DEVLINK_PARAM_TYPE_BOOL:
3020 value->vbool = info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA] ?
3021 true : false;
3022 break;
3023 }
3024 return 0;
3025}
3026
3027static struct devlink_param_item *
3028devlink_param_get_from_info(struct devlink *devlink,
3029 struct genl_info *info)
3030{
3031 char *param_name;
3032
3033 if (!info->attrs[DEVLINK_ATTR_PARAM_NAME])
3034 return NULL;
3035
3036 param_name = nla_data(info->attrs[DEVLINK_ATTR_PARAM_NAME]);
3037 return devlink_param_find_by_name(&devlink->param_list, param_name);
3038}
3039
3040static int devlink_nl_cmd_param_get_doit(struct sk_buff *skb,
3041 struct genl_info *info)
3042{
3043 struct devlink *devlink = info->user_ptr[0];
3044 struct devlink_param_item *param_item;
3045 struct sk_buff *msg;
3046 int err;
3047
3048 param_item = devlink_param_get_from_info(devlink, info);
3049 if (!param_item)
3050 return -EINVAL;
3051
3052 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
3053 if (!msg)
3054 return -ENOMEM;
3055
3056 err = devlink_nl_param_fill(msg, devlink, param_item,
3057 DEVLINK_CMD_PARAM_GET,
3058 info->snd_portid, info->snd_seq, 0);
3059 if (err) {
3060 nlmsg_free(msg);
3061 return err;
3062 }
3063
3064 return genlmsg_reply(msg, info);
3065}
3066
3067static int devlink_nl_cmd_param_set_doit(struct sk_buff *skb,
3068 struct genl_info *info)
3069{
3070 struct devlink *devlink = info->user_ptr[0];
3071 enum devlink_param_type param_type;
3072 struct devlink_param_gset_ctx ctx;
3073 enum devlink_param_cmode cmode;
3074 struct devlink_param_item *param_item;
3075 const struct devlink_param *param;
3076 union devlink_param_value value;
3077 int err = 0;
3078
3079 param_item = devlink_param_get_from_info(devlink, info);
3080 if (!param_item)
3081 return -EINVAL;
3082 param = param_item->param;
3083 err = devlink_param_type_get_from_info(info, &param_type);
3084 if (err)
3085 return err;
3086 if (param_type != param->type)
3087 return -EINVAL;
3088 err = devlink_param_value_get_from_info(param, info, &value);
3089 if (err)
3090 return err;
3091 if (param->validate) {
3092 err = param->validate(devlink, param->id, value, info->extack);
3093 if (err)
3094 return err;
3095 }
3096
3097 if (!info->attrs[DEVLINK_ATTR_PARAM_VALUE_CMODE])
3098 return -EINVAL;
3099 cmode = nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_VALUE_CMODE]);
3100 if (!devlink_param_cmode_is_supported(param, cmode))
3101 return -EOPNOTSUPP;
3102
3103 if (cmode == DEVLINK_PARAM_CMODE_DRIVERINIT) {
3104 param_item->driverinit_value = value;
3105 param_item->driverinit_value_valid = true;
3106 } else {
3107 if (!param->set)
3108 return -EOPNOTSUPP;
3109 ctx.val = value;
3110 ctx.cmode = cmode;
3111 err = devlink_param_set(devlink, param, &ctx);
3112 if (err)
3113 return err;
3114 }
3115
3116 devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW);
3117 return 0;
3118}
3119
3120static int devlink_param_register_one(struct devlink *devlink,
3121 const struct devlink_param *param)
3122{
3123 struct devlink_param_item *param_item;
3124
3125 if (devlink_param_find_by_name(&devlink->param_list,
3126 param->name))
3127 return -EEXIST;
3128
3129 if (param->supported_cmodes == BIT(DEVLINK_PARAM_CMODE_DRIVERINIT))
3130 WARN_ON(param->get || param->set);
3131 else
3132 WARN_ON(!param->get || !param->set);
3133
3134 param_item = kzalloc(sizeof(*param_item), GFP_KERNEL);
3135 if (!param_item)
3136 return -ENOMEM;
3137 param_item->param = param;
3138
3139 list_add_tail(&param_item->list, &devlink->param_list);
3140 devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW);
3141 return 0;
3142}
3143
3144static void devlink_param_unregister_one(struct devlink *devlink,
3145 const struct devlink_param *param)
3146{
3147 struct devlink_param_item *param_item;
3148
3149 param_item = devlink_param_find_by_name(&devlink->param_list,
3150 param->name);
3151 WARN_ON(!param_item);
3152 devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_DEL);
3153 list_del(&param_item->list);
3154 kfree(param_item);
3155}
3156
3157static int devlink_nl_region_snapshot_id_put(struct sk_buff *msg,
3158 struct devlink *devlink,
3159 struct devlink_snapshot *snapshot)
3160{
3161 struct nlattr *snap_attr;
3162 int err;
3163
3164 snap_attr = nla_nest_start(msg, DEVLINK_ATTR_REGION_SNAPSHOT);
3165 if (!snap_attr)
3166 return -EINVAL;
3167
3168 err = nla_put_u32(msg, DEVLINK_ATTR_REGION_SNAPSHOT_ID, snapshot->id);
3169 if (err)
3170 goto nla_put_failure;
3171
3172 nla_nest_end(msg, snap_attr);
3173 return 0;
3174
3175nla_put_failure:
3176 nla_nest_cancel(msg, snap_attr);
3177 return err;
3178}
3179
3180static int devlink_nl_region_snapshots_id_put(struct sk_buff *msg,
3181 struct devlink *devlink,
3182 struct devlink_region *region)
3183{
3184 struct devlink_snapshot *snapshot;
3185 struct nlattr *snapshots_attr;
3186 int err;
3187
3188 snapshots_attr = nla_nest_start(msg, DEVLINK_ATTR_REGION_SNAPSHOTS);
3189 if (!snapshots_attr)
3190 return -EINVAL;
3191
3192 list_for_each_entry(snapshot, &region->snapshot_list, list) {
3193 err = devlink_nl_region_snapshot_id_put(msg, devlink, snapshot);
3194 if (err)
3195 goto nla_put_failure;
3196 }
3197
3198 nla_nest_end(msg, snapshots_attr);
3199 return 0;
3200
3201nla_put_failure:
3202 nla_nest_cancel(msg, snapshots_attr);
3203 return err;
3204}
3205
3206static int devlink_nl_region_fill(struct sk_buff *msg, struct devlink *devlink,
3207 enum devlink_command cmd, u32 portid,
3208 u32 seq, int flags,
3209 struct devlink_region *region)
3210{
3211 void *hdr;
3212 int err;
3213
3214 hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
3215 if (!hdr)
3216 return -EMSGSIZE;
3217
3218 err = devlink_nl_put_handle(msg, devlink);
3219 if (err)
3220 goto nla_put_failure;
3221
3222 err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME, region->name);
3223 if (err)
3224 goto nla_put_failure;
3225
3226 err = nla_put_u64_64bit(msg, DEVLINK_ATTR_REGION_SIZE,
3227 region->size,
3228 DEVLINK_ATTR_PAD);
3229 if (err)
3230 goto nla_put_failure;
3231
3232 err = devlink_nl_region_snapshots_id_put(msg, devlink, region);
3233 if (err)
3234 goto nla_put_failure;
3235
3236 genlmsg_end(msg, hdr);
3237 return 0;
3238
3239nla_put_failure:
3240 genlmsg_cancel(msg, hdr);
3241 return err;
3242}
3243
3244static void devlink_nl_region_notify(struct devlink_region *region,
3245 struct devlink_snapshot *snapshot,
3246 enum devlink_command cmd)
3247{
3248 struct devlink *devlink = region->devlink;
3249 struct sk_buff *msg;
3250 void *hdr;
3251 int err;
3252
3253 WARN_ON(cmd != DEVLINK_CMD_REGION_NEW && cmd != DEVLINK_CMD_REGION_DEL);
3254
3255 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
3256 if (!msg)
3257 return;
3258
3259 hdr = genlmsg_put(msg, 0, 0, &devlink_nl_family, 0, cmd);
3260 if (!hdr)
3261 goto out_free_msg;
3262
3263 err = devlink_nl_put_handle(msg, devlink);
3264 if (err)
3265 goto out_cancel_msg;
3266
3267 err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME,
3268 region->name);
3269 if (err)
3270 goto out_cancel_msg;
3271
3272 if (snapshot) {
3273 err = nla_put_u32(msg, DEVLINK_ATTR_REGION_SNAPSHOT_ID,
3274 snapshot->id);
3275 if (err)
3276 goto out_cancel_msg;
3277 } else {
3278 err = nla_put_u64_64bit(msg, DEVLINK_ATTR_REGION_SIZE,
3279 region->size, DEVLINK_ATTR_PAD);
3280 if (err)
3281 goto out_cancel_msg;
3282 }
3283 genlmsg_end(msg, hdr);
3284
3285 genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
3286 msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
3287
3288 return;
3289
3290out_cancel_msg:
3291 genlmsg_cancel(msg, hdr);
3292out_free_msg:
3293 nlmsg_free(msg);
3294}
3295
3296static int devlink_nl_cmd_region_get_doit(struct sk_buff *skb,
3297 struct genl_info *info)
3298{
3299 struct devlink *devlink = info->user_ptr[0];
3300 struct devlink_region *region;
3301 const char *region_name;
3302 struct sk_buff *msg;
3303 int err;
3304
3305 if (!info->attrs[DEVLINK_ATTR_REGION_NAME])
3306 return -EINVAL;
3307
3308 region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]);
3309 region = devlink_region_get_by_name(devlink, region_name);
3310 if (!region)
3311 return -EINVAL;
3312
3313 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
3314 if (!msg)
3315 return -ENOMEM;
3316
3317 err = devlink_nl_region_fill(msg, devlink, DEVLINK_CMD_REGION_GET,
3318 info->snd_portid, info->snd_seq, 0,
3319 region);
3320 if (err) {
3321 nlmsg_free(msg);
3322 return err;
3323 }
3324
3325 return genlmsg_reply(msg, info);
3326}
3327
3328static int devlink_nl_cmd_region_get_dumpit(struct sk_buff *msg,
3329 struct netlink_callback *cb)
3330{
3331 struct devlink_region *region;
3332 struct devlink *devlink;
3333 int start = cb->args[0];
3334 int idx = 0;
3335 int err;
3336
3337 mutex_lock(&devlink_mutex);
3338 list_for_each_entry(devlink, &devlink_list, list) {
3339 if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
3340 continue;
3341
3342 mutex_lock(&devlink->lock);
3343 list_for_each_entry(region, &devlink->region_list, list) {
3344 if (idx < start) {
3345 idx++;
3346 continue;
3347 }
3348 err = devlink_nl_region_fill(msg, devlink,
3349 DEVLINK_CMD_REGION_GET,
3350 NETLINK_CB(cb->skb).portid,
3351 cb->nlh->nlmsg_seq,
3352 NLM_F_MULTI, region);
3353 if (err) {
3354 mutex_unlock(&devlink->lock);
3355 goto out;
3356 }
3357 idx++;
3358 }
3359 mutex_unlock(&devlink->lock);
3360 }
3361out:
3362 mutex_unlock(&devlink_mutex);
3363 cb->args[0] = idx;
3364 return msg->len;
3365}
3366
3367static int devlink_nl_cmd_region_del(struct sk_buff *skb,
3368 struct genl_info *info)
3369{
3370 struct devlink *devlink = info->user_ptr[0];
3371 struct devlink_snapshot *snapshot;
3372 struct devlink_region *region;
3373 const char *region_name;
3374 u32 snapshot_id;
3375
3376 if (!info->attrs[DEVLINK_ATTR_REGION_NAME] ||
3377 !info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID])
3378 return -EINVAL;
3379
3380 region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]);
3381 snapshot_id = nla_get_u32(info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]);
3382
3383 region = devlink_region_get_by_name(devlink, region_name);
3384 if (!region)
3385 return -EINVAL;
3386
3387 snapshot = devlink_region_snapshot_get_by_id(region, snapshot_id);
3388 if (!snapshot)
3389 return -EINVAL;
3390
3391 devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_DEL);
3392 devlink_region_snapshot_del(snapshot);
3393 return 0;
3394}
3395
3396static int devlink_nl_cmd_region_read_chunk_fill(struct sk_buff *msg,
3397 struct devlink *devlink,
3398 u8 *chunk, u32 chunk_size,
3399 u64 addr)
3400{
3401 struct nlattr *chunk_attr;
3402 int err;
3403
3404 chunk_attr = nla_nest_start(msg, DEVLINK_ATTR_REGION_CHUNK);
3405 if (!chunk_attr)
3406 return -EINVAL;
3407
3408 err = nla_put(msg, DEVLINK_ATTR_REGION_CHUNK_DATA, chunk_size, chunk);
3409 if (err)
3410 goto nla_put_failure;
3411
3412 err = nla_put_u64_64bit(msg, DEVLINK_ATTR_REGION_CHUNK_ADDR, addr,
3413 DEVLINK_ATTR_PAD);
3414 if (err)
3415 goto nla_put_failure;
3416
3417 nla_nest_end(msg, chunk_attr);
3418 return 0;
3419
3420nla_put_failure:
3421 nla_nest_cancel(msg, chunk_attr);
3422 return err;
3423}
3424
3425#define DEVLINK_REGION_READ_CHUNK_SIZE 256
3426
3427static int devlink_nl_region_read_snapshot_fill(struct sk_buff *skb,
3428 struct devlink *devlink,
3429 struct devlink_region *region,
3430 struct nlattr **attrs,
3431 u64 start_offset,
3432 u64 end_offset,
3433 bool dump,
3434 u64 *new_offset)
3435{
3436 struct devlink_snapshot *snapshot;
3437 u64 curr_offset = start_offset;
3438 u32 snapshot_id;
3439 int err = 0;
3440
3441 *new_offset = start_offset;
3442
3443 snapshot_id = nla_get_u32(attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]);
3444 snapshot = devlink_region_snapshot_get_by_id(region, snapshot_id);
3445 if (!snapshot)
3446 return -EINVAL;
3447
3448 if (end_offset > snapshot->data_len || dump)
3449 end_offset = snapshot->data_len;
3450
3451 while (curr_offset < end_offset) {
3452 u32 data_size;
3453 u8 *data;
3454
3455 if (end_offset - curr_offset < DEVLINK_REGION_READ_CHUNK_SIZE)
3456 data_size = end_offset - curr_offset;
3457 else
3458 data_size = DEVLINK_REGION_READ_CHUNK_SIZE;
3459
3460 data = &snapshot->data[curr_offset];
3461 err = devlink_nl_cmd_region_read_chunk_fill(skb, devlink,
3462 data, data_size,
3463 curr_offset);
3464 if (err)
3465 break;
3466
3467 curr_offset += data_size;
3468 }
3469 *new_offset = curr_offset;
3470
3471 return err;
3472}
3473
3474static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb,
3475 struct netlink_callback *cb)
3476{
3477 u64 ret_offset, start_offset, end_offset = 0;
3478 struct nlattr *attrs[DEVLINK_ATTR_MAX + 1];
3479 const struct genl_ops *ops = cb->data;
3480 struct devlink_region *region;
3481 struct nlattr *chunks_attr;
3482 const char *region_name;
3483 struct devlink *devlink;
3484 bool dump = true;
3485 void *hdr;
3486 int err;
3487
3488 start_offset = *((u64 *)&cb->args[0]);
3489
3490 err = nlmsg_parse(cb->nlh, GENL_HDRLEN + devlink_nl_family.hdrsize,
3491 attrs, DEVLINK_ATTR_MAX, ops->policy, NULL);
3492 if (err)
3493 goto out;
3494
3495 devlink = devlink_get_from_attrs(sock_net(cb->skb->sk), attrs);
3496 if (IS_ERR(devlink))
3497 goto out;
3498
3499 mutex_lock(&devlink_mutex);
3500 mutex_lock(&devlink->lock);
3501
3502 if (!attrs[DEVLINK_ATTR_REGION_NAME] ||
3503 !attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID])
3504 goto out_unlock;
3505
3506 region_name = nla_data(attrs[DEVLINK_ATTR_REGION_NAME]);
3507 region = devlink_region_get_by_name(devlink, region_name);
3508 if (!region)
3509 goto out_unlock;
3510
3511 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
3512 &devlink_nl_family, NLM_F_ACK | NLM_F_MULTI,
3513 DEVLINK_CMD_REGION_READ);
3514 if (!hdr)
3515 goto out_unlock;
3516
3517 err = devlink_nl_put_handle(skb, devlink);
3518 if (err)
3519 goto nla_put_failure;
3520
3521 err = nla_put_string(skb, DEVLINK_ATTR_REGION_NAME, region_name);
3522 if (err)
3523 goto nla_put_failure;
3524
3525 chunks_attr = nla_nest_start(skb, DEVLINK_ATTR_REGION_CHUNKS);
3526 if (!chunks_attr)
3527 goto nla_put_failure;
3528
3529 if (attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR] &&
3530 attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]) {
3531 if (!start_offset)
3532 start_offset =
3533 nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]);
3534
3535 end_offset = nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]);
3536 end_offset += nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]);
3537 dump = false;
3538 }
3539
3540 err = devlink_nl_region_read_snapshot_fill(skb, devlink,
3541 region, attrs,
3542 start_offset,
3543 end_offset, dump,
3544 &ret_offset);
3545
3546 if (err && err != -EMSGSIZE)
3547 goto nla_put_failure;
3548
3549 /* Check if there was any progress done to prevent infinite loop */
3550 if (ret_offset == start_offset)
3551 goto nla_put_failure;
3552
3553 *((u64 *)&cb->args[0]) = ret_offset;
3554
3555 nla_nest_end(skb, chunks_attr);
3556 genlmsg_end(skb, hdr);
3557 mutex_unlock(&devlink->lock);
3558 mutex_unlock(&devlink_mutex);
3559
3560 return skb->len;
3561
3562nla_put_failure:
3563 genlmsg_cancel(skb, hdr);
3564out_unlock:
3565 mutex_unlock(&devlink->lock);
3566 mutex_unlock(&devlink_mutex);
3567out:
3568 return 0;
3569}
3570
2607static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { 3571static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
2608 [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, 3572 [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
2609 [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, 3573 [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
@@ -2624,6 +3588,11 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
2624 [DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED] = { .type = NLA_U8 }, 3588 [DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED] = { .type = NLA_U8 },
2625 [DEVLINK_ATTR_RESOURCE_ID] = { .type = NLA_U64}, 3589 [DEVLINK_ATTR_RESOURCE_ID] = { .type = NLA_U64},
2626 [DEVLINK_ATTR_RESOURCE_SIZE] = { .type = NLA_U64}, 3590 [DEVLINK_ATTR_RESOURCE_SIZE] = { .type = NLA_U64},
3591 [DEVLINK_ATTR_PARAM_NAME] = { .type = NLA_NUL_STRING },
3592 [DEVLINK_ATTR_PARAM_TYPE] = { .type = NLA_U8 },
3593 [DEVLINK_ATTR_PARAM_VALUE_CMODE] = { .type = NLA_U8 },
3594 [DEVLINK_ATTR_REGION_NAME] = { .type = NLA_NUL_STRING },
3595 [DEVLINK_ATTR_REGION_SNAPSHOT_ID] = { .type = NLA_U32 },
2627}; 3596};
2628 3597
2629static const struct genl_ops devlink_nl_ops[] = { 3598static const struct genl_ops devlink_nl_ops[] = {
@@ -2807,6 +3776,43 @@ static const struct genl_ops devlink_nl_ops[] = {
2807 .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | 3776 .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
2808 DEVLINK_NL_FLAG_NO_LOCK, 3777 DEVLINK_NL_FLAG_NO_LOCK,
2809 }, 3778 },
3779 {
3780 .cmd = DEVLINK_CMD_PARAM_GET,
3781 .doit = devlink_nl_cmd_param_get_doit,
3782 .dumpit = devlink_nl_cmd_param_get_dumpit,
3783 .policy = devlink_nl_policy,
3784 .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
3785 /* can be retrieved by unprivileged users */
3786 },
3787 {
3788 .cmd = DEVLINK_CMD_PARAM_SET,
3789 .doit = devlink_nl_cmd_param_set_doit,
3790 .policy = devlink_nl_policy,
3791 .flags = GENL_ADMIN_PERM,
3792 .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
3793 },
3794 {
3795 .cmd = DEVLINK_CMD_REGION_GET,
3796 .doit = devlink_nl_cmd_region_get_doit,
3797 .dumpit = devlink_nl_cmd_region_get_dumpit,
3798 .policy = devlink_nl_policy,
3799 .flags = GENL_ADMIN_PERM,
3800 .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
3801 },
3802 {
3803 .cmd = DEVLINK_CMD_REGION_DEL,
3804 .doit = devlink_nl_cmd_region_del,
3805 .policy = devlink_nl_policy,
3806 .flags = GENL_ADMIN_PERM,
3807 .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
3808 },
3809 {
3810 .cmd = DEVLINK_CMD_REGION_READ,
3811 .dumpit = devlink_nl_cmd_region_read_dumpit,
3812 .policy = devlink_nl_policy,
3813 .flags = GENL_ADMIN_PERM,
3814 .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
3815 },
2810}; 3816};
2811 3817
2812static struct genl_family devlink_nl_family __ro_after_init = { 3818static struct genl_family devlink_nl_family __ro_after_init = {
@@ -2845,6 +3851,8 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size)
2845 INIT_LIST_HEAD(&devlink->sb_list); 3851 INIT_LIST_HEAD(&devlink->sb_list);
2846 INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list); 3852 INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list);
2847 INIT_LIST_HEAD(&devlink->resource_list); 3853 INIT_LIST_HEAD(&devlink->resource_list);
3854 INIT_LIST_HEAD(&devlink->param_list);
3855 INIT_LIST_HEAD(&devlink->region_list);
2848 mutex_init(&devlink->lock); 3856 mutex_init(&devlink->lock);
2849 return devlink; 3857 return devlink;
2850} 3858}
@@ -3434,6 +4442,320 @@ out:
3434} 4442}
3435EXPORT_SYMBOL_GPL(devlink_resource_occ_get_unregister); 4443EXPORT_SYMBOL_GPL(devlink_resource_occ_get_unregister);
3436 4444
4445/**
4446 * devlink_params_register - register configuration parameters
4447 *
4448 * @devlink: devlink
4449 * @params: configuration parameters array
4450 * @params_count: number of parameters provided
4451 *
4452 * Register the configuration parameters supported by the driver.
4453 */
4454int devlink_params_register(struct devlink *devlink,
4455 const struct devlink_param *params,
4456 size_t params_count)
4457{
4458 const struct devlink_param *param = params;
4459 int i;
4460 int err;
4461
4462 mutex_lock(&devlink->lock);
4463 for (i = 0; i < params_count; i++, param++) {
4464 if (!param || !param->name || !param->supported_cmodes) {
4465 err = -EINVAL;
4466 goto rollback;
4467 }
4468 if (param->generic) {
4469 err = devlink_param_generic_verify(param);
4470 if (err)
4471 goto rollback;
4472 } else {
4473 err = devlink_param_driver_verify(param);
4474 if (err)
4475 goto rollback;
4476 }
4477 err = devlink_param_register_one(devlink, param);
4478 if (err)
4479 goto rollback;
4480 }
4481
4482 mutex_unlock(&devlink->lock);
4483 return 0;
4484
4485rollback:
4486 if (!i)
4487 goto unlock;
4488 for (param--; i > 0; i--, param--)
4489 devlink_param_unregister_one(devlink, param);
4490unlock:
4491 mutex_unlock(&devlink->lock);
4492 return err;
4493}
4494EXPORT_SYMBOL_GPL(devlink_params_register);
4495
4496/**
4497 * devlink_params_unregister - unregister configuration parameters
4498 * @devlink: devlink
4499 * @params: configuration parameters to unregister
4500 * @params_count: number of parameters provided
4501 */
4502void devlink_params_unregister(struct devlink *devlink,
4503 const struct devlink_param *params,
4504 size_t params_count)
4505{
4506 const struct devlink_param *param = params;
4507 int i;
4508
4509 mutex_lock(&devlink->lock);
4510 for (i = 0; i < params_count; i++, param++)
4511 devlink_param_unregister_one(devlink, param);
4512 mutex_unlock(&devlink->lock);
4513}
4514EXPORT_SYMBOL_GPL(devlink_params_unregister);
4515
4516/**
4517 * devlink_param_driverinit_value_get - get configuration parameter
4518 * value for driver initializing
4519 *
4520 * @devlink: devlink
4521 * @param_id: parameter ID
4522 * @init_val: value of parameter in driverinit configuration mode
4523 *
4524 * This function should be used by the driver to get driverinit
4525 * configuration for initialization after reload command.
4526 */
4527int devlink_param_driverinit_value_get(struct devlink *devlink, u32 param_id,
4528 union devlink_param_value *init_val)
4529{
4530 struct devlink_param_item *param_item;
4531
4532 if (!devlink->ops || !devlink->ops->reload)
4533 return -EOPNOTSUPP;
4534
4535 param_item = devlink_param_find_by_id(&devlink->param_list, param_id);
4536 if (!param_item)
4537 return -EINVAL;
4538
4539 if (!param_item->driverinit_value_valid ||
4540 !devlink_param_cmode_is_supported(param_item->param,
4541 DEVLINK_PARAM_CMODE_DRIVERINIT))
4542 return -EOPNOTSUPP;
4543
4544 *init_val = param_item->driverinit_value;
4545
4546 return 0;
4547}
4548EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_get);
4549
4550/**
4551 * devlink_param_driverinit_value_set - set value of configuration
4552 * parameter for driverinit
4553 * configuration mode
4554 *
4555 * @devlink: devlink
4556 * @param_id: parameter ID
4557 * @init_val: value of parameter to set for driverinit configuration mode
4558 *
4559 * This function should be used by the driver to set driverinit
4560 * configuration mode default value.
4561 */
4562int devlink_param_driverinit_value_set(struct devlink *devlink, u32 param_id,
4563 union devlink_param_value init_val)
4564{
4565 struct devlink_param_item *param_item;
4566
4567 param_item = devlink_param_find_by_id(&devlink->param_list, param_id);
4568 if (!param_item)
4569 return -EINVAL;
4570
4571 if (!devlink_param_cmode_is_supported(param_item->param,
4572 DEVLINK_PARAM_CMODE_DRIVERINIT))
4573 return -EOPNOTSUPP;
4574
4575 param_item->driverinit_value = init_val;
4576 param_item->driverinit_value_valid = true;
4577
4578 devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW);
4579 return 0;
4580}
4581EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_set);
4582
4583/**
4584 * devlink_param_value_changed - notify devlink on a parameter's value
4585 * change. Should be called by the driver
4586 * right after the change.
4587 *
4588 * @devlink: devlink
4589 * @param_id: parameter ID
4590 *
4591 * This function should be used by the driver to notify devlink on value
4592 * change, excluding driverinit configuration mode.
4593 * For driverinit configuration mode driver should use the function
4594 * devlink_param_driverinit_value_set() instead.
4595 */
4596void devlink_param_value_changed(struct devlink *devlink, u32 param_id)
4597{
4598 struct devlink_param_item *param_item;
4599
4600 param_item = devlink_param_find_by_id(&devlink->param_list, param_id);
4601 WARN_ON(!param_item);
4602
4603 devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW);
4604}
4605EXPORT_SYMBOL_GPL(devlink_param_value_changed);
4606
4607/**
4608 * devlink_region_create - create a new address region
4609 *
4610 * @devlink: devlink
4611 * @region_name: region name
4612 * @region_max_snapshots: Maximum supported number of snapshots for region
4613 * @region_size: size of region
4614 */
4615struct devlink_region *devlink_region_create(struct devlink *devlink,
4616 const char *region_name,
4617 u32 region_max_snapshots,
4618 u64 region_size)
4619{
4620 struct devlink_region *region;
4621 int err = 0;
4622
4623 mutex_lock(&devlink->lock);
4624
4625 if (devlink_region_get_by_name(devlink, region_name)) {
4626 err = -EEXIST;
4627 goto unlock;
4628 }
4629
4630 region = kzalloc(sizeof(*region), GFP_KERNEL);
4631 if (!region) {
4632 err = -ENOMEM;
4633 goto unlock;
4634 }
4635
4636 region->devlink = devlink;
4637 region->max_snapshots = region_max_snapshots;
4638 region->name = region_name;
4639 region->size = region_size;
4640 INIT_LIST_HEAD(&region->snapshot_list);
4641 list_add_tail(&region->list, &devlink->region_list);
4642 devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_NEW);
4643
4644 mutex_unlock(&devlink->lock);
4645 return region;
4646
4647unlock:
4648 mutex_unlock(&devlink->lock);
4649 return ERR_PTR(err);
4650}
4651EXPORT_SYMBOL_GPL(devlink_region_create);
4652
4653/**
4654 * devlink_region_destroy - destroy address region
4655 *
4656 * @region: devlink region to destroy
4657 */
4658void devlink_region_destroy(struct devlink_region *region)
4659{
4660 struct devlink *devlink = region->devlink;
4661 struct devlink_snapshot *snapshot, *ts;
4662
4663 mutex_lock(&devlink->lock);
4664
4665 /* Free all snapshots of region */
4666 list_for_each_entry_safe(snapshot, ts, &region->snapshot_list, list)
4667 devlink_region_snapshot_del(snapshot);
4668
4669 list_del(&region->list);
4670
4671 devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_DEL);
4672 mutex_unlock(&devlink->lock);
4673 kfree(region);
4674}
4675EXPORT_SYMBOL_GPL(devlink_region_destroy);
4676
4677/**
4678 * devlink_region_shapshot_id_get - get snapshot ID
4679 *
4680 * This callback should be called when adding a new snapshot,
4681 * Driver should use the same id for multiple snapshots taken
4682 * on multiple regions at the same time/by the same trigger.
4683 *
4684 * @devlink: devlink
4685 */
4686u32 devlink_region_shapshot_id_get(struct devlink *devlink)
4687{
4688 u32 id;
4689
4690 mutex_lock(&devlink->lock);
4691 id = ++devlink->snapshot_id;
4692 mutex_unlock(&devlink->lock);
4693
4694 return id;
4695}
4696EXPORT_SYMBOL_GPL(devlink_region_shapshot_id_get);
4697
4698/**
4699 * devlink_region_snapshot_create - create a new snapshot
4700 * This will add a new snapshot of a region. The snapshot
4701 * will be stored on the region struct and can be accessed
4702 * from devlink. This is useful for future analyses of snapshots.
4703 * Multiple snapshots can be created on a region.
4704 * The @snapshot_id should be obtained using the getter function.
4705 *
4706 * @devlink_region: devlink region of the snapshot
4707 * @data_len: size of snapshot data
4708 * @data: snapshot data
4709 * @snapshot_id: snapshot id to be created
4710 * @data_destructor: pointer to destructor function to free data
4711 */
4712int devlink_region_snapshot_create(struct devlink_region *region, u64 data_len,
4713 u8 *data, u32 snapshot_id,
4714 devlink_snapshot_data_dest_t *data_destructor)
4715{
4716 struct devlink *devlink = region->devlink;
4717 struct devlink_snapshot *snapshot;
4718 int err;
4719
4720 mutex_lock(&devlink->lock);
4721
4722 /* check if region can hold one more snapshot */
4723 if (region->cur_snapshots == region->max_snapshots) {
4724 err = -ENOMEM;
4725 goto unlock;
4726 }
4727
4728 if (devlink_region_snapshot_get_by_id(region, snapshot_id)) {
4729 err = -EEXIST;
4730 goto unlock;
4731 }
4732
4733 snapshot = kzalloc(sizeof(*snapshot), GFP_KERNEL);
4734 if (!snapshot) {
4735 err = -ENOMEM;
4736 goto unlock;
4737 }
4738
4739 snapshot->id = snapshot_id;
4740 snapshot->region = region;
4741 snapshot->data = data;
4742 snapshot->data_len = data_len;
4743 snapshot->data_destructor = data_destructor;
4744
4745 list_add_tail(&snapshot->list, &region->snapshot_list);
4746
4747 region->cur_snapshots++;
4748
4749 devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_NEW);
4750 mutex_unlock(&devlink->lock);
4751 return 0;
4752
4753unlock:
4754 mutex_unlock(&devlink->lock);
4755 return err;
4756}
4757EXPORT_SYMBOL_GPL(devlink_region_snapshot_create);
4758
3437static int __init devlink_module_init(void) 4759static int __init devlink_module_init(void)
3438{ 4760{
3439 return genl_register_family(&devlink_nl_family); 4761 return genl_register_family(&devlink_nl_family);
diff --git a/net/core/dst.c b/net/core/dst.c
index 2d9b37f8944a..81ccf20e2826 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -307,6 +307,7 @@ void metadata_dst_free(struct metadata_dst *md_dst)
307#endif 307#endif
308 kfree(md_dst); 308 kfree(md_dst);
309} 309}
310EXPORT_SYMBOL_GPL(metadata_dst_free);
310 311
311struct metadata_dst __percpu * 312struct metadata_dst __percpu *
312metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags) 313metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags)
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index e677a20180cf..c9993c6c2fd4 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -111,6 +111,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
111 [NETIF_F_RX_UDP_TUNNEL_PORT_BIT] = "rx-udp_tunnel-port-offload", 111 [NETIF_F_RX_UDP_TUNNEL_PORT_BIT] = "rx-udp_tunnel-port-offload",
112 [NETIF_F_HW_TLS_RECORD_BIT] = "tls-hw-record", 112 [NETIF_F_HW_TLS_RECORD_BIT] = "tls-hw-record",
113 [NETIF_F_HW_TLS_TX_BIT] = "tls-hw-tx-offload", 113 [NETIF_F_HW_TLS_TX_BIT] = "tls-hw-tx-offload",
114 [NETIF_F_HW_TLS_RX_BIT] = "tls-hw-rx-offload",
114}; 115};
115 116
116static const char 117static const char
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 126ffc5bc630..0ff3953f64aa 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -416,6 +416,14 @@ static struct fib_rule *rule_find(struct fib_rules_ops *ops,
416 if (rule->mark && r->mark != rule->mark) 416 if (rule->mark && r->mark != rule->mark)
417 continue; 417 continue;
418 418
419 if (rule->suppress_ifgroup != -1 &&
420 r->suppress_ifgroup != rule->suppress_ifgroup)
421 continue;
422
423 if (rule->suppress_prefixlen != -1 &&
424 r->suppress_prefixlen != rule->suppress_prefixlen)
425 continue;
426
419 if (rule->mark_mask && r->mark_mask != rule->mark_mask) 427 if (rule->mark_mask && r->mark_mask != rule->mark_mask)
420 continue; 428 continue;
421 429
@@ -436,6 +444,9 @@ static struct fib_rule *rule_find(struct fib_rules_ops *ops,
436 if (rule->ip_proto && r->ip_proto != rule->ip_proto) 444 if (rule->ip_proto && r->ip_proto != rule->ip_proto)
437 continue; 445 continue;
438 446
447 if (rule->proto && r->proto != rule->proto)
448 continue;
449
439 if (fib_rule_port_range_set(&rule->sport_range) && 450 if (fib_rule_port_range_set(&rule->sport_range) &&
440 !fib_rule_port_range_compare(&r->sport_range, 451 !fib_rule_port_range_compare(&r->sport_range,
441 &rule->sport_range)) 452 &rule->sport_range))
@@ -645,6 +656,73 @@ errout:
645 return err; 656 return err;
646} 657}
647 658
659static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh,
660 struct nlattr **tb, struct fib_rule *rule)
661{
662 struct fib_rule *r;
663
664 list_for_each_entry(r, &ops->rules_list, list) {
665 if (r->action != rule->action)
666 continue;
667
668 if (r->table != rule->table)
669 continue;
670
671 if (r->pref != rule->pref)
672 continue;
673
674 if (memcmp(r->iifname, rule->iifname, IFNAMSIZ))
675 continue;
676
677 if (memcmp(r->oifname, rule->oifname, IFNAMSIZ))
678 continue;
679
680 if (r->mark != rule->mark)
681 continue;
682
683 if (r->suppress_ifgroup != rule->suppress_ifgroup)
684 continue;
685
686 if (r->suppress_prefixlen != rule->suppress_prefixlen)
687 continue;
688
689 if (r->mark_mask != rule->mark_mask)
690 continue;
691
692 if (r->tun_id != rule->tun_id)
693 continue;
694
695 if (r->fr_net != rule->fr_net)
696 continue;
697
698 if (r->l3mdev != rule->l3mdev)
699 continue;
700
701 if (!uid_eq(r->uid_range.start, rule->uid_range.start) ||
702 !uid_eq(r->uid_range.end, rule->uid_range.end))
703 continue;
704
705 if (r->ip_proto != rule->ip_proto)
706 continue;
707
708 if (r->proto != rule->proto)
709 continue;
710
711 if (!fib_rule_port_range_compare(&r->sport_range,
712 &rule->sport_range))
713 continue;
714
715 if (!fib_rule_port_range_compare(&r->dport_range,
716 &rule->dport_range))
717 continue;
718
719 if (!ops->compare(r, frh, tb))
720 continue;
721 return 1;
722 }
723 return 0;
724}
725
648int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, 726int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
649 struct netlink_ext_ack *extack) 727 struct netlink_ext_ack *extack)
650{ 728{
@@ -679,7 +757,7 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
679 goto errout; 757 goto errout;
680 758
681 if ((nlh->nlmsg_flags & NLM_F_EXCL) && 759 if ((nlh->nlmsg_flags & NLM_F_EXCL) &&
682 rule_find(ops, frh, tb, rule, user_priority)) { 760 rule_exists(ops, frh, tb, rule)) {
683 err = -EEXIST; 761 err = -EEXIST;
684 goto errout_free; 762 goto errout_free;
685 } 763 }
@@ -846,8 +924,7 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
846 return 0; 924 return 0;
847 925
848errout: 926errout:
849 if (nlrule) 927 kfree(nlrule);
850 kfree(nlrule);
851 rules_ops_put(ops); 928 rules_ops_put(ops);
852 return err; 929 return err;
853} 930}
diff --git a/net/core/filter.c b/net/core/filter.c
index e7f12e9f598c..c25eb36f1320 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -459,11 +459,21 @@ static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp)
459 (!unaligned_ok && offset >= 0 && 459 (!unaligned_ok && offset >= 0 &&
460 offset + ip_align >= 0 && 460 offset + ip_align >= 0 &&
461 offset + ip_align % size == 0))) { 461 offset + ip_align % size == 0))) {
462 bool ldx_off_ok = offset <= S16_MAX;
463
462 *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H); 464 *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H);
463 *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset); 465 *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset);
464 *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP, size, 2 + endian); 466 *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP,
465 *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A, BPF_REG_D, 467 size, 2 + endian + (!ldx_off_ok * 2));
466 offset); 468 if (ldx_off_ok) {
469 *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A,
470 BPF_REG_D, offset);
471 } else {
472 *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_D);
473 *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_TMP, offset);
474 *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A,
475 BPF_REG_TMP, 0);
476 }
467 if (endian) 477 if (endian)
468 *insn++ = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, size * 8); 478 *insn++ = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, size * 8);
469 *insn++ = BPF_JMP_A(8); 479 *insn++ = BPF_JMP_A(8);
@@ -1443,30 +1453,6 @@ static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk)
1443 return 0; 1453 return 0;
1444} 1454}
1445 1455
1446static int __reuseport_attach_prog(struct bpf_prog *prog, struct sock *sk)
1447{
1448 struct bpf_prog *old_prog;
1449 int err;
1450
1451 if (bpf_prog_size(prog->len) > sysctl_optmem_max)
1452 return -ENOMEM;
1453
1454 if (sk_unhashed(sk) && sk->sk_reuseport) {
1455 err = reuseport_alloc(sk);
1456 if (err)
1457 return err;
1458 } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) {
1459 /* The socket wasn't bound with SO_REUSEPORT */
1460 return -EINVAL;
1461 }
1462
1463 old_prog = reuseport_attach_prog(sk, prog);
1464 if (old_prog)
1465 bpf_prog_destroy(old_prog);
1466
1467 return 0;
1468}
1469
1470static 1456static
1471struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk) 1457struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk)
1472{ 1458{
@@ -1540,13 +1526,15 @@ int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk)
1540 if (IS_ERR(prog)) 1526 if (IS_ERR(prog))
1541 return PTR_ERR(prog); 1527 return PTR_ERR(prog);
1542 1528
1543 err = __reuseport_attach_prog(prog, sk); 1529 if (bpf_prog_size(prog->len) > sysctl_optmem_max)
1544 if (err < 0) { 1530 err = -ENOMEM;
1531 else
1532 err = reuseport_attach_prog(sk, prog);
1533
1534 if (err)
1545 __bpf_prog_release(prog); 1535 __bpf_prog_release(prog);
1546 return err;
1547 }
1548 1536
1549 return 0; 1537 return err;
1550} 1538}
1551 1539
1552static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk) 1540static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk)
@@ -1576,19 +1564,58 @@ int sk_attach_bpf(u32 ufd, struct sock *sk)
1576 1564
1577int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) 1565int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk)
1578{ 1566{
1579 struct bpf_prog *prog = __get_bpf(ufd, sk); 1567 struct bpf_prog *prog;
1580 int err; 1568 int err;
1581 1569
1570 if (sock_flag(sk, SOCK_FILTER_LOCKED))
1571 return -EPERM;
1572
1573 prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER);
1574 if (IS_ERR(prog) && PTR_ERR(prog) == -EINVAL)
1575 prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SK_REUSEPORT);
1582 if (IS_ERR(prog)) 1576 if (IS_ERR(prog))
1583 return PTR_ERR(prog); 1577 return PTR_ERR(prog);
1584 1578
1585 err = __reuseport_attach_prog(prog, sk); 1579 if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) {
1586 if (err < 0) { 1580 /* Like other non BPF_PROG_TYPE_SOCKET_FILTER
1587 bpf_prog_put(prog); 1581 * bpf prog (e.g. sockmap). It depends on the
1588 return err; 1582 * limitation imposed by bpf_prog_load().
1583 * Hence, sysctl_optmem_max is not checked.
1584 */
1585 if ((sk->sk_type != SOCK_STREAM &&
1586 sk->sk_type != SOCK_DGRAM) ||
1587 (sk->sk_protocol != IPPROTO_UDP &&
1588 sk->sk_protocol != IPPROTO_TCP) ||
1589 (sk->sk_family != AF_INET &&
1590 sk->sk_family != AF_INET6)) {
1591 err = -ENOTSUPP;
1592 goto err_prog_put;
1593 }
1594 } else {
1595 /* BPF_PROG_TYPE_SOCKET_FILTER */
1596 if (bpf_prog_size(prog->len) > sysctl_optmem_max) {
1597 err = -ENOMEM;
1598 goto err_prog_put;
1599 }
1589 } 1600 }
1590 1601
1591 return 0; 1602 err = reuseport_attach_prog(sk, prog);
1603err_prog_put:
1604 if (err)
1605 bpf_prog_put(prog);
1606
1607 return err;
1608}
1609
1610void sk_reuseport_prog_free(struct bpf_prog *prog)
1611{
1612 if (!prog)
1613 return;
1614
1615 if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
1616 bpf_prog_put(prog);
1617 else
1618 bpf_prog_destroy(prog);
1592} 1619}
1593 1620
1594struct bpf_scratchpad { 1621struct bpf_scratchpad {
@@ -1702,24 +1729,26 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
1702BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb, 1729BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb,
1703 u32, offset, void *, to, u32, len, u32, start_header) 1730 u32, offset, void *, to, u32, len, u32, start_header)
1704{ 1731{
1732 u8 *end = skb_tail_pointer(skb);
1733 u8 *net = skb_network_header(skb);
1734 u8 *mac = skb_mac_header(skb);
1705 u8 *ptr; 1735 u8 *ptr;
1706 1736
1707 if (unlikely(offset > 0xffff || len > skb_headlen(skb))) 1737 if (unlikely(offset > 0xffff || len > (end - mac)))
1708 goto err_clear; 1738 goto err_clear;
1709 1739
1710 switch (start_header) { 1740 switch (start_header) {
1711 case BPF_HDR_START_MAC: 1741 case BPF_HDR_START_MAC:
1712 ptr = skb_mac_header(skb) + offset; 1742 ptr = mac + offset;
1713 break; 1743 break;
1714 case BPF_HDR_START_NET: 1744 case BPF_HDR_START_NET:
1715 ptr = skb_network_header(skb) + offset; 1745 ptr = net + offset;
1716 break; 1746 break;
1717 default: 1747 default:
1718 goto err_clear; 1748 goto err_clear;
1719 } 1749 }
1720 1750
1721 if (likely(ptr >= skb_mac_header(skb) && 1751 if (likely(ptr >= mac && ptr + len <= end)) {
1722 ptr + len <= skb_tail_pointer(skb))) {
1723 memcpy(to, ptr, len); 1752 memcpy(to, ptr, len);
1724 return 0; 1753 return 0;
1725 } 1754 }
@@ -1762,6 +1791,37 @@ static const struct bpf_func_proto bpf_skb_pull_data_proto = {
1762 .arg2_type = ARG_ANYTHING, 1791 .arg2_type = ARG_ANYTHING,
1763}; 1792};
1764 1793
1794static inline int sk_skb_try_make_writable(struct sk_buff *skb,
1795 unsigned int write_len)
1796{
1797 int err = __bpf_try_make_writable(skb, write_len);
1798
1799 bpf_compute_data_end_sk_skb(skb);
1800 return err;
1801}
1802
1803BPF_CALL_2(sk_skb_pull_data, struct sk_buff *, skb, u32, len)
1804{
1805 /* Idea is the following: should the needed direct read/write
1806 * test fail during runtime, we can pull in more data and redo
1807 * again, since implicitly, we invalidate previous checks here.
1808 *
1809 * Or, since we know how much we need to make read/writeable,
1810 * this can be done once at the program beginning for direct
1811 * access case. By this we overcome limitations of only current
1812 * headroom being accessible.
1813 */
1814 return sk_skb_try_make_writable(skb, len ? : skb_headlen(skb));
1815}
1816
1817static const struct bpf_func_proto sk_skb_pull_data_proto = {
1818 .func = sk_skb_pull_data,
1819 .gpl_only = false,
1820 .ret_type = RET_INTEGER,
1821 .arg1_type = ARG_PTR_TO_CTX,
1822 .arg2_type = ARG_ANYTHING,
1823};
1824
1765BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset, 1825BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset,
1766 u64, from, u64, to, u64, flags) 1826 u64, from, u64, to, u64, flags)
1767{ 1827{
@@ -2039,19 +2099,12 @@ static const struct bpf_func_proto bpf_clone_redirect_proto = {
2039 .arg3_type = ARG_ANYTHING, 2099 .arg3_type = ARG_ANYTHING,
2040}; 2100};
2041 2101
2042struct redirect_info { 2102DEFINE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info);
2043 u32 ifindex; 2103EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info);
2044 u32 flags;
2045 struct bpf_map *map;
2046 struct bpf_map *map_to_flush;
2047 unsigned long map_owner;
2048};
2049
2050static DEFINE_PER_CPU(struct redirect_info, redirect_info);
2051 2104
2052BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) 2105BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
2053{ 2106{
2054 struct redirect_info *ri = this_cpu_ptr(&redirect_info); 2107 struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
2055 2108
2056 if (unlikely(flags & ~(BPF_F_INGRESS))) 2109 if (unlikely(flags & ~(BPF_F_INGRESS)))
2057 return TC_ACT_SHOT; 2110 return TC_ACT_SHOT;
@@ -2064,7 +2117,7 @@ BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
2064 2117
2065int skb_do_redirect(struct sk_buff *skb) 2118int skb_do_redirect(struct sk_buff *skb)
2066{ 2119{
2067 struct redirect_info *ri = this_cpu_ptr(&redirect_info); 2120 struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
2068 struct net_device *dev; 2121 struct net_device *dev;
2069 2122
2070 dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->ifindex); 2123 dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->ifindex);
@@ -2779,7 +2832,8 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff)
2779 2832
2780static u32 __bpf_skb_max_len(const struct sk_buff *skb) 2833static u32 __bpf_skb_max_len(const struct sk_buff *skb)
2781{ 2834{
2782 return skb->dev->mtu + skb->dev->hard_header_len; 2835 return skb->dev ? skb->dev->mtu + skb->dev->hard_header_len :
2836 SKB_MAX_ALLOC;
2783} 2837}
2784 2838
2785static int bpf_skb_adjust_net(struct sk_buff *skb, s32 len_diff) 2839static int bpf_skb_adjust_net(struct sk_buff *skb, s32 len_diff)
@@ -2863,8 +2917,8 @@ static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len)
2863 return __skb_trim_rcsum(skb, new_len); 2917 return __skb_trim_rcsum(skb, new_len);
2864} 2918}
2865 2919
2866BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len, 2920static inline int __bpf_skb_change_tail(struct sk_buff *skb, u32 new_len,
2867 u64, flags) 2921 u64 flags)
2868{ 2922{
2869 u32 max_len = __bpf_skb_max_len(skb); 2923 u32 max_len = __bpf_skb_max_len(skb);
2870 u32 min_len = __bpf_skb_min_len(skb); 2924 u32 min_len = __bpf_skb_min_len(skb);
@@ -2900,6 +2954,13 @@ BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len,
2900 if (!ret && skb_is_gso(skb)) 2954 if (!ret && skb_is_gso(skb))
2901 skb_gso_reset(skb); 2955 skb_gso_reset(skb);
2902 } 2956 }
2957 return ret;
2958}
2959
2960BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len,
2961 u64, flags)
2962{
2963 int ret = __bpf_skb_change_tail(skb, new_len, flags);
2903 2964
2904 bpf_compute_data_pointers(skb); 2965 bpf_compute_data_pointers(skb);
2905 return ret; 2966 return ret;
@@ -2914,9 +2975,27 @@ static const struct bpf_func_proto bpf_skb_change_tail_proto = {
2914 .arg3_type = ARG_ANYTHING, 2975 .arg3_type = ARG_ANYTHING,
2915}; 2976};
2916 2977
2917BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room, 2978BPF_CALL_3(sk_skb_change_tail, struct sk_buff *, skb, u32, new_len,
2918 u64, flags) 2979 u64, flags)
2919{ 2980{
2981 int ret = __bpf_skb_change_tail(skb, new_len, flags);
2982
2983 bpf_compute_data_end_sk_skb(skb);
2984 return ret;
2985}
2986
2987static const struct bpf_func_proto sk_skb_change_tail_proto = {
2988 .func = sk_skb_change_tail,
2989 .gpl_only = false,
2990 .ret_type = RET_INTEGER,
2991 .arg1_type = ARG_PTR_TO_CTX,
2992 .arg2_type = ARG_ANYTHING,
2993 .arg3_type = ARG_ANYTHING,
2994};
2995
2996static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room,
2997 u64 flags)
2998{
2920 u32 max_len = __bpf_skb_max_len(skb); 2999 u32 max_len = __bpf_skb_max_len(skb);
2921 u32 new_len = skb->len + head_room; 3000 u32 new_len = skb->len + head_room;
2922 int ret; 3001 int ret;
@@ -2941,8 +3020,16 @@ BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room,
2941 skb_reset_mac_header(skb); 3020 skb_reset_mac_header(skb);
2942 } 3021 }
2943 3022
3023 return ret;
3024}
3025
3026BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room,
3027 u64, flags)
3028{
3029 int ret = __bpf_skb_change_head(skb, head_room, flags);
3030
2944 bpf_compute_data_pointers(skb); 3031 bpf_compute_data_pointers(skb);
2945 return 0; 3032 return ret;
2946} 3033}
2947 3034
2948static const struct bpf_func_proto bpf_skb_change_head_proto = { 3035static const struct bpf_func_proto bpf_skb_change_head_proto = {
@@ -2954,6 +3041,23 @@ static const struct bpf_func_proto bpf_skb_change_head_proto = {
2954 .arg3_type = ARG_ANYTHING, 3041 .arg3_type = ARG_ANYTHING,
2955}; 3042};
2956 3043
3044BPF_CALL_3(sk_skb_change_head, struct sk_buff *, skb, u32, head_room,
3045 u64, flags)
3046{
3047 int ret = __bpf_skb_change_head(skb, head_room, flags);
3048
3049 bpf_compute_data_end_sk_skb(skb);
3050 return ret;
3051}
3052
3053static const struct bpf_func_proto sk_skb_change_head_proto = {
3054 .func = sk_skb_change_head,
3055 .gpl_only = false,
3056 .ret_type = RET_INTEGER,
3057 .arg1_type = ARG_PTR_TO_CTX,
3058 .arg2_type = ARG_ANYTHING,
3059 .arg3_type = ARG_ANYTHING,
3060};
2957static unsigned long xdp_get_metalen(const struct xdp_buff *xdp) 3061static unsigned long xdp_get_metalen(const struct xdp_buff *xdp)
2958{ 3062{
2959 return xdp_data_meta_unsupported(xdp) ? 0 : 3063 return xdp_data_meta_unsupported(xdp) ? 0 :
@@ -3046,12 +3150,16 @@ static int __bpf_tx_xdp(struct net_device *dev,
3046 u32 index) 3150 u32 index)
3047{ 3151{
3048 struct xdp_frame *xdpf; 3152 struct xdp_frame *xdpf;
3049 int sent; 3153 int err, sent;
3050 3154
3051 if (!dev->netdev_ops->ndo_xdp_xmit) { 3155 if (!dev->netdev_ops->ndo_xdp_xmit) {
3052 return -EOPNOTSUPP; 3156 return -EOPNOTSUPP;
3053 } 3157 }
3054 3158
3159 err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data);
3160 if (unlikely(err))
3161 return err;
3162
3055 xdpf = convert_to_xdp_frame(xdp); 3163 xdpf = convert_to_xdp_frame(xdp);
3056 if (unlikely(!xdpf)) 3164 if (unlikely(!xdpf))
3057 return -EOVERFLOW; 3165 return -EOVERFLOW;
@@ -3102,7 +3210,7 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
3102 3210
3103void xdp_do_flush_map(void) 3211void xdp_do_flush_map(void)
3104{ 3212{
3105 struct redirect_info *ri = this_cpu_ptr(&redirect_info); 3213 struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
3106 struct bpf_map *map = ri->map_to_flush; 3214 struct bpf_map *map = ri->map_to_flush;
3107 3215
3108 ri->map_to_flush = NULL; 3216 ri->map_to_flush = NULL;
@@ -3138,31 +3246,33 @@ static void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index)
3138 } 3246 }
3139} 3247}
3140 3248
3141static inline bool xdp_map_invalid(const struct bpf_prog *xdp_prog, 3249void bpf_clear_redirect_map(struct bpf_map *map)
3142 unsigned long aux)
3143{ 3250{
3144 return (unsigned long)xdp_prog->aux != aux; 3251 struct bpf_redirect_info *ri;
3252 int cpu;
3253
3254 for_each_possible_cpu(cpu) {
3255 ri = per_cpu_ptr(&bpf_redirect_info, cpu);
3256 /* Avoid polluting remote cacheline due to writes if
3257 * not needed. Once we pass this test, we need the
3258 * cmpxchg() to make sure it hasn't been changed in
3259 * the meantime by remote CPU.
3260 */
3261 if (unlikely(READ_ONCE(ri->map) == map))
3262 cmpxchg(&ri->map, map, NULL);
3263 }
3145} 3264}
3146 3265
3147static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, 3266static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
3148 struct bpf_prog *xdp_prog) 3267 struct bpf_prog *xdp_prog, struct bpf_map *map)
3149{ 3268{
3150 struct redirect_info *ri = this_cpu_ptr(&redirect_info); 3269 struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
3151 unsigned long map_owner = ri->map_owner;
3152 struct bpf_map *map = ri->map;
3153 u32 index = ri->ifindex; 3270 u32 index = ri->ifindex;
3154 void *fwd = NULL; 3271 void *fwd = NULL;
3155 int err; 3272 int err;
3156 3273
3157 ri->ifindex = 0; 3274 ri->ifindex = 0;
3158 ri->map = NULL; 3275 WRITE_ONCE(ri->map, NULL);
3159 ri->map_owner = 0;
3160
3161 if (unlikely(xdp_map_invalid(xdp_prog, map_owner))) {
3162 err = -EFAULT;
3163 map = NULL;
3164 goto err;
3165 }
3166 3276
3167 fwd = __xdp_map_lookup_elem(map, index); 3277 fwd = __xdp_map_lookup_elem(map, index);
3168 if (!fwd) { 3278 if (!fwd) {
@@ -3187,13 +3297,14 @@ err:
3187int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, 3297int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
3188 struct bpf_prog *xdp_prog) 3298 struct bpf_prog *xdp_prog)
3189{ 3299{
3190 struct redirect_info *ri = this_cpu_ptr(&redirect_info); 3300 struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
3301 struct bpf_map *map = READ_ONCE(ri->map);
3191 struct net_device *fwd; 3302 struct net_device *fwd;
3192 u32 index = ri->ifindex; 3303 u32 index = ri->ifindex;
3193 int err; 3304 int err;
3194 3305
3195 if (ri->map) 3306 if (map)
3196 return xdp_do_redirect_map(dev, xdp, xdp_prog); 3307 return xdp_do_redirect_map(dev, xdp, xdp_prog, map);
3197 3308
3198 fwd = dev_get_by_index_rcu(dev_net(dev), index); 3309 fwd = dev_get_by_index_rcu(dev_net(dev), index);
3199 ri->ifindex = 0; 3310 ri->ifindex = 0;
@@ -3217,24 +3328,17 @@ EXPORT_SYMBOL_GPL(xdp_do_redirect);
3217static int xdp_do_generic_redirect_map(struct net_device *dev, 3328static int xdp_do_generic_redirect_map(struct net_device *dev,
3218 struct sk_buff *skb, 3329 struct sk_buff *skb,
3219 struct xdp_buff *xdp, 3330 struct xdp_buff *xdp,
3220 struct bpf_prog *xdp_prog) 3331 struct bpf_prog *xdp_prog,
3332 struct bpf_map *map)
3221{ 3333{
3222 struct redirect_info *ri = this_cpu_ptr(&redirect_info); 3334 struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
3223 unsigned long map_owner = ri->map_owner;
3224 struct bpf_map *map = ri->map;
3225 u32 index = ri->ifindex; 3335 u32 index = ri->ifindex;
3226 void *fwd = NULL; 3336 void *fwd = NULL;
3227 int err = 0; 3337 int err = 0;
3228 3338
3229 ri->ifindex = 0; 3339 ri->ifindex = 0;
3230 ri->map = NULL; 3340 WRITE_ONCE(ri->map, NULL);
3231 ri->map_owner = 0;
3232 3341
3233 if (unlikely(xdp_map_invalid(xdp_prog, map_owner))) {
3234 err = -EFAULT;
3235 map = NULL;
3236 goto err;
3237 }
3238 fwd = __xdp_map_lookup_elem(map, index); 3342 fwd = __xdp_map_lookup_elem(map, index);
3239 if (unlikely(!fwd)) { 3343 if (unlikely(!fwd)) {
3240 err = -EINVAL; 3344 err = -EINVAL;
@@ -3270,14 +3374,15 @@ err:
3270int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, 3374int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
3271 struct xdp_buff *xdp, struct bpf_prog *xdp_prog) 3375 struct xdp_buff *xdp, struct bpf_prog *xdp_prog)
3272{ 3376{
3273 struct redirect_info *ri = this_cpu_ptr(&redirect_info); 3377 struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
3378 struct bpf_map *map = READ_ONCE(ri->map);
3274 u32 index = ri->ifindex; 3379 u32 index = ri->ifindex;
3275 struct net_device *fwd; 3380 struct net_device *fwd;
3276 int err = 0; 3381 int err = 0;
3277 3382
3278 if (ri->map) 3383 if (map)
3279 return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog); 3384 return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog,
3280 3385 map);
3281 ri->ifindex = 0; 3386 ri->ifindex = 0;
3282 fwd = dev_get_by_index_rcu(dev_net(dev), index); 3387 fwd = dev_get_by_index_rcu(dev_net(dev), index);
3283 if (unlikely(!fwd)) { 3388 if (unlikely(!fwd)) {
@@ -3285,7 +3390,8 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
3285 goto err; 3390 goto err;
3286 } 3391 }
3287 3392
3288 if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd)))) 3393 err = xdp_ok_fwd_dev(fwd, skb->len);
3394 if (unlikely(err))
3289 goto err; 3395 goto err;
3290 3396
3291 skb->dev = fwd; 3397 skb->dev = fwd;
@@ -3300,15 +3406,14 @@ EXPORT_SYMBOL_GPL(xdp_do_generic_redirect);
3300 3406
3301BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) 3407BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags)
3302{ 3408{
3303 struct redirect_info *ri = this_cpu_ptr(&redirect_info); 3409 struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
3304 3410
3305 if (unlikely(flags)) 3411 if (unlikely(flags))
3306 return XDP_ABORTED; 3412 return XDP_ABORTED;
3307 3413
3308 ri->ifindex = ifindex; 3414 ri->ifindex = ifindex;
3309 ri->flags = flags; 3415 ri->flags = flags;
3310 ri->map = NULL; 3416 WRITE_ONCE(ri->map, NULL);
3311 ri->map_owner = 0;
3312 3417
3313 return XDP_REDIRECT; 3418 return XDP_REDIRECT;
3314} 3419}
@@ -3321,25 +3426,21 @@ static const struct bpf_func_proto bpf_xdp_redirect_proto = {
3321 .arg2_type = ARG_ANYTHING, 3426 .arg2_type = ARG_ANYTHING,
3322}; 3427};
3323 3428
3324BPF_CALL_4(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags, 3429BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex,
3325 unsigned long, map_owner) 3430 u64, flags)
3326{ 3431{
3327 struct redirect_info *ri = this_cpu_ptr(&redirect_info); 3432 struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
3328 3433
3329 if (unlikely(flags)) 3434 if (unlikely(flags))
3330 return XDP_ABORTED; 3435 return XDP_ABORTED;
3331 3436
3332 ri->ifindex = ifindex; 3437 ri->ifindex = ifindex;
3333 ri->flags = flags; 3438 ri->flags = flags;
3334 ri->map = map; 3439 WRITE_ONCE(ri->map, map);
3335 ri->map_owner = map_owner;
3336 3440
3337 return XDP_REDIRECT; 3441 return XDP_REDIRECT;
3338} 3442}
3339 3443
3340/* Note, arg4 is hidden from users and populated by the verifier
3341 * with the right pointer.
3342 */
3343static const struct bpf_func_proto bpf_xdp_redirect_map_proto = { 3444static const struct bpf_func_proto bpf_xdp_redirect_map_proto = {
3344 .func = bpf_xdp_redirect_map, 3445 .func = bpf_xdp_redirect_map,
3345 .gpl_only = false, 3446 .gpl_only = false,
@@ -3582,7 +3683,7 @@ BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb,
3582 if (unlikely(size > IP_TUNNEL_OPTS_MAX)) 3683 if (unlikely(size > IP_TUNNEL_OPTS_MAX))
3583 return -ENOMEM; 3684 return -ENOMEM;
3584 3685
3585 ip_tunnel_info_opts_set(info, from, size); 3686 ip_tunnel_info_opts_set(info, from, size, TUNNEL_OPTIONS_PRESENT);
3586 3687
3587 return 0; 3688 return 0;
3588} 3689}
@@ -3669,6 +3770,32 @@ static const struct bpf_func_proto bpf_skb_cgroup_id_proto = {
3669 .ret_type = RET_INTEGER, 3770 .ret_type = RET_INTEGER,
3670 .arg1_type = ARG_PTR_TO_CTX, 3771 .arg1_type = ARG_PTR_TO_CTX,
3671}; 3772};
3773
3774BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int,
3775 ancestor_level)
3776{
3777 struct sock *sk = skb_to_full_sk(skb);
3778 struct cgroup *ancestor;
3779 struct cgroup *cgrp;
3780
3781 if (!sk || !sk_fullsock(sk))
3782 return 0;
3783
3784 cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
3785 ancestor = cgroup_ancestor(cgrp, ancestor_level);
3786 if (!ancestor)
3787 return 0;
3788
3789 return ancestor->kn->id.id;
3790}
3791
3792static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = {
3793 .func = bpf_skb_ancestor_cgroup_id,
3794 .gpl_only = false,
3795 .ret_type = RET_INTEGER,
3796 .arg1_type = ARG_PTR_TO_CTX,
3797 .arg2_type = ARG_ANYTHING,
3798};
3672#endif 3799#endif
3673 3800
3674static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff, 3801static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
@@ -3715,6 +3842,30 @@ static const struct bpf_func_proto bpf_get_socket_cookie_proto = {
3715 .arg1_type = ARG_PTR_TO_CTX, 3842 .arg1_type = ARG_PTR_TO_CTX,
3716}; 3843};
3717 3844
3845BPF_CALL_1(bpf_get_socket_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx)
3846{
3847 return sock_gen_cookie(ctx->sk);
3848}
3849
3850static const struct bpf_func_proto bpf_get_socket_cookie_sock_addr_proto = {
3851 .func = bpf_get_socket_cookie_sock_addr,
3852 .gpl_only = false,
3853 .ret_type = RET_INTEGER,
3854 .arg1_type = ARG_PTR_TO_CTX,
3855};
3856
3857BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx)
3858{
3859 return sock_gen_cookie(ctx->sk);
3860}
3861
3862static const struct bpf_func_proto bpf_get_socket_cookie_sock_ops_proto = {
3863 .func = bpf_get_socket_cookie_sock_ops,
3864 .gpl_only = false,
3865 .ret_type = RET_INTEGER,
3866 .arg1_type = ARG_PTR_TO_CTX,
3867};
3868
3718BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb) 3869BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb)
3719{ 3870{
3720 struct sock *sk = sk_to_full_sk(skb->sk); 3871 struct sock *sk = sk_to_full_sk(skb->sk);
@@ -4073,8 +4224,9 @@ static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params,
4073 memcpy(params->smac, dev->dev_addr, ETH_ALEN); 4224 memcpy(params->smac, dev->dev_addr, ETH_ALEN);
4074 params->h_vlan_TCI = 0; 4225 params->h_vlan_TCI = 0;
4075 params->h_vlan_proto = 0; 4226 params->h_vlan_proto = 0;
4227 params->ifindex = dev->ifindex;
4076 4228
4077 return dev->ifindex; 4229 return 0;
4078} 4230}
4079#endif 4231#endif
4080 4232
@@ -4098,7 +4250,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
4098 /* verify forwarding is enabled on this interface */ 4250 /* verify forwarding is enabled on this interface */
4099 in_dev = __in_dev_get_rcu(dev); 4251 in_dev = __in_dev_get_rcu(dev);
4100 if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev))) 4252 if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev)))
4101 return 0; 4253 return BPF_FIB_LKUP_RET_FWD_DISABLED;
4102 4254
4103 if (flags & BPF_FIB_LOOKUP_OUTPUT) { 4255 if (flags & BPF_FIB_LOOKUP_OUTPUT) {
4104 fl4.flowi4_iif = 1; 4256 fl4.flowi4_iif = 1;
@@ -4123,7 +4275,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
4123 4275
4124 tb = fib_get_table(net, tbid); 4276 tb = fib_get_table(net, tbid);
4125 if (unlikely(!tb)) 4277 if (unlikely(!tb))
4126 return 0; 4278 return BPF_FIB_LKUP_RET_NOT_FWDED;
4127 4279
4128 err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); 4280 err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
4129 } else { 4281 } else {
@@ -4135,8 +4287,20 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
4135 err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF); 4287 err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF);
4136 } 4288 }
4137 4289
4138 if (err || res.type != RTN_UNICAST) 4290 if (err) {
4139 return 0; 4291 /* map fib lookup errors to RTN_ type */
4292 if (err == -EINVAL)
4293 return BPF_FIB_LKUP_RET_BLACKHOLE;
4294 if (err == -EHOSTUNREACH)
4295 return BPF_FIB_LKUP_RET_UNREACHABLE;
4296 if (err == -EACCES)
4297 return BPF_FIB_LKUP_RET_PROHIBIT;
4298
4299 return BPF_FIB_LKUP_RET_NOT_FWDED;
4300 }
4301
4302 if (res.type != RTN_UNICAST)
4303 return BPF_FIB_LKUP_RET_NOT_FWDED;
4140 4304
4141 if (res.fi->fib_nhs > 1) 4305 if (res.fi->fib_nhs > 1)
4142 fib_select_path(net, &res, &fl4, NULL); 4306 fib_select_path(net, &res, &fl4, NULL);
@@ -4144,19 +4308,16 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
4144 if (check_mtu) { 4308 if (check_mtu) {
4145 mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst); 4309 mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst);
4146 if (params->tot_len > mtu) 4310 if (params->tot_len > mtu)
4147 return 0; 4311 return BPF_FIB_LKUP_RET_FRAG_NEEDED;
4148 } 4312 }
4149 4313
4150 nh = &res.fi->fib_nh[res.nh_sel]; 4314 nh = &res.fi->fib_nh[res.nh_sel];
4151 4315
4152 /* do not handle lwt encaps right now */ 4316 /* do not handle lwt encaps right now */
4153 if (nh->nh_lwtstate) 4317 if (nh->nh_lwtstate)
4154 return 0; 4318 return BPF_FIB_LKUP_RET_UNSUPP_LWT;
4155 4319
4156 dev = nh->nh_dev; 4320 dev = nh->nh_dev;
4157 if (unlikely(!dev))
4158 return 0;
4159
4160 if (nh->nh_gw) 4321 if (nh->nh_gw)
4161 params->ipv4_dst = nh->nh_gw; 4322 params->ipv4_dst = nh->nh_gw;
4162 4323
@@ -4166,10 +4327,10 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
4166 * rcu_read_lock_bh is not needed here 4327 * rcu_read_lock_bh is not needed here
4167 */ 4328 */
4168 neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst); 4329 neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst);
4169 if (neigh) 4330 if (!neigh)
4170 return bpf_fib_set_fwd_params(params, neigh, dev); 4331 return BPF_FIB_LKUP_RET_NO_NEIGH;
4171 4332
4172 return 0; 4333 return bpf_fib_set_fwd_params(params, neigh, dev);
4173} 4334}
4174#endif 4335#endif
4175 4336
@@ -4190,7 +4351,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
4190 4351
4191 /* link local addresses are never forwarded */ 4352 /* link local addresses are never forwarded */
4192 if (rt6_need_strict(dst) || rt6_need_strict(src)) 4353 if (rt6_need_strict(dst) || rt6_need_strict(src))
4193 return 0; 4354 return BPF_FIB_LKUP_RET_NOT_FWDED;
4194 4355
4195 dev = dev_get_by_index_rcu(net, params->ifindex); 4356 dev = dev_get_by_index_rcu(net, params->ifindex);
4196 if (unlikely(!dev)) 4357 if (unlikely(!dev))
@@ -4198,7 +4359,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
4198 4359
4199 idev = __in6_dev_get_safely(dev); 4360 idev = __in6_dev_get_safely(dev);
4200 if (unlikely(!idev || !net->ipv6.devconf_all->forwarding)) 4361 if (unlikely(!idev || !net->ipv6.devconf_all->forwarding))
4201 return 0; 4362 return BPF_FIB_LKUP_RET_FWD_DISABLED;
4202 4363
4203 if (flags & BPF_FIB_LOOKUP_OUTPUT) { 4364 if (flags & BPF_FIB_LOOKUP_OUTPUT) {
4204 fl6.flowi6_iif = 1; 4365 fl6.flowi6_iif = 1;
@@ -4225,7 +4386,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
4225 4386
4226 tb = ipv6_stub->fib6_get_table(net, tbid); 4387 tb = ipv6_stub->fib6_get_table(net, tbid);
4227 if (unlikely(!tb)) 4388 if (unlikely(!tb))
4228 return 0; 4389 return BPF_FIB_LKUP_RET_NOT_FWDED;
4229 4390
4230 f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict); 4391 f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict);
4231 } else { 4392 } else {
@@ -4238,11 +4399,23 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
4238 } 4399 }
4239 4400
4240 if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry)) 4401 if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry))
4241 return 0; 4402 return BPF_FIB_LKUP_RET_NOT_FWDED;
4403
4404 if (unlikely(f6i->fib6_flags & RTF_REJECT)) {
4405 switch (f6i->fib6_type) {
4406 case RTN_BLACKHOLE:
4407 return BPF_FIB_LKUP_RET_BLACKHOLE;
4408 case RTN_UNREACHABLE:
4409 return BPF_FIB_LKUP_RET_UNREACHABLE;
4410 case RTN_PROHIBIT:
4411 return BPF_FIB_LKUP_RET_PROHIBIT;
4412 default:
4413 return BPF_FIB_LKUP_RET_NOT_FWDED;
4414 }
4415 }
4242 4416
4243 if (unlikely(f6i->fib6_flags & RTF_REJECT || 4417 if (f6i->fib6_type != RTN_UNICAST)
4244 f6i->fib6_type != RTN_UNICAST)) 4418 return BPF_FIB_LKUP_RET_NOT_FWDED;
4245 return 0;
4246 4419
4247 if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0) 4420 if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0)
4248 f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6, 4421 f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6,
@@ -4252,11 +4425,11 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
4252 if (check_mtu) { 4425 if (check_mtu) {
4253 mtu = ipv6_stub->ip6_mtu_from_fib6(f6i, dst, src); 4426 mtu = ipv6_stub->ip6_mtu_from_fib6(f6i, dst, src);
4254 if (params->tot_len > mtu) 4427 if (params->tot_len > mtu)
4255 return 0; 4428 return BPF_FIB_LKUP_RET_FRAG_NEEDED;
4256 } 4429 }
4257 4430
4258 if (f6i->fib6_nh.nh_lwtstate) 4431 if (f6i->fib6_nh.nh_lwtstate)
4259 return 0; 4432 return BPF_FIB_LKUP_RET_UNSUPP_LWT;
4260 4433
4261 if (f6i->fib6_flags & RTF_GATEWAY) 4434 if (f6i->fib6_flags & RTF_GATEWAY)
4262 *dst = f6i->fib6_nh.nh_gw; 4435 *dst = f6i->fib6_nh.nh_gw;
@@ -4270,10 +4443,10 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
4270 */ 4443 */
4271 neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128, 4444 neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128,
4272 ndisc_hashfn, dst, dev); 4445 ndisc_hashfn, dst, dev);
4273 if (neigh) 4446 if (!neigh)
4274 return bpf_fib_set_fwd_params(params, neigh, dev); 4447 return BPF_FIB_LKUP_RET_NO_NEIGH;
4275 4448
4276 return 0; 4449 return bpf_fib_set_fwd_params(params, neigh, dev);
4277} 4450}
4278#endif 4451#endif
4279 4452
@@ -4315,7 +4488,7 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
4315 struct bpf_fib_lookup *, params, int, plen, u32, flags) 4488 struct bpf_fib_lookup *, params, int, plen, u32, flags)
4316{ 4489{
4317 struct net *net = dev_net(skb->dev); 4490 struct net *net = dev_net(skb->dev);
4318 int index = -EAFNOSUPPORT; 4491 int rc = -EAFNOSUPPORT;
4319 4492
4320 if (plen < sizeof(*params)) 4493 if (plen < sizeof(*params))
4321 return -EINVAL; 4494 return -EINVAL;
@@ -4326,25 +4499,25 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
4326 switch (params->family) { 4499 switch (params->family) {
4327#if IS_ENABLED(CONFIG_INET) 4500#if IS_ENABLED(CONFIG_INET)
4328 case AF_INET: 4501 case AF_INET:
4329 index = bpf_ipv4_fib_lookup(net, params, flags, false); 4502 rc = bpf_ipv4_fib_lookup(net, params, flags, false);
4330 break; 4503 break;
4331#endif 4504#endif
4332#if IS_ENABLED(CONFIG_IPV6) 4505#if IS_ENABLED(CONFIG_IPV6)
4333 case AF_INET6: 4506 case AF_INET6:
4334 index = bpf_ipv6_fib_lookup(net, params, flags, false); 4507 rc = bpf_ipv6_fib_lookup(net, params, flags, false);
4335 break; 4508 break;
4336#endif 4509#endif
4337 } 4510 }
4338 4511
4339 if (index > 0) { 4512 if (!rc) {
4340 struct net_device *dev; 4513 struct net_device *dev;
4341 4514
4342 dev = dev_get_by_index_rcu(net, index); 4515 dev = dev_get_by_index_rcu(net, params->ifindex);
4343 if (!is_skb_forwardable(dev, skb)) 4516 if (!is_skb_forwardable(dev, skb))
4344 index = 0; 4517 rc = BPF_FIB_LKUP_RET_FRAG_NEEDED;
4345 } 4518 }
4346 4519
4347 return index; 4520 return rc;
4348} 4521}
4349 4522
4350static const struct bpf_func_proto bpf_skb_fib_lookup_proto = { 4523static const struct bpf_func_proto bpf_skb_fib_lookup_proto = {
@@ -4417,38 +4590,37 @@ static const struct bpf_func_proto bpf_lwt_push_encap_proto = {
4417 .arg4_type = ARG_CONST_SIZE 4590 .arg4_type = ARG_CONST_SIZE
4418}; 4591};
4419 4592
4593#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
4420BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset, 4594BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset,
4421 const void *, from, u32, len) 4595 const void *, from, u32, len)
4422{ 4596{
4423#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
4424 struct seg6_bpf_srh_state *srh_state = 4597 struct seg6_bpf_srh_state *srh_state =
4425 this_cpu_ptr(&seg6_bpf_srh_states); 4598 this_cpu_ptr(&seg6_bpf_srh_states);
4599 struct ipv6_sr_hdr *srh = srh_state->srh;
4426 void *srh_tlvs, *srh_end, *ptr; 4600 void *srh_tlvs, *srh_end, *ptr;
4427 struct ipv6_sr_hdr *srh;
4428 int srhoff = 0; 4601 int srhoff = 0;
4429 4602
4430 if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) 4603 if (srh == NULL)
4431 return -EINVAL; 4604 return -EINVAL;
4432 4605
4433 srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
4434 srh_tlvs = (void *)((char *)srh + ((srh->first_segment + 1) << 4)); 4606 srh_tlvs = (void *)((char *)srh + ((srh->first_segment + 1) << 4));
4435 srh_end = (void *)((char *)srh + sizeof(*srh) + srh_state->hdrlen); 4607 srh_end = (void *)((char *)srh + sizeof(*srh) + srh_state->hdrlen);
4436 4608
4437 ptr = skb->data + offset; 4609 ptr = skb->data + offset;
4438 if (ptr >= srh_tlvs && ptr + len <= srh_end) 4610 if (ptr >= srh_tlvs && ptr + len <= srh_end)
4439 srh_state->valid = 0; 4611 srh_state->valid = false;
4440 else if (ptr < (void *)&srh->flags || 4612 else if (ptr < (void *)&srh->flags ||
4441 ptr + len > (void *)&srh->segments) 4613 ptr + len > (void *)&srh->segments)
4442 return -EFAULT; 4614 return -EFAULT;
4443 4615
4444 if (unlikely(bpf_try_make_writable(skb, offset + len))) 4616 if (unlikely(bpf_try_make_writable(skb, offset + len)))
4445 return -EFAULT; 4617 return -EFAULT;
4618 if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
4619 return -EINVAL;
4620 srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
4446 4621
4447 memcpy(skb->data + offset, from, len); 4622 memcpy(skb->data + offset, from, len);
4448 return 0; 4623 return 0;
4449#else /* CONFIG_IPV6_SEG6_BPF */
4450 return -EOPNOTSUPP;
4451#endif
4452} 4624}
4453 4625
4454static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = { 4626static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = {
@@ -4461,60 +4633,82 @@ static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = {
4461 .arg4_type = ARG_CONST_SIZE 4633 .arg4_type = ARG_CONST_SIZE
4462}; 4634};
4463 4635
4464BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb, 4636static void bpf_update_srh_state(struct sk_buff *skb)
4465 u32, action, void *, param, u32, param_len)
4466{ 4637{
4467#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
4468 struct seg6_bpf_srh_state *srh_state = 4638 struct seg6_bpf_srh_state *srh_state =
4469 this_cpu_ptr(&seg6_bpf_srh_states); 4639 this_cpu_ptr(&seg6_bpf_srh_states);
4470 struct ipv6_sr_hdr *srh;
4471 int srhoff = 0; 4640 int srhoff = 0;
4472 int err;
4473
4474 if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
4475 return -EINVAL;
4476 srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
4477 4641
4478 if (!srh_state->valid) { 4642 if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) {
4479 if (unlikely((srh_state->hdrlen & 7) != 0)) 4643 srh_state->srh = NULL;
4480 return -EBADMSG; 4644 } else {
4481 4645 srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
4482 srh->hdrlen = (u8)(srh_state->hdrlen >> 3); 4646 srh_state->hdrlen = srh_state->srh->hdrlen << 3;
4483 if (unlikely(!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3))) 4647 srh_state->valid = true;
4484 return -EBADMSG;
4485
4486 srh_state->valid = 1;
4487 } 4648 }
4649}
4650
4651BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb,
4652 u32, action, void *, param, u32, param_len)
4653{
4654 struct seg6_bpf_srh_state *srh_state =
4655 this_cpu_ptr(&seg6_bpf_srh_states);
4656 int hdroff = 0;
4657 int err;
4488 4658
4489 switch (action) { 4659 switch (action) {
4490 case SEG6_LOCAL_ACTION_END_X: 4660 case SEG6_LOCAL_ACTION_END_X:
4661 if (!seg6_bpf_has_valid_srh(skb))
4662 return -EBADMSG;
4491 if (param_len != sizeof(struct in6_addr)) 4663 if (param_len != sizeof(struct in6_addr))
4492 return -EINVAL; 4664 return -EINVAL;
4493 return seg6_lookup_nexthop(skb, (struct in6_addr *)param, 0); 4665 return seg6_lookup_nexthop(skb, (struct in6_addr *)param, 0);
4494 case SEG6_LOCAL_ACTION_END_T: 4666 case SEG6_LOCAL_ACTION_END_T:
4667 if (!seg6_bpf_has_valid_srh(skb))
4668 return -EBADMSG;
4669 if (param_len != sizeof(int))
4670 return -EINVAL;
4671 return seg6_lookup_nexthop(skb, NULL, *(int *)param);
4672 case SEG6_LOCAL_ACTION_END_DT6:
4673 if (!seg6_bpf_has_valid_srh(skb))
4674 return -EBADMSG;
4495 if (param_len != sizeof(int)) 4675 if (param_len != sizeof(int))
4496 return -EINVAL; 4676 return -EINVAL;
4677
4678 if (ipv6_find_hdr(skb, &hdroff, IPPROTO_IPV6, NULL, NULL) < 0)
4679 return -EBADMSG;
4680 if (!pskb_pull(skb, hdroff))
4681 return -EBADMSG;
4682
4683 skb_postpull_rcsum(skb, skb_network_header(skb), hdroff);
4684 skb_reset_network_header(skb);
4685 skb_reset_transport_header(skb);
4686 skb->encapsulation = 0;
4687
4688 bpf_compute_data_pointers(skb);
4689 bpf_update_srh_state(skb);
4497 return seg6_lookup_nexthop(skb, NULL, *(int *)param); 4690 return seg6_lookup_nexthop(skb, NULL, *(int *)param);
4498 case SEG6_LOCAL_ACTION_END_B6: 4691 case SEG6_LOCAL_ACTION_END_B6:
4692 if (srh_state->srh && !seg6_bpf_has_valid_srh(skb))
4693 return -EBADMSG;
4499 err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6_INLINE, 4694 err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6_INLINE,
4500 param, param_len); 4695 param, param_len);
4501 if (!err) 4696 if (!err)
4502 srh_state->hdrlen = 4697 bpf_update_srh_state(skb);
4503 ((struct ipv6_sr_hdr *)param)->hdrlen << 3; 4698
4504 return err; 4699 return err;
4505 case SEG6_LOCAL_ACTION_END_B6_ENCAP: 4700 case SEG6_LOCAL_ACTION_END_B6_ENCAP:
4701 if (srh_state->srh && !seg6_bpf_has_valid_srh(skb))
4702 return -EBADMSG;
4506 err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6, 4703 err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6,
4507 param, param_len); 4704 param, param_len);
4508 if (!err) 4705 if (!err)
4509 srh_state->hdrlen = 4706 bpf_update_srh_state(skb);
4510 ((struct ipv6_sr_hdr *)param)->hdrlen << 3; 4707
4511 return err; 4708 return err;
4512 default: 4709 default:
4513 return -EINVAL; 4710 return -EINVAL;
4514 } 4711 }
4515#else /* CONFIG_IPV6_SEG6_BPF */
4516 return -EOPNOTSUPP;
4517#endif
4518} 4712}
4519 4713
4520static const struct bpf_func_proto bpf_lwt_seg6_action_proto = { 4714static const struct bpf_func_proto bpf_lwt_seg6_action_proto = {
@@ -4530,18 +4724,16 @@ static const struct bpf_func_proto bpf_lwt_seg6_action_proto = {
4530BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset, 4724BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset,
4531 s32, len) 4725 s32, len)
4532{ 4726{
4533#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
4534 struct seg6_bpf_srh_state *srh_state = 4727 struct seg6_bpf_srh_state *srh_state =
4535 this_cpu_ptr(&seg6_bpf_srh_states); 4728 this_cpu_ptr(&seg6_bpf_srh_states);
4729 struct ipv6_sr_hdr *srh = srh_state->srh;
4536 void *srh_end, *srh_tlvs, *ptr; 4730 void *srh_end, *srh_tlvs, *ptr;
4537 struct ipv6_sr_hdr *srh;
4538 struct ipv6hdr *hdr; 4731 struct ipv6hdr *hdr;
4539 int srhoff = 0; 4732 int srhoff = 0;
4540 int ret; 4733 int ret;
4541 4734
4542 if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) 4735 if (unlikely(srh == NULL))
4543 return -EINVAL; 4736 return -EINVAL;
4544 srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
4545 4737
4546 srh_tlvs = (void *)((unsigned char *)srh + sizeof(*srh) + 4738 srh_tlvs = (void *)((unsigned char *)srh + sizeof(*srh) +
4547 ((srh->first_segment + 1) << 4)); 4739 ((srh->first_segment + 1) << 4));
@@ -4571,12 +4763,12 @@ BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset,
4571 hdr = (struct ipv6hdr *)skb->data; 4763 hdr = (struct ipv6hdr *)skb->data;
4572 hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); 4764 hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
4573 4765
4766 if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
4767 return -EINVAL;
4768 srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
4574 srh_state->hdrlen += len; 4769 srh_state->hdrlen += len;
4575 srh_state->valid = 0; 4770 srh_state->valid = false;
4576 return 0; 4771 return 0;
4577#else /* CONFIG_IPV6_SEG6_BPF */
4578 return -EOPNOTSUPP;
4579#endif
4580} 4772}
4581 4773
4582static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { 4774static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
@@ -4587,6 +4779,7 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
4587 .arg2_type = ARG_ANYTHING, 4779 .arg2_type = ARG_ANYTHING,
4588 .arg3_type = ARG_ANYTHING, 4780 .arg3_type = ARG_ANYTHING,
4589}; 4781};
4782#endif /* CONFIG_IPV6_SEG6_BPF */
4590 4783
4591bool bpf_helper_changes_pkt_data(void *func) 4784bool bpf_helper_changes_pkt_data(void *func)
4592{ 4785{
@@ -4595,9 +4788,12 @@ bool bpf_helper_changes_pkt_data(void *func)
4595 func == bpf_skb_store_bytes || 4788 func == bpf_skb_store_bytes ||
4596 func == bpf_skb_change_proto || 4789 func == bpf_skb_change_proto ||
4597 func == bpf_skb_change_head || 4790 func == bpf_skb_change_head ||
4791 func == sk_skb_change_head ||
4598 func == bpf_skb_change_tail || 4792 func == bpf_skb_change_tail ||
4793 func == sk_skb_change_tail ||
4599 func == bpf_skb_adjust_room || 4794 func == bpf_skb_adjust_room ||
4600 func == bpf_skb_pull_data || 4795 func == bpf_skb_pull_data ||
4796 func == sk_skb_pull_data ||
4601 func == bpf_clone_redirect || 4797 func == bpf_clone_redirect ||
4602 func == bpf_l3_csum_replace || 4798 func == bpf_l3_csum_replace ||
4603 func == bpf_l4_csum_replace || 4799 func == bpf_l4_csum_replace ||
@@ -4605,11 +4801,12 @@ bool bpf_helper_changes_pkt_data(void *func)
4605 func == bpf_xdp_adjust_meta || 4801 func == bpf_xdp_adjust_meta ||
4606 func == bpf_msg_pull_data || 4802 func == bpf_msg_pull_data ||
4607 func == bpf_xdp_adjust_tail || 4803 func == bpf_xdp_adjust_tail ||
4608 func == bpf_lwt_push_encap || 4804#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
4609 func == bpf_lwt_seg6_store_bytes || 4805 func == bpf_lwt_seg6_store_bytes ||
4610 func == bpf_lwt_seg6_adjust_srh || 4806 func == bpf_lwt_seg6_adjust_srh ||
4611 func == bpf_lwt_seg6_action 4807 func == bpf_lwt_seg6_action ||
4612 ) 4808#endif
4809 func == bpf_lwt_push_encap)
4613 return true; 4810 return true;
4614 4811
4615 return false; 4812 return false;
@@ -4638,6 +4835,7 @@ bpf_base_func_proto(enum bpf_func_id func_id)
4638 case BPF_FUNC_trace_printk: 4835 case BPF_FUNC_trace_printk:
4639 if (capable(CAP_SYS_ADMIN)) 4836 if (capable(CAP_SYS_ADMIN))
4640 return bpf_get_trace_printk_proto(); 4837 return bpf_get_trace_printk_proto();
4838 /* else: fall through */
4641 default: 4839 default:
4642 return NULL; 4840 return NULL;
4643 } 4841 }
@@ -4652,6 +4850,8 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
4652 */ 4850 */
4653 case BPF_FUNC_get_current_uid_gid: 4851 case BPF_FUNC_get_current_uid_gid:
4654 return &bpf_get_current_uid_gid_proto; 4852 return &bpf_get_current_uid_gid_proto;
4853 case BPF_FUNC_get_local_storage:
4854 return &bpf_get_local_storage_proto;
4655 default: 4855 default:
4656 return bpf_base_func_proto(func_id); 4856 return bpf_base_func_proto(func_id);
4657 } 4857 }
@@ -4674,6 +4874,10 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
4674 default: 4874 default:
4675 return NULL; 4875 return NULL;
4676 } 4876 }
4877 case BPF_FUNC_get_socket_cookie:
4878 return &bpf_get_socket_cookie_sock_addr_proto;
4879 case BPF_FUNC_get_local_storage:
4880 return &bpf_get_local_storage_proto;
4677 default: 4881 default:
4678 return bpf_base_func_proto(func_id); 4882 return bpf_base_func_proto(func_id);
4679 } 4883 }
@@ -4697,6 +4901,17 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
4697} 4901}
4698 4902
4699static const struct bpf_func_proto * 4903static const struct bpf_func_proto *
4904cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
4905{
4906 switch (func_id) {
4907 case BPF_FUNC_get_local_storage:
4908 return &bpf_get_local_storage_proto;
4909 default:
4910 return sk_filter_func_proto(func_id, prog);
4911 }
4912}
4913
4914static const struct bpf_func_proto *
4700tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 4915tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
4701{ 4916{
4702 switch (func_id) { 4917 switch (func_id) {
@@ -4769,6 +4984,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
4769#ifdef CONFIG_SOCK_CGROUP_DATA 4984#ifdef CONFIG_SOCK_CGROUP_DATA
4770 case BPF_FUNC_skb_cgroup_id: 4985 case BPF_FUNC_skb_cgroup_id:
4771 return &bpf_skb_cgroup_id_proto; 4986 return &bpf_skb_cgroup_id_proto;
4987 case BPF_FUNC_skb_ancestor_cgroup_id:
4988 return &bpf_skb_ancestor_cgroup_id_proto;
4772#endif 4989#endif
4773 default: 4990 default:
4774 return bpf_base_func_proto(func_id); 4991 return bpf_base_func_proto(func_id);
@@ -4816,6 +5033,10 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
4816 return &bpf_sock_map_update_proto; 5033 return &bpf_sock_map_update_proto;
4817 case BPF_FUNC_sock_hash_update: 5034 case BPF_FUNC_sock_hash_update:
4818 return &bpf_sock_hash_update_proto; 5035 return &bpf_sock_hash_update_proto;
5036 case BPF_FUNC_get_socket_cookie:
5037 return &bpf_get_socket_cookie_sock_ops_proto;
5038 case BPF_FUNC_get_local_storage:
5039 return &bpf_get_local_storage_proto;
4819 default: 5040 default:
4820 return bpf_base_func_proto(func_id); 5041 return bpf_base_func_proto(func_id);
4821 } 5042 }
@@ -4835,6 +5056,8 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
4835 return &bpf_msg_cork_bytes_proto; 5056 return &bpf_msg_cork_bytes_proto;
4836 case BPF_FUNC_msg_pull_data: 5057 case BPF_FUNC_msg_pull_data:
4837 return &bpf_msg_pull_data_proto; 5058 return &bpf_msg_pull_data_proto;
5059 case BPF_FUNC_get_local_storage:
5060 return &bpf_get_local_storage_proto;
4838 default: 5061 default:
4839 return bpf_base_func_proto(func_id); 5062 return bpf_base_func_proto(func_id);
4840 } 5063 }
@@ -4849,11 +5072,11 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
4849 case BPF_FUNC_skb_load_bytes: 5072 case BPF_FUNC_skb_load_bytes:
4850 return &bpf_skb_load_bytes_proto; 5073 return &bpf_skb_load_bytes_proto;
4851 case BPF_FUNC_skb_pull_data: 5074 case BPF_FUNC_skb_pull_data:
4852 return &bpf_skb_pull_data_proto; 5075 return &sk_skb_pull_data_proto;
4853 case BPF_FUNC_skb_change_tail: 5076 case BPF_FUNC_skb_change_tail:
4854 return &bpf_skb_change_tail_proto; 5077 return &sk_skb_change_tail_proto;
4855 case BPF_FUNC_skb_change_head: 5078 case BPF_FUNC_skb_change_head:
4856 return &bpf_skb_change_head_proto; 5079 return &sk_skb_change_head_proto;
4857 case BPF_FUNC_get_socket_cookie: 5080 case BPF_FUNC_get_socket_cookie:
4858 return &bpf_get_socket_cookie_proto; 5081 return &bpf_get_socket_cookie_proto;
4859 case BPF_FUNC_get_socket_uid: 5082 case BPF_FUNC_get_socket_uid:
@@ -4862,6 +5085,8 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
4862 return &bpf_sk_redirect_map_proto; 5085 return &bpf_sk_redirect_map_proto;
4863 case BPF_FUNC_sk_redirect_hash: 5086 case BPF_FUNC_sk_redirect_hash:
4864 return &bpf_sk_redirect_hash_proto; 5087 return &bpf_sk_redirect_hash_proto;
5088 case BPF_FUNC_get_local_storage:
5089 return &bpf_get_local_storage_proto;
4865 default: 5090 default:
4866 return bpf_base_func_proto(func_id); 5091 return bpf_base_func_proto(func_id);
4867 } 5092 }
@@ -4944,12 +5169,14 @@ static const struct bpf_func_proto *
4944lwt_seg6local_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 5169lwt_seg6local_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
4945{ 5170{
4946 switch (func_id) { 5171 switch (func_id) {
5172#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
4947 case BPF_FUNC_lwt_seg6_store_bytes: 5173 case BPF_FUNC_lwt_seg6_store_bytes:
4948 return &bpf_lwt_seg6_store_bytes_proto; 5174 return &bpf_lwt_seg6_store_bytes_proto;
4949 case BPF_FUNC_lwt_seg6_action: 5175 case BPF_FUNC_lwt_seg6_action:
4950 return &bpf_lwt_seg6_action_proto; 5176 return &bpf_lwt_seg6_action_proto;
4951 case BPF_FUNC_lwt_seg6_adjust_srh: 5177 case BPF_FUNC_lwt_seg6_adjust_srh:
4952 return &bpf_lwt_seg6_adjust_srh_proto; 5178 return &bpf_lwt_seg6_adjust_srh_proto;
5179#endif
4953 default: 5180 default:
4954 return lwt_out_func_proto(func_id, prog); 5181 return lwt_out_func_proto(func_id, prog);
4955 } 5182 }
@@ -6664,7 +6891,7 @@ const struct bpf_prog_ops xdp_prog_ops = {
6664}; 6891};
6665 6892
6666const struct bpf_verifier_ops cg_skb_verifier_ops = { 6893const struct bpf_verifier_ops cg_skb_verifier_ops = {
6667 .get_func_proto = sk_filter_func_proto, 6894 .get_func_proto = cg_skb_func_proto,
6668 .is_valid_access = sk_filter_is_valid_access, 6895 .is_valid_access = sk_filter_is_valid_access,
6669 .convert_ctx_access = bpf_convert_ctx_access, 6896 .convert_ctx_access = bpf_convert_ctx_access,
6670}; 6897};
@@ -6823,3 +7050,271 @@ out:
6823 release_sock(sk); 7050 release_sock(sk);
6824 return ret; 7051 return ret;
6825} 7052}
7053
7054#ifdef CONFIG_INET
7055struct sk_reuseport_kern {
7056 struct sk_buff *skb;
7057 struct sock *sk;
7058 struct sock *selected_sk;
7059 void *data_end;
7060 u32 hash;
7061 u32 reuseport_id;
7062 bool bind_inany;
7063};
7064
7065static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern,
7066 struct sock_reuseport *reuse,
7067 struct sock *sk, struct sk_buff *skb,
7068 u32 hash)
7069{
7070 reuse_kern->skb = skb;
7071 reuse_kern->sk = sk;
7072 reuse_kern->selected_sk = NULL;
7073 reuse_kern->data_end = skb->data + skb_headlen(skb);
7074 reuse_kern->hash = hash;
7075 reuse_kern->reuseport_id = reuse->reuseport_id;
7076 reuse_kern->bind_inany = reuse->bind_inany;
7077}
7078
7079struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
7080 struct bpf_prog *prog, struct sk_buff *skb,
7081 u32 hash)
7082{
7083 struct sk_reuseport_kern reuse_kern;
7084 enum sk_action action;
7085
7086 bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, hash);
7087 action = BPF_PROG_RUN(prog, &reuse_kern);
7088
7089 if (action == SK_PASS)
7090 return reuse_kern.selected_sk;
7091 else
7092 return ERR_PTR(-ECONNREFUSED);
7093}
7094
7095BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern,
7096 struct bpf_map *, map, void *, key, u32, flags)
7097{
7098 struct sock_reuseport *reuse;
7099 struct sock *selected_sk;
7100
7101 selected_sk = map->ops->map_lookup_elem(map, key);
7102 if (!selected_sk)
7103 return -ENOENT;
7104
7105 reuse = rcu_dereference(selected_sk->sk_reuseport_cb);
7106 if (!reuse)
7107 /* selected_sk is unhashed (e.g. by close()) after the
7108 * above map_lookup_elem(). Treat selected_sk has already
7109 * been removed from the map.
7110 */
7111 return -ENOENT;
7112
7113 if (unlikely(reuse->reuseport_id != reuse_kern->reuseport_id)) {
7114 struct sock *sk;
7115
7116 if (unlikely(!reuse_kern->reuseport_id))
7117 /* There is a small race between adding the
7118 * sk to the map and setting the
7119 * reuse_kern->reuseport_id.
7120 * Treat it as the sk has not been added to
7121 * the bpf map yet.
7122 */
7123 return -ENOENT;
7124
7125 sk = reuse_kern->sk;
7126 if (sk->sk_protocol != selected_sk->sk_protocol)
7127 return -EPROTOTYPE;
7128 else if (sk->sk_family != selected_sk->sk_family)
7129 return -EAFNOSUPPORT;
7130
7131 /* Catch all. Likely bound to a different sockaddr. */
7132 return -EBADFD;
7133 }
7134
7135 reuse_kern->selected_sk = selected_sk;
7136
7137 return 0;
7138}
7139
7140static const struct bpf_func_proto sk_select_reuseport_proto = {
7141 .func = sk_select_reuseport,
7142 .gpl_only = false,
7143 .ret_type = RET_INTEGER,
7144 .arg1_type = ARG_PTR_TO_CTX,
7145 .arg2_type = ARG_CONST_MAP_PTR,
7146 .arg3_type = ARG_PTR_TO_MAP_KEY,
7147 .arg4_type = ARG_ANYTHING,
7148};
7149
7150BPF_CALL_4(sk_reuseport_load_bytes,
7151 const struct sk_reuseport_kern *, reuse_kern, u32, offset,
7152 void *, to, u32, len)
7153{
7154 return ____bpf_skb_load_bytes(reuse_kern->skb, offset, to, len);
7155}
7156
7157static const struct bpf_func_proto sk_reuseport_load_bytes_proto = {
7158 .func = sk_reuseport_load_bytes,
7159 .gpl_only = false,
7160 .ret_type = RET_INTEGER,
7161 .arg1_type = ARG_PTR_TO_CTX,
7162 .arg2_type = ARG_ANYTHING,
7163 .arg3_type = ARG_PTR_TO_UNINIT_MEM,
7164 .arg4_type = ARG_CONST_SIZE,
7165};
7166
7167BPF_CALL_5(sk_reuseport_load_bytes_relative,
7168 const struct sk_reuseport_kern *, reuse_kern, u32, offset,
7169 void *, to, u32, len, u32, start_header)
7170{
7171 return ____bpf_skb_load_bytes_relative(reuse_kern->skb, offset, to,
7172 len, start_header);
7173}
7174
7175static const struct bpf_func_proto sk_reuseport_load_bytes_relative_proto = {
7176 .func = sk_reuseport_load_bytes_relative,
7177 .gpl_only = false,
7178 .ret_type = RET_INTEGER,
7179 .arg1_type = ARG_PTR_TO_CTX,
7180 .arg2_type = ARG_ANYTHING,
7181 .arg3_type = ARG_PTR_TO_UNINIT_MEM,
7182 .arg4_type = ARG_CONST_SIZE,
7183 .arg5_type = ARG_ANYTHING,
7184};
7185
7186static const struct bpf_func_proto *
7187sk_reuseport_func_proto(enum bpf_func_id func_id,
7188 const struct bpf_prog *prog)
7189{
7190 switch (func_id) {
7191 case BPF_FUNC_sk_select_reuseport:
7192 return &sk_select_reuseport_proto;
7193 case BPF_FUNC_skb_load_bytes:
7194 return &sk_reuseport_load_bytes_proto;
7195 case BPF_FUNC_skb_load_bytes_relative:
7196 return &sk_reuseport_load_bytes_relative_proto;
7197 default:
7198 return bpf_base_func_proto(func_id);
7199 }
7200}
7201
7202static bool
7203sk_reuseport_is_valid_access(int off, int size,
7204 enum bpf_access_type type,
7205 const struct bpf_prog *prog,
7206 struct bpf_insn_access_aux *info)
7207{
7208 const u32 size_default = sizeof(__u32);
7209
7210 if (off < 0 || off >= sizeof(struct sk_reuseport_md) ||
7211 off % size || type != BPF_READ)
7212 return false;
7213
7214 switch (off) {
7215 case offsetof(struct sk_reuseport_md, data):
7216 info->reg_type = PTR_TO_PACKET;
7217 return size == sizeof(__u64);
7218
7219 case offsetof(struct sk_reuseport_md, data_end):
7220 info->reg_type = PTR_TO_PACKET_END;
7221 return size == sizeof(__u64);
7222
7223 case offsetof(struct sk_reuseport_md, hash):
7224 return size == size_default;
7225
7226 /* Fields that allow narrowing */
7227 case offsetof(struct sk_reuseport_md, eth_protocol):
7228 if (size < FIELD_SIZEOF(struct sk_buff, protocol))
7229 return false;
7230 /* fall through */
7231 case offsetof(struct sk_reuseport_md, ip_protocol):
7232 case offsetof(struct sk_reuseport_md, bind_inany):
7233 case offsetof(struct sk_reuseport_md, len):
7234 bpf_ctx_record_field_size(info, size_default);
7235 return bpf_ctx_narrow_access_ok(off, size, size_default);
7236
7237 default:
7238 return false;
7239 }
7240}
7241
7242#define SK_REUSEPORT_LOAD_FIELD(F) ({ \
7243 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_reuseport_kern, F), \
7244 si->dst_reg, si->src_reg, \
7245 bpf_target_off(struct sk_reuseport_kern, F, \
7246 FIELD_SIZEOF(struct sk_reuseport_kern, F), \
7247 target_size)); \
7248 })
7249
7250#define SK_REUSEPORT_LOAD_SKB_FIELD(SKB_FIELD) \
7251 SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern, \
7252 struct sk_buff, \
7253 skb, \
7254 SKB_FIELD)
7255
7256#define SK_REUSEPORT_LOAD_SK_FIELD_SIZE_OFF(SK_FIELD, BPF_SIZE, EXTRA_OFF) \
7257 SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(struct sk_reuseport_kern, \
7258 struct sock, \
7259 sk, \
7260 SK_FIELD, BPF_SIZE, EXTRA_OFF)
7261
7262static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type,
7263 const struct bpf_insn *si,
7264 struct bpf_insn *insn_buf,
7265 struct bpf_prog *prog,
7266 u32 *target_size)
7267{
7268 struct bpf_insn *insn = insn_buf;
7269
7270 switch (si->off) {
7271 case offsetof(struct sk_reuseport_md, data):
7272 SK_REUSEPORT_LOAD_SKB_FIELD(data);
7273 break;
7274
7275 case offsetof(struct sk_reuseport_md, len):
7276 SK_REUSEPORT_LOAD_SKB_FIELD(len);
7277 break;
7278
7279 case offsetof(struct sk_reuseport_md, eth_protocol):
7280 SK_REUSEPORT_LOAD_SKB_FIELD(protocol);
7281 break;
7282
7283 case offsetof(struct sk_reuseport_md, ip_protocol):
7284 BUILD_BUG_ON(hweight_long(SK_FL_PROTO_MASK) != BITS_PER_BYTE);
7285 SK_REUSEPORT_LOAD_SK_FIELD_SIZE_OFF(__sk_flags_offset,
7286 BPF_W, 0);
7287 *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK);
7288 *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg,
7289 SK_FL_PROTO_SHIFT);
7290 /* SK_FL_PROTO_MASK and SK_FL_PROTO_SHIFT are endian
7291 * aware. No further narrowing or masking is needed.
7292 */
7293 *target_size = 1;
7294 break;
7295
7296 case offsetof(struct sk_reuseport_md, data_end):
7297 SK_REUSEPORT_LOAD_FIELD(data_end);
7298 break;
7299
7300 case offsetof(struct sk_reuseport_md, hash):
7301 SK_REUSEPORT_LOAD_FIELD(hash);
7302 break;
7303
7304 case offsetof(struct sk_reuseport_md, bind_inany):
7305 SK_REUSEPORT_LOAD_FIELD(bind_inany);
7306 break;
7307 }
7308
7309 return insn - insn_buf;
7310}
7311
7312const struct bpf_verifier_ops sk_reuseport_verifier_ops = {
7313 .get_func_proto = sk_reuseport_func_proto,
7314 .is_valid_access = sk_reuseport_is_valid_access,
7315 .convert_ctx_access = sk_reuseport_convert_ctx_access,
7316};
7317
7318const struct bpf_prog_ops sk_reuseport_prog_ops = {
7319};
7320#endif /* CONFIG_INET */
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 53f96e4f7bf5..ce9eeeb7c024 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -152,7 +152,11 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
152 !dissector_uses_key(flow_dissector, 152 !dissector_uses_key(flow_dissector,
153 FLOW_DISSECTOR_KEY_ENC_CONTROL) && 153 FLOW_DISSECTOR_KEY_ENC_CONTROL) &&
154 !dissector_uses_key(flow_dissector, 154 !dissector_uses_key(flow_dissector,
155 FLOW_DISSECTOR_KEY_ENC_PORTS)) 155 FLOW_DISSECTOR_KEY_ENC_PORTS) &&
156 !dissector_uses_key(flow_dissector,
157 FLOW_DISSECTOR_KEY_ENC_IP) &&
158 !dissector_uses_key(flow_dissector,
159 FLOW_DISSECTOR_KEY_ENC_OPTS))
156 return; 160 return;
157 161
158 info = skb_tunnel_info(skb); 162 info = skb_tunnel_info(skb);
@@ -212,6 +216,31 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
212 tp->src = key->tp_src; 216 tp->src = key->tp_src;
213 tp->dst = key->tp_dst; 217 tp->dst = key->tp_dst;
214 } 218 }
219
220 if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IP)) {
221 struct flow_dissector_key_ip *ip;
222
223 ip = skb_flow_dissector_target(flow_dissector,
224 FLOW_DISSECTOR_KEY_ENC_IP,
225 target_container);
226 ip->tos = key->tos;
227 ip->ttl = key->ttl;
228 }
229
230 if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_OPTS)) {
231 struct flow_dissector_key_enc_opts *enc_opt;
232
233 enc_opt = skb_flow_dissector_target(flow_dissector,
234 FLOW_DISSECTOR_KEY_ENC_OPTS,
235 target_container);
236
237 if (info->options_len) {
238 enc_opt->len = info->options_len;
239 ip_tunnel_info_opts_get(enc_opt->data, info);
240 enc_opt->dst_opt_type = info->key.tun_flags &
241 TUNNEL_OPTIONS_PRESENT;
242 }
243 }
215} 244}
216EXPORT_SYMBOL(skb_flow_dissect_tunnel_info); 245EXPORT_SYMBOL(skb_flow_dissect_tunnel_info);
217 246
@@ -589,7 +618,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
589 struct flow_dissector_key_tags *key_tags; 618 struct flow_dissector_key_tags *key_tags;
590 struct flow_dissector_key_vlan *key_vlan; 619 struct flow_dissector_key_vlan *key_vlan;
591 enum flow_dissect_ret fdret; 620 enum flow_dissect_ret fdret;
592 bool skip_vlan = false; 621 enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX;
593 int num_hdrs = 0; 622 int num_hdrs = 0;
594 u8 ip_proto = 0; 623 u8 ip_proto = 0;
595 bool ret; 624 bool ret;
@@ -748,14 +777,14 @@ proto_again:
748 } 777 }
749 case htons(ETH_P_8021AD): 778 case htons(ETH_P_8021AD):
750 case htons(ETH_P_8021Q): { 779 case htons(ETH_P_8021Q): {
751 const struct vlan_hdr *vlan; 780 const struct vlan_hdr *vlan = NULL;
752 struct vlan_hdr _vlan; 781 struct vlan_hdr _vlan;
753 bool vlan_tag_present = skb && skb_vlan_tag_present(skb); 782 __be16 saved_vlan_tpid = proto;
754 783
755 if (vlan_tag_present) 784 if (dissector_vlan == FLOW_DISSECTOR_KEY_MAX &&
785 skb && skb_vlan_tag_present(skb)) {
756 proto = skb->protocol; 786 proto = skb->protocol;
757 787 } else {
758 if (!vlan_tag_present || eth_type_vlan(skb->protocol)) {
759 vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan), 788 vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan),
760 data, hlen, &_vlan); 789 data, hlen, &_vlan);
761 if (!vlan) { 790 if (!vlan) {
@@ -765,20 +794,23 @@ proto_again:
765 794
766 proto = vlan->h_vlan_encapsulated_proto; 795 proto = vlan->h_vlan_encapsulated_proto;
767 nhoff += sizeof(*vlan); 796 nhoff += sizeof(*vlan);
768 if (skip_vlan) {
769 fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
770 break;
771 }
772 } 797 }
773 798
774 skip_vlan = true; 799 if (dissector_vlan == FLOW_DISSECTOR_KEY_MAX) {
775 if (dissector_uses_key(flow_dissector, 800 dissector_vlan = FLOW_DISSECTOR_KEY_VLAN;
776 FLOW_DISSECTOR_KEY_VLAN)) { 801 } else if (dissector_vlan == FLOW_DISSECTOR_KEY_VLAN) {
802 dissector_vlan = FLOW_DISSECTOR_KEY_CVLAN;
803 } else {
804 fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
805 break;
806 }
807
808 if (dissector_uses_key(flow_dissector, dissector_vlan)) {
777 key_vlan = skb_flow_dissector_target(flow_dissector, 809 key_vlan = skb_flow_dissector_target(flow_dissector,
778 FLOW_DISSECTOR_KEY_VLAN, 810 dissector_vlan,
779 target_container); 811 target_container);
780 812
781 if (vlan_tag_present) { 813 if (!vlan) {
782 key_vlan->vlan_id = skb_vlan_tag_get_id(skb); 814 key_vlan->vlan_id = skb_vlan_tag_get_id(skb);
783 key_vlan->vlan_priority = 815 key_vlan->vlan_priority =
784 (skb_vlan_tag_get_prio(skb) >> VLAN_PRIO_SHIFT); 816 (skb_vlan_tag_get_prio(skb) >> VLAN_PRIO_SHIFT);
@@ -789,6 +821,7 @@ proto_again:
789 (ntohs(vlan->h_vlan_TCI) & 821 (ntohs(vlan->h_vlan_TCI) &
790 VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; 822 VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
791 } 823 }
824 key_vlan->vlan_tpid = saved_vlan_tpid;
792 } 825 }
793 826
794 fdret = FLOW_DISSECT_RET_PROTO_AGAIN; 827 fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 98fd12721221..e4e442d70c2d 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -112,7 +112,7 @@ static void est_timer(struct timer_list *t)
112 * @bstats: basic statistics 112 * @bstats: basic statistics
113 * @cpu_bstats: bstats per cpu 113 * @cpu_bstats: bstats per cpu
114 * @rate_est: rate estimator statistics 114 * @rate_est: rate estimator statistics
115 * @stats_lock: statistics lock 115 * @lock: lock for statistics and control path
116 * @running: qdisc running seqcount 116 * @running: qdisc running seqcount
117 * @opt: rate estimator configuration TLV 117 * @opt: rate estimator configuration TLV
118 * 118 *
@@ -128,7 +128,7 @@ static void est_timer(struct timer_list *t)
128int gen_new_estimator(struct gnet_stats_basic_packed *bstats, 128int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
129 struct gnet_stats_basic_cpu __percpu *cpu_bstats, 129 struct gnet_stats_basic_cpu __percpu *cpu_bstats,
130 struct net_rate_estimator __rcu **rate_est, 130 struct net_rate_estimator __rcu **rate_est,
131 spinlock_t *stats_lock, 131 spinlock_t *lock,
132 seqcount_t *running, 132 seqcount_t *running,
133 struct nlattr *opt) 133 struct nlattr *opt)
134{ 134{
@@ -154,19 +154,22 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
154 seqcount_init(&est->seq); 154 seqcount_init(&est->seq);
155 intvl_log = parm->interval + 2; 155 intvl_log = parm->interval + 2;
156 est->bstats = bstats; 156 est->bstats = bstats;
157 est->stats_lock = stats_lock; 157 est->stats_lock = lock;
158 est->running = running; 158 est->running = running;
159 est->ewma_log = parm->ewma_log; 159 est->ewma_log = parm->ewma_log;
160 est->intvl_log = intvl_log; 160 est->intvl_log = intvl_log;
161 est->cpu_bstats = cpu_bstats; 161 est->cpu_bstats = cpu_bstats;
162 162
163 if (stats_lock) 163 if (lock)
164 local_bh_disable(); 164 local_bh_disable();
165 est_fetch_counters(est, &b); 165 est_fetch_counters(est, &b);
166 if (stats_lock) 166 if (lock)
167 local_bh_enable(); 167 local_bh_enable();
168 est->last_bytes = b.bytes; 168 est->last_bytes = b.bytes;
169 est->last_packets = b.packets; 169 est->last_packets = b.packets;
170
171 if (lock)
172 spin_lock_bh(lock);
170 old = rcu_dereference_protected(*rate_est, 1); 173 old = rcu_dereference_protected(*rate_est, 1);
171 if (old) { 174 if (old) {
172 del_timer_sync(&old->timer); 175 del_timer_sync(&old->timer);
@@ -179,6 +182,8 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
179 mod_timer(&est->timer, est->next_jiffies); 182 mod_timer(&est->timer, est->next_jiffies);
180 183
181 rcu_assign_pointer(*rate_est, est); 184 rcu_assign_pointer(*rate_est, est);
185 if (lock)
186 spin_unlock_bh(lock);
182 if (old) 187 if (old)
183 kfree_rcu(old, rcu); 188 kfree_rcu(old, rcu);
184 return 0; 189 return 0;
@@ -209,7 +214,7 @@ EXPORT_SYMBOL(gen_kill_estimator);
209 * @bstats: basic statistics 214 * @bstats: basic statistics
210 * @cpu_bstats: bstats per cpu 215 * @cpu_bstats: bstats per cpu
211 * @rate_est: rate estimator statistics 216 * @rate_est: rate estimator statistics
212 * @stats_lock: statistics lock 217 * @lock: lock for statistics and control path
213 * @running: qdisc running seqcount (might be NULL) 218 * @running: qdisc running seqcount (might be NULL)
214 * @opt: rate estimator configuration TLV 219 * @opt: rate estimator configuration TLV
215 * 220 *
@@ -221,11 +226,11 @@ EXPORT_SYMBOL(gen_kill_estimator);
221int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, 226int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
222 struct gnet_stats_basic_cpu __percpu *cpu_bstats, 227 struct gnet_stats_basic_cpu __percpu *cpu_bstats,
223 struct net_rate_estimator __rcu **rate_est, 228 struct net_rate_estimator __rcu **rate_est,
224 spinlock_t *stats_lock, 229 spinlock_t *lock,
225 seqcount_t *running, struct nlattr *opt) 230 seqcount_t *running, struct nlattr *opt)
226{ 231{
227 return gen_new_estimator(bstats, cpu_bstats, rate_est, 232 return gen_new_estimator(bstats, cpu_bstats, rate_est,
228 stats_lock, running, opt); 233 lock, running, opt);
229} 234}
230EXPORT_SYMBOL(gen_replace_estimator); 235EXPORT_SYMBOL(gen_replace_estimator);
231 236
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index b2b2323bdc84..188d693cb251 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -77,8 +77,20 @@ gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type,
77 d->lock = lock; 77 d->lock = lock;
78 spin_lock_bh(lock); 78 spin_lock_bh(lock);
79 } 79 }
80 if (d->tail) 80 if (d->tail) {
81 return gnet_stats_copy(d, type, NULL, 0, padattr); 81 int ret = gnet_stats_copy(d, type, NULL, 0, padattr);
82
83 /* The initial attribute added in gnet_stats_copy() may be
84 * preceded by a padding attribute, in which case d->tail will
85 * end up pointing at the padding instead of the real attribute.
86 * Fix this so gnet_stats_finish_copy() adjusts the length of
87 * the right attribute.
88 */
89 if (ret == 0 && d->tail->nla_type == padattr)
90 d->tail = (struct nlattr *)((char *)d->tail +
91 NLA_ALIGN(d->tail->nla_len));
92 return ret;
93 }
82 94
83 return 0; 95 return 0;
84} 96}
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index e7e626fb87bb..3e85437f7106 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -50,10 +50,8 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
50 * mixing with BH RCU lock doesn't work. 50 * mixing with BH RCU lock doesn't work.
51 */ 51 */
52 preempt_disable(); 52 preempt_disable();
53 rcu_read_lock();
54 bpf_compute_data_pointers(skb); 53 bpf_compute_data_pointers(skb);
55 ret = bpf_prog_run_save_cb(lwt->prog, skb); 54 ret = bpf_prog_run_save_cb(lwt->prog, skb);
56 rcu_read_unlock();
57 55
58 switch (ret) { 56 switch (ret) {
59 case BPF_OK: 57 case BPF_OK:
@@ -217,7 +215,7 @@ static int bpf_parse_prog(struct nlattr *attr, struct bpf_lwt_prog *prog,
217 if (!tb[LWT_BPF_PROG_FD] || !tb[LWT_BPF_PROG_NAME]) 215 if (!tb[LWT_BPF_PROG_FD] || !tb[LWT_BPF_PROG_NAME])
218 return -EINVAL; 216 return -EINVAL;
219 217
220 prog->name = nla_memdup(tb[LWT_BPF_PROG_NAME], GFP_KERNEL); 218 prog->name = nla_memdup(tb[LWT_BPF_PROG_NAME], GFP_ATOMIC);
221 if (!prog->name) 219 if (!prog->name)
222 return -ENOMEM; 220 return -ENOMEM;
223 221
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 8e3fda9e725c..aa19d86937af 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1148,7 +1148,8 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
1148 neigh->nud_state = new; 1148 neigh->nud_state = new;
1149 err = 0; 1149 err = 0;
1150 notify = old & NUD_VALID; 1150 notify = old & NUD_VALID;
1151 if ((old & (NUD_INCOMPLETE | NUD_PROBE)) && 1151 if (((old & (NUD_INCOMPLETE | NUD_PROBE)) ||
1152 (flags & NEIGH_UPDATE_F_ADMIN)) &&
1152 (new & NUD_FAILED)) { 1153 (new & NUD_FAILED)) {
1153 neigh_invalidate(neigh); 1154 neigh_invalidate(neigh);
1154 notify = 1; 1155 notify = 1;
@@ -3273,4 +3274,3 @@ static int __init neigh_init(void)
3273} 3274}
3274 3275
3275subsys_initcall(neigh_init); 3276subsys_initcall(neigh_init);
3276
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index bb7e80f4ced3..bd67c4d0fcfd 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -26,6 +26,7 @@
26#include <linux/pm_runtime.h> 26#include <linux/pm_runtime.h>
27#include <linux/of.h> 27#include <linux/of.h>
28#include <linux/of_net.h> 28#include <linux/of_net.h>
29#include <linux/cpu.h>
29 30
30#include "net-sysfs.h" 31#include "net-sysfs.h"
31 32
@@ -905,11 +906,20 @@ static const void *rx_queue_namespace(struct kobject *kobj)
905 return ns; 906 return ns;
906} 907}
907 908
909static void rx_queue_get_ownership(struct kobject *kobj,
910 kuid_t *uid, kgid_t *gid)
911{
912 const struct net *net = rx_queue_namespace(kobj);
913
914 net_ns_get_ownership(net, uid, gid);
915}
916
908static struct kobj_type rx_queue_ktype __ro_after_init = { 917static struct kobj_type rx_queue_ktype __ro_after_init = {
909 .sysfs_ops = &rx_queue_sysfs_ops, 918 .sysfs_ops = &rx_queue_sysfs_ops,
910 .release = rx_queue_release, 919 .release = rx_queue_release,
911 .default_attrs = rx_queue_default_attrs, 920 .default_attrs = rx_queue_default_attrs,
912 .namespace = rx_queue_namespace 921 .namespace = rx_queue_namespace,
922 .get_ownership = rx_queue_get_ownership,
913}; 923};
914 924
915static int rx_queue_add_kobject(struct net_device *dev, int index) 925static int rx_queue_add_kobject(struct net_device *dev, int index)
@@ -1047,13 +1057,30 @@ static ssize_t traffic_class_show(struct netdev_queue *queue,
1047 char *buf) 1057 char *buf)
1048{ 1058{
1049 struct net_device *dev = queue->dev; 1059 struct net_device *dev = queue->dev;
1050 int index = get_netdev_queue_index(queue); 1060 int index;
1051 int tc = netdev_txq_to_tc(dev, index); 1061 int tc;
1062
1063 if (!netif_is_multiqueue(dev))
1064 return -ENOENT;
1052 1065
1066 index = get_netdev_queue_index(queue);
1067
1068 /* If queue belongs to subordinate dev use its TC mapping */
1069 dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
1070
1071 tc = netdev_txq_to_tc(dev, index);
1053 if (tc < 0) 1072 if (tc < 0)
1054 return -EINVAL; 1073 return -EINVAL;
1055 1074
1056 return sprintf(buf, "%u\n", tc); 1075 /* We can report the traffic class one of two ways:
1076 * Subordinate device traffic classes are reported with the traffic
1077 * class first, and then the subordinate class so for example TC0 on
1078 * subordinate device 2 will be reported as "0-2". If the queue
1079 * belongs to the root device it will be reported with just the
1080 * traffic class, so just "0" for TC 0 for example.
1081 */
1082 return dev->num_tc < 0 ? sprintf(buf, "%u%d\n", tc, dev->num_tc) :
1083 sprintf(buf, "%u\n", tc);
1057} 1084}
1058 1085
1059#ifdef CONFIG_XPS 1086#ifdef CONFIG_XPS
@@ -1070,6 +1097,9 @@ static ssize_t tx_maxrate_store(struct netdev_queue *queue,
1070 int err, index = get_netdev_queue_index(queue); 1097 int err, index = get_netdev_queue_index(queue);
1071 u32 rate = 0; 1098 u32 rate = 0;
1072 1099
1100 if (!capable(CAP_NET_ADMIN))
1101 return -EPERM;
1102
1073 err = kstrtou32(buf, 10, &rate); 1103 err = kstrtou32(buf, 10, &rate);
1074 if (err < 0) 1104 if (err < 0)
1075 return err; 1105 return err;
@@ -1214,10 +1244,20 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue,
1214 cpumask_var_t mask; 1244 cpumask_var_t mask;
1215 unsigned long index; 1245 unsigned long index;
1216 1246
1247 if (!netif_is_multiqueue(dev))
1248 return -ENOENT;
1249
1217 index = get_netdev_queue_index(queue); 1250 index = get_netdev_queue_index(queue);
1218 1251
1219 if (dev->num_tc) { 1252 if (dev->num_tc) {
1253 /* Do not allow XPS on subordinate device directly */
1220 num_tc = dev->num_tc; 1254 num_tc = dev->num_tc;
1255 if (num_tc < 0)
1256 return -EINVAL;
1257
1258 /* If queue belongs to subordinate dev use its map */
1259 dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
1260
1221 tc = netdev_txq_to_tc(dev, index); 1261 tc = netdev_txq_to_tc(dev, index);
1222 if (tc < 0) 1262 if (tc < 0)
1223 return -EINVAL; 1263 return -EINVAL;
@@ -1227,13 +1267,13 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue,
1227 return -ENOMEM; 1267 return -ENOMEM;
1228 1268
1229 rcu_read_lock(); 1269 rcu_read_lock();
1230 dev_maps = rcu_dereference(dev->xps_maps); 1270 dev_maps = rcu_dereference(dev->xps_cpus_map);
1231 if (dev_maps) { 1271 if (dev_maps) {
1232 for_each_possible_cpu(cpu) { 1272 for_each_possible_cpu(cpu) {
1233 int i, tci = cpu * num_tc + tc; 1273 int i, tci = cpu * num_tc + tc;
1234 struct xps_map *map; 1274 struct xps_map *map;
1235 1275
1236 map = rcu_dereference(dev_maps->cpu_map[tci]); 1276 map = rcu_dereference(dev_maps->attr_map[tci]);
1237 if (!map) 1277 if (!map)
1238 continue; 1278 continue;
1239 1279
@@ -1260,6 +1300,9 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue,
1260 cpumask_var_t mask; 1300 cpumask_var_t mask;
1261 int err; 1301 int err;
1262 1302
1303 if (!netif_is_multiqueue(dev))
1304 return -ENOENT;
1305
1263 if (!capable(CAP_NET_ADMIN)) 1306 if (!capable(CAP_NET_ADMIN))
1264 return -EPERM; 1307 return -EPERM;
1265 1308
@@ -1283,6 +1326,91 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue,
1283 1326
1284static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init 1327static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init
1285 = __ATTR_RW(xps_cpus); 1328 = __ATTR_RW(xps_cpus);
1329
1330static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf)
1331{
1332 struct net_device *dev = queue->dev;
1333 struct xps_dev_maps *dev_maps;
1334 unsigned long *mask, index;
1335 int j, len, num_tc = 1, tc = 0;
1336
1337 index = get_netdev_queue_index(queue);
1338
1339 if (dev->num_tc) {
1340 num_tc = dev->num_tc;
1341 tc = netdev_txq_to_tc(dev, index);
1342 if (tc < 0)
1343 return -EINVAL;
1344 }
1345 mask = kcalloc(BITS_TO_LONGS(dev->num_rx_queues), sizeof(long),
1346 GFP_KERNEL);
1347 if (!mask)
1348 return -ENOMEM;
1349
1350 rcu_read_lock();
1351 dev_maps = rcu_dereference(dev->xps_rxqs_map);
1352 if (!dev_maps)
1353 goto out_no_maps;
1354
1355 for (j = -1; j = netif_attrmask_next(j, NULL, dev->num_rx_queues),
1356 j < dev->num_rx_queues;) {
1357 int i, tci = j * num_tc + tc;
1358 struct xps_map *map;
1359
1360 map = rcu_dereference(dev_maps->attr_map[tci]);
1361 if (!map)
1362 continue;
1363
1364 for (i = map->len; i--;) {
1365 if (map->queues[i] == index) {
1366 set_bit(j, mask);
1367 break;
1368 }
1369 }
1370 }
1371out_no_maps:
1372 rcu_read_unlock();
1373
1374 len = bitmap_print_to_pagebuf(false, buf, mask, dev->num_rx_queues);
1375 kfree(mask);
1376
1377 return len < PAGE_SIZE ? len : -EINVAL;
1378}
1379
1380static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf,
1381 size_t len)
1382{
1383 struct net_device *dev = queue->dev;
1384 struct net *net = dev_net(dev);
1385 unsigned long *mask, index;
1386 int err;
1387
1388 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
1389 return -EPERM;
1390
1391 mask = kcalloc(BITS_TO_LONGS(dev->num_rx_queues), sizeof(long),
1392 GFP_KERNEL);
1393 if (!mask)
1394 return -ENOMEM;
1395
1396 index = get_netdev_queue_index(queue);
1397
1398 err = bitmap_parse(buf, len, mask, dev->num_rx_queues);
1399 if (err) {
1400 kfree(mask);
1401 return err;
1402 }
1403
1404 cpus_read_lock();
1405 err = __netif_set_xps_queue(dev, mask, index, true);
1406 cpus_read_unlock();
1407
1408 kfree(mask);
1409 return err ? : len;
1410}
1411
1412static struct netdev_queue_attribute xps_rxqs_attribute __ro_after_init
1413 = __ATTR_RW(xps_rxqs);
1286#endif /* CONFIG_XPS */ 1414#endif /* CONFIG_XPS */
1287 1415
1288static struct attribute *netdev_queue_default_attrs[] __ro_after_init = { 1416static struct attribute *netdev_queue_default_attrs[] __ro_after_init = {
@@ -1290,6 +1418,7 @@ static struct attribute *netdev_queue_default_attrs[] __ro_after_init = {
1290 &queue_traffic_class.attr, 1418 &queue_traffic_class.attr,
1291#ifdef CONFIG_XPS 1419#ifdef CONFIG_XPS
1292 &xps_cpus_attribute.attr, 1420 &xps_cpus_attribute.attr,
1421 &xps_rxqs_attribute.attr,
1293 &queue_tx_maxrate.attr, 1422 &queue_tx_maxrate.attr,
1294#endif 1423#endif
1295 NULL 1424 NULL
@@ -1315,11 +1444,20 @@ static const void *netdev_queue_namespace(struct kobject *kobj)
1315 return ns; 1444 return ns;
1316} 1445}
1317 1446
1447static void netdev_queue_get_ownership(struct kobject *kobj,
1448 kuid_t *uid, kgid_t *gid)
1449{
1450 const struct net *net = netdev_queue_namespace(kobj);
1451
1452 net_ns_get_ownership(net, uid, gid);
1453}
1454
1318static struct kobj_type netdev_queue_ktype __ro_after_init = { 1455static struct kobj_type netdev_queue_ktype __ro_after_init = {
1319 .sysfs_ops = &netdev_queue_sysfs_ops, 1456 .sysfs_ops = &netdev_queue_sysfs_ops,
1320 .release = netdev_queue_release, 1457 .release = netdev_queue_release,
1321 .default_attrs = netdev_queue_default_attrs, 1458 .default_attrs = netdev_queue_default_attrs,
1322 .namespace = netdev_queue_namespace, 1459 .namespace = netdev_queue_namespace,
1460 .get_ownership = netdev_queue_get_ownership,
1323}; 1461};
1324 1462
1325static int netdev_queue_add_kobject(struct net_device *dev, int index) 1463static int netdev_queue_add_kobject(struct net_device *dev, int index)
@@ -1509,6 +1647,14 @@ static const void *net_namespace(struct device *d)
1509 return dev_net(dev); 1647 return dev_net(dev);
1510} 1648}
1511 1649
1650static void net_get_ownership(struct device *d, kuid_t *uid, kgid_t *gid)
1651{
1652 struct net_device *dev = to_net_dev(d);
1653 const struct net *net = dev_net(dev);
1654
1655 net_ns_get_ownership(net, uid, gid);
1656}
1657
1512static struct class net_class __ro_after_init = { 1658static struct class net_class __ro_after_init = {
1513 .name = "net", 1659 .name = "net",
1514 .dev_release = netdev_release, 1660 .dev_release = netdev_release,
@@ -1516,6 +1662,7 @@ static struct class net_class __ro_after_init = {
1516 .dev_uevent = netdev_uevent, 1662 .dev_uevent = netdev_uevent,
1517 .ns_type = &net_ns_type_operations, 1663 .ns_type = &net_ns_type_operations,
1518 .namespace = net_namespace, 1664 .namespace = net_namespace,
1665 .get_ownership = net_get_ownership,
1519}; 1666};
1520 1667
1521#ifdef CONFIG_OF_NET 1668#ifdef CONFIG_OF_NET
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index a11e03f920d3..670c84b1bfc2 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -17,6 +17,7 @@
17#include <linux/user_namespace.h> 17#include <linux/user_namespace.h>
18#include <linux/net_namespace.h> 18#include <linux/net_namespace.h>
19#include <linux/sched/task.h> 19#include <linux/sched/task.h>
20#include <linux/uidgid.h>
20 21
21#include <net/sock.h> 22#include <net/sock.h>
22#include <net/netlink.h> 23#include <net/netlink.h>
@@ -448,6 +449,33 @@ dec_ucounts:
448 return net; 449 return net;
449} 450}
450 451
452/**
453 * net_ns_get_ownership - get sysfs ownership data for @net
454 * @net: network namespace in question (can be NULL)
455 * @uid: kernel user ID for sysfs objects
456 * @gid: kernel group ID for sysfs objects
457 *
458 * Returns the uid/gid pair of root in the user namespace associated with the
459 * given network namespace.
460 */
461void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid)
462{
463 if (net) {
464 kuid_t ns_root_uid = make_kuid(net->user_ns, 0);
465 kgid_t ns_root_gid = make_kgid(net->user_ns, 0);
466
467 if (uid_valid(ns_root_uid))
468 *uid = ns_root_uid;
469
470 if (gid_valid(ns_root_gid))
471 *gid = ns_root_gid;
472 } else {
473 *uid = GLOBAL_ROOT_UID;
474 *gid = GLOBAL_ROOT_GID;
475 }
476}
477EXPORT_SYMBOL_GPL(net_ns_get_ownership);
478
451static void unhash_nsid(struct net *net, struct net *last) 479static void unhash_nsid(struct net *net, struct net *last)
452{ 480{
453 struct net *tmp; 481 struct net *tmp;
@@ -973,22 +1001,18 @@ static int register_pernet_operations(struct list_head *list,
973 int error; 1001 int error;
974 1002
975 if (ops->id) { 1003 if (ops->id) {
976again: 1004 error = ida_alloc_min(&net_generic_ids, MIN_PERNET_OPS_ID,
977 error = ida_get_new_above(&net_generic_ids, MIN_PERNET_OPS_ID, ops->id); 1005 GFP_KERNEL);
978 if (error < 0) { 1006 if (error < 0)
979 if (error == -EAGAIN) {
980 ida_pre_get(&net_generic_ids, GFP_KERNEL);
981 goto again;
982 }
983 return error; 1007 return error;
984 } 1008 *ops->id = error;
985 max_gen_ptrs = max(max_gen_ptrs, *ops->id + 1); 1009 max_gen_ptrs = max(max_gen_ptrs, *ops->id + 1);
986 } 1010 }
987 error = __register_pernet_operations(list, ops); 1011 error = __register_pernet_operations(list, ops);
988 if (error) { 1012 if (error) {
989 rcu_barrier(); 1013 rcu_barrier();
990 if (ops->id) 1014 if (ops->id)
991 ida_remove(&net_generic_ids, *ops->id); 1015 ida_free(&net_generic_ids, *ops->id);
992 } 1016 }
993 1017
994 return error; 1018 return error;
@@ -999,7 +1023,7 @@ static void unregister_pernet_operations(struct pernet_operations *ops)
999 __unregister_pernet_operations(ops); 1023 __unregister_pernet_operations(ops);
1000 rcu_barrier(); 1024 rcu_barrier();
1001 if (ops->id) 1025 if (ops->id)
1002 ida_remove(&net_generic_ids, *ops->id); 1026 ida_free(&net_generic_ids, *ops->id);
1003} 1027}
1004 1028
1005/** 1029/**
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 68bf07206744..43a932cb609b 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -269,7 +269,7 @@ static void __page_pool_empty_ring(struct page_pool *pool)
269 struct page *page; 269 struct page *page;
270 270
271 /* Empty recycle ring */ 271 /* Empty recycle ring */
272 while ((page = ptr_ring_consume(&pool->ring))) { 272 while ((page = ptr_ring_consume_bh(&pool->ring))) {
273 /* Verify the refcnt invariant of cached pages */ 273 /* Verify the refcnt invariant of cached pages */
274 if (!(page_ref_count(page) == 1)) 274 if (!(page_ref_count(page) == 1))
275 pr_crit("%s() page_pool refcnt %d violation\n", 275 pr_crit("%s() page_pool refcnt %d violation\n",
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 49368e21d228..7f6938405fa1 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -1265,7 +1265,7 @@ static ssize_t pktgen_if_write(struct file *file,
1265 buf[len] = 0; 1265 buf[len] = 0;
1266 if (strcmp(buf, pkt_dev->dst_min) != 0) { 1266 if (strcmp(buf, pkt_dev->dst_min) != 0) {
1267 memset(pkt_dev->dst_min, 0, sizeof(pkt_dev->dst_min)); 1267 memset(pkt_dev->dst_min, 0, sizeof(pkt_dev->dst_min));
1268 strncpy(pkt_dev->dst_min, buf, len); 1268 strcpy(pkt_dev->dst_min, buf);
1269 pkt_dev->daddr_min = in_aton(pkt_dev->dst_min); 1269 pkt_dev->daddr_min = in_aton(pkt_dev->dst_min);
1270 pkt_dev->cur_daddr = pkt_dev->daddr_min; 1270 pkt_dev->cur_daddr = pkt_dev->daddr_min;
1271 } 1271 }
@@ -1280,14 +1280,12 @@ static ssize_t pktgen_if_write(struct file *file,
1280 if (len < 0) 1280 if (len < 0)
1281 return len; 1281 return len;
1282 1282
1283
1284 if (copy_from_user(buf, &user_buffer[i], len)) 1283 if (copy_from_user(buf, &user_buffer[i], len))
1285 return -EFAULT; 1284 return -EFAULT;
1286
1287 buf[len] = 0; 1285 buf[len] = 0;
1288 if (strcmp(buf, pkt_dev->dst_max) != 0) { 1286 if (strcmp(buf, pkt_dev->dst_max) != 0) {
1289 memset(pkt_dev->dst_max, 0, sizeof(pkt_dev->dst_max)); 1287 memset(pkt_dev->dst_max, 0, sizeof(pkt_dev->dst_max));
1290 strncpy(pkt_dev->dst_max, buf, len); 1288 strcpy(pkt_dev->dst_max, buf);
1291 pkt_dev->daddr_max = in_aton(pkt_dev->dst_max); 1289 pkt_dev->daddr_max = in_aton(pkt_dev->dst_max);
1292 pkt_dev->cur_daddr = pkt_dev->daddr_max; 1290 pkt_dev->cur_daddr = pkt_dev->daddr_max;
1293 } 1291 }
@@ -1396,7 +1394,7 @@ static ssize_t pktgen_if_write(struct file *file,
1396 buf[len] = 0; 1394 buf[len] = 0;
1397 if (strcmp(buf, pkt_dev->src_min) != 0) { 1395 if (strcmp(buf, pkt_dev->src_min) != 0) {
1398 memset(pkt_dev->src_min, 0, sizeof(pkt_dev->src_min)); 1396 memset(pkt_dev->src_min, 0, sizeof(pkt_dev->src_min));
1399 strncpy(pkt_dev->src_min, buf, len); 1397 strcpy(pkt_dev->src_min, buf);
1400 pkt_dev->saddr_min = in_aton(pkt_dev->src_min); 1398 pkt_dev->saddr_min = in_aton(pkt_dev->src_min);
1401 pkt_dev->cur_saddr = pkt_dev->saddr_min; 1399 pkt_dev->cur_saddr = pkt_dev->saddr_min;
1402 } 1400 }
@@ -1416,7 +1414,7 @@ static ssize_t pktgen_if_write(struct file *file,
1416 buf[len] = 0; 1414 buf[len] = 0;
1417 if (strcmp(buf, pkt_dev->src_max) != 0) { 1415 if (strcmp(buf, pkt_dev->src_max) != 0) {
1418 memset(pkt_dev->src_max, 0, sizeof(pkt_dev->src_max)); 1416 memset(pkt_dev->src_max, 0, sizeof(pkt_dev->src_max));
1419 strncpy(pkt_dev->src_max, buf, len); 1417 strcpy(pkt_dev->src_max, buf);
1420 pkt_dev->saddr_max = in_aton(pkt_dev->src_max); 1418 pkt_dev->saddr_max = in_aton(pkt_dev->src_max);
1421 pkt_dev->cur_saddr = pkt_dev->saddr_max; 1419 pkt_dev->cur_saddr = pkt_dev->saddr_max;
1422 } 1420 }
@@ -2255,7 +2253,7 @@ static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow)
2255 x = xfrm_state_lookup_byspi(pn->net, htonl(pkt_dev->spi), AF_INET); 2253 x = xfrm_state_lookup_byspi(pn->net, htonl(pkt_dev->spi), AF_INET);
2256 } else { 2254 } else {
2257 /* slow path: we dont already have xfrm_state */ 2255 /* slow path: we dont already have xfrm_state */
2258 x = xfrm_stateonly_find(pn->net, DUMMY_MARK, 2256 x = xfrm_stateonly_find(pn->net, DUMMY_MARK, 0,
2259 (xfrm_address_t *)&pkt_dev->cur_daddr, 2257 (xfrm_address_t *)&pkt_dev->cur_daddr,
2260 (xfrm_address_t *)&pkt_dev->cur_saddr, 2258 (xfrm_address_t *)&pkt_dev->cur_saddr,
2261 AF_INET, 2259 AF_INET,
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 5ef61222fdef..24431e578310 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -964,7 +964,8 @@ static size_t rtnl_xdp_size(void)
964{ 964{
965 size_t xdp_size = nla_total_size(0) + /* nest IFLA_XDP */ 965 size_t xdp_size = nla_total_size(0) + /* nest IFLA_XDP */
966 nla_total_size(1) + /* XDP_ATTACHED */ 966 nla_total_size(1) + /* XDP_ATTACHED */
967 nla_total_size(4); /* XDP_PROG_ID */ 967 nla_total_size(4) + /* XDP_PROG_ID (or 1st mode) */
968 nla_total_size(4); /* XDP_<mode>_PROG_ID */
968 969
969 return xdp_size; 970 return xdp_size;
970} 971}
@@ -1014,6 +1015,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
1014 + nla_total_size(4) /* IFLA_IF_NETNSID */ 1015 + nla_total_size(4) /* IFLA_IF_NETNSID */
1015 + nla_total_size(4) /* IFLA_CARRIER_UP_COUNT */ 1016 + nla_total_size(4) /* IFLA_CARRIER_UP_COUNT */
1016 + nla_total_size(4) /* IFLA_CARRIER_DOWN_COUNT */ 1017 + nla_total_size(4) /* IFLA_CARRIER_DOWN_COUNT */
1018 + nla_total_size(4) /* IFLA_MIN_MTU */
1019 + nla_total_size(4) /* IFLA_MAX_MTU */
1017 + 0; 1020 + 0;
1018} 1021}
1019 1022
@@ -1353,27 +1356,51 @@ static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev)
1353 return 0; 1356 return 0;
1354} 1357}
1355 1358
1356static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id) 1359static u32 rtnl_xdp_prog_skb(struct net_device *dev)
1357{ 1360{
1358 const struct net_device_ops *ops = dev->netdev_ops;
1359 const struct bpf_prog *generic_xdp_prog; 1361 const struct bpf_prog *generic_xdp_prog;
1360 struct netdev_bpf xdp;
1361 1362
1362 ASSERT_RTNL(); 1363 ASSERT_RTNL();
1363 1364
1364 *prog_id = 0;
1365 generic_xdp_prog = rtnl_dereference(dev->xdp_prog); 1365 generic_xdp_prog = rtnl_dereference(dev->xdp_prog);
1366 if (generic_xdp_prog) { 1366 if (!generic_xdp_prog)
1367 *prog_id = generic_xdp_prog->aux->id; 1367 return 0;
1368 return XDP_ATTACHED_SKB; 1368 return generic_xdp_prog->aux->id;
1369 } 1369}
1370 if (!ops->ndo_bpf) 1370
1371 return XDP_ATTACHED_NONE; 1371static u32 rtnl_xdp_prog_drv(struct net_device *dev)
1372{
1373 return __dev_xdp_query(dev, dev->netdev_ops->ndo_bpf, XDP_QUERY_PROG);
1374}
1375
1376static u32 rtnl_xdp_prog_hw(struct net_device *dev)
1377{
1378 return __dev_xdp_query(dev, dev->netdev_ops->ndo_bpf,
1379 XDP_QUERY_PROG_HW);
1380}
1381
1382static int rtnl_xdp_report_one(struct sk_buff *skb, struct net_device *dev,
1383 u32 *prog_id, u8 *mode, u8 tgt_mode, u32 attr,
1384 u32 (*get_prog_id)(struct net_device *dev))
1385{
1386 u32 curr_id;
1387 int err;
1388
1389 curr_id = get_prog_id(dev);
1390 if (!curr_id)
1391 return 0;
1372 1392
1373 __dev_xdp_query(dev, ops->ndo_bpf, &xdp); 1393 *prog_id = curr_id;
1374 *prog_id = xdp.prog_id; 1394 err = nla_put_u32(skb, attr, curr_id);
1395 if (err)
1396 return err;
1397
1398 if (*mode != XDP_ATTACHED_NONE)
1399 *mode = XDP_ATTACHED_MULTI;
1400 else
1401 *mode = tgt_mode;
1375 1402
1376 return xdp.prog_attached; 1403 return 0;
1377} 1404}
1378 1405
1379static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev) 1406static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev)
@@ -1381,17 +1408,32 @@ static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev)
1381 struct nlattr *xdp; 1408 struct nlattr *xdp;
1382 u32 prog_id; 1409 u32 prog_id;
1383 int err; 1410 int err;
1411 u8 mode;
1384 1412
1385 xdp = nla_nest_start(skb, IFLA_XDP); 1413 xdp = nla_nest_start(skb, IFLA_XDP);
1386 if (!xdp) 1414 if (!xdp)
1387 return -EMSGSIZE; 1415 return -EMSGSIZE;
1388 1416
1389 err = nla_put_u8(skb, IFLA_XDP_ATTACHED, 1417 prog_id = 0;
1390 rtnl_xdp_attached_mode(dev, &prog_id)); 1418 mode = XDP_ATTACHED_NONE;
1419 err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_SKB,
1420 IFLA_XDP_SKB_PROG_ID, rtnl_xdp_prog_skb);
1421 if (err)
1422 goto err_cancel;
1423 err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_DRV,
1424 IFLA_XDP_DRV_PROG_ID, rtnl_xdp_prog_drv);
1425 if (err)
1426 goto err_cancel;
1427 err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_HW,
1428 IFLA_XDP_HW_PROG_ID, rtnl_xdp_prog_hw);
1429 if (err)
1430 goto err_cancel;
1431
1432 err = nla_put_u8(skb, IFLA_XDP_ATTACHED, mode);
1391 if (err) 1433 if (err)
1392 goto err_cancel; 1434 goto err_cancel;
1393 1435
1394 if (prog_id) { 1436 if (prog_id && mode != XDP_ATTACHED_MULTI) {
1395 err = nla_put_u32(skb, IFLA_XDP_PROG_ID, prog_id); 1437 err = nla_put_u32(skb, IFLA_XDP_PROG_ID, prog_id);
1396 if (err) 1438 if (err)
1397 goto err_cancel; 1439 goto err_cancel;
@@ -1561,6 +1603,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
1561 netif_running(dev) ? dev->operstate : IF_OPER_DOWN) || 1603 netif_running(dev) ? dev->operstate : IF_OPER_DOWN) ||
1562 nla_put_u8(skb, IFLA_LINKMODE, dev->link_mode) || 1604 nla_put_u8(skb, IFLA_LINKMODE, dev->link_mode) ||
1563 nla_put_u32(skb, IFLA_MTU, dev->mtu) || 1605 nla_put_u32(skb, IFLA_MTU, dev->mtu) ||
1606 nla_put_u32(skb, IFLA_MIN_MTU, dev->min_mtu) ||
1607 nla_put_u32(skb, IFLA_MAX_MTU, dev->max_mtu) ||
1564 nla_put_u32(skb, IFLA_GROUP, dev->group) || 1608 nla_put_u32(skb, IFLA_GROUP, dev->group) ||
1565 nla_put_u32(skb, IFLA_PROMISCUITY, dev->promiscuity) || 1609 nla_put_u32(skb, IFLA_PROMISCUITY, dev->promiscuity) ||
1566 nla_put_u32(skb, IFLA_NUM_TX_QUEUES, dev->num_tx_queues) || 1610 nla_put_u32(skb, IFLA_NUM_TX_QUEUES, dev->num_tx_queues) ||
@@ -1692,6 +1736,8 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
1692 [IFLA_IF_NETNSID] = { .type = NLA_S32 }, 1736 [IFLA_IF_NETNSID] = { .type = NLA_S32 },
1693 [IFLA_CARRIER_UP_COUNT] = { .type = NLA_U32 }, 1737 [IFLA_CARRIER_UP_COUNT] = { .type = NLA_U32 },
1694 [IFLA_CARRIER_DOWN_COUNT] = { .type = NLA_U32 }, 1738 [IFLA_CARRIER_DOWN_COUNT] = { .type = NLA_U32 },
1739 [IFLA_MIN_MTU] = { .type = NLA_U32 },
1740 [IFLA_MAX_MTU] = { .type = NLA_U32 },
1695}; 1741};
1696 1742
1697static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { 1743static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
@@ -2336,7 +2382,7 @@ static int do_setlink(const struct sk_buff *skb,
2336 } 2382 }
2337 2383
2338 if (tb[IFLA_MTU]) { 2384 if (tb[IFLA_MTU]) {
2339 err = dev_set_mtu(dev, nla_get_u32(tb[IFLA_MTU])); 2385 err = dev_set_mtu_ext(dev, nla_get_u32(tb[IFLA_MTU]), extack);
2340 if (err < 0) 2386 if (err < 0)
2341 goto errout; 2387 goto errout;
2342 status |= DO_SETLINK_MODIFIED; 2388 status |= DO_SETLINK_MODIFIED;
@@ -2759,9 +2805,12 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm)
2759 return err; 2805 return err;
2760 } 2806 }
2761 2807
2762 dev->rtnl_link_state = RTNL_LINK_INITIALIZED; 2808 if (dev->rtnl_link_state == RTNL_LINK_INITIALIZED) {
2763 2809 __dev_notify_flags(dev, old_flags, 0U);
2764 __dev_notify_flags(dev, old_flags, ~0U); 2810 } else {
2811 dev->rtnl_link_state = RTNL_LINK_INITIALIZED;
2812 __dev_notify_flags(dev, old_flags, ~0U);
2813 }
2765 return 0; 2814 return 0;
2766} 2815}
2767EXPORT_SYMBOL(rtnl_configure_link); 2816EXPORT_SYMBOL(rtnl_configure_link);
diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
index 7232274de334..af6ad467ed61 100644
--- a/net/core/secure_seq.c
+++ b/net/core/secure_seq.c
@@ -140,6 +140,7 @@ u32 secure_tcp_seq(__be32 saddr, __be32 daddr,
140 &net_secret); 140 &net_secret);
141 return seq_scale(hash); 141 return seq_scale(hash);
142} 142}
143EXPORT_SYMBOL_GPL(secure_tcp_seq);
143 144
144u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport) 145u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport)
145{ 146{
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index c642304f178c..c996c09d095f 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -858,6 +858,7 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
858 n->cloned = 1; 858 n->cloned = 1;
859 n->nohdr = 0; 859 n->nohdr = 0;
860 n->peeked = 0; 860 n->peeked = 0;
861 C(pfmemalloc);
861 n->destructor = NULL; 862 n->destructor = NULL;
862 C(tail); 863 C(tail);
863 C(end); 864 C(end);
@@ -1290,7 +1291,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
1290} 1291}
1291EXPORT_SYMBOL(skb_clone); 1292EXPORT_SYMBOL(skb_clone);
1292 1293
1293static void skb_headers_offset_update(struct sk_buff *skb, int off) 1294void skb_headers_offset_update(struct sk_buff *skb, int off)
1294{ 1295{
1295 /* Only adjust this if it actually is csum_start rather than csum */ 1296 /* Only adjust this if it actually is csum_start rather than csum */
1296 if (skb->ip_summed == CHECKSUM_PARTIAL) 1297 if (skb->ip_summed == CHECKSUM_PARTIAL)
@@ -1304,6 +1305,7 @@ static void skb_headers_offset_update(struct sk_buff *skb, int off)
1304 skb->inner_network_header += off; 1305 skb->inner_network_header += off;
1305 skb->inner_mac_header += off; 1306 skb->inner_mac_header += off;
1306} 1307}
1308EXPORT_SYMBOL(skb_headers_offset_update);
1307 1309
1308void skb_copy_header(struct sk_buff *new, const struct sk_buff *old) 1310void skb_copy_header(struct sk_buff *new, const struct sk_buff *old)
1309{ 1311{
@@ -1714,7 +1716,7 @@ void *skb_push(struct sk_buff *skb, unsigned int len)
1714{ 1716{
1715 skb->data -= len; 1717 skb->data -= len;
1716 skb->len += len; 1718 skb->len += len;
1717 if (unlikely(skb->data<skb->head)) 1719 if (unlikely(skb->data < skb->head))
1718 skb_under_panic(skb, len, __builtin_return_address(0)); 1720 skb_under_panic(skb, len, __builtin_return_address(0));
1719 return skb->data; 1721 return skb->data;
1720} 1722}
@@ -2857,23 +2859,27 @@ EXPORT_SYMBOL(skb_queue_purge);
2857/** 2859/**
2858 * skb_rbtree_purge - empty a skb rbtree 2860 * skb_rbtree_purge - empty a skb rbtree
2859 * @root: root of the rbtree to empty 2861 * @root: root of the rbtree to empty
2862 * Return value: the sum of truesizes of all purged skbs.
2860 * 2863 *
2861 * Delete all buffers on an &sk_buff rbtree. Each buffer is removed from 2864 * Delete all buffers on an &sk_buff rbtree. Each buffer is removed from
2862 * the list and one reference dropped. This function does not take 2865 * the list and one reference dropped. This function does not take
2863 * any lock. Synchronization should be handled by the caller (e.g., TCP 2866 * any lock. Synchronization should be handled by the caller (e.g., TCP
2864 * out-of-order queue is protected by the socket lock). 2867 * out-of-order queue is protected by the socket lock).
2865 */ 2868 */
2866void skb_rbtree_purge(struct rb_root *root) 2869unsigned int skb_rbtree_purge(struct rb_root *root)
2867{ 2870{
2868 struct rb_node *p = rb_first(root); 2871 struct rb_node *p = rb_first(root);
2872 unsigned int sum = 0;
2869 2873
2870 while (p) { 2874 while (p) {
2871 struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); 2875 struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
2872 2876
2873 p = rb_next(p); 2877 p = rb_next(p);
2874 rb_erase(&skb->rbnode, root); 2878 rb_erase(&skb->rbnode, root);
2879 sum += skb->truesize;
2875 kfree_skb(skb); 2880 kfree_skb(skb);
2876 } 2881 }
2882 return sum;
2877} 2883}
2878 2884
2879/** 2885/**
@@ -3719,6 +3725,7 @@ normal:
3719 net_warn_ratelimited( 3725 net_warn_ratelimited(
3720 "skb_segment: too many frags: %u %u\n", 3726 "skb_segment: too many frags: %u %u\n",
3721 pos, mss); 3727 pos, mss);
3728 err = -EINVAL;
3722 goto err; 3729 goto err;
3723 } 3730 }
3724 3731
@@ -3752,11 +3759,10 @@ skip_fraglist:
3752 3759
3753perform_csum_check: 3760perform_csum_check:
3754 if (!csum) { 3761 if (!csum) {
3755 if (skb_has_shared_frag(nskb)) { 3762 if (skb_has_shared_frag(nskb) &&
3756 err = __skb_linearize(nskb); 3763 __skb_linearize(nskb))
3757 if (err) 3764 goto err;
3758 goto err; 3765
3759 }
3760 if (!nskb->remcsum_offload) 3766 if (!nskb->remcsum_offload)
3761 nskb->ip_summed = CHECKSUM_NONE; 3767 nskb->ip_summed = CHECKSUM_NONE;
3762 SKB_GSO_CB(nskb)->csum = 3768 SKB_GSO_CB(nskb)->csum =
@@ -3815,14 +3821,14 @@ err:
3815} 3821}
3816EXPORT_SYMBOL_GPL(skb_segment); 3822EXPORT_SYMBOL_GPL(skb_segment);
3817 3823
3818int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) 3824int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
3819{ 3825{
3820 struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb); 3826 struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb);
3821 unsigned int offset = skb_gro_offset(skb); 3827 unsigned int offset = skb_gro_offset(skb);
3822 unsigned int headlen = skb_headlen(skb); 3828 unsigned int headlen = skb_headlen(skb);
3823 unsigned int len = skb_gro_len(skb); 3829 unsigned int len = skb_gro_len(skb);
3824 struct sk_buff *lp, *p = *head;
3825 unsigned int delta_truesize; 3830 unsigned int delta_truesize;
3831 struct sk_buff *lp;
3826 3832
3827 if (unlikely(p->len + len >= 65536)) 3833 if (unlikely(p->len + len >= 65536))
3828 return -E2BIG; 3834 return -E2BIG;
@@ -4898,7 +4904,6 @@ EXPORT_SYMBOL(skb_try_coalesce);
4898 */ 4904 */
4899void skb_scrub_packet(struct sk_buff *skb, bool xnet) 4905void skb_scrub_packet(struct sk_buff *skb, bool xnet)
4900{ 4906{
4901 skb->tstamp = 0;
4902 skb->pkt_type = PACKET_HOST; 4907 skb->pkt_type = PACKET_HOST;
4903 skb->skb_iif = 0; 4908 skb->skb_iif = 0;
4904 skb->ignore_df = 0; 4909 skb->ignore_df = 0;
@@ -4911,8 +4916,8 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet)
4911 return; 4916 return;
4912 4917
4913 ipvs_reset(skb); 4918 ipvs_reset(skb);
4914 skb_orphan(skb);
4915 skb->mark = 0; 4919 skb->mark = 0;
4920 skb->tstamp = 0;
4916} 4921}
4917EXPORT_SYMBOL_GPL(skb_scrub_packet); 4922EXPORT_SYMBOL_GPL(skb_scrub_packet);
4918 4923
@@ -5276,8 +5281,7 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
5276 if (npages >= 1 << order) { 5281 if (npages >= 1 << order) {
5277 page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) | 5282 page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) |
5278 __GFP_COMP | 5283 __GFP_COMP |
5279 __GFP_NOWARN | 5284 __GFP_NOWARN,
5280 __GFP_NORETRY,
5281 order); 5285 order);
5282 if (page) 5286 if (page)
5283 goto fill_page; 5287 goto fill_page;
diff --git a/net/core/sock.c b/net/core/sock.c
index bcc41829a16d..3730eb855095 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -91,6 +91,7 @@
91 91
92#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 92#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93 93
94#include <asm/unaligned.h>
94#include <linux/capability.h> 95#include <linux/capability.h>
95#include <linux/errno.h> 96#include <linux/errno.h>
96#include <linux/errqueue.h> 97#include <linux/errqueue.h>
@@ -249,58 +250,13 @@ static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
249 _sock_locks("k-clock-") 250 _sock_locks("k-clock-")
250}; 251};
251static const char *const af_family_rlock_key_strings[AF_MAX+1] = { 252static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
252 "rlock-AF_UNSPEC", "rlock-AF_UNIX" , "rlock-AF_INET" , 253 _sock_locks("rlock-")
253 "rlock-AF_AX25" , "rlock-AF_IPX" , "rlock-AF_APPLETALK",
254 "rlock-AF_NETROM", "rlock-AF_BRIDGE" , "rlock-AF_ATMPVC" ,
255 "rlock-AF_X25" , "rlock-AF_INET6" , "rlock-AF_ROSE" ,
256 "rlock-AF_DECnet", "rlock-AF_NETBEUI" , "rlock-AF_SECURITY" ,
257 "rlock-AF_KEY" , "rlock-AF_NETLINK" , "rlock-AF_PACKET" ,
258 "rlock-AF_ASH" , "rlock-AF_ECONET" , "rlock-AF_ATMSVC" ,
259 "rlock-AF_RDS" , "rlock-AF_SNA" , "rlock-AF_IRDA" ,
260 "rlock-AF_PPPOX" , "rlock-AF_WANPIPE" , "rlock-AF_LLC" ,
261 "rlock-27" , "rlock-28" , "rlock-AF_CAN" ,
262 "rlock-AF_TIPC" , "rlock-AF_BLUETOOTH", "rlock-AF_IUCV" ,
263 "rlock-AF_RXRPC" , "rlock-AF_ISDN" , "rlock-AF_PHONET" ,
264 "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG" ,
265 "rlock-AF_NFC" , "rlock-AF_VSOCK" , "rlock-AF_KCM" ,
266 "rlock-AF_QIPCRTR", "rlock-AF_SMC" , "rlock-AF_XDP" ,
267 "rlock-AF_MAX"
268}; 254};
269static const char *const af_family_wlock_key_strings[AF_MAX+1] = { 255static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
270 "wlock-AF_UNSPEC", "wlock-AF_UNIX" , "wlock-AF_INET" , 256 _sock_locks("wlock-")
271 "wlock-AF_AX25" , "wlock-AF_IPX" , "wlock-AF_APPLETALK",
272 "wlock-AF_NETROM", "wlock-AF_BRIDGE" , "wlock-AF_ATMPVC" ,
273 "wlock-AF_X25" , "wlock-AF_INET6" , "wlock-AF_ROSE" ,
274 "wlock-AF_DECnet", "wlock-AF_NETBEUI" , "wlock-AF_SECURITY" ,
275 "wlock-AF_KEY" , "wlock-AF_NETLINK" , "wlock-AF_PACKET" ,
276 "wlock-AF_ASH" , "wlock-AF_ECONET" , "wlock-AF_ATMSVC" ,
277 "wlock-AF_RDS" , "wlock-AF_SNA" , "wlock-AF_IRDA" ,
278 "wlock-AF_PPPOX" , "wlock-AF_WANPIPE" , "wlock-AF_LLC" ,
279 "wlock-27" , "wlock-28" , "wlock-AF_CAN" ,
280 "wlock-AF_TIPC" , "wlock-AF_BLUETOOTH", "wlock-AF_IUCV" ,
281 "wlock-AF_RXRPC" , "wlock-AF_ISDN" , "wlock-AF_PHONET" ,
282 "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG" ,
283 "wlock-AF_NFC" , "wlock-AF_VSOCK" , "wlock-AF_KCM" ,
284 "wlock-AF_QIPCRTR", "wlock-AF_SMC" , "wlock-AF_XDP" ,
285 "wlock-AF_MAX"
286}; 257};
287static const char *const af_family_elock_key_strings[AF_MAX+1] = { 258static const char *const af_family_elock_key_strings[AF_MAX+1] = {
288 "elock-AF_UNSPEC", "elock-AF_UNIX" , "elock-AF_INET" , 259 _sock_locks("elock-")
289 "elock-AF_AX25" , "elock-AF_IPX" , "elock-AF_APPLETALK",
290 "elock-AF_NETROM", "elock-AF_BRIDGE" , "elock-AF_ATMPVC" ,
291 "elock-AF_X25" , "elock-AF_INET6" , "elock-AF_ROSE" ,
292 "elock-AF_DECnet", "elock-AF_NETBEUI" , "elock-AF_SECURITY" ,
293 "elock-AF_KEY" , "elock-AF_NETLINK" , "elock-AF_PACKET" ,
294 "elock-AF_ASH" , "elock-AF_ECONET" , "elock-AF_ATMSVC" ,
295 "elock-AF_RDS" , "elock-AF_SNA" , "elock-AF_IRDA" ,
296 "elock-AF_PPPOX" , "elock-AF_WANPIPE" , "elock-AF_LLC" ,
297 "elock-27" , "elock-28" , "elock-AF_CAN" ,
298 "elock-AF_TIPC" , "elock-AF_BLUETOOTH", "elock-AF_IUCV" ,
299 "elock-AF_RXRPC" , "elock-AF_ISDN" , "elock-AF_PHONET" ,
300 "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG" ,
301 "elock-AF_NFC" , "elock-AF_VSOCK" , "elock-AF_KCM" ,
302 "elock-AF_QIPCRTR", "elock-AF_SMC" , "elock-AF_XDP" ,
303 "elock-AF_MAX"
304}; 260};
305 261
306/* 262/*
@@ -697,6 +653,7 @@ EXPORT_SYMBOL(sk_mc_loop);
697int sock_setsockopt(struct socket *sock, int level, int optname, 653int sock_setsockopt(struct socket *sock, int level, int optname,
698 char __user *optval, unsigned int optlen) 654 char __user *optval, unsigned int optlen)
699{ 655{
656 struct sock_txtime sk_txtime;
700 struct sock *sk = sock->sk; 657 struct sock *sk = sock->sk;
701 int val; 658 int val;
702 int valbool; 659 int valbool;
@@ -1070,6 +1027,26 @@ set_rcvbuf:
1070 } 1027 }
1071 break; 1028 break;
1072 1029
1030 case SO_TXTIME:
1031 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1032 ret = -EPERM;
1033 } else if (optlen != sizeof(struct sock_txtime)) {
1034 ret = -EINVAL;
1035 } else if (copy_from_user(&sk_txtime, optval,
1036 sizeof(struct sock_txtime))) {
1037 ret = -EFAULT;
1038 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1039 ret = -EINVAL;
1040 } else {
1041 sock_valbool_flag(sk, SOCK_TXTIME, true);
1042 sk->sk_clockid = sk_txtime.clockid;
1043 sk->sk_txtime_deadline_mode =
1044 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1045 sk->sk_txtime_report_errors =
1046 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1047 }
1048 break;
1049
1073 default: 1050 default:
1074 ret = -ENOPROTOOPT; 1051 ret = -ENOPROTOOPT;
1075 break; 1052 break;
@@ -1115,6 +1092,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
1115 u64 val64; 1092 u64 val64;
1116 struct linger ling; 1093 struct linger ling;
1117 struct timeval tm; 1094 struct timeval tm;
1095 struct sock_txtime txtime;
1118 } v; 1096 } v;
1119 1097
1120 int lv = sizeof(int); 1098 int lv = sizeof(int);
@@ -1403,6 +1381,15 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
1403 v.val = sock_flag(sk, SOCK_ZEROCOPY); 1381 v.val = sock_flag(sk, SOCK_ZEROCOPY);
1404 break; 1382 break;
1405 1383
1384 case SO_TXTIME:
1385 lv = sizeof(v.txtime);
1386 v.txtime.clockid = sk->sk_clockid;
1387 v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1388 SOF_TXTIME_DEADLINE_MODE : 0;
1389 v.txtime.flags |= sk->sk_txtime_report_errors ?
1390 SOF_TXTIME_REPORT_ERRORS : 0;
1391 break;
1392
1406 default: 1393 default:
1407 /* We implement the SO_SNDLOWAT etc to not be settable 1394 /* We implement the SO_SNDLOWAT etc to not be settable
1408 * (1003.1g 7). 1395 * (1003.1g 7).
@@ -2137,6 +2124,13 @@ int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2137 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK; 2124 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2138 sockc->tsflags |= tsflags; 2125 sockc->tsflags |= tsflags;
2139 break; 2126 break;
2127 case SCM_TXTIME:
2128 if (!sock_flag(sk, SOCK_TXTIME))
2129 return -EINVAL;
2130 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2131 return -EINVAL;
2132 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2133 break;
2140 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ 2134 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2141 case SCM_RIGHTS: 2135 case SCM_RIGHTS:
2142 case SCM_CREDENTIALS: 2136 case SCM_CREDENTIALS:
@@ -2277,9 +2271,9 @@ int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg,
2277 pfrag->offset += use; 2271 pfrag->offset += use;
2278 2272
2279 sge = sg + sg_curr - 1; 2273 sge = sg + sg_curr - 1;
2280 if (sg_curr > first_coalesce && sg_page(sg) == pfrag->page && 2274 if (sg_curr > first_coalesce && sg_page(sge) == pfrag->page &&
2281 sg->offset + sg->length == orig_offset) { 2275 sge->offset + sge->length == orig_offset) {
2282 sg->length += use; 2276 sge->length += use;
2283 } else { 2277 } else {
2284 sge = sg + sg_curr; 2278 sge = sg + sg_curr;
2285 sg_unmark_end(sge); 2279 sg_unmark_end(sge);
@@ -2401,9 +2395,10 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2401{ 2395{
2402 struct proto *prot = sk->sk_prot; 2396 struct proto *prot = sk->sk_prot;
2403 long allocated = sk_memory_allocated_add(sk, amt); 2397 long allocated = sk_memory_allocated_add(sk, amt);
2398 bool charged = true;
2404 2399
2405 if (mem_cgroup_sockets_enabled && sk->sk_memcg && 2400 if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2406 !mem_cgroup_charge_skmem(sk->sk_memcg, amt)) 2401 !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
2407 goto suppress_allocation; 2402 goto suppress_allocation;
2408 2403
2409 /* Under limit. */ 2404 /* Under limit. */
@@ -2461,7 +2456,8 @@ suppress_allocation:
2461 return 1; 2456 return 1;
2462 } 2457 }
2463 2458
2464 trace_sock_exceed_buf_limit(sk, prot, allocated); 2459 if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2460 trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2465 2461
2466 sk_memory_allocated_sub(sk, amt); 2462 sk_memory_allocated_sub(sk, amt);
2467 2463
@@ -2818,6 +2814,8 @@ void sock_init_data(struct socket *sock, struct sock *sk)
2818 sk->sk_pacing_rate = ~0U; 2814 sk->sk_pacing_rate = ~0U;
2819 sk->sk_pacing_shift = 10; 2815 sk->sk_pacing_shift = 10;
2820 sk->sk_incoming_cpu = -1; 2816 sk->sk_incoming_cpu = -1;
2817
2818 sk_rx_queue_clear(sk);
2821 /* 2819 /*
2822 * Before updating sk_refcnt, we must commit prior changes to memory 2820 * Before updating sk_refcnt, we must commit prior changes to memory
2823 * (Documentation/RCU/rculist_nulls.txt for details) 2821 * (Documentation/RCU/rculist_nulls.txt for details)
@@ -2902,8 +2900,8 @@ EXPORT_SYMBOL(lock_sock_fast);
2902int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp) 2900int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2903{ 2901{
2904 struct timeval tv; 2902 struct timeval tv;
2905 if (!sock_flag(sk, SOCK_TIMESTAMP)) 2903
2906 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 2904 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2907 tv = ktime_to_timeval(sk->sk_stamp); 2905 tv = ktime_to_timeval(sk->sk_stamp);
2908 if (tv.tv_sec == -1) 2906 if (tv.tv_sec == -1)
2909 return -ENOENT; 2907 return -ENOENT;
@@ -2918,8 +2916,8 @@ EXPORT_SYMBOL(sock_get_timestamp);
2918int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp) 2916int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2919{ 2917{
2920 struct timespec ts; 2918 struct timespec ts;
2921 if (!sock_flag(sk, SOCK_TIMESTAMP)) 2919
2922 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 2920 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2923 ts = ktime_to_timespec(sk->sk_stamp); 2921 ts = ktime_to_timespec(sk->sk_stamp);
2924 if (ts.tv_sec == -1) 2922 if (ts.tv_sec == -1)
2925 return -ENOENT; 2923 return -ENOENT;
@@ -3243,7 +3241,8 @@ static int req_prot_init(const struct proto *prot)
3243 3241
3244 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name, 3242 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3245 rsk_prot->obj_size, 0, 3243 rsk_prot->obj_size, 0,
3246 prot->slab_flags, NULL); 3244 SLAB_ACCOUNT | prot->slab_flags,
3245 NULL);
3247 3246
3248 if (!rsk_prot->slab) { 3247 if (!rsk_prot->slab) {
3249 pr_crit("%s: Can't create request sock SLAB cache!\n", 3248 pr_crit("%s: Can't create request sock SLAB cache!\n",
@@ -3258,7 +3257,8 @@ int proto_register(struct proto *prot, int alloc_slab)
3258 if (alloc_slab) { 3257 if (alloc_slab) {
3259 prot->slab = kmem_cache_create_usercopy(prot->name, 3258 prot->slab = kmem_cache_create_usercopy(prot->name,
3260 prot->obj_size, 0, 3259 prot->obj_size, 0,
3261 SLAB_HWCACHE_ALIGN | prot->slab_flags, 3260 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3261 prot->slab_flags,
3262 prot->useroffset, prot->usersize, 3262 prot->useroffset, prot->usersize,
3263 NULL); 3263 NULL);
3264 3264
@@ -3281,6 +3281,7 @@ int proto_register(struct proto *prot, int alloc_slab)
3281 kmem_cache_create(prot->twsk_prot->twsk_slab_name, 3281 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3282 prot->twsk_prot->twsk_obj_size, 3282 prot->twsk_prot->twsk_obj_size,
3283 0, 3283 0,
3284 SLAB_ACCOUNT |
3284 prot->slab_flags, 3285 prot->slab_flags,
3285 NULL); 3286 NULL);
3286 if (prot->twsk_prot->twsk_slab == NULL) 3287 if (prot->twsk_prot->twsk_slab == NULL)
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index c37b5be7c5e4..3312a5849a97 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -10,6 +10,7 @@
10#include <linux/kernel.h> 10#include <linux/kernel.h>
11#include <linux/tcp.h> 11#include <linux/tcp.h>
12#include <linux/workqueue.h> 12#include <linux/workqueue.h>
13#include <linux/nospec.h>
13 14
14#include <linux/inet_diag.h> 15#include <linux/inet_diag.h>
15#include <linux/sock_diag.h> 16#include <linux/sock_diag.h>
@@ -218,6 +219,7 @@ static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh)
218 219
219 if (req->sdiag_family >= AF_MAX) 220 if (req->sdiag_family >= AF_MAX)
220 return -EINVAL; 221 return -EINVAL;
222 req->sdiag_family = array_index_nospec(req->sdiag_family, AF_MAX);
221 223
222 if (sock_diag_handlers[req->sdiag_family] == NULL) 224 if (sock_diag_handlers[req->sdiag_family] == NULL)
223 sock_load_diag_module(req->sdiag_family, 0); 225 sock_load_diag_module(req->sdiag_family, 0);
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index 064acb04be0f..ba5cba56f574 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -8,11 +8,34 @@
8 8
9#include <net/sock_reuseport.h> 9#include <net/sock_reuseport.h>
10#include <linux/bpf.h> 10#include <linux/bpf.h>
11#include <linux/idr.h>
12#include <linux/filter.h>
11#include <linux/rcupdate.h> 13#include <linux/rcupdate.h>
12 14
13#define INIT_SOCKS 128 15#define INIT_SOCKS 128
14 16
15static DEFINE_SPINLOCK(reuseport_lock); 17DEFINE_SPINLOCK(reuseport_lock);
18
19#define REUSEPORT_MIN_ID 1
20static DEFINE_IDA(reuseport_ida);
21
22int reuseport_get_id(struct sock_reuseport *reuse)
23{
24 int id;
25
26 if (reuse->reuseport_id)
27 return reuse->reuseport_id;
28
29 id = ida_simple_get(&reuseport_ida, REUSEPORT_MIN_ID, 0,
30 /* Called under reuseport_lock */
31 GFP_ATOMIC);
32 if (id < 0)
33 return id;
34
35 reuse->reuseport_id = id;
36
37 return reuse->reuseport_id;
38}
16 39
17static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks) 40static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks)
18{ 41{
@@ -29,7 +52,7 @@ static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks)
29 return reuse; 52 return reuse;
30} 53}
31 54
32int reuseport_alloc(struct sock *sk) 55int reuseport_alloc(struct sock *sk, bool bind_inany)
33{ 56{
34 struct sock_reuseport *reuse; 57 struct sock_reuseport *reuse;
35 58
@@ -41,9 +64,17 @@ int reuseport_alloc(struct sock *sk)
41 /* Allocation attempts can occur concurrently via the setsockopt path 64 /* Allocation attempts can occur concurrently via the setsockopt path
42 * and the bind/hash path. Nothing to do when we lose the race. 65 * and the bind/hash path. Nothing to do when we lose the race.
43 */ 66 */
44 if (rcu_dereference_protected(sk->sk_reuseport_cb, 67 reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
45 lockdep_is_held(&reuseport_lock))) 68 lockdep_is_held(&reuseport_lock));
69 if (reuse) {
70 /* Only set reuse->bind_inany if the bind_inany is true.
71 * Otherwise, it will overwrite the reuse->bind_inany
72 * which was set by the bind/hash path.
73 */
74 if (bind_inany)
75 reuse->bind_inany = bind_inany;
46 goto out; 76 goto out;
77 }
47 78
48 reuse = __reuseport_alloc(INIT_SOCKS); 79 reuse = __reuseport_alloc(INIT_SOCKS);
49 if (!reuse) { 80 if (!reuse) {
@@ -53,6 +84,7 @@ int reuseport_alloc(struct sock *sk)
53 84
54 reuse->socks[0] = sk; 85 reuse->socks[0] = sk;
55 reuse->num_socks = 1; 86 reuse->num_socks = 1;
87 reuse->bind_inany = bind_inany;
56 rcu_assign_pointer(sk->sk_reuseport_cb, reuse); 88 rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
57 89
58out: 90out:
@@ -78,9 +110,12 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
78 more_reuse->max_socks = more_socks_size; 110 more_reuse->max_socks = more_socks_size;
79 more_reuse->num_socks = reuse->num_socks; 111 more_reuse->num_socks = reuse->num_socks;
80 more_reuse->prog = reuse->prog; 112 more_reuse->prog = reuse->prog;
113 more_reuse->reuseport_id = reuse->reuseport_id;
114 more_reuse->bind_inany = reuse->bind_inany;
81 115
82 memcpy(more_reuse->socks, reuse->socks, 116 memcpy(more_reuse->socks, reuse->socks,
83 reuse->num_socks * sizeof(struct sock *)); 117 reuse->num_socks * sizeof(struct sock *));
118 more_reuse->synq_overflow_ts = READ_ONCE(reuse->synq_overflow_ts);
84 119
85 for (i = 0; i < reuse->num_socks; ++i) 120 for (i = 0; i < reuse->num_socks; ++i)
86 rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb, 121 rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb,
@@ -99,8 +134,9 @@ static void reuseport_free_rcu(struct rcu_head *head)
99 struct sock_reuseport *reuse; 134 struct sock_reuseport *reuse;
100 135
101 reuse = container_of(head, struct sock_reuseport, rcu); 136 reuse = container_of(head, struct sock_reuseport, rcu);
102 if (reuse->prog) 137 sk_reuseport_prog_free(rcu_dereference_protected(reuse->prog, 1));
103 bpf_prog_destroy(reuse->prog); 138 if (reuse->reuseport_id)
139 ida_simple_remove(&reuseport_ida, reuse->reuseport_id);
104 kfree(reuse); 140 kfree(reuse);
105} 141}
106 142
@@ -110,12 +146,12 @@ static void reuseport_free_rcu(struct rcu_head *head)
110 * @sk2: Socket belonging to the existing reuseport group. 146 * @sk2: Socket belonging to the existing reuseport group.
111 * May return ENOMEM and not add socket to group under memory pressure. 147 * May return ENOMEM and not add socket to group under memory pressure.
112 */ 148 */
113int reuseport_add_sock(struct sock *sk, struct sock *sk2) 149int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany)
114{ 150{
115 struct sock_reuseport *old_reuse, *reuse; 151 struct sock_reuseport *old_reuse, *reuse;
116 152
117 if (!rcu_access_pointer(sk2->sk_reuseport_cb)) { 153 if (!rcu_access_pointer(sk2->sk_reuseport_cb)) {
118 int err = reuseport_alloc(sk2); 154 int err = reuseport_alloc(sk2, bind_inany);
119 155
120 if (err) 156 if (err)
121 return err; 157 return err;
@@ -160,6 +196,14 @@ void reuseport_detach_sock(struct sock *sk)
160 spin_lock_bh(&reuseport_lock); 196 spin_lock_bh(&reuseport_lock);
161 reuse = rcu_dereference_protected(sk->sk_reuseport_cb, 197 reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
162 lockdep_is_held(&reuseport_lock)); 198 lockdep_is_held(&reuseport_lock));
199
200 /* At least one of the sk in this reuseport group is added to
201 * a bpf map. Notify the bpf side. The bpf map logic will
202 * remove the sk if it is indeed added to a bpf map.
203 */
204 if (reuse->reuseport_id)
205 bpf_sk_reuseport_detach(sk);
206
163 rcu_assign_pointer(sk->sk_reuseport_cb, NULL); 207 rcu_assign_pointer(sk->sk_reuseport_cb, NULL);
164 208
165 for (i = 0; i < reuse->num_socks; i++) { 209 for (i = 0; i < reuse->num_socks; i++) {
@@ -175,9 +219,9 @@ void reuseport_detach_sock(struct sock *sk)
175} 219}
176EXPORT_SYMBOL(reuseport_detach_sock); 220EXPORT_SYMBOL(reuseport_detach_sock);
177 221
178static struct sock *run_bpf(struct sock_reuseport *reuse, u16 socks, 222static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks,
179 struct bpf_prog *prog, struct sk_buff *skb, 223 struct bpf_prog *prog, struct sk_buff *skb,
180 int hdr_len) 224 int hdr_len)
181{ 225{
182 struct sk_buff *nskb = NULL; 226 struct sk_buff *nskb = NULL;
183 u32 index; 227 u32 index;
@@ -238,9 +282,15 @@ struct sock *reuseport_select_sock(struct sock *sk,
238 /* paired with smp_wmb() in reuseport_add_sock() */ 282 /* paired with smp_wmb() in reuseport_add_sock() */
239 smp_rmb(); 283 smp_rmb();
240 284
241 if (prog && skb) 285 if (!prog || !skb)
242 sk2 = run_bpf(reuse, socks, prog, skb, hdr_len); 286 goto select_by_hash;
287
288 if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
289 sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, hash);
290 else
291 sk2 = run_bpf_filter(reuse, socks, prog, skb, hdr_len);
243 292
293select_by_hash:
244 /* no bpf or invalid bpf result: fall back to hash usage */ 294 /* no bpf or invalid bpf result: fall back to hash usage */
245 if (!sk2) 295 if (!sk2)
246 sk2 = reuse->socks[reciprocal_scale(hash, socks)]; 296 sk2 = reuse->socks[reciprocal_scale(hash, socks)];
@@ -252,12 +302,21 @@ out:
252} 302}
253EXPORT_SYMBOL(reuseport_select_sock); 303EXPORT_SYMBOL(reuseport_select_sock);
254 304
255struct bpf_prog * 305int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
256reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
257{ 306{
258 struct sock_reuseport *reuse; 307 struct sock_reuseport *reuse;
259 struct bpf_prog *old_prog; 308 struct bpf_prog *old_prog;
260 309
310 if (sk_unhashed(sk) && sk->sk_reuseport) {
311 int err = reuseport_alloc(sk, false);
312
313 if (err)
314 return err;
315 } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) {
316 /* The socket wasn't bound with SO_REUSEPORT */
317 return -EINVAL;
318 }
319
261 spin_lock_bh(&reuseport_lock); 320 spin_lock_bh(&reuseport_lock);
262 reuse = rcu_dereference_protected(sk->sk_reuseport_cb, 321 reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
263 lockdep_is_held(&reuseport_lock)); 322 lockdep_is_held(&reuseport_lock));
@@ -266,6 +325,7 @@ reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
266 rcu_assign_pointer(reuse->prog, prog); 325 rcu_assign_pointer(reuse->prog, prog);
267 spin_unlock_bh(&reuseport_lock); 326 spin_unlock_bh(&reuseport_lock);
268 327
269 return old_prog; 328 sk_reuseport_prog_free(old_prog);
329 return 0;
270} 330}
271EXPORT_SYMBOL(reuseport_attach_prog); 331EXPORT_SYMBOL(reuseport_attach_prog);
diff --git a/net/core/utils.c b/net/core/utils.c
index d47863b07a60..2a597ac7808e 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -397,7 +397,7 @@ int inet_pton_with_scope(struct net *net, __kernel_sa_family_t af,
397 break; 397 break;
398 default: 398 default:
399 pr_err("unexpected address family %d\n", af); 399 pr_err("unexpected address family %d\n", af);
400 }; 400 }
401 401
402 return ret; 402 return ret;
403} 403}
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 9d1f22072d5d..89b6785cef2a 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -3,8 +3,11 @@
3 * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. 3 * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
4 * Released under terms in GPL version 2. See COPYING. 4 * Released under terms in GPL version 2. See COPYING.
5 */ 5 */
6#include <linux/bpf.h>
7#include <linux/filter.h>
6#include <linux/types.h> 8#include <linux/types.h>
7#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/netdevice.h>
8#include <linux/slab.h> 11#include <linux/slab.h>
9#include <linux/idr.h> 12#include <linux/idr.h>
10#include <linux/rhashtable.h> 13#include <linux/rhashtable.h>
@@ -45,8 +48,8 @@ static u32 xdp_mem_id_hashfn(const void *data, u32 len, u32 seed)
45 BUILD_BUG_ON(FIELD_SIZEOF(struct xdp_mem_allocator, mem.id) 48 BUILD_BUG_ON(FIELD_SIZEOF(struct xdp_mem_allocator, mem.id)
46 != sizeof(u32)); 49 != sizeof(u32));
47 50
48 /* Use cyclic increasing ID as direct hash key, see rht_bucket_index */ 51 /* Use cyclic increasing ID as direct hash key */
49 return key << RHT_HASH_RESERVED_SPACE; 52 return key;
50} 53}
51 54
52static int xdp_mem_id_cmp(struct rhashtable_compare_arg *arg, 55static int xdp_mem_id_cmp(struct rhashtable_compare_arg *arg,
@@ -95,23 +98,15 @@ static void __xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
95{ 98{
96 struct xdp_mem_allocator *xa; 99 struct xdp_mem_allocator *xa;
97 int id = xdp_rxq->mem.id; 100 int id = xdp_rxq->mem.id;
98 int err;
99 101
100 if (id == 0) 102 if (id == 0)
101 return; 103 return;
102 104
103 mutex_lock(&mem_id_lock); 105 mutex_lock(&mem_id_lock);
104 106
105 xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params); 107 xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params);
106 if (!xa) { 108 if (xa && !rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
107 mutex_unlock(&mem_id_lock); 109 call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
108 return;
109 }
110
111 err = rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params);
112 WARN_ON(err);
113
114 call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
115 110
116 mutex_unlock(&mem_id_lock); 111 mutex_unlock(&mem_id_lock);
117} 112}
@@ -327,10 +322,12 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
327 /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */ 322 /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */
328 xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); 323 xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
329 page = virt_to_head_page(data); 324 page = virt_to_head_page(data);
330 if (xa) 325 if (xa) {
326 napi_direct &= !xdp_return_frame_no_direct();
331 page_pool_put_page(xa->page_pool, page, napi_direct); 327 page_pool_put_page(xa->page_pool, page, napi_direct);
332 else 328 } else {
333 put_page(page); 329 put_page(page);
330 }
334 rcu_read_unlock(); 331 rcu_read_unlock();
335 break; 332 break;
336 case MEM_TYPE_PAGE_SHARED: 333 case MEM_TYPE_PAGE_SHARED:
@@ -370,3 +367,34 @@ void xdp_return_buff(struct xdp_buff *xdp)
370 __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp->handle); 367 __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp->handle);
371} 368}
372EXPORT_SYMBOL_GPL(xdp_return_buff); 369EXPORT_SYMBOL_GPL(xdp_return_buff);
370
371int xdp_attachment_query(struct xdp_attachment_info *info,
372 struct netdev_bpf *bpf)
373{
374 bpf->prog_id = info->prog ? info->prog->aux->id : 0;
375 bpf->prog_flags = info->prog ? info->flags : 0;
376 return 0;
377}
378EXPORT_SYMBOL_GPL(xdp_attachment_query);
379
380bool xdp_attachment_flags_ok(struct xdp_attachment_info *info,
381 struct netdev_bpf *bpf)
382{
383 if (info->prog && (bpf->flags ^ info->flags) & XDP_FLAGS_MODES) {
384 NL_SET_ERR_MSG(bpf->extack,
385 "program loaded with different flags");
386 return false;
387 }
388 return true;
389}
390EXPORT_SYMBOL_GPL(xdp_attachment_flags_ok);
391
392void xdp_attachment_setup(struct xdp_attachment_info *info,
393 struct netdev_bpf *bpf)
394{
395 if (info->prog)
396 bpf_prog_put(info->prog);
397 info->prog = bpf->prog;
398 info->flags = bpf->flags;
399}
400EXPORT_SYMBOL_GPL(xdp_attachment_setup);
diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c
index 2589a6b78aa1..a556cd708885 100644
--- a/net/dcb/dcbnl.c
+++ b/net/dcb/dcbnl.c
@@ -1786,7 +1786,7 @@ static struct dcb_app_type *dcb_app_lookup(const struct dcb_app *app,
1786 if (itr->app.selector == app->selector && 1786 if (itr->app.selector == app->selector &&
1787 itr->app.protocol == app->protocol && 1787 itr->app.protocol == app->protocol &&
1788 itr->ifindex == ifindex && 1788 itr->ifindex == ifindex &&
1789 (!prio || itr->app.priority == prio)) 1789 ((prio == -1) || itr->app.priority == prio))
1790 return itr; 1790 return itr;
1791 } 1791 }
1792 1792
@@ -1821,7 +1821,8 @@ u8 dcb_getapp(struct net_device *dev, struct dcb_app *app)
1821 u8 prio = 0; 1821 u8 prio = 0;
1822 1822
1823 spin_lock_bh(&dcb_lock); 1823 spin_lock_bh(&dcb_lock);
1824 if ((itr = dcb_app_lookup(app, dev->ifindex, 0))) 1824 itr = dcb_app_lookup(app, dev->ifindex, -1);
1825 if (itr)
1825 prio = itr->app.priority; 1826 prio = itr->app.priority;
1826 spin_unlock_bh(&dcb_lock); 1827 spin_unlock_bh(&dcb_lock);
1827 1828
@@ -1849,7 +1850,8 @@ int dcb_setapp(struct net_device *dev, struct dcb_app *new)
1849 1850
1850 spin_lock_bh(&dcb_lock); 1851 spin_lock_bh(&dcb_lock);
1851 /* Search for existing match and replace */ 1852 /* Search for existing match and replace */
1852 if ((itr = dcb_app_lookup(new, dev->ifindex, 0))) { 1853 itr = dcb_app_lookup(new, dev->ifindex, -1);
1854 if (itr) {
1853 if (new->priority) 1855 if (new->priority)
1854 itr->app.priority = new->priority; 1856 itr->app.priority = new->priority;
1855 else { 1857 else {
@@ -1882,7 +1884,8 @@ u8 dcb_ieee_getapp_mask(struct net_device *dev, struct dcb_app *app)
1882 u8 prio = 0; 1884 u8 prio = 0;
1883 1885
1884 spin_lock_bh(&dcb_lock); 1886 spin_lock_bh(&dcb_lock);
1885 if ((itr = dcb_app_lookup(app, dev->ifindex, 0))) 1887 itr = dcb_app_lookup(app, dev->ifindex, -1);
1888 if (itr)
1886 prio |= 1 << itr->app.priority; 1889 prio |= 1 << itr->app.priority;
1887 spin_unlock_bh(&dcb_lock); 1890 spin_unlock_bh(&dcb_lock);
1888 1891
@@ -1955,6 +1958,92 @@ int dcb_ieee_delapp(struct net_device *dev, struct dcb_app *del)
1955} 1958}
1956EXPORT_SYMBOL(dcb_ieee_delapp); 1959EXPORT_SYMBOL(dcb_ieee_delapp);
1957 1960
1961/**
1962 * dcb_ieee_getapp_prio_dscp_mask_map - For a given device, find mapping from
1963 * priorities to the DSCP values assigned to that priority. Initialize p_map
1964 * such that each map element holds a bit mask of DSCP values configured for
1965 * that priority by APP entries.
1966 */
1967void dcb_ieee_getapp_prio_dscp_mask_map(const struct net_device *dev,
1968 struct dcb_ieee_app_prio_map *p_map)
1969{
1970 int ifindex = dev->ifindex;
1971 struct dcb_app_type *itr;
1972 u8 prio;
1973
1974 memset(p_map->map, 0, sizeof(p_map->map));
1975
1976 spin_lock_bh(&dcb_lock);
1977 list_for_each_entry(itr, &dcb_app_list, list) {
1978 if (itr->ifindex == ifindex &&
1979 itr->app.selector == IEEE_8021QAZ_APP_SEL_DSCP &&
1980 itr->app.protocol < 64 &&
1981 itr->app.priority < IEEE_8021QAZ_MAX_TCS) {
1982 prio = itr->app.priority;
1983 p_map->map[prio] |= 1ULL << itr->app.protocol;
1984 }
1985 }
1986 spin_unlock_bh(&dcb_lock);
1987}
1988EXPORT_SYMBOL(dcb_ieee_getapp_prio_dscp_mask_map);
1989
1990/**
1991 * dcb_ieee_getapp_dscp_prio_mask_map - For a given device, find mapping from
1992 * DSCP values to the priorities assigned to that DSCP value. Initialize p_map
1993 * such that each map element holds a bit mask of priorities configured for a
1994 * given DSCP value by APP entries.
1995 */
1996void
1997dcb_ieee_getapp_dscp_prio_mask_map(const struct net_device *dev,
1998 struct dcb_ieee_app_dscp_map *p_map)
1999{
2000 int ifindex = dev->ifindex;
2001 struct dcb_app_type *itr;
2002
2003 memset(p_map->map, 0, sizeof(p_map->map));
2004
2005 spin_lock_bh(&dcb_lock);
2006 list_for_each_entry(itr, &dcb_app_list, list) {
2007 if (itr->ifindex == ifindex &&
2008 itr->app.selector == IEEE_8021QAZ_APP_SEL_DSCP &&
2009 itr->app.protocol < 64 &&
2010 itr->app.priority < IEEE_8021QAZ_MAX_TCS)
2011 p_map->map[itr->app.protocol] |= 1 << itr->app.priority;
2012 }
2013 spin_unlock_bh(&dcb_lock);
2014}
2015EXPORT_SYMBOL(dcb_ieee_getapp_dscp_prio_mask_map);
2016
2017/**
2018 * Per 802.1Q-2014, the selector value of 1 is used for matching on Ethernet
2019 * type, with valid PID values >= 1536. A special meaning is then assigned to
2020 * protocol value of 0: "default priority. For use when priority is not
2021 * otherwise specified".
2022 *
2023 * dcb_ieee_getapp_default_prio_mask - For a given device, find all APP entries
2024 * of the form {$PRIO, ETHERTYPE, 0} and construct a bit mask of all default
2025 * priorities set by these entries.
2026 */
2027u8 dcb_ieee_getapp_default_prio_mask(const struct net_device *dev)
2028{
2029 int ifindex = dev->ifindex;
2030 struct dcb_app_type *itr;
2031 u8 mask = 0;
2032
2033 spin_lock_bh(&dcb_lock);
2034 list_for_each_entry(itr, &dcb_app_list, list) {
2035 if (itr->ifindex == ifindex &&
2036 itr->app.selector == IEEE_8021QAZ_APP_SEL_ETHERTYPE &&
2037 itr->app.protocol == 0 &&
2038 itr->app.priority < IEEE_8021QAZ_MAX_TCS)
2039 mask |= 1 << itr->app.priority;
2040 }
2041 spin_unlock_bh(&dcb_lock);
2042
2043 return mask;
2044}
2045EXPORT_SYMBOL(dcb_ieee_getapp_default_prio_mask);
2046
1958static int __init dcbnl_init(void) 2047static int __init dcbnl_init(void)
1959{ 2048{
1960 INIT_LIST_HEAD(&dcb_app_list); 2049 INIT_LIST_HEAD(&dcb_app_list);
diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
index 2b75df469220..842a9c7c73a3 100644
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -229,14 +229,16 @@ static void ccid2_cwnd_restart(struct sock *sk, const u32 now)
229 struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); 229 struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
230 u32 cwnd = hc->tx_cwnd, restart_cwnd, 230 u32 cwnd = hc->tx_cwnd, restart_cwnd,
231 iwnd = rfc3390_bytes_to_packets(dccp_sk(sk)->dccps_mss_cache); 231 iwnd = rfc3390_bytes_to_packets(dccp_sk(sk)->dccps_mss_cache);
232 s32 delta = now - hc->tx_lsndtime;
232 233
233 hc->tx_ssthresh = max(hc->tx_ssthresh, (cwnd >> 1) + (cwnd >> 2)); 234 hc->tx_ssthresh = max(hc->tx_ssthresh, (cwnd >> 1) + (cwnd >> 2));
234 235
235 /* don't reduce cwnd below the initial window (IW) */ 236 /* don't reduce cwnd below the initial window (IW) */
236 restart_cwnd = min(cwnd, iwnd); 237 restart_cwnd = min(cwnd, iwnd);
237 cwnd >>= (now - hc->tx_lsndtime) / hc->tx_rto;
238 hc->tx_cwnd = max(cwnd, restart_cwnd);
239 238
239 while ((delta -= hc->tx_rto) >= 0 && cwnd > restart_cwnd)
240 cwnd >>= 1;
241 hc->tx_cwnd = max(cwnd, restart_cwnd);
240 hc->tx_cwnd_stamp = now; 242 hc->tx_cwnd_stamp = now;
241 hc->tx_cwnd_used = 0; 243 hc->tx_cwnd_used = 0;
242 244
diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index 8b5ba6dffac7..12877a1514e7 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -600,7 +600,7 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk,
600{ 600{
601 struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); 601 struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
602 struct dccp_sock *dp = dccp_sk(sk); 602 struct dccp_sock *dp = dccp_sk(sk);
603 ktime_t now = ktime_get_real(); 603 ktime_t now = ktime_get();
604 s64 delta = 0; 604 s64 delta = 0;
605 605
606 switch (fbtype) { 606 switch (fbtype) {
@@ -625,15 +625,14 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk,
625 case CCID3_FBACK_PERIODIC: 625 case CCID3_FBACK_PERIODIC:
626 delta = ktime_us_delta(now, hc->rx_tstamp_last_feedback); 626 delta = ktime_us_delta(now, hc->rx_tstamp_last_feedback);
627 if (delta <= 0) 627 if (delta <= 0)
628 DCCP_BUG("delta (%ld) <= 0", (long)delta); 628 delta = 1;
629 else 629 hc->rx_x_recv = scaled_div32(hc->rx_bytes_recv, delta);
630 hc->rx_x_recv = scaled_div32(hc->rx_bytes_recv, delta);
631 break; 630 break;
632 default: 631 default:
633 return; 632 return;
634 } 633 }
635 634
636 ccid3_pr_debug("Interval %ldusec, X_recv=%u, 1/p=%u\n", (long)delta, 635 ccid3_pr_debug("Interval %lldusec, X_recv=%u, 1/p=%u\n", delta,
637 hc->rx_x_recv, hc->rx_pinv); 636 hc->rx_x_recv, hc->rx_pinv);
638 637
639 hc->rx_tstamp_last_feedback = now; 638 hc->rx_tstamp_last_feedback = now;
@@ -680,7 +679,8 @@ static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
680static u32 ccid3_first_li(struct sock *sk) 679static u32 ccid3_first_li(struct sock *sk)
681{ 680{
682 struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); 681 struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
683 u32 x_recv, p, delta; 682 u32 x_recv, p;
683 s64 delta;
684 u64 fval; 684 u64 fval;
685 685
686 if (hc->rx_rtt == 0) { 686 if (hc->rx_rtt == 0) {
@@ -688,7 +688,9 @@ static u32 ccid3_first_li(struct sock *sk)
688 hc->rx_rtt = DCCP_FALLBACK_RTT; 688 hc->rx_rtt = DCCP_FALLBACK_RTT;
689 } 689 }
690 690
691 delta = ktime_to_us(net_timedelta(hc->rx_tstamp_last_feedback)); 691 delta = ktime_us_delta(ktime_get(), hc->rx_tstamp_last_feedback);
692 if (delta <= 0)
693 delta = 1;
692 x_recv = scaled_div32(hc->rx_bytes_recv, delta); 694 x_recv = scaled_div32(hc->rx_bytes_recv, delta);
693 if (x_recv == 0) { /* would also trigger divide-by-zero */ 695 if (x_recv == 0) { /* would also trigger divide-by-zero */
694 DCCP_WARN("X_recv==0\n"); 696 DCCP_WARN("X_recv==0\n");
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index 0ea2ee56ac1b..f91e3816806b 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -316,7 +316,8 @@ int dccp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
316 int flags, int *addr_len); 316 int flags, int *addr_len);
317void dccp_shutdown(struct sock *sk, int how); 317void dccp_shutdown(struct sock *sk, int how);
318int inet_dccp_listen(struct socket *sock, int backlog); 318int inet_dccp_listen(struct socket *sock, int backlog);
319__poll_t dccp_poll_mask(struct socket *sock, __poll_t events); 319__poll_t dccp_poll(struct file *file, struct socket *sock,
320 poll_table *wait);
320int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); 321int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
321void dccp_req_err(struct sock *sk, u64 seq); 322void dccp_req_err(struct sock *sk, u64 seq);
322 323
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index a9e478cd3787..b08feb219b44 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -984,7 +984,7 @@ static const struct proto_ops inet_dccp_ops = {
984 .accept = inet_accept, 984 .accept = inet_accept,
985 .getname = inet_getname, 985 .getname = inet_getname,
986 /* FIXME: work on tcp_poll to rename it to inet_csk_poll */ 986 /* FIXME: work on tcp_poll to rename it to inet_csk_poll */
987 .poll_mask = dccp_poll_mask, 987 .poll = dccp_poll,
988 .ioctl = inet_ioctl, 988 .ioctl = inet_ioctl,
989 /* FIXME: work on inet_listen to rename it to sock_common_listen */ 989 /* FIXME: work on inet_listen to rename it to sock_common_listen */
990 .listen = inet_dccp_listen, 990 .listen = inet_dccp_listen,
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 17fc4e0166ba..6344f1b18a6a 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -1070,7 +1070,7 @@ static const struct proto_ops inet6_dccp_ops = {
1070 .socketpair = sock_no_socketpair, 1070 .socketpair = sock_no_socketpair,
1071 .accept = inet_accept, 1071 .accept = inet_accept,
1072 .getname = inet6_getname, 1072 .getname = inet6_getname,
1073 .poll_mask = dccp_poll_mask, 1073 .poll = dccp_poll,
1074 .ioctl = inet6_ioctl, 1074 .ioctl = inet6_ioctl,
1075 .listen = inet_dccp_listen, 1075 .listen = inet_dccp_listen,
1076 .shutdown = inet_shutdown, 1076 .shutdown = inet_shutdown,
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index ca21c1c76da0..875858c8b059 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -312,11 +312,20 @@ int dccp_disconnect(struct sock *sk, int flags)
312 312
313EXPORT_SYMBOL_GPL(dccp_disconnect); 313EXPORT_SYMBOL_GPL(dccp_disconnect);
314 314
315__poll_t dccp_poll_mask(struct socket *sock, __poll_t events) 315/*
316 * Wait for a DCCP event.
317 *
318 * Note that we don't need to lock the socket, as the upper poll layers
319 * take care of normal races (between the test and the event) and we don't
320 * go look at any of the socket buffers directly.
321 */
322__poll_t dccp_poll(struct file *file, struct socket *sock,
323 poll_table *wait)
316{ 324{
317 __poll_t mask; 325 __poll_t mask;
318 struct sock *sk = sock->sk; 326 struct sock *sk = sock->sk;
319 327
328 sock_poll_wait(file, wait);
320 if (sk->sk_state == DCCP_LISTEN) 329 if (sk->sk_state == DCCP_LISTEN)
321 return inet_csk_listen_poll(sk); 330 return inet_csk_listen_poll(sk);
322 331
@@ -358,7 +367,7 @@ __poll_t dccp_poll_mask(struct socket *sock, __poll_t events)
358 return mask; 367 return mask;
359} 368}
360 369
361EXPORT_SYMBOL_GPL(dccp_poll_mask); 370EXPORT_SYMBOL_GPL(dccp_poll);
362 371
363int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg) 372int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg)
364{ 373{
diff --git a/net/decnet/Kconfig b/net/decnet/Kconfig
index f3393e154f0f..dcc74956badd 100644
--- a/net/decnet/Kconfig
+++ b/net/decnet/Kconfig
@@ -40,4 +40,3 @@ config DECNET_ROUTER
40 to work. 40 to work.
41 41
42 See <file:Documentation/networking/decnet.txt> for more information. 42 See <file:Documentation/networking/decnet.txt> for more information.
43
diff --git a/net/decnet/Makefile b/net/decnet/Makefile
index 9e38122d942b..07b38e441b2d 100644
--- a/net/decnet/Makefile
+++ b/net/decnet/Makefile
@@ -8,4 +8,3 @@ decnet-$(CONFIG_DECNET_ROUTER) += dn_fib.o dn_rules.o dn_table.o
8decnet-y += sysctl_net_decnet.o 8decnet-y += sysctl_net_decnet.o
9 9
10obj-$(CONFIG_NETFILTER) += netfilter/ 10obj-$(CONFIG_NETFILTER) += netfilter/
11
diff --git a/net/decnet/TODO b/net/decnet/TODO
index ebb5ac69d128..358e9eb49016 100644
--- a/net/decnet/TODO
+++ b/net/decnet/TODO
@@ -16,14 +16,14 @@ Steve's quick list of things that need finishing off:
16 16
17 o Verify errors etc. against POSIX 1003.1g (draft) 17 o Verify errors etc. against POSIX 1003.1g (draft)
18 18
19 o Using send/recvmsg() to get at connect/disconnect data (POSIX 1003.1g) 19 o Using send/recvmsg() to get at connect/disconnect data (POSIX 1003.1g)
20 [maybe this should be done at socket level... the control data in the 20 [maybe this should be done at socket level... the control data in the
21 send/recvmsg() calls should simply be a vector of set/getsockopt() 21 send/recvmsg() calls should simply be a vector of set/getsockopt()
22 calls] 22 calls]
23 23
24 o check MSG_CTRUNC is set where it should be. 24 o check MSG_CTRUNC is set where it should be.
25 25
26 o Find all the commonality between DECnet and IPv4 routing code and extract 26 o Find all the commonality between DECnet and IPv4 routing code and extract
27 it into a small library of routines. [probably a project for 2.7.xx] 27 it into a small library of routines. [probably a project for 2.7.xx]
28 28
29 o Add perfect socket hashing - an idea suggested by Paul Koning. Currently 29 o Add perfect socket hashing - an idea suggested by Paul Koning. Currently
@@ -38,4 +38,3 @@ Steve's quick list of things that need finishing off:
38 o DECnet sendpages() function 38 o DECnet sendpages() function
39 39
40 o AIO for DECnet 40 o AIO for DECnet
41
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 9a686d890bfa..7d6ff983ba2c 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -1207,11 +1207,11 @@ static int dn_getname(struct socket *sock, struct sockaddr *uaddr,int peer)
1207} 1207}
1208 1208
1209 1209
1210static __poll_t dn_poll_mask(struct socket *sock, __poll_t events) 1210static __poll_t dn_poll(struct file *file, struct socket *sock, poll_table *wait)
1211{ 1211{
1212 struct sock *sk = sock->sk; 1212 struct sock *sk = sock->sk;
1213 struct dn_scp *scp = DN_SK(sk); 1213 struct dn_scp *scp = DN_SK(sk);
1214 __poll_t mask = datagram_poll_mask(sock, events); 1214 __poll_t mask = datagram_poll(file, sock, wait);
1215 1215
1216 if (!skb_queue_empty(&scp->other_receive_queue)) 1216 if (!skb_queue_empty(&scp->other_receive_queue))
1217 mask |= EPOLLRDBAND; 1217 mask |= EPOLLRDBAND;
@@ -2331,7 +2331,7 @@ static const struct proto_ops dn_proto_ops = {
2331 .socketpair = sock_no_socketpair, 2331 .socketpair = sock_no_socketpair,
2332 .accept = dn_accept, 2332 .accept = dn_accept,
2333 .getname = dn_getname, 2333 .getname = dn_getname,
2334 .poll_mask = dn_poll_mask, 2334 .poll = dn_poll,
2335 .ioctl = dn_ioctl, 2335 .ioctl = dn_ioctl,
2336 .listen = dn_listen, 2336 .listen = dn_listen,
2337 .shutdown = dn_shutdown, 2337 .shutdown = dn_shutdown,
diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c
index fce94cbd4378..f78fe58eafc8 100644
--- a/net/decnet/dn_fib.c
+++ b/net/decnet/dn_fib.c
@@ -797,5 +797,3 @@ void __init dn_fib_init(void)
797 rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_DELROUTE, 797 rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_DELROUTE,
798 dn_fib_rtm_delroute, NULL, 0); 798 dn_fib_rtm_delroute, NULL, 0);
799} 799}
800
801
diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c
index 1b2120645730..2fb5e055ba25 100644
--- a/net/decnet/dn_nsp_in.c
+++ b/net/decnet/dn_nsp_in.c
@@ -491,6 +491,7 @@ static void dn_nsp_disc_conf(struct sock *sk, struct sk_buff *skb)
491 break; 491 break;
492 case DN_RUN: 492 case DN_RUN:
493 sk->sk_shutdown |= SHUTDOWN_MASK; 493 sk->sk_shutdown |= SHUTDOWN_MASK;
494 /* fall through */
494 case DN_CC: 495 case DN_CC:
495 scp->state = DN_CN; 496 scp->state = DN_CN;
496 } 497 }
@@ -911,4 +912,3 @@ free_out:
911 912
912 return NET_RX_SUCCESS; 913 return NET_RX_SUCCESS;
913} 914}
914
diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c
index 56a52a004c56..a1779de6bd9c 100644
--- a/net/decnet/dn_nsp_out.c
+++ b/net/decnet/dn_nsp_out.c
@@ -701,4 +701,3 @@ void dn_nsp_send_conninit(struct sock *sk, unsigned char msgflg)
701 701
702 dn_nsp_send(skb); 702 dn_nsp_send(skb);
703} 703}
704
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index e74765024d88..1c002c0fb712 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -404,7 +404,7 @@ void dn_rt_cache_flush(int delay)
404 404
405 if (delay <= 0) { 405 if (delay <= 0) {
406 spin_unlock_bh(&dn_rt_flush_lock); 406 spin_unlock_bh(&dn_rt_flush_lock);
407 dn_run_flush(0); 407 dn_run_flush(NULL);
408 return; 408 return;
409 } 409 }
410 410
@@ -1920,9 +1920,8 @@ void __init dn_route_init(void)
1920void __exit dn_route_cleanup(void) 1920void __exit dn_route_cleanup(void)
1921{ 1921{
1922 del_timer(&dn_route_timer); 1922 del_timer(&dn_route_timer);
1923 dn_run_flush(0); 1923 dn_run_flush(NULL);
1924 1924
1925 remove_proc_entry("decnet_cache", init_net.proc_net); 1925 remove_proc_entry("decnet_cache", init_net.proc_net);
1926 dst_entries_destroy(&dn_dst_ops); 1926 dst_entries_destroy(&dn_dst_ops);
1927} 1927}
1928
diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c
index 72236695db3d..4a4e3c17740c 100644
--- a/net/decnet/dn_rules.c
+++ b/net/decnet/dn_rules.c
@@ -256,5 +256,3 @@ void __exit dn_fib_rules_cleanup(void)
256 rtnl_unlock(); 256 rtnl_unlock();
257 rcu_barrier(); 257 rcu_barrier();
258} 258}
259
260
diff --git a/net/decnet/netfilter/Makefile b/net/decnet/netfilter/Makefile
index 255c1ae9daeb..b579e52130aa 100644
--- a/net/decnet/netfilter/Makefile
+++ b/net/decnet/netfilter/Makefile
@@ -3,4 +3,3 @@
3# 3#
4 4
5obj-$(CONFIG_DECNET_NF_GRABULATOR) += dn_rtmsg.o 5obj-$(CONFIG_DECNET_NF_GRABULATOR) += dn_rtmsg.o
6
diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c
index ab395e55cd78..a4faacadd8a8 100644
--- a/net/decnet/netfilter/dn_rtmsg.c
+++ b/net/decnet/netfilter/dn_rtmsg.c
@@ -158,4 +158,3 @@ MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_DNRTMSG);
158 158
159module_init(dn_rtmsg_init); 159module_init(dn_rtmsg_init);
160module_exit(dn_rtmsg_fini); 160module_exit(dn_rtmsg_fini);
161
diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c
index 40c851693f77..7f4534828f6c 100644
--- a/net/dns_resolver/dns_key.c
+++ b/net/dns_resolver/dns_key.c
@@ -86,35 +86,39 @@ dns_resolver_preparse(struct key_preparsed_payload *prep)
86 opt++; 86 opt++;
87 kdebug("options: '%s'", opt); 87 kdebug("options: '%s'", opt);
88 do { 88 do {
89 int opt_len, opt_nlen;
89 const char *eq; 90 const char *eq;
90 int opt_len, opt_nlen, opt_vlen, tmp; 91 char optval[128];
91 92
92 next_opt = memchr(opt, '#', end - opt) ?: end; 93 next_opt = memchr(opt, '#', end - opt) ?: end;
93 opt_len = next_opt - opt; 94 opt_len = next_opt - opt;
94 if (opt_len <= 0 || opt_len > 128) { 95 if (opt_len <= 0 || opt_len > sizeof(optval)) {
95 pr_warn_ratelimited("Invalid option length (%d) for dns_resolver key\n", 96 pr_warn_ratelimited("Invalid option length (%d) for dns_resolver key\n",
96 opt_len); 97 opt_len);
97 return -EINVAL; 98 return -EINVAL;
98 } 99 }
99 100
100 eq = memchr(opt, '=', opt_len) ?: end; 101 eq = memchr(opt, '=', opt_len);
101 opt_nlen = eq - opt; 102 if (eq) {
102 eq++; 103 opt_nlen = eq - opt;
103 opt_vlen = next_opt - eq; /* will be -1 if no value */ 104 eq++;
105 memcpy(optval, eq, next_opt - eq);
106 optval[next_opt - eq] = '\0';
107 } else {
108 opt_nlen = opt_len;
109 optval[0] = '\0';
110 }
104 111
105 tmp = opt_vlen >= 0 ? opt_vlen : 0; 112 kdebug("option '%*.*s' val '%s'",
106 kdebug("option '%*.*s' val '%*.*s'", 113 opt_nlen, opt_nlen, opt, optval);
107 opt_nlen, opt_nlen, opt, tmp, tmp, eq);
108 114
109 /* see if it's an error number representing a DNS error 115 /* see if it's an error number representing a DNS error
110 * that's to be recorded as the result in this key */ 116 * that's to be recorded as the result in this key */
111 if (opt_nlen == sizeof(DNS_ERRORNO_OPTION) - 1 && 117 if (opt_nlen == sizeof(DNS_ERRORNO_OPTION) - 1 &&
112 memcmp(opt, DNS_ERRORNO_OPTION, opt_nlen) == 0) { 118 memcmp(opt, DNS_ERRORNO_OPTION, opt_nlen) == 0) {
113 kdebug("dns error number option"); 119 kdebug("dns error number option");
114 if (opt_vlen <= 0)
115 goto bad_option_value;
116 120
117 ret = kstrtoul(eq, 10, &derrno); 121 ret = kstrtoul(optval, 10, &derrno);
118 if (ret < 0) 122 if (ret < 0)
119 goto bad_option_value; 123 goto bad_option_value;
120 124
@@ -316,4 +320,3 @@ static void __exit exit_dns_resolver(void)
316module_init(init_dns_resolver) 320module_init(init_dns_resolver)
317module_exit(exit_dns_resolver) 321module_exit(exit_dns_resolver)
318MODULE_LICENSE("GPL"); 322MODULE_LICENSE("GPL");
319
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index dc5d9af3dc80..a1917025e155 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -775,6 +775,20 @@ struct dsa_switch *dsa_switch_alloc(struct device *dev, size_t n)
775 if (!ds) 775 if (!ds)
776 return NULL; 776 return NULL;
777 777
778 /* We avoid allocating memory outside dsa_switch
779 * if it is not needed.
780 */
781 if (n <= sizeof(ds->_bitmap) * 8) {
782 ds->bitmap = &ds->_bitmap;
783 } else {
784 ds->bitmap = devm_kcalloc(dev,
785 BITS_TO_LONGS(n),
786 sizeof(unsigned long),
787 GFP_KERNEL);
788 if (unlikely(!ds->bitmap))
789 return NULL;
790 }
791
778 ds->dev = dev; 792 ds->dev = dev;
779 ds->num_ports = n; 793 ds->num_ports = n;
780 794
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 1e3b6a6d8a40..1c45c1d6d241 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -639,7 +639,7 @@ static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e)
639 int ret; 639 int ret;
640 640
641 /* Port's PHY and MAC both need to be EEE capable */ 641 /* Port's PHY and MAC both need to be EEE capable */
642 if (!dev->phydev) 642 if (!dev->phydev && !dp->pl)
643 return -ENODEV; 643 return -ENODEV;
644 644
645 if (!ds->ops->set_mac_eee) 645 if (!ds->ops->set_mac_eee)
@@ -659,7 +659,7 @@ static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e)
659 int ret; 659 int ret;
660 660
661 /* Port's PHY and MAC both need to be EEE capable */ 661 /* Port's PHY and MAC both need to be EEE capable */
662 if (!dev->phydev) 662 if (!dev->phydev && !dp->pl)
663 return -ENODEV; 663 return -ENODEV;
664 664
665 if (!ds->ops->get_mac_eee) 665 if (!ds->ops->get_mac_eee)
@@ -767,7 +767,6 @@ static int dsa_slave_add_cls_matchall(struct net_device *dev,
767 const struct tc_action *a; 767 const struct tc_action *a;
768 struct dsa_port *to_dp; 768 struct dsa_port *to_dp;
769 int err = -EOPNOTSUPP; 769 int err = -EOPNOTSUPP;
770 LIST_HEAD(actions);
771 770
772 if (!ds->ops->port_mirror_add) 771 if (!ds->ops->port_mirror_add)
773 return err; 772 return err;
@@ -775,8 +774,7 @@ static int dsa_slave_add_cls_matchall(struct net_device *dev,
775 if (!tcf_exts_has_one_action(cls->exts)) 774 if (!tcf_exts_has_one_action(cls->exts))
776 return err; 775 return err;
777 776
778 tcf_exts_to_list(cls->exts, &actions); 777 a = tcf_exts_first_action(cls->exts);
779 a = list_first_entry(&actions, struct tc_action, list);
780 778
781 if (is_tcf_mirred_egress_mirror(a) && protocol == htons(ETH_P_ALL)) { 779 if (is_tcf_mirred_egress_mirror(a) && protocol == htons(ETH_P_ALL)) {
782 struct dsa_mall_mirror_tc_entry *mirror; 780 struct dsa_mall_mirror_tc_entry *mirror;
@@ -900,7 +898,7 @@ static int dsa_slave_setup_tc_block(struct net_device *dev,
900 898
901 switch (f->command) { 899 switch (f->command) {
902 case TC_BLOCK_BIND: 900 case TC_BLOCK_BIND:
903 return tcf_block_cb_register(f->block, cb, dev, dev); 901 return tcf_block_cb_register(f->block, cb, dev, dev, f->extack);
904 case TC_BLOCK_UNBIND: 902 case TC_BLOCK_UNBIND:
905 tcf_block_cb_unregister(f->block, cb, dev); 903 tcf_block_cb_unregister(f->block, cb, dev);
906 return 0; 904 return 0;
@@ -1248,6 +1246,9 @@ int dsa_slave_suspend(struct net_device *slave_dev)
1248{ 1246{
1249 struct dsa_port *dp = dsa_slave_to_port(slave_dev); 1247 struct dsa_port *dp = dsa_slave_to_port(slave_dev);
1250 1248
1249 if (!netif_running(slave_dev))
1250 return 0;
1251
1251 netif_device_detach(slave_dev); 1252 netif_device_detach(slave_dev);
1252 1253
1253 rtnl_lock(); 1254 rtnl_lock();
@@ -1261,6 +1262,9 @@ int dsa_slave_resume(struct net_device *slave_dev)
1261{ 1262{
1262 struct dsa_port *dp = dsa_slave_to_port(slave_dev); 1263 struct dsa_port *dp = dsa_slave_to_port(slave_dev);
1263 1264
1265 if (!netif_running(slave_dev))
1266 return 0;
1267
1264 netif_device_attach(slave_dev); 1268 netif_device_attach(slave_dev);
1265 1269
1266 rtnl_lock(); 1270 rtnl_lock();
diff --git a/net/dsa/switch.c b/net/dsa/switch.c
index b93511726069..142b294d3446 100644
--- a/net/dsa/switch.c
+++ b/net/dsa/switch.c
@@ -136,21 +136,20 @@ static int dsa_switch_mdb_add(struct dsa_switch *ds,
136{ 136{
137 const struct switchdev_obj_port_mdb *mdb = info->mdb; 137 const struct switchdev_obj_port_mdb *mdb = info->mdb;
138 struct switchdev_trans *trans = info->trans; 138 struct switchdev_trans *trans = info->trans;
139 DECLARE_BITMAP(group, ds->num_ports);
140 int port; 139 int port;
141 140
142 /* Build a mask of Multicast group members */ 141 /* Build a mask of Multicast group members */
143 bitmap_zero(group, ds->num_ports); 142 bitmap_zero(ds->bitmap, ds->num_ports);
144 if (ds->index == info->sw_index) 143 if (ds->index == info->sw_index)
145 set_bit(info->port, group); 144 set_bit(info->port, ds->bitmap);
146 for (port = 0; port < ds->num_ports; port++) 145 for (port = 0; port < ds->num_ports; port++)
147 if (dsa_is_dsa_port(ds, port)) 146 if (dsa_is_dsa_port(ds, port))
148 set_bit(port, group); 147 set_bit(port, ds->bitmap);
149 148
150 if (switchdev_trans_ph_prepare(trans)) 149 if (switchdev_trans_ph_prepare(trans))
151 return dsa_switch_mdb_prepare_bitmap(ds, mdb, group); 150 return dsa_switch_mdb_prepare_bitmap(ds, mdb, ds->bitmap);
152 151
153 dsa_switch_mdb_add_bitmap(ds, mdb, group); 152 dsa_switch_mdb_add_bitmap(ds, mdb, ds->bitmap);
154 153
155 return 0; 154 return 0;
156} 155}
@@ -204,21 +203,20 @@ static int dsa_switch_vlan_add(struct dsa_switch *ds,
204{ 203{
205 const struct switchdev_obj_port_vlan *vlan = info->vlan; 204 const struct switchdev_obj_port_vlan *vlan = info->vlan;
206 struct switchdev_trans *trans = info->trans; 205 struct switchdev_trans *trans = info->trans;
207 DECLARE_BITMAP(members, ds->num_ports);
208 int port; 206 int port;
209 207
210 /* Build a mask of VLAN members */ 208 /* Build a mask of VLAN members */
211 bitmap_zero(members, ds->num_ports); 209 bitmap_zero(ds->bitmap, ds->num_ports);
212 if (ds->index == info->sw_index) 210 if (ds->index == info->sw_index)
213 set_bit(info->port, members); 211 set_bit(info->port, ds->bitmap);
214 for (port = 0; port < ds->num_ports; port++) 212 for (port = 0; port < ds->num_ports; port++)
215 if (dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port)) 213 if (dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))
216 set_bit(port, members); 214 set_bit(port, ds->bitmap);
217 215
218 if (switchdev_trans_ph_prepare(trans)) 216 if (switchdev_trans_ph_prepare(trans))
219 return dsa_switch_vlan_prepare_bitmap(ds, vlan, members); 217 return dsa_switch_vlan_prepare_bitmap(ds, vlan, ds->bitmap);
220 218
221 dsa_switch_vlan_add_bitmap(ds, vlan, members); 219 dsa_switch_vlan_add_bitmap(ds, vlan, ds->bitmap);
222 220
223 return 0; 221 return 0;
224} 222}
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index ee28440f57c5..fd8faa0dfa61 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -427,13 +427,13 @@ ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len)
427} 427}
428EXPORT_SYMBOL(sysfs_format_mac); 428EXPORT_SYMBOL(sysfs_format_mac);
429 429
430struct sk_buff **eth_gro_receive(struct sk_buff **head, 430struct sk_buff *eth_gro_receive(struct list_head *head, struct sk_buff *skb)
431 struct sk_buff *skb)
432{ 431{
433 struct sk_buff *p, **pp = NULL;
434 struct ethhdr *eh, *eh2;
435 unsigned int hlen, off_eth;
436 const struct packet_offload *ptype; 432 const struct packet_offload *ptype;
433 unsigned int hlen, off_eth;
434 struct sk_buff *pp = NULL;
435 struct ethhdr *eh, *eh2;
436 struct sk_buff *p;
437 __be16 type; 437 __be16 type;
438 int flush = 1; 438 int flush = 1;
439 439
@@ -448,7 +448,7 @@ struct sk_buff **eth_gro_receive(struct sk_buff **head,
448 448
449 flush = 0; 449 flush = 0;
450 450
451 for (p = *head; p; p = p->next) { 451 list_for_each_entry(p, head, list) {
452 if (!NAPI_GRO_CB(p)->same_flow) 452 if (!NAPI_GRO_CB(p)->same_flow)
453 continue; 453 continue;
454 454
diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c
index 275449b0d633..3297e7fa9945 100644
--- a/net/ieee802154/6lowpan/core.c
+++ b/net/ieee802154/6lowpan/core.c
@@ -90,12 +90,18 @@ static int lowpan_neigh_construct(struct net_device *dev, struct neighbour *n)
90 return 0; 90 return 0;
91} 91}
92 92
93static int lowpan_get_iflink(const struct net_device *dev)
94{
95 return lowpan_802154_dev(dev)->wdev->ifindex;
96}
97
93static const struct net_device_ops lowpan_netdev_ops = { 98static const struct net_device_ops lowpan_netdev_ops = {
94 .ndo_init = lowpan_dev_init, 99 .ndo_init = lowpan_dev_init,
95 .ndo_start_xmit = lowpan_xmit, 100 .ndo_start_xmit = lowpan_xmit,
96 .ndo_open = lowpan_open, 101 .ndo_open = lowpan_open,
97 .ndo_stop = lowpan_stop, 102 .ndo_stop = lowpan_stop,
98 .ndo_neigh_construct = lowpan_neigh_construct, 103 .ndo_neigh_construct = lowpan_neigh_construct,
104 .ndo_get_iflink = lowpan_get_iflink,
99}; 105};
100 106
101static void lowpan_setup(struct net_device *ldev) 107static void lowpan_setup(struct net_device *ldev)
diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index 2cc224106b69..e7857a8ac86d 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -25,7 +25,7 @@
25 25
26#include <net/ieee802154_netdev.h> 26#include <net/ieee802154_netdev.h>
27#include <net/6lowpan.h> 27#include <net/6lowpan.h>
28#include <net/ipv6.h> 28#include <net/ipv6_frag.h>
29#include <net/inet_frag.h> 29#include <net/inet_frag.h>
30 30
31#include "6lowpan_i.h" 31#include "6lowpan_i.h"
@@ -40,9 +40,6 @@ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq,
40static void lowpan_frag_init(struct inet_frag_queue *q, const void *a) 40static void lowpan_frag_init(struct inet_frag_queue *q, const void *a)
41{ 41{
42 const struct frag_lowpan_compare_key *key = a; 42 const struct frag_lowpan_compare_key *key = a;
43 struct lowpan_frag_queue *fq;
44
45 fq = container_of(q, struct lowpan_frag_queue, q);
46 43
47 BUILD_BUG_ON(sizeof(*key) > sizeof(q->key)); 44 BUILD_BUG_ON(sizeof(*key) > sizeof(q->key));
48 memcpy(&q->key, key, sizeof(*key)); 45 memcpy(&q->key, key, sizeof(*key));
@@ -52,10 +49,8 @@ static void lowpan_frag_expire(struct timer_list *t)
52{ 49{
53 struct inet_frag_queue *frag = from_timer(frag, t, timer); 50 struct inet_frag_queue *frag = from_timer(frag, t, timer);
54 struct frag_queue *fq; 51 struct frag_queue *fq;
55 struct net *net;
56 52
57 fq = container_of(frag, struct frag_queue, q); 53 fq = container_of(frag, struct frag_queue, q);
58 net = container_of(fq->q.net, struct net, ieee802154_lowpan.frags);
59 54
60 spin_lock(&fq->q.lock); 55 spin_lock(&fq->q.lock);
61 56
diff --git a/net/ieee802154/6lowpan/tx.c b/net/ieee802154/6lowpan/tx.c
index e6ff5128e61a..ca53efa17be1 100644
--- a/net/ieee802154/6lowpan/tx.c
+++ b/net/ieee802154/6lowpan/tx.c
@@ -265,9 +265,24 @@ netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *ldev)
265 /* We must take a copy of the skb before we modify/replace the ipv6 265 /* We must take a copy of the skb before we modify/replace the ipv6
266 * header as the header could be used elsewhere 266 * header as the header could be used elsewhere
267 */ 267 */
268 skb = skb_unshare(skb, GFP_ATOMIC); 268 if (unlikely(skb_headroom(skb) < ldev->needed_headroom ||
269 if (!skb) 269 skb_tailroom(skb) < ldev->needed_tailroom)) {
270 return NET_XMIT_DROP; 270 struct sk_buff *nskb;
271
272 nskb = skb_copy_expand(skb, ldev->needed_headroom,
273 ldev->needed_tailroom, GFP_ATOMIC);
274 if (likely(nskb)) {
275 consume_skb(skb);
276 skb = nskb;
277 } else {
278 kfree_skb(skb);
279 return NET_XMIT_DROP;
280 }
281 } else {
282 skb = skb_unshare(skb, GFP_ATOMIC);
283 if (!skb)
284 return NET_XMIT_DROP;
285 }
271 286
272 ret = lowpan_header(skb, ldev, &dgram_size, &dgram_offset); 287 ret = lowpan_header(skb, ldev, &dgram_size, &dgram_offset);
273 if (ret < 0) { 288 if (ret < 0) {
diff --git a/net/ieee802154/core.c b/net/ieee802154/core.c
index cb7176cd4cd6..fe225d9a1877 100644
--- a/net/ieee802154/core.c
+++ b/net/ieee802154/core.c
@@ -400,4 +400,3 @@ module_exit(wpan_phy_class_exit);
400MODULE_LICENSE("GPL v2"); 400MODULE_LICENSE("GPL v2");
401MODULE_DESCRIPTION("IEEE 802.15.4 configuration interface"); 401MODULE_DESCRIPTION("IEEE 802.15.4 configuration interface");
402MODULE_AUTHOR("Dmitry Eremin-Solenikov"); 402MODULE_AUTHOR("Dmitry Eremin-Solenikov");
403
diff --git a/net/ieee802154/nl_policy.c b/net/ieee802154/nl_policy.c
index 35c432668454..78f6f1233194 100644
--- a/net/ieee802154/nl_policy.c
+++ b/net/ieee802154/nl_policy.c
@@ -75,4 +75,3 @@ const struct nla_policy ieee802154_policy[IEEE802154_ATTR_MAX + 1] = {
75 [IEEE802154_ATTR_LLSEC_DEV_OVERRIDE] = { .type = NLA_U8, }, 75 [IEEE802154_ATTR_LLSEC_DEV_OVERRIDE] = { .type = NLA_U8, },
76 [IEEE802154_ATTR_LLSEC_DEV_KEY_MODE] = { .type = NLA_U8, }, 76 [IEEE802154_ATTR_LLSEC_DEV_KEY_MODE] = { .type = NLA_U8, },
77}; 77};
78
diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c
index a0768d2759b8..bc6b912603f1 100644
--- a/net/ieee802154/socket.c
+++ b/net/ieee802154/socket.c
@@ -25,6 +25,7 @@
25#include <linux/termios.h> /* For TIOCOUTQ/INQ */ 25#include <linux/termios.h> /* For TIOCOUTQ/INQ */
26#include <linux/list.h> 26#include <linux/list.h>
27#include <linux/slab.h> 27#include <linux/slab.h>
28#include <linux/socket.h>
28#include <net/datalink.h> 29#include <net/datalink.h>
29#include <net/psnap.h> 30#include <net/psnap.h>
30#include <net/sock.h> 31#include <net/sock.h>
@@ -423,7 +424,7 @@ static const struct proto_ops ieee802154_raw_ops = {
423 .socketpair = sock_no_socketpair, 424 .socketpair = sock_no_socketpair,
424 .accept = sock_no_accept, 425 .accept = sock_no_accept,
425 .getname = sock_no_getname, 426 .getname = sock_no_getname,
426 .poll_mask = datagram_poll_mask, 427 .poll = datagram_poll,
427 .ioctl = ieee802154_sock_ioctl, 428 .ioctl = ieee802154_sock_ioctl,
428 .listen = sock_no_listen, 429 .listen = sock_no_listen,
429 .shutdown = sock_no_shutdown, 430 .shutdown = sock_no_shutdown,
@@ -452,6 +453,7 @@ struct dgram_sock {
452 unsigned int bound:1; 453 unsigned int bound:1;
453 unsigned int connected:1; 454 unsigned int connected:1;
454 unsigned int want_ack:1; 455 unsigned int want_ack:1;
456 unsigned int want_lqi:1;
455 unsigned int secen:1; 457 unsigned int secen:1;
456 unsigned int secen_override:1; 458 unsigned int secen_override:1;
457 unsigned int seclevel:3; 459 unsigned int seclevel:3;
@@ -486,6 +488,7 @@ static int dgram_init(struct sock *sk)
486 struct dgram_sock *ro = dgram_sk(sk); 488 struct dgram_sock *ro = dgram_sk(sk);
487 489
488 ro->want_ack = 1; 490 ro->want_ack = 1;
491 ro->want_lqi = 0;
489 return 0; 492 return 0;
490} 493}
491 494
@@ -713,6 +716,7 @@ static int dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
713 size_t copied = 0; 716 size_t copied = 0;
714 int err = -EOPNOTSUPP; 717 int err = -EOPNOTSUPP;
715 struct sk_buff *skb; 718 struct sk_buff *skb;
719 struct dgram_sock *ro = dgram_sk(sk);
716 DECLARE_SOCKADDR(struct sockaddr_ieee802154 *, saddr, msg->msg_name); 720 DECLARE_SOCKADDR(struct sockaddr_ieee802154 *, saddr, msg->msg_name);
717 721
718 skb = skb_recv_datagram(sk, flags, noblock, &err); 722 skb = skb_recv_datagram(sk, flags, noblock, &err);
@@ -744,6 +748,13 @@ static int dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
744 *addr_len = sizeof(*saddr); 748 *addr_len = sizeof(*saddr);
745 } 749 }
746 750
751 if (ro->want_lqi) {
752 err = put_cmsg(msg, SOL_IEEE802154, WPAN_WANTLQI,
753 sizeof(uint8_t), &(mac_cb(skb)->lqi));
754 if (err)
755 goto done;
756 }
757
747 if (flags & MSG_TRUNC) 758 if (flags & MSG_TRUNC)
748 copied = skb->len; 759 copied = skb->len;
749done: 760done:
@@ -847,6 +858,9 @@ static int dgram_getsockopt(struct sock *sk, int level, int optname,
847 case WPAN_WANTACK: 858 case WPAN_WANTACK:
848 val = ro->want_ack; 859 val = ro->want_ack;
849 break; 860 break;
861 case WPAN_WANTLQI:
862 val = ro->want_lqi;
863 break;
850 case WPAN_SECURITY: 864 case WPAN_SECURITY:
851 if (!ro->secen_override) 865 if (!ro->secen_override)
852 val = WPAN_SECURITY_DEFAULT; 866 val = WPAN_SECURITY_DEFAULT;
@@ -892,6 +906,9 @@ static int dgram_setsockopt(struct sock *sk, int level, int optname,
892 case WPAN_WANTACK: 906 case WPAN_WANTACK:
893 ro->want_ack = !!val; 907 ro->want_ack = !!val;
894 break; 908 break;
909 case WPAN_WANTLQI:
910 ro->want_lqi = !!val;
911 break;
895 case WPAN_SECURITY: 912 case WPAN_SECURITY:
896 if (!ns_capable(net->user_ns, CAP_NET_ADMIN) && 913 if (!ns_capable(net->user_ns, CAP_NET_ADMIN) &&
897 !ns_capable(net->user_ns, CAP_NET_RAW)) { 914 !ns_capable(net->user_ns, CAP_NET_RAW)) {
@@ -969,7 +986,7 @@ static const struct proto_ops ieee802154_dgram_ops = {
969 .socketpair = sock_no_socketpair, 986 .socketpair = sock_no_socketpair,
970 .accept = sock_no_accept, 987 .accept = sock_no_accept,
971 .getname = sock_no_getname, 988 .getname = sock_no_getname,
972 .poll_mask = datagram_poll_mask, 989 .poll = datagram_poll,
973 .ioctl = ieee802154_sock_ioctl, 990 .ioctl = ieee802154_sock_ioctl,
974 .listen = sock_no_listen, 991 .listen = sock_no_listen,
975 .shutdown = sock_no_shutdown, 992 .shutdown = sock_no_shutdown,
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 80dad301361d..32cae39cdff6 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -430,7 +430,7 @@ config INET_DIAG
430 Support for INET (TCP, DCCP, etc) socket monitoring interface used by 430 Support for INET (TCP, DCCP, etc) socket monitoring interface used by
431 native Linux tools such as ss. ss is included in iproute2, currently 431 native Linux tools such as ss. ss is included in iproute2, currently
432 downloadable at: 432 downloadable at:
433 433
434 http://www.linuxfoundation.org/collaborate/workgroups/networking/iproute2 434 http://www.linuxfoundation.org/collaborate/workgroups/networking/iproute2
435 435
436 If unsure, say Y. 436 If unsure, say Y.
@@ -600,7 +600,7 @@ config TCP_CONG_VENO
600 distinguishing to circumvent the difficult judgment of the packet loss 600 distinguishing to circumvent the difficult judgment of the packet loss
601 type. TCP Veno cuts down less congestion window in response to random 601 type. TCP Veno cuts down less congestion window in response to random
602 loss packets. 602 loss packets.
603 See <http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1177186> 603 See <http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1177186>
604 604
605config TCP_CONG_YEAH 605config TCP_CONG_YEAH
606 tristate "YeAH TCP" 606 tristate "YeAH TCP"
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index eec9569ffa5c..7446b98661d8 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -43,7 +43,7 @@ obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o
43obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o 43obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o
44obj-$(CONFIG_IP_PNP) += ipconfig.o 44obj-$(CONFIG_IP_PNP) += ipconfig.o
45obj-$(CONFIG_NETFILTER) += netfilter.o netfilter/ 45obj-$(CONFIG_NETFILTER) += netfilter.o netfilter/
46obj-$(CONFIG_INET_DIAG) += inet_diag.o 46obj-$(CONFIG_INET_DIAG) += inet_diag.o
47obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o 47obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
48obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o 48obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
49obj-$(CONFIG_INET_RAW_DIAG) += raw_diag.o 49obj-$(CONFIG_INET_RAW_DIAG) += raw_diag.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 15e125558c76..20fda8fb8ffd 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -229,6 +229,7 @@ int inet_listen(struct socket *sock, int backlog)
229 err = inet_csk_listen_start(sk, backlog); 229 err = inet_csk_listen_start(sk, backlog);
230 if (err) 230 if (err)
231 goto out; 231 goto out;
232 tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL);
232 } 233 }
233 sk->sk_max_ack_backlog = backlog; 234 sk->sk_max_ack_backlog = backlog;
234 err = 0; 235 err = 0;
@@ -485,8 +486,7 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
485 * is temporarily down) 486 * is temporarily down)
486 */ 487 */
487 err = -EADDRNOTAVAIL; 488 err = -EADDRNOTAVAIL;
488 if (!net->ipv4.sysctl_ip_nonlocal_bind && 489 if (!inet_can_nonlocal_bind(net, inet) &&
489 !(inet->freebind || inet->transparent) &&
490 addr->sin_addr.s_addr != htonl(INADDR_ANY) && 490 addr->sin_addr.s_addr != htonl(INADDR_ANY) &&
491 chk_addr_ret != RTN_LOCAL && 491 chk_addr_ret != RTN_LOCAL &&
492 chk_addr_ret != RTN_MULTICAST && 492 chk_addr_ret != RTN_MULTICAST &&
@@ -986,7 +986,7 @@ const struct proto_ops inet_stream_ops = {
986 .socketpair = sock_no_socketpair, 986 .socketpair = sock_no_socketpair,
987 .accept = inet_accept, 987 .accept = inet_accept,
988 .getname = inet_getname, 988 .getname = inet_getname,
989 .poll_mask = tcp_poll_mask, 989 .poll = tcp_poll,
990 .ioctl = inet_ioctl, 990 .ioctl = inet_ioctl,
991 .listen = inet_listen, 991 .listen = inet_listen,
992 .shutdown = inet_shutdown, 992 .shutdown = inet_shutdown,
@@ -1021,7 +1021,7 @@ const struct proto_ops inet_dgram_ops = {
1021 .socketpair = sock_no_socketpair, 1021 .socketpair = sock_no_socketpair,
1022 .accept = sock_no_accept, 1022 .accept = sock_no_accept,
1023 .getname = inet_getname, 1023 .getname = inet_getname,
1024 .poll_mask = udp_poll_mask, 1024 .poll = udp_poll,
1025 .ioctl = inet_ioctl, 1025 .ioctl = inet_ioctl,
1026 .listen = sock_no_listen, 1026 .listen = sock_no_listen,
1027 .shutdown = inet_shutdown, 1027 .shutdown = inet_shutdown,
@@ -1042,7 +1042,7 @@ EXPORT_SYMBOL(inet_dgram_ops);
1042 1042
1043/* 1043/*
1044 * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without 1044 * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without
1045 * udp_poll_mask 1045 * udp_poll
1046 */ 1046 */
1047static const struct proto_ops inet_sockraw_ops = { 1047static const struct proto_ops inet_sockraw_ops = {
1048 .family = PF_INET, 1048 .family = PF_INET,
@@ -1053,7 +1053,7 @@ static const struct proto_ops inet_sockraw_ops = {
1053 .socketpair = sock_no_socketpair, 1053 .socketpair = sock_no_socketpair,
1054 .accept = sock_no_accept, 1054 .accept = sock_no_accept,
1055 .getname = inet_getname, 1055 .getname = inet_getname,
1056 .poll_mask = datagram_poll_mask, 1056 .poll = datagram_poll,
1057 .ioctl = inet_ioctl, 1057 .ioctl = inet_ioctl,
1058 .listen = sock_no_listen, 1058 .listen = sock_no_listen,
1059 .shutdown = inet_shutdown, 1059 .shutdown = inet_shutdown,
@@ -1384,12 +1384,12 @@ out:
1384} 1384}
1385EXPORT_SYMBOL(inet_gso_segment); 1385EXPORT_SYMBOL(inet_gso_segment);
1386 1386
1387struct sk_buff **inet_gro_receive(struct sk_buff **head, struct sk_buff *skb) 1387struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
1388{ 1388{
1389 const struct net_offload *ops; 1389 const struct net_offload *ops;
1390 struct sk_buff **pp = NULL; 1390 struct sk_buff *pp = NULL;
1391 struct sk_buff *p;
1392 const struct iphdr *iph; 1391 const struct iphdr *iph;
1392 struct sk_buff *p;
1393 unsigned int hlen; 1393 unsigned int hlen;
1394 unsigned int off; 1394 unsigned int off;
1395 unsigned int id; 1395 unsigned int id;
@@ -1425,7 +1425,7 @@ struct sk_buff **inet_gro_receive(struct sk_buff **head, struct sk_buff *skb)
1425 flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF)); 1425 flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF));
1426 id >>= 16; 1426 id >>= 16;
1427 1427
1428 for (p = *head; p; p = p->next) { 1428 list_for_each_entry(p, head, list) {
1429 struct iphdr *iph2; 1429 struct iphdr *iph2;
1430 u16 flush_id; 1430 u16 flush_id;
1431 1431
@@ -1505,8 +1505,8 @@ out:
1505} 1505}
1506EXPORT_SYMBOL(inet_gro_receive); 1506EXPORT_SYMBOL(inet_gro_receive);
1507 1507
1508static struct sk_buff **ipip_gro_receive(struct sk_buff **head, 1508static struct sk_buff *ipip_gro_receive(struct list_head *head,
1509 struct sk_buff *skb) 1509 struct sk_buff *skb)
1510{ 1510{
1511 if (NAPI_GRO_CB(skb)->encap_mark) { 1511 if (NAPI_GRO_CB(skb)->encap_mark) {
1512 NAPI_GRO_CB(skb)->flush = 1; 1512 NAPI_GRO_CB(skb)->flush = 1;
@@ -1801,6 +1801,7 @@ static __net_init int inet_init_net(struct net *net)
1801 * We set them here, in case sysctl is not compiled. 1801 * We set them here, in case sysctl is not compiled.
1802 */ 1802 */
1803 net->ipv4.sysctl_ip_default_ttl = IPDEFTTL; 1803 net->ipv4.sysctl_ip_default_ttl = IPDEFTTL;
1804 net->ipv4.sysctl_ip_fwd_update_priority = 1;
1804 net->ipv4.sysctl_ip_dynaddr = 0; 1805 net->ipv4.sysctl_ip_dynaddr = 0;
1805 net->ipv4.sysctl_ip_early_demux = 1; 1806 net->ipv4.sysctl_ip_early_demux = 1;
1806 net->ipv4.sysctl_udp_early_demux = 1; 1807 net->ipv4.sysctl_udp_early_demux = 1;
@@ -1882,6 +1883,7 @@ fs_initcall(ipv4_offload_init);
1882static struct packet_type ip_packet_type __read_mostly = { 1883static struct packet_type ip_packet_type __read_mostly = {
1883 .type = cpu_to_be16(ETH_P_IP), 1884 .type = cpu_to_be16(ETH_P_IP),
1884 .func = ip_rcv, 1885 .func = ip_rcv,
1886 .list_func = ip_list_rcv,
1885}; 1887};
1886 1888
1887static int __init inet_init(void) 1889static int __init inet_init(void)
diff --git a/net/ipv4/bpfilter/Makefile b/net/ipv4/bpfilter/Makefile
index ce262d76cc48..e9e42f99725e 100644
--- a/net/ipv4/bpfilter/Makefile
+++ b/net/ipv4/bpfilter/Makefile
@@ -1,2 +1 @@
1obj-$(CONFIG_BPFILTER) += sockopt.o obj-$(CONFIG_BPFILTER) += sockopt.o
2
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index d7585ab1a77a..ea4bd8a52422 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1827,6 +1827,8 @@ static int inet_netconf_msgsize_devconf(int type)
1827 size += nla_total_size(4); 1827 size += nla_total_size(4);
1828 if (all || type == NETCONFA_MC_FORWARDING) 1828 if (all || type == NETCONFA_MC_FORWARDING)
1829 size += nla_total_size(4); 1829 size += nla_total_size(4);
1830 if (all || type == NETCONFA_BC_FORWARDING)
1831 size += nla_total_size(4);
1830 if (all || type == NETCONFA_PROXY_NEIGH) 1832 if (all || type == NETCONFA_PROXY_NEIGH)
1831 size += nla_total_size(4); 1833 size += nla_total_size(4);
1832 if (all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) 1834 if (all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
@@ -1873,6 +1875,10 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
1873 nla_put_s32(skb, NETCONFA_MC_FORWARDING, 1875 nla_put_s32(skb, NETCONFA_MC_FORWARDING,
1874 IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0) 1876 IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0)
1875 goto nla_put_failure; 1877 goto nla_put_failure;
1878 if ((all || type == NETCONFA_BC_FORWARDING) &&
1879 nla_put_s32(skb, NETCONFA_BC_FORWARDING,
1880 IPV4_DEVCONF(*devconf, BC_FORWARDING)) < 0)
1881 goto nla_put_failure;
1876 if ((all || type == NETCONFA_PROXY_NEIGH) && 1882 if ((all || type == NETCONFA_PROXY_NEIGH) &&
1877 nla_put_s32(skb, NETCONFA_PROXY_NEIGH, 1883 nla_put_s32(skb, NETCONFA_PROXY_NEIGH,
1878 IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0) 1884 IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0)
@@ -2143,6 +2149,10 @@ static int devinet_conf_proc(struct ctl_table *ctl, int write,
2143 if ((new_value == 0) && (old_value != 0)) 2149 if ((new_value == 0) && (old_value != 0))
2144 rt_cache_flush(net); 2150 rt_cache_flush(net);
2145 2151
2152 if (i == IPV4_DEVCONF_BC_FORWARDING - 1 &&
2153 new_value != old_value)
2154 rt_cache_flush(net);
2155
2146 if (i == IPV4_DEVCONF_RP_FILTER - 1 && 2156 if (i == IPV4_DEVCONF_RP_FILTER - 1 &&
2147 new_value != old_value) { 2157 new_value != old_value) {
2148 ifindex = devinet_conf_ifindex(net, cnf); 2158 ifindex = devinet_conf_ifindex(net, cnf);
@@ -2259,6 +2269,7 @@ static struct devinet_sysctl_table {
2259 DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding", 2269 DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding",
2260 devinet_sysctl_forward), 2270 devinet_sysctl_forward),
2261 DEVINET_SYSCTL_RO_ENTRY(MC_FORWARDING, "mc_forwarding"), 2271 DEVINET_SYSCTL_RO_ENTRY(MC_FORWARDING, "mc_forwarding"),
2272 DEVINET_SYSCTL_RW_ENTRY(BC_FORWARDING, "bc_forwarding"),
2262 2273
2263 DEVINET_SYSCTL_RW_ENTRY(ACCEPT_REDIRECTS, "accept_redirects"), 2274 DEVINET_SYSCTL_RW_ENTRY(ACCEPT_REDIRECTS, "accept_redirects"),
2264 DEVINET_SYSCTL_RW_ENTRY(SECURE_REDIRECTS, "secure_redirects"), 2275 DEVINET_SYSCTL_RW_ENTRY(SECURE_REDIRECTS, "secure_redirects"),
diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
index 7cf755ef9efb..58834a10c0be 100644
--- a/net/ipv4/esp4_offload.c
+++ b/net/ipv4/esp4_offload.c
@@ -28,8 +28,8 @@
28#include <linux/spinlock.h> 28#include <linux/spinlock.h>
29#include <net/udp.h> 29#include <net/udp.h>
30 30
31static struct sk_buff **esp4_gro_receive(struct sk_buff **head, 31static struct sk_buff *esp4_gro_receive(struct list_head *head,
32 struct sk_buff *skb) 32 struct sk_buff *skb)
33{ 33{
34 int offset = skb_gro_offset(skb); 34 int offset = skb_gro_offset(skb);
35 struct xfrm_offload *xo; 35 struct xfrm_offload *xo;
@@ -135,8 +135,7 @@ static struct sk_buff *esp4_gso_segment(struct sk_buff *skb,
135 135
136 skb->encap_hdr_csum = 1; 136 skb->encap_hdr_csum = 1;
137 137
138 if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle || 138 if (!(features & NETIF_F_HW_ESP) || x->xso.dev != skb->dev)
139 (x->xso.dev != skb->dev))
140 esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK); 139 esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK);
141 else if (!(features & NETIF_F_HW_ESP_TX_CSUM)) 140 else if (!(features & NETIF_F_HW_ESP_TX_CSUM))
142 esp_features = features & ~NETIF_F_CSUM_MASK; 141 esp_features = features & ~NETIF_F_CSUM_MASK;
@@ -179,8 +178,7 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_
179 if (!xo) 178 if (!xo)
180 return -EINVAL; 179 return -EINVAL;
181 180
182 if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle || 181 if (!(features & NETIF_F_HW_ESP) || x->xso.dev != skb->dev) {
183 (x->xso.dev != skb->dev)) {
184 xo->flags |= CRYPTO_FALLBACK; 182 xo->flags |= CRYPTO_FALLBACK;
185 hw_offload = false; 183 hw_offload = false;
186 } 184 }
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index b21833651394..2998b0e47d4b 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -292,18 +292,19 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
292 return ip_hdr(skb)->daddr; 292 return ip_hdr(skb)->daddr;
293 293
294 in_dev = __in_dev_get_rcu(dev); 294 in_dev = __in_dev_get_rcu(dev);
295 BUG_ON(!in_dev);
296 295
297 net = dev_net(dev); 296 net = dev_net(dev);
298 297
299 scope = RT_SCOPE_UNIVERSE; 298 scope = RT_SCOPE_UNIVERSE;
300 if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) { 299 if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) {
300 bool vmark = in_dev && IN_DEV_SRC_VMARK(in_dev);
301 struct flowi4 fl4 = { 301 struct flowi4 fl4 = {
302 .flowi4_iif = LOOPBACK_IFINDEX, 302 .flowi4_iif = LOOPBACK_IFINDEX,
303 .flowi4_oif = l3mdev_master_ifindex_rcu(dev),
303 .daddr = ip_hdr(skb)->saddr, 304 .daddr = ip_hdr(skb)->saddr,
304 .flowi4_tos = RT_TOS(ip_hdr(skb)->tos), 305 .flowi4_tos = RT_TOS(ip_hdr(skb)->tos),
305 .flowi4_scope = scope, 306 .flowi4_scope = scope,
306 .flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0, 307 .flowi4_mark = vmark ? skb->mark : 0,
307 }; 308 };
308 if (!fib_lookup(net, &fl4, &res, 0)) 309 if (!fib_lookup(net, &fl4, &res, 0))
309 return FIB_RES_PREFSRC(net, res); 310 return FIB_RES_PREFSRC(net, res);
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 1540db65241a..500a59906b87 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -224,14 +224,14 @@ drop:
224 return 0; 224 return 0;
225} 225}
226 226
227static struct sk_buff **fou_gro_receive(struct sock *sk, 227static struct sk_buff *fou_gro_receive(struct sock *sk,
228 struct sk_buff **head, 228 struct list_head *head,
229 struct sk_buff *skb) 229 struct sk_buff *skb)
230{ 230{
231 const struct net_offload *ops;
232 struct sk_buff **pp = NULL;
233 u8 proto = fou_from_sock(sk)->protocol; 231 u8 proto = fou_from_sock(sk)->protocol;
234 const struct net_offload **offloads; 232 const struct net_offload **offloads;
233 const struct net_offload *ops;
234 struct sk_buff *pp = NULL;
235 235
236 /* We can clear the encap_mark for FOU as we are essentially doing 236 /* We can clear the encap_mark for FOU as we are essentially doing
237 * one of two possible things. We are either adding an L4 tunnel 237 * one of two possible things. We are either adding an L4 tunnel
@@ -305,13 +305,13 @@ static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off,
305 return guehdr; 305 return guehdr;
306} 306}
307 307
308static struct sk_buff **gue_gro_receive(struct sock *sk, 308static struct sk_buff *gue_gro_receive(struct sock *sk,
309 struct sk_buff **head, 309 struct list_head *head,
310 struct sk_buff *skb) 310 struct sk_buff *skb)
311{ 311{
312 const struct net_offload **offloads; 312 const struct net_offload **offloads;
313 const struct net_offload *ops; 313 const struct net_offload *ops;
314 struct sk_buff **pp = NULL; 314 struct sk_buff *pp = NULL;
315 struct sk_buff *p; 315 struct sk_buff *p;
316 struct guehdr *guehdr; 316 struct guehdr *guehdr;
317 size_t len, optlen, hdrlen, off; 317 size_t len, optlen, hdrlen, off;
@@ -397,7 +397,7 @@ static struct sk_buff **gue_gro_receive(struct sock *sk,
397 397
398 skb_gro_pull(skb, hdrlen); 398 skb_gro_pull(skb, hdrlen);
399 399
400 for (p = *head; p; p = p->next) { 400 list_for_each_entry(p, head, list) {
401 const struct guehdr *guehdr2; 401 const struct guehdr *guehdr2;
402 402
403 if (!NAPI_GRO_CB(p)->same_flow) 403 if (!NAPI_GRO_CB(p)->same_flow)
@@ -448,9 +448,7 @@ next_proto:
448out_unlock: 448out_unlock:
449 rcu_read_unlock(); 449 rcu_read_unlock();
450out: 450out:
451 NAPI_GRO_CB(skb)->flush |= flush; 451 skb_gro_flush_final_remcsum(skb, pp, flush, &grc);
452 skb_gro_remcsum_cleanup(skb, &grc);
453 skb->remcsum_offload = 0;
454 452
455 return pp; 453 return pp;
456} 454}
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index 1859c473b21a..6c63524f598a 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -108,10 +108,10 @@ out:
108 return segs; 108 return segs;
109} 109}
110 110
111static struct sk_buff **gre_gro_receive(struct sk_buff **head, 111static struct sk_buff *gre_gro_receive(struct list_head *head,
112 struct sk_buff *skb) 112 struct sk_buff *skb)
113{ 113{
114 struct sk_buff **pp = NULL; 114 struct sk_buff *pp = NULL;
115 struct sk_buff *p; 115 struct sk_buff *p;
116 const struct gre_base_hdr *greh; 116 const struct gre_base_hdr *greh;
117 unsigned int hlen, grehlen; 117 unsigned int hlen, grehlen;
@@ -182,7 +182,7 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head,
182 null_compute_pseudo); 182 null_compute_pseudo);
183 } 183 }
184 184
185 for (p = *head; p; p = p->next) { 185 list_for_each_entry(p, head, list) {
186 const struct gre_base_hdr *greh2; 186 const struct gre_base_hdr *greh2;
187 187
188 if (!NAPI_GRO_CB(p)->same_flow) 188 if (!NAPI_GRO_CB(p)->same_flow)
@@ -223,7 +223,7 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head,
223out_unlock: 223out_unlock:
224 rcu_read_unlock(); 224 rcu_read_unlock();
225out: 225out:
226 NAPI_GRO_CB(skb)->flush |= flush; 226 skb_gro_flush_final(skb, pp, flush);
227 227
228 return pp; 228 return pp;
229} 229}
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 1617604c9284..695979b7ef6d 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -429,14 +429,11 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
429 429
430 icmp_param->data.icmph.checksum = 0; 430 icmp_param->data.icmph.checksum = 0;
431 431
432 ipcm_init(&ipc);
432 inet->tos = ip_hdr(skb)->tos; 433 inet->tos = ip_hdr(skb)->tos;
433 sk->sk_mark = mark; 434 sk->sk_mark = mark;
434 daddr = ipc.addr = ip_hdr(skb)->saddr; 435 daddr = ipc.addr = ip_hdr(skb)->saddr;
435 saddr = fib_compute_spec_dst(skb); 436 saddr = fib_compute_spec_dst(skb);
436 ipc.opt = NULL;
437 ipc.tx_flags = 0;
438 ipc.ttl = 0;
439 ipc.tos = -1;
440 437
441 if (icmp_param->replyopts.opt.opt.optlen) { 438 if (icmp_param->replyopts.opt.opt.optlen) {
442 ipc.opt = &icmp_param->replyopts.opt; 439 ipc.opt = &icmp_param->replyopts.opt;
@@ -710,11 +707,9 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
710 icmp_param.offset = skb_network_offset(skb_in); 707 icmp_param.offset = skb_network_offset(skb_in);
711 inet_sk(sk)->tos = tos; 708 inet_sk(sk)->tos = tos;
712 sk->sk_mark = mark; 709 sk->sk_mark = mark;
710 ipcm_init(&ipc);
713 ipc.addr = iph->saddr; 711 ipc.addr = iph->saddr;
714 ipc.opt = &icmp_param.replyopts.opt; 712 ipc.opt = &icmp_param.replyopts.opt;
715 ipc.tx_flags = 0;
716 ipc.ttl = 0;
717 ipc.tos = -1;
718 713
719 rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark, 714 rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark,
720 type, code, &icmp_param); 715 type, code, &icmp_param);
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 85b617b655bc..cf75f8944b05 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1200,13 +1200,13 @@ static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im)
1200 spin_lock_bh(&im->lock); 1200 spin_lock_bh(&im->lock);
1201 if (pmc) { 1201 if (pmc) {
1202 im->interface = pmc->interface; 1202 im->interface = pmc->interface;
1203 im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; 1203 if (im->sfmode == MCAST_INCLUDE) {
1204 im->sfmode = pmc->sfmode;
1205 if (pmc->sfmode == MCAST_INCLUDE) {
1206 im->tomb = pmc->tomb; 1204 im->tomb = pmc->tomb;
1207 im->sources = pmc->sources; 1205 im->sources = pmc->sources;
1208 for (psf = im->sources; psf; psf = psf->sf_next) 1206 for (psf = im->sources; psf; psf = psf->sf_next)
1209 psf->sf_crcount = im->crcount; 1207 psf->sf_crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
1208 } else {
1209 im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
1210 } 1210 }
1211 in_dev_put(pmc->interface); 1211 in_dev_put(pmc->interface);
1212 kfree(pmc); 1212 kfree(pmc);
@@ -1316,7 +1316,13 @@ static void igmp_group_added(struct ip_mc_list *im)
1316 } 1316 }
1317 /* else, v3 */ 1317 /* else, v3 */
1318 1318
1319 im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; 1319 /* Based on RFC3376 5.1, for newly added INCLUDE SSM, we should
1320 * not send filter-mode change record as the mode should be from
1321 * IN() to IN(A).
1322 */
1323 if (im->sfmode == MCAST_EXCLUDE)
1324 im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
1325
1320 igmp_ifc_event(in_dev); 1326 igmp_ifc_event(in_dev);
1321#endif 1327#endif
1322} 1328}
@@ -1381,8 +1387,8 @@ static void ip_mc_hash_remove(struct in_device *in_dev,
1381/* 1387/*
1382 * A socket has joined a multicast group on device dev. 1388 * A socket has joined a multicast group on device dev.
1383 */ 1389 */
1384 1390static void __ip_mc_inc_group(struct in_device *in_dev, __be32 addr,
1385void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) 1391 unsigned int mode)
1386{ 1392{
1387 struct ip_mc_list *im; 1393 struct ip_mc_list *im;
1388#ifdef CONFIG_IP_MULTICAST 1394#ifdef CONFIG_IP_MULTICAST
@@ -1394,7 +1400,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
1394 for_each_pmc_rtnl(in_dev, im) { 1400 for_each_pmc_rtnl(in_dev, im) {
1395 if (im->multiaddr == addr) { 1401 if (im->multiaddr == addr) {
1396 im->users++; 1402 im->users++;
1397 ip_mc_add_src(in_dev, &addr, MCAST_EXCLUDE, 0, NULL, 0); 1403 ip_mc_add_src(in_dev, &addr, mode, 0, NULL, 0);
1398 goto out; 1404 goto out;
1399 } 1405 }
1400 } 1406 }
@@ -1408,8 +1414,8 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
1408 in_dev_hold(in_dev); 1414 in_dev_hold(in_dev);
1409 im->multiaddr = addr; 1415 im->multiaddr = addr;
1410 /* initial mode is (EX, empty) */ 1416 /* initial mode is (EX, empty) */
1411 im->sfmode = MCAST_EXCLUDE; 1417 im->sfmode = mode;
1412 im->sfcount[MCAST_EXCLUDE] = 1; 1418 im->sfcount[mode] = 1;
1413 refcount_set(&im->refcnt, 1); 1419 refcount_set(&im->refcnt, 1);
1414 spin_lock_init(&im->lock); 1420 spin_lock_init(&im->lock);
1415#ifdef CONFIG_IP_MULTICAST 1421#ifdef CONFIG_IP_MULTICAST
@@ -1432,6 +1438,11 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
1432out: 1438out:
1433 return; 1439 return;
1434} 1440}
1441
1442void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
1443{
1444 __ip_mc_inc_group(in_dev, addr, MCAST_EXCLUDE);
1445}
1435EXPORT_SYMBOL(ip_mc_inc_group); 1446EXPORT_SYMBOL(ip_mc_inc_group);
1436 1447
1437static int ip_mc_check_iphdr(struct sk_buff *skb) 1448static int ip_mc_check_iphdr(struct sk_buff *skb)
@@ -2130,8 +2141,8 @@ static void ip_mc_clear_src(struct ip_mc_list *pmc)
2130 2141
2131/* Join a multicast group 2142/* Join a multicast group
2132 */ 2143 */
2133 2144static int __ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr,
2134int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr) 2145 unsigned int mode)
2135{ 2146{
2136 __be32 addr = imr->imr_multiaddr.s_addr; 2147 __be32 addr = imr->imr_multiaddr.s_addr;
2137 struct ip_mc_socklist *iml, *i; 2148 struct ip_mc_socklist *iml, *i;
@@ -2172,15 +2183,30 @@ int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr)
2172 memcpy(&iml->multi, imr, sizeof(*imr)); 2183 memcpy(&iml->multi, imr, sizeof(*imr));
2173 iml->next_rcu = inet->mc_list; 2184 iml->next_rcu = inet->mc_list;
2174 iml->sflist = NULL; 2185 iml->sflist = NULL;
2175 iml->sfmode = MCAST_EXCLUDE; 2186 iml->sfmode = mode;
2176 rcu_assign_pointer(inet->mc_list, iml); 2187 rcu_assign_pointer(inet->mc_list, iml);
2177 ip_mc_inc_group(in_dev, addr); 2188 __ip_mc_inc_group(in_dev, addr, mode);
2178 err = 0; 2189 err = 0;
2179done: 2190done:
2180 return err; 2191 return err;
2181} 2192}
2193
2194/* Join ASM (Any-Source Multicast) group
2195 */
2196int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr)
2197{
2198 return __ip_mc_join_group(sk, imr, MCAST_EXCLUDE);
2199}
2182EXPORT_SYMBOL(ip_mc_join_group); 2200EXPORT_SYMBOL(ip_mc_join_group);
2183 2201
2202/* Join SSM (Source-Specific Multicast) group
2203 */
2204int ip_mc_join_group_ssm(struct sock *sk, struct ip_mreqn *imr,
2205 unsigned int mode)
2206{
2207 return __ip_mc_join_group(sk, imr, mode);
2208}
2209
2184static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml, 2210static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml,
2185 struct in_device *in_dev) 2211 struct in_device *in_dev)
2186{ 2212{
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 33a88e045efd..dfd5009f96ef 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -107,6 +107,15 @@ bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
107} 107}
108EXPORT_SYMBOL(inet_rcv_saddr_equal); 108EXPORT_SYMBOL(inet_rcv_saddr_equal);
109 109
110bool inet_rcv_saddr_any(const struct sock *sk)
111{
112#if IS_ENABLED(CONFIG_IPV6)
113 if (sk->sk_family == AF_INET6)
114 return ipv6_addr_any(&sk->sk_v6_rcv_saddr);
115#endif
116 return !sk->sk_rcv_saddr;
117}
118
110void inet_get_local_port_range(struct net *net, int *low, int *high) 119void inet_get_local_port_range(struct net *net, int *low, int *high)
111{ 120{
112 unsigned int seq; 121 unsigned int seq;
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index c9e35b81d093..bcb11f3a27c0 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -20,6 +20,7 @@
20#include <linux/skbuff.h> 20#include <linux/skbuff.h>
21#include <linux/rtnetlink.h> 21#include <linux/rtnetlink.h>
22#include <linux/slab.h> 22#include <linux/slab.h>
23#include <linux/rhashtable.h>
23 24
24#include <net/sock.h> 25#include <net/sock.h>
25#include <net/inet_frag.h> 26#include <net/inet_frag.h>
@@ -90,7 +91,7 @@ static void inet_frags_free_cb(void *ptr, void *arg)
90 91
91void inet_frags_exit_net(struct netns_frags *nf) 92void inet_frags_exit_net(struct netns_frags *nf)
92{ 93{
93 nf->low_thresh = 0; /* prevent creation of new frags */ 94 nf->high_thresh = 0; /* prevent creation of new frags */
94 95
95 rhashtable_free_and_destroy(&nf->rhashtable, inet_frags_free_cb, NULL); 96 rhashtable_free_and_destroy(&nf->rhashtable, inet_frags_free_cb, NULL);
96} 97}
@@ -136,12 +137,16 @@ void inet_frag_destroy(struct inet_frag_queue *q)
136 fp = q->fragments; 137 fp = q->fragments;
137 nf = q->net; 138 nf = q->net;
138 f = nf->f; 139 f = nf->f;
139 while (fp) { 140 if (fp) {
140 struct sk_buff *xp = fp->next; 141 do {
141 142 struct sk_buff *xp = fp->next;
142 sum_truesize += fp->truesize; 143
143 kfree_skb(fp); 144 sum_truesize += fp->truesize;
144 fp = xp; 145 kfree_skb(fp);
146 fp = xp;
147 } while (fp);
148 } else {
149 sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments);
145 } 150 }
146 sum = sum_truesize + f->qsize; 151 sum = sum_truesize + f->qsize;
147 152
@@ -157,9 +162,6 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
157{ 162{
158 struct inet_frag_queue *q; 163 struct inet_frag_queue *q;
159 164
160 if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh)
161 return NULL;
162
163 q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC); 165 q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC);
164 if (!q) 166 if (!q)
165 return NULL; 167 return NULL;
@@ -204,6 +206,9 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key)
204{ 206{
205 struct inet_frag_queue *fq; 207 struct inet_frag_queue *fq;
206 208
209 if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh)
210 return NULL;
211
207 rcu_read_lock(); 212 rcu_read_lock();
208 213
209 fq = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params); 214 fq = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 3647167c8fa3..f5c9ef2586de 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -328,7 +328,7 @@ struct sock *__inet_lookup_listener(struct net *net,
328 saddr, sport, daddr, hnum, 328 saddr, sport, daddr, hnum,
329 dif, sdif); 329 dif, sdif);
330 if (result) 330 if (result)
331 return result; 331 goto done;
332 332
333 /* Lookup lhash2 with INADDR_ANY */ 333 /* Lookup lhash2 with INADDR_ANY */
334 334
@@ -337,9 +337,10 @@ struct sock *__inet_lookup_listener(struct net *net,
337 if (ilb2->count > ilb->count) 337 if (ilb2->count > ilb->count)
338 goto port_lookup; 338 goto port_lookup;
339 339
340 return inet_lhash2_lookup(net, ilb2, skb, doff, 340 result = inet_lhash2_lookup(net, ilb2, skb, doff,
341 saddr, sport, daddr, hnum, 341 saddr, sport, daddr, hnum,
342 dif, sdif); 342 dif, sdif);
343 goto done;
343 344
344port_lookup: 345port_lookup:
345 sk_for_each_rcu(sk, &ilb->head) { 346 sk_for_each_rcu(sk, &ilb->head) {
@@ -352,12 +353,15 @@ port_lookup:
352 result = reuseport_select_sock(sk, phash, 353 result = reuseport_select_sock(sk, phash,
353 skb, doff); 354 skb, doff);
354 if (result) 355 if (result)
355 return result; 356 goto done;
356 } 357 }
357 result = sk; 358 result = sk;
358 hiscore = score; 359 hiscore = score;
359 } 360 }
360 } 361 }
362done:
363 if (unlikely(IS_ERR(result)))
364 return NULL;
361 return result; 365 return result;
362} 366}
363EXPORT_SYMBOL_GPL(__inet_lookup_listener); 367EXPORT_SYMBOL_GPL(__inet_lookup_listener);
@@ -567,10 +571,11 @@ static int inet_reuseport_add_sock(struct sock *sk,
567 inet_csk(sk2)->icsk_bind_hash == tb && 571 inet_csk(sk2)->icsk_bind_hash == tb &&
568 sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && 572 sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
569 inet_rcv_saddr_equal(sk, sk2, false)) 573 inet_rcv_saddr_equal(sk, sk2, false))
570 return reuseport_add_sock(sk, sk2); 574 return reuseport_add_sock(sk, sk2,
575 inet_rcv_saddr_any(sk));
571 } 576 }
572 577
573 return reuseport_alloc(sk); 578 return reuseport_alloc(sk, inet_rcv_saddr_any(sk));
574} 579}
575 580
576int __inet_hash(struct sock *sk, struct sock *osk) 581int __inet_hash(struct sock *sk, struct sock *osk)
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index b54b948b0596..32662e9e5d21 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -143,7 +143,8 @@ int ip_forward(struct sk_buff *skb)
143 !skb_sec_path(skb)) 143 !skb_sec_path(skb))
144 ip_rt_send_redirect(skb); 144 ip_rt_send_redirect(skb);
145 145
146 skb->priority = rt_tos2priority(iph->tos); 146 if (net->ipv4.sysctl_ip_fwd_update_priority)
147 skb->priority = rt_tos2priority(iph->tos);
147 148
148 return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, 149 return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD,
149 net, NULL, skb, skb->dev, rt->dst.dev, 150 net, NULL, skb, skb->dev, rt->dst.dev,
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 8e9528ebaa8e..88281fbce88c 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -57,6 +57,57 @@
57 */ 57 */
58static const char ip_frag_cache_name[] = "ip4-frags"; 58static const char ip_frag_cache_name[] = "ip4-frags";
59 59
60/* Use skb->cb to track consecutive/adjacent fragments coming at
61 * the end of the queue. Nodes in the rb-tree queue will
62 * contain "runs" of one or more adjacent fragments.
63 *
64 * Invariants:
65 * - next_frag is NULL at the tail of a "run";
66 * - the head of a "run" has the sum of all fragment lengths in frag_run_len.
67 */
68struct ipfrag_skb_cb {
69 struct inet_skb_parm h;
70 struct sk_buff *next_frag;
71 int frag_run_len;
72};
73
74#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb))
75
76static void ip4_frag_init_run(struct sk_buff *skb)
77{
78 BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb));
79
80 FRAG_CB(skb)->next_frag = NULL;
81 FRAG_CB(skb)->frag_run_len = skb->len;
82}
83
84/* Append skb to the last "run". */
85static void ip4_frag_append_to_last_run(struct inet_frag_queue *q,
86 struct sk_buff *skb)
87{
88 RB_CLEAR_NODE(&skb->rbnode);
89 FRAG_CB(skb)->next_frag = NULL;
90
91 FRAG_CB(q->last_run_head)->frag_run_len += skb->len;
92 FRAG_CB(q->fragments_tail)->next_frag = skb;
93 q->fragments_tail = skb;
94}
95
96/* Create a new "run" with the skb. */
97static void ip4_frag_create_run(struct inet_frag_queue *q, struct sk_buff *skb)
98{
99 if (q->last_run_head)
100 rb_link_node(&skb->rbnode, &q->last_run_head->rbnode,
101 &q->last_run_head->rbnode.rb_right);
102 else
103 rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node);
104 rb_insert_color(&skb->rbnode, &q->rb_fragments);
105
106 ip4_frag_init_run(skb);
107 q->fragments_tail = skb;
108 q->last_run_head = skb;
109}
110
60/* Describe an entry in the "incomplete datagrams" queue. */ 111/* Describe an entry in the "incomplete datagrams" queue. */
61struct ipq { 112struct ipq {
62 struct inet_frag_queue q; 113 struct inet_frag_queue q;
@@ -75,8 +126,8 @@ static u8 ip4_frag_ecn(u8 tos)
75 126
76static struct inet_frags ip4_frags; 127static struct inet_frags ip4_frags;
77 128
78static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, 129static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
79 struct net_device *dev); 130 struct sk_buff *prev_tail, struct net_device *dev);
80 131
81 132
82static void ip4_frag_init(struct inet_frag_queue *q, const void *a) 133static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
@@ -136,7 +187,7 @@ static void ip_expire(struct timer_list *t)
136{ 187{
137 struct inet_frag_queue *frag = from_timer(frag, t, timer); 188 struct inet_frag_queue *frag = from_timer(frag, t, timer);
138 const struct iphdr *iph; 189 const struct iphdr *iph;
139 struct sk_buff *head; 190 struct sk_buff *head = NULL;
140 struct net *net; 191 struct net *net;
141 struct ipq *qp; 192 struct ipq *qp;
142 int err; 193 int err;
@@ -152,14 +203,36 @@ static void ip_expire(struct timer_list *t)
152 203
153 ipq_kill(qp); 204 ipq_kill(qp);
154 __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); 205 __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
155
156 head = qp->q.fragments;
157
158 __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT); 206 __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT);
159 207
160 if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !head) 208 if (!(qp->q.flags & INET_FRAG_FIRST_IN))
161 goto out; 209 goto out;
162 210
211 /* sk_buff::dev and sk_buff::rbnode are unionized. So we
212 * pull the head out of the tree in order to be able to
213 * deal with head->dev.
214 */
215 if (qp->q.fragments) {
216 head = qp->q.fragments;
217 qp->q.fragments = head->next;
218 } else {
219 head = skb_rb_first(&qp->q.rb_fragments);
220 if (!head)
221 goto out;
222 if (FRAG_CB(head)->next_frag)
223 rb_replace_node(&head->rbnode,
224 &FRAG_CB(head)->next_frag->rbnode,
225 &qp->q.rb_fragments);
226 else
227 rb_erase(&head->rbnode, &qp->q.rb_fragments);
228 memset(&head->rbnode, 0, sizeof(head->rbnode));
229 barrier();
230 }
231 if (head == qp->q.fragments_tail)
232 qp->q.fragments_tail = NULL;
233
234 sub_frag_mem_limit(qp->q.net, head->truesize);
235
163 head->dev = dev_get_by_index_rcu(net, qp->iif); 236 head->dev = dev_get_by_index_rcu(net, qp->iif);
164 if (!head->dev) 237 if (!head->dev)
165 goto out; 238 goto out;
@@ -179,16 +252,16 @@ static void ip_expire(struct timer_list *t)
179 (skb_rtable(head)->rt_type != RTN_LOCAL)) 252 (skb_rtable(head)->rt_type != RTN_LOCAL))
180 goto out; 253 goto out;
181 254
182 skb_get(head);
183 spin_unlock(&qp->q.lock); 255 spin_unlock(&qp->q.lock);
184 icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); 256 icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
185 kfree_skb(head);
186 goto out_rcu_unlock; 257 goto out_rcu_unlock;
187 258
188out: 259out:
189 spin_unlock(&qp->q.lock); 260 spin_unlock(&qp->q.lock);
190out_rcu_unlock: 261out_rcu_unlock:
191 rcu_read_unlock(); 262 rcu_read_unlock();
263 if (head)
264 kfree_skb(head);
192 ipq_put(qp); 265 ipq_put(qp);
193} 266}
194 267
@@ -231,7 +304,7 @@ static int ip_frag_too_far(struct ipq *qp)
231 end = atomic_inc_return(&peer->rid); 304 end = atomic_inc_return(&peer->rid);
232 qp->rid = end; 305 qp->rid = end;
233 306
234 rc = qp->q.fragments && (end - start) > max; 307 rc = qp->q.fragments_tail && (end - start) > max;
235 308
236 if (rc) { 309 if (rc) {
237 struct net *net; 310 struct net *net;
@@ -245,7 +318,6 @@ static int ip_frag_too_far(struct ipq *qp)
245 318
246static int ip_frag_reinit(struct ipq *qp) 319static int ip_frag_reinit(struct ipq *qp)
247{ 320{
248 struct sk_buff *fp;
249 unsigned int sum_truesize = 0; 321 unsigned int sum_truesize = 0;
250 322
251 if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) { 323 if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) {
@@ -253,21 +325,16 @@ static int ip_frag_reinit(struct ipq *qp)
253 return -ETIMEDOUT; 325 return -ETIMEDOUT;
254 } 326 }
255 327
256 fp = qp->q.fragments; 328 sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments);
257 do {
258 struct sk_buff *xp = fp->next;
259
260 sum_truesize += fp->truesize;
261 kfree_skb(fp);
262 fp = xp;
263 } while (fp);
264 sub_frag_mem_limit(qp->q.net, sum_truesize); 329 sub_frag_mem_limit(qp->q.net, sum_truesize);
265 330
266 qp->q.flags = 0; 331 qp->q.flags = 0;
267 qp->q.len = 0; 332 qp->q.len = 0;
268 qp->q.meat = 0; 333 qp->q.meat = 0;
269 qp->q.fragments = NULL; 334 qp->q.fragments = NULL;
335 qp->q.rb_fragments = RB_ROOT;
270 qp->q.fragments_tail = NULL; 336 qp->q.fragments_tail = NULL;
337 qp->q.last_run_head = NULL;
271 qp->iif = 0; 338 qp->iif = 0;
272 qp->ecn = 0; 339 qp->ecn = 0;
273 340
@@ -277,7 +344,9 @@ static int ip_frag_reinit(struct ipq *qp)
277/* Add new segment to existing queue. */ 344/* Add new segment to existing queue. */
278static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) 345static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
279{ 346{
280 struct sk_buff *prev, *next; 347 struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
348 struct rb_node **rbn, *parent;
349 struct sk_buff *skb1, *prev_tail;
281 struct net_device *dev; 350 struct net_device *dev;
282 unsigned int fragsize; 351 unsigned int fragsize;
283 int flags, offset; 352 int flags, offset;
@@ -340,95 +409,61 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
340 if (err) 409 if (err)
341 goto err; 410 goto err;
342 411
343 /* Find out which fragments are in front and at the back of us 412 /* Note : skb->rbnode and skb->dev share the same location. */
344 * in the chain of fragments so far. We must know where to put 413 dev = skb->dev;
345 * this fragment, right? 414 /* Makes sure compiler wont do silly aliasing games */
346 */ 415 barrier();
347 prev = qp->q.fragments_tail;
348 if (!prev || prev->ip_defrag_offset < offset) {
349 next = NULL;
350 goto found;
351 }
352 prev = NULL;
353 for (next = qp->q.fragments; next != NULL; next = next->next) {
354 if (next->ip_defrag_offset >= offset)
355 break; /* bingo! */
356 prev = next;
357 }
358 416
359found: 417 /* RFC5722, Section 4, amended by Errata ID : 3089
360 /* We found where to put this one. Check for overlap with 418 * When reassembling an IPv6 datagram, if
361 * preceding fragment, and, if needed, align things so that 419 * one or more its constituent fragments is determined to be an
362 * any overlaps are eliminated. 420 * overlapping fragment, the entire datagram (and any constituent
421 * fragments) MUST be silently discarded.
422 *
423 * We do the same here for IPv4 (and increment an snmp counter).
363 */ 424 */
364 if (prev) {
365 int i = (prev->ip_defrag_offset + prev->len) - offset;
366
367 if (i > 0) {
368 offset += i;
369 err = -EINVAL;
370 if (end <= offset)
371 goto err;
372 err = -ENOMEM;
373 if (!pskb_pull(skb, i))
374 goto err;
375 if (skb->ip_summed != CHECKSUM_UNNECESSARY)
376 skb->ip_summed = CHECKSUM_NONE;
377 }
378 }
379 425
380 err = -ENOMEM; 426 /* Find out where to put this fragment. */
381 427 prev_tail = qp->q.fragments_tail;
382 while (next && next->ip_defrag_offset < end) { 428 if (!prev_tail)
383 int i = end - next->ip_defrag_offset; /* overlap is 'i' bytes */ 429 ip4_frag_create_run(&qp->q, skb); /* First fragment. */
384 430 else if (prev_tail->ip_defrag_offset + prev_tail->len < end) {
385 if (i < next->len) { 431 /* This is the common case: skb goes to the end. */
386 /* Eat head of the next overlapped fragment 432 /* Detect and discard overlaps. */
387 * and leave the loop. The next ones cannot overlap. 433 if (offset < prev_tail->ip_defrag_offset + prev_tail->len)
388 */ 434 goto discard_qp;
389 if (!pskb_pull(next, i)) 435 if (offset == prev_tail->ip_defrag_offset + prev_tail->len)
390 goto err; 436 ip4_frag_append_to_last_run(&qp->q, skb);
391 next->ip_defrag_offset += i; 437 else
392 qp->q.meat -= i; 438 ip4_frag_create_run(&qp->q, skb);
393 if (next->ip_summed != CHECKSUM_UNNECESSARY) 439 } else {
394 next->ip_summed = CHECKSUM_NONE; 440 /* Binary search. Note that skb can become the first fragment,
395 break; 441 * but not the last (covered above).
396 } else { 442 */
397 struct sk_buff *free_it = next; 443 rbn = &qp->q.rb_fragments.rb_node;
398 444 do {
399 /* Old fragment is completely overridden with 445 parent = *rbn;
400 * new one drop it. 446 skb1 = rb_to_skb(parent);
401 */ 447 if (end <= skb1->ip_defrag_offset)
402 next = next->next; 448 rbn = &parent->rb_left;
403 449 else if (offset >= skb1->ip_defrag_offset +
404 if (prev) 450 FRAG_CB(skb1)->frag_run_len)
405 prev->next = next; 451 rbn = &parent->rb_right;
406 else 452 else /* Found an overlap with skb1. */
407 qp->q.fragments = next; 453 goto discard_qp;
408 454 } while (*rbn);
409 qp->q.meat -= free_it->len; 455 /* Here we have parent properly set, and rbn pointing to
410 sub_frag_mem_limit(qp->q.net, free_it->truesize); 456 * one of its NULL left/right children. Insert skb.
411 kfree_skb(free_it); 457 */
412 } 458 ip4_frag_init_run(skb);
459 rb_link_node(&skb->rbnode, parent, rbn);
460 rb_insert_color(&skb->rbnode, &qp->q.rb_fragments);
413 } 461 }
414 462
415 /* Note : skb->ip_defrag_offset and skb->dev share the same location */
416 dev = skb->dev;
417 if (dev) 463 if (dev)
418 qp->iif = dev->ifindex; 464 qp->iif = dev->ifindex;
419 /* Makes sure compiler wont do silly aliasing games */
420 barrier();
421 skb->ip_defrag_offset = offset; 465 skb->ip_defrag_offset = offset;
422 466
423 /* Insert this fragment in the chain of fragments. */
424 skb->next = next;
425 if (!next)
426 qp->q.fragments_tail = skb;
427 if (prev)
428 prev->next = skb;
429 else
430 qp->q.fragments = skb;
431
432 qp->q.stamp = skb->tstamp; 467 qp->q.stamp = skb->tstamp;
433 qp->q.meat += skb->len; 468 qp->q.meat += skb->len;
434 qp->ecn |= ecn; 469 qp->ecn |= ecn;
@@ -450,7 +485,7 @@ found:
450 unsigned long orefdst = skb->_skb_refdst; 485 unsigned long orefdst = skb->_skb_refdst;
451 486
452 skb->_skb_refdst = 0UL; 487 skb->_skb_refdst = 0UL;
453 err = ip_frag_reasm(qp, prev, dev); 488 err = ip_frag_reasm(qp, skb, prev_tail, dev);
454 skb->_skb_refdst = orefdst; 489 skb->_skb_refdst = orefdst;
455 return err; 490 return err;
456 } 491 }
@@ -458,20 +493,24 @@ found:
458 skb_dst_drop(skb); 493 skb_dst_drop(skb);
459 return -EINPROGRESS; 494 return -EINPROGRESS;
460 495
496discard_qp:
497 inet_frag_kill(&qp->q);
498 err = -EINVAL;
499 __IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS);
461err: 500err:
462 kfree_skb(skb); 501 kfree_skb(skb);
463 return err; 502 return err;
464} 503}
465 504
466
467/* Build a new IP datagram from all its fragments. */ 505/* Build a new IP datagram from all its fragments. */
468 506static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
469static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, 507 struct sk_buff *prev_tail, struct net_device *dev)
470 struct net_device *dev)
471{ 508{
472 struct net *net = container_of(qp->q.net, struct net, ipv4.frags); 509 struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
473 struct iphdr *iph; 510 struct iphdr *iph;
474 struct sk_buff *fp, *head = qp->q.fragments; 511 struct sk_buff *fp, *head = skb_rb_first(&qp->q.rb_fragments);
512 struct sk_buff **nextp; /* To build frag_list. */
513 struct rb_node *rbn;
475 int len; 514 int len;
476 int ihlen; 515 int ihlen;
477 int err; 516 int err;
@@ -485,25 +524,26 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
485 goto out_fail; 524 goto out_fail;
486 } 525 }
487 /* Make the one we just received the head. */ 526 /* Make the one we just received the head. */
488 if (prev) { 527 if (head != skb) {
489 head = prev->next; 528 fp = skb_clone(skb, GFP_ATOMIC);
490 fp = skb_clone(head, GFP_ATOMIC);
491 if (!fp) 529 if (!fp)
492 goto out_nomem; 530 goto out_nomem;
493 531 FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag;
494 fp->next = head->next; 532 if (RB_EMPTY_NODE(&skb->rbnode))
495 if (!fp->next) 533 FRAG_CB(prev_tail)->next_frag = fp;
534 else
535 rb_replace_node(&skb->rbnode, &fp->rbnode,
536 &qp->q.rb_fragments);
537 if (qp->q.fragments_tail == skb)
496 qp->q.fragments_tail = fp; 538 qp->q.fragments_tail = fp;
497 prev->next = fp; 539 skb_morph(skb, head);
498 540 FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag;
499 skb_morph(head, qp->q.fragments); 541 rb_replace_node(&head->rbnode, &skb->rbnode,
500 head->next = qp->q.fragments->next; 542 &qp->q.rb_fragments);
501 543 consume_skb(head);
502 consume_skb(qp->q.fragments); 544 head = skb;
503 qp->q.fragments = head;
504 } 545 }
505 546
506 WARN_ON(!head);
507 WARN_ON(head->ip_defrag_offset != 0); 547 WARN_ON(head->ip_defrag_offset != 0);
508 548
509 /* Allocate a new buffer for the datagram. */ 549 /* Allocate a new buffer for the datagram. */
@@ -528,35 +568,60 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
528 clone = alloc_skb(0, GFP_ATOMIC); 568 clone = alloc_skb(0, GFP_ATOMIC);
529 if (!clone) 569 if (!clone)
530 goto out_nomem; 570 goto out_nomem;
531 clone->next = head->next;
532 head->next = clone;
533 skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; 571 skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
534 skb_frag_list_init(head); 572 skb_frag_list_init(head);
535 for (i = 0; i < skb_shinfo(head)->nr_frags; i++) 573 for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
536 plen += skb_frag_size(&skb_shinfo(head)->frags[i]); 574 plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
537 clone->len = clone->data_len = head->data_len - plen; 575 clone->len = clone->data_len = head->data_len - plen;
538 head->data_len -= clone->len; 576 head->truesize += clone->truesize;
539 head->len -= clone->len;
540 clone->csum = 0; 577 clone->csum = 0;
541 clone->ip_summed = head->ip_summed; 578 clone->ip_summed = head->ip_summed;
542 add_frag_mem_limit(qp->q.net, clone->truesize); 579 add_frag_mem_limit(qp->q.net, clone->truesize);
580 skb_shinfo(head)->frag_list = clone;
581 nextp = &clone->next;
582 } else {
583 nextp = &skb_shinfo(head)->frag_list;
543 } 584 }
544 585
545 skb_shinfo(head)->frag_list = head->next;
546 skb_push(head, head->data - skb_network_header(head)); 586 skb_push(head, head->data - skb_network_header(head));
547 587
548 for (fp=head->next; fp; fp = fp->next) { 588 /* Traverse the tree in order, to build frag_list. */
549 head->data_len += fp->len; 589 fp = FRAG_CB(head)->next_frag;
550 head->len += fp->len; 590 rbn = rb_next(&head->rbnode);
551 if (head->ip_summed != fp->ip_summed) 591 rb_erase(&head->rbnode, &qp->q.rb_fragments);
552 head->ip_summed = CHECKSUM_NONE; 592 while (rbn || fp) {
553 else if (head->ip_summed == CHECKSUM_COMPLETE) 593 /* fp points to the next sk_buff in the current run;
554 head->csum = csum_add(head->csum, fp->csum); 594 * rbn points to the next run.
555 head->truesize += fp->truesize; 595 */
596 /* Go through the current run. */
597 while (fp) {
598 *nextp = fp;
599 nextp = &fp->next;
600 fp->prev = NULL;
601 memset(&fp->rbnode, 0, sizeof(fp->rbnode));
602 head->data_len += fp->len;
603 head->len += fp->len;
604 if (head->ip_summed != fp->ip_summed)
605 head->ip_summed = CHECKSUM_NONE;
606 else if (head->ip_summed == CHECKSUM_COMPLETE)
607 head->csum = csum_add(head->csum, fp->csum);
608 head->truesize += fp->truesize;
609 fp = FRAG_CB(fp)->next_frag;
610 }
611 /* Move to the next run. */
612 if (rbn) {
613 struct rb_node *rbnext = rb_next(rbn);
614
615 fp = rb_to_skb(rbn);
616 rb_erase(rbn, &qp->q.rb_fragments);
617 rbn = rbnext;
618 }
556 } 619 }
557 sub_frag_mem_limit(qp->q.net, head->truesize); 620 sub_frag_mem_limit(qp->q.net, head->truesize);
558 621
622 *nextp = NULL;
559 head->next = NULL; 623 head->next = NULL;
624 head->prev = NULL;
560 head->dev = dev; 625 head->dev = dev;
561 head->tstamp = qp->q.stamp; 626 head->tstamp = qp->q.stamp;
562 IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size); 627 IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size);
@@ -584,7 +649,9 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
584 649
585 __IP_INC_STATS(net, IPSTATS_MIB_REASMOKS); 650 __IP_INC_STATS(net, IPSTATS_MIB_REASMOKS);
586 qp->q.fragments = NULL; 651 qp->q.fragments = NULL;
652 qp->q.rb_fragments = RB_ROOT;
587 qp->q.fragments_tail = NULL; 653 qp->q.fragments_tail = NULL;
654 qp->q.last_run_head = NULL;
588 return 0; 655 return 0;
589 656
590out_nomem: 657out_nomem:
@@ -666,6 +733,28 @@ struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user)
666} 733}
667EXPORT_SYMBOL(ip_check_defrag); 734EXPORT_SYMBOL(ip_check_defrag);
668 735
736unsigned int inet_frag_rbtree_purge(struct rb_root *root)
737{
738 struct rb_node *p = rb_first(root);
739 unsigned int sum = 0;
740
741 while (p) {
742 struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
743
744 p = rb_next(p);
745 rb_erase(&skb->rbnode, root);
746 while (skb) {
747 struct sk_buff *next = FRAG_CB(skb)->next_frag;
748
749 sum += skb->truesize;
750 kfree_skb(skb);
751 skb = next;
752 }
753 }
754 return sum;
755}
756EXPORT_SYMBOL(inet_frag_rbtree_purge);
757
669#ifdef CONFIG_SYSCTL 758#ifdef CONFIG_SYSCTL
670static int dist_min; 759static int dist_min;
671 760
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 2d8efeecf619..51a5d06085ac 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -587,6 +587,8 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
587 goto err_free_skb; 587 goto err_free_skb;
588 588
589 key = &tun_info->key; 589 key = &tun_info->key;
590 if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT))
591 goto err_free_rt;
590 md = ip_tunnel_info_opts(tun_info); 592 md = ip_tunnel_info_opts(tun_info);
591 if (!md) 593 if (!md)
592 goto err_free_rt; 594 goto err_free_rt;
@@ -983,7 +985,6 @@ static void ipgre_tunnel_setup(struct net_device *dev)
983static void __gre_tunnel_init(struct net_device *dev) 985static void __gre_tunnel_init(struct net_device *dev)
984{ 986{
985 struct ip_tunnel *tunnel; 987 struct ip_tunnel *tunnel;
986 int t_hlen;
987 988
988 tunnel = netdev_priv(dev); 989 tunnel = netdev_priv(dev);
989 tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags); 990 tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
@@ -991,8 +992,6 @@ static void __gre_tunnel_init(struct net_device *dev)
991 992
992 tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen; 993 tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
993 994
994 t_hlen = tunnel->hlen + sizeof(struct iphdr);
995
996 dev->features |= GRE_FEATURES; 995 dev->features |= GRE_FEATURES;
997 dev->hw_features |= GRE_FEATURES; 996 dev->hw_features |= GRE_FEATURES;
998 997
@@ -1302,13 +1301,11 @@ static const struct net_device_ops gre_tap_netdev_ops = {
1302static int erspan_tunnel_init(struct net_device *dev) 1301static int erspan_tunnel_init(struct net_device *dev)
1303{ 1302{
1304 struct ip_tunnel *tunnel = netdev_priv(dev); 1303 struct ip_tunnel *tunnel = netdev_priv(dev);
1305 int t_hlen;
1306 1304
1307 tunnel->tun_hlen = 8; 1305 tunnel->tun_hlen = 8;
1308 tunnel->parms.iph.protocol = IPPROTO_GRE; 1306 tunnel->parms.iph.protocol = IPPROTO_GRE;
1309 tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen + 1307 tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen +
1310 erspan_hdr_len(tunnel->erspan_ver); 1308 erspan_hdr_len(tunnel->erspan_ver);
1311 t_hlen = tunnel->hlen + sizeof(struct iphdr);
1312 1309
1313 dev->features |= GRE_FEATURES; 1310 dev->features |= GRE_FEATURES;
1314 dev->hw_features |= GRE_FEATURES; 1311 dev->hw_features |= GRE_FEATURES;
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 7582713dd18f..3196cf58f418 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -307,7 +307,8 @@ drop:
307 return true; 307 return true;
308} 308}
309 309
310static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) 310static int ip_rcv_finish_core(struct net *net, struct sock *sk,
311 struct sk_buff *skb)
311{ 312{
312 const struct iphdr *iph = ip_hdr(skb); 313 const struct iphdr *iph = ip_hdr(skb);
313 int (*edemux)(struct sk_buff *skb); 314 int (*edemux)(struct sk_buff *skb);
@@ -315,13 +316,6 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
315 struct rtable *rt; 316 struct rtable *rt;
316 int err; 317 int err;
317 318
318 /* if ingress device is enslaved to an L3 master device pass the
319 * skb to its handler for processing
320 */
321 skb = l3mdev_ip_rcv(skb);
322 if (!skb)
323 return NET_RX_SUCCESS;
324
325 if (net->ipv4.sysctl_ip_early_demux && 319 if (net->ipv4.sysctl_ip_early_demux &&
326 !skb_dst(skb) && 320 !skb_dst(skb) &&
327 !skb->sk && 321 !skb->sk &&
@@ -393,7 +387,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
393 goto drop; 387 goto drop;
394 } 388 }
395 389
396 return dst_input(skb); 390 return NET_RX_SUCCESS;
397 391
398drop: 392drop:
399 kfree_skb(skb); 393 kfree_skb(skb);
@@ -405,13 +399,29 @@ drop_error:
405 goto drop; 399 goto drop;
406} 400}
407 401
402static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
403{
404 int ret;
405
406 /* if ingress device is enslaved to an L3 master device pass the
407 * skb to its handler for processing
408 */
409 skb = l3mdev_ip_rcv(skb);
410 if (!skb)
411 return NET_RX_SUCCESS;
412
413 ret = ip_rcv_finish_core(net, sk, skb);
414 if (ret != NET_RX_DROP)
415 ret = dst_input(skb);
416 return ret;
417}
418
408/* 419/*
409 * Main IP Receive routine. 420 * Main IP Receive routine.
410 */ 421 */
411int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) 422static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
412{ 423{
413 const struct iphdr *iph; 424 const struct iphdr *iph;
414 struct net *net;
415 u32 len; 425 u32 len;
416 426
417 /* When the interface is in promisc. mode, drop all the crap 427 /* When the interface is in promisc. mode, drop all the crap
@@ -421,7 +431,6 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
421 goto drop; 431 goto drop;
422 432
423 433
424 net = dev_net(dev);
425 __IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len); 434 __IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len);
426 435
427 skb = skb_share_check(skb, GFP_ATOMIC); 436 skb = skb_share_check(skb, GFP_ATOMIC);
@@ -489,9 +498,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
489 /* Must drop socket now because of tproxy. */ 498 /* Must drop socket now because of tproxy. */
490 skb_orphan(skb); 499 skb_orphan(skb);
491 500
492 return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, 501 return skb;
493 net, NULL, skb, dev, NULL,
494 ip_rcv_finish);
495 502
496csum_error: 503csum_error:
497 __IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS); 504 __IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS);
@@ -500,5 +507,113 @@ inhdr_error:
500drop: 507drop:
501 kfree_skb(skb); 508 kfree_skb(skb);
502out: 509out:
503 return NET_RX_DROP; 510 return NULL;
511}
512
513/*
514 * IP receive entry point
515 */
516int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
517 struct net_device *orig_dev)
518{
519 struct net *net = dev_net(dev);
520
521 skb = ip_rcv_core(skb, net);
522 if (skb == NULL)
523 return NET_RX_DROP;
524 return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
525 net, NULL, skb, dev, NULL,
526 ip_rcv_finish);
527}
528
529static void ip_sublist_rcv_finish(struct list_head *head)
530{
531 struct sk_buff *skb, *next;
532
533 list_for_each_entry_safe(skb, next, head, list) {
534 list_del(&skb->list);
535 /* Handle ip{6}_forward case, as sch_direct_xmit have
536 * another kind of SKB-list usage (see validate_xmit_skb_list)
537 */
538 skb->next = NULL;
539 dst_input(skb);
540 }
541}
542
543static void ip_list_rcv_finish(struct net *net, struct sock *sk,
544 struct list_head *head)
545{
546 struct dst_entry *curr_dst = NULL;
547 struct sk_buff *skb, *next;
548 struct list_head sublist;
549
550 INIT_LIST_HEAD(&sublist);
551 list_for_each_entry_safe(skb, next, head, list) {
552 struct dst_entry *dst;
553
554 list_del(&skb->list);
555 /* if ingress device is enslaved to an L3 master device pass the
556 * skb to its handler for processing
557 */
558 skb = l3mdev_ip_rcv(skb);
559 if (!skb)
560 continue;
561 if (ip_rcv_finish_core(net, sk, skb) == NET_RX_DROP)
562 continue;
563
564 dst = skb_dst(skb);
565 if (curr_dst != dst) {
566 /* dispatch old sublist */
567 if (!list_empty(&sublist))
568 ip_sublist_rcv_finish(&sublist);
569 /* start new sublist */
570 INIT_LIST_HEAD(&sublist);
571 curr_dst = dst;
572 }
573 list_add_tail(&skb->list, &sublist);
574 }
575 /* dispatch final sublist */
576 ip_sublist_rcv_finish(&sublist);
577}
578
579static void ip_sublist_rcv(struct list_head *head, struct net_device *dev,
580 struct net *net)
581{
582 NF_HOOK_LIST(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL,
583 head, dev, NULL, ip_rcv_finish);
584 ip_list_rcv_finish(net, NULL, head);
585}
586
587/* Receive a list of IP packets */
588void ip_list_rcv(struct list_head *head, struct packet_type *pt,
589 struct net_device *orig_dev)
590{
591 struct net_device *curr_dev = NULL;
592 struct net *curr_net = NULL;
593 struct sk_buff *skb, *next;
594 struct list_head sublist;
595
596 INIT_LIST_HEAD(&sublist);
597 list_for_each_entry_safe(skb, next, head, list) {
598 struct net_device *dev = skb->dev;
599 struct net *net = dev_net(dev);
600
601 list_del(&skb->list);
602 skb = ip_rcv_core(skb, net);
603 if (skb == NULL)
604 continue;
605
606 if (curr_dev != dev || curr_net != net) {
607 /* dispatch old sublist */
608 if (!list_empty(&sublist))
609 ip_sublist_rcv(&sublist, curr_dev, curr_net);
610 /* start new sublist */
611 INIT_LIST_HEAD(&sublist);
612 curr_dev = dev;
613 curr_net = net;
614 }
615 list_add_tail(&skb->list, &sublist);
616 }
617 /* dispatch final sublist */
618 ip_sublist_rcv(&sublist, curr_dev, curr_net);
504} 619}
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index b3308e9d9762..9c4e72e9c60a 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -423,7 +423,8 @@ static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
423} 423}
424 424
425/* Note: skb->sk can be different from sk, in case of tunnels */ 425/* Note: skb->sk can be different from sk, in case of tunnels */
426int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) 426int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
427 __u8 tos)
427{ 428{
428 struct inet_sock *inet = inet_sk(sk); 429 struct inet_sock *inet = inet_sk(sk);
429 struct net *net = sock_net(sk); 430 struct net *net = sock_net(sk);
@@ -462,7 +463,7 @@ int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
462 inet->inet_dport, 463 inet->inet_dport,
463 inet->inet_sport, 464 inet->inet_sport,
464 sk->sk_protocol, 465 sk->sk_protocol,
465 RT_CONN_FLAGS(sk), 466 RT_CONN_FLAGS_TOS(sk, tos),
466 sk->sk_bound_dev_if); 467 sk->sk_bound_dev_if);
467 if (IS_ERR(rt)) 468 if (IS_ERR(rt))
468 goto no_route; 469 goto no_route;
@@ -478,7 +479,7 @@ packet_routed:
478 skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0)); 479 skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
479 skb_reset_network_header(skb); 480 skb_reset_network_header(skb);
480 iph = ip_hdr(skb); 481 iph = ip_hdr(skb);
481 *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); 482 *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (tos & 0xff));
482 if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df) 483 if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df)
483 iph->frag_off = htons(IP_DF); 484 iph->frag_off = htons(IP_DF);
484 else 485 else
@@ -511,7 +512,7 @@ no_route:
511 kfree_skb(skb); 512 kfree_skb(skb);
512 return -EHOSTUNREACH; 513 return -EHOSTUNREACH;
513} 514}
514EXPORT_SYMBOL(ip_queue_xmit); 515EXPORT_SYMBOL(__ip_queue_xmit);
515 516
516static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) 517static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
517{ 518{
@@ -523,6 +524,8 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
523 to->dev = from->dev; 524 to->dev = from->dev;
524 to->mark = from->mark; 525 to->mark = from->mark;
525 526
527 skb_copy_hash(to, from);
528
526 /* Copy the flags to each fragment. */ 529 /* Copy the flags to each fragment. */
527 IPCB(to)->flags = IPCB(from)->flags; 530 IPCB(to)->flags = IPCB(from)->flags;
528 531
@@ -1145,14 +1148,15 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1145 cork->fragsize = ip_sk_use_pmtu(sk) ? 1148 cork->fragsize = ip_sk_use_pmtu(sk) ?
1146 dst_mtu(&rt->dst) : rt->dst.dev->mtu; 1149 dst_mtu(&rt->dst) : rt->dst.dev->mtu;
1147 1150
1148 cork->gso_size = sk->sk_type == SOCK_DGRAM && 1151 cork->gso_size = ipc->gso_size;
1149 sk->sk_protocol == IPPROTO_UDP ? ipc->gso_size : 0;
1150 cork->dst = &rt->dst; 1152 cork->dst = &rt->dst;
1151 cork->length = 0; 1153 cork->length = 0;
1152 cork->ttl = ipc->ttl; 1154 cork->ttl = ipc->ttl;
1153 cork->tos = ipc->tos; 1155 cork->tos = ipc->tos;
1154 cork->priority = ipc->priority; 1156 cork->priority = ipc->priority;
1155 cork->tx_flags = ipc->tx_flags; 1157 cork->transmit_time = ipc->sockc.transmit_time;
1158 cork->tx_flags = 0;
1159 sock_tx_timestamp(sk, ipc->sockc.tsflags, &cork->tx_flags);
1156 1160
1157 return 0; 1161 return 0;
1158} 1162}
@@ -1413,6 +1417,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
1413 1417
1414 skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority; 1418 skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority;
1415 skb->mark = sk->sk_mark; 1419 skb->mark = sk->sk_mark;
1420 skb->tstamp = cork->transmit_time;
1416 /* 1421 /*
1417 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec 1422 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1418 * on dst refcount 1423 * on dst refcount
@@ -1545,11 +1550,8 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
1545 if (__ip_options_echo(net, &replyopts.opt.opt, skb, sopt)) 1550 if (__ip_options_echo(net, &replyopts.opt.opt, skb, sopt))
1546 return; 1551 return;
1547 1552
1553 ipcm_init(&ipc);
1548 ipc.addr = daddr; 1554 ipc.addr = daddr;
1549 ipc.opt = NULL;
1550 ipc.tx_flags = 0;
1551 ipc.ttl = 0;
1552 ipc.tos = -1;
1553 1555
1554 if (replyopts.opt.opt.optlen) { 1556 if (replyopts.opt.opt.optlen) {
1555 ipc.opt = &replyopts.opt; 1557 ipc.opt = &replyopts.opt;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index fc32fdbeefa6..c0fe5ad996f2 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -150,15 +150,18 @@ static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
150{ 150{
151 struct sockaddr_in sin; 151 struct sockaddr_in sin;
152 const struct iphdr *iph = ip_hdr(skb); 152 const struct iphdr *iph = ip_hdr(skb);
153 __be16 *ports = (__be16 *)skb_transport_header(skb); 153 __be16 *ports;
154 int end;
154 155
155 if (skb_transport_offset(skb) + 4 > (int)skb->len) 156 end = skb_transport_offset(skb) + 4;
157 if (end > 0 && !pskb_may_pull(skb, end))
156 return; 158 return;
157 159
158 /* All current transport protocols have the port numbers in the 160 /* All current transport protocols have the port numbers in the
159 * first four bytes of the transport header and this function is 161 * first four bytes of the transport header and this function is
160 * written with this assumption in mind. 162 * written with this assumption in mind.
161 */ 163 */
164 ports = (__be16 *)skb_transport_header(skb);
162 165
163 sin.sin_family = AF_INET; 166 sin.sin_family = AF_INET;
164 sin.sin_addr.s_addr = iph->daddr; 167 sin.sin_addr.s_addr = iph->daddr;
@@ -984,7 +987,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
984 mreq.imr_multiaddr.s_addr = mreqs.imr_multiaddr; 987 mreq.imr_multiaddr.s_addr = mreqs.imr_multiaddr;
985 mreq.imr_address.s_addr = mreqs.imr_interface; 988 mreq.imr_address.s_addr = mreqs.imr_interface;
986 mreq.imr_ifindex = 0; 989 mreq.imr_ifindex = 0;
987 err = ip_mc_join_group(sk, &mreq); 990 err = ip_mc_join_group_ssm(sk, &mreq, MCAST_INCLUDE);
988 if (err && err != -EADDRINUSE) 991 if (err && err != -EADDRINUSE)
989 break; 992 break;
990 omode = MCAST_INCLUDE; 993 omode = MCAST_INCLUDE;
@@ -1061,7 +1064,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
1061 mreq.imr_multiaddr = psin->sin_addr; 1064 mreq.imr_multiaddr = psin->sin_addr;
1062 mreq.imr_address.s_addr = 0; 1065 mreq.imr_address.s_addr = 0;
1063 mreq.imr_ifindex = greqs.gsr_interface; 1066 mreq.imr_ifindex = greqs.gsr_interface;
1064 err = ip_mc_join_group(sk, &mreq); 1067 err = ip_mc_join_group_ssm(sk, &mreq, MCAST_INCLUDE);
1065 if (err && err != -EADDRINUSE) 1068 if (err && err != -EADDRINUSE)
1066 break; 1069 break;
1067 greqs.gsr_interface = mreq.imr_ifindex; 1070 greqs.gsr_interface = mreq.imr_ifindex;
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 3f091ccad9af..f38cb21d773d 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -438,7 +438,8 @@ static int __net_init vti_init_net(struct net *net)
438 if (err) 438 if (err)
439 return err; 439 return err;
440 itn = net_generic(net, vti_net_id); 440 itn = net_generic(net, vti_net_id);
441 vti_fb_tunnel_init(itn->fb_tunnel_dev); 441 if (itn->fb_tunnel_dev)
442 vti_fb_tunnel_init(itn->fb_tunnel_dev);
442 return 0; 443 return 0;
443} 444}
444 445
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 9f79b9803a16..5660adcf7a04 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -60,6 +60,7 @@
60#include <linux/netfilter_ipv4.h> 60#include <linux/netfilter_ipv4.h>
61#include <linux/compat.h> 61#include <linux/compat.h>
62#include <linux/export.h> 62#include <linux/export.h>
63#include <linux/rhashtable.h>
63#include <net/ip_tunnels.h> 64#include <net/ip_tunnels.h>
64#include <net/checksum.h> 65#include <net/checksum.h>
65#include <net/netlink.h> 66#include <net/netlink.h>
@@ -1051,7 +1052,7 @@ static int ipmr_cache_report(struct mr_table *mrt,
1051 struct sk_buff *skb; 1052 struct sk_buff *skb;
1052 int ret; 1053 int ret;
1053 1054
1054 if (assert == IGMPMSG_WHOLEPKT) 1055 if (assert == IGMPMSG_WHOLEPKT || assert == IGMPMSG_WRVIFWHOLE)
1055 skb = skb_realloc_headroom(pkt, sizeof(struct iphdr)); 1056 skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
1056 else 1057 else
1057 skb = alloc_skb(128, GFP_ATOMIC); 1058 skb = alloc_skb(128, GFP_ATOMIC);
@@ -1059,7 +1060,7 @@ static int ipmr_cache_report(struct mr_table *mrt,
1059 if (!skb) 1060 if (!skb)
1060 return -ENOBUFS; 1061 return -ENOBUFS;
1061 1062
1062 if (assert == IGMPMSG_WHOLEPKT) { 1063 if (assert == IGMPMSG_WHOLEPKT || assert == IGMPMSG_WRVIFWHOLE) {
1063 /* Ugly, but we have no choice with this interface. 1064 /* Ugly, but we have no choice with this interface.
1064 * Duplicate old header, fix ihl, length etc. 1065 * Duplicate old header, fix ihl, length etc.
1065 * And all this only to mangle msg->im_msgtype and 1066 * And all this only to mangle msg->im_msgtype and
@@ -1070,9 +1071,12 @@ static int ipmr_cache_report(struct mr_table *mrt,
1070 skb_reset_transport_header(skb); 1071 skb_reset_transport_header(skb);
1071 msg = (struct igmpmsg *)skb_network_header(skb); 1072 msg = (struct igmpmsg *)skb_network_header(skb);
1072 memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr)); 1073 memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
1073 msg->im_msgtype = IGMPMSG_WHOLEPKT; 1074 msg->im_msgtype = assert;
1074 msg->im_mbz = 0; 1075 msg->im_mbz = 0;
1075 msg->im_vif = mrt->mroute_reg_vif_num; 1076 if (assert == IGMPMSG_WRVIFWHOLE)
1077 msg->im_vif = vifi;
1078 else
1079 msg->im_vif = mrt->mroute_reg_vif_num;
1076 ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2; 1080 ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
1077 ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) + 1081 ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
1078 sizeof(struct iphdr)); 1082 sizeof(struct iphdr));
@@ -1371,6 +1375,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
1371 struct mr_table *mrt; 1375 struct mr_table *mrt;
1372 struct vifctl vif; 1376 struct vifctl vif;
1373 struct mfcctl mfc; 1377 struct mfcctl mfc;
1378 bool do_wrvifwhole;
1374 u32 uval; 1379 u32 uval;
1375 1380
1376 /* There's one exception to the lock - MRT_DONE which needs to unlock */ 1381 /* There's one exception to the lock - MRT_DONE which needs to unlock */
@@ -1501,10 +1506,12 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
1501 break; 1506 break;
1502 } 1507 }
1503 1508
1509 do_wrvifwhole = (val == IGMPMSG_WRVIFWHOLE);
1504 val = !!val; 1510 val = !!val;
1505 if (val != mrt->mroute_do_pim) { 1511 if (val != mrt->mroute_do_pim) {
1506 mrt->mroute_do_pim = val; 1512 mrt->mroute_do_pim = val;
1507 mrt->mroute_do_assert = val; 1513 mrt->mroute_do_assert = val;
1514 mrt->mroute_do_wrvifwhole = do_wrvifwhole;
1508 } 1515 }
1509 break; 1516 break;
1510 case MRT_TABLE: 1517 case MRT_TABLE:
@@ -1982,6 +1989,9 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt,
1982 MFC_ASSERT_THRESH)) { 1989 MFC_ASSERT_THRESH)) {
1983 c->_c.mfc_un.res.last_assert = jiffies; 1990 c->_c.mfc_un.res.last_assert = jiffies;
1984 ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRONGVIF); 1991 ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRONGVIF);
1992 if (mrt->mroute_do_wrvifwhole)
1993 ipmr_cache_report(mrt, skb, true_vifi,
1994 IGMPMSG_WRVIFWHOLE);
1985 } 1995 }
1986 goto dont_forward; 1996 goto dont_forward;
1987 } 1997 }
@@ -2658,7 +2668,9 @@ static bool ipmr_fill_table(struct mr_table *mrt, struct sk_buff *skb)
2658 mrt->mroute_reg_vif_num) || 2668 mrt->mroute_reg_vif_num) ||
2659 nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_ASSERT, 2669 nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_ASSERT,
2660 mrt->mroute_do_assert) || 2670 mrt->mroute_do_assert) ||
2661 nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_PIM, mrt->mroute_do_pim)) 2671 nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_PIM, mrt->mroute_do_pim) ||
2672 nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_WRVIFWHOLE,
2673 mrt->mroute_do_wrvifwhole))
2662 return false; 2674 return false;
2663 2675
2664 return true; 2676 return true;
diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c
index cafb0506c8c9..1ad9aa62a97b 100644
--- a/net/ipv4/ipmr_base.c
+++ b/net/ipv4/ipmr_base.c
@@ -2,6 +2,7 @@
2 * Common logic shared by IPv4 [ipmr] and IPv6 [ip6mr] implementation 2 * Common logic shared by IPv4 [ipmr] and IPv6 [ip6mr] implementation
3 */ 3 */
4 4
5#include <linux/rhashtable.h>
5#include <linux/mroute_base.h> 6#include <linux/mroute_base.h>
6 7
7/* Sets everything common except 'dev', since that is done under locking */ 8/* Sets everything common except 'dev', since that is done under locking */
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index e6774ccb7731..8d2e5dc9a827 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -98,59 +98,6 @@ int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry)
98} 98}
99EXPORT_SYMBOL_GPL(nf_ip_reroute); 99EXPORT_SYMBOL_GPL(nf_ip_reroute);
100 100
101__sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook,
102 unsigned int dataoff, u_int8_t protocol)
103{
104 const struct iphdr *iph = ip_hdr(skb);
105 __sum16 csum = 0;
106
107 switch (skb->ip_summed) {
108 case CHECKSUM_COMPLETE:
109 if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN)
110 break;
111 if ((protocol == 0 && !csum_fold(skb->csum)) ||
112 !csum_tcpudp_magic(iph->saddr, iph->daddr,
113 skb->len - dataoff, protocol,
114 skb->csum)) {
115 skb->ip_summed = CHECKSUM_UNNECESSARY;
116 break;
117 }
118 /* fall through */
119 case CHECKSUM_NONE:
120 if (protocol == 0)
121 skb->csum = 0;
122 else
123 skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
124 skb->len - dataoff,
125 protocol, 0);
126 csum = __skb_checksum_complete(skb);
127 }
128 return csum;
129}
130EXPORT_SYMBOL(nf_ip_checksum);
131
132__sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook,
133 unsigned int dataoff, unsigned int len,
134 u_int8_t protocol)
135{
136 const struct iphdr *iph = ip_hdr(skb);
137 __sum16 csum = 0;
138
139 switch (skb->ip_summed) {
140 case CHECKSUM_COMPLETE:
141 if (len == skb->len - dataoff)
142 return nf_ip_checksum(skb, hook, dataoff, protocol);
143 /* fall through */
144 case CHECKSUM_NONE:
145 skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, protocol,
146 skb->len - dataoff, 0);
147 skb->ip_summed = CHECKSUM_NONE;
148 return __skb_checksum_complete_head(skb, dataoff + len);
149 }
150 return csum;
151}
152EXPORT_SYMBOL_GPL(nf_ip_checksum_partial);
153
154int nf_ip_route(struct net *net, struct dst_entry **dst, struct flowi *fl, 101int nf_ip_route(struct net *net, struct dst_entry **dst, struct flowi *fl,
155 bool strict __always_unused) 102 bool strict __always_unused)
156{ 103{
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index bbfc356cb1b5..d9504adc47b3 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -9,22 +9,6 @@ config NF_DEFRAG_IPV4
9 tristate 9 tristate
10 default n 10 default n
11 11
12config NF_CONNTRACK_IPV4
13 tristate "IPv4 connection tracking support (required for NAT)"
14 depends on NF_CONNTRACK
15 default m if NETFILTER_ADVANCED=n
16 select NF_DEFRAG_IPV4
17 ---help---
18 Connection tracking keeps a record of what packets have passed
19 through your machine, in order to figure out how they are related
20 into connections.
21
22 This is IPv4 support on Layer 3 independent connection tracking.
23 Layer 3 independent connection tracking is experimental scheme
24 which generalize ip_conntrack to support other layer 3 protocols.
25
26 To compile it as a module, choose M here. If unsure, say N.
27
28config NF_SOCKET_IPV4 12config NF_SOCKET_IPV4
29 tristate "IPv4 socket lookup support" 13 tristate "IPv4 socket lookup support"
30 help 14 help
@@ -112,7 +96,7 @@ config NF_REJECT_IPV4
112 96
113config NF_NAT_IPV4 97config NF_NAT_IPV4
114 tristate "IPv4 NAT" 98 tristate "IPv4 NAT"
115 depends on NF_CONNTRACK_IPV4 99 depends on NF_CONNTRACK
116 default m if NETFILTER_ADVANCED=n 100 default m if NETFILTER_ADVANCED=n
117 select NF_NAT 101 select NF_NAT
118 help 102 help
@@ -279,7 +263,7 @@ config IP_NF_TARGET_SYNPROXY
279# NAT + specific targets: nf_conntrack 263# NAT + specific targets: nf_conntrack
280config IP_NF_NAT 264config IP_NF_NAT
281 tristate "iptables NAT support" 265 tristate "iptables NAT support"
282 depends on NF_CONNTRACK_IPV4 266 depends on NF_CONNTRACK
283 default m if NETFILTER_ADVANCED=n 267 default m if NETFILTER_ADVANCED=n
284 select NF_NAT 268 select NF_NAT
285 select NF_NAT_IPV4 269 select NF_NAT_IPV4
@@ -340,7 +324,7 @@ config IP_NF_MANGLE
340config IP_NF_TARGET_CLUSTERIP 324config IP_NF_TARGET_CLUSTERIP
341 tristate "CLUSTERIP target support" 325 tristate "CLUSTERIP target support"
342 depends on IP_NF_MANGLE 326 depends on IP_NF_MANGLE
343 depends on NF_CONNTRACK_IPV4 327 depends on NF_CONNTRACK
344 depends on NETFILTER_ADVANCED 328 depends on NETFILTER_ADVANCED
345 select NF_CONNTRACK_MARK 329 select NF_CONNTRACK_MARK
346 select NETFILTER_FAMILY_ARP 330 select NETFILTER_FAMILY_ARP
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 8394c17c269f..367993adf4d3 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -3,12 +3,6 @@
3# Makefile for the netfilter modules on top of IPv4. 3# Makefile for the netfilter modules on top of IPv4.
4# 4#
5 5
6# objects for l3 independent conntrack
7nf_conntrack_ipv4-y := nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o
8
9# connection tracking
10obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o
11
12nf_nat_ipv4-y := nf_nat_l3proto_ipv4.o nf_nat_proto_icmp.o 6nf_nat_ipv4-y := nf_nat_l3proto_ipv4.o nf_nat_proto_icmp.o
13nf_nat_ipv4-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o 7nf_nat_ipv4-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o
14obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o 8obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index ca0dad90803a..e77872c93c20 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1898,6 +1898,7 @@ static struct xt_match ipt_builtin_mt[] __read_mostly = {
1898 .checkentry = icmp_checkentry, 1898 .checkentry = icmp_checkentry,
1899 .proto = IPPROTO_ICMP, 1899 .proto = IPPROTO_ICMP,
1900 .family = NFPROTO_IPV4, 1900 .family = NFPROTO_IPV4,
1901 .me = THIS_MODULE,
1901 }, 1902 },
1902}; 1903};
1903 1904
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
deleted file mode 100644
index 9db988f9a4d7..000000000000
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ /dev/null
@@ -1,472 +0,0 @@
1
2/* (C) 1999-2001 Paul `Rusty' Russell
3 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
4 * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/types.h>
12#include <linux/ip.h>
13#include <linux/netfilter.h>
14#include <linux/module.h>
15#include <linux/skbuff.h>
16#include <linux/icmp.h>
17#include <linux/sysctl.h>
18#include <net/route.h>
19#include <net/ip.h>
20
21#include <linux/netfilter_ipv4.h>
22#include <net/netfilter/nf_conntrack.h>
23#include <net/netfilter/nf_conntrack_helper.h>
24#include <net/netfilter/nf_conntrack_l4proto.h>
25#include <net/netfilter/nf_conntrack_l3proto.h>
26#include <net/netfilter/nf_conntrack_zones.h>
27#include <net/netfilter/nf_conntrack_core.h>
28#include <net/netfilter/nf_conntrack_seqadj.h>
29#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
30#include <net/netfilter/nf_nat_helper.h>
31#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
32#include <net/netfilter/nf_log.h>
33
34static int conntrack4_net_id __read_mostly;
35static DEFINE_MUTEX(register_ipv4_hooks);
36
37struct conntrack4_net {
38 unsigned int users;
39};
40
41static bool ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
42 struct nf_conntrack_tuple *tuple)
43{
44 const __be32 *ap;
45 __be32 _addrs[2];
46 ap = skb_header_pointer(skb, nhoff + offsetof(struct iphdr, saddr),
47 sizeof(u_int32_t) * 2, _addrs);
48 if (ap == NULL)
49 return false;
50
51 tuple->src.u3.ip = ap[0];
52 tuple->dst.u3.ip = ap[1];
53
54 return true;
55}
56
57static bool ipv4_invert_tuple(struct nf_conntrack_tuple *tuple,
58 const struct nf_conntrack_tuple *orig)
59{
60 tuple->src.u3.ip = orig->dst.u3.ip;
61 tuple->dst.u3.ip = orig->src.u3.ip;
62
63 return true;
64}
65
66static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
67 unsigned int *dataoff, u_int8_t *protonum)
68{
69 const struct iphdr *iph;
70 struct iphdr _iph;
71
72 iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
73 if (iph == NULL)
74 return -NF_ACCEPT;
75
76 /* Conntrack defragments packets, we might still see fragments
77 * inside ICMP packets though. */
78 if (iph->frag_off & htons(IP_OFFSET))
79 return -NF_ACCEPT;
80
81 *dataoff = nhoff + (iph->ihl << 2);
82 *protonum = iph->protocol;
83
84 /* Check bogus IP headers */
85 if (*dataoff > skb->len) {
86 pr_debug("nf_conntrack_ipv4: bogus IPv4 packet: "
87 "nhoff %u, ihl %u, skblen %u\n",
88 nhoff, iph->ihl << 2, skb->len);
89 return -NF_ACCEPT;
90 }
91
92 return NF_ACCEPT;
93}
94
95static unsigned int ipv4_helper(void *priv,
96 struct sk_buff *skb,
97 const struct nf_hook_state *state)
98{
99 struct nf_conn *ct;
100 enum ip_conntrack_info ctinfo;
101 const struct nf_conn_help *help;
102 const struct nf_conntrack_helper *helper;
103
104 /* This is where we call the helper: as the packet goes out. */
105 ct = nf_ct_get(skb, &ctinfo);
106 if (!ct || ctinfo == IP_CT_RELATED_REPLY)
107 return NF_ACCEPT;
108
109 help = nfct_help(ct);
110 if (!help)
111 return NF_ACCEPT;
112
113 /* rcu_read_lock()ed by nf_hook_thresh */
114 helper = rcu_dereference(help->helper);
115 if (!helper)
116 return NF_ACCEPT;
117
118 return helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb),
119 ct, ctinfo);
120}
121
122static unsigned int ipv4_confirm(void *priv,
123 struct sk_buff *skb,
124 const struct nf_hook_state *state)
125{
126 struct nf_conn *ct;
127 enum ip_conntrack_info ctinfo;
128
129 ct = nf_ct_get(skb, &ctinfo);
130 if (!ct || ctinfo == IP_CT_RELATED_REPLY)
131 goto out;
132
133 /* adjust seqs for loopback traffic only in outgoing direction */
134 if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
135 !nf_is_loopback_packet(skb)) {
136 if (!nf_ct_seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) {
137 NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
138 return NF_DROP;
139 }
140 }
141out:
142 /* We've seen it coming out the other side: confirm it */
143 return nf_conntrack_confirm(skb);
144}
145
146static unsigned int ipv4_conntrack_in(void *priv,
147 struct sk_buff *skb,
148 const struct nf_hook_state *state)
149{
150 return nf_conntrack_in(state->net, PF_INET, state->hook, skb);
151}
152
153static unsigned int ipv4_conntrack_local(void *priv,
154 struct sk_buff *skb,
155 const struct nf_hook_state *state)
156{
157 if (ip_is_fragment(ip_hdr(skb))) { /* IP_NODEFRAG setsockopt set */
158 enum ip_conntrack_info ctinfo;
159 struct nf_conn *tmpl;
160
161 tmpl = nf_ct_get(skb, &ctinfo);
162 if (tmpl && nf_ct_is_template(tmpl)) {
163 /* when skipping ct, clear templates to avoid fooling
164 * later targets/matches
165 */
166 skb->_nfct = 0;
167 nf_ct_put(tmpl);
168 }
169 return NF_ACCEPT;
170 }
171
172 return nf_conntrack_in(state->net, PF_INET, state->hook, skb);
173}
174
175/* Connection tracking may drop packets, but never alters them, so
176 make it the first hook. */
177static const struct nf_hook_ops ipv4_conntrack_ops[] = {
178 {
179 .hook = ipv4_conntrack_in,
180 .pf = NFPROTO_IPV4,
181 .hooknum = NF_INET_PRE_ROUTING,
182 .priority = NF_IP_PRI_CONNTRACK,
183 },
184 {
185 .hook = ipv4_conntrack_local,
186 .pf = NFPROTO_IPV4,
187 .hooknum = NF_INET_LOCAL_OUT,
188 .priority = NF_IP_PRI_CONNTRACK,
189 },
190 {
191 .hook = ipv4_helper,
192 .pf = NFPROTO_IPV4,
193 .hooknum = NF_INET_POST_ROUTING,
194 .priority = NF_IP_PRI_CONNTRACK_HELPER,
195 },
196 {
197 .hook = ipv4_confirm,
198 .pf = NFPROTO_IPV4,
199 .hooknum = NF_INET_POST_ROUTING,
200 .priority = NF_IP_PRI_CONNTRACK_CONFIRM,
201 },
202 {
203 .hook = ipv4_helper,
204 .pf = NFPROTO_IPV4,
205 .hooknum = NF_INET_LOCAL_IN,
206 .priority = NF_IP_PRI_CONNTRACK_HELPER,
207 },
208 {
209 .hook = ipv4_confirm,
210 .pf = NFPROTO_IPV4,
211 .hooknum = NF_INET_LOCAL_IN,
212 .priority = NF_IP_PRI_CONNTRACK_CONFIRM,
213 },
214};
215
216/* Fast function for those who don't want to parse /proc (and I don't
217 blame them). */
218/* Reversing the socket's dst/src point of view gives us the reply
219 mapping. */
220static int
221getorigdst(struct sock *sk, int optval, void __user *user, int *len)
222{
223 const struct inet_sock *inet = inet_sk(sk);
224 const struct nf_conntrack_tuple_hash *h;
225 struct nf_conntrack_tuple tuple;
226
227 memset(&tuple, 0, sizeof(tuple));
228
229 lock_sock(sk);
230 tuple.src.u3.ip = inet->inet_rcv_saddr;
231 tuple.src.u.tcp.port = inet->inet_sport;
232 tuple.dst.u3.ip = inet->inet_daddr;
233 tuple.dst.u.tcp.port = inet->inet_dport;
234 tuple.src.l3num = PF_INET;
235 tuple.dst.protonum = sk->sk_protocol;
236 release_sock(sk);
237
238 /* We only do TCP and SCTP at the moment: is there a better way? */
239 if (tuple.dst.protonum != IPPROTO_TCP &&
240 tuple.dst.protonum != IPPROTO_SCTP) {
241 pr_debug("SO_ORIGINAL_DST: Not a TCP/SCTP socket\n");
242 return -ENOPROTOOPT;
243 }
244
245 if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
246 pr_debug("SO_ORIGINAL_DST: len %d not %zu\n",
247 *len, sizeof(struct sockaddr_in));
248 return -EINVAL;
249 }
250
251 h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple);
252 if (h) {
253 struct sockaddr_in sin;
254 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
255
256 sin.sin_family = AF_INET;
257 sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
258 .tuple.dst.u.tcp.port;
259 sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
260 .tuple.dst.u3.ip;
261 memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
262
263 pr_debug("SO_ORIGINAL_DST: %pI4 %u\n",
264 &sin.sin_addr.s_addr, ntohs(sin.sin_port));
265 nf_ct_put(ct);
266 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
267 return -EFAULT;
268 else
269 return 0;
270 }
271 pr_debug("SO_ORIGINAL_DST: Can't find %pI4/%u-%pI4/%u.\n",
272 &tuple.src.u3.ip, ntohs(tuple.src.u.tcp.port),
273 &tuple.dst.u3.ip, ntohs(tuple.dst.u.tcp.port));
274 return -ENOENT;
275}
276
277#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
278
279#include <linux/netfilter/nfnetlink.h>
280#include <linux/netfilter/nfnetlink_conntrack.h>
281
282static int ipv4_tuple_to_nlattr(struct sk_buff *skb,
283 const struct nf_conntrack_tuple *tuple)
284{
285 if (nla_put_in_addr(skb, CTA_IP_V4_SRC, tuple->src.u3.ip) ||
286 nla_put_in_addr(skb, CTA_IP_V4_DST, tuple->dst.u3.ip))
287 goto nla_put_failure;
288 return 0;
289
290nla_put_failure:
291 return -1;
292}
293
294static const struct nla_policy ipv4_nla_policy[CTA_IP_MAX+1] = {
295 [CTA_IP_V4_SRC] = { .type = NLA_U32 },
296 [CTA_IP_V4_DST] = { .type = NLA_U32 },
297};
298
299static int ipv4_nlattr_to_tuple(struct nlattr *tb[],
300 struct nf_conntrack_tuple *t)
301{
302 if (!tb[CTA_IP_V4_SRC] || !tb[CTA_IP_V4_DST])
303 return -EINVAL;
304
305 t->src.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_SRC]);
306 t->dst.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_DST]);
307
308 return 0;
309}
310#endif
311
312static struct nf_sockopt_ops so_getorigdst = {
313 .pf = PF_INET,
314 .get_optmin = SO_ORIGINAL_DST,
315 .get_optmax = SO_ORIGINAL_DST+1,
316 .get = getorigdst,
317 .owner = THIS_MODULE,
318};
319
320static int ipv4_hooks_register(struct net *net)
321{
322 struct conntrack4_net *cnet = net_generic(net, conntrack4_net_id);
323 int err = 0;
324
325 mutex_lock(&register_ipv4_hooks);
326
327 cnet->users++;
328 if (cnet->users > 1)
329 goto out_unlock;
330
331 err = nf_defrag_ipv4_enable(net);
332 if (err) {
333 cnet->users = 0;
334 goto out_unlock;
335 }
336
337 err = nf_register_net_hooks(net, ipv4_conntrack_ops,
338 ARRAY_SIZE(ipv4_conntrack_ops));
339
340 if (err)
341 cnet->users = 0;
342 out_unlock:
343 mutex_unlock(&register_ipv4_hooks);
344 return err;
345}
346
347static void ipv4_hooks_unregister(struct net *net)
348{
349 struct conntrack4_net *cnet = net_generic(net, conntrack4_net_id);
350
351 mutex_lock(&register_ipv4_hooks);
352 if (cnet->users && (--cnet->users == 0))
353 nf_unregister_net_hooks(net, ipv4_conntrack_ops,
354 ARRAY_SIZE(ipv4_conntrack_ops));
355 mutex_unlock(&register_ipv4_hooks);
356}
357
358const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 = {
359 .l3proto = PF_INET,
360 .pkt_to_tuple = ipv4_pkt_to_tuple,
361 .invert_tuple = ipv4_invert_tuple,
362 .get_l4proto = ipv4_get_l4proto,
363#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
364 .tuple_to_nlattr = ipv4_tuple_to_nlattr,
365 .nlattr_to_tuple = ipv4_nlattr_to_tuple,
366 .nla_policy = ipv4_nla_policy,
367 .nla_size = NLA_ALIGN(NLA_HDRLEN + sizeof(u32)) + /* CTA_IP_V4_SRC */
368 NLA_ALIGN(NLA_HDRLEN + sizeof(u32)), /* CTA_IP_V4_DST */
369#endif
370 .net_ns_get = ipv4_hooks_register,
371 .net_ns_put = ipv4_hooks_unregister,
372 .me = THIS_MODULE,
373};
374
375module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
376 &nf_conntrack_htable_size, 0600);
377
378MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET));
379MODULE_ALIAS("ip_conntrack");
380MODULE_LICENSE("GPL");
381
382static const struct nf_conntrack_l4proto * const builtin_l4proto4[] = {
383 &nf_conntrack_l4proto_tcp4,
384 &nf_conntrack_l4proto_udp4,
385 &nf_conntrack_l4proto_icmp,
386#ifdef CONFIG_NF_CT_PROTO_DCCP
387 &nf_conntrack_l4proto_dccp4,
388#endif
389#ifdef CONFIG_NF_CT_PROTO_SCTP
390 &nf_conntrack_l4proto_sctp4,
391#endif
392#ifdef CONFIG_NF_CT_PROTO_UDPLITE
393 &nf_conntrack_l4proto_udplite4,
394#endif
395};
396
397static int ipv4_net_init(struct net *net)
398{
399 return nf_ct_l4proto_pernet_register(net, builtin_l4proto4,
400 ARRAY_SIZE(builtin_l4proto4));
401}
402
403static void ipv4_net_exit(struct net *net)
404{
405 nf_ct_l4proto_pernet_unregister(net, builtin_l4proto4,
406 ARRAY_SIZE(builtin_l4proto4));
407}
408
409static struct pernet_operations ipv4_net_ops = {
410 .init = ipv4_net_init,
411 .exit = ipv4_net_exit,
412 .id = &conntrack4_net_id,
413 .size = sizeof(struct conntrack4_net),
414};
415
416static int __init nf_conntrack_l3proto_ipv4_init(void)
417{
418 int ret = 0;
419
420 need_conntrack();
421
422#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
423 if (WARN_ON(nla_policy_len(ipv4_nla_policy, CTA_IP_MAX + 1) !=
424 nf_conntrack_l3proto_ipv4.nla_size))
425 return -EINVAL;
426#endif
427 ret = nf_register_sockopt(&so_getorigdst);
428 if (ret < 0) {
429 pr_err("Unable to register netfilter socket option\n");
430 return ret;
431 }
432
433 ret = register_pernet_subsys(&ipv4_net_ops);
434 if (ret < 0) {
435 pr_err("nf_conntrack_ipv4: can't register pernet ops\n");
436 goto cleanup_sockopt;
437 }
438
439 ret = nf_ct_l4proto_register(builtin_l4proto4,
440 ARRAY_SIZE(builtin_l4proto4));
441 if (ret < 0)
442 goto cleanup_pernet;
443
444 ret = nf_ct_l3proto_register(&nf_conntrack_l3proto_ipv4);
445 if (ret < 0) {
446 pr_err("nf_conntrack_ipv4: can't register ipv4 proto.\n");
447 goto cleanup_l4proto;
448 }
449
450 return ret;
451cleanup_l4proto:
452 nf_ct_l4proto_unregister(builtin_l4proto4,
453 ARRAY_SIZE(builtin_l4proto4));
454 cleanup_pernet:
455 unregister_pernet_subsys(&ipv4_net_ops);
456 cleanup_sockopt:
457 nf_unregister_sockopt(&so_getorigdst);
458 return ret;
459}
460
461static void __exit nf_conntrack_l3proto_ipv4_fini(void)
462{
463 synchronize_net();
464 nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv4);
465 nf_ct_l4proto_unregister(builtin_l4proto4,
466 ARRAY_SIZE(builtin_l4proto4));
467 unregister_pernet_subsys(&ipv4_net_ops);
468 nf_unregister_sockopt(&so_getorigdst);
469}
470
471module_init(nf_conntrack_l3proto_ipv4_init);
472module_exit(nf_conntrack_l3proto_ipv4_fini);
diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c
index 4388de0e5380..1e6f28c97d3a 100644
--- a/net/ipv4/netfilter/nf_log_ipv4.c
+++ b/net/ipv4/netfilter/nf_log_ipv4.c
@@ -35,7 +35,7 @@ static const struct nf_loginfo default_loginfo = {
35}; 35};
36 36
37/* One level of recursion won't kill us */ 37/* One level of recursion won't kill us */
38static void dump_ipv4_packet(struct nf_log_buf *m, 38static void dump_ipv4_packet(struct net *net, struct nf_log_buf *m,
39 const struct nf_loginfo *info, 39 const struct nf_loginfo *info,
40 const struct sk_buff *skb, unsigned int iphoff) 40 const struct sk_buff *skb, unsigned int iphoff)
41{ 41{
@@ -183,7 +183,7 @@ static void dump_ipv4_packet(struct nf_log_buf *m,
183 /* Max length: 3+maxlen */ 183 /* Max length: 3+maxlen */
184 if (!iphoff) { /* Only recurse once. */ 184 if (!iphoff) { /* Only recurse once. */
185 nf_log_buf_add(m, "["); 185 nf_log_buf_add(m, "[");
186 dump_ipv4_packet(m, info, skb, 186 dump_ipv4_packet(net, m, info, skb,
187 iphoff + ih->ihl*4+sizeof(_icmph)); 187 iphoff + ih->ihl*4+sizeof(_icmph));
188 nf_log_buf_add(m, "] "); 188 nf_log_buf_add(m, "] ");
189 } 189 }
@@ -251,7 +251,7 @@ static void dump_ipv4_packet(struct nf_log_buf *m,
251 251
252 /* Max length: 15 "UID=4294967295 " */ 252 /* Max length: 15 "UID=4294967295 " */
253 if ((logflags & NF_LOG_UID) && !iphoff) 253 if ((logflags & NF_LOG_UID) && !iphoff)
254 nf_log_dump_sk_uid_gid(m, skb->sk); 254 nf_log_dump_sk_uid_gid(net, m, skb->sk);
255 255
256 /* Max length: 16 "MARK=0xFFFFFFFF " */ 256 /* Max length: 16 "MARK=0xFFFFFFFF " */
257 if (!iphoff && skb->mark) 257 if (!iphoff && skb->mark)
@@ -333,7 +333,7 @@ static void nf_log_ip_packet(struct net *net, u_int8_t pf,
333 if (in != NULL) 333 if (in != NULL)
334 dump_ipv4_mac_header(m, loginfo, skb); 334 dump_ipv4_mac_header(m, loginfo, skb);
335 335
336 dump_ipv4_packet(m, loginfo, skb, 0); 336 dump_ipv4_packet(net, m, loginfo, skb, 0);
337 337
338 nf_log_buf_close(m); 338 nf_log_buf_close(m);
339} 339}
diff --git a/net/ipv4/netfilter/nf_tproxy_ipv4.c b/net/ipv4/netfilter/nf_tproxy_ipv4.c
index 805e83ec3ad9..164714104965 100644
--- a/net/ipv4/netfilter/nf_tproxy_ipv4.c
+++ b/net/ipv4/netfilter/nf_tproxy_ipv4.c
@@ -37,7 +37,7 @@ nf_tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb,
37 * to a listener socket if there's one */ 37 * to a listener socket if there's one */
38 struct sock *sk2; 38 struct sock *sk2;
39 39
40 sk2 = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol, 40 sk2 = nf_tproxy_get_sock_v4(net, skb, iph->protocol,
41 iph->saddr, laddr ? laddr : iph->daddr, 41 iph->saddr, laddr ? laddr : iph->daddr,
42 hp->source, lport ? lport : hp->dest, 42 hp->source, lport ? lport : hp->dest,
43 skb->dev, NF_TPROXY_LOOKUP_LISTENER); 43 skb->dev, NF_TPROXY_LOOKUP_LISTENER);
@@ -71,7 +71,7 @@ __be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
71EXPORT_SYMBOL_GPL(nf_tproxy_laddr4); 71EXPORT_SYMBOL_GPL(nf_tproxy_laddr4);
72 72
73struct sock * 73struct sock *
74nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp, 74nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb,
75 const u8 protocol, 75 const u8 protocol,
76 const __be32 saddr, const __be32 daddr, 76 const __be32 saddr, const __be32 daddr,
77 const __be16 sport, const __be16 dport, 77 const __be16 sport, const __be16 dport,
@@ -79,16 +79,21 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp,
79 const enum nf_tproxy_lookup_t lookup_type) 79 const enum nf_tproxy_lookup_t lookup_type)
80{ 80{
81 struct sock *sk; 81 struct sock *sk;
82 struct tcphdr *tcph;
83 82
84 switch (protocol) { 83 switch (protocol) {
85 case IPPROTO_TCP: 84 case IPPROTO_TCP: {
85 struct tcphdr _hdr, *hp;
86
87 hp = skb_header_pointer(skb, ip_hdrlen(skb),
88 sizeof(struct tcphdr), &_hdr);
89 if (hp == NULL)
90 return NULL;
91
86 switch (lookup_type) { 92 switch (lookup_type) {
87 case NF_TPROXY_LOOKUP_LISTENER: 93 case NF_TPROXY_LOOKUP_LISTENER:
88 tcph = hp;
89 sk = inet_lookup_listener(net, &tcp_hashinfo, skb, 94 sk = inet_lookup_listener(net, &tcp_hashinfo, skb,
90 ip_hdrlen(skb) + 95 ip_hdrlen(skb) +
91 __tcp_hdrlen(tcph), 96 __tcp_hdrlen(hp),
92 saddr, sport, 97 saddr, sport,
93 daddr, dport, 98 daddr, dport,
94 in->ifindex, 0); 99 in->ifindex, 0);
@@ -110,6 +115,7 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp,
110 BUG(); 115 BUG();
111 } 116 }
112 break; 117 break;
118 }
113 case IPPROTO_UDP: 119 case IPPROTO_UDP:
114 sk = udp4_lib_lookup(net, saddr, sport, daddr, dport, 120 sk = udp4_lib_lookup(net, saddr, sport, daddr, dport,
115 in->ifindex); 121 in->ifindex);
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 2ed64bca54e3..8d7aaf118a30 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -320,8 +320,7 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk,
320 if (addr->sin_addr.s_addr == htonl(INADDR_ANY)) 320 if (addr->sin_addr.s_addr == htonl(INADDR_ANY))
321 chk_addr_ret = RTN_LOCAL; 321 chk_addr_ret = RTN_LOCAL;
322 322
323 if ((net->ipv4.sysctl_ip_nonlocal_bind == 0 && 323 if ((!inet_can_nonlocal_bind(net, isk) &&
324 isk->freebind == 0 && isk->transparent == 0 &&
325 chk_addr_ret != RTN_LOCAL) || 324 chk_addr_ret != RTN_LOCAL) ||
326 chk_addr_ret == RTN_MULTICAST || 325 chk_addr_ret == RTN_MULTICAST ||
327 chk_addr_ret == RTN_BROADCAST) 326 chk_addr_ret == RTN_BROADCAST)
@@ -361,8 +360,7 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk,
361 scoped); 360 scoped);
362 rcu_read_unlock(); 361 rcu_read_unlock();
363 362
364 if (!(net->ipv6.sysctl.ip_nonlocal_bind || 363 if (!(ipv6_can_nonlocal_bind(net, isk) || has_addr ||
365 isk->freebind || isk->transparent || has_addr ||
366 addr_type == IPV6_ADDR_ANY)) 364 addr_type == IPV6_ADDR_ANY))
367 return -EADDRNOTAVAIL; 365 return -EADDRNOTAVAIL;
368 366
@@ -739,13 +737,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
739 /* no remote port */ 737 /* no remote port */
740 } 738 }
741 739
742 ipc.sockc.tsflags = sk->sk_tsflags; 740 ipcm_init_sk(&ipc, inet);
743 ipc.addr = inet->inet_saddr;
744 ipc.opt = NULL;
745 ipc.oif = sk->sk_bound_dev_if;
746 ipc.tx_flags = 0;
747 ipc.ttl = 0;
748 ipc.tos = -1;
749 741
750 if (msg->msg_controllen) { 742 if (msg->msg_controllen) {
751 err = ip_cmsg_send(sk, msg, &ipc, false); 743 err = ip_cmsg_send(sk, msg, &ipc, false);
@@ -769,8 +761,6 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
769 rcu_read_unlock(); 761 rcu_read_unlock();
770 } 762 }
771 763
772 sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags);
773
774 saddr = ipc.addr; 764 saddr = ipc.addr;
775 ipc.addr = faddr = daddr; 765 ipc.addr = faddr = daddr;
776 766
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 77350c1256ce..70289682a670 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -119,6 +119,7 @@ static const struct snmp_mib snmp4_ipextstats_list[] = {
119 SNMP_MIB_ITEM("InECT1Pkts", IPSTATS_MIB_ECT1PKTS), 119 SNMP_MIB_ITEM("InECT1Pkts", IPSTATS_MIB_ECT1PKTS),
120 SNMP_MIB_ITEM("InECT0Pkts", IPSTATS_MIB_ECT0PKTS), 120 SNMP_MIB_ITEM("InECT0Pkts", IPSTATS_MIB_ECT0PKTS),
121 SNMP_MIB_ITEM("InCEPkts", IPSTATS_MIB_CEPKTS), 121 SNMP_MIB_ITEM("InCEPkts", IPSTATS_MIB_CEPKTS),
122 SNMP_MIB_ITEM("ReasmOverlaps", IPSTATS_MIB_REASM_OVERLAPS),
122 SNMP_MIB_SENTINEL 123 SNMP_MIB_SENTINEL
123}; 124};
124 125
@@ -287,6 +288,8 @@ static const struct snmp_mib snmp4_net_list[] = {
287 SNMP_MIB_ITEM("TCPDelivered", LINUX_MIB_TCPDELIVERED), 288 SNMP_MIB_ITEM("TCPDelivered", LINUX_MIB_TCPDELIVERED),
288 SNMP_MIB_ITEM("TCPDeliveredCE", LINUX_MIB_TCPDELIVEREDCE), 289 SNMP_MIB_ITEM("TCPDeliveredCE", LINUX_MIB_TCPDELIVEREDCE),
289 SNMP_MIB_ITEM("TCPAckCompressed", LINUX_MIB_TCPACKCOMPRESSED), 290 SNMP_MIB_ITEM("TCPAckCompressed", LINUX_MIB_TCPACKCOMPRESSED),
291 SNMP_MIB_ITEM("TCPZeroWindowDrop", LINUX_MIB_TCPZEROWINDOWDROP),
292 SNMP_MIB_ITEM("TCPRcvQDrop", LINUX_MIB_TCPRCVQDROP),
290 SNMP_MIB_SENTINEL 293 SNMP_MIB_SENTINEL
291}; 294};
292 295
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index abb3c9490c55..33df4d76db2d 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -381,6 +381,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
381 381
382 skb->priority = sk->sk_priority; 382 skb->priority = sk->sk_priority;
383 skb->mark = sk->sk_mark; 383 skb->mark = sk->sk_mark;
384 skb->tstamp = sockc->transmit_time;
384 skb_dst_set(skb, &rt->dst); 385 skb_dst_set(skb, &rt->dst);
385 *rtp = NULL; 386 *rtp = NULL;
386 387
@@ -561,13 +562,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
561 daddr = inet->inet_daddr; 562 daddr = inet->inet_daddr;
562 } 563 }
563 564
564 ipc.sockc.tsflags = sk->sk_tsflags; 565 ipcm_init_sk(&ipc, inet);
565 ipc.addr = inet->inet_saddr;
566 ipc.opt = NULL;
567 ipc.tx_flags = 0;
568 ipc.ttl = 0;
569 ipc.tos = -1;
570 ipc.oif = sk->sk_bound_dev_if;
571 566
572 if (msg->msg_controllen) { 567 if (msg->msg_controllen) {
573 err = ip_cmsg_send(sk, msg, &ipc, false); 568 err = ip_cmsg_send(sk, msg, &ipc, false);
@@ -670,8 +665,6 @@ back_from_confirm:
670 &rt, msg->msg_flags, &ipc.sockc); 665 &rt, msg->msg_flags, &ipc.sockc);
671 666
672 else { 667 else {
673 sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags);
674
675 if (!ipc.addr) 668 if (!ipc.addr)
676 ipc.addr = fl4.daddr; 669 ipc.addr = fl4.daddr;
677 lock_sock(sk); 670 lock_sock(sk);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 1df6e97106d7..b678466da451 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1996,8 +1996,11 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1996 goto no_route; 1996 goto no_route;
1997 } 1997 }
1998 1998
1999 if (res->type == RTN_BROADCAST) 1999 if (res->type == RTN_BROADCAST) {
2000 if (IN_DEV_BFORWARD(in_dev))
2001 goto make_route;
2000 goto brd_input; 2002 goto brd_input;
2003 }
2001 2004
2002 if (res->type == RTN_LOCAL) { 2005 if (res->type == RTN_LOCAL) {
2003 err = fib_validate_source(skb, saddr, daddr, tos, 2006 err = fib_validate_source(skb, saddr, daddr, tos,
@@ -2014,6 +2017,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2014 if (res->type != RTN_UNICAST) 2017 if (res->type != RTN_UNICAST)
2015 goto martian_destination; 2018 goto martian_destination;
2016 2019
2020make_route:
2017 err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys); 2021 err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2018out: return err; 2022out: return err;
2019 2023
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index d06247ba08b2..b92f422f2fa8 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -189,8 +189,9 @@ static int ipv4_ping_group_range(struct ctl_table *table, int write,
189 if (write && ret == 0) { 189 if (write && ret == 0) {
190 low = make_kgid(user_ns, urange[0]); 190 low = make_kgid(user_ns, urange[0]);
191 high = make_kgid(user_ns, urange[1]); 191 high = make_kgid(user_ns, urange[1]);
192 if (!gid_valid(low) || !gid_valid(high) || 192 if (!gid_valid(low) || !gid_valid(high))
193 (urange[1] < urange[0]) || gid_lt(high, low)) { 193 return -EINVAL;
194 if (urange[1] < urange[0] || gid_lt(high, low)) {
194 low = make_kgid(&init_user_ns, 1); 195 low = make_kgid(&init_user_ns, 1);
195 high = make_kgid(&init_user_ns, 0); 196 high = make_kgid(&init_user_ns, 0);
196 } 197 }
@@ -200,6 +201,23 @@ static int ipv4_ping_group_range(struct ctl_table *table, int write,
200 return ret; 201 return ret;
201} 202}
202 203
204static int ipv4_fwd_update_priority(struct ctl_table *table, int write,
205 void __user *buffer,
206 size_t *lenp, loff_t *ppos)
207{
208 struct net *net;
209 int ret;
210
211 net = container_of(table->data, struct net,
212 ipv4.sysctl_ip_fwd_update_priority);
213 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
214 if (write && ret == 0)
215 call_netevent_notifiers(NETEVENT_IPV4_FWD_UPDATE_PRIORITY_UPDATE,
216 net);
217
218 return ret;
219}
220
203static int proc_tcp_congestion_control(struct ctl_table *ctl, int write, 221static int proc_tcp_congestion_control(struct ctl_table *ctl, int write,
204 void __user *buffer, size_t *lenp, loff_t *ppos) 222 void __user *buffer, size_t *lenp, loff_t *ppos)
205{ 223{
@@ -265,8 +283,9 @@ static int proc_tcp_fastopen_key(struct ctl_table *table, int write,
265 ipv4.sysctl_tcp_fastopen); 283 ipv4.sysctl_tcp_fastopen);
266 struct ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) }; 284 struct ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) };
267 struct tcp_fastopen_context *ctxt; 285 struct tcp_fastopen_context *ctxt;
268 int ret;
269 u32 user_key[4]; /* 16 bytes, matching TCP_FASTOPEN_KEY_LENGTH */ 286 u32 user_key[4]; /* 16 bytes, matching TCP_FASTOPEN_KEY_LENGTH */
287 __le32 key[4];
288 int ret, i;
270 289
271 tbl.data = kmalloc(tbl.maxlen, GFP_KERNEL); 290 tbl.data = kmalloc(tbl.maxlen, GFP_KERNEL);
272 if (!tbl.data) 291 if (!tbl.data)
@@ -275,11 +294,14 @@ static int proc_tcp_fastopen_key(struct ctl_table *table, int write,
275 rcu_read_lock(); 294 rcu_read_lock();
276 ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx); 295 ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx);
277 if (ctxt) 296 if (ctxt)
278 memcpy(user_key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH); 297 memcpy(key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH);
279 else 298 else
280 memset(user_key, 0, sizeof(user_key)); 299 memset(key, 0, sizeof(key));
281 rcu_read_unlock(); 300 rcu_read_unlock();
282 301
302 for (i = 0; i < ARRAY_SIZE(key); i++)
303 user_key[i] = le32_to_cpu(key[i]);
304
283 snprintf(tbl.data, tbl.maxlen, "%08x-%08x-%08x-%08x", 305 snprintf(tbl.data, tbl.maxlen, "%08x-%08x-%08x-%08x",
284 user_key[0], user_key[1], user_key[2], user_key[3]); 306 user_key[0], user_key[1], user_key[2], user_key[3]);
285 ret = proc_dostring(&tbl, write, buffer, lenp, ppos); 307 ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
@@ -290,13 +312,17 @@ static int proc_tcp_fastopen_key(struct ctl_table *table, int write,
290 ret = -EINVAL; 312 ret = -EINVAL;
291 goto bad_key; 313 goto bad_key;
292 } 314 }
293 tcp_fastopen_reset_cipher(net, NULL, user_key, 315
316 for (i = 0; i < ARRAY_SIZE(user_key); i++)
317 key[i] = cpu_to_le32(user_key[i]);
318
319 tcp_fastopen_reset_cipher(net, NULL, key,
294 TCP_FASTOPEN_KEY_LENGTH); 320 TCP_FASTOPEN_KEY_LENGTH);
295 } 321 }
296 322
297bad_key: 323bad_key:
298 pr_debug("proc FO key set 0x%x-%x-%x-%x <- 0x%s: %u\n", 324 pr_debug("proc FO key set 0x%x-%x-%x-%x <- 0x%s: %u\n",
299 user_key[0], user_key[1], user_key[2], user_key[3], 325 user_key[0], user_key[1], user_key[2], user_key[3],
300 (char *)tbl.data, ret); 326 (char *)tbl.data, ret);
301 kfree(tbl.data); 327 kfree(tbl.data);
302 return ret; 328 return ret;
@@ -655,6 +681,15 @@ static struct ctl_table ipv4_net_table[] = {
655 .proc_handler = proc_dointvec, 681 .proc_handler = proc_dointvec,
656 }, 682 },
657 { 683 {
684 .procname = "ip_forward_update_priority",
685 .data = &init_net.ipv4.sysctl_ip_fwd_update_priority,
686 .maxlen = sizeof(int),
687 .mode = 0644,
688 .proc_handler = ipv4_fwd_update_priority,
689 .extra1 = &zero,
690 .extra2 = &one,
691 },
692 {
658 .procname = "ip_nonlocal_bind", 693 .procname = "ip_nonlocal_bind",
659 .data = &init_net.ipv4.sysctl_ip_nonlocal_bind, 694 .data = &init_net.ipv4.sysctl_ip_nonlocal_bind,
660 .maxlen = sizeof(int), 695 .maxlen = sizeof(int),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 141acd92e58a..b8af2fec5ad5 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -494,21 +494,32 @@ static inline bool tcp_stream_is_readable(const struct tcp_sock *tp,
494} 494}
495 495
496/* 496/*
497 * Socket is not locked. We are protected from async events by poll logic and 497 * Wait for a TCP event.
498 * correct handling of state changes made by other threads is impossible in 498 *
499 * any case. 499 * Note that we don't need to lock the socket, as the upper poll layers
500 * take care of normal races (between the test and the event) and we don't
501 * go look at any of the socket buffers directly.
500 */ 502 */
501__poll_t tcp_poll_mask(struct socket *sock, __poll_t events) 503__poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
502{ 504{
505 __poll_t mask;
503 struct sock *sk = sock->sk; 506 struct sock *sk = sock->sk;
504 const struct tcp_sock *tp = tcp_sk(sk); 507 const struct tcp_sock *tp = tcp_sk(sk);
505 __poll_t mask = 0;
506 int state; 508 int state;
507 509
510 sock_poll_wait(file, wait);
511
508 state = inet_sk_state_load(sk); 512 state = inet_sk_state_load(sk);
509 if (state == TCP_LISTEN) 513 if (state == TCP_LISTEN)
510 return inet_csk_listen_poll(sk); 514 return inet_csk_listen_poll(sk);
511 515
516 /* Socket is not locked. We are protected from async events
517 * by poll logic and correct handling of state changes
518 * made by other threads is impossible in any case.
519 */
520
521 mask = 0;
522
512 /* 523 /*
513 * EPOLLHUP is certainly not done right. But poll() doesn't 524 * EPOLLHUP is certainly not done right. But poll() doesn't
514 * have a notion of HUP in just one direction, and for a 525 * have a notion of HUP in just one direction, and for a
@@ -589,7 +600,7 @@ __poll_t tcp_poll_mask(struct socket *sock, __poll_t events)
589 600
590 return mask; 601 return mask;
591} 602}
592EXPORT_SYMBOL(tcp_poll_mask); 603EXPORT_SYMBOL(tcp_poll);
593 604
594int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) 605int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
595{ 606{
@@ -806,8 +817,7 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
806 * This occurs when user tries to read 817 * This occurs when user tries to read
807 * from never connected socket. 818 * from never connected socket.
808 */ 819 */
809 if (!sock_flag(sk, SOCK_DONE)) 820 ret = -ENOTCONN;
810 ret = -ENOTCONN;
811 break; 821 break;
812 } 822 }
813 if (!timeo) { 823 if (!timeo) {
@@ -1230,7 +1240,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
1230 /* 'common' sending to sendq */ 1240 /* 'common' sending to sendq */
1231 } 1241 }
1232 1242
1233 sockc.tsflags = sk->sk_tsflags; 1243 sockcm_init(&sockc, sk);
1234 if (msg->msg_controllen) { 1244 if (msg->msg_controllen) {
1235 err = sock_cmsg_send(sk, msg, &sockc); 1245 err = sock_cmsg_send(sk, msg, &sockc);
1236 if (unlikely(err)) { 1246 if (unlikely(err)) {
@@ -1264,9 +1274,6 @@ restart:
1264 int linear; 1274 int linear;
1265 1275
1266new_segment: 1276new_segment:
1267 /* Allocate new segment. If the interface is SG,
1268 * allocate skb fitting to single page.
1269 */
1270 if (!sk_stream_memory_free(sk)) 1277 if (!sk_stream_memory_free(sk))
1271 goto wait_for_sndbuf; 1278 goto wait_for_sndbuf;
1272 1279
@@ -1987,7 +1994,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
1987 * shouldn't happen. 1994 * shouldn't happen.
1988 */ 1995 */
1989 if (WARN(before(*seq, TCP_SKB_CB(skb)->seq), 1996 if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
1990 "recvmsg bug: copied %X seq %X rcvnxt %X fl %X\n", 1997 "TCP recvmsg seq # bug: copied %X, seq %X, rcvnxt %X, fl %X\n",
1991 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, 1998 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
1992 flags)) 1999 flags))
1993 break; 2000 break;
@@ -2002,7 +2009,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
2002 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) 2009 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
2003 goto found_fin_ok; 2010 goto found_fin_ok;
2004 WARN(!(flags & MSG_PEEK), 2011 WARN(!(flags & MSG_PEEK),
2005 "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n", 2012 "TCP recvmsg seq # bug 2: copied %X, seq %X, rcvnxt %X, fl %X\n",
2006 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags); 2013 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
2007 } 2014 }
2008 2015
@@ -2031,13 +2038,10 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
2031 break; 2038 break;
2032 2039
2033 if (sk->sk_state == TCP_CLOSE) { 2040 if (sk->sk_state == TCP_CLOSE) {
2034 if (!sock_flag(sk, SOCK_DONE)) { 2041 /* This occurs when user tries to read
2035 /* This occurs when user tries to read 2042 * from never connected socket.
2036 * from never connected socket. 2043 */
2037 */ 2044 copied = -ENOTCONN;
2038 copied = -ENOTCONN;
2039 break;
2040 }
2041 break; 2045 break;
2042 } 2046 }
2043 2047
@@ -2527,7 +2531,6 @@ int tcp_disconnect(struct sock *sk, int flags)
2527 struct inet_sock *inet = inet_sk(sk); 2531 struct inet_sock *inet = inet_sk(sk);
2528 struct inet_connection_sock *icsk = inet_csk(sk); 2532 struct inet_connection_sock *icsk = inet_csk(sk);
2529 struct tcp_sock *tp = tcp_sk(sk); 2533 struct tcp_sock *tp = tcp_sk(sk);
2530 int err = 0;
2531 int old_state = sk->sk_state; 2534 int old_state = sk->sk_state;
2532 2535
2533 if (old_state != TCP_CLOSE) 2536 if (old_state != TCP_CLOSE)
@@ -2551,6 +2554,8 @@ int tcp_disconnect(struct sock *sk, int flags)
2551 2554
2552 tcp_clear_xmit_timers(sk); 2555 tcp_clear_xmit_timers(sk);
2553 __skb_queue_purge(&sk->sk_receive_queue); 2556 __skb_queue_purge(&sk->sk_receive_queue);
2557 tp->copied_seq = tp->rcv_nxt;
2558 tp->urg_data = 0;
2554 tcp_write_queue_purge(sk); 2559 tcp_write_queue_purge(sk);
2555 tcp_fastopen_active_disable_ofo_check(sk); 2560 tcp_fastopen_active_disable_ofo_check(sk);
2556 skb_rbtree_purge(&tp->out_of_order_queue); 2561 skb_rbtree_purge(&tp->out_of_order_queue);
@@ -2563,6 +2568,7 @@ int tcp_disconnect(struct sock *sk, int flags)
2563 sk->sk_shutdown = 0; 2568 sk->sk_shutdown = 0;
2564 sock_reset_flag(sk, SOCK_DONE); 2569 sock_reset_flag(sk, SOCK_DONE);
2565 tp->srtt_us = 0; 2570 tp->srtt_us = 0;
2571 tp->rcv_rtt_last_tsecr = 0;
2566 tp->write_seq += tp->max_window + 2; 2572 tp->write_seq += tp->max_window + 2;
2567 if (tp->write_seq == 0) 2573 if (tp->write_seq == 0)
2568 tp->write_seq = 1; 2574 tp->write_seq = 1;
@@ -2587,6 +2593,10 @@ int tcp_disconnect(struct sock *sk, int flags)
2587 sk->sk_rx_dst = NULL; 2593 sk->sk_rx_dst = NULL;
2588 tcp_saved_syn_free(tp); 2594 tcp_saved_syn_free(tp);
2589 tp->compressed_ack = 0; 2595 tp->compressed_ack = 0;
2596 tp->bytes_sent = 0;
2597 tp->bytes_retrans = 0;
2598 tp->dsack_dups = 0;
2599 tp->reord_seen = 0;
2590 2600
2591 /* Clean up fastopen related fields */ 2601 /* Clean up fastopen related fields */
2592 tcp_free_fastopen_req(tp); 2602 tcp_free_fastopen_req(tp);
@@ -2601,7 +2611,7 @@ int tcp_disconnect(struct sock *sk, int flags)
2601 } 2611 }
2602 2612
2603 sk->sk_error_report(sk); 2613 sk->sk_error_report(sk);
2604 return err; 2614 return 0;
2605} 2615}
2606EXPORT_SYMBOL(tcp_disconnect); 2616EXPORT_SYMBOL(tcp_disconnect);
2607 2617
@@ -2810,14 +2820,17 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2810 case TCP_REPAIR: 2820 case TCP_REPAIR:
2811 if (!tcp_can_repair_sock(sk)) 2821 if (!tcp_can_repair_sock(sk))
2812 err = -EPERM; 2822 err = -EPERM;
2813 else if (val == 1) { 2823 else if (val == TCP_REPAIR_ON) {
2814 tp->repair = 1; 2824 tp->repair = 1;
2815 sk->sk_reuse = SK_FORCE_REUSE; 2825 sk->sk_reuse = SK_FORCE_REUSE;
2816 tp->repair_queue = TCP_NO_QUEUE; 2826 tp->repair_queue = TCP_NO_QUEUE;
2817 } else if (val == 0) { 2827 } else if (val == TCP_REPAIR_OFF) {
2818 tp->repair = 0; 2828 tp->repair = 0;
2819 sk->sk_reuse = SK_NO_REUSE; 2829 sk->sk_reuse = SK_NO_REUSE;
2820 tcp_send_window_probe(sk); 2830 tcp_send_window_probe(sk);
2831 } else if (val == TCP_REPAIR_OFF_NO_WP) {
2832 tp->repair = 0;
2833 sk->sk_reuse = SK_NO_REUSE;
2821 } else 2834 } else
2822 err = -EINVAL; 2835 err = -EINVAL;
2823 2836
@@ -2979,7 +2992,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2979 if (val < 0) 2992 if (val < 0)
2980 err = -EINVAL; 2993 err = -EINVAL;
2981 else 2994 else
2982 icsk->icsk_user_timeout = msecs_to_jiffies(val); 2995 icsk->icsk_user_timeout = val;
2983 break; 2996 break;
2984 2997
2985 case TCP_FASTOPEN: 2998 case TCP_FASTOPEN:
@@ -3191,10 +3204,41 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
3191 info->tcpi_delivery_rate = rate64; 3204 info->tcpi_delivery_rate = rate64;
3192 info->tcpi_delivered = tp->delivered; 3205 info->tcpi_delivered = tp->delivered;
3193 info->tcpi_delivered_ce = tp->delivered_ce; 3206 info->tcpi_delivered_ce = tp->delivered_ce;
3207 info->tcpi_bytes_sent = tp->bytes_sent;
3208 info->tcpi_bytes_retrans = tp->bytes_retrans;
3209 info->tcpi_dsack_dups = tp->dsack_dups;
3210 info->tcpi_reord_seen = tp->reord_seen;
3194 unlock_sock_fast(sk, slow); 3211 unlock_sock_fast(sk, slow);
3195} 3212}
3196EXPORT_SYMBOL_GPL(tcp_get_info); 3213EXPORT_SYMBOL_GPL(tcp_get_info);
3197 3214
3215static size_t tcp_opt_stats_get_size(void)
3216{
3217 return
3218 nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BUSY */
3219 nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_RWND_LIMITED */
3220 nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_SNDBUF_LIMITED */
3221 nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_DATA_SEGS_OUT */
3222 nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_TOTAL_RETRANS */
3223 nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_PACING_RATE */
3224 nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_DELIVERY_RATE */
3225 nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_CWND */
3226 nla_total_size(sizeof(u32)) + /* TCP_NLA_REORDERING */
3227 nla_total_size(sizeof(u32)) + /* TCP_NLA_MIN_RTT */
3228 nla_total_size(sizeof(u8)) + /* TCP_NLA_RECUR_RETRANS */
3229 nla_total_size(sizeof(u8)) + /* TCP_NLA_DELIVERY_RATE_APP_LMT */
3230 nla_total_size(sizeof(u32)) + /* TCP_NLA_SNDQ_SIZE */
3231 nla_total_size(sizeof(u8)) + /* TCP_NLA_CA_STATE */
3232 nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_SSTHRESH */
3233 nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED */
3234 nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED_CE */
3235 nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_SENT */
3236 nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_RETRANS */
3237 nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */
3238 nla_total_size(sizeof(u32)) + /* TCP_NLA_REORD_SEEN */
3239 0;
3240}
3241
3198struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) 3242struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
3199{ 3243{
3200 const struct tcp_sock *tp = tcp_sk(sk); 3244 const struct tcp_sock *tp = tcp_sk(sk);
@@ -3203,9 +3247,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
3203 u64 rate64; 3247 u64 rate64;
3204 u32 rate; 3248 u32 rate;
3205 3249
3206 stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) + 3250 stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC);
3207 7 * nla_total_size(sizeof(u32)) +
3208 3 * nla_total_size(sizeof(u8)), GFP_ATOMIC);
3209 if (!stats) 3251 if (!stats)
3210 return NULL; 3252 return NULL;
3211 3253
@@ -3241,6 +3283,13 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
3241 nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una); 3283 nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una);
3242 nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state); 3284 nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state);
3243 3285
3286 nla_put_u64_64bit(stats, TCP_NLA_BYTES_SENT, tp->bytes_sent,
3287 TCP_NLA_PAD);
3288 nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS, tp->bytes_retrans,
3289 TCP_NLA_PAD);
3290 nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups);
3291 nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen);
3292
3244 return stats; 3293 return stats;
3245} 3294}
3246 3295
@@ -3435,7 +3484,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
3435 break; 3484 break;
3436 3485
3437 case TCP_USER_TIMEOUT: 3486 case TCP_USER_TIMEOUT:
3438 val = jiffies_to_msecs(icsk->icsk_user_timeout); 3487 val = icsk->icsk_user_timeout;
3439 break; 3488 break;
3440 3489
3441 case TCP_FASTOPEN: 3490 case TCP_FASTOPEN:
@@ -3709,8 +3758,7 @@ int tcp_abort(struct sock *sk, int err)
3709 struct request_sock *req = inet_reqsk(sk); 3758 struct request_sock *req = inet_reqsk(sk);
3710 3759
3711 local_bh_disable(); 3760 local_bh_disable();
3712 inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, 3761 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
3713 req);
3714 local_bh_enable(); 3762 local_bh_enable();
3715 return 0; 3763 return 0;
3716 } 3764 }
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 58e2f479ffb4..02ff2dde9609 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -95,11 +95,10 @@ struct bbr {
95 u32 mode:3, /* current bbr_mode in state machine */ 95 u32 mode:3, /* current bbr_mode in state machine */
96 prev_ca_state:3, /* CA state on previous ACK */ 96 prev_ca_state:3, /* CA state on previous ACK */
97 packet_conservation:1, /* use packet conservation? */ 97 packet_conservation:1, /* use packet conservation? */
98 restore_cwnd:1, /* decided to revert cwnd to old value */
99 round_start:1, /* start of packet-timed tx->ack round? */ 98 round_start:1, /* start of packet-timed tx->ack round? */
100 idle_restart:1, /* restarting after idle? */ 99 idle_restart:1, /* restarting after idle? */
101 probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */ 100 probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */
102 unused:12, 101 unused:13,
103 lt_is_sampling:1, /* taking long-term ("LT") samples now? */ 102 lt_is_sampling:1, /* taking long-term ("LT") samples now? */
104 lt_rtt_cnt:7, /* round trips in long-term interval */ 103 lt_rtt_cnt:7, /* round trips in long-term interval */
105 lt_use_bw:1; /* use lt_bw as our bw estimate? */ 104 lt_use_bw:1; /* use lt_bw as our bw estimate? */
@@ -175,6 +174,8 @@ static const u32 bbr_lt_bw_diff = 4000 / 8;
175/* If we estimate we're policed, use lt_bw for this many round trips: */ 174/* If we estimate we're policed, use lt_bw for this many round trips: */
176static const u32 bbr_lt_bw_max_rtts = 48; 175static const u32 bbr_lt_bw_max_rtts = 48;
177 176
177static void bbr_check_probe_rtt_done(struct sock *sk);
178
178/* Do we estimate that STARTUP filled the pipe? */ 179/* Do we estimate that STARTUP filled the pipe? */
179static bool bbr_full_bw_reached(const struct sock *sk) 180static bool bbr_full_bw_reached(const struct sock *sk)
180{ 181{
@@ -205,7 +206,11 @@ static u32 bbr_bw(const struct sock *sk)
205 */ 206 */
206static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain) 207static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain)
207{ 208{
208 rate *= tcp_mss_to_mtu(sk, tcp_sk(sk)->mss_cache); 209 unsigned int mss = tcp_sk(sk)->mss_cache;
210
211 if (!tcp_needs_internal_pacing(sk))
212 mss = tcp_mss_to_mtu(sk, mss);
213 rate *= mss;
209 rate *= gain; 214 rate *= gain;
210 rate >>= BBR_SCALE; 215 rate >>= BBR_SCALE;
211 rate *= USEC_PER_SEC; 216 rate *= USEC_PER_SEC;
@@ -305,6 +310,8 @@ static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
305 */ 310 */
306 if (bbr->mode == BBR_PROBE_BW) 311 if (bbr->mode == BBR_PROBE_BW)
307 bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT); 312 bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT);
313 else if (bbr->mode == BBR_PROBE_RTT)
314 bbr_check_probe_rtt_done(sk);
308 } 315 }
309} 316}
310 317
@@ -354,6 +361,10 @@ static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain)
354 /* Reduce delayed ACKs by rounding up cwnd to the next even number. */ 361 /* Reduce delayed ACKs by rounding up cwnd to the next even number. */
355 cwnd = (cwnd + 1) & ~1U; 362 cwnd = (cwnd + 1) & ~1U;
356 363
364 /* Ensure gain cycling gets inflight above BDP even for small BDPs. */
365 if (bbr->mode == BBR_PROBE_BW && gain > BBR_UNIT)
366 cwnd += 2;
367
357 return cwnd; 368 return cwnd;
358} 369}
359 370
@@ -388,17 +399,11 @@ static bool bbr_set_cwnd_to_recover_or_restore(
388 cwnd = tcp_packets_in_flight(tp) + acked; 399 cwnd = tcp_packets_in_flight(tp) + acked;
389 } else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) { 400 } else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) {
390 /* Exiting loss recovery; restore cwnd saved before recovery. */ 401 /* Exiting loss recovery; restore cwnd saved before recovery. */
391 bbr->restore_cwnd = 1; 402 cwnd = max(cwnd, bbr->prior_cwnd);
392 bbr->packet_conservation = 0; 403 bbr->packet_conservation = 0;
393 } 404 }
394 bbr->prev_ca_state = state; 405 bbr->prev_ca_state = state;
395 406
396 if (bbr->restore_cwnd) {
397 /* Restore cwnd after exiting loss recovery or PROBE_RTT. */
398 cwnd = max(cwnd, bbr->prior_cwnd);
399 bbr->restore_cwnd = 0;
400 }
401
402 if (bbr->packet_conservation) { 407 if (bbr->packet_conservation) {
403 *new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked); 408 *new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked);
404 return true; /* yes, using packet conservation */ 409 return true; /* yes, using packet conservation */
@@ -415,10 +420,10 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
415{ 420{
416 struct tcp_sock *tp = tcp_sk(sk); 421 struct tcp_sock *tp = tcp_sk(sk);
417 struct bbr *bbr = inet_csk_ca(sk); 422 struct bbr *bbr = inet_csk_ca(sk);
418 u32 cwnd = 0, target_cwnd = 0; 423 u32 cwnd = tp->snd_cwnd, target_cwnd = 0;
419 424
420 if (!acked) 425 if (!acked)
421 return; 426 goto done; /* no packet fully ACKed; just apply caps */
422 427
423 if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd)) 428 if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd))
424 goto done; 429 goto done;
@@ -740,6 +745,20 @@ static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
740 bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */ 745 bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */
741} 746}
742 747
748static void bbr_check_probe_rtt_done(struct sock *sk)
749{
750 struct tcp_sock *tp = tcp_sk(sk);
751 struct bbr *bbr = inet_csk_ca(sk);
752
753 if (!(bbr->probe_rtt_done_stamp &&
754 after(tcp_jiffies32, bbr->probe_rtt_done_stamp)))
755 return;
756
757 bbr->min_rtt_stamp = tcp_jiffies32; /* wait a while until PROBE_RTT */
758 tp->snd_cwnd = max(tp->snd_cwnd, bbr->prior_cwnd);
759 bbr_reset_mode(sk);
760}
761
743/* The goal of PROBE_RTT mode is to have BBR flows cooperatively and 762/* The goal of PROBE_RTT mode is to have BBR flows cooperatively and
744 * periodically drain the bottleneck queue, to converge to measure the true 763 * periodically drain the bottleneck queue, to converge to measure the true
745 * min_rtt (unloaded propagation delay). This allows the flows to keep queues 764 * min_rtt (unloaded propagation delay). This allows the flows to keep queues
@@ -798,12 +817,8 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
798 } else if (bbr->probe_rtt_done_stamp) { 817 } else if (bbr->probe_rtt_done_stamp) {
799 if (bbr->round_start) 818 if (bbr->round_start)
800 bbr->probe_rtt_round_done = 1; 819 bbr->probe_rtt_round_done = 1;
801 if (bbr->probe_rtt_round_done && 820 if (bbr->probe_rtt_round_done)
802 after(tcp_jiffies32, bbr->probe_rtt_done_stamp)) { 821 bbr_check_probe_rtt_done(sk);
803 bbr->min_rtt_stamp = tcp_jiffies32;
804 bbr->restore_cwnd = 1; /* snap to prior_cwnd */
805 bbr_reset_mode(sk);
806 }
807 } 822 }
808 } 823 }
809 /* Restart after idle ends only once we process a new S/ACK for data */ 824 /* Restart after idle ends only once we process a new S/ACK for data */
@@ -854,7 +869,6 @@ static void bbr_init(struct sock *sk)
854 bbr->has_seen_rtt = 0; 869 bbr->has_seen_rtt = 0;
855 bbr_init_pacing_rate_from_rtt(sk); 870 bbr_init_pacing_rate_from_rtt(sk);
856 871
857 bbr->restore_cwnd = 0;
858 bbr->round_start = 0; 872 bbr->round_start = 0;
859 bbr->idle_restart = 0; 873 bbr->idle_restart = 0;
860 bbr->full_bw_reached = 0; 874 bbr->full_bw_reached = 0;
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index 5f5e5936760e..ca61e2a659e7 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -55,7 +55,6 @@ struct dctcp {
55 u32 dctcp_alpha; 55 u32 dctcp_alpha;
56 u32 next_seq; 56 u32 next_seq;
57 u32 ce_state; 57 u32 ce_state;
58 u32 delayed_ack_reserved;
59 u32 loss_cwnd; 58 u32 loss_cwnd;
60}; 59};
61 60
@@ -96,7 +95,6 @@ static void dctcp_init(struct sock *sk)
96 95
97 ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA); 96 ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA);
98 97
99 ca->delayed_ack_reserved = 0;
100 ca->loss_cwnd = 0; 98 ca->loss_cwnd = 0;
101 ca->ce_state = 0; 99 ca->ce_state = 0;
102 100
@@ -131,23 +129,14 @@ static void dctcp_ce_state_0_to_1(struct sock *sk)
131 struct dctcp *ca = inet_csk_ca(sk); 129 struct dctcp *ca = inet_csk_ca(sk);
132 struct tcp_sock *tp = tcp_sk(sk); 130 struct tcp_sock *tp = tcp_sk(sk);
133 131
134 /* State has changed from CE=0 to CE=1 and delayed 132 if (!ca->ce_state) {
135 * ACK has not sent yet. 133 /* State has changed from CE=0 to CE=1, force an immediate
136 */ 134 * ACK to reflect the new CE state. If an ACK was delayed,
137 if (!ca->ce_state && ca->delayed_ack_reserved) { 135 * send that first to reflect the prior CE state.
138 u32 tmp_rcv_nxt; 136 */
139 137 if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER)
140 /* Save current rcv_nxt. */ 138 __tcp_send_ack(sk, ca->prior_rcv_nxt);
141 tmp_rcv_nxt = tp->rcv_nxt; 139 inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
142
143 /* Generate previous ack with CE=0. */
144 tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
145 tp->rcv_nxt = ca->prior_rcv_nxt;
146
147 tcp_send_ack(sk);
148
149 /* Recover current rcv_nxt. */
150 tp->rcv_nxt = tmp_rcv_nxt;
151 } 140 }
152 141
153 ca->prior_rcv_nxt = tp->rcv_nxt; 142 ca->prior_rcv_nxt = tp->rcv_nxt;
@@ -161,23 +150,14 @@ static void dctcp_ce_state_1_to_0(struct sock *sk)
161 struct dctcp *ca = inet_csk_ca(sk); 150 struct dctcp *ca = inet_csk_ca(sk);
162 struct tcp_sock *tp = tcp_sk(sk); 151 struct tcp_sock *tp = tcp_sk(sk);
163 152
164 /* State has changed from CE=1 to CE=0 and delayed 153 if (ca->ce_state) {
165 * ACK has not sent yet. 154 /* State has changed from CE=1 to CE=0, force an immediate
166 */ 155 * ACK to reflect the new CE state. If an ACK was delayed,
167 if (ca->ce_state && ca->delayed_ack_reserved) { 156 * send that first to reflect the prior CE state.
168 u32 tmp_rcv_nxt; 157 */
169 158 if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER)
170 /* Save current rcv_nxt. */ 159 __tcp_send_ack(sk, ca->prior_rcv_nxt);
171 tmp_rcv_nxt = tp->rcv_nxt; 160 inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
172
173 /* Generate previous ack with CE=1. */
174 tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
175 tp->rcv_nxt = ca->prior_rcv_nxt;
176
177 tcp_send_ack(sk);
178
179 /* Recover current rcv_nxt. */
180 tp->rcv_nxt = tmp_rcv_nxt;
181 } 161 }
182 162
183 ca->prior_rcv_nxt = tp->rcv_nxt; 163 ca->prior_rcv_nxt = tp->rcv_nxt;
@@ -248,25 +228,6 @@ static void dctcp_state(struct sock *sk, u8 new_state)
248 } 228 }
249} 229}
250 230
251static void dctcp_update_ack_reserved(struct sock *sk, enum tcp_ca_event ev)
252{
253 struct dctcp *ca = inet_csk_ca(sk);
254
255 switch (ev) {
256 case CA_EVENT_DELAYED_ACK:
257 if (!ca->delayed_ack_reserved)
258 ca->delayed_ack_reserved = 1;
259 break;
260 case CA_EVENT_NON_DELAYED_ACK:
261 if (ca->delayed_ack_reserved)
262 ca->delayed_ack_reserved = 0;
263 break;
264 default:
265 /* Don't care for the rest. */
266 break;
267 }
268}
269
270static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev) 231static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
271{ 232{
272 switch (ev) { 233 switch (ev) {
@@ -276,10 +237,6 @@ static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
276 case CA_EVENT_ECN_NO_CE: 237 case CA_EVENT_ECN_NO_CE:
277 dctcp_ce_state_1_to_0(sk); 238 dctcp_ce_state_1_to_0(sk);
278 break; 239 break;
279 case CA_EVENT_DELAYED_ACK:
280 case CA_EVENT_NON_DELAYED_ACK:
281 dctcp_update_ack_reserved(sk, ev);
282 break;
283 default: 240 default:
284 /* Don't care for the rest. */ 241 /* Don't care for the rest. */
285 break; 242 break;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 355d3dffd021..4c2dd9f863f7 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -78,6 +78,7 @@
78#include <linux/errqueue.h> 78#include <linux/errqueue.h>
79#include <trace/events/tcp.h> 79#include <trace/events/tcp.h>
80#include <linux/static_key.h> 80#include <linux/static_key.h>
81#include <net/busy_poll.h>
81 82
82int sysctl_tcp_max_orphans __read_mostly = NR_FILE; 83int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
83 84
@@ -215,7 +216,7 @@ static void tcp_incr_quickack(struct sock *sk, unsigned int max_quickacks)
215 icsk->icsk_ack.quick = quickacks; 216 icsk->icsk_ack.quick = quickacks;
216} 217}
217 218
218static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) 219void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
219{ 220{
220 struct inet_connection_sock *icsk = inet_csk(sk); 221 struct inet_connection_sock *icsk = inet_csk(sk);
221 222
@@ -223,6 +224,7 @@ static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
223 icsk->icsk_ack.pingpong = 0; 224 icsk->icsk_ack.pingpong = 0;
224 icsk->icsk_ack.ato = TCP_ATO_MIN; 225 icsk->icsk_ack.ato = TCP_ATO_MIN;
225} 226}
227EXPORT_SYMBOL(tcp_enter_quickack_mode);
226 228
227/* Send ACKs quickly, if "quick" count is not exhausted 229/* Send ACKs quickly, if "quick" count is not exhausted
228 * and the session is not interactive. 230 * and the session is not interactive.
@@ -243,10 +245,17 @@ static void tcp_ecn_queue_cwr(struct tcp_sock *tp)
243 tp->ecn_flags |= TCP_ECN_QUEUE_CWR; 245 tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
244} 246}
245 247
246static void tcp_ecn_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb) 248static void tcp_ecn_accept_cwr(struct sock *sk, const struct sk_buff *skb)
247{ 249{
248 if (tcp_hdr(skb)->cwr) 250 if (tcp_hdr(skb)->cwr) {
249 tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; 251 tcp_sk(sk)->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
252
253 /* If the sender is telling us it has entered CWR, then its
254 * cwnd may be very low (even just 1 packet), so we should ACK
255 * immediately.
256 */
257 inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
258 }
250} 259}
251 260
252static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp) 261static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
@@ -265,7 +274,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
265 * it is probably a retransmit. 274 * it is probably a retransmit.
266 */ 275 */
267 if (tp->ecn_flags & TCP_ECN_SEEN) 276 if (tp->ecn_flags & TCP_ECN_SEEN)
268 tcp_enter_quickack_mode(sk, 1); 277 tcp_enter_quickack_mode(sk, 2);
269 break; 278 break;
270 case INET_ECN_CE: 279 case INET_ECN_CE:
271 if (tcp_ca_needs_ecn(sk)) 280 if (tcp_ca_needs_ecn(sk))
@@ -273,7 +282,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
273 282
274 if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { 283 if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
275 /* Better not delay acks, sender can have a very low cwnd */ 284 /* Better not delay acks, sender can have a very low cwnd */
276 tcp_enter_quickack_mode(sk, 1); 285 tcp_enter_quickack_mode(sk, 2);
277 tp->ecn_flags |= TCP_ECN_DEMAND_CWR; 286 tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
278 } 287 }
279 tp->ecn_flags |= TCP_ECN_SEEN; 288 tp->ecn_flags |= TCP_ECN_SEEN;
@@ -582,9 +591,12 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
582{ 591{
583 struct tcp_sock *tp = tcp_sk(sk); 592 struct tcp_sock *tp = tcp_sk(sk);
584 593
585 if (tp->rx_opt.rcv_tsecr && 594 if (tp->rx_opt.rcv_tsecr == tp->rcv_rtt_last_tsecr)
586 (TCP_SKB_CB(skb)->end_seq - 595 return;
587 TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss)) { 596 tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr;
597
598 if (TCP_SKB_CB(skb)->end_seq -
599 TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss) {
588 u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr; 600 u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;
589 u32 delta_us; 601 u32 delta_us;
590 602
@@ -869,6 +881,7 @@ static void tcp_dsack_seen(struct tcp_sock *tp)
869{ 881{
870 tp->rx_opt.sack_ok |= TCP_DSACK_SEEN; 882 tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
871 tp->rack.dsack_seen = 1; 883 tp->rack.dsack_seen = 1;
884 tp->dsack_dups++;
872} 885}
873 886
874/* It's reordering when higher sequence was delivered (i.e. sacked) before 887/* It's reordering when higher sequence was delivered (i.e. sacked) before
@@ -900,8 +913,8 @@ static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq,
900 sock_net(sk)->ipv4.sysctl_tcp_max_reordering); 913 sock_net(sk)->ipv4.sysctl_tcp_max_reordering);
901 } 914 }
902 915
903 tp->rack.reord = 1;
904 /* This exciting event is worth to be remembered. 8) */ 916 /* This exciting event is worth to be remembered. 8) */
917 tp->reord_seen++;
905 NET_INC_STATS(sock_net(sk), 918 NET_INC_STATS(sock_net(sk),
906 ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER); 919 ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER);
907} 920}
@@ -1865,6 +1878,7 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend)
1865 1878
1866 tp->reordering = min_t(u32, tp->packets_out + addend, 1879 tp->reordering = min_t(u32, tp->packets_out + addend,
1867 sock_net(sk)->ipv4.sysctl_tcp_max_reordering); 1880 sock_net(sk)->ipv4.sysctl_tcp_max_reordering);
1881 tp->reord_seen++;
1868 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER); 1882 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER);
1869} 1883}
1870 1884
@@ -3181,6 +3195,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
3181 3195
3182 if (tcp_is_reno(tp)) { 3196 if (tcp_is_reno(tp)) {
3183 tcp_remove_reno_sacks(sk, pkts_acked); 3197 tcp_remove_reno_sacks(sk, pkts_acked);
3198
3199 /* If any of the cumulatively ACKed segments was
3200 * retransmitted, non-SACK case cannot confirm that
3201 * progress was due to original transmission due to
3202 * lack of TCPCB_SACKED_ACKED bits even if some of
3203 * the packets may have been never retransmitted.
3204 */
3205 if (flag & FLAG_RETRANS_DATA_ACKED)
3206 flag &= ~FLAG_ORIG_SACK_ACKED;
3184 } else { 3207 } else {
3185 int delta; 3208 int delta;
3186 3209
@@ -3449,7 +3472,7 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb)
3449static void tcp_store_ts_recent(struct tcp_sock *tp) 3472static void tcp_store_ts_recent(struct tcp_sock *tp)
3450{ 3473{
3451 tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval; 3474 tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
3452 tp->rx_opt.ts_recent_stamp = get_seconds(); 3475 tp->rx_opt.ts_recent_stamp = ktime_get_seconds();
3453} 3476}
3454 3477
3455static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) 3478static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
@@ -4330,6 +4353,11 @@ static bool tcp_try_coalesce(struct sock *sk,
4330 if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq) 4353 if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
4331 return false; 4354 return false;
4332 4355
4356#ifdef CONFIG_TLS_DEVICE
4357 if (from->decrypted != to->decrypted)
4358 return false;
4359#endif
4360
4333 if (!skb_try_coalesce(to, from, fragstolen, &delta)) 4361 if (!skb_try_coalesce(to, from, fragstolen, &delta))
4334 return false; 4362 return false;
4335 4363
@@ -4348,6 +4376,23 @@ static bool tcp_try_coalesce(struct sock *sk,
4348 return true; 4376 return true;
4349} 4377}
4350 4378
4379static bool tcp_ooo_try_coalesce(struct sock *sk,
4380 struct sk_buff *to,
4381 struct sk_buff *from,
4382 bool *fragstolen)
4383{
4384 bool res = tcp_try_coalesce(sk, to, from, fragstolen);
4385
4386 /* In case tcp_drop() is called later, update to->gso_segs */
4387 if (res) {
4388 u32 gso_segs = max_t(u16, 1, skb_shinfo(to)->gso_segs) +
4389 max_t(u16, 1, skb_shinfo(from)->gso_segs);
4390
4391 skb_shinfo(to)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
4392 }
4393 return res;
4394}
4395
4351static void tcp_drop(struct sock *sk, struct sk_buff *skb) 4396static void tcp_drop(struct sock *sk, struct sk_buff *skb)
4352{ 4397{
4353 sk_drops_add(sk, skb); 4398 sk_drops_add(sk, skb);
@@ -4471,8 +4516,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
4471 /* In the typical case, we are adding an skb to the end of the list. 4516 /* In the typical case, we are adding an skb to the end of the list.
4472 * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup. 4517 * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
4473 */ 4518 */
4474 if (tcp_try_coalesce(sk, tp->ooo_last_skb, 4519 if (tcp_ooo_try_coalesce(sk, tp->ooo_last_skb,
4475 skb, &fragstolen)) { 4520 skb, &fragstolen)) {
4476coalesce_done: 4521coalesce_done:
4477 tcp_grow_window(sk, skb); 4522 tcp_grow_window(sk, skb);
4478 kfree_skb_partial(skb, fragstolen); 4523 kfree_skb_partial(skb, fragstolen);
@@ -4500,7 +4545,7 @@ coalesce_done:
4500 /* All the bits are present. Drop. */ 4545 /* All the bits are present. Drop. */
4501 NET_INC_STATS(sock_net(sk), 4546 NET_INC_STATS(sock_net(sk),
4502 LINUX_MIB_TCPOFOMERGE); 4547 LINUX_MIB_TCPOFOMERGE);
4503 __kfree_skb(skb); 4548 tcp_drop(sk, skb);
4504 skb = NULL; 4549 skb = NULL;
4505 tcp_dsack_set(sk, seq, end_seq); 4550 tcp_dsack_set(sk, seq, end_seq);
4506 goto add_sack; 4551 goto add_sack;
@@ -4519,11 +4564,11 @@ coalesce_done:
4519 TCP_SKB_CB(skb1)->end_seq); 4564 TCP_SKB_CB(skb1)->end_seq);
4520 NET_INC_STATS(sock_net(sk), 4565 NET_INC_STATS(sock_net(sk),
4521 LINUX_MIB_TCPOFOMERGE); 4566 LINUX_MIB_TCPOFOMERGE);
4522 __kfree_skb(skb1); 4567 tcp_drop(sk, skb1);
4523 goto merge_right; 4568 goto merge_right;
4524 } 4569 }
4525 } else if (tcp_try_coalesce(sk, skb1, 4570 } else if (tcp_ooo_try_coalesce(sk, skb1,
4526 skb, &fragstolen)) { 4571 skb, &fragstolen)) {
4527 goto coalesce_done; 4572 goto coalesce_done;
4528 } 4573 }
4529 p = &parent->rb_right; 4574 p = &parent->rb_right;
@@ -4608,8 +4653,10 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
4608 skb->data_len = data_len; 4653 skb->data_len = data_len;
4609 skb->len = size; 4654 skb->len = size;
4610 4655
4611 if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) 4656 if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
4657 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
4612 goto err_free; 4658 goto err_free;
4659 }
4613 4660
4614 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); 4661 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
4615 if (err) 4662 if (err)
@@ -4656,7 +4703,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
4656 skb_dst_drop(skb); 4703 skb_dst_drop(skb);
4657 __skb_pull(skb, tcp_hdr(skb)->doff * 4); 4704 __skb_pull(skb, tcp_hdr(skb)->doff * 4);
4658 4705
4659 tcp_ecn_accept_cwr(tp, skb); 4706 tcp_ecn_accept_cwr(sk, skb);
4660 4707
4661 tp->rx_opt.dsack = 0; 4708 tp->rx_opt.dsack = 0;
4662 4709
@@ -4665,18 +4712,21 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
4665 * Out of sequence packets to the out_of_order_queue. 4712 * Out of sequence packets to the out_of_order_queue.
4666 */ 4713 */
4667 if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { 4714 if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
4668 if (tcp_receive_window(tp) == 0) 4715 if (tcp_receive_window(tp) == 0) {
4716 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
4669 goto out_of_window; 4717 goto out_of_window;
4718 }
4670 4719
4671 /* Ok. In sequence. In window. */ 4720 /* Ok. In sequence. In window. */
4672queue_and_out: 4721queue_and_out:
4673 if (skb_queue_len(&sk->sk_receive_queue) == 0) 4722 if (skb_queue_len(&sk->sk_receive_queue) == 0)
4674 sk_forced_mem_schedule(sk, skb->truesize); 4723 sk_forced_mem_schedule(sk, skb->truesize);
4675 else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) 4724 else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
4725 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
4676 goto drop; 4726 goto drop;
4727 }
4677 4728
4678 eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); 4729 eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
4679 tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
4680 if (skb->len) 4730 if (skb->len)
4681 tcp_event_data_recv(sk, skb); 4731 tcp_event_data_recv(sk, skb);
4682 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) 4732 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
@@ -4685,11 +4735,11 @@ queue_and_out:
4685 if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) { 4735 if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
4686 tcp_ofo_queue(sk); 4736 tcp_ofo_queue(sk);
4687 4737
4688 /* RFC2581. 4.2. SHOULD send immediate ACK, when 4738 /* RFC5681. 4.2. SHOULD send immediate ACK, when
4689 * gap in queue is filled. 4739 * gap in queue is filled.
4690 */ 4740 */
4691 if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) 4741 if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
4692 inet_csk(sk)->icsk_ack.pingpong = 0; 4742 inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
4693 } 4743 }
4694 4744
4695 if (tp->rx_opt.num_sacks) 4745 if (tp->rx_opt.num_sacks)
@@ -4732,8 +4782,10 @@ drop:
4732 /* If window is closed, drop tail of packet. But after 4782 /* If window is closed, drop tail of packet. But after
4733 * remembering D-SACK for its head made in previous line. 4783 * remembering D-SACK for its head made in previous line.
4734 */ 4784 */
4735 if (!tcp_receive_window(tp)) 4785 if (!tcp_receive_window(tp)) {
4786 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
4736 goto out_of_window; 4787 goto out_of_window;
4788 }
4737 goto queue_and_out; 4789 goto queue_and_out;
4738 } 4790 }
4739 4791
@@ -4851,6 +4903,9 @@ restart:
4851 break; 4903 break;
4852 4904
4853 memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); 4905 memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
4906#ifdef CONFIG_TLS_DEVICE
4907 nskb->decrypted = skb->decrypted;
4908#endif
4854 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; 4909 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
4855 if (list) 4910 if (list)
4856 __skb_queue_before(list, skb, nskb); 4911 __skb_queue_before(list, skb, nskb);
@@ -4878,6 +4933,10 @@ restart:
4878 skb == tail || 4933 skb == tail ||
4879 (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) 4934 (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
4880 goto end; 4935 goto end;
4936#ifdef CONFIG_TLS_DEVICE
4937 if (skb->decrypted != nskb->decrypted)
4938 goto end;
4939#endif
4881 } 4940 }
4882 } 4941 }
4883 } 4942 }
@@ -4892,6 +4951,7 @@ end:
4892static void tcp_collapse_ofo_queue(struct sock *sk) 4951static void tcp_collapse_ofo_queue(struct sock *sk)
4893{ 4952{
4894 struct tcp_sock *tp = tcp_sk(sk); 4953 struct tcp_sock *tp = tcp_sk(sk);
4954 u32 range_truesize, sum_tiny = 0;
4895 struct sk_buff *skb, *head; 4955 struct sk_buff *skb, *head;
4896 u32 start, end; 4956 u32 start, end;
4897 4957
@@ -4903,6 +4963,7 @@ new_range:
4903 } 4963 }
4904 start = TCP_SKB_CB(skb)->seq; 4964 start = TCP_SKB_CB(skb)->seq;
4905 end = TCP_SKB_CB(skb)->end_seq; 4965 end = TCP_SKB_CB(skb)->end_seq;
4966 range_truesize = skb->truesize;
4906 4967
4907 for (head = skb;;) { 4968 for (head = skb;;) {
4908 skb = skb_rb_next(skb); 4969 skb = skb_rb_next(skb);
@@ -4913,11 +4974,20 @@ new_range:
4913 if (!skb || 4974 if (!skb ||
4914 after(TCP_SKB_CB(skb)->seq, end) || 4975 after(TCP_SKB_CB(skb)->seq, end) ||
4915 before(TCP_SKB_CB(skb)->end_seq, start)) { 4976 before(TCP_SKB_CB(skb)->end_seq, start)) {
4916 tcp_collapse(sk, NULL, &tp->out_of_order_queue, 4977 /* Do not attempt collapsing tiny skbs */
4917 head, skb, start, end); 4978 if (range_truesize != head->truesize ||
4979 end - start >= SKB_WITH_OVERHEAD(SK_MEM_QUANTUM)) {
4980 tcp_collapse(sk, NULL, &tp->out_of_order_queue,
4981 head, skb, start, end);
4982 } else {
4983 sum_tiny += range_truesize;
4984 if (sum_tiny > sk->sk_rcvbuf >> 3)
4985 return;
4986 }
4918 goto new_range; 4987 goto new_range;
4919 } 4988 }
4920 4989
4990 range_truesize += skb->truesize;
4921 if (unlikely(before(TCP_SKB_CB(skb)->seq, start))) 4991 if (unlikely(before(TCP_SKB_CB(skb)->seq, start)))
4922 start = TCP_SKB_CB(skb)->seq; 4992 start = TCP_SKB_CB(skb)->seq;
4923 if (after(TCP_SKB_CB(skb)->end_seq, end)) 4993 if (after(TCP_SKB_CB(skb)->end_seq, end))
@@ -4932,6 +5002,7 @@ new_range:
4932 * 2) not add too big latencies if thousands of packets sit there. 5002 * 2) not add too big latencies if thousands of packets sit there.
4933 * (But if application shrinks SO_RCVBUF, we could still end up 5003 * (But if application shrinks SO_RCVBUF, we could still end up
4934 * freeing whole queue here) 5004 * freeing whole queue here)
5005 * 3) Drop at least 12.5 % of sk_rcvbuf to avoid malicious attacks.
4935 * 5006 *
4936 * Return true if queue has shrunk. 5007 * Return true if queue has shrunk.
4937 */ 5008 */
@@ -4939,20 +5010,26 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
4939{ 5010{
4940 struct tcp_sock *tp = tcp_sk(sk); 5011 struct tcp_sock *tp = tcp_sk(sk);
4941 struct rb_node *node, *prev; 5012 struct rb_node *node, *prev;
5013 int goal;
4942 5014
4943 if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) 5015 if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
4944 return false; 5016 return false;
4945 5017
4946 NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED); 5018 NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
5019 goal = sk->sk_rcvbuf >> 3;
4947 node = &tp->ooo_last_skb->rbnode; 5020 node = &tp->ooo_last_skb->rbnode;
4948 do { 5021 do {
4949 prev = rb_prev(node); 5022 prev = rb_prev(node);
4950 rb_erase(node, &tp->out_of_order_queue); 5023 rb_erase(node, &tp->out_of_order_queue);
5024 goal -= rb_to_skb(node)->truesize;
4951 tcp_drop(sk, rb_to_skb(node)); 5025 tcp_drop(sk, rb_to_skb(node));
4952 sk_mem_reclaim(sk); 5026 if (!prev || goal <= 0) {
4953 if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && 5027 sk_mem_reclaim(sk);
4954 !tcp_under_memory_pressure(sk)) 5028 if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
4955 break; 5029 !tcp_under_memory_pressure(sk))
5030 break;
5031 goal = sk->sk_rcvbuf >> 3;
5032 }
4956 node = prev; 5033 node = prev;
4957 } while (node); 5034 } while (node);
4958 tp->ooo_last_skb = rb_to_skb(prev); 5035 tp->ooo_last_skb = rb_to_skb(prev);
@@ -4987,6 +5064,9 @@ static int tcp_prune_queue(struct sock *sk)
4987 else if (tcp_under_memory_pressure(sk)) 5064 else if (tcp_under_memory_pressure(sk))
4988 tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); 5065 tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
4989 5066
5067 if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
5068 return 0;
5069
4990 tcp_collapse_ofo_queue(sk); 5070 tcp_collapse_ofo_queue(sk);
4991 if (!skb_queue_empty(&sk->sk_receive_queue)) 5071 if (!skb_queue_empty(&sk->sk_receive_queue))
4992 tcp_collapse(sk, &sk->sk_receive_queue, NULL, 5072 tcp_collapse(sk, &sk->sk_receive_queue, NULL,
@@ -5099,7 +5179,9 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
5099 (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || 5179 (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
5100 __tcp_select_window(sk) >= tp->rcv_wnd)) || 5180 __tcp_select_window(sk) >= tp->rcv_wnd)) ||
5101 /* We ACK each frame or... */ 5181 /* We ACK each frame or... */
5102 tcp_in_quickack_mode(sk)) { 5182 tcp_in_quickack_mode(sk) ||
5183 /* Protocol state mandates a one-time immediate ACK */
5184 inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOW) {
5103send_now: 5185send_now:
5104 tcp_send_ack(sk); 5186 tcp_send_ack(sk);
5105 return; 5187 return;
@@ -5475,6 +5557,11 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
5475 tcp_ack(sk, skb, 0); 5557 tcp_ack(sk, skb, 0);
5476 __kfree_skb(skb); 5558 __kfree_skb(skb);
5477 tcp_data_snd_check(sk); 5559 tcp_data_snd_check(sk);
5560 /* When receiving pure ack in fast path, update
5561 * last ts ecr directly instead of calling
5562 * tcp_rcv_rtt_measure_ts()
5563 */
5564 tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr;
5478 return; 5565 return;
5479 } else { /* Header too small */ 5566 } else { /* Header too small */
5480 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 5567 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
@@ -5576,6 +5663,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
5576 if (skb) { 5663 if (skb) {
5577 icsk->icsk_af_ops->sk_rx_dst_set(sk, skb); 5664 icsk->icsk_af_ops->sk_rx_dst_set(sk, skb);
5578 security_inet_conn_established(sk, skb); 5665 security_inet_conn_established(sk, skb);
5666 sk_mark_napi_id(sk, skb);
5579 } 5667 }
5580 5668
5581 tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB); 5669 tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB);
@@ -6404,6 +6492,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
6404 tcp_rsk(req)->snt_isn = isn; 6492 tcp_rsk(req)->snt_isn = isn;
6405 tcp_rsk(req)->txhash = net_tx_rndhash(); 6493 tcp_rsk(req)->txhash = net_tx_rndhash();
6406 tcp_openreq_init_rwin(req, sk, dst); 6494 tcp_openreq_init_rwin(req, sk, dst);
6495 sk_rx_queue_set(req_to_sk(req), skb);
6407 if (!want_cookie) { 6496 if (!want_cookie) {
6408 tcp_reqsk_record_syn(sk, req, skb); 6497 tcp_reqsk_record_syn(sk, req, skb);
6409 fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst); 6498 fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index bea17f1e8302..44c09eddbb78 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -155,12 +155,26 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
155 and use initial timestamp retrieved from peer table. 155 and use initial timestamp retrieved from peer table.
156 */ 156 */
157 if (tcptw->tw_ts_recent_stamp && 157 if (tcptw->tw_ts_recent_stamp &&
158 (!twp || (reuse && get_seconds() - tcptw->tw_ts_recent_stamp > 1))) { 158 (!twp || (reuse && time_after32(ktime_get_seconds(),
159 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; 159 tcptw->tw_ts_recent_stamp)))) {
160 if (tp->write_seq == 0) 160 /* In case of repair and re-using TIME-WAIT sockets we still
161 tp->write_seq = 1; 161 * want to be sure that it is safe as above but honor the
162 tp->rx_opt.ts_recent = tcptw->tw_ts_recent; 162 * sequence numbers and time stamps set as part of the repair
163 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; 163 * process.
164 *
165 * Without this check re-using a TIME-WAIT socket with TCP
166 * repair would accumulate a -1 on the repair assigned
167 * sequence number. The first time it is reused the sequence
168 * is -1, the second time -2, etc. This fixes that issue
169 * without appearing to create any others.
170 */
171 if (likely(!tp->repair)) {
172 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
173 if (tp->write_seq == 0)
174 tp->write_seq = 1;
175 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
176 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
177 }
164 sock_hold(sktw); 178 sock_hold(sktw);
165 return 1; 179 return 1;
166 } 180 }
@@ -2503,6 +2517,12 @@ static int __net_init tcp_sk_init(struct net *net)
2503 if (res) 2517 if (res)
2504 goto fail; 2518 goto fail;
2505 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 2519 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2520
2521 /* Please enforce IP_DF and IPID==0 for RST and
2522 * ACK sent in SYN-RECV and TIME-WAIT state.
2523 */
2524 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2525
2506 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk; 2526 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2507 } 2527 }
2508 2528
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 1dda1341a223..75ef332a7caf 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -144,7 +144,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
144 tw->tw_substate = TCP_TIME_WAIT; 144 tw->tw_substate = TCP_TIME_WAIT;
145 tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq; 145 tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;
146 if (tmp_opt.saw_tstamp) { 146 if (tmp_opt.saw_tstamp) {
147 tcptw->tw_ts_recent_stamp = get_seconds(); 147 tcptw->tw_ts_recent_stamp = ktime_get_seconds();
148 tcptw->tw_ts_recent = tmp_opt.rcv_tsval; 148 tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
149 } 149 }
150 150
@@ -189,7 +189,7 @@ kill:
189 189
190 if (tmp_opt.saw_tstamp) { 190 if (tmp_opt.saw_tstamp) {
191 tcptw->tw_ts_recent = tmp_opt.rcv_tsval; 191 tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
192 tcptw->tw_ts_recent_stamp = get_seconds(); 192 tcptw->tw_ts_recent_stamp = ktime_get_seconds();
193 } 193 }
194 194
195 inet_twsk_put(tw); 195 inet_twsk_put(tw);
@@ -449,119 +449,122 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
449 struct sk_buff *skb) 449 struct sk_buff *skb)
450{ 450{
451 struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC); 451 struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC);
452 const struct inet_request_sock *ireq = inet_rsk(req);
453 struct tcp_request_sock *treq = tcp_rsk(req);
454 struct inet_connection_sock *newicsk;
455 struct tcp_sock *oldtp, *newtp;
452 456
453 if (newsk) { 457 if (!newsk)
454 const struct inet_request_sock *ireq = inet_rsk(req); 458 return NULL;
455 struct tcp_request_sock *treq = tcp_rsk(req); 459
456 struct inet_connection_sock *newicsk = inet_csk(newsk); 460 newicsk = inet_csk(newsk);
457 struct tcp_sock *newtp = tcp_sk(newsk); 461 newtp = tcp_sk(newsk);
458 struct tcp_sock *oldtp = tcp_sk(sk); 462 oldtp = tcp_sk(sk);
459 463
460 smc_check_reset_syn_req(oldtp, req, newtp); 464 smc_check_reset_syn_req(oldtp, req, newtp);
461 465
462 /* Now setup tcp_sock */ 466 /* Now setup tcp_sock */
463 newtp->pred_flags = 0; 467 newtp->pred_flags = 0;
464 468
465 newtp->rcv_wup = newtp->copied_seq = 469 newtp->rcv_wup = newtp->copied_seq =
466 newtp->rcv_nxt = treq->rcv_isn + 1; 470 newtp->rcv_nxt = treq->rcv_isn + 1;
467 newtp->segs_in = 1; 471 newtp->segs_in = 1;
468 472
469 newtp->snd_sml = newtp->snd_una = 473 newtp->snd_sml = newtp->snd_una =
470 newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1; 474 newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;
471 475
472 INIT_LIST_HEAD(&newtp->tsq_node); 476 INIT_LIST_HEAD(&newtp->tsq_node);
473 INIT_LIST_HEAD(&newtp->tsorted_sent_queue); 477 INIT_LIST_HEAD(&newtp->tsorted_sent_queue);
474 478
475 tcp_init_wl(newtp, treq->rcv_isn); 479 tcp_init_wl(newtp, treq->rcv_isn);
476 480
477 newtp->srtt_us = 0; 481 newtp->srtt_us = 0;
478 newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); 482 newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
479 minmax_reset(&newtp->rtt_min, tcp_jiffies32, ~0U); 483 minmax_reset(&newtp->rtt_min, tcp_jiffies32, ~0U);
480 newicsk->icsk_rto = TCP_TIMEOUT_INIT; 484 newicsk->icsk_rto = TCP_TIMEOUT_INIT;
481 newicsk->icsk_ack.lrcvtime = tcp_jiffies32; 485 newicsk->icsk_ack.lrcvtime = tcp_jiffies32;
482 486
483 newtp->packets_out = 0; 487 newtp->packets_out = 0;
484 newtp->retrans_out = 0; 488 newtp->retrans_out = 0;
485 newtp->sacked_out = 0; 489 newtp->sacked_out = 0;
486 newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; 490 newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
487 newtp->tlp_high_seq = 0; 491 newtp->tlp_high_seq = 0;
488 newtp->lsndtime = tcp_jiffies32; 492 newtp->lsndtime = tcp_jiffies32;
489 newsk->sk_txhash = treq->txhash; 493 newsk->sk_txhash = treq->txhash;
490 newtp->last_oow_ack_time = 0; 494 newtp->last_oow_ack_time = 0;
491 newtp->total_retrans = req->num_retrans; 495 newtp->total_retrans = req->num_retrans;
492 496
493 /* So many TCP implementations out there (incorrectly) count the 497 /* So many TCP implementations out there (incorrectly) count the
494 * initial SYN frame in their delayed-ACK and congestion control 498 * initial SYN frame in their delayed-ACK and congestion control
495 * algorithms that we must have the following bandaid to talk 499 * algorithms that we must have the following bandaid to talk
496 * efficiently to them. -DaveM 500 * efficiently to them. -DaveM
497 */ 501 */
498 newtp->snd_cwnd = TCP_INIT_CWND; 502 newtp->snd_cwnd = TCP_INIT_CWND;
499 newtp->snd_cwnd_cnt = 0; 503 newtp->snd_cwnd_cnt = 0;
500 504
501 /* There's a bubble in the pipe until at least the first ACK. */ 505 /* There's a bubble in the pipe until at least the first ACK. */
502 newtp->app_limited = ~0U; 506 newtp->app_limited = ~0U;
503 507
504 tcp_init_xmit_timers(newsk); 508 tcp_init_xmit_timers(newsk);
505 newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1; 509 newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1;
506 510
507 newtp->rx_opt.saw_tstamp = 0; 511 newtp->rx_opt.saw_tstamp = 0;
508 512
509 newtp->rx_opt.dsack = 0; 513 newtp->rx_opt.dsack = 0;
510 newtp->rx_opt.num_sacks = 0; 514 newtp->rx_opt.num_sacks = 0;
511 515
512 newtp->urg_data = 0; 516 newtp->urg_data = 0;
513 517
514 if (sock_flag(newsk, SOCK_KEEPOPEN)) 518 if (sock_flag(newsk, SOCK_KEEPOPEN))
515 inet_csk_reset_keepalive_timer(newsk, 519 inet_csk_reset_keepalive_timer(newsk,
516 keepalive_time_when(newtp)); 520 keepalive_time_when(newtp));
517 521
518 newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; 522 newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
519 newtp->rx_opt.sack_ok = ireq->sack_ok; 523 newtp->rx_opt.sack_ok = ireq->sack_ok;
520 newtp->window_clamp = req->rsk_window_clamp; 524 newtp->window_clamp = req->rsk_window_clamp;
521 newtp->rcv_ssthresh = req->rsk_rcv_wnd; 525 newtp->rcv_ssthresh = req->rsk_rcv_wnd;
522 newtp->rcv_wnd = req->rsk_rcv_wnd; 526 newtp->rcv_wnd = req->rsk_rcv_wnd;
523 newtp->rx_opt.wscale_ok = ireq->wscale_ok; 527 newtp->rx_opt.wscale_ok = ireq->wscale_ok;
524 if (newtp->rx_opt.wscale_ok) { 528 if (newtp->rx_opt.wscale_ok) {
525 newtp->rx_opt.snd_wscale = ireq->snd_wscale; 529 newtp->rx_opt.snd_wscale = ireq->snd_wscale;
526 newtp->rx_opt.rcv_wscale = ireq->rcv_wscale; 530 newtp->rx_opt.rcv_wscale = ireq->rcv_wscale;
527 } else { 531 } else {
528 newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0; 532 newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0;
529 newtp->window_clamp = min(newtp->window_clamp, 65535U); 533 newtp->window_clamp = min(newtp->window_clamp, 65535U);
530 } 534 }
531 newtp->snd_wnd = (ntohs(tcp_hdr(skb)->window) << 535 newtp->snd_wnd = ntohs(tcp_hdr(skb)->window) << newtp->rx_opt.snd_wscale;
532 newtp->rx_opt.snd_wscale); 536 newtp->max_window = newtp->snd_wnd;
533 newtp->max_window = newtp->snd_wnd; 537
534 538 if (newtp->rx_opt.tstamp_ok) {
535 if (newtp->rx_opt.tstamp_ok) { 539 newtp->rx_opt.ts_recent = req->ts_recent;
536 newtp->rx_opt.ts_recent = req->ts_recent; 540 newtp->rx_opt.ts_recent_stamp = ktime_get_seconds();
537 newtp->rx_opt.ts_recent_stamp = get_seconds(); 541 newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
538 newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; 542 } else {
539 } else { 543 newtp->rx_opt.ts_recent_stamp = 0;
540 newtp->rx_opt.ts_recent_stamp = 0; 544 newtp->tcp_header_len = sizeof(struct tcphdr);
541 newtp->tcp_header_len = sizeof(struct tcphdr); 545 }
542 } 546 newtp->tsoffset = treq->ts_off;
543 newtp->tsoffset = treq->ts_off;
544#ifdef CONFIG_TCP_MD5SIG 547#ifdef CONFIG_TCP_MD5SIG
545 newtp->md5sig_info = NULL; /*XXX*/ 548 newtp->md5sig_info = NULL; /*XXX*/
546 if (newtp->af_specific->md5_lookup(sk, newsk)) 549 if (newtp->af_specific->md5_lookup(sk, newsk))
547 newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED; 550 newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
548#endif 551#endif
549 if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len) 552 if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len)
550 newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; 553 newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
551 newtp->rx_opt.mss_clamp = req->mss; 554 newtp->rx_opt.mss_clamp = req->mss;
552 tcp_ecn_openreq_child(newtp, req); 555 tcp_ecn_openreq_child(newtp, req);
553 newtp->fastopen_req = NULL; 556 newtp->fastopen_req = NULL;
554 newtp->fastopen_rsk = NULL; 557 newtp->fastopen_rsk = NULL;
555 newtp->syn_data_acked = 0; 558 newtp->syn_data_acked = 0;
556 newtp->rack.mstamp = 0; 559 newtp->rack.mstamp = 0;
557 newtp->rack.advanced = 0; 560 newtp->rack.advanced = 0;
558 newtp->rack.reo_wnd_steps = 1; 561 newtp->rack.reo_wnd_steps = 1;
559 newtp->rack.last_delivered = 0; 562 newtp->rack.last_delivered = 0;
560 newtp->rack.reo_wnd_persist = 0; 563 newtp->rack.reo_wnd_persist = 0;
561 newtp->rack.dsack_seen = 0; 564 newtp->rack.dsack_seen = 0;
562 565
563 __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); 566 __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);
564 } 567
565 return newsk; 568 return newsk;
566} 569}
567EXPORT_SYMBOL(tcp_create_openreq_child); 570EXPORT_SYMBOL(tcp_create_openreq_child);
@@ -600,7 +603,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
600 * it can be estimated (approximately) 603 * it can be estimated (approximately)
601 * from another data. 604 * from another data.
602 */ 605 */
603 tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->num_timeout); 606 tmp_opt.ts_recent_stamp = ktime_get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->num_timeout);
604 paws_reject = tcp_paws_reject(&tmp_opt, th->rst); 607 paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
605 } 608 }
606 } 609 }
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 8cc7c3487330..870b0a335061 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -180,9 +180,9 @@ out:
180 return segs; 180 return segs;
181} 181}
182 182
183struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) 183struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb)
184{ 184{
185 struct sk_buff **pp = NULL; 185 struct sk_buff *pp = NULL;
186 struct sk_buff *p; 186 struct sk_buff *p;
187 struct tcphdr *th; 187 struct tcphdr *th;
188 struct tcphdr *th2; 188 struct tcphdr *th2;
@@ -220,7 +220,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
220 len = skb_gro_len(skb); 220 len = skb_gro_len(skb);
221 flags = tcp_flag_word(th); 221 flags = tcp_flag_word(th);
222 222
223 for (; (p = *head); head = &p->next) { 223 list_for_each_entry(p, head, list) {
224 if (!NAPI_GRO_CB(p)->same_flow) 224 if (!NAPI_GRO_CB(p)->same_flow)
225 continue; 225 continue;
226 226
@@ -233,7 +233,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
233 233
234 goto found; 234 goto found;
235 } 235 }
236 236 p = NULL;
237 goto out_check_final; 237 goto out_check_final;
238 238
239found: 239found:
@@ -262,8 +262,11 @@ found:
262 262
263 flush |= (len - 1) >= mss; 263 flush |= (len - 1) >= mss;
264 flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq); 264 flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
265#ifdef CONFIG_TLS_DEVICE
266 flush |= p->decrypted ^ skb->decrypted;
267#endif
265 268
266 if (flush || skb_gro_receive(head, skb)) { 269 if (flush || skb_gro_receive(p, skb)) {
267 mss = 1; 270 mss = 1;
268 goto out_check_final; 271 goto out_check_final;
269 } 272 }
@@ -277,7 +280,7 @@ out_check_final:
277 TCP_FLAG_FIN)); 280 TCP_FLAG_FIN));
278 281
279 if (p && (!NAPI_GRO_CB(skb)->same_flow || flush)) 282 if (p && (!NAPI_GRO_CB(skb)->same_flow || flush))
280 pp = head; 283 pp = p;
281 284
282out: 285out:
283 NAPI_GRO_CB(skb)->flush |= (flush != 0); 286 NAPI_GRO_CB(skb)->flush |= (flush != 0);
@@ -302,7 +305,7 @@ int tcp_gro_complete(struct sk_buff *skb)
302} 305}
303EXPORT_SYMBOL(tcp_gro_complete); 306EXPORT_SYMBOL(tcp_gro_complete);
304 307
305static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) 308static struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb)
306{ 309{
307 /* Don't bother verifying checksum if we're going to flush anyway. */ 310 /* Don't bother verifying checksum if we're going to flush anyway. */
308 if (!NAPI_GRO_CB(skb)->flush && 311 if (!NAPI_GRO_CB(skb)->flush &&
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8e08b409c71e..597dbd749f05 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -160,7 +160,8 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
160} 160}
161 161
162/* Account for an ACK we sent. */ 162/* Account for an ACK we sent. */
163static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) 163static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts,
164 u32 rcv_nxt)
164{ 165{
165 struct tcp_sock *tp = tcp_sk(sk); 166 struct tcp_sock *tp = tcp_sk(sk);
166 167
@@ -171,6 +172,9 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
171 if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1) 172 if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
172 __sock_put(sk); 173 __sock_put(sk);
173 } 174 }
175
176 if (unlikely(rcv_nxt != tp->rcv_nxt))
177 return; /* Special ACK sent by DCTCP to reflect ECN */
174 tcp_dec_quickack_mode(sk, pkts); 178 tcp_dec_quickack_mode(sk, pkts);
175 inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); 179 inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
176} 180}
@@ -973,17 +977,6 @@ enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer)
973 return HRTIMER_NORESTART; 977 return HRTIMER_NORESTART;
974} 978}
975 979
976/* BBR congestion control needs pacing.
977 * Same remark for SO_MAX_PACING_RATE.
978 * sch_fq packet scheduler is efficiently handling pacing,
979 * but is not always installed/used.
980 * Return true if TCP stack should pace packets itself.
981 */
982static bool tcp_needs_internal_pacing(const struct sock *sk)
983{
984 return smp_load_acquire(&sk->sk_pacing_status) == SK_PACING_NEEDED;
985}
986
987static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb) 980static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb)
988{ 981{
989 u64 len_ns; 982 u64 len_ns;
@@ -995,9 +988,6 @@ static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb)
995 if (!rate || rate == ~0U) 988 if (!rate || rate == ~0U)
996 return; 989 return;
997 990
998 /* Should account for header sizes as sch_fq does,
999 * but lets make things simple.
1000 */
1001 len_ns = (u64)skb->len * NSEC_PER_SEC; 991 len_ns = (u64)skb->len * NSEC_PER_SEC;
1002 do_div(len_ns, rate); 992 do_div(len_ns, rate);
1003 hrtimer_start(&tcp_sk(sk)->pacing_timer, 993 hrtimer_start(&tcp_sk(sk)->pacing_timer,
@@ -1023,8 +1013,8 @@ static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb)
1023 * We are working here with either a clone of the original 1013 * We are working here with either a clone of the original
1024 * SKB, or a fresh unique copy made by the retransmit engine. 1014 * SKB, or a fresh unique copy made by the retransmit engine.
1025 */ 1015 */
1026static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, 1016static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
1027 gfp_t gfp_mask) 1017 int clone_it, gfp_t gfp_mask, u32 rcv_nxt)
1028{ 1018{
1029 const struct inet_connection_sock *icsk = inet_csk(sk); 1019 const struct inet_connection_sock *icsk = inet_csk(sk);
1030 struct inet_sock *inet; 1020 struct inet_sock *inet;
@@ -1100,7 +1090,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
1100 th->source = inet->inet_sport; 1090 th->source = inet->inet_sport;
1101 th->dest = inet->inet_dport; 1091 th->dest = inet->inet_dport;
1102 th->seq = htonl(tcb->seq); 1092 th->seq = htonl(tcb->seq);
1103 th->ack_seq = htonl(tp->rcv_nxt); 1093 th->ack_seq = htonl(rcv_nxt);
1104 *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | 1094 *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) |
1105 tcb->tcp_flags); 1095 tcb->tcp_flags);
1106 1096
@@ -1141,11 +1131,12 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
1141 icsk->icsk_af_ops->send_check(sk, skb); 1131 icsk->icsk_af_ops->send_check(sk, skb);
1142 1132
1143 if (likely(tcb->tcp_flags & TCPHDR_ACK)) 1133 if (likely(tcb->tcp_flags & TCPHDR_ACK))
1144 tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); 1134 tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt);
1145 1135
1146 if (skb->len != tcp_header_size) { 1136 if (skb->len != tcp_header_size) {
1147 tcp_event_data_sent(tp, sk); 1137 tcp_event_data_sent(tp, sk);
1148 tp->data_segs_out += tcp_skb_pcount(skb); 1138 tp->data_segs_out += tcp_skb_pcount(skb);
1139 tp->bytes_sent += skb->len - tcp_header_size;
1149 tcp_internal_pacing(sk, skb); 1140 tcp_internal_pacing(sk, skb);
1150 } 1141 }
1151 1142
@@ -1178,6 +1169,13 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
1178 return err; 1169 return err;
1179} 1170}
1180 1171
1172static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
1173 gfp_t gfp_mask)
1174{
1175 return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask,
1176 tcp_sk(sk)->rcv_nxt);
1177}
1178
1181/* This routine just queues the buffer for sending. 1179/* This routine just queues the buffer for sending.
1182 * 1180 *
1183 * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames, 1181 * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
@@ -2700,9 +2698,8 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
2700{ 2698{
2701 struct tcp_sock *tp = tcp_sk(sk); 2699 struct tcp_sock *tp = tcp_sk(sk);
2702 struct sk_buff *next_skb = skb_rb_next(skb); 2700 struct sk_buff *next_skb = skb_rb_next(skb);
2703 int skb_size, next_skb_size; 2701 int next_skb_size;
2704 2702
2705 skb_size = skb->len;
2706 next_skb_size = next_skb->len; 2703 next_skb_size = next_skb->len;
2707 2704
2708 BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); 2705 BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1);
@@ -2873,6 +2870,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
2873 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) 2870 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
2874 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); 2871 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
2875 tp->total_retrans += segs; 2872 tp->total_retrans += segs;
2873 tp->bytes_retrans += skb->len;
2876 2874
2877 /* make sure skb->data is aligned on arches that require it 2875 /* make sure skb->data is aligned on arches that require it
2878 * and check if ack-trimming & collapsing extended the headroom 2876 * and check if ack-trimming & collapsing extended the headroom
@@ -3523,8 +3521,6 @@ void tcp_send_delayed_ack(struct sock *sk)
3523 int ato = icsk->icsk_ack.ato; 3521 int ato = icsk->icsk_ack.ato;
3524 unsigned long timeout; 3522 unsigned long timeout;
3525 3523
3526 tcp_ca_event(sk, CA_EVENT_DELAYED_ACK);
3527
3528 if (ato > TCP_DELACK_MIN) { 3524 if (ato > TCP_DELACK_MIN) {
3529 const struct tcp_sock *tp = tcp_sk(sk); 3525 const struct tcp_sock *tp = tcp_sk(sk);
3530 int max_ato = HZ / 2; 3526 int max_ato = HZ / 2;
@@ -3573,7 +3569,7 @@ void tcp_send_delayed_ack(struct sock *sk)
3573} 3569}
3574 3570
3575/* This routine sends an ack and also updates the window. */ 3571/* This routine sends an ack and also updates the window. */
3576void tcp_send_ack(struct sock *sk) 3572void __tcp_send_ack(struct sock *sk, u32 rcv_nxt)
3577{ 3573{
3578 struct sk_buff *buff; 3574 struct sk_buff *buff;
3579 3575
@@ -3581,8 +3577,6 @@ void tcp_send_ack(struct sock *sk)
3581 if (sk->sk_state == TCP_CLOSE) 3577 if (sk->sk_state == TCP_CLOSE)
3582 return; 3578 return;
3583 3579
3584 tcp_ca_event(sk, CA_EVENT_NON_DELAYED_ACK);
3585
3586 /* We are not putting this on the write queue, so 3580 /* We are not putting this on the write queue, so
3587 * tcp_transmit_skb() will set the ownership to this 3581 * tcp_transmit_skb() will set the ownership to this
3588 * sock. 3582 * sock.
@@ -3608,9 +3602,14 @@ void tcp_send_ack(struct sock *sk)
3608 skb_set_tcp_pure_ack(buff); 3602 skb_set_tcp_pure_ack(buff);
3609 3603
3610 /* Send it off, this clears delayed acks for us. */ 3604 /* Send it off, this clears delayed acks for us. */
3611 tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0); 3605 __tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0, rcv_nxt);
3606}
3607EXPORT_SYMBOL_GPL(__tcp_send_ack);
3608
3609void tcp_send_ack(struct sock *sk)
3610{
3611 __tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt);
3612} 3612}
3613EXPORT_SYMBOL_GPL(tcp_send_ack);
3614 3613
3615/* This routine sends a packet with an out of date sequence 3614/* This routine sends a packet with an out of date sequence
3616 * number. It assumes the other end will try to ack it. 3615 * number. It assumes the other end will try to ack it.
diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
index c61240e43923..4dff40dad4dc 100644
--- a/net/ipv4/tcp_rate.c
+++ b/net/ipv4/tcp_rate.c
@@ -146,6 +146,10 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
146 rs->prior_mstamp); /* ack phase */ 146 rs->prior_mstamp); /* ack phase */
147 rs->interval_us = max(snd_us, ack_us); 147 rs->interval_us = max(snd_us, ack_us);
148 148
149 /* Record both segment send and ack receive intervals */
150 rs->snd_interval_us = snd_us;
151 rs->rcv_interval_us = ack_us;
152
149 /* Normally we expect interval_us >= min-rtt. 153 /* Normally we expect interval_us >= min-rtt.
150 * Note that rate may still be over-estimated when a spuriously 154 * Note that rate may still be over-estimated when a spuriously
151 * retransmistted skb was first (s)acked because "interval_us" 155 * retransmistted skb was first (s)acked because "interval_us"
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index 71593e4400ab..c81aadff769b 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -25,7 +25,7 @@ static u32 tcp_rack_reo_wnd(const struct sock *sk)
25{ 25{
26 struct tcp_sock *tp = tcp_sk(sk); 26 struct tcp_sock *tp = tcp_sk(sk);
27 27
28 if (!tp->rack.reord) { 28 if (!tp->reord_seen) {
29 /* If reordering has not been observed, be aggressive during 29 /* If reordering has not been observed, be aggressive during
30 * the recovery or starting the recovery by DUPACK threshold. 30 * the recovery or starting the recovery by DUPACK threshold.
31 */ 31 */
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 3b3611729928..7fdf222a0bdf 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -22,6 +22,35 @@
22#include <linux/gfp.h> 22#include <linux/gfp.h>
23#include <net/tcp.h> 23#include <net/tcp.h>
24 24
25static u32 tcp_retransmit_stamp(const struct sock *sk)
26{
27 u32 start_ts = tcp_sk(sk)->retrans_stamp;
28
29 if (unlikely(!start_ts)) {
30 struct sk_buff *head = tcp_rtx_queue_head(sk);
31
32 if (!head)
33 return 0;
34 start_ts = tcp_skb_timestamp(head);
35 }
36 return start_ts;
37}
38
39static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
40{
41 struct inet_connection_sock *icsk = inet_csk(sk);
42 u32 elapsed, start_ts;
43
44 start_ts = tcp_retransmit_stamp(sk);
45 if (!icsk->icsk_user_timeout || !start_ts)
46 return icsk->icsk_rto;
47 elapsed = tcp_time_stamp(tcp_sk(sk)) - start_ts;
48 if (elapsed >= icsk->icsk_user_timeout)
49 return 1; /* user timeout has passed; fire ASAP */
50 else
51 return min_t(u32, icsk->icsk_rto, msecs_to_jiffies(icsk->icsk_user_timeout - elapsed));
52}
53
25/** 54/**
26 * tcp_write_err() - close socket and save error info 55 * tcp_write_err() - close socket and save error info
27 * @sk: The socket the error has appeared on. 56 * @sk: The socket the error has appeared on.
@@ -166,14 +195,9 @@ static bool retransmits_timed_out(struct sock *sk,
166 if (!inet_csk(sk)->icsk_retransmits) 195 if (!inet_csk(sk)->icsk_retransmits)
167 return false; 196 return false;
168 197
169 start_ts = tcp_sk(sk)->retrans_stamp; 198 start_ts = tcp_retransmit_stamp(sk);
170 if (unlikely(!start_ts)) { 199 if (!start_ts)
171 struct sk_buff *head = tcp_rtx_queue_head(sk); 200 return false;
172
173 if (!head)
174 return false;
175 start_ts = tcp_skb_timestamp(head);
176 }
177 201
178 if (likely(timeout == 0)) { 202 if (likely(timeout == 0)) {
179 linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base); 203 linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base);
@@ -183,8 +207,9 @@ static bool retransmits_timed_out(struct sock *sk,
183 else 207 else
184 timeout = ((2 << linear_backoff_thresh) - 1) * rto_base + 208 timeout = ((2 << linear_backoff_thresh) - 1) * rto_base +
185 (boundary - linear_backoff_thresh) * TCP_RTO_MAX; 209 (boundary - linear_backoff_thresh) * TCP_RTO_MAX;
210 timeout = jiffies_to_msecs(timeout);
186 } 211 }
187 return (tcp_time_stamp(tcp_sk(sk)) - start_ts) >= jiffies_to_msecs(timeout); 212 return (tcp_time_stamp(tcp_sk(sk)) - start_ts) >= timeout;
188} 213}
189 214
190/* A write timeout has occurred. Process the after effects. */ 215/* A write timeout has occurred. Process the after effects. */
@@ -337,8 +362,7 @@ static void tcp_probe_timer(struct sock *sk)
337 if (!start_ts) 362 if (!start_ts)
338 skb->skb_mstamp = tp->tcp_mstamp; 363 skb->skb_mstamp = tp->tcp_mstamp;
339 else if (icsk->icsk_user_timeout && 364 else if (icsk->icsk_user_timeout &&
340 (s32)(tcp_time_stamp(tp) - start_ts) > 365 (s32)(tcp_time_stamp(tp) - start_ts) > icsk->icsk_user_timeout)
341 jiffies_to_msecs(icsk->icsk_user_timeout))
342 goto abort; 366 goto abort;
343 367
344 max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2; 368 max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2;
@@ -535,7 +559,8 @@ out_reset_timer:
535 /* Use normal (exponential) backoff */ 559 /* Use normal (exponential) backoff */
536 icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); 560 icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
537 } 561 }
538 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); 562 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
563 tcp_clamp_rto_to_user_timeout(sk), TCP_RTO_MAX);
539 if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1 + 1, 0)) 564 if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1 + 1, 0))
540 __sk_dst_reset(sk); 565 __sk_dst_reset(sk);
541 566
@@ -672,7 +697,7 @@ static void tcp_keepalive_timer (struct timer_list *t)
672 * to determine when to timeout instead. 697 * to determine when to timeout instead.
673 */ 698 */
674 if ((icsk->icsk_user_timeout != 0 && 699 if ((icsk->icsk_user_timeout != 0 &&
675 elapsed >= icsk->icsk_user_timeout && 700 elapsed >= msecs_to_jiffies(icsk->icsk_user_timeout) &&
676 icsk->icsk_probes_out > 0) || 701 icsk->icsk_probes_out > 0) ||
677 (icsk->icsk_user_timeout == 0 && 702 (icsk->icsk_user_timeout == 0 &&
678 icsk->icsk_probes_out >= keepalive_probes(tp))) { 703 icsk->icsk_probes_out >= keepalive_probes(tp))) {
diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c
index 622caa4039e0..a5995bb2eaca 100644
--- a/net/ipv4/tcp_ulp.c
+++ b/net/ipv4/tcp_ulp.c
@@ -51,7 +51,7 @@ static const struct tcp_ulp_ops *__tcp_ulp_find_autoload(const char *name)
51#ifdef CONFIG_MODULES 51#ifdef CONFIG_MODULES
52 if (!ulp && capable(CAP_NET_ADMIN)) { 52 if (!ulp && capable(CAP_NET_ADMIN)) {
53 rcu_read_unlock(); 53 rcu_read_unlock();
54 request_module("%s", name); 54 request_module("tcp-ulp-%s", name);
55 rcu_read_lock(); 55 rcu_read_lock();
56 ulp = tcp_ulp_find(name); 56 ulp = tcp_ulp_find(name);
57 } 57 }
@@ -129,6 +129,8 @@ void tcp_cleanup_ulp(struct sock *sk)
129 if (icsk->icsk_ulp_ops->release) 129 if (icsk->icsk_ulp_ops->release)
130 icsk->icsk_ulp_ops->release(sk); 130 icsk->icsk_ulp_ops->release(sk);
131 module_put(icsk->icsk_ulp_ops->owner); 131 module_put(icsk->icsk_ulp_ops->owner);
132
133 icsk->icsk_ulp_ops = NULL;
132} 134}
133 135
134/* Change upper layer protocol for socket */ 136/* Change upper layer protocol for socket */
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 9bb27df4dac5..f4e35b2ff8b8 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -221,11 +221,12 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot)
221 (sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && 221 (sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
222 sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && 222 sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
223 inet_rcv_saddr_equal(sk, sk2, false)) { 223 inet_rcv_saddr_equal(sk, sk2, false)) {
224 return reuseport_add_sock(sk, sk2); 224 return reuseport_add_sock(sk, sk2,
225 inet_rcv_saddr_any(sk));
225 } 226 }
226 } 227 }
227 228
228 return reuseport_alloc(sk); 229 return reuseport_alloc(sk, inet_rcv_saddr_any(sk));
229} 230}
230 231
231/** 232/**
@@ -498,6 +499,8 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
498 daddr, hnum, dif, sdif, 499 daddr, hnum, dif, sdif,
499 exact_dif, hslot2, skb); 500 exact_dif, hslot2, skb);
500 } 501 }
502 if (unlikely(IS_ERR(result)))
503 return NULL;
501 return result; 504 return result;
502 } 505 }
503begin: 506begin:
@@ -512,6 +515,8 @@ begin:
512 saddr, sport); 515 saddr, sport);
513 result = reuseport_select_sock(sk, hash, skb, 516 result = reuseport_select_sock(sk, hash, skb,
514 sizeof(struct udphdr)); 517 sizeof(struct udphdr));
518 if (unlikely(IS_ERR(result)))
519 return NULL;
515 if (result) 520 if (result)
516 return result; 521 return result;
517 } 522 }
@@ -926,11 +931,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
926 if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message compatibility */ 931 if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message compatibility */
927 return -EOPNOTSUPP; 932 return -EOPNOTSUPP;
928 933
929 ipc.opt = NULL;
930 ipc.tx_flags = 0;
931 ipc.ttl = 0;
932 ipc.tos = -1;
933
934 getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; 934 getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;
935 935
936 fl4 = &inet->cork.fl.u.ip4; 936 fl4 = &inet->cork.fl.u.ip4;
@@ -977,9 +977,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
977 connected = 1; 977 connected = 1;
978 } 978 }
979 979
980 ipc.sockc.tsflags = sk->sk_tsflags; 980 ipcm_init_sk(&ipc, inet);
981 ipc.addr = inet->inet_saddr;
982 ipc.oif = sk->sk_bound_dev_if;
983 ipc.gso_size = up->gso_size; 981 ipc.gso_size = up->gso_size;
984 982
985 if (msg->msg_controllen) { 983 if (msg->msg_controllen) {
@@ -1027,8 +1025,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
1027 saddr = ipc.addr; 1025 saddr = ipc.addr;
1028 ipc.addr = faddr = daddr; 1026 ipc.addr = faddr = daddr;
1029 1027
1030 sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags);
1031
1032 if (ipc.opt && ipc.opt->opt.srr) { 1028 if (ipc.opt && ipc.opt->opt.srr) {
1033 if (!daddr) { 1029 if (!daddr) {
1034 err = -EINVAL; 1030 err = -EINVAL;
@@ -2591,7 +2587,7 @@ int compat_udp_getsockopt(struct sock *sk, int level, int optname,
2591 * udp_poll - wait for a UDP event. 2587 * udp_poll - wait for a UDP event.
2592 * @file - file struct 2588 * @file - file struct
2593 * @sock - socket 2589 * @sock - socket
2594 * @events - events to wait for 2590 * @wait - poll table
2595 * 2591 *
2596 * This is same as datagram poll, except for the special case of 2592 * This is same as datagram poll, except for the special case of
2597 * blocking sockets. If application is using a blocking fd 2593 * blocking sockets. If application is using a blocking fd
@@ -2600,23 +2596,23 @@ int compat_udp_getsockopt(struct sock *sk, int level, int optname,
2600 * but then block when reading it. Add special case code 2596 * but then block when reading it. Add special case code
2601 * to work around these arguably broken applications. 2597 * to work around these arguably broken applications.
2602 */ 2598 */
2603__poll_t udp_poll_mask(struct socket *sock, __poll_t events) 2599__poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait)
2604{ 2600{
2605 __poll_t mask = datagram_poll_mask(sock, events); 2601 __poll_t mask = datagram_poll(file, sock, wait);
2606 struct sock *sk = sock->sk; 2602 struct sock *sk = sock->sk;
2607 2603
2608 if (!skb_queue_empty(&udp_sk(sk)->reader_queue)) 2604 if (!skb_queue_empty(&udp_sk(sk)->reader_queue))
2609 mask |= EPOLLIN | EPOLLRDNORM; 2605 mask |= EPOLLIN | EPOLLRDNORM;
2610 2606
2611 /* Check for false positives due to checksum errors */ 2607 /* Check for false positives due to checksum errors */
2612 if ((mask & EPOLLRDNORM) && !(sock->file->f_flags & O_NONBLOCK) && 2608 if ((mask & EPOLLRDNORM) && !(file->f_flags & O_NONBLOCK) &&
2613 !(sk->sk_shutdown & RCV_SHUTDOWN) && first_packet_length(sk) == -1) 2609 !(sk->sk_shutdown & RCV_SHUTDOWN) && first_packet_length(sk) == -1)
2614 mask &= ~(EPOLLIN | EPOLLRDNORM); 2610 mask &= ~(EPOLLIN | EPOLLRDNORM);
2615 2611
2616 return mask; 2612 return mask;
2617 2613
2618} 2614}
2619EXPORT_SYMBOL(udp_poll_mask); 2615EXPORT_SYMBOL(udp_poll);
2620 2616
2621int udp_abort(struct sock *sk, int err) 2617int udp_abort(struct sock *sk, int err)
2622{ 2618{
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 92dc9e5a7ff3..0c0522b79b43 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -343,10 +343,11 @@ out:
343 return segs; 343 return segs;
344} 344}
345 345
346struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb, 346struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
347 struct udphdr *uh, udp_lookup_t lookup) 347 struct udphdr *uh, udp_lookup_t lookup)
348{ 348{
349 struct sk_buff *p, **pp = NULL; 349 struct sk_buff *pp = NULL;
350 struct sk_buff *p;
350 struct udphdr *uh2; 351 struct udphdr *uh2;
351 unsigned int off = skb_gro_offset(skb); 352 unsigned int off = skb_gro_offset(skb);
352 int flush = 1; 353 int flush = 1;
@@ -371,7 +372,7 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
371unflush: 372unflush:
372 flush = 0; 373 flush = 0;
373 374
374 for (p = *head; p; p = p->next) { 375 list_for_each_entry(p, head, list) {
375 if (!NAPI_GRO_CB(p)->same_flow) 376 if (!NAPI_GRO_CB(p)->same_flow)
376 continue; 377 continue;
377 378
@@ -394,13 +395,13 @@ unflush:
394out_unlock: 395out_unlock:
395 rcu_read_unlock(); 396 rcu_read_unlock();
396out: 397out:
397 NAPI_GRO_CB(skb)->flush |= flush; 398 skb_gro_flush_final(skb, pp, flush);
398 return pp; 399 return pp;
399} 400}
400EXPORT_SYMBOL(udp_gro_receive); 401EXPORT_SYMBOL(udp_gro_receive);
401 402
402static struct sk_buff **udp4_gro_receive(struct sk_buff **head, 403static struct sk_buff *udp4_gro_receive(struct list_head *head,
403 struct sk_buff *skb) 404 struct sk_buff *skb)
404{ 405{
405 struct udphdr *uh = udp_gro_udphdr(skb); 406 struct udphdr *uh = udp_gro_udphdr(skb);
406 407
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index 0eff75525da1..613282c65a10 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -15,7 +15,7 @@ menuconfig IPV6
15 Documentation/networking/ipv6.txt and read the HOWTO at 15 Documentation/networking/ipv6.txt and read the HOWTO at
16 <http://www.tldp.org/HOWTO/Linux+IPv6-HOWTO/> 16 <http://www.tldp.org/HOWTO/Linux+IPv6-HOWTO/>
17 17
18 To compile this protocol support as a module, choose M here: the 18 To compile this protocol support as a module, choose M here: the
19 module will be called ipv6. 19 module will be called ipv6.
20 20
21if IPV6 21if IPV6
@@ -108,6 +108,7 @@ config IPV6_MIP6
108config IPV6_ILA 108config IPV6_ILA
109 tristate "IPv6: Identifier Locator Addressing (ILA)" 109 tristate "IPv6: Identifier Locator Addressing (ILA)"
110 depends on NETFILTER 110 depends on NETFILTER
111 select DST_CACHE
111 select LWTUNNEL 112 select LWTUNNEL
112 ---help--- 113 ---help---
113 Support for IPv6 Identifier Locator Addressing (ILA). 114 Support for IPv6 Identifier Locator Addressing (ILA).
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index c134286d6a41..d51a8c0b3372 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -385,8 +385,6 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
385 385
386 if (ndev->cnf.stable_secret.initialized) 386 if (ndev->cnf.stable_secret.initialized)
387 ndev->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY; 387 ndev->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY;
388 else
389 ndev->cnf.addr_gen_mode = ipv6_devconf_dflt.addr_gen_mode;
390 388
391 ndev->cnf.mtu6 = dev->mtu; 389 ndev->cnf.mtu6 = dev->mtu;
392 ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl); 390 ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl);
@@ -2374,7 +2372,8 @@ static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
2374 continue; 2372 continue;
2375 if ((rt->fib6_flags & noflags) != 0) 2373 if ((rt->fib6_flags & noflags) != 0)
2376 continue; 2374 continue;
2377 fib6_info_hold(rt); 2375 if (!fib6_info_hold_safe(rt))
2376 continue;
2378 break; 2377 break;
2379 } 2378 }
2380out: 2379out:
@@ -2399,7 +2398,7 @@ static void addrconf_add_mroute(struct net_device *dev)
2399 2398
2400 ipv6_addr_set(&cfg.fc_dst, htonl(0xFF000000), 0, 0, 0); 2399 ipv6_addr_set(&cfg.fc_dst, htonl(0xFF000000), 0, 0, 0);
2401 2400
2402 ip6_route_add(&cfg, GFP_ATOMIC, NULL); 2401 ip6_route_add(&cfg, GFP_KERNEL, NULL);
2403} 2402}
2404 2403
2405static struct inet6_dev *addrconf_add_dev(struct net_device *dev) 2404static struct inet6_dev *addrconf_add_dev(struct net_device *dev)
@@ -3063,7 +3062,7 @@ static void sit_add_v4_addrs(struct inet6_dev *idev)
3063 if (addr.s6_addr32[3]) { 3062 if (addr.s6_addr32[3]) {
3064 add_addr(idev, &addr, plen, scope); 3063 add_addr(idev, &addr, plen, scope);
3065 addrconf_prefix_route(&addr, plen, 0, idev->dev, 0, pflags, 3064 addrconf_prefix_route(&addr, plen, 0, idev->dev, 0, pflags,
3066 GFP_ATOMIC); 3065 GFP_KERNEL);
3067 return; 3066 return;
3068 } 3067 }
3069 3068
@@ -3088,7 +3087,7 @@ static void sit_add_v4_addrs(struct inet6_dev *idev)
3088 3087
3089 add_addr(idev, &addr, plen, flag); 3088 add_addr(idev, &addr, plen, flag);
3090 addrconf_prefix_route(&addr, plen, 0, idev->dev, 3089 addrconf_prefix_route(&addr, plen, 0, idev->dev,
3091 0, pflags, GFP_ATOMIC); 3090 0, pflags, GFP_KERNEL);
3092 } 3091 }
3093 } 3092 }
3094 } 3093 }
@@ -4528,6 +4527,7 @@ static int modify_prefix_route(struct inet6_ifaddr *ifp,
4528 unsigned long expires, u32 flags) 4527 unsigned long expires, u32 flags)
4529{ 4528{
4530 struct fib6_info *f6i; 4529 struct fib6_info *f6i;
4530 u32 prio;
4531 4531
4532 f6i = addrconf_get_prefix_route(&ifp->addr, 4532 f6i = addrconf_get_prefix_route(&ifp->addr,
4533 ifp->prefix_len, 4533 ifp->prefix_len,
@@ -4536,13 +4536,15 @@ static int modify_prefix_route(struct inet6_ifaddr *ifp,
4536 if (!f6i) 4536 if (!f6i)
4537 return -ENOENT; 4537 return -ENOENT;
4538 4538
4539 if (f6i->fib6_metric != ifp->rt_priority) { 4539 prio = ifp->rt_priority ? : IP6_RT_PRIO_ADDRCONF;
4540 if (f6i->fib6_metric != prio) {
4541 /* delete old one */
4542 ip6_del_rt(dev_net(ifp->idev->dev), f6i);
4543
4540 /* add new one */ 4544 /* add new one */
4541 addrconf_prefix_route(&ifp->addr, ifp->prefix_len, 4545 addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
4542 ifp->rt_priority, ifp->idev->dev, 4546 ifp->rt_priority, ifp->idev->dev,
4543 expires, flags, GFP_KERNEL); 4547 expires, flags, GFP_KERNEL);
4544 /* delete old one */
4545 ip6_del_rt(dev_net(ifp->idev->dev), f6i);
4546 } else { 4548 } else {
4547 if (!expires) 4549 if (!expires)
4548 fib6_clean_expires(f6i); 4550 fib6_clean_expires(f6i);
@@ -5207,7 +5209,9 @@ static inline size_t inet6_ifla6_size(void)
5207 + nla_total_size(DEVCONF_MAX * 4) /* IFLA_INET6_CONF */ 5209 + nla_total_size(DEVCONF_MAX * 4) /* IFLA_INET6_CONF */
5208 + nla_total_size(IPSTATS_MIB_MAX * 8) /* IFLA_INET6_STATS */ 5210 + nla_total_size(IPSTATS_MIB_MAX * 8) /* IFLA_INET6_STATS */
5209 + nla_total_size(ICMP6_MIB_MAX * 8) /* IFLA_INET6_ICMP6STATS */ 5211 + nla_total_size(ICMP6_MIB_MAX * 8) /* IFLA_INET6_ICMP6STATS */
5210 + nla_total_size(sizeof(struct in6_addr)); /* IFLA_INET6_TOKEN */ 5212 + nla_total_size(sizeof(struct in6_addr)) /* IFLA_INET6_TOKEN */
5213 + nla_total_size(1) /* IFLA_INET6_ADDR_GEN_MODE */
5214 + 0;
5211} 5215}
5212 5216
5213static inline size_t inet6_if_nlmsg_size(void) 5217static inline size_t inet6_if_nlmsg_size(void)
@@ -5889,32 +5893,31 @@ static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write,
5889 loff_t *ppos) 5893 loff_t *ppos)
5890{ 5894{
5891 int ret = 0; 5895 int ret = 0;
5892 int new_val; 5896 u32 new_val;
5893 struct inet6_dev *idev = (struct inet6_dev *)ctl->extra1; 5897 struct inet6_dev *idev = (struct inet6_dev *)ctl->extra1;
5894 struct net *net = (struct net *)ctl->extra2; 5898 struct net *net = (struct net *)ctl->extra2;
5899 struct ctl_table tmp = {
5900 .data = &new_val,
5901 .maxlen = sizeof(new_val),
5902 .mode = ctl->mode,
5903 };
5895 5904
5896 if (!rtnl_trylock()) 5905 if (!rtnl_trylock())
5897 return restart_syscall(); 5906 return restart_syscall();
5898 5907
5899 ret = proc_dointvec(ctl, write, buffer, lenp, ppos); 5908 new_val = *((u32 *)ctl->data);
5900 5909
5901 if (write) { 5910 ret = proc_douintvec(&tmp, write, buffer, lenp, ppos);
5902 new_val = *((int *)ctl->data); 5911 if (ret != 0)
5912 goto out;
5903 5913
5914 if (write) {
5904 if (check_addr_gen_mode(new_val) < 0) { 5915 if (check_addr_gen_mode(new_val) < 0) {
5905 ret = -EINVAL; 5916 ret = -EINVAL;
5906 goto out; 5917 goto out;
5907 } 5918 }
5908 5919
5909 /* request for default */ 5920 if (idev) {
5910 if (&net->ipv6.devconf_dflt->addr_gen_mode == ctl->data) {
5911 ipv6_devconf_dflt.addr_gen_mode = new_val;
5912
5913 /* request for individual net device */
5914 } else {
5915 if (!idev)
5916 goto out;
5917
5918 if (check_stable_privacy(idev, net, new_val) < 0) { 5921 if (check_stable_privacy(idev, net, new_val) < 0) {
5919 ret = -EINVAL; 5922 ret = -EINVAL;
5920 goto out; 5923 goto out;
@@ -5924,7 +5927,21 @@ static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write,
5924 idev->cnf.addr_gen_mode = new_val; 5927 idev->cnf.addr_gen_mode = new_val;
5925 addrconf_dev_config(idev->dev); 5928 addrconf_dev_config(idev->dev);
5926 } 5929 }
5930 } else if (&net->ipv6.devconf_all->addr_gen_mode == ctl->data) {
5931 struct net_device *dev;
5932
5933 net->ipv6.devconf_dflt->addr_gen_mode = new_val;
5934 for_each_netdev(net, dev) {
5935 idev = __in6_dev_get(dev);
5936 if (idev &&
5937 idev->cnf.addr_gen_mode != new_val) {
5938 idev->cnf.addr_gen_mode = new_val;
5939 addrconf_dev_config(idev->dev);
5940 }
5941 }
5927 } 5942 }
5943
5944 *((u32 *)ctl->data) = new_val;
5928 } 5945 }
5929 5946
5930out: 5947out:
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 74f2a261e8df..673bba31eb18 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -322,8 +322,7 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
322 /* Reproduce AF_INET checks to make the bindings consistent */ 322 /* Reproduce AF_INET checks to make the bindings consistent */
323 v4addr = addr->sin6_addr.s6_addr32[3]; 323 v4addr = addr->sin6_addr.s6_addr32[3];
324 chk_addr_ret = inet_addr_type(net, v4addr); 324 chk_addr_ret = inet_addr_type(net, v4addr);
325 if (!net->ipv4.sysctl_ip_nonlocal_bind && 325 if (!inet_can_nonlocal_bind(net, inet) &&
326 !(inet->freebind || inet->transparent) &&
327 v4addr != htonl(INADDR_ANY) && 326 v4addr != htonl(INADDR_ANY) &&
328 chk_addr_ret != RTN_LOCAL && 327 chk_addr_ret != RTN_LOCAL &&
329 chk_addr_ret != RTN_MULTICAST && 328 chk_addr_ret != RTN_MULTICAST &&
@@ -362,8 +361,7 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
362 */ 361 */
363 v4addr = LOOPBACK4_IPV6; 362 v4addr = LOOPBACK4_IPV6;
364 if (!(addr_type & IPV6_ADDR_MULTICAST)) { 363 if (!(addr_type & IPV6_ADDR_MULTICAST)) {
365 if (!net->ipv6.sysctl.ip_nonlocal_bind && 364 if (!ipv6_can_nonlocal_bind(net, inet) &&
366 !(inet->freebind || inet->transparent) &&
367 !ipv6_chk_addr(net, &addr->sin6_addr, 365 !ipv6_chk_addr(net, &addr->sin6_addr,
368 dev, 0)) { 366 dev, 0)) {
369 err = -EADDRNOTAVAIL; 367 err = -EADDRNOTAVAIL;
@@ -570,7 +568,7 @@ const struct proto_ops inet6_stream_ops = {
570 .socketpair = sock_no_socketpair, /* a do nothing */ 568 .socketpair = sock_no_socketpair, /* a do nothing */
571 .accept = inet_accept, /* ok */ 569 .accept = inet_accept, /* ok */
572 .getname = inet6_getname, 570 .getname = inet6_getname,
573 .poll_mask = tcp_poll_mask, /* ok */ 571 .poll = tcp_poll, /* ok */
574 .ioctl = inet6_ioctl, /* must change */ 572 .ioctl = inet6_ioctl, /* must change */
575 .listen = inet_listen, /* ok */ 573 .listen = inet_listen, /* ok */
576 .shutdown = inet_shutdown, /* ok */ 574 .shutdown = inet_shutdown, /* ok */
@@ -603,7 +601,7 @@ const struct proto_ops inet6_dgram_ops = {
603 .socketpair = sock_no_socketpair, /* a do nothing */ 601 .socketpair = sock_no_socketpair, /* a do nothing */
604 .accept = sock_no_accept, /* a do nothing */ 602 .accept = sock_no_accept, /* a do nothing */
605 .getname = inet6_getname, 603 .getname = inet6_getname,
606 .poll_mask = udp_poll_mask, /* ok */ 604 .poll = udp_poll, /* ok */
607 .ioctl = inet6_ioctl, /* must change */ 605 .ioctl = inet6_ioctl, /* must change */
608 .listen = sock_no_listen, /* ok */ 606 .listen = sock_no_listen, /* ok */
609 .shutdown = inet_shutdown, /* ok */ 607 .shutdown = inet_shutdown, /* ok */
@@ -764,6 +762,7 @@ EXPORT_SYMBOL_GPL(ipv6_opt_accepted);
764static struct packet_type ipv6_packet_type __read_mostly = { 762static struct packet_type ipv6_packet_type __read_mostly = {
765 .type = cpu_to_be16(ETH_P_IPV6), 763 .type = cpu_to_be16(ETH_P_IPV6),
766 .func = ipv6_rcv, 764 .func = ipv6_rcv,
765 .list_func = ipv6_list_rcv,
767}; 766};
768 767
769static int __init ipv6_packet_init(void) 768static int __init ipv6_packet_init(void)
@@ -833,6 +832,7 @@ static int __net_init inet6_net_init(struct net *net)
833 832
834 net->ipv6.sysctl.bindv6only = 0; 833 net->ipv6.sysctl.bindv6only = 0;
835 net->ipv6.sysctl.icmpv6_time = 1*HZ; 834 net->ipv6.sysctl.icmpv6_time = 1*HZ;
835 net->ipv6.sysctl.icmpv6_echo_ignore_all = 0;
836 net->ipv6.sysctl.flowlabel_consistency = 1; 836 net->ipv6.sysctl.flowlabel_consistency = 1;
837 net->ipv6.sysctl.auto_flowlabels = IP6_DEFAULT_AUTO_FLOW_LABELS; 837 net->ipv6.sysctl.auto_flowlabels = IP6_DEFAULT_AUTO_FLOW_LABELS;
838 net->ipv6.sysctl.idgen_retries = 3; 838 net->ipv6.sysctl.idgen_retries = 3;
diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c
index 1323b9679cf7..1c0bb9fb76e6 100644
--- a/net/ipv6/calipso.c
+++ b/net/ipv6/calipso.c
@@ -799,8 +799,7 @@ static int calipso_opt_update(struct sock *sk, struct ipv6_opt_hdr *hop)
799{ 799{
800 struct ipv6_txoptions *old = txopt_get(inet6_sk(sk)), *txopts; 800 struct ipv6_txoptions *old = txopt_get(inet6_sk(sk)), *txopts;
801 801
802 txopts = ipv6_renew_options_kern(sk, old, IPV6_HOPOPTS, 802 txopts = ipv6_renew_options(sk, old, IPV6_HOPOPTS, hop);
803 hop, hop ? ipv6_optlen(hop) : 0);
804 txopt_put(old); 803 txopt_put(old);
805 if (IS_ERR(txopts)) 804 if (IS_ERR(txopts))
806 return PTR_ERR(txopts); 805 return PTR_ERR(txopts);
@@ -1222,8 +1221,7 @@ static int calipso_req_setattr(struct request_sock *req,
1222 if (IS_ERR(new)) 1221 if (IS_ERR(new))
1223 return PTR_ERR(new); 1222 return PTR_ERR(new);
1224 1223
1225 txopts = ipv6_renew_options_kern(sk, req_inet->ipv6_opt, IPV6_HOPOPTS, 1224 txopts = ipv6_renew_options(sk, req_inet->ipv6_opt, IPV6_HOPOPTS, new);
1226 new, new ? ipv6_optlen(new) : 0);
1227 1225
1228 kfree(new); 1226 kfree(new);
1229 1227
@@ -1260,8 +1258,7 @@ static void calipso_req_delattr(struct request_sock *req)
1260 if (calipso_opt_del(req_inet->ipv6_opt->hopopt, &new)) 1258 if (calipso_opt_del(req_inet->ipv6_opt->hopopt, &new))
1261 return; /* Nothing to do */ 1259 return; /* Nothing to do */
1262 1260
1263 txopts = ipv6_renew_options_kern(sk, req_inet->ipv6_opt, IPV6_HOPOPTS, 1261 txopts = ipv6_renew_options(sk, req_inet->ipv6_opt, IPV6_HOPOPTS, new);
1264 new, new ? ipv6_optlen(new) : 0);
1265 1262
1266 if (!IS_ERR(txopts)) { 1263 if (!IS_ERR(txopts)) {
1267 txopts = xchg(&req_inet->ipv6_opt, txopts); 1264 txopts = xchg(&req_inet->ipv6_opt, txopts);
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 2ee08b6a86a4..1ede7a16a0be 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -700,13 +700,16 @@ void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg,
700 } 700 }
701 if (np->rxopt.bits.rxorigdstaddr) { 701 if (np->rxopt.bits.rxorigdstaddr) {
702 struct sockaddr_in6 sin6; 702 struct sockaddr_in6 sin6;
703 __be16 *ports = (__be16 *) skb_transport_header(skb); 703 __be16 *ports;
704 int end;
704 705
705 if (skb_transport_offset(skb) + 4 <= (int)skb->len) { 706 end = skb_transport_offset(skb) + 4;
707 if (end <= 0 || pskb_may_pull(skb, end)) {
706 /* All current transport protocols have the port numbers in the 708 /* All current transport protocols have the port numbers in the
707 * first four bytes of the transport header and this function is 709 * first four bytes of the transport header and this function is
708 * written with this assumption in mind. 710 * written with this assumption in mind.
709 */ 711 */
712 ports = (__be16 *)skb_transport_header(skb);
710 713
711 sin6.sin6_family = AF_INET6; 714 sin6.sin6_family = AF_INET6;
712 sin6.sin6_addr = ipv6_hdr(skb)->daddr; 715 sin6.sin6_addr = ipv6_hdr(skb)->daddr;
@@ -736,7 +739,7 @@ EXPORT_SYMBOL_GPL(ip6_datagram_recv_ctl);
736 739
737int ip6_datagram_send_ctl(struct net *net, struct sock *sk, 740int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
738 struct msghdr *msg, struct flowi6 *fl6, 741 struct msghdr *msg, struct flowi6 *fl6,
739 struct ipcm6_cookie *ipc6, struct sockcm_cookie *sockc) 742 struct ipcm6_cookie *ipc6)
740{ 743{
741 struct in6_pktinfo *src_info; 744 struct in6_pktinfo *src_info;
742 struct cmsghdr *cmsg; 745 struct cmsghdr *cmsg;
@@ -755,7 +758,7 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
755 } 758 }
756 759
757 if (cmsg->cmsg_level == SOL_SOCKET) { 760 if (cmsg->cmsg_level == SOL_SOCKET) {
758 err = __sock_cmsg_send(sk, msg, cmsg, sockc); 761 err = __sock_cmsg_send(sk, msg, cmsg, &ipc6->sockc);
759 if (err) 762 if (err)
760 return err; 763 return err;
761 continue; 764 continue;
@@ -800,7 +803,7 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
800 803
801 if (addr_type != IPV6_ADDR_ANY) { 804 if (addr_type != IPV6_ADDR_ANY) {
802 int strict = __ipv6_addr_src_scope(addr_type) <= IPV6_ADDR_SCOPE_LINKLOCAL; 805 int strict = __ipv6_addr_src_scope(addr_type) <= IPV6_ADDR_SCOPE_LINKLOCAL;
803 if (!(inet_sk(sk)->freebind || inet_sk(sk)->transparent) && 806 if (!ipv6_can_nonlocal_bind(net, inet_sk(sk)) &&
804 !ipv6_chk_addr_and_flags(net, &src_info->ipi6_addr, 807 !ipv6_chk_addr_and_flags(net, &src_info->ipi6_addr,
805 dev, !strict, 0, 808 dev, !strict, 0,
806 IFA_F_TENTATIVE) && 809 IFA_F_TENTATIVE) &&
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 97513f35bcc5..88a7579c23bd 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -669,8 +669,10 @@ skip_cow:
669 669
670 sg_init_table(sg, nfrags); 670 sg_init_table(sg, nfrags);
671 ret = skb_to_sgvec(skb, sg, 0, skb->len); 671 ret = skb_to_sgvec(skb, sg, 0, skb->len);
672 if (unlikely(ret < 0)) 672 if (unlikely(ret < 0)) {
673 kfree(tmp);
673 goto out; 674 goto out;
675 }
674 676
675 skb->ip_summed = CHECKSUM_NONE; 677 skb->ip_summed = CHECKSUM_NONE;
676 678
diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c
index 27f59b61f70f..6177e2171171 100644
--- a/net/ipv6/esp6_offload.c
+++ b/net/ipv6/esp6_offload.c
@@ -49,8 +49,8 @@ static __u16 esp6_nexthdr_esp_offset(struct ipv6hdr *ipv6_hdr, int nhlen)
49 return 0; 49 return 0;
50} 50}
51 51
52static struct sk_buff **esp6_gro_receive(struct sk_buff **head, 52static struct sk_buff *esp6_gro_receive(struct list_head *head,
53 struct sk_buff *skb) 53 struct sk_buff *skb)
54{ 54{
55 int offset = skb_gro_offset(skb); 55 int offset = skb_gro_offset(skb);
56 struct xfrm_offload *xo; 56 struct xfrm_offload *xo;
@@ -162,8 +162,7 @@ static struct sk_buff *esp6_gso_segment(struct sk_buff *skb,
162 162
163 skb->encap_hdr_csum = 1; 163 skb->encap_hdr_csum = 1;
164 164
165 if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle || 165 if (!(features & NETIF_F_HW_ESP) || x->xso.dev != skb->dev)
166 (x->xso.dev != skb->dev))
167 esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK); 166 esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK);
168 else if (!(features & NETIF_F_HW_ESP_TX_CSUM)) 167 else if (!(features & NETIF_F_HW_ESP_TX_CSUM))
169 esp_features = features & ~NETIF_F_CSUM_MASK; 168 esp_features = features & ~NETIF_F_CSUM_MASK;
@@ -207,8 +206,7 @@ static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features
207 if (!xo) 206 if (!xo)
208 return -EINVAL; 207 return -EINVAL;
209 208
210 if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle || 209 if (!(features & NETIF_F_HW_ESP) || x->xso.dev != skb->dev) {
211 (x->xso.dev != skb->dev)) {
212 xo->flags |= CRYPTO_FALLBACK; 210 xo->flags |= CRYPTO_FALLBACK;
213 hw_offload = false; 211 hw_offload = false;
214 } 212 }
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 5bc2bf3733ab..20291c2036fc 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -1015,29 +1015,21 @@ ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt)
1015} 1015}
1016EXPORT_SYMBOL_GPL(ipv6_dup_options); 1016EXPORT_SYMBOL_GPL(ipv6_dup_options);
1017 1017
1018static int ipv6_renew_option(void *ohdr, 1018static void ipv6_renew_option(int renewtype,
1019 struct ipv6_opt_hdr __user *newopt, int newoptlen, 1019 struct ipv6_opt_hdr **dest,
1020 int inherit, 1020 struct ipv6_opt_hdr *old,
1021 struct ipv6_opt_hdr **hdr, 1021 struct ipv6_opt_hdr *new,
1022 char **p) 1022 int newtype, char **p)
1023{ 1023{
1024 if (inherit) { 1024 struct ipv6_opt_hdr *src;
1025 if (ohdr) { 1025
1026 memcpy(*p, ohdr, ipv6_optlen((struct ipv6_opt_hdr *)ohdr)); 1026 src = (renewtype == newtype ? new : old);
1027 *hdr = (struct ipv6_opt_hdr *)*p; 1027 if (!src)
1028 *p += CMSG_ALIGN(ipv6_optlen(*hdr)); 1028 return;
1029 } 1029
1030 } else { 1030 memcpy(*p, src, ipv6_optlen(src));
1031 if (newopt) { 1031 *dest = (struct ipv6_opt_hdr *)*p;
1032 if (copy_from_user(*p, newopt, newoptlen)) 1032 *p += CMSG_ALIGN(ipv6_optlen(*dest));
1033 return -EFAULT;
1034 *hdr = (struct ipv6_opt_hdr *)*p;
1035 if (ipv6_optlen(*hdr) > newoptlen)
1036 return -EINVAL;
1037 *p += CMSG_ALIGN(newoptlen);
1038 }
1039 }
1040 return 0;
1041} 1033}
1042 1034
1043/** 1035/**
@@ -1063,13 +1055,11 @@ static int ipv6_renew_option(void *ohdr,
1063 */ 1055 */
1064struct ipv6_txoptions * 1056struct ipv6_txoptions *
1065ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt, 1057ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt,
1066 int newtype, 1058 int newtype, struct ipv6_opt_hdr *newopt)
1067 struct ipv6_opt_hdr __user *newopt, int newoptlen)
1068{ 1059{
1069 int tot_len = 0; 1060 int tot_len = 0;
1070 char *p; 1061 char *p;
1071 struct ipv6_txoptions *opt2; 1062 struct ipv6_txoptions *opt2;
1072 int err;
1073 1063
1074 if (opt) { 1064 if (opt) {
1075 if (newtype != IPV6_HOPOPTS && opt->hopopt) 1065 if (newtype != IPV6_HOPOPTS && opt->hopopt)
@@ -1082,8 +1072,8 @@ ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt,
1082 tot_len += CMSG_ALIGN(ipv6_optlen(opt->dst1opt)); 1072 tot_len += CMSG_ALIGN(ipv6_optlen(opt->dst1opt));
1083 } 1073 }
1084 1074
1085 if (newopt && newoptlen) 1075 if (newopt)
1086 tot_len += CMSG_ALIGN(newoptlen); 1076 tot_len += CMSG_ALIGN(ipv6_optlen(newopt));
1087 1077
1088 if (!tot_len) 1078 if (!tot_len)
1089 return NULL; 1079 return NULL;
@@ -1098,29 +1088,19 @@ ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt,
1098 opt2->tot_len = tot_len; 1088 opt2->tot_len = tot_len;
1099 p = (char *)(opt2 + 1); 1089 p = (char *)(opt2 + 1);
1100 1090
1101 err = ipv6_renew_option(opt ? opt->hopopt : NULL, newopt, newoptlen, 1091 ipv6_renew_option(IPV6_HOPOPTS, &opt2->hopopt,
1102 newtype != IPV6_HOPOPTS, 1092 (opt ? opt->hopopt : NULL),
1103 &opt2->hopopt, &p); 1093 newopt, newtype, &p);
1104 if (err) 1094 ipv6_renew_option(IPV6_RTHDRDSTOPTS, &opt2->dst0opt,
1105 goto out; 1095 (opt ? opt->dst0opt : NULL),
1106 1096 newopt, newtype, &p);
1107 err = ipv6_renew_option(opt ? opt->dst0opt : NULL, newopt, newoptlen, 1097 ipv6_renew_option(IPV6_RTHDR,
1108 newtype != IPV6_RTHDRDSTOPTS, 1098 (struct ipv6_opt_hdr **)&opt2->srcrt,
1109 &opt2->dst0opt, &p); 1099 (opt ? (struct ipv6_opt_hdr *)opt->srcrt : NULL),
1110 if (err) 1100 newopt, newtype, &p);
1111 goto out; 1101 ipv6_renew_option(IPV6_DSTOPTS, &opt2->dst1opt,
1112 1102 (opt ? opt->dst1opt : NULL),
1113 err = ipv6_renew_option(opt ? opt->srcrt : NULL, newopt, newoptlen, 1103 newopt, newtype, &p);
1114 newtype != IPV6_RTHDR,
1115 (struct ipv6_opt_hdr **)&opt2->srcrt, &p);
1116 if (err)
1117 goto out;
1118
1119 err = ipv6_renew_option(opt ? opt->dst1opt : NULL, newopt, newoptlen,
1120 newtype != IPV6_DSTOPTS,
1121 &opt2->dst1opt, &p);
1122 if (err)
1123 goto out;
1124 1104
1125 opt2->opt_nflen = (opt2->hopopt ? ipv6_optlen(opt2->hopopt) : 0) + 1105 opt2->opt_nflen = (opt2->hopopt ? ipv6_optlen(opt2->hopopt) : 0) +
1126 (opt2->dst0opt ? ipv6_optlen(opt2->dst0opt) : 0) + 1106 (opt2->dst0opt ? ipv6_optlen(opt2->dst0opt) : 0) +
@@ -1128,37 +1108,6 @@ ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt,
1128 opt2->opt_flen = (opt2->dst1opt ? ipv6_optlen(opt2->dst1opt) : 0); 1108 opt2->opt_flen = (opt2->dst1opt ? ipv6_optlen(opt2->dst1opt) : 0);
1129 1109
1130 return opt2; 1110 return opt2;
1131out:
1132 sock_kfree_s(sk, opt2, opt2->tot_len);
1133 return ERR_PTR(err);
1134}
1135
1136/**
1137 * ipv6_renew_options_kern - replace a specific ext hdr with a new one.
1138 *
1139 * @sk: sock from which to allocate memory
1140 * @opt: original options
1141 * @newtype: option type to replace in @opt
1142 * @newopt: new option of type @newtype to replace (kernel-mem)
1143 * @newoptlen: length of @newopt
1144 *
1145 * See ipv6_renew_options(). The difference is that @newopt is
1146 * kernel memory, rather than user memory.
1147 */
1148struct ipv6_txoptions *
1149ipv6_renew_options_kern(struct sock *sk, struct ipv6_txoptions *opt,
1150 int newtype, struct ipv6_opt_hdr *newopt,
1151 int newoptlen)
1152{
1153 struct ipv6_txoptions *ret_val;
1154 const mm_segment_t old_fs = get_fs();
1155
1156 set_fs(KERNEL_DS);
1157 ret_val = ipv6_renew_options(sk, opt, newtype,
1158 (struct ipv6_opt_hdr __user *)newopt,
1159 newoptlen);
1160 set_fs(old_fs);
1161 return ret_val;
1162} 1111}
1163 1112
1164struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space, 1113struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space,
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index be491bf6ab6e..c9c53ade55c3 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -92,7 +92,7 @@ static void icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
92 struct net *net = dev_net(skb->dev); 92 struct net *net = dev_net(skb->dev);
93 93
94 if (type == ICMPV6_PKT_TOOBIG) 94 if (type == ICMPV6_PKT_TOOBIG)
95 ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL)); 95 ip6_update_pmtu(skb, net, info, skb->dev->ifindex, 0, sock_net_uid(net, NULL));
96 else if (type == NDISC_REDIRECT) 96 else if (type == NDISC_REDIRECT)
97 ip6_redirect(skb, net, skb->dev->ifindex, 0, 97 ip6_redirect(skb, net, skb->dev->ifindex, 0,
98 sock_net_uid(net, NULL)); 98 sock_net_uid(net, NULL));
@@ -402,9 +402,10 @@ static int icmp6_iif(const struct sk_buff *skb)
402 402
403 /* for local traffic to local address, skb dev is the loopback 403 /* for local traffic to local address, skb dev is the loopback
404 * device. Check if there is a dst attached to the skb and if so 404 * device. Check if there is a dst attached to the skb and if so
405 * get the real device index. 405 * get the real device index. Same is needed for replies to a link
406 * local address on a device enslaved to an L3 master device
406 */ 407 */
407 if (unlikely(iif == LOOPBACK_IFINDEX)) { 408 if (unlikely(iif == LOOPBACK_IFINDEX || netif_is_l3_master(skb->dev))) {
408 const struct rt6_info *rt6 = skb_rt6_info(skb); 409 const struct rt6_info *rt6 = skb_rt6_info(skb);
409 410
410 if (rt6) 411 if (rt6)
@@ -430,7 +431,6 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
430 struct icmp6hdr tmp_hdr; 431 struct icmp6hdr tmp_hdr;
431 struct flowi6 fl6; 432 struct flowi6 fl6;
432 struct icmpv6_msg msg; 433 struct icmpv6_msg msg;
433 struct sockcm_cookie sockc_unused = {0};
434 struct ipcm6_cookie ipc6; 434 struct ipcm6_cookie ipc6;
435 int iif = 0; 435 int iif = 0;
436 int addr_type = 0; 436 int addr_type = 0;
@@ -545,7 +545,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
545 else if (!fl6.flowi6_oif) 545 else if (!fl6.flowi6_oif)
546 fl6.flowi6_oif = np->ucast_oif; 546 fl6.flowi6_oif = np->ucast_oif;
547 547
548 ipc6.tclass = np->tclass; 548 ipcm6_init_sk(&ipc6, np);
549 fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); 549 fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel);
550 550
551 dst = icmpv6_route_lookup(net, skb, sk, &fl6); 551 dst = icmpv6_route_lookup(net, skb, sk, &fl6);
@@ -553,8 +553,6 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
553 goto out; 553 goto out;
554 554
555 ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); 555 ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
556 ipc6.dontfrag = np->dontfrag;
557 ipc6.opt = NULL;
558 556
559 msg.skb = skb; 557 msg.skb = skb;
560 msg.offset = skb_network_offset(skb); 558 msg.offset = skb_network_offset(skb);
@@ -575,7 +573,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
575 len + sizeof(struct icmp6hdr), 573 len + sizeof(struct icmp6hdr),
576 sizeof(struct icmp6hdr), 574 sizeof(struct icmp6hdr),
577 &ipc6, &fl6, (struct rt6_info *)dst, 575 &ipc6, &fl6, (struct rt6_info *)dst,
578 MSG_DONTWAIT, &sockc_unused)) { 576 MSG_DONTWAIT)) {
579 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS); 577 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS);
580 ip6_flush_pending_frames(sk); 578 ip6_flush_pending_frames(sk);
581 } else { 579 } else {
@@ -679,7 +677,6 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
679 struct dst_entry *dst; 677 struct dst_entry *dst;
680 struct ipcm6_cookie ipc6; 678 struct ipcm6_cookie ipc6;
681 u32 mark = IP6_REPLY_MARK(net, skb->mark); 679 u32 mark = IP6_REPLY_MARK(net, skb->mark);
682 struct sockcm_cookie sockc_unused = {0};
683 680
684 saddr = &ipv6_hdr(skb)->daddr; 681 saddr = &ipv6_hdr(skb)->daddr;
685 682
@@ -726,16 +723,14 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
726 msg.offset = 0; 723 msg.offset = 0;
727 msg.type = ICMPV6_ECHO_REPLY; 724 msg.type = ICMPV6_ECHO_REPLY;
728 725
726 ipcm6_init_sk(&ipc6, np);
729 ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); 727 ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
730 ipc6.tclass = ipv6_get_dsfield(ipv6_hdr(skb)); 728 ipc6.tclass = ipv6_get_dsfield(ipv6_hdr(skb));
731 ipc6.dontfrag = np->dontfrag;
732 ipc6.opt = NULL;
733 729
734 if (ip6_append_data(sk, icmpv6_getfrag, &msg, 730 if (ip6_append_data(sk, icmpv6_getfrag, &msg,
735 skb->len + sizeof(struct icmp6hdr), 731 skb->len + sizeof(struct icmp6hdr),
736 sizeof(struct icmp6hdr), &ipc6, &fl6, 732 sizeof(struct icmp6hdr), &ipc6, &fl6,
737 (struct rt6_info *)dst, MSG_DONTWAIT, 733 (struct rt6_info *)dst, MSG_DONTWAIT)) {
738 &sockc_unused)) {
739 __ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS); 734 __ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS);
740 ip6_flush_pending_frames(sk); 735 ip6_flush_pending_frames(sk);
741 } else { 736 } else {
@@ -799,6 +794,7 @@ out:
799 794
800static int icmpv6_rcv(struct sk_buff *skb) 795static int icmpv6_rcv(struct sk_buff *skb)
801{ 796{
797 struct net *net = dev_net(skb->dev);
802 struct net_device *dev = skb->dev; 798 struct net_device *dev = skb->dev;
803 struct inet6_dev *idev = __in6_dev_get(dev); 799 struct inet6_dev *idev = __in6_dev_get(dev);
804 const struct in6_addr *saddr, *daddr; 800 const struct in6_addr *saddr, *daddr;
@@ -848,7 +844,8 @@ static int icmpv6_rcv(struct sk_buff *skb)
848 844
849 switch (type) { 845 switch (type) {
850 case ICMPV6_ECHO_REQUEST: 846 case ICMPV6_ECHO_REQUEST:
851 icmpv6_echo_reply(skb); 847 if (!net->ipv6.sysctl.icmpv6_echo_ignore_all)
848 icmpv6_echo_reply(skb);
852 break; 849 break;
853 850
854 case ICMPV6_ECHO_REPLY: 851 case ICMPV6_ECHO_REPLY:
@@ -1109,6 +1106,13 @@ static struct ctl_table ipv6_icmp_table_template[] = {
1109 .mode = 0644, 1106 .mode = 0644,
1110 .proc_handler = proc_dointvec_ms_jiffies, 1107 .proc_handler = proc_dointvec_ms_jiffies,
1111 }, 1108 },
1109 {
1110 .procname = "echo_ignore_all",
1111 .data = &init_net.ipv6.sysctl.icmpv6_echo_ignore_all,
1112 .maxlen = sizeof(int),
1113 .mode = 0644,
1114 .proc_handler = proc_dointvec,
1115 },
1112 { }, 1116 { },
1113}; 1117};
1114 1118
@@ -1120,9 +1124,10 @@ struct ctl_table * __net_init ipv6_icmp_sysctl_init(struct net *net)
1120 sizeof(ipv6_icmp_table_template), 1124 sizeof(ipv6_icmp_table_template),
1121 GFP_KERNEL); 1125 GFP_KERNEL);
1122 1126
1123 if (table) 1127 if (table) {
1124 table[0].data = &net->ipv6.sysctl.icmpv6_time; 1128 table[0].data = &net->ipv6.sysctl.icmpv6_time;
1125 1129 table[1].data = &net->ipv6.sysctl.icmpv6_echo_ignore_all;
1130 }
1126 return table; 1131 return table;
1127} 1132}
1128#endif 1133#endif
diff --git a/net/ipv6/ila/Makefile b/net/ipv6/ila/Makefile
index 4b32e5921e5c..b7739aba6e68 100644
--- a/net/ipv6/ila/Makefile
+++ b/net/ipv6/ila/Makefile
@@ -4,4 +4,4 @@
4 4
5obj-$(CONFIG_IPV6_ILA) += ila.o 5obj-$(CONFIG_IPV6_ILA) += ila.o
6 6
7ila-objs := ila_common.o ila_lwt.o ila_xlat.o 7ila-objs := ila_main.o ila_common.o ila_lwt.o ila_xlat.o
diff --git a/net/ipv6/ila/ila.h b/net/ipv6/ila/ila.h
index 3c7a11b62334..1f747bcbec29 100644
--- a/net/ipv6/ila/ila.h
+++ b/net/ipv6/ila/ila.h
@@ -19,6 +19,7 @@
19#include <linux/skbuff.h> 19#include <linux/skbuff.h>
20#include <linux/types.h> 20#include <linux/types.h>
21#include <net/checksum.h> 21#include <net/checksum.h>
22#include <net/genetlink.h>
22#include <net/ip.h> 23#include <net/ip.h>
23#include <net/protocol.h> 24#include <net/protocol.h>
24#include <uapi/linux/ila.h> 25#include <uapi/linux/ila.h>
@@ -104,9 +105,31 @@ void ila_update_ipv6_locator(struct sk_buff *skb, struct ila_params *p,
104 105
105void ila_init_saved_csum(struct ila_params *p); 106void ila_init_saved_csum(struct ila_params *p);
106 107
108struct ila_net {
109 struct {
110 struct rhashtable rhash_table;
111 spinlock_t *locks; /* Bucket locks for entry manipulation */
112 unsigned int locks_mask;
113 bool hooks_registered;
114 } xlat;
115};
116
107int ila_lwt_init(void); 117int ila_lwt_init(void);
108void ila_lwt_fini(void); 118void ila_lwt_fini(void);
109int ila_xlat_init(void); 119
110void ila_xlat_fini(void); 120int ila_xlat_init_net(struct net *net);
121void ila_xlat_exit_net(struct net *net);
122
123int ila_xlat_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info);
124int ila_xlat_nl_cmd_del_mapping(struct sk_buff *skb, struct genl_info *info);
125int ila_xlat_nl_cmd_get_mapping(struct sk_buff *skb, struct genl_info *info);
126int ila_xlat_nl_cmd_flush(struct sk_buff *skb, struct genl_info *info);
127int ila_xlat_nl_dump_start(struct netlink_callback *cb);
128int ila_xlat_nl_dump_done(struct netlink_callback *cb);
129int ila_xlat_nl_dump(struct sk_buff *skb, struct netlink_callback *cb);
130
131extern unsigned int ila_net_id;
132
133extern struct genl_family ila_nl_family;
111 134
112#endif /* __ILA_H */ 135#endif /* __ILA_H */
diff --git a/net/ipv6/ila/ila_common.c b/net/ipv6/ila/ila_common.c
index 8c88ecf29b93..95e9146918cc 100644
--- a/net/ipv6/ila/ila_common.c
+++ b/net/ipv6/ila/ila_common.c
@@ -153,34 +153,3 @@ void ila_update_ipv6_locator(struct sk_buff *skb, struct ila_params *p,
153 /* Now change destination address */ 153 /* Now change destination address */
154 iaddr->loc = p->locator; 154 iaddr->loc = p->locator;
155} 155}
156
157static int __init ila_init(void)
158{
159 int ret;
160
161 ret = ila_lwt_init();
162
163 if (ret)
164 goto fail_lwt;
165
166 ret = ila_xlat_init();
167 if (ret)
168 goto fail_xlat;
169
170 return 0;
171fail_xlat:
172 ila_lwt_fini();
173fail_lwt:
174 return ret;
175}
176
177static void __exit ila_fini(void)
178{
179 ila_xlat_fini();
180 ila_lwt_fini();
181}
182
183module_init(ila_init);
184module_exit(ila_fini);
185MODULE_AUTHOR("Tom Herbert <tom@herbertland.com>");
186MODULE_LICENSE("GPL");
diff --git a/net/ipv6/ila/ila_main.c b/net/ipv6/ila/ila_main.c
new file mode 100644
index 000000000000..18fac76b9520
--- /dev/null
+++ b/net/ipv6/ila/ila_main.c
@@ -0,0 +1,121 @@
1// SPDX-License-Identifier: GPL-2.0
2#include <net/genetlink.h>
3#include <net/ila.h>
4#include <net/netns/generic.h>
5#include <uapi/linux/genetlink.h>
6#include "ila.h"
7
8static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = {
9 [ILA_ATTR_LOCATOR] = { .type = NLA_U64, },
10 [ILA_ATTR_LOCATOR_MATCH] = { .type = NLA_U64, },
11 [ILA_ATTR_IFINDEX] = { .type = NLA_U32, },
12 [ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, },
13 [ILA_ATTR_IDENT_TYPE] = { .type = NLA_U8, },
14};
15
16static const struct genl_ops ila_nl_ops[] = {
17 {
18 .cmd = ILA_CMD_ADD,
19 .doit = ila_xlat_nl_cmd_add_mapping,
20 .policy = ila_nl_policy,
21 .flags = GENL_ADMIN_PERM,
22 },
23 {
24 .cmd = ILA_CMD_DEL,
25 .doit = ila_xlat_nl_cmd_del_mapping,
26 .policy = ila_nl_policy,
27 .flags = GENL_ADMIN_PERM,
28 },
29 {
30 .cmd = ILA_CMD_FLUSH,
31 .doit = ila_xlat_nl_cmd_flush,
32 .policy = ila_nl_policy,
33 .flags = GENL_ADMIN_PERM,
34 },
35 {
36 .cmd = ILA_CMD_GET,
37 .doit = ila_xlat_nl_cmd_get_mapping,
38 .start = ila_xlat_nl_dump_start,
39 .dumpit = ila_xlat_nl_dump,
40 .done = ila_xlat_nl_dump_done,
41 .policy = ila_nl_policy,
42 },
43};
44
45unsigned int ila_net_id;
46
47struct genl_family ila_nl_family __ro_after_init = {
48 .hdrsize = 0,
49 .name = ILA_GENL_NAME,
50 .version = ILA_GENL_VERSION,
51 .maxattr = ILA_ATTR_MAX,
52 .netnsok = true,
53 .parallel_ops = true,
54 .module = THIS_MODULE,
55 .ops = ila_nl_ops,
56 .n_ops = ARRAY_SIZE(ila_nl_ops),
57};
58
59static __net_init int ila_init_net(struct net *net)
60{
61 int err;
62
63 err = ila_xlat_init_net(net);
64 if (err)
65 goto ila_xlat_init_fail;
66
67 return 0;
68
69ila_xlat_init_fail:
70 return err;
71}
72
73static __net_exit void ila_exit_net(struct net *net)
74{
75 ila_xlat_exit_net(net);
76}
77
78static struct pernet_operations ila_net_ops = {
79 .init = ila_init_net,
80 .exit = ila_exit_net,
81 .id = &ila_net_id,
82 .size = sizeof(struct ila_net),
83};
84
85static int __init ila_init(void)
86{
87 int ret;
88
89 ret = register_pernet_device(&ila_net_ops);
90 if (ret)
91 goto register_device_fail;
92
93 ret = genl_register_family(&ila_nl_family);
94 if (ret)
95 goto register_family_fail;
96
97 ret = ila_lwt_init();
98 if (ret)
99 goto fail_lwt;
100
101 return 0;
102
103fail_lwt:
104 genl_unregister_family(&ila_nl_family);
105register_family_fail:
106 unregister_pernet_device(&ila_net_ops);
107register_device_fail:
108 return ret;
109}
110
111static void __exit ila_fini(void)
112{
113 ila_lwt_fini();
114 genl_unregister_family(&ila_nl_family);
115 unregister_pernet_device(&ila_net_ops);
116}
117
118module_init(ila_init);
119module_exit(ila_fini);
120MODULE_AUTHOR("Tom Herbert <tom@herbertland.com>");
121MODULE_LICENSE("GPL");
diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c
index 10ae13560b40..17c455ff69ff 100644
--- a/net/ipv6/ila/ila_xlat.c
+++ b/net/ipv6/ila/ila_xlat.c
@@ -22,36 +22,14 @@ struct ila_map {
22 struct rcu_head rcu; 22 struct rcu_head rcu;
23}; 23};
24 24
25static unsigned int ila_net_id; 25#define MAX_LOCKS 1024
26
27struct ila_net {
28 struct rhashtable rhash_table;
29 spinlock_t *locks; /* Bucket locks for entry manipulation */
30 unsigned int locks_mask;
31 bool hooks_registered;
32};
33
34#define LOCKS_PER_CPU 10 26#define LOCKS_PER_CPU 10
35 27
36static int alloc_ila_locks(struct ila_net *ilan) 28static int alloc_ila_locks(struct ila_net *ilan)
37{ 29{
38 unsigned int i, size; 30 return alloc_bucket_spinlocks(&ilan->xlat.locks, &ilan->xlat.locks_mask,
39 unsigned int nr_pcpus = num_possible_cpus(); 31 MAX_LOCKS, LOCKS_PER_CPU,
40 32 GFP_KERNEL);
41 nr_pcpus = min_t(unsigned int, nr_pcpus, 32UL);
42 size = roundup_pow_of_two(nr_pcpus * LOCKS_PER_CPU);
43
44 if (sizeof(spinlock_t) != 0) {
45 ilan->locks = kvmalloc_array(size, sizeof(spinlock_t),
46 GFP_KERNEL);
47 if (!ilan->locks)
48 return -ENOMEM;
49 for (i = 0; i < size; i++)
50 spin_lock_init(&ilan->locks[i]);
51 }
52 ilan->locks_mask = size - 1;
53
54 return 0;
55} 33}
56 34
57static u32 hashrnd __read_mostly; 35static u32 hashrnd __read_mostly;
@@ -71,7 +49,7 @@ static inline u32 ila_locator_hash(struct ila_locator loc)
71static inline spinlock_t *ila_get_lock(struct ila_net *ilan, 49static inline spinlock_t *ila_get_lock(struct ila_net *ilan,
72 struct ila_locator loc) 50 struct ila_locator loc)
73{ 51{
74 return &ilan->locks[ila_locator_hash(loc) & ilan->locks_mask]; 52 return &ilan->xlat.locks[ila_locator_hash(loc) & ilan->xlat.locks_mask];
75} 53}
76 54
77static inline int ila_cmp_wildcards(struct ila_map *ila, 55static inline int ila_cmp_wildcards(struct ila_map *ila,
@@ -115,16 +93,6 @@ static const struct rhashtable_params rht_params = {
115 .obj_cmpfn = ila_cmpfn, 93 .obj_cmpfn = ila_cmpfn,
116}; 94};
117 95
118static struct genl_family ila_nl_family;
119
120static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = {
121 [ILA_ATTR_LOCATOR] = { .type = NLA_U64, },
122 [ILA_ATTR_LOCATOR_MATCH] = { .type = NLA_U64, },
123 [ILA_ATTR_IFINDEX] = { .type = NLA_U32, },
124 [ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, },
125 [ILA_ATTR_IDENT_TYPE] = { .type = NLA_U8, },
126};
127
128static int parse_nl_config(struct genl_info *info, 96static int parse_nl_config(struct genl_info *info,
129 struct ila_xlat_params *xp) 97 struct ila_xlat_params *xp)
130{ 98{
@@ -162,7 +130,7 @@ static inline struct ila_map *ila_lookup_wildcards(struct ila_addr *iaddr,
162{ 130{
163 struct ila_map *ila; 131 struct ila_map *ila;
164 132
165 ila = rhashtable_lookup_fast(&ilan->rhash_table, &iaddr->loc, 133 ila = rhashtable_lookup_fast(&ilan->xlat.rhash_table, &iaddr->loc,
166 rht_params); 134 rht_params);
167 while (ila) { 135 while (ila) {
168 if (!ila_cmp_wildcards(ila, iaddr, ifindex)) 136 if (!ila_cmp_wildcards(ila, iaddr, ifindex))
@@ -179,7 +147,7 @@ static inline struct ila_map *ila_lookup_by_params(struct ila_xlat_params *xp,
179{ 147{
180 struct ila_map *ila; 148 struct ila_map *ila;
181 149
182 ila = rhashtable_lookup_fast(&ilan->rhash_table, 150 ila = rhashtable_lookup_fast(&ilan->xlat.rhash_table,
183 &xp->ip.locator_match, 151 &xp->ip.locator_match,
184 rht_params); 152 rht_params);
185 while (ila) { 153 while (ila) {
@@ -196,9 +164,9 @@ static inline void ila_release(struct ila_map *ila)
196 kfree_rcu(ila, rcu); 164 kfree_rcu(ila, rcu);
197} 165}
198 166
199static void ila_free_cb(void *ptr, void *arg) 167static void ila_free_node(struct ila_map *ila)
200{ 168{
201 struct ila_map *ila = (struct ila_map *)ptr, *next; 169 struct ila_map *next;
202 170
203 /* Assume rcu_readlock held */ 171 /* Assume rcu_readlock held */
204 while (ila) { 172 while (ila) {
@@ -208,6 +176,11 @@ static void ila_free_cb(void *ptr, void *arg)
208 } 176 }
209} 177}
210 178
179static void ila_free_cb(void *ptr, void *arg)
180{
181 ila_free_node((struct ila_map *)ptr);
182}
183
211static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila); 184static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila);
212 185
213static unsigned int 186static unsigned int
@@ -235,7 +208,7 @@ static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp)
235 spinlock_t *lock = ila_get_lock(ilan, xp->ip.locator_match); 208 spinlock_t *lock = ila_get_lock(ilan, xp->ip.locator_match);
236 int err = 0, order; 209 int err = 0, order;
237 210
238 if (!ilan->hooks_registered) { 211 if (!ilan->xlat.hooks_registered) {
239 /* We defer registering net hooks in the namespace until the 212 /* We defer registering net hooks in the namespace until the
240 * first mapping is added. 213 * first mapping is added.
241 */ 214 */
@@ -244,7 +217,7 @@ static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp)
244 if (err) 217 if (err)
245 return err; 218 return err;
246 219
247 ilan->hooks_registered = true; 220 ilan->xlat.hooks_registered = true;
248 } 221 }
249 222
250 ila = kzalloc(sizeof(*ila), GFP_KERNEL); 223 ila = kzalloc(sizeof(*ila), GFP_KERNEL);
@@ -259,12 +232,12 @@ static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp)
259 232
260 spin_lock(lock); 233 spin_lock(lock);
261 234
262 head = rhashtable_lookup_fast(&ilan->rhash_table, 235 head = rhashtable_lookup_fast(&ilan->xlat.rhash_table,
263 &xp->ip.locator_match, 236 &xp->ip.locator_match,
264 rht_params); 237 rht_params);
265 if (!head) { 238 if (!head) {
266 /* New entry for the rhash_table */ 239 /* New entry for the rhash_table */
267 err = rhashtable_lookup_insert_fast(&ilan->rhash_table, 240 err = rhashtable_lookup_insert_fast(&ilan->xlat.rhash_table,
268 &ila->node, rht_params); 241 &ila->node, rht_params);
269 } else { 242 } else {
270 struct ila_map *tila = head, *prev = NULL; 243 struct ila_map *tila = head, *prev = NULL;
@@ -290,7 +263,7 @@ static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp)
290 } else { 263 } else {
291 /* Make this ila new head */ 264 /* Make this ila new head */
292 RCU_INIT_POINTER(ila->next, head); 265 RCU_INIT_POINTER(ila->next, head);
293 err = rhashtable_replace_fast(&ilan->rhash_table, 266 err = rhashtable_replace_fast(&ilan->xlat.rhash_table,
294 &head->node, 267 &head->node,
295 &ila->node, rht_params); 268 &ila->node, rht_params);
296 if (err) 269 if (err)
@@ -316,7 +289,7 @@ static int ila_del_mapping(struct net *net, struct ila_xlat_params *xp)
316 289
317 spin_lock(lock); 290 spin_lock(lock);
318 291
319 head = rhashtable_lookup_fast(&ilan->rhash_table, 292 head = rhashtable_lookup_fast(&ilan->xlat.rhash_table,
320 &xp->ip.locator_match, rht_params); 293 &xp->ip.locator_match, rht_params);
321 ila = head; 294 ila = head;
322 295
@@ -346,15 +319,15 @@ static int ila_del_mapping(struct net *net, struct ila_xlat_params *xp)
346 * table 319 * table
347 */ 320 */
348 err = rhashtable_replace_fast( 321 err = rhashtable_replace_fast(
349 &ilan->rhash_table, &ila->node, 322 &ilan->xlat.rhash_table, &ila->node,
350 &head->node, rht_params); 323 &head->node, rht_params);
351 if (err) 324 if (err)
352 goto out; 325 goto out;
353 } else { 326 } else {
354 /* Entry no longer used */ 327 /* Entry no longer used */
355 err = rhashtable_remove_fast(&ilan->rhash_table, 328 err = rhashtable_remove_fast(
356 &ila->node, 329 &ilan->xlat.rhash_table,
357 rht_params); 330 &ila->node, rht_params);
358 } 331 }
359 } 332 }
360 333
@@ -369,7 +342,7 @@ out:
369 return err; 342 return err;
370} 343}
371 344
372static int ila_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info) 345int ila_xlat_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info)
373{ 346{
374 struct net *net = genl_info_net(info); 347 struct net *net = genl_info_net(info);
375 struct ila_xlat_params p; 348 struct ila_xlat_params p;
@@ -382,7 +355,7 @@ static int ila_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info)
382 return ila_add_mapping(net, &p); 355 return ila_add_mapping(net, &p);
383} 356}
384 357
385static int ila_nl_cmd_del_mapping(struct sk_buff *skb, struct genl_info *info) 358int ila_xlat_nl_cmd_del_mapping(struct sk_buff *skb, struct genl_info *info)
386{ 359{
387 struct net *net = genl_info_net(info); 360 struct net *net = genl_info_net(info);
388 struct ila_xlat_params xp; 361 struct ila_xlat_params xp;
@@ -397,6 +370,59 @@ static int ila_nl_cmd_del_mapping(struct sk_buff *skb, struct genl_info *info)
397 return 0; 370 return 0;
398} 371}
399 372
373static inline spinlock_t *lock_from_ila_map(struct ila_net *ilan,
374 struct ila_map *ila)
375{
376 return ila_get_lock(ilan, ila->xp.ip.locator_match);
377}
378
379int ila_xlat_nl_cmd_flush(struct sk_buff *skb, struct genl_info *info)
380{
381 struct net *net = genl_info_net(info);
382 struct ila_net *ilan = net_generic(net, ila_net_id);
383 struct rhashtable_iter iter;
384 struct ila_map *ila;
385 spinlock_t *lock;
386 int ret;
387
388 ret = rhashtable_walk_init(&ilan->xlat.rhash_table, &iter, GFP_KERNEL);
389 if (ret)
390 goto done;
391
392 rhashtable_walk_start(&iter);
393
394 for (;;) {
395 ila = rhashtable_walk_next(&iter);
396
397 if (IS_ERR(ila)) {
398 if (PTR_ERR(ila) == -EAGAIN)
399 continue;
400 ret = PTR_ERR(ila);
401 goto done;
402 } else if (!ila) {
403 break;
404 }
405
406 lock = lock_from_ila_map(ilan, ila);
407
408 spin_lock(lock);
409
410 ret = rhashtable_remove_fast(&ilan->xlat.rhash_table,
411 &ila->node, rht_params);
412 if (!ret)
413 ila_free_node(ila);
414
415 spin_unlock(lock);
416
417 if (ret)
418 break;
419 }
420
421done:
422 rhashtable_walk_stop(&iter);
423 return ret;
424}
425
400static int ila_fill_info(struct ila_map *ila, struct sk_buff *msg) 426static int ila_fill_info(struct ila_map *ila, struct sk_buff *msg)
401{ 427{
402 if (nla_put_u64_64bit(msg, ILA_ATTR_LOCATOR, 428 if (nla_put_u64_64bit(msg, ILA_ATTR_LOCATOR,
@@ -434,7 +460,7 @@ nla_put_failure:
434 return -EMSGSIZE; 460 return -EMSGSIZE;
435} 461}
436 462
437static int ila_nl_cmd_get_mapping(struct sk_buff *skb, struct genl_info *info) 463int ila_xlat_nl_cmd_get_mapping(struct sk_buff *skb, struct genl_info *info)
438{ 464{
439 struct net *net = genl_info_net(info); 465 struct net *net = genl_info_net(info);
440 struct ila_net *ilan = net_generic(net, ila_net_id); 466 struct ila_net *ilan = net_generic(net, ila_net_id);
@@ -475,27 +501,34 @@ out_free:
475 501
476struct ila_dump_iter { 502struct ila_dump_iter {
477 struct rhashtable_iter rhiter; 503 struct rhashtable_iter rhiter;
504 int skip;
478}; 505};
479 506
480static int ila_nl_dump_start(struct netlink_callback *cb) 507int ila_xlat_nl_dump_start(struct netlink_callback *cb)
481{ 508{
482 struct net *net = sock_net(cb->skb->sk); 509 struct net *net = sock_net(cb->skb->sk);
483 struct ila_net *ilan = net_generic(net, ila_net_id); 510 struct ila_net *ilan = net_generic(net, ila_net_id);
484 struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args[0]; 511 struct ila_dump_iter *iter;
512 int ret;
485 513
486 if (!iter) { 514 iter = kmalloc(sizeof(*iter), GFP_KERNEL);
487 iter = kmalloc(sizeof(*iter), GFP_KERNEL); 515 if (!iter)
488 if (!iter) 516 return -ENOMEM;
489 return -ENOMEM;
490 517
491 cb->args[0] = (long)iter; 518 ret = rhashtable_walk_init(&ilan->xlat.rhash_table, &iter->rhiter,
519 GFP_KERNEL);
520 if (ret) {
521 kfree(iter);
522 return ret;
492 } 523 }
493 524
494 return rhashtable_walk_init(&ilan->rhash_table, &iter->rhiter, 525 iter->skip = 0;
495 GFP_KERNEL); 526 cb->args[0] = (long)iter;
527
528 return ret;
496} 529}
497 530
498static int ila_nl_dump_done(struct netlink_callback *cb) 531int ila_xlat_nl_dump_done(struct netlink_callback *cb)
499{ 532{
500 struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args[0]; 533 struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args[0];
501 534
@@ -506,24 +539,49 @@ static int ila_nl_dump_done(struct netlink_callback *cb)
506 return 0; 539 return 0;
507} 540}
508 541
509static int ila_nl_dump(struct sk_buff *skb, struct netlink_callback *cb) 542int ila_xlat_nl_dump(struct sk_buff *skb, struct netlink_callback *cb)
510{ 543{
511 struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args[0]; 544 struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args[0];
512 struct rhashtable_iter *rhiter = &iter->rhiter; 545 struct rhashtable_iter *rhiter = &iter->rhiter;
546 int skip = iter->skip;
513 struct ila_map *ila; 547 struct ila_map *ila;
514 int ret; 548 int ret;
515 549
516 rhashtable_walk_start(rhiter); 550 rhashtable_walk_start(rhiter);
517 551
518 for (;;) { 552 /* Get first entry */
519 ila = rhashtable_walk_next(rhiter); 553 ila = rhashtable_walk_peek(rhiter);
554
555 if (ila && !IS_ERR(ila) && skip) {
556 /* Skip over visited entries */
557
558 while (ila && skip) {
559 /* Skip over any ila entries in this list that we
560 * have already dumped.
561 */
562 ila = rcu_access_pointer(ila->next);
563 skip--;
564 }
565 }
520 566
567 skip = 0;
568
569 for (;;) {
521 if (IS_ERR(ila)) { 570 if (IS_ERR(ila)) {
522 if (PTR_ERR(ila) == -EAGAIN)
523 continue;
524 ret = PTR_ERR(ila); 571 ret = PTR_ERR(ila);
525 goto done; 572 if (ret == -EAGAIN) {
573 /* Table has changed and iter has reset. Return
574 * -EAGAIN to the application even if we have
575 * written data to the skb. The application
576 * needs to deal with this.
577 */
578
579 goto out_ret;
580 } else {
581 break;
582 }
526 } else if (!ila) { 583 } else if (!ila) {
584 ret = 0;
527 break; 585 break;
528 } 586 }
529 587
@@ -532,90 +590,54 @@ static int ila_nl_dump(struct sk_buff *skb, struct netlink_callback *cb)
532 cb->nlh->nlmsg_seq, NLM_F_MULTI, 590 cb->nlh->nlmsg_seq, NLM_F_MULTI,
533 skb, ILA_CMD_GET); 591 skb, ILA_CMD_GET);
534 if (ret) 592 if (ret)
535 goto done; 593 goto out;
536 594
595 skip++;
537 ila = rcu_access_pointer(ila->next); 596 ila = rcu_access_pointer(ila->next);
538 } 597 }
598
599 skip = 0;
600 ila = rhashtable_walk_next(rhiter);
539 } 601 }
540 602
541 ret = skb->len; 603out:
604 iter->skip = skip;
605 ret = (skb->len ? : ret);
542 606
543done: 607out_ret:
544 rhashtable_walk_stop(rhiter); 608 rhashtable_walk_stop(rhiter);
545 return ret; 609 return ret;
546} 610}
547 611
548static const struct genl_ops ila_nl_ops[] = {
549 {
550 .cmd = ILA_CMD_ADD,
551 .doit = ila_nl_cmd_add_mapping,
552 .policy = ila_nl_policy,
553 .flags = GENL_ADMIN_PERM,
554 },
555 {
556 .cmd = ILA_CMD_DEL,
557 .doit = ila_nl_cmd_del_mapping,
558 .policy = ila_nl_policy,
559 .flags = GENL_ADMIN_PERM,
560 },
561 {
562 .cmd = ILA_CMD_GET,
563 .doit = ila_nl_cmd_get_mapping,
564 .start = ila_nl_dump_start,
565 .dumpit = ila_nl_dump,
566 .done = ila_nl_dump_done,
567 .policy = ila_nl_policy,
568 },
569};
570
571static struct genl_family ila_nl_family __ro_after_init = {
572 .hdrsize = 0,
573 .name = ILA_GENL_NAME,
574 .version = ILA_GENL_VERSION,
575 .maxattr = ILA_ATTR_MAX,
576 .netnsok = true,
577 .parallel_ops = true,
578 .module = THIS_MODULE,
579 .ops = ila_nl_ops,
580 .n_ops = ARRAY_SIZE(ila_nl_ops),
581};
582
583#define ILA_HASH_TABLE_SIZE 1024 612#define ILA_HASH_TABLE_SIZE 1024
584 613
585static __net_init int ila_init_net(struct net *net) 614int ila_xlat_init_net(struct net *net)
586{ 615{
587 int err;
588 struct ila_net *ilan = net_generic(net, ila_net_id); 616 struct ila_net *ilan = net_generic(net, ila_net_id);
617 int err;
589 618
590 err = alloc_ila_locks(ilan); 619 err = alloc_ila_locks(ilan);
591 if (err) 620 if (err)
592 return err; 621 return err;
593 622
594 rhashtable_init(&ilan->rhash_table, &rht_params); 623 rhashtable_init(&ilan->xlat.rhash_table, &rht_params);
595 624
596 return 0; 625 return 0;
597} 626}
598 627
599static __net_exit void ila_exit_net(struct net *net) 628void ila_xlat_exit_net(struct net *net)
600{ 629{
601 struct ila_net *ilan = net_generic(net, ila_net_id); 630 struct ila_net *ilan = net_generic(net, ila_net_id);
602 631
603 rhashtable_free_and_destroy(&ilan->rhash_table, ila_free_cb, NULL); 632 rhashtable_free_and_destroy(&ilan->xlat.rhash_table, ila_free_cb, NULL);
604 633
605 kvfree(ilan->locks); 634 free_bucket_spinlocks(ilan->xlat.locks);
606 635
607 if (ilan->hooks_registered) 636 if (ilan->xlat.hooks_registered)
608 nf_unregister_net_hooks(net, ila_nf_hook_ops, 637 nf_unregister_net_hooks(net, ila_nf_hook_ops,
609 ARRAY_SIZE(ila_nf_hook_ops)); 638 ARRAY_SIZE(ila_nf_hook_ops));
610} 639}
611 640
612static struct pernet_operations ila_net_ops = {
613 .init = ila_init_net,
614 .exit = ila_exit_net,
615 .id = &ila_net_id,
616 .size = sizeof(struct ila_net),
617};
618
619static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila) 641static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila)
620{ 642{
621 struct ila_map *ila; 643 struct ila_map *ila;
@@ -641,29 +663,3 @@ static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila)
641 663
642 return 0; 664 return 0;
643} 665}
644
645int __init ila_xlat_init(void)
646{
647 int ret;
648
649 ret = register_pernet_device(&ila_net_ops);
650 if (ret)
651 goto exit;
652
653 ret = genl_register_family(&ila_nl_family);
654 if (ret < 0)
655 goto unregister;
656
657 return 0;
658
659unregister:
660 unregister_pernet_device(&ila_net_ops);
661exit:
662 return ret;
663}
664
665void ila_xlat_fini(void)
666{
667 genl_unregister_family(&ila_nl_family);
668 unregister_pernet_device(&ila_net_ops);
669}
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 595ad408dba0..3d7c7460a0c5 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -191,7 +191,7 @@ struct sock *inet6_lookup_listener(struct net *net,
191 saddr, sport, daddr, hnum, 191 saddr, sport, daddr, hnum,
192 dif, sdif); 192 dif, sdif);
193 if (result) 193 if (result)
194 return result; 194 goto done;
195 195
196 /* Lookup lhash2 with in6addr_any */ 196 /* Lookup lhash2 with in6addr_any */
197 197
@@ -200,9 +200,10 @@ struct sock *inet6_lookup_listener(struct net *net,
200 if (ilb2->count > ilb->count) 200 if (ilb2->count > ilb->count)
201 goto port_lookup; 201 goto port_lookup;
202 202
203 return inet6_lhash2_lookup(net, ilb2, skb, doff, 203 result = inet6_lhash2_lookup(net, ilb2, skb, doff,
204 saddr, sport, daddr, hnum, 204 saddr, sport, daddr, hnum,
205 dif, sdif); 205 dif, sdif);
206 goto done;
206 207
207port_lookup: 208port_lookup:
208 sk_for_each(sk, &ilb->head) { 209 sk_for_each(sk, &ilb->head) {
@@ -214,12 +215,15 @@ port_lookup:
214 result = reuseport_select_sock(sk, phash, 215 result = reuseport_select_sock(sk, phash,
215 skb, doff); 216 skb, doff);
216 if (result) 217 if (result)
217 return result; 218 goto done;
218 } 219 }
219 result = sk; 220 result = sk;
220 hiscore = score; 221 hiscore = score;
221 } 222 }
222 } 223 }
224done:
225 if (unlikely(IS_ERR(result)))
226 return NULL;
223 return result; 227 return result;
224} 228}
225EXPORT_SYMBOL_GPL(inet6_lookup_listener); 229EXPORT_SYMBOL_GPL(inet6_lookup_listener);
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 1fb2f3118d60..c861a6d4671d 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -198,6 +198,8 @@ void fib6_info_destroy_rcu(struct rcu_head *head)
198 } 198 }
199 } 199 }
200 200
201 lwtstate_put(f6i->fib6_nh.nh_lwtstate);
202
201 if (f6i->fib6_nh.nh_dev) 203 if (f6i->fib6_nh.nh_dev)
202 dev_put(f6i->fib6_nh.nh_dev); 204 dev_put(f6i->fib6_nh.nh_dev);
203 205
@@ -935,20 +937,19 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
935{ 937{
936 struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, 938 struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
937 lockdep_is_held(&rt->fib6_table->tb6_lock)); 939 lockdep_is_held(&rt->fib6_table->tb6_lock));
938 enum fib_event_type event = FIB_EVENT_ENTRY_ADD; 940 struct fib6_info *iter = NULL;
939 struct fib6_info *iter = NULL, *match = NULL;
940 struct fib6_info __rcu **ins; 941 struct fib6_info __rcu **ins;
942 struct fib6_info __rcu **fallback_ins = NULL;
941 int replace = (info->nlh && 943 int replace = (info->nlh &&
942 (info->nlh->nlmsg_flags & NLM_F_REPLACE)); 944 (info->nlh->nlmsg_flags & NLM_F_REPLACE));
943 int append = (info->nlh &&
944 (info->nlh->nlmsg_flags & NLM_F_APPEND));
945 int add = (!info->nlh || 945 int add = (!info->nlh ||
946 (info->nlh->nlmsg_flags & NLM_F_CREATE)); 946 (info->nlh->nlmsg_flags & NLM_F_CREATE));
947 int found = 0; 947 int found = 0;
948 bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
948 u16 nlflags = NLM_F_EXCL; 949 u16 nlflags = NLM_F_EXCL;
949 int err; 950 int err;
950 951
951 if (append) 952 if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND))
952 nlflags |= NLM_F_APPEND; 953 nlflags |= NLM_F_APPEND;
953 954
954 ins = &fn->leaf; 955 ins = &fn->leaf;
@@ -970,8 +971,13 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
970 971
971 nlflags &= ~NLM_F_EXCL; 972 nlflags &= ~NLM_F_EXCL;
972 if (replace) { 973 if (replace) {
973 found++; 974 if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) {
974 break; 975 found++;
976 break;
977 }
978 if (rt_can_ecmp)
979 fallback_ins = fallback_ins ?: ins;
980 goto next_iter;
975 } 981 }
976 982
977 if (rt6_duplicate_nexthop(iter, rt)) { 983 if (rt6_duplicate_nexthop(iter, rt)) {
@@ -986,51 +992,71 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
986 fib6_metric_set(iter, RTAX_MTU, rt->fib6_pmtu); 992 fib6_metric_set(iter, RTAX_MTU, rt->fib6_pmtu);
987 return -EEXIST; 993 return -EEXIST;
988 } 994 }
989 995 /* If we have the same destination and the same metric,
990 /* first route that matches */ 996 * but not the same gateway, then the route we try to
991 if (!match) 997 * add is sibling to this route, increment our counter
992 match = iter; 998 * of siblings, and later we will add our route to the
999 * list.
1000 * Only static routes (which don't have flag
1001 * RTF_EXPIRES) are used for ECMPv6.
1002 *
1003 * To avoid long list, we only had siblings if the
1004 * route have a gateway.
1005 */
1006 if (rt_can_ecmp &&
1007 rt6_qualify_for_ecmp(iter))
1008 rt->fib6_nsiblings++;
993 } 1009 }
994 1010
995 if (iter->fib6_metric > rt->fib6_metric) 1011 if (iter->fib6_metric > rt->fib6_metric)
996 break; 1012 break;
997 1013
1014next_iter:
998 ins = &iter->fib6_next; 1015 ins = &iter->fib6_next;
999 } 1016 }
1000 1017
1018 if (fallback_ins && !found) {
1019 /* No ECMP-able route found, replace first non-ECMP one */
1020 ins = fallback_ins;
1021 iter = rcu_dereference_protected(*ins,
1022 lockdep_is_held(&rt->fib6_table->tb6_lock));
1023 found++;
1024 }
1025
1001 /* Reset round-robin state, if necessary */ 1026 /* Reset round-robin state, if necessary */
1002 if (ins == &fn->leaf) 1027 if (ins == &fn->leaf)
1003 fn->rr_ptr = NULL; 1028 fn->rr_ptr = NULL;
1004 1029
1005 /* Link this route to others same route. */ 1030 /* Link this route to others same route. */
1006 if (append && match) { 1031 if (rt->fib6_nsiblings) {
1032 unsigned int fib6_nsiblings;
1007 struct fib6_info *sibling, *temp_sibling; 1033 struct fib6_info *sibling, *temp_sibling;
1008 1034
1009 if (rt->fib6_flags & RTF_REJECT) { 1035 /* Find the first route that have the same metric */
1010 NL_SET_ERR_MSG(extack, 1036 sibling = leaf;
1011 "Can not append a REJECT route"); 1037 while (sibling) {
1012 return -EINVAL; 1038 if (sibling->fib6_metric == rt->fib6_metric &&
1013 } else if (match->fib6_flags & RTF_REJECT) { 1039 rt6_qualify_for_ecmp(sibling)) {
1014 NL_SET_ERR_MSG(extack, 1040 list_add_tail(&rt->fib6_siblings,
1015 "Can not append to a REJECT route"); 1041 &sibling->fib6_siblings);
1016 return -EINVAL; 1042 break;
1043 }
1044 sibling = rcu_dereference_protected(sibling->fib6_next,
1045 lockdep_is_held(&rt->fib6_table->tb6_lock));
1017 } 1046 }
1018 event = FIB_EVENT_ENTRY_APPEND;
1019 rt->fib6_nsiblings = match->fib6_nsiblings;
1020 list_add_tail(&rt->fib6_siblings, &match->fib6_siblings);
1021 match->fib6_nsiblings++;
1022
1023 /* For each sibling in the list, increment the counter of 1047 /* For each sibling in the list, increment the counter of
1024 * siblings. BUG() if counters does not match, list of siblings 1048 * siblings. BUG() if counters does not match, list of siblings
1025 * is broken! 1049 * is broken!
1026 */ 1050 */
1051 fib6_nsiblings = 0;
1027 list_for_each_entry_safe(sibling, temp_sibling, 1052 list_for_each_entry_safe(sibling, temp_sibling,
1028 &match->fib6_siblings, fib6_siblings) { 1053 &rt->fib6_siblings, fib6_siblings) {
1029 sibling->fib6_nsiblings++; 1054 sibling->fib6_nsiblings++;
1030 BUG_ON(sibling->fib6_nsiblings != match->fib6_nsiblings); 1055 BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings);
1056 fib6_nsiblings++;
1031 } 1057 }
1032 1058 BUG_ON(fib6_nsiblings != rt->fib6_nsiblings);
1033 rt6_multipath_rebalance(match); 1059 rt6_multipath_rebalance(temp_sibling);
1034 } 1060 }
1035 1061
1036 /* 1062 /*
@@ -1043,8 +1069,9 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
1043add: 1069add:
1044 nlflags |= NLM_F_CREATE; 1070 nlflags |= NLM_F_CREATE;
1045 1071
1046 err = call_fib6_entry_notifiers(info->nl_net, event, rt, 1072 err = call_fib6_entry_notifiers(info->nl_net,
1047 extack); 1073 FIB_EVENT_ENTRY_ADD,
1074 rt, extack);
1048 if (err) 1075 if (err)
1049 return err; 1076 return err;
1050 1077
@@ -1062,7 +1089,7 @@ add:
1062 } 1089 }
1063 1090
1064 } else { 1091 } else {
1065 struct fib6_info *tmp; 1092 int nsiblings;
1066 1093
1067 if (!found) { 1094 if (!found) {
1068 if (add) 1095 if (add)
@@ -1077,57 +1104,48 @@ add:
1077 if (err) 1104 if (err)
1078 return err; 1105 return err;
1079 1106
1080 /* if route being replaced has siblings, set tmp to
1081 * last one, otherwise tmp is current route. this is
1082 * used to set fib6_next for new route
1083 */
1084 if (iter->fib6_nsiblings)
1085 tmp = list_last_entry(&iter->fib6_siblings,
1086 struct fib6_info,
1087 fib6_siblings);
1088 else
1089 tmp = iter;
1090
1091 /* insert new route */
1092 atomic_inc(&rt->fib6_ref); 1107 atomic_inc(&rt->fib6_ref);
1093 rcu_assign_pointer(rt->fib6_node, fn); 1108 rcu_assign_pointer(rt->fib6_node, fn);
1094 rt->fib6_next = tmp->fib6_next; 1109 rt->fib6_next = iter->fib6_next;
1095 rcu_assign_pointer(*ins, rt); 1110 rcu_assign_pointer(*ins, rt);
1096
1097 if (!info->skip_notify) 1111 if (!info->skip_notify)
1098 inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE); 1112 inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
1099 if (!(fn->fn_flags & RTN_RTINFO)) { 1113 if (!(fn->fn_flags & RTN_RTINFO)) {
1100 info->nl_net->ipv6.rt6_stats->fib_route_nodes++; 1114 info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
1101 fn->fn_flags |= RTN_RTINFO; 1115 fn->fn_flags |= RTN_RTINFO;
1102 } 1116 }
1117 nsiblings = iter->fib6_nsiblings;
1118 iter->fib6_node = NULL;
1119 fib6_purge_rt(iter, fn, info->nl_net);
1120 if (rcu_access_pointer(fn->rr_ptr) == iter)
1121 fn->rr_ptr = NULL;
1122 fib6_info_release(iter);
1103 1123
1104 /* delete old route */ 1124 if (nsiblings) {
1105 rt = iter;
1106
1107 if (rt->fib6_nsiblings) {
1108 struct fib6_info *tmp;
1109
1110 /* Replacing an ECMP route, remove all siblings */ 1125 /* Replacing an ECMP route, remove all siblings */
1111 list_for_each_entry_safe(iter, tmp, &rt->fib6_siblings, 1126 ins = &rt->fib6_next;
1112 fib6_siblings) { 1127 iter = rcu_dereference_protected(*ins,
1113 iter->fib6_node = NULL; 1128 lockdep_is_held(&rt->fib6_table->tb6_lock));
1114 fib6_purge_rt(iter, fn, info->nl_net); 1129 while (iter) {
1115 if (rcu_access_pointer(fn->rr_ptr) == iter) 1130 if (iter->fib6_metric > rt->fib6_metric)
1116 fn->rr_ptr = NULL; 1131 break;
1117 fib6_info_release(iter); 1132 if (rt6_qualify_for_ecmp(iter)) {
1118 1133 *ins = iter->fib6_next;
1119 rt->fib6_nsiblings--; 1134 iter->fib6_node = NULL;
1120 info->nl_net->ipv6.rt6_stats->fib_rt_entries--; 1135 fib6_purge_rt(iter, fn, info->nl_net);
1136 if (rcu_access_pointer(fn->rr_ptr) == iter)
1137 fn->rr_ptr = NULL;
1138 fib6_info_release(iter);
1139 nsiblings--;
1140 info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
1141 } else {
1142 ins = &iter->fib6_next;
1143 }
1144 iter = rcu_dereference_protected(*ins,
1145 lockdep_is_held(&rt->fib6_table->tb6_lock));
1121 } 1146 }
1147 WARN_ON(nsiblings != 0);
1122 } 1148 }
1123
1124 WARN_ON(rt->fib6_nsiblings != 0);
1125
1126 rt->fib6_node = NULL;
1127 fib6_purge_rt(rt, fn, info->nl_net);
1128 if (rcu_access_pointer(fn->rr_ptr) == rt)
1129 fn->rr_ptr = NULL;
1130 fib6_info_release(rt);
1131 } 1149 }
1132 1150
1133 return 0; 1151 return 0;
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index 3eee7637bdfe..cb54a8a3c273 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -373,7 +373,6 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq,
373 if (olen > 0) { 373 if (olen > 0) {
374 struct msghdr msg; 374 struct msghdr msg;
375 struct flowi6 flowi6; 375 struct flowi6 flowi6;
376 struct sockcm_cookie sockc_junk;
377 struct ipcm6_cookie ipc6; 376 struct ipcm6_cookie ipc6;
378 377
379 err = -ENOMEM; 378 err = -ENOMEM;
@@ -392,7 +391,7 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq,
392 memset(&flowi6, 0, sizeof(flowi6)); 391 memset(&flowi6, 0, sizeof(flowi6));
393 392
394 ipc6.opt = fl->opt; 393 ipc6.opt = fl->opt;
395 err = ip6_datagram_send_ctl(net, sk, &msg, &flowi6, &ipc6, &sockc_junk); 394 err = ip6_datagram_send_ctl(net, sk, &msg, &flowi6, &ipc6);
396 if (err) 395 if (err)
397 goto done; 396 goto done;
398 err = -EINVAL; 397 err = -EINVAL;
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index c8cf2fdbb13b..18a3794b0f52 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -927,7 +927,6 @@ tx_err:
927static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, 927static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
928 struct net_device *dev) 928 struct net_device *dev)
929{ 929{
930 struct ipv6hdr *ipv6h = ipv6_hdr(skb);
931 struct ip6_tnl *t = netdev_priv(dev); 930 struct ip6_tnl *t = netdev_priv(dev);
932 struct dst_entry *dst = skb_dst(skb); 931 struct dst_entry *dst = skb_dst(skb);
933 struct net_device_stats *stats; 932 struct net_device_stats *stats;
@@ -990,6 +989,8 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
990 fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); 989 fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
991 990
992 dsfield = key->tos; 991 dsfield = key->tos;
992 if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT))
993 goto tx_err;
993 md = ip_tunnel_info_opts(tun_info); 994 md = ip_tunnel_info_opts(tun_info);
994 if (!md) 995 if (!md)
995 goto tx_err; 996 goto tx_err;
@@ -1010,6 +1011,8 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
1010 goto tx_err; 1011 goto tx_err;
1011 } 1012 }
1012 } else { 1013 } else {
1014 struct ipv6hdr *ipv6h = ipv6_hdr(skb);
1015
1013 switch (skb->protocol) { 1016 switch (skb->protocol) {
1014 case htons(ETH_P_IP): 1017 case htons(ETH_P_IP):
1015 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 1018 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
@@ -1126,7 +1129,7 @@ static void ip6gre_tnl_link_config_route(struct ip6_tnl *t, int set_mtu,
1126 return; 1129 return;
1127 1130
1128 if (rt->dst.dev) { 1131 if (rt->dst.dev) {
1129 dev->hard_header_len = rt->dst.dev->hard_header_len + 1132 dev->needed_headroom = rt->dst.dev->hard_header_len +
1130 t_hlen; 1133 t_hlen;
1131 1134
1132 if (set_mtu) { 1135 if (set_mtu) {
@@ -1152,7 +1155,7 @@ static int ip6gre_calc_hlen(struct ip6_tnl *tunnel)
1152 tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen; 1155 tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
1153 1156
1154 t_hlen = tunnel->hlen + sizeof(struct ipv6hdr); 1157 t_hlen = tunnel->hlen + sizeof(struct ipv6hdr);
1155 tunnel->dev->hard_header_len = LL_MAX_HEADER + t_hlen; 1158 tunnel->dev->needed_headroom = LL_MAX_HEADER + t_hlen;
1156 return t_hlen; 1159 return t_hlen;
1157} 1160}
1158 1161
@@ -1822,7 +1825,7 @@ static int ip6erspan_calc_hlen(struct ip6_tnl *tunnel)
1822 erspan_hdr_len(tunnel->parms.erspan_ver); 1825 erspan_hdr_len(tunnel->parms.erspan_ver);
1823 1826
1824 t_hlen = tunnel->hlen + sizeof(struct ipv6hdr); 1827 t_hlen = tunnel->hlen + sizeof(struct ipv6hdr);
1825 tunnel->dev->hard_header_len = LL_MAX_HEADER + t_hlen; 1828 tunnel->dev->needed_headroom = LL_MAX_HEADER + t_hlen;
1826 return t_hlen; 1829 return t_hlen;
1827} 1830}
1828 1831
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index f08d34491ece..6242682be876 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -47,17 +47,11 @@
47#include <net/inet_ecn.h> 47#include <net/inet_ecn.h>
48#include <net/dst_metadata.h> 48#include <net/dst_metadata.h>
49 49
50int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) 50static void ip6_rcv_finish_core(struct net *net, struct sock *sk,
51 struct sk_buff *skb)
51{ 52{
52 void (*edemux)(struct sk_buff *skb); 53 void (*edemux)(struct sk_buff *skb);
53 54
54 /* if ingress device is enslaved to an L3 master device pass the
55 * skb to its handler for processing
56 */
57 skb = l3mdev_ip6_rcv(skb);
58 if (!skb)
59 return NET_RX_SUCCESS;
60
61 if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) { 55 if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) {
62 const struct inet6_protocol *ipprot; 56 const struct inet6_protocol *ipprot;
63 57
@@ -67,20 +61,73 @@ int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
67 } 61 }
68 if (!skb_valid_dst(skb)) 62 if (!skb_valid_dst(skb))
69 ip6_route_input(skb); 63 ip6_route_input(skb);
64}
65
66int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
67{
68 /* if ingress device is enslaved to an L3 master device pass the
69 * skb to its handler for processing
70 */
71 skb = l3mdev_ip6_rcv(skb);
72 if (!skb)
73 return NET_RX_SUCCESS;
74 ip6_rcv_finish_core(net, sk, skb);
70 75
71 return dst_input(skb); 76 return dst_input(skb);
72} 77}
73 78
74int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) 79static void ip6_sublist_rcv_finish(struct list_head *head)
80{
81 struct sk_buff *skb, *next;
82
83 list_for_each_entry_safe(skb, next, head, list)
84 dst_input(skb);
85}
86
87static void ip6_list_rcv_finish(struct net *net, struct sock *sk,
88 struct list_head *head)
89{
90 struct dst_entry *curr_dst = NULL;
91 struct sk_buff *skb, *next;
92 struct list_head sublist;
93
94 INIT_LIST_HEAD(&sublist);
95 list_for_each_entry_safe(skb, next, head, list) {
96 struct dst_entry *dst;
97
98 list_del(&skb->list);
99 /* if ingress device is enslaved to an L3 master device pass the
100 * skb to its handler for processing
101 */
102 skb = l3mdev_ip6_rcv(skb);
103 if (!skb)
104 continue;
105 ip6_rcv_finish_core(net, sk, skb);
106 dst = skb_dst(skb);
107 if (curr_dst != dst) {
108 /* dispatch old sublist */
109 if (!list_empty(&sublist))
110 ip6_sublist_rcv_finish(&sublist);
111 /* start new sublist */
112 INIT_LIST_HEAD(&sublist);
113 curr_dst = dst;
114 }
115 list_add_tail(&skb->list, &sublist);
116 }
117 /* dispatch final sublist */
118 ip6_sublist_rcv_finish(&sublist);
119}
120
121static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev,
122 struct net *net)
75{ 123{
76 const struct ipv6hdr *hdr; 124 const struct ipv6hdr *hdr;
77 u32 pkt_len; 125 u32 pkt_len;
78 struct inet6_dev *idev; 126 struct inet6_dev *idev;
79 struct net *net = dev_net(skb->dev);
80 127
81 if (skb->pkt_type == PACKET_OTHERHOST) { 128 if (skb->pkt_type == PACKET_OTHERHOST) {
82 kfree_skb(skb); 129 kfree_skb(skb);
83 return NET_RX_DROP; 130 return NULL;
84 } 131 }
85 132
86 rcu_read_lock(); 133 rcu_read_lock();
@@ -196,7 +243,7 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
196 if (ipv6_parse_hopopts(skb) < 0) { 243 if (ipv6_parse_hopopts(skb) < 0) {
197 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); 244 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
198 rcu_read_unlock(); 245 rcu_read_unlock();
199 return NET_RX_DROP; 246 return NULL;
200 } 247 }
201 } 248 }
202 249
@@ -205,15 +252,67 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
205 /* Must drop socket now because of tproxy. */ 252 /* Must drop socket now because of tproxy. */
206 skb_orphan(skb); 253 skb_orphan(skb);
207 254
208 return NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, 255 return skb;
209 net, NULL, skb, dev, NULL,
210 ip6_rcv_finish);
211err: 256err:
212 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); 257 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
213drop: 258drop:
214 rcu_read_unlock(); 259 rcu_read_unlock();
215 kfree_skb(skb); 260 kfree_skb(skb);
216 return NET_RX_DROP; 261 return NULL;
262}
263
264int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
265{
266 struct net *net = dev_net(skb->dev);
267
268 skb = ip6_rcv_core(skb, dev, net);
269 if (skb == NULL)
270 return NET_RX_DROP;
271 return NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING,
272 net, NULL, skb, dev, NULL,
273 ip6_rcv_finish);
274}
275
276static void ip6_sublist_rcv(struct list_head *head, struct net_device *dev,
277 struct net *net)
278{
279 NF_HOOK_LIST(NFPROTO_IPV6, NF_INET_PRE_ROUTING, net, NULL,
280 head, dev, NULL, ip6_rcv_finish);
281 ip6_list_rcv_finish(net, NULL, head);
282}
283
284/* Receive a list of IPv6 packets */
285void ipv6_list_rcv(struct list_head *head, struct packet_type *pt,
286 struct net_device *orig_dev)
287{
288 struct net_device *curr_dev = NULL;
289 struct net *curr_net = NULL;
290 struct sk_buff *skb, *next;
291 struct list_head sublist;
292
293 INIT_LIST_HEAD(&sublist);
294 list_for_each_entry_safe(skb, next, head, list) {
295 struct net_device *dev = skb->dev;
296 struct net *net = dev_net(dev);
297
298 list_del(&skb->list);
299 skb = ip6_rcv_core(skb, dev, net);
300 if (skb == NULL)
301 continue;
302
303 if (curr_dev != dev || curr_net != net) {
304 /* dispatch old sublist */
305 if (!list_empty(&sublist))
306 ip6_sublist_rcv(&sublist, curr_dev, curr_net);
307 /* start new sublist */
308 INIT_LIST_HEAD(&sublist);
309 curr_dev = dev;
310 curr_net = net;
311 }
312 list_add_tail(&skb->list, &sublist);
313 }
314 /* dispatch final sublist */
315 ip6_sublist_rcv(&sublist, curr_dev, curr_net);
217} 316}
218 317
219/* 318/*
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 5b3f2f89ef41..37ff4805b20c 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -163,11 +163,11 @@ static int ipv6_exthdrs_len(struct ipv6hdr *iph,
163 return len; 163 return len;
164} 164}
165 165
166static struct sk_buff **ipv6_gro_receive(struct sk_buff **head, 166static struct sk_buff *ipv6_gro_receive(struct list_head *head,
167 struct sk_buff *skb) 167 struct sk_buff *skb)
168{ 168{
169 const struct net_offload *ops; 169 const struct net_offload *ops;
170 struct sk_buff **pp = NULL; 170 struct sk_buff *pp = NULL;
171 struct sk_buff *p; 171 struct sk_buff *p;
172 struct ipv6hdr *iph; 172 struct ipv6hdr *iph;
173 unsigned int nlen; 173 unsigned int nlen;
@@ -214,7 +214,7 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head,
214 flush--; 214 flush--;
215 nlen = skb_network_header_len(skb); 215 nlen = skb_network_header_len(skb);
216 216
217 for (p = *head; p; p = p->next) { 217 list_for_each_entry(p, head, list) {
218 const struct ipv6hdr *iph2; 218 const struct ipv6hdr *iph2;
219 __be32 first_word; /* <Version:4><Traffic_Class:8><Flow_Label:20> */ 219 __be32 first_word; /* <Version:4><Traffic_Class:8><Flow_Label:20> */
220 220
@@ -263,8 +263,8 @@ out:
263 return pp; 263 return pp;
264} 264}
265 265
266static struct sk_buff **sit_ip6ip6_gro_receive(struct sk_buff **head, 266static struct sk_buff *sit_ip6ip6_gro_receive(struct list_head *head,
267 struct sk_buff *skb) 267 struct sk_buff *skb)
268{ 268{
269 /* Common GRO receive for SIT and IP6IP6 */ 269 /* Common GRO receive for SIT and IP6IP6 */
270 270
@@ -278,8 +278,8 @@ static struct sk_buff **sit_ip6ip6_gro_receive(struct sk_buff **head,
278 return ipv6_gro_receive(head, skb); 278 return ipv6_gro_receive(head, skb);
279} 279}
280 280
281static struct sk_buff **ip4ip6_gro_receive(struct sk_buff **head, 281static struct sk_buff *ip4ip6_gro_receive(struct list_head *head,
282 struct sk_buff *skb) 282 struct sk_buff *skb)
283{ 283{
284 /* Common GRO receive for SIT and IP6IP6 */ 284 /* Common GRO receive for SIT and IP6IP6 */
285 285
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index a14fb4fcdf18..16f200f06500 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -570,6 +570,8 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
570 to->dev = from->dev; 570 to->dev = from->dev;
571 to->mark = from->mark; 571 to->mark = from->mark;
572 572
573 skb_copy_hash(to, from);
574
573#ifdef CONFIG_NET_SCHED 575#ifdef CONFIG_NET_SCHED
574 to->tc_index = from->tc_index; 576 to->tc_index = from->tc_index;
575#endif 577#endif
@@ -1219,13 +1221,16 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1219 if (mtu < IPV6_MIN_MTU) 1221 if (mtu < IPV6_MIN_MTU)
1220 return -EINVAL; 1222 return -EINVAL;
1221 cork->base.fragsize = mtu; 1223 cork->base.fragsize = mtu;
1222 cork->base.gso_size = sk->sk_type == SOCK_DGRAM && 1224 cork->base.gso_size = ipc6->gso_size;
1223 sk->sk_protocol == IPPROTO_UDP ? ipc6->gso_size : 0; 1225 cork->base.tx_flags = 0;
1226 sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1224 1227
1225 if (dst_allfrag(xfrm_dst_path(&rt->dst))) 1228 if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1226 cork->base.flags |= IPCORK_ALLFRAG; 1229 cork->base.flags |= IPCORK_ALLFRAG;
1227 cork->base.length = 0; 1230 cork->base.length = 0;
1228 1231
1232 cork->base.transmit_time = ipc6->sockc.transmit_time;
1233
1229 return 0; 1234 return 0;
1230} 1235}
1231 1236
@@ -1238,8 +1243,7 @@ static int __ip6_append_data(struct sock *sk,
1238 int getfrag(void *from, char *to, int offset, 1243 int getfrag(void *from, char *to, int offset,
1239 int len, int odd, struct sk_buff *skb), 1244 int len, int odd, struct sk_buff *skb),
1240 void *from, int length, int transhdrlen, 1245 void *from, int length, int transhdrlen,
1241 unsigned int flags, struct ipcm6_cookie *ipc6, 1246 unsigned int flags, struct ipcm6_cookie *ipc6)
1242 const struct sockcm_cookie *sockc)
1243{ 1247{
1244 struct sk_buff *skb, *skb_prev = NULL; 1248 struct sk_buff *skb, *skb_prev = NULL;
1245 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu; 1249 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
@@ -1249,7 +1253,6 @@ static int __ip6_append_data(struct sock *sk,
1249 int copy; 1253 int copy;
1250 int err; 1254 int err;
1251 int offset = 0; 1255 int offset = 0;
1252 __u8 tx_flags = 0;
1253 u32 tskey = 0; 1256 u32 tskey = 0;
1254 struct rt6_info *rt = (struct rt6_info *)cork->dst; 1257 struct rt6_info *rt = (struct rt6_info *)cork->dst;
1255 struct ipv6_txoptions *opt = v6_cork->opt; 1258 struct ipv6_txoptions *opt = v6_cork->opt;
@@ -1268,6 +1271,10 @@ static int __ip6_append_data(struct sock *sk,
1268 mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize; 1271 mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1269 orig_mtu = mtu; 1272 orig_mtu = mtu;
1270 1273
1274 if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1275 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1276 tskey = sk->sk_tskey++;
1277
1271 hh_len = LL_RESERVED_SPACE(rt->dst.dev); 1278 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1272 1279
1273 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + 1280 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
@@ -1317,13 +1324,6 @@ emsgsize:
1317 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) 1324 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1318 csummode = CHECKSUM_PARTIAL; 1325 csummode = CHECKSUM_PARTIAL;
1319 1326
1320 if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1321 sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1322 if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1323 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1324 tskey = sk->sk_tskey++;
1325 }
1326
1327 /* 1327 /*
1328 * Let's try using as much space as possible. 1328 * Let's try using as much space as possible.
1329 * Use MTU if total length of the message fits into the MTU. 1329 * Use MTU if total length of the message fits into the MTU.
@@ -1442,8 +1442,8 @@ alloc_new_skb:
1442 dst_exthdrlen); 1442 dst_exthdrlen);
1443 1443
1444 /* Only the initial fragment is time stamped */ 1444 /* Only the initial fragment is time stamped */
1445 skb_shinfo(skb)->tx_flags = tx_flags; 1445 skb_shinfo(skb)->tx_flags = cork->tx_flags;
1446 tx_flags = 0; 1446 cork->tx_flags = 0;
1447 skb_shinfo(skb)->tskey = tskey; 1447 skb_shinfo(skb)->tskey = tskey;
1448 tskey = 0; 1448 tskey = 0;
1449 1449
@@ -1560,8 +1560,7 @@ int ip6_append_data(struct sock *sk,
1560 int odd, struct sk_buff *skb), 1560 int odd, struct sk_buff *skb),
1561 void *from, int length, int transhdrlen, 1561 void *from, int length, int transhdrlen,
1562 struct ipcm6_cookie *ipc6, struct flowi6 *fl6, 1562 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1563 struct rt6_info *rt, unsigned int flags, 1563 struct rt6_info *rt, unsigned int flags)
1564 const struct sockcm_cookie *sockc)
1565{ 1564{
1566 struct inet_sock *inet = inet_sk(sk); 1565 struct inet_sock *inet = inet_sk(sk);
1567 struct ipv6_pinfo *np = inet6_sk(sk); 1566 struct ipv6_pinfo *np = inet6_sk(sk);
@@ -1589,7 +1588,7 @@ int ip6_append_data(struct sock *sk,
1589 1588
1590 return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base, 1589 return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1591 &np->cork, sk_page_frag(sk), getfrag, 1590 &np->cork, sk_page_frag(sk), getfrag,
1592 from, length, transhdrlen, flags, ipc6, sockc); 1591 from, length, transhdrlen, flags, ipc6);
1593} 1592}
1594EXPORT_SYMBOL_GPL(ip6_append_data); 1593EXPORT_SYMBOL_GPL(ip6_append_data);
1595 1594
@@ -1673,6 +1672,8 @@ struct sk_buff *__ip6_make_skb(struct sock *sk,
1673 skb->priority = sk->sk_priority; 1672 skb->priority = sk->sk_priority;
1674 skb->mark = sk->sk_mark; 1673 skb->mark = sk->sk_mark;
1675 1674
1675 skb->tstamp = cork->base.transmit_time;
1676
1676 skb_dst_set(skb, dst_clone(&rt->dst)); 1677 skb_dst_set(skb, dst_clone(&rt->dst));
1677 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); 1678 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1678 if (proto == IPPROTO_ICMPV6) { 1679 if (proto == IPPROTO_ICMPV6) {
@@ -1747,8 +1748,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
1747 void *from, int length, int transhdrlen, 1748 void *from, int length, int transhdrlen,
1748 struct ipcm6_cookie *ipc6, struct flowi6 *fl6, 1749 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1749 struct rt6_info *rt, unsigned int flags, 1750 struct rt6_info *rt, unsigned int flags,
1750 struct inet_cork_full *cork, 1751 struct inet_cork_full *cork)
1751 const struct sockcm_cookie *sockc)
1752{ 1752{
1753 struct inet6_cork v6_cork; 1753 struct inet6_cork v6_cork;
1754 struct sk_buff_head queue; 1754 struct sk_buff_head queue;
@@ -1776,7 +1776,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
1776 err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork, 1776 err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
1777 &current->task_frag, getfrag, from, 1777 &current->task_frag, getfrag, from,
1778 length + exthdrlen, transhdrlen + exthdrlen, 1778 length + exthdrlen, transhdrlen + exthdrlen,
1779 flags, ipc6, sockc); 1779 flags, ipc6);
1780 if (err) { 1780 if (err) {
1781 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork); 1781 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
1782 return ERR_PTR(err); 1782 return ERR_PTR(err);
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 00e138a44cbb..5df2a58d945c 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1113,7 +1113,7 @@ route_lookup:
1113 dst = NULL; 1113 dst = NULL;
1114 goto tx_err_link_failure; 1114 goto tx_err_link_failure;
1115 } 1115 }
1116 if (t->parms.collect_md && 1116 if (t->parms.collect_md && ipv6_addr_any(&fl6->saddr) &&
1117 ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev, 1117 ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev,
1118 &fl6->daddr, 0, &fl6->saddr)) 1118 &fl6->daddr, 0, &fl6->saddr))
1119 goto tx_err_link_failure; 1119 goto tx_err_link_failure;
@@ -1133,12 +1133,8 @@ route_lookup:
1133 max_headroom += 8; 1133 max_headroom += 8;
1134 mtu -= 8; 1134 mtu -= 8;
1135 } 1135 }
1136 if (skb->protocol == htons(ETH_P_IPV6)) { 1136 mtu = max(mtu, skb->protocol == htons(ETH_P_IPV6) ?
1137 if (mtu < IPV6_MIN_MTU) 1137 IPV6_MIN_MTU : IPV4_MIN_MTU);
1138 mtu = IPV6_MIN_MTU;
1139 } else if (mtu < 576) {
1140 mtu = 576;
1141 }
1142 1138
1143 skb_dst_update_pmtu(skb, mtu); 1139 skb_dst_update_pmtu(skb, mtu);
1144 if (skb->len - t->tun_hlen - eth_hlen > mtu && !skb_is_gso(skb)) { 1140 if (skb->len - t->tun_hlen - eth_hlen > mtu && !skb_is_gso(skb)) {
@@ -1255,6 +1251,7 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
1255 key = &tun_info->key; 1251 key = &tun_info->key;
1256 memset(&fl6, 0, sizeof(fl6)); 1252 memset(&fl6, 0, sizeof(fl6));
1257 fl6.flowi6_proto = IPPROTO_IPIP; 1253 fl6.flowi6_proto = IPPROTO_IPIP;
1254 fl6.saddr = key->u.ipv6.src;
1258 fl6.daddr = key->u.ipv6.dst; 1255 fl6.daddr = key->u.ipv6.dst;
1259 fl6.flowlabel = key->label; 1256 fl6.flowlabel = key->label;
1260 dsfield = key->tos; 1257 dsfield = key->tos;
@@ -1326,6 +1323,7 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
1326 key = &tun_info->key; 1323 key = &tun_info->key;
1327 memset(&fl6, 0, sizeof(fl6)); 1324 memset(&fl6, 0, sizeof(fl6));
1328 fl6.flowi6_proto = IPPROTO_IPV6; 1325 fl6.flowi6_proto = IPPROTO_IPV6;
1326 fl6.saddr = key->u.ipv6.src;
1329 fl6.daddr = key->u.ipv6.dst; 1327 fl6.daddr = key->u.ipv6.dst;
1330 fl6.flowlabel = key->label; 1328 fl6.flowlabel = key->label;
1331 dsfield = key->tos; 1329 dsfield = key->tos;
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index b7f28deddaea..5095367c7204 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -480,10 +480,6 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
480 goto tx_err_dst_release; 480 goto tx_err_dst_release;
481 } 481 }
482 482
483 skb_scrub_packet(skb, !net_eq(t->net, dev_net(dev)));
484 skb_dst_set(skb, dst);
485 skb->dev = skb_dst(skb)->dev;
486
487 mtu = dst_mtu(dst); 483 mtu = dst_mtu(dst);
488 if (!skb->ignore_df && skb->len > mtu) { 484 if (!skb->ignore_df && skb->len > mtu) {
489 skb_dst_update_pmtu(skb, mtu); 485 skb_dst_update_pmtu(skb, mtu);
@@ -498,21 +494,18 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
498 htonl(mtu)); 494 htonl(mtu));
499 } 495 }
500 496
501 return -EMSGSIZE; 497 err = -EMSGSIZE;
498 goto tx_err_dst_release;
502 } 499 }
503 500
504 err = dst_output(t->net, skb->sk, skb); 501 skb_scrub_packet(skb, !net_eq(t->net, dev_net(dev)));
505 if (net_xmit_eval(err) == 0) { 502 skb_dst_set(skb, dst);
506 struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats); 503 skb->dev = skb_dst(skb)->dev;
507 504
508 u64_stats_update_begin(&tstats->syncp); 505 err = dst_output(t->net, skb->sk, skb);
509 tstats->tx_bytes += pkt_len; 506 if (net_xmit_eval(err) == 0)
510 tstats->tx_packets++; 507 err = pkt_len;
511 u64_stats_update_end(&tstats->syncp); 508 iptunnel_xmit_stats(dev, err);
512 } else {
513 stats->tx_errors++;
514 stats->tx_aborted_errors++;
515 }
516 509
517 return 0; 510 return 0;
518tx_err_link_failure: 511tx_err_link_failure:
@@ -1101,7 +1094,8 @@ static void __net_exit vti6_destroy_tunnels(struct vti6_net *ip6n,
1101 } 1094 }
1102 1095
1103 t = rtnl_dereference(ip6n->tnls_wc[0]); 1096 t = rtnl_dereference(ip6n->tnls_wc[0]);
1104 unregister_netdevice_queue(t->dev, list); 1097 if (t)
1098 unregister_netdevice_queue(t->dev, list);
1105} 1099}
1106 1100
1107static int __net_init vti6_init_net(struct net *net) 1101static int __net_init vti6_init_net(struct net *net)
@@ -1113,6 +1107,8 @@ static int __net_init vti6_init_net(struct net *net)
1113 ip6n->tnls[0] = ip6n->tnls_wc; 1107 ip6n->tnls[0] = ip6n->tnls_wc;
1114 ip6n->tnls[1] = ip6n->tnls_r_l; 1108 ip6n->tnls[1] = ip6n->tnls_r_l;
1115 1109
1110 if (!net_has_fallback_tunnels(net))
1111 return 0;
1116 err = -ENOMEM; 1112 err = -ENOMEM;
1117 ip6n->fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6_vti0", 1113 ip6n->fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6_vti0",
1118 NET_NAME_UNKNOWN, vti6_dev_setup); 1114 NET_NAME_UNKNOWN, vti6_dev_setup);
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 0d0f0053bb11..d0b7e0249c13 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -32,6 +32,7 @@
32#include <linux/seq_file.h> 32#include <linux/seq_file.h>
33#include <linux/init.h> 33#include <linux/init.h>
34#include <linux/compat.h> 34#include <linux/compat.h>
35#include <linux/rhashtable.h>
35#include <net/protocol.h> 36#include <net/protocol.h>
36#include <linux/skbuff.h> 37#include <linux/skbuff.h>
37#include <net/raw.h> 38#include <net/raw.h>
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 4d780c7f0130..c0cac9cc3a28 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -398,6 +398,12 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
398 case IPV6_DSTOPTS: 398 case IPV6_DSTOPTS:
399 { 399 {
400 struct ipv6_txoptions *opt; 400 struct ipv6_txoptions *opt;
401 struct ipv6_opt_hdr *new = NULL;
402
403 /* hop-by-hop / destination options are privileged option */
404 retv = -EPERM;
405 if (optname != IPV6_RTHDR && !ns_capable(net->user_ns, CAP_NET_RAW))
406 break;
401 407
402 /* remove any sticky options header with a zero option 408 /* remove any sticky options header with a zero option
403 * length, per RFC3542. 409 * length, per RFC3542.
@@ -409,17 +415,22 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
409 else if (optlen < sizeof(struct ipv6_opt_hdr) || 415 else if (optlen < sizeof(struct ipv6_opt_hdr) ||
410 optlen & 0x7 || optlen > 8 * 255) 416 optlen & 0x7 || optlen > 8 * 255)
411 goto e_inval; 417 goto e_inval;
412 418 else {
413 /* hop-by-hop / destination options are privileged option */ 419 new = memdup_user(optval, optlen);
414 retv = -EPERM; 420 if (IS_ERR(new)) {
415 if (optname != IPV6_RTHDR && !ns_capable(net->user_ns, CAP_NET_RAW)) 421 retv = PTR_ERR(new);
416 break; 422 break;
423 }
424 if (unlikely(ipv6_optlen(new) > optlen)) {
425 kfree(new);
426 goto e_inval;
427 }
428 }
417 429
418 opt = rcu_dereference_protected(np->opt, 430 opt = rcu_dereference_protected(np->opt,
419 lockdep_sock_is_held(sk)); 431 lockdep_sock_is_held(sk));
420 opt = ipv6_renew_options(sk, opt, optname, 432 opt = ipv6_renew_options(sk, opt, optname, new);
421 (struct ipv6_opt_hdr __user *)optval, 433 kfree(new);
422 optlen);
423 if (IS_ERR(opt)) { 434 if (IS_ERR(opt)) {
424 retv = PTR_ERR(opt); 435 retv = PTR_ERR(opt);
425 break; 436 break;
@@ -489,7 +500,6 @@ sticky_done:
489 struct ipv6_txoptions *opt = NULL; 500 struct ipv6_txoptions *opt = NULL;
490 struct msghdr msg; 501 struct msghdr msg;
491 struct flowi6 fl6; 502 struct flowi6 fl6;
492 struct sockcm_cookie sockc_junk;
493 struct ipcm6_cookie ipc6; 503 struct ipcm6_cookie ipc6;
494 504
495 memset(&fl6, 0, sizeof(fl6)); 505 memset(&fl6, 0, sizeof(fl6));
@@ -522,7 +532,7 @@ sticky_done:
522 msg.msg_control = (void *)(opt+1); 532 msg.msg_control = (void *)(opt+1);
523 ipc6.opt = opt; 533 ipc6.opt = opt;
524 534
525 retv = ip6_datagram_send_ctl(net, sk, &msg, &fl6, &ipc6, &sockc_junk); 535 retv = ip6_datagram_send_ctl(net, sk, &msg, &fl6, &ipc6);
526 if (retv) 536 if (retv)
527 goto done; 537 goto done;
528update: 538update:
@@ -718,8 +728,9 @@ done:
718 struct sockaddr_in6 *psin6; 728 struct sockaddr_in6 *psin6;
719 729
720 psin6 = (struct sockaddr_in6 *)&greqs.gsr_group; 730 psin6 = (struct sockaddr_in6 *)&greqs.gsr_group;
721 retv = ipv6_sock_mc_join(sk, greqs.gsr_interface, 731 retv = ipv6_sock_mc_join_ssm(sk, greqs.gsr_interface,
722 &psin6->sin6_addr); 732 &psin6->sin6_addr,
733 MCAST_INCLUDE);
723 /* prior join w/ different source is ok */ 734 /* prior join w/ different source is ok */
724 if (retv && retv != -EADDRINUSE) 735 if (retv && retv != -EADDRINUSE)
725 break; 736 break;
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 975021df7c1c..4ae54aaca373 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -95,6 +95,8 @@ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca,
95 int delta); 95 int delta);
96static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, 96static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml,
97 struct inet6_dev *idev); 97 struct inet6_dev *idev);
98static int __ipv6_dev_mc_inc(struct net_device *dev,
99 const struct in6_addr *addr, unsigned int mode);
98 100
99#define MLD_QRV_DEFAULT 2 101#define MLD_QRV_DEFAULT 2
100/* RFC3810, 9.2. Query Interval */ 102/* RFC3810, 9.2. Query Interval */
@@ -132,7 +134,8 @@ static int unsolicited_report_interval(struct inet6_dev *idev)
132 return iv > 0 ? iv : 1; 134 return iv > 0 ? iv : 1;
133} 135}
134 136
135int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) 137static int __ipv6_sock_mc_join(struct sock *sk, int ifindex,
138 const struct in6_addr *addr, unsigned int mode)
136{ 139{
137 struct net_device *dev = NULL; 140 struct net_device *dev = NULL;
138 struct ipv6_mc_socklist *mc_lst; 141 struct ipv6_mc_socklist *mc_lst;
@@ -179,7 +182,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
179 } 182 }
180 183
181 mc_lst->ifindex = dev->ifindex; 184 mc_lst->ifindex = dev->ifindex;
182 mc_lst->sfmode = MCAST_EXCLUDE; 185 mc_lst->sfmode = mode;
183 rwlock_init(&mc_lst->sflock); 186 rwlock_init(&mc_lst->sflock);
184 mc_lst->sflist = NULL; 187 mc_lst->sflist = NULL;
185 188
@@ -187,7 +190,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
187 * now add/increase the group membership on the device 190 * now add/increase the group membership on the device
188 */ 191 */
189 192
190 err = ipv6_dev_mc_inc(dev, addr); 193 err = __ipv6_dev_mc_inc(dev, addr, mode);
191 194
192 if (err) { 195 if (err) {
193 sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); 196 sock_kfree_s(sk, mc_lst, sizeof(*mc_lst));
@@ -199,8 +202,19 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
199 202
200 return 0; 203 return 0;
201} 204}
205
206int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
207{
208 return __ipv6_sock_mc_join(sk, ifindex, addr, MCAST_EXCLUDE);
209}
202EXPORT_SYMBOL(ipv6_sock_mc_join); 210EXPORT_SYMBOL(ipv6_sock_mc_join);
203 211
212int ipv6_sock_mc_join_ssm(struct sock *sk, int ifindex,
213 const struct in6_addr *addr, unsigned int mode)
214{
215 return __ipv6_sock_mc_join(sk, ifindex, addr, mode);
216}
217
204/* 218/*
205 * socket leave on multicast group 219 * socket leave on multicast group
206 */ 220 */
@@ -672,7 +686,13 @@ static void igmp6_group_added(struct ifmcaddr6 *mc)
672 } 686 }
673 /* else v2 */ 687 /* else v2 */
674 688
675 mc->mca_crcount = mc->idev->mc_qrv; 689 /* Based on RFC3810 6.1, for newly added INCLUDE SSM, we
690 * should not send filter-mode change record as the mode
691 * should be from IN() to IN(A).
692 */
693 if (mc->mca_sfmode == MCAST_EXCLUDE)
694 mc->mca_crcount = mc->idev->mc_qrv;
695
676 mld_ifc_event(mc->idev); 696 mld_ifc_event(mc->idev);
677} 697}
678 698
@@ -770,13 +790,13 @@ static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im)
770 spin_lock_bh(&im->mca_lock); 790 spin_lock_bh(&im->mca_lock);
771 if (pmc) { 791 if (pmc) {
772 im->idev = pmc->idev; 792 im->idev = pmc->idev;
773 im->mca_crcount = idev->mc_qrv; 793 if (im->mca_sfmode == MCAST_INCLUDE) {
774 im->mca_sfmode = pmc->mca_sfmode;
775 if (pmc->mca_sfmode == MCAST_INCLUDE) {
776 im->mca_tomb = pmc->mca_tomb; 794 im->mca_tomb = pmc->mca_tomb;
777 im->mca_sources = pmc->mca_sources; 795 im->mca_sources = pmc->mca_sources;
778 for (psf = im->mca_sources; psf; psf = psf->sf_next) 796 for (psf = im->mca_sources; psf; psf = psf->sf_next)
779 psf->sf_crcount = im->mca_crcount; 797 psf->sf_crcount = idev->mc_qrv;
798 } else {
799 im->mca_crcount = idev->mc_qrv;
780 } 800 }
781 in6_dev_put(pmc->idev); 801 in6_dev_put(pmc->idev);
782 kfree(pmc); 802 kfree(pmc);
@@ -831,7 +851,8 @@ static void ma_put(struct ifmcaddr6 *mc)
831} 851}
832 852
833static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev, 853static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev,
834 const struct in6_addr *addr) 854 const struct in6_addr *addr,
855 unsigned int mode)
835{ 856{
836 struct ifmcaddr6 *mc; 857 struct ifmcaddr6 *mc;
837 858
@@ -849,9 +870,8 @@ static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev,
849 refcount_set(&mc->mca_refcnt, 1); 870 refcount_set(&mc->mca_refcnt, 1);
850 spin_lock_init(&mc->mca_lock); 871 spin_lock_init(&mc->mca_lock);
851 872
852 /* initial mode is (EX, empty) */ 873 mc->mca_sfmode = mode;
853 mc->mca_sfmode = MCAST_EXCLUDE; 874 mc->mca_sfcount[mode] = 1;
854 mc->mca_sfcount[MCAST_EXCLUDE] = 1;
855 875
856 if (ipv6_addr_is_ll_all_nodes(&mc->mca_addr) || 876 if (ipv6_addr_is_ll_all_nodes(&mc->mca_addr) ||
857 IPV6_ADDR_MC_SCOPE(&mc->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) 877 IPV6_ADDR_MC_SCOPE(&mc->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL)
@@ -863,7 +883,8 @@ static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev,
863/* 883/*
864 * device multicast group inc (add if not found) 884 * device multicast group inc (add if not found)
865 */ 885 */
866int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr) 886static int __ipv6_dev_mc_inc(struct net_device *dev,
887 const struct in6_addr *addr, unsigned int mode)
867{ 888{
868 struct ifmcaddr6 *mc; 889 struct ifmcaddr6 *mc;
869 struct inet6_dev *idev; 890 struct inet6_dev *idev;
@@ -887,14 +908,13 @@ int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr)
887 if (ipv6_addr_equal(&mc->mca_addr, addr)) { 908 if (ipv6_addr_equal(&mc->mca_addr, addr)) {
888 mc->mca_users++; 909 mc->mca_users++;
889 write_unlock_bh(&idev->lock); 910 write_unlock_bh(&idev->lock);
890 ip6_mc_add_src(idev, &mc->mca_addr, MCAST_EXCLUDE, 0, 911 ip6_mc_add_src(idev, &mc->mca_addr, mode, 0, NULL, 0);
891 NULL, 0);
892 in6_dev_put(idev); 912 in6_dev_put(idev);
893 return 0; 913 return 0;
894 } 914 }
895 } 915 }
896 916
897 mc = mca_alloc(idev, addr); 917 mc = mca_alloc(idev, addr, mode);
898 if (!mc) { 918 if (!mc) {
899 write_unlock_bh(&idev->lock); 919 write_unlock_bh(&idev->lock);
900 in6_dev_put(idev); 920 in6_dev_put(idev);
@@ -916,6 +936,11 @@ int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr)
916 return 0; 936 return 0;
917} 937}
918 938
939int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr)
940{
941 return __ipv6_dev_mc_inc(dev, addr, MCAST_EXCLUDE);
942}
943
919/* 944/*
920 * device multicast group del 945 * device multicast group del
921 */ 946 */
@@ -1751,7 +1776,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc,
1751 1776
1752 psf_next = psf->sf_next; 1777 psf_next = psf->sf_next;
1753 1778
1754 if (!is_in(pmc, psf, type, gdeleted, sdeleted)) { 1779 if (!is_in(pmc, psf, type, gdeleted, sdeleted) && !crsend) {
1755 psf_prev = psf; 1780 psf_prev = psf;
1756 continue; 1781 continue;
1757 } 1782 }
@@ -2066,7 +2091,7 @@ static void mld_send_initial_cr(struct inet6_dev *idev)
2066 if (pmc->mca_sfcount[MCAST_EXCLUDE]) 2091 if (pmc->mca_sfcount[MCAST_EXCLUDE])
2067 type = MLD2_CHANGE_TO_EXCLUDE; 2092 type = MLD2_CHANGE_TO_EXCLUDE;
2068 else 2093 else
2069 type = MLD2_CHANGE_TO_INCLUDE; 2094 type = MLD2_ALLOW_NEW_SOURCES;
2070 skb = add_grec(skb, pmc, type, 0, 0, 1); 2095 skb = add_grec(skb, pmc, type, 0, 0, 1);
2071 spin_unlock_bh(&pmc->mca_lock); 2096 spin_unlock_bh(&pmc->mca_lock);
2072 } 2097 }
@@ -2082,7 +2107,8 @@ void ipv6_mc_dad_complete(struct inet6_dev *idev)
2082 mld_send_initial_cr(idev); 2107 mld_send_initial_cr(idev);
2083 idev->mc_dad_count--; 2108 idev->mc_dad_count--;
2084 if (idev->mc_dad_count) 2109 if (idev->mc_dad_count)
2085 mld_dad_start_timer(idev, idev->mc_maxdelay); 2110 mld_dad_start_timer(idev,
2111 unsolicited_report_interval(idev));
2086 } 2112 }
2087} 2113}
2088 2114
@@ -2094,7 +2120,8 @@ static void mld_dad_timer_expire(struct timer_list *t)
2094 if (idev->mc_dad_count) { 2120 if (idev->mc_dad_count) {
2095 idev->mc_dad_count--; 2121 idev->mc_dad_count--;
2096 if (idev->mc_dad_count) 2122 if (idev->mc_dad_count)
2097 mld_dad_start_timer(idev, idev->mc_maxdelay); 2123 mld_dad_start_timer(idev,
2124 unsolicited_report_interval(idev));
2098 } 2125 }
2099 in6_dev_put(idev); 2126 in6_dev_put(idev);
2100} 2127}
@@ -2452,7 +2479,8 @@ static void mld_ifc_timer_expire(struct timer_list *t)
2452 if (idev->mc_ifc_count) { 2479 if (idev->mc_ifc_count) {
2453 idev->mc_ifc_count--; 2480 idev->mc_ifc_count--;
2454 if (idev->mc_ifc_count) 2481 if (idev->mc_ifc_count)
2455 mld_ifc_start_timer(idev, idev->mc_maxdelay); 2482 mld_ifc_start_timer(idev,
2483 unsolicited_report_interval(idev));
2456 } 2484 }
2457 in6_dev_put(idev); 2485 in6_dev_put(idev);
2458} 2486}
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index e640d2f3c55c..0ec273997d1d 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -811,7 +811,7 @@ static void ndisc_recv_ns(struct sk_buff *skb)
811 return; 811 return;
812 } 812 }
813 } 813 }
814 if (ndopts.nd_opts_nonce) 814 if (ndopts.nd_opts_nonce && ndopts.nd_opts_nonce->nd_opt_len == 1)
815 memcpy(&nonce, (u8 *)(ndopts.nd_opts_nonce + 1), 6); 815 memcpy(&nonce, (u8 *)(ndopts.nd_opts_nonce + 1), 6);
816 816
817 inc = ipv6_addr_is_multicast(daddr); 817 inc = ipv6_addr_is_multicast(daddr);
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index 531d6957af36..5ae8e1c51079 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -15,7 +15,6 @@
15#include <net/ipv6.h> 15#include <net/ipv6.h>
16#include <net/ip6_route.h> 16#include <net/ip6_route.h>
17#include <net/xfrm.h> 17#include <net/xfrm.h>
18#include <net/ip6_checksum.h>
19#include <net/netfilter/nf_queue.h> 18#include <net/netfilter/nf_queue.h>
20 19
21int ip6_route_me_harder(struct net *net, struct sk_buff *skb) 20int ip6_route_me_harder(struct net *net, struct sk_buff *skb)
@@ -106,71 +105,10 @@ static int nf_ip6_route(struct net *net, struct dst_entry **dst,
106 return err; 105 return err;
107} 106}
108 107
109__sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook,
110 unsigned int dataoff, u_int8_t protocol)
111{
112 const struct ipv6hdr *ip6h = ipv6_hdr(skb);
113 __sum16 csum = 0;
114
115 switch (skb->ip_summed) {
116 case CHECKSUM_COMPLETE:
117 if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN)
118 break;
119 if (!csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
120 skb->len - dataoff, protocol,
121 csum_sub(skb->csum,
122 skb_checksum(skb, 0,
123 dataoff, 0)))) {
124 skb->ip_summed = CHECKSUM_UNNECESSARY;
125 break;
126 }
127 /* fall through */
128 case CHECKSUM_NONE:
129 skb->csum = ~csum_unfold(
130 csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
131 skb->len - dataoff,
132 protocol,
133 csum_sub(0,
134 skb_checksum(skb, 0,
135 dataoff, 0))));
136 csum = __skb_checksum_complete(skb);
137 }
138 return csum;
139}
140EXPORT_SYMBOL(nf_ip6_checksum);
141
142static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook,
143 unsigned int dataoff, unsigned int len,
144 u_int8_t protocol)
145{
146 const struct ipv6hdr *ip6h = ipv6_hdr(skb);
147 __wsum hsum;
148 __sum16 csum = 0;
149
150 switch (skb->ip_summed) {
151 case CHECKSUM_COMPLETE:
152 if (len == skb->len - dataoff)
153 return nf_ip6_checksum(skb, hook, dataoff, protocol);
154 /* fall through */
155 case CHECKSUM_NONE:
156 hsum = skb_checksum(skb, 0, dataoff, 0);
157 skb->csum = ~csum_unfold(csum_ipv6_magic(&ip6h->saddr,
158 &ip6h->daddr,
159 skb->len - dataoff,
160 protocol,
161 csum_sub(0, hsum)));
162 skb->ip_summed = CHECKSUM_NONE;
163 return __skb_checksum_complete_head(skb, dataoff + len);
164 }
165 return csum;
166};
167
168static const struct nf_ipv6_ops ipv6ops = { 108static const struct nf_ipv6_ops ipv6ops = {
169 .chk_addr = ipv6_chk_addr, 109 .chk_addr = ipv6_chk_addr,
170 .route_input = ip6_route_input, 110 .route_input = ip6_route_input,
171 .fragment = ip6_fragment, 111 .fragment = ip6_fragment,
172 .checksum = nf_ip6_checksum,
173 .checksum_partial = nf_ip6_checksum_partial,
174 .route = nf_ip6_route, 112 .route = nf_ip6_route,
175 .reroute = nf_ip6_reroute, 113 .reroute = nf_ip6_reroute,
176}; 114};
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index 37b14dc9d863..339d0762b027 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -5,26 +5,6 @@
5menu "IPv6: Netfilter Configuration" 5menu "IPv6: Netfilter Configuration"
6 depends on INET && IPV6 && NETFILTER 6 depends on INET && IPV6 && NETFILTER
7 7
8config NF_DEFRAG_IPV6
9 tristate
10 default n
11
12config NF_CONNTRACK_IPV6
13 tristate "IPv6 connection tracking support"
14 depends on INET && IPV6 && NF_CONNTRACK
15 default m if NETFILTER_ADVANCED=n
16 select NF_DEFRAG_IPV6
17 ---help---
18 Connection tracking keeps a record of what packets have passed
19 through your machine, in order to figure out how they are related
20 into connections.
21
22 This is IPv6 support on Layer 3 independent connection tracking.
23 Layer 3 independent connection tracking is experimental scheme
24 which generalize ip_conntrack to support other layer 3 protocols.
25
26 To compile it as a module, choose M here. If unsure, say N.
27
28config NF_SOCKET_IPV6 8config NF_SOCKET_IPV6
29 tristate "IPv6 socket lookup support" 9 tristate "IPv6 socket lookup support"
30 help 10 help
@@ -128,7 +108,7 @@ config NF_LOG_IPV6
128 108
129config NF_NAT_IPV6 109config NF_NAT_IPV6
130 tristate "IPv6 NAT" 110 tristate "IPv6 NAT"
131 depends on NF_CONNTRACK_IPV6 111 depends on NF_CONNTRACK
132 depends on NETFILTER_ADVANCED 112 depends on NETFILTER_ADVANCED
133 select NF_NAT 113 select NF_NAT
134 help 114 help
@@ -328,7 +308,7 @@ config IP6_NF_SECURITY
328 308
329config IP6_NF_NAT 309config IP6_NF_NAT
330 tristate "ip6tables NAT support" 310 tristate "ip6tables NAT support"
331 depends on NF_CONNTRACK_IPV6 311 depends on NF_CONNTRACK
332 depends on NETFILTER_ADVANCED 312 depends on NETFILTER_ADVANCED
333 select NF_NAT 313 select NF_NAT
334 select NF_NAT_IPV6 314 select NF_NAT_IPV6
@@ -365,6 +345,7 @@ config IP6_NF_TARGET_NPT
365endif # IP6_NF_NAT 345endif # IP6_NF_NAT
366 346
367endif # IP6_NF_IPTABLES 347endif # IP6_NF_IPTABLES
368
369endmenu 348endmenu
370 349
350config NF_DEFRAG_IPV6
351 tristate
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index 10a5a1c87320..200c0c235565 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -11,12 +11,6 @@ obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o
11obj-$(CONFIG_IP6_NF_SECURITY) += ip6table_security.o 11obj-$(CONFIG_IP6_NF_SECURITY) += ip6table_security.o
12obj-$(CONFIG_IP6_NF_NAT) += ip6table_nat.o 12obj-$(CONFIG_IP6_NF_NAT) += ip6table_nat.o
13 13
14# objects for l3 independent conntrack
15nf_conntrack_ipv6-y := nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o
16
17# l3 independent conntrack
18obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o
19
20nf_nat_ipv6-y := nf_nat_l3proto_ipv6.o nf_nat_proto_icmpv6.o 14nf_nat_ipv6-y := nf_nat_l3proto_ipv6.o nf_nat_proto_icmpv6.o
21nf_nat_ipv6-$(CONFIG_NF_NAT_MASQUERADE_IPV6) += nf_nat_masquerade_ipv6.o 15nf_nat_ipv6-$(CONFIG_NF_NAT_MASQUERADE_IPV6) += nf_nat_masquerade_ipv6.o
22obj-$(CONFIG_NF_NAT_IPV6) += nf_nat_ipv6.o 16obj-$(CONFIG_NF_NAT_IPV6) += nf_nat_ipv6.o
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 7eab959734bc..daf2e9e9193d 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -1909,6 +1909,7 @@ static struct xt_match ip6t_builtin_mt[] __read_mostly = {
1909 .checkentry = icmp6_checkentry, 1909 .checkentry = icmp6_checkentry,
1910 .proto = IPPROTO_ICMPV6, 1910 .proto = IPPROTO_ICMPV6,
1911 .family = NFPROTO_IPV6, 1911 .family = NFPROTO_IPV6,
1912 .me = THIS_MODULE,
1912 }, 1913 },
1913}; 1914};
1914 1915
diff --git a/net/ipv6/netfilter/ip6t_rpfilter.c b/net/ipv6/netfilter/ip6t_rpfilter.c
index 0fe61ede77c6..c3c6b09acdc4 100644
--- a/net/ipv6/netfilter/ip6t_rpfilter.c
+++ b/net/ipv6/netfilter/ip6t_rpfilter.c
@@ -26,6 +26,12 @@ static bool rpfilter_addr_unicast(const struct in6_addr *addr)
26 return addr_type & IPV6_ADDR_UNICAST; 26 return addr_type & IPV6_ADDR_UNICAST;
27} 27}
28 28
29static bool rpfilter_addr_linklocal(const struct in6_addr *addr)
30{
31 int addr_type = ipv6_addr_type(addr);
32 return addr_type & IPV6_ADDR_LINKLOCAL;
33}
34
29static bool rpfilter_lookup_reverse6(struct net *net, const struct sk_buff *skb, 35static bool rpfilter_lookup_reverse6(struct net *net, const struct sk_buff *skb,
30 const struct net_device *dev, u8 flags) 36 const struct net_device *dev, u8 flags)
31{ 37{
@@ -48,7 +54,11 @@ static bool rpfilter_lookup_reverse6(struct net *net, const struct sk_buff *skb,
48 } 54 }
49 55
50 fl6.flowi6_mark = flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0; 56 fl6.flowi6_mark = flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0;
51 if ((flags & XT_RPFILTER_LOOSE) == 0) 57
58 if (rpfilter_addr_linklocal(&iph->saddr)) {
59 lookup_flags |= RT6_LOOKUP_F_IFACE;
60 fl6.flowi6_oif = dev->ifindex;
61 } else if ((flags & XT_RPFILTER_LOOSE) == 0)
52 fl6.flowi6_oif = dev->ifindex; 62 fl6.flowi6_oif = dev->ifindex;
53 63
54 rt = (void *)ip6_route_lookup(net, &fl6, skb, lookup_flags); 64 rt = (void *)ip6_route_lookup(net, &fl6, skb, lookup_flags);
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
deleted file mode 100644
index 663827ee3cf8..000000000000
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ /dev/null
@@ -1,460 +0,0 @@
1/*
2 * Copyright (C)2004 USAGI/WIDE Project
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 *
8 * Author:
9 * Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
10 */
11
12#include <linux/types.h>
13#include <linux/ipv6.h>
14#include <linux/in6.h>
15#include <linux/netfilter.h>
16#include <linux/module.h>
17#include <linux/skbuff.h>
18#include <linux/icmp.h>
19#include <net/ipv6.h>
20#include <net/inet_frag.h>
21
22#include <linux/netfilter_bridge.h>
23#include <linux/netfilter_ipv6.h>
24#include <linux/netfilter_ipv6/ip6_tables.h>
25#include <net/netfilter/nf_conntrack.h>
26#include <net/netfilter/nf_conntrack_helper.h>
27#include <net/netfilter/nf_conntrack_l4proto.h>
28#include <net/netfilter/nf_conntrack_l3proto.h>
29#include <net/netfilter/nf_conntrack_core.h>
30#include <net/netfilter/nf_conntrack_zones.h>
31#include <net/netfilter/nf_conntrack_seqadj.h>
32#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
33#include <net/netfilter/nf_nat_helper.h>
34#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
35#include <net/netfilter/nf_log.h>
36
37static int conntrack6_net_id;
38static DEFINE_MUTEX(register_ipv6_hooks);
39
40struct conntrack6_net {
41 unsigned int users;
42};
43
44static bool ipv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
45 struct nf_conntrack_tuple *tuple)
46{
47 const u_int32_t *ap;
48 u_int32_t _addrs[8];
49
50 ap = skb_header_pointer(skb, nhoff + offsetof(struct ipv6hdr, saddr),
51 sizeof(_addrs), _addrs);
52 if (ap == NULL)
53 return false;
54
55 memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6));
56 memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6));
57
58 return true;
59}
60
61static bool ipv6_invert_tuple(struct nf_conntrack_tuple *tuple,
62 const struct nf_conntrack_tuple *orig)
63{
64 memcpy(tuple->src.u3.ip6, orig->dst.u3.ip6, sizeof(tuple->src.u3.ip6));
65 memcpy(tuple->dst.u3.ip6, orig->src.u3.ip6, sizeof(tuple->dst.u3.ip6));
66
67 return true;
68}
69
70static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
71 unsigned int *dataoff, u_int8_t *protonum)
72{
73 unsigned int extoff = nhoff + sizeof(struct ipv6hdr);
74 __be16 frag_off;
75 int protoff;
76 u8 nexthdr;
77
78 if (skb_copy_bits(skb, nhoff + offsetof(struct ipv6hdr, nexthdr),
79 &nexthdr, sizeof(nexthdr)) != 0) {
80 pr_debug("ip6_conntrack_core: can't get nexthdr\n");
81 return -NF_ACCEPT;
82 }
83 protoff = ipv6_skip_exthdr(skb, extoff, &nexthdr, &frag_off);
84 /*
85 * (protoff == skb->len) means the packet has not data, just
86 * IPv6 and possibly extensions headers, but it is tracked anyway
87 */
88 if (protoff < 0 || (frag_off & htons(~0x7)) != 0) {
89 pr_debug("ip6_conntrack_core: can't find proto in pkt\n");
90 return -NF_ACCEPT;
91 }
92
93 *dataoff = protoff;
94 *protonum = nexthdr;
95 return NF_ACCEPT;
96}
97
98static unsigned int ipv6_helper(void *priv,
99 struct sk_buff *skb,
100 const struct nf_hook_state *state)
101{
102 struct nf_conn *ct;
103 const struct nf_conn_help *help;
104 const struct nf_conntrack_helper *helper;
105 enum ip_conntrack_info ctinfo;
106 __be16 frag_off;
107 int protoff;
108 u8 nexthdr;
109
110 /* This is where we call the helper: as the packet goes out. */
111 ct = nf_ct_get(skb, &ctinfo);
112 if (!ct || ctinfo == IP_CT_RELATED_REPLY)
113 return NF_ACCEPT;
114
115 help = nfct_help(ct);
116 if (!help)
117 return NF_ACCEPT;
118 /* rcu_read_lock()ed by nf_hook_thresh */
119 helper = rcu_dereference(help->helper);
120 if (!helper)
121 return NF_ACCEPT;
122
123 nexthdr = ipv6_hdr(skb)->nexthdr;
124 protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
125 &frag_off);
126 if (protoff < 0 || (frag_off & htons(~0x7)) != 0) {
127 pr_debug("proto header not found\n");
128 return NF_ACCEPT;
129 }
130
131 return helper->help(skb, protoff, ct, ctinfo);
132}
133
134static unsigned int ipv6_confirm(void *priv,
135 struct sk_buff *skb,
136 const struct nf_hook_state *state)
137{
138 struct nf_conn *ct;
139 enum ip_conntrack_info ctinfo;
140 unsigned char pnum = ipv6_hdr(skb)->nexthdr;
141 int protoff;
142 __be16 frag_off;
143
144 ct = nf_ct_get(skb, &ctinfo);
145 if (!ct || ctinfo == IP_CT_RELATED_REPLY)
146 goto out;
147
148 protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum,
149 &frag_off);
150 if (protoff < 0 || (frag_off & htons(~0x7)) != 0) {
151 pr_debug("proto header not found\n");
152 goto out;
153 }
154
155 /* adjust seqs for loopback traffic only in outgoing direction */
156 if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
157 !nf_is_loopback_packet(skb)) {
158 if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) {
159 NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
160 return NF_DROP;
161 }
162 }
163out:
164 /* We've seen it coming out the other side: confirm it */
165 return nf_conntrack_confirm(skb);
166}
167
168static unsigned int ipv6_conntrack_in(void *priv,
169 struct sk_buff *skb,
170 const struct nf_hook_state *state)
171{
172 return nf_conntrack_in(state->net, PF_INET6, state->hook, skb);
173}
174
175static unsigned int ipv6_conntrack_local(void *priv,
176 struct sk_buff *skb,
177 const struct nf_hook_state *state)
178{
179 return nf_conntrack_in(state->net, PF_INET6, state->hook, skb);
180}
181
182static const struct nf_hook_ops ipv6_conntrack_ops[] = {
183 {
184 .hook = ipv6_conntrack_in,
185 .pf = NFPROTO_IPV6,
186 .hooknum = NF_INET_PRE_ROUTING,
187 .priority = NF_IP6_PRI_CONNTRACK,
188 },
189 {
190 .hook = ipv6_conntrack_local,
191 .pf = NFPROTO_IPV6,
192 .hooknum = NF_INET_LOCAL_OUT,
193 .priority = NF_IP6_PRI_CONNTRACK,
194 },
195 {
196 .hook = ipv6_helper,
197 .pf = NFPROTO_IPV6,
198 .hooknum = NF_INET_POST_ROUTING,
199 .priority = NF_IP6_PRI_CONNTRACK_HELPER,
200 },
201 {
202 .hook = ipv6_confirm,
203 .pf = NFPROTO_IPV6,
204 .hooknum = NF_INET_POST_ROUTING,
205 .priority = NF_IP6_PRI_LAST,
206 },
207 {
208 .hook = ipv6_helper,
209 .pf = NFPROTO_IPV6,
210 .hooknum = NF_INET_LOCAL_IN,
211 .priority = NF_IP6_PRI_CONNTRACK_HELPER,
212 },
213 {
214 .hook = ipv6_confirm,
215 .pf = NFPROTO_IPV6,
216 .hooknum = NF_INET_LOCAL_IN,
217 .priority = NF_IP6_PRI_LAST-1,
218 },
219};
220
221static int
222ipv6_getorigdst(struct sock *sk, int optval, void __user *user, int *len)
223{
224 struct nf_conntrack_tuple tuple = { .src.l3num = NFPROTO_IPV6 };
225 const struct ipv6_pinfo *inet6 = inet6_sk(sk);
226 const struct inet_sock *inet = inet_sk(sk);
227 const struct nf_conntrack_tuple_hash *h;
228 struct sockaddr_in6 sin6;
229 struct nf_conn *ct;
230 __be32 flow_label;
231 int bound_dev_if;
232
233 lock_sock(sk);
234 tuple.src.u3.in6 = sk->sk_v6_rcv_saddr;
235 tuple.src.u.tcp.port = inet->inet_sport;
236 tuple.dst.u3.in6 = sk->sk_v6_daddr;
237 tuple.dst.u.tcp.port = inet->inet_dport;
238 tuple.dst.protonum = sk->sk_protocol;
239 bound_dev_if = sk->sk_bound_dev_if;
240 flow_label = inet6->flow_label;
241 release_sock(sk);
242
243 if (tuple.dst.protonum != IPPROTO_TCP &&
244 tuple.dst.protonum != IPPROTO_SCTP)
245 return -ENOPROTOOPT;
246
247 if (*len < 0 || (unsigned int) *len < sizeof(sin6))
248 return -EINVAL;
249
250 h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple);
251 if (!h) {
252 pr_debug("IP6T_SO_ORIGINAL_DST: Can't find %pI6c/%u-%pI6c/%u.\n",
253 &tuple.src.u3.ip6, ntohs(tuple.src.u.tcp.port),
254 &tuple.dst.u3.ip6, ntohs(tuple.dst.u.tcp.port));
255 return -ENOENT;
256 }
257
258 ct = nf_ct_tuplehash_to_ctrack(h);
259
260 sin6.sin6_family = AF_INET6;
261 sin6.sin6_port = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.tcp.port;
262 sin6.sin6_flowinfo = flow_label & IPV6_FLOWINFO_MASK;
263 memcpy(&sin6.sin6_addr,
264 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in6,
265 sizeof(sin6.sin6_addr));
266
267 nf_ct_put(ct);
268 sin6.sin6_scope_id = ipv6_iface_scope_id(&sin6.sin6_addr, bound_dev_if);
269 return copy_to_user(user, &sin6, sizeof(sin6)) ? -EFAULT : 0;
270}
271
272#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
273
274#include <linux/netfilter/nfnetlink.h>
275#include <linux/netfilter/nfnetlink_conntrack.h>
276
277static int ipv6_tuple_to_nlattr(struct sk_buff *skb,
278 const struct nf_conntrack_tuple *tuple)
279{
280 if (nla_put_in6_addr(skb, CTA_IP_V6_SRC, &tuple->src.u3.in6) ||
281 nla_put_in6_addr(skb, CTA_IP_V6_DST, &tuple->dst.u3.in6))
282 goto nla_put_failure;
283 return 0;
284
285nla_put_failure:
286 return -1;
287}
288
289static const struct nla_policy ipv6_nla_policy[CTA_IP_MAX+1] = {
290 [CTA_IP_V6_SRC] = { .len = sizeof(u_int32_t)*4 },
291 [CTA_IP_V6_DST] = { .len = sizeof(u_int32_t)*4 },
292};
293
294static int ipv6_nlattr_to_tuple(struct nlattr *tb[],
295 struct nf_conntrack_tuple *t)
296{
297 if (!tb[CTA_IP_V6_SRC] || !tb[CTA_IP_V6_DST])
298 return -EINVAL;
299
300 t->src.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_SRC]);
301 t->dst.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_DST]);
302
303 return 0;
304}
305#endif
306
307static int ipv6_hooks_register(struct net *net)
308{
309 struct conntrack6_net *cnet = net_generic(net, conntrack6_net_id);
310 int err = 0;
311
312 mutex_lock(&register_ipv6_hooks);
313 cnet->users++;
314 if (cnet->users > 1)
315 goto out_unlock;
316
317 err = nf_defrag_ipv6_enable(net);
318 if (err < 0) {
319 cnet->users = 0;
320 goto out_unlock;
321 }
322
323 err = nf_register_net_hooks(net, ipv6_conntrack_ops,
324 ARRAY_SIZE(ipv6_conntrack_ops));
325 if (err)
326 cnet->users = 0;
327 out_unlock:
328 mutex_unlock(&register_ipv6_hooks);
329 return err;
330}
331
332static void ipv6_hooks_unregister(struct net *net)
333{
334 struct conntrack6_net *cnet = net_generic(net, conntrack6_net_id);
335
336 mutex_lock(&register_ipv6_hooks);
337 if (cnet->users && (--cnet->users == 0))
338 nf_unregister_net_hooks(net, ipv6_conntrack_ops,
339 ARRAY_SIZE(ipv6_conntrack_ops));
340 mutex_unlock(&register_ipv6_hooks);
341}
342
343const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 = {
344 .l3proto = PF_INET6,
345 .pkt_to_tuple = ipv6_pkt_to_tuple,
346 .invert_tuple = ipv6_invert_tuple,
347 .get_l4proto = ipv6_get_l4proto,
348#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
349 .tuple_to_nlattr = ipv6_tuple_to_nlattr,
350 .nlattr_to_tuple = ipv6_nlattr_to_tuple,
351 .nla_policy = ipv6_nla_policy,
352 .nla_size = NLA_ALIGN(NLA_HDRLEN + sizeof(u32[4])) +
353 NLA_ALIGN(NLA_HDRLEN + sizeof(u32[4])),
354#endif
355 .net_ns_get = ipv6_hooks_register,
356 .net_ns_put = ipv6_hooks_unregister,
357 .me = THIS_MODULE,
358};
359
360MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET6));
361MODULE_LICENSE("GPL");
362MODULE_AUTHOR("Yasuyuki KOZAKAI @USAGI <yasuyuki.kozakai@toshiba.co.jp>");
363
364static struct nf_sockopt_ops so_getorigdst6 = {
365 .pf = NFPROTO_IPV6,
366 .get_optmin = IP6T_SO_ORIGINAL_DST,
367 .get_optmax = IP6T_SO_ORIGINAL_DST + 1,
368 .get = ipv6_getorigdst,
369 .owner = THIS_MODULE,
370};
371
372static const struct nf_conntrack_l4proto * const builtin_l4proto6[] = {
373 &nf_conntrack_l4proto_tcp6,
374 &nf_conntrack_l4proto_udp6,
375 &nf_conntrack_l4proto_icmpv6,
376#ifdef CONFIG_NF_CT_PROTO_DCCP
377 &nf_conntrack_l4proto_dccp6,
378#endif
379#ifdef CONFIG_NF_CT_PROTO_SCTP
380 &nf_conntrack_l4proto_sctp6,
381#endif
382#ifdef CONFIG_NF_CT_PROTO_UDPLITE
383 &nf_conntrack_l4proto_udplite6,
384#endif
385};
386
387static int ipv6_net_init(struct net *net)
388{
389 return nf_ct_l4proto_pernet_register(net, builtin_l4proto6,
390 ARRAY_SIZE(builtin_l4proto6));
391}
392
393static void ipv6_net_exit(struct net *net)
394{
395 nf_ct_l4proto_pernet_unregister(net, builtin_l4proto6,
396 ARRAY_SIZE(builtin_l4proto6));
397}
398
399static struct pernet_operations ipv6_net_ops = {
400 .init = ipv6_net_init,
401 .exit = ipv6_net_exit,
402 .id = &conntrack6_net_id,
403 .size = sizeof(struct conntrack6_net),
404};
405
406static int __init nf_conntrack_l3proto_ipv6_init(void)
407{
408 int ret = 0;
409
410 need_conntrack();
411
412#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
413 if (WARN_ON(nla_policy_len(ipv6_nla_policy, CTA_IP_MAX + 1) !=
414 nf_conntrack_l3proto_ipv6.nla_size))
415 return -EINVAL;
416#endif
417
418 ret = nf_register_sockopt(&so_getorigdst6);
419 if (ret < 0) {
420 pr_err("Unable to register netfilter socket option\n");
421 return ret;
422 }
423
424 ret = register_pernet_subsys(&ipv6_net_ops);
425 if (ret < 0)
426 goto cleanup_sockopt;
427
428 ret = nf_ct_l4proto_register(builtin_l4proto6,
429 ARRAY_SIZE(builtin_l4proto6));
430 if (ret < 0)
431 goto cleanup_pernet;
432
433 ret = nf_ct_l3proto_register(&nf_conntrack_l3proto_ipv6);
434 if (ret < 0) {
435 pr_err("nf_conntrack_ipv6: can't register ipv6 proto.\n");
436 goto cleanup_l4proto;
437 }
438 return ret;
439cleanup_l4proto:
440 nf_ct_l4proto_unregister(builtin_l4proto6,
441 ARRAY_SIZE(builtin_l4proto6));
442 cleanup_pernet:
443 unregister_pernet_subsys(&ipv6_net_ops);
444 cleanup_sockopt:
445 nf_unregister_sockopt(&so_getorigdst6);
446 return ret;
447}
448
449static void __exit nf_conntrack_l3proto_ipv6_fini(void)
450{
451 synchronize_net();
452 nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv6);
453 nf_ct_l4proto_unregister(builtin_l4proto6,
454 ARRAY_SIZE(builtin_l4proto6));
455 unregister_pernet_subsys(&ipv6_net_ops);
456 nf_unregister_sockopt(&so_getorigdst6);
457}
458
459module_init(nf_conntrack_l3proto_ipv6_init);
460module_exit(nf_conntrack_l3proto_ipv6_fini);
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 5e0332014c17..2a14d8b65924 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -33,9 +33,8 @@
33 33
34#include <net/sock.h> 34#include <net/sock.h>
35#include <net/snmp.h> 35#include <net/snmp.h>
36#include <net/inet_frag.h> 36#include <net/ipv6_frag.h>
37 37
38#include <net/ipv6.h>
39#include <net/protocol.h> 38#include <net/protocol.h>
40#include <net/transp_v6.h> 39#include <net/transp_v6.h>
41#include <net/rawv6.h> 40#include <net/rawv6.h>
@@ -107,7 +106,7 @@ static int nf_ct_frag6_sysctl_register(struct net *net)
107 if (hdr == NULL) 106 if (hdr == NULL)
108 goto err_reg; 107 goto err_reg;
109 108
110 net->nf_frag.sysctl.frags_hdr = hdr; 109 net->nf_frag_frags_hdr = hdr;
111 return 0; 110 return 0;
112 111
113err_reg: 112err_reg:
@@ -121,8 +120,8 @@ static void __net_exit nf_ct_frags6_sysctl_unregister(struct net *net)
121{ 120{
122 struct ctl_table *table; 121 struct ctl_table *table;
123 122
124 table = net->nf_frag.sysctl.frags_hdr->ctl_table_arg; 123 table = net->nf_frag_frags_hdr->ctl_table_arg;
125 unregister_net_sysctl_table(net->nf_frag.sysctl.frags_hdr); 124 unregister_net_sysctl_table(net->nf_frag_frags_hdr);
126 if (!net_eq(net, &init_net)) 125 if (!net_eq(net, &init_net))
127 kfree(table); 126 kfree(table);
128} 127}
@@ -151,7 +150,7 @@ static void nf_ct_frag6_expire(struct timer_list *t)
151 fq = container_of(frag, struct frag_queue, q); 150 fq = container_of(frag, struct frag_queue, q);
152 net = container_of(fq->q.net, struct net, nf_frag.frags); 151 net = container_of(fq->q.net, struct net, nf_frag.frags);
153 152
154 ip6_expire_frag_queue(net, fq); 153 ip6frag_expire_frag_queue(net, fq);
155} 154}
156 155
157/* Creation primitives. */ 156/* Creation primitives. */
@@ -464,6 +463,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_devic
464 head->csum); 463 head->csum);
465 464
466 fq->q.fragments = NULL; 465 fq->q.fragments = NULL;
466 fq->q.rb_fragments = RB_ROOT;
467 fq->q.fragments_tail = NULL; 467 fq->q.fragments_tail = NULL;
468 468
469 return true; 469 return true;
@@ -558,6 +558,10 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
558 hdr = ipv6_hdr(skb); 558 hdr = ipv6_hdr(skb);
559 fhdr = (struct frag_hdr *)skb_transport_header(skb); 559 fhdr = (struct frag_hdr *)skb_transport_header(skb);
560 560
561 if (skb->len - skb_network_offset(skb) < IPV6_MIN_MTU &&
562 fhdr->frag_off & htons(IP6_MF))
563 return -EINVAL;
564
561 skb_orphan(skb); 565 skb_orphan(skb);
562 fq = fq_find(net, fhdr->identification, user, hdr, 566 fq = fq_find(net, fhdr->identification, user, hdr,
563 skb->dev ? skb->dev->ifindex : 0); 567 skb->dev ? skb->dev->ifindex : 0);
@@ -585,6 +589,8 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
585 fq->q.meat == fq->q.len && 589 fq->q.meat == fq->q.len &&
586 nf_ct_frag6_reasm(fq, skb, dev)) 590 nf_ct_frag6_reasm(fq, skb, dev))
587 ret = 0; 591 ret = 0;
592 else
593 skb_dst_drop(skb);
588 594
589out_unlock: 595out_unlock:
590 spin_unlock_bh(&fq->q.lock); 596 spin_unlock_bh(&fq->q.lock);
@@ -622,16 +628,24 @@ static struct pernet_operations nf_ct_net_ops = {
622 .exit = nf_ct_net_exit, 628 .exit = nf_ct_net_exit,
623}; 629};
624 630
631static const struct rhashtable_params nfct_rhash_params = {
632 .head_offset = offsetof(struct inet_frag_queue, node),
633 .hashfn = ip6frag_key_hashfn,
634 .obj_hashfn = ip6frag_obj_hashfn,
635 .obj_cmpfn = ip6frag_obj_cmpfn,
636 .automatic_shrinking = true,
637};
638
625int nf_ct_frag6_init(void) 639int nf_ct_frag6_init(void)
626{ 640{
627 int ret = 0; 641 int ret = 0;
628 642
629 nf_frags.constructor = ip6_frag_init; 643 nf_frags.constructor = ip6frag_init;
630 nf_frags.destructor = NULL; 644 nf_frags.destructor = NULL;
631 nf_frags.qsize = sizeof(struct frag_queue); 645 nf_frags.qsize = sizeof(struct frag_queue);
632 nf_frags.frag_expire = nf_ct_frag6_expire; 646 nf_frags.frag_expire = nf_ct_frag6_expire;
633 nf_frags.frags_cache_name = nf_frags_cache_name; 647 nf_frags.frags_cache_name = nf_frags_cache_name;
634 nf_frags.rhash_params = ip6_rhash_params; 648 nf_frags.rhash_params = nfct_rhash_params;
635 ret = inet_frags_init(&nf_frags); 649 ret = inet_frags_init(&nf_frags);
636 if (ret) 650 if (ret)
637 goto out; 651 goto out;
diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
index c87b48359e8f..72dd3e202375 100644
--- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
+++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
@@ -14,8 +14,7 @@
14#include <linux/skbuff.h> 14#include <linux/skbuff.h>
15#include <linux/icmp.h> 15#include <linux/icmp.h>
16#include <linux/sysctl.h> 16#include <linux/sysctl.h>
17#include <net/ipv6.h> 17#include <net/ipv6_frag.h>
18#include <net/inet_frag.h>
19 18
20#include <linux/netfilter_ipv6.h> 19#include <linux/netfilter_ipv6.h>
21#include <linux/netfilter_bridge.h> 20#include <linux/netfilter_bridge.h>
@@ -23,7 +22,6 @@
23#include <net/netfilter/nf_conntrack.h> 22#include <net/netfilter/nf_conntrack.h>
24#include <net/netfilter/nf_conntrack_helper.h> 23#include <net/netfilter/nf_conntrack_helper.h>
25#include <net/netfilter/nf_conntrack_l4proto.h> 24#include <net/netfilter/nf_conntrack_l4proto.h>
26#include <net/netfilter/nf_conntrack_l3proto.h>
27#include <net/netfilter/nf_conntrack_core.h> 25#include <net/netfilter/nf_conntrack_core.h>
28#include <net/netfilter/ipv6/nf_conntrack_ipv6.h> 26#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
29#endif 27#endif
diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c
index b397a8fe88b9..c6bf580d0f33 100644
--- a/net/ipv6/netfilter/nf_log_ipv6.c
+++ b/net/ipv6/netfilter/nf_log_ipv6.c
@@ -36,7 +36,7 @@ static const struct nf_loginfo default_loginfo = {
36}; 36};
37 37
38/* One level of recursion won't kill us */ 38/* One level of recursion won't kill us */
39static void dump_ipv6_packet(struct nf_log_buf *m, 39static void dump_ipv6_packet(struct net *net, struct nf_log_buf *m,
40 const struct nf_loginfo *info, 40 const struct nf_loginfo *info,
41 const struct sk_buff *skb, unsigned int ip6hoff, 41 const struct sk_buff *skb, unsigned int ip6hoff,
42 int recurse) 42 int recurse)
@@ -258,7 +258,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m,
258 /* Max length: 3+maxlen */ 258 /* Max length: 3+maxlen */
259 if (recurse) { 259 if (recurse) {
260 nf_log_buf_add(m, "["); 260 nf_log_buf_add(m, "[");
261 dump_ipv6_packet(m, info, skb, 261 dump_ipv6_packet(net, m, info, skb,
262 ptr + sizeof(_icmp6h), 0); 262 ptr + sizeof(_icmp6h), 0);
263 nf_log_buf_add(m, "] "); 263 nf_log_buf_add(m, "] ");
264 } 264 }
@@ -278,7 +278,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m,
278 278
279 /* Max length: 15 "UID=4294967295 " */ 279 /* Max length: 15 "UID=4294967295 " */
280 if ((logflags & NF_LOG_UID) && recurse) 280 if ((logflags & NF_LOG_UID) && recurse)
281 nf_log_dump_sk_uid_gid(m, skb->sk); 281 nf_log_dump_sk_uid_gid(net, m, skb->sk);
282 282
283 /* Max length: 16 "MARK=0xFFFFFFFF " */ 283 /* Max length: 16 "MARK=0xFFFFFFFF " */
284 if (recurse && skb->mark) 284 if (recurse && skb->mark)
@@ -365,7 +365,7 @@ static void nf_log_ip6_packet(struct net *net, u_int8_t pf,
365 if (in != NULL) 365 if (in != NULL)
366 dump_ipv6_mac_header(m, loginfo, skb); 366 dump_ipv6_mac_header(m, loginfo, skb);
367 367
368 dump_ipv6_packet(m, loginfo, skb, skb_network_offset(skb), 1); 368 dump_ipv6_packet(net, m, loginfo, skb, skb_network_offset(skb), 1);
369 369
370 nf_log_buf_close(m); 370 nf_log_buf_close(m);
371} 371}
diff --git a/net/ipv6/netfilter/nf_tproxy_ipv6.c b/net/ipv6/netfilter/nf_tproxy_ipv6.c
index bf1d6c421e3b..5dfd33af6451 100644
--- a/net/ipv6/netfilter/nf_tproxy_ipv6.c
+++ b/net/ipv6/netfilter/nf_tproxy_ipv6.c
@@ -55,7 +55,7 @@ nf_tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
55 * to a listener socket if there's one */ 55 * to a listener socket if there's one */
56 struct sock *sk2; 56 struct sock *sk2;
57 57
58 sk2 = nf_tproxy_get_sock_v6(net, skb, thoff, hp, tproto, 58 sk2 = nf_tproxy_get_sock_v6(net, skb, thoff, tproto,
59 &iph->saddr, 59 &iph->saddr,
60 nf_tproxy_laddr6(skb, laddr, &iph->daddr), 60 nf_tproxy_laddr6(skb, laddr, &iph->daddr),
61 hp->source, 61 hp->source,
@@ -72,7 +72,7 @@ nf_tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
72EXPORT_SYMBOL_GPL(nf_tproxy_handle_time_wait6); 72EXPORT_SYMBOL_GPL(nf_tproxy_handle_time_wait6);
73 73
74struct sock * 74struct sock *
75nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp, 75nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff,
76 const u8 protocol, 76 const u8 protocol,
77 const struct in6_addr *saddr, const struct in6_addr *daddr, 77 const struct in6_addr *saddr, const struct in6_addr *daddr,
78 const __be16 sport, const __be16 dport, 78 const __be16 sport, const __be16 dport,
@@ -80,15 +80,20 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp,
80 const enum nf_tproxy_lookup_t lookup_type) 80 const enum nf_tproxy_lookup_t lookup_type)
81{ 81{
82 struct sock *sk; 82 struct sock *sk;
83 struct tcphdr *tcph;
84 83
85 switch (protocol) { 84 switch (protocol) {
86 case IPPROTO_TCP: 85 case IPPROTO_TCP: {
86 struct tcphdr _hdr, *hp;
87
88 hp = skb_header_pointer(skb, thoff,
89 sizeof(struct tcphdr), &_hdr);
90 if (hp == NULL)
91 return NULL;
92
87 switch (lookup_type) { 93 switch (lookup_type) {
88 case NF_TPROXY_LOOKUP_LISTENER: 94 case NF_TPROXY_LOOKUP_LISTENER:
89 tcph = hp;
90 sk = inet6_lookup_listener(net, &tcp_hashinfo, skb, 95 sk = inet6_lookup_listener(net, &tcp_hashinfo, skb,
91 thoff + __tcp_hdrlen(tcph), 96 thoff + __tcp_hdrlen(hp),
92 saddr, sport, 97 saddr, sport,
93 daddr, ntohs(dport), 98 daddr, ntohs(dport),
94 in->ifindex, 0); 99 in->ifindex, 0);
@@ -110,6 +115,7 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp,
110 BUG(); 115 BUG();
111 } 116 }
112 break; 117 break;
118 }
113 case IPPROTO_UDP: 119 case IPPROTO_UDP:
114 sk = udp6_lib_lookup(net, saddr, sport, daddr, dport, 120 sk = udp6_lib_lookup(net, saddr, sport, daddr, dport,
115 in->ifindex); 121 in->ifindex);
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index 96f56bf49a30..4c04bccc7417 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -62,7 +62,6 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
62 struct dst_entry *dst; 62 struct dst_entry *dst;
63 struct rt6_info *rt; 63 struct rt6_info *rt;
64 struct pingfakehdr pfh; 64 struct pingfakehdr pfh;
65 struct sockcm_cookie junk = {0};
66 struct ipcm6_cookie ipc6; 65 struct ipcm6_cookie ipc6;
67 66
68 pr_debug("ping_v6_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num); 67 pr_debug("ping_v6_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num);
@@ -119,7 +118,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
119 fl6.fl6_icmp_code = user_icmph.icmp6_code; 118 fl6.fl6_icmp_code = user_icmph.icmp6_code;
120 security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); 119 security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
121 120
122 ipc6.tclass = np->tclass; 121 ipcm6_init_sk(&ipc6, np);
123 fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); 122 fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel);
124 123
125 dst = ip6_sk_dst_lookup_flow(sk, &fl6, daddr, false); 124 dst = ip6_sk_dst_lookup_flow(sk, &fl6, daddr, false);
@@ -142,13 +141,11 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
142 pfh.family = AF_INET6; 141 pfh.family = AF_INET6;
143 142
144 ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); 143 ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
145 ipc6.dontfrag = np->dontfrag;
146 ipc6.opt = NULL;
147 144
148 lock_sock(sk); 145 lock_sock(sk);
149 err = ip6_append_data(sk, ping_getfrag, &pfh, len, 146 err = ip6_append_data(sk, ping_getfrag, &pfh, len,
150 0, &ipc6, &fl6, rt, 147 0, &ipc6, &fl6, rt,
151 MSG_DONTWAIT, &junk); 148 MSG_DONTWAIT);
152 149
153 if (err) { 150 if (err) {
154 ICMP6_INC_STATS(sock_net(sk), rt->rt6i_idev, 151 ICMP6_INC_STATS(sock_net(sk), rt->rt6i_idev,
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index ce6f0d15b5dd..413d98bf24f4 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -620,7 +620,7 @@ out:
620 620
621static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, 621static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length,
622 struct flowi6 *fl6, struct dst_entry **dstp, 622 struct flowi6 *fl6, struct dst_entry **dstp,
623 unsigned int flags) 623 unsigned int flags, const struct sockcm_cookie *sockc)
624{ 624{
625 struct ipv6_pinfo *np = inet6_sk(sk); 625 struct ipv6_pinfo *np = inet6_sk(sk);
626 struct net *net = sock_net(sk); 626 struct net *net = sock_net(sk);
@@ -650,6 +650,7 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length,
650 skb->protocol = htons(ETH_P_IPV6); 650 skb->protocol = htons(ETH_P_IPV6);
651 skb->priority = sk->sk_priority; 651 skb->priority = sk->sk_priority;
652 skb->mark = sk->sk_mark; 652 skb->mark = sk->sk_mark;
653 skb->tstamp = sockc->transmit_time;
653 skb_dst_set(skb, &rt->dst); 654 skb_dst_set(skb, &rt->dst);
654 *dstp = NULL; 655 *dstp = NULL;
655 656
@@ -766,7 +767,6 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
766 struct dst_entry *dst = NULL; 767 struct dst_entry *dst = NULL;
767 struct raw6_frag_vec rfv; 768 struct raw6_frag_vec rfv;
768 struct flowi6 fl6; 769 struct flowi6 fl6;
769 struct sockcm_cookie sockc;
770 struct ipcm6_cookie ipc6; 770 struct ipcm6_cookie ipc6;
771 int addr_len = msg->msg_namelen; 771 int addr_len = msg->msg_namelen;
772 u16 proto; 772 u16 proto;
@@ -790,10 +790,8 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
790 fl6.flowi6_mark = sk->sk_mark; 790 fl6.flowi6_mark = sk->sk_mark;
791 fl6.flowi6_uid = sk->sk_uid; 791 fl6.flowi6_uid = sk->sk_uid;
792 792
793 ipc6.hlimit = -1; 793 ipcm6_init(&ipc6);
794 ipc6.tclass = -1; 794 ipc6.sockc.tsflags = sk->sk_tsflags;
795 ipc6.dontfrag = -1;
796 ipc6.opt = NULL;
797 795
798 if (sin6) { 796 if (sin6) {
799 if (addr_len < SIN6_LEN_RFC2133) 797 if (addr_len < SIN6_LEN_RFC2133)
@@ -847,14 +845,13 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
847 if (fl6.flowi6_oif == 0) 845 if (fl6.flowi6_oif == 0)
848 fl6.flowi6_oif = sk->sk_bound_dev_if; 846 fl6.flowi6_oif = sk->sk_bound_dev_if;
849 847
850 sockc.tsflags = sk->sk_tsflags;
851 if (msg->msg_controllen) { 848 if (msg->msg_controllen) {
852 opt = &opt_space; 849 opt = &opt_space;
853 memset(opt, 0, sizeof(struct ipv6_txoptions)); 850 memset(opt, 0, sizeof(struct ipv6_txoptions));
854 opt->tot_len = sizeof(struct ipv6_txoptions); 851 opt->tot_len = sizeof(struct ipv6_txoptions);
855 ipc6.opt = opt; 852 ipc6.opt = opt;
856 853
857 err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6, &sockc); 854 err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6);
858 if (err < 0) { 855 if (err < 0) {
859 fl6_sock_release(flowlabel); 856 fl6_sock_release(flowlabel);
860 return err; 857 return err;
@@ -921,13 +918,14 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
921 918
922back_from_confirm: 919back_from_confirm:
923 if (inet->hdrincl) 920 if (inet->hdrincl)
924 err = rawv6_send_hdrinc(sk, msg, len, &fl6, &dst, msg->msg_flags); 921 err = rawv6_send_hdrinc(sk, msg, len, &fl6, &dst,
922 msg->msg_flags, &ipc6.sockc);
925 else { 923 else {
926 ipc6.opt = opt; 924 ipc6.opt = opt;
927 lock_sock(sk); 925 lock_sock(sk);
928 err = ip6_append_data(sk, raw6_getfrag, &rfv, 926 err = ip6_append_data(sk, raw6_getfrag, &rfv,
929 len, 0, &ipc6, &fl6, (struct rt6_info *)dst, 927 len, 0, &ipc6, &fl6, (struct rt6_info *)dst,
930 msg->msg_flags, &sockc); 928 msg->msg_flags);
931 929
932 if (err) 930 if (err)
933 ip6_flush_pending_frames(sk); 931 ip6_flush_pending_frames(sk);
@@ -1334,7 +1332,7 @@ void raw6_proc_exit(void)
1334} 1332}
1335#endif /* CONFIG_PROC_FS */ 1333#endif /* CONFIG_PROC_FS */
1336 1334
1337/* Same as inet6_dgram_ops, sans udp_poll_mask. */ 1335/* Same as inet6_dgram_ops, sans udp_poll. */
1338const struct proto_ops inet6_sockraw_ops = { 1336const struct proto_ops inet6_sockraw_ops = {
1339 .family = PF_INET6, 1337 .family = PF_INET6,
1340 .owner = THIS_MODULE, 1338 .owner = THIS_MODULE,
@@ -1344,7 +1342,7 @@ const struct proto_ops inet6_sockraw_ops = {
1344 .socketpair = sock_no_socketpair, /* a do nothing */ 1342 .socketpair = sock_no_socketpair, /* a do nothing */
1345 .accept = sock_no_accept, /* a do nothing */ 1343 .accept = sock_no_accept, /* a do nothing */
1346 .getname = inet6_getname, 1344 .getname = inet6_getname,
1347 .poll_mask = datagram_poll_mask, /* ok */ 1345 .poll = datagram_poll, /* ok */
1348 .ioctl = inet6_ioctl, /* must change */ 1346 .ioctl = inet6_ioctl, /* must change */
1349 .listen = sock_no_listen, /* ok */ 1347 .listen = sock_no_listen, /* ok */
1350 .shutdown = inet_shutdown, /* ok */ 1348 .shutdown = inet_shutdown, /* ok */
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index b939b94e7e91..5c5b4f79296e 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -57,7 +57,7 @@
57#include <net/rawv6.h> 57#include <net/rawv6.h>
58#include <net/ndisc.h> 58#include <net/ndisc.h>
59#include <net/addrconf.h> 59#include <net/addrconf.h>
60#include <net/inet_frag.h> 60#include <net/ipv6_frag.h>
61#include <net/inet_ecn.h> 61#include <net/inet_ecn.h>
62 62
63static const char ip6_frag_cache_name[] = "ip6-frags"; 63static const char ip6_frag_cache_name[] = "ip6-frags";
@@ -72,61 +72,6 @@ static struct inet_frags ip6_frags;
72static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, 72static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
73 struct net_device *dev); 73 struct net_device *dev);
74 74
75void ip6_frag_init(struct inet_frag_queue *q, const void *a)
76{
77 struct frag_queue *fq = container_of(q, struct frag_queue, q);
78 const struct frag_v6_compare_key *key = a;
79
80 q->key.v6 = *key;
81 fq->ecn = 0;
82}
83EXPORT_SYMBOL(ip6_frag_init);
84
85void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq)
86{
87 struct net_device *dev = NULL;
88 struct sk_buff *head;
89
90 rcu_read_lock();
91 spin_lock(&fq->q.lock);
92
93 if (fq->q.flags & INET_FRAG_COMPLETE)
94 goto out;
95
96 inet_frag_kill(&fq->q);
97
98 dev = dev_get_by_index_rcu(net, fq->iif);
99 if (!dev)
100 goto out;
101
102 __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
103 __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT);
104
105 /* Don't send error if the first segment did not arrive. */
106 head = fq->q.fragments;
107 if (!(fq->q.flags & INET_FRAG_FIRST_IN) || !head)
108 goto out;
109
110 /* But use as source device on which LAST ARRIVED
111 * segment was received. And do not use fq->dev
112 * pointer directly, device might already disappeared.
113 */
114 head->dev = dev;
115 skb_get(head);
116 spin_unlock(&fq->q.lock);
117
118 icmpv6_send(head, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0);
119 kfree_skb(head);
120 goto out_rcu_unlock;
121
122out:
123 spin_unlock(&fq->q.lock);
124out_rcu_unlock:
125 rcu_read_unlock();
126 inet_frag_put(&fq->q);
127}
128EXPORT_SYMBOL(ip6_expire_frag_queue);
129
130static void ip6_frag_expire(struct timer_list *t) 75static void ip6_frag_expire(struct timer_list *t)
131{ 76{
132 struct inet_frag_queue *frag = from_timer(frag, t, timer); 77 struct inet_frag_queue *frag = from_timer(frag, t, timer);
@@ -136,7 +81,7 @@ static void ip6_frag_expire(struct timer_list *t)
136 fq = container_of(frag, struct frag_queue, q); 81 fq = container_of(frag, struct frag_queue, q);
137 net = container_of(fq->q.net, struct net, ipv6.frags); 82 net = container_of(fq->q.net, struct net, ipv6.frags);
138 83
139 ip6_expire_frag_queue(net, fq); 84 ip6frag_expire_frag_queue(net, fq);
140} 85}
141 86
142static struct frag_queue * 87static struct frag_queue *
@@ -460,6 +405,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
460 __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS); 405 __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS);
461 rcu_read_unlock(); 406 rcu_read_unlock();
462 fq->q.fragments = NULL; 407 fq->q.fragments = NULL;
408 fq->q.rb_fragments = RB_ROOT;
463 fq->q.fragments_tail = NULL; 409 fq->q.fragments_tail = NULL;
464 return 1; 410 return 1;
465 411
@@ -510,6 +456,10 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
510 return 1; 456 return 1;
511 } 457 }
512 458
459 if (skb->len - skb_network_offset(skb) < IPV6_MIN_MTU &&
460 fhdr->frag_off & htons(IP6_MF))
461 goto fail_hdr;
462
513 iif = skb->dev ? skb->dev->ifindex : 0; 463 iif = skb->dev ? skb->dev->ifindex : 0;
514 fq = fq_find(net, fhdr->identification, hdr, iif); 464 fq = fq_find(net, fhdr->identification, hdr, iif);
515 if (fq) { 465 if (fq) {
@@ -696,42 +646,19 @@ static struct pernet_operations ip6_frags_ops = {
696 .exit = ipv6_frags_exit_net, 646 .exit = ipv6_frags_exit_net,
697}; 647};
698 648
699static u32 ip6_key_hashfn(const void *data, u32 len, u32 seed) 649static const struct rhashtable_params ip6_rhash_params = {
700{
701 return jhash2(data,
702 sizeof(struct frag_v6_compare_key) / sizeof(u32), seed);
703}
704
705static u32 ip6_obj_hashfn(const void *data, u32 len, u32 seed)
706{
707 const struct inet_frag_queue *fq = data;
708
709 return jhash2((const u32 *)&fq->key.v6,
710 sizeof(struct frag_v6_compare_key) / sizeof(u32), seed);
711}
712
713static int ip6_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr)
714{
715 const struct frag_v6_compare_key *key = arg->key;
716 const struct inet_frag_queue *fq = ptr;
717
718 return !!memcmp(&fq->key, key, sizeof(*key));
719}
720
721const struct rhashtable_params ip6_rhash_params = {
722 .head_offset = offsetof(struct inet_frag_queue, node), 650 .head_offset = offsetof(struct inet_frag_queue, node),
723 .hashfn = ip6_key_hashfn, 651 .hashfn = ip6frag_key_hashfn,
724 .obj_hashfn = ip6_obj_hashfn, 652 .obj_hashfn = ip6frag_obj_hashfn,
725 .obj_cmpfn = ip6_obj_cmpfn, 653 .obj_cmpfn = ip6frag_obj_cmpfn,
726 .automatic_shrinking = true, 654 .automatic_shrinking = true,
727}; 655};
728EXPORT_SYMBOL(ip6_rhash_params);
729 656
730int __init ipv6_frag_init(void) 657int __init ipv6_frag_init(void)
731{ 658{
732 int ret; 659 int ret;
733 660
734 ip6_frags.constructor = ip6_frag_init; 661 ip6_frags.constructor = ip6frag_init;
735 ip6_frags.destructor = NULL; 662 ip6_frags.destructor = NULL;
736 ip6_frags.qsize = sizeof(struct frag_queue); 663 ip6_frags.qsize = sizeof(struct frag_queue);
737 ip6_frags.frag_expire = ip6_frag_expire; 664 ip6_frags.frag_expire = ip6_frag_expire;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 86a0e4333d42..c4ea13e8360b 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -956,7 +956,7 @@ static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
956 rt->dst.error = 0; 956 rt->dst.error = 0;
957 rt->dst.output = ip6_output; 957 rt->dst.output = ip6_output;
958 958
959 if (ort->fib6_type == RTN_LOCAL) { 959 if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
960 rt->dst.input = ip6_input; 960 rt->dst.input = ip6_input;
961 } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { 961 } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
962 rt->dst.input = ip6_mc_input; 962 rt->dst.input = ip6_mc_input;
@@ -972,18 +972,15 @@ static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
972 rt->dst.lastuse = jiffies; 972 rt->dst.lastuse = jiffies;
973} 973}
974 974
975/* Caller must already hold reference to @from */
975static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) 976static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
976{ 977{
977 rt->rt6i_flags &= ~RTF_EXPIRES; 978 rt->rt6i_flags &= ~RTF_EXPIRES;
978 fib6_info_hold(from);
979 rcu_assign_pointer(rt->from, from); 979 rcu_assign_pointer(rt->from, from);
980 dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true); 980 dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
981 if (from->fib6_metrics != &dst_default_metrics) {
982 rt->dst._metrics |= DST_METRICS_REFCOUNTED;
983 refcount_inc(&from->fib6_metrics->refcnt);
984 }
985} 981}
986 982
983/* Caller must already hold reference to @ort */
987static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort) 984static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
988{ 985{
989 struct net_device *dev = fib6_info_nh_dev(ort); 986 struct net_device *dev = fib6_info_nh_dev(ort);
@@ -1044,9 +1041,14 @@ static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1044 struct net_device *dev = rt->fib6_nh.nh_dev; 1041 struct net_device *dev = rt->fib6_nh.nh_dev;
1045 struct rt6_info *nrt; 1042 struct rt6_info *nrt;
1046 1043
1044 if (!fib6_info_hold_safe(rt))
1045 return NULL;
1046
1047 nrt = ip6_dst_alloc(dev_net(dev), dev, flags); 1047 nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1048 if (nrt) 1048 if (nrt)
1049 ip6_rt_copy_init(nrt, rt); 1049 ip6_rt_copy_init(nrt, rt);
1050 else
1051 fib6_info_release(rt);
1050 1052
1051 return nrt; 1053 return nrt;
1052} 1054}
@@ -1178,10 +1180,15 @@ static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1178 * Clone the route. 1180 * Clone the route.
1179 */ 1181 */
1180 1182
1183 if (!fib6_info_hold_safe(ort))
1184 return NULL;
1185
1181 dev = ip6_rt_get_dev_rcu(ort); 1186 dev = ip6_rt_get_dev_rcu(ort);
1182 rt = ip6_dst_alloc(dev_net(dev), dev, 0); 1187 rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1183 if (!rt) 1188 if (!rt) {
1189 fib6_info_release(ort);
1184 return NULL; 1190 return NULL;
1191 }
1185 1192
1186 ip6_rt_copy_init(rt, ort); 1193 ip6_rt_copy_init(rt, ort);
1187 rt->rt6i_flags |= RTF_CACHE; 1194 rt->rt6i_flags |= RTF_CACHE;
@@ -1210,12 +1217,17 @@ static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1210 struct net_device *dev; 1217 struct net_device *dev;
1211 struct rt6_info *pcpu_rt; 1218 struct rt6_info *pcpu_rt;
1212 1219
1220 if (!fib6_info_hold_safe(rt))
1221 return NULL;
1222
1213 rcu_read_lock(); 1223 rcu_read_lock();
1214 dev = ip6_rt_get_dev_rcu(rt); 1224 dev = ip6_rt_get_dev_rcu(rt);
1215 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags); 1225 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1216 rcu_read_unlock(); 1226 rcu_read_unlock();
1217 if (!pcpu_rt) 1227 if (!pcpu_rt) {
1228 fib6_info_release(rt);
1218 return NULL; 1229 return NULL;
1230 }
1219 ip6_rt_copy_init(pcpu_rt, rt); 1231 ip6_rt_copy_init(pcpu_rt, rt);
1220 pcpu_rt->rt6i_flags |= RTF_PCPU; 1232 pcpu_rt->rt6i_flags |= RTF_PCPU;
1221 return pcpu_rt; 1233 return pcpu_rt;
@@ -2486,7 +2498,7 @@ restart:
2486 2498
2487out: 2499out:
2488 if (ret) 2500 if (ret)
2489 dst_hold(&ret->dst); 2501 ip6_hold_safe(net, &ret, true);
2490 else 2502 else
2491 ret = ip6_create_rt_rcu(rt); 2503 ret = ip6_create_rt_rcu(rt);
2492 2504
@@ -3303,7 +3315,8 @@ static int ip6_route_del(struct fib6_config *cfg,
3303 continue; 3315 continue;
3304 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol) 3316 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3305 continue; 3317 continue;
3306 fib6_info_hold(rt); 3318 if (!fib6_info_hold_safe(rt))
3319 continue;
3307 rcu_read_unlock(); 3320 rcu_read_unlock();
3308 3321
3309 /* if gateway was specified only delete the one hop */ 3322 /* if gateway was specified only delete the one hop */
@@ -3409,6 +3422,9 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
3409 3422
3410 rcu_read_lock(); 3423 rcu_read_lock();
3411 from = rcu_dereference(rt->from); 3424 from = rcu_dereference(rt->from);
3425 /* This fib6_info_hold() is safe here because we hold reference to rt
3426 * and rt already holds reference to fib6_info.
3427 */
3412 fib6_info_hold(from); 3428 fib6_info_hold(from);
3413 rcu_read_unlock(); 3429 rcu_read_unlock();
3414 3430
@@ -3470,7 +3486,8 @@ static struct fib6_info *rt6_get_route_info(struct net *net,
3470 continue; 3486 continue;
3471 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr)) 3487 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3472 continue; 3488 continue;
3473 fib6_info_hold(rt); 3489 if (!fib6_info_hold_safe(rt))
3490 continue;
3474 break; 3491 break;
3475 } 3492 }
3476out: 3493out:
@@ -3530,8 +3547,8 @@ struct fib6_info *rt6_get_dflt_router(struct net *net,
3530 ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr)) 3547 ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3531 break; 3548 break;
3532 } 3549 }
3533 if (rt) 3550 if (rt && !fib6_info_hold_safe(rt))
3534 fib6_info_hold(rt); 3551 rt = NULL;
3535 rcu_read_unlock(); 3552 rcu_read_unlock();
3536 return rt; 3553 return rt;
3537} 3554}
@@ -3579,8 +3596,8 @@ restart:
3579 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL; 3596 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3580 3597
3581 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) && 3598 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3582 (!idev || idev->cnf.accept_ra != 2)) { 3599 (!idev || idev->cnf.accept_ra != 2) &&
3583 fib6_info_hold(rt); 3600 fib6_info_hold_safe(rt)) {
3584 rcu_read_unlock(); 3601 rcu_read_unlock();
3585 ip6_del_rt(net, rt); 3602 ip6_del_rt(net, rt);
3586 goto restart; 3603 goto restart;
@@ -3842,7 +3859,7 @@ static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3842 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3859 lockdep_is_held(&rt->fib6_table->tb6_lock));
3843 while (iter) { 3860 while (iter) {
3844 if (iter->fib6_metric == rt->fib6_metric && 3861 if (iter->fib6_metric == rt->fib6_metric &&
3845 iter->fib6_nsiblings) 3862 rt6_qualify_for_ecmp(iter))
3846 return iter; 3863 return iter;
3847 iter = rcu_dereference_protected(iter->fib6_next, 3864 iter = rcu_dereference_protected(iter->fib6_next,
3848 lockdep_is_held(&rt->fib6_table->tb6_lock)); 3865 lockdep_is_held(&rt->fib6_table->tb6_lock));
@@ -4388,6 +4405,13 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
4388 rt = NULL; 4405 rt = NULL;
4389 goto cleanup; 4406 goto cleanup;
4390 } 4407 }
4408 if (!rt6_qualify_for_ecmp(rt)) {
4409 err = -EINVAL;
4410 NL_SET_ERR_MSG(extack,
4411 "Device only routes can not be added for IPv6 using the multipath API.");
4412 fib6_info_release(rt);
4413 goto cleanup;
4414 }
4391 4415
4392 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1; 4416 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4393 4417
@@ -4439,7 +4463,6 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
4439 */ 4463 */
4440 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | 4464 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4441 NLM_F_REPLACE); 4465 NLM_F_REPLACE);
4442 cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_APPEND;
4443 nhn++; 4466 nhn++;
4444 } 4467 }
4445 4468
diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c
index 0fdf2a55e746..8d0ba757a46c 100644
--- a/net/ipv6/seg6.c
+++ b/net/ipv6/seg6.c
@@ -17,6 +17,7 @@
17#include <linux/net.h> 17#include <linux/net.h>
18#include <linux/in6.h> 18#include <linux/in6.h>
19#include <linux/slab.h> 19#include <linux/slab.h>
20#include <linux/rhashtable.h>
20 21
21#include <net/ipv6.h> 22#include <net/ipv6.h>
22#include <net/protocol.h> 23#include <net/protocol.h>
diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c
index 33fb35cbfac1..8546f94f30d4 100644
--- a/net/ipv6/seg6_hmac.c
+++ b/net/ipv6/seg6_hmac.c
@@ -22,6 +22,7 @@
22#include <linux/icmpv6.h> 22#include <linux/icmpv6.h>
23#include <linux/mroute6.h> 23#include <linux/mroute6.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/rhashtable.h>
25 26
26#include <linux/netfilter.h> 27#include <linux/netfilter.h>
27#include <linux/netfilter_ipv6.h> 28#include <linux/netfilter_ipv6.h>
@@ -373,7 +374,7 @@ static int seg6_hmac_init_algo(void)
373 return -ENOMEM; 374 return -ENOMEM;
374 375
375 for_each_possible_cpu(cpu) { 376 for_each_possible_cpu(cpu) {
376 tfm = crypto_alloc_shash(algo->name, 0, GFP_KERNEL); 377 tfm = crypto_alloc_shash(algo->name, 0, 0);
377 if (IS_ERR(tfm)) 378 if (IS_ERR(tfm))
378 return PTR_ERR(tfm); 379 return PTR_ERR(tfm);
379 p_tfm = per_cpu_ptr(algo->tfms, cpu); 380 p_tfm = per_cpu_ptr(algo->tfms, cpu);
diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c
index 19ccf0dc996c..a8854dd3e9c5 100644
--- a/net/ipv6/seg6_iptunnel.c
+++ b/net/ipv6/seg6_iptunnel.c
@@ -101,7 +101,7 @@ static __be32 seg6_make_flowlabel(struct net *net, struct sk_buff *skb,
101 101
102 if (do_flowlabel > 0) { 102 if (do_flowlabel > 0) {
103 hash = skb_get_hash(skb); 103 hash = skb_get_hash(skb);
104 rol32(hash, 16); 104 hash = rol32(hash, 16);
105 flowlabel = (__force __be32)hash & IPV6_FLOWLABEL_MASK; 105 flowlabel = (__force __be32)hash & IPV6_FLOWLABEL_MASK;
106 } else if (!do_flowlabel && skb->protocol == htons(ETH_P_IPV6)) { 106 } else if (!do_flowlabel && skb->protocol == htons(ETH_P_IPV6)) {
107 flowlabel = ip6_flowlabel(inner_hdr); 107 flowlabel = ip6_flowlabel(inner_hdr);
diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
index cd6e4cab63f6..60325dbfe88b 100644
--- a/net/ipv6/seg6_local.c
+++ b/net/ipv6/seg6_local.c
@@ -459,36 +459,57 @@ drop:
459 459
460DEFINE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states); 460DEFINE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states);
461 461
462bool seg6_bpf_has_valid_srh(struct sk_buff *skb)
463{
464 struct seg6_bpf_srh_state *srh_state =
465 this_cpu_ptr(&seg6_bpf_srh_states);
466 struct ipv6_sr_hdr *srh = srh_state->srh;
467
468 if (unlikely(srh == NULL))
469 return false;
470
471 if (unlikely(!srh_state->valid)) {
472 if ((srh_state->hdrlen & 7) != 0)
473 return false;
474
475 srh->hdrlen = (u8)(srh_state->hdrlen >> 3);
476 if (!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3))
477 return false;
478
479 srh_state->valid = true;
480 }
481
482 return true;
483}
484
462static int input_action_end_bpf(struct sk_buff *skb, 485static int input_action_end_bpf(struct sk_buff *skb,
463 struct seg6_local_lwt *slwt) 486 struct seg6_local_lwt *slwt)
464{ 487{
465 struct seg6_bpf_srh_state *srh_state = 488 struct seg6_bpf_srh_state *srh_state =
466 this_cpu_ptr(&seg6_bpf_srh_states); 489 this_cpu_ptr(&seg6_bpf_srh_states);
467 struct seg6_bpf_srh_state local_srh_state;
468 struct ipv6_sr_hdr *srh; 490 struct ipv6_sr_hdr *srh;
469 int srhoff = 0;
470 int ret; 491 int ret;
471 492
472 srh = get_and_validate_srh(skb); 493 srh = get_and_validate_srh(skb);
473 if (!srh) 494 if (!srh) {
474 goto drop; 495 kfree_skb(skb);
496 return -EINVAL;
497 }
475 advance_nextseg(srh, &ipv6_hdr(skb)->daddr); 498 advance_nextseg(srh, &ipv6_hdr(skb)->daddr);
476 499
477 /* preempt_disable is needed to protect the per-CPU buffer srh_state, 500 /* preempt_disable is needed to protect the per-CPU buffer srh_state,
478 * which is also accessed by the bpf_lwt_seg6_* helpers 501 * which is also accessed by the bpf_lwt_seg6_* helpers
479 */ 502 */
480 preempt_disable(); 503 preempt_disable();
504 srh_state->srh = srh;
481 srh_state->hdrlen = srh->hdrlen << 3; 505 srh_state->hdrlen = srh->hdrlen << 3;
482 srh_state->valid = 1; 506 srh_state->valid = true;
483 507
484 rcu_read_lock(); 508 rcu_read_lock();
485 bpf_compute_data_pointers(skb); 509 bpf_compute_data_pointers(skb);
486 ret = bpf_prog_run_save_cb(slwt->bpf.prog, skb); 510 ret = bpf_prog_run_save_cb(slwt->bpf.prog, skb);
487 rcu_read_unlock(); 511 rcu_read_unlock();
488 512
489 local_srh_state = *srh_state;
490 preempt_enable();
491
492 switch (ret) { 513 switch (ret) {
493 case BPF_OK: 514 case BPF_OK:
494 case BPF_REDIRECT: 515 case BPF_REDIRECT:
@@ -500,24 +521,17 @@ static int input_action_end_bpf(struct sk_buff *skb,
500 goto drop; 521 goto drop;
501 } 522 }
502 523
503 if (unlikely((local_srh_state.hdrlen & 7) != 0)) 524 if (srh_state->srh && !seg6_bpf_has_valid_srh(skb))
504 goto drop;
505
506 if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
507 goto drop;
508 srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
509 srh->hdrlen = (u8)(local_srh_state.hdrlen >> 3);
510
511 if (!local_srh_state.valid &&
512 unlikely(!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3)))
513 goto drop; 525 goto drop;
514 526
527 preempt_enable();
515 if (ret != BPF_REDIRECT) 528 if (ret != BPF_REDIRECT)
516 seg6_lookup_nexthop(skb, NULL, 0); 529 seg6_lookup_nexthop(skb, NULL, 0);
517 530
518 return dst_input(skb); 531 return dst_input(skb);
519 532
520drop: 533drop:
534 preempt_enable();
521 kfree_skb(skb); 535 kfree_skb(skb);
522 return -EINVAL; 536 return -EINVAL;
523} 537}
@@ -637,12 +651,10 @@ static int parse_nla_srh(struct nlattr **attrs, struct seg6_local_lwt *slwt)
637 if (!seg6_validate_srh(srh, len)) 651 if (!seg6_validate_srh(srh, len))
638 return -EINVAL; 652 return -EINVAL;
639 653
640 slwt->srh = kmalloc(len, GFP_KERNEL); 654 slwt->srh = kmemdup(srh, len, GFP_KERNEL);
641 if (!slwt->srh) 655 if (!slwt->srh)
642 return -ENOMEM; 656 return -ENOMEM;
643 657
644 memcpy(slwt->srh, srh, len);
645
646 slwt->headroom += len; 658 slwt->headroom += len;
647 659
648 return 0; 660 return 0;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 7efa9fd7e109..03e6b7a2bc53 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -938,7 +938,8 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
938 &tcp_hashinfo, NULL, 0, 938 &tcp_hashinfo, NULL, 0,
939 &ipv6h->saddr, 939 &ipv6h->saddr,
940 th->source, &ipv6h->daddr, 940 th->source, &ipv6h->daddr,
941 ntohs(th->source), tcp_v6_iif(skb), 941 ntohs(th->source),
942 tcp_v6_iif_l3_slave(skb),
942 tcp_v6_sdif(skb)); 943 tcp_v6_sdif(skb));
943 if (!sk1) 944 if (!sk1)
944 goto out; 945 goto out;
@@ -1609,7 +1610,8 @@ do_time_wait:
1609 skb, __tcp_hdrlen(th), 1610 skb, __tcp_hdrlen(th),
1610 &ipv6_hdr(skb)->saddr, th->source, 1611 &ipv6_hdr(skb)->saddr, th->source,
1611 &ipv6_hdr(skb)->daddr, 1612 &ipv6_hdr(skb)->daddr,
1612 ntohs(th->dest), tcp_v6_iif(skb), 1613 ntohs(th->dest),
1614 tcp_v6_iif_l3_slave(skb),
1613 sdif); 1615 sdif);
1614 if (sk2) { 1616 if (sk2) {
1615 struct inet_timewait_sock *tw = inet_twsk(sk); 1617 struct inet_timewait_sock *tw = inet_twsk(sk);
diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c
index 278e49cd67d4..e72947c99454 100644
--- a/net/ipv6/tcpv6_offload.c
+++ b/net/ipv6/tcpv6_offload.c
@@ -15,8 +15,8 @@
15#include <net/ip6_checksum.h> 15#include <net/ip6_checksum.h>
16#include "ip6_offload.h" 16#include "ip6_offload.h"
17 17
18static struct sk_buff **tcp6_gro_receive(struct sk_buff **head, 18static struct sk_buff *tcp6_gro_receive(struct list_head *head,
19 struct sk_buff *skb) 19 struct sk_buff *skb)
20{ 20{
21 /* Don't bother verifying checksum if we're going to flush anyway. */ 21 /* Don't bother verifying checksum if we're going to flush anyway. */
22 if (!NAPI_GRO_CB(skb)->flush && 22 if (!NAPI_GRO_CB(skb)->flush &&
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index e6645cae403e..83f4c77c79d8 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -235,6 +235,8 @@ struct sock *__udp6_lib_lookup(struct net *net,
235 exact_dif, hslot2, 235 exact_dif, hslot2,
236 skb); 236 skb);
237 } 237 }
238 if (unlikely(IS_ERR(result)))
239 return NULL;
238 return result; 240 return result;
239 } 241 }
240begin: 242begin:
@@ -249,6 +251,8 @@ begin:
249 saddr, sport); 251 saddr, sport);
250 result = reuseport_select_sock(sk, hash, skb, 252 result = reuseport_select_sock(sk, hash, skb,
251 sizeof(struct udphdr)); 253 sizeof(struct udphdr));
254 if (unlikely(IS_ERR(result)))
255 return NULL;
252 if (result) 256 if (result)
253 return result; 257 return result;
254 } 258 }
@@ -1141,13 +1145,10 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
1141 int err; 1145 int err;
1142 int is_udplite = IS_UDPLITE(sk); 1146 int is_udplite = IS_UDPLITE(sk);
1143 int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); 1147 int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
1144 struct sockcm_cookie sockc;
1145 1148
1146 ipc6.hlimit = -1; 1149 ipcm6_init(&ipc6);
1147 ipc6.tclass = -1;
1148 ipc6.dontfrag = -1;
1149 ipc6.gso_size = up->gso_size; 1150 ipc6.gso_size = up->gso_size;
1150 sockc.tsflags = sk->sk_tsflags; 1151 ipc6.sockc.tsflags = sk->sk_tsflags;
1151 1152
1152 /* destination address check */ 1153 /* destination address check */
1153 if (sin6) { 1154 if (sin6) {
@@ -1282,7 +1283,7 @@ do_udp_sendmsg:
1282 err = udp_cmsg_send(sk, msg, &ipc6.gso_size); 1283 err = udp_cmsg_send(sk, msg, &ipc6.gso_size);
1283 if (err > 0) 1284 if (err > 0)
1284 err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, 1285 err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6,
1285 &ipc6, &sockc); 1286 &ipc6);
1286 if (err < 0) { 1287 if (err < 0) {
1287 fl6_sock_release(flowlabel); 1288 fl6_sock_release(flowlabel);
1288 return err; 1289 return err;
@@ -1376,7 +1377,7 @@ back_from_confirm:
1376 skb = ip6_make_skb(sk, getfrag, msg, ulen, 1377 skb = ip6_make_skb(sk, getfrag, msg, ulen,
1377 sizeof(struct udphdr), &ipc6, 1378 sizeof(struct udphdr), &ipc6,
1378 &fl6, (struct rt6_info *)dst, 1379 &fl6, (struct rt6_info *)dst,
1379 msg->msg_flags, &cork, &sockc); 1380 msg->msg_flags, &cork);
1380 err = PTR_ERR(skb); 1381 err = PTR_ERR(skb);
1381 if (!IS_ERR_OR_NULL(skb)) 1382 if (!IS_ERR_OR_NULL(skb))
1382 err = udp_v6_send_skb(skb, &fl6, &cork.base); 1383 err = udp_v6_send_skb(skb, &fl6, &cork.base);
@@ -1402,7 +1403,7 @@ do_append_data:
1402 up->len += ulen; 1403 up->len += ulen;
1403 err = ip6_append_data(sk, getfrag, msg, ulen, sizeof(struct udphdr), 1404 err = ip6_append_data(sk, getfrag, msg, ulen, sizeof(struct udphdr),
1404 &ipc6, &fl6, (struct rt6_info *)dst, 1405 &ipc6, &fl6, (struct rt6_info *)dst,
1405 corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags, &sockc); 1406 corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
1406 if (err) 1407 if (err)
1407 udp_v6_flush_pending_frames(sk); 1408 udp_v6_flush_pending_frames(sk);
1408 else if (!corkreq) 1409 else if (!corkreq)
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 03a2ff3fe1e6..95dee9ca8d22 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -114,8 +114,8 @@ out:
114 return segs; 114 return segs;
115} 115}
116 116
117static struct sk_buff **udp6_gro_receive(struct sk_buff **head, 117static struct sk_buff *udp6_gro_receive(struct list_head *head,
118 struct sk_buff *skb) 118 struct sk_buff *skb)
119{ 119{
120 struct udphdr *uh = udp_gro_udphdr(skb); 120 struct udphdr *uh = udp_gro_udphdr(skb);
121 121
diff --git a/net/ipv6/xfrm6_mode_ro.c b/net/ipv6/xfrm6_mode_ro.c
index 07d36573f50b..da28e4407b8f 100644
--- a/net/ipv6/xfrm6_mode_ro.c
+++ b/net/ipv6/xfrm6_mode_ro.c
@@ -55,7 +55,7 @@ static int xfrm6_ro_output(struct xfrm_state *x, struct sk_buff *skb)
55 __skb_pull(skb, hdr_len); 55 __skb_pull(skb, hdr_len);
56 memmove(ipv6_hdr(skb), iph, hdr_len); 56 memmove(ipv6_hdr(skb), iph, hdr_len);
57 57
58 x->lastused = get_seconds(); 58 x->lastused = ktime_get_real_seconds();
59 59
60 return 0; 60 return 0;
61} 61}
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index 68e86257a549..a21d8ed0a325 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -150,7 +150,6 @@ static int afiucv_pm_freeze(struct device *dev)
150{ 150{
151 struct iucv_sock *iucv; 151 struct iucv_sock *iucv;
152 struct sock *sk; 152 struct sock *sk;
153 int err = 0;
154 153
155#ifdef CONFIG_PM_DEBUG 154#ifdef CONFIG_PM_DEBUG
156 printk(KERN_WARNING "afiucv_pm_freeze\n"); 155 printk(KERN_WARNING "afiucv_pm_freeze\n");
@@ -175,7 +174,7 @@ static int afiucv_pm_freeze(struct device *dev)
175 skb_queue_purge(&iucv->backlog_skb_q); 174 skb_queue_purge(&iucv->backlog_skb_q);
176 } 175 }
177 read_unlock(&iucv_sk_list.lock); 176 read_unlock(&iucv_sk_list.lock);
178 return err; 177 return 0;
179} 178}
180 179
181/** 180/**
@@ -1488,11 +1487,14 @@ static inline __poll_t iucv_accept_poll(struct sock *parent)
1488 return 0; 1487 return 0;
1489} 1488}
1490 1489
1491static __poll_t iucv_sock_poll_mask(struct socket *sock, __poll_t events) 1490__poll_t iucv_sock_poll(struct file *file, struct socket *sock,
1491 poll_table *wait)
1492{ 1492{
1493 struct sock *sk = sock->sk; 1493 struct sock *sk = sock->sk;
1494 __poll_t mask = 0; 1494 __poll_t mask = 0;
1495 1495
1496 sock_poll_wait(file, wait);
1497
1496 if (sk->sk_state == IUCV_LISTEN) 1498 if (sk->sk_state == IUCV_LISTEN)
1497 return iucv_accept_poll(sk); 1499 return iucv_accept_poll(sk);
1498 1500
@@ -2385,7 +2387,7 @@ static const struct proto_ops iucv_sock_ops = {
2385 .getname = iucv_sock_getname, 2387 .getname = iucv_sock_getname,
2386 .sendmsg = iucv_sock_sendmsg, 2388 .sendmsg = iucv_sock_sendmsg,
2387 .recvmsg = iucv_sock_recvmsg, 2389 .recvmsg = iucv_sock_recvmsg,
2388 .poll_mask = iucv_sock_poll_mask, 2390 .poll = iucv_sock_poll,
2389 .ioctl = sock_no_ioctl, 2391 .ioctl = sock_no_ioctl,
2390 .mmap = sock_no_mmap, 2392 .mmap = sock_no_mmap,
2391 .socketpair = sock_no_socketpair, 2393 .socketpair = sock_no_socketpair,
@@ -2512,4 +2514,3 @@ MODULE_DESCRIPTION("IUCV Sockets ver " VERSION);
2512MODULE_VERSION(VERSION); 2514MODULE_VERSION(VERSION);
2513MODULE_LICENSE("GPL"); 2515MODULE_LICENSE("GPL");
2514MODULE_ALIAS_NETPROTO(PF_IUCV); 2516MODULE_ALIAS_NETPROTO(PF_IUCV);
2515
diff --git a/net/kcm/Kconfig b/net/kcm/Kconfig
index 87fca36e6c47..9ca83f2ade6f 100644
--- a/net/kcm/Kconfig
+++ b/net/kcm/Kconfig
@@ -8,4 +8,3 @@ config AF_KCM
8 KCM (Kernel Connection Multiplexor) sockets provide a method 8 KCM (Kernel Connection Multiplexor) sockets provide a method
9 for multiplexing messages of a message based application 9 for multiplexing messages of a message based application
10 protocol over kernel connectons (e.g. TCP connections). 10 protocol over kernel connectons (e.g. TCP connections).
11
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index 84b7d5c6fec8..571d824e4e24 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -1336,9 +1336,9 @@ static void init_kcm_sock(struct kcm_sock *kcm, struct kcm_mux *mux)
1336 struct list_head *head; 1336 struct list_head *head;
1337 int index = 0; 1337 int index = 0;
1338 1338
1339 /* For SOCK_SEQPACKET sock type, datagram_poll_mask checks the sk_state, 1339 /* For SOCK_SEQPACKET sock type, datagram_poll checks the sk_state, so
1340 * so we set sk_state, otherwise epoll_wait always returns right away 1340 * we set sk_state, otherwise epoll_wait always returns right away with
1341 * with EPOLLHUP 1341 * EPOLLHUP
1342 */ 1342 */
1343 kcm->sk.sk_state = TCP_ESTABLISHED; 1343 kcm->sk.sk_state = TCP_ESTABLISHED;
1344 1344
@@ -1903,7 +1903,7 @@ static const struct proto_ops kcm_dgram_ops = {
1903 .socketpair = sock_no_socketpair, 1903 .socketpair = sock_no_socketpair,
1904 .accept = sock_no_accept, 1904 .accept = sock_no_accept,
1905 .getname = sock_no_getname, 1905 .getname = sock_no_getname,
1906 .poll_mask = datagram_poll_mask, 1906 .poll = datagram_poll,
1907 .ioctl = kcm_ioctl, 1907 .ioctl = kcm_ioctl,
1908 .listen = sock_no_listen, 1908 .listen = sock_no_listen,
1909 .shutdown = sock_no_shutdown, 1909 .shutdown = sock_no_shutdown,
@@ -1924,7 +1924,7 @@ static const struct proto_ops kcm_seqpacket_ops = {
1924 .socketpair = sock_no_socketpair, 1924 .socketpair = sock_no_socketpair,
1925 .accept = sock_no_accept, 1925 .accept = sock_no_accept,
1926 .getname = sock_no_getname, 1926 .getname = sock_no_getname,
1927 .poll_mask = datagram_poll_mask, 1927 .poll = datagram_poll,
1928 .ioctl = kcm_ioctl, 1928 .ioctl = kcm_ioctl,
1929 .listen = sock_no_listen, 1929 .listen = sock_no_listen,
1930 .shutdown = sock_no_shutdown, 1930 .shutdown = sock_no_shutdown,
@@ -2104,4 +2104,3 @@ module_exit(kcm_exit);
2104 2104
2105MODULE_LICENSE("GPL"); 2105MODULE_LICENSE("GPL");
2106MODULE_ALIAS_NETPROTO(PF_KCM); 2106MODULE_ALIAS_NETPROTO(PF_KCM);
2107
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 8bdc1cbe490a..9d61266526e7 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -1383,7 +1383,7 @@ static int pfkey_getspi(struct sock *sk, struct sk_buff *skb, const struct sadb_
1383 } 1383 }
1384 1384
1385 if (!x) 1385 if (!x)
1386 x = xfrm_find_acq(net, &dummy_mark, mode, reqid, proto, xdaddr, xsaddr, 1, family); 1386 x = xfrm_find_acq(net, &dummy_mark, mode, reqid, 0, proto, xdaddr, xsaddr, 1, family);
1387 1387
1388 if (x == NULL) 1388 if (x == NULL)
1389 return -ENOENT; 1389 return -ENOENT;
@@ -2414,7 +2414,7 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, const struct sa
2414 return err; 2414 return err;
2415 } 2415 }
2416 2416
2417 xp = xfrm_policy_bysel_ctx(net, DUMMY_MARK, XFRM_POLICY_TYPE_MAIN, 2417 xp = xfrm_policy_bysel_ctx(net, DUMMY_MARK, 0, XFRM_POLICY_TYPE_MAIN,
2418 pol->sadb_x_policy_dir - 1, &sel, pol_ctx, 2418 pol->sadb_x_policy_dir - 1, &sel, pol_ctx,
2419 1, &err); 2419 1, &err);
2420 security_xfrm_policy_free(pol_ctx); 2420 security_xfrm_policy_free(pol_ctx);
@@ -2663,7 +2663,7 @@ static int pfkey_spdget(struct sock *sk, struct sk_buff *skb, const struct sadb_
2663 return -EINVAL; 2663 return -EINVAL;
2664 2664
2665 delete = (hdr->sadb_msg_type == SADB_X_SPDDELETE2); 2665 delete = (hdr->sadb_msg_type == SADB_X_SPDDELETE2);
2666 xp = xfrm_policy_byid(net, DUMMY_MARK, XFRM_POLICY_TYPE_MAIN, 2666 xp = xfrm_policy_byid(net, DUMMY_MARK, 0, XFRM_POLICY_TYPE_MAIN,
2667 dir, pol->sadb_x_policy_id, delete, &err); 2667 dir, pol->sadb_x_policy_id, delete, &err);
2668 if (xp == NULL) 2668 if (xp == NULL)
2669 return -ENOENT; 2669 return -ENOENT;
@@ -3751,7 +3751,7 @@ static const struct proto_ops pfkey_ops = {
3751 3751
3752 /* Now the operations that really occur. */ 3752 /* Now the operations that really occur. */
3753 .release = pfkey_release, 3753 .release = pfkey_release,
3754 .poll_mask = datagram_poll_mask, 3754 .poll = datagram_poll,
3755 .sendmsg = pfkey_sendmsg, 3755 .sendmsg = pfkey_sendmsg,
3756 .recvmsg = pfkey_recvmsg, 3756 .recvmsg = pfkey_recvmsg,
3757}; 3757};
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 40261cb68e83..82cdf9020b53 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -203,44 +203,44 @@ struct l2tp_tunnel *l2tp_tunnel_get_nth(const struct net *net, int nth)
203} 203}
204EXPORT_SYMBOL_GPL(l2tp_tunnel_get_nth); 204EXPORT_SYMBOL_GPL(l2tp_tunnel_get_nth);
205 205
206/* Lookup a session. A new reference is held on the returned session. */ 206struct l2tp_session *l2tp_tunnel_get_session(struct l2tp_tunnel *tunnel,
207struct l2tp_session *l2tp_session_get(const struct net *net, 207 u32 session_id)
208 struct l2tp_tunnel *tunnel,
209 u32 session_id)
210{ 208{
211 struct hlist_head *session_list; 209 struct hlist_head *session_list;
212 struct l2tp_session *session; 210 struct l2tp_session *session;
213 211
214 if (!tunnel) { 212 session_list = l2tp_session_id_hash(tunnel, session_id);
215 struct l2tp_net *pn = l2tp_pernet(net);
216
217 session_list = l2tp_session_id_hash_2(pn, session_id);
218 213
219 rcu_read_lock_bh(); 214 read_lock_bh(&tunnel->hlist_lock);
220 hlist_for_each_entry_rcu(session, session_list, global_hlist) { 215 hlist_for_each_entry(session, session_list, hlist)
221 if (session->session_id == session_id) { 216 if (session->session_id == session_id) {
222 l2tp_session_inc_refcount(session); 217 l2tp_session_inc_refcount(session);
223 rcu_read_unlock_bh(); 218 read_unlock_bh(&tunnel->hlist_lock);
224 219
225 return session; 220 return session;
226 }
227 } 221 }
228 rcu_read_unlock_bh(); 222 read_unlock_bh(&tunnel->hlist_lock);
229 223
230 return NULL; 224 return NULL;
231 } 225}
226EXPORT_SYMBOL_GPL(l2tp_tunnel_get_session);
232 227
233 session_list = l2tp_session_id_hash(tunnel, session_id); 228struct l2tp_session *l2tp_session_get(const struct net *net, u32 session_id)
234 read_lock_bh(&tunnel->hlist_lock); 229{
235 hlist_for_each_entry(session, session_list, hlist) { 230 struct hlist_head *session_list;
231 struct l2tp_session *session;
232
233 session_list = l2tp_session_id_hash_2(l2tp_pernet(net), session_id);
234
235 rcu_read_lock_bh();
236 hlist_for_each_entry_rcu(session, session_list, global_hlist)
236 if (session->session_id == session_id) { 237 if (session->session_id == session_id) {
237 l2tp_session_inc_refcount(session); 238 l2tp_session_inc_refcount(session);
238 read_unlock_bh(&tunnel->hlist_lock); 239 rcu_read_unlock_bh();
239 240
240 return session; 241 return session;
241 } 242 }
242 } 243 rcu_read_unlock_bh();
243 read_unlock_bh(&tunnel->hlist_lock);
244 244
245 return NULL; 245 return NULL;
246} 246}
@@ -322,8 +322,7 @@ int l2tp_session_register(struct l2tp_session *session,
322 322
323 if (tunnel->version == L2TP_HDR_VER_3) { 323 if (tunnel->version == L2TP_HDR_VER_3) {
324 pn = l2tp_pernet(tunnel->l2tp_net); 324 pn = l2tp_pernet(tunnel->l2tp_net);
325 g_head = l2tp_session_id_hash_2(l2tp_pernet(tunnel->l2tp_net), 325 g_head = l2tp_session_id_hash_2(pn, session->session_id);
326 session->session_id);
327 326
328 spin_lock_bh(&pn->l2tp_session_hlist_lock); 327 spin_lock_bh(&pn->l2tp_session_hlist_lock);
329 328
@@ -620,7 +619,7 @@ discard:
620 */ 619 */
621void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, 620void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
622 unsigned char *ptr, unsigned char *optr, u16 hdrflags, 621 unsigned char *ptr, unsigned char *optr, u16 hdrflags,
623 int length, int (*payload_hook)(struct sk_buff *skb)) 622 int length)
624{ 623{
625 struct l2tp_tunnel *tunnel = session->tunnel; 624 struct l2tp_tunnel *tunnel = session->tunnel;
626 int offset; 625 int offset;
@@ -741,13 +740,6 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
741 740
742 __skb_pull(skb, offset); 741 __skb_pull(skb, offset);
743 742
744 /* If caller wants to process the payload before we queue the
745 * packet, do so now.
746 */
747 if (payload_hook)
748 if ((*payload_hook)(skb))
749 goto discard;
750
751 /* Prepare skb for adding to the session's reorder_q. Hold 743 /* Prepare skb for adding to the session's reorder_q. Hold
752 * packets for max reorder_timeout or 1 second if not 744 * packets for max reorder_timeout or 1 second if not
753 * reordering. 745 * reordering.
@@ -783,7 +775,7 @@ EXPORT_SYMBOL(l2tp_recv_common);
783 775
784/* Drop skbs from the session's reorder_q 776/* Drop skbs from the session's reorder_q
785 */ 777 */
786int l2tp_session_queue_purge(struct l2tp_session *session) 778static int l2tp_session_queue_purge(struct l2tp_session *session)
787{ 779{
788 struct sk_buff *skb = NULL; 780 struct sk_buff *skb = NULL;
789 BUG_ON(!session); 781 BUG_ON(!session);
@@ -794,7 +786,6 @@ int l2tp_session_queue_purge(struct l2tp_session *session)
794 } 786 }
795 return 0; 787 return 0;
796} 788}
797EXPORT_SYMBOL_GPL(l2tp_session_queue_purge);
798 789
799/* Internal UDP receive frame. Do the real work of receiving an L2TP data frame 790/* Internal UDP receive frame. Do the real work of receiving an L2TP data frame
800 * here. The skb is not on a list when we get here. 791 * here. The skb is not on a list when we get here.
@@ -802,8 +793,7 @@ EXPORT_SYMBOL_GPL(l2tp_session_queue_purge);
802 * Returns 1 if the packet was not a good data packet and could not be 793 * Returns 1 if the packet was not a good data packet and could not be
803 * forwarded. All such packets are passed up to userspace to deal with. 794 * forwarded. All such packets are passed up to userspace to deal with.
804 */ 795 */
805static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb, 796static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb)
806 int (*payload_hook)(struct sk_buff *skb))
807{ 797{
808 struct l2tp_session *session = NULL; 798 struct l2tp_session *session = NULL;
809 unsigned char *ptr, *optr; 799 unsigned char *ptr, *optr;
@@ -882,7 +872,7 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb,
882 } 872 }
883 873
884 /* Find the session context */ 874 /* Find the session context */
885 session = l2tp_session_get(tunnel->l2tp_net, tunnel, session_id); 875 session = l2tp_tunnel_get_session(tunnel, session_id);
886 if (!session || !session->recv_skb) { 876 if (!session || !session->recv_skb) {
887 if (session) 877 if (session)
888 l2tp_session_dec_refcount(session); 878 l2tp_session_dec_refcount(session);
@@ -894,7 +884,7 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb,
894 goto error; 884 goto error;
895 } 885 }
896 886
897 l2tp_recv_common(session, skb, ptr, optr, hdrflags, length, payload_hook); 887 l2tp_recv_common(session, skb, ptr, optr, hdrflags, length);
898 l2tp_session_dec_refcount(session); 888 l2tp_session_dec_refcount(session);
899 889
900 return 0; 890 return 0;
@@ -923,7 +913,7 @@ int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
923 l2tp_dbg(tunnel, L2TP_MSG_DATA, "%s: received %d bytes\n", 913 l2tp_dbg(tunnel, L2TP_MSG_DATA, "%s: received %d bytes\n",
924 tunnel->name, skb->len); 914 tunnel->name, skb->len);
925 915
926 if (l2tp_udp_recv_core(tunnel, skb, tunnel->recv_payload_hook)) 916 if (l2tp_udp_recv_core(tunnel, skb))
927 goto pass_up; 917 goto pass_up;
928 918
929 return 0; 919 return 0;
@@ -1009,8 +999,8 @@ static int l2tp_build_l2tpv3_header(struct l2tp_session *session, void *buf)
1009 return bufp - optr; 999 return bufp - optr;
1010} 1000}
1011 1001
1012static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, 1002static void l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb,
1013 struct flowi *fl, size_t data_len) 1003 struct flowi *fl, size_t data_len)
1014{ 1004{
1015 struct l2tp_tunnel *tunnel = session->tunnel; 1005 struct l2tp_tunnel *tunnel = session->tunnel;
1016 unsigned int len = skb->len; 1006 unsigned int len = skb->len;
@@ -1052,8 +1042,6 @@ static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb,
1052 atomic_long_inc(&tunnel->stats.tx_errors); 1042 atomic_long_inc(&tunnel->stats.tx_errors);
1053 atomic_long_inc(&session->stats.tx_errors); 1043 atomic_long_inc(&session->stats.tx_errors);
1054 } 1044 }
1055
1056 return 0;
1057} 1045}
1058 1046
1059/* If caller requires the skb to have a ppp header, the header must be 1047/* If caller requires the skb to have a ppp header, the header must be
@@ -1110,7 +1098,7 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len
1110 1098
1111 /* Get routing info from the tunnel socket */ 1099 /* Get routing info from the tunnel socket */
1112 skb_dst_drop(skb); 1100 skb_dst_drop(skb);
1113 skb_dst_set(skb, dst_clone(__sk_dst_check(sk, 0))); 1101 skb_dst_set(skb, sk_dst_check(sk, 0));
1114 1102
1115 inet = inet_sk(sk); 1103 inet = inet_sk(sk);
1116 fl = &inet->cork.fl; 1104 fl = &inet->cork.fl;
@@ -1193,7 +1181,7 @@ end:
1193 1181
1194/* When the tunnel is closed, all the attached sessions need to go too. 1182/* When the tunnel is closed, all the attached sessions need to go too.
1195 */ 1183 */
1196void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel) 1184static void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel)
1197{ 1185{
1198 int hash; 1186 int hash;
1199 struct hlist_node *walk; 1187 struct hlist_node *walk;
@@ -1242,7 +1230,6 @@ again:
1242 } 1230 }
1243 write_unlock_bh(&tunnel->hlist_lock); 1231 write_unlock_bh(&tunnel->hlist_lock);
1244} 1232}
1245EXPORT_SYMBOL_GPL(l2tp_tunnel_closeall);
1246 1233
1247/* Tunnel socket destroy hook for UDP encapsulation */ 1234/* Tunnel socket destroy hook for UDP encapsulation */
1248static void l2tp_udp_encap_destroy(struct sock *sk) 1235static void l2tp_udp_encap_destroy(struct sock *sk)
@@ -1687,8 +1674,6 @@ struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunn
1687 if (cfg) { 1674 if (cfg) {
1688 session->pwtype = cfg->pw_type; 1675 session->pwtype = cfg->pw_type;
1689 session->debug = cfg->debug; 1676 session->debug = cfg->debug;
1690 session->mtu = cfg->mtu;
1691 session->mru = cfg->mru;
1692 session->send_seq = cfg->send_seq; 1677 session->send_seq = cfg->send_seq;
1693 session->recv_seq = cfg->recv_seq; 1678 session->recv_seq = cfg->recv_seq;
1694 session->lns_mode = cfg->lns_mode; 1679 session->lns_mode = cfg->lns_mode;
@@ -1800,4 +1785,3 @@ MODULE_AUTHOR("James Chapman <jchapman@katalix.com>");
1800MODULE_DESCRIPTION("L2TP core"); 1785MODULE_DESCRIPTION("L2TP core");
1801MODULE_LICENSE("GPL"); 1786MODULE_LICENSE("GPL");
1802MODULE_VERSION(L2TP_DRV_VERSION); 1787MODULE_VERSION(L2TP_DRV_VERSION);
1803
diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h
index c199020f8a8a..9c9afe94d389 100644
--- a/net/l2tp/l2tp_core.h
+++ b/net/l2tp/l2tp_core.h
@@ -12,6 +12,13 @@
12#ifndef _L2TP_CORE_H_ 12#ifndef _L2TP_CORE_H_
13#define _L2TP_CORE_H_ 13#define _L2TP_CORE_H_
14 14
15#include <net/dst.h>
16#include <net/sock.h>
17
18#ifdef CONFIG_XFRM
19#include <net/xfrm.h>
20#endif
21
15/* Just some random numbers */ 22/* Just some random numbers */
16#define L2TP_TUNNEL_MAGIC 0x42114DDA 23#define L2TP_TUNNEL_MAGIC 0x42114DDA
17#define L2TP_SESSION_MAGIC 0x0C04EB7D 24#define L2TP_SESSION_MAGIC 0x0C04EB7D
@@ -45,10 +52,6 @@ struct l2tp_tunnel;
45 */ 52 */
46struct l2tp_session_cfg { 53struct l2tp_session_cfg {
47 enum l2tp_pwtype pw_type; 54 enum l2tp_pwtype pw_type;
48 unsigned int data_seq:2; /* data sequencing level
49 * 0 => none, 1 => IP only,
50 * 2 => all
51 */
52 unsigned int recv_seq:1; /* expect receive packets with 55 unsigned int recv_seq:1; /* expect receive packets with
53 * sequence numbers? */ 56 * sequence numbers? */
54 unsigned int send_seq:1; /* send packets with sequence 57 unsigned int send_seq:1; /* send packets with sequence
@@ -58,7 +61,6 @@ struct l2tp_session_cfg {
58 * control of LNS. */ 61 * control of LNS. */
59 int debug; /* bitmask of debug message 62 int debug; /* bitmask of debug message
60 * categories */ 63 * categories */
61 u16 vlan_id; /* VLAN pseudowire only */
62 u16 l2specific_type; /* Layer 2 specific type */ 64 u16 l2specific_type; /* Layer 2 specific type */
63 u8 cookie[8]; /* optional cookie */ 65 u8 cookie[8]; /* optional cookie */
64 int cookie_len; /* 0, 4 or 8 bytes */ 66 int cookie_len; /* 0, 4 or 8 bytes */
@@ -66,8 +68,6 @@ struct l2tp_session_cfg {
66 int peer_cookie_len; /* 0, 4 or 8 bytes */ 68 int peer_cookie_len; /* 0, 4 or 8 bytes */
67 int reorder_timeout; /* configured reorder timeout 69 int reorder_timeout; /* configured reorder timeout
68 * (in jiffies) */ 70 * (in jiffies) */
69 int mtu;
70 int mru;
71 char *ifname; 71 char *ifname;
72}; 72};
73 73
@@ -99,10 +99,6 @@ struct l2tp_session {
99 99
100 char name[32]; /* for logging */ 100 char name[32]; /* for logging */
101 char ifname[IFNAMSIZ]; 101 char ifname[IFNAMSIZ];
102 unsigned int data_seq:2; /* data sequencing level
103 * 0 => none, 1 => IP only,
104 * 2 => all
105 */
106 unsigned int recv_seq:1; /* expect receive packets with 102 unsigned int recv_seq:1; /* expect receive packets with
107 * sequence numbers? */ 103 * sequence numbers? */
108 unsigned int send_seq:1; /* send packets with sequence 104 unsigned int send_seq:1; /* send packets with sequence
@@ -115,8 +111,6 @@ struct l2tp_session {
115 int reorder_timeout; /* configured reorder timeout 111 int reorder_timeout; /* configured reorder timeout
116 * (in jiffies) */ 112 * (in jiffies) */
117 int reorder_skip; /* set if skip to next nr */ 113 int reorder_skip; /* set if skip to next nr */
118 int mtu;
119 int mru;
120 enum l2tp_pwtype pwtype; 114 enum l2tp_pwtype pwtype;
121 struct l2tp_stats stats; 115 struct l2tp_stats stats;
122 struct hlist_node global_hlist; /* Global hash list node */ 116 struct hlist_node global_hlist; /* Global hash list node */
@@ -124,9 +118,7 @@ struct l2tp_session {
124 int (*build_header)(struct l2tp_session *session, void *buf); 118 int (*build_header)(struct l2tp_session *session, void *buf);
125 void (*recv_skb)(struct l2tp_session *session, struct sk_buff *skb, int data_len); 119 void (*recv_skb)(struct l2tp_session *session, struct sk_buff *skb, int data_len);
126 void (*session_close)(struct l2tp_session *session); 120 void (*session_close)(struct l2tp_session *session);
127#if IS_ENABLED(CONFIG_L2TP_DEBUGFS)
128 void (*show)(struct seq_file *m, void *priv); 121 void (*show)(struct seq_file *m, void *priv);
129#endif
130 uint8_t priv[0]; /* private data */ 122 uint8_t priv[0]; /* private data */
131}; 123};
132 124
@@ -180,18 +172,12 @@ struct l2tp_tunnel {
180 struct net *l2tp_net; /* the net we belong to */ 172 struct net *l2tp_net; /* the net we belong to */
181 173
182 refcount_t ref_count; 174 refcount_t ref_count;
183#ifdef CONFIG_DEBUG_FS
184 void (*show)(struct seq_file *m, void *arg);
185#endif
186 int (*recv_payload_hook)(struct sk_buff *skb);
187 void (*old_sk_destruct)(struct sock *); 175 void (*old_sk_destruct)(struct sock *);
188 struct sock *sock; /* Parent socket */ 176 struct sock *sock; /* Parent socket */
189 int fd; /* Parent fd, if tunnel socket 177 int fd; /* Parent fd, if tunnel socket
190 * was created by userspace */ 178 * was created by userspace */
191 179
192 struct work_struct del_work; 180 struct work_struct del_work;
193
194 uint8_t priv[0]; /* private data */
195}; 181};
196 182
197struct l2tp_nl_cmd_ops { 183struct l2tp_nl_cmd_ops {
@@ -201,11 +187,6 @@ struct l2tp_nl_cmd_ops {
201 int (*session_delete)(struct l2tp_session *session); 187 int (*session_delete)(struct l2tp_session *session);
202}; 188};
203 189
204static inline void *l2tp_tunnel_priv(struct l2tp_tunnel *tunnel)
205{
206 return &tunnel->priv[0];
207}
208
209static inline void *l2tp_session_priv(struct l2tp_session *session) 190static inline void *l2tp_session_priv(struct l2tp_session *session)
210{ 191{
211 return &session->priv[0]; 192 return &session->priv[0];
@@ -213,12 +194,12 @@ static inline void *l2tp_session_priv(struct l2tp_session *session)
213 194
214struct l2tp_tunnel *l2tp_tunnel_get(const struct net *net, u32 tunnel_id); 195struct l2tp_tunnel *l2tp_tunnel_get(const struct net *net, u32 tunnel_id);
215struct l2tp_tunnel *l2tp_tunnel_get_nth(const struct net *net, int nth); 196struct l2tp_tunnel *l2tp_tunnel_get_nth(const struct net *net, int nth);
197struct l2tp_session *l2tp_tunnel_get_session(struct l2tp_tunnel *tunnel,
198 u32 session_id);
216 199
217void l2tp_tunnel_free(struct l2tp_tunnel *tunnel); 200void l2tp_tunnel_free(struct l2tp_tunnel *tunnel);
218 201
219struct l2tp_session *l2tp_session_get(const struct net *net, 202struct l2tp_session *l2tp_session_get(const struct net *net, u32 session_id);
220 struct l2tp_tunnel *tunnel,
221 u32 session_id);
222struct l2tp_session *l2tp_session_get_nth(struct l2tp_tunnel *tunnel, int nth); 203struct l2tp_session *l2tp_session_get_nth(struct l2tp_tunnel *tunnel, int nth);
223struct l2tp_session *l2tp_session_get_by_ifname(const struct net *net, 204struct l2tp_session *l2tp_session_get_by_ifname(const struct net *net,
224 const char *ifname); 205 const char *ifname);
@@ -229,7 +210,6 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id,
229int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, 210int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net,
230 struct l2tp_tunnel_cfg *cfg); 211 struct l2tp_tunnel_cfg *cfg);
231 212
232void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel);
233void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel); 213void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel);
234struct l2tp_session *l2tp_session_create(int priv_size, 214struct l2tp_session *l2tp_session_create(int priv_size,
235 struct l2tp_tunnel *tunnel, 215 struct l2tp_tunnel *tunnel,
@@ -243,8 +223,7 @@ int l2tp_session_delete(struct l2tp_session *session);
243void l2tp_session_free(struct l2tp_session *session); 223void l2tp_session_free(struct l2tp_session *session);
244void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, 224void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
245 unsigned char *ptr, unsigned char *optr, u16 hdrflags, 225 unsigned char *ptr, unsigned char *optr, u16 hdrflags,
246 int length, int (*payload_hook)(struct sk_buff *skb)); 226 int length);
247int l2tp_session_queue_purge(struct l2tp_session *session);
248int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb); 227int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb);
249void l2tp_session_set_header_len(struct l2tp_session *session, int version); 228void l2tp_session_set_header_len(struct l2tp_session *session, int version);
250 229
@@ -292,6 +271,36 @@ static inline int l2tp_get_l2specific_len(struct l2tp_session *session)
292 } 271 }
293} 272}
294 273
274static inline u32 l2tp_tunnel_dst_mtu(const struct l2tp_tunnel *tunnel)
275{
276 struct dst_entry *dst;
277 u32 mtu;
278
279 dst = sk_dst_get(tunnel->sock);
280 if (!dst)
281 return 0;
282
283 mtu = dst_mtu(dst);
284 dst_release(dst);
285
286 return mtu;
287}
288
289#ifdef CONFIG_XFRM
290static inline bool l2tp_tunnel_uses_xfrm(const struct l2tp_tunnel *tunnel)
291{
292 struct sock *sk = tunnel->sock;
293
294 return sk && (rcu_access_pointer(sk->sk_policy[0]) ||
295 rcu_access_pointer(sk->sk_policy[1]));
296}
297#else
298static inline bool l2tp_tunnel_uses_xfrm(const struct l2tp_tunnel *tunnel)
299{
300 return false;
301}
302#endif
303
295#define l2tp_printk(ptr, type, func, fmt, ...) \ 304#define l2tp_printk(ptr, type, func, fmt, ...) \
296do { \ 305do { \
297 if (((ptr)->debug) & (type)) \ 306 if (((ptr)->debug) & (type)) \
diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c
index e87686f7d63c..9821a1458555 100644
--- a/net/l2tp/l2tp_debugfs.c
+++ b/net/l2tp/l2tp_debugfs.c
@@ -177,9 +177,6 @@ static void l2tp_dfs_seq_tunnel_show(struct seq_file *m, void *v)
177 atomic_long_read(&tunnel->stats.rx_packets), 177 atomic_long_read(&tunnel->stats.rx_packets),
178 atomic_long_read(&tunnel->stats.rx_bytes), 178 atomic_long_read(&tunnel->stats.rx_bytes),
179 atomic_long_read(&tunnel->stats.rx_errors)); 179 atomic_long_read(&tunnel->stats.rx_errors));
180
181 if (tunnel->show != NULL)
182 tunnel->show(m, tunnel);
183} 180}
184 181
185static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v) 182static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v)
@@ -194,12 +191,9 @@ static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v)
194 if (session->send_seq || session->recv_seq) 191 if (session->send_seq || session->recv_seq)
195 seq_printf(m, " nr %hu, ns %hu\n", session->nr, session->ns); 192 seq_printf(m, " nr %hu, ns %hu\n", session->nr, session->ns);
196 seq_printf(m, " refcnt %d\n", refcount_read(&session->ref_count)); 193 seq_printf(m, " refcnt %d\n", refcount_read(&session->ref_count));
197 seq_printf(m, " config %d/%d/%c/%c/%s/%s %08x %u\n", 194 seq_printf(m, " config 0/0/%c/%c/-/%s %08x %u\n",
198 session->mtu, session->mru,
199 session->recv_seq ? 'R' : '-', 195 session->recv_seq ? 'R' : '-',
200 session->send_seq ? 'S' : '-', 196 session->send_seq ? 'S' : '-',
201 session->data_seq == 1 ? "IPSEQ" :
202 session->data_seq == 2 ? "DATASEQ" : "-",
203 session->lns_mode ? "LNS" : "LAC", 197 session->lns_mode ? "LNS" : "LAC",
204 session->debug, 198 session->debug,
205 jiffies_to_msecs(session->reorder_timeout)); 199 jiffies_to_msecs(session->reorder_timeout));
diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c
index 5c366ecfa1cb..8aadc4f3bb9e 100644
--- a/net/l2tp/l2tp_eth.c
+++ b/net/l2tp/l2tp_eth.c
@@ -199,7 +199,6 @@ static void l2tp_eth_delete(struct l2tp_session *session)
199 } 199 }
200} 200}
201 201
202#if IS_ENABLED(CONFIG_L2TP_DEBUGFS)
203static void l2tp_eth_show(struct seq_file *m, void *arg) 202static void l2tp_eth_show(struct seq_file *m, void *arg)
204{ 203{
205 struct l2tp_session *session = arg; 204 struct l2tp_session *session = arg;
@@ -219,29 +218,25 @@ static void l2tp_eth_show(struct seq_file *m, void *arg)
219 218
220 dev_put(dev); 219 dev_put(dev);
221} 220}
222#endif
223 221
224static void l2tp_eth_adjust_mtu(struct l2tp_tunnel *tunnel, 222static void l2tp_eth_adjust_mtu(struct l2tp_tunnel *tunnel,
225 struct l2tp_session *session, 223 struct l2tp_session *session,
226 struct net_device *dev) 224 struct net_device *dev)
227{ 225{
228 unsigned int overhead = 0; 226 unsigned int overhead = 0;
229 struct dst_entry *dst;
230 u32 l3_overhead = 0; 227 u32 l3_overhead = 0;
228 u32 mtu;
231 229
232 /* if the encap is UDP, account for UDP header size */ 230 /* if the encap is UDP, account for UDP header size */
233 if (tunnel->encap == L2TP_ENCAPTYPE_UDP) { 231 if (tunnel->encap == L2TP_ENCAPTYPE_UDP) {
234 overhead += sizeof(struct udphdr); 232 overhead += sizeof(struct udphdr);
235 dev->needed_headroom += sizeof(struct udphdr); 233 dev->needed_headroom += sizeof(struct udphdr);
236 } 234 }
237 if (session->mtu != 0) { 235
238 dev->mtu = session->mtu;
239 dev->needed_headroom += session->hdr_len;
240 return;
241 }
242 lock_sock(tunnel->sock); 236 lock_sock(tunnel->sock);
243 l3_overhead = kernel_sock_ip_overhead(tunnel->sock); 237 l3_overhead = kernel_sock_ip_overhead(tunnel->sock);
244 release_sock(tunnel->sock); 238 release_sock(tunnel->sock);
239
245 if (l3_overhead == 0) { 240 if (l3_overhead == 0) {
246 /* L3 Overhead couldn't be identified, this could be 241 /* L3 Overhead couldn't be identified, this could be
247 * because tunnel->sock was NULL or the socket's 242 * because tunnel->sock was NULL or the socket's
@@ -255,18 +250,12 @@ static void l2tp_eth_adjust_mtu(struct l2tp_tunnel *tunnel,
255 */ 250 */
256 overhead += session->hdr_len + ETH_HLEN + l3_overhead; 251 overhead += session->hdr_len + ETH_HLEN + l3_overhead;
257 252
258 /* If PMTU discovery was enabled, use discovered MTU on L2TP device */ 253 mtu = l2tp_tunnel_dst_mtu(tunnel) - overhead;
259 dst = sk_dst_get(tunnel->sock); 254 if (mtu < dev->min_mtu || mtu > dev->max_mtu)
260 if (dst) { 255 dev->mtu = ETH_DATA_LEN - overhead;
261 /* dst_mtu will use PMTU if found, else fallback to intf MTU */ 256 else
262 u32 pmtu = dst_mtu(dst); 257 dev->mtu = mtu;
263 258
264 if (pmtu != 0)
265 dev->mtu = pmtu;
266 dst_release(dst);
267 }
268 session->mtu = dev->mtu - overhead;
269 dev->mtu = session->mtu;
270 dev->needed_headroom += session->hdr_len; 259 dev->needed_headroom += session->hdr_len;
271} 260}
272 261
@@ -314,9 +303,8 @@ static int l2tp_eth_create(struct net *net, struct l2tp_tunnel *tunnel,
314 303
315 session->recv_skb = l2tp_eth_dev_recv; 304 session->recv_skb = l2tp_eth_dev_recv;
316 session->session_close = l2tp_eth_delete; 305 session->session_close = l2tp_eth_delete;
317#if IS_ENABLED(CONFIG_L2TP_DEBUGFS) 306 if (IS_ENABLED(CONFIG_L2TP_DEBUGFS))
318 session->show = l2tp_eth_show; 307 session->show = l2tp_eth_show;
319#endif
320 308
321 spriv = l2tp_session_priv(session); 309 spriv = l2tp_session_priv(session);
322 310
diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index 181073bf6925..35f6f86d4dcc 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -144,7 +144,7 @@ static int l2tp_ip_recv(struct sk_buff *skb)
144 } 144 }
145 145
146 /* Ok, this is a data packet. Lookup the session. */ 146 /* Ok, this is a data packet. Lookup the session. */
147 session = l2tp_session_get(net, NULL, session_id); 147 session = l2tp_session_get(net, session_id);
148 if (!session) 148 if (!session)
149 goto discard; 149 goto discard;
150 150
@@ -165,7 +165,7 @@ static int l2tp_ip_recv(struct sk_buff *skb)
165 print_hex_dump_bytes("", DUMP_PREFIX_OFFSET, ptr, length); 165 print_hex_dump_bytes("", DUMP_PREFIX_OFFSET, ptr, length);
166 } 166 }
167 167
168 l2tp_recv_common(session, skb, ptr, optr, 0, skb->len, tunnel->recv_payload_hook); 168 l2tp_recv_common(session, skb, ptr, optr, 0, skb->len);
169 l2tp_session_dec_refcount(session); 169 l2tp_session_dec_refcount(session);
170 170
171 return 0; 171 return 0;
@@ -613,7 +613,7 @@ static const struct proto_ops l2tp_ip_ops = {
613 .socketpair = sock_no_socketpair, 613 .socketpair = sock_no_socketpair,
614 .accept = sock_no_accept, 614 .accept = sock_no_accept,
615 .getname = l2tp_ip_getname, 615 .getname = l2tp_ip_getname,
616 .poll_mask = datagram_poll_mask, 616 .poll = datagram_poll,
617 .ioctl = inet_ioctl, 617 .ioctl = inet_ioctl,
618 .listen = sock_no_listen, 618 .listen = sock_no_listen,
619 .shutdown = inet_shutdown, 619 .shutdown = inet_shutdown,
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index 336e4c00abbc..237f1a4a0b0c 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -157,7 +157,7 @@ static int l2tp_ip6_recv(struct sk_buff *skb)
157 } 157 }
158 158
159 /* Ok, this is a data packet. Lookup the session. */ 159 /* Ok, this is a data packet. Lookup the session. */
160 session = l2tp_session_get(net, NULL, session_id); 160 session = l2tp_session_get(net, session_id);
161 if (!session) 161 if (!session)
162 goto discard; 162 goto discard;
163 163
@@ -178,8 +178,7 @@ static int l2tp_ip6_recv(struct sk_buff *skb)
178 print_hex_dump_bytes("", DUMP_PREFIX_OFFSET, ptr, length); 178 print_hex_dump_bytes("", DUMP_PREFIX_OFFSET, ptr, length);
179 } 179 }
180 180
181 l2tp_recv_common(session, skb, ptr, optr, 0, skb->len, 181 l2tp_recv_common(session, skb, ptr, optr, 0, skb->len);
182 tunnel->recv_payload_hook);
183 l2tp_session_dec_refcount(session); 182 l2tp_session_dec_refcount(session);
184 183
185 return 0; 184 return 0;
@@ -500,7 +499,6 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
500 struct ip6_flowlabel *flowlabel = NULL; 499 struct ip6_flowlabel *flowlabel = NULL;
501 struct dst_entry *dst = NULL; 500 struct dst_entry *dst = NULL;
502 struct flowi6 fl6; 501 struct flowi6 fl6;
503 struct sockcm_cookie sockc_unused = {0};
504 struct ipcm6_cookie ipc6; 502 struct ipcm6_cookie ipc6;
505 int addr_len = msg->msg_namelen; 503 int addr_len = msg->msg_namelen;
506 int transhdrlen = 4; /* zero session-id */ 504 int transhdrlen = 4; /* zero session-id */
@@ -525,9 +523,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
525 fl6.flowi6_mark = sk->sk_mark; 523 fl6.flowi6_mark = sk->sk_mark;
526 fl6.flowi6_uid = sk->sk_uid; 524 fl6.flowi6_uid = sk->sk_uid;
527 525
528 ipc6.hlimit = -1; 526 ipcm6_init(&ipc6);
529 ipc6.tclass = -1;
530 ipc6.dontfrag = -1;
531 527
532 if (lsa) { 528 if (lsa) {
533 if (addr_len < SIN6_LEN_RFC2133) 529 if (addr_len < SIN6_LEN_RFC2133)
@@ -575,8 +571,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
575 opt->tot_len = sizeof(struct ipv6_txoptions); 571 opt->tot_len = sizeof(struct ipv6_txoptions);
576 ipc6.opt = opt; 572 ipc6.opt = opt;
577 573
578 err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6, 574 err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6);
579 &sockc_unused);
580 if (err < 0) { 575 if (err < 0) {
581 fl6_sock_release(flowlabel); 576 fl6_sock_release(flowlabel);
582 return err; 577 return err;
@@ -641,7 +636,7 @@ back_from_confirm:
641 err = ip6_append_data(sk, ip_generic_getfrag, msg, 636 err = ip6_append_data(sk, ip_generic_getfrag, msg,
642 ulen, transhdrlen, &ipc6, 637 ulen, transhdrlen, &ipc6,
643 &fl6, (struct rt6_info *)dst, 638 &fl6, (struct rt6_info *)dst,
644 msg->msg_flags, &sockc_unused); 639 msg->msg_flags);
645 if (err) 640 if (err)
646 ip6_flush_pending_frames(sk); 641 ip6_flush_pending_frames(sk);
647 else if (!(msg->msg_flags & MSG_MORE)) 642 else if (!(msg->msg_flags & MSG_MORE))
@@ -754,7 +749,7 @@ static const struct proto_ops l2tp_ip6_ops = {
754 .socketpair = sock_no_socketpair, 749 .socketpair = sock_no_socketpair,
755 .accept = sock_no_accept, 750 .accept = sock_no_accept,
756 .getname = l2tp_ip6_getname, 751 .getname = l2tp_ip6_getname,
757 .poll_mask = datagram_poll_mask, 752 .poll = datagram_poll,
758 .ioctl = inet6_ioctl, 753 .ioctl = inet6_ioctl,
759 .listen = sock_no_listen, 754 .listen = sock_no_listen,
760 .shutdown = inet_shutdown, 755 .shutdown = inet_shutdown,
diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
index 5b9900889e31..edbd5d1fbcde 100644
--- a/net/l2tp/l2tp_netlink.c
+++ b/net/l2tp/l2tp_netlink.c
@@ -66,7 +66,7 @@ static struct l2tp_session *l2tp_nl_session_get(struct genl_info *info)
66 session_id = nla_get_u32(info->attrs[L2TP_ATTR_SESSION_ID]); 66 session_id = nla_get_u32(info->attrs[L2TP_ATTR_SESSION_ID]);
67 tunnel = l2tp_tunnel_get(net, tunnel_id); 67 tunnel = l2tp_tunnel_get(net, tunnel_id);
68 if (tunnel) { 68 if (tunnel) {
69 session = l2tp_session_get(net, tunnel, session_id); 69 session = l2tp_tunnel_get_session(tunnel, session_id);
70 l2tp_tunnel_dec_refcount(tunnel); 70 l2tp_tunnel_dec_refcount(tunnel);
71 } 71 }
72 } 72 }
@@ -560,9 +560,6 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf
560 } 560 }
561 561
562 if (tunnel->version > 2) { 562 if (tunnel->version > 2) {
563 if (info->attrs[L2TP_ATTR_DATA_SEQ])
564 cfg.data_seq = nla_get_u8(info->attrs[L2TP_ATTR_DATA_SEQ]);
565
566 if (info->attrs[L2TP_ATTR_L2SPEC_TYPE]) { 563 if (info->attrs[L2TP_ATTR_L2SPEC_TYPE]) {
567 cfg.l2specific_type = nla_get_u8(info->attrs[L2TP_ATTR_L2SPEC_TYPE]); 564 cfg.l2specific_type = nla_get_u8(info->attrs[L2TP_ATTR_L2SPEC_TYPE]);
568 if (cfg.l2specific_type != L2TP_L2SPECTYPE_DEFAULT && 565 if (cfg.l2specific_type != L2TP_L2SPECTYPE_DEFAULT &&
@@ -594,9 +591,6 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf
594 } 591 }
595 if (info->attrs[L2TP_ATTR_IFNAME]) 592 if (info->attrs[L2TP_ATTR_IFNAME])
596 cfg.ifname = nla_data(info->attrs[L2TP_ATTR_IFNAME]); 593 cfg.ifname = nla_data(info->attrs[L2TP_ATTR_IFNAME]);
597
598 if (info->attrs[L2TP_ATTR_VLAN_ID])
599 cfg.vlan_id = nla_get_u16(info->attrs[L2TP_ATTR_VLAN_ID]);
600 } 594 }
601 595
602 if (info->attrs[L2TP_ATTR_DEBUG]) 596 if (info->attrs[L2TP_ATTR_DEBUG])
@@ -614,12 +608,6 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf
614 if (info->attrs[L2TP_ATTR_RECV_TIMEOUT]) 608 if (info->attrs[L2TP_ATTR_RECV_TIMEOUT])
615 cfg.reorder_timeout = nla_get_msecs(info->attrs[L2TP_ATTR_RECV_TIMEOUT]); 609 cfg.reorder_timeout = nla_get_msecs(info->attrs[L2TP_ATTR_RECV_TIMEOUT]);
616 610
617 if (info->attrs[L2TP_ATTR_MTU])
618 cfg.mtu = nla_get_u16(info->attrs[L2TP_ATTR_MTU]);
619
620 if (info->attrs[L2TP_ATTR_MRU])
621 cfg.mru = nla_get_u16(info->attrs[L2TP_ATTR_MRU]);
622
623#ifdef CONFIG_MODULES 611#ifdef CONFIG_MODULES
624 if (l2tp_nl_cmd_ops[cfg.pw_type] == NULL) { 612 if (l2tp_nl_cmd_ops[cfg.pw_type] == NULL) {
625 genl_unlock(); 613 genl_unlock();
@@ -639,7 +627,7 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf
639 &cfg); 627 &cfg);
640 628
641 if (ret >= 0) { 629 if (ret >= 0) {
642 session = l2tp_session_get(net, tunnel, session_id); 630 session = l2tp_tunnel_get_session(tunnel, session_id);
643 if (session) { 631 if (session) {
644 ret = l2tp_session_notify(&l2tp_nl_family, info, session, 632 ret = l2tp_session_notify(&l2tp_nl_family, info, session,
645 L2TP_CMD_SESSION_CREATE); 633 L2TP_CMD_SESSION_CREATE);
@@ -693,9 +681,6 @@ static int l2tp_nl_cmd_session_modify(struct sk_buff *skb, struct genl_info *inf
693 if (info->attrs[L2TP_ATTR_DEBUG]) 681 if (info->attrs[L2TP_ATTR_DEBUG])
694 session->debug = nla_get_u32(info->attrs[L2TP_ATTR_DEBUG]); 682 session->debug = nla_get_u32(info->attrs[L2TP_ATTR_DEBUG]);
695 683
696 if (info->attrs[L2TP_ATTR_DATA_SEQ])
697 session->data_seq = nla_get_u8(info->attrs[L2TP_ATTR_DATA_SEQ]);
698
699 if (info->attrs[L2TP_ATTR_RECV_SEQ]) 684 if (info->attrs[L2TP_ATTR_RECV_SEQ])
700 session->recv_seq = nla_get_u8(info->attrs[L2TP_ATTR_RECV_SEQ]); 685 session->recv_seq = nla_get_u8(info->attrs[L2TP_ATTR_RECV_SEQ]);
701 686
@@ -710,12 +695,6 @@ static int l2tp_nl_cmd_session_modify(struct sk_buff *skb, struct genl_info *inf
710 if (info->attrs[L2TP_ATTR_RECV_TIMEOUT]) 695 if (info->attrs[L2TP_ATTR_RECV_TIMEOUT])
711 session->reorder_timeout = nla_get_msecs(info->attrs[L2TP_ATTR_RECV_TIMEOUT]); 696 session->reorder_timeout = nla_get_msecs(info->attrs[L2TP_ATTR_RECV_TIMEOUT]);
712 697
713 if (info->attrs[L2TP_ATTR_MTU])
714 session->mtu = nla_get_u16(info->attrs[L2TP_ATTR_MTU]);
715
716 if (info->attrs[L2TP_ATTR_MRU])
717 session->mru = nla_get_u16(info->attrs[L2TP_ATTR_MRU]);
718
719 ret = l2tp_session_notify(&l2tp_nl_family, info, 698 ret = l2tp_session_notify(&l2tp_nl_family, info,
720 session, L2TP_CMD_SESSION_MODIFY); 699 session, L2TP_CMD_SESSION_MODIFY);
721 700
@@ -731,9 +710,6 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl
731 void *hdr; 710 void *hdr;
732 struct nlattr *nest; 711 struct nlattr *nest;
733 struct l2tp_tunnel *tunnel = session->tunnel; 712 struct l2tp_tunnel *tunnel = session->tunnel;
734 struct sock *sk = NULL;
735
736 sk = tunnel->sock;
737 713
738 hdr = genlmsg_put(skb, portid, seq, &l2tp_nl_family, flags, cmd); 714 hdr = genlmsg_put(skb, portid, seq, &l2tp_nl_family, flags, cmd);
739 if (!hdr) 715 if (!hdr)
@@ -745,10 +721,7 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl
745 nla_put_u32(skb, L2TP_ATTR_PEER_SESSION_ID, 721 nla_put_u32(skb, L2TP_ATTR_PEER_SESSION_ID,
746 session->peer_session_id) || 722 session->peer_session_id) ||
747 nla_put_u32(skb, L2TP_ATTR_DEBUG, session->debug) || 723 nla_put_u32(skb, L2TP_ATTR_DEBUG, session->debug) ||
748 nla_put_u16(skb, L2TP_ATTR_PW_TYPE, session->pwtype) || 724 nla_put_u16(skb, L2TP_ATTR_PW_TYPE, session->pwtype))
749 nla_put_u16(skb, L2TP_ATTR_MTU, session->mtu) ||
750 (session->mru &&
751 nla_put_u16(skb, L2TP_ATTR_MRU, session->mru)))
752 goto nla_put_failure; 725 goto nla_put_failure;
753 726
754 if ((session->ifname[0] && 727 if ((session->ifname[0] &&
@@ -762,10 +735,8 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl
762 nla_put_u8(skb, L2TP_ATTR_RECV_SEQ, session->recv_seq) || 735 nla_put_u8(skb, L2TP_ATTR_RECV_SEQ, session->recv_seq) ||
763 nla_put_u8(skb, L2TP_ATTR_SEND_SEQ, session->send_seq) || 736 nla_put_u8(skb, L2TP_ATTR_SEND_SEQ, session->send_seq) ||
764 nla_put_u8(skb, L2TP_ATTR_LNS_MODE, session->lns_mode) || 737 nla_put_u8(skb, L2TP_ATTR_LNS_MODE, session->lns_mode) ||
765#ifdef CONFIG_XFRM 738 (l2tp_tunnel_uses_xfrm(tunnel) &&
766 (((sk) && (sk->sk_policy[0] || sk->sk_policy[1])) &&
767 nla_put_u8(skb, L2TP_ATTR_USING_IPSEC, 1)) || 739 nla_put_u8(skb, L2TP_ATTR_USING_IPSEC, 1)) ||
768#endif
769 (session->reorder_timeout && 740 (session->reorder_timeout &&
770 nla_put_msecs(skb, L2TP_ATTR_RECV_TIMEOUT, 741 nla_put_msecs(skb, L2TP_ATTR_RECV_TIMEOUT,
771 session->reorder_timeout, L2TP_ATTR_PAD))) 742 session->reorder_timeout, L2TP_ATTR_PAD)))
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 55188382845c..04d9946dcdba 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -93,10 +93,8 @@
93#include <linux/nsproxy.h> 93#include <linux/nsproxy.h>
94#include <net/net_namespace.h> 94#include <net/net_namespace.h>
95#include <net/netns/generic.h> 95#include <net/netns/generic.h>
96#include <net/dst.h>
97#include <net/ip.h> 96#include <net/ip.h>
98#include <net/udp.h> 97#include <net/udp.h>
99#include <net/xfrm.h>
100#include <net/inet_common.h> 98#include <net/inet_common.h>
101 99
102#include <asm/byteorder.h> 100#include <asm/byteorder.h>
@@ -127,8 +125,6 @@ struct pppol2tp_session {
127 * PPPoX socket */ 125 * PPPoX socket */
128 struct sock *__sk; /* Copy of .sk, for cleanup */ 126 struct sock *__sk; /* Copy of .sk, for cleanup */
129 struct rcu_head rcu; /* For asynchronous release */ 127 struct rcu_head rcu; /* For asynchronous release */
130 int flags; /* accessed by PPPIOCGFLAGS.
131 * Unused. */
132}; 128};
133 129
134static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb); 130static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb);
@@ -183,25 +179,6 @@ out:
183 * Receive data handling 179 * Receive data handling
184 *****************************************************************************/ 180 *****************************************************************************/
185 181
186static int pppol2tp_recv_payload_hook(struct sk_buff *skb)
187{
188 /* Skip PPP header, if present. In testing, Microsoft L2TP clients
189 * don't send the PPP header (PPP header compression enabled), but
190 * other clients can include the header. So we cope with both cases
191 * here. The PPP header is always FF03 when using L2TP.
192 *
193 * Note that skb->data[] isn't dereferenced from a u16 ptr here since
194 * the field may be unaligned.
195 */
196 if (!pskb_may_pull(skb, 2))
197 return 1;
198
199 if ((skb->data[0] == PPP_ALLSTATIONS) && (skb->data[1] == PPP_UI))
200 skb_pull(skb, 2);
201
202 return 0;
203}
204
205/* Receive message. This is the recvmsg for the PPPoL2TP socket. 182/* Receive message. This is the recvmsg for the PPPoL2TP socket.
206 */ 183 */
207static int pppol2tp_recvmsg(struct socket *sock, struct msghdr *msg, 184static int pppol2tp_recvmsg(struct socket *sock, struct msghdr *msg,
@@ -248,6 +225,17 @@ static void pppol2tp_recv(struct l2tp_session *session, struct sk_buff *skb, int
248 if (sk == NULL) 225 if (sk == NULL)
249 goto no_sock; 226 goto no_sock;
250 227
228 /* If the first two bytes are 0xFF03, consider that it is the PPP's
229 * Address and Control fields and skip them. The L2TP module has always
230 * worked this way, although, in theory, the use of these fields should
231 * be negociated and handled at the PPP layer. These fields are
232 * constant: 0xFF is the All-Stations Address and 0x03 the Unnumbered
233 * Information command with Poll/Final bit set to zero (RFC 1662).
234 */
235 if (pskb_may_pull(skb, 2) && skb->data[0] == PPP_ALLSTATIONS &&
236 skb->data[1] == PPP_UI)
237 skb_pull(skb, 2);
238
251 if (sk->sk_state & PPPOX_BOUND) { 239 if (sk->sk_state & PPPOX_BOUND) {
252 struct pppox_sock *po; 240 struct pppox_sock *po;
253 241
@@ -424,12 +412,6 @@ static void pppol2tp_put_sk(struct rcu_head *head)
424 sock_put(ps->__sk); 412 sock_put(ps->__sk);
425} 413}
426 414
427/* Called by l2tp_core when a session socket is being closed.
428 */
429static void pppol2tp_session_close(struct l2tp_session *session)
430{
431}
432
433/* Really kill the session socket. (Called from sock_put() if 415/* Really kill the session socket. (Called from sock_put() if
434 * refcnt == 0.) 416 * refcnt == 0.)
435 */ 417 */
@@ -551,7 +533,6 @@ out:
551 return error; 533 return error;
552} 534}
553 535
554#if IS_ENABLED(CONFIG_L2TP_DEBUGFS)
555static void pppol2tp_show(struct seq_file *m, void *arg) 536static void pppol2tp_show(struct seq_file *m, void *arg)
556{ 537{
557 struct l2tp_session *session = arg; 538 struct l2tp_session *session = arg;
@@ -565,34 +546,118 @@ static void pppol2tp_show(struct seq_file *m, void *arg)
565 sock_put(sk); 546 sock_put(sk);
566 } 547 }
567} 548}
568#endif
569 549
570static void pppol2tp_session_init(struct l2tp_session *session) 550static void pppol2tp_session_init(struct l2tp_session *session)
571{ 551{
572 struct pppol2tp_session *ps; 552 struct pppol2tp_session *ps;
573 struct dst_entry *dst;
574 553
575 session->recv_skb = pppol2tp_recv; 554 session->recv_skb = pppol2tp_recv;
576 session->session_close = pppol2tp_session_close; 555 if (IS_ENABLED(CONFIG_L2TP_DEBUGFS))
577#if IS_ENABLED(CONFIG_L2TP_DEBUGFS) 556 session->show = pppol2tp_show;
578 session->show = pppol2tp_show;
579#endif
580 557
581 ps = l2tp_session_priv(session); 558 ps = l2tp_session_priv(session);
582 mutex_init(&ps->sk_lock); 559 mutex_init(&ps->sk_lock);
583 ps->owner = current->pid; 560 ps->owner = current->pid;
561}
562
563struct l2tp_connect_info {
564 u8 version;
565 int fd;
566 u32 tunnel_id;
567 u32 peer_tunnel_id;
568 u32 session_id;
569 u32 peer_session_id;
570};
571
572static int pppol2tp_sockaddr_get_info(const void *sa, int sa_len,
573 struct l2tp_connect_info *info)
574{
575 switch (sa_len) {
576 case sizeof(struct sockaddr_pppol2tp):
577 {
578 const struct sockaddr_pppol2tp *sa_v2in4 = sa;
584 579
585 /* If PMTU discovery was enabled, use the MTU that was discovered */ 580 if (sa_v2in4->sa_protocol != PX_PROTO_OL2TP)
586 dst = sk_dst_get(session->tunnel->sock); 581 return -EINVAL;
587 if (dst) {
588 u32 pmtu = dst_mtu(dst);
589 582
590 if (pmtu) { 583 info->version = 2;
591 session->mtu = pmtu - PPPOL2TP_HEADER_OVERHEAD; 584 info->fd = sa_v2in4->pppol2tp.fd;
592 session->mru = pmtu - PPPOL2TP_HEADER_OVERHEAD; 585 info->tunnel_id = sa_v2in4->pppol2tp.s_tunnel;
593 } 586 info->peer_tunnel_id = sa_v2in4->pppol2tp.d_tunnel;
594 dst_release(dst); 587 info->session_id = sa_v2in4->pppol2tp.s_session;
588 info->peer_session_id = sa_v2in4->pppol2tp.d_session;
589
590 break;
595 } 591 }
592 case sizeof(struct sockaddr_pppol2tpv3):
593 {
594 const struct sockaddr_pppol2tpv3 *sa_v3in4 = sa;
595
596 if (sa_v3in4->sa_protocol != PX_PROTO_OL2TP)
597 return -EINVAL;
598
599 info->version = 3;
600 info->fd = sa_v3in4->pppol2tp.fd;
601 info->tunnel_id = sa_v3in4->pppol2tp.s_tunnel;
602 info->peer_tunnel_id = sa_v3in4->pppol2tp.d_tunnel;
603 info->session_id = sa_v3in4->pppol2tp.s_session;
604 info->peer_session_id = sa_v3in4->pppol2tp.d_session;
605
606 break;
607 }
608 case sizeof(struct sockaddr_pppol2tpin6):
609 {
610 const struct sockaddr_pppol2tpin6 *sa_v2in6 = sa;
611
612 if (sa_v2in6->sa_protocol != PX_PROTO_OL2TP)
613 return -EINVAL;
614
615 info->version = 2;
616 info->fd = sa_v2in6->pppol2tp.fd;
617 info->tunnel_id = sa_v2in6->pppol2tp.s_tunnel;
618 info->peer_tunnel_id = sa_v2in6->pppol2tp.d_tunnel;
619 info->session_id = sa_v2in6->pppol2tp.s_session;
620 info->peer_session_id = sa_v2in6->pppol2tp.d_session;
621
622 break;
623 }
624 case sizeof(struct sockaddr_pppol2tpv3in6):
625 {
626 const struct sockaddr_pppol2tpv3in6 *sa_v3in6 = sa;
627
628 if (sa_v3in6->sa_protocol != PX_PROTO_OL2TP)
629 return -EINVAL;
630
631 info->version = 3;
632 info->fd = sa_v3in6->pppol2tp.fd;
633 info->tunnel_id = sa_v3in6->pppol2tp.s_tunnel;
634 info->peer_tunnel_id = sa_v3in6->pppol2tp.d_tunnel;
635 info->session_id = sa_v3in6->pppol2tp.s_session;
636 info->peer_session_id = sa_v3in6->pppol2tp.d_session;
637
638 break;
639 }
640 default:
641 return -EINVAL;
642 }
643
644 return 0;
645}
646
647/* Rough estimation of the maximum payload size a tunnel can transmit without
648 * fragmenting at the lower IP layer. Assumes L2TPv2 with sequence
649 * numbers and no IP option. Not quite accurate, but the result is mostly
650 * unused anyway.
651 */
652static int pppol2tp_tunnel_mtu(const struct l2tp_tunnel *tunnel)
653{
654 int mtu;
655
656 mtu = l2tp_tunnel_dst_mtu(tunnel);
657 if (mtu <= PPPOL2TP_HEADER_OVERHEAD)
658 return 1500 - PPPOL2TP_HEADER_OVERHEAD;
659
660 return mtu - PPPOL2TP_HEADER_OVERHEAD;
596} 661}
597 662
598/* connect() handler. Attach a PPPoX socket to a tunnel UDP socket 663/* connect() handler. Attach a PPPoX socket to a tunnel UDP socket
@@ -601,34 +666,23 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
601 int sockaddr_len, int flags) 666 int sockaddr_len, int flags)
602{ 667{
603 struct sock *sk = sock->sk; 668 struct sock *sk = sock->sk;
604 struct sockaddr_pppol2tp *sp = (struct sockaddr_pppol2tp *) uservaddr;
605 struct pppox_sock *po = pppox_sk(sk); 669 struct pppox_sock *po = pppox_sk(sk);
606 struct l2tp_session *session = NULL; 670 struct l2tp_session *session = NULL;
671 struct l2tp_connect_info info;
607 struct l2tp_tunnel *tunnel; 672 struct l2tp_tunnel *tunnel;
608 struct pppol2tp_session *ps; 673 struct pppol2tp_session *ps;
609 struct l2tp_session_cfg cfg = { 0, }; 674 struct l2tp_session_cfg cfg = { 0, };
610 int error = 0;
611 u32 tunnel_id, peer_tunnel_id;
612 u32 session_id, peer_session_id;
613 bool drop_refcnt = false; 675 bool drop_refcnt = false;
614 bool drop_tunnel = false; 676 bool drop_tunnel = false;
615 bool new_session = false; 677 bool new_session = false;
616 bool new_tunnel = false; 678 bool new_tunnel = false;
617 int ver = 2; 679 int error;
618 int fd;
619
620 lock_sock(sk);
621
622 error = -EINVAL;
623 680
624 if (sockaddr_len != sizeof(struct sockaddr_pppol2tp) && 681 error = pppol2tp_sockaddr_get_info(uservaddr, sockaddr_len, &info);
625 sockaddr_len != sizeof(struct sockaddr_pppol2tpv3) && 682 if (error < 0)
626 sockaddr_len != sizeof(struct sockaddr_pppol2tpin6) && 683 return error;
627 sockaddr_len != sizeof(struct sockaddr_pppol2tpv3in6))
628 goto end;
629 684
630 if (sp->sa_protocol != PX_PROTO_OL2TP) 685 lock_sock(sk);
631 goto end;
632 686
633 /* Check for already bound sockets */ 687 /* Check for already bound sockets */
634 error = -EBUSY; 688 error = -EBUSY;
@@ -640,56 +694,12 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
640 if (sk->sk_user_data) 694 if (sk->sk_user_data)
641 goto end; /* socket is already attached */ 695 goto end; /* socket is already attached */
642 696
643 /* Get params from socket address. Handle L2TPv2 and L2TPv3.
644 * This is nasty because there are different sockaddr_pppol2tp
645 * structs for L2TPv2, L2TPv3, over IPv4 and IPv6. We use
646 * the sockaddr size to determine which structure the caller
647 * is using.
648 */
649 peer_tunnel_id = 0;
650 if (sockaddr_len == sizeof(struct sockaddr_pppol2tp)) {
651 fd = sp->pppol2tp.fd;
652 tunnel_id = sp->pppol2tp.s_tunnel;
653 peer_tunnel_id = sp->pppol2tp.d_tunnel;
654 session_id = sp->pppol2tp.s_session;
655 peer_session_id = sp->pppol2tp.d_session;
656 } else if (sockaddr_len == sizeof(struct sockaddr_pppol2tpv3)) {
657 struct sockaddr_pppol2tpv3 *sp3 =
658 (struct sockaddr_pppol2tpv3 *) sp;
659 ver = 3;
660 fd = sp3->pppol2tp.fd;
661 tunnel_id = sp3->pppol2tp.s_tunnel;
662 peer_tunnel_id = sp3->pppol2tp.d_tunnel;
663 session_id = sp3->pppol2tp.s_session;
664 peer_session_id = sp3->pppol2tp.d_session;
665 } else if (sockaddr_len == sizeof(struct sockaddr_pppol2tpin6)) {
666 struct sockaddr_pppol2tpin6 *sp6 =
667 (struct sockaddr_pppol2tpin6 *) sp;
668 fd = sp6->pppol2tp.fd;
669 tunnel_id = sp6->pppol2tp.s_tunnel;
670 peer_tunnel_id = sp6->pppol2tp.d_tunnel;
671 session_id = sp6->pppol2tp.s_session;
672 peer_session_id = sp6->pppol2tp.d_session;
673 } else if (sockaddr_len == sizeof(struct sockaddr_pppol2tpv3in6)) {
674 struct sockaddr_pppol2tpv3in6 *sp6 =
675 (struct sockaddr_pppol2tpv3in6 *) sp;
676 ver = 3;
677 fd = sp6->pppol2tp.fd;
678 tunnel_id = sp6->pppol2tp.s_tunnel;
679 peer_tunnel_id = sp6->pppol2tp.d_tunnel;
680 session_id = sp6->pppol2tp.s_session;
681 peer_session_id = sp6->pppol2tp.d_session;
682 } else {
683 error = -EINVAL;
684 goto end; /* bad socket address */
685 }
686
687 /* Don't bind if tunnel_id is 0 */ 697 /* Don't bind if tunnel_id is 0 */
688 error = -EINVAL; 698 error = -EINVAL;
689 if (tunnel_id == 0) 699 if (!info.tunnel_id)
690 goto end; 700 goto end;
691 701
692 tunnel = l2tp_tunnel_get(sock_net(sk), tunnel_id); 702 tunnel = l2tp_tunnel_get(sock_net(sk), info.tunnel_id);
693 if (tunnel) 703 if (tunnel)
694 drop_tunnel = true; 704 drop_tunnel = true;
695 705
@@ -697,7 +707,7 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
697 * peer_session_id is 0. Otherwise look up tunnel using supplied 707 * peer_session_id is 0. Otherwise look up tunnel using supplied
698 * tunnel id. 708 * tunnel id.
699 */ 709 */
700 if ((session_id == 0) && (peer_session_id == 0)) { 710 if (!info.session_id && !info.peer_session_id) {
701 if (tunnel == NULL) { 711 if (tunnel == NULL) {
702 struct l2tp_tunnel_cfg tcfg = { 712 struct l2tp_tunnel_cfg tcfg = {
703 .encap = L2TP_ENCAPTYPE_UDP, 713 .encap = L2TP_ENCAPTYPE_UDP,
@@ -707,12 +717,16 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
707 /* Prevent l2tp_tunnel_register() from trying to set up 717 /* Prevent l2tp_tunnel_register() from trying to set up
708 * a kernel socket. 718 * a kernel socket.
709 */ 719 */
710 if (fd < 0) { 720 if (info.fd < 0) {
711 error = -EBADF; 721 error = -EBADF;
712 goto end; 722 goto end;
713 } 723 }
714 724
715 error = l2tp_tunnel_create(sock_net(sk), fd, ver, tunnel_id, peer_tunnel_id, &tcfg, &tunnel); 725 error = l2tp_tunnel_create(sock_net(sk), info.fd,
726 info.version,
727 info.tunnel_id,
728 info.peer_tunnel_id, &tcfg,
729 &tunnel);
716 if (error < 0) 730 if (error < 0)
717 goto end; 731 goto end;
718 732
@@ -737,13 +751,10 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
737 goto end; 751 goto end;
738 } 752 }
739 753
740 if (tunnel->recv_payload_hook == NULL)
741 tunnel->recv_payload_hook = pppol2tp_recv_payload_hook;
742
743 if (tunnel->peer_tunnel_id == 0) 754 if (tunnel->peer_tunnel_id == 0)
744 tunnel->peer_tunnel_id = peer_tunnel_id; 755 tunnel->peer_tunnel_id = info.peer_tunnel_id;
745 756
746 session = l2tp_session_get(sock_net(sk), tunnel, session_id); 757 session = l2tp_tunnel_get_session(tunnel, info.session_id);
747 if (session) { 758 if (session) {
748 drop_refcnt = true; 759 drop_refcnt = true;
749 760
@@ -766,14 +777,11 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
766 goto end; 777 goto end;
767 } 778 }
768 } else { 779 } else {
769 /* Default MTU must allow space for UDP/L2TP/PPP headers */
770 cfg.mtu = 1500 - PPPOL2TP_HEADER_OVERHEAD;
771 cfg.mru = cfg.mtu;
772 cfg.pw_type = L2TP_PWTYPE_PPP; 780 cfg.pw_type = L2TP_PWTYPE_PPP;
773 781
774 session = l2tp_session_create(sizeof(struct pppol2tp_session), 782 session = l2tp_session_create(sizeof(struct pppol2tp_session),
775 tunnel, session_id, 783 tunnel, info.session_id,
776 peer_session_id, &cfg); 784 info.peer_session_id, &cfg);
777 if (IS_ERR(session)) { 785 if (IS_ERR(session)) {
778 error = PTR_ERR(session); 786 error = PTR_ERR(session);
779 goto end; 787 goto end;
@@ -813,7 +821,7 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
813 821
814 po->chan.private = sk; 822 po->chan.private = sk;
815 po->chan.ops = &pppol2tp_chan_ops; 823 po->chan.ops = &pppol2tp_chan_ops;
816 po->chan.mtu = session->mtu; 824 po->chan.mtu = pppol2tp_tunnel_mtu(tunnel);
817 825
818 error = ppp_register_net_channel(sock_net(sk), &po->chan); 826 error = ppp_register_net_channel(sock_net(sk), &po->chan);
819 if (error) { 827 if (error) {
@@ -869,12 +877,6 @@ static int pppol2tp_session_create(struct net *net, struct l2tp_tunnel *tunnel,
869 goto err; 877 goto err;
870 } 878 }
871 879
872 /* Default MTU values. */
873 if (cfg->mtu == 0)
874 cfg->mtu = 1500 - PPPOL2TP_HEADER_OVERHEAD;
875 if (cfg->mru == 0)
876 cfg->mru = cfg->mtu;
877
878 /* Allocate and initialize a new session context. */ 880 /* Allocate and initialize a new session context. */
879 session = l2tp_session_create(sizeof(struct pppol2tp_session), 881 session = l2tp_session_create(sizeof(struct pppol2tp_session),
880 tunnel, session_id, 882 tunnel, session_id,
@@ -1021,8 +1023,10 @@ end:
1021 ****************************************************************************/ 1023 ****************************************************************************/
1022 1024
1023static void pppol2tp_copy_stats(struct pppol2tp_ioc_stats *dest, 1025static void pppol2tp_copy_stats(struct pppol2tp_ioc_stats *dest,
1024 struct l2tp_stats *stats) 1026 const struct l2tp_stats *stats)
1025{ 1027{
1028 memset(dest, 0, sizeof(*dest));
1029
1026 dest->tx_packets = atomic_long_read(&stats->tx_packets); 1030 dest->tx_packets = atomic_long_read(&stats->tx_packets);
1027 dest->tx_bytes = atomic_long_read(&stats->tx_bytes); 1031 dest->tx_bytes = atomic_long_read(&stats->tx_bytes);
1028 dest->tx_errors = atomic_long_read(&stats->tx_errors); 1032 dest->tx_errors = atomic_long_read(&stats->tx_errors);
@@ -1033,251 +1037,107 @@ static void pppol2tp_copy_stats(struct pppol2tp_ioc_stats *dest,
1033 dest->rx_errors = atomic_long_read(&stats->rx_errors); 1037 dest->rx_errors = atomic_long_read(&stats->rx_errors);
1034} 1038}
1035 1039
1036/* Session ioctl helper. 1040static int pppol2tp_tunnel_copy_stats(struct pppol2tp_ioc_stats *stats,
1037 */ 1041 struct l2tp_tunnel *tunnel)
1038static int pppol2tp_session_ioctl(struct l2tp_session *session,
1039 unsigned int cmd, unsigned long arg)
1040{ 1042{
1041 struct ifreq ifr; 1043 struct l2tp_session *session;
1042 int err = 0;
1043 struct sock *sk;
1044 int val = (int) arg;
1045 struct pppol2tp_session *ps = l2tp_session_priv(session);
1046 struct l2tp_tunnel *tunnel = session->tunnel;
1047 struct pppol2tp_ioc_stats stats;
1048 1044
1049 l2tp_dbg(session, L2TP_MSG_CONTROL, 1045 if (!stats->session_id) {
1050 "%s: pppol2tp_session_ioctl(cmd=%#x, arg=%#lx)\n", 1046 pppol2tp_copy_stats(stats, &tunnel->stats);
1051 session->name, cmd, arg); 1047 return 0;
1048 }
1052 1049
1053 sk = pppol2tp_session_get_sock(session); 1050 /* If session_id is set, search the corresponding session in the
1054 if (!sk) 1051 * context of this tunnel and record the session's statistics.
1052 */
1053 session = l2tp_tunnel_get_session(tunnel, stats->session_id);
1054 if (!session)
1055 return -EBADR; 1055 return -EBADR;
1056 1056
1057 switch (cmd) { 1057 if (session->pwtype != L2TP_PWTYPE_PPP) {
1058 case SIOCGIFMTU: 1058 l2tp_session_dec_refcount(session);
1059 err = -ENXIO; 1059 return -EBADR;
1060 if (!(sk->sk_state & PPPOX_CONNECTED)) 1060 }
1061 break;
1062
1063 err = -EFAULT;
1064 if (copy_from_user(&ifr, (void __user *) arg, sizeof(struct ifreq)))
1065 break;
1066 ifr.ifr_mtu = session->mtu;
1067 if (copy_to_user((void __user *) arg, &ifr, sizeof(struct ifreq)))
1068 break;
1069
1070 l2tp_info(session, L2TP_MSG_CONTROL, "%s: get mtu=%d\n",
1071 session->name, session->mtu);
1072 err = 0;
1073 break;
1074
1075 case SIOCSIFMTU:
1076 err = -ENXIO;
1077 if (!(sk->sk_state & PPPOX_CONNECTED))
1078 break;
1079 1061
1080 err = -EFAULT; 1062 pppol2tp_copy_stats(stats, &session->stats);
1081 if (copy_from_user(&ifr, (void __user *) arg, sizeof(struct ifreq))) 1063 l2tp_session_dec_refcount(session);
1082 break;
1083 1064
1084 session->mtu = ifr.ifr_mtu; 1065 return 0;
1066}
1085 1067
1086 l2tp_info(session, L2TP_MSG_CONTROL, "%s: set mtu=%d\n", 1068static int pppol2tp_ioctl(struct socket *sock, unsigned int cmd,
1087 session->name, session->mtu); 1069 unsigned long arg)
1088 err = 0; 1070{
1089 break; 1071 struct pppol2tp_ioc_stats stats;
1072 struct l2tp_session *session;
1073 int val;
1090 1074
1075 switch (cmd) {
1091 case PPPIOCGMRU: 1076 case PPPIOCGMRU:
1092 err = -ENXIO; 1077 case PPPIOCGFLAGS:
1093 if (!(sk->sk_state & PPPOX_CONNECTED)) 1078 session = sock->sk->sk_user_data;
1094 break; 1079 if (!session)
1080 return -ENOTCONN;
1095 1081
1096 err = -EFAULT; 1082 /* Not defined for tunnels */
1097 if (put_user(session->mru, (int __user *) arg)) 1083 if (!session->session_id && !session->peer_session_id)
1098 break; 1084 return -ENOSYS;
1099 1085
1100 l2tp_info(session, L2TP_MSG_CONTROL, "%s: get mru=%d\n", 1086 if (put_user(0, (int __user *)arg))
1101 session->name, session->mru); 1087 return -EFAULT;
1102 err = 0;
1103 break; 1088 break;
1104 1089
1105 case PPPIOCSMRU: 1090 case PPPIOCSMRU:
1106 err = -ENXIO;
1107 if (!(sk->sk_state & PPPOX_CONNECTED))
1108 break;
1109
1110 err = -EFAULT;
1111 if (get_user(val, (int __user *) arg))
1112 break;
1113
1114 session->mru = val;
1115 l2tp_info(session, L2TP_MSG_CONTROL, "%s: set mru=%d\n",
1116 session->name, session->mru);
1117 err = 0;
1118 break;
1119
1120 case PPPIOCGFLAGS:
1121 err = -EFAULT;
1122 if (put_user(ps->flags, (int __user *) arg))
1123 break;
1124
1125 l2tp_info(session, L2TP_MSG_CONTROL, "%s: get flags=%d\n",
1126 session->name, ps->flags);
1127 err = 0;
1128 break;
1129
1130 case PPPIOCSFLAGS: 1091 case PPPIOCSFLAGS:
1131 err = -EFAULT; 1092 session = sock->sk->sk_user_data;
1132 if (get_user(val, (int __user *) arg)) 1093 if (!session)
1133 break; 1094 return -ENOTCONN;
1134 ps->flags = val;
1135 l2tp_info(session, L2TP_MSG_CONTROL, "%s: set flags=%d\n",
1136 session->name, ps->flags);
1137 err = 0;
1138 break;
1139
1140 case PPPIOCGL2TPSTATS:
1141 err = -ENXIO;
1142 if (!(sk->sk_state & PPPOX_CONNECTED))
1143 break;
1144 1095
1145 memset(&stats, 0, sizeof(stats)); 1096 /* Not defined for tunnels */
1146 stats.tunnel_id = tunnel->tunnel_id; 1097 if (!session->session_id && !session->peer_session_id)
1147 stats.session_id = session->session_id; 1098 return -ENOSYS;
1148 pppol2tp_copy_stats(&stats, &session->stats);
1149 if (copy_to_user((void __user *) arg, &stats,
1150 sizeof(stats)))
1151 break;
1152 l2tp_info(session, L2TP_MSG_CONTROL, "%s: get L2TP stats\n",
1153 session->name);
1154 err = 0;
1155 break;
1156 1099
1157 default: 1100 if (get_user(val, (int __user *)arg))
1158 err = -ENOSYS; 1101 return -EFAULT;
1159 break; 1102 break;
1160 }
1161 1103
1162 sock_put(sk);
1163
1164 return err;
1165}
1166
1167/* Tunnel ioctl helper.
1168 *
1169 * Note the special handling for PPPIOCGL2TPSTATS below. If the ioctl data
1170 * specifies a session_id, the session ioctl handler is called. This allows an
1171 * application to retrieve session stats via a tunnel socket.
1172 */
1173static int pppol2tp_tunnel_ioctl(struct l2tp_tunnel *tunnel,
1174 unsigned int cmd, unsigned long arg)
1175{
1176 int err = 0;
1177 struct sock *sk;
1178 struct pppol2tp_ioc_stats stats;
1179
1180 l2tp_dbg(tunnel, L2TP_MSG_CONTROL,
1181 "%s: pppol2tp_tunnel_ioctl(cmd=%#x, arg=%#lx)\n",
1182 tunnel->name, cmd, arg);
1183
1184 sk = tunnel->sock;
1185 sock_hold(sk);
1186
1187 switch (cmd) {
1188 case PPPIOCGL2TPSTATS: 1104 case PPPIOCGL2TPSTATS:
1189 err = -ENXIO; 1105 session = sock->sk->sk_user_data;
1190 if (!(sk->sk_state & PPPOX_CONNECTED)) 1106 if (!session)
1191 break; 1107 return -ENOTCONN;
1192 1108
1193 if (copy_from_user(&stats, (void __user *) arg, 1109 /* Session 0 represents the parent tunnel */
1194 sizeof(stats))) { 1110 if (!session->session_id && !session->peer_session_id) {
1195 err = -EFAULT; 1111 u32 session_id;
1196 break; 1112 int err;
1113
1114 if (copy_from_user(&stats, (void __user *)arg,
1115 sizeof(stats)))
1116 return -EFAULT;
1117
1118 session_id = stats.session_id;
1119 err = pppol2tp_tunnel_copy_stats(&stats,
1120 session->tunnel);
1121 if (err < 0)
1122 return err;
1123
1124 stats.session_id = session_id;
1125 } else {
1126 pppol2tp_copy_stats(&stats, &session->stats);
1127 stats.session_id = session->session_id;
1197 } 1128 }
1198 if (stats.session_id != 0) { 1129 stats.tunnel_id = session->tunnel->tunnel_id;
1199 /* resend to session ioctl handler */ 1130 stats.using_ipsec = l2tp_tunnel_uses_xfrm(session->tunnel);
1200 struct l2tp_session *session =
1201 l2tp_session_get(sock_net(sk), tunnel,
1202 stats.session_id);
1203
1204 if (session && session->pwtype == L2TP_PWTYPE_PPP) {
1205 err = pppol2tp_session_ioctl(session, cmd,
1206 arg);
1207 l2tp_session_dec_refcount(session);
1208 } else {
1209 err = -EBADR;
1210 }
1211 break;
1212 }
1213#ifdef CONFIG_XFRM
1214 stats.using_ipsec = (sk->sk_policy[0] || sk->sk_policy[1]) ? 1 : 0;
1215#endif
1216 pppol2tp_copy_stats(&stats, &tunnel->stats);
1217 if (copy_to_user((void __user *) arg, &stats, sizeof(stats))) {
1218 err = -EFAULT;
1219 break;
1220 }
1221 l2tp_info(tunnel, L2TP_MSG_CONTROL, "%s: get L2TP stats\n",
1222 tunnel->name);
1223 err = 0;
1224 break;
1225 1131
1226 default: 1132 if (copy_to_user((void __user *)arg, &stats, sizeof(stats)))
1227 err = -ENOSYS; 1133 return -EFAULT;
1228 break; 1134 break;
1229 }
1230 1135
1231 sock_put(sk); 1136 default:
1232 1137 return -ENOIOCTLCMD;
1233 return err;
1234}
1235
1236/* Main ioctl() handler.
1237 * Dispatch to tunnel or session helpers depending on the socket.
1238 */
1239static int pppol2tp_ioctl(struct socket *sock, unsigned int cmd,
1240 unsigned long arg)
1241{
1242 struct sock *sk = sock->sk;
1243 struct l2tp_session *session;
1244 struct l2tp_tunnel *tunnel;
1245 int err;
1246
1247 if (!sk)
1248 return 0;
1249
1250 err = -EBADF;
1251 if (sock_flag(sk, SOCK_DEAD) != 0)
1252 goto end;
1253
1254 err = -ENOTCONN;
1255 if ((sk->sk_user_data == NULL) ||
1256 (!(sk->sk_state & (PPPOX_CONNECTED | PPPOX_BOUND))))
1257 goto end;
1258
1259 /* Get session context from the socket */
1260 err = -EBADF;
1261 session = pppol2tp_sock_to_session(sk);
1262 if (session == NULL)
1263 goto end;
1264
1265 /* Special case: if session's session_id is zero, treat ioctl as a
1266 * tunnel ioctl
1267 */
1268 if ((session->session_id == 0) &&
1269 (session->peer_session_id == 0)) {
1270 tunnel = session->tunnel;
1271 err = pppol2tp_tunnel_ioctl(tunnel, cmd, arg);
1272 goto end_put_sess;
1273 } 1138 }
1274 1139
1275 err = pppol2tp_session_ioctl(session, cmd, arg); 1140 return 0;
1276
1277end_put_sess:
1278 sock_put(sk);
1279end:
1280 return err;
1281} 1141}
1282 1142
1283/***************************************************************************** 1143/*****************************************************************************
@@ -1717,8 +1577,7 @@ static void pppol2tp_seq_session_show(struct seq_file *m, void *v)
1717 tunnel->peer_tunnel_id, 1577 tunnel->peer_tunnel_id,
1718 session->peer_session_id, 1578 session->peer_session_id,
1719 state, user_data_ok); 1579 state, user_data_ok);
1720 seq_printf(m, " %d/%d/%c/%c/%s %08x %u\n", 1580 seq_printf(m, " 0/0/%c/%c/%s %08x %u\n",
1721 session->mtu, session->mru,
1722 session->recv_seq ? 'R' : '-', 1581 session->recv_seq ? 'R' : '-',
1723 session->send_seq ? 'S' : '-', 1582 session->send_seq ? 'S' : '-',
1724 session->lns_mode ? "LNS" : "LAC", 1583 session->lns_mode ? "LNS" : "LAC",
@@ -1818,7 +1677,7 @@ static const struct proto_ops pppol2tp_ops = {
1818 .socketpair = sock_no_socketpair, 1677 .socketpair = sock_no_socketpair,
1819 .accept = sock_no_accept, 1678 .accept = sock_no_accept,
1820 .getname = pppol2tp_getname, 1679 .getname = pppol2tp_getname,
1821 .poll_mask = datagram_poll_mask, 1680 .poll = datagram_poll,
1822 .listen = sock_no_listen, 1681 .listen = sock_no_listen,
1823 .shutdown = sock_no_shutdown, 1682 .shutdown = sock_no_shutdown,
1824 .setsockopt = pppol2tp_setsockopt, 1683 .setsockopt = pppol2tp_setsockopt,
diff --git a/net/llc/Kconfig b/net/llc/Kconfig
index b91c65108162..176a6c1521a5 100644
--- a/net/llc/Kconfig
+++ b/net/llc/Kconfig
@@ -6,5 +6,5 @@ config LLC2
6 tristate "ANSI/IEEE 802.2 LLC type 2 Support" 6 tristate "ANSI/IEEE 802.2 LLC type 2 Support"
7 select LLC 7 select LLC
8 help 8 help
9 This is a Logical Link Layer type 2, connection oriented support. 9 This is a Logical Link Layer type 2, connection oriented support.
10 Select this if you want to have support for PF_LLC sockets. 10 Select this if you want to have support for PF_LLC sockets.
diff --git a/net/llc/Makefile b/net/llc/Makefile
index 4e260cff3c5d..5e0ef436daae 100644
--- a/net/llc/Makefile
+++ b/net/llc/Makefile
@@ -4,7 +4,7 @@
4# Copyright (c) 1997 by Procom Technology,Inc. 4# Copyright (c) 1997 by Procom Technology,Inc.
5# 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br> 5# 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br>
6# 6#
7# This program can be redistributed or modified under the terms of the 7# This program can be redistributed or modified under the terms of the
8# GNU General Public License as published by the Free Software Foundation. 8# GNU General Public License as published by the Free Software Foundation.
9# This program is distributed without any warranty or implied warranty 9# This program is distributed without any warranty or implied warranty
10# of merchantability or fitness for a particular purpose. 10# of merchantability or fitness for a particular purpose.
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 804de8490186..1beeea9549fa 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -1192,7 +1192,7 @@ static const struct proto_ops llc_ui_ops = {
1192 .socketpair = sock_no_socketpair, 1192 .socketpair = sock_no_socketpair,
1193 .accept = llc_ui_accept, 1193 .accept = llc_ui_accept,
1194 .getname = llc_ui_getname, 1194 .getname = llc_ui_getname,
1195 .poll_mask = datagram_poll_mask, 1195 .poll = datagram_poll,
1196 .ioctl = llc_ui_ioctl, 1196 .ioctl = llc_ui_ioctl,
1197 .listen = llc_ui_listen, 1197 .listen = llc_ui_listen,
1198 .shutdown = llc_ui_shutdown, 1198 .shutdown = llc_ui_shutdown,
diff --git a/net/llc/llc_core.c b/net/llc/llc_core.c
index 89041260784c..260b3dc1b4a2 100644
--- a/net/llc/llc_core.c
+++ b/net/llc/llc_core.c
@@ -73,8 +73,8 @@ struct llc_sap *llc_sap_find(unsigned char sap_value)
73 73
74 rcu_read_lock_bh(); 74 rcu_read_lock_bh();
75 sap = __llc_sap_find(sap_value); 75 sap = __llc_sap_find(sap_value);
76 if (sap) 76 if (!sap || !llc_sap_hold_safe(sap))
77 llc_sap_hold(sap); 77 sap = NULL;
78 rcu_read_unlock_bh(); 78 rcu_read_unlock_bh();
79 return sap; 79 return sap;
80} 80}
diff --git a/net/llc/llc_if.c b/net/llc/llc_if.c
index 6daf391b3e84..8db03c2d5440 100644
--- a/net/llc/llc_if.c
+++ b/net/llc/llc_if.c
@@ -151,4 +151,3 @@ out:
151 sock_put(sk); 151 sock_put(sk);
152 return rc; 152 return rc;
153} 153}
154
diff --git a/net/mac80211/Makefile b/net/mac80211/Makefile
index e3589ade62e0..bb707789ef2b 100644
--- a/net/mac80211/Makefile
+++ b/net/mac80211/Makefile
@@ -12,6 +12,7 @@ mac80211-y := \
12 scan.o offchannel.o \ 12 scan.o offchannel.o \
13 ht.o agg-tx.o agg-rx.o \ 13 ht.o agg-tx.o agg-rx.o \
14 vht.o \ 14 vht.o \
15 he.o \
15 ibss.o \ 16 ibss.o \
16 iface.o \ 17 iface.o \
17 rate.o \ 18 rate.o \
diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index e83c19d4c292..6a4f154c99f6 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -245,6 +245,7 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta,
245 }; 245 };
246 int i, ret = -EOPNOTSUPP; 246 int i, ret = -EOPNOTSUPP;
247 u16 status = WLAN_STATUS_REQUEST_DECLINED; 247 u16 status = WLAN_STATUS_REQUEST_DECLINED;
248 u16 max_buf_size;
248 249
249 if (tid >= IEEE80211_FIRST_TSPEC_TSID) { 250 if (tid >= IEEE80211_FIRST_TSPEC_TSID) {
250 ht_dbg(sta->sdata, 251 ht_dbg(sta->sdata,
@@ -268,13 +269,18 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta,
268 goto end; 269 goto end;
269 } 270 }
270 271
272 if (sta->sta.he_cap.has_he)
273 max_buf_size = IEEE80211_MAX_AMPDU_BUF;
274 else
275 max_buf_size = IEEE80211_MAX_AMPDU_BUF_HT;
276
271 /* sanity check for incoming parameters: 277 /* sanity check for incoming parameters:
272 * check if configuration can support the BA policy 278 * check if configuration can support the BA policy
273 * and if buffer size does not exceeds max value */ 279 * and if buffer size does not exceeds max value */
274 /* XXX: check own ht delayed BA capability?? */ 280 /* XXX: check own ht delayed BA capability?? */
275 if (((ba_policy != 1) && 281 if (((ba_policy != 1) &&
276 (!(sta->sta.ht_cap.cap & IEEE80211_HT_CAP_DELAY_BA))) || 282 (!(sta->sta.ht_cap.cap & IEEE80211_HT_CAP_DELAY_BA))) ||
277 (buf_size > IEEE80211_MAX_AMPDU_BUF)) { 283 (buf_size > max_buf_size)) {
278 status = WLAN_STATUS_INVALID_QOS_PARAM; 284 status = WLAN_STATUS_INVALID_QOS_PARAM;
279 ht_dbg_ratelimited(sta->sdata, 285 ht_dbg_ratelimited(sta->sdata,
280 "AddBA Req with bad params from %pM on tid %u. policy %d, buffer size %d\n", 286 "AddBA Req with bad params from %pM on tid %u. policy %d, buffer size %d\n",
@@ -283,7 +289,7 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta,
283 } 289 }
284 /* determine default buffer size */ 290 /* determine default buffer size */
285 if (buf_size == 0) 291 if (buf_size == 0)
286 buf_size = IEEE80211_MAX_AMPDU_BUF; 292 buf_size = max_buf_size;
287 293
288 /* make sure the size doesn't exceed the maximum supported by the hw */ 294 /* make sure the size doesn't exceed the maximum supported by the hw */
289 if (buf_size > sta->sta.max_rx_aggregation_subframes) 295 if (buf_size > sta->sta.max_rx_aggregation_subframes)
diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
index ac4295296514..69e831bc317b 100644
--- a/net/mac80211/agg-tx.c
+++ b/net/mac80211/agg-tx.c
@@ -463,6 +463,7 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid)
463 .timeout = 0, 463 .timeout = 0,
464 }; 464 };
465 int ret; 465 int ret;
466 u16 buf_size;
466 467
467 tid_tx = rcu_dereference_protected_tid_tx(sta, tid); 468 tid_tx = rcu_dereference_protected_tid_tx(sta, tid);
468 469
@@ -511,11 +512,22 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid)
511 sta->ampdu_mlme.addba_req_num[tid]++; 512 sta->ampdu_mlme.addba_req_num[tid]++;
512 spin_unlock_bh(&sta->lock); 513 spin_unlock_bh(&sta->lock);
513 514
515 if (sta->sta.he_cap.has_he) {
516 buf_size = local->hw.max_tx_aggregation_subframes;
517 } else {
518 /*
519 * We really should use what the driver told us it will
520 * transmit as the maximum, but certain APs (e.g. the
521 * LinkSys WRT120N with FW v1.0.07 build 002 Jun 18 2012)
522 * will crash when we use a lower number.
523 */
524 buf_size = IEEE80211_MAX_AMPDU_BUF_HT;
525 }
526
514 /* send AddBA request */ 527 /* send AddBA request */
515 ieee80211_send_addba_request(sdata, sta->sta.addr, tid, 528 ieee80211_send_addba_request(sdata, sta->sta.addr, tid,
516 tid_tx->dialog_token, params.ssn, 529 tid_tx->dialog_token, params.ssn,
517 IEEE80211_MAX_AMPDU_BUF, 530 buf_size, tid_tx->timeout);
518 tid_tx->timeout);
519} 531}
520 532
521/* 533/*
@@ -905,8 +917,7 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local,
905{ 917{
906 struct tid_ampdu_tx *tid_tx; 918 struct tid_ampdu_tx *tid_tx;
907 struct ieee80211_txq *txq; 919 struct ieee80211_txq *txq;
908 u16 capab, tid; 920 u16 capab, tid, buf_size;
909 u8 buf_size;
910 bool amsdu; 921 bool amsdu;
911 922
912 capab = le16_to_cpu(mgmt->u.action.u.addba_resp.capab); 923 capab = le16_to_cpu(mgmt->u.action.u.addba_resp.capab);
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index bdf6fa78d0d2..d25da0e66da1 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -495,7 +495,7 @@ static int ieee80211_del_key(struct wiphy *wiphy, struct net_device *dev,
495 goto out_unlock; 495 goto out_unlock;
496 } 496 }
497 497
498 ieee80211_key_free(key, true); 498 ieee80211_key_free(key, sdata->vif.type == NL80211_IFTYPE_STATION);
499 499
500 ret = 0; 500 ret = 0;
501 out_unlock: 501 out_unlock:
@@ -1412,6 +1412,11 @@ static int sta_apply_parameters(struct ieee80211_local *local,
1412 ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband, 1412 ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband,
1413 params->vht_capa, sta); 1413 params->vht_capa, sta);
1414 1414
1415 if (params->he_capa)
1416 ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband,
1417 (void *)params->he_capa,
1418 params->he_capa_len, sta);
1419
1415 if (params->opmode_notif_used) { 1420 if (params->opmode_notif_used) {
1416 /* returned value is only needed for rc update, but the 1421 /* returned value is only needed for rc update, but the
1417 * rc isn't initialized here yet, so ignore it 1422 * rc isn't initialized here yet, so ignore it
@@ -3486,7 +3491,7 @@ static int ieee80211_probe_client(struct wiphy *wiphy, struct net_device *dev,
3486 } 3491 }
3487 3492
3488 local_bh_disable(); 3493 local_bh_disable();
3489 ieee80211_xmit(sdata, sta, skb); 3494 ieee80211_xmit(sdata, sta, skb, 0);
3490 local_bh_enable(); 3495 local_bh_enable();
3491 3496
3492 ret = 0; 3497 ret = 0;
diff --git a/net/mac80211/ethtool.c b/net/mac80211/ethtool.c
index 690c142a7a44..5ac743816b59 100644
--- a/net/mac80211/ethtool.c
+++ b/net/mac80211/ethtool.c
@@ -116,16 +116,16 @@ static void ieee80211_get_stats(struct net_device *dev,
116 data[i++] = sta->sta_state; 116 data[i++] = sta->sta_state;
117 117
118 118
119 if (sinfo.filled & BIT(NL80211_STA_INFO_TX_BITRATE)) 119 if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE))
120 data[i] = 100000ULL * 120 data[i] = 100000ULL *
121 cfg80211_calculate_bitrate(&sinfo.txrate); 121 cfg80211_calculate_bitrate(&sinfo.txrate);
122 i++; 122 i++;
123 if (sinfo.filled & BIT(NL80211_STA_INFO_RX_BITRATE)) 123 if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_RX_BITRATE))
124 data[i] = 100000ULL * 124 data[i] = 100000ULL *
125 cfg80211_calculate_bitrate(&sinfo.rxrate); 125 cfg80211_calculate_bitrate(&sinfo.rxrate);
126 i++; 126 i++;
127 127
128 if (sinfo.filled & BIT(NL80211_STA_INFO_SIGNAL_AVG)) 128 if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG))
129 data[i] = (u8)sinfo.signal_avg; 129 data[i] = (u8)sinfo.signal_avg;
130 i++; 130 i++;
131 } else { 131 } else {
diff --git a/net/mac80211/he.c b/net/mac80211/he.c
new file mode 100644
index 000000000000..769078ed5a12
--- /dev/null
+++ b/net/mac80211/he.c
@@ -0,0 +1,55 @@
1/*
2 * HE handling
3 *
4 * Copyright(c) 2017 Intel Deutschland GmbH
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include "ieee80211_i.h"
12
13void
14ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata,
15 struct ieee80211_supported_band *sband,
16 const u8 *he_cap_ie, u8 he_cap_len,
17 struct sta_info *sta)
18{
19 struct ieee80211_sta_he_cap *he_cap = &sta->sta.he_cap;
20 struct ieee80211_he_cap_elem *he_cap_ie_elem = (void *)he_cap_ie;
21 u8 he_ppe_size;
22 u8 mcs_nss_size;
23 u8 he_total_size;
24
25 memset(he_cap, 0, sizeof(*he_cap));
26
27 if (!he_cap_ie || !ieee80211_get_he_sta_cap(sband))
28 return;
29
30 /* Make sure size is OK */
31 mcs_nss_size = ieee80211_he_mcs_nss_size(he_cap_ie_elem);
32 he_ppe_size =
33 ieee80211_he_ppe_size(he_cap_ie[sizeof(he_cap->he_cap_elem) +
34 mcs_nss_size],
35 he_cap_ie_elem->phy_cap_info);
36 he_total_size = sizeof(he_cap->he_cap_elem) + mcs_nss_size +
37 he_ppe_size;
38 if (he_cap_len < he_total_size)
39 return;
40
41 memcpy(&he_cap->he_cap_elem, he_cap_ie, sizeof(he_cap->he_cap_elem));
42
43 /* HE Tx/Rx HE MCS NSS Support Field */
44 memcpy(&he_cap->he_mcs_nss_supp,
45 &he_cap_ie[sizeof(he_cap->he_cap_elem)], mcs_nss_size);
46
47 /* Check if there are (optional) PPE Thresholds */
48 if (he_cap->he_cap_elem.phy_cap_info[6] &
49 IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT)
50 memcpy(he_cap->ppe_thres,
51 &he_cap_ie[sizeof(he_cap->he_cap_elem) + mcs_nss_size],
52 he_ppe_size);
53
54 he_cap->has_he = true;
55}
diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c
index 26a7ba3b698f..f849ea814993 100644
--- a/net/mac80211/ht.c
+++ b/net/mac80211/ht.c
@@ -352,7 +352,7 @@ void ieee80211_ba_session_work(struct work_struct *work)
352 test_and_clear_bit(tid, 352 test_and_clear_bit(tid,
353 sta->ampdu_mlme.tid_rx_manage_offl)) 353 sta->ampdu_mlme.tid_rx_manage_offl))
354 ___ieee80211_start_rx_ba_session(sta, 0, 0, 0, 1, tid, 354 ___ieee80211_start_rx_ba_session(sta, 0, 0, 0, 1, tid,
355 IEEE80211_MAX_AMPDU_BUF, 355 IEEE80211_MAX_AMPDU_BUF_HT,
356 false, true); 356 false, true);
357 357
358 if (test_and_clear_bit(tid + IEEE80211_NUM_TIDS, 358 if (test_and_clear_bit(tid + IEEE80211_NUM_TIDS,
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index d1978aa1c15d..172aeae21ae9 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -165,6 +165,7 @@ typedef unsigned __bitwise ieee80211_tx_result;
165#define TX_DROP ((__force ieee80211_tx_result) 1u) 165#define TX_DROP ((__force ieee80211_tx_result) 1u)
166#define TX_QUEUED ((__force ieee80211_tx_result) 2u) 166#define TX_QUEUED ((__force ieee80211_tx_result) 2u)
167 167
168#define IEEE80211_TX_NO_SEQNO BIT(0)
168#define IEEE80211_TX_UNICAST BIT(1) 169#define IEEE80211_TX_UNICAST BIT(1)
169#define IEEE80211_TX_PS_BUFFERED BIT(2) 170#define IEEE80211_TX_PS_BUFFERED BIT(2)
170 171
@@ -364,6 +365,7 @@ enum ieee80211_sta_flags {
364 IEEE80211_STA_DISABLE_160MHZ = BIT(13), 365 IEEE80211_STA_DISABLE_160MHZ = BIT(13),
365 IEEE80211_STA_DISABLE_WMM = BIT(14), 366 IEEE80211_STA_DISABLE_WMM = BIT(14),
366 IEEE80211_STA_ENABLE_RRM = BIT(15), 367 IEEE80211_STA_ENABLE_RRM = BIT(15),
368 IEEE80211_STA_DISABLE_HE = BIT(16),
367}; 369};
368 370
369struct ieee80211_mgd_auth_data { 371struct ieee80211_mgd_auth_data {
@@ -1453,6 +1455,10 @@ struct ieee802_11_elems {
1453 const struct ieee80211_vht_cap *vht_cap_elem; 1455 const struct ieee80211_vht_cap *vht_cap_elem;
1454 const struct ieee80211_vht_operation *vht_operation; 1456 const struct ieee80211_vht_operation *vht_operation;
1455 const struct ieee80211_meshconf_ie *mesh_config; 1457 const struct ieee80211_meshconf_ie *mesh_config;
1458 const u8 *he_cap;
1459 const struct ieee80211_he_operation *he_operation;
1460 const struct ieee80211_mu_edca_param_set *mu_edca_param_set;
1461 const u8 *uora_element;
1456 const u8 *mesh_id; 1462 const u8 *mesh_id;
1457 const u8 *peering; 1463 const u8 *peering;
1458 const __le16 *awake_window; 1464 const __le16 *awake_window;
@@ -1482,6 +1488,7 @@ struct ieee802_11_elems {
1482 u8 ext_supp_rates_len; 1488 u8 ext_supp_rates_len;
1483 u8 wmm_info_len; 1489 u8 wmm_info_len;
1484 u8 wmm_param_len; 1490 u8 wmm_param_len;
1491 u8 he_cap_len;
1485 u8 mesh_id_len; 1492 u8 mesh_id_len;
1486 u8 peering_len; 1493 u8 peering_len;
1487 u8 preq_len; 1494 u8 preq_len;
@@ -1824,6 +1831,13 @@ void ieee80211_get_vht_mask_from_cap(__le16 vht_cap,
1824enum nl80211_chan_width 1831enum nl80211_chan_width
1825ieee80211_sta_rx_bw_to_chan_width(struct sta_info *sta); 1832ieee80211_sta_rx_bw_to_chan_width(struct sta_info *sta);
1826 1833
1834/* HE */
1835void
1836ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata,
1837 struct ieee80211_supported_band *sband,
1838 const u8 *he_cap_ie, u8 he_cap_len,
1839 struct sta_info *sta);
1840
1827/* Spectrum management */ 1841/* Spectrum management */
1828void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata, 1842void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata,
1829 struct ieee80211_mgmt *mgmt, 1843 struct ieee80211_mgmt *mgmt,
@@ -1880,19 +1894,20 @@ void ieee80211_regulatory_limit_wmm_params(struct ieee80211_sub_if_data *sdata,
1880void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata, 1894void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata,
1881 bool bss_notify, bool enable_qos); 1895 bool bss_notify, bool enable_qos);
1882void ieee80211_xmit(struct ieee80211_sub_if_data *sdata, 1896void ieee80211_xmit(struct ieee80211_sub_if_data *sdata,
1883 struct sta_info *sta, struct sk_buff *skb); 1897 struct sta_info *sta, struct sk_buff *skb,
1898 u32 txdata_flags);
1884 1899
1885void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, 1900void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata,
1886 struct sk_buff *skb, int tid, 1901 struct sk_buff *skb, int tid,
1887 enum nl80211_band band); 1902 enum nl80211_band band, u32 txdata_flags);
1888 1903
1889static inline void 1904static inline void
1890ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, 1905ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata,
1891 struct sk_buff *skb, int tid, 1906 struct sk_buff *skb, int tid,
1892 enum nl80211_band band) 1907 enum nl80211_band band, u32 txdata_flags)
1893{ 1908{
1894 rcu_read_lock(); 1909 rcu_read_lock();
1895 __ieee80211_tx_skb_tid_band(sdata, skb, tid, band); 1910 __ieee80211_tx_skb_tid_band(sdata, skb, tid, band, txdata_flags);
1896 rcu_read_unlock(); 1911 rcu_read_unlock();
1897} 1912}
1898 1913
@@ -1910,7 +1925,7 @@ static inline void ieee80211_tx_skb_tid(struct ieee80211_sub_if_data *sdata,
1910 } 1925 }
1911 1926
1912 __ieee80211_tx_skb_tid_band(sdata, skb, tid, 1927 __ieee80211_tx_skb_tid_band(sdata, skb, tid,
1913 chanctx_conf->def.chan->band); 1928 chanctx_conf->def.chan->band, 0);
1914 rcu_read_unlock(); 1929 rcu_read_unlock();
1915} 1930}
1916 1931
@@ -2031,26 +2046,27 @@ void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
2031void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata, 2046void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata,
2032 const u8 *bssid, u16 stype, u16 reason, 2047 const u8 *bssid, u16 stype, u16 reason,
2033 bool send_frame, u8 *frame_buf); 2048 bool send_frame, u8 *frame_buf);
2049
2050enum {
2051 IEEE80211_PROBE_FLAG_DIRECTED = BIT(0),
2052 IEEE80211_PROBE_FLAG_MIN_CONTENT = BIT(1),
2053 IEEE80211_PROBE_FLAG_RANDOM_SN = BIT(2),
2054};
2055
2034int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer, 2056int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer,
2035 size_t buffer_len, 2057 size_t buffer_len,
2036 struct ieee80211_scan_ies *ie_desc, 2058 struct ieee80211_scan_ies *ie_desc,
2037 const u8 *ie, size_t ie_len, 2059 const u8 *ie, size_t ie_len,
2038 u8 bands_used, u32 *rate_masks, 2060 u8 bands_used, u32 *rate_masks,
2039 struct cfg80211_chan_def *chandef); 2061 struct cfg80211_chan_def *chandef,
2062 u32 flags);
2040struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata, 2063struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata,
2041 const u8 *src, const u8 *dst, 2064 const u8 *src, const u8 *dst,
2042 u32 ratemask, 2065 u32 ratemask,
2043 struct ieee80211_channel *chan, 2066 struct ieee80211_channel *chan,
2044 const u8 *ssid, size_t ssid_len, 2067 const u8 *ssid, size_t ssid_len,
2045 const u8 *ie, size_t ie_len, 2068 const u8 *ie, size_t ie_len,
2046 bool directed); 2069 u32 flags);
2047void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata,
2048 const u8 *src, const u8 *dst,
2049 const u8 *ssid, size_t ssid_len,
2050 const u8 *ie, size_t ie_len,
2051 u32 ratemask, bool directed, u32 tx_flags,
2052 struct ieee80211_channel *channel, bool scan);
2053
2054u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata, 2070u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata,
2055 struct ieee802_11_elems *elems, 2071 struct ieee802_11_elems *elems,
2056 enum nl80211_band band, u32 *basic_rates); 2072 enum nl80211_band band, u32 *basic_rates);
@@ -2073,6 +2089,9 @@ u8 *ieee80211_ie_build_vht_cap(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap,
2073 u32 cap); 2089 u32 cap);
2074u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, 2090u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap,
2075 const struct cfg80211_chan_def *chandef); 2091 const struct cfg80211_chan_def *chandef);
2092u8 *ieee80211_ie_build_he_cap(u8 *pos,
2093 const struct ieee80211_sta_he_cap *he_cap,
2094 u8 *end);
2076int ieee80211_parse_bitrates(struct cfg80211_chan_def *chandef, 2095int ieee80211_parse_bitrates(struct cfg80211_chan_def *chandef,
2077 const struct ieee80211_supported_band *sband, 2096 const struct ieee80211_supported_band *sband,
2078 const u8 *srates, int srates_len, u32 *rates); 2097 const u8 *srates, int srates_len, u32 *rates);
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 555e389b7dfa..5e6cf2cee965 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -1130,7 +1130,7 @@ static void ieee80211_uninit(struct net_device *dev)
1130 1130
1131static u16 ieee80211_netdev_select_queue(struct net_device *dev, 1131static u16 ieee80211_netdev_select_queue(struct net_device *dev,
1132 struct sk_buff *skb, 1132 struct sk_buff *skb,
1133 void *accel_priv, 1133 struct net_device *sb_dev,
1134 select_queue_fallback_t fallback) 1134 select_queue_fallback_t fallback)
1135{ 1135{
1136 return ieee80211_select_queue(IEEE80211_DEV_TO_SUB_IF(dev), skb); 1136 return ieee80211_select_queue(IEEE80211_DEV_TO_SUB_IF(dev), skb);
@@ -1176,7 +1176,7 @@ static const struct net_device_ops ieee80211_dataif_ops = {
1176 1176
1177static u16 ieee80211_monitor_select_queue(struct net_device *dev, 1177static u16 ieee80211_monitor_select_queue(struct net_device *dev,
1178 struct sk_buff *skb, 1178 struct sk_buff *skb,
1179 void *accel_priv, 1179 struct net_device *sb_dev,
1180 select_queue_fallback_t fallback) 1180 select_queue_fallback_t fallback)
1181{ 1181{
1182 struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); 1182 struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index ee0d0cc8dc3b..c054ac85793c 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -656,11 +656,15 @@ int ieee80211_key_link(struct ieee80211_key *key,
656{ 656{
657 struct ieee80211_local *local = sdata->local; 657 struct ieee80211_local *local = sdata->local;
658 struct ieee80211_key *old_key; 658 struct ieee80211_key *old_key;
659 int idx, ret; 659 int idx = key->conf.keyidx;
660 bool pairwise; 660 bool pairwise = key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE;
661 661 /*
662 pairwise = key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE; 662 * We want to delay tailroom updates only for station - in that
663 idx = key->conf.keyidx; 663 * case it helps roaming speed, but in other cases it hurts and
664 * can cause warnings to appear.
665 */
666 bool delay_tailroom = sdata->vif.type == NL80211_IFTYPE_STATION;
667 int ret;
664 668
665 mutex_lock(&sdata->local->key_mtx); 669 mutex_lock(&sdata->local->key_mtx);
666 670
@@ -688,14 +692,14 @@ int ieee80211_key_link(struct ieee80211_key *key,
688 increment_tailroom_need_count(sdata); 692 increment_tailroom_need_count(sdata);
689 693
690 ieee80211_key_replace(sdata, sta, pairwise, old_key, key); 694 ieee80211_key_replace(sdata, sta, pairwise, old_key, key);
691 ieee80211_key_destroy(old_key, true); 695 ieee80211_key_destroy(old_key, delay_tailroom);
692 696
693 ieee80211_debugfs_key_add(key); 697 ieee80211_debugfs_key_add(key);
694 698
695 if (!local->wowlan) { 699 if (!local->wowlan) {
696 ret = ieee80211_key_enable_hw_accel(key); 700 ret = ieee80211_key_enable_hw_accel(key);
697 if (ret) 701 if (ret)
698 ieee80211_key_free(key, true); 702 ieee80211_key_free(key, delay_tailroom);
699 } else { 703 } else {
700 ret = 0; 704 ret = 0;
701 } 705 }
@@ -930,7 +934,8 @@ void ieee80211_free_sta_keys(struct ieee80211_local *local,
930 ieee80211_key_replace(key->sdata, key->sta, 934 ieee80211_key_replace(key->sdata, key->sta,
931 key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE, 935 key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE,
932 key, NULL); 936 key, NULL);
933 __ieee80211_key_destroy(key, true); 937 __ieee80211_key_destroy(key, key->sdata->vif.type ==
938 NL80211_IFTYPE_STATION);
934 } 939 }
935 940
936 for (i = 0; i < NUM_DEFAULT_KEYS; i++) { 941 for (i = 0; i < NUM_DEFAULT_KEYS; i++) {
@@ -940,7 +945,8 @@ void ieee80211_free_sta_keys(struct ieee80211_local *local,
940 ieee80211_key_replace(key->sdata, key->sta, 945 ieee80211_key_replace(key->sdata, key->sta,
941 key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE, 946 key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE,
942 key, NULL); 947 key, NULL);
943 __ieee80211_key_destroy(key, true); 948 __ieee80211_key_destroy(key, key->sdata->vif.type ==
949 NL80211_IFTYPE_STATION);
944 } 950 }
945 951
946 mutex_unlock(&local->key_mtx); 952 mutex_unlock(&local->key_mtx);
diff --git a/net/mac80211/led.c b/net/mac80211/led.c
index ba0b507ea691..d6c66fc19716 100644
--- a/net/mac80211/led.c
+++ b/net/mac80211/led.c
@@ -52,13 +52,15 @@ void ieee80211_free_led_names(struct ieee80211_local *local)
52 kfree(local->radio_led.name); 52 kfree(local->radio_led.name);
53} 53}
54 54
55static void ieee80211_tx_led_activate(struct led_classdev *led_cdev) 55static int ieee80211_tx_led_activate(struct led_classdev *led_cdev)
56{ 56{
57 struct ieee80211_local *local = container_of(led_cdev->trigger, 57 struct ieee80211_local *local = container_of(led_cdev->trigger,
58 struct ieee80211_local, 58 struct ieee80211_local,
59 tx_led); 59 tx_led);
60 60
61 atomic_inc(&local->tx_led_active); 61 atomic_inc(&local->tx_led_active);
62
63 return 0;
62} 64}
63 65
64static void ieee80211_tx_led_deactivate(struct led_classdev *led_cdev) 66static void ieee80211_tx_led_deactivate(struct led_classdev *led_cdev)
@@ -70,13 +72,15 @@ static void ieee80211_tx_led_deactivate(struct led_classdev *led_cdev)
70 atomic_dec(&local->tx_led_active); 72 atomic_dec(&local->tx_led_active);
71} 73}
72 74
73static void ieee80211_rx_led_activate(struct led_classdev *led_cdev) 75static int ieee80211_rx_led_activate(struct led_classdev *led_cdev)
74{ 76{
75 struct ieee80211_local *local = container_of(led_cdev->trigger, 77 struct ieee80211_local *local = container_of(led_cdev->trigger,
76 struct ieee80211_local, 78 struct ieee80211_local,
77 rx_led); 79 rx_led);
78 80
79 atomic_inc(&local->rx_led_active); 81 atomic_inc(&local->rx_led_active);
82
83 return 0;
80} 84}
81 85
82static void ieee80211_rx_led_deactivate(struct led_classdev *led_cdev) 86static void ieee80211_rx_led_deactivate(struct led_classdev *led_cdev)
@@ -88,13 +92,15 @@ static void ieee80211_rx_led_deactivate(struct led_classdev *led_cdev)
88 atomic_dec(&local->rx_led_active); 92 atomic_dec(&local->rx_led_active);
89} 93}
90 94
91static void ieee80211_assoc_led_activate(struct led_classdev *led_cdev) 95static int ieee80211_assoc_led_activate(struct led_classdev *led_cdev)
92{ 96{
93 struct ieee80211_local *local = container_of(led_cdev->trigger, 97 struct ieee80211_local *local = container_of(led_cdev->trigger,
94 struct ieee80211_local, 98 struct ieee80211_local,
95 assoc_led); 99 assoc_led);
96 100
97 atomic_inc(&local->assoc_led_active); 101 atomic_inc(&local->assoc_led_active);
102
103 return 0;
98} 104}
99 105
100static void ieee80211_assoc_led_deactivate(struct led_classdev *led_cdev) 106static void ieee80211_assoc_led_deactivate(struct led_classdev *led_cdev)
@@ -106,13 +112,15 @@ static void ieee80211_assoc_led_deactivate(struct led_classdev *led_cdev)
106 atomic_dec(&local->assoc_led_active); 112 atomic_dec(&local->assoc_led_active);
107} 113}
108 114
109static void ieee80211_radio_led_activate(struct led_classdev *led_cdev) 115static int ieee80211_radio_led_activate(struct led_classdev *led_cdev)
110{ 116{
111 struct ieee80211_local *local = container_of(led_cdev->trigger, 117 struct ieee80211_local *local = container_of(led_cdev->trigger,
112 struct ieee80211_local, 118 struct ieee80211_local,
113 radio_led); 119 radio_led);
114 120
115 atomic_inc(&local->radio_led_active); 121 atomic_inc(&local->radio_led_active);
122
123 return 0;
116} 124}
117 125
118static void ieee80211_radio_led_deactivate(struct led_classdev *led_cdev) 126static void ieee80211_radio_led_deactivate(struct led_classdev *led_cdev)
@@ -124,13 +132,15 @@ static void ieee80211_radio_led_deactivate(struct led_classdev *led_cdev)
124 atomic_dec(&local->radio_led_active); 132 atomic_dec(&local->radio_led_active);
125} 133}
126 134
127static void ieee80211_tpt_led_activate(struct led_classdev *led_cdev) 135static int ieee80211_tpt_led_activate(struct led_classdev *led_cdev)
128{ 136{
129 struct ieee80211_local *local = container_of(led_cdev->trigger, 137 struct ieee80211_local *local = container_of(led_cdev->trigger,
130 struct ieee80211_local, 138 struct ieee80211_local,
131 tpt_led); 139 tpt_led);
132 140
133 atomic_inc(&local->tpt_led_active); 141 atomic_inc(&local->tpt_led_active);
142
143 return 0;
134} 144}
135 145
136static void ieee80211_tpt_led_deactivate(struct led_classdev *led_cdev) 146static void ieee80211_tpt_led_deactivate(struct led_classdev *led_cdev)
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index fb73451ed85e..4fb2709cb527 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -3,6 +3,7 @@
3 * Copyright 2005-2006, Devicescape Software, Inc. 3 * Copyright 2005-2006, Devicescape Software, Inc.
4 * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> 4 * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
5 * Copyright 2013-2014 Intel Mobile Communications GmbH 5 * Copyright 2013-2014 Intel Mobile Communications GmbH
6 * Copyright (C) 2017 Intel Deutschland GmbH
6 * 7 *
7 * This program is free software; you can redistribute it and/or modify 8 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as 9 * it under the terms of the GNU General Public License version 2 as
@@ -557,10 +558,19 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
557 wiphy_ext_feature_set(wiphy, 558 wiphy_ext_feature_set(wiphy,
558 NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211); 559 NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211);
559 560
560 if (!ops->hw_scan) 561 if (!ops->hw_scan) {
561 wiphy->features |= NL80211_FEATURE_LOW_PRIORITY_SCAN | 562 wiphy->features |= NL80211_FEATURE_LOW_PRIORITY_SCAN |
562 NL80211_FEATURE_AP_SCAN; 563 NL80211_FEATURE_AP_SCAN;
563 564 /*
565 * if the driver behaves correctly using the probe request
566 * (template) from mac80211, then both of these should be
567 * supported even with hw scan - but let drivers opt in.
568 */
569 wiphy_ext_feature_set(wiphy,
570 NL80211_EXT_FEATURE_SCAN_RANDOM_SN);
571 wiphy_ext_feature_set(wiphy,
572 NL80211_EXT_FEATURE_SCAN_MIN_PREQ_CONTENT);
573 }
564 574
565 if (!ops->set_key) 575 if (!ops->set_key)
566 wiphy->flags |= WIPHY_FLAG_IBSS_RSN; 576 wiphy->flags |= WIPHY_FLAG_IBSS_RSN;
@@ -588,8 +598,8 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
588 local->hw.queues = 1; 598 local->hw.queues = 1;
589 local->hw.max_rates = 1; 599 local->hw.max_rates = 1;
590 local->hw.max_report_rates = 0; 600 local->hw.max_report_rates = 0;
591 local->hw.max_rx_aggregation_subframes = IEEE80211_MAX_AMPDU_BUF; 601 local->hw.max_rx_aggregation_subframes = IEEE80211_MAX_AMPDU_BUF_HT;
592 local->hw.max_tx_aggregation_subframes = IEEE80211_MAX_AMPDU_BUF; 602 local->hw.max_tx_aggregation_subframes = IEEE80211_MAX_AMPDU_BUF_HT;
593 local->hw.offchannel_tx_hw_queue = IEEE80211_INVAL_HW_QUEUE; 603 local->hw.offchannel_tx_hw_queue = IEEE80211_INVAL_HW_QUEUE;
594 local->hw.conf.long_frame_max_tx_count = wiphy->retry_long; 604 local->hw.conf.long_frame_max_tx_count = wiphy->retry_long;
595 local->hw.conf.short_frame_max_tx_count = wiphy->retry_short; 605 local->hw.conf.short_frame_max_tx_count = wiphy->retry_short;
@@ -816,7 +826,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
816 int result, i; 826 int result, i;
817 enum nl80211_band band; 827 enum nl80211_band band;
818 int channels, max_bitrates; 828 int channels, max_bitrates;
819 bool supp_ht, supp_vht; 829 bool supp_ht, supp_vht, supp_he;
820 netdev_features_t feature_whitelist; 830 netdev_features_t feature_whitelist;
821 struct cfg80211_chan_def dflt_chandef = {}; 831 struct cfg80211_chan_def dflt_chandef = {};
822 832
@@ -896,6 +906,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
896 max_bitrates = 0; 906 max_bitrates = 0;
897 supp_ht = false; 907 supp_ht = false;
898 supp_vht = false; 908 supp_vht = false;
909 supp_he = false;
899 for (band = 0; band < NUM_NL80211_BANDS; band++) { 910 for (band = 0; band < NUM_NL80211_BANDS; band++) {
900 struct ieee80211_supported_band *sband; 911 struct ieee80211_supported_band *sband;
901 912
@@ -922,6 +933,9 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
922 supp_ht = supp_ht || sband->ht_cap.ht_supported; 933 supp_ht = supp_ht || sband->ht_cap.ht_supported;
923 supp_vht = supp_vht || sband->vht_cap.vht_supported; 934 supp_vht = supp_vht || sband->vht_cap.vht_supported;
924 935
936 if (!supp_he)
937 supp_he = !!ieee80211_get_he_sta_cap(sband);
938
925 if (!sband->ht_cap.ht_supported) 939 if (!sband->ht_cap.ht_supported)
926 continue; 940 continue;
927 941
@@ -1011,6 +1025,18 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
1011 local->scan_ies_len += 1025 local->scan_ies_len +=
1012 2 + sizeof(struct ieee80211_vht_cap); 1026 2 + sizeof(struct ieee80211_vht_cap);
1013 1027
1028 /* HE cap element is variable in size - set len to allow max size */
1029 /*
1030 * TODO: 1 is added at the end of the calculation to accommodate for
1031 * the temporary placing of the HE capabilities IE under EXT.
1032 * Remove it once it is placed in the final place.
1033 */
1034 if (supp_he)
1035 local->scan_ies_len +=
1036 2 + sizeof(struct ieee80211_he_cap_elem) +
1037 sizeof(struct ieee80211_he_mcs_nss_supp) +
1038 IEEE80211_HE_PPE_THRES_MAX_LEN + 1;
1039
1014 if (!local->ops->hw_scan) { 1040 if (!local->ops->hw_scan) {
1015 /* For hw_scan, driver needs to set these up. */ 1041 /* For hw_scan, driver needs to set these up. */
1016 local->hw.wiphy->max_scan_ssids = 4; 1042 local->hw.wiphy->max_scan_ssids = 4;
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index a59187c016e0..7fb9957359a3 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -149,6 +149,7 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata,
149 struct ieee80211_channel *channel, 149 struct ieee80211_channel *channel,
150 const struct ieee80211_ht_operation *ht_oper, 150 const struct ieee80211_ht_operation *ht_oper,
151 const struct ieee80211_vht_operation *vht_oper, 151 const struct ieee80211_vht_operation *vht_oper,
152 const struct ieee80211_he_operation *he_oper,
152 struct cfg80211_chan_def *chandef, bool tracking) 153 struct cfg80211_chan_def *chandef, bool tracking)
153{ 154{
154 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; 155 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
@@ -207,7 +208,27 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata,
207 } 208 }
208 209
209 vht_chandef = *chandef; 210 vht_chandef = *chandef;
210 if (!ieee80211_chandef_vht_oper(vht_oper, &vht_chandef)) { 211 if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE) && he_oper &&
212 (le32_to_cpu(he_oper->he_oper_params) &
213 IEEE80211_HE_OPERATION_VHT_OPER_INFO)) {
214 struct ieee80211_vht_operation he_oper_vht_cap;
215
216 /*
217 * Set only first 3 bytes (other 2 aren't used in
218 * ieee80211_chandef_vht_oper() anyway)
219 */
220 memcpy(&he_oper_vht_cap, he_oper->optional, 3);
221 he_oper_vht_cap.basic_mcs_set = cpu_to_le16(0);
222
223 if (!ieee80211_chandef_vht_oper(&he_oper_vht_cap,
224 &vht_chandef)) {
225 if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE))
226 sdata_info(sdata,
227 "HE AP VHT information is invalid, disable HE\n");
228 ret = IEEE80211_STA_DISABLE_HE;
229 goto out;
230 }
231 } else if (!ieee80211_chandef_vht_oper(vht_oper, &vht_chandef)) {
211 if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) 232 if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT))
212 sdata_info(sdata, 233 sdata_info(sdata,
213 "AP VHT information is invalid, disable VHT\n"); 234 "AP VHT information is invalid, disable VHT\n");
@@ -300,12 +321,14 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata,
300 const struct ieee80211_ht_cap *ht_cap, 321 const struct ieee80211_ht_cap *ht_cap,
301 const struct ieee80211_ht_operation *ht_oper, 322 const struct ieee80211_ht_operation *ht_oper,
302 const struct ieee80211_vht_operation *vht_oper, 323 const struct ieee80211_vht_operation *vht_oper,
324 const struct ieee80211_he_operation *he_oper,
303 const u8 *bssid, u32 *changed) 325 const u8 *bssid, u32 *changed)
304{ 326{
305 struct ieee80211_local *local = sdata->local; 327 struct ieee80211_local *local = sdata->local;
306 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; 328 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
307 struct ieee80211_supported_band *sband; 329 struct ieee80211_channel *chan = sdata->vif.bss_conf.chandef.chan;
308 struct ieee80211_channel *chan; 330 struct ieee80211_supported_band *sband =
331 local->hw.wiphy->bands[chan->band];
309 struct cfg80211_chan_def chandef; 332 struct cfg80211_chan_def chandef;
310 u16 ht_opmode; 333 u16 ht_opmode;
311 u32 flags; 334 u32 flags;
@@ -320,6 +343,11 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata,
320 if (ifmgd->flags & IEEE80211_STA_DISABLE_VHT) 343 if (ifmgd->flags & IEEE80211_STA_DISABLE_VHT)
321 vht_oper = NULL; 344 vht_oper = NULL;
322 345
346 /* don't check HE if we associated as non-HE station */
347 if (ifmgd->flags & IEEE80211_STA_DISABLE_HE ||
348 !ieee80211_get_he_sta_cap(sband))
349 he_oper = NULL;
350
323 if (WARN_ON_ONCE(!sta)) 351 if (WARN_ON_ONCE(!sta))
324 return -EINVAL; 352 return -EINVAL;
325 353
@@ -333,12 +361,9 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata,
333 sdata->vif.bss_conf.ht_operation_mode = ht_opmode; 361 sdata->vif.bss_conf.ht_operation_mode = ht_opmode;
334 } 362 }
335 363
336 chan = sdata->vif.bss_conf.chandef.chan; 364 /* calculate new channel (type) based on HT/VHT/HE operation IEs */
337 sband = local->hw.wiphy->bands[chan->band];
338
339 /* calculate new channel (type) based on HT/VHT operation IEs */
340 flags = ieee80211_determine_chantype(sdata, sband, chan, 365 flags = ieee80211_determine_chantype(sdata, sband, chan,
341 ht_oper, vht_oper, 366 ht_oper, vht_oper, he_oper,
342 &chandef, true); 367 &chandef, true);
343 368
344 /* 369 /*
@@ -582,6 +607,34 @@ static void ieee80211_add_vht_ie(struct ieee80211_sub_if_data *sdata,
582 ieee80211_ie_build_vht_cap(pos, &vht_cap, cap); 607 ieee80211_ie_build_vht_cap(pos, &vht_cap, cap);
583} 608}
584 609
610/* This function determines HE capability flags for the association
611 * and builds the IE.
612 */
613static void ieee80211_add_he_ie(struct ieee80211_sub_if_data *sdata,
614 struct sk_buff *skb,
615 struct ieee80211_supported_band *sband)
616{
617 u8 *pos;
618 const struct ieee80211_sta_he_cap *he_cap = NULL;
619 u8 he_cap_size;
620
621 he_cap = ieee80211_get_he_sta_cap(sband);
622 if (!he_cap)
623 return;
624
625 /*
626 * TODO: the 1 added is because this temporarily is under the EXTENSION
627 * IE. Get rid of it when it moves.
628 */
629 he_cap_size =
630 2 + 1 + sizeof(he_cap->he_cap_elem) +
631 ieee80211_he_mcs_nss_size(&he_cap->he_cap_elem) +
632 ieee80211_he_ppe_size(he_cap->ppe_thres[0],
633 he_cap->he_cap_elem.phy_cap_info);
634 pos = skb_put(skb, he_cap_size);
635 ieee80211_ie_build_he_cap(pos, he_cap, pos + he_cap_size);
636}
637
585static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) 638static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
586{ 639{
587 struct ieee80211_local *local = sdata->local; 640 struct ieee80211_local *local = sdata->local;
@@ -643,6 +696,9 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
643 2 + 2 * sband->n_channels + /* supported channels */ 696 2 + 2 * sband->n_channels + /* supported channels */
644 2 + sizeof(struct ieee80211_ht_cap) + /* HT */ 697 2 + sizeof(struct ieee80211_ht_cap) + /* HT */
645 2 + sizeof(struct ieee80211_vht_cap) + /* VHT */ 698 2 + sizeof(struct ieee80211_vht_cap) + /* VHT */
699 2 + 1 + sizeof(struct ieee80211_he_cap_elem) + /* HE */
700 sizeof(struct ieee80211_he_mcs_nss_supp) +
701 IEEE80211_HE_PPE_THRES_MAX_LEN +
646 assoc_data->ie_len + /* extra IEs */ 702 assoc_data->ie_len + /* extra IEs */
647 (assoc_data->fils_kek_len ? 16 /* AES-SIV */ : 0) + 703 (assoc_data->fils_kek_len ? 16 /* AES-SIV */ : 0) +
648 9, /* WMM */ 704 9, /* WMM */
@@ -827,11 +883,41 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
827 offset = noffset; 883 offset = noffset;
828 } 884 }
829 885
886 /* if present, add any custom IEs that go before HE */
887 if (assoc_data->ie_len) {
888 static const u8 before_he[] = {
889 /*
890 * no need to list the ones split off before VHT
891 * or generated here
892 */
893 WLAN_EID_OPMODE_NOTIF,
894 WLAN_EID_EXTENSION, WLAN_EID_EXT_FUTURE_CHAN_GUIDANCE,
895 /* 11ai elements */
896 WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_SESSION,
897 WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_PUBLIC_KEY,
898 WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_KEY_CONFIRM,
899 WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_HLP_CONTAINER,
900 WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_IP_ADDR_ASSIGN,
901 /* TODO: add 11ah/11aj/11ak elements */
902 };
903
904 /* RIC already taken above, so no need to handle here anymore */
905 noffset = ieee80211_ie_split(assoc_data->ie, assoc_data->ie_len,
906 before_he, ARRAY_SIZE(before_he),
907 offset);
908 pos = skb_put(skb, noffset - offset);
909 memcpy(pos, assoc_data->ie + offset, noffset - offset);
910 offset = noffset;
911 }
912
830 if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) 913 if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT))
831 ieee80211_add_vht_ie(sdata, skb, sband, 914 ieee80211_add_vht_ie(sdata, skb, sband,
832 &assoc_data->ap_vht_cap); 915 &assoc_data->ap_vht_cap);
833 916
834 /* if present, add any custom non-vendor IEs that go after HT */ 917 if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE))
918 ieee80211_add_he_ie(sdata, skb, sband);
919
920 /* if present, add any custom non-vendor IEs that go after HE */
835 if (assoc_data->ie_len) { 921 if (assoc_data->ie_len) {
836 noffset = ieee80211_ie_split_vendor(assoc_data->ie, 922 noffset = ieee80211_ie_split_vendor(assoc_data->ie,
837 assoc_data->ie_len, 923 assoc_data->ie_len,
@@ -898,6 +984,11 @@ void ieee80211_send_nullfunc(struct ieee80211_local *local,
898 struct ieee80211_hdr_3addr *nullfunc; 984 struct ieee80211_hdr_3addr *nullfunc;
899 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; 985 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
900 986
987 /* Don't send NDPs when STA is connected HE */
988 if (sdata->vif.type == NL80211_IFTYPE_STATION &&
989 !(ifmgd->flags & IEEE80211_STA_DISABLE_HE))
990 return;
991
901 skb = ieee80211_nullfunc_get(&local->hw, &sdata->vif, 992 skb = ieee80211_nullfunc_get(&local->hw, &sdata->vif,
902 !ieee80211_hw_check(&local->hw, DOESNT_SUPPORT_QOS_NDP)); 993 !ieee80211_hw_check(&local->hw, DOESNT_SUPPORT_QOS_NDP));
903 if (!skb) 994 if (!skb)
@@ -929,6 +1020,10 @@ static void ieee80211_send_4addr_nullfunc(struct ieee80211_local *local,
929 if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION)) 1020 if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION))
930 return; 1021 return;
931 1022
1023 /* Don't send NDPs when connected HE */
1024 if (!(sdata->u.mgd.flags & IEEE80211_STA_DISABLE_HE))
1025 return;
1026
932 skb = dev_alloc_skb(local->hw.extra_tx_headroom + 30); 1027 skb = dev_alloc_skb(local->hw.extra_tx_headroom + 30);
933 if (!skb) 1028 if (!skb)
934 return; 1029 return;
@@ -1700,9 +1795,11 @@ static void ieee80211_sta_handle_tspec_ac_params_wk(struct work_struct *work)
1700} 1795}
1701 1796
1702/* MLME */ 1797/* MLME */
1703static bool ieee80211_sta_wmm_params(struct ieee80211_local *local, 1798static bool
1704 struct ieee80211_sub_if_data *sdata, 1799ieee80211_sta_wmm_params(struct ieee80211_local *local,
1705 const u8 *wmm_param, size_t wmm_param_len) 1800 struct ieee80211_sub_if_data *sdata,
1801 const u8 *wmm_param, size_t wmm_param_len,
1802 const struct ieee80211_mu_edca_param_set *mu_edca)
1706{ 1803{
1707 struct ieee80211_tx_queue_params params[IEEE80211_NUM_ACS]; 1804 struct ieee80211_tx_queue_params params[IEEE80211_NUM_ACS];
1708 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; 1805 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
@@ -1749,6 +1846,9 @@ static bool ieee80211_sta_wmm_params(struct ieee80211_local *local,
1749 sdata->wmm_acm |= BIT(1) | BIT(2); /* BK/- */ 1846 sdata->wmm_acm |= BIT(1) | BIT(2); /* BK/- */
1750 if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_BK) 1847 if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_BK)
1751 uapsd = true; 1848 uapsd = true;
1849 params[ac].mu_edca = !!mu_edca;
1850 if (mu_edca)
1851 params[ac].mu_edca_param_rec = mu_edca->ac_bk;
1752 break; 1852 break;
1753 case 2: /* AC_VI */ 1853 case 2: /* AC_VI */
1754 ac = IEEE80211_AC_VI; 1854 ac = IEEE80211_AC_VI;
@@ -1756,6 +1856,9 @@ static bool ieee80211_sta_wmm_params(struct ieee80211_local *local,
1756 sdata->wmm_acm |= BIT(4) | BIT(5); /* CL/VI */ 1856 sdata->wmm_acm |= BIT(4) | BIT(5); /* CL/VI */
1757 if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_VI) 1857 if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_VI)
1758 uapsd = true; 1858 uapsd = true;
1859 params[ac].mu_edca = !!mu_edca;
1860 if (mu_edca)
1861 params[ac].mu_edca_param_rec = mu_edca->ac_vi;
1759 break; 1862 break;
1760 case 3: /* AC_VO */ 1863 case 3: /* AC_VO */
1761 ac = IEEE80211_AC_VO; 1864 ac = IEEE80211_AC_VO;
@@ -1763,6 +1866,9 @@ static bool ieee80211_sta_wmm_params(struct ieee80211_local *local,
1763 sdata->wmm_acm |= BIT(6) | BIT(7); /* VO/NC */ 1866 sdata->wmm_acm |= BIT(6) | BIT(7); /* VO/NC */
1764 if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_VO) 1867 if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_VO)
1765 uapsd = true; 1868 uapsd = true;
1869 params[ac].mu_edca = !!mu_edca;
1870 if (mu_edca)
1871 params[ac].mu_edca_param_rec = mu_edca->ac_vo;
1766 break; 1872 break;
1767 case 0: /* AC_BE */ 1873 case 0: /* AC_BE */
1768 default: 1874 default:
@@ -1771,6 +1877,9 @@ static bool ieee80211_sta_wmm_params(struct ieee80211_local *local,
1771 sdata->wmm_acm |= BIT(0) | BIT(3); /* BE/EE */ 1877 sdata->wmm_acm |= BIT(0) | BIT(3); /* BE/EE */
1772 if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_BE) 1878 if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_BE)
1773 uapsd = true; 1879 uapsd = true;
1880 params[ac].mu_edca = !!mu_edca;
1881 if (mu_edca)
1882 params[ac].mu_edca_param_rec = mu_edca->ac_be;
1774 break; 1883 break;
1775 } 1884 }
1776 1885
@@ -2219,6 +2328,20 @@ void ieee80211_sta_tx_notify(struct ieee80211_sub_if_data *sdata,
2219 ieee80211_sta_reset_conn_monitor(sdata); 2328 ieee80211_sta_reset_conn_monitor(sdata);
2220} 2329}
2221 2330
2331static void ieee80211_mlme_send_probe_req(struct ieee80211_sub_if_data *sdata,
2332 const u8 *src, const u8 *dst,
2333 const u8 *ssid, size_t ssid_len,
2334 struct ieee80211_channel *channel)
2335{
2336 struct sk_buff *skb;
2337
2338 skb = ieee80211_build_probe_req(sdata, src, dst, (u32)-1, channel,
2339 ssid, ssid_len, NULL, 0,
2340 IEEE80211_PROBE_FLAG_DIRECTED);
2341 if (skb)
2342 ieee80211_tx_skb(sdata, skb);
2343}
2344
2222static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata) 2345static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata)
2223{ 2346{
2224 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; 2347 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
@@ -2265,10 +2388,9 @@ static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata)
2265 else 2388 else
2266 ssid_len = ssid[1]; 2389 ssid_len = ssid[1];
2267 2390
2268 ieee80211_send_probe_req(sdata, sdata->vif.addr, dst, 2391 ieee80211_mlme_send_probe_req(sdata, sdata->vif.addr, dst,
2269 ssid + 2, ssid_len, NULL, 2392 ssid + 2, ssid_len,
2270 0, (u32) -1, true, 0, 2393 ifmgd->associated->channel);
2271 ifmgd->associated->channel, false);
2272 rcu_read_unlock(); 2394 rcu_read_unlock();
2273 } 2395 }
2274 2396
@@ -2370,7 +2492,7 @@ struct sk_buff *ieee80211_ap_probereq_get(struct ieee80211_hw *hw,
2370 skb = ieee80211_build_probe_req(sdata, sdata->vif.addr, cbss->bssid, 2492 skb = ieee80211_build_probe_req(sdata, sdata->vif.addr, cbss->bssid,
2371 (u32) -1, cbss->channel, 2493 (u32) -1, cbss->channel,
2372 ssid + 2, ssid_len, 2494 ssid + 2, ssid_len,
2373 NULL, 0, true); 2495 NULL, 0, IEEE80211_PROBE_FLAG_DIRECTED);
2374 rcu_read_unlock(); 2496 rcu_read_unlock();
2375 2497
2376 return skb; 2498 return skb;
@@ -3008,6 +3130,25 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
3008 goto out; 3130 goto out;
3009 } 3131 }
3010 3132
3133 /*
3134 * If AP doesn't support HT, or it doesn't have HE mandatory IEs, mark
3135 * HE as disabled. If on the 5GHz band, make sure it supports VHT.
3136 */
3137 if (ifmgd->flags & IEEE80211_STA_DISABLE_HT ||
3138 (sband->band == NL80211_BAND_5GHZ &&
3139 ifmgd->flags & IEEE80211_STA_DISABLE_VHT) ||
3140 (!elems.he_cap && !elems.he_operation))
3141 ifmgd->flags |= IEEE80211_STA_DISABLE_HE;
3142
3143 if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE) &&
3144 (!elems.he_cap || !elems.he_operation)) {
3145 mutex_unlock(&sdata->local->sta_mtx);
3146 sdata_info(sdata,
3147 "HE AP is missing HE capability/operation\n");
3148 ret = false;
3149 goto out;
3150 }
3151
3011 /* Set up internal HT/VHT capabilities */ 3152 /* Set up internal HT/VHT capabilities */
3012 if (elems.ht_cap_elem && !(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) 3153 if (elems.ht_cap_elem && !(ifmgd->flags & IEEE80211_STA_DISABLE_HT))
3013 ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband, 3154 ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband,
@@ -3017,6 +3158,48 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
3017 ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband, 3158 ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband,
3018 elems.vht_cap_elem, sta); 3159 elems.vht_cap_elem, sta);
3019 3160
3161 if (elems.he_operation && !(ifmgd->flags & IEEE80211_STA_DISABLE_HE) &&
3162 elems.he_cap) {
3163 ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband,
3164 elems.he_cap,
3165 elems.he_cap_len,
3166 sta);
3167
3168 bss_conf->he_support = sta->sta.he_cap.has_he;
3169 } else {
3170 bss_conf->he_support = false;
3171 }
3172
3173 if (bss_conf->he_support) {
3174 u32 he_oper_params =
3175 le32_to_cpu(elems.he_operation->he_oper_params);
3176
3177 bss_conf->bss_color = he_oper_params &
3178 IEEE80211_HE_OPERATION_BSS_COLOR_MASK;
3179 bss_conf->htc_trig_based_pkt_ext =
3180 (he_oper_params &
3181 IEEE80211_HE_OPERATION_DFLT_PE_DURATION_MASK) <<
3182 IEEE80211_HE_OPERATION_DFLT_PE_DURATION_OFFSET;
3183 bss_conf->frame_time_rts_th =
3184 (he_oper_params &
3185 IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK) <<
3186 IEEE80211_HE_OPERATION_RTS_THRESHOLD_OFFSET;
3187
3188 bss_conf->multi_sta_back_32bit =
3189 sta->sta.he_cap.he_cap_elem.mac_cap_info[2] &
3190 IEEE80211_HE_MAC_CAP2_32BIT_BA_BITMAP;
3191
3192 bss_conf->ack_enabled =
3193 sta->sta.he_cap.he_cap_elem.mac_cap_info[2] &
3194 IEEE80211_HE_MAC_CAP2_ACK_EN;
3195
3196 bss_conf->uora_exists = !!elems.uora_element;
3197 if (elems.uora_element)
3198 bss_conf->uora_ocw_range = elems.uora_element[0];
3199
3200 /* TODO: OPEN: what happens if BSS color disable is set? */
3201 }
3202
3020 /* 3203 /*
3021 * Some APs, e.g. Netgear WNDR3700, report invalid HT operation data 3204 * Some APs, e.g. Netgear WNDR3700, report invalid HT operation data
3022 * in their association response, so ignore that data for our own 3205 * in their association response, so ignore that data for our own
@@ -3076,7 +3259,8 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
3076 if (ifmgd->flags & IEEE80211_STA_DISABLE_WMM) { 3259 if (ifmgd->flags & IEEE80211_STA_DISABLE_WMM) {
3077 ieee80211_set_wmm_default(sdata, false, false); 3260 ieee80211_set_wmm_default(sdata, false, false);
3078 } else if (!ieee80211_sta_wmm_params(local, sdata, elems.wmm_param, 3261 } else if (!ieee80211_sta_wmm_params(local, sdata, elems.wmm_param,
3079 elems.wmm_param_len)) { 3262 elems.wmm_param_len,
3263 elems.mu_edca_param_set)) {
3080 /* still enable QoS since we might have HT/VHT */ 3264 /* still enable QoS since we might have HT/VHT */
3081 ieee80211_set_wmm_default(sdata, false, true); 3265 ieee80211_set_wmm_default(sdata, false, true);
3082 /* set the disable-WMM flag in this case to disable 3266 /* set the disable-WMM flag in this case to disable
@@ -3590,7 +3774,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
3590 3774
3591 if (!(ifmgd->flags & IEEE80211_STA_DISABLE_WMM) && 3775 if (!(ifmgd->flags & IEEE80211_STA_DISABLE_WMM) &&
3592 ieee80211_sta_wmm_params(local, sdata, elems.wmm_param, 3776 ieee80211_sta_wmm_params(local, sdata, elems.wmm_param,
3593 elems.wmm_param_len)) 3777 elems.wmm_param_len,
3778 elems.mu_edca_param_set))
3594 changed |= BSS_CHANGED_QOS; 3779 changed |= BSS_CHANGED_QOS;
3595 3780
3596 /* 3781 /*
@@ -3629,7 +3814,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
3629 3814
3630 if (ieee80211_config_bw(sdata, sta, 3815 if (ieee80211_config_bw(sdata, sta,
3631 elems.ht_cap_elem, elems.ht_operation, 3816 elems.ht_cap_elem, elems.ht_operation,
3632 elems.vht_operation, bssid, &changed)) { 3817 elems.vht_operation, elems.he_operation,
3818 bssid, &changed)) {
3633 mutex_unlock(&local->sta_mtx); 3819 mutex_unlock(&local->sta_mtx);
3634 sdata_info(sdata, 3820 sdata_info(sdata,
3635 "failed to follow AP %pM bandwidth change, disconnect\n", 3821 "failed to follow AP %pM bandwidth change, disconnect\n",
@@ -4266,6 +4452,68 @@ static u8 ieee80211_ht_vht_rx_chains(struct ieee80211_sub_if_data *sdata,
4266 return chains; 4452 return chains;
4267} 4453}
4268 4454
4455static bool
4456ieee80211_verify_sta_he_mcs_support(struct ieee80211_supported_band *sband,
4457 const struct ieee80211_he_operation *he_op)
4458{
4459 const struct ieee80211_sta_he_cap *sta_he_cap =
4460 ieee80211_get_he_sta_cap(sband);
4461 u16 ap_min_req_set;
4462 int i;
4463
4464 if (!sta_he_cap || !he_op)
4465 return false;
4466
4467 ap_min_req_set = le16_to_cpu(he_op->he_mcs_nss_set);
4468
4469 /* Need to go over for 80MHz, 160MHz and for 80+80 */
4470 for (i = 0; i < 3; i++) {
4471 const struct ieee80211_he_mcs_nss_supp *sta_mcs_nss_supp =
4472 &sta_he_cap->he_mcs_nss_supp;
4473 u16 sta_mcs_map_rx =
4474 le16_to_cpu(((__le16 *)sta_mcs_nss_supp)[2 * i]);
4475 u16 sta_mcs_map_tx =
4476 le16_to_cpu(((__le16 *)sta_mcs_nss_supp)[2 * i + 1]);
4477 u8 nss;
4478 bool verified = true;
4479
4480 /*
4481 * For each band there is a maximum of 8 spatial streams
4482 * possible. Each of the sta_mcs_map_* is a 16-bit struct built
4483 * of 2 bits per NSS (1-8), with the values defined in enum
4484 * ieee80211_he_mcs_support. Need to make sure STA TX and RX
4485 * capabilities aren't less than the AP's minimum requirements
4486 * for this HE BSS per SS.
4487 * It is enough to find one such band that meets the reqs.
4488 */
4489 for (nss = 8; nss > 0; nss--) {
4490 u8 sta_rx_val = (sta_mcs_map_rx >> (2 * (nss - 1))) & 3;
4491 u8 sta_tx_val = (sta_mcs_map_tx >> (2 * (nss - 1))) & 3;
4492 u8 ap_val = (ap_min_req_set >> (2 * (nss - 1))) & 3;
4493
4494 if (ap_val == IEEE80211_HE_MCS_NOT_SUPPORTED)
4495 continue;
4496
4497 /*
4498 * Make sure the HE AP doesn't require MCSs that aren't
4499 * supported by the client
4500 */
4501 if (sta_rx_val == IEEE80211_HE_MCS_NOT_SUPPORTED ||
4502 sta_tx_val == IEEE80211_HE_MCS_NOT_SUPPORTED ||
4503 (ap_val > sta_rx_val) || (ap_val > sta_tx_val)) {
4504 verified = false;
4505 break;
4506 }
4507 }
4508
4509 if (verified)
4510 return true;
4511 }
4512
4513 /* If here, STA doesn't meet AP's HE min requirements */
4514 return false;
4515}
4516
4269static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, 4517static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
4270 struct cfg80211_bss *cbss) 4518 struct cfg80211_bss *cbss)
4271{ 4519{
@@ -4274,6 +4522,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
4274 const struct ieee80211_ht_cap *ht_cap = NULL; 4522 const struct ieee80211_ht_cap *ht_cap = NULL;
4275 const struct ieee80211_ht_operation *ht_oper = NULL; 4523 const struct ieee80211_ht_operation *ht_oper = NULL;
4276 const struct ieee80211_vht_operation *vht_oper = NULL; 4524 const struct ieee80211_vht_operation *vht_oper = NULL;
4525 const struct ieee80211_he_operation *he_oper = NULL;
4277 struct ieee80211_supported_band *sband; 4526 struct ieee80211_supported_band *sband;
4278 struct cfg80211_chan_def chandef; 4527 struct cfg80211_chan_def chandef;
4279 int ret; 4528 int ret;
@@ -4329,6 +4578,24 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
4329 } 4578 }
4330 } 4579 }
4331 4580
4581 if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE) &&
4582 ieee80211_get_he_sta_cap(sband)) {
4583 const struct cfg80211_bss_ies *ies;
4584 const u8 *he_oper_ie;
4585
4586 ies = rcu_dereference(cbss->ies);
4587 he_oper_ie = cfg80211_find_ext_ie(WLAN_EID_EXT_HE_OPERATION,
4588 ies->data, ies->len);
4589 if (he_oper_ie &&
4590 he_oper_ie[1] == ieee80211_he_oper_size(&he_oper_ie[3]))
4591 he_oper = (void *)(he_oper_ie + 3);
4592 else
4593 he_oper = NULL;
4594
4595 if (!ieee80211_verify_sta_he_mcs_support(sband, he_oper))
4596 ifmgd->flags |= IEEE80211_STA_DISABLE_HE;
4597 }
4598
4332 /* Allow VHT if at least one channel on the sband supports 80 MHz */ 4599 /* Allow VHT if at least one channel on the sband supports 80 MHz */
4333 have_80mhz = false; 4600 have_80mhz = false;
4334 for (i = 0; i < sband->n_channels; i++) { 4601 for (i = 0; i < sband->n_channels; i++) {
@@ -4345,7 +4612,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
4345 4612
4346 ifmgd->flags |= ieee80211_determine_chantype(sdata, sband, 4613 ifmgd->flags |= ieee80211_determine_chantype(sdata, sband,
4347 cbss->channel, 4614 cbss->channel,
4348 ht_oper, vht_oper, 4615 ht_oper, vht_oper, he_oper,
4349 &chandef, false); 4616 &chandef, false);
4350 4617
4351 sdata->needed_rx_chains = min(ieee80211_ht_vht_rx_chains(sdata, cbss), 4618 sdata->needed_rx_chains = min(ieee80211_ht_vht_rx_chains(sdata, cbss),
@@ -4751,8 +5018,9 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
4751 req->crypto.ciphers_pairwise[i] == WLAN_CIPHER_SUITE_WEP104) { 5018 req->crypto.ciphers_pairwise[i] == WLAN_CIPHER_SUITE_WEP104) {
4752 ifmgd->flags |= IEEE80211_STA_DISABLE_HT; 5019 ifmgd->flags |= IEEE80211_STA_DISABLE_HT;
4753 ifmgd->flags |= IEEE80211_STA_DISABLE_VHT; 5020 ifmgd->flags |= IEEE80211_STA_DISABLE_VHT;
5021 ifmgd->flags |= IEEE80211_STA_DISABLE_HE;
4754 netdev_info(sdata->dev, 5022 netdev_info(sdata->dev,
4755 "disabling HT/VHT due to WEP/TKIP use\n"); 5023 "disabling HE/HT/VHT due to WEP/TKIP use\n");
4756 } 5024 }
4757 } 5025 }
4758 5026
diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c
index f1d40b6645ff..8ef4153cd299 100644
--- a/net/mac80211/offchannel.c
+++ b/net/mac80211/offchannel.c
@@ -262,7 +262,7 @@ static void ieee80211_handle_roc_started(struct ieee80211_roc_work *roc,
262 if (roc->mgmt_tx_cookie) { 262 if (roc->mgmt_tx_cookie) {
263 if (!WARN_ON(!roc->frame)) { 263 if (!WARN_ON(!roc->frame)) {
264 ieee80211_tx_skb_tid_band(roc->sdata, roc->frame, 7, 264 ieee80211_tx_skb_tid_band(roc->sdata, roc->frame, 7,
265 roc->chan->band); 265 roc->chan->band, 0);
266 roc->frame = NULL; 266 roc->frame = NULL;
267 } 267 }
268 } else { 268 } else {
diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c
index 76048b53c5b2..07fb219327d6 100644
--- a/net/mac80211/rc80211_minstrel.c
+++ b/net/mac80211/rc80211_minstrel.c
@@ -751,4 +751,3 @@ rc80211_minstrel_exit(void)
751{ 751{
752 ieee80211_rate_control_unregister(&mac80211_minstrel); 752 ieee80211_rate_control_unregister(&mac80211_minstrel);
753} 753}
754
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 0a38cc1cbebc..64742f2765c4 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -175,6 +175,20 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local,
175 len += 12; 175 len += 12;
176 } 176 }
177 177
178 if (status->encoding == RX_ENC_HE &&
179 status->flag & RX_FLAG_RADIOTAP_HE) {
180 len = ALIGN(len, 2);
181 len += 12;
182 BUILD_BUG_ON(sizeof(struct ieee80211_radiotap_he) != 12);
183 }
184
185 if (status->encoding == RX_ENC_HE &&
186 status->flag & RX_FLAG_RADIOTAP_HE_MU) {
187 len = ALIGN(len, 2);
188 len += 12;
189 BUILD_BUG_ON(sizeof(struct ieee80211_radiotap_he_mu) != 12);
190 }
191
178 if (status->chains) { 192 if (status->chains) {
179 /* antenna and antenna signal fields */ 193 /* antenna and antenna signal fields */
180 len += 2 * hweight8(status->chains); 194 len += 2 * hweight8(status->chains);
@@ -263,6 +277,19 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
263 int mpdulen, chain; 277 int mpdulen, chain;
264 unsigned long chains = status->chains; 278 unsigned long chains = status->chains;
265 struct ieee80211_vendor_radiotap rtap = {}; 279 struct ieee80211_vendor_radiotap rtap = {};
280 struct ieee80211_radiotap_he he = {};
281 struct ieee80211_radiotap_he_mu he_mu = {};
282
283 if (status->flag & RX_FLAG_RADIOTAP_HE) {
284 he = *(struct ieee80211_radiotap_he *)skb->data;
285 skb_pull(skb, sizeof(he));
286 WARN_ON_ONCE(status->encoding != RX_ENC_HE);
287 }
288
289 if (status->flag & RX_FLAG_RADIOTAP_HE_MU) {
290 he_mu = *(struct ieee80211_radiotap_he_mu *)skb->data;
291 skb_pull(skb, sizeof(he_mu));
292 }
266 293
267 if (status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA) { 294 if (status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA) {
268 rtap = *(struct ieee80211_vendor_radiotap *)skb->data; 295 rtap = *(struct ieee80211_vendor_radiotap *)skb->data;
@@ -520,6 +547,89 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
520 *pos++ = flags; 547 *pos++ = flags;
521 } 548 }
522 549
550 if (status->encoding == RX_ENC_HE &&
551 status->flag & RX_FLAG_RADIOTAP_HE) {
552#define HE_PREP(f, val) cpu_to_le16(FIELD_PREP(IEEE80211_RADIOTAP_HE_##f, val))
553
554 if (status->enc_flags & RX_ENC_FLAG_STBC_MASK) {
555 he.data6 |= HE_PREP(DATA6_NSTS,
556 FIELD_GET(RX_ENC_FLAG_STBC_MASK,
557 status->enc_flags));
558 he.data3 |= HE_PREP(DATA3_STBC, 1);
559 } else {
560 he.data6 |= HE_PREP(DATA6_NSTS, status->nss);
561 }
562
563#define CHECK_GI(s) \
564 BUILD_BUG_ON(IEEE80211_RADIOTAP_HE_DATA5_GI_##s != \
565 (int)NL80211_RATE_INFO_HE_GI_##s)
566
567 CHECK_GI(0_8);
568 CHECK_GI(1_6);
569 CHECK_GI(3_2);
570
571 he.data3 |= HE_PREP(DATA3_DATA_MCS, status->rate_idx);
572 he.data3 |= HE_PREP(DATA3_DATA_DCM, status->he_dcm);
573 he.data3 |= HE_PREP(DATA3_CODING,
574 !!(status->enc_flags & RX_ENC_FLAG_LDPC));
575
576 he.data5 |= HE_PREP(DATA5_GI, status->he_gi);
577
578 switch (status->bw) {
579 case RATE_INFO_BW_20:
580 he.data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC,
581 IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_20MHZ);
582 break;
583 case RATE_INFO_BW_40:
584 he.data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC,
585 IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_40MHZ);
586 break;
587 case RATE_INFO_BW_80:
588 he.data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC,
589 IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_80MHZ);
590 break;
591 case RATE_INFO_BW_160:
592 he.data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC,
593 IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_160MHZ);
594 break;
595 case RATE_INFO_BW_HE_RU:
596#define CHECK_RU_ALLOC(s) \
597 BUILD_BUG_ON(IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_##s##T != \
598 NL80211_RATE_INFO_HE_RU_ALLOC_##s + 4)
599
600 CHECK_RU_ALLOC(26);
601 CHECK_RU_ALLOC(52);
602 CHECK_RU_ALLOC(106);
603 CHECK_RU_ALLOC(242);
604 CHECK_RU_ALLOC(484);
605 CHECK_RU_ALLOC(996);
606 CHECK_RU_ALLOC(2x996);
607
608 he.data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC,
609 status->he_ru + 4);
610 break;
611 default:
612 WARN_ONCE(1, "Invalid SU BW %d\n", status->bw);
613 }
614
615 /* ensure 2 byte alignment */
616 while ((pos - (u8 *)rthdr) & 1)
617 pos++;
618 rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_HE);
619 memcpy(pos, &he, sizeof(he));
620 pos += sizeof(he);
621 }
622
623 if (status->encoding == RX_ENC_HE &&
624 status->flag & RX_FLAG_RADIOTAP_HE_MU) {
625 /* ensure 2 byte alignment */
626 while ((pos - (u8 *)rthdr) & 1)
627 pos++;
628 rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_HE_MU);
629 memcpy(pos, &he_mu, sizeof(he_mu));
630 pos += sizeof(he_mu);
631 }
632
523 for_each_set_bit(chain, &chains, IEEE80211_MAX_CHAINS) { 633 for_each_set_bit(chain, &chains, IEEE80211_MAX_CHAINS) {
524 *pos++ = status->chain_signal[chain]; 634 *pos++ = status->chain_signal[chain];
525 *pos++ = chain; 635 *pos++ = chain;
@@ -613,6 +723,12 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
613 rcu_dereference(local->monitor_sdata); 723 rcu_dereference(local->monitor_sdata);
614 bool only_monitor = false; 724 bool only_monitor = false;
615 725
726 if (status->flag & RX_FLAG_RADIOTAP_HE)
727 rtap_space += sizeof(struct ieee80211_radiotap_he);
728
729 if (status->flag & RX_FLAG_RADIOTAP_HE_MU)
730 rtap_space += sizeof(struct ieee80211_radiotap_he_mu);
731
616 if (unlikely(status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA)) { 732 if (unlikely(status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA)) {
617 struct ieee80211_vendor_radiotap *rtap = (void *)origskb->data; 733 struct ieee80211_vendor_radiotap *rtap = (void *)origskb->data;
618 734
@@ -2254,11 +2370,8 @@ static void ieee80211_deliver_skb_to_local_stack(struct sk_buff *skb,
2254 sdata->control_port_over_nl80211)) { 2370 sdata->control_port_over_nl80211)) {
2255 struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); 2371 struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
2256 bool noencrypt = status->flag & RX_FLAG_DECRYPTED; 2372 bool noencrypt = status->flag & RX_FLAG_DECRYPTED;
2257 struct ethhdr *ehdr = eth_hdr(skb);
2258 2373
2259 cfg80211_rx_control_port(dev, skb->data, skb->len, 2374 cfg80211_rx_control_port(dev, skb, noencrypt);
2260 ehdr->h_source,
2261 be16_to_cpu(skb->protocol), noencrypt);
2262 dev_kfree_skb(skb); 2375 dev_kfree_skb(skb);
2263 } else { 2376 } else {
2264 /* deliver to local stack */ 2377 /* deliver to local stack */
@@ -3241,7 +3354,7 @@ ieee80211_rx_h_action_return(struct ieee80211_rx_data *rx)
3241 } 3354 }
3242 3355
3243 __ieee80211_tx_skb_tid_band(rx->sdata, nskb, 7, 3356 __ieee80211_tx_skb_tid_band(rx->sdata, nskb, 7,
3244 status->band); 3357 status->band, 0);
3245 } 3358 }
3246 dev_kfree_skb(rx->skb); 3359 dev_kfree_skb(rx->skb);
3247 return RX_QUEUED; 3360 return RX_QUEUED;
@@ -3386,8 +3499,7 @@ static void ieee80211_rx_handlers_result(struct ieee80211_rx_data *rx,
3386 status = IEEE80211_SKB_RXCB((rx->skb)); 3499 status = IEEE80211_SKB_RXCB((rx->skb));
3387 3500
3388 sband = rx->local->hw.wiphy->bands[status->band]; 3501 sband = rx->local->hw.wiphy->bands[status->band];
3389 if (!(status->encoding == RX_ENC_HT) && 3502 if (status->encoding == RX_ENC_LEGACY)
3390 !(status->encoding == RX_ENC_VHT))
3391 rate = &sband->bitrates[status->rate_idx]; 3503 rate = &sband->bitrates[status->rate_idx];
3392 3504
3393 ieee80211_rx_cooked_monitor(rx, rate); 3505 ieee80211_rx_cooked_monitor(rx, rate);
@@ -4386,6 +4498,14 @@ void ieee80211_rx_napi(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta,
4386 status->rate_idx, status->nss)) 4498 status->rate_idx, status->nss))
4387 goto drop; 4499 goto drop;
4388 break; 4500 break;
4501 case RX_ENC_HE:
4502 if (WARN_ONCE(status->rate_idx > 11 ||
4503 !status->nss ||
4504 status->nss > 8,
4505 "Rate marked as an HE rate but data is invalid: MCS: %d, NSS: %d\n",
4506 status->rate_idx, status->nss))
4507 goto drop;
4508 break;
4389 default: 4509 default:
4390 WARN_ON_ONCE(1); 4510 WARN_ON_ONCE(1);
4391 /* fall through */ 4511 /* fall through */
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index 2e917a6d239d..5d2a11777718 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -20,6 +20,7 @@
20#include <net/sch_generic.h> 20#include <net/sch_generic.h>
21#include <linux/slab.h> 21#include <linux/slab.h>
22#include <linux/export.h> 22#include <linux/export.h>
23#include <linux/random.h>
23#include <net/mac80211.h> 24#include <net/mac80211.h>
24 25
25#include "ieee80211_i.h" 26#include "ieee80211_i.h"
@@ -293,6 +294,7 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_local *local)
293 struct cfg80211_chan_def chandef; 294 struct cfg80211_chan_def chandef;
294 u8 bands_used = 0; 295 u8 bands_used = 0;
295 int i, ielen, n_chans; 296 int i, ielen, n_chans;
297 u32 flags = 0;
296 298
297 req = rcu_dereference_protected(local->scan_req, 299 req = rcu_dereference_protected(local->scan_req,
298 lockdep_is_held(&local->mtx)); 300 lockdep_is_held(&local->mtx));
@@ -331,12 +333,16 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_local *local)
331 local->hw_scan_req->req.n_channels = n_chans; 333 local->hw_scan_req->req.n_channels = n_chans;
332 ieee80211_prepare_scan_chandef(&chandef, req->scan_width); 334 ieee80211_prepare_scan_chandef(&chandef, req->scan_width);
333 335
336 if (req->flags & NL80211_SCAN_FLAG_MIN_PREQ_CONTENT)
337 flags |= IEEE80211_PROBE_FLAG_MIN_CONTENT;
338
334 ielen = ieee80211_build_preq_ies(local, 339 ielen = ieee80211_build_preq_ies(local,
335 (u8 *)local->hw_scan_req->req.ie, 340 (u8 *)local->hw_scan_req->req.ie,
336 local->hw_scan_ies_bufsize, 341 local->hw_scan_ies_bufsize,
337 &local->hw_scan_req->ies, 342 &local->hw_scan_req->ies,
338 req->ie, req->ie_len, 343 req->ie, req->ie_len,
339 bands_used, req->rates, &chandef); 344 bands_used, req->rates, &chandef,
345 flags);
340 local->hw_scan_req->req.ie_len = ielen; 346 local->hw_scan_req->req.ie_len = ielen;
341 local->hw_scan_req->req.no_cck = req->no_cck; 347 local->hw_scan_req->req.no_cck = req->no_cck;
342 ether_addr_copy(local->hw_scan_req->req.mac_addr, req->mac_addr); 348 ether_addr_copy(local->hw_scan_req->req.mac_addr, req->mac_addr);
@@ -528,6 +534,35 @@ void ieee80211_run_deferred_scan(struct ieee80211_local *local)
528 round_jiffies_relative(0)); 534 round_jiffies_relative(0));
529} 535}
530 536
537static void ieee80211_send_scan_probe_req(struct ieee80211_sub_if_data *sdata,
538 const u8 *src, const u8 *dst,
539 const u8 *ssid, size_t ssid_len,
540 const u8 *ie, size_t ie_len,
541 u32 ratemask, u32 flags, u32 tx_flags,
542 struct ieee80211_channel *channel)
543{
544 struct sk_buff *skb;
545 u32 txdata_flags = 0;
546
547 skb = ieee80211_build_probe_req(sdata, src, dst, ratemask, channel,
548 ssid, ssid_len,
549 ie, ie_len, flags);
550
551 if (skb) {
552 if (flags & IEEE80211_PROBE_FLAG_RANDOM_SN) {
553 struct ieee80211_hdr *hdr = (void *)skb->data;
554 u16 sn = get_random_u32();
555
556 txdata_flags |= IEEE80211_TX_NO_SEQNO;
557 hdr->seq_ctrl =
558 cpu_to_le16(IEEE80211_SN_TO_SEQ(sn));
559 }
560 IEEE80211_SKB_CB(skb)->flags |= tx_flags;
561 ieee80211_tx_skb_tid_band(sdata, skb, 7, channel->band,
562 txdata_flags);
563 }
564}
565
531static void ieee80211_scan_state_send_probe(struct ieee80211_local *local, 566static void ieee80211_scan_state_send_probe(struct ieee80211_local *local,
532 unsigned long *next_delay) 567 unsigned long *next_delay)
533{ 568{
@@ -535,7 +570,7 @@ static void ieee80211_scan_state_send_probe(struct ieee80211_local *local,
535 struct ieee80211_sub_if_data *sdata; 570 struct ieee80211_sub_if_data *sdata;
536 struct cfg80211_scan_request *scan_req; 571 struct cfg80211_scan_request *scan_req;
537 enum nl80211_band band = local->hw.conf.chandef.chan->band; 572 enum nl80211_band band = local->hw.conf.chandef.chan->band;
538 u32 tx_flags; 573 u32 flags = 0, tx_flags;
539 574
540 scan_req = rcu_dereference_protected(local->scan_req, 575 scan_req = rcu_dereference_protected(local->scan_req,
541 lockdep_is_held(&local->mtx)); 576 lockdep_is_held(&local->mtx));
@@ -543,17 +578,21 @@ static void ieee80211_scan_state_send_probe(struct ieee80211_local *local,
543 tx_flags = IEEE80211_TX_INTFL_OFFCHAN_TX_OK; 578 tx_flags = IEEE80211_TX_INTFL_OFFCHAN_TX_OK;
544 if (scan_req->no_cck) 579 if (scan_req->no_cck)
545 tx_flags |= IEEE80211_TX_CTL_NO_CCK_RATE; 580 tx_flags |= IEEE80211_TX_CTL_NO_CCK_RATE;
581 if (scan_req->flags & NL80211_SCAN_FLAG_MIN_PREQ_CONTENT)
582 flags |= IEEE80211_PROBE_FLAG_MIN_CONTENT;
583 if (scan_req->flags & NL80211_SCAN_FLAG_RANDOM_SN)
584 flags |= IEEE80211_PROBE_FLAG_RANDOM_SN;
546 585
547 sdata = rcu_dereference_protected(local->scan_sdata, 586 sdata = rcu_dereference_protected(local->scan_sdata,
548 lockdep_is_held(&local->mtx)); 587 lockdep_is_held(&local->mtx));
549 588
550 for (i = 0; i < scan_req->n_ssids; i++) 589 for (i = 0; i < scan_req->n_ssids; i++)
551 ieee80211_send_probe_req( 590 ieee80211_send_scan_probe_req(
552 sdata, local->scan_addr, scan_req->bssid, 591 sdata, local->scan_addr, scan_req->bssid,
553 scan_req->ssids[i].ssid, scan_req->ssids[i].ssid_len, 592 scan_req->ssids[i].ssid, scan_req->ssids[i].ssid_len,
554 scan_req->ie, scan_req->ie_len, 593 scan_req->ie, scan_req->ie_len,
555 scan_req->rates[band], false, 594 scan_req->rates[band], flags,
556 tx_flags, local->hw.conf.chandef.chan, true); 595 tx_flags, local->hw.conf.chandef.chan);
557 596
558 /* 597 /*
559 * After sending probe requests, wait for probe responses 598 * After sending probe requests, wait for probe responses
@@ -1141,6 +1180,7 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata,
1141 u32 rate_masks[NUM_NL80211_BANDS] = {}; 1180 u32 rate_masks[NUM_NL80211_BANDS] = {};
1142 u8 bands_used = 0; 1181 u8 bands_used = 0;
1143 u8 *ie; 1182 u8 *ie;
1183 u32 flags = 0;
1144 1184
1145 iebufsz = local->scan_ies_len + req->ie_len; 1185 iebufsz = local->scan_ies_len + req->ie_len;
1146 1186
@@ -1157,6 +1197,9 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata,
1157 } 1197 }
1158 } 1198 }
1159 1199
1200 if (req->flags & NL80211_SCAN_FLAG_MIN_PREQ_CONTENT)
1201 flags |= IEEE80211_PROBE_FLAG_MIN_CONTENT;
1202
1160 ie = kcalloc(iebufsz, num_bands, GFP_KERNEL); 1203 ie = kcalloc(iebufsz, num_bands, GFP_KERNEL);
1161 if (!ie) { 1204 if (!ie) {
1162 ret = -ENOMEM; 1205 ret = -ENOMEM;
@@ -1167,7 +1210,8 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata,
1167 1210
1168 ieee80211_build_preq_ies(local, ie, num_bands * iebufsz, 1211 ieee80211_build_preq_ies(local, ie, num_bands * iebufsz,
1169 &sched_scan_ies, req->ie, 1212 &sched_scan_ies, req->ie,
1170 req->ie_len, bands_used, rate_masks, &chandef); 1213 req->ie_len, bands_used, rate_masks, &chandef,
1214 flags);
1171 1215
1172 ret = drv_sched_scan_start(local, sdata, req, &sched_scan_ies); 1216 ret = drv_sched_scan_start(local, sdata, req, &sched_scan_ies);
1173 if (ret == 0) { 1217 if (ret == 0) {
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 6428f1ac37b6..f34202242d24 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -1323,6 +1323,11 @@ static void ieee80211_send_null_response(struct sta_info *sta, int tid,
1323 struct ieee80211_tx_info *info; 1323 struct ieee80211_tx_info *info;
1324 struct ieee80211_chanctx_conf *chanctx_conf; 1324 struct ieee80211_chanctx_conf *chanctx_conf;
1325 1325
1326 /* Don't send NDPs when STA is connected HE */
1327 if (sdata->vif.type == NL80211_IFTYPE_STATION &&
1328 !(sdata->u.mgd.flags & IEEE80211_STA_DISABLE_HE))
1329 return;
1330
1326 if (qos) { 1331 if (qos) {
1327 fc = cpu_to_le16(IEEE80211_FTYPE_DATA | 1332 fc = cpu_to_le16(IEEE80211_FTYPE_DATA |
1328 IEEE80211_STYPE_QOS_NULLFUNC | 1333 IEEE80211_STYPE_QOS_NULLFUNC |
@@ -1391,7 +1396,7 @@ static void ieee80211_send_null_response(struct sta_info *sta, int tid,
1391 } 1396 }
1392 1397
1393 info->band = chanctx_conf->def.chan->band; 1398 info->band = chanctx_conf->def.chan->band;
1394 ieee80211_xmit(sdata, sta, skb); 1399 ieee80211_xmit(sdata, sta, skb, 0);
1395 rcu_read_unlock(); 1400 rcu_read_unlock();
1396} 1401}
1397 1402
@@ -1968,7 +1973,7 @@ sta_get_last_rx_stats(struct sta_info *sta)
1968 return stats; 1973 return stats;
1969} 1974}
1970 1975
1971static void sta_stats_decode_rate(struct ieee80211_local *local, u16 rate, 1976static void sta_stats_decode_rate(struct ieee80211_local *local, u32 rate,
1972 struct rate_info *rinfo) 1977 struct rate_info *rinfo)
1973{ 1978{
1974 rinfo->bw = STA_STATS_GET(BW, rate); 1979 rinfo->bw = STA_STATS_GET(BW, rate);
@@ -2005,6 +2010,14 @@ static void sta_stats_decode_rate(struct ieee80211_local *local, u16 rate,
2005 rinfo->legacy = DIV_ROUND_UP(brate, 1 << shift); 2010 rinfo->legacy = DIV_ROUND_UP(brate, 1 << shift);
2006 break; 2011 break;
2007 } 2012 }
2013 case STA_STATS_RATE_TYPE_HE:
2014 rinfo->flags = RATE_INFO_FLAGS_HE_MCS;
2015 rinfo->mcs = STA_STATS_GET(HE_MCS, rate);
2016 rinfo->nss = STA_STATS_GET(HE_NSS, rate);
2017 rinfo->he_gi = STA_STATS_GET(HE_GI, rate);
2018 rinfo->he_ru_alloc = STA_STATS_GET(HE_RU, rate);
2019 rinfo->he_dcm = STA_STATS_GET(HE_DCM, rate);
2020 break;
2008 } 2021 }
2009} 2022}
2010 2023
@@ -2101,38 +2114,38 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
2101 2114
2102 drv_sta_statistics(local, sdata, &sta->sta, sinfo); 2115 drv_sta_statistics(local, sdata, &sta->sta, sinfo);
2103 2116
2104 sinfo->filled |= BIT(NL80211_STA_INFO_INACTIVE_TIME) | 2117 sinfo->filled |= BIT_ULL(NL80211_STA_INFO_INACTIVE_TIME) |
2105 BIT(NL80211_STA_INFO_STA_FLAGS) | 2118 BIT_ULL(NL80211_STA_INFO_STA_FLAGS) |
2106 BIT(NL80211_STA_INFO_BSS_PARAM) | 2119 BIT_ULL(NL80211_STA_INFO_BSS_PARAM) |
2107 BIT(NL80211_STA_INFO_CONNECTED_TIME) | 2120 BIT_ULL(NL80211_STA_INFO_CONNECTED_TIME) |
2108 BIT(NL80211_STA_INFO_RX_DROP_MISC); 2121 BIT_ULL(NL80211_STA_INFO_RX_DROP_MISC);
2109 2122
2110 if (sdata->vif.type == NL80211_IFTYPE_STATION) { 2123 if (sdata->vif.type == NL80211_IFTYPE_STATION) {
2111 sinfo->beacon_loss_count = sdata->u.mgd.beacon_loss_count; 2124 sinfo->beacon_loss_count = sdata->u.mgd.beacon_loss_count;
2112 sinfo->filled |= BIT(NL80211_STA_INFO_BEACON_LOSS); 2125 sinfo->filled |= BIT_ULL(NL80211_STA_INFO_BEACON_LOSS);
2113 } 2126 }
2114 2127
2115 sinfo->connected_time = ktime_get_seconds() - sta->last_connected; 2128 sinfo->connected_time = ktime_get_seconds() - sta->last_connected;
2116 sinfo->inactive_time = 2129 sinfo->inactive_time =
2117 jiffies_to_msecs(jiffies - ieee80211_sta_last_active(sta)); 2130 jiffies_to_msecs(jiffies - ieee80211_sta_last_active(sta));
2118 2131
2119 if (!(sinfo->filled & (BIT(NL80211_STA_INFO_TX_BYTES64) | 2132 if (!(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_TX_BYTES64) |
2120 BIT(NL80211_STA_INFO_TX_BYTES)))) { 2133 BIT_ULL(NL80211_STA_INFO_TX_BYTES)))) {
2121 sinfo->tx_bytes = 0; 2134 sinfo->tx_bytes = 0;
2122 for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) 2135 for (ac = 0; ac < IEEE80211_NUM_ACS; ac++)
2123 sinfo->tx_bytes += sta->tx_stats.bytes[ac]; 2136 sinfo->tx_bytes += sta->tx_stats.bytes[ac];
2124 sinfo->filled |= BIT(NL80211_STA_INFO_TX_BYTES64); 2137 sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_BYTES64);
2125 } 2138 }
2126 2139
2127 if (!(sinfo->filled & BIT(NL80211_STA_INFO_TX_PACKETS))) { 2140 if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_PACKETS))) {
2128 sinfo->tx_packets = 0; 2141 sinfo->tx_packets = 0;
2129 for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) 2142 for (ac = 0; ac < IEEE80211_NUM_ACS; ac++)
2130 sinfo->tx_packets += sta->tx_stats.packets[ac]; 2143 sinfo->tx_packets += sta->tx_stats.packets[ac];
2131 sinfo->filled |= BIT(NL80211_STA_INFO_TX_PACKETS); 2144 sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_PACKETS);
2132 } 2145 }
2133 2146
2134 if (!(sinfo->filled & (BIT(NL80211_STA_INFO_RX_BYTES64) | 2147 if (!(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_RX_BYTES64) |
2135 BIT(NL80211_STA_INFO_RX_BYTES)))) { 2148 BIT_ULL(NL80211_STA_INFO_RX_BYTES)))) {
2136 sinfo->rx_bytes += sta_get_stats_bytes(&sta->rx_stats); 2149 sinfo->rx_bytes += sta_get_stats_bytes(&sta->rx_stats);
2137 2150
2138 if (sta->pcpu_rx_stats) { 2151 if (sta->pcpu_rx_stats) {
@@ -2144,10 +2157,10 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
2144 } 2157 }
2145 } 2158 }
2146 2159
2147 sinfo->filled |= BIT(NL80211_STA_INFO_RX_BYTES64); 2160 sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_BYTES64);
2148 } 2161 }
2149 2162
2150 if (!(sinfo->filled & BIT(NL80211_STA_INFO_RX_PACKETS))) { 2163 if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_PACKETS))) {
2151 sinfo->rx_packets = sta->rx_stats.packets; 2164 sinfo->rx_packets = sta->rx_stats.packets;
2152 if (sta->pcpu_rx_stats) { 2165 if (sta->pcpu_rx_stats) {
2153 for_each_possible_cpu(cpu) { 2166 for_each_possible_cpu(cpu) {
@@ -2157,17 +2170,17 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
2157 sinfo->rx_packets += cpurxs->packets; 2170 sinfo->rx_packets += cpurxs->packets;
2158 } 2171 }
2159 } 2172 }
2160 sinfo->filled |= BIT(NL80211_STA_INFO_RX_PACKETS); 2173 sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_PACKETS);
2161 } 2174 }
2162 2175
2163 if (!(sinfo->filled & BIT(NL80211_STA_INFO_TX_RETRIES))) { 2176 if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_RETRIES))) {
2164 sinfo->tx_retries = sta->status_stats.retry_count; 2177 sinfo->tx_retries = sta->status_stats.retry_count;
2165 sinfo->filled |= BIT(NL80211_STA_INFO_TX_RETRIES); 2178 sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_RETRIES);
2166 } 2179 }
2167 2180
2168 if (!(sinfo->filled & BIT(NL80211_STA_INFO_TX_FAILED))) { 2181 if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_FAILED))) {
2169 sinfo->tx_failed = sta->status_stats.retry_failed; 2182 sinfo->tx_failed = sta->status_stats.retry_failed;
2170 sinfo->filled |= BIT(NL80211_STA_INFO_TX_FAILED); 2183 sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_FAILED);
2171 } 2184 }
2172 2185
2173 sinfo->rx_dropped_misc = sta->rx_stats.dropped; 2186 sinfo->rx_dropped_misc = sta->rx_stats.dropped;
@@ -2182,23 +2195,23 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
2182 2195
2183 if (sdata->vif.type == NL80211_IFTYPE_STATION && 2196 if (sdata->vif.type == NL80211_IFTYPE_STATION &&
2184 !(sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER)) { 2197 !(sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER)) {
2185 sinfo->filled |= BIT(NL80211_STA_INFO_BEACON_RX) | 2198 sinfo->filled |= BIT_ULL(NL80211_STA_INFO_BEACON_RX) |
2186 BIT(NL80211_STA_INFO_BEACON_SIGNAL_AVG); 2199 BIT_ULL(NL80211_STA_INFO_BEACON_SIGNAL_AVG);
2187 sinfo->rx_beacon_signal_avg = ieee80211_ave_rssi(&sdata->vif); 2200 sinfo->rx_beacon_signal_avg = ieee80211_ave_rssi(&sdata->vif);
2188 } 2201 }
2189 2202
2190 if (ieee80211_hw_check(&sta->local->hw, SIGNAL_DBM) || 2203 if (ieee80211_hw_check(&sta->local->hw, SIGNAL_DBM) ||
2191 ieee80211_hw_check(&sta->local->hw, SIGNAL_UNSPEC)) { 2204 ieee80211_hw_check(&sta->local->hw, SIGNAL_UNSPEC)) {
2192 if (!(sinfo->filled & BIT(NL80211_STA_INFO_SIGNAL))) { 2205 if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_SIGNAL))) {
2193 sinfo->signal = (s8)last_rxstats->last_signal; 2206 sinfo->signal = (s8)last_rxstats->last_signal;
2194 sinfo->filled |= BIT(NL80211_STA_INFO_SIGNAL); 2207 sinfo->filled |= BIT_ULL(NL80211_STA_INFO_SIGNAL);
2195 } 2208 }
2196 2209
2197 if (!sta->pcpu_rx_stats && 2210 if (!sta->pcpu_rx_stats &&
2198 !(sinfo->filled & BIT(NL80211_STA_INFO_SIGNAL_AVG))) { 2211 !(sinfo->filled & BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG))) {
2199 sinfo->signal_avg = 2212 sinfo->signal_avg =
2200 -ewma_signal_read(&sta->rx_stats_avg.signal); 2213 -ewma_signal_read(&sta->rx_stats_avg.signal);
2201 sinfo->filled |= BIT(NL80211_STA_INFO_SIGNAL_AVG); 2214 sinfo->filled |= BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG);
2202 } 2215 }
2203 } 2216 }
2204 2217
@@ -2207,11 +2220,11 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
2207 * pcpu statistics 2220 * pcpu statistics
2208 */ 2221 */
2209 if (last_rxstats->chains && 2222 if (last_rxstats->chains &&
2210 !(sinfo->filled & (BIT(NL80211_STA_INFO_CHAIN_SIGNAL) | 2223 !(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL) |
2211 BIT(NL80211_STA_INFO_CHAIN_SIGNAL_AVG)))) { 2224 BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL_AVG)))) {
2212 sinfo->filled |= BIT(NL80211_STA_INFO_CHAIN_SIGNAL); 2225 sinfo->filled |= BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL);
2213 if (!sta->pcpu_rx_stats) 2226 if (!sta->pcpu_rx_stats)
2214 sinfo->filled |= BIT(NL80211_STA_INFO_CHAIN_SIGNAL_AVG); 2227 sinfo->filled |= BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL_AVG);
2215 2228
2216 sinfo->chains = last_rxstats->chains; 2229 sinfo->chains = last_rxstats->chains;
2217 2230
@@ -2223,15 +2236,15 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
2223 } 2236 }
2224 } 2237 }
2225 2238
2226 if (!(sinfo->filled & BIT(NL80211_STA_INFO_TX_BITRATE))) { 2239 if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE))) {
2227 sta_set_rate_info_tx(sta, &sta->tx_stats.last_rate, 2240 sta_set_rate_info_tx(sta, &sta->tx_stats.last_rate,
2228 &sinfo->txrate); 2241 &sinfo->txrate);
2229 sinfo->filled |= BIT(NL80211_STA_INFO_TX_BITRATE); 2242 sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_BITRATE);
2230 } 2243 }
2231 2244
2232 if (!(sinfo->filled & BIT(NL80211_STA_INFO_RX_BITRATE))) { 2245 if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_BITRATE))) {
2233 if (sta_set_rate_info_rx(sta, &sinfo->rxrate) == 0) 2246 if (sta_set_rate_info_rx(sta, &sinfo->rxrate) == 0)
2234 sinfo->filled |= BIT(NL80211_STA_INFO_RX_BITRATE); 2247 sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_BITRATE);
2235 } 2248 }
2236 2249
2237 if (tidstats && !cfg80211_sinfo_alloc_tid_stats(sinfo, GFP_KERNEL)) { 2250 if (tidstats && !cfg80211_sinfo_alloc_tid_stats(sinfo, GFP_KERNEL)) {
@@ -2244,18 +2257,18 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
2244 2257
2245 if (ieee80211_vif_is_mesh(&sdata->vif)) { 2258 if (ieee80211_vif_is_mesh(&sdata->vif)) {
2246#ifdef CONFIG_MAC80211_MESH 2259#ifdef CONFIG_MAC80211_MESH
2247 sinfo->filled |= BIT(NL80211_STA_INFO_LLID) | 2260 sinfo->filled |= BIT_ULL(NL80211_STA_INFO_LLID) |
2248 BIT(NL80211_STA_INFO_PLID) | 2261 BIT_ULL(NL80211_STA_INFO_PLID) |
2249 BIT(NL80211_STA_INFO_PLINK_STATE) | 2262 BIT_ULL(NL80211_STA_INFO_PLINK_STATE) |
2250 BIT(NL80211_STA_INFO_LOCAL_PM) | 2263 BIT_ULL(NL80211_STA_INFO_LOCAL_PM) |
2251 BIT(NL80211_STA_INFO_PEER_PM) | 2264 BIT_ULL(NL80211_STA_INFO_PEER_PM) |
2252 BIT(NL80211_STA_INFO_NONPEER_PM); 2265 BIT_ULL(NL80211_STA_INFO_NONPEER_PM);
2253 2266
2254 sinfo->llid = sta->mesh->llid; 2267 sinfo->llid = sta->mesh->llid;
2255 sinfo->plid = sta->mesh->plid; 2268 sinfo->plid = sta->mesh->plid;
2256 sinfo->plink_state = sta->mesh->plink_state; 2269 sinfo->plink_state = sta->mesh->plink_state;
2257 if (test_sta_flag(sta, WLAN_STA_TOFFSET_KNOWN)) { 2270 if (test_sta_flag(sta, WLAN_STA_TOFFSET_KNOWN)) {
2258 sinfo->filled |= BIT(NL80211_STA_INFO_T_OFFSET); 2271 sinfo->filled |= BIT_ULL(NL80211_STA_INFO_T_OFFSET);
2259 sinfo->t_offset = sta->mesh->t_offset; 2272 sinfo->t_offset = sta->mesh->t_offset;
2260 } 2273 }
2261 sinfo->local_pm = sta->mesh->local_pm; 2274 sinfo->local_pm = sta->mesh->local_pm;
@@ -2300,7 +2313,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
2300 thr = sta_get_expected_throughput(sta); 2313 thr = sta_get_expected_throughput(sta);
2301 2314
2302 if (thr != 0) { 2315 if (thr != 0) {
2303 sinfo->filled |= BIT(NL80211_STA_INFO_EXPECTED_THROUGHPUT); 2316 sinfo->filled |= BIT_ULL(NL80211_STA_INFO_EXPECTED_THROUGHPUT);
2304 sinfo->expected_throughput = thr; 2317 sinfo->expected_throughput = thr;
2305 } 2318 }
2306 2319
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index 81b35f623792..9a04327d71d1 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -170,7 +170,7 @@ struct tid_ampdu_tx {
170 u8 dialog_token; 170 u8 dialog_token;
171 u8 stop_initiator; 171 u8 stop_initiator;
172 bool tx_stop; 172 bool tx_stop;
173 u8 buf_size; 173 u16 buf_size;
174 174
175 u16 failed_bar_ssn; 175 u16 failed_bar_ssn;
176 bool bar_pending; 176 bool bar_pending;
@@ -405,7 +405,7 @@ struct ieee80211_sta_rx_stats {
405 int last_signal; 405 int last_signal;
406 u8 chains; 406 u8 chains;
407 s8 chain_signal_last[IEEE80211_MAX_CHAINS]; 407 s8 chain_signal_last[IEEE80211_MAX_CHAINS];
408 u16 last_rate; 408 u32 last_rate;
409 struct u64_stats_sync syncp; 409 struct u64_stats_sync syncp;
410 u64 bytes; 410 u64 bytes;
411 u64 msdu[IEEE80211_NUM_TIDS + 1]; 411 u64 msdu[IEEE80211_NUM_TIDS + 1];
@@ -764,6 +764,7 @@ enum sta_stats_type {
764 STA_STATS_RATE_TYPE_LEGACY, 764 STA_STATS_RATE_TYPE_LEGACY,
765 STA_STATS_RATE_TYPE_HT, 765 STA_STATS_RATE_TYPE_HT,
766 STA_STATS_RATE_TYPE_VHT, 766 STA_STATS_RATE_TYPE_VHT,
767 STA_STATS_RATE_TYPE_HE,
767}; 768};
768 769
769#define STA_STATS_FIELD_HT_MCS GENMASK( 7, 0) 770#define STA_STATS_FIELD_HT_MCS GENMASK( 7, 0)
@@ -771,9 +772,14 @@ enum sta_stats_type {
771#define STA_STATS_FIELD_LEGACY_BAND GENMASK( 7, 4) 772#define STA_STATS_FIELD_LEGACY_BAND GENMASK( 7, 4)
772#define STA_STATS_FIELD_VHT_MCS GENMASK( 3, 0) 773#define STA_STATS_FIELD_VHT_MCS GENMASK( 3, 0)
773#define STA_STATS_FIELD_VHT_NSS GENMASK( 7, 4) 774#define STA_STATS_FIELD_VHT_NSS GENMASK( 7, 4)
775#define STA_STATS_FIELD_HE_MCS GENMASK( 3, 0)
776#define STA_STATS_FIELD_HE_NSS GENMASK( 7, 4)
774#define STA_STATS_FIELD_BW GENMASK(11, 8) 777#define STA_STATS_FIELD_BW GENMASK(11, 8)
775#define STA_STATS_FIELD_SGI GENMASK(12, 12) 778#define STA_STATS_FIELD_SGI GENMASK(12, 12)
776#define STA_STATS_FIELD_TYPE GENMASK(15, 13) 779#define STA_STATS_FIELD_TYPE GENMASK(15, 13)
780#define STA_STATS_FIELD_HE_RU GENMASK(18, 16)
781#define STA_STATS_FIELD_HE_GI GENMASK(20, 19)
782#define STA_STATS_FIELD_HE_DCM GENMASK(21, 21)
777 783
778#define STA_STATS_FIELD(_n, _v) FIELD_PREP(STA_STATS_FIELD_ ## _n, _v) 784#define STA_STATS_FIELD(_n, _v) FIELD_PREP(STA_STATS_FIELD_ ## _n, _v)
779#define STA_STATS_GET(_n, _v) FIELD_GET(STA_STATS_FIELD_ ## _n, _v) 785#define STA_STATS_GET(_n, _v) FIELD_GET(STA_STATS_FIELD_ ## _n, _v)
@@ -782,7 +788,7 @@ enum sta_stats_type {
782 788
783static inline u32 sta_stats_encode_rate(struct ieee80211_rx_status *s) 789static inline u32 sta_stats_encode_rate(struct ieee80211_rx_status *s)
784{ 790{
785 u16 r; 791 u32 r;
786 792
787 r = STA_STATS_FIELD(BW, s->bw); 793 r = STA_STATS_FIELD(BW, s->bw);
788 794
@@ -804,6 +810,14 @@ static inline u32 sta_stats_encode_rate(struct ieee80211_rx_status *s)
804 r |= STA_STATS_FIELD(LEGACY_BAND, s->band); 810 r |= STA_STATS_FIELD(LEGACY_BAND, s->band);
805 r |= STA_STATS_FIELD(LEGACY_IDX, s->rate_idx); 811 r |= STA_STATS_FIELD(LEGACY_IDX, s->rate_idx);
806 break; 812 break;
813 case RX_ENC_HE:
814 r |= STA_STATS_FIELD(TYPE, STA_STATS_RATE_TYPE_HE);
815 r |= STA_STATS_FIELD(HE_NSS, s->nss);
816 r |= STA_STATS_FIELD(HE_MCS, s->rate_idx);
817 r |= STA_STATS_FIELD(HE_GI, s->he_gi);
818 r |= STA_STATS_FIELD(HE_RU, s->he_ru);
819 r |= STA_STATS_FIELD(HE_DCM, s->he_dcm);
820 break;
807 default: 821 default:
808 WARN_ON(1); 822 WARN_ON(1);
809 return STA_STATS_RATE_INVALID; 823 return STA_STATS_RATE_INVALID;
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index 80a7edf8d314..0ab69a1964f8 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -92,7 +92,7 @@
92 STA_ENTRY \ 92 STA_ENTRY \
93 __field(u16, tid) \ 93 __field(u16, tid) \
94 __field(u16, ssn) \ 94 __field(u16, ssn) \
95 __field(u8, buf_size) \ 95 __field(u16, buf_size) \
96 __field(bool, amsdu) \ 96 __field(bool, amsdu) \
97 __field(u16, timeout) \ 97 __field(u16, timeout) \
98 __field(u16, action) 98 __field(u16, action)
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 44b5dfe8727d..cd332e3e1134 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -825,6 +825,8 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
825 */ 825 */
826 if (!ieee80211_is_data_qos(hdr->frame_control) || 826 if (!ieee80211_is_data_qos(hdr->frame_control) ||
827 is_multicast_ether_addr(hdr->addr1)) { 827 is_multicast_ether_addr(hdr->addr1)) {
828 if (tx->flags & IEEE80211_TX_NO_SEQNO)
829 return TX_CONTINUE;
828 /* driver should assign sequence number */ 830 /* driver should assign sequence number */
829 info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ; 831 info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
830 /* for pure STA mode without beacons, we can do it */ 832 /* for pure STA mode without beacons, we can do it */
@@ -1247,7 +1249,7 @@ static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local,
1247 (info->control.flags & IEEE80211_TX_CTRL_PS_RESPONSE)) 1249 (info->control.flags & IEEE80211_TX_CTRL_PS_RESPONSE))
1248 return NULL; 1250 return NULL;
1249 1251
1250 if (!ieee80211_is_data(hdr->frame_control)) 1252 if (!ieee80211_is_data_present(hdr->frame_control))
1251 return NULL; 1253 return NULL;
1252 1254
1253 if (sta) { 1255 if (sta) {
@@ -1854,7 +1856,7 @@ EXPORT_SYMBOL(ieee80211_tx_prepare_skb);
1854 */ 1856 */
1855static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata, 1857static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata,
1856 struct sta_info *sta, struct sk_buff *skb, 1858 struct sta_info *sta, struct sk_buff *skb,
1857 bool txpending) 1859 bool txpending, u32 txdata_flags)
1858{ 1860{
1859 struct ieee80211_local *local = sdata->local; 1861 struct ieee80211_local *local = sdata->local;
1860 struct ieee80211_tx_data tx; 1862 struct ieee80211_tx_data tx;
@@ -1872,6 +1874,8 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata,
1872 led_len = skb->len; 1874 led_len = skb->len;
1873 res_prepare = ieee80211_tx_prepare(sdata, &tx, sta, skb); 1875 res_prepare = ieee80211_tx_prepare(sdata, &tx, sta, skb);
1874 1876
1877 tx.flags |= txdata_flags;
1878
1875 if (unlikely(res_prepare == TX_DROP)) { 1879 if (unlikely(res_prepare == TX_DROP)) {
1876 ieee80211_free_txskb(&local->hw, skb); 1880 ieee80211_free_txskb(&local->hw, skb);
1877 return true; 1881 return true;
@@ -1933,7 +1937,8 @@ static int ieee80211_skb_resize(struct ieee80211_sub_if_data *sdata,
1933} 1937}
1934 1938
1935void ieee80211_xmit(struct ieee80211_sub_if_data *sdata, 1939void ieee80211_xmit(struct ieee80211_sub_if_data *sdata,
1936 struct sta_info *sta, struct sk_buff *skb) 1940 struct sta_info *sta, struct sk_buff *skb,
1941 u32 txdata_flags)
1937{ 1942{
1938 struct ieee80211_local *local = sdata->local; 1943 struct ieee80211_local *local = sdata->local;
1939 struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); 1944 struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
@@ -1968,7 +1973,7 @@ void ieee80211_xmit(struct ieee80211_sub_if_data *sdata,
1968 } 1973 }
1969 1974
1970 ieee80211_set_qos_hdr(sdata, skb); 1975 ieee80211_set_qos_hdr(sdata, skb);
1971 ieee80211_tx(sdata, sta, skb, false); 1976 ieee80211_tx(sdata, sta, skb, false, txdata_flags);
1972} 1977}
1973 1978
1974static bool ieee80211_parse_tx_radiotap(struct ieee80211_local *local, 1979static bool ieee80211_parse_tx_radiotap(struct ieee80211_local *local,
@@ -2289,7 +2294,7 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb,
2289 if (!ieee80211_parse_tx_radiotap(local, skb)) 2294 if (!ieee80211_parse_tx_radiotap(local, skb))
2290 goto fail_rcu; 2295 goto fail_rcu;
2291 2296
2292 ieee80211_xmit(sdata, NULL, skb); 2297 ieee80211_xmit(sdata, NULL, skb, 0);
2293 rcu_read_unlock(); 2298 rcu_read_unlock();
2294 2299
2295 return NETDEV_TX_OK; 2300 return NETDEV_TX_OK;
@@ -3648,7 +3653,7 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb,
3648 3653
3649 ieee80211_tx_stats(dev, skb->len); 3654 ieee80211_tx_stats(dev, skb->len);
3650 3655
3651 ieee80211_xmit(sdata, sta, skb); 3656 ieee80211_xmit(sdata, sta, skb, 0);
3652 } 3657 }
3653 goto out; 3658 goto out;
3654 out_free: 3659 out_free:
@@ -3867,7 +3872,7 @@ static bool ieee80211_tx_pending_skb(struct ieee80211_local *local,
3867 return true; 3872 return true;
3868 } 3873 }
3869 info->band = chanctx_conf->def.chan->band; 3874 info->band = chanctx_conf->def.chan->band;
3870 result = ieee80211_tx(sdata, NULL, skb, true); 3875 result = ieee80211_tx(sdata, NULL, skb, true, 0);
3871 } else { 3876 } else {
3872 struct sk_buff_head skbs; 3877 struct sk_buff_head skbs;
3873 3878
@@ -4783,7 +4788,7 @@ EXPORT_SYMBOL(ieee80211_unreserve_tid);
4783 4788
4784void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, 4789void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata,
4785 struct sk_buff *skb, int tid, 4790 struct sk_buff *skb, int tid,
4786 enum nl80211_band band) 4791 enum nl80211_band band, u32 txdata_flags)
4787{ 4792{
4788 int ac = ieee80211_ac_from_tid(tid); 4793 int ac = ieee80211_ac_from_tid(tid);
4789 4794
@@ -4800,7 +4805,7 @@ void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata,
4800 */ 4805 */
4801 local_bh_disable(); 4806 local_bh_disable();
4802 IEEE80211_SKB_CB(skb)->band = band; 4807 IEEE80211_SKB_CB(skb)->band = band;
4803 ieee80211_xmit(sdata, NULL, skb); 4808 ieee80211_xmit(sdata, NULL, skb, txdata_flags);
4804 local_bh_enable(); 4809 local_bh_enable();
4805} 4810}
4806 4811
@@ -4845,7 +4850,9 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev,
4845 skb_reset_network_header(skb); 4850 skb_reset_network_header(skb);
4846 skb_reset_mac_header(skb); 4851 skb_reset_mac_header(skb);
4847 4852
4853 local_bh_disable();
4848 __ieee80211_subif_start_xmit(skb, skb->dev, flags); 4854 __ieee80211_subif_start_xmit(skb, skb->dev, flags);
4855 local_bh_enable();
4849 4856
4850 return 0; 4857 return 0;
4851} 4858}
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 5e2e511c4a6f..88efda7c9f8a 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -1095,6 +1095,21 @@ u32 ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action,
1095 if (elen >= sizeof(*elems->max_idle_period_ie)) 1095 if (elen >= sizeof(*elems->max_idle_period_ie))
1096 elems->max_idle_period_ie = (void *)pos; 1096 elems->max_idle_period_ie = (void *)pos;
1097 break; 1097 break;
1098 case WLAN_EID_EXTENSION:
1099 if (pos[0] == WLAN_EID_EXT_HE_MU_EDCA &&
1100 elen >= (sizeof(*elems->mu_edca_param_set) + 1)) {
1101 elems->mu_edca_param_set = (void *)&pos[1];
1102 } else if (pos[0] == WLAN_EID_EXT_HE_CAPABILITY) {
1103 elems->he_cap = (void *)&pos[1];
1104 elems->he_cap_len = elen - 1;
1105 } else if (pos[0] == WLAN_EID_EXT_HE_OPERATION &&
1106 elen >= sizeof(*elems->he_operation) &&
1107 elen >= ieee80211_he_oper_size(&pos[1])) {
1108 elems->he_operation = (void *)&pos[1];
1109 } else if (pos[0] == WLAN_EID_EXT_UORA && elen >= 1) {
1110 elems->uora_element = (void *)&pos[1];
1111 }
1112 break;
1098 default: 1113 default:
1099 break; 1114 break;
1100 } 1115 }
@@ -1353,9 +1368,10 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local,
1353 enum nl80211_band band, 1368 enum nl80211_band band,
1354 u32 rate_mask, 1369 u32 rate_mask,
1355 struct cfg80211_chan_def *chandef, 1370 struct cfg80211_chan_def *chandef,
1356 size_t *offset) 1371 size_t *offset, u32 flags)
1357{ 1372{
1358 struct ieee80211_supported_band *sband; 1373 struct ieee80211_supported_band *sband;
1374 const struct ieee80211_sta_he_cap *he_cap;
1359 u8 *pos = buffer, *end = buffer + buffer_len; 1375 u8 *pos = buffer, *end = buffer + buffer_len;
1360 size_t noffset; 1376 size_t noffset;
1361 int supp_rates_len, i; 1377 int supp_rates_len, i;
@@ -1433,6 +1449,9 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local,
1433 chandef->chan->center_freq); 1449 chandef->chan->center_freq);
1434 } 1450 }
1435 1451
1452 if (flags & IEEE80211_PROBE_FLAG_MIN_CONTENT)
1453 goto done;
1454
1436 /* insert custom IEs that go before HT */ 1455 /* insert custom IEs that go before HT */
1437 if (ie && ie_len) { 1456 if (ie && ie_len) {
1438 static const u8 before_ht[] = { 1457 static const u8 before_ht[] = {
@@ -1460,11 +1479,6 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local,
1460 sband->ht_cap.cap); 1479 sband->ht_cap.cap);
1461 } 1480 }
1462 1481
1463 /*
1464 * If adding more here, adjust code in main.c
1465 * that calculates local->scan_ies_len.
1466 */
1467
1468 /* insert custom IEs that go before VHT */ 1482 /* insert custom IEs that go before VHT */
1469 if (ie && ie_len) { 1483 if (ie && ie_len) {
1470 static const u8 before_vht[] = { 1484 static const u8 before_vht[] = {
@@ -1507,9 +1521,43 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local,
1507 sband->vht_cap.cap); 1521 sband->vht_cap.cap);
1508 } 1522 }
1509 1523
1524 /* insert custom IEs that go before HE */
1525 if (ie && ie_len) {
1526 static const u8 before_he[] = {
1527 /*
1528 * no need to list the ones split off before VHT
1529 * or generated here
1530 */
1531 WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_REQ_PARAMS,
1532 WLAN_EID_AP_CSN,
1533 /* TODO: add 11ah/11aj/11ak elements */
1534 };
1535 noffset = ieee80211_ie_split(ie, ie_len,
1536 before_he, ARRAY_SIZE(before_he),
1537 *offset);
1538 if (end - pos < noffset - *offset)
1539 goto out_err;
1540 memcpy(pos, ie + *offset, noffset - *offset);
1541 pos += noffset - *offset;
1542 *offset = noffset;
1543 }
1544
1545 he_cap = ieee80211_get_he_sta_cap(sband);
1546 if (he_cap) {
1547 pos = ieee80211_ie_build_he_cap(pos, he_cap, end);
1548 if (!pos)
1549 goto out_err;
1550 }
1551
1552 /*
1553 * If adding more here, adjust code in main.c
1554 * that calculates local->scan_ies_len.
1555 */
1556
1510 return pos - buffer; 1557 return pos - buffer;
1511 out_err: 1558 out_err:
1512 WARN_ONCE(1, "not enough space for preq IEs\n"); 1559 WARN_ONCE(1, "not enough space for preq IEs\n");
1560 done:
1513 return pos - buffer; 1561 return pos - buffer;
1514} 1562}
1515 1563
@@ -1518,7 +1566,8 @@ int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer,
1518 struct ieee80211_scan_ies *ie_desc, 1566 struct ieee80211_scan_ies *ie_desc,
1519 const u8 *ie, size_t ie_len, 1567 const u8 *ie, size_t ie_len,
1520 u8 bands_used, u32 *rate_masks, 1568 u8 bands_used, u32 *rate_masks,
1521 struct cfg80211_chan_def *chandef) 1569 struct cfg80211_chan_def *chandef,
1570 u32 flags)
1522{ 1571{
1523 size_t pos = 0, old_pos = 0, custom_ie_offset = 0; 1572 size_t pos = 0, old_pos = 0, custom_ie_offset = 0;
1524 int i; 1573 int i;
@@ -1533,7 +1582,8 @@ int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer,
1533 ie, ie_len, i, 1582 ie, ie_len, i,
1534 rate_masks[i], 1583 rate_masks[i],
1535 chandef, 1584 chandef,
1536 &custom_ie_offset); 1585 &custom_ie_offset,
1586 flags);
1537 ie_desc->ies[i] = buffer + old_pos; 1587 ie_desc->ies[i] = buffer + old_pos;
1538 ie_desc->len[i] = pos - old_pos; 1588 ie_desc->len[i] = pos - old_pos;
1539 old_pos = pos; 1589 old_pos = pos;
@@ -1561,7 +1611,7 @@ struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata,
1561 struct ieee80211_channel *chan, 1611 struct ieee80211_channel *chan,
1562 const u8 *ssid, size_t ssid_len, 1612 const u8 *ssid, size_t ssid_len,
1563 const u8 *ie, size_t ie_len, 1613 const u8 *ie, size_t ie_len,
1564 bool directed) 1614 u32 flags)
1565{ 1615{
1566 struct ieee80211_local *local = sdata->local; 1616 struct ieee80211_local *local = sdata->local;
1567 struct cfg80211_chan_def chandef; 1617 struct cfg80211_chan_def chandef;
@@ -1577,7 +1627,7 @@ struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata,
1577 * badly-behaved APs don't respond when this parameter is included. 1627 * badly-behaved APs don't respond when this parameter is included.
1578 */ 1628 */
1579 chandef.width = sdata->vif.bss_conf.chandef.width; 1629 chandef.width = sdata->vif.bss_conf.chandef.width;
1580 if (directed) 1630 if (flags & IEEE80211_PROBE_FLAG_DIRECTED)
1581 chandef.chan = NULL; 1631 chandef.chan = NULL;
1582 else 1632 else
1583 chandef.chan = chan; 1633 chandef.chan = chan;
@@ -1591,7 +1641,7 @@ struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata,
1591 ies_len = ieee80211_build_preq_ies(local, skb_tail_pointer(skb), 1641 ies_len = ieee80211_build_preq_ies(local, skb_tail_pointer(skb),
1592 skb_tailroom(skb), &dummy_ie_desc, 1642 skb_tailroom(skb), &dummy_ie_desc,
1593 ie, ie_len, BIT(chan->band), 1643 ie, ie_len, BIT(chan->band),
1594 rate_masks, &chandef); 1644 rate_masks, &chandef, flags);
1595 skb_put(skb, ies_len); 1645 skb_put(skb, ies_len);
1596 1646
1597 if (dst) { 1647 if (dst) {
@@ -1605,27 +1655,6 @@ struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata,
1605 return skb; 1655 return skb;
1606} 1656}
1607 1657
1608void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata,
1609 const u8 *src, const u8 *dst,
1610 const u8 *ssid, size_t ssid_len,
1611 const u8 *ie, size_t ie_len,
1612 u32 ratemask, bool directed, u32 tx_flags,
1613 struct ieee80211_channel *channel, bool scan)
1614{
1615 struct sk_buff *skb;
1616
1617 skb = ieee80211_build_probe_req(sdata, src, dst, ratemask, channel,
1618 ssid, ssid_len,
1619 ie, ie_len, directed);
1620 if (skb) {
1621 IEEE80211_SKB_CB(skb)->flags |= tx_flags;
1622 if (scan)
1623 ieee80211_tx_skb_tid_band(sdata, skb, 7, channel->band);
1624 else
1625 ieee80211_tx_skb(sdata, skb);
1626 }
1627}
1628
1629u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata, 1658u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata,
1630 struct ieee802_11_elems *elems, 1659 struct ieee802_11_elems *elems,
1631 enum nl80211_band band, u32 *basic_rates) 1660 enum nl80211_band band, u32 *basic_rates)
@@ -2111,7 +2140,8 @@ int ieee80211_reconfig(struct ieee80211_local *local)
2111 if (!sta->uploaded) 2140 if (!sta->uploaded)
2112 continue; 2141 continue;
2113 2142
2114 if (sta->sdata->vif.type != NL80211_IFTYPE_AP) 2143 if (sta->sdata->vif.type != NL80211_IFTYPE_AP &&
2144 sta->sdata->vif.type != NL80211_IFTYPE_AP_VLAN)
2115 continue; 2145 continue;
2116 2146
2117 for (state = IEEE80211_STA_NOTEXIST; 2147 for (state = IEEE80211_STA_NOTEXIST;
@@ -2412,6 +2442,72 @@ u8 *ieee80211_ie_build_vht_cap(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap,
2412 return pos; 2442 return pos;
2413} 2443}
2414 2444
2445u8 *ieee80211_ie_build_he_cap(u8 *pos,
2446 const struct ieee80211_sta_he_cap *he_cap,
2447 u8 *end)
2448{
2449 u8 n;
2450 u8 ie_len;
2451 u8 *orig_pos = pos;
2452
2453 /* Make sure we have place for the IE */
2454 /*
2455 * TODO: the 1 added is because this temporarily is under the EXTENSION
2456 * IE. Get rid of it when it moves.
2457 */
2458 if (!he_cap)
2459 return orig_pos;
2460
2461 n = ieee80211_he_mcs_nss_size(&he_cap->he_cap_elem);
2462 ie_len = 2 + 1 +
2463 sizeof(he_cap->he_cap_elem) + n +
2464 ieee80211_he_ppe_size(he_cap->ppe_thres[0],
2465 he_cap->he_cap_elem.phy_cap_info);
2466
2467 if ((end - pos) < ie_len)
2468 return orig_pos;
2469
2470 *pos++ = WLAN_EID_EXTENSION;
2471 pos++; /* We'll set the size later below */
2472 *pos++ = WLAN_EID_EXT_HE_CAPABILITY;
2473
2474 /* Fixed data */
2475 memcpy(pos, &he_cap->he_cap_elem, sizeof(he_cap->he_cap_elem));
2476 pos += sizeof(he_cap->he_cap_elem);
2477
2478 memcpy(pos, &he_cap->he_mcs_nss_supp, n);
2479 pos += n;
2480
2481 /* Check if PPE Threshold should be present */
2482 if ((he_cap->he_cap_elem.phy_cap_info[6] &
2483 IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT) == 0)
2484 goto end;
2485
2486 /*
2487 * Calculate how many PPET16/PPET8 pairs are to come. Algorithm:
2488 * (NSS_M1 + 1) x (num of 1 bits in RU_INDEX_BITMASK)
2489 */
2490 n = hweight8(he_cap->ppe_thres[0] &
2491 IEEE80211_PPE_THRES_RU_INDEX_BITMASK_MASK);
2492 n *= (1 + ((he_cap->ppe_thres[0] & IEEE80211_PPE_THRES_NSS_MASK) >>
2493 IEEE80211_PPE_THRES_NSS_POS));
2494
2495 /*
2496 * Each pair is 6 bits, and we need to add the 7 "header" bits to the
2497 * total size.
2498 */
2499 n = (n * IEEE80211_PPE_THRES_INFO_PPET_SIZE * 2) + 7;
2500 n = DIV_ROUND_UP(n, 8);
2501
2502 /* Copy PPE Thresholds */
2503 memcpy(pos, &he_cap->ppe_thres, n);
2504 pos += n;
2505
2506end:
2507 orig_pos[1] = (pos - orig_pos) - 2;
2508 return pos;
2509}
2510
2415u8 *ieee80211_ie_build_ht_oper(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap, 2511u8 *ieee80211_ie_build_ht_oper(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap,
2416 const struct cfg80211_chan_def *chandef, 2512 const struct cfg80211_chan_def *chandef,
2417 u16 prot_mode, bool rifs_mode) 2513 u16 prot_mode, bool rifs_mode)
diff --git a/net/mac802154/tx.c b/net/mac802154/tx.c
index 7e253455f9dd..bcd1a5e6ebf4 100644
--- a/net/mac802154/tx.c
+++ b/net/mac802154/tx.c
@@ -63,8 +63,21 @@ ieee802154_tx(struct ieee802154_local *local, struct sk_buff *skb)
63 int ret; 63 int ret;
64 64
65 if (!(local->hw.flags & IEEE802154_HW_TX_OMIT_CKSUM)) { 65 if (!(local->hw.flags & IEEE802154_HW_TX_OMIT_CKSUM)) {
66 u16 crc = crc_ccitt(0, skb->data, skb->len); 66 struct sk_buff *nskb;
67 u16 crc;
68
69 if (unlikely(skb_tailroom(skb) < IEEE802154_FCS_LEN)) {
70 nskb = skb_copy_expand(skb, 0, IEEE802154_FCS_LEN,
71 GFP_ATOMIC);
72 if (likely(nskb)) {
73 consume_skb(skb);
74 skb = nskb;
75 } else {
76 goto err_tx;
77 }
78 }
67 79
80 crc = crc_ccitt(0, skb->data, skb->len);
68 put_unaligned_le16(crc, skb_put(skb, 2)); 81 put_unaligned_le16(crc, skb_put(skb, 2));
69 } 82 }
70 83
diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c
index 6e558a419f60..94f53a9b7d1a 100644
--- a/net/mpls/mpls_iptunnel.c
+++ b/net/mpls/mpls_iptunnel.c
@@ -224,7 +224,7 @@ static int mpls_fill_encap_info(struct sk_buff *skb,
224 struct lwtunnel_state *lwtstate) 224 struct lwtunnel_state *lwtstate)
225{ 225{
226 struct mpls_iptunnel_encap *tun_encap_info; 226 struct mpls_iptunnel_encap *tun_encap_info;
227 227
228 tun_encap_info = mpls_lwtunnel_encap(lwtstate); 228 tun_encap_info = mpls_lwtunnel_encap(lwtstate);
229 229
230 if (nla_put_labels(skb, MPLS_IPTUNNEL_DST, tun_encap_info->labels, 230 if (nla_put_labels(skb, MPLS_IPTUNNEL_DST, tun_encap_info->labels,
diff --git a/net/ncsi/ncsi-netlink.c b/net/ncsi/ncsi-netlink.c
index 82e6edf9c5d9..45f33d6dedf7 100644
--- a/net/ncsi/ncsi-netlink.c
+++ b/net/ncsi/ncsi-netlink.c
@@ -100,7 +100,7 @@ static int ncsi_write_package_info(struct sk_buff *skb,
100 bool found; 100 bool found;
101 int rc; 101 int rc;
102 102
103 if (id > ndp->package_num) { 103 if (id > ndp->package_num - 1) {
104 netdev_info(ndp->ndev.dev, "NCSI: No package with id %u\n", id); 104 netdev_info(ndp->ndev.dev, "NCSI: No package with id %u\n", id);
105 return -ENODEV; 105 return -ENODEV;
106 } 106 }
@@ -240,7 +240,7 @@ static int ncsi_pkg_info_all_nl(struct sk_buff *skb,
240 return 0; /* done */ 240 return 0; /* done */
241 241
242 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 242 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
243 &ncsi_genl_family, 0, NCSI_CMD_PKG_INFO); 243 &ncsi_genl_family, NLM_F_MULTI, NCSI_CMD_PKG_INFO);
244 if (!hdr) { 244 if (!hdr) {
245 rc = -EMSGSIZE; 245 rc = -EMSGSIZE;
246 goto err; 246 goto err;
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index dbd7d1fad277..71709c104081 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -46,9 +46,19 @@ config NETFILTER_NETLINK_LOG
46 and is also scheduled to replace the old syslog-based ipt_LOG 46 and is also scheduled to replace the old syslog-based ipt_LOG
47 and ip6t_LOG modules. 47 and ip6t_LOG modules.
48 48
49config NETFILTER_NETLINK_OSF
50 tristate "Netfilter OSF over NFNETLINK interface"
51 depends on NETFILTER_ADVANCED
52 select NETFILTER_NETLINK
53 help
54 If this option is enabled, the kernel will include support
55 for passive OS fingerprint via NFNETLINK.
56
49config NF_CONNTRACK 57config NF_CONNTRACK
50 tristate "Netfilter connection tracking support" 58 tristate "Netfilter connection tracking support"
51 default m if NETFILTER_ADVANCED=n 59 default m if NETFILTER_ADVANCED=n
60 select NF_DEFRAG_IPV4
61 select NF_DEFRAG_IPV6 if IPV6 != n
52 help 62 help
53 Connection tracking keeps a record of what packets have passed 63 Connection tracking keeps a record of what packets have passed
54 through your machine, in order to figure out how they are related 64 through your machine, in order to figure out how they are related
@@ -96,7 +106,6 @@ config NF_CONNTRACK_SECMARK
96config NF_CONNTRACK_ZONES 106config NF_CONNTRACK_ZONES
97 bool 'Connection tracking zones' 107 bool 'Connection tracking zones'
98 depends on NETFILTER_ADVANCED 108 depends on NETFILTER_ADVANCED
99 depends on NETFILTER_XT_TARGET_CT
100 help 109 help
101 This option enables support for connection tracking zones. 110 This option enables support for connection tracking zones.
102 Normally, each connection needs to have a unique system wide 111 Normally, each connection needs to have a unique system wide
@@ -148,10 +157,11 @@ config NF_CONNTRACK_TIMESTAMP
148 If unsure, say `N'. 157 If unsure, say `N'.
149 158
150config NF_CONNTRACK_LABELS 159config NF_CONNTRACK_LABELS
151 bool 160 bool "Connection tracking labels"
152 help 161 help
153 This option enables support for assigning user-defined flag bits 162 This option enables support for assigning user-defined flag bits
154 to connection tracking entries. It selected by the connlabel match. 163 to connection tracking entries. It can be used with xtables connlabel
164 match and the nftables ct expression.
155 165
156config NF_CT_PROTO_DCCP 166config NF_CT_PROTO_DCCP
157 bool 'DCCP protocol connection tracking support' 167 bool 'DCCP protocol connection tracking support'
@@ -355,6 +365,7 @@ config NF_CT_NETLINK_TIMEOUT
355 tristate 'Connection tracking timeout tuning via Netlink' 365 tristate 'Connection tracking timeout tuning via Netlink'
356 select NETFILTER_NETLINK 366 select NETFILTER_NETLINK
357 depends on NETFILTER_ADVANCED 367 depends on NETFILTER_ADVANCED
368 depends on NF_CONNTRACK_TIMEOUT
358 help 369 help
359 This option enables support for connection tracking timeout 370 This option enables support for connection tracking timeout
360 fine-grain tuning. This allows you to attach specific timeout 371 fine-grain tuning. This allows you to attach specific timeout
@@ -440,9 +451,6 @@ config NETFILTER_SYNPROXY
440 451
441endif # NF_CONNTRACK 452endif # NF_CONNTRACK
442 453
443config NF_OSF
444 tristate
445
446config NF_TABLES 454config NF_TABLES
447 select NETFILTER_NETLINK 455 select NETFILTER_NETLINK
448 tristate "Netfilter nf_tables support" 456 tristate "Netfilter nf_tables support"
@@ -460,6 +468,13 @@ config NF_TABLES
460 468
461if NF_TABLES 469if NF_TABLES
462 470
471config NF_TABLES_SET
472 tristate "Netfilter nf_tables set infrastructure"
473 help
474 This option enables the nf_tables set infrastructure that allows to
475 look up for elements in a set and to build one-way mappings between
476 matchings and actions.
477
463config NF_TABLES_INET 478config NF_TABLES_INET
464 depends on IPV6 479 depends on IPV6
465 select NF_TABLES_IPV4 480 select NF_TABLES_IPV4
@@ -493,24 +508,6 @@ config NFT_FLOW_OFFLOAD
493 This option adds the "flow_offload" expression that you can use to 508 This option adds the "flow_offload" expression that you can use to
494 choose what flows are placed into the hardware. 509 choose what flows are placed into the hardware.
495 510
496config NFT_SET_RBTREE
497 tristate "Netfilter nf_tables rbtree set module"
498 help
499 This option adds the "rbtree" set type (Red Black tree) that is used
500 to build interval-based sets.
501
502config NFT_SET_HASH
503 tristate "Netfilter nf_tables hash set module"
504 help
505 This option adds the "hash" set type that is used to build one-way
506 mappings between matchings and actions.
507
508config NFT_SET_BITMAP
509 tristate "Netfilter nf_tables bitmap set module"
510 help
511 This option adds the "bitmap" set type that is used to build sets
512 whose keys are smaller or equal to 16 bits.
513
514config NFT_COUNTER 511config NFT_COUNTER
515 tristate "Netfilter nf_tables counter module" 512 tristate "Netfilter nf_tables counter module"
516 help 513 help
@@ -562,6 +559,12 @@ config NFT_NAT
562 This option adds the "nat" expression that you can use to perform 559 This option adds the "nat" expression that you can use to perform
563 typical Network Address Translation (NAT) packet transformations. 560 typical Network Address Translation (NAT) packet transformations.
564 561
562config NFT_TUNNEL
563 tristate "Netfilter nf_tables tunnel module"
564 help
565 This option adds the "tunnel" expression that you can use to set
566 tunneling policies.
567
565config NFT_OBJREF 568config NFT_OBJREF
566 tristate "Netfilter nf_tables stateful object reference module" 569 tristate "Netfilter nf_tables stateful object reference module"
567 help 570 help
@@ -626,11 +629,28 @@ config NFT_SOCKET
626 tristate "Netfilter nf_tables socket match support" 629 tristate "Netfilter nf_tables socket match support"
627 depends on IPV6 || IPV6=n 630 depends on IPV6 || IPV6=n
628 select NF_SOCKET_IPV4 631 select NF_SOCKET_IPV4
629 select NF_SOCKET_IPV6 if IPV6 632 select NF_SOCKET_IPV6 if NF_TABLES_IPV6
630 help 633 help
631 This option allows matching for the presence or absence of a 634 This option allows matching for the presence or absence of a
632 corresponding socket and its attributes. 635 corresponding socket and its attributes.
633 636
637config NFT_OSF
638 tristate "Netfilter nf_tables passive OS fingerprint support"
639 depends on NETFILTER_ADVANCED
640 select NETFILTER_NETLINK_OSF
641 help
642 This option allows matching packets from an specific OS.
643
644config NFT_TPROXY
645 tristate "Netfilter nf_tables tproxy support"
646 depends on IPV6 || IPV6=n
647 select NF_DEFRAG_IPV4
648 select NF_DEFRAG_IPV6 if NF_TABLES_IPV6
649 select NF_TPROXY_IPV4
650 select NF_TPROXY_IPV6 if NF_TABLES_IPV6
651 help
652 This makes transparent proxy support available in nftables.
653
634if NF_TABLES_NETDEV 654if NF_TABLES_NETDEV
635 655
636config NF_DUP_NETDEV 656config NF_DUP_NETDEV
@@ -892,7 +912,7 @@ config NETFILTER_XT_TARGET_LOG
892 tristate "LOG target support" 912 tristate "LOG target support"
893 select NF_LOG_COMMON 913 select NF_LOG_COMMON
894 select NF_LOG_IPV4 914 select NF_LOG_IPV4
895 select NF_LOG_IPV6 if IPV6 915 select NF_LOG_IPV6 if IP6_NF_IPTABLES
896 default m if NETFILTER_ADVANCED=n 916 default m if NETFILTER_ADVANCED=n
897 help 917 help
898 This option adds a `LOG' target, which allows you to create rules in 918 This option adds a `LOG' target, which allows you to create rules in
@@ -984,7 +1004,7 @@ config NETFILTER_XT_TARGET_TEE
984 depends on IPV6 || IPV6=n 1004 depends on IPV6 || IPV6=n
985 depends on !NF_CONNTRACK || NF_CONNTRACK 1005 depends on !NF_CONNTRACK || NF_CONNTRACK
986 select NF_DUP_IPV4 1006 select NF_DUP_IPV4
987 select NF_DUP_IPV6 if IPV6 1007 select NF_DUP_IPV6 if IP6_NF_IPTABLES
988 ---help--- 1008 ---help---
989 This option adds a "TEE" target with which a packet can be cloned and 1009 This option adds a "TEE" target with which a packet can be cloned and
990 this clone be rerouted to another nexthop. 1010 this clone be rerouted to another nexthop.
@@ -1377,8 +1397,8 @@ config NETFILTER_XT_MATCH_NFACCT
1377 1397
1378config NETFILTER_XT_MATCH_OSF 1398config NETFILTER_XT_MATCH_OSF
1379 tristate '"osf" Passive OS fingerprint match' 1399 tristate '"osf" Passive OS fingerprint match'
1380 depends on NETFILTER_ADVANCED && NETFILTER_NETLINK 1400 depends on NETFILTER_ADVANCED
1381 select NF_OSF 1401 select NETFILTER_NETLINK_OSF
1382 help 1402 help
1383 This option selects the Passive OS Fingerprinting match module 1403 This option selects the Passive OS Fingerprinting match module
1384 that allows to passively match the remote operating system by 1404 that allows to passively match the remote operating system by
@@ -1492,8 +1512,8 @@ config NETFILTER_XT_MATCH_SOCKET
1492 depends on NETFILTER_ADVANCED 1512 depends on NETFILTER_ADVANCED
1493 depends on IPV6 || IPV6=n 1513 depends on IPV6 || IPV6=n
1494 depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n 1514 depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n
1495 depends on NF_SOCKET_IPV4 1515 select NF_SOCKET_IPV4
1496 depends on NF_SOCKET_IPV6 1516 select NF_SOCKET_IPV6 if IP6_NF_IPTABLES
1497 select NF_DEFRAG_IPV4 1517 select NF_DEFRAG_IPV4
1498 select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES != n 1518 select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES != n
1499 help 1519 help
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 44449389e527..16895e045b66 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -1,7 +1,12 @@
1# SPDX-License-Identifier: GPL-2.0 1# SPDX-License-Identifier: GPL-2.0
2netfilter-objs := core.o nf_log.o nf_queue.o nf_sockopt.o utils.o 2netfilter-objs := core.o nf_log.o nf_queue.o nf_sockopt.o utils.o
3 3
4nf_conntrack-y := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o nf_conntrack_proto.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o nf_conntrack_extend.o nf_conntrack_acct.o nf_conntrack_seqadj.o 4nf_conntrack-y := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o \
5 nf_conntrack_proto.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o \
6 nf_conntrack_proto_icmp.o \
7 nf_conntrack_extend.o nf_conntrack_acct.o nf_conntrack_seqadj.o
8
9nf_conntrack-$(subst m,y,$(CONFIG_IPV6)) += nf_conntrack_proto_icmpv6.o
5nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMEOUT) += nf_conntrack_timeout.o 10nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMEOUT) += nf_conntrack_timeout.o
6nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o 11nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o
7nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o 12nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o
@@ -15,6 +20,7 @@ obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o
15obj-$(CONFIG_NETFILTER_NETLINK_ACCT) += nfnetlink_acct.o 20obj-$(CONFIG_NETFILTER_NETLINK_ACCT) += nfnetlink_acct.o
16obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o 21obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o
17obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o 22obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o
23obj-$(CONFIG_NETFILTER_NETLINK_OSF) += nfnetlink_osf.o
18 24
19# connection tracking 25# connection tracking
20obj-$(CONFIG_NF_CONNTRACK) += nf_conntrack.o 26obj-$(CONFIG_NF_CONNTRACK) += nf_conntrack.o
@@ -78,7 +84,11 @@ nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \
78 nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \ 84 nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \
79 nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o 85 nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o
80 86
87nf_tables_set-objs := nf_tables_set_core.o \
88 nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o
89
81obj-$(CONFIG_NF_TABLES) += nf_tables.o 90obj-$(CONFIG_NF_TABLES) += nf_tables.o
91obj-$(CONFIG_NF_TABLES_SET) += nf_tables_set.o
82obj-$(CONFIG_NFT_COMPAT) += nft_compat.o 92obj-$(CONFIG_NFT_COMPAT) += nft_compat.o
83obj-$(CONFIG_NFT_CONNLIMIT) += nft_connlimit.o 93obj-$(CONFIG_NFT_CONNLIMIT) += nft_connlimit.o
84obj-$(CONFIG_NFT_NUMGEN) += nft_numgen.o 94obj-$(CONFIG_NFT_NUMGEN) += nft_numgen.o
@@ -91,9 +101,7 @@ obj-$(CONFIG_NFT_QUEUE) += nft_queue.o
91obj-$(CONFIG_NFT_QUOTA) += nft_quota.o 101obj-$(CONFIG_NFT_QUOTA) += nft_quota.o
92obj-$(CONFIG_NFT_REJECT) += nft_reject.o 102obj-$(CONFIG_NFT_REJECT) += nft_reject.o
93obj-$(CONFIG_NFT_REJECT_INET) += nft_reject_inet.o 103obj-$(CONFIG_NFT_REJECT_INET) += nft_reject_inet.o
94obj-$(CONFIG_NFT_SET_RBTREE) += nft_set_rbtree.o 104obj-$(CONFIG_NFT_TUNNEL) += nft_tunnel.o
95obj-$(CONFIG_NFT_SET_HASH) += nft_set_hash.o
96obj-$(CONFIG_NFT_SET_BITMAP) += nft_set_bitmap.o
97obj-$(CONFIG_NFT_COUNTER) += nft_counter.o 105obj-$(CONFIG_NFT_COUNTER) += nft_counter.o
98obj-$(CONFIG_NFT_LOG) += nft_log.o 106obj-$(CONFIG_NFT_LOG) += nft_log.o
99obj-$(CONFIG_NFT_MASQ) += nft_masq.o 107obj-$(CONFIG_NFT_MASQ) += nft_masq.o
@@ -102,8 +110,9 @@ obj-$(CONFIG_NFT_HASH) += nft_hash.o
102obj-$(CONFIG_NFT_FIB) += nft_fib.o 110obj-$(CONFIG_NFT_FIB) += nft_fib.o
103obj-$(CONFIG_NFT_FIB_INET) += nft_fib_inet.o 111obj-$(CONFIG_NFT_FIB_INET) += nft_fib_inet.o
104obj-$(CONFIG_NFT_FIB_NETDEV) += nft_fib_netdev.o 112obj-$(CONFIG_NFT_FIB_NETDEV) += nft_fib_netdev.o
105obj-$(CONFIG_NF_OSF) += nf_osf.o
106obj-$(CONFIG_NFT_SOCKET) += nft_socket.o 113obj-$(CONFIG_NFT_SOCKET) += nft_socket.o
114obj-$(CONFIG_NFT_OSF) += nft_osf.o
115obj-$(CONFIG_NFT_TPROXY) += nft_tproxy.o
107 116
108# nf_tables netdev 117# nf_tables netdev
109obj-$(CONFIG_NFT_DUP_NETDEV) += nft_dup_netdev.o 118obj-$(CONFIG_NFT_DUP_NETDEV) += nft_dup_netdev.o
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 168af54db975..dc240cb47ddf 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -603,6 +603,21 @@ void nf_conntrack_destroy(struct nf_conntrack *nfct)
603} 603}
604EXPORT_SYMBOL(nf_conntrack_destroy); 604EXPORT_SYMBOL(nf_conntrack_destroy);
605 605
606bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
607 const struct sk_buff *skb)
608{
609 struct nf_ct_hook *ct_hook;
610 bool ret = false;
611
612 rcu_read_lock();
613 ct_hook = rcu_dereference(nf_ct_hook);
614 if (ct_hook)
615 ret = ct_hook->get_tuple_skb(dst_tuple, skb);
616 rcu_read_unlock();
617 return ret;
618}
619EXPORT_SYMBOL(nf_ct_get_tuple_skb);
620
606/* Built-in default zone used e.g. by modules. */ 621/* Built-in default zone used e.g. by modules. */
607const struct nf_conntrack_zone nf_ct_zone_dflt = { 622const struct nf_conntrack_zone nf_ct_zone_dflt = {
608 .id = NF_CT_DEFAULT_ZONE_ID, 623 .id = NF_CT_DEFAULT_ZONE_ID,
diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig
index 05dc1b77e466..cad48d07c818 100644
--- a/net/netfilter/ipvs/Kconfig
+++ b/net/netfilter/ipvs/Kconfig
@@ -296,10 +296,10 @@ config IP_VS_MH_TAB_INDEX
296 stored in a hash table. This table is assigned by a preference 296 stored in a hash table. This table is assigned by a preference
297 list of the positions to each destination until all slots in 297 list of the positions to each destination until all slots in
298 the table are filled. The index determines the prime for size of 298 the table are filled. The index determines the prime for size of
299 the table as 251, 509, 1021, 2039, 4093, 8191, 16381, 32749, 299 the table as 251, 509, 1021, 2039, 4093, 8191, 16381, 32749,
300 65521 or 131071. When using weights to allow destinations to 300 65521 or 131071. When using weights to allow destinations to
301 receive more connections, the table is assigned an amount 301 receive more connections, the table is assigned an amount
302 proportional to the weights specified. The table needs to be large 302 proportional to the weights specified. The table needs to be large
303 enough to effectively fit all the destinations multiplied by their 303 enough to effectively fit all the destinations multiplied by their
304 respective weights. 304 respective weights.
305 305
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 99e0aa350dc5..5b2b17867cb1 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -825,12 +825,23 @@ static void ip_vs_conn_expire(struct timer_list *t)
825 825
826 /* Unlink conn if not referenced anymore */ 826 /* Unlink conn if not referenced anymore */
827 if (likely(ip_vs_conn_unlink(cp))) { 827 if (likely(ip_vs_conn_unlink(cp))) {
828 struct ip_vs_conn *ct = cp->control;
829
828 /* delete the timer if it is activated by other users */ 830 /* delete the timer if it is activated by other users */
829 del_timer(&cp->timer); 831 del_timer(&cp->timer);
830 832
831 /* does anybody control me? */ 833 /* does anybody control me? */
832 if (cp->control) 834 if (ct) {
833 ip_vs_control_del(cp); 835 ip_vs_control_del(cp);
836 /* Drop CTL or non-assured TPL if not used anymore */
837 if (!cp->timeout && !atomic_read(&ct->n_control) &&
838 (!(ct->flags & IP_VS_CONN_F_TEMPLATE) ||
839 !(ct->state & IP_VS_CTPL_S_ASSURED))) {
840 IP_VS_DBG(4, "drop controlling connection\n");
841 ct->timeout = 0;
842 ip_vs_conn_expire_now(ct);
843 }
844 }
834 845
835 if ((cp->flags & IP_VS_CONN_F_NFCT) && 846 if ((cp->flags & IP_VS_CONN_F_NFCT) &&
836 !(cp->flags & IP_VS_CONN_F_ONE_PACKET)) { 847 !(cp->flags & IP_VS_CONN_F_ONE_PACKET)) {
@@ -872,6 +883,10 @@ static void ip_vs_conn_expire(struct timer_list *t)
872 883
873/* Modify timer, so that it expires as soon as possible. 884/* Modify timer, so that it expires as soon as possible.
874 * Can be called without reference only if under RCU lock. 885 * Can be called without reference only if under RCU lock.
886 * We can have such chain of conns linked with ->control: DATA->CTL->TPL
887 * - DATA (eg. FTP) and TPL (persistence) can be present depending on setup
888 * - cp->timeout=0 indicates all conns from chain should be dropped but
889 * TPL is not dropped if in assured state
875 */ 890 */
876void ip_vs_conn_expire_now(struct ip_vs_conn *cp) 891void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
877{ 892{
@@ -1102,24 +1117,28 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
1102#ifdef CONFIG_IP_VS_IPV6 1117#ifdef CONFIG_IP_VS_IPV6
1103 if (cp->af == AF_INET6) 1118 if (cp->af == AF_INET6)
1104 seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X " 1119 seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X "
1105 "%s %04X %-11s %7lu%s\n", 1120 "%s %04X %-11s %7u%s\n",
1106 ip_vs_proto_name(cp->protocol), 1121 ip_vs_proto_name(cp->protocol),
1107 &cp->caddr.in6, ntohs(cp->cport), 1122 &cp->caddr.in6, ntohs(cp->cport),
1108 &cp->vaddr.in6, ntohs(cp->vport), 1123 &cp->vaddr.in6, ntohs(cp->vport),
1109 dbuf, ntohs(cp->dport), 1124 dbuf, ntohs(cp->dport),
1110 ip_vs_state_name(cp->protocol, cp->state), 1125 ip_vs_state_name(cp),
1111 (cp->timer.expires-jiffies)/HZ, pe_data); 1126 jiffies_delta_to_msecs(cp->timer.expires -
1127 jiffies) / 1000,
1128 pe_data);
1112 else 1129 else
1113#endif 1130#endif
1114 seq_printf(seq, 1131 seq_printf(seq,
1115 "%-3s %08X %04X %08X %04X" 1132 "%-3s %08X %04X %08X %04X"
1116 " %s %04X %-11s %7lu%s\n", 1133 " %s %04X %-11s %7u%s\n",
1117 ip_vs_proto_name(cp->protocol), 1134 ip_vs_proto_name(cp->protocol),
1118 ntohl(cp->caddr.ip), ntohs(cp->cport), 1135 ntohl(cp->caddr.ip), ntohs(cp->cport),
1119 ntohl(cp->vaddr.ip), ntohs(cp->vport), 1136 ntohl(cp->vaddr.ip), ntohs(cp->vport),
1120 dbuf, ntohs(cp->dport), 1137 dbuf, ntohs(cp->dport),
1121 ip_vs_state_name(cp->protocol, cp->state), 1138 ip_vs_state_name(cp),
1122 (cp->timer.expires-jiffies)/HZ, pe_data); 1139 jiffies_delta_to_msecs(cp->timer.expires -
1140 jiffies) / 1000,
1141 pe_data);
1123 } 1142 }
1124 return 0; 1143 return 0;
1125} 1144}
@@ -1164,26 +1183,28 @@ static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v)
1164#ifdef CONFIG_IP_VS_IPV6 1183#ifdef CONFIG_IP_VS_IPV6
1165 if (cp->af == AF_INET6) 1184 if (cp->af == AF_INET6)
1166 seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X " 1185 seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X "
1167 "%s %04X %-11s %-6s %7lu\n", 1186 "%s %04X %-11s %-6s %7u\n",
1168 ip_vs_proto_name(cp->protocol), 1187 ip_vs_proto_name(cp->protocol),
1169 &cp->caddr.in6, ntohs(cp->cport), 1188 &cp->caddr.in6, ntohs(cp->cport),
1170 &cp->vaddr.in6, ntohs(cp->vport), 1189 &cp->vaddr.in6, ntohs(cp->vport),
1171 dbuf, ntohs(cp->dport), 1190 dbuf, ntohs(cp->dport),
1172 ip_vs_state_name(cp->protocol, cp->state), 1191 ip_vs_state_name(cp),
1173 ip_vs_origin_name(cp->flags), 1192 ip_vs_origin_name(cp->flags),
1174 (cp->timer.expires-jiffies)/HZ); 1193 jiffies_delta_to_msecs(cp->timer.expires -
1194 jiffies) / 1000);
1175 else 1195 else
1176#endif 1196#endif
1177 seq_printf(seq, 1197 seq_printf(seq,
1178 "%-3s %08X %04X %08X %04X " 1198 "%-3s %08X %04X %08X %04X "
1179 "%s %04X %-11s %-6s %7lu\n", 1199 "%s %04X %-11s %-6s %7u\n",
1180 ip_vs_proto_name(cp->protocol), 1200 ip_vs_proto_name(cp->protocol),
1181 ntohl(cp->caddr.ip), ntohs(cp->cport), 1201 ntohl(cp->caddr.ip), ntohs(cp->cport),
1182 ntohl(cp->vaddr.ip), ntohs(cp->vport), 1202 ntohl(cp->vaddr.ip), ntohs(cp->vport),
1183 dbuf, ntohs(cp->dport), 1203 dbuf, ntohs(cp->dport),
1184 ip_vs_state_name(cp->protocol, cp->state), 1204 ip_vs_state_name(cp),
1185 ip_vs_origin_name(cp->flags), 1205 ip_vs_origin_name(cp->flags),
1186 (cp->timer.expires-jiffies)/HZ); 1206 jiffies_delta_to_msecs(cp->timer.expires -
1207 jiffies) / 1000);
1187 } 1208 }
1188 return 0; 1209 return 0;
1189} 1210}
@@ -1197,8 +1218,11 @@ static const struct seq_operations ip_vs_conn_sync_seq_ops = {
1197#endif 1218#endif
1198 1219
1199 1220
1200/* 1221/* Randomly drop connection entries before running out of memory
1201 * Randomly drop connection entries before running out of memory 1222 * Can be used for DATA and CTL conns. For TPL conns there are exceptions:
1223 * - traffic for services in OPS mode increases ct->in_pkts, so it is supported
1224 * - traffic for services not in OPS mode does not increase ct->in_pkts in
1225 * all cases, so it is not supported
1202 */ 1226 */
1203static inline int todrop_entry(struct ip_vs_conn *cp) 1227static inline int todrop_entry(struct ip_vs_conn *cp)
1204{ 1228{
@@ -1242,7 +1266,7 @@ static inline bool ip_vs_conn_ops_mode(struct ip_vs_conn *cp)
1242void ip_vs_random_dropentry(struct netns_ipvs *ipvs) 1266void ip_vs_random_dropentry(struct netns_ipvs *ipvs)
1243{ 1267{
1244 int idx; 1268 int idx;
1245 struct ip_vs_conn *cp, *cp_c; 1269 struct ip_vs_conn *cp;
1246 1270
1247 rcu_read_lock(); 1271 rcu_read_lock();
1248 /* 1272 /*
@@ -1254,13 +1278,15 @@ void ip_vs_random_dropentry(struct netns_ipvs *ipvs)
1254 hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { 1278 hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
1255 if (cp->ipvs != ipvs) 1279 if (cp->ipvs != ipvs)
1256 continue; 1280 continue;
1281 if (atomic_read(&cp->n_control))
1282 continue;
1257 if (cp->flags & IP_VS_CONN_F_TEMPLATE) { 1283 if (cp->flags & IP_VS_CONN_F_TEMPLATE) {
1258 if (atomic_read(&cp->n_control) || 1284 /* connection template of OPS */
1259 !ip_vs_conn_ops_mode(cp)) 1285 if (ip_vs_conn_ops_mode(cp))
1260 continue;
1261 else
1262 /* connection template of OPS */
1263 goto try_drop; 1286 goto try_drop;
1287 if (!(cp->state & IP_VS_CTPL_S_ASSURED))
1288 goto drop;
1289 continue;
1264 } 1290 }
1265 if (cp->protocol == IPPROTO_TCP) { 1291 if (cp->protocol == IPPROTO_TCP) {
1266 switch(cp->state) { 1292 switch(cp->state) {
@@ -1294,15 +1320,10 @@ try_drop:
1294 continue; 1320 continue;
1295 } 1321 }
1296 1322
1297 IP_VS_DBG(4, "del connection\n"); 1323drop:
1324 IP_VS_DBG(4, "drop connection\n");
1325 cp->timeout = 0;
1298 ip_vs_conn_expire_now(cp); 1326 ip_vs_conn_expire_now(cp);
1299 cp_c = cp->control;
1300 /* cp->control is valid only with reference to cp */
1301 if (cp_c && __ip_vs_conn_get(cp)) {
1302 IP_VS_DBG(4, "del conn template\n");
1303 ip_vs_conn_expire_now(cp_c);
1304 __ip_vs_conn_put(cp);
1305 }
1306 } 1327 }
1307 cond_resched_rcu(); 1328 cond_resched_rcu();
1308 } 1329 }
@@ -1325,15 +1346,19 @@ flush_again:
1325 hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { 1346 hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
1326 if (cp->ipvs != ipvs) 1347 if (cp->ipvs != ipvs)
1327 continue; 1348 continue;
1328 IP_VS_DBG(4, "del connection\n"); 1349 /* As timers are expired in LIFO order, restart
1329 ip_vs_conn_expire_now(cp); 1350 * the timer of controlling connection first, so
1351 * that it is expired after us.
1352 */
1330 cp_c = cp->control; 1353 cp_c = cp->control;
1331 /* cp->control is valid only with reference to cp */ 1354 /* cp->control is valid only with reference to cp */
1332 if (cp_c && __ip_vs_conn_get(cp)) { 1355 if (cp_c && __ip_vs_conn_get(cp)) {
1333 IP_VS_DBG(4, "del conn template\n"); 1356 IP_VS_DBG(4, "del controlling connection\n");
1334 ip_vs_conn_expire_now(cp_c); 1357 ip_vs_conn_expire_now(cp_c);
1335 __ip_vs_conn_put(cp); 1358 __ip_vs_conn_put(cp);
1336 } 1359 }
1360 IP_VS_DBG(4, "del connection\n");
1361 ip_vs_conn_expire_now(cp);
1337 } 1362 }
1338 cond_resched_rcu(); 1363 cond_resched_rcu();
1339 } 1364 }
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 0679dd101e72..7ca926a03b81 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1972,13 +1972,20 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int
1972 if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { 1972 if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
1973 /* the destination server is not available */ 1973 /* the destination server is not available */
1974 1974
1975 if (sysctl_expire_nodest_conn(ipvs)) { 1975 __u32 flags = cp->flags;
1976
1977 /* when timer already started, silently drop the packet.*/
1978 if (timer_pending(&cp->timer))
1979 __ip_vs_conn_put(cp);
1980 else
1981 ip_vs_conn_put(cp);
1982
1983 if (sysctl_expire_nodest_conn(ipvs) &&
1984 !(flags & IP_VS_CONN_F_ONE_PACKET)) {
1976 /* try to expire the connection immediately */ 1985 /* try to expire the connection immediately */
1977 ip_vs_conn_expire_now(cp); 1986 ip_vs_conn_expire_now(cp);
1978 } 1987 }
1979 /* don't restart its timer, and silently 1988
1980 drop the packet. */
1981 __ip_vs_conn_put(cp);
1982 return NF_DROP; 1989 return NF_DROP;
1983 } 1990 }
1984 1991
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index dd21782e2f12..62eefea48973 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -134,7 +134,7 @@ static void update_defense_level(struct netns_ipvs *ipvs)
134 } else { 134 } else {
135 atomic_set(&ipvs->dropentry, 0); 135 atomic_set(&ipvs->dropentry, 0);
136 ipvs->sysctl_drop_entry = 1; 136 ipvs->sysctl_drop_entry = 1;
137 }; 137 }
138 break; 138 break;
139 case 3: 139 case 3:
140 atomic_set(&ipvs->dropentry, 1); 140 atomic_set(&ipvs->dropentry, 1);
diff --git a/net/netfilter/ipvs/ip_vs_mh.c b/net/netfilter/ipvs/ip_vs_mh.c
index 0f795b186eb3..94d9d349ebb0 100644
--- a/net/netfilter/ipvs/ip_vs_mh.c
+++ b/net/netfilter/ipvs/ip_vs_mh.c
@@ -5,10 +5,10 @@
5 * 5 *
6 */ 6 */
7 7
8/* The mh algorithm is to assign a preference list of all the lookup 8/* The mh algorithm is to assign a preference list of all the lookup
9 * table positions to each destination and populate the table with 9 * table positions to each destination and populate the table with
10 * the most-preferred position of destinations. Then it is to select 10 * the most-preferred position of destinations. Then it is to select
11 * destination with the hash key of source IP address through looking 11 * destination with the hash key of source IP address through looking
12 * up a the lookup table. 12 * up a the lookup table.
13 * 13 *
14 * The algorithm is detailed in: 14 * The algorithm is detailed in:
diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c
index ca880a3ad033..54ee84adf0bd 100644
--- a/net/netfilter/ipvs/ip_vs_proto.c
+++ b/net/netfilter/ipvs/ip_vs_proto.c
@@ -42,6 +42,11 @@
42 42
43static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE]; 43static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE];
44 44
45/* States for conn templates: NONE or words separated with ",", max 15 chars */
46static const char *ip_vs_ctpl_state_name_table[IP_VS_CTPL_S_LAST] = {
47 [IP_VS_CTPL_S_NONE] = "NONE",
48 [IP_VS_CTPL_S_ASSURED] = "ASSURED",
49};
45 50
46/* 51/*
47 * register an ipvs protocol 52 * register an ipvs protocol
@@ -193,12 +198,20 @@ ip_vs_create_timeout_table(int *table, int size)
193} 198}
194 199
195 200
196const char * ip_vs_state_name(__u16 proto, int state) 201const char *ip_vs_state_name(const struct ip_vs_conn *cp)
197{ 202{
198 struct ip_vs_protocol *pp = ip_vs_proto_get(proto); 203 unsigned int state = cp->state;
204 struct ip_vs_protocol *pp;
205
206 if (cp->flags & IP_VS_CONN_F_TEMPLATE) {
199 207
208 if (state >= IP_VS_CTPL_S_LAST)
209 return "ERR!";
210 return ip_vs_ctpl_state_name_table[state] ? : "?";
211 }
212 pp = ip_vs_proto_get(cp->protocol);
200 if (pp == NULL || pp->state_name == NULL) 213 if (pp == NULL || pp->state_name == NULL)
201 return (IPPROTO_IP == proto) ? "NONE" : "ERR!"; 214 return (cp->protocol == IPPROTO_IP) ? "NONE" : "ERR!";
202 return pp->state_name(state); 215 return pp->state_name(state);
203} 216}
204 217
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index 3250c4a1111e..b0cd7d08f2a7 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -461,6 +461,8 @@ set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
461 cp->flags &= ~IP_VS_CONN_F_INACTIVE; 461 cp->flags &= ~IP_VS_CONN_F_INACTIVE;
462 } 462 }
463 } 463 }
464 if (next_state == IP_VS_SCTP_S_ESTABLISHED)
465 ip_vs_control_assure_ct(cp);
464 } 466 }
465 if (likely(pd)) 467 if (likely(pd))
466 cp->timeout = pd->timeout_table[cp->state = next_state]; 468 cp->timeout = pd->timeout_table[cp->state = next_state];
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index 80d10ad12a15..1770fc6ce960 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -569,6 +569,8 @@ set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
569 cp->flags &= ~IP_VS_CONN_F_INACTIVE; 569 cp->flags &= ~IP_VS_CONN_F_INACTIVE;
570 } 570 }
571 } 571 }
572 if (new_state == IP_VS_TCP_S_ESTABLISHED)
573 ip_vs_control_assure_ct(cp);
572 } 574 }
573 575
574 if (likely(pd)) 576 if (likely(pd))
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
index e0ef11c3691e..0f53c49025f8 100644
--- a/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -460,6 +460,8 @@ udp_state_transition(struct ip_vs_conn *cp, int direction,
460 } 460 }
461 461
462 cp->timeout = pd->timeout_table[IP_VS_UDP_S_NORMAL]; 462 cp->timeout = pd->timeout_table[IP_VS_UDP_S_NORMAL];
463 if (direction == IP_VS_DIR_OUTPUT)
464 ip_vs_control_assure_ct(cp);
463} 465}
464 466
465static int __udp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) 467static int __udp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd)
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 001501e25625..d4020c5e831d 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -1003,12 +1003,9 @@ static void ip_vs_process_message_v0(struct netns_ipvs *ipvs, const char *buffer
1003 continue; 1003 continue;
1004 } 1004 }
1005 } else { 1005 } else {
1006 /* protocol in templates is not used for state/timeout */ 1006 if (state >= IP_VS_CTPL_S_LAST)
1007 if (state > 0) { 1007 IP_VS_DBG(7, "BACKUP v0, Invalid tpl state %u\n",
1008 IP_VS_DBG(2, "BACKUP v0, Invalid template state %u\n", 1008 state);
1009 state);
1010 state = 0;
1011 }
1012 } 1009 }
1013 1010
1014 ip_vs_conn_fill_param(ipvs, AF_INET, s->protocol, 1011 ip_vs_conn_fill_param(ipvs, AF_INET, s->protocol,
@@ -1166,12 +1163,9 @@ static inline int ip_vs_proc_sync_conn(struct netns_ipvs *ipvs, __u8 *p, __u8 *m
1166 goto out; 1163 goto out;
1167 } 1164 }
1168 } else { 1165 } else {
1169 /* protocol in templates is not used for state/timeout */ 1166 if (state >= IP_VS_CTPL_S_LAST)
1170 if (state > 0) { 1167 IP_VS_DBG(7, "BACKUP, Invalid tpl state %u\n",
1171 IP_VS_DBG(3, "BACKUP, Invalid template state %u\n", 1168 state);
1172 state);
1173 state = 0;
1174 }
1175 } 1169 }
1176 if (ip_vs_conn_fill_param_sync(ipvs, af, s, &param, pe_data, 1170 if (ip_vs_conn_fill_param_sync(ipvs, af, s, &param, pe_data,
1177 pe_data_len, pe_name, pe_name_len)) { 1171 pe_data_len, pe_name, pe_name_len)) {
diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index d8383609fe28..02ca7df793f5 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -44,15 +44,19 @@
44 44
45/* we will save the tuples of all connections we care about */ 45/* we will save the tuples of all connections we care about */
46struct nf_conncount_tuple { 46struct nf_conncount_tuple {
47 struct hlist_node node; 47 struct list_head node;
48 struct nf_conntrack_tuple tuple; 48 struct nf_conntrack_tuple tuple;
49 struct nf_conntrack_zone zone; 49 struct nf_conntrack_zone zone;
50 int cpu;
51 u32 jiffies32;
52 struct rcu_head rcu_head;
50}; 53};
51 54
52struct nf_conncount_rb { 55struct nf_conncount_rb {
53 struct rb_node node; 56 struct rb_node node;
54 struct hlist_head hhead; /* connections/hosts in same subnet */ 57 struct nf_conncount_list list;
55 u32 key[MAX_KEYLEN]; 58 u32 key[MAX_KEYLEN];
59 struct rcu_head rcu_head;
56}; 60};
57 61
58static spinlock_t nf_conncount_locks[CONNCOUNT_LOCK_SLOTS] __cacheline_aligned_in_smp; 62static spinlock_t nf_conncount_locks[CONNCOUNT_LOCK_SLOTS] __cacheline_aligned_in_smp;
@@ -60,6 +64,10 @@ static spinlock_t nf_conncount_locks[CONNCOUNT_LOCK_SLOTS] __cacheline_aligned_i
60struct nf_conncount_data { 64struct nf_conncount_data {
61 unsigned int keylen; 65 unsigned int keylen;
62 struct rb_root root[CONNCOUNT_SLOTS]; 66 struct rb_root root[CONNCOUNT_SLOTS];
67 struct net *net;
68 struct work_struct gc_work;
69 unsigned long pending_trees[BITS_TO_LONGS(CONNCOUNT_SLOTS)];
70 unsigned int gc_tree;
63}; 71};
64 72
65static u_int32_t conncount_rnd __read_mostly; 73static u_int32_t conncount_rnd __read_mostly;
@@ -80,41 +88,129 @@ static int key_diff(const u32 *a, const u32 *b, unsigned int klen)
80 return memcmp(a, b, klen * sizeof(u32)); 88 return memcmp(a, b, klen * sizeof(u32));
81} 89}
82 90
83bool nf_conncount_add(struct hlist_head *head, 91enum nf_conncount_list_add
84 const struct nf_conntrack_tuple *tuple, 92nf_conncount_add(struct nf_conncount_list *list,
85 const struct nf_conntrack_zone *zone) 93 const struct nf_conntrack_tuple *tuple,
94 const struct nf_conntrack_zone *zone)
86{ 95{
87 struct nf_conncount_tuple *conn; 96 struct nf_conncount_tuple *conn;
88 97
98 if (WARN_ON_ONCE(list->count > INT_MAX))
99 return NF_CONNCOUNT_ERR;
100
89 conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC); 101 conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC);
90 if (conn == NULL) 102 if (conn == NULL)
91 return false; 103 return NF_CONNCOUNT_ERR;
104
92 conn->tuple = *tuple; 105 conn->tuple = *tuple;
93 conn->zone = *zone; 106 conn->zone = *zone;
94 hlist_add_head(&conn->node, head); 107 conn->cpu = raw_smp_processor_id();
95 return true; 108 conn->jiffies32 = (u32)jiffies;
109 spin_lock(&list->list_lock);
110 if (list->dead == true) {
111 kmem_cache_free(conncount_conn_cachep, conn);
112 spin_unlock(&list->list_lock);
113 return NF_CONNCOUNT_SKIP;
114 }
115 list_add_tail(&conn->node, &list->head);
116 list->count++;
117 spin_unlock(&list->list_lock);
118 return NF_CONNCOUNT_ADDED;
96} 119}
97EXPORT_SYMBOL_GPL(nf_conncount_add); 120EXPORT_SYMBOL_GPL(nf_conncount_add);
98 121
99unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head, 122static void __conn_free(struct rcu_head *h)
100 const struct nf_conntrack_tuple *tuple,
101 const struct nf_conntrack_zone *zone,
102 bool *addit)
103{ 123{
104 const struct nf_conntrack_tuple_hash *found;
105 struct nf_conncount_tuple *conn; 124 struct nf_conncount_tuple *conn;
106 struct hlist_node *n; 125
126 conn = container_of(h, struct nf_conncount_tuple, rcu_head);
127 kmem_cache_free(conncount_conn_cachep, conn);
128}
129
130static bool conn_free(struct nf_conncount_list *list,
131 struct nf_conncount_tuple *conn)
132{
133 bool free_entry = false;
134
135 spin_lock(&list->list_lock);
136
137 if (list->count == 0) {
138 spin_unlock(&list->list_lock);
139 return free_entry;
140 }
141
142 list->count--;
143 list_del_rcu(&conn->node);
144 if (list->count == 0)
145 free_entry = true;
146
147 spin_unlock(&list->list_lock);
148 call_rcu(&conn->rcu_head, __conn_free);
149 return free_entry;
150}
151
152static const struct nf_conntrack_tuple_hash *
153find_or_evict(struct net *net, struct nf_conncount_list *list,
154 struct nf_conncount_tuple *conn, bool *free_entry)
155{
156 const struct nf_conntrack_tuple_hash *found;
157 unsigned long a, b;
158 int cpu = raw_smp_processor_id();
159 __s32 age;
160
161 found = nf_conntrack_find_get(net, &conn->zone, &conn->tuple);
162 if (found)
163 return found;
164 b = conn->jiffies32;
165 a = (u32)jiffies;
166
167 /* conn might have been added just before by another cpu and
168 * might still be unconfirmed. In this case, nf_conntrack_find()
169 * returns no result. Thus only evict if this cpu added the
170 * stale entry or if the entry is older than two jiffies.
171 */
172 age = a - b;
173 if (conn->cpu == cpu || age >= 2) {
174 *free_entry = conn_free(list, conn);
175 return ERR_PTR(-ENOENT);
176 }
177
178 return ERR_PTR(-EAGAIN);
179}
180
181void nf_conncount_lookup(struct net *net,
182 struct nf_conncount_list *list,
183 const struct nf_conntrack_tuple *tuple,
184 const struct nf_conntrack_zone *zone,
185 bool *addit)
186{
187 const struct nf_conntrack_tuple_hash *found;
188 struct nf_conncount_tuple *conn, *conn_n;
107 struct nf_conn *found_ct; 189 struct nf_conn *found_ct;
108 unsigned int length = 0; 190 unsigned int collect = 0;
191 bool free_entry = false;
109 192
193 /* best effort only */
110 *addit = tuple ? true : false; 194 *addit = tuple ? true : false;
111 195
112 /* check the saved connections */ 196 /* check the saved connections */
113 hlist_for_each_entry_safe(conn, n, head, node) { 197 list_for_each_entry_safe(conn, conn_n, &list->head, node) {
114 found = nf_conntrack_find_get(net, &conn->zone, &conn->tuple); 198 if (collect > CONNCOUNT_GC_MAX_NODES)
115 if (found == NULL) { 199 break;
116 hlist_del(&conn->node); 200
117 kmem_cache_free(conncount_conn_cachep, conn); 201 found = find_or_evict(net, list, conn, &free_entry);
202 if (IS_ERR(found)) {
203 /* Not found, but might be about to be confirmed */
204 if (PTR_ERR(found) == -EAGAIN) {
205 if (!tuple)
206 continue;
207
208 if (nf_ct_tuple_equal(&conn->tuple, tuple) &&
209 nf_ct_zone_id(&conn->zone, conn->zone.dir) ==
210 nf_ct_zone_id(zone, zone->dir))
211 *addit = false;
212 } else if (PTR_ERR(found) == -ENOENT)
213 collect++;
118 continue; 214 continue;
119 } 215 }
120 216
@@ -123,9 +219,10 @@ unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head,
123 if (tuple && nf_ct_tuple_equal(&conn->tuple, tuple) && 219 if (tuple && nf_ct_tuple_equal(&conn->tuple, tuple) &&
124 nf_ct_zone_equal(found_ct, zone, zone->dir)) { 220 nf_ct_zone_equal(found_ct, zone, zone->dir)) {
125 /* 221 /*
126 * Just to be sure we have it only once in the list.
127 * We should not see tuples twice unless someone hooks 222 * We should not see tuples twice unless someone hooks
128 * this into a table without "-p tcp --syn". 223 * this into a table without "-p tcp --syn".
224 *
225 * Attempt to avoid a re-add in this case.
129 */ 226 */
130 *addit = false; 227 *addit = false;
131 } else if (already_closed(found_ct)) { 228 } else if (already_closed(found_ct)) {
@@ -134,19 +231,75 @@ unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head,
134 * closed already -> ditch it 231 * closed already -> ditch it
135 */ 232 */
136 nf_ct_put(found_ct); 233 nf_ct_put(found_ct);
137 hlist_del(&conn->node); 234 conn_free(list, conn);
138 kmem_cache_free(conncount_conn_cachep, conn); 235 collect++;
139 continue; 236 continue;
140 } 237 }
141 238
142 nf_ct_put(found_ct); 239 nf_ct_put(found_ct);
143 length++;
144 } 240 }
145
146 return length;
147} 241}
148EXPORT_SYMBOL_GPL(nf_conncount_lookup); 242EXPORT_SYMBOL_GPL(nf_conncount_lookup);
149 243
244void nf_conncount_list_init(struct nf_conncount_list *list)
245{
246 spin_lock_init(&list->list_lock);
247 INIT_LIST_HEAD(&list->head);
248 list->count = 1;
249 list->dead = false;
250}
251EXPORT_SYMBOL_GPL(nf_conncount_list_init);
252
253/* Return true if the list is empty */
254bool nf_conncount_gc_list(struct net *net,
255 struct nf_conncount_list *list)
256{
257 const struct nf_conntrack_tuple_hash *found;
258 struct nf_conncount_tuple *conn, *conn_n;
259 struct nf_conn *found_ct;
260 unsigned int collected = 0;
261 bool free_entry = false;
262
263 list_for_each_entry_safe(conn, conn_n, &list->head, node) {
264 found = find_or_evict(net, list, conn, &free_entry);
265 if (IS_ERR(found)) {
266 if (PTR_ERR(found) == -ENOENT) {
267 if (free_entry)
268 return true;
269 collected++;
270 }
271 continue;
272 }
273
274 found_ct = nf_ct_tuplehash_to_ctrack(found);
275 if (already_closed(found_ct)) {
276 /*
277 * we do not care about connections which are
278 * closed already -> ditch it
279 */
280 nf_ct_put(found_ct);
281 if (conn_free(list, conn))
282 return true;
283 collected++;
284 continue;
285 }
286
287 nf_ct_put(found_ct);
288 if (collected > CONNCOUNT_GC_MAX_NODES)
289 return false;
290 }
291 return false;
292}
293EXPORT_SYMBOL_GPL(nf_conncount_gc_list);
294
295static void __tree_nodes_free(struct rcu_head *h)
296{
297 struct nf_conncount_rb *rbconn;
298
299 rbconn = container_of(h, struct nf_conncount_rb, rcu_head);
300 kmem_cache_free(conncount_rb_cachep, rbconn);
301}
302
150static void tree_nodes_free(struct rb_root *root, 303static void tree_nodes_free(struct rb_root *root,
151 struct nf_conncount_rb *gc_nodes[], 304 struct nf_conncount_rb *gc_nodes[],
152 unsigned int gc_count) 305 unsigned int gc_count)
@@ -155,32 +308,46 @@ static void tree_nodes_free(struct rb_root *root,
155 308
156 while (gc_count) { 309 while (gc_count) {
157 rbconn = gc_nodes[--gc_count]; 310 rbconn = gc_nodes[--gc_count];
158 rb_erase(&rbconn->node, root); 311 spin_lock(&rbconn->list.list_lock);
159 kmem_cache_free(conncount_rb_cachep, rbconn); 312 if (rbconn->list.count == 0 && rbconn->list.dead == false) {
313 rbconn->list.dead = true;
314 rb_erase(&rbconn->node, root);
315 call_rcu(&rbconn->rcu_head, __tree_nodes_free);
316 }
317 spin_unlock(&rbconn->list.list_lock);
160 } 318 }
161} 319}
162 320
321static void schedule_gc_worker(struct nf_conncount_data *data, int tree)
322{
323 set_bit(tree, data->pending_trees);
324 schedule_work(&data->gc_work);
325}
326
163static unsigned int 327static unsigned int
164count_tree(struct net *net, struct rb_root *root, 328insert_tree(struct net *net,
165 const u32 *key, u8 keylen, 329 struct nf_conncount_data *data,
166 const struct nf_conntrack_tuple *tuple, 330 struct rb_root *root,
167 const struct nf_conntrack_zone *zone) 331 unsigned int hash,
332 const u32 *key,
333 u8 keylen,
334 const struct nf_conntrack_tuple *tuple,
335 const struct nf_conntrack_zone *zone)
168{ 336{
337 enum nf_conncount_list_add ret;
169 struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES]; 338 struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES];
170 struct rb_node **rbnode, *parent; 339 struct rb_node **rbnode, *parent;
171 struct nf_conncount_rb *rbconn; 340 struct nf_conncount_rb *rbconn;
172 struct nf_conncount_tuple *conn; 341 struct nf_conncount_tuple *conn;
173 unsigned int gc_count; 342 unsigned int count = 0, gc_count = 0;
174 bool no_gc = false; 343 bool node_found = false;
344
345 spin_lock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]);
175 346
176 restart:
177 gc_count = 0;
178 parent = NULL; 347 parent = NULL;
179 rbnode = &(root->rb_node); 348 rbnode = &(root->rb_node);
180 while (*rbnode) { 349 while (*rbnode) {
181 int diff; 350 int diff;
182 bool addit;
183
184 rbconn = rb_entry(*rbnode, struct nf_conncount_rb, node); 351 rbconn = rb_entry(*rbnode, struct nf_conncount_rb, node);
185 352
186 parent = *rbnode; 353 parent = *rbnode;
@@ -190,33 +357,30 @@ count_tree(struct net *net, struct rb_root *root,
190 } else if (diff > 0) { 357 } else if (diff > 0) {
191 rbnode = &((*rbnode)->rb_right); 358 rbnode = &((*rbnode)->rb_right);
192 } else { 359 } else {
193 /* same source network -> be counted! */ 360 /* unlikely: other cpu added node already */
194 unsigned int count; 361 node_found = true;
195 362 ret = nf_conncount_add(&rbconn->list, tuple, zone);
196 count = nf_conncount_lookup(net, &rbconn->hhead, tuple, 363 if (ret == NF_CONNCOUNT_ERR) {
197 zone, &addit); 364 count = 0; /* hotdrop */
198 365 } else if (ret == NF_CONNCOUNT_ADDED) {
199 tree_nodes_free(root, gc_nodes, gc_count); 366 count = rbconn->list.count;
200 if (!addit) 367 } else {
201 return count; 368 /* NF_CONNCOUNT_SKIP, rbconn is already
202 369 * reclaimed by gc, insert a new tree node
203 if (!nf_conncount_add(&rbconn->hhead, tuple, zone)) 370 */
204 return 0; /* hotdrop */ 371 node_found = false;
205 372 }
206 return count + 1; 373 break;
207 } 374 }
208 375
209 if (no_gc || gc_count >= ARRAY_SIZE(gc_nodes)) 376 if (gc_count >= ARRAY_SIZE(gc_nodes))
210 continue; 377 continue;
211 378
212 /* only used for GC on hhead, retval and 'addit' ignored */ 379 if (nf_conncount_gc_list(net, &rbconn->list))
213 nf_conncount_lookup(net, &rbconn->hhead, tuple, zone, &addit);
214 if (hlist_empty(&rbconn->hhead))
215 gc_nodes[gc_count++] = rbconn; 380 gc_nodes[gc_count++] = rbconn;
216 } 381 }
217 382
218 if (gc_count) { 383 if (gc_count) {
219 no_gc = true;
220 tree_nodes_free(root, gc_nodes, gc_count); 384 tree_nodes_free(root, gc_nodes, gc_count);
221 /* tree_node_free before new allocation permits 385 /* tree_node_free before new allocation permits
222 * allocator to re-use newly free'd object. 386 * allocator to re-use newly free'd object.
@@ -224,58 +388,146 @@ count_tree(struct net *net, struct rb_root *root,
224 * This is a rare event; in most cases we will find 388 * This is a rare event; in most cases we will find
225 * existing node to re-use. (or gc_count is 0). 389 * existing node to re-use. (or gc_count is 0).
226 */ 390 */
227 goto restart; 391
392 if (gc_count >= ARRAY_SIZE(gc_nodes))
393 schedule_gc_worker(data, hash);
228 } 394 }
229 395
230 if (!tuple) 396 if (node_found)
231 return 0; 397 goto out_unlock;
232 398
233 /* no match, need to insert new node */ 399 /* expected case: match, insert new node */
234 rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC); 400 rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC);
235 if (rbconn == NULL) 401 if (rbconn == NULL)
236 return 0; 402 goto out_unlock;
237 403
238 conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC); 404 conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC);
239 if (conn == NULL) { 405 if (conn == NULL) {
240 kmem_cache_free(conncount_rb_cachep, rbconn); 406 kmem_cache_free(conncount_rb_cachep, rbconn);
241 return 0; 407 goto out_unlock;
242 } 408 }
243 409
244 conn->tuple = *tuple; 410 conn->tuple = *tuple;
245 conn->zone = *zone; 411 conn->zone = *zone;
246 memcpy(rbconn->key, key, sizeof(u32) * keylen); 412 memcpy(rbconn->key, key, sizeof(u32) * keylen);
247 413
248 INIT_HLIST_HEAD(&rbconn->hhead); 414 nf_conncount_list_init(&rbconn->list);
249 hlist_add_head(&conn->node, &rbconn->hhead); 415 list_add(&conn->node, &rbconn->list.head);
416 count = 1;
250 417
251 rb_link_node(&rbconn->node, parent, rbnode); 418 rb_link_node(&rbconn->node, parent, rbnode);
252 rb_insert_color(&rbconn->node, root); 419 rb_insert_color(&rbconn->node, root);
253 return 1; 420out_unlock:
421 spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]);
422 return count;
254} 423}
255 424
256/* Count and return number of conntrack entries in 'net' with particular 'key'. 425static unsigned int
257 * If 'tuple' is not null, insert it into the accounting data structure. 426count_tree(struct net *net,
258 */ 427 struct nf_conncount_data *data,
259unsigned int nf_conncount_count(struct net *net, 428 const u32 *key,
260 struct nf_conncount_data *data, 429 const struct nf_conntrack_tuple *tuple,
261 const u32 *key, 430 const struct nf_conntrack_zone *zone)
262 const struct nf_conntrack_tuple *tuple,
263 const struct nf_conntrack_zone *zone)
264{ 431{
432 enum nf_conncount_list_add ret;
265 struct rb_root *root; 433 struct rb_root *root;
266 int count; 434 struct rb_node *parent;
267 u32 hash; 435 struct nf_conncount_rb *rbconn;
436 unsigned int hash;
437 u8 keylen = data->keylen;
268 438
269 hash = jhash2(key, data->keylen, conncount_rnd) % CONNCOUNT_SLOTS; 439 hash = jhash2(key, data->keylen, conncount_rnd) % CONNCOUNT_SLOTS;
270 root = &data->root[hash]; 440 root = &data->root[hash];
271 441
272 spin_lock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); 442 parent = rcu_dereference_raw(root->rb_node);
443 while (parent) {
444 int diff;
445 bool addit;
273 446
274 count = count_tree(net, root, key, data->keylen, tuple, zone); 447 rbconn = rb_entry(parent, struct nf_conncount_rb, node);
275 448
276 spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); 449 diff = key_diff(key, rbconn->key, keylen);
450 if (diff < 0) {
451 parent = rcu_dereference_raw(parent->rb_left);
452 } else if (diff > 0) {
453 parent = rcu_dereference_raw(parent->rb_right);
454 } else {
455 /* same source network -> be counted! */
456 nf_conncount_lookup(net, &rbconn->list, tuple, zone,
457 &addit);
277 458
278 return count; 459 if (!addit)
460 return rbconn->list.count;
461
462 ret = nf_conncount_add(&rbconn->list, tuple, zone);
463 if (ret == NF_CONNCOUNT_ERR) {
464 return 0; /* hotdrop */
465 } else if (ret == NF_CONNCOUNT_ADDED) {
466 return rbconn->list.count;
467 } else {
468 /* NF_CONNCOUNT_SKIP, rbconn is already
469 * reclaimed by gc, insert a new tree node
470 */
471 break;
472 }
473 }
474 }
475
476 if (!tuple)
477 return 0;
478
479 return insert_tree(net, data, root, hash, key, keylen, tuple, zone);
480}
481
482static void tree_gc_worker(struct work_struct *work)
483{
484 struct nf_conncount_data *data = container_of(work, struct nf_conncount_data, gc_work);
485 struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES], *rbconn;
486 struct rb_root *root;
487 struct rb_node *node;
488 unsigned int tree, next_tree, gc_count = 0;
489
490 tree = data->gc_tree % CONNCOUNT_LOCK_SLOTS;
491 root = &data->root[tree];
492
493 rcu_read_lock();
494 for (node = rb_first(root); node != NULL; node = rb_next(node)) {
495 rbconn = rb_entry(node, struct nf_conncount_rb, node);
496 if (nf_conncount_gc_list(data->net, &rbconn->list))
497 gc_nodes[gc_count++] = rbconn;
498 }
499 rcu_read_unlock();
500
501 spin_lock_bh(&nf_conncount_locks[tree]);
502
503 if (gc_count) {
504 tree_nodes_free(root, gc_nodes, gc_count);
505 }
506
507 clear_bit(tree, data->pending_trees);
508
509 next_tree = (tree + 1) % CONNCOUNT_SLOTS;
510 next_tree = find_next_bit(data->pending_trees, next_tree, CONNCOUNT_SLOTS);
511
512 if (next_tree < CONNCOUNT_SLOTS) {
513 data->gc_tree = next_tree;
514 schedule_work(work);
515 }
516
517 spin_unlock_bh(&nf_conncount_locks[tree]);
518}
519
520/* Count and return number of conntrack entries in 'net' with particular 'key'.
521 * If 'tuple' is not null, insert it into the accounting data structure.
522 * Call with RCU read lock.
523 */
524unsigned int nf_conncount_count(struct net *net,
525 struct nf_conncount_data *data,
526 const u32 *key,
527 const struct nf_conntrack_tuple *tuple,
528 const struct nf_conntrack_zone *zone)
529{
530 return count_tree(net, data, key, tuple, zone);
279} 531}
280EXPORT_SYMBOL_GPL(nf_conncount_count); 532EXPORT_SYMBOL_GPL(nf_conncount_count);
281 533
@@ -306,17 +558,18 @@ struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family
306 data->root[i] = RB_ROOT; 558 data->root[i] = RB_ROOT;
307 559
308 data->keylen = keylen / sizeof(u32); 560 data->keylen = keylen / sizeof(u32);
561 data->net = net;
562 INIT_WORK(&data->gc_work, tree_gc_worker);
309 563
310 return data; 564 return data;
311} 565}
312EXPORT_SYMBOL_GPL(nf_conncount_init); 566EXPORT_SYMBOL_GPL(nf_conncount_init);
313 567
314void nf_conncount_cache_free(struct hlist_head *hhead) 568void nf_conncount_cache_free(struct nf_conncount_list *list)
315{ 569{
316 struct nf_conncount_tuple *conn; 570 struct nf_conncount_tuple *conn, *conn_n;
317 struct hlist_node *n;
318 571
319 hlist_for_each_entry_safe(conn, n, hhead, node) 572 list_for_each_entry_safe(conn, conn_n, &list->head, node)
320 kmem_cache_free(conncount_conn_cachep, conn); 573 kmem_cache_free(conncount_conn_cachep, conn);
321} 574}
322EXPORT_SYMBOL_GPL(nf_conncount_cache_free); 575EXPORT_SYMBOL_GPL(nf_conncount_cache_free);
@@ -331,7 +584,7 @@ static void destroy_tree(struct rb_root *r)
331 584
332 rb_erase(node, r); 585 rb_erase(node, r);
333 586
334 nf_conncount_cache_free(&rbconn->hhead); 587 nf_conncount_cache_free(&rbconn->list);
335 588
336 kmem_cache_free(conncount_rb_cachep, rbconn); 589 kmem_cache_free(conncount_rb_cachep, rbconn);
337 } 590 }
@@ -342,6 +595,7 @@ void nf_conncount_destroy(struct net *net, unsigned int family,
342{ 595{
343 unsigned int i; 596 unsigned int i;
344 597
598 cancel_work_sync(&data->gc_work);
345 nf_ct_netns_put(net, family); 599 nf_ct_netns_put(net, family);
346 600
347 for (i = 0; i < ARRAY_SIZE(data->root); ++i) 601 for (i = 0; i < ARRAY_SIZE(data->root); ++i)
diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c
index a1086bdec242..5423b197d98a 100644
--- a/net/netfilter/nf_conntrack_broadcast.c
+++ b/net/netfilter/nf_conntrack_broadcast.c
@@ -32,7 +32,7 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb,
32 __be32 mask = 0; 32 __be32 mask = 0;
33 33
34 /* we're only interested in locally generated packets */ 34 /* we're only interested in locally generated packets */
35 if (skb->sk == NULL) 35 if (skb->sk == NULL || !net_eq(nf_ct_net(ct), sock_net(skb->sk)))
36 goto out; 36 goto out;
37 if (rt == NULL || !(rt->rt_flags & RTCF_BROADCAST)) 37 if (rt == NULL || !(rt->rt_flags & RTCF_BROADCAST))
38 goto out; 38 goto out;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 3465da2a98bd..a676d5f76bdc 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -37,7 +37,6 @@
37#include <linux/rculist_nulls.h> 37#include <linux/rculist_nulls.h>
38 38
39#include <net/netfilter/nf_conntrack.h> 39#include <net/netfilter/nf_conntrack.h>
40#include <net/netfilter/nf_conntrack_l3proto.h>
41#include <net/netfilter/nf_conntrack_l4proto.h> 40#include <net/netfilter/nf_conntrack_l4proto.h>
42#include <net/netfilter/nf_conntrack_expect.h> 41#include <net/netfilter/nf_conntrack_expect.h>
43#include <net/netfilter/nf_conntrack_helper.h> 42#include <net/netfilter/nf_conntrack_helper.h>
@@ -55,6 +54,7 @@
55#include <net/netfilter/nf_nat_core.h> 54#include <net/netfilter/nf_nat_core.h>
56#include <net/netfilter/nf_nat_helper.h> 55#include <net/netfilter/nf_nat_helper.h>
57#include <net/netns/hash.h> 56#include <net/netns/hash.h>
57#include <net/ip.h>
58 58
59#include "nf_internals.h" 59#include "nf_internals.h"
60 60
@@ -222,7 +222,7 @@ static u32 hash_conntrack(const struct net *net,
222 return scale_hash(hash_conntrack_raw(tuple, net)); 222 return scale_hash(hash_conntrack_raw(tuple, net));
223} 223}
224 224
225bool 225static bool
226nf_ct_get_tuple(const struct sk_buff *skb, 226nf_ct_get_tuple(const struct sk_buff *skb,
227 unsigned int nhoff, 227 unsigned int nhoff,
228 unsigned int dataoff, 228 unsigned int dataoff,
@@ -230,37 +230,151 @@ nf_ct_get_tuple(const struct sk_buff *skb,
230 u_int8_t protonum, 230 u_int8_t protonum,
231 struct net *net, 231 struct net *net,
232 struct nf_conntrack_tuple *tuple, 232 struct nf_conntrack_tuple *tuple,
233 const struct nf_conntrack_l3proto *l3proto,
234 const struct nf_conntrack_l4proto *l4proto) 233 const struct nf_conntrack_l4proto *l4proto)
235{ 234{
235 unsigned int size;
236 const __be32 *ap;
237 __be32 _addrs[8];
238 struct {
239 __be16 sport;
240 __be16 dport;
241 } _inet_hdr, *inet_hdr;
242
236 memset(tuple, 0, sizeof(*tuple)); 243 memset(tuple, 0, sizeof(*tuple));
237 244
238 tuple->src.l3num = l3num; 245 tuple->src.l3num = l3num;
239 if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0) 246 switch (l3num) {
247 case NFPROTO_IPV4:
248 nhoff += offsetof(struct iphdr, saddr);
249 size = 2 * sizeof(__be32);
250 break;
251 case NFPROTO_IPV6:
252 nhoff += offsetof(struct ipv6hdr, saddr);
253 size = sizeof(_addrs);
254 break;
255 default:
256 return true;
257 }
258
259 ap = skb_header_pointer(skb, nhoff, size, _addrs);
260 if (!ap)
240 return false; 261 return false;
241 262
263 switch (l3num) {
264 case NFPROTO_IPV4:
265 tuple->src.u3.ip = ap[0];
266 tuple->dst.u3.ip = ap[1];
267 break;
268 case NFPROTO_IPV6:
269 memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6));
270 memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6));
271 break;
272 }
273
242 tuple->dst.protonum = protonum; 274 tuple->dst.protonum = protonum;
243 tuple->dst.dir = IP_CT_DIR_ORIGINAL; 275 tuple->dst.dir = IP_CT_DIR_ORIGINAL;
244 276
245 return l4proto->pkt_to_tuple(skb, dataoff, net, tuple); 277 if (unlikely(l4proto->pkt_to_tuple))
278 return l4proto->pkt_to_tuple(skb, dataoff, net, tuple);
279
280 /* Actually only need first 4 bytes to get ports. */
281 inet_hdr = skb_header_pointer(skb, dataoff, sizeof(_inet_hdr), &_inet_hdr);
282 if (!inet_hdr)
283 return false;
284
285 tuple->src.u.udp.port = inet_hdr->sport;
286 tuple->dst.u.udp.port = inet_hdr->dport;
287 return true;
288}
289
290static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
291 u_int8_t *protonum)
292{
293 int dataoff = -1;
294 const struct iphdr *iph;
295 struct iphdr _iph;
296
297 iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
298 if (!iph)
299 return -1;
300
301 /* Conntrack defragments packets, we might still see fragments
302 * inside ICMP packets though.
303 */
304 if (iph->frag_off & htons(IP_OFFSET))
305 return -1;
306
307 dataoff = nhoff + (iph->ihl << 2);
308 *protonum = iph->protocol;
309
310 /* Check bogus IP headers */
311 if (dataoff > skb->len) {
312 pr_debug("bogus IPv4 packet: nhoff %u, ihl %u, skblen %u\n",
313 nhoff, iph->ihl << 2, skb->len);
314 return -1;
315 }
316 return dataoff;
317}
318
319#if IS_ENABLED(CONFIG_IPV6)
320static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
321 u8 *protonum)
322{
323 int protoff = -1;
324 unsigned int extoff = nhoff + sizeof(struct ipv6hdr);
325 __be16 frag_off;
326 u8 nexthdr;
327
328 if (skb_copy_bits(skb, nhoff + offsetof(struct ipv6hdr, nexthdr),
329 &nexthdr, sizeof(nexthdr)) != 0) {
330 pr_debug("can't get nexthdr\n");
331 return -1;
332 }
333 protoff = ipv6_skip_exthdr(skb, extoff, &nexthdr, &frag_off);
334 /*
335 * (protoff == skb->len) means the packet has not data, just
336 * IPv6 and possibly extensions headers, but it is tracked anyway
337 */
338 if (protoff < 0 || (frag_off & htons(~0x7)) != 0) {
339 pr_debug("can't find proto in pkt\n");
340 return -1;
341 }
342
343 *protonum = nexthdr;
344 return protoff;
345}
346#endif
347
348static int get_l4proto(const struct sk_buff *skb,
349 unsigned int nhoff, u8 pf, u8 *l4num)
350{
351 switch (pf) {
352 case NFPROTO_IPV4:
353 return ipv4_get_l4proto(skb, nhoff, l4num);
354#if IS_ENABLED(CONFIG_IPV6)
355 case NFPROTO_IPV6:
356 return ipv6_get_l4proto(skb, nhoff, l4num);
357#endif
358 default:
359 *l4num = 0;
360 break;
361 }
362 return -1;
246} 363}
247EXPORT_SYMBOL_GPL(nf_ct_get_tuple);
248 364
249bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, 365bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff,
250 u_int16_t l3num, 366 u_int16_t l3num,
251 struct net *net, struct nf_conntrack_tuple *tuple) 367 struct net *net, struct nf_conntrack_tuple *tuple)
252{ 368{
253 const struct nf_conntrack_l3proto *l3proto;
254 const struct nf_conntrack_l4proto *l4proto; 369 const struct nf_conntrack_l4proto *l4proto;
255 unsigned int protoff; 370 u8 protonum;
256 u_int8_t protonum; 371 int protoff;
257 int ret; 372 int ret;
258 373
259 rcu_read_lock(); 374 rcu_read_lock();
260 375
261 l3proto = __nf_ct_l3proto_find(l3num); 376 protoff = get_l4proto(skb, nhoff, l3num, &protonum);
262 ret = l3proto->get_l4proto(skb, nhoff, &protoff, &protonum); 377 if (protoff <= 0) {
263 if (ret != NF_ACCEPT) {
264 rcu_read_unlock(); 378 rcu_read_unlock();
265 return false; 379 return false;
266 } 380 }
@@ -268,7 +382,7 @@ bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff,
268 l4proto = __nf_ct_l4proto_find(l3num, protonum); 382 l4proto = __nf_ct_l4proto_find(l3num, protonum);
269 383
270 ret = nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple, 384 ret = nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple,
271 l3proto, l4proto); 385 l4proto);
272 386
273 rcu_read_unlock(); 387 rcu_read_unlock();
274 return ret; 388 return ret;
@@ -278,19 +392,35 @@ EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr);
278bool 392bool
279nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, 393nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
280 const struct nf_conntrack_tuple *orig, 394 const struct nf_conntrack_tuple *orig,
281 const struct nf_conntrack_l3proto *l3proto,
282 const struct nf_conntrack_l4proto *l4proto) 395 const struct nf_conntrack_l4proto *l4proto)
283{ 396{
284 memset(inverse, 0, sizeof(*inverse)); 397 memset(inverse, 0, sizeof(*inverse));
285 398
286 inverse->src.l3num = orig->src.l3num; 399 inverse->src.l3num = orig->src.l3num;
287 if (l3proto->invert_tuple(inverse, orig) == 0) 400
288 return false; 401 switch (orig->src.l3num) {
402 case NFPROTO_IPV4:
403 inverse->src.u3.ip = orig->dst.u3.ip;
404 inverse->dst.u3.ip = orig->src.u3.ip;
405 break;
406 case NFPROTO_IPV6:
407 inverse->src.u3.in6 = orig->dst.u3.in6;
408 inverse->dst.u3.in6 = orig->src.u3.in6;
409 break;
410 default:
411 break;
412 }
289 413
290 inverse->dst.dir = !orig->dst.dir; 414 inverse->dst.dir = !orig->dst.dir;
291 415
292 inverse->dst.protonum = orig->dst.protonum; 416 inverse->dst.protonum = orig->dst.protonum;
293 return l4proto->invert_tuple(inverse, orig); 417
418 if (unlikely(l4proto->invert_tuple))
419 return l4proto->invert_tuple(inverse, orig);
420
421 inverse->src.u.all = orig->dst.u.all;
422 inverse->dst.u.all = orig->src.u.all;
423 return true;
294} 424}
295EXPORT_SYMBOL_GPL(nf_ct_invert_tuple); 425EXPORT_SYMBOL_GPL(nf_ct_invert_tuple);
296 426
@@ -502,6 +632,18 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
502 net_eq(net, nf_ct_net(ct)); 632 net_eq(net, nf_ct_net(ct));
503} 633}
504 634
635static inline bool
636nf_ct_match(const struct nf_conn *ct1, const struct nf_conn *ct2)
637{
638 return nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
639 &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple) &&
640 nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple,
641 &ct2->tuplehash[IP_CT_DIR_REPLY].tuple) &&
642 nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL) &&
643 nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_REPLY) &&
644 net_eq(nf_ct_net(ct1), nf_ct_net(ct2));
645}
646
505/* caller must hold rcu readlock and none of the nf_conntrack_locks */ 647/* caller must hold rcu readlock and none of the nf_conntrack_locks */
506static void nf_ct_gc_expired(struct nf_conn *ct) 648static void nf_ct_gc_expired(struct nf_conn *ct)
507{ 649{
@@ -695,19 +837,21 @@ static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb,
695 /* This is the conntrack entry already in hashes that won race. */ 837 /* This is the conntrack entry already in hashes that won race. */
696 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 838 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
697 const struct nf_conntrack_l4proto *l4proto; 839 const struct nf_conntrack_l4proto *l4proto;
840 enum ip_conntrack_info oldinfo;
841 struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo);
698 842
699 l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); 843 l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
700 if (l4proto->allow_clash && 844 if (l4proto->allow_clash &&
701 ((ct->status & IPS_NAT_DONE_MASK) == 0) &&
702 !nf_ct_is_dying(ct) && 845 !nf_ct_is_dying(ct) &&
703 atomic_inc_not_zero(&ct->ct_general.use)) { 846 atomic_inc_not_zero(&ct->ct_general.use)) {
704 enum ip_conntrack_info oldinfo; 847 if (((ct->status & IPS_NAT_DONE_MASK) == 0) ||
705 struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo); 848 nf_ct_match(ct, loser_ct)) {
706 849 nf_ct_acct_merge(ct, ctinfo, loser_ct);
707 nf_ct_acct_merge(ct, ctinfo, loser_ct); 850 nf_conntrack_put(&loser_ct->ct_general);
708 nf_conntrack_put(&loser_ct->ct_general); 851 nf_ct_set(skb, ct, oldinfo);
709 nf_ct_set(skb, ct, oldinfo); 852 return NF_ACCEPT;
710 return NF_ACCEPT; 853 }
854 nf_ct_put(ct);
711 } 855 }
712 NF_CT_STAT_INC(net, drop); 856 NF_CT_STAT_INC(net, drop);
713 return NF_DROP; 857 return NF_DROP;
@@ -1195,7 +1339,6 @@ EXPORT_SYMBOL_GPL(nf_conntrack_free);
1195static noinline struct nf_conntrack_tuple_hash * 1339static noinline struct nf_conntrack_tuple_hash *
1196init_conntrack(struct net *net, struct nf_conn *tmpl, 1340init_conntrack(struct net *net, struct nf_conn *tmpl,
1197 const struct nf_conntrack_tuple *tuple, 1341 const struct nf_conntrack_tuple *tuple,
1198 const struct nf_conntrack_l3proto *l3proto,
1199 const struct nf_conntrack_l4proto *l4proto, 1342 const struct nf_conntrack_l4proto *l4proto,
1200 struct sk_buff *skb, 1343 struct sk_buff *skb,
1201 unsigned int dataoff, u32 hash) 1344 unsigned int dataoff, u32 hash)
@@ -1208,9 +1351,8 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
1208 const struct nf_conntrack_zone *zone; 1351 const struct nf_conntrack_zone *zone;
1209 struct nf_conn_timeout *timeout_ext; 1352 struct nf_conn_timeout *timeout_ext;
1210 struct nf_conntrack_zone tmp; 1353 struct nf_conntrack_zone tmp;
1211 unsigned int *timeouts;
1212 1354
1213 if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) { 1355 if (!nf_ct_invert_tuple(&repl_tuple, tuple, l4proto)) {
1214 pr_debug("Can't invert tuple.\n"); 1356 pr_debug("Can't invert tuple.\n");
1215 return NULL; 1357 return NULL;
1216 } 1358 }
@@ -1227,15 +1369,8 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
1227 } 1369 }
1228 1370
1229 timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL; 1371 timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL;
1230 if (timeout_ext) {
1231 timeouts = nf_ct_timeout_data(timeout_ext);
1232 if (unlikely(!timeouts))
1233 timeouts = l4proto->get_timeouts(net);
1234 } else {
1235 timeouts = l4proto->get_timeouts(net);
1236 }
1237 1372
1238 if (!l4proto->new(ct, skb, dataoff, timeouts)) { 1373 if (!l4proto->new(ct, skb, dataoff)) {
1239 nf_conntrack_free(ct); 1374 nf_conntrack_free(ct);
1240 pr_debug("can't track with proto module\n"); 1375 pr_debug("can't track with proto module\n");
1241 return NULL; 1376 return NULL;
@@ -1266,8 +1401,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
1266 /* exp->master safe, refcnt bumped in nf_ct_find_expectation */ 1401 /* exp->master safe, refcnt bumped in nf_ct_find_expectation */
1267 ct->master = exp->master; 1402 ct->master = exp->master;
1268 if (exp->helper) { 1403 if (exp->helper) {
1269 help = nf_ct_helper_ext_add(ct, exp->helper, 1404 help = nf_ct_helper_ext_add(ct, GFP_ATOMIC);
1270 GFP_ATOMIC);
1271 if (help) 1405 if (help)
1272 rcu_assign_pointer(help->helper, exp->helper); 1406 rcu_assign_pointer(help->helper, exp->helper);
1273 } 1407 }
@@ -1307,7 +1441,6 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
1307 unsigned int dataoff, 1441 unsigned int dataoff,
1308 u_int16_t l3num, 1442 u_int16_t l3num,
1309 u_int8_t protonum, 1443 u_int8_t protonum,
1310 const struct nf_conntrack_l3proto *l3proto,
1311 const struct nf_conntrack_l4proto *l4proto) 1444 const struct nf_conntrack_l4proto *l4proto)
1312{ 1445{
1313 const struct nf_conntrack_zone *zone; 1446 const struct nf_conntrack_zone *zone;
@@ -1319,8 +1452,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
1319 u32 hash; 1452 u32 hash;
1320 1453
1321 if (!nf_ct_get_tuple(skb, skb_network_offset(skb), 1454 if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
1322 dataoff, l3num, protonum, net, &tuple, l3proto, 1455 dataoff, l3num, protonum, net, &tuple, l4proto)) {
1323 l4proto)) {
1324 pr_debug("Can't get tuple\n"); 1456 pr_debug("Can't get tuple\n");
1325 return 0; 1457 return 0;
1326 } 1458 }
@@ -1330,7 +1462,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
1330 hash = hash_conntrack_raw(&tuple, net); 1462 hash = hash_conntrack_raw(&tuple, net);
1331 h = __nf_conntrack_find_get(net, zone, &tuple, hash); 1463 h = __nf_conntrack_find_get(net, zone, &tuple, hash);
1332 if (!h) { 1464 if (!h) {
1333 h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto, 1465 h = init_conntrack(net, tmpl, &tuple, l4proto,
1334 skb, dataoff, hash); 1466 skb, dataoff, hash);
1335 if (!h) 1467 if (!h)
1336 return 0; 1468 return 0;
@@ -1363,14 +1495,11 @@ unsigned int
1363nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, 1495nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
1364 struct sk_buff *skb) 1496 struct sk_buff *skb)
1365{ 1497{
1366 const struct nf_conntrack_l3proto *l3proto;
1367 const struct nf_conntrack_l4proto *l4proto; 1498 const struct nf_conntrack_l4proto *l4proto;
1368 struct nf_conn *ct, *tmpl; 1499 struct nf_conn *ct, *tmpl;
1369 enum ip_conntrack_info ctinfo; 1500 enum ip_conntrack_info ctinfo;
1370 unsigned int *timeouts;
1371 unsigned int dataoff;
1372 u_int8_t protonum; 1501 u_int8_t protonum;
1373 int ret; 1502 int dataoff, ret;
1374 1503
1375 tmpl = nf_ct_get(skb, &ctinfo); 1504 tmpl = nf_ct_get(skb, &ctinfo);
1376 if (tmpl || ctinfo == IP_CT_UNTRACKED) { 1505 if (tmpl || ctinfo == IP_CT_UNTRACKED) {
@@ -1384,14 +1513,12 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
1384 } 1513 }
1385 1514
1386 /* rcu_read_lock()ed by nf_hook_thresh */ 1515 /* rcu_read_lock()ed by nf_hook_thresh */
1387 l3proto = __nf_ct_l3proto_find(pf); 1516 dataoff = get_l4proto(skb, skb_network_offset(skb), pf, &protonum);
1388 ret = l3proto->get_l4proto(skb, skb_network_offset(skb), 1517 if (dataoff <= 0) {
1389 &dataoff, &protonum);
1390 if (ret <= 0) {
1391 pr_debug("not prepared to track yet or error occurred\n"); 1518 pr_debug("not prepared to track yet or error occurred\n");
1392 NF_CT_STAT_INC_ATOMIC(net, error); 1519 NF_CT_STAT_INC_ATOMIC(net, error);
1393 NF_CT_STAT_INC_ATOMIC(net, invalid); 1520 NF_CT_STAT_INC_ATOMIC(net, invalid);
1394 ret = -ret; 1521 ret = NF_ACCEPT;
1395 goto out; 1522 goto out;
1396 } 1523 }
1397 1524
@@ -1413,8 +1540,7 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
1413 goto out; 1540 goto out;
1414 } 1541 }
1415repeat: 1542repeat:
1416 ret = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum, 1543 ret = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum, l4proto);
1417 l3proto, l4proto);
1418 if (ret < 0) { 1544 if (ret < 0) {
1419 /* Too stressed to deal. */ 1545 /* Too stressed to deal. */
1420 NF_CT_STAT_INC_ATOMIC(net, drop); 1546 NF_CT_STAT_INC_ATOMIC(net, drop);
@@ -1430,10 +1556,7 @@ repeat:
1430 goto out; 1556 goto out;
1431 } 1557 }
1432 1558
1433 /* Decide what timeout policy we want to apply to this flow. */ 1559 ret = l4proto->packet(ct, skb, dataoff, ctinfo);
1434 timeouts = nf_ct_timeout_lookup(net, ct, l4proto);
1435
1436 ret = l4proto->packet(ct, skb, dataoff, ctinfo, timeouts);
1437 if (ret <= 0) { 1560 if (ret <= 0) {
1438 /* Invalid: inverse of the return code tells 1561 /* Invalid: inverse of the return code tells
1439 * the netfilter core what to do */ 1562 * the netfilter core what to do */
@@ -1471,7 +1594,6 @@ bool nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse,
1471 1594
1472 rcu_read_lock(); 1595 rcu_read_lock();
1473 ret = nf_ct_invert_tuple(inverse, orig, 1596 ret = nf_ct_invert_tuple(inverse, orig,
1474 __nf_ct_l3proto_find(orig->src.l3num),
1475 __nf_ct_l4proto_find(orig->src.l3num, 1597 __nf_ct_l4proto_find(orig->src.l3num,
1476 orig->dst.protonum)); 1598 orig->dst.protonum));
1477 rcu_read_unlock(); 1599 rcu_read_unlock();
@@ -1609,14 +1731,14 @@ static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb)
1609 1731
1610static int nf_conntrack_update(struct net *net, struct sk_buff *skb) 1732static int nf_conntrack_update(struct net *net, struct sk_buff *skb)
1611{ 1733{
1612 const struct nf_conntrack_l3proto *l3proto;
1613 const struct nf_conntrack_l4proto *l4proto; 1734 const struct nf_conntrack_l4proto *l4proto;
1614 struct nf_conntrack_tuple_hash *h; 1735 struct nf_conntrack_tuple_hash *h;
1615 struct nf_conntrack_tuple tuple; 1736 struct nf_conntrack_tuple tuple;
1616 enum ip_conntrack_info ctinfo; 1737 enum ip_conntrack_info ctinfo;
1617 struct nf_nat_hook *nat_hook; 1738 struct nf_nat_hook *nat_hook;
1618 unsigned int dataoff, status; 1739 unsigned int status;
1619 struct nf_conn *ct; 1740 struct nf_conn *ct;
1741 int dataoff;
1620 u16 l3num; 1742 u16 l3num;
1621 u8 l4num; 1743 u8 l4num;
1622 1744
@@ -1625,16 +1747,15 @@ static int nf_conntrack_update(struct net *net, struct sk_buff *skb)
1625 return 0; 1747 return 0;
1626 1748
1627 l3num = nf_ct_l3num(ct); 1749 l3num = nf_ct_l3num(ct);
1628 l3proto = nf_ct_l3proto_find_get(l3num);
1629 1750
1630 if (l3proto->get_l4proto(skb, skb_network_offset(skb), &dataoff, 1751 dataoff = get_l4proto(skb, skb_network_offset(skb), l3num, &l4num);
1631 &l4num) <= 0) 1752 if (dataoff <= 0)
1632 return -1; 1753 return -1;
1633 1754
1634 l4proto = nf_ct_l4proto_find_get(l3num, l4num); 1755 l4proto = nf_ct_l4proto_find_get(l3num, l4num);
1635 1756
1636 if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num, 1757 if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num,
1637 l4num, net, &tuple, l3proto, l4proto)) 1758 l4num, net, &tuple, l4proto))
1638 return -1; 1759 return -1;
1639 1760
1640 if (ct->status & IPS_SRC_NAT) { 1761 if (ct->status & IPS_SRC_NAT) {
@@ -1683,6 +1804,41 @@ static int nf_conntrack_update(struct net *net, struct sk_buff *skb)
1683 return 0; 1804 return 0;
1684} 1805}
1685 1806
1807static bool nf_conntrack_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
1808 const struct sk_buff *skb)
1809{
1810 const struct nf_conntrack_tuple *src_tuple;
1811 const struct nf_conntrack_tuple_hash *hash;
1812 struct nf_conntrack_tuple srctuple;
1813 enum ip_conntrack_info ctinfo;
1814 struct nf_conn *ct;
1815
1816 ct = nf_ct_get(skb, &ctinfo);
1817 if (ct) {
1818 src_tuple = nf_ct_tuple(ct, CTINFO2DIR(ctinfo));
1819 memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple));
1820 return true;
1821 }
1822
1823 if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
1824 NFPROTO_IPV4, dev_net(skb->dev),
1825 &srctuple))
1826 return false;
1827
1828 hash = nf_conntrack_find_get(dev_net(skb->dev),
1829 &nf_ct_zone_dflt,
1830 &srctuple);
1831 if (!hash)
1832 return false;
1833
1834 ct = nf_ct_tuplehash_to_ctrack(hash);
1835 src_tuple = nf_ct_tuple(ct, !hash->tuple.dst.dir);
1836 memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple));
1837 nf_ct_put(ct);
1838
1839 return true;
1840}
1841
1686/* Bring out ya dead! */ 1842/* Bring out ya dead! */
1687static struct nf_conn * 1843static struct nf_conn *
1688get_next_corpse(int (*iter)(struct nf_conn *i, void *data), 1844get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
@@ -1866,16 +2022,6 @@ static int kill_all(struct nf_conn *i, void *data)
1866 return net_eq(nf_ct_net(i), data); 2022 return net_eq(nf_ct_net(i), data);
1867} 2023}
1868 2024
1869void nf_ct_free_hashtable(void *hash, unsigned int size)
1870{
1871 if (is_vmalloc_addr(hash))
1872 vfree(hash);
1873 else
1874 free_pages((unsigned long)hash,
1875 get_order(sizeof(struct hlist_head) * size));
1876}
1877EXPORT_SYMBOL_GPL(nf_ct_free_hashtable);
1878
1879void nf_conntrack_cleanup_start(void) 2025void nf_conntrack_cleanup_start(void)
1880{ 2026{
1881 conntrack_gc_work.exiting = true; 2027 conntrack_gc_work.exiting = true;
@@ -1886,7 +2032,7 @@ void nf_conntrack_cleanup_end(void)
1886{ 2032{
1887 RCU_INIT_POINTER(nf_ct_hook, NULL); 2033 RCU_INIT_POINTER(nf_ct_hook, NULL);
1888 cancel_delayed_work_sync(&conntrack_gc_work.dwork); 2034 cancel_delayed_work_sync(&conntrack_gc_work.dwork);
1889 nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size); 2035 kvfree(nf_conntrack_hash);
1890 2036
1891 nf_conntrack_proto_fini(); 2037 nf_conntrack_proto_fini();
1892 nf_conntrack_seqadj_fini(); 2038 nf_conntrack_seqadj_fini();
@@ -1952,7 +2098,6 @@ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls)
1952{ 2098{
1953 struct hlist_nulls_head *hash; 2099 struct hlist_nulls_head *hash;
1954 unsigned int nr_slots, i; 2100 unsigned int nr_slots, i;
1955 size_t sz;
1956 2101
1957 if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head))) 2102 if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head)))
1958 return NULL; 2103 return NULL;
@@ -1960,14 +2105,8 @@ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls)
1960 BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head)); 2105 BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head));
1961 nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head)); 2106 nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head));
1962 2107
1963 if (nr_slots > (UINT_MAX / sizeof(struct hlist_nulls_head))) 2108 hash = kvmalloc_array(nr_slots, sizeof(struct hlist_nulls_head),
1964 return NULL; 2109 GFP_KERNEL | __GFP_ZERO);
1965
1966 sz = nr_slots * sizeof(struct hlist_nulls_head);
1967 hash = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO,
1968 get_order(sz));
1969 if (!hash)
1970 hash = vzalloc(sz);
1971 2110
1972 if (hash && nulls) 2111 if (hash && nulls)
1973 for (i = 0; i < nr_slots; i++) 2112 for (i = 0; i < nr_slots; i++)
@@ -1994,7 +2133,7 @@ int nf_conntrack_hash_resize(unsigned int hashsize)
1994 2133
1995 old_size = nf_conntrack_htable_size; 2134 old_size = nf_conntrack_htable_size;
1996 if (old_size == hashsize) { 2135 if (old_size == hashsize) {
1997 nf_ct_free_hashtable(hash, hashsize); 2136 kvfree(hash);
1998 return 0; 2137 return 0;
1999 } 2138 }
2000 2139
@@ -2030,7 +2169,7 @@ int nf_conntrack_hash_resize(unsigned int hashsize)
2030 local_bh_enable(); 2169 local_bh_enable();
2031 2170
2032 synchronize_net(); 2171 synchronize_net();
2033 nf_ct_free_hashtable(old_hash, old_size); 2172 kvfree(old_hash);
2034 return 0; 2173 return 0;
2035} 2174}
2036 2175
@@ -2043,7 +2182,7 @@ int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp)
2043 return -EOPNOTSUPP; 2182 return -EOPNOTSUPP;
2044 2183
2045 /* On boot, we can set this without any fancy locking. */ 2184 /* On boot, we can set this without any fancy locking. */
2046 if (!nf_conntrack_htable_size) 2185 if (!nf_conntrack_hash)
2047 return param_set_uint(val, kp); 2186 return param_set_uint(val, kp);
2048 2187
2049 rc = kstrtouint(val, 0, &hashsize); 2188 rc = kstrtouint(val, 0, &hashsize);
@@ -2054,9 +2193,6 @@ int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp)
2054} 2193}
2055EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize); 2194EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize);
2056 2195
2057module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
2058 &nf_conntrack_htable_size, 0600);
2059
2060static __always_inline unsigned int total_extension_size(void) 2196static __always_inline unsigned int total_extension_size(void)
2061{ 2197{
2062 /* remember to add new extensions below */ 2198 /* remember to add new extensions below */
@@ -2197,13 +2333,14 @@ err_acct:
2197err_expect: 2333err_expect:
2198 kmem_cache_destroy(nf_conntrack_cachep); 2334 kmem_cache_destroy(nf_conntrack_cachep);
2199err_cachep: 2335err_cachep:
2200 nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size); 2336 kvfree(nf_conntrack_hash);
2201 return ret; 2337 return ret;
2202} 2338}
2203 2339
2204static struct nf_ct_hook nf_conntrack_hook = { 2340static struct nf_ct_hook nf_conntrack_hook = {
2205 .update = nf_conntrack_update, 2341 .update = nf_conntrack_update,
2206 .destroy = destroy_conntrack, 2342 .destroy = destroy_conntrack,
2343 .get_tuple_skb = nf_conntrack_get_tuple_skb,
2207}; 2344};
2208 2345
2209void nf_conntrack_init_end(void) 2346void nf_conntrack_init_end(void)
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index 853b23206bb7..27b84231db10 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -610,7 +610,6 @@ static int exp_seq_show(struct seq_file *s, void *v)
610 expect->tuple.src.l3num, 610 expect->tuple.src.l3num,
611 expect->tuple.dst.protonum); 611 expect->tuple.dst.protonum);
612 print_tuple(s, &expect->tuple, 612 print_tuple(s, &expect->tuple,
613 __nf_ct_l3proto_find(expect->tuple.src.l3num),
614 __nf_ct_l4proto_find(expect->tuple.src.l3num, 613 __nf_ct_l4proto_find(expect->tuple.src.l3num,
615 expect->tuple.dst.protonum)); 614 expect->tuple.dst.protonum));
616 615
@@ -713,5 +712,5 @@ void nf_conntrack_expect_fini(void)
713{ 712{
714 rcu_barrier(); /* Wait for call_rcu() before destroy */ 713 rcu_barrier(); /* Wait for call_rcu() before destroy */
715 kmem_cache_destroy(nf_ct_expect_cachep); 714 kmem_cache_destroy(nf_ct_expect_cachep);
716 nf_ct_free_hashtable(nf_ct_expect_hash, nf_ct_expect_hsize); 715 kvfree(nf_ct_expect_hash);
717} 716}
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index 551a1eddf0fa..e24b762ffa1d 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -24,7 +24,6 @@
24#include <linux/rtnetlink.h> 24#include <linux/rtnetlink.h>
25 25
26#include <net/netfilter/nf_conntrack.h> 26#include <net/netfilter/nf_conntrack.h>
27#include <net/netfilter/nf_conntrack_l3proto.h>
28#include <net/netfilter/nf_conntrack_l4proto.h> 27#include <net/netfilter/nf_conntrack_l4proto.h>
29#include <net/netfilter/nf_conntrack_helper.h> 28#include <net/netfilter/nf_conntrack_helper.h>
30#include <net/netfilter/nf_conntrack_core.h> 29#include <net/netfilter/nf_conntrack_core.h>
@@ -193,8 +192,7 @@ void nf_conntrack_helper_put(struct nf_conntrack_helper *helper)
193EXPORT_SYMBOL_GPL(nf_conntrack_helper_put); 192EXPORT_SYMBOL_GPL(nf_conntrack_helper_put);
194 193
195struct nf_conn_help * 194struct nf_conn_help *
196nf_ct_helper_ext_add(struct nf_conn *ct, 195nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp)
197 struct nf_conntrack_helper *helper, gfp_t gfp)
198{ 196{
199 struct nf_conn_help *help; 197 struct nf_conn_help *help;
200 198
@@ -263,7 +261,7 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
263 } 261 }
264 262
265 if (help == NULL) { 263 if (help == NULL) {
266 help = nf_ct_helper_ext_add(ct, helper, flags); 264 help = nf_ct_helper_ext_add(ct, flags);
267 if (help == NULL) 265 if (help == NULL)
268 return -ENOMEM; 266 return -ENOMEM;
269 } else { 267 } else {
@@ -465,6 +463,11 @@ void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
465 463
466 nf_ct_expect_iterate_destroy(expect_iter_me, NULL); 464 nf_ct_expect_iterate_destroy(expect_iter_me, NULL);
467 nf_ct_iterate_destroy(unhelp, me); 465 nf_ct_iterate_destroy(unhelp, me);
466
467 /* Maybe someone has gotten the helper already when unhelp above.
468 * So need to wait it.
469 */
470 synchronize_rcu();
468} 471}
469EXPORT_SYMBOL_GPL(nf_conntrack_helper_unregister); 472EXPORT_SYMBOL_GPL(nf_conntrack_helper_unregister);
470 473
@@ -559,12 +562,12 @@ int nf_conntrack_helper_init(void)
559 562
560 return 0; 563 return 0;
561out_extend: 564out_extend:
562 nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_hsize); 565 kvfree(nf_ct_helper_hash);
563 return ret; 566 return ret;
564} 567}
565 568
566void nf_conntrack_helper_fini(void) 569void nf_conntrack_helper_fini(void)
567{ 570{
568 nf_ct_extend_unregister(&helper_extend); 571 nf_ct_extend_unregister(&helper_extend);
569 nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_hsize); 572 kvfree(nf_ct_helper_hash);
570} 573}
diff --git a/net/netfilter/nf_conntrack_l3proto_generic.c b/net/netfilter/nf_conntrack_l3proto_generic.c
deleted file mode 100644
index 397e6911214f..000000000000
--- a/net/netfilter/nf_conntrack_l3proto_generic.c
+++ /dev/null
@@ -1,66 +0,0 @@
1/*
2 * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
3 *
4 * Based largely upon the original ip_conntrack code which
5 * had the following copyright information:
6 *
7 * (C) 1999-2001 Paul `Rusty' Russell
8 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2 as
12 * published by the Free Software Foundation.
13 *
14 * Author:
15 * Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
16 */
17
18#include <linux/types.h>
19#include <linux/ip.h>
20#include <linux/netfilter.h>
21#include <linux/module.h>
22#include <linux/skbuff.h>
23#include <linux/icmp.h>
24#include <linux/sysctl.h>
25#include <net/ip.h>
26
27#include <linux/netfilter_ipv4.h>
28#include <net/netfilter/nf_conntrack.h>
29#include <net/netfilter/nf_conntrack_l4proto.h>
30#include <net/netfilter/nf_conntrack_l3proto.h>
31#include <net/netfilter/nf_conntrack_core.h>
32#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
33
34static bool generic_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
35 struct nf_conntrack_tuple *tuple)
36{
37 memset(&tuple->src.u3, 0, sizeof(tuple->src.u3));
38 memset(&tuple->dst.u3, 0, sizeof(tuple->dst.u3));
39
40 return true;
41}
42
43static bool generic_invert_tuple(struct nf_conntrack_tuple *tuple,
44 const struct nf_conntrack_tuple *orig)
45{
46 memset(&tuple->src.u3, 0, sizeof(tuple->src.u3));
47 memset(&tuple->dst.u3, 0, sizeof(tuple->dst.u3));
48
49 return true;
50}
51
52static int generic_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
53 unsigned int *dataoff, u_int8_t *protonum)
54{
55 /* Never track !!! */
56 return -NF_ACCEPT;
57}
58
59
60struct nf_conntrack_l3proto nf_conntrack_l3proto_generic __read_mostly = {
61 .l3proto = PF_UNSPEC,
62 .pkt_to_tuple = generic_pkt_to_tuple,
63 .invert_tuple = generic_invert_tuple,
64 .get_l4proto = generic_get_l4proto,
65};
66EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_generic);
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 20a2e37c76d1..036207ecaf16 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -38,7 +38,6 @@
38#include <net/netfilter/nf_conntrack_expect.h> 38#include <net/netfilter/nf_conntrack_expect.h>
39#include <net/netfilter/nf_conntrack_helper.h> 39#include <net/netfilter/nf_conntrack_helper.h>
40#include <net/netfilter/nf_conntrack_seqadj.h> 40#include <net/netfilter/nf_conntrack_seqadj.h>
41#include <net/netfilter/nf_conntrack_l3proto.h>
42#include <net/netfilter/nf_conntrack_l4proto.h> 41#include <net/netfilter/nf_conntrack_l4proto.h>
43#include <net/netfilter/nf_conntrack_tuple.h> 42#include <net/netfilter/nf_conntrack_tuple.h>
44#include <net/netfilter/nf_conntrack_acct.h> 43#include <net/netfilter/nf_conntrack_acct.h>
@@ -81,9 +80,26 @@ nla_put_failure:
81 return -1; 80 return -1;
82} 81}
83 82
83static int ipv4_tuple_to_nlattr(struct sk_buff *skb,
84 const struct nf_conntrack_tuple *tuple)
85{
86 if (nla_put_in_addr(skb, CTA_IP_V4_SRC, tuple->src.u3.ip) ||
87 nla_put_in_addr(skb, CTA_IP_V4_DST, tuple->dst.u3.ip))
88 return -EMSGSIZE;
89 return 0;
90}
91
92static int ipv6_tuple_to_nlattr(struct sk_buff *skb,
93 const struct nf_conntrack_tuple *tuple)
94{
95 if (nla_put_in6_addr(skb, CTA_IP_V6_SRC, &tuple->src.u3.in6) ||
96 nla_put_in6_addr(skb, CTA_IP_V6_DST, &tuple->dst.u3.in6))
97 return -EMSGSIZE;
98 return 0;
99}
100
84static int ctnetlink_dump_tuples_ip(struct sk_buff *skb, 101static int ctnetlink_dump_tuples_ip(struct sk_buff *skb,
85 const struct nf_conntrack_tuple *tuple, 102 const struct nf_conntrack_tuple *tuple)
86 const struct nf_conntrack_l3proto *l3proto)
87{ 103{
88 int ret = 0; 104 int ret = 0;
89 struct nlattr *nest_parms; 105 struct nlattr *nest_parms;
@@ -92,8 +108,14 @@ static int ctnetlink_dump_tuples_ip(struct sk_buff *skb,
92 if (!nest_parms) 108 if (!nest_parms)
93 goto nla_put_failure; 109 goto nla_put_failure;
94 110
95 if (likely(l3proto->tuple_to_nlattr)) 111 switch (tuple->src.l3num) {
96 ret = l3proto->tuple_to_nlattr(skb, tuple); 112 case NFPROTO_IPV4:
113 ret = ipv4_tuple_to_nlattr(skb, tuple);
114 break;
115 case NFPROTO_IPV6:
116 ret = ipv6_tuple_to_nlattr(skb, tuple);
117 break;
118 }
97 119
98 nla_nest_end(skb, nest_parms); 120 nla_nest_end(skb, nest_parms);
99 121
@@ -106,13 +128,11 @@ nla_put_failure:
106static int ctnetlink_dump_tuples(struct sk_buff *skb, 128static int ctnetlink_dump_tuples(struct sk_buff *skb,
107 const struct nf_conntrack_tuple *tuple) 129 const struct nf_conntrack_tuple *tuple)
108{ 130{
109 const struct nf_conntrack_l3proto *l3proto;
110 const struct nf_conntrack_l4proto *l4proto; 131 const struct nf_conntrack_l4proto *l4proto;
111 int ret; 132 int ret;
112 133
113 rcu_read_lock(); 134 rcu_read_lock();
114 l3proto = __nf_ct_l3proto_find(tuple->src.l3num); 135 ret = ctnetlink_dump_tuples_ip(skb, tuple);
115 ret = ctnetlink_dump_tuples_ip(skb, tuple, l3proto);
116 136
117 if (ret >= 0) { 137 if (ret >= 0) {
118 l4proto = __nf_ct_l4proto_find(tuple->src.l3num, 138 l4proto = __nf_ct_l4proto_find(tuple->src.l3num,
@@ -556,15 +576,20 @@ nla_put_failure:
556 return -1; 576 return -1;
557} 577}
558 578
579static const struct nla_policy cta_ip_nla_policy[CTA_IP_MAX + 1] = {
580 [CTA_IP_V4_SRC] = { .type = NLA_U32 },
581 [CTA_IP_V4_DST] = { .type = NLA_U32 },
582 [CTA_IP_V6_SRC] = { .len = sizeof(__be32) * 4 },
583 [CTA_IP_V6_DST] = { .len = sizeof(__be32) * 4 },
584};
585
559#if defined(CONFIG_NETFILTER_NETLINK_GLUE_CT) || defined(CONFIG_NF_CONNTRACK_EVENTS) 586#if defined(CONFIG_NETFILTER_NETLINK_GLUE_CT) || defined(CONFIG_NF_CONNTRACK_EVENTS)
560static size_t ctnetlink_proto_size(const struct nf_conn *ct) 587static size_t ctnetlink_proto_size(const struct nf_conn *ct)
561{ 588{
562 const struct nf_conntrack_l3proto *l3proto;
563 const struct nf_conntrack_l4proto *l4proto; 589 const struct nf_conntrack_l4proto *l4proto;
564 size_t len, len4 = 0; 590 size_t len, len4 = 0;
565 591
566 l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct)); 592 len = nla_policy_len(cta_ip_nla_policy, CTA_IP_MAX + 1);
567 len = l3proto->nla_size;
568 len *= 3u; /* ORIG, REPLY, MASTER */ 593 len *= 3u; /* ORIG, REPLY, MASTER */
569 594
570 l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); 595 l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
@@ -821,6 +846,21 @@ ctnetlink_alloc_filter(const struct nlattr * const cda[])
821#endif 846#endif
822} 847}
823 848
849static int ctnetlink_start(struct netlink_callback *cb)
850{
851 const struct nlattr * const *cda = cb->data;
852 struct ctnetlink_filter *filter = NULL;
853
854 if (cda[CTA_MARK] && cda[CTA_MARK_MASK]) {
855 filter = ctnetlink_alloc_filter(cda);
856 if (IS_ERR(filter))
857 return PTR_ERR(filter);
858 }
859
860 cb->data = filter;
861 return 0;
862}
863
824static int ctnetlink_filter_match(struct nf_conn *ct, void *data) 864static int ctnetlink_filter_match(struct nf_conn *ct, void *data)
825{ 865{
826 struct ctnetlink_filter *filter = data; 866 struct ctnetlink_filter *filter = data;
@@ -936,29 +976,54 @@ out:
936 return skb->len; 976 return skb->len;
937} 977}
938 978
979static int ipv4_nlattr_to_tuple(struct nlattr *tb[],
980 struct nf_conntrack_tuple *t)
981{
982 if (!tb[CTA_IP_V4_SRC] || !tb[CTA_IP_V4_DST])
983 return -EINVAL;
984
985 t->src.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_SRC]);
986 t->dst.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_DST]);
987
988 return 0;
989}
990
991static int ipv6_nlattr_to_tuple(struct nlattr *tb[],
992 struct nf_conntrack_tuple *t)
993{
994 if (!tb[CTA_IP_V6_SRC] || !tb[CTA_IP_V6_DST])
995 return -EINVAL;
996
997 t->src.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_SRC]);
998 t->dst.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_DST]);
999
1000 return 0;
1001}
1002
939static int ctnetlink_parse_tuple_ip(struct nlattr *attr, 1003static int ctnetlink_parse_tuple_ip(struct nlattr *attr,
940 struct nf_conntrack_tuple *tuple) 1004 struct nf_conntrack_tuple *tuple)
941{ 1005{
942 struct nlattr *tb[CTA_IP_MAX+1]; 1006 struct nlattr *tb[CTA_IP_MAX+1];
943 struct nf_conntrack_l3proto *l3proto;
944 int ret = 0; 1007 int ret = 0;
945 1008
946 ret = nla_parse_nested(tb, CTA_IP_MAX, attr, NULL, NULL); 1009 ret = nla_parse_nested(tb, CTA_IP_MAX, attr, NULL, NULL);
947 if (ret < 0) 1010 if (ret < 0)
948 return ret; 1011 return ret;
949 1012
950 rcu_read_lock(); 1013 ret = nla_validate_nested(attr, CTA_IP_MAX,
951 l3proto = __nf_ct_l3proto_find(tuple->src.l3num); 1014 cta_ip_nla_policy, NULL);
1015 if (ret)
1016 return ret;
952 1017
953 if (likely(l3proto->nlattr_to_tuple)) { 1018 switch (tuple->src.l3num) {
954 ret = nla_validate_nested(attr, CTA_IP_MAX, 1019 case NFPROTO_IPV4:
955 l3proto->nla_policy, NULL); 1020 ret = ipv4_nlattr_to_tuple(tb, tuple);
956 if (ret == 0) 1021 break;
957 ret = l3proto->nlattr_to_tuple(tb, tuple); 1022 case NFPROTO_IPV6:
1023 ret = ipv6_nlattr_to_tuple(tb, tuple);
1024 break;
958 } 1025 }
959 1026
960 rcu_read_unlock();
961
962 return ret; 1027 return ret;
963} 1028}
964 1029
@@ -1240,19 +1305,12 @@ static int ctnetlink_get_conntrack(struct net *net, struct sock *ctnl,
1240 1305
1241 if (nlh->nlmsg_flags & NLM_F_DUMP) { 1306 if (nlh->nlmsg_flags & NLM_F_DUMP) {
1242 struct netlink_dump_control c = { 1307 struct netlink_dump_control c = {
1308 .start = ctnetlink_start,
1243 .dump = ctnetlink_dump_table, 1309 .dump = ctnetlink_dump_table,
1244 .done = ctnetlink_done, 1310 .done = ctnetlink_done,
1311 .data = (void *)cda,
1245 }; 1312 };
1246 1313
1247 if (cda[CTA_MARK] && cda[CTA_MARK_MASK]) {
1248 struct ctnetlink_filter *filter;
1249
1250 filter = ctnetlink_alloc_filter(cda);
1251 if (IS_ERR(filter))
1252 return PTR_ERR(filter);
1253
1254 c.data = filter;
1255 }
1256 return netlink_dump_start(ctnl, skb, nlh, &c); 1314 return netlink_dump_start(ctnl, skb, nlh, &c);
1257 } 1315 }
1258 1316
@@ -1897,7 +1955,7 @@ ctnetlink_create_conntrack(struct net *net,
1897 } else { 1955 } else {
1898 struct nf_conn_help *help; 1956 struct nf_conn_help *help;
1899 1957
1900 help = nf_ct_helper_ext_add(ct, helper, GFP_ATOMIC); 1958 help = nf_ct_helper_ext_add(ct, GFP_ATOMIC);
1901 if (help == NULL) { 1959 if (help == NULL) {
1902 err = -ENOMEM; 1960 err = -ENOMEM;
1903 goto err2; 1961 goto err2;
@@ -2581,7 +2639,6 @@ static int ctnetlink_exp_dump_mask(struct sk_buff *skb,
2581 const struct nf_conntrack_tuple *tuple, 2639 const struct nf_conntrack_tuple *tuple,
2582 const struct nf_conntrack_tuple_mask *mask) 2640 const struct nf_conntrack_tuple_mask *mask)
2583{ 2641{
2584 const struct nf_conntrack_l3proto *l3proto;
2585 const struct nf_conntrack_l4proto *l4proto; 2642 const struct nf_conntrack_l4proto *l4proto;
2586 struct nf_conntrack_tuple m; 2643 struct nf_conntrack_tuple m;
2587 struct nlattr *nest_parms; 2644 struct nlattr *nest_parms;
@@ -2597,8 +2654,7 @@ static int ctnetlink_exp_dump_mask(struct sk_buff *skb,
2597 goto nla_put_failure; 2654 goto nla_put_failure;
2598 2655
2599 rcu_read_lock(); 2656 rcu_read_lock();
2600 l3proto = __nf_ct_l3proto_find(tuple->src.l3num); 2657 ret = ctnetlink_dump_tuples_ip(skb, &m);
2601 ret = ctnetlink_dump_tuples_ip(skb, &m, l3proto);
2602 if (ret >= 0) { 2658 if (ret >= 0) {
2603 l4proto = __nf_ct_l4proto_find(tuple->src.l3num, 2659 l4proto = __nf_ct_l4proto_find(tuple->src.l3num,
2604 tuple->dst.protonum); 2660 tuple->dst.protonum);
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index d88841fbc560..9f14b0df6960 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -1,14 +1,4 @@
1/* L3/L4 protocol support for nf_conntrack. */ 1// SPDX-License-Identifier: GPL-2.0
2
3/* (C) 1999-2001 Paul `Rusty' Russell
4 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
5 * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
6 * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 */
12 2
13#include <linux/types.h> 3#include <linux/types.h>
14#include <linux/netfilter.h> 4#include <linux/netfilter.h>
@@ -24,14 +14,36 @@
24#include <linux/netdevice.h> 14#include <linux/netdevice.h>
25 15
26#include <net/netfilter/nf_conntrack.h> 16#include <net/netfilter/nf_conntrack.h>
27#include <net/netfilter/nf_conntrack_l3proto.h>
28#include <net/netfilter/nf_conntrack_l4proto.h> 17#include <net/netfilter/nf_conntrack_l4proto.h>
29#include <net/netfilter/nf_conntrack_core.h> 18#include <net/netfilter/nf_conntrack_core.h>
30#include <net/netfilter/nf_log.h> 19#include <net/netfilter/nf_log.h>
31 20
21#include <linux/ip.h>
22#include <linux/icmp.h>
23#include <linux/sysctl.h>
24#include <net/route.h>
25#include <net/ip.h>
26
27#include <linux/netfilter_ipv4.h>
28#include <linux/netfilter_ipv6.h>
29#include <linux/netfilter_ipv6/ip6_tables.h>
30#include <net/netfilter/nf_conntrack_helper.h>
31#include <net/netfilter/nf_conntrack_zones.h>
32#include <net/netfilter/nf_conntrack_seqadj.h>
33#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
34#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
35#include <net/netfilter/nf_nat_helper.h>
36#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
37#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
38
39#include <linux/ipv6.h>
40#include <linux/in6.h>
41#include <net/ipv6.h>
42#include <net/inet_frag.h>
43
44extern unsigned int nf_conntrack_net_id;
45
32static struct nf_conntrack_l4proto __rcu **nf_ct_protos[NFPROTO_NUMPROTO] __read_mostly; 46static struct nf_conntrack_l4proto __rcu **nf_ct_protos[NFPROTO_NUMPROTO] __read_mostly;
33struct nf_conntrack_l3proto __rcu *nf_ct_l3protos[NFPROTO_NUMPROTO] __read_mostly;
34EXPORT_SYMBOL_GPL(nf_ct_l3protos);
35 47
36static DEFINE_MUTEX(nf_ct_proto_mutex); 48static DEFINE_MUTEX(nf_ct_proto_mutex);
37 49
@@ -122,137 +134,6 @@ __nf_ct_l4proto_find(u_int16_t l3proto, u_int8_t l4proto)
122} 134}
123EXPORT_SYMBOL_GPL(__nf_ct_l4proto_find); 135EXPORT_SYMBOL_GPL(__nf_ct_l4proto_find);
124 136
125/* this is guaranteed to always return a valid protocol helper, since
126 * it falls back to generic_protocol */
127const struct nf_conntrack_l3proto *
128nf_ct_l3proto_find_get(u_int16_t l3proto)
129{
130 struct nf_conntrack_l3proto *p;
131
132 rcu_read_lock();
133 p = __nf_ct_l3proto_find(l3proto);
134 if (!try_module_get(p->me))
135 p = &nf_conntrack_l3proto_generic;
136 rcu_read_unlock();
137
138 return p;
139}
140EXPORT_SYMBOL_GPL(nf_ct_l3proto_find_get);
141
142int
143nf_ct_l3proto_try_module_get(unsigned short l3proto)
144{
145 const struct nf_conntrack_l3proto *p;
146 int ret;
147
148retry: p = nf_ct_l3proto_find_get(l3proto);
149 if (p == &nf_conntrack_l3proto_generic) {
150 ret = request_module("nf_conntrack-%d", l3proto);
151 if (!ret)
152 goto retry;
153
154 return -EPROTOTYPE;
155 }
156
157 return 0;
158}
159EXPORT_SYMBOL_GPL(nf_ct_l3proto_try_module_get);
160
161void nf_ct_l3proto_module_put(unsigned short l3proto)
162{
163 struct nf_conntrack_l3proto *p;
164
165 /* rcu_read_lock not necessary since the caller holds a reference, but
166 * taken anyways to avoid lockdep warnings in __nf_ct_l3proto_find()
167 */
168 rcu_read_lock();
169 p = __nf_ct_l3proto_find(l3proto);
170 module_put(p->me);
171 rcu_read_unlock();
172}
173EXPORT_SYMBOL_GPL(nf_ct_l3proto_module_put);
174
175static int nf_ct_netns_do_get(struct net *net, u8 nfproto)
176{
177 const struct nf_conntrack_l3proto *l3proto;
178 int ret;
179
180 might_sleep();
181
182 ret = nf_ct_l3proto_try_module_get(nfproto);
183 if (ret < 0)
184 return ret;
185
186 /* we already have a reference, can't fail */
187 rcu_read_lock();
188 l3proto = __nf_ct_l3proto_find(nfproto);
189 rcu_read_unlock();
190
191 if (!l3proto->net_ns_get)
192 return 0;
193
194 ret = l3proto->net_ns_get(net);
195 if (ret < 0)
196 nf_ct_l3proto_module_put(nfproto);
197
198 return ret;
199}
200
201int nf_ct_netns_get(struct net *net, u8 nfproto)
202{
203 int err;
204
205 if (nfproto == NFPROTO_INET) {
206 err = nf_ct_netns_do_get(net, NFPROTO_IPV4);
207 if (err < 0)
208 goto err1;
209 err = nf_ct_netns_do_get(net, NFPROTO_IPV6);
210 if (err < 0)
211 goto err2;
212 } else {
213 err = nf_ct_netns_do_get(net, nfproto);
214 if (err < 0)
215 goto err1;
216 }
217 return 0;
218
219err2:
220 nf_ct_netns_put(net, NFPROTO_IPV4);
221err1:
222 return err;
223}
224EXPORT_SYMBOL_GPL(nf_ct_netns_get);
225
226static void nf_ct_netns_do_put(struct net *net, u8 nfproto)
227{
228 const struct nf_conntrack_l3proto *l3proto;
229
230 might_sleep();
231
232 /* same as nf_conntrack_netns_get(), reference assumed */
233 rcu_read_lock();
234 l3proto = __nf_ct_l3proto_find(nfproto);
235 rcu_read_unlock();
236
237 if (WARN_ON(!l3proto))
238 return;
239
240 if (l3proto->net_ns_put)
241 l3proto->net_ns_put(net);
242
243 nf_ct_l3proto_module_put(nfproto);
244}
245
246void nf_ct_netns_put(struct net *net, uint8_t nfproto)
247{
248 if (nfproto == NFPROTO_INET) {
249 nf_ct_netns_do_put(net, NFPROTO_IPV4);
250 nf_ct_netns_do_put(net, NFPROTO_IPV6);
251 } else
252 nf_ct_netns_do_put(net, nfproto);
253}
254EXPORT_SYMBOL_GPL(nf_ct_netns_put);
255
256const struct nf_conntrack_l4proto * 137const struct nf_conntrack_l4proto *
257nf_ct_l4proto_find_get(u_int16_t l3num, u_int8_t l4num) 138nf_ct_l4proto_find_get(u_int16_t l3num, u_int8_t l4num)
258{ 139{
@@ -274,11 +155,6 @@ void nf_ct_l4proto_put(const struct nf_conntrack_l4proto *p)
274} 155}
275EXPORT_SYMBOL_GPL(nf_ct_l4proto_put); 156EXPORT_SYMBOL_GPL(nf_ct_l4proto_put);
276 157
277static int kill_l3proto(struct nf_conn *i, void *data)
278{
279 return nf_ct_l3num(i) == ((const struct nf_conntrack_l3proto *)data)->l3proto;
280}
281
282static int kill_l4proto(struct nf_conn *i, void *data) 158static int kill_l4proto(struct nf_conn *i, void *data)
283{ 159{
284 const struct nf_conntrack_l4proto *l4proto; 160 const struct nf_conntrack_l4proto *l4proto;
@@ -287,52 +163,6 @@ static int kill_l4proto(struct nf_conn *i, void *data)
287 nf_ct_l3num(i) == l4proto->l3proto; 163 nf_ct_l3num(i) == l4proto->l3proto;
288} 164}
289 165
290int nf_ct_l3proto_register(const struct nf_conntrack_l3proto *proto)
291{
292 int ret = 0;
293 struct nf_conntrack_l3proto *old;
294
295 if (proto->l3proto >= NFPROTO_NUMPROTO)
296 return -EBUSY;
297#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
298 if (proto->tuple_to_nlattr && proto->nla_size == 0)
299 return -EINVAL;
300#endif
301 mutex_lock(&nf_ct_proto_mutex);
302 old = rcu_dereference_protected(nf_ct_l3protos[proto->l3proto],
303 lockdep_is_held(&nf_ct_proto_mutex));
304 if (old != &nf_conntrack_l3proto_generic) {
305 ret = -EBUSY;
306 goto out_unlock;
307 }
308
309 rcu_assign_pointer(nf_ct_l3protos[proto->l3proto], proto);
310
311out_unlock:
312 mutex_unlock(&nf_ct_proto_mutex);
313 return ret;
314
315}
316EXPORT_SYMBOL_GPL(nf_ct_l3proto_register);
317
318void nf_ct_l3proto_unregister(const struct nf_conntrack_l3proto *proto)
319{
320 BUG_ON(proto->l3proto >= NFPROTO_NUMPROTO);
321
322 mutex_lock(&nf_ct_proto_mutex);
323 BUG_ON(rcu_dereference_protected(nf_ct_l3protos[proto->l3proto],
324 lockdep_is_held(&nf_ct_proto_mutex)
325 ) != proto);
326 rcu_assign_pointer(nf_ct_l3protos[proto->l3proto],
327 &nf_conntrack_l3proto_generic);
328 mutex_unlock(&nf_ct_proto_mutex);
329
330 synchronize_rcu();
331 /* Remove all contrack entries for this protocol */
332 nf_ct_iterate_destroy(kill_l3proto, (void*)proto);
333}
334EXPORT_SYMBOL_GPL(nf_ct_l3proto_unregister);
335
336static struct nf_proto_net *nf_ct_l4proto_net(struct net *net, 166static struct nf_proto_net *nf_ct_l4proto_net(struct net *net,
337 const struct nf_conntrack_l4proto *l4proto) 167 const struct nf_conntrack_l4proto *l4proto)
338{ 168{
@@ -482,7 +312,9 @@ void nf_ct_l4proto_unregister_one(const struct nf_conntrack_l4proto *l4proto)
482 __nf_ct_l4proto_unregister_one(l4proto); 312 __nf_ct_l4proto_unregister_one(l4proto);
483 mutex_unlock(&nf_ct_proto_mutex); 313 mutex_unlock(&nf_ct_proto_mutex);
484 314
485 synchronize_rcu(); 315 synchronize_net();
316 /* Remove all contrack entries for this protocol */
317 nf_ct_iterate_destroy(kill_l4proto, (void *)l4proto);
486} 318}
487EXPORT_SYMBOL_GPL(nf_ct_l4proto_unregister_one); 319EXPORT_SYMBOL_GPL(nf_ct_l4proto_unregister_one);
488 320
@@ -499,8 +331,26 @@ void nf_ct_l4proto_pernet_unregister_one(struct net *net,
499} 331}
500EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_unregister_one); 332EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_unregister_one);
501 333
502int nf_ct_l4proto_register(const struct nf_conntrack_l4proto * const l4proto[], 334static void
503 unsigned int num_proto) 335nf_ct_l4proto_unregister(const struct nf_conntrack_l4proto * const l4proto[],
336 unsigned int num_proto)
337{
338 int i;
339
340 mutex_lock(&nf_ct_proto_mutex);
341 for (i = 0; i < num_proto; i++)
342 __nf_ct_l4proto_unregister_one(l4proto[i]);
343 mutex_unlock(&nf_ct_proto_mutex);
344
345 synchronize_net();
346
347 for (i = 0; i < num_proto; i++)
348 nf_ct_iterate_destroy(kill_l4proto, (void *)l4proto[i]);
349}
350
351static int
352nf_ct_l4proto_register(const struct nf_conntrack_l4proto * const l4proto[],
353 unsigned int num_proto)
504{ 354{
505 int ret = -EINVAL, ver; 355 int ret = -EINVAL, ver;
506 unsigned int i; 356 unsigned int i;
@@ -518,7 +368,6 @@ int nf_ct_l4proto_register(const struct nf_conntrack_l4proto * const l4proto[],
518 } 368 }
519 return ret; 369 return ret;
520} 370}
521EXPORT_SYMBOL_GPL(nf_ct_l4proto_register);
522 371
523int nf_ct_l4proto_pernet_register(struct net *net, 372int nf_ct_l4proto_pernet_register(struct net *net,
524 const struct nf_conntrack_l4proto *const l4proto[], 373 const struct nf_conntrack_l4proto *const l4proto[],
@@ -542,20 +391,6 @@ int nf_ct_l4proto_pernet_register(struct net *net,
542} 391}
543EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_register); 392EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_register);
544 393
545void nf_ct_l4proto_unregister(const struct nf_conntrack_l4proto * const l4proto[],
546 unsigned int num_proto)
547{
548 mutex_lock(&nf_ct_proto_mutex);
549 while (num_proto-- != 0)
550 __nf_ct_l4proto_unregister_one(l4proto[num_proto]);
551 mutex_unlock(&nf_ct_proto_mutex);
552
553 synchronize_net();
554 /* Remove all contrack entries for this protocol */
555 nf_ct_iterate_destroy(kill_l4proto, (void *)l4proto);
556}
557EXPORT_SYMBOL_GPL(nf_ct_l4proto_unregister);
558
559void nf_ct_l4proto_pernet_unregister(struct net *net, 394void nf_ct_l4proto_pernet_unregister(struct net *net,
560 const struct nf_conntrack_l4proto *const l4proto[], 395 const struct nf_conntrack_l4proto *const l4proto[],
561 unsigned int num_proto) 396 unsigned int num_proto)
@@ -565,6 +400,562 @@ void nf_ct_l4proto_pernet_unregister(struct net *net,
565} 400}
566EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_unregister); 401EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_unregister);
567 402
403static unsigned int ipv4_helper(void *priv,
404 struct sk_buff *skb,
405 const struct nf_hook_state *state)
406{
407 struct nf_conn *ct;
408 enum ip_conntrack_info ctinfo;
409 const struct nf_conn_help *help;
410 const struct nf_conntrack_helper *helper;
411
412 /* This is where we call the helper: as the packet goes out. */
413 ct = nf_ct_get(skb, &ctinfo);
414 if (!ct || ctinfo == IP_CT_RELATED_REPLY)
415 return NF_ACCEPT;
416
417 help = nfct_help(ct);
418 if (!help)
419 return NF_ACCEPT;
420
421 /* rcu_read_lock()ed by nf_hook_thresh */
422 helper = rcu_dereference(help->helper);
423 if (!helper)
424 return NF_ACCEPT;
425
426 return helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb),
427 ct, ctinfo);
428}
429
430static unsigned int ipv4_confirm(void *priv,
431 struct sk_buff *skb,
432 const struct nf_hook_state *state)
433{
434 struct nf_conn *ct;
435 enum ip_conntrack_info ctinfo;
436
437 ct = nf_ct_get(skb, &ctinfo);
438 if (!ct || ctinfo == IP_CT_RELATED_REPLY)
439 goto out;
440
441 /* adjust seqs for loopback traffic only in outgoing direction */
442 if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
443 !nf_is_loopback_packet(skb)) {
444 if (!nf_ct_seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) {
445 NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
446 return NF_DROP;
447 }
448 }
449out:
450 /* We've seen it coming out the other side: confirm it */
451 return nf_conntrack_confirm(skb);
452}
453
454static unsigned int ipv4_conntrack_in(void *priv,
455 struct sk_buff *skb,
456 const struct nf_hook_state *state)
457{
458 return nf_conntrack_in(state->net, PF_INET, state->hook, skb);
459}
460
461static unsigned int ipv4_conntrack_local(void *priv,
462 struct sk_buff *skb,
463 const struct nf_hook_state *state)
464{
465 if (ip_is_fragment(ip_hdr(skb))) { /* IP_NODEFRAG setsockopt set */
466 enum ip_conntrack_info ctinfo;
467 struct nf_conn *tmpl;
468
469 tmpl = nf_ct_get(skb, &ctinfo);
470 if (tmpl && nf_ct_is_template(tmpl)) {
471 /* when skipping ct, clear templates to avoid fooling
472 * later targets/matches
473 */
474 skb->_nfct = 0;
475 nf_ct_put(tmpl);
476 }
477 return NF_ACCEPT;
478 }
479
480 return nf_conntrack_in(state->net, PF_INET, state->hook, skb);
481}
482
483/* Connection tracking may drop packets, but never alters them, so
484 * make it the first hook.
485 */
486static const struct nf_hook_ops ipv4_conntrack_ops[] = {
487 {
488 .hook = ipv4_conntrack_in,
489 .pf = NFPROTO_IPV4,
490 .hooknum = NF_INET_PRE_ROUTING,
491 .priority = NF_IP_PRI_CONNTRACK,
492 },
493 {
494 .hook = ipv4_conntrack_local,
495 .pf = NFPROTO_IPV4,
496 .hooknum = NF_INET_LOCAL_OUT,
497 .priority = NF_IP_PRI_CONNTRACK,
498 },
499 {
500 .hook = ipv4_helper,
501 .pf = NFPROTO_IPV4,
502 .hooknum = NF_INET_POST_ROUTING,
503 .priority = NF_IP_PRI_CONNTRACK_HELPER,
504 },
505 {
506 .hook = ipv4_confirm,
507 .pf = NFPROTO_IPV4,
508 .hooknum = NF_INET_POST_ROUTING,
509 .priority = NF_IP_PRI_CONNTRACK_CONFIRM,
510 },
511 {
512 .hook = ipv4_helper,
513 .pf = NFPROTO_IPV4,
514 .hooknum = NF_INET_LOCAL_IN,
515 .priority = NF_IP_PRI_CONNTRACK_HELPER,
516 },
517 {
518 .hook = ipv4_confirm,
519 .pf = NFPROTO_IPV4,
520 .hooknum = NF_INET_LOCAL_IN,
521 .priority = NF_IP_PRI_CONNTRACK_CONFIRM,
522 },
523};
524
525/* Fast function for those who don't want to parse /proc (and I don't
526 * blame them).
527 * Reversing the socket's dst/src point of view gives us the reply
528 * mapping.
529 */
530static int
531getorigdst(struct sock *sk, int optval, void __user *user, int *len)
532{
533 const struct inet_sock *inet = inet_sk(sk);
534 const struct nf_conntrack_tuple_hash *h;
535 struct nf_conntrack_tuple tuple;
536
537 memset(&tuple, 0, sizeof(tuple));
538
539 lock_sock(sk);
540 tuple.src.u3.ip = inet->inet_rcv_saddr;
541 tuple.src.u.tcp.port = inet->inet_sport;
542 tuple.dst.u3.ip = inet->inet_daddr;
543 tuple.dst.u.tcp.port = inet->inet_dport;
544 tuple.src.l3num = PF_INET;
545 tuple.dst.protonum = sk->sk_protocol;
546 release_sock(sk);
547
548 /* We only do TCP and SCTP at the moment: is there a better way? */
549 if (tuple.dst.protonum != IPPROTO_TCP &&
550 tuple.dst.protonum != IPPROTO_SCTP) {
551 pr_debug("SO_ORIGINAL_DST: Not a TCP/SCTP socket\n");
552 return -ENOPROTOOPT;
553 }
554
555 if ((unsigned int)*len < sizeof(struct sockaddr_in)) {
556 pr_debug("SO_ORIGINAL_DST: len %d not %zu\n",
557 *len, sizeof(struct sockaddr_in));
558 return -EINVAL;
559 }
560
561 h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple);
562 if (h) {
563 struct sockaddr_in sin;
564 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
565
566 sin.sin_family = AF_INET;
567 sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
568 .tuple.dst.u.tcp.port;
569 sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
570 .tuple.dst.u3.ip;
571 memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
572
573 pr_debug("SO_ORIGINAL_DST: %pI4 %u\n",
574 &sin.sin_addr.s_addr, ntohs(sin.sin_port));
575 nf_ct_put(ct);
576 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
577 return -EFAULT;
578 else
579 return 0;
580 }
581 pr_debug("SO_ORIGINAL_DST: Can't find %pI4/%u-%pI4/%u.\n",
582 &tuple.src.u3.ip, ntohs(tuple.src.u.tcp.port),
583 &tuple.dst.u3.ip, ntohs(tuple.dst.u.tcp.port));
584 return -ENOENT;
585}
586
587static struct nf_sockopt_ops so_getorigdst = {
588 .pf = PF_INET,
589 .get_optmin = SO_ORIGINAL_DST,
590 .get_optmax = SO_ORIGINAL_DST + 1,
591 .get = getorigdst,
592 .owner = THIS_MODULE,
593};
594
595#if IS_ENABLED(CONFIG_IPV6)
596static int
597ipv6_getorigdst(struct sock *sk, int optval, void __user *user, int *len)
598{
599 struct nf_conntrack_tuple tuple = { .src.l3num = NFPROTO_IPV6 };
600 const struct ipv6_pinfo *inet6 = inet6_sk(sk);
601 const struct inet_sock *inet = inet_sk(sk);
602 const struct nf_conntrack_tuple_hash *h;
603 struct sockaddr_in6 sin6;
604 struct nf_conn *ct;
605 __be32 flow_label;
606 int bound_dev_if;
607
608 lock_sock(sk);
609 tuple.src.u3.in6 = sk->sk_v6_rcv_saddr;
610 tuple.src.u.tcp.port = inet->inet_sport;
611 tuple.dst.u3.in6 = sk->sk_v6_daddr;
612 tuple.dst.u.tcp.port = inet->inet_dport;
613 tuple.dst.protonum = sk->sk_protocol;
614 bound_dev_if = sk->sk_bound_dev_if;
615 flow_label = inet6->flow_label;
616 release_sock(sk);
617
618 if (tuple.dst.protonum != IPPROTO_TCP &&
619 tuple.dst.protonum != IPPROTO_SCTP)
620 return -ENOPROTOOPT;
621
622 if (*len < 0 || (unsigned int)*len < sizeof(sin6))
623 return -EINVAL;
624
625 h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple);
626 if (!h) {
627 pr_debug("IP6T_SO_ORIGINAL_DST: Can't find %pI6c/%u-%pI6c/%u.\n",
628 &tuple.src.u3.ip6, ntohs(tuple.src.u.tcp.port),
629 &tuple.dst.u3.ip6, ntohs(tuple.dst.u.tcp.port));
630 return -ENOENT;
631 }
632
633 ct = nf_ct_tuplehash_to_ctrack(h);
634
635 sin6.sin6_family = AF_INET6;
636 sin6.sin6_port = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.tcp.port;
637 sin6.sin6_flowinfo = flow_label & IPV6_FLOWINFO_MASK;
638 memcpy(&sin6.sin6_addr,
639 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in6,
640 sizeof(sin6.sin6_addr));
641
642 nf_ct_put(ct);
643 sin6.sin6_scope_id = ipv6_iface_scope_id(&sin6.sin6_addr, bound_dev_if);
644 return copy_to_user(user, &sin6, sizeof(sin6)) ? -EFAULT : 0;
645}
646
647static struct nf_sockopt_ops so_getorigdst6 = {
648 .pf = NFPROTO_IPV6,
649 .get_optmin = IP6T_SO_ORIGINAL_DST,
650 .get_optmax = IP6T_SO_ORIGINAL_DST + 1,
651 .get = ipv6_getorigdst,
652 .owner = THIS_MODULE,
653};
654
655static unsigned int ipv6_confirm(void *priv,
656 struct sk_buff *skb,
657 const struct nf_hook_state *state)
658{
659 struct nf_conn *ct;
660 enum ip_conntrack_info ctinfo;
661 unsigned char pnum = ipv6_hdr(skb)->nexthdr;
662 int protoff;
663 __be16 frag_off;
664
665 ct = nf_ct_get(skb, &ctinfo);
666 if (!ct || ctinfo == IP_CT_RELATED_REPLY)
667 goto out;
668
669 protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum,
670 &frag_off);
671 if (protoff < 0 || (frag_off & htons(~0x7)) != 0) {
672 pr_debug("proto header not found\n");
673 goto out;
674 }
675
676 /* adjust seqs for loopback traffic only in outgoing direction */
677 if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
678 !nf_is_loopback_packet(skb)) {
679 if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) {
680 NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
681 return NF_DROP;
682 }
683 }
684out:
685 /* We've seen it coming out the other side: confirm it */
686 return nf_conntrack_confirm(skb);
687}
688
689static unsigned int ipv6_conntrack_in(void *priv,
690 struct sk_buff *skb,
691 const struct nf_hook_state *state)
692{
693 return nf_conntrack_in(state->net, PF_INET6, state->hook, skb);
694}
695
696static unsigned int ipv6_conntrack_local(void *priv,
697 struct sk_buff *skb,
698 const struct nf_hook_state *state)
699{
700 return nf_conntrack_in(state->net, PF_INET6, state->hook, skb);
701}
702
703static unsigned int ipv6_helper(void *priv,
704 struct sk_buff *skb,
705 const struct nf_hook_state *state)
706{
707 struct nf_conn *ct;
708 const struct nf_conn_help *help;
709 const struct nf_conntrack_helper *helper;
710 enum ip_conntrack_info ctinfo;
711 __be16 frag_off;
712 int protoff;
713 u8 nexthdr;
714
715 /* This is where we call the helper: as the packet goes out. */
716 ct = nf_ct_get(skb, &ctinfo);
717 if (!ct || ctinfo == IP_CT_RELATED_REPLY)
718 return NF_ACCEPT;
719
720 help = nfct_help(ct);
721 if (!help)
722 return NF_ACCEPT;
723 /* rcu_read_lock()ed by nf_hook_thresh */
724 helper = rcu_dereference(help->helper);
725 if (!helper)
726 return NF_ACCEPT;
727
728 nexthdr = ipv6_hdr(skb)->nexthdr;
729 protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
730 &frag_off);
731 if (protoff < 0 || (frag_off & htons(~0x7)) != 0) {
732 pr_debug("proto header not found\n");
733 return NF_ACCEPT;
734 }
735
736 return helper->help(skb, protoff, ct, ctinfo);
737}
738
739static const struct nf_hook_ops ipv6_conntrack_ops[] = {
740 {
741 .hook = ipv6_conntrack_in,
742 .pf = NFPROTO_IPV6,
743 .hooknum = NF_INET_PRE_ROUTING,
744 .priority = NF_IP6_PRI_CONNTRACK,
745 },
746 {
747 .hook = ipv6_conntrack_local,
748 .pf = NFPROTO_IPV6,
749 .hooknum = NF_INET_LOCAL_OUT,
750 .priority = NF_IP6_PRI_CONNTRACK,
751 },
752 {
753 .hook = ipv6_helper,
754 .pf = NFPROTO_IPV6,
755 .hooknum = NF_INET_POST_ROUTING,
756 .priority = NF_IP6_PRI_CONNTRACK_HELPER,
757 },
758 {
759 .hook = ipv6_confirm,
760 .pf = NFPROTO_IPV6,
761 .hooknum = NF_INET_POST_ROUTING,
762 .priority = NF_IP6_PRI_LAST,
763 },
764 {
765 .hook = ipv6_helper,
766 .pf = NFPROTO_IPV6,
767 .hooknum = NF_INET_LOCAL_IN,
768 .priority = NF_IP6_PRI_CONNTRACK_HELPER,
769 },
770 {
771 .hook = ipv6_confirm,
772 .pf = NFPROTO_IPV6,
773 .hooknum = NF_INET_LOCAL_IN,
774 .priority = NF_IP6_PRI_LAST - 1,
775 },
776};
777#endif
778
779static int nf_ct_netns_do_get(struct net *net, u8 nfproto)
780{
781 struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id);
782 int err = 0;
783
784 mutex_lock(&nf_ct_proto_mutex);
785
786 switch (nfproto) {
787 case NFPROTO_IPV4:
788 cnet->users4++;
789 if (cnet->users4 > 1)
790 goto out_unlock;
791 err = nf_defrag_ipv4_enable(net);
792 if (err) {
793 cnet->users4 = 0;
794 goto out_unlock;
795 }
796
797 err = nf_register_net_hooks(net, ipv4_conntrack_ops,
798 ARRAY_SIZE(ipv4_conntrack_ops));
799 if (err)
800 cnet->users4 = 0;
801 break;
802#if IS_ENABLED(CONFIG_IPV6)
803 case NFPROTO_IPV6:
804 cnet->users6++;
805 if (cnet->users6 > 1)
806 goto out_unlock;
807 err = nf_defrag_ipv6_enable(net);
808 if (err < 0) {
809 cnet->users6 = 0;
810 goto out_unlock;
811 }
812
813 err = nf_register_net_hooks(net, ipv6_conntrack_ops,
814 ARRAY_SIZE(ipv6_conntrack_ops));
815 if (err)
816 cnet->users6 = 0;
817 break;
818#endif
819 default:
820 err = -EPROTO;
821 break;
822 }
823 out_unlock:
824 mutex_unlock(&nf_ct_proto_mutex);
825 return err;
826}
827
828static void nf_ct_netns_do_put(struct net *net, u8 nfproto)
829{
830 struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id);
831
832 mutex_lock(&nf_ct_proto_mutex);
833 switch (nfproto) {
834 case NFPROTO_IPV4:
835 if (cnet->users4 && (--cnet->users4 == 0))
836 nf_unregister_net_hooks(net, ipv4_conntrack_ops,
837 ARRAY_SIZE(ipv4_conntrack_ops));
838 break;
839#if IS_ENABLED(CONFIG_IPV6)
840 case NFPROTO_IPV6:
841 if (cnet->users6 && (--cnet->users6 == 0))
842 nf_unregister_net_hooks(net, ipv6_conntrack_ops,
843 ARRAY_SIZE(ipv6_conntrack_ops));
844 break;
845#endif
846 }
847
848 mutex_unlock(&nf_ct_proto_mutex);
849}
850
851int nf_ct_netns_get(struct net *net, u8 nfproto)
852{
853 int err;
854
855 if (nfproto == NFPROTO_INET) {
856 err = nf_ct_netns_do_get(net, NFPROTO_IPV4);
857 if (err < 0)
858 goto err1;
859 err = nf_ct_netns_do_get(net, NFPROTO_IPV6);
860 if (err < 0)
861 goto err2;
862 } else {
863 err = nf_ct_netns_do_get(net, nfproto);
864 if (err < 0)
865 goto err1;
866 }
867 return 0;
868
869err2:
870 nf_ct_netns_put(net, NFPROTO_IPV4);
871err1:
872 return err;
873}
874EXPORT_SYMBOL_GPL(nf_ct_netns_get);
875
876void nf_ct_netns_put(struct net *net, uint8_t nfproto)
877{
878 if (nfproto == NFPROTO_INET) {
879 nf_ct_netns_do_put(net, NFPROTO_IPV4);
880 nf_ct_netns_do_put(net, NFPROTO_IPV6);
881 } else {
882 nf_ct_netns_do_put(net, nfproto);
883 }
884}
885EXPORT_SYMBOL_GPL(nf_ct_netns_put);
886
887static const struct nf_conntrack_l4proto * const builtin_l4proto[] = {
888 &nf_conntrack_l4proto_tcp4,
889 &nf_conntrack_l4proto_udp4,
890 &nf_conntrack_l4proto_icmp,
891#ifdef CONFIG_NF_CT_PROTO_DCCP
892 &nf_conntrack_l4proto_dccp4,
893#endif
894#ifdef CONFIG_NF_CT_PROTO_SCTP
895 &nf_conntrack_l4proto_sctp4,
896#endif
897#ifdef CONFIG_NF_CT_PROTO_UDPLITE
898 &nf_conntrack_l4proto_udplite4,
899#endif
900#if IS_ENABLED(CONFIG_IPV6)
901 &nf_conntrack_l4proto_tcp6,
902 &nf_conntrack_l4proto_udp6,
903 &nf_conntrack_l4proto_icmpv6,
904#ifdef CONFIG_NF_CT_PROTO_DCCP
905 &nf_conntrack_l4proto_dccp6,
906#endif
907#ifdef CONFIG_NF_CT_PROTO_SCTP
908 &nf_conntrack_l4proto_sctp6,
909#endif
910#ifdef CONFIG_NF_CT_PROTO_UDPLITE
911 &nf_conntrack_l4proto_udplite6,
912#endif
913#endif /* CONFIG_IPV6 */
914};
915
916int nf_conntrack_proto_init(void)
917{
918 int ret = 0;
919
920 ret = nf_register_sockopt(&so_getorigdst);
921 if (ret < 0)
922 return ret;
923
924#if IS_ENABLED(CONFIG_IPV6)
925 ret = nf_register_sockopt(&so_getorigdst6);
926 if (ret < 0)
927 goto cleanup_sockopt;
928#endif
929 ret = nf_ct_l4proto_register(builtin_l4proto,
930 ARRAY_SIZE(builtin_l4proto));
931 if (ret < 0)
932 goto cleanup_sockopt2;
933
934 return ret;
935cleanup_sockopt2:
936 nf_unregister_sockopt(&so_getorigdst);
937#if IS_ENABLED(CONFIG_IPV6)
938cleanup_sockopt:
939 nf_unregister_sockopt(&so_getorigdst6);
940#endif
941 return ret;
942}
943
944void nf_conntrack_proto_fini(void)
945{
946 unsigned int i;
947
948 nf_unregister_sockopt(&so_getorigdst);
949#if IS_ENABLED(CONFIG_IPV6)
950 nf_unregister_sockopt(&so_getorigdst6);
951#endif
952 /* No need to call nf_ct_l4proto_unregister(), the register
953 * tables are free'd here anyway.
954 */
955 for (i = 0; i < ARRAY_SIZE(nf_ct_protos); i++)
956 kfree(nf_ct_protos[i]);
957}
958
568int nf_conntrack_proto_pernet_init(struct net *net) 959int nf_conntrack_proto_pernet_init(struct net *net)
569{ 960{
570 int err; 961 int err;
@@ -581,6 +972,14 @@ int nf_conntrack_proto_pernet_init(struct net *net)
581 if (err < 0) 972 if (err < 0)
582 return err; 973 return err;
583 974
975 err = nf_ct_l4proto_pernet_register(net, builtin_l4proto,
976 ARRAY_SIZE(builtin_l4proto));
977 if (err < 0) {
978 nf_ct_l4proto_unregister_sysctl(net, pn,
979 &nf_conntrack_l4proto_generic);
980 return err;
981 }
982
584 pn->users++; 983 pn->users++;
585 return 0; 984 return 0;
586} 985}
@@ -590,25 +989,19 @@ void nf_conntrack_proto_pernet_fini(struct net *net)
590 struct nf_proto_net *pn = nf_ct_l4proto_net(net, 989 struct nf_proto_net *pn = nf_ct_l4proto_net(net,
591 &nf_conntrack_l4proto_generic); 990 &nf_conntrack_l4proto_generic);
592 991
992 nf_ct_l4proto_pernet_unregister(net, builtin_l4proto,
993 ARRAY_SIZE(builtin_l4proto));
593 pn->users--; 994 pn->users--;
594 nf_ct_l4proto_unregister_sysctl(net, 995 nf_ct_l4proto_unregister_sysctl(net,
595 pn, 996 pn,
596 &nf_conntrack_l4proto_generic); 997 &nf_conntrack_l4proto_generic);
597} 998}
598 999
599int nf_conntrack_proto_init(void)
600{
601 unsigned int i;
602 for (i = 0; i < NFPROTO_NUMPROTO; i++)
603 rcu_assign_pointer(nf_ct_l3protos[i],
604 &nf_conntrack_l3proto_generic);
605 return 0;
606}
607 1000
608void nf_conntrack_proto_fini(void) 1001module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
609{ 1002 &nf_conntrack_htable_size, 0600);
610 unsigned int i; 1003
611 /* free l3proto protocol tables */ 1004MODULE_ALIAS("ip_conntrack");
612 for (i = 0; i < ARRAY_SIZE(nf_ct_protos); i++) 1005MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET));
613 kfree(nf_ct_protos[i]); 1006MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET6));
614} 1007MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index abe647d5b8c6..8c58f96b59e7 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -23,6 +23,7 @@
23#include <net/netfilter/nf_conntrack.h> 23#include <net/netfilter/nf_conntrack.h>
24#include <net/netfilter/nf_conntrack_l4proto.h> 24#include <net/netfilter/nf_conntrack_l4proto.h>
25#include <net/netfilter/nf_conntrack_ecache.h> 25#include <net/netfilter/nf_conntrack_ecache.h>
26#include <net/netfilter/nf_conntrack_timeout.h>
26#include <net/netfilter/nf_log.h> 27#include <net/netfilter/nf_log.h>
27 28
28/* Timeouts are based on values from RFC4340: 29/* Timeouts are based on values from RFC4340:
@@ -243,14 +244,14 @@ dccp_state_table[CT_DCCP_ROLE_MAX + 1][DCCP_PKT_SYNCACK + 1][CT_DCCP_MAX + 1] =
243 * We currently ignore Sync packets 244 * We currently ignore Sync packets
244 * 245 *
245 * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ 246 * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
246 sIG, sIG, sIG, sIG, sIG, sIG, sIG, sIG, 247 sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG,
247 }, 248 },
248 [DCCP_PKT_SYNCACK] = { 249 [DCCP_PKT_SYNCACK] = {
249 /* 250 /*
250 * We currently ignore SyncAck packets 251 * We currently ignore SyncAck packets
251 * 252 *
252 * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ 253 * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
253 sIG, sIG, sIG, sIG, sIG, sIG, sIG, sIG, 254 sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG,
254 }, 255 },
255 }, 256 },
256 [CT_DCCP_ROLE_SERVER] = { 257 [CT_DCCP_ROLE_SERVER] = {
@@ -371,14 +372,14 @@ dccp_state_table[CT_DCCP_ROLE_MAX + 1][DCCP_PKT_SYNCACK + 1][CT_DCCP_MAX + 1] =
371 * We currently ignore Sync packets 372 * We currently ignore Sync packets
372 * 373 *
373 * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ 374 * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
374 sIG, sIG, sIG, sIG, sIG, sIG, sIG, sIG, 375 sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG,
375 }, 376 },
376 [DCCP_PKT_SYNCACK] = { 377 [DCCP_PKT_SYNCACK] = {
377 /* 378 /*
378 * We currently ignore SyncAck packets 379 * We currently ignore SyncAck packets
379 * 380 *
380 * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ 381 * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
381 sIG, sIG, sIG, sIG, sIG, sIG, sIG, sIG, 382 sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG,
382 }, 383 },
383 }, 384 },
384}; 385};
@@ -388,31 +389,8 @@ static inline struct nf_dccp_net *dccp_pernet(struct net *net)
388 return &net->ct.nf_ct_proto.dccp; 389 return &net->ct.nf_ct_proto.dccp;
389} 390}
390 391
391static bool dccp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
392 struct net *net, struct nf_conntrack_tuple *tuple)
393{
394 struct dccp_hdr _hdr, *dh;
395
396 /* Actually only need first 4 bytes to get ports. */
397 dh = skb_header_pointer(skb, dataoff, 4, &_hdr);
398 if (dh == NULL)
399 return false;
400
401 tuple->src.u.dccp.port = dh->dccph_sport;
402 tuple->dst.u.dccp.port = dh->dccph_dport;
403 return true;
404}
405
406static bool dccp_invert_tuple(struct nf_conntrack_tuple *inv,
407 const struct nf_conntrack_tuple *tuple)
408{
409 inv->src.u.dccp.port = tuple->dst.u.dccp.port;
410 inv->dst.u.dccp.port = tuple->src.u.dccp.port;
411 return true;
412}
413
414static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb, 392static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb,
415 unsigned int dataoff, unsigned int *timeouts) 393 unsigned int dataoff)
416{ 394{
417 struct net *net = nf_ct_net(ct); 395 struct net *net = nf_ct_net(ct);
418 struct nf_dccp_net *dn; 396 struct nf_dccp_net *dn;
@@ -460,19 +438,14 @@ static u64 dccp_ack_seq(const struct dccp_hdr *dh)
460 ntohl(dhack->dccph_ack_nr_low); 438 ntohl(dhack->dccph_ack_nr_low);
461} 439}
462 440
463static unsigned int *dccp_get_timeouts(struct net *net)
464{
465 return dccp_pernet(net)->dccp_timeout;
466}
467
468static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb, 441static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb,
469 unsigned int dataoff, enum ip_conntrack_info ctinfo, 442 unsigned int dataoff, enum ip_conntrack_info ctinfo)
470 unsigned int *timeouts)
471{ 443{
472 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); 444 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
473 struct dccp_hdr _dh, *dh; 445 struct dccp_hdr _dh, *dh;
474 u_int8_t type, old_state, new_state; 446 u_int8_t type, old_state, new_state;
475 enum ct_dccp_roles role; 447 enum ct_dccp_roles role;
448 unsigned int *timeouts;
476 449
477 dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh); 450 dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh);
478 BUG_ON(dh == NULL); 451 BUG_ON(dh == NULL);
@@ -546,6 +519,9 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb,
546 if (new_state != old_state) 519 if (new_state != old_state)
547 nf_conntrack_event_cache(IPCT_PROTOINFO, ct); 520 nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
548 521
522 timeouts = nf_ct_timeout_lookup(ct);
523 if (!timeouts)
524 timeouts = dccp_pernet(nf_ct_net(ct))->dccp_timeout;
549 nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]); 525 nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]);
550 526
551 return NF_ACCEPT; 527 return NF_ACCEPT;
@@ -864,11 +840,8 @@ static struct nf_proto_net *dccp_get_net_proto(struct net *net)
864const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4 = { 840const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4 = {
865 .l3proto = AF_INET, 841 .l3proto = AF_INET,
866 .l4proto = IPPROTO_DCCP, 842 .l4proto = IPPROTO_DCCP,
867 .pkt_to_tuple = dccp_pkt_to_tuple,
868 .invert_tuple = dccp_invert_tuple,
869 .new = dccp_new, 843 .new = dccp_new,
870 .packet = dccp_packet, 844 .packet = dccp_packet,
871 .get_timeouts = dccp_get_timeouts,
872 .error = dccp_error, 845 .error = dccp_error,
873 .can_early_drop = dccp_can_early_drop, 846 .can_early_drop = dccp_can_early_drop,
874#ifdef CONFIG_NF_CONNTRACK_PROCFS 847#ifdef CONFIG_NF_CONNTRACK_PROCFS
@@ -900,11 +873,8 @@ EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_dccp4);
900const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6 = { 873const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6 = {
901 .l3proto = AF_INET6, 874 .l3proto = AF_INET6,
902 .l4proto = IPPROTO_DCCP, 875 .l4proto = IPPROTO_DCCP,
903 .pkt_to_tuple = dccp_pkt_to_tuple,
904 .invert_tuple = dccp_invert_tuple,
905 .new = dccp_new, 876 .new = dccp_new,
906 .packet = dccp_packet, 877 .packet = dccp_packet,
907 .get_timeouts = dccp_get_timeouts,
908 .error = dccp_error, 878 .error = dccp_error,
909 .can_early_drop = dccp_can_early_drop, 879 .can_early_drop = dccp_can_early_drop,
910#ifdef CONFIG_NF_CONNTRACK_PROCFS 880#ifdef CONFIG_NF_CONNTRACK_PROCFS
diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c
index 6c6896d21cd7..ac4a0b296dcd 100644
--- a/net/netfilter/nf_conntrack_proto_generic.c
+++ b/net/netfilter/nf_conntrack_proto_generic.c
@@ -11,6 +11,7 @@
11#include <linux/timer.h> 11#include <linux/timer.h>
12#include <linux/netfilter.h> 12#include <linux/netfilter.h>
13#include <net/netfilter/nf_conntrack_l4proto.h> 13#include <net/netfilter/nf_conntrack_l4proto.h>
14#include <net/netfilter/nf_conntrack_timeout.h>
14 15
15static const unsigned int nf_ct_generic_timeout = 600*HZ; 16static const unsigned int nf_ct_generic_timeout = 600*HZ;
16 17
@@ -41,34 +42,24 @@ static bool generic_pkt_to_tuple(const struct sk_buff *skb,
41 return true; 42 return true;
42} 43}
43 44
44static bool generic_invert_tuple(struct nf_conntrack_tuple *tuple,
45 const struct nf_conntrack_tuple *orig)
46{
47 tuple->src.u.all = 0;
48 tuple->dst.u.all = 0;
49
50 return true;
51}
52
53static unsigned int *generic_get_timeouts(struct net *net)
54{
55 return &(generic_pernet(net)->timeout);
56}
57
58/* Returns verdict for packet, or -1 for invalid. */ 45/* Returns verdict for packet, or -1 for invalid. */
59static int generic_packet(struct nf_conn *ct, 46static int generic_packet(struct nf_conn *ct,
60 const struct sk_buff *skb, 47 const struct sk_buff *skb,
61 unsigned int dataoff, 48 unsigned int dataoff,
62 enum ip_conntrack_info ctinfo, 49 enum ip_conntrack_info ctinfo)
63 unsigned int *timeout)
64{ 50{
51 const unsigned int *timeout = nf_ct_timeout_lookup(ct);
52
53 if (!timeout)
54 timeout = &generic_pernet(nf_ct_net(ct))->timeout;
55
65 nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); 56 nf_ct_refresh_acct(ct, ctinfo, skb, *timeout);
66 return NF_ACCEPT; 57 return NF_ACCEPT;
67} 58}
68 59
69/* Called when a new connection for this protocol found. */ 60/* Called when a new connection for this protocol found. */
70static bool generic_new(struct nf_conn *ct, const struct sk_buff *skb, 61static bool generic_new(struct nf_conn *ct, const struct sk_buff *skb,
71 unsigned int dataoff, unsigned int *timeouts) 62 unsigned int dataoff)
72{ 63{
73 bool ret; 64 bool ret;
74 65
@@ -87,8 +78,11 @@ static bool generic_new(struct nf_conn *ct, const struct sk_buff *skb,
87static int generic_timeout_nlattr_to_obj(struct nlattr *tb[], 78static int generic_timeout_nlattr_to_obj(struct nlattr *tb[],
88 struct net *net, void *data) 79 struct net *net, void *data)
89{ 80{
90 unsigned int *timeout = data;
91 struct nf_generic_net *gn = generic_pernet(net); 81 struct nf_generic_net *gn = generic_pernet(net);
82 unsigned int *timeout = data;
83
84 if (!timeout)
85 timeout = &gn->timeout;
92 86
93 if (tb[CTA_TIMEOUT_GENERIC_TIMEOUT]) 87 if (tb[CTA_TIMEOUT_GENERIC_TIMEOUT])
94 *timeout = 88 *timeout =
@@ -168,9 +162,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_generic =
168 .l3proto = PF_UNSPEC, 162 .l3proto = PF_UNSPEC,
169 .l4proto = 255, 163 .l4proto = 255,
170 .pkt_to_tuple = generic_pkt_to_tuple, 164 .pkt_to_tuple = generic_pkt_to_tuple,
171 .invert_tuple = generic_invert_tuple,
172 .packet = generic_packet, 165 .packet = generic_packet,
173 .get_timeouts = generic_get_timeouts,
174 .new = generic_new, 166 .new = generic_new,
175#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) 167#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
176 .ctnl_timeout = { 168 .ctnl_timeout = {
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index d049ea5a3770..d1632252bf5b 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -39,6 +39,7 @@
39#include <net/netfilter/nf_conntrack_l4proto.h> 39#include <net/netfilter/nf_conntrack_l4proto.h>
40#include <net/netfilter/nf_conntrack_helper.h> 40#include <net/netfilter/nf_conntrack_helper.h>
41#include <net/netfilter/nf_conntrack_core.h> 41#include <net/netfilter/nf_conntrack_core.h>
42#include <net/netfilter/nf_conntrack_timeout.h>
42#include <linux/netfilter/nf_conntrack_proto_gre.h> 43#include <linux/netfilter/nf_conntrack_proto_gre.h>
43#include <linux/netfilter/nf_conntrack_pptp.h> 44#include <linux/netfilter/nf_conntrack_pptp.h>
44 45
@@ -179,15 +180,6 @@ EXPORT_SYMBOL_GPL(nf_ct_gre_keymap_destroy);
179 180
180/* PUBLIC CONNTRACK PROTO HELPER FUNCTIONS */ 181/* PUBLIC CONNTRACK PROTO HELPER FUNCTIONS */
181 182
182/* invert gre part of tuple */
183static bool gre_invert_tuple(struct nf_conntrack_tuple *tuple,
184 const struct nf_conntrack_tuple *orig)
185{
186 tuple->dst.u.gre.key = orig->src.u.gre.key;
187 tuple->src.u.gre.key = orig->dst.u.gre.key;
188 return true;
189}
190
191/* gre hdr info to tuple */ 183/* gre hdr info to tuple */
192static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, 184static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
193 struct net *net, struct nf_conntrack_tuple *tuple) 185 struct net *net, struct nf_conntrack_tuple *tuple)
@@ -243,8 +235,7 @@ static unsigned int *gre_get_timeouts(struct net *net)
243static int gre_packet(struct nf_conn *ct, 235static int gre_packet(struct nf_conn *ct,
244 const struct sk_buff *skb, 236 const struct sk_buff *skb,
245 unsigned int dataoff, 237 unsigned int dataoff,
246 enum ip_conntrack_info ctinfo, 238 enum ip_conntrack_info ctinfo)
247 unsigned int *timeouts)
248{ 239{
249 /* If we've seen traffic both ways, this is a GRE connection. 240 /* If we've seen traffic both ways, this is a GRE connection.
250 * Extend timeout. */ 241 * Extend timeout. */
@@ -263,8 +254,13 @@ static int gre_packet(struct nf_conn *ct,
263 254
264/* Called when a new connection for this protocol found. */ 255/* Called when a new connection for this protocol found. */
265static bool gre_new(struct nf_conn *ct, const struct sk_buff *skb, 256static bool gre_new(struct nf_conn *ct, const struct sk_buff *skb,
266 unsigned int dataoff, unsigned int *timeouts) 257 unsigned int dataoff)
267{ 258{
259 unsigned int *timeouts = nf_ct_timeout_lookup(ct);
260
261 if (!timeouts)
262 timeouts = gre_get_timeouts(nf_ct_net(ct));
263
268 pr_debug(": "); 264 pr_debug(": ");
269 nf_ct_dump_tuple(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 265 nf_ct_dump_tuple(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
270 266
@@ -300,6 +296,8 @@ static int gre_timeout_nlattr_to_obj(struct nlattr *tb[],
300 unsigned int *timeouts = data; 296 unsigned int *timeouts = data;
301 struct netns_proto_gre *net_gre = gre_pernet(net); 297 struct netns_proto_gre *net_gre = gre_pernet(net);
302 298
299 if (!timeouts)
300 timeouts = gre_get_timeouts(net);
303 /* set default timeouts for GRE. */ 301 /* set default timeouts for GRE. */
304 timeouts[GRE_CT_UNREPLIED] = net_gre->gre_timeouts[GRE_CT_UNREPLIED]; 302 timeouts[GRE_CT_UNREPLIED] = net_gre->gre_timeouts[GRE_CT_UNREPLIED];
305 timeouts[GRE_CT_REPLIED] = net_gre->gre_timeouts[GRE_CT_REPLIED]; 303 timeouts[GRE_CT_REPLIED] = net_gre->gre_timeouts[GRE_CT_REPLIED];
@@ -356,11 +354,9 @@ static const struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 = {
356 .l3proto = AF_INET, 354 .l3proto = AF_INET,
357 .l4proto = IPPROTO_GRE, 355 .l4proto = IPPROTO_GRE,
358 .pkt_to_tuple = gre_pkt_to_tuple, 356 .pkt_to_tuple = gre_pkt_to_tuple,
359 .invert_tuple = gre_invert_tuple,
360#ifdef CONFIG_NF_CONNTRACK_PROCFS 357#ifdef CONFIG_NF_CONNTRACK_PROCFS
361 .print_conntrack = gre_print_conntrack, 358 .print_conntrack = gre_print_conntrack,
362#endif 359#endif
363 .get_timeouts = gre_get_timeouts,
364 .packet = gre_packet, 360 .packet = gre_packet,
365 .new = gre_new, 361 .new = gre_new,
366 .destroy = gre_destroy, 362 .destroy = gre_destroy,
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c
index 5c15beafa711..036670b38282 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/netfilter/nf_conntrack_proto_icmp.c
@@ -19,6 +19,7 @@
19#include <net/netfilter/nf_conntrack_tuple.h> 19#include <net/netfilter/nf_conntrack_tuple.h>
20#include <net/netfilter/nf_conntrack_l4proto.h> 20#include <net/netfilter/nf_conntrack_l4proto.h>
21#include <net/netfilter/nf_conntrack_core.h> 21#include <net/netfilter/nf_conntrack_core.h>
22#include <net/netfilter/nf_conntrack_timeout.h>
22#include <net/netfilter/nf_conntrack_zones.h> 23#include <net/netfilter/nf_conntrack_zones.h>
23#include <net/netfilter/nf_log.h> 24#include <net/netfilter/nf_log.h>
24 25
@@ -80,12 +81,16 @@ static unsigned int *icmp_get_timeouts(struct net *net)
80static int icmp_packet(struct nf_conn *ct, 81static int icmp_packet(struct nf_conn *ct,
81 const struct sk_buff *skb, 82 const struct sk_buff *skb,
82 unsigned int dataoff, 83 unsigned int dataoff,
83 enum ip_conntrack_info ctinfo, 84 enum ip_conntrack_info ctinfo)
84 unsigned int *timeout)
85{ 85{
86 /* Do not immediately delete the connection after the first 86 /* Do not immediately delete the connection after the first
87 successful reply to avoid excessive conntrackd traffic 87 successful reply to avoid excessive conntrackd traffic
88 and also to handle correctly ICMP echo reply duplicates. */ 88 and also to handle correctly ICMP echo reply duplicates. */
89 unsigned int *timeout = nf_ct_timeout_lookup(ct);
90
91 if (!timeout)
92 timeout = icmp_get_timeouts(nf_ct_net(ct));
93
89 nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); 94 nf_ct_refresh_acct(ct, ctinfo, skb, *timeout);
90 95
91 return NF_ACCEPT; 96 return NF_ACCEPT;
@@ -93,7 +98,7 @@ static int icmp_packet(struct nf_conn *ct,
93 98
94/* Called when a new connection for this protocol found. */ 99/* Called when a new connection for this protocol found. */
95static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb, 100static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb,
96 unsigned int dataoff, unsigned int *timeouts) 101 unsigned int dataoff)
97{ 102{
98 static const u_int8_t valid_new[] = { 103 static const u_int8_t valid_new[] = {
99 [ICMP_ECHO] = 1, 104 [ICMP_ECHO] = 1,
@@ -142,8 +147,7 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
142 147
143 /* Ordinarily, we'd expect the inverted tupleproto, but it's 148 /* Ordinarily, we'd expect the inverted tupleproto, but it's
144 been preserved inside the ICMP. */ 149 been preserved inside the ICMP. */
145 if (!nf_ct_invert_tuple(&innertuple, &origtuple, 150 if (!nf_ct_invert_tuple(&innertuple, &origtuple, innerproto)) {
146 &nf_conntrack_l3proto_ipv4, innerproto)) {
147 pr_debug("icmp_error_message: no match\n"); 151 pr_debug("icmp_error_message: no match\n");
148 return -NF_ACCEPT; 152 return -NF_ACCEPT;
149 } 153 }
@@ -281,9 +285,11 @@ static int icmp_timeout_nlattr_to_obj(struct nlattr *tb[],
281 struct nf_icmp_net *in = icmp_pernet(net); 285 struct nf_icmp_net *in = icmp_pernet(net);
282 286
283 if (tb[CTA_TIMEOUT_ICMP_TIMEOUT]) { 287 if (tb[CTA_TIMEOUT_ICMP_TIMEOUT]) {
288 if (!timeout)
289 timeout = &in->timeout;
284 *timeout = 290 *timeout =
285 ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMP_TIMEOUT])) * HZ; 291 ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMP_TIMEOUT])) * HZ;
286 } else { 292 } else if (timeout) {
287 /* Set default ICMP timeout. */ 293 /* Set default ICMP timeout. */
288 *timeout = in->timeout; 294 *timeout = in->timeout;
289 } 295 }
@@ -358,7 +364,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp =
358 .pkt_to_tuple = icmp_pkt_to_tuple, 364 .pkt_to_tuple = icmp_pkt_to_tuple,
359 .invert_tuple = icmp_invert_tuple, 365 .invert_tuple = icmp_invert_tuple,
360 .packet = icmp_packet, 366 .packet = icmp_packet,
361 .get_timeouts = icmp_get_timeouts,
362 .new = icmp_new, 367 .new = icmp_new,
363 .error = icmp_error, 368 .error = icmp_error,
364 .destroy = NULL, 369 .destroy = NULL,
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c
index 2548e2c8aedd..bed07b998a10 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/netfilter/nf_conntrack_proto_icmpv6.c
@@ -23,6 +23,7 @@
23#include <net/netfilter/nf_conntrack_tuple.h> 23#include <net/netfilter/nf_conntrack_tuple.h>
24#include <net/netfilter/nf_conntrack_l4proto.h> 24#include <net/netfilter/nf_conntrack_l4proto.h>
25#include <net/netfilter/nf_conntrack_core.h> 25#include <net/netfilter/nf_conntrack_core.h>
26#include <net/netfilter/nf_conntrack_timeout.h>
26#include <net/netfilter/nf_conntrack_zones.h> 27#include <net/netfilter/nf_conntrack_zones.h>
27#include <net/netfilter/ipv6/nf_conntrack_icmpv6.h> 28#include <net/netfilter/ipv6/nf_conntrack_icmpv6.h>
28#include <net/netfilter/nf_log.h> 29#include <net/netfilter/nf_log.h>
@@ -93,9 +94,13 @@ static unsigned int *icmpv6_get_timeouts(struct net *net)
93static int icmpv6_packet(struct nf_conn *ct, 94static int icmpv6_packet(struct nf_conn *ct,
94 const struct sk_buff *skb, 95 const struct sk_buff *skb,
95 unsigned int dataoff, 96 unsigned int dataoff,
96 enum ip_conntrack_info ctinfo, 97 enum ip_conntrack_info ctinfo)
97 unsigned int *timeout)
98{ 98{
99 unsigned int *timeout = nf_ct_timeout_lookup(ct);
100
101 if (!timeout)
102 timeout = icmpv6_get_timeouts(nf_ct_net(ct));
103
99 /* Do not immediately delete the connection after the first 104 /* Do not immediately delete the connection after the first
100 successful reply to avoid excessive conntrackd traffic 105 successful reply to avoid excessive conntrackd traffic
101 and also to handle correctly ICMP echo reply duplicates. */ 106 and also to handle correctly ICMP echo reply duplicates. */
@@ -106,7 +111,7 @@ static int icmpv6_packet(struct nf_conn *ct,
106 111
107/* Called when a new connection for this protocol found. */ 112/* Called when a new connection for this protocol found. */
108static bool icmpv6_new(struct nf_conn *ct, const struct sk_buff *skb, 113static bool icmpv6_new(struct nf_conn *ct, const struct sk_buff *skb,
109 unsigned int dataoff, unsigned int *timeouts) 114 unsigned int dataoff)
110{ 115{
111 static const u_int8_t valid_new[] = { 116 static const u_int8_t valid_new[] = {
112 [ICMPV6_ECHO_REQUEST - 128] = 1, 117 [ICMPV6_ECHO_REQUEST - 128] = 1,
@@ -152,8 +157,7 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
152 157
153 /* Ordinarily, we'd expect the inverted tupleproto, but it's 158 /* Ordinarily, we'd expect the inverted tupleproto, but it's
154 been preserved inside the ICMP. */ 159 been preserved inside the ICMP. */
155 if (!nf_ct_invert_tuple(&intuple, &origtuple, 160 if (!nf_ct_invert_tuple(&intuple, &origtuple, inproto)) {
156 &nf_conntrack_l3proto_ipv6, inproto)) {
157 pr_debug("icmpv6_error: Can't invert tuple\n"); 161 pr_debug("icmpv6_error: Can't invert tuple\n");
158 return -NF_ACCEPT; 162 return -NF_ACCEPT;
159 } 163 }
@@ -281,6 +285,8 @@ static int icmpv6_timeout_nlattr_to_obj(struct nlattr *tb[],
281 unsigned int *timeout = data; 285 unsigned int *timeout = data;
282 struct nf_icmp_net *in = icmpv6_pernet(net); 286 struct nf_icmp_net *in = icmpv6_pernet(net);
283 287
288 if (!timeout)
289 timeout = icmpv6_get_timeouts(net);
284 if (tb[CTA_TIMEOUT_ICMPV6_TIMEOUT]) { 290 if (tb[CTA_TIMEOUT_ICMPV6_TIMEOUT]) {
285 *timeout = 291 *timeout =
286 ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMPV6_TIMEOUT])) * HZ; 292 ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMPV6_TIMEOUT])) * HZ;
@@ -359,7 +365,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 =
359 .pkt_to_tuple = icmpv6_pkt_to_tuple, 365 .pkt_to_tuple = icmpv6_pkt_to_tuple,
360 .invert_tuple = icmpv6_invert_tuple, 366 .invert_tuple = icmpv6_invert_tuple,
361 .packet = icmpv6_packet, 367 .packet = icmpv6_packet,
362 .get_timeouts = icmpv6_get_timeouts,
363 .new = icmpv6_new, 368 .new = icmpv6_new,
364 .error = icmpv6_error, 369 .error = icmpv6_error,
365#if IS_ENABLED(CONFIG_NF_CT_NETLINK) 370#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index fb9a35d16069..8d1e085fc14a 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -28,6 +28,7 @@
28#include <net/netfilter/nf_conntrack.h> 28#include <net/netfilter/nf_conntrack.h>
29#include <net/netfilter/nf_conntrack_l4proto.h> 29#include <net/netfilter/nf_conntrack_l4proto.h>
30#include <net/netfilter/nf_conntrack_ecache.h> 30#include <net/netfilter/nf_conntrack_ecache.h>
31#include <net/netfilter/nf_conntrack_timeout.h>
31 32
32/* FIXME: Examine ipfilter's timeouts and conntrack transitions more 33/* FIXME: Examine ipfilter's timeouts and conntrack transitions more
33 closely. They're more complex. --RR 34 closely. They're more complex. --RR
@@ -150,30 +151,6 @@ static inline struct nf_sctp_net *sctp_pernet(struct net *net)
150 return &net->ct.nf_ct_proto.sctp; 151 return &net->ct.nf_ct_proto.sctp;
151} 152}
152 153
153static bool sctp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
154 struct net *net, struct nf_conntrack_tuple *tuple)
155{
156 const struct sctphdr *hp;
157 struct sctphdr _hdr;
158
159 /* Actually only need first 4 bytes to get ports. */
160 hp = skb_header_pointer(skb, dataoff, 4, &_hdr);
161 if (hp == NULL)
162 return false;
163
164 tuple->src.u.sctp.port = hp->source;
165 tuple->dst.u.sctp.port = hp->dest;
166 return true;
167}
168
169static bool sctp_invert_tuple(struct nf_conntrack_tuple *tuple,
170 const struct nf_conntrack_tuple *orig)
171{
172 tuple->src.u.sctp.port = orig->dst.u.sctp.port;
173 tuple->dst.u.sctp.port = orig->src.u.sctp.port;
174 return true;
175}
176
177#ifdef CONFIG_NF_CONNTRACK_PROCFS 154#ifdef CONFIG_NF_CONNTRACK_PROCFS
178/* Print out the private part of the conntrack. */ 155/* Print out the private part of the conntrack. */
179static void sctp_print_conntrack(struct seq_file *s, struct nf_conn *ct) 156static void sctp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
@@ -296,17 +273,11 @@ static int sctp_new_state(enum ip_conntrack_dir dir,
296 return sctp_conntracks[dir][i][cur_state]; 273 return sctp_conntracks[dir][i][cur_state];
297} 274}
298 275
299static unsigned int *sctp_get_timeouts(struct net *net)
300{
301 return sctp_pernet(net)->timeouts;
302}
303
304/* Returns verdict for packet, or -NF_ACCEPT for invalid. */ 276/* Returns verdict for packet, or -NF_ACCEPT for invalid. */
305static int sctp_packet(struct nf_conn *ct, 277static int sctp_packet(struct nf_conn *ct,
306 const struct sk_buff *skb, 278 const struct sk_buff *skb,
307 unsigned int dataoff, 279 unsigned int dataoff,
308 enum ip_conntrack_info ctinfo, 280 enum ip_conntrack_info ctinfo)
309 unsigned int *timeouts)
310{ 281{
311 enum sctp_conntrack new_state, old_state; 282 enum sctp_conntrack new_state, old_state;
312 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); 283 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
@@ -315,6 +286,7 @@ static int sctp_packet(struct nf_conn *ct,
315 const struct sctp_chunkhdr *sch; 286 const struct sctp_chunkhdr *sch;
316 struct sctp_chunkhdr _sch; 287 struct sctp_chunkhdr _sch;
317 u_int32_t offset, count; 288 u_int32_t offset, count;
289 unsigned int *timeouts;
318 unsigned long map[256 / sizeof(unsigned long)] = { 0 }; 290 unsigned long map[256 / sizeof(unsigned long)] = { 0 };
319 291
320 sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph); 292 sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph);
@@ -403,6 +375,10 @@ static int sctp_packet(struct nf_conn *ct,
403 } 375 }
404 spin_unlock_bh(&ct->lock); 376 spin_unlock_bh(&ct->lock);
405 377
378 timeouts = nf_ct_timeout_lookup(ct);
379 if (!timeouts)
380 timeouts = sctp_pernet(nf_ct_net(ct))->timeouts;
381
406 nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]); 382 nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]);
407 383
408 if (old_state == SCTP_CONNTRACK_COOKIE_ECHOED && 384 if (old_state == SCTP_CONNTRACK_COOKIE_ECHOED &&
@@ -423,7 +399,7 @@ out:
423 399
424/* Called when a new connection for this protocol found. */ 400/* Called when a new connection for this protocol found. */
425static bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb, 401static bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb,
426 unsigned int dataoff, unsigned int *timeouts) 402 unsigned int dataoff)
427{ 403{
428 enum sctp_conntrack new_state; 404 enum sctp_conntrack new_state;
429 const struct sctphdr *sh; 405 const struct sctphdr *sh;
@@ -780,13 +756,10 @@ static struct nf_proto_net *sctp_get_net_proto(struct net *net)
780const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 = { 756const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 = {
781 .l3proto = PF_INET, 757 .l3proto = PF_INET,
782 .l4proto = IPPROTO_SCTP, 758 .l4proto = IPPROTO_SCTP,
783 .pkt_to_tuple = sctp_pkt_to_tuple,
784 .invert_tuple = sctp_invert_tuple,
785#ifdef CONFIG_NF_CONNTRACK_PROCFS 759#ifdef CONFIG_NF_CONNTRACK_PROCFS
786 .print_conntrack = sctp_print_conntrack, 760 .print_conntrack = sctp_print_conntrack,
787#endif 761#endif
788 .packet = sctp_packet, 762 .packet = sctp_packet,
789 .get_timeouts = sctp_get_timeouts,
790 .new = sctp_new, 763 .new = sctp_new,
791 .error = sctp_error, 764 .error = sctp_error,
792 .can_early_drop = sctp_can_early_drop, 765 .can_early_drop = sctp_can_early_drop,
@@ -817,13 +790,10 @@ EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_sctp4);
817const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 = { 790const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 = {
818 .l3proto = PF_INET6, 791 .l3proto = PF_INET6,
819 .l4proto = IPPROTO_SCTP, 792 .l4proto = IPPROTO_SCTP,
820 .pkt_to_tuple = sctp_pkt_to_tuple,
821 .invert_tuple = sctp_invert_tuple,
822#ifdef CONFIG_NF_CONNTRACK_PROCFS 793#ifdef CONFIG_NF_CONNTRACK_PROCFS
823 .print_conntrack = sctp_print_conntrack, 794 .print_conntrack = sctp_print_conntrack,
824#endif 795#endif
825 .packet = sctp_packet, 796 .packet = sctp_packet,
826 .get_timeouts = sctp_get_timeouts,
827 .new = sctp_new, 797 .new = sctp_new,
828 .error = sctp_error, 798 .error = sctp_error,
829 .can_early_drop = sctp_can_early_drop, 799 .can_early_drop = sctp_can_early_drop,
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 8e67910185a0..d80d322b9d8b 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -29,6 +29,7 @@
29#include <net/netfilter/nf_conntrack_ecache.h> 29#include <net/netfilter/nf_conntrack_ecache.h>
30#include <net/netfilter/nf_conntrack_seqadj.h> 30#include <net/netfilter/nf_conntrack_seqadj.h>
31#include <net/netfilter/nf_conntrack_synproxy.h> 31#include <net/netfilter/nf_conntrack_synproxy.h>
32#include <net/netfilter/nf_conntrack_timeout.h>
32#include <net/netfilter/nf_log.h> 33#include <net/netfilter/nf_log.h>
33#include <net/netfilter/ipv4/nf_conntrack_ipv4.h> 34#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
34#include <net/netfilter/ipv6/nf_conntrack_ipv6.h> 35#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
@@ -276,31 +277,6 @@ static inline struct nf_tcp_net *tcp_pernet(struct net *net)
276 return &net->ct.nf_ct_proto.tcp; 277 return &net->ct.nf_ct_proto.tcp;
277} 278}
278 279
279static bool tcp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
280 struct net *net, struct nf_conntrack_tuple *tuple)
281{
282 const struct tcphdr *hp;
283 struct tcphdr _hdr;
284
285 /* Actually only need first 4 bytes to get ports. */
286 hp = skb_header_pointer(skb, dataoff, 4, &_hdr);
287 if (hp == NULL)
288 return false;
289
290 tuple->src.u.tcp.port = hp->source;
291 tuple->dst.u.tcp.port = hp->dest;
292
293 return true;
294}
295
296static bool tcp_invert_tuple(struct nf_conntrack_tuple *tuple,
297 const struct nf_conntrack_tuple *orig)
298{
299 tuple->src.u.tcp.port = orig->dst.u.tcp.port;
300 tuple->dst.u.tcp.port = orig->src.u.tcp.port;
301 return true;
302}
303
304#ifdef CONFIG_NF_CONNTRACK_PROCFS 280#ifdef CONFIG_NF_CONNTRACK_PROCFS
305/* Print out the private part of the conntrack. */ 281/* Print out the private part of the conntrack. */
306static void tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct) 282static void tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
@@ -793,27 +769,21 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl,
793 return NF_ACCEPT; 769 return NF_ACCEPT;
794} 770}
795 771
796static unsigned int *tcp_get_timeouts(struct net *net)
797{
798 return tcp_pernet(net)->timeouts;
799}
800
801/* Returns verdict for packet, or -1 for invalid. */ 772/* Returns verdict for packet, or -1 for invalid. */
802static int tcp_packet(struct nf_conn *ct, 773static int tcp_packet(struct nf_conn *ct,
803 const struct sk_buff *skb, 774 const struct sk_buff *skb,
804 unsigned int dataoff, 775 unsigned int dataoff,
805 enum ip_conntrack_info ctinfo, 776 enum ip_conntrack_info ctinfo)
806 unsigned int *timeouts)
807{ 777{
808 struct net *net = nf_ct_net(ct); 778 struct net *net = nf_ct_net(ct);
809 struct nf_tcp_net *tn = tcp_pernet(net); 779 struct nf_tcp_net *tn = tcp_pernet(net);
810 struct nf_conntrack_tuple *tuple; 780 struct nf_conntrack_tuple *tuple;
811 enum tcp_conntrack new_state, old_state; 781 enum tcp_conntrack new_state, old_state;
782 unsigned int index, *timeouts;
812 enum ip_conntrack_dir dir; 783 enum ip_conntrack_dir dir;
813 const struct tcphdr *th; 784 const struct tcphdr *th;
814 struct tcphdr _tcph; 785 struct tcphdr _tcph;
815 unsigned long timeout; 786 unsigned long timeout;
816 unsigned int index;
817 787
818 th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); 788 th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
819 BUG_ON(th == NULL); 789 BUG_ON(th == NULL);
@@ -1046,6 +1016,10 @@ static int tcp_packet(struct nf_conn *ct,
1046 && new_state == TCP_CONNTRACK_FIN_WAIT) 1016 && new_state == TCP_CONNTRACK_FIN_WAIT)
1047 ct->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT; 1017 ct->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT;
1048 1018
1019 timeouts = nf_ct_timeout_lookup(ct);
1020 if (!timeouts)
1021 timeouts = tn->timeouts;
1022
1049 if (ct->proto.tcp.retrans >= tn->tcp_max_retrans && 1023 if (ct->proto.tcp.retrans >= tn->tcp_max_retrans &&
1050 timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS]) 1024 timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS])
1051 timeout = timeouts[TCP_CONNTRACK_RETRANS]; 1025 timeout = timeouts[TCP_CONNTRACK_RETRANS];
@@ -1095,7 +1069,7 @@ static int tcp_packet(struct nf_conn *ct,
1095 1069
1096/* Called when a new connection for this protocol found. */ 1070/* Called when a new connection for this protocol found. */
1097static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb, 1071static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
1098 unsigned int dataoff, unsigned int *timeouts) 1072 unsigned int dataoff)
1099{ 1073{
1100 enum tcp_conntrack new_state; 1074 enum tcp_conntrack new_state;
1101 const struct tcphdr *th; 1075 const struct tcphdr *th;
@@ -1313,10 +1287,12 @@ static unsigned int tcp_nlattr_tuple_size(void)
1313static int tcp_timeout_nlattr_to_obj(struct nlattr *tb[], 1287static int tcp_timeout_nlattr_to_obj(struct nlattr *tb[],
1314 struct net *net, void *data) 1288 struct net *net, void *data)
1315{ 1289{
1316 unsigned int *timeouts = data;
1317 struct nf_tcp_net *tn = tcp_pernet(net); 1290 struct nf_tcp_net *tn = tcp_pernet(net);
1291 unsigned int *timeouts = data;
1318 int i; 1292 int i;
1319 1293
1294 if (!timeouts)
1295 timeouts = tn->timeouts;
1320 /* set default TCP timeouts. */ 1296 /* set default TCP timeouts. */
1321 for (i=0; i<TCP_CONNTRACK_TIMEOUT_MAX; i++) 1297 for (i=0; i<TCP_CONNTRACK_TIMEOUT_MAX; i++)
1322 timeouts[i] = tn->timeouts[i]; 1298 timeouts[i] = tn->timeouts[i];
@@ -1559,13 +1535,10 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 =
1559{ 1535{
1560 .l3proto = PF_INET, 1536 .l3proto = PF_INET,
1561 .l4proto = IPPROTO_TCP, 1537 .l4proto = IPPROTO_TCP,
1562 .pkt_to_tuple = tcp_pkt_to_tuple,
1563 .invert_tuple = tcp_invert_tuple,
1564#ifdef CONFIG_NF_CONNTRACK_PROCFS 1538#ifdef CONFIG_NF_CONNTRACK_PROCFS
1565 .print_conntrack = tcp_print_conntrack, 1539 .print_conntrack = tcp_print_conntrack,
1566#endif 1540#endif
1567 .packet = tcp_packet, 1541 .packet = tcp_packet,
1568 .get_timeouts = tcp_get_timeouts,
1569 .new = tcp_new, 1542 .new = tcp_new,
1570 .error = tcp_error, 1543 .error = tcp_error,
1571 .can_early_drop = tcp_can_early_drop, 1544 .can_early_drop = tcp_can_early_drop,
@@ -1597,13 +1570,10 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 =
1597{ 1570{
1598 .l3proto = PF_INET6, 1571 .l3proto = PF_INET6,
1599 .l4proto = IPPROTO_TCP, 1572 .l4proto = IPPROTO_TCP,
1600 .pkt_to_tuple = tcp_pkt_to_tuple,
1601 .invert_tuple = tcp_invert_tuple,
1602#ifdef CONFIG_NF_CONNTRACK_PROCFS 1573#ifdef CONFIG_NF_CONNTRACK_PROCFS
1603 .print_conntrack = tcp_print_conntrack, 1574 .print_conntrack = tcp_print_conntrack,
1604#endif 1575#endif
1605 .packet = tcp_packet, 1576 .packet = tcp_packet,
1606 .get_timeouts = tcp_get_timeouts,
1607 .new = tcp_new, 1577 .new = tcp_new,
1608 .error = tcp_error, 1578 .error = tcp_error,
1609 .can_early_drop = tcp_can_early_drop, 1579 .can_early_drop = tcp_can_early_drop,
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index fe7243970aa4..7a1b8988a931 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -22,6 +22,7 @@
22#include <linux/netfilter_ipv6.h> 22#include <linux/netfilter_ipv6.h>
23#include <net/netfilter/nf_conntrack_l4proto.h> 23#include <net/netfilter/nf_conntrack_l4proto.h>
24#include <net/netfilter/nf_conntrack_ecache.h> 24#include <net/netfilter/nf_conntrack_ecache.h>
25#include <net/netfilter/nf_conntrack_timeout.h>
25#include <net/netfilter/nf_log.h> 26#include <net/netfilter/nf_log.h>
26#include <net/netfilter/ipv4/nf_conntrack_ipv4.h> 27#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
27#include <net/netfilter/ipv6/nf_conntrack_ipv6.h> 28#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
@@ -36,33 +37,6 @@ static inline struct nf_udp_net *udp_pernet(struct net *net)
36 return &net->ct.nf_ct_proto.udp; 37 return &net->ct.nf_ct_proto.udp;
37} 38}
38 39
39static bool udp_pkt_to_tuple(const struct sk_buff *skb,
40 unsigned int dataoff,
41 struct net *net,
42 struct nf_conntrack_tuple *tuple)
43{
44 const struct udphdr *hp;
45 struct udphdr _hdr;
46
47 /* Actually only need first 4 bytes to get ports. */
48 hp = skb_header_pointer(skb, dataoff, 4, &_hdr);
49 if (hp == NULL)
50 return false;
51
52 tuple->src.u.udp.port = hp->source;
53 tuple->dst.u.udp.port = hp->dest;
54
55 return true;
56}
57
58static bool udp_invert_tuple(struct nf_conntrack_tuple *tuple,
59 const struct nf_conntrack_tuple *orig)
60{
61 tuple->src.u.udp.port = orig->dst.u.udp.port;
62 tuple->dst.u.udp.port = orig->src.u.udp.port;
63 return true;
64}
65
66static unsigned int *udp_get_timeouts(struct net *net) 40static unsigned int *udp_get_timeouts(struct net *net)
67{ 41{
68 return udp_pernet(net)->timeouts; 42 return udp_pernet(net)->timeouts;
@@ -72,9 +46,14 @@ static unsigned int *udp_get_timeouts(struct net *net)
72static int udp_packet(struct nf_conn *ct, 46static int udp_packet(struct nf_conn *ct,
73 const struct sk_buff *skb, 47 const struct sk_buff *skb,
74 unsigned int dataoff, 48 unsigned int dataoff,
75 enum ip_conntrack_info ctinfo, 49 enum ip_conntrack_info ctinfo)
76 unsigned int *timeouts)
77{ 50{
51 unsigned int *timeouts;
52
53 timeouts = nf_ct_timeout_lookup(ct);
54 if (!timeouts)
55 timeouts = udp_get_timeouts(nf_ct_net(ct));
56
78 /* If we've seen traffic both ways, this is some kind of UDP 57 /* If we've seen traffic both ways, this is some kind of UDP
79 stream. Extend timeout. */ 58 stream. Extend timeout. */
80 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { 59 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
@@ -92,7 +71,7 @@ static int udp_packet(struct nf_conn *ct,
92 71
93/* Called when a new connection for this protocol found. */ 72/* Called when a new connection for this protocol found. */
94static bool udp_new(struct nf_conn *ct, const struct sk_buff *skb, 73static bool udp_new(struct nf_conn *ct, const struct sk_buff *skb,
95 unsigned int dataoff, unsigned int *timeouts) 74 unsigned int dataoff)
96{ 75{
97 return true; 76 return true;
98} 77}
@@ -203,6 +182,9 @@ static int udp_timeout_nlattr_to_obj(struct nlattr *tb[],
203 unsigned int *timeouts = data; 182 unsigned int *timeouts = data;
204 struct nf_udp_net *un = udp_pernet(net); 183 struct nf_udp_net *un = udp_pernet(net);
205 184
185 if (!timeouts)
186 timeouts = un->timeouts;
187
206 /* set default timeouts for UDP. */ 188 /* set default timeouts for UDP. */
207 timeouts[UDP_CT_UNREPLIED] = un->timeouts[UDP_CT_UNREPLIED]; 189 timeouts[UDP_CT_UNREPLIED] = un->timeouts[UDP_CT_UNREPLIED];
208 timeouts[UDP_CT_REPLIED] = un->timeouts[UDP_CT_REPLIED]; 190 timeouts[UDP_CT_REPLIED] = un->timeouts[UDP_CT_REPLIED];
@@ -301,10 +283,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 =
301 .l3proto = PF_INET, 283 .l3proto = PF_INET,
302 .l4proto = IPPROTO_UDP, 284 .l4proto = IPPROTO_UDP,
303 .allow_clash = true, 285 .allow_clash = true,
304 .pkt_to_tuple = udp_pkt_to_tuple,
305 .invert_tuple = udp_invert_tuple,
306 .packet = udp_packet, 286 .packet = udp_packet,
307 .get_timeouts = udp_get_timeouts,
308 .new = udp_new, 287 .new = udp_new,
309 .error = udp_error, 288 .error = udp_error,
310#if IS_ENABLED(CONFIG_NF_CT_NETLINK) 289#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
@@ -333,10 +312,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 =
333 .l3proto = PF_INET, 312 .l3proto = PF_INET,
334 .l4proto = IPPROTO_UDPLITE, 313 .l4proto = IPPROTO_UDPLITE,
335 .allow_clash = true, 314 .allow_clash = true,
336 .pkt_to_tuple = udp_pkt_to_tuple,
337 .invert_tuple = udp_invert_tuple,
338 .packet = udp_packet, 315 .packet = udp_packet,
339 .get_timeouts = udp_get_timeouts,
340 .new = udp_new, 316 .new = udp_new,
341 .error = udplite_error, 317 .error = udplite_error,
342#if IS_ENABLED(CONFIG_NF_CT_NETLINK) 318#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
@@ -365,10 +341,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 =
365 .l3proto = PF_INET6, 341 .l3proto = PF_INET6,
366 .l4proto = IPPROTO_UDP, 342 .l4proto = IPPROTO_UDP,
367 .allow_clash = true, 343 .allow_clash = true,
368 .pkt_to_tuple = udp_pkt_to_tuple,
369 .invert_tuple = udp_invert_tuple,
370 .packet = udp_packet, 344 .packet = udp_packet,
371 .get_timeouts = udp_get_timeouts,
372 .new = udp_new, 345 .new = udp_new,
373 .error = udp_error, 346 .error = udp_error,
374#if IS_ENABLED(CONFIG_NF_CT_NETLINK) 347#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
@@ -397,10 +370,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 =
397 .l3proto = PF_INET6, 370 .l3proto = PF_INET6,
398 .l4proto = IPPROTO_UDPLITE, 371 .l4proto = IPPROTO_UDPLITE,
399 .allow_clash = true, 372 .allow_clash = true,
400 .pkt_to_tuple = udp_pkt_to_tuple,
401 .invert_tuple = udp_invert_tuple,
402 .packet = udp_packet, 373 .packet = udp_packet,
403 .get_timeouts = udp_get_timeouts,
404 .new = udp_new, 374 .new = udp_new,
405 .error = udplite_error, 375 .error = udplite_error,
406#if IS_ENABLED(CONFIG_NF_CT_NETLINK) 376#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
@@ -423,3 +393,4 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 =
423}; 393};
424EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udplite6); 394EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udplite6);
425#endif 395#endif
396#include <net/netfilter/nf_conntrack_timeout.h>
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index b642c0b2495c..13279f683da9 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -1,12 +1,4 @@
1/* (C) 1999-2001 Paul `Rusty' Russell 1// SPDX-License-Identifier: GPL-2.0
2 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
3 * (C) 2005-2012 Patrick McHardy <kaber@trash.net>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 */
9
10#include <linux/types.h> 2#include <linux/types.h>
11#include <linux/netfilter.h> 3#include <linux/netfilter.h>
12#include <linux/slab.h> 4#include <linux/slab.h>
@@ -24,7 +16,6 @@
24 16
25#include <net/netfilter/nf_conntrack.h> 17#include <net/netfilter/nf_conntrack.h>
26#include <net/netfilter/nf_conntrack_core.h> 18#include <net/netfilter/nf_conntrack_core.h>
27#include <net/netfilter/nf_conntrack_l3proto.h>
28#include <net/netfilter/nf_conntrack_l4proto.h> 19#include <net/netfilter/nf_conntrack_l4proto.h>
29#include <net/netfilter/nf_conntrack_expect.h> 20#include <net/netfilter/nf_conntrack_expect.h>
30#include <net/netfilter/nf_conntrack_helper.h> 21#include <net/netfilter/nf_conntrack_helper.h>
@@ -33,15 +24,14 @@
33#include <net/netfilter/nf_conntrack_timestamp.h> 24#include <net/netfilter/nf_conntrack_timestamp.h>
34#include <linux/rculist_nulls.h> 25#include <linux/rculist_nulls.h>
35 26
36MODULE_LICENSE("GPL"); 27unsigned int nf_conntrack_net_id __read_mostly;
37 28
38#ifdef CONFIG_NF_CONNTRACK_PROCFS 29#ifdef CONFIG_NF_CONNTRACK_PROCFS
39void 30void
40print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, 31print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple,
41 const struct nf_conntrack_l3proto *l3proto,
42 const struct nf_conntrack_l4proto *l4proto) 32 const struct nf_conntrack_l4proto *l4proto)
43{ 33{
44 switch (l3proto->l3proto) { 34 switch (tuple->src.l3num) {
45 case NFPROTO_IPV4: 35 case NFPROTO_IPV4:
46 seq_printf(s, "src=%pI4 dst=%pI4 ", 36 seq_printf(s, "src=%pI4 dst=%pI4 ",
47 &tuple->src.u3.ip, &tuple->dst.u3.ip); 37 &tuple->src.u3.ip, &tuple->dst.u3.ip);
@@ -282,7 +272,6 @@ static int ct_seq_show(struct seq_file *s, void *v)
282{ 272{
283 struct nf_conntrack_tuple_hash *hash = v; 273 struct nf_conntrack_tuple_hash *hash = v;
284 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(hash); 274 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(hash);
285 const struct nf_conntrack_l3proto *l3proto;
286 const struct nf_conntrack_l4proto *l4proto; 275 const struct nf_conntrack_l4proto *l4proto;
287 struct net *net = seq_file_net(s); 276 struct net *net = seq_file_net(s);
288 int ret = 0; 277 int ret = 0;
@@ -303,14 +292,12 @@ static int ct_seq_show(struct seq_file *s, void *v)
303 if (!net_eq(nf_ct_net(ct), net)) 292 if (!net_eq(nf_ct_net(ct), net))
304 goto release; 293 goto release;
305 294
306 l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct));
307 WARN_ON(!l3proto);
308 l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); 295 l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
309 WARN_ON(!l4proto); 296 WARN_ON(!l4proto);
310 297
311 ret = -ENOSPC; 298 ret = -ENOSPC;
312 seq_printf(s, "%-8s %u %-8s %u ", 299 seq_printf(s, "%-8s %u %-8s %u ",
313 l3proto_name(l3proto->l3proto), nf_ct_l3num(ct), 300 l3proto_name(nf_ct_l3num(ct)), nf_ct_l3num(ct),
314 l4proto_name(l4proto->l4proto), nf_ct_protonum(ct)); 301 l4proto_name(l4proto->l4proto), nf_ct_protonum(ct));
315 302
316 if (!test_bit(IPS_OFFLOAD_BIT, &ct->status)) 303 if (!test_bit(IPS_OFFLOAD_BIT, &ct->status))
@@ -320,7 +307,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
320 l4proto->print_conntrack(s, ct); 307 l4proto->print_conntrack(s, ct);
321 308
322 print_tuple(s, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 309 print_tuple(s, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
323 l3proto, l4proto); 310 l4proto);
324 311
325 ct_show_zone(s, ct, NF_CT_ZONE_DIR_ORIG); 312 ct_show_zone(s, ct, NF_CT_ZONE_DIR_ORIG);
326 313
@@ -333,8 +320,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
333 if (!(test_bit(IPS_SEEN_REPLY_BIT, &ct->status))) 320 if (!(test_bit(IPS_SEEN_REPLY_BIT, &ct->status)))
334 seq_puts(s, "[UNREPLIED] "); 321 seq_puts(s, "[UNREPLIED] ");
335 322
336 print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 323 print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, l4proto);
337 l3proto, l4proto);
338 324
339 ct_show_zone(s, ct, NF_CT_ZONE_DIR_REPL); 325 ct_show_zone(s, ct, NF_CT_ZONE_DIR_REPL);
340 326
@@ -680,6 +666,8 @@ static void nf_conntrack_pernet_exit(struct list_head *net_exit_list)
680static struct pernet_operations nf_conntrack_net_ops = { 666static struct pernet_operations nf_conntrack_net_ops = {
681 .init = nf_conntrack_pernet_init, 667 .init = nf_conntrack_pernet_init,
682 .exit_batch = nf_conntrack_pernet_exit, 668 .exit_batch = nf_conntrack_pernet_exit,
669 .id = &nf_conntrack_net_id,
670 .size = sizeof(struct nf_conntrack_net),
683}; 671};
684 672
685static int __init nf_conntrack_standalone_init(void) 673static int __init nf_conntrack_standalone_init(void)
diff --git a/net/netfilter/nf_conntrack_timeout.c b/net/netfilter/nf_conntrack_timeout.c
index 46aee65f339b..91fbd183da2d 100644
--- a/net/netfilter/nf_conntrack_timeout.c
+++ b/net/netfilter/nf_conntrack_timeout.c
@@ -24,13 +24,30 @@
24#include <net/netfilter/nf_conntrack_extend.h> 24#include <net/netfilter/nf_conntrack_extend.h>
25#include <net/netfilter/nf_conntrack_timeout.h> 25#include <net/netfilter/nf_conntrack_timeout.h>
26 26
27struct ctnl_timeout * 27struct nf_ct_timeout *
28(*nf_ct_timeout_find_get_hook)(struct net *net, const char *name) __read_mostly; 28(*nf_ct_timeout_find_get_hook)(struct net *net, const char *name) __read_mostly;
29EXPORT_SYMBOL_GPL(nf_ct_timeout_find_get_hook); 29EXPORT_SYMBOL_GPL(nf_ct_timeout_find_get_hook);
30 30
31void (*nf_ct_timeout_put_hook)(struct ctnl_timeout *timeout) __read_mostly; 31void (*nf_ct_timeout_put_hook)(struct nf_ct_timeout *timeout) __read_mostly;
32EXPORT_SYMBOL_GPL(nf_ct_timeout_put_hook); 32EXPORT_SYMBOL_GPL(nf_ct_timeout_put_hook);
33 33
34static int untimeout(struct nf_conn *ct, void *timeout)
35{
36 struct nf_conn_timeout *timeout_ext = nf_ct_timeout_find(ct);
37
38 if (timeout_ext && (!timeout || timeout_ext->timeout == timeout))
39 RCU_INIT_POINTER(timeout_ext->timeout, NULL);
40
41 /* We are not intended to delete this conntrack. */
42 return 0;
43}
44
45void nf_ct_untimeout(struct net *net, struct nf_ct_timeout *timeout)
46{
47 nf_ct_iterate_cleanup_net(net, untimeout, timeout, 0, 0);
48}
49EXPORT_SYMBOL_GPL(nf_ct_untimeout);
50
34static const struct nf_ct_ext_type timeout_extend = { 51static const struct nf_ct_ext_type timeout_extend = {
35 .len = sizeof(struct nf_conn_timeout), 52 .len = sizeof(struct nf_conn_timeout),
36 .align = __alignof__(struct nf_conn_timeout), 53 .align = __alignof__(struct nf_conn_timeout),
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index eb0d1658ac05..d8125616edc7 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -107,11 +107,12 @@ static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp)
107 tcp->seen[1].td_maxwin = 0; 107 tcp->seen[1].td_maxwin = 0;
108} 108}
109 109
110#define NF_FLOWTABLE_TCP_PICKUP_TIMEOUT (120 * HZ)
111#define NF_FLOWTABLE_UDP_PICKUP_TIMEOUT (30 * HZ)
112
110static void flow_offload_fixup_ct_state(struct nf_conn *ct) 113static void flow_offload_fixup_ct_state(struct nf_conn *ct)
111{ 114{
112 const struct nf_conntrack_l4proto *l4proto; 115 const struct nf_conntrack_l4proto *l4proto;
113 struct net *net = nf_ct_net(ct);
114 unsigned int *timeouts;
115 unsigned int timeout; 116 unsigned int timeout;
116 int l4num; 117 int l4num;
117 118
@@ -123,14 +124,10 @@ static void flow_offload_fixup_ct_state(struct nf_conn *ct)
123 if (!l4proto) 124 if (!l4proto)
124 return; 125 return;
125 126
126 timeouts = l4proto->get_timeouts(net);
127 if (!timeouts)
128 return;
129
130 if (l4num == IPPROTO_TCP) 127 if (l4num == IPPROTO_TCP)
131 timeout = timeouts[TCP_CONNTRACK_ESTABLISHED]; 128 timeout = NF_FLOWTABLE_TCP_PICKUP_TIMEOUT;
132 else if (l4num == IPPROTO_UDP) 129 else if (l4num == IPPROTO_UDP)
133 timeout = timeouts[UDP_CT_REPLIED]; 130 timeout = NF_FLOWTABLE_UDP_PICKUP_TIMEOUT;
134 else 131 else
135 return; 132 return;
136 133
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index 426457047578..a61d6df6e5f6 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -424,6 +424,10 @@ static int nf_log_proc_dostring(struct ctl_table *table, int write,
424 if (write) { 424 if (write) {
425 struct ctl_table tmp = *table; 425 struct ctl_table tmp = *table;
426 426
427 /* proc_dostring() can append to existing strings, so we need to
428 * initialize it as an empty string.
429 */
430 buf[0] = '\0';
427 tmp.data = buf; 431 tmp.data = buf;
428 r = proc_dostring(&tmp, write, buffer, lenp, ppos); 432 r = proc_dostring(&tmp, write, buffer, lenp, ppos);
429 if (r) 433 if (r)
@@ -442,14 +446,17 @@ static int nf_log_proc_dostring(struct ctl_table *table, int write,
442 rcu_assign_pointer(net->nf.nf_loggers[tindex], logger); 446 rcu_assign_pointer(net->nf.nf_loggers[tindex], logger);
443 mutex_unlock(&nf_log_mutex); 447 mutex_unlock(&nf_log_mutex);
444 } else { 448 } else {
449 struct ctl_table tmp = *table;
450
451 tmp.data = buf;
445 mutex_lock(&nf_log_mutex); 452 mutex_lock(&nf_log_mutex);
446 logger = nft_log_dereference(net->nf.nf_loggers[tindex]); 453 logger = nft_log_dereference(net->nf.nf_loggers[tindex]);
447 if (!logger) 454 if (!logger)
448 table->data = "NONE"; 455 strlcpy(buf, "NONE", sizeof(buf));
449 else 456 else
450 table->data = logger->name; 457 strlcpy(buf, logger->name, sizeof(buf));
451 r = proc_dostring(table, write, buffer, lenp, ppos);
452 mutex_unlock(&nf_log_mutex); 458 mutex_unlock(&nf_log_mutex);
459 r = proc_dostring(&tmp, write, buffer, lenp, ppos);
453 } 460 }
454 461
455 return r; 462 return r;
diff --git a/net/netfilter/nf_log_common.c b/net/netfilter/nf_log_common.c
index dc61399e30be..a8c5c846aec1 100644
--- a/net/netfilter/nf_log_common.c
+++ b/net/netfilter/nf_log_common.c
@@ -132,9 +132,10 @@ int nf_log_dump_tcp_header(struct nf_log_buf *m, const struct sk_buff *skb,
132} 132}
133EXPORT_SYMBOL_GPL(nf_log_dump_tcp_header); 133EXPORT_SYMBOL_GPL(nf_log_dump_tcp_header);
134 134
135void nf_log_dump_sk_uid_gid(struct nf_log_buf *m, struct sock *sk) 135void nf_log_dump_sk_uid_gid(struct net *net, struct nf_log_buf *m,
136 struct sock *sk)
136{ 137{
137 if (!sk || !sk_fullsock(sk)) 138 if (!sk || !sk_fullsock(sk) || !net_eq(net, sock_net(sk)))
138 return; 139 return;
139 140
140 read_lock_bh(&sk->sk_callback_lock); 141 read_lock_bh(&sk->sk_callback_lock);
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 46f9df99d276..e2b196054dfc 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -28,7 +28,6 @@
28#include <net/netfilter/nf_nat_helper.h> 28#include <net/netfilter/nf_nat_helper.h>
29#include <net/netfilter/nf_conntrack_helper.h> 29#include <net/netfilter/nf_conntrack_helper.h>
30#include <net/netfilter/nf_conntrack_seqadj.h> 30#include <net/netfilter/nf_conntrack_seqadj.h>
31#include <net/netfilter/nf_conntrack_l3proto.h>
32#include <net/netfilter/nf_conntrack_zones.h> 31#include <net/netfilter/nf_conntrack_zones.h>
33#include <linux/netfilter/nf_nat.h> 32#include <linux/netfilter/nf_nat.h>
34 33
@@ -108,6 +107,7 @@ int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family)
108 struct flowi fl; 107 struct flowi fl;
109 unsigned int hh_len; 108 unsigned int hh_len;
110 struct dst_entry *dst; 109 struct dst_entry *dst;
110 struct sock *sk = skb->sk;
111 int err; 111 int err;
112 112
113 err = xfrm_decode_session(skb, &fl, family); 113 err = xfrm_decode_session(skb, &fl, family);
@@ -119,7 +119,10 @@ int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family)
119 dst = ((struct xfrm_dst *)dst)->route; 119 dst = ((struct xfrm_dst *)dst)->route;
120 dst_hold(dst); 120 dst_hold(dst);
121 121
122 dst = xfrm_lookup(net, dst, &fl, skb->sk, 0); 122 if (sk && !net_eq(net, sock_net(sk)))
123 sk = NULL;
124
125 dst = xfrm_lookup(net, dst, &fl, sk, 0);
123 if (IS_ERR(dst)) 126 if (IS_ERR(dst))
124 return PTR_ERR(dst); 127 return PTR_ERR(dst);
125 128
@@ -739,12 +742,6 @@ EXPORT_SYMBOL_GPL(nf_nat_l4proto_unregister);
739 742
740int nf_nat_l3proto_register(const struct nf_nat_l3proto *l3proto) 743int nf_nat_l3proto_register(const struct nf_nat_l3proto *l3proto)
741{ 744{
742 int err;
743
744 err = nf_ct_l3proto_try_module_get(l3proto->l3proto);
745 if (err < 0)
746 return err;
747
748 mutex_lock(&nf_nat_proto_mutex); 745 mutex_lock(&nf_nat_proto_mutex);
749 RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_TCP], 746 RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_TCP],
750 &nf_nat_l4proto_tcp); 747 &nf_nat_l4proto_tcp);
@@ -777,7 +774,6 @@ void nf_nat_l3proto_unregister(const struct nf_nat_l3proto *l3proto)
777 synchronize_rcu(); 774 synchronize_rcu();
778 775
779 nf_nat_l3proto_clean(l3proto->l3proto); 776 nf_nat_l3proto_clean(l3proto->l3proto);
780 nf_ct_l3proto_module_put(l3proto->l3proto);
781} 777}
782EXPORT_SYMBOL_GPL(nf_nat_l3proto_unregister); 778EXPORT_SYMBOL_GPL(nf_nat_l3proto_unregister);
783 779
@@ -1060,7 +1056,7 @@ static int __init nf_nat_init(void)
1060 1056
1061 ret = nf_ct_extend_register(&nat_extend); 1057 ret = nf_ct_extend_register(&nat_extend);
1062 if (ret < 0) { 1058 if (ret < 0) {
1063 nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); 1059 kvfree(nf_nat_bysource);
1064 pr_err("Unable to register extension\n"); 1060 pr_err("Unable to register extension\n");
1065 return ret; 1061 return ret;
1066 } 1062 }
@@ -1098,7 +1094,7 @@ static void __exit nf_nat_cleanup(void)
1098 for (i = 0; i < NFPROTO_NUMPROTO; i++) 1094 for (i = 0; i < NFPROTO_NUMPROTO; i++)
1099 kfree(nf_nat_l4protos[i]); 1095 kfree(nf_nat_l4protos[i]);
1100 synchronize_net(); 1096 synchronize_net();
1101 nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); 1097 kvfree(nf_nat_bysource);
1102 unregister_pernet_subsys(&nat_net_ops); 1098 unregister_pernet_subsys(&nat_net_ops);
1103} 1099}
1104 1100
diff --git a/net/netfilter/nf_osf.c b/net/netfilter/nf_osf.c
deleted file mode 100644
index 5ba5c7bef2f9..000000000000
--- a/net/netfilter/nf_osf.c
+++ /dev/null
@@ -1,218 +0,0 @@
1#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
2#include <linux/module.h>
3#include <linux/kernel.h>
4
5#include <linux/capability.h>
6#include <linux/if.h>
7#include <linux/inetdevice.h>
8#include <linux/ip.h>
9#include <linux/list.h>
10#include <linux/rculist.h>
11#include <linux/skbuff.h>
12#include <linux/slab.h>
13#include <linux/tcp.h>
14
15#include <net/ip.h>
16#include <net/tcp.h>
17
18#include <linux/netfilter/nfnetlink.h>
19#include <linux/netfilter/x_tables.h>
20#include <net/netfilter/nf_log.h>
21#include <linux/netfilter/nf_osf.h>
22
23static inline int nf_osf_ttl(const struct sk_buff *skb,
24 const struct nf_osf_info *info,
25 unsigned char f_ttl)
26{
27 const struct iphdr *ip = ip_hdr(skb);
28
29 if (info->flags & NF_OSF_TTL) {
30 if (info->ttl == NF_OSF_TTL_TRUE)
31 return ip->ttl == f_ttl;
32 if (info->ttl == NF_OSF_TTL_NOCHECK)
33 return 1;
34 else if (ip->ttl <= f_ttl)
35 return 1;
36 else {
37 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
38 int ret = 0;
39
40 for_ifa(in_dev) {
41 if (inet_ifa_match(ip->saddr, ifa)) {
42 ret = (ip->ttl == f_ttl);
43 break;
44 }
45 }
46 endfor_ifa(in_dev);
47
48 return ret;
49 }
50 }
51
52 return ip->ttl == f_ttl;
53}
54
55bool
56nf_osf_match(const struct sk_buff *skb, u_int8_t family,
57 int hooknum, struct net_device *in, struct net_device *out,
58 const struct nf_osf_info *info, struct net *net,
59 const struct list_head *nf_osf_fingers)
60{
61 const unsigned char *optp = NULL, *_optp = NULL;
62 unsigned int optsize = 0, check_WSS = 0;
63 int fmatch = FMATCH_WRONG, fcount = 0;
64 const struct iphdr *ip = ip_hdr(skb);
65 const struct nf_osf_user_finger *f;
66 unsigned char opts[MAX_IPOPTLEN];
67 const struct nf_osf_finger *kf;
68 u16 window, totlen, mss = 0;
69 const struct tcphdr *tcp;
70 struct tcphdr _tcph;
71 bool df;
72
73 tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph);
74 if (!tcp)
75 return false;
76
77 if (!tcp->syn)
78 return false;
79
80 totlen = ntohs(ip->tot_len);
81 df = ntohs(ip->frag_off) & IP_DF;
82 window = ntohs(tcp->window);
83
84 if (tcp->doff * 4 > sizeof(struct tcphdr)) {
85 optsize = tcp->doff * 4 - sizeof(struct tcphdr);
86
87 _optp = optp = skb_header_pointer(skb, ip_hdrlen(skb) +
88 sizeof(struct tcphdr), optsize, opts);
89 }
90
91 list_for_each_entry_rcu(kf, &nf_osf_fingers[df], finger_entry) {
92 int foptsize, optnum;
93
94 f = &kf->finger;
95
96 if (!(info->flags & NF_OSF_LOG) && strcmp(info->genre, f->genre))
97 continue;
98
99 optp = _optp;
100 fmatch = FMATCH_WRONG;
101
102 if (totlen != f->ss || !nf_osf_ttl(skb, info, f->ttl))
103 continue;
104
105 /*
106 * Should not happen if userspace parser was written correctly.
107 */
108 if (f->wss.wc >= OSF_WSS_MAX)
109 continue;
110
111 /* Check options */
112
113 foptsize = 0;
114 for (optnum = 0; optnum < f->opt_num; ++optnum)
115 foptsize += f->opt[optnum].length;
116
117 if (foptsize > MAX_IPOPTLEN ||
118 optsize > MAX_IPOPTLEN ||
119 optsize != foptsize)
120 continue;
121
122 check_WSS = f->wss.wc;
123
124 for (optnum = 0; optnum < f->opt_num; ++optnum) {
125 if (f->opt[optnum].kind == (*optp)) {
126 __u32 len = f->opt[optnum].length;
127 const __u8 *optend = optp + len;
128
129 fmatch = FMATCH_OK;
130
131 switch (*optp) {
132 case OSFOPT_MSS:
133 mss = optp[3];
134 mss <<= 8;
135 mss |= optp[2];
136
137 mss = ntohs((__force __be16)mss);
138 break;
139 case OSFOPT_TS:
140 break;
141 }
142
143 optp = optend;
144 } else
145 fmatch = FMATCH_OPT_WRONG;
146
147 if (fmatch != FMATCH_OK)
148 break;
149 }
150
151 if (fmatch != FMATCH_OPT_WRONG) {
152 fmatch = FMATCH_WRONG;
153
154 switch (check_WSS) {
155 case OSF_WSS_PLAIN:
156 if (f->wss.val == 0 || window == f->wss.val)
157 fmatch = FMATCH_OK;
158 break;
159 case OSF_WSS_MSS:
160 /*
161 * Some smart modems decrease mangle MSS to
162 * SMART_MSS_2, so we check standard, decreased
163 * and the one provided in the fingerprint MSS
164 * values.
165 */
166#define SMART_MSS_1 1460
167#define SMART_MSS_2 1448
168 if (window == f->wss.val * mss ||
169 window == f->wss.val * SMART_MSS_1 ||
170 window == f->wss.val * SMART_MSS_2)
171 fmatch = FMATCH_OK;
172 break;
173 case OSF_WSS_MTU:
174 if (window == f->wss.val * (mss + 40) ||
175 window == f->wss.val * (SMART_MSS_1 + 40) ||
176 window == f->wss.val * (SMART_MSS_2 + 40))
177 fmatch = FMATCH_OK;
178 break;
179 case OSF_WSS_MODULO:
180 if ((window % f->wss.val) == 0)
181 fmatch = FMATCH_OK;
182 break;
183 }
184 }
185
186 if (fmatch != FMATCH_OK)
187 continue;
188
189 fcount++;
190
191 if (info->flags & NF_OSF_LOG)
192 nf_log_packet(net, family, hooknum, skb,
193 in, out, NULL,
194 "%s [%s:%s] : %pI4:%d -> %pI4:%d hops=%d\n",
195 f->genre, f->version, f->subtype,
196 &ip->saddr, ntohs(tcp->source),
197 &ip->daddr, ntohs(tcp->dest),
198 f->ttl - ip->ttl);
199
200 if ((info->flags & NF_OSF_LOG) &&
201 info->loglevel == NF_OSF_LOGLEVEL_FIRST)
202 break;
203 }
204
205 if (!fcount && (info->flags & NF_OSF_LOG))
206 nf_log_packet(net, family, hooknum, skb, in, out, NULL,
207 "Remote OS is not known: %pI4:%u -> %pI4:%u\n",
208 &ip->saddr, ntohs(tcp->source),
209 &ip->daddr, ntohs(tcp->dest));
210
211 if (fcount)
212 fmatch = FMATCH_OK;
213
214 return fmatch == FMATCH_OK;
215}
216EXPORT_SYMBOL_GPL(nf_osf_match);
217
218MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 896d4a36081d..1dca5683f59f 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -14,6 +14,7 @@
14#include <linux/skbuff.h> 14#include <linux/skbuff.h>
15#include <linux/netlink.h> 15#include <linux/netlink.h>
16#include <linux/vmalloc.h> 16#include <linux/vmalloc.h>
17#include <linux/rhashtable.h>
17#include <linux/netfilter.h> 18#include <linux/netfilter.h>
18#include <linux/netfilter/nfnetlink.h> 19#include <linux/netfilter/nfnetlink.h>
19#include <linux/netfilter/nf_tables.h> 20#include <linux/netfilter/nf_tables.h>
@@ -75,6 +76,7 @@ static void nft_ctx_init(struct nft_ctx *ctx,
75{ 76{
76 ctx->net = net; 77 ctx->net = net;
77 ctx->family = family; 78 ctx->family = family;
79 ctx->level = 0;
78 ctx->table = table; 80 ctx->table = table;
79 ctx->chain = chain; 81 ctx->chain = chain;
80 ctx->nla = nla; 82 ctx->nla = nla;
@@ -454,20 +456,59 @@ __nf_tables_chain_type_lookup(const struct nlattr *nla, u8 family)
454 return NULL; 456 return NULL;
455} 457}
456 458
459/*
460 * Loading a module requires dropping mutex that guards the
461 * transaction.
462 * We first need to abort any pending transactions as once
463 * mutex is unlocked a different client could start a new
464 * transaction. It must not see any 'future generation'
465 * changes * as these changes will never happen.
466 */
467#ifdef CONFIG_MODULES
468static int __nf_tables_abort(struct net *net);
469
470static void nft_request_module(struct net *net, const char *fmt, ...)
471{
472 char module_name[MODULE_NAME_LEN];
473 va_list args;
474 int ret;
475
476 __nf_tables_abort(net);
477
478 va_start(args, fmt);
479 ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
480 va_end(args);
481 if (WARN(ret >= MODULE_NAME_LEN, "truncated: '%s' (len %d)", module_name, ret))
482 return;
483
484 mutex_unlock(&net->nft.commit_mutex);
485 request_module("%s", module_name);
486 mutex_lock(&net->nft.commit_mutex);
487}
488#endif
489
490static void lockdep_nfnl_nft_mutex_not_held(void)
491{
492#ifdef CONFIG_PROVE_LOCKING
493 WARN_ON_ONCE(lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES));
494#endif
495}
496
457static const struct nft_chain_type * 497static const struct nft_chain_type *
458nf_tables_chain_type_lookup(const struct nlattr *nla, u8 family, bool autoload) 498nf_tables_chain_type_lookup(struct net *net, const struct nlattr *nla,
499 u8 family, bool autoload)
459{ 500{
460 const struct nft_chain_type *type; 501 const struct nft_chain_type *type;
461 502
462 type = __nf_tables_chain_type_lookup(nla, family); 503 type = __nf_tables_chain_type_lookup(nla, family);
463 if (type != NULL) 504 if (type != NULL)
464 return type; 505 return type;
506
507 lockdep_nfnl_nft_mutex_not_held();
465#ifdef CONFIG_MODULES 508#ifdef CONFIG_MODULES
466 if (autoload) { 509 if (autoload) {
467 nfnl_unlock(NFNL_SUBSYS_NFTABLES); 510 nft_request_module(net, "nft-chain-%u-%.*s", family,
468 request_module("nft-chain-%u-%.*s", family, 511 nla_len(nla), (const char *)nla_data(nla));
469 nla_len(nla), (const char *)nla_data(nla));
470 nfnl_lock(NFNL_SUBSYS_NFTABLES);
471 type = __nf_tables_chain_type_lookup(nla, family); 512 type = __nf_tables_chain_type_lookup(nla, family);
472 if (type != NULL) 513 if (type != NULL)
473 return ERR_PTR(-EAGAIN); 514 return ERR_PTR(-EAGAIN);
@@ -771,6 +812,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
771 struct nft_ctx ctx; 812 struct nft_ctx ctx;
772 int err; 813 int err;
773 814
815 lockdep_assert_held(&net->nft.commit_mutex);
774 attr = nla[NFTA_TABLE_NAME]; 816 attr = nla[NFTA_TABLE_NAME];
775 table = nft_table_lookup(net, attr, family, genmask); 817 table = nft_table_lookup(net, attr, family, genmask);
776 if (IS_ERR(table)) { 818 if (IS_ERR(table)) {
@@ -1011,7 +1053,17 @@ nft_chain_lookup_byhandle(const struct nft_table *table, u64 handle, u8 genmask)
1011 return ERR_PTR(-ENOENT); 1053 return ERR_PTR(-ENOENT);
1012} 1054}
1013 1055
1014static struct nft_chain *nft_chain_lookup(struct nft_table *table, 1056static bool lockdep_commit_lock_is_held(struct net *net)
1057{
1058#ifdef CONFIG_PROVE_LOCKING
1059 return lockdep_is_held(&net->nft.commit_mutex);
1060#else
1061 return true;
1062#endif
1063}
1064
1065static struct nft_chain *nft_chain_lookup(struct net *net,
1066 struct nft_table *table,
1015 const struct nlattr *nla, u8 genmask) 1067 const struct nlattr *nla, u8 genmask)
1016{ 1068{
1017 char search[NFT_CHAIN_MAXNAMELEN + 1]; 1069 char search[NFT_CHAIN_MAXNAMELEN + 1];
@@ -1024,7 +1076,7 @@ static struct nft_chain *nft_chain_lookup(struct nft_table *table,
1024 nla_strlcpy(search, nla, sizeof(search)); 1076 nla_strlcpy(search, nla, sizeof(search));
1025 1077
1026 WARN_ON(!rcu_read_lock_held() && 1078 WARN_ON(!rcu_read_lock_held() &&
1027 !lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES)); 1079 !lockdep_commit_lock_is_held(net));
1028 1080
1029 chain = ERR_PTR(-ENOENT); 1081 chain = ERR_PTR(-ENOENT);
1030 rcu_read_lock(); 1082 rcu_read_lock();
@@ -1264,7 +1316,7 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk,
1264 return PTR_ERR(table); 1316 return PTR_ERR(table);
1265 } 1317 }
1266 1318
1267 chain = nft_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask); 1319 chain = nft_chain_lookup(net, table, nla[NFTA_CHAIN_NAME], genmask);
1268 if (IS_ERR(chain)) { 1320 if (IS_ERR(chain)) {
1269 NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]); 1321 NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]);
1270 return PTR_ERR(chain); 1322 return PTR_ERR(chain);
@@ -1390,13 +1442,16 @@ struct nft_chain_hook {
1390static int nft_chain_parse_hook(struct net *net, 1442static int nft_chain_parse_hook(struct net *net,
1391 const struct nlattr * const nla[], 1443 const struct nlattr * const nla[],
1392 struct nft_chain_hook *hook, u8 family, 1444 struct nft_chain_hook *hook, u8 family,
1393 bool create) 1445 bool autoload)
1394{ 1446{
1395 struct nlattr *ha[NFTA_HOOK_MAX + 1]; 1447 struct nlattr *ha[NFTA_HOOK_MAX + 1];
1396 const struct nft_chain_type *type; 1448 const struct nft_chain_type *type;
1397 struct net_device *dev; 1449 struct net_device *dev;
1398 int err; 1450 int err;
1399 1451
1452 lockdep_assert_held(&net->nft.commit_mutex);
1453 lockdep_nfnl_nft_mutex_not_held();
1454
1400 err = nla_parse_nested(ha, NFTA_HOOK_MAX, nla[NFTA_CHAIN_HOOK], 1455 err = nla_parse_nested(ha, NFTA_HOOK_MAX, nla[NFTA_CHAIN_HOOK],
1401 nft_hook_policy, NULL); 1456 nft_hook_policy, NULL);
1402 if (err < 0) 1457 if (err < 0)
@@ -1411,8 +1466,8 @@ static int nft_chain_parse_hook(struct net *net,
1411 1466
1412 type = chain_type[family][NFT_CHAIN_T_DEFAULT]; 1467 type = chain_type[family][NFT_CHAIN_T_DEFAULT];
1413 if (nla[NFTA_CHAIN_TYPE]) { 1468 if (nla[NFTA_CHAIN_TYPE]) {
1414 type = nf_tables_chain_type_lookup(nla[NFTA_CHAIN_TYPE], 1469 type = nf_tables_chain_type_lookup(net, nla[NFTA_CHAIN_TYPE],
1415 family, create); 1470 family, autoload);
1416 if (IS_ERR(type)) 1471 if (IS_ERR(type))
1417 return PTR_ERR(type); 1472 return PTR_ERR(type);
1418 } 1473 }
@@ -1479,7 +1534,7 @@ static struct nft_rule **nf_tables_chain_alloc_rules(const struct nft_chain *cha
1479} 1534}
1480 1535
1481static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, 1536static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
1482 u8 policy, bool create) 1537 u8 policy)
1483{ 1538{
1484 const struct nlattr * const *nla = ctx->nla; 1539 const struct nlattr * const *nla = ctx->nla;
1485 struct nft_table *table = ctx->table; 1540 struct nft_table *table = ctx->table;
@@ -1497,7 +1552,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
1497 struct nft_chain_hook hook; 1552 struct nft_chain_hook hook;
1498 struct nf_hook_ops *ops; 1553 struct nf_hook_ops *ops;
1499 1554
1500 err = nft_chain_parse_hook(net, nla, &hook, family, create); 1555 err = nft_chain_parse_hook(net, nla, &hook, family, true);
1501 if (err < 0) 1556 if (err < 0)
1502 return err; 1557 return err;
1503 1558
@@ -1588,8 +1643,7 @@ err1:
1588 return err; 1643 return err;
1589} 1644}
1590 1645
1591static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, 1646static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy)
1592 bool create)
1593{ 1647{
1594 const struct nlattr * const *nla = ctx->nla; 1648 const struct nlattr * const *nla = ctx->nla;
1595 struct nft_table *table = ctx->table; 1649 struct nft_table *table = ctx->table;
@@ -1597,7 +1651,6 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
1597 struct nft_base_chain *basechain; 1651 struct nft_base_chain *basechain;
1598 struct nft_stats *stats = NULL; 1652 struct nft_stats *stats = NULL;
1599 struct nft_chain_hook hook; 1653 struct nft_chain_hook hook;
1600 const struct nlattr *name;
1601 struct nf_hook_ops *ops; 1654 struct nf_hook_ops *ops;
1602 struct nft_trans *trans; 1655 struct nft_trans *trans;
1603 int err; 1656 int err;
@@ -1607,7 +1660,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
1607 return -EBUSY; 1660 return -EBUSY;
1608 1661
1609 err = nft_chain_parse_hook(ctx->net, nla, &hook, ctx->family, 1662 err = nft_chain_parse_hook(ctx->net, nla, &hook, ctx->family,
1610 create); 1663 false);
1611 if (err < 0) 1664 if (err < 0)
1612 return err; 1665 return err;
1613 1666
@@ -1631,7 +1684,8 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
1631 nla[NFTA_CHAIN_NAME]) { 1684 nla[NFTA_CHAIN_NAME]) {
1632 struct nft_chain *chain2; 1685 struct nft_chain *chain2;
1633 1686
1634 chain2 = nft_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask); 1687 chain2 = nft_chain_lookup(ctx->net, table,
1688 nla[NFTA_CHAIN_NAME], genmask);
1635 if (!IS_ERR(chain2)) 1689 if (!IS_ERR(chain2))
1636 return -EEXIST; 1690 return -EEXIST;
1637 } 1691 }
@@ -1645,12 +1699,11 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
1645 return PTR_ERR(stats); 1699 return PTR_ERR(stats);
1646 } 1700 }
1647 1701
1702 err = -ENOMEM;
1648 trans = nft_trans_alloc(ctx, NFT_MSG_NEWCHAIN, 1703 trans = nft_trans_alloc(ctx, NFT_MSG_NEWCHAIN,
1649 sizeof(struct nft_trans_chain)); 1704 sizeof(struct nft_trans_chain));
1650 if (trans == NULL) { 1705 if (trans == NULL)
1651 free_percpu(stats); 1706 goto err;
1652 return -ENOMEM;
1653 }
1654 1707
1655 nft_trans_chain_stats(trans) = stats; 1708 nft_trans_chain_stats(trans) = stats;
1656 nft_trans_chain_update(trans) = true; 1709 nft_trans_chain_update(trans) = true;
@@ -1660,19 +1713,37 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
1660 else 1713 else
1661 nft_trans_chain_policy(trans) = -1; 1714 nft_trans_chain_policy(trans) = -1;
1662 1715
1663 name = nla[NFTA_CHAIN_NAME]; 1716 if (nla[NFTA_CHAIN_HANDLE] &&
1664 if (nla[NFTA_CHAIN_HANDLE] && name) { 1717 nla[NFTA_CHAIN_NAME]) {
1665 nft_trans_chain_name(trans) = 1718 struct nft_trans *tmp;
1666 nla_strdup(name, GFP_KERNEL); 1719 char *name;
1667 if (!nft_trans_chain_name(trans)) { 1720
1668 kfree(trans); 1721 err = -ENOMEM;
1669 free_percpu(stats); 1722 name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL);
1670 return -ENOMEM; 1723 if (!name)
1724 goto err;
1725
1726 err = -EEXIST;
1727 list_for_each_entry(tmp, &ctx->net->nft.commit_list, list) {
1728 if (tmp->msg_type == NFT_MSG_NEWCHAIN &&
1729 tmp->ctx.table == table &&
1730 nft_trans_chain_update(tmp) &&
1731 nft_trans_chain_name(tmp) &&
1732 strcmp(name, nft_trans_chain_name(tmp)) == 0) {
1733 kfree(name);
1734 goto err;
1735 }
1671 } 1736 }
1737
1738 nft_trans_chain_name(trans) = name;
1672 } 1739 }
1673 list_add_tail(&trans->list, &ctx->net->nft.commit_list); 1740 list_add_tail(&trans->list, &ctx->net->nft.commit_list);
1674 1741
1675 return 0; 1742 return 0;
1743err:
1744 free_percpu(stats);
1745 kfree(trans);
1746 return err;
1676} 1747}
1677 1748
1678static int nf_tables_newchain(struct net *net, struct sock *nlsk, 1749static int nf_tables_newchain(struct net *net, struct sock *nlsk,
@@ -1689,9 +1760,8 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
1689 u8 policy = NF_ACCEPT; 1760 u8 policy = NF_ACCEPT;
1690 struct nft_ctx ctx; 1761 struct nft_ctx ctx;
1691 u64 handle = 0; 1762 u64 handle = 0;
1692 bool create;
1693 1763
1694 create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false; 1764 lockdep_assert_held(&net->nft.commit_mutex);
1695 1765
1696 table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask); 1766 table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask);
1697 if (IS_ERR(table)) { 1767 if (IS_ERR(table)) {
@@ -1711,7 +1781,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
1711 } 1781 }
1712 attr = nla[NFTA_CHAIN_HANDLE]; 1782 attr = nla[NFTA_CHAIN_HANDLE];
1713 } else { 1783 } else {
1714 chain = nft_chain_lookup(table, attr, genmask); 1784 chain = nft_chain_lookup(net, table, attr, genmask);
1715 if (IS_ERR(chain)) { 1785 if (IS_ERR(chain)) {
1716 if (PTR_ERR(chain) != -ENOENT) { 1786 if (PTR_ERR(chain) != -ENOENT) {
1717 NL_SET_BAD_ATTR(extack, attr); 1787 NL_SET_BAD_ATTR(extack, attr);
@@ -1754,10 +1824,10 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
1754 if (nlh->nlmsg_flags & NLM_F_REPLACE) 1824 if (nlh->nlmsg_flags & NLM_F_REPLACE)
1755 return -EOPNOTSUPP; 1825 return -EOPNOTSUPP;
1756 1826
1757 return nf_tables_updchain(&ctx, genmask, policy, create); 1827 return nf_tables_updchain(&ctx, genmask, policy);
1758 } 1828 }
1759 1829
1760 return nf_tables_addchain(&ctx, family, genmask, policy, create); 1830 return nf_tables_addchain(&ctx, family, genmask, policy);
1761} 1831}
1762 1832
1763static int nf_tables_delchain(struct net *net, struct sock *nlsk, 1833static int nf_tables_delchain(struct net *net, struct sock *nlsk,
@@ -1789,7 +1859,7 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk,
1789 chain = nft_chain_lookup_byhandle(table, handle, genmask); 1859 chain = nft_chain_lookup_byhandle(table, handle, genmask);
1790 } else { 1860 } else {
1791 attr = nla[NFTA_CHAIN_NAME]; 1861 attr = nla[NFTA_CHAIN_NAME];
1792 chain = nft_chain_lookup(table, attr, genmask); 1862 chain = nft_chain_lookup(net, table, attr, genmask);
1793 } 1863 }
1794 if (IS_ERR(chain)) { 1864 if (IS_ERR(chain)) {
1795 NL_SET_BAD_ATTR(extack, attr); 1865 NL_SET_BAD_ATTR(extack, attr);
@@ -1874,7 +1944,8 @@ static const struct nft_expr_type *__nft_expr_type_get(u8 family,
1874 return NULL; 1944 return NULL;
1875} 1945}
1876 1946
1877static const struct nft_expr_type *nft_expr_type_get(u8 family, 1947static const struct nft_expr_type *nft_expr_type_get(struct net *net,
1948 u8 family,
1878 struct nlattr *nla) 1949 struct nlattr *nla)
1879{ 1950{
1880 const struct nft_expr_type *type; 1951 const struct nft_expr_type *type;
@@ -1886,19 +1957,16 @@ static const struct nft_expr_type *nft_expr_type_get(u8 family,
1886 if (type != NULL && try_module_get(type->owner)) 1957 if (type != NULL && try_module_get(type->owner))
1887 return type; 1958 return type;
1888 1959
1960 lockdep_nfnl_nft_mutex_not_held();
1889#ifdef CONFIG_MODULES 1961#ifdef CONFIG_MODULES
1890 if (type == NULL) { 1962 if (type == NULL) {
1891 nfnl_unlock(NFNL_SUBSYS_NFTABLES); 1963 nft_request_module(net, "nft-expr-%u-%.*s", family,
1892 request_module("nft-expr-%u-%.*s", family, 1964 nla_len(nla), (char *)nla_data(nla));
1893 nla_len(nla), (char *)nla_data(nla));
1894 nfnl_lock(NFNL_SUBSYS_NFTABLES);
1895 if (__nft_expr_type_get(family, nla)) 1965 if (__nft_expr_type_get(family, nla))
1896 return ERR_PTR(-EAGAIN); 1966 return ERR_PTR(-EAGAIN);
1897 1967
1898 nfnl_unlock(NFNL_SUBSYS_NFTABLES); 1968 nft_request_module(net, "nft-expr-%.*s",
1899 request_module("nft-expr-%.*s", 1969 nla_len(nla), (char *)nla_data(nla));
1900 nla_len(nla), (char *)nla_data(nla));
1901 nfnl_lock(NFNL_SUBSYS_NFTABLES);
1902 if (__nft_expr_type_get(family, nla)) 1970 if (__nft_expr_type_get(family, nla))
1903 return ERR_PTR(-EAGAIN); 1971 return ERR_PTR(-EAGAIN);
1904 } 1972 }
@@ -1967,7 +2035,7 @@ static int nf_tables_expr_parse(const struct nft_ctx *ctx,
1967 if (err < 0) 2035 if (err < 0)
1968 return err; 2036 return err;
1969 2037
1970 type = nft_expr_type_get(ctx->family, tb[NFTA_EXPR_NAME]); 2038 type = nft_expr_type_get(ctx->net, ctx->family, tb[NFTA_EXPR_NAME]);
1971 if (IS_ERR(type)) 2039 if (IS_ERR(type))
1972 return PTR_ERR(type); 2040 return PTR_ERR(type);
1973 2041
@@ -2254,6 +2322,39 @@ done:
2254 return skb->len; 2322 return skb->len;
2255} 2323}
2256 2324
2325static int nf_tables_dump_rules_start(struct netlink_callback *cb)
2326{
2327 const struct nlattr * const *nla = cb->data;
2328 struct nft_rule_dump_ctx *ctx = NULL;
2329
2330 if (nla[NFTA_RULE_TABLE] || nla[NFTA_RULE_CHAIN]) {
2331 ctx = kzalloc(sizeof(*ctx), GFP_ATOMIC);
2332 if (!ctx)
2333 return -ENOMEM;
2334
2335 if (nla[NFTA_RULE_TABLE]) {
2336 ctx->table = nla_strdup(nla[NFTA_RULE_TABLE],
2337 GFP_ATOMIC);
2338 if (!ctx->table) {
2339 kfree(ctx);
2340 return -ENOMEM;
2341 }
2342 }
2343 if (nla[NFTA_RULE_CHAIN]) {
2344 ctx->chain = nla_strdup(nla[NFTA_RULE_CHAIN],
2345 GFP_ATOMIC);
2346 if (!ctx->chain) {
2347 kfree(ctx->table);
2348 kfree(ctx);
2349 return -ENOMEM;
2350 }
2351 }
2352 }
2353
2354 cb->data = ctx;
2355 return 0;
2356}
2357
2257static int nf_tables_dump_rules_done(struct netlink_callback *cb) 2358static int nf_tables_dump_rules_done(struct netlink_callback *cb)
2258{ 2359{
2259 struct nft_rule_dump_ctx *ctx = cb->data; 2360 struct nft_rule_dump_ctx *ctx = cb->data;
@@ -2283,38 +2384,13 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
2283 2384
2284 if (nlh->nlmsg_flags & NLM_F_DUMP) { 2385 if (nlh->nlmsg_flags & NLM_F_DUMP) {
2285 struct netlink_dump_control c = { 2386 struct netlink_dump_control c = {
2387 .start= nf_tables_dump_rules_start,
2286 .dump = nf_tables_dump_rules, 2388 .dump = nf_tables_dump_rules,
2287 .done = nf_tables_dump_rules_done, 2389 .done = nf_tables_dump_rules_done,
2288 .module = THIS_MODULE, 2390 .module = THIS_MODULE,
2391 .data = (void *)nla,
2289 }; 2392 };
2290 2393
2291 if (nla[NFTA_RULE_TABLE] || nla[NFTA_RULE_CHAIN]) {
2292 struct nft_rule_dump_ctx *ctx;
2293
2294 ctx = kzalloc(sizeof(*ctx), GFP_ATOMIC);
2295 if (!ctx)
2296 return -ENOMEM;
2297
2298 if (nla[NFTA_RULE_TABLE]) {
2299 ctx->table = nla_strdup(nla[NFTA_RULE_TABLE],
2300 GFP_ATOMIC);
2301 if (!ctx->table) {
2302 kfree(ctx);
2303 return -ENOMEM;
2304 }
2305 }
2306 if (nla[NFTA_RULE_CHAIN]) {
2307 ctx->chain = nla_strdup(nla[NFTA_RULE_CHAIN],
2308 GFP_ATOMIC);
2309 if (!ctx->chain) {
2310 kfree(ctx->table);
2311 kfree(ctx);
2312 return -ENOMEM;
2313 }
2314 }
2315 c.data = ctx;
2316 }
2317
2318 return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); 2394 return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
2319 } 2395 }
2320 2396
@@ -2324,7 +2400,7 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
2324 return PTR_ERR(table); 2400 return PTR_ERR(table);
2325 } 2401 }
2326 2402
2327 chain = nft_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask); 2403 chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], genmask);
2328 if (IS_ERR(chain)) { 2404 if (IS_ERR(chain)) {
2329 NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); 2405 NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]);
2330 return PTR_ERR(chain); 2406 return PTR_ERR(chain);
@@ -2358,6 +2434,7 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx,
2358{ 2434{
2359 struct nft_expr *expr; 2435 struct nft_expr *expr;
2360 2436
2437 lockdep_assert_held(&ctx->net->nft.commit_mutex);
2361 /* 2438 /*
2362 * Careful: some expressions might not be initialized in case this 2439 * Careful: some expressions might not be initialized in case this
2363 * is called on error from nf_tables_newrule(). 2440 * is called on error from nf_tables_newrule().
@@ -2384,6 +2461,9 @@ int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain)
2384 struct nft_rule *rule; 2461 struct nft_rule *rule;
2385 int err; 2462 int err;
2386 2463
2464 if (ctx->level == NFT_JUMP_STACK_SIZE)
2465 return -EMLINK;
2466
2387 list_for_each_entry(rule, &chain->rules, list) { 2467 list_for_each_entry(rule, &chain->rules, list) {
2388 if (!nft_is_active_next(ctx->net, rule)) 2468 if (!nft_is_active_next(ctx->net, rule))
2389 continue; 2469 continue;
@@ -2426,8 +2506,6 @@ static int nft_table_validate(struct net *net, const struct nft_table *table)
2426 2506
2427#define NFT_RULE_MAXEXPRS 128 2507#define NFT_RULE_MAXEXPRS 128
2428 2508
2429static struct nft_expr_info *info;
2430
2431static int nf_tables_newrule(struct net *net, struct sock *nlsk, 2509static int nf_tables_newrule(struct net *net, struct sock *nlsk,
2432 struct sk_buff *skb, const struct nlmsghdr *nlh, 2510 struct sk_buff *skb, const struct nlmsghdr *nlh,
2433 const struct nlattr * const nla[], 2511 const struct nlattr * const nla[],
@@ -2435,6 +2513,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
2435{ 2513{
2436 const struct nfgenmsg *nfmsg = nlmsg_data(nlh); 2514 const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
2437 u8 genmask = nft_genmask_next(net); 2515 u8 genmask = nft_genmask_next(net);
2516 struct nft_expr_info *info = NULL;
2438 int family = nfmsg->nfgen_family; 2517 int family = nfmsg->nfgen_family;
2439 struct nft_table *table; 2518 struct nft_table *table;
2440 struct nft_chain *chain; 2519 struct nft_chain *chain;
@@ -2446,10 +2525,9 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
2446 struct nlattr *tmp; 2525 struct nlattr *tmp;
2447 unsigned int size, i, n, ulen = 0, usize = 0; 2526 unsigned int size, i, n, ulen = 0, usize = 0;
2448 int err, rem; 2527 int err, rem;
2449 bool create;
2450 u64 handle, pos_handle; 2528 u64 handle, pos_handle;
2451 2529
2452 create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false; 2530 lockdep_assert_held(&net->nft.commit_mutex);
2453 2531
2454 table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask); 2532 table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask);
2455 if (IS_ERR(table)) { 2533 if (IS_ERR(table)) {
@@ -2457,7 +2535,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
2457 return PTR_ERR(table); 2535 return PTR_ERR(table);
2458 } 2536 }
2459 2537
2460 chain = nft_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask); 2538 chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], genmask);
2461 if (IS_ERR(chain)) { 2539 if (IS_ERR(chain)) {
2462 NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); 2540 NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]);
2463 return PTR_ERR(chain); 2541 return PTR_ERR(chain);
@@ -2480,7 +2558,8 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
2480 else 2558 else
2481 return -EOPNOTSUPP; 2559 return -EOPNOTSUPP;
2482 } else { 2560 } else {
2483 if (!create || nlh->nlmsg_flags & NLM_F_REPLACE) 2561 if (!(nlh->nlmsg_flags & NLM_F_CREATE) ||
2562 nlh->nlmsg_flags & NLM_F_REPLACE)
2484 return -EINVAL; 2563 return -EINVAL;
2485 handle = nf_tables_alloc_handle(table); 2564 handle = nf_tables_alloc_handle(table);
2486 2565
@@ -2505,6 +2584,12 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
2505 n = 0; 2584 n = 0;
2506 size = 0; 2585 size = 0;
2507 if (nla[NFTA_RULE_EXPRESSIONS]) { 2586 if (nla[NFTA_RULE_EXPRESSIONS]) {
2587 info = kvmalloc_array(NFT_RULE_MAXEXPRS,
2588 sizeof(struct nft_expr_info),
2589 GFP_KERNEL);
2590 if (!info)
2591 return -ENOMEM;
2592
2508 nla_for_each_nested(tmp, nla[NFTA_RULE_EXPRESSIONS], rem) { 2593 nla_for_each_nested(tmp, nla[NFTA_RULE_EXPRESSIONS], rem) {
2509 err = -EINVAL; 2594 err = -EINVAL;
2510 if (nla_type(tmp) != NFTA_LIST_ELEM) 2595 if (nla_type(tmp) != NFTA_LIST_ELEM)
@@ -2597,6 +2682,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
2597 list_add_rcu(&rule->list, &chain->rules); 2682 list_add_rcu(&rule->list, &chain->rules);
2598 } 2683 }
2599 } 2684 }
2685 kvfree(info);
2600 chain->use++; 2686 chain->use++;
2601 2687
2602 if (net->nft.validate_state == NFT_VALIDATE_DO) 2688 if (net->nft.validate_state == NFT_VALIDATE_DO)
@@ -2610,6 +2696,7 @@ err1:
2610 if (info[i].ops != NULL) 2696 if (info[i].ops != NULL)
2611 module_put(info[i].ops->type->owner); 2697 module_put(info[i].ops->type->owner);
2612 } 2698 }
2699 kvfree(info);
2613 return err; 2700 return err;
2614} 2701}
2615 2702
@@ -2649,7 +2736,8 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk,
2649 } 2736 }
2650 2737
2651 if (nla[NFTA_RULE_CHAIN]) { 2738 if (nla[NFTA_RULE_CHAIN]) {
2652 chain = nft_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask); 2739 chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN],
2740 genmask);
2653 if (IS_ERR(chain)) { 2741 if (IS_ERR(chain)) {
2654 NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); 2742 NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]);
2655 return PTR_ERR(chain); 2743 return PTR_ERR(chain);
@@ -2741,11 +2829,11 @@ nft_select_set_ops(const struct nft_ctx *ctx,
2741 const struct nft_set_type *type; 2829 const struct nft_set_type *type;
2742 u32 flags = 0; 2830 u32 flags = 0;
2743 2831
2832 lockdep_assert_held(&ctx->net->nft.commit_mutex);
2833 lockdep_nfnl_nft_mutex_not_held();
2744#ifdef CONFIG_MODULES 2834#ifdef CONFIG_MODULES
2745 if (list_empty(&nf_tables_set_types)) { 2835 if (list_empty(&nf_tables_set_types)) {
2746 nfnl_unlock(NFNL_SUBSYS_NFTABLES); 2836 nft_request_module(ctx->net, "nft-set");
2747 request_module("nft-set");
2748 nfnl_lock(NFNL_SUBSYS_NFTABLES);
2749 if (!list_empty(&nf_tables_set_types)) 2837 if (!list_empty(&nf_tables_set_types))
2750 return ERR_PTR(-EAGAIN); 2838 return ERR_PTR(-EAGAIN);
2751 } 2839 }
@@ -3161,6 +3249,18 @@ done:
3161 return skb->len; 3249 return skb->len;
3162} 3250}
3163 3251
3252static int nf_tables_dump_sets_start(struct netlink_callback *cb)
3253{
3254 struct nft_ctx *ctx_dump = NULL;
3255
3256 ctx_dump = kmemdup(cb->data, sizeof(*ctx_dump), GFP_ATOMIC);
3257 if (ctx_dump == NULL)
3258 return -ENOMEM;
3259
3260 cb->data = ctx_dump;
3261 return 0;
3262}
3263
3164static int nf_tables_dump_sets_done(struct netlink_callback *cb) 3264static int nf_tables_dump_sets_done(struct netlink_callback *cb)
3165{ 3265{
3166 kfree(cb->data); 3266 kfree(cb->data);
@@ -3188,18 +3288,12 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk,
3188 3288
3189 if (nlh->nlmsg_flags & NLM_F_DUMP) { 3289 if (nlh->nlmsg_flags & NLM_F_DUMP) {
3190 struct netlink_dump_control c = { 3290 struct netlink_dump_control c = {
3291 .start = nf_tables_dump_sets_start,
3191 .dump = nf_tables_dump_sets, 3292 .dump = nf_tables_dump_sets,
3192 .done = nf_tables_dump_sets_done, 3293 .done = nf_tables_dump_sets_done,
3294 .data = &ctx,
3193 .module = THIS_MODULE, 3295 .module = THIS_MODULE,
3194 }; 3296 };
3195 struct nft_ctx *ctx_dump;
3196
3197 ctx_dump = kmalloc(sizeof(*ctx_dump), GFP_ATOMIC);
3198 if (ctx_dump == NULL)
3199 return -ENOMEM;
3200
3201 *ctx_dump = ctx;
3202 c.data = ctx_dump;
3203 3297
3204 return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); 3298 return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
3205 } 3299 }
@@ -3260,8 +3354,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
3260 struct nft_set *set; 3354 struct nft_set *set;
3261 struct nft_ctx ctx; 3355 struct nft_ctx ctx;
3262 char *name; 3356 char *name;
3263 unsigned int size; 3357 u64 size;
3264 bool create;
3265 u64 timeout; 3358 u64 timeout;
3266 u32 ktype, dtype, flags, policy, gc_int, objtype; 3359 u32 ktype, dtype, flags, policy, gc_int, objtype;
3267 struct nft_set_desc desc; 3360 struct nft_set_desc desc;
@@ -3362,8 +3455,6 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
3362 return err; 3455 return err;
3363 } 3456 }
3364 3457
3365 create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
3366
3367 table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, genmask); 3458 table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, genmask);
3368 if (IS_ERR(table)) { 3459 if (IS_ERR(table)) {
3369 NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]); 3460 NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]);
@@ -3849,6 +3940,15 @@ nla_put_failure:
3849 return -ENOSPC; 3940 return -ENOSPC;
3850} 3941}
3851 3942
3943static int nf_tables_dump_set_start(struct netlink_callback *cb)
3944{
3945 struct nft_set_dump_ctx *dump_ctx = cb->data;
3946
3947 cb->data = kmemdup(dump_ctx, sizeof(*dump_ctx), GFP_ATOMIC);
3948
3949 return cb->data ? 0 : -ENOMEM;
3950}
3951
3852static int nf_tables_dump_set_done(struct netlink_callback *cb) 3952static int nf_tables_dump_set_done(struct netlink_callback *cb)
3853{ 3953{
3854 kfree(cb->data); 3954 kfree(cb->data);
@@ -3920,7 +4020,6 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set,
3920 const struct nlattr *attr) 4020 const struct nlattr *attr)
3921{ 4021{
3922 struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; 4022 struct nlattr *nla[NFTA_SET_ELEM_MAX + 1];
3923 const struct nft_set_ext *ext;
3924 struct nft_data_desc desc; 4023 struct nft_data_desc desc;
3925 struct nft_set_elem elem; 4024 struct nft_set_elem elem;
3926 struct sk_buff *skb; 4025 struct sk_buff *skb;
@@ -3954,7 +4053,6 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set,
3954 return PTR_ERR(priv); 4053 return PTR_ERR(priv);
3955 4054
3956 elem.priv = priv; 4055 elem.priv = priv;
3957 ext = nft_set_elem_ext(set, &elem);
3958 4056
3959 err = -ENOMEM; 4057 err = -ENOMEM;
3960 skb = nlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC); 4058 skb = nlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC);
@@ -4002,20 +4100,17 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk,
4002 4100
4003 if (nlh->nlmsg_flags & NLM_F_DUMP) { 4101 if (nlh->nlmsg_flags & NLM_F_DUMP) {
4004 struct netlink_dump_control c = { 4102 struct netlink_dump_control c = {
4103 .start = nf_tables_dump_set_start,
4005 .dump = nf_tables_dump_set, 4104 .dump = nf_tables_dump_set,
4006 .done = nf_tables_dump_set_done, 4105 .done = nf_tables_dump_set_done,
4007 .module = THIS_MODULE, 4106 .module = THIS_MODULE,
4008 }; 4107 };
4009 struct nft_set_dump_ctx *dump_ctx; 4108 struct nft_set_dump_ctx dump_ctx = {
4010 4109 .set = set,
4011 dump_ctx = kmalloc(sizeof(*dump_ctx), GFP_ATOMIC); 4110 .ctx = ctx,
4012 if (!dump_ctx) 4111 };
4013 return -ENOMEM;
4014
4015 dump_ctx->set = set;
4016 dump_ctx->ctx = ctx;
4017 4112
4018 c.data = dump_ctx; 4113 c.data = &dump_ctx;
4019 return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); 4114 return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
4020 } 4115 }
4021 4116
@@ -4778,7 +4873,8 @@ static const struct nft_object_type *__nft_obj_type_get(u32 objtype)
4778 return NULL; 4873 return NULL;
4779} 4874}
4780 4875
4781static const struct nft_object_type *nft_obj_type_get(u32 objtype) 4876static const struct nft_object_type *
4877nft_obj_type_get(struct net *net, u32 objtype)
4782{ 4878{
4783 const struct nft_object_type *type; 4879 const struct nft_object_type *type;
4784 4880
@@ -4786,11 +4882,10 @@ static const struct nft_object_type *nft_obj_type_get(u32 objtype)
4786 if (type != NULL && try_module_get(type->owner)) 4882 if (type != NULL && try_module_get(type->owner))
4787 return type; 4883 return type;
4788 4884
4885 lockdep_nfnl_nft_mutex_not_held();
4789#ifdef CONFIG_MODULES 4886#ifdef CONFIG_MODULES
4790 if (type == NULL) { 4887 if (type == NULL) {
4791 nfnl_unlock(NFNL_SUBSYS_NFTABLES); 4888 nft_request_module(net, "nft-obj-%u", objtype);
4792 request_module("nft-obj-%u", objtype);
4793 nfnl_lock(NFNL_SUBSYS_NFTABLES);
4794 if (__nft_obj_type_get(objtype)) 4889 if (__nft_obj_type_get(objtype))
4795 return ERR_PTR(-EAGAIN); 4890 return ERR_PTR(-EAGAIN);
4796 } 4891 }
@@ -4842,7 +4937,7 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk,
4842 4937
4843 nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); 4938 nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
4844 4939
4845 type = nft_obj_type_get(objtype); 4940 type = nft_obj_type_get(net, objtype);
4846 if (IS_ERR(type)) 4941 if (IS_ERR(type))
4847 return PTR_ERR(type); 4942 return PTR_ERR(type);
4848 4943
@@ -4975,38 +5070,42 @@ done:
4975 return skb->len; 5070 return skb->len;
4976} 5071}
4977 5072
4978static int nf_tables_dump_obj_done(struct netlink_callback *cb) 5073static int nf_tables_dump_obj_start(struct netlink_callback *cb)
4979{ 5074{
4980 struct nft_obj_filter *filter = cb->data; 5075 const struct nlattr * const *nla = cb->data;
5076 struct nft_obj_filter *filter = NULL;
4981 5077
4982 if (filter) { 5078 if (nla[NFTA_OBJ_TABLE] || nla[NFTA_OBJ_TYPE]) {
4983 kfree(filter->table); 5079 filter = kzalloc(sizeof(*filter), GFP_ATOMIC);
4984 kfree(filter); 5080 if (!filter)
5081 return -ENOMEM;
5082
5083 if (nla[NFTA_OBJ_TABLE]) {
5084 filter->table = nla_strdup(nla[NFTA_OBJ_TABLE], GFP_ATOMIC);
5085 if (!filter->table) {
5086 kfree(filter);
5087 return -ENOMEM;
5088 }
5089 }
5090
5091 if (nla[NFTA_OBJ_TYPE])
5092 filter->type = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
4985 } 5093 }
4986 5094
5095 cb->data = filter;
4987 return 0; 5096 return 0;
4988} 5097}
4989 5098
4990static struct nft_obj_filter * 5099static int nf_tables_dump_obj_done(struct netlink_callback *cb)
4991nft_obj_filter_alloc(const struct nlattr * const nla[])
4992{ 5100{
4993 struct nft_obj_filter *filter; 5101 struct nft_obj_filter *filter = cb->data;
4994
4995 filter = kzalloc(sizeof(*filter), GFP_ATOMIC);
4996 if (!filter)
4997 return ERR_PTR(-ENOMEM);
4998 5102
4999 if (nla[NFTA_OBJ_TABLE]) { 5103 if (filter) {
5000 filter->table = nla_strdup(nla[NFTA_OBJ_TABLE], GFP_ATOMIC); 5104 kfree(filter->table);
5001 if (!filter->table) { 5105 kfree(filter);
5002 kfree(filter);
5003 return ERR_PTR(-ENOMEM);
5004 }
5005 } 5106 }
5006 if (nla[NFTA_OBJ_TYPE])
5007 filter->type = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
5008 5107
5009 return filter; 5108 return 0;
5010} 5109}
5011 5110
5012/* called with rcu_read_lock held */ 5111/* called with rcu_read_lock held */
@@ -5027,21 +5126,13 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk,
5027 5126
5028 if (nlh->nlmsg_flags & NLM_F_DUMP) { 5127 if (nlh->nlmsg_flags & NLM_F_DUMP) {
5029 struct netlink_dump_control c = { 5128 struct netlink_dump_control c = {
5129 .start = nf_tables_dump_obj_start,
5030 .dump = nf_tables_dump_obj, 5130 .dump = nf_tables_dump_obj,
5031 .done = nf_tables_dump_obj_done, 5131 .done = nf_tables_dump_obj_done,
5032 .module = THIS_MODULE, 5132 .module = THIS_MODULE,
5133 .data = (void *)nla,
5033 }; 5134 };
5034 5135
5035 if (nla[NFTA_OBJ_TABLE] ||
5036 nla[NFTA_OBJ_TYPE]) {
5037 struct nft_obj_filter *filter;
5038
5039 filter = nft_obj_filter_alloc(nla);
5040 if (IS_ERR(filter))
5041 return -ENOMEM;
5042
5043 c.data = filter;
5044 }
5045 return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); 5136 return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
5046 } 5137 }
5047 5138
@@ -5320,8 +5411,6 @@ static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx,
5320 flowtable->ops[i].priv = &flowtable->data; 5411 flowtable->ops[i].priv = &flowtable->data;
5321 flowtable->ops[i].hook = flowtable->data.type->hook; 5412 flowtable->ops[i].hook = flowtable->data.type->hook;
5322 flowtable->ops[i].dev = dev_array[i]; 5413 flowtable->ops[i].dev = dev_array[i];
5323 flowtable->dev_name[i] = kstrdup(dev_array[i]->name,
5324 GFP_KERNEL);
5325 } 5414 }
5326 5415
5327 return err; 5416 return err;
@@ -5338,7 +5427,8 @@ static const struct nf_flowtable_type *__nft_flowtable_type_get(u8 family)
5338 return NULL; 5427 return NULL;
5339} 5428}
5340 5429
5341static const struct nf_flowtable_type *nft_flowtable_type_get(u8 family) 5430static const struct nf_flowtable_type *
5431nft_flowtable_type_get(struct net *net, u8 family)
5342{ 5432{
5343 const struct nf_flowtable_type *type; 5433 const struct nf_flowtable_type *type;
5344 5434
@@ -5346,11 +5436,10 @@ static const struct nf_flowtable_type *nft_flowtable_type_get(u8 family)
5346 if (type != NULL && try_module_get(type->owner)) 5436 if (type != NULL && try_module_get(type->owner))
5347 return type; 5437 return type;
5348 5438
5439 lockdep_nfnl_nft_mutex_not_held();
5349#ifdef CONFIG_MODULES 5440#ifdef CONFIG_MODULES
5350 if (type == NULL) { 5441 if (type == NULL) {
5351 nfnl_unlock(NFNL_SUBSYS_NFTABLES); 5442 nft_request_module(net, "nf-flowtable-%u", family);
5352 request_module("nf-flowtable-%u", family);
5353 nfnl_lock(NFNL_SUBSYS_NFTABLES);
5354 if (__nft_flowtable_type_get(family)) 5443 if (__nft_flowtable_type_get(family))
5355 return ERR_PTR(-EAGAIN); 5444 return ERR_PTR(-EAGAIN);
5356 } 5445 }
@@ -5430,7 +5519,7 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
5430 goto err1; 5519 goto err1;
5431 } 5520 }
5432 5521
5433 type = nft_flowtable_type_get(family); 5522 type = nft_flowtable_type_get(net, family);
5434 if (IS_ERR(type)) { 5523 if (IS_ERR(type)) {
5435 err = PTR_ERR(type); 5524 err = PTR_ERR(type);
5436 goto err2; 5525 goto err2;
@@ -5479,10 +5568,8 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
5479err6: 5568err6:
5480 i = flowtable->ops_len; 5569 i = flowtable->ops_len;
5481err5: 5570err5:
5482 for (k = i - 1; k >= 0; k--) { 5571 for (k = i - 1; k >= 0; k--)
5483 kfree(flowtable->dev_name[k]);
5484 nf_unregister_net_hook(net, &flowtable->ops[k]); 5572 nf_unregister_net_hook(net, &flowtable->ops[k]);
5485 }
5486 5573
5487 kfree(flowtable->ops); 5574 kfree(flowtable->ops);
5488err4: 5575err4:
@@ -5581,9 +5668,10 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net,
5581 goto nla_put_failure; 5668 goto nla_put_failure;
5582 5669
5583 for (i = 0; i < flowtable->ops_len; i++) { 5670 for (i = 0; i < flowtable->ops_len; i++) {
5584 if (flowtable->dev_name[i][0] && 5671 const struct net_device *dev = READ_ONCE(flowtable->ops[i].dev);
5585 nla_put_string(skb, NFTA_DEVICE_NAME, 5672
5586 flowtable->dev_name[i])) 5673 if (dev &&
5674 nla_put_string(skb, NFTA_DEVICE_NAME, dev->name))
5587 goto nla_put_failure; 5675 goto nla_put_failure;
5588 } 5676 }
5589 nla_nest_end(skb, nest_devs); 5677 nla_nest_end(skb, nest_devs);
@@ -5650,37 +5738,39 @@ done:
5650 return skb->len; 5738 return skb->len;
5651} 5739}
5652 5740
5653static int nf_tables_dump_flowtable_done(struct netlink_callback *cb) 5741static int nf_tables_dump_flowtable_start(struct netlink_callback *cb)
5654{ 5742{
5655 struct nft_flowtable_filter *filter = cb->data; 5743 const struct nlattr * const *nla = cb->data;
5744 struct nft_flowtable_filter *filter = NULL;
5656 5745
5657 if (!filter) 5746 if (nla[NFTA_FLOWTABLE_TABLE]) {
5658 return 0; 5747 filter = kzalloc(sizeof(*filter), GFP_ATOMIC);
5748 if (!filter)
5749 return -ENOMEM;
5659 5750
5660 kfree(filter->table); 5751 filter->table = nla_strdup(nla[NFTA_FLOWTABLE_TABLE],
5661 kfree(filter); 5752 GFP_ATOMIC);
5753 if (!filter->table) {
5754 kfree(filter);
5755 return -ENOMEM;
5756 }
5757 }
5662 5758
5759 cb->data = filter;
5663 return 0; 5760 return 0;
5664} 5761}
5665 5762
5666static struct nft_flowtable_filter * 5763static int nf_tables_dump_flowtable_done(struct netlink_callback *cb)
5667nft_flowtable_filter_alloc(const struct nlattr * const nla[])
5668{ 5764{
5669 struct nft_flowtable_filter *filter; 5765 struct nft_flowtable_filter *filter = cb->data;
5670 5766
5671 filter = kzalloc(sizeof(*filter), GFP_ATOMIC);
5672 if (!filter) 5767 if (!filter)
5673 return ERR_PTR(-ENOMEM); 5768 return 0;
5674 5769
5675 if (nla[NFTA_FLOWTABLE_TABLE]) { 5770 kfree(filter->table);
5676 filter->table = nla_strdup(nla[NFTA_FLOWTABLE_TABLE], 5771 kfree(filter);
5677 GFP_ATOMIC); 5772
5678 if (!filter->table) { 5773 return 0;
5679 kfree(filter);
5680 return ERR_PTR(-ENOMEM);
5681 }
5682 }
5683 return filter;
5684} 5774}
5685 5775
5686/* called with rcu_read_lock held */ 5776/* called with rcu_read_lock held */
@@ -5700,20 +5790,13 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk,
5700 5790
5701 if (nlh->nlmsg_flags & NLM_F_DUMP) { 5791 if (nlh->nlmsg_flags & NLM_F_DUMP) {
5702 struct netlink_dump_control c = { 5792 struct netlink_dump_control c = {
5793 .start = nf_tables_dump_flowtable_start,
5703 .dump = nf_tables_dump_flowtable, 5794 .dump = nf_tables_dump_flowtable,
5704 .done = nf_tables_dump_flowtable_done, 5795 .done = nf_tables_dump_flowtable_done,
5705 .module = THIS_MODULE, 5796 .module = THIS_MODULE,
5797 .data = (void *)nla,
5706 }; 5798 };
5707 5799
5708 if (nla[NFTA_FLOWTABLE_TABLE]) {
5709 struct nft_flowtable_filter *filter;
5710
5711 filter = nft_flowtable_filter_alloc(nla);
5712 if (IS_ERR(filter))
5713 return -ENOMEM;
5714
5715 c.data = filter;
5716 }
5717 return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); 5800 return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
5718 } 5801 }
5719 5802
@@ -5783,6 +5866,7 @@ static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable)
5783 kfree(flowtable->name); 5866 kfree(flowtable->name);
5784 flowtable->data.type->free(&flowtable->data); 5867 flowtable->data.type->free(&flowtable->data);
5785 module_put(flowtable->data.type->owner); 5868 module_put(flowtable->data.type->owner);
5869 kfree(flowtable);
5786} 5870}
5787 5871
5788static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net, 5872static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net,
@@ -5825,7 +5909,6 @@ static void nft_flowtable_event(unsigned long event, struct net_device *dev,
5825 continue; 5909 continue;
5826 5910
5827 nf_unregister_net_hook(dev_net(dev), &flowtable->ops[i]); 5911 nf_unregister_net_hook(dev_net(dev), &flowtable->ops[i]);
5828 flowtable->dev_name[i][0] = '\0';
5829 flowtable->ops[i].dev = NULL; 5912 flowtable->ops[i].dev = NULL;
5830 break; 5913 break;
5831 } 5914 }
@@ -5842,18 +5925,15 @@ static int nf_tables_flowtable_event(struct notifier_block *this,
5842 if (event != NETDEV_UNREGISTER) 5925 if (event != NETDEV_UNREGISTER)
5843 return 0; 5926 return 0;
5844 5927
5845 net = maybe_get_net(dev_net(dev)); 5928 net = dev_net(dev);
5846 if (!net) 5929 mutex_lock(&net->nft.commit_mutex);
5847 return 0;
5848
5849 nfnl_lock(NFNL_SUBSYS_NFTABLES);
5850 list_for_each_entry(table, &net->nft.tables, list) { 5930 list_for_each_entry(table, &net->nft.tables, list) {
5851 list_for_each_entry(flowtable, &table->flowtables, list) { 5931 list_for_each_entry(flowtable, &table->flowtables, list) {
5852 nft_flowtable_event(event, dev, flowtable); 5932 nft_flowtable_event(event, dev, flowtable);
5853 } 5933 }
5854 } 5934 }
5855 nfnl_unlock(NFNL_SUBSYS_NFTABLES); 5935 mutex_unlock(&net->nft.commit_mutex);
5856 put_net(net); 5936
5857 return NOTIFY_DONE; 5937 return NOTIFY_DONE;
5858} 5938}
5859 5939
@@ -6086,6 +6166,9 @@ static void nft_commit_release(struct nft_trans *trans)
6086 case NFT_MSG_DELTABLE: 6166 case NFT_MSG_DELTABLE:
6087 nf_tables_table_destroy(&trans->ctx); 6167 nf_tables_table_destroy(&trans->ctx);
6088 break; 6168 break;
6169 case NFT_MSG_NEWCHAIN:
6170 kfree(nft_trans_chain_name(trans));
6171 break;
6089 case NFT_MSG_DELCHAIN: 6172 case NFT_MSG_DELCHAIN:
6090 nf_tables_chain_destroy(&trans->ctx); 6173 nf_tables_chain_destroy(&trans->ctx);
6091 break; 6174 break;
@@ -6201,9 +6284,9 @@ static void nf_tables_commit_chain_active(struct net *net, struct nft_chain *cha
6201 next_genbit = nft_gencursor_next(net); 6284 next_genbit = nft_gencursor_next(net);
6202 6285
6203 g0 = rcu_dereference_protected(chain->rules_gen_0, 6286 g0 = rcu_dereference_protected(chain->rules_gen_0,
6204 lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES)); 6287 lockdep_commit_lock_is_held(net));
6205 g1 = rcu_dereference_protected(chain->rules_gen_1, 6288 g1 = rcu_dereference_protected(chain->rules_gen_1,
6206 lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES)); 6289 lockdep_commit_lock_is_held(net));
6207 6290
6208 /* No changes to this chain? */ 6291 /* No changes to this chain? */
6209 if (chain->rules_next == NULL) { 6292 if (chain->rules_next == NULL) {
@@ -6315,13 +6398,15 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
6315 nf_tables_table_notify(&trans->ctx, NFT_MSG_DELTABLE); 6398 nf_tables_table_notify(&trans->ctx, NFT_MSG_DELTABLE);
6316 break; 6399 break;
6317 case NFT_MSG_NEWCHAIN: 6400 case NFT_MSG_NEWCHAIN:
6318 if (nft_trans_chain_update(trans)) 6401 if (nft_trans_chain_update(trans)) {
6319 nft_chain_commit_update(trans); 6402 nft_chain_commit_update(trans);
6320 else 6403 nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN);
6404 /* trans destroyed after rcu grace period */
6405 } else {
6321 nft_clear(net, trans->ctx.chain); 6406 nft_clear(net, trans->ctx.chain);
6322 6407 nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN);
6323 nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN); 6408 nft_trans_destroy(trans);
6324 nft_trans_destroy(trans); 6409 }
6325 break; 6410 break;
6326 case NFT_MSG_DELCHAIN: 6411 case NFT_MSG_DELCHAIN:
6327 nft_chain_del(trans->ctx.chain); 6412 nft_chain_del(trans->ctx.chain);
@@ -6411,6 +6496,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
6411 6496
6412 nf_tables_commit_release(net); 6497 nf_tables_commit_release(net);
6413 nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN); 6498 nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN);
6499 mutex_unlock(&net->nft.commit_mutex);
6414 6500
6415 return 0; 6501 return 0;
6416} 6502}
@@ -6471,7 +6557,7 @@ static int __nf_tables_abort(struct net *net)
6471 case NFT_MSG_NEWCHAIN: 6557 case NFT_MSG_NEWCHAIN:
6472 if (nft_trans_chain_update(trans)) { 6558 if (nft_trans_chain_update(trans)) {
6473 free_percpu(nft_trans_chain_stats(trans)); 6559 free_percpu(nft_trans_chain_stats(trans));
6474 6560 kfree(nft_trans_chain_name(trans));
6475 nft_trans_destroy(trans); 6561 nft_trans_destroy(trans);
6476 } else { 6562 } else {
6477 trans->ctx.table->use--; 6563 trans->ctx.table->use--;
@@ -6562,12 +6648,25 @@ static void nf_tables_cleanup(struct net *net)
6562 6648
6563static int nf_tables_abort(struct net *net, struct sk_buff *skb) 6649static int nf_tables_abort(struct net *net, struct sk_buff *skb)
6564{ 6650{
6565 return __nf_tables_abort(net); 6651 int ret = __nf_tables_abort(net);
6652
6653 mutex_unlock(&net->nft.commit_mutex);
6654
6655 return ret;
6566} 6656}
6567 6657
6568static bool nf_tables_valid_genid(struct net *net, u32 genid) 6658static bool nf_tables_valid_genid(struct net *net, u32 genid)
6569{ 6659{
6570 return net->nft.base_seq == genid; 6660 bool genid_ok;
6661
6662 mutex_lock(&net->nft.commit_mutex);
6663
6664 genid_ok = genid == 0 || net->nft.base_seq == genid;
6665 if (!genid_ok)
6666 mutex_unlock(&net->nft.commit_mutex);
6667
6668 /* else, commit mutex has to be released by commit or abort function */
6669 return genid_ok;
6571} 6670}
6572 6671
6573static const struct nfnetlink_subsystem nf_tables_subsys = { 6672static const struct nfnetlink_subsystem nf_tables_subsys = {
@@ -6579,6 +6678,7 @@ static const struct nfnetlink_subsystem nf_tables_subsys = {
6579 .abort = nf_tables_abort, 6678 .abort = nf_tables_abort,
6580 .cleanup = nf_tables_cleanup, 6679 .cleanup = nf_tables_cleanup,
6581 .valid_genid = nf_tables_valid_genid, 6680 .valid_genid = nf_tables_valid_genid,
6681 .owner = THIS_MODULE,
6582}; 6682};
6583 6683
6584int nft_chain_validate_dependency(const struct nft_chain *chain, 6684int nft_chain_validate_dependency(const struct nft_chain *chain,
@@ -6837,13 +6937,6 @@ int nft_validate_register_store(const struct nft_ctx *ctx,
6837 err = nf_tables_check_loops(ctx, data->verdict.chain); 6937 err = nf_tables_check_loops(ctx, data->verdict.chain);
6838 if (err < 0) 6938 if (err < 0)
6839 return err; 6939 return err;
6840
6841 if (ctx->chain->level + 1 >
6842 data->verdict.chain->level) {
6843 if (ctx->chain->level + 1 == NFT_JUMP_STACK_SIZE)
6844 return -EMLINK;
6845 data->verdict.chain->level = ctx->chain->level + 1;
6846 }
6847 } 6940 }
6848 6941
6849 return 0; 6942 return 0;
@@ -6905,8 +6998,8 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data,
6905 case NFT_GOTO: 6998 case NFT_GOTO:
6906 if (!tb[NFTA_VERDICT_CHAIN]) 6999 if (!tb[NFTA_VERDICT_CHAIN])
6907 return -EINVAL; 7000 return -EINVAL;
6908 chain = nft_chain_lookup(ctx->table, tb[NFTA_VERDICT_CHAIN], 7001 chain = nft_chain_lookup(ctx->net, ctx->table,
6909 genmask); 7002 tb[NFTA_VERDICT_CHAIN], genmask);
6910 if (IS_ERR(chain)) 7003 if (IS_ERR(chain))
6911 return PTR_ERR(chain); 7004 return PTR_ERR(chain);
6912 if (nft_is_base_chain(chain)) 7005 if (nft_is_base_chain(chain))
@@ -7151,6 +7244,7 @@ static int __net_init nf_tables_init_net(struct net *net)
7151{ 7244{
7152 INIT_LIST_HEAD(&net->nft.tables); 7245 INIT_LIST_HEAD(&net->nft.tables);
7153 INIT_LIST_HEAD(&net->nft.commit_list); 7246 INIT_LIST_HEAD(&net->nft.commit_list);
7247 mutex_init(&net->nft.commit_mutex);
7154 net->nft.base_seq = 1; 7248 net->nft.base_seq = 1;
7155 net->nft.validate_state = NFT_VALIDATE_SKIP; 7249 net->nft.validate_state = NFT_VALIDATE_SKIP;
7156 7250
@@ -7159,11 +7253,11 @@ static int __net_init nf_tables_init_net(struct net *net)
7159 7253
7160static void __net_exit nf_tables_exit_net(struct net *net) 7254static void __net_exit nf_tables_exit_net(struct net *net)
7161{ 7255{
7162 nfnl_lock(NFNL_SUBSYS_NFTABLES); 7256 mutex_lock(&net->nft.commit_mutex);
7163 if (!list_empty(&net->nft.commit_list)) 7257 if (!list_empty(&net->nft.commit_list))
7164 __nf_tables_abort(net); 7258 __nf_tables_abort(net);
7165 __nft_release_tables(net); 7259 __nft_release_tables(net);
7166 nfnl_unlock(NFNL_SUBSYS_NFTABLES); 7260 mutex_unlock(&net->nft.commit_mutex);
7167 WARN_ON_ONCE(!list_empty(&net->nft.tables)); 7261 WARN_ON_ONCE(!list_empty(&net->nft.tables));
7168} 7262}
7169 7263
@@ -7176,31 +7270,36 @@ static int __init nf_tables_module_init(void)
7176{ 7270{
7177 int err; 7271 int err;
7178 7272
7179 nft_chain_filter_init(); 7273 err = register_pernet_subsys(&nf_tables_net_ops);
7274 if (err < 0)
7275 return err;
7180 7276
7181 info = kmalloc_array(NFT_RULE_MAXEXPRS, sizeof(struct nft_expr_info), 7277 err = nft_chain_filter_init();
7182 GFP_KERNEL); 7278 if (err < 0)
7183 if (info == NULL) {
7184 err = -ENOMEM;
7185 goto err1; 7279 goto err1;
7186 }
7187 7280
7188 err = nf_tables_core_module_init(); 7281 err = nf_tables_core_module_init();
7189 if (err < 0) 7282 if (err < 0)
7190 goto err2; 7283 goto err2;
7191 7284
7192 err = nfnetlink_subsys_register(&nf_tables_subsys); 7285 err = register_netdevice_notifier(&nf_tables_flowtable_notifier);
7193 if (err < 0) 7286 if (err < 0)
7194 goto err3; 7287 goto err3;
7195 7288
7196 register_netdevice_notifier(&nf_tables_flowtable_notifier); 7289 /* must be last */
7290 err = nfnetlink_subsys_register(&nf_tables_subsys);
7291 if (err < 0)
7292 goto err4;
7197 7293
7198 return register_pernet_subsys(&nf_tables_net_ops); 7294 return err;
7295err4:
7296 unregister_netdevice_notifier(&nf_tables_flowtable_notifier);
7199err3: 7297err3:
7200 nf_tables_core_module_exit(); 7298 nf_tables_core_module_exit();
7201err2: 7299err2:
7202 kfree(info); 7300 nft_chain_filter_fini();
7203err1: 7301err1:
7302 unregister_pernet_subsys(&nf_tables_net_ops);
7204 return err; 7303 return err;
7205} 7304}
7206 7305
@@ -7212,7 +7311,6 @@ static void __exit nf_tables_module_exit(void)
7212 unregister_pernet_subsys(&nf_tables_net_ops); 7311 unregister_pernet_subsys(&nf_tables_net_ops);
7213 rcu_barrier(); 7312 rcu_barrier();
7214 nf_tables_core_module_exit(); 7313 nf_tables_core_module_exit();
7215 kfree(info);
7216} 7314}
7217 7315
7218module_init(nf_tables_module_init); 7316module_init(nf_tables_module_init);
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index 8de912ca53d3..ffd5c0f9412b 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -120,6 +120,20 @@ struct nft_jumpstack {
120 struct nft_rule *const *rules; 120 struct nft_rule *const *rules;
121}; 121};
122 122
123static void expr_call_ops_eval(const struct nft_expr *expr,
124 struct nft_regs *regs,
125 struct nft_pktinfo *pkt)
126{
127 unsigned long e = (unsigned long)expr->ops->eval;
128
129 if (e == (unsigned long)nft_meta_get_eval)
130 nft_meta_get_eval(expr, regs, pkt);
131 else if (e == (unsigned long)nft_lookup_eval)
132 nft_lookup_eval(expr, regs, pkt);
133 else
134 expr->ops->eval(expr, regs, pkt);
135}
136
123unsigned int 137unsigned int
124nft_do_chain(struct nft_pktinfo *pkt, void *priv) 138nft_do_chain(struct nft_pktinfo *pkt, void *priv)
125{ 139{
@@ -153,7 +167,7 @@ next_rule:
153 nft_cmp_fast_eval(expr, &regs); 167 nft_cmp_fast_eval(expr, &regs);
154 else if (expr->ops != &nft_payload_fast_ops || 168 else if (expr->ops != &nft_payload_fast_ops ||
155 !nft_payload_fast_eval(expr, &regs, pkt)) 169 !nft_payload_fast_eval(expr, &regs, pkt))
156 expr->ops->eval(expr, &regs, pkt); 170 expr_call_ops_eval(expr, &regs, pkt);
157 171
158 if (regs.verdict.code != NFT_CONTINUE) 172 if (regs.verdict.code != NFT_CONTINUE)
159 break; 173 break;
diff --git a/net/netfilter/nf_tables_set_core.c b/net/netfilter/nf_tables_set_core.c
new file mode 100644
index 000000000000..814789644bd3
--- /dev/null
+++ b/net/netfilter/nf_tables_set_core.c
@@ -0,0 +1,28 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2#include <net/netfilter/nf_tables_core.h>
3
4static int __init nf_tables_set_module_init(void)
5{
6 nft_register_set(&nft_set_hash_fast_type);
7 nft_register_set(&nft_set_hash_type);
8 nft_register_set(&nft_set_rhash_type);
9 nft_register_set(&nft_set_bitmap_type);
10 nft_register_set(&nft_set_rbtree_type);
11
12 return 0;
13}
14
15static void __exit nf_tables_set_module_exit(void)
16{
17 nft_unregister_set(&nft_set_rbtree_type);
18 nft_unregister_set(&nft_set_bitmap_type);
19 nft_unregister_set(&nft_set_rhash_type);
20 nft_unregister_set(&nft_set_hash_type);
21 nft_unregister_set(&nft_set_hash_fast_type);
22}
23
24module_init(nf_tables_set_module_init);
25module_exit(nf_tables_set_module_exit);
26
27MODULE_LICENSE("GPL");
28MODULE_ALIAS_NFT_SET();
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index e1b6be29848d..916913454624 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -331,18 +331,27 @@ replay:
331 } 331 }
332 } 332 }
333 333
334 if (!ss->commit || !ss->abort) { 334 if (!ss->valid_genid || !ss->commit || !ss->abort) {
335 nfnl_unlock(subsys_id); 335 nfnl_unlock(subsys_id);
336 netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL); 336 netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL);
337 return kfree_skb(skb); 337 return kfree_skb(skb);
338 } 338 }
339 339
340 if (genid && ss->valid_genid && !ss->valid_genid(net, genid)) { 340 if (!try_module_get(ss->owner)) {
341 nfnl_unlock(subsys_id);
342 netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL);
343 return kfree_skb(skb);
344 }
345
346 if (!ss->valid_genid(net, genid)) {
347 module_put(ss->owner);
341 nfnl_unlock(subsys_id); 348 nfnl_unlock(subsys_id);
342 netlink_ack(oskb, nlh, -ERESTART, NULL); 349 netlink_ack(oskb, nlh, -ERESTART, NULL);
343 return kfree_skb(skb); 350 return kfree_skb(skb);
344 } 351 }
345 352
353 nfnl_unlock(subsys_id);
354
346 while (skb->len >= nlmsg_total_size(0)) { 355 while (skb->len >= nlmsg_total_size(0)) {
347 int msglen, type; 356 int msglen, type;
348 357
@@ -464,14 +473,10 @@ ack:
464 } 473 }
465done: 474done:
466 if (status & NFNL_BATCH_REPLAY) { 475 if (status & NFNL_BATCH_REPLAY) {
467 const struct nfnetlink_subsystem *ss2; 476 ss->abort(net, oskb);
468
469 ss2 = nfnl_dereference_protected(subsys_id);
470 if (ss2 == ss)
471 ss->abort(net, oskb);
472 nfnl_err_reset(&err_list); 477 nfnl_err_reset(&err_list);
473 nfnl_unlock(subsys_id);
474 kfree_skb(skb); 478 kfree_skb(skb);
479 module_put(ss->owner);
475 goto replay; 480 goto replay;
476 } else if (status == NFNL_BATCH_DONE) { 481 } else if (status == NFNL_BATCH_DONE) {
477 err = ss->commit(net, oskb); 482 err = ss->commit(net, oskb);
@@ -489,8 +494,8 @@ done:
489 ss->cleanup(net); 494 ss->cleanup(net);
490 495
491 nfnl_err_deliver(&err_list, oskb); 496 nfnl_err_deliver(&err_list, oskb);
492 nfnl_unlock(subsys_id);
493 kfree_skb(skb); 497 kfree_skb(skb);
498 module_put(ss->owner);
494} 499}
495 500
496static const struct nla_policy nfnl_batch_policy[NFNL_BATCH_MAX + 1] = { 501static const struct nla_policy nfnl_batch_policy[NFNL_BATCH_MAX + 1] = {
diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c
index a0e5adf0b3b6..8fa8bf7c48e6 100644
--- a/net/netfilter/nfnetlink_acct.c
+++ b/net/netfilter/nfnetlink_acct.c
@@ -238,29 +238,33 @@ static const struct nla_policy filter_policy[NFACCT_FILTER_MAX + 1] = {
238 [NFACCT_FILTER_VALUE] = { .type = NLA_U32 }, 238 [NFACCT_FILTER_VALUE] = { .type = NLA_U32 },
239}; 239};
240 240
241static struct nfacct_filter * 241static int nfnl_acct_start(struct netlink_callback *cb)
242nfacct_filter_alloc(const struct nlattr * const attr)
243{ 242{
244 struct nfacct_filter *filter; 243 const struct nlattr *const attr = cb->data;
245 struct nlattr *tb[NFACCT_FILTER_MAX + 1]; 244 struct nlattr *tb[NFACCT_FILTER_MAX + 1];
245 struct nfacct_filter *filter;
246 int err; 246 int err;
247 247
248 if (!attr)
249 return 0;
250
248 err = nla_parse_nested(tb, NFACCT_FILTER_MAX, attr, filter_policy, 251 err = nla_parse_nested(tb, NFACCT_FILTER_MAX, attr, filter_policy,
249 NULL); 252 NULL);
250 if (err < 0) 253 if (err < 0)
251 return ERR_PTR(err); 254 return err;
252 255
253 if (!tb[NFACCT_FILTER_MASK] || !tb[NFACCT_FILTER_VALUE]) 256 if (!tb[NFACCT_FILTER_MASK] || !tb[NFACCT_FILTER_VALUE])
254 return ERR_PTR(-EINVAL); 257 return -EINVAL;
255 258
256 filter = kzalloc(sizeof(struct nfacct_filter), GFP_KERNEL); 259 filter = kzalloc(sizeof(struct nfacct_filter), GFP_KERNEL);
257 if (!filter) 260 if (!filter)
258 return ERR_PTR(-ENOMEM); 261 return -ENOMEM;
259 262
260 filter->mask = ntohl(nla_get_be32(tb[NFACCT_FILTER_MASK])); 263 filter->mask = ntohl(nla_get_be32(tb[NFACCT_FILTER_MASK]));
261 filter->value = ntohl(nla_get_be32(tb[NFACCT_FILTER_VALUE])); 264 filter->value = ntohl(nla_get_be32(tb[NFACCT_FILTER_VALUE]));
265 cb->data = filter;
262 266
263 return filter; 267 return 0;
264} 268}
265 269
266static int nfnl_acct_get(struct net *net, struct sock *nfnl, 270static int nfnl_acct_get(struct net *net, struct sock *nfnl,
@@ -275,18 +279,11 @@ static int nfnl_acct_get(struct net *net, struct sock *nfnl,
275 if (nlh->nlmsg_flags & NLM_F_DUMP) { 279 if (nlh->nlmsg_flags & NLM_F_DUMP) {
276 struct netlink_dump_control c = { 280 struct netlink_dump_control c = {
277 .dump = nfnl_acct_dump, 281 .dump = nfnl_acct_dump,
282 .start = nfnl_acct_start,
278 .done = nfnl_acct_done, 283 .done = nfnl_acct_done,
284 .data = (void *)tb[NFACCT_FILTER],
279 }; 285 };
280 286
281 if (tb[NFACCT_FILTER]) {
282 struct nfacct_filter *filter;
283
284 filter = nfacct_filter_alloc(tb[NFACCT_FILTER]);
285 if (IS_ERR(filter))
286 return PTR_ERR(filter);
287
288 c.data = filter;
289 }
290 return netlink_dump_start(nfnl, skb, nlh, &c); 287 return netlink_dump_start(nfnl, skb, nlh, &c);
291 } 288 }
292 289
diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
index 9ee5fa551fa6..d46a236cdf31 100644
--- a/net/netfilter/nfnetlink_cttimeout.c
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -26,7 +26,6 @@
26#include <net/sock.h> 26#include <net/sock.h>
27#include <net/netfilter/nf_conntrack.h> 27#include <net/netfilter/nf_conntrack.h>
28#include <net/netfilter/nf_conntrack_core.h> 28#include <net/netfilter/nf_conntrack_core.h>
29#include <net/netfilter/nf_conntrack_l3proto.h>
30#include <net/netfilter/nf_conntrack_l4proto.h> 29#include <net/netfilter/nf_conntrack_l4proto.h>
31#include <net/netfilter/nf_conntrack_tuple.h> 30#include <net/netfilter/nf_conntrack_tuple.h>
32#include <net/netfilter/nf_conntrack_timeout.h> 31#include <net/netfilter/nf_conntrack_timeout.h>
@@ -47,7 +46,7 @@ static const struct nla_policy cttimeout_nla_policy[CTA_TIMEOUT_MAX+1] = {
47}; 46};
48 47
49static int 48static int
50ctnl_timeout_parse_policy(void *timeouts, 49ctnl_timeout_parse_policy(void *timeout,
51 const struct nf_conntrack_l4proto *l4proto, 50 const struct nf_conntrack_l4proto *l4proto,
52 struct net *net, const struct nlattr *attr) 51 struct net *net, const struct nlattr *attr)
53{ 52{
@@ -68,7 +67,7 @@ ctnl_timeout_parse_policy(void *timeouts,
68 if (ret < 0) 67 if (ret < 0)
69 goto err; 68 goto err;
70 69
71 ret = l4proto->ctnl_timeout.nlattr_to_obj(tb, net, timeouts); 70 ret = l4proto->ctnl_timeout.nlattr_to_obj(tb, net, timeout);
72 71
73err: 72err:
74 kfree(tb); 73 kfree(tb);
@@ -114,13 +113,13 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl,
114 /* You cannot replace one timeout policy by another of 113 /* You cannot replace one timeout policy by another of
115 * different kind, sorry. 114 * different kind, sorry.
116 */ 115 */
117 if (matching->l3num != l3num || 116 if (matching->timeout.l3num != l3num ||
118 matching->l4proto->l4proto != l4num) 117 matching->timeout.l4proto->l4proto != l4num)
119 return -EINVAL; 118 return -EINVAL;
120 119
121 return ctnl_timeout_parse_policy(&matching->data, 120 return ctnl_timeout_parse_policy(&matching->timeout.data,
122 matching->l4proto, net, 121 matching->timeout.l4proto,
123 cda[CTA_TIMEOUT_DATA]); 122 net, cda[CTA_TIMEOUT_DATA]);
124 } 123 }
125 124
126 return -EBUSY; 125 return -EBUSY;
@@ -141,14 +140,14 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl,
141 goto err_proto_put; 140 goto err_proto_put;
142 } 141 }
143 142
144 ret = ctnl_timeout_parse_policy(&timeout->data, l4proto, net, 143 ret = ctnl_timeout_parse_policy(&timeout->timeout.data, l4proto, net,
145 cda[CTA_TIMEOUT_DATA]); 144 cda[CTA_TIMEOUT_DATA]);
146 if (ret < 0) 145 if (ret < 0)
147 goto err; 146 goto err;
148 147
149 strcpy(timeout->name, nla_data(cda[CTA_TIMEOUT_NAME])); 148 strcpy(timeout->name, nla_data(cda[CTA_TIMEOUT_NAME]));
150 timeout->l3num = l3num; 149 timeout->timeout.l3num = l3num;
151 timeout->l4proto = l4proto; 150 timeout->timeout.l4proto = l4proto;
152 refcount_set(&timeout->refcnt, 1); 151 refcount_set(&timeout->refcnt, 1);
153 list_add_tail_rcu(&timeout->head, &net->nfct_timeout_list); 152 list_add_tail_rcu(&timeout->head, &net->nfct_timeout_list);
154 153
@@ -167,7 +166,7 @@ ctnl_timeout_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
167 struct nlmsghdr *nlh; 166 struct nlmsghdr *nlh;
168 struct nfgenmsg *nfmsg; 167 struct nfgenmsg *nfmsg;
169 unsigned int flags = portid ? NLM_F_MULTI : 0; 168 unsigned int flags = portid ? NLM_F_MULTI : 0;
170 const struct nf_conntrack_l4proto *l4proto = timeout->l4proto; 169 const struct nf_conntrack_l4proto *l4proto = timeout->timeout.l4proto;
171 170
172 event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_TIMEOUT, event); 171 event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_TIMEOUT, event);
173 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); 172 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
@@ -180,8 +179,9 @@ ctnl_timeout_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
180 nfmsg->res_id = 0; 179 nfmsg->res_id = 0;
181 180
182 if (nla_put_string(skb, CTA_TIMEOUT_NAME, timeout->name) || 181 if (nla_put_string(skb, CTA_TIMEOUT_NAME, timeout->name) ||
183 nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, htons(timeout->l3num)) || 182 nla_put_be16(skb, CTA_TIMEOUT_L3PROTO,
184 nla_put_u8(skb, CTA_TIMEOUT_L4PROTO, timeout->l4proto->l4proto) || 183 htons(timeout->timeout.l3num)) ||
184 nla_put_u8(skb, CTA_TIMEOUT_L4PROTO, l4proto->l4proto) ||
185 nla_put_be32(skb, CTA_TIMEOUT_USE, 185 nla_put_be32(skb, CTA_TIMEOUT_USE,
186 htonl(refcount_read(&timeout->refcnt)))) 186 htonl(refcount_read(&timeout->refcnt))))
187 goto nla_put_failure; 187 goto nla_put_failure;
@@ -195,7 +195,8 @@ ctnl_timeout_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
195 if (!nest_parms) 195 if (!nest_parms)
196 goto nla_put_failure; 196 goto nla_put_failure;
197 197
198 ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, &timeout->data); 198 ret = l4proto->ctnl_timeout.obj_to_nlattr(skb,
199 &timeout->timeout.data);
199 if (ret < 0) 200 if (ret < 0)
200 goto nla_put_failure; 201 goto nla_put_failure;
201 202
@@ -298,22 +299,6 @@ static int cttimeout_get_timeout(struct net *net, struct sock *ctnl,
298 return ret; 299 return ret;
299} 300}
300 301
301static int untimeout(struct nf_conn *ct, void *timeout)
302{
303 struct nf_conn_timeout *timeout_ext = nf_ct_timeout_find(ct);
304
305 if (timeout_ext && (!timeout || timeout_ext->timeout == timeout))
306 RCU_INIT_POINTER(timeout_ext->timeout, NULL);
307
308 /* We are not intended to delete this conntrack. */
309 return 0;
310}
311
312static void ctnl_untimeout(struct net *net, struct ctnl_timeout *timeout)
313{
314 nf_ct_iterate_cleanup_net(net, untimeout, timeout, 0, 0);
315}
316
317/* try to delete object, fail if it is still in use. */ 302/* try to delete object, fail if it is still in use. */
318static int ctnl_timeout_try_del(struct net *net, struct ctnl_timeout *timeout) 303static int ctnl_timeout_try_del(struct net *net, struct ctnl_timeout *timeout)
319{ 304{
@@ -325,8 +310,8 @@ static int ctnl_timeout_try_del(struct net *net, struct ctnl_timeout *timeout)
325 if (refcount_dec_if_one(&timeout->refcnt)) { 310 if (refcount_dec_if_one(&timeout->refcnt)) {
326 /* We are protected by nfnl mutex. */ 311 /* We are protected by nfnl mutex. */
327 list_del_rcu(&timeout->head); 312 list_del_rcu(&timeout->head);
328 nf_ct_l4proto_put(timeout->l4proto); 313 nf_ct_l4proto_put(timeout->timeout.l4proto);
329 ctnl_untimeout(net, timeout); 314 nf_ct_untimeout(net, &timeout->timeout);
330 kfree_rcu(timeout, rcu_head); 315 kfree_rcu(timeout, rcu_head);
331 } else { 316 } else {
332 ret = -EBUSY; 317 ret = -EBUSY;
@@ -373,7 +358,6 @@ static int cttimeout_default_set(struct net *net, struct sock *ctnl,
373 struct netlink_ext_ack *extack) 358 struct netlink_ext_ack *extack)
374{ 359{
375 const struct nf_conntrack_l4proto *l4proto; 360 const struct nf_conntrack_l4proto *l4proto;
376 unsigned int *timeouts;
377 __u16 l3num; 361 __u16 l3num;
378 __u8 l4num; 362 __u8 l4num;
379 int ret; 363 int ret;
@@ -393,9 +377,7 @@ static int cttimeout_default_set(struct net *net, struct sock *ctnl,
393 goto err; 377 goto err;
394 } 378 }
395 379
396 timeouts = l4proto->get_timeouts(net); 380 ret = ctnl_timeout_parse_policy(NULL, l4proto, net,
397
398 ret = ctnl_timeout_parse_policy(timeouts, l4proto, net,
399 cda[CTA_TIMEOUT_DATA]); 381 cda[CTA_TIMEOUT_DATA]);
400 if (ret < 0) 382 if (ret < 0)
401 goto err; 383 goto err;
@@ -432,7 +414,6 @@ cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid,
432 414
433 if (likely(l4proto->ctnl_timeout.obj_to_nlattr)) { 415 if (likely(l4proto->ctnl_timeout.obj_to_nlattr)) {
434 struct nlattr *nest_parms; 416 struct nlattr *nest_parms;
435 unsigned int *timeouts = l4proto->get_timeouts(net);
436 int ret; 417 int ret;
437 418
438 nest_parms = nla_nest_start(skb, 419 nest_parms = nla_nest_start(skb,
@@ -440,7 +421,7 @@ cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid,
440 if (!nest_parms) 421 if (!nest_parms)
441 goto nla_put_failure; 422 goto nla_put_failure;
442 423
443 ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, timeouts); 424 ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, NULL);
444 if (ret < 0) 425 if (ret < 0)
445 goto nla_put_failure; 426 goto nla_put_failure;
446 427
@@ -508,7 +489,6 @@ err:
508 return err; 489 return err;
509} 490}
510 491
511#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
512static struct ctnl_timeout * 492static struct ctnl_timeout *
513ctnl_timeout_find_get(struct net *net, const char *name) 493ctnl_timeout_find_get(struct net *net, const char *name)
514{ 494{
@@ -532,14 +512,16 @@ err:
532 return matching; 512 return matching;
533} 513}
534 514
535static void ctnl_timeout_put(struct ctnl_timeout *timeout) 515static void ctnl_timeout_put(struct nf_ct_timeout *t)
536{ 516{
517 struct ctnl_timeout *timeout =
518 container_of(t, struct ctnl_timeout, timeout);
519
537 if (refcount_dec_and_test(&timeout->refcnt)) 520 if (refcount_dec_and_test(&timeout->refcnt))
538 kfree_rcu(timeout, rcu_head); 521 kfree_rcu(timeout, rcu_head);
539 522
540 module_put(THIS_MODULE); 523 module_put(THIS_MODULE);
541} 524}
542#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
543 525
544static const struct nfnl_callback cttimeout_cb[IPCTNL_MSG_TIMEOUT_MAX] = { 526static const struct nfnl_callback cttimeout_cb[IPCTNL_MSG_TIMEOUT_MAX] = {
545 [IPCTNL_MSG_TIMEOUT_NEW] = { .call = cttimeout_new_timeout, 527 [IPCTNL_MSG_TIMEOUT_NEW] = { .call = cttimeout_new_timeout,
@@ -580,11 +562,11 @@ static void __net_exit cttimeout_net_exit(struct net *net)
580 struct ctnl_timeout *cur, *tmp; 562 struct ctnl_timeout *cur, *tmp;
581 563
582 nf_ct_unconfirmed_destroy(net); 564 nf_ct_unconfirmed_destroy(net);
583 ctnl_untimeout(net, NULL); 565 nf_ct_untimeout(net, NULL);
584 566
585 list_for_each_entry_safe(cur, tmp, &net->nfct_timeout_list, head) { 567 list_for_each_entry_safe(cur, tmp, &net->nfct_timeout_list, head) {
586 list_del_rcu(&cur->head); 568 list_del_rcu(&cur->head);
587 nf_ct_l4proto_put(cur->l4proto); 569 nf_ct_l4proto_put(cur->timeout.l4proto);
588 570
589 if (refcount_dec_and_test(&cur->refcnt)) 571 if (refcount_dec_and_test(&cur->refcnt))
590 kfree_rcu(cur, rcu_head); 572 kfree_rcu(cur, rcu_head);
@@ -610,10 +592,8 @@ static int __init cttimeout_init(void)
610 "nfnetlink.\n"); 592 "nfnetlink.\n");
611 goto err_out; 593 goto err_out;
612 } 594 }
613#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
614 RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, ctnl_timeout_find_get); 595 RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, ctnl_timeout_find_get);
615 RCU_INIT_POINTER(nf_ct_timeout_put_hook, ctnl_timeout_put); 596 RCU_INIT_POINTER(nf_ct_timeout_put_hook, ctnl_timeout_put);
616#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
617 return 0; 597 return 0;
618 598
619err_out: 599err_out:
@@ -626,11 +606,9 @@ static void __exit cttimeout_exit(void)
626 nfnetlink_subsys_unregister(&cttimeout_subsys); 606 nfnetlink_subsys_unregister(&cttimeout_subsys);
627 607
628 unregister_pernet_subsys(&cttimeout_ops); 608 unregister_pernet_subsys(&cttimeout_ops);
629#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
630 RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, NULL); 609 RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, NULL);
631 RCU_INIT_POINTER(nf_ct_timeout_put_hook, NULL); 610 RCU_INIT_POINTER(nf_ct_timeout_put_hook, NULL);
632 synchronize_rcu(); 611 synchronize_rcu();
633#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
634} 612}
635 613
636module_init(cttimeout_init); 614module_init(cttimeout_init);
diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c
new file mode 100644
index 000000000000..00db27dfd2ff
--- /dev/null
+++ b/net/netfilter/nfnetlink_osf.c
@@ -0,0 +1,436 @@
1#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
2#include <linux/module.h>
3#include <linux/kernel.h>
4
5#include <linux/capability.h>
6#include <linux/if.h>
7#include <linux/inetdevice.h>
8#include <linux/ip.h>
9#include <linux/list.h>
10#include <linux/rculist.h>
11#include <linux/skbuff.h>
12#include <linux/slab.h>
13#include <linux/tcp.h>
14
15#include <net/ip.h>
16#include <net/tcp.h>
17
18#include <linux/netfilter/nfnetlink.h>
19#include <linux/netfilter/x_tables.h>
20#include <net/netfilter/nf_log.h>
21#include <linux/netfilter/nfnetlink_osf.h>
22
23/*
24 * Indexed by dont-fragment bit.
25 * It is the only constant value in the fingerprint.
26 */
27struct list_head nf_osf_fingers[2];
28EXPORT_SYMBOL_GPL(nf_osf_fingers);
29
30static inline int nf_osf_ttl(const struct sk_buff *skb,
31 int ttl_check, unsigned char f_ttl)
32{
33 const struct iphdr *ip = ip_hdr(skb);
34
35 if (ttl_check != -1) {
36 if (ttl_check == NF_OSF_TTL_TRUE)
37 return ip->ttl == f_ttl;
38 if (ttl_check == NF_OSF_TTL_NOCHECK)
39 return 1;
40 else if (ip->ttl <= f_ttl)
41 return 1;
42 else {
43 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
44 int ret = 0;
45
46 for_ifa(in_dev) {
47 if (inet_ifa_match(ip->saddr, ifa)) {
48 ret = (ip->ttl == f_ttl);
49 break;
50 }
51 }
52 endfor_ifa(in_dev);
53
54 return ret;
55 }
56 }
57
58 return ip->ttl == f_ttl;
59}
60
61struct nf_osf_hdr_ctx {
62 bool df;
63 u16 window;
64 u16 totlen;
65 const unsigned char *optp;
66 unsigned int optsize;
67};
68
69static bool nf_osf_match_one(const struct sk_buff *skb,
70 const struct nf_osf_user_finger *f,
71 int ttl_check,
72 struct nf_osf_hdr_ctx *ctx)
73{
74 unsigned int check_WSS = 0;
75 int fmatch = FMATCH_WRONG;
76 int foptsize, optnum;
77 u16 mss = 0;
78
79 if (ctx->totlen != f->ss || !nf_osf_ttl(skb, ttl_check, f->ttl))
80 return false;
81
82 /*
83 * Should not happen if userspace parser was written correctly.
84 */
85 if (f->wss.wc >= OSF_WSS_MAX)
86 return false;
87
88 /* Check options */
89
90 foptsize = 0;
91 for (optnum = 0; optnum < f->opt_num; ++optnum)
92 foptsize += f->opt[optnum].length;
93
94 if (foptsize > MAX_IPOPTLEN ||
95 ctx->optsize > MAX_IPOPTLEN ||
96 ctx->optsize != foptsize)
97 return false;
98
99 check_WSS = f->wss.wc;
100
101 for (optnum = 0; optnum < f->opt_num; ++optnum) {
102 if (f->opt[optnum].kind == *ctx->optp) {
103 __u32 len = f->opt[optnum].length;
104 const __u8 *optend = ctx->optp + len;
105
106 fmatch = FMATCH_OK;
107
108 switch (*ctx->optp) {
109 case OSFOPT_MSS:
110 mss = ctx->optp[3];
111 mss <<= 8;
112 mss |= ctx->optp[2];
113
114 mss = ntohs((__force __be16)mss);
115 break;
116 case OSFOPT_TS:
117 break;
118 }
119
120 ctx->optp = optend;
121 } else
122 fmatch = FMATCH_OPT_WRONG;
123
124 if (fmatch != FMATCH_OK)
125 break;
126 }
127
128 if (fmatch != FMATCH_OPT_WRONG) {
129 fmatch = FMATCH_WRONG;
130
131 switch (check_WSS) {
132 case OSF_WSS_PLAIN:
133 if (f->wss.val == 0 || ctx->window == f->wss.val)
134 fmatch = FMATCH_OK;
135 break;
136 case OSF_WSS_MSS:
137 /*
138 * Some smart modems decrease mangle MSS to
139 * SMART_MSS_2, so we check standard, decreased
140 * and the one provided in the fingerprint MSS
141 * values.
142 */
143#define SMART_MSS_1 1460
144#define SMART_MSS_2 1448
145 if (ctx->window == f->wss.val * mss ||
146 ctx->window == f->wss.val * SMART_MSS_1 ||
147 ctx->window == f->wss.val * SMART_MSS_2)
148 fmatch = FMATCH_OK;
149 break;
150 case OSF_WSS_MTU:
151 if (ctx->window == f->wss.val * (mss + 40) ||
152 ctx->window == f->wss.val * (SMART_MSS_1 + 40) ||
153 ctx->window == f->wss.val * (SMART_MSS_2 + 40))
154 fmatch = FMATCH_OK;
155 break;
156 case OSF_WSS_MODULO:
157 if ((ctx->window % f->wss.val) == 0)
158 fmatch = FMATCH_OK;
159 break;
160 }
161 }
162
163 return fmatch == FMATCH_OK;
164}
165
166static const struct tcphdr *nf_osf_hdr_ctx_init(struct nf_osf_hdr_ctx *ctx,
167 const struct sk_buff *skb,
168 const struct iphdr *ip,
169 unsigned char *opts)
170{
171 const struct tcphdr *tcp;
172 struct tcphdr _tcph;
173
174 tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph);
175 if (!tcp)
176 return NULL;
177
178 if (!tcp->syn)
179 return NULL;
180
181 ctx->totlen = ntohs(ip->tot_len);
182 ctx->df = ntohs(ip->frag_off) & IP_DF;
183 ctx->window = ntohs(tcp->window);
184
185 if (tcp->doff * 4 > sizeof(struct tcphdr)) {
186 ctx->optsize = tcp->doff * 4 - sizeof(struct tcphdr);
187
188 ctx->optp = skb_header_pointer(skb, ip_hdrlen(skb) +
189 sizeof(struct tcphdr), ctx->optsize, opts);
190 }
191
192 return tcp;
193}
194
195bool
196nf_osf_match(const struct sk_buff *skb, u_int8_t family,
197 int hooknum, struct net_device *in, struct net_device *out,
198 const struct nf_osf_info *info, struct net *net,
199 const struct list_head *nf_osf_fingers)
200{
201 const struct iphdr *ip = ip_hdr(skb);
202 const struct nf_osf_user_finger *f;
203 unsigned char opts[MAX_IPOPTLEN];
204 const struct nf_osf_finger *kf;
205 int fcount = 0, ttl_check;
206 int fmatch = FMATCH_WRONG;
207 struct nf_osf_hdr_ctx ctx;
208 const struct tcphdr *tcp;
209
210 memset(&ctx, 0, sizeof(ctx));
211
212 tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts);
213 if (!tcp)
214 return false;
215
216 ttl_check = (info->flags & NF_OSF_TTL) ? info->ttl : -1;
217
218 list_for_each_entry_rcu(kf, &nf_osf_fingers[ctx.df], finger_entry) {
219
220 f = &kf->finger;
221
222 if (!(info->flags & NF_OSF_LOG) && strcmp(info->genre, f->genre))
223 continue;
224
225 if (!nf_osf_match_one(skb, f, ttl_check, &ctx))
226 continue;
227
228 fmatch = FMATCH_OK;
229
230 fcount++;
231
232 if (info->flags & NF_OSF_LOG)
233 nf_log_packet(net, family, hooknum, skb,
234 in, out, NULL,
235 "%s [%s:%s] : %pI4:%d -> %pI4:%d hops=%d\n",
236 f->genre, f->version, f->subtype,
237 &ip->saddr, ntohs(tcp->source),
238 &ip->daddr, ntohs(tcp->dest),
239 f->ttl - ip->ttl);
240
241 if ((info->flags & NF_OSF_LOG) &&
242 info->loglevel == NF_OSF_LOGLEVEL_FIRST)
243 break;
244 }
245
246 if (!fcount && (info->flags & NF_OSF_LOG))
247 nf_log_packet(net, family, hooknum, skb, in, out, NULL,
248 "Remote OS is not known: %pI4:%u -> %pI4:%u\n",
249 &ip->saddr, ntohs(tcp->source),
250 &ip->daddr, ntohs(tcp->dest));
251
252 if (fcount)
253 fmatch = FMATCH_OK;
254
255 return fmatch == FMATCH_OK;
256}
257EXPORT_SYMBOL_GPL(nf_osf_match);
258
259const char *nf_osf_find(const struct sk_buff *skb,
260 const struct list_head *nf_osf_fingers)
261{
262 const struct iphdr *ip = ip_hdr(skb);
263 const struct nf_osf_user_finger *f;
264 unsigned char opts[MAX_IPOPTLEN];
265 const struct nf_osf_finger *kf;
266 struct nf_osf_hdr_ctx ctx;
267 const struct tcphdr *tcp;
268 const char *genre = NULL;
269
270 memset(&ctx, 0, sizeof(ctx));
271
272 tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts);
273 if (!tcp)
274 return NULL;
275
276 list_for_each_entry_rcu(kf, &nf_osf_fingers[ctx.df], finger_entry) {
277 f = &kf->finger;
278 if (!nf_osf_match_one(skb, f, -1, &ctx))
279 continue;
280
281 genre = f->genre;
282 break;
283 }
284
285 return genre;
286}
287EXPORT_SYMBOL_GPL(nf_osf_find);
288
289static const struct nla_policy nfnl_osf_policy[OSF_ATTR_MAX + 1] = {
290 [OSF_ATTR_FINGER] = { .len = sizeof(struct nf_osf_user_finger) },
291};
292
293static int nfnl_osf_add_callback(struct net *net, struct sock *ctnl,
294 struct sk_buff *skb, const struct nlmsghdr *nlh,
295 const struct nlattr * const osf_attrs[],
296 struct netlink_ext_ack *extack)
297{
298 struct nf_osf_user_finger *f;
299 struct nf_osf_finger *kf = NULL, *sf;
300 int err = 0;
301
302 if (!capable(CAP_NET_ADMIN))
303 return -EPERM;
304
305 if (!osf_attrs[OSF_ATTR_FINGER])
306 return -EINVAL;
307
308 if (!(nlh->nlmsg_flags & NLM_F_CREATE))
309 return -EINVAL;
310
311 f = nla_data(osf_attrs[OSF_ATTR_FINGER]);
312
313 kf = kmalloc(sizeof(struct nf_osf_finger), GFP_KERNEL);
314 if (!kf)
315 return -ENOMEM;
316
317 memcpy(&kf->finger, f, sizeof(struct nf_osf_user_finger));
318
319 list_for_each_entry(sf, &nf_osf_fingers[!!f->df], finger_entry) {
320 if (memcmp(&sf->finger, f, sizeof(struct nf_osf_user_finger)))
321 continue;
322
323 kfree(kf);
324 kf = NULL;
325
326 if (nlh->nlmsg_flags & NLM_F_EXCL)
327 err = -EEXIST;
328 break;
329 }
330
331 /*
332 * We are protected by nfnl mutex.
333 */
334 if (kf)
335 list_add_tail_rcu(&kf->finger_entry, &nf_osf_fingers[!!f->df]);
336
337 return err;
338}
339
340static int nfnl_osf_remove_callback(struct net *net, struct sock *ctnl,
341 struct sk_buff *skb,
342 const struct nlmsghdr *nlh,
343 const struct nlattr * const osf_attrs[],
344 struct netlink_ext_ack *extack)
345{
346 struct nf_osf_user_finger *f;
347 struct nf_osf_finger *sf;
348 int err = -ENOENT;
349
350 if (!capable(CAP_NET_ADMIN))
351 return -EPERM;
352
353 if (!osf_attrs[OSF_ATTR_FINGER])
354 return -EINVAL;
355
356 f = nla_data(osf_attrs[OSF_ATTR_FINGER]);
357
358 list_for_each_entry(sf, &nf_osf_fingers[!!f->df], finger_entry) {
359 if (memcmp(&sf->finger, f, sizeof(struct nf_osf_user_finger)))
360 continue;
361
362 /*
363 * We are protected by nfnl mutex.
364 */
365 list_del_rcu(&sf->finger_entry);
366 kfree_rcu(sf, rcu_head);
367
368 err = 0;
369 break;
370 }
371
372 return err;
373}
374
375static const struct nfnl_callback nfnl_osf_callbacks[OSF_MSG_MAX] = {
376 [OSF_MSG_ADD] = {
377 .call = nfnl_osf_add_callback,
378 .attr_count = OSF_ATTR_MAX,
379 .policy = nfnl_osf_policy,
380 },
381 [OSF_MSG_REMOVE] = {
382 .call = nfnl_osf_remove_callback,
383 .attr_count = OSF_ATTR_MAX,
384 .policy = nfnl_osf_policy,
385 },
386};
387
388static const struct nfnetlink_subsystem nfnl_osf_subsys = {
389 .name = "osf",
390 .subsys_id = NFNL_SUBSYS_OSF,
391 .cb_count = OSF_MSG_MAX,
392 .cb = nfnl_osf_callbacks,
393};
394
395static int __init nfnl_osf_init(void)
396{
397 int err = -EINVAL;
398 int i;
399
400 for (i = 0; i < ARRAY_SIZE(nf_osf_fingers); ++i)
401 INIT_LIST_HEAD(&nf_osf_fingers[i]);
402
403 err = nfnetlink_subsys_register(&nfnl_osf_subsys);
404 if (err < 0) {
405 pr_err("Failed to register OSF nsfnetlink helper (%d)\n", err);
406 goto err_out_exit;
407 }
408 return 0;
409
410err_out_exit:
411 return err;
412}
413
414static void __exit nfnl_osf_fini(void)
415{
416 struct nf_osf_finger *f;
417 int i;
418
419 nfnetlink_subsys_unregister(&nfnl_osf_subsys);
420
421 rcu_read_lock();
422 for (i = 0; i < ARRAY_SIZE(nf_osf_fingers); ++i) {
423 list_for_each_entry_rcu(f, &nf_osf_fingers[i], finger_entry) {
424 list_del_rcu(&f->finger_entry);
425 kfree_rcu(f, rcu_head);
426 }
427 }
428 rcu_read_unlock();
429
430 rcu_barrier();
431}
432
433module_init(nfnl_osf_init);
434module_exit(nfnl_osf_fini);
435
436MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 4ccd2988f9db..ea4ba551abb2 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -1243,6 +1243,9 @@ static int nfqnl_recv_unsupp(struct net *net, struct sock *ctnl,
1243static const struct nla_policy nfqa_cfg_policy[NFQA_CFG_MAX+1] = { 1243static const struct nla_policy nfqa_cfg_policy[NFQA_CFG_MAX+1] = {
1244 [NFQA_CFG_CMD] = { .len = sizeof(struct nfqnl_msg_config_cmd) }, 1244 [NFQA_CFG_CMD] = { .len = sizeof(struct nfqnl_msg_config_cmd) },
1245 [NFQA_CFG_PARAMS] = { .len = sizeof(struct nfqnl_msg_config_params) }, 1245 [NFQA_CFG_PARAMS] = { .len = sizeof(struct nfqnl_msg_config_params) },
1246 [NFQA_CFG_QUEUE_MAXLEN] = { .type = NLA_U32 },
1247 [NFQA_CFG_MASK] = { .type = NLA_U32 },
1248 [NFQA_CFG_FLAGS] = { .type = NLA_U32 },
1246}; 1249};
1247 1250
1248static const struct nf_queue_handler nfqh = { 1251static const struct nf_queue_handler nfqh = {
diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c
index d21834bed805..3fd540b2c6ba 100644
--- a/net/netfilter/nft_chain_filter.c
+++ b/net/netfilter/nft_chain_filter.c
@@ -293,6 +293,13 @@ static void nft_netdev_event(unsigned long event, struct net_device *dev,
293 if (strcmp(basechain->dev_name, dev->name) != 0) 293 if (strcmp(basechain->dev_name, dev->name) != 0)
294 return; 294 return;
295 295
296 /* UNREGISTER events are also happpening on netns exit.
297 *
298 * Altough nf_tables core releases all tables/chains, only
299 * this event handler provides guarantee that
300 * basechain.ops->dev is still accessible, so we cannot
301 * skip exiting net namespaces.
302 */
296 __nft_release_basechain(ctx); 303 __nft_release_basechain(ctx);
297 break; 304 break;
298 case NETDEV_CHANGENAME: 305 case NETDEV_CHANGENAME:
@@ -318,11 +325,7 @@ static int nf_tables_netdev_event(struct notifier_block *this,
318 event != NETDEV_CHANGENAME) 325 event != NETDEV_CHANGENAME)
319 return NOTIFY_DONE; 326 return NOTIFY_DONE;
320 327
321 ctx.net = maybe_get_net(ctx.net); 328 mutex_lock(&ctx.net->nft.commit_mutex);
322 if (!ctx.net)
323 return NOTIFY_DONE;
324
325 nfnl_lock(NFNL_SUBSYS_NFTABLES);
326 list_for_each_entry(table, &ctx.net->nft.tables, list) { 329 list_for_each_entry(table, &ctx.net->nft.tables, list) {
327 if (table->family != NFPROTO_NETDEV) 330 if (table->family != NFPROTO_NETDEV)
328 continue; 331 continue;
@@ -337,8 +340,7 @@ static int nf_tables_netdev_event(struct notifier_block *this,
337 nft_netdev_event(event, dev, &ctx); 340 nft_netdev_event(event, dev, &ctx);
338 } 341 }
339 } 342 }
340 nfnl_unlock(NFNL_SUBSYS_NFTABLES); 343 mutex_unlock(&ctx.net->nft.commit_mutex);
341 put_net(ctx.net);
342 344
343 return NOTIFY_DONE; 345 return NOTIFY_DONE;
344} 346}
@@ -392,7 +394,7 @@ int __init nft_chain_filter_init(void)
392 return 0; 394 return 0;
393} 395}
394 396
395void __exit nft_chain_filter_fini(void) 397void nft_chain_filter_fini(void)
396{ 398{
397 nft_chain_filter_bridge_fini(); 399 nft_chain_filter_bridge_fini();
398 nft_chain_filter_inet_fini(); 400 nft_chain_filter_inet_fini();
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index 8d1ff654e5af..32535eea51b2 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -832,10 +832,18 @@ nft_target_select_ops(const struct nft_ctx *ctx,
832 rev = ntohl(nla_get_be32(tb[NFTA_TARGET_REV])); 832 rev = ntohl(nla_get_be32(tb[NFTA_TARGET_REV]));
833 family = ctx->family; 833 family = ctx->family;
834 834
835 if (strcmp(tg_name, XT_ERROR_TARGET) == 0 ||
836 strcmp(tg_name, XT_STANDARD_TARGET) == 0 ||
837 strcmp(tg_name, "standard") == 0)
838 return ERR_PTR(-EINVAL);
839
835 /* Re-use the existing target if it's already loaded. */ 840 /* Re-use the existing target if it's already loaded. */
836 list_for_each_entry(nft_target, &nft_target_list, head) { 841 list_for_each_entry(nft_target, &nft_target_list, head) {
837 struct xt_target *target = nft_target->ops.data; 842 struct xt_target *target = nft_target->ops.data;
838 843
844 if (!target->target)
845 continue;
846
839 if (nft_target_cmp(target, tg_name, rev, family)) 847 if (nft_target_cmp(target, tg_name, rev, family))
840 return &nft_target->ops; 848 return &nft_target->ops;
841 } 849 }
@@ -844,6 +852,11 @@ nft_target_select_ops(const struct nft_ctx *ctx,
844 if (IS_ERR(target)) 852 if (IS_ERR(target))
845 return ERR_PTR(-ENOENT); 853 return ERR_PTR(-ENOENT);
846 854
855 if (!target->target) {
856 err = -EINVAL;
857 goto err;
858 }
859
847 if (target->targetsize > nla_len(tb[NFTA_TARGET_INFO])) { 860 if (target->targetsize > nla_len(tb[NFTA_TARGET_INFO])) {
848 err = -EINVAL; 861 err = -EINVAL;
849 goto err; 862 goto err;
diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c
index a832c59f0a9c..b90d96ba4a12 100644
--- a/net/netfilter/nft_connlimit.c
+++ b/net/netfilter/nft_connlimit.c
@@ -14,10 +14,9 @@
14#include <net/netfilter/nf_conntrack_zones.h> 14#include <net/netfilter/nf_conntrack_zones.h>
15 15
16struct nft_connlimit { 16struct nft_connlimit {
17 spinlock_t lock; 17 struct nf_conncount_list list;
18 struct hlist_head hhead; 18 u32 limit;
19 u32 limit; 19 bool invert;
20 bool invert;
21}; 20};
22 21
23static inline void nft_connlimit_do_eval(struct nft_connlimit *priv, 22static inline void nft_connlimit_do_eval(struct nft_connlimit *priv,
@@ -45,21 +44,19 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit *priv,
45 return; 44 return;
46 } 45 }
47 46
48 spin_lock_bh(&priv->lock); 47 nf_conncount_lookup(nft_net(pkt), &priv->list, tuple_ptr, zone,
49 count = nf_conncount_lookup(nft_net(pkt), &priv->hhead, tuple_ptr, zone, 48 &addit);
50 &addit); 49 count = priv->list.count;
51 50
52 if (!addit) 51 if (!addit)
53 goto out; 52 goto out;
54 53
55 if (!nf_conncount_add(&priv->hhead, tuple_ptr, zone)) { 54 if (nf_conncount_add(&priv->list, tuple_ptr, zone) == NF_CONNCOUNT_ERR) {
56 regs->verdict.code = NF_DROP; 55 regs->verdict.code = NF_DROP;
57 spin_unlock_bh(&priv->lock);
58 return; 56 return;
59 } 57 }
60 count++; 58 count++;
61out: 59out:
62 spin_unlock_bh(&priv->lock);
63 60
64 if ((count > priv->limit) ^ priv->invert) { 61 if ((count > priv->limit) ^ priv->invert) {
65 regs->verdict.code = NFT_BREAK; 62 regs->verdict.code = NFT_BREAK;
@@ -87,8 +84,7 @@ static int nft_connlimit_do_init(const struct nft_ctx *ctx,
87 invert = true; 84 invert = true;
88 } 85 }
89 86
90 spin_lock_init(&priv->lock); 87 nf_conncount_list_init(&priv->list);
91 INIT_HLIST_HEAD(&priv->hhead);
92 priv->limit = limit; 88 priv->limit = limit;
93 priv->invert = invert; 89 priv->invert = invert;
94 90
@@ -99,7 +95,7 @@ static void nft_connlimit_do_destroy(const struct nft_ctx *ctx,
99 struct nft_connlimit *priv) 95 struct nft_connlimit *priv)
100{ 96{
101 nf_ct_netns_put(ctx->net, ctx->family); 97 nf_ct_netns_put(ctx->net, ctx->family);
102 nf_conncount_cache_free(&priv->hhead); 98 nf_conncount_cache_free(&priv->list);
103} 99}
104 100
105static int nft_connlimit_do_dump(struct sk_buff *skb, 101static int nft_connlimit_do_dump(struct sk_buff *skb,
@@ -212,8 +208,7 @@ static int nft_connlimit_clone(struct nft_expr *dst, const struct nft_expr *src)
212 struct nft_connlimit *priv_dst = nft_expr_priv(dst); 208 struct nft_connlimit *priv_dst = nft_expr_priv(dst);
213 struct nft_connlimit *priv_src = nft_expr_priv(src); 209 struct nft_connlimit *priv_src = nft_expr_priv(src);
214 210
215 spin_lock_init(&priv_dst->lock); 211 nf_conncount_list_init(&priv_dst->list);
216 INIT_HLIST_HEAD(&priv_dst->hhead);
217 priv_dst->limit = priv_src->limit; 212 priv_dst->limit = priv_src->limit;
218 priv_dst->invert = priv_src->invert; 213 priv_dst->invert = priv_src->invert;
219 214
@@ -225,21 +220,14 @@ static void nft_connlimit_destroy_clone(const struct nft_ctx *ctx,
225{ 220{
226 struct nft_connlimit *priv = nft_expr_priv(expr); 221 struct nft_connlimit *priv = nft_expr_priv(expr);
227 222
228 nf_conncount_cache_free(&priv->hhead); 223 nf_conncount_cache_free(&priv->list);
229} 224}
230 225
231static bool nft_connlimit_gc(struct net *net, const struct nft_expr *expr) 226static bool nft_connlimit_gc(struct net *net, const struct nft_expr *expr)
232{ 227{
233 struct nft_connlimit *priv = nft_expr_priv(expr); 228 struct nft_connlimit *priv = nft_expr_priv(expr);
234 bool addit, ret;
235 229
236 spin_lock_bh(&priv->lock); 230 return nf_conncount_gc_list(net, &priv->list);
237 nf_conncount_lookup(net, &priv->hhead, NULL, &nf_ct_zone_dflt, &addit);
238
239 ret = hlist_empty(&priv->hhead);
240 spin_unlock_bh(&priv->lock);
241
242 return ret;
243} 231}
244 232
245static struct nft_expr_type nft_connlimit_type; 233static struct nft_expr_type nft_connlimit_type;
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index 1435ffc5f57e..26a8baebd072 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -22,6 +22,8 @@
22#include <net/netfilter/nf_conntrack_helper.h> 22#include <net/netfilter/nf_conntrack_helper.h>
23#include <net/netfilter/nf_conntrack_ecache.h> 23#include <net/netfilter/nf_conntrack_ecache.h>
24#include <net/netfilter/nf_conntrack_labels.h> 24#include <net/netfilter/nf_conntrack_labels.h>
25#include <net/netfilter/nf_conntrack_timeout.h>
26#include <net/netfilter/nf_conntrack_l4proto.h>
25 27
26struct nft_ct { 28struct nft_ct {
27 enum nft_ct_keys key:8; 29 enum nft_ct_keys key:8;
@@ -765,6 +767,195 @@ static struct nft_expr_type nft_notrack_type __read_mostly = {
765 .owner = THIS_MODULE, 767 .owner = THIS_MODULE,
766}; 768};
767 769
770#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
771static int
772nft_ct_timeout_parse_policy(void *timeouts,
773 const struct nf_conntrack_l4proto *l4proto,
774 struct net *net, const struct nlattr *attr)
775{
776 struct nlattr **tb;
777 int ret = 0;
778
779 if (!l4proto->ctnl_timeout.nlattr_to_obj)
780 return 0;
781
782 tb = kcalloc(l4proto->ctnl_timeout.nlattr_max + 1, sizeof(*tb),
783 GFP_KERNEL);
784
785 if (!tb)
786 return -ENOMEM;
787
788 ret = nla_parse_nested(tb, l4proto->ctnl_timeout.nlattr_max,
789 attr, l4proto->ctnl_timeout.nla_policy,
790 NULL);
791 if (ret < 0)
792 goto err;
793
794 ret = l4proto->ctnl_timeout.nlattr_to_obj(tb, net, timeouts);
795
796err:
797 kfree(tb);
798 return ret;
799}
800
801struct nft_ct_timeout_obj {
802 struct nf_conn *tmpl;
803 u8 l4proto;
804};
805
806static void nft_ct_timeout_obj_eval(struct nft_object *obj,
807 struct nft_regs *regs,
808 const struct nft_pktinfo *pkt)
809{
810 const struct nft_ct_timeout_obj *priv = nft_obj_data(obj);
811 struct nf_conn *ct = (struct nf_conn *)skb_nfct(pkt->skb);
812 struct sk_buff *skb = pkt->skb;
813
814 if (ct ||
815 priv->l4proto != pkt->tprot)
816 return;
817
818 nf_ct_set(skb, priv->tmpl, IP_CT_NEW);
819}
820
821static int nft_ct_timeout_obj_init(const struct nft_ctx *ctx,
822 const struct nlattr * const tb[],
823 struct nft_object *obj)
824{
825 const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
826 struct nft_ct_timeout_obj *priv = nft_obj_data(obj);
827 const struct nf_conntrack_l4proto *l4proto;
828 struct nf_conn_timeout *timeout_ext;
829 struct nf_ct_timeout *timeout;
830 int l3num = ctx->family;
831 struct nf_conn *tmpl;
832 __u8 l4num;
833 int ret;
834
835 if (!tb[NFTA_CT_TIMEOUT_L4PROTO] ||
836 !tb[NFTA_CT_TIMEOUT_DATA])
837 return -EINVAL;
838
839 if (tb[NFTA_CT_TIMEOUT_L3PROTO])
840 l3num = ntohs(nla_get_be16(tb[NFTA_CT_TIMEOUT_L3PROTO]));
841
842 l4num = nla_get_u8(tb[NFTA_CT_TIMEOUT_L4PROTO]);
843 priv->l4proto = l4num;
844
845 l4proto = nf_ct_l4proto_find_get(l3num, l4num);
846
847 if (l4proto->l4proto != l4num) {
848 ret = -EOPNOTSUPP;
849 goto err_proto_put;
850 }
851
852 timeout = kzalloc(sizeof(struct nf_ct_timeout) +
853 l4proto->ctnl_timeout.obj_size, GFP_KERNEL);
854 if (timeout == NULL) {
855 ret = -ENOMEM;
856 goto err_proto_put;
857 }
858
859 ret = nft_ct_timeout_parse_policy(&timeout->data, l4proto, ctx->net,
860 tb[NFTA_CT_TIMEOUT_DATA]);
861 if (ret < 0)
862 goto err_free_timeout;
863
864 timeout->l3num = l3num;
865 timeout->l4proto = l4proto;
866 tmpl = nf_ct_tmpl_alloc(ctx->net, zone, GFP_ATOMIC);
867 if (!tmpl) {
868 ret = -ENOMEM;
869 goto err_free_timeout;
870 }
871
872 timeout_ext = nf_ct_timeout_ext_add(tmpl, timeout, GFP_ATOMIC);
873 if (!timeout_ext) {
874 ret = -ENOMEM;
875 goto err_free_tmpl;
876 }
877
878 ret = nf_ct_netns_get(ctx->net, ctx->family);
879 if (ret < 0)
880 goto err_free_tmpl;
881
882 priv->tmpl = tmpl;
883
884 return 0;
885
886err_free_tmpl:
887 nf_ct_tmpl_free(tmpl);
888err_free_timeout:
889 kfree(timeout);
890err_proto_put:
891 nf_ct_l4proto_put(l4proto);
892 return ret;
893}
894
895static void nft_ct_timeout_obj_destroy(const struct nft_ctx *ctx,
896 struct nft_object *obj)
897{
898 struct nft_ct_timeout_obj *priv = nft_obj_data(obj);
899 struct nf_conn_timeout *t = nf_ct_timeout_find(priv->tmpl);
900 struct nf_ct_timeout *timeout;
901
902 timeout = rcu_dereference_raw(t->timeout);
903 nf_ct_untimeout(ctx->net, timeout);
904 nf_ct_l4proto_put(timeout->l4proto);
905 nf_ct_netns_put(ctx->net, ctx->family);
906 nf_ct_tmpl_free(priv->tmpl);
907}
908
909static int nft_ct_timeout_obj_dump(struct sk_buff *skb,
910 struct nft_object *obj, bool reset)
911{
912 const struct nft_ct_timeout_obj *priv = nft_obj_data(obj);
913 const struct nf_conn_timeout *t = nf_ct_timeout_find(priv->tmpl);
914 const struct nf_ct_timeout *timeout = rcu_dereference_raw(t->timeout);
915 struct nlattr *nest_params;
916 int ret;
917
918 if (nla_put_u8(skb, NFTA_CT_TIMEOUT_L4PROTO, timeout->l4proto->l4proto) ||
919 nla_put_be16(skb, NFTA_CT_TIMEOUT_L3PROTO, htons(timeout->l3num)))
920 return -1;
921
922 nest_params = nla_nest_start(skb, NFTA_CT_TIMEOUT_DATA | NLA_F_NESTED);
923 if (!nest_params)
924 return -1;
925
926 ret = timeout->l4proto->ctnl_timeout.obj_to_nlattr(skb, &timeout->data);
927 if (ret < 0)
928 return -1;
929 nla_nest_end(skb, nest_params);
930 return 0;
931}
932
933static const struct nla_policy nft_ct_timeout_policy[NFTA_CT_TIMEOUT_MAX + 1] = {
934 [NFTA_CT_TIMEOUT_L3PROTO] = {.type = NLA_U16 },
935 [NFTA_CT_TIMEOUT_L4PROTO] = {.type = NLA_U8 },
936 [NFTA_CT_TIMEOUT_DATA] = {.type = NLA_NESTED },
937};
938
939static struct nft_object_type nft_ct_timeout_obj_type;
940
941static const struct nft_object_ops nft_ct_timeout_obj_ops = {
942 .type = &nft_ct_timeout_obj_type,
943 .size = sizeof(struct nft_ct_timeout_obj),
944 .eval = nft_ct_timeout_obj_eval,
945 .init = nft_ct_timeout_obj_init,
946 .destroy = nft_ct_timeout_obj_destroy,
947 .dump = nft_ct_timeout_obj_dump,
948};
949
950static struct nft_object_type nft_ct_timeout_obj_type __read_mostly = {
951 .type = NFT_OBJECT_CT_TIMEOUT,
952 .ops = &nft_ct_timeout_obj_ops,
953 .maxattr = NFTA_CT_TIMEOUT_MAX,
954 .policy = nft_ct_timeout_policy,
955 .owner = THIS_MODULE,
956};
957#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
958
768static int nft_ct_helper_obj_init(const struct nft_ctx *ctx, 959static int nft_ct_helper_obj_init(const struct nft_ctx *ctx,
769 const struct nlattr * const tb[], 960 const struct nlattr * const tb[],
770 struct nft_object *obj) 961 struct nft_object *obj)
@@ -773,6 +964,7 @@ static int nft_ct_helper_obj_init(const struct nft_ctx *ctx,
773 struct nf_conntrack_helper *help4, *help6; 964 struct nf_conntrack_helper *help4, *help6;
774 char name[NF_CT_HELPER_NAME_LEN]; 965 char name[NF_CT_HELPER_NAME_LEN];
775 int family = ctx->family; 966 int family = ctx->family;
967 int err;
776 968
777 if (!tb[NFTA_CT_HELPER_NAME] || !tb[NFTA_CT_HELPER_L4PROTO]) 969 if (!tb[NFTA_CT_HELPER_NAME] || !tb[NFTA_CT_HELPER_L4PROTO])
778 return -EINVAL; 970 return -EINVAL;
@@ -823,7 +1015,18 @@ static int nft_ct_helper_obj_init(const struct nft_ctx *ctx,
823 priv->helper4 = help4; 1015 priv->helper4 = help4;
824 priv->helper6 = help6; 1016 priv->helper6 = help6;
825 1017
1018 err = nf_ct_netns_get(ctx->net, ctx->family);
1019 if (err < 0)
1020 goto err_put_helper;
1021
826 return 0; 1022 return 0;
1023
1024err_put_helper:
1025 if (priv->helper4)
1026 nf_conntrack_helper_put(priv->helper4);
1027 if (priv->helper6)
1028 nf_conntrack_helper_put(priv->helper6);
1029 return err;
827} 1030}
828 1031
829static void nft_ct_helper_obj_destroy(const struct nft_ctx *ctx, 1032static void nft_ct_helper_obj_destroy(const struct nft_ctx *ctx,
@@ -835,6 +1038,8 @@ static void nft_ct_helper_obj_destroy(const struct nft_ctx *ctx,
835 nf_conntrack_helper_put(priv->helper4); 1038 nf_conntrack_helper_put(priv->helper4);
836 if (priv->helper6) 1039 if (priv->helper6)
837 nf_conntrack_helper_put(priv->helper6); 1040 nf_conntrack_helper_put(priv->helper6);
1041
1042 nf_ct_netns_put(ctx->net, ctx->family);
838} 1043}
839 1044
840static void nft_ct_helper_obj_eval(struct nft_object *obj, 1045static void nft_ct_helper_obj_eval(struct nft_object *obj,
@@ -870,7 +1075,7 @@ static void nft_ct_helper_obj_eval(struct nft_object *obj,
870 if (test_bit(IPS_HELPER_BIT, &ct->status)) 1075 if (test_bit(IPS_HELPER_BIT, &ct->status))
871 return; 1076 return;
872 1077
873 help = nf_ct_helper_ext_add(ct, to_assign, GFP_ATOMIC); 1078 help = nf_ct_helper_ext_add(ct, GFP_ATOMIC);
874 if (help) { 1079 if (help) {
875 rcu_assign_pointer(help->helper, to_assign); 1080 rcu_assign_pointer(help->helper, to_assign);
876 set_bit(IPS_HELPER_BIT, &ct->status); 1081 set_bit(IPS_HELPER_BIT, &ct->status);
@@ -949,9 +1154,17 @@ static int __init nft_ct_module_init(void)
949 err = nft_register_obj(&nft_ct_helper_obj_type); 1154 err = nft_register_obj(&nft_ct_helper_obj_type);
950 if (err < 0) 1155 if (err < 0)
951 goto err2; 1156 goto err2;
952 1157#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
1158 err = nft_register_obj(&nft_ct_timeout_obj_type);
1159 if (err < 0)
1160 goto err3;
1161#endif
953 return 0; 1162 return 0;
954 1163
1164#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
1165err3:
1166 nft_unregister_obj(&nft_ct_helper_obj_type);
1167#endif
955err2: 1168err2:
956 nft_unregister_expr(&nft_notrack_type); 1169 nft_unregister_expr(&nft_notrack_type);
957err1: 1170err1:
@@ -961,6 +1174,9 @@ err1:
961 1174
962static void __exit nft_ct_module_exit(void) 1175static void __exit nft_ct_module_exit(void)
963{ 1176{
1177#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
1178 nft_unregister_obj(&nft_ct_timeout_obj_type);
1179#endif
964 nft_unregister_obj(&nft_ct_helper_obj_type); 1180 nft_unregister_obj(&nft_ct_helper_obj_type);
965 nft_unregister_expr(&nft_notrack_type); 1181 nft_unregister_expr(&nft_notrack_type);
966 nft_unregister_expr(&nft_ct_type); 1182 nft_unregister_expr(&nft_ct_type);
@@ -974,3 +1190,4 @@ MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
974MODULE_ALIAS_NFT_EXPR("ct"); 1190MODULE_ALIAS_NFT_EXPR("ct");
975MODULE_ALIAS_NFT_EXPR("notrack"); 1191MODULE_ALIAS_NFT_EXPR("notrack");
976MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_HELPER); 1192MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_HELPER);
1193MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_TIMEOUT);
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 27d7e4598ab6..6e91a37d57f2 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -118,6 +118,8 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
118 u64 timeout; 118 u64 timeout;
119 int err; 119 int err;
120 120
121 lockdep_assert_held(&ctx->net->nft.commit_mutex);
122
121 if (tb[NFTA_DYNSET_SET_NAME] == NULL || 123 if (tb[NFTA_DYNSET_SET_NAME] == NULL ||
122 tb[NFTA_DYNSET_OP] == NULL || 124 tb[NFTA_DYNSET_OP] == NULL ||
123 tb[NFTA_DYNSET_SREG_KEY] == NULL) 125 tb[NFTA_DYNSET_SREG_KEY] == NULL)
@@ -185,8 +187,6 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
185 if (tb[NFTA_DYNSET_EXPR] != NULL) { 187 if (tb[NFTA_DYNSET_EXPR] != NULL) {
186 if (!(set->flags & NFT_SET_EVAL)) 188 if (!(set->flags & NFT_SET_EVAL))
187 return -EINVAL; 189 return -EINVAL;
188 if (!nft_set_is_anonymous(set))
189 return -EOPNOTSUPP;
190 190
191 priv->expr = nft_expr_init(ctx, tb[NFTA_DYNSET_EXPR]); 191 priv->expr = nft_expr_init(ctx, tb[NFTA_DYNSET_EXPR]);
192 if (IS_ERR(priv->expr)) 192 if (IS_ERR(priv->expr))
diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c
index 15adf8ca82c3..0777a93211e2 100644
--- a/net/netfilter/nft_immediate.c
+++ b/net/netfilter/nft_immediate.c
@@ -98,6 +98,7 @@ static int nft_immediate_validate(const struct nft_ctx *ctx,
98 const struct nft_data **d) 98 const struct nft_data **d)
99{ 99{
100 const struct nft_immediate_expr *priv = nft_expr_priv(expr); 100 const struct nft_immediate_expr *priv = nft_expr_priv(expr);
101 struct nft_ctx *pctx = (struct nft_ctx *)ctx;
101 const struct nft_data *data; 102 const struct nft_data *data;
102 int err; 103 int err;
103 104
@@ -109,9 +110,11 @@ static int nft_immediate_validate(const struct nft_ctx *ctx,
109 switch (data->verdict.code) { 110 switch (data->verdict.code) {
110 case NFT_JUMP: 111 case NFT_JUMP:
111 case NFT_GOTO: 112 case NFT_GOTO:
113 pctx->level++;
112 err = nft_chain_validate(ctx, data->verdict.chain); 114 err = nft_chain_validate(ctx, data->verdict.chain);
113 if (err < 0) 115 if (err < 0)
114 return err; 116 return err;
117 pctx->level--;
115 break; 118 break;
116 default: 119 default:
117 break; 120 break;
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
index 42e6fadf1417..ad13e8643599 100644
--- a/net/netfilter/nft_lookup.c
+++ b/net/netfilter/nft_lookup.c
@@ -26,9 +26,9 @@ struct nft_lookup {
26 struct nft_set_binding binding; 26 struct nft_set_binding binding;
27}; 27};
28 28
29static void nft_lookup_eval(const struct nft_expr *expr, 29void nft_lookup_eval(const struct nft_expr *expr,
30 struct nft_regs *regs, 30 struct nft_regs *regs,
31 const struct nft_pktinfo *pkt) 31 const struct nft_pktinfo *pkt)
32{ 32{
33 const struct nft_lookup *priv = nft_expr_priv(expr); 33 const struct nft_lookup *priv = nft_expr_priv(expr);
34 const struct nft_set *set = priv->set; 34 const struct nft_set *set = priv->set;
@@ -155,7 +155,9 @@ static int nft_lookup_validate_setelem(const struct nft_ctx *ctx,
155 struct nft_set_elem *elem) 155 struct nft_set_elem *elem)
156{ 156{
157 const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); 157 const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
158 struct nft_ctx *pctx = (struct nft_ctx *)ctx;
158 const struct nft_data *data; 159 const struct nft_data *data;
160 int err;
159 161
160 if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && 162 if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
161 *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END) 163 *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END)
@@ -165,10 +167,17 @@ static int nft_lookup_validate_setelem(const struct nft_ctx *ctx,
165 switch (data->verdict.code) { 167 switch (data->verdict.code) {
166 case NFT_JUMP: 168 case NFT_JUMP:
167 case NFT_GOTO: 169 case NFT_GOTO:
168 return nft_chain_validate(ctx, data->verdict.chain); 170 pctx->level++;
171 err = nft_chain_validate(ctx, data->verdict.chain);
172 if (err < 0)
173 return err;
174 pctx->level--;
175 break;
169 default: 176 default:
170 return 0; 177 break;
171 } 178 }
179
180 return 0;
172} 181}
173 182
174static int nft_lookup_validate(const struct nft_ctx *ctx, 183static int nft_lookup_validate(const struct nft_ctx *ctx,
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 1105a23bda5e..297fe7d97c18 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -41,9 +41,9 @@ static DEFINE_PER_CPU(struct rnd_state, nft_prandom_state);
41#include "../bridge/br_private.h" 41#include "../bridge/br_private.h"
42#endif 42#endif
43 43
44static void nft_meta_get_eval(const struct nft_expr *expr, 44void nft_meta_get_eval(const struct nft_expr *expr,
45 struct nft_regs *regs, 45 struct nft_regs *regs,
46 const struct nft_pktinfo *pkt) 46 const struct nft_pktinfo *pkt)
47{ 47{
48 const struct nft_meta *priv = nft_expr_priv(expr); 48 const struct nft_meta *priv = nft_expr_priv(expr);
49 const struct sk_buff *skb = pkt->skb; 49 const struct sk_buff *skb = pkt->skb;
@@ -107,7 +107,8 @@ static void nft_meta_get_eval(const struct nft_expr *expr,
107 break; 107 break;
108 case NFT_META_SKUID: 108 case NFT_META_SKUID:
109 sk = skb_to_full_sk(skb); 109 sk = skb_to_full_sk(skb);
110 if (!sk || !sk_fullsock(sk)) 110 if (!sk || !sk_fullsock(sk) ||
111 !net_eq(nft_net(pkt), sock_net(sk)))
111 goto err; 112 goto err;
112 113
113 read_lock_bh(&sk->sk_callback_lock); 114 read_lock_bh(&sk->sk_callback_lock);
@@ -123,7 +124,8 @@ static void nft_meta_get_eval(const struct nft_expr *expr,
123 break; 124 break;
124 case NFT_META_SKGID: 125 case NFT_META_SKGID:
125 sk = skb_to_full_sk(skb); 126 sk = skb_to_full_sk(skb);
126 if (!sk || !sk_fullsock(sk)) 127 if (!sk || !sk_fullsock(sk) ||
128 !net_eq(nft_net(pkt), sock_net(sk)))
127 goto err; 129 goto err;
128 130
129 read_lock_bh(&sk->sk_callback_lock); 131 read_lock_bh(&sk->sk_callback_lock);
@@ -214,7 +216,8 @@ static void nft_meta_get_eval(const struct nft_expr *expr,
214#ifdef CONFIG_CGROUP_NET_CLASSID 216#ifdef CONFIG_CGROUP_NET_CLASSID
215 case NFT_META_CGROUP: 217 case NFT_META_CGROUP:
216 sk = skb_to_full_sk(skb); 218 sk = skb_to_full_sk(skb);
217 if (!sk || !sk_fullsock(sk)) 219 if (!sk || !sk_fullsock(sk) ||
220 !net_eq(nft_net(pkt), sock_net(sk)))
218 goto err; 221 goto err;
219 *dest = sock_cgroup_classid(&sk->sk_cgrp_data); 222 *dest = sock_cgroup_classid(&sk->sk_cgrp_data);
220 break; 223 break;
diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c
index 1f4d0854cf70..649d1700ec5b 100644
--- a/net/netfilter/nft_numgen.c
+++ b/net/netfilter/nft_numgen.c
@@ -237,10 +237,8 @@ static int nft_ng_random_map_init(const struct nft_ctx *ctx,
237 priv->map = nft_set_lookup_global(ctx->net, ctx->table, 237 priv->map = nft_set_lookup_global(ctx->net, ctx->table,
238 tb[NFTA_NG_SET_NAME], 238 tb[NFTA_NG_SET_NAME],
239 tb[NFTA_NG_SET_ID], genmask); 239 tb[NFTA_NG_SET_ID], genmask);
240 if (IS_ERR(priv->map))
241 return PTR_ERR(priv->map);
242 240
243 return 0; 241 return PTR_ERR_OR_ZERO(priv->map);
244} 242}
245 243
246static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr) 244static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr)
diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c
new file mode 100644
index 000000000000..5af74b37f423
--- /dev/null
+++ b/net/netfilter/nft_osf.c
@@ -0,0 +1,104 @@
1#include <net/ip.h>
2#include <net/tcp.h>
3
4#include <net/netfilter/nf_tables.h>
5#include <linux/netfilter/nfnetlink_osf.h>
6
7struct nft_osf {
8 enum nft_registers dreg:8;
9};
10
11static const struct nla_policy nft_osf_policy[NFTA_OSF_MAX + 1] = {
12 [NFTA_OSF_DREG] = { .type = NLA_U32 },
13};
14
15static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs,
16 const struct nft_pktinfo *pkt)
17{
18 struct nft_osf *priv = nft_expr_priv(expr);
19 u32 *dest = &regs->data[priv->dreg];
20 struct sk_buff *skb = pkt->skb;
21 const struct tcphdr *tcp;
22 struct tcphdr _tcph;
23 const char *os_name;
24
25 tcp = skb_header_pointer(skb, ip_hdrlen(skb),
26 sizeof(struct tcphdr), &_tcph);
27 if (!tcp) {
28 regs->verdict.code = NFT_BREAK;
29 return;
30 }
31 if (!tcp->syn) {
32 regs->verdict.code = NFT_BREAK;
33 return;
34 }
35
36 os_name = nf_osf_find(skb, nf_osf_fingers);
37 if (!os_name)
38 strncpy((char *)dest, "unknown", NFT_OSF_MAXGENRELEN);
39 else
40 strncpy((char *)dest, os_name, NFT_OSF_MAXGENRELEN);
41}
42
43static int nft_osf_init(const struct nft_ctx *ctx,
44 const struct nft_expr *expr,
45 const struct nlattr * const tb[])
46{
47 struct nft_osf *priv = nft_expr_priv(expr);
48 int err;
49
50 priv->dreg = nft_parse_register(tb[NFTA_OSF_DREG]);
51 err = nft_validate_register_store(ctx, priv->dreg, NULL,
52 NFTA_DATA_VALUE, NFT_OSF_MAXGENRELEN);
53 if (err < 0)
54 return err;
55
56 return 0;
57}
58
59static int nft_osf_dump(struct sk_buff *skb, const struct nft_expr *expr)
60{
61 const struct nft_osf *priv = nft_expr_priv(expr);
62
63 if (nft_dump_register(skb, NFTA_OSF_DREG, priv->dreg))
64 goto nla_put_failure;
65
66 return 0;
67
68nla_put_failure:
69 return -1;
70}
71
72static struct nft_expr_type nft_osf_type;
73static const struct nft_expr_ops nft_osf_op = {
74 .eval = nft_osf_eval,
75 .size = NFT_EXPR_SIZE(sizeof(struct nft_osf)),
76 .init = nft_osf_init,
77 .dump = nft_osf_dump,
78 .type = &nft_osf_type,
79};
80
81static struct nft_expr_type nft_osf_type __read_mostly = {
82 .ops = &nft_osf_op,
83 .name = "osf",
84 .owner = THIS_MODULE,
85 .policy = nft_osf_policy,
86 .maxattr = NFTA_OSF_MAX,
87};
88
89static int __init nft_osf_module_init(void)
90{
91 return nft_register_expr(&nft_osf_type);
92}
93
94static void __exit nft_osf_module_exit(void)
95{
96 return nft_unregister_expr(&nft_osf_type);
97}
98
99module_init(nft_osf_module_init);
100module_exit(nft_osf_module_exit);
101
102MODULE_LICENSE("GPL");
103MODULE_AUTHOR("Fernando Fernandez <ffmancera@riseup.net>");
104MODULE_ALIAS_NFT_EXPR("osf");
diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c
index d6626e01c7ee..f866bd41e5d2 100644
--- a/net/netfilter/nft_set_bitmap.c
+++ b/net/netfilter/nft_set_bitmap.c
@@ -248,13 +248,13 @@ static inline u32 nft_bitmap_size(u32 klen)
248 return ((2 << ((klen * BITS_PER_BYTE) - 1)) / BITS_PER_BYTE) << 1; 248 return ((2 << ((klen * BITS_PER_BYTE) - 1)) / BITS_PER_BYTE) << 1;
249} 249}
250 250
251static inline u32 nft_bitmap_total_size(u32 klen) 251static inline u64 nft_bitmap_total_size(u32 klen)
252{ 252{
253 return sizeof(struct nft_bitmap) + nft_bitmap_size(klen); 253 return sizeof(struct nft_bitmap) + nft_bitmap_size(klen);
254} 254}
255 255
256static unsigned int nft_bitmap_privsize(const struct nlattr * const nla[], 256static u64 nft_bitmap_privsize(const struct nlattr * const nla[],
257 const struct nft_set_desc *desc) 257 const struct nft_set_desc *desc)
258{ 258{
259 u32 klen = ntohl(nla_get_be32(nla[NFTA_SET_KEY_LEN])); 259 u32 klen = ntohl(nla_get_be32(nla[NFTA_SET_KEY_LEN]));
260 260
@@ -296,7 +296,7 @@ static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features,
296 return true; 296 return true;
297} 297}
298 298
299static struct nft_set_type nft_bitmap_type __read_mostly = { 299struct nft_set_type nft_set_bitmap_type __read_mostly = {
300 .owner = THIS_MODULE, 300 .owner = THIS_MODULE,
301 .ops = { 301 .ops = {
302 .privsize = nft_bitmap_privsize, 302 .privsize = nft_bitmap_privsize,
@@ -314,20 +314,3 @@ static struct nft_set_type nft_bitmap_type __read_mostly = {
314 .get = nft_bitmap_get, 314 .get = nft_bitmap_get,
315 }, 315 },
316}; 316};
317
318static int __init nft_bitmap_module_init(void)
319{
320 return nft_register_set(&nft_bitmap_type);
321}
322
323static void __exit nft_bitmap_module_exit(void)
324{
325 nft_unregister_set(&nft_bitmap_type);
326}
327
328module_init(nft_bitmap_module_init);
329module_exit(nft_bitmap_module_exit);
330
331MODULE_LICENSE("GPL");
332MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
333MODULE_ALIAS_NFT_SET();
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index 6f9a1365a09f..015124e649cb 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -341,8 +341,8 @@ schedule:
341 nft_set_gc_interval(set)); 341 nft_set_gc_interval(set));
342} 342}
343 343
344static unsigned int nft_rhash_privsize(const struct nlattr * const nla[], 344static u64 nft_rhash_privsize(const struct nlattr * const nla[],
345 const struct nft_set_desc *desc) 345 const struct nft_set_desc *desc)
346{ 346{
347 return sizeof(struct nft_rhash); 347 return sizeof(struct nft_rhash);
348} 348}
@@ -387,6 +387,7 @@ static void nft_rhash_destroy(const struct nft_set *set)
387 struct nft_rhash *priv = nft_set_priv(set); 387 struct nft_rhash *priv = nft_set_priv(set);
388 388
389 cancel_delayed_work_sync(&priv->gc_work); 389 cancel_delayed_work_sync(&priv->gc_work);
390 rcu_barrier();
390 rhashtable_free_and_destroy(&priv->ht, nft_rhash_elem_destroy, 391 rhashtable_free_and_destroy(&priv->ht, nft_rhash_elem_destroy,
391 (void *)set); 392 (void *)set);
392} 393}
@@ -584,8 +585,8 @@ cont:
584 } 585 }
585} 586}
586 587
587static unsigned int nft_hash_privsize(const struct nlattr * const nla[], 588static u64 nft_hash_privsize(const struct nlattr * const nla[],
588 const struct nft_set_desc *desc) 589 const struct nft_set_desc *desc)
589{ 590{
590 return sizeof(struct nft_hash) + 591 return sizeof(struct nft_hash) +
591 nft_hash_buckets(desc->size) * sizeof(struct hlist_head); 592 nft_hash_buckets(desc->size) * sizeof(struct hlist_head);
@@ -654,7 +655,7 @@ static bool nft_hash_fast_estimate(const struct nft_set_desc *desc, u32 features
654 return true; 655 return true;
655} 656}
656 657
657static struct nft_set_type nft_rhash_type __read_mostly = { 658struct nft_set_type nft_set_rhash_type __read_mostly = {
658 .owner = THIS_MODULE, 659 .owner = THIS_MODULE,
659 .features = NFT_SET_MAP | NFT_SET_OBJECT | 660 .features = NFT_SET_MAP | NFT_SET_OBJECT |
660 NFT_SET_TIMEOUT | NFT_SET_EVAL, 661 NFT_SET_TIMEOUT | NFT_SET_EVAL,
@@ -677,7 +678,7 @@ static struct nft_set_type nft_rhash_type __read_mostly = {
677 }, 678 },
678}; 679};
679 680
680static struct nft_set_type nft_hash_type __read_mostly = { 681struct nft_set_type nft_set_hash_type __read_mostly = {
681 .owner = THIS_MODULE, 682 .owner = THIS_MODULE,
682 .features = NFT_SET_MAP | NFT_SET_OBJECT, 683 .features = NFT_SET_MAP | NFT_SET_OBJECT,
683 .ops = { 684 .ops = {
@@ -697,7 +698,7 @@ static struct nft_set_type nft_hash_type __read_mostly = {
697 }, 698 },
698}; 699};
699 700
700static struct nft_set_type nft_hash_fast_type __read_mostly = { 701struct nft_set_type nft_set_hash_fast_type __read_mostly = {
701 .owner = THIS_MODULE, 702 .owner = THIS_MODULE,
702 .features = NFT_SET_MAP | NFT_SET_OBJECT, 703 .features = NFT_SET_MAP | NFT_SET_OBJECT,
703 .ops = { 704 .ops = {
@@ -716,26 +717,3 @@ static struct nft_set_type nft_hash_fast_type __read_mostly = {
716 .get = nft_hash_get, 717 .get = nft_hash_get,
717 }, 718 },
718}; 719};
719
720static int __init nft_hash_module_init(void)
721{
722 if (nft_register_set(&nft_hash_fast_type) ||
723 nft_register_set(&nft_hash_type) ||
724 nft_register_set(&nft_rhash_type))
725 return 1;
726 return 0;
727}
728
729static void __exit nft_hash_module_exit(void)
730{
731 nft_unregister_set(&nft_rhash_type);
732 nft_unregister_set(&nft_hash_type);
733 nft_unregister_set(&nft_hash_fast_type);
734}
735
736module_init(nft_hash_module_init);
737module_exit(nft_hash_module_exit);
738
739MODULE_LICENSE("GPL");
740MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
741MODULE_ALIAS_NFT_SET();
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index 7f3a9a211034..55e2d9215c0d 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -381,7 +381,7 @@ static void nft_rbtree_gc(struct work_struct *work)
381 381
382 gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); 382 gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
383 if (!gcb) 383 if (!gcb)
384 goto out; 384 break;
385 385
386 atomic_dec(&set->nelems); 386 atomic_dec(&set->nelems);
387 nft_set_gc_batch_add(gcb, rbe); 387 nft_set_gc_batch_add(gcb, rbe);
@@ -390,10 +390,12 @@ static void nft_rbtree_gc(struct work_struct *work)
390 rbe = rb_entry(prev, struct nft_rbtree_elem, node); 390 rbe = rb_entry(prev, struct nft_rbtree_elem, node);
391 atomic_dec(&set->nelems); 391 atomic_dec(&set->nelems);
392 nft_set_gc_batch_add(gcb, rbe); 392 nft_set_gc_batch_add(gcb, rbe);
393 prev = NULL;
393 } 394 }
394 node = rb_next(node); 395 node = rb_next(node);
396 if (!node)
397 break;
395 } 398 }
396out:
397 if (gcb) { 399 if (gcb) {
398 for (i = 0; i < gcb->head.cnt; i++) { 400 for (i = 0; i < gcb->head.cnt; i++) {
399 rbe = gcb->elems[i]; 401 rbe = gcb->elems[i];
@@ -409,8 +411,8 @@ out:
409 nft_set_gc_interval(set)); 411 nft_set_gc_interval(set));
410} 412}
411 413
412static unsigned int nft_rbtree_privsize(const struct nlattr * const nla[], 414static u64 nft_rbtree_privsize(const struct nlattr * const nla[],
413 const struct nft_set_desc *desc) 415 const struct nft_set_desc *desc)
414{ 416{
415 return sizeof(struct nft_rbtree); 417 return sizeof(struct nft_rbtree);
416} 418}
@@ -440,6 +442,7 @@ static void nft_rbtree_destroy(const struct nft_set *set)
440 struct rb_node *node; 442 struct rb_node *node;
441 443
442 cancel_delayed_work_sync(&priv->gc_work); 444 cancel_delayed_work_sync(&priv->gc_work);
445 rcu_barrier();
443 while ((node = priv->root.rb_node) != NULL) { 446 while ((node = priv->root.rb_node) != NULL) {
444 rb_erase(node, &priv->root); 447 rb_erase(node, &priv->root);
445 rbe = rb_entry(node, struct nft_rbtree_elem, node); 448 rbe = rb_entry(node, struct nft_rbtree_elem, node);
@@ -462,7 +465,7 @@ static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features,
462 return true; 465 return true;
463} 466}
464 467
465static struct nft_set_type nft_rbtree_type __read_mostly = { 468struct nft_set_type nft_set_rbtree_type __read_mostly = {
466 .owner = THIS_MODULE, 469 .owner = THIS_MODULE,
467 .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT, 470 .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT,
468 .ops = { 471 .ops = {
@@ -481,20 +484,3 @@ static struct nft_set_type nft_rbtree_type __read_mostly = {
481 .get = nft_rbtree_get, 484 .get = nft_rbtree_get,
482 }, 485 },
483}; 486};
484
485static int __init nft_rbtree_module_init(void)
486{
487 return nft_register_set(&nft_rbtree_type);
488}
489
490static void __exit nft_rbtree_module_exit(void)
491{
492 nft_unregister_set(&nft_rbtree_type);
493}
494
495module_init(nft_rbtree_module_init);
496module_exit(nft_rbtree_module_exit);
497
498MODULE_LICENSE("GPL");
499MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
500MODULE_ALIAS_NFT_SET();
diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c
index 74e1b3bd6954..d7f3776dfd71 100644
--- a/net/netfilter/nft_socket.c
+++ b/net/netfilter/nft_socket.c
@@ -23,12 +23,15 @@ static void nft_socket_eval(const struct nft_expr *expr,
23 struct sock *sk = skb->sk; 23 struct sock *sk = skb->sk;
24 u32 *dest = &regs->data[priv->dreg]; 24 u32 *dest = &regs->data[priv->dreg];
25 25
26 if (sk && !net_eq(nft_net(pkt), sock_net(sk)))
27 sk = NULL;
28
26 if (!sk) 29 if (!sk)
27 switch(nft_pf(pkt)) { 30 switch(nft_pf(pkt)) {
28 case NFPROTO_IPV4: 31 case NFPROTO_IPV4:
29 sk = nf_sk_lookup_slow_v4(nft_net(pkt), skb, nft_in(pkt)); 32 sk = nf_sk_lookup_slow_v4(nft_net(pkt), skb, nft_in(pkt));
30 break; 33 break;
31#if IS_ENABLED(CONFIG_NF_SOCKET_IPV6) 34#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
32 case NFPROTO_IPV6: 35 case NFPROTO_IPV6:
33 sk = nf_sk_lookup_slow_v6(nft_net(pkt), skb, nft_in(pkt)); 36 sk = nf_sk_lookup_slow_v6(nft_net(pkt), skb, nft_in(pkt));
34 break; 37 break;
@@ -39,8 +42,8 @@ static void nft_socket_eval(const struct nft_expr *expr,
39 return; 42 return;
40 } 43 }
41 44
42 if(!sk) { 45 if (!sk) {
43 nft_reg_store8(dest, 0); 46 regs->verdict.code = NFT_BREAK;
44 return; 47 return;
45 } 48 }
46 49
@@ -51,6 +54,14 @@ static void nft_socket_eval(const struct nft_expr *expr,
51 case NFT_SOCKET_TRANSPARENT: 54 case NFT_SOCKET_TRANSPARENT:
52 nft_reg_store8(dest, inet_sk_transparent(sk)); 55 nft_reg_store8(dest, inet_sk_transparent(sk));
53 break; 56 break;
57 case NFT_SOCKET_MARK:
58 if (sk_fullsock(sk)) {
59 *dest = sk->sk_mark;
60 } else {
61 regs->verdict.code = NFT_BREAK;
62 return;
63 }
64 break;
54 default: 65 default:
55 WARN_ON(1); 66 WARN_ON(1);
56 regs->verdict.code = NFT_BREAK; 67 regs->verdict.code = NFT_BREAK;
@@ -74,7 +85,7 @@ static int nft_socket_init(const struct nft_ctx *ctx,
74 85
75 switch(ctx->family) { 86 switch(ctx->family) {
76 case NFPROTO_IPV4: 87 case NFPROTO_IPV4:
77#if IS_ENABLED(CONFIG_NF_SOCKET_IPV6) 88#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
78 case NFPROTO_IPV6: 89 case NFPROTO_IPV6:
79#endif 90#endif
80 case NFPROTO_INET: 91 case NFPROTO_INET:
@@ -88,6 +99,9 @@ static int nft_socket_init(const struct nft_ctx *ctx,
88 case NFT_SOCKET_TRANSPARENT: 99 case NFT_SOCKET_TRANSPARENT:
89 len = sizeof(u8); 100 len = sizeof(u8);
90 break; 101 break;
102 case NFT_SOCKET_MARK:
103 len = sizeof(u32);
104 break;
91 default: 105 default:
92 return -EOPNOTSUPP; 106 return -EOPNOTSUPP;
93 } 107 }
diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c
new file mode 100644
index 000000000000..f92a82c73880
--- /dev/null
+++ b/net/netfilter/nft_tproxy.c
@@ -0,0 +1,318 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2#include <linux/module.h>
3#include <linux/netfilter/nf_tables.h>
4#include <net/netfilter/nf_tables.h>
5#include <net/netfilter/nf_tables_core.h>
6#include <net/netfilter/nf_tproxy.h>
7#include <net/inet_sock.h>
8#include <net/tcp.h>
9#include <linux/if_ether.h>
10#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
11#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
12#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
13#endif
14
15struct nft_tproxy {
16 enum nft_registers sreg_addr:8;
17 enum nft_registers sreg_port:8;
18 u8 family;
19};
20
21static void nft_tproxy_eval_v4(const struct nft_expr *expr,
22 struct nft_regs *regs,
23 const struct nft_pktinfo *pkt)
24{
25 const struct nft_tproxy *priv = nft_expr_priv(expr);
26 struct sk_buff *skb = pkt->skb;
27 const struct iphdr *iph = ip_hdr(skb);
28 struct udphdr _hdr, *hp;
29 __be32 taddr = 0;
30 __be16 tport = 0;
31 struct sock *sk;
32
33 hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr);
34 if (!hp) {
35 regs->verdict.code = NFT_BREAK;
36 return;
37 }
38
39 /* check if there's an ongoing connection on the packet addresses, this
40 * happens if the redirect already happened and the current packet
41 * belongs to an already established connection
42 */
43 sk = nf_tproxy_get_sock_v4(nft_net(pkt), skb, iph->protocol,
44 iph->saddr, iph->daddr,
45 hp->source, hp->dest,
46 skb->dev, NF_TPROXY_LOOKUP_ESTABLISHED);
47
48 if (priv->sreg_addr)
49 taddr = regs->data[priv->sreg_addr];
50 taddr = nf_tproxy_laddr4(skb, taddr, iph->daddr);
51
52 if (priv->sreg_port)
53 tport = regs->data[priv->sreg_port];
54 if (!tport)
55 tport = hp->dest;
56
57 /* UDP has no TCP_TIME_WAIT state, so we never enter here */
58 if (sk && sk->sk_state == TCP_TIME_WAIT) {
59 /* reopening a TIME_WAIT connection needs special handling */
60 sk = nf_tproxy_handle_time_wait4(nft_net(pkt), skb, taddr, tport, sk);
61 } else if (!sk) {
62 /* no, there's no established connection, check if
63 * there's a listener on the redirected addr/port
64 */
65 sk = nf_tproxy_get_sock_v4(nft_net(pkt), skb, iph->protocol,
66 iph->saddr, taddr,
67 hp->source, tport,
68 skb->dev, NF_TPROXY_LOOKUP_LISTENER);
69 }
70
71 if (sk && nf_tproxy_sk_is_transparent(sk))
72 nf_tproxy_assign_sock(skb, sk);
73 else
74 regs->verdict.code = NFT_BREAK;
75}
76
77#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
78static void nft_tproxy_eval_v6(const struct nft_expr *expr,
79 struct nft_regs *regs,
80 const struct nft_pktinfo *pkt)
81{
82 const struct nft_tproxy *priv = nft_expr_priv(expr);
83 struct sk_buff *skb = pkt->skb;
84 const struct ipv6hdr *iph = ipv6_hdr(skb);
85 struct in6_addr taddr;
86 int thoff = pkt->xt.thoff;
87 struct udphdr _hdr, *hp;
88 __be16 tport = 0;
89 struct sock *sk;
90 int l4proto;
91
92 memset(&taddr, 0, sizeof(taddr));
93
94 if (!pkt->tprot_set) {
95 regs->verdict.code = NFT_BREAK;
96 return;
97 }
98 l4proto = pkt->tprot;
99
100 hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
101 if (hp == NULL) {
102 regs->verdict.code = NFT_BREAK;
103 return;
104 }
105
106 /* check if there's an ongoing connection on the packet addresses, this
107 * happens if the redirect already happened and the current packet
108 * belongs to an already established connection
109 */
110 sk = nf_tproxy_get_sock_v6(nft_net(pkt), skb, thoff, l4proto,
111 &iph->saddr, &iph->daddr,
112 hp->source, hp->dest,
113 nft_in(pkt), NF_TPROXY_LOOKUP_ESTABLISHED);
114
115 if (priv->sreg_addr)
116 memcpy(&taddr, &regs->data[priv->sreg_addr], sizeof(taddr));
117 taddr = *nf_tproxy_laddr6(skb, &taddr, &iph->daddr);
118
119 if (priv->sreg_port)
120 tport = regs->data[priv->sreg_port];
121 if (!tport)
122 tport = hp->dest;
123
124 /* UDP has no TCP_TIME_WAIT state, so we never enter here */
125 if (sk && sk->sk_state == TCP_TIME_WAIT) {
126 /* reopening a TIME_WAIT connection needs special handling */
127 sk = nf_tproxy_handle_time_wait6(skb, l4proto, thoff,
128 nft_net(pkt),
129 &taddr,
130 tport,
131 sk);
132 } else if (!sk) {
133 /* no there's no established connection, check if
134 * there's a listener on the redirected addr/port
135 */
136 sk = nf_tproxy_get_sock_v6(nft_net(pkt), skb, thoff,
137 l4proto, &iph->saddr, &taddr,
138 hp->source, tport,
139 nft_in(pkt), NF_TPROXY_LOOKUP_LISTENER);
140 }
141
142 /* NOTE: assign_sock consumes our sk reference */
143 if (sk && nf_tproxy_sk_is_transparent(sk))
144 nf_tproxy_assign_sock(skb, sk);
145 else
146 regs->verdict.code = NFT_BREAK;
147}
148#endif
149
150static void nft_tproxy_eval(const struct nft_expr *expr,
151 struct nft_regs *regs,
152 const struct nft_pktinfo *pkt)
153{
154 const struct nft_tproxy *priv = nft_expr_priv(expr);
155
156 switch (nft_pf(pkt)) {
157 case NFPROTO_IPV4:
158 switch (priv->family) {
159 case NFPROTO_IPV4:
160 case NFPROTO_UNSPEC:
161 nft_tproxy_eval_v4(expr, regs, pkt);
162 return;
163 }
164 break;
165#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
166 case NFPROTO_IPV6:
167 switch (priv->family) {
168 case NFPROTO_IPV6:
169 case NFPROTO_UNSPEC:
170 nft_tproxy_eval_v6(expr, regs, pkt);
171 return;
172 }
173#endif
174 }
175 regs->verdict.code = NFT_BREAK;
176}
177
178static const struct nla_policy nft_tproxy_policy[NFTA_TPROXY_MAX + 1] = {
179 [NFTA_TPROXY_FAMILY] = { .type = NLA_U32 },
180 [NFTA_TPROXY_REG_ADDR] = { .type = NLA_U32 },
181 [NFTA_TPROXY_REG_PORT] = { .type = NLA_U32 },
182};
183
184static int nft_tproxy_init(const struct nft_ctx *ctx,
185 const struct nft_expr *expr,
186 const struct nlattr * const tb[])
187{
188 struct nft_tproxy *priv = nft_expr_priv(expr);
189 unsigned int alen = 0;
190 int err;
191
192 if (!tb[NFTA_TPROXY_FAMILY] ||
193 (!tb[NFTA_TPROXY_REG_ADDR] && !tb[NFTA_TPROXY_REG_PORT]))
194 return -EINVAL;
195
196 priv->family = ntohl(nla_get_be32(tb[NFTA_TPROXY_FAMILY]));
197
198 switch (ctx->family) {
199 case NFPROTO_IPV4:
200 if (priv->family != NFPROTO_IPV4)
201 return -EINVAL;
202 break;
203#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
204 case NFPROTO_IPV6:
205 if (priv->family != NFPROTO_IPV6)
206 return -EINVAL;
207 break;
208#endif
209 case NFPROTO_INET:
210 break;
211 default:
212 return -EOPNOTSUPP;
213 }
214
215 /* Address is specified but the rule family is not set accordingly */
216 if (priv->family == NFPROTO_UNSPEC && tb[NFTA_TPROXY_REG_ADDR])
217 return -EINVAL;
218
219 switch (priv->family) {
220 case NFPROTO_IPV4:
221 alen = FIELD_SIZEOF(union nf_inet_addr, in);
222 err = nf_defrag_ipv4_enable(ctx->net);
223 if (err)
224 return err;
225 break;
226#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
227 case NFPROTO_IPV6:
228 alen = FIELD_SIZEOF(union nf_inet_addr, in6);
229 err = nf_defrag_ipv6_enable(ctx->net);
230 if (err)
231 return err;
232 break;
233#endif
234 case NFPROTO_UNSPEC:
235 /* No address is specified here */
236 err = nf_defrag_ipv4_enable(ctx->net);
237 if (err)
238 return err;
239#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
240 err = nf_defrag_ipv6_enable(ctx->net);
241 if (err)
242 return err;
243#endif
244 break;
245 default:
246 return -EOPNOTSUPP;
247 }
248
249 if (tb[NFTA_TPROXY_REG_ADDR]) {
250 priv->sreg_addr = nft_parse_register(tb[NFTA_TPROXY_REG_ADDR]);
251 err = nft_validate_register_load(priv->sreg_addr, alen);
252 if (err < 0)
253 return err;
254 }
255
256 if (tb[NFTA_TPROXY_REG_PORT]) {
257 priv->sreg_port = nft_parse_register(tb[NFTA_TPROXY_REG_PORT]);
258 err = nft_validate_register_load(priv->sreg_port, sizeof(u16));
259 if (err < 0)
260 return err;
261 }
262
263 return 0;
264}
265
266static int nft_tproxy_dump(struct sk_buff *skb,
267 const struct nft_expr *expr)
268{
269 const struct nft_tproxy *priv = nft_expr_priv(expr);
270
271 if (nla_put_be32(skb, NFTA_TPROXY_FAMILY, htonl(priv->family)))
272 return -1;
273
274 if (priv->sreg_addr &&
275 nft_dump_register(skb, NFTA_TPROXY_REG_ADDR, priv->sreg_addr))
276 return -1;
277
278 if (priv->sreg_port &&
279 nft_dump_register(skb, NFTA_TPROXY_REG_PORT, priv->sreg_port))
280 return -1;
281
282 return 0;
283}
284
285static struct nft_expr_type nft_tproxy_type;
286static const struct nft_expr_ops nft_tproxy_ops = {
287 .type = &nft_tproxy_type,
288 .size = NFT_EXPR_SIZE(sizeof(struct nft_tproxy)),
289 .eval = nft_tproxy_eval,
290 .init = nft_tproxy_init,
291 .dump = nft_tproxy_dump,
292};
293
294static struct nft_expr_type nft_tproxy_type __read_mostly = {
295 .name = "tproxy",
296 .ops = &nft_tproxy_ops,
297 .policy = nft_tproxy_policy,
298 .maxattr = NFTA_TPROXY_MAX,
299 .owner = THIS_MODULE,
300};
301
302static int __init nft_tproxy_module_init(void)
303{
304 return nft_register_expr(&nft_tproxy_type);
305}
306
307static void __exit nft_tproxy_module_exit(void)
308{
309 nft_unregister_expr(&nft_tproxy_type);
310}
311
312module_init(nft_tproxy_module_init);
313module_exit(nft_tproxy_module_exit);
314
315MODULE_LICENSE("GPL");
316MODULE_AUTHOR("Máté Eckl");
317MODULE_DESCRIPTION("nf_tables tproxy support module");
318MODULE_ALIAS_NFT_EXPR("tproxy");
diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c
new file mode 100644
index 000000000000..3a15f219e4e7
--- /dev/null
+++ b/net/netfilter/nft_tunnel.c
@@ -0,0 +1,566 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2#include <linux/kernel.h>
3#include <linux/init.h>
4#include <linux/module.h>
5#include <linux/seqlock.h>
6#include <linux/netlink.h>
7#include <linux/netfilter.h>
8#include <linux/netfilter/nf_tables.h>
9#include <net/netfilter/nf_tables.h>
10#include <net/dst_metadata.h>
11#include <net/ip_tunnels.h>
12#include <net/vxlan.h>
13#include <net/erspan.h>
14
15struct nft_tunnel {
16 enum nft_tunnel_keys key:8;
17 enum nft_registers dreg:8;
18};
19
20static void nft_tunnel_get_eval(const struct nft_expr *expr,
21 struct nft_regs *regs,
22 const struct nft_pktinfo *pkt)
23{
24 const struct nft_tunnel *priv = nft_expr_priv(expr);
25 u32 *dest = &regs->data[priv->dreg];
26 struct ip_tunnel_info *tun_info;
27
28 tun_info = skb_tunnel_info(pkt->skb);
29
30 switch (priv->key) {
31 case NFT_TUNNEL_PATH:
32 nft_reg_store8(dest, !!tun_info);
33 break;
34 case NFT_TUNNEL_ID:
35 if (!tun_info) {
36 regs->verdict.code = NFT_BREAK;
37 return;
38 }
39 *dest = ntohl(tunnel_id_to_key32(tun_info->key.tun_id));
40 break;
41 default:
42 WARN_ON(1);
43 regs->verdict.code = NFT_BREAK;
44 }
45}
46
47static const struct nla_policy nft_tunnel_policy[NFTA_TUNNEL_MAX + 1] = {
48 [NFTA_TUNNEL_KEY] = { .type = NLA_U32 },
49 [NFTA_TUNNEL_DREG] = { .type = NLA_U32 },
50};
51
52static int nft_tunnel_get_init(const struct nft_ctx *ctx,
53 const struct nft_expr *expr,
54 const struct nlattr * const tb[])
55{
56 struct nft_tunnel *priv = nft_expr_priv(expr);
57 u32 len;
58
59 if (!tb[NFTA_TUNNEL_KEY] &&
60 !tb[NFTA_TUNNEL_DREG])
61 return -EINVAL;
62
63 priv->key = ntohl(nla_get_be32(tb[NFTA_TUNNEL_KEY]));
64 switch (priv->key) {
65 case NFT_TUNNEL_PATH:
66 len = sizeof(u8);
67 break;
68 case NFT_TUNNEL_ID:
69 len = sizeof(u32);
70 break;
71 default:
72 return -EOPNOTSUPP;
73 }
74
75 priv->dreg = nft_parse_register(tb[NFTA_TUNNEL_DREG]);
76
77 return nft_validate_register_store(ctx, priv->dreg, NULL,
78 NFT_DATA_VALUE, len);
79}
80
81static int nft_tunnel_get_dump(struct sk_buff *skb,
82 const struct nft_expr *expr)
83{
84 const struct nft_tunnel *priv = nft_expr_priv(expr);
85
86 if (nla_put_be32(skb, NFTA_TUNNEL_KEY, htonl(priv->key)))
87 goto nla_put_failure;
88 if (nft_dump_register(skb, NFTA_TUNNEL_DREG, priv->dreg))
89 goto nla_put_failure;
90 return 0;
91
92nla_put_failure:
93 return -1;
94}
95
96static struct nft_expr_type nft_tunnel_type;
97static const struct nft_expr_ops nft_tunnel_get_ops = {
98 .type = &nft_tunnel_type,
99 .size = NFT_EXPR_SIZE(sizeof(struct nft_tunnel)),
100 .eval = nft_tunnel_get_eval,
101 .init = nft_tunnel_get_init,
102 .dump = nft_tunnel_get_dump,
103};
104
105static struct nft_expr_type nft_tunnel_type __read_mostly = {
106 .name = "tunnel",
107 .ops = &nft_tunnel_get_ops,
108 .policy = nft_tunnel_policy,
109 .maxattr = NFTA_TUNNEL_MAX,
110 .owner = THIS_MODULE,
111};
112
113struct nft_tunnel_opts {
114 union {
115 struct vxlan_metadata vxlan;
116 struct erspan_metadata erspan;
117 } u;
118 u32 len;
119 __be16 flags;
120};
121
122struct nft_tunnel_obj {
123 struct metadata_dst *md;
124 struct nft_tunnel_opts opts;
125};
126
127static const struct nla_policy nft_tunnel_ip_policy[NFTA_TUNNEL_KEY_IP_MAX + 1] = {
128 [NFTA_TUNNEL_KEY_IP_SRC] = { .type = NLA_U32 },
129 [NFTA_TUNNEL_KEY_IP_DST] = { .type = NLA_U32 },
130};
131
132static int nft_tunnel_obj_ip_init(const struct nft_ctx *ctx,
133 const struct nlattr *attr,
134 struct ip_tunnel_info *info)
135{
136 struct nlattr *tb[NFTA_TUNNEL_KEY_IP_MAX + 1];
137 int err;
138
139 err = nla_parse_nested(tb, NFTA_TUNNEL_KEY_IP_MAX, attr,
140 nft_tunnel_ip_policy, NULL);
141 if (err < 0)
142 return err;
143
144 if (!tb[NFTA_TUNNEL_KEY_IP_DST])
145 return -EINVAL;
146
147 if (tb[NFTA_TUNNEL_KEY_IP_SRC])
148 info->key.u.ipv4.src = nla_get_be32(tb[NFTA_TUNNEL_KEY_IP_SRC]);
149 if (tb[NFTA_TUNNEL_KEY_IP_DST])
150 info->key.u.ipv4.dst = nla_get_be32(tb[NFTA_TUNNEL_KEY_IP_DST]);
151
152 return 0;
153}
154
155static const struct nla_policy nft_tunnel_ip6_policy[NFTA_TUNNEL_KEY_IP6_MAX + 1] = {
156 [NFTA_TUNNEL_KEY_IP6_SRC] = { .len = sizeof(struct in6_addr), },
157 [NFTA_TUNNEL_KEY_IP6_DST] = { .len = sizeof(struct in6_addr), },
158 [NFTA_TUNNEL_KEY_IP6_FLOWLABEL] = { .type = NLA_U32, }
159};
160
161static int nft_tunnel_obj_ip6_init(const struct nft_ctx *ctx,
162 const struct nlattr *attr,
163 struct ip_tunnel_info *info)
164{
165 struct nlattr *tb[NFTA_TUNNEL_KEY_IP6_MAX + 1];
166 int err;
167
168 err = nla_parse_nested(tb, NFTA_TUNNEL_KEY_IP6_MAX, attr,
169 nft_tunnel_ip6_policy, NULL);
170 if (err < 0)
171 return err;
172
173 if (!tb[NFTA_TUNNEL_KEY_IP6_DST])
174 return -EINVAL;
175
176 if (tb[NFTA_TUNNEL_KEY_IP6_SRC]) {
177 memcpy(&info->key.u.ipv6.src,
178 nla_data(tb[NFTA_TUNNEL_KEY_IP6_SRC]),
179 sizeof(struct in6_addr));
180 }
181 if (tb[NFTA_TUNNEL_KEY_IP6_DST]) {
182 memcpy(&info->key.u.ipv6.dst,
183 nla_data(tb[NFTA_TUNNEL_KEY_IP6_DST]),
184 sizeof(struct in6_addr));
185 }
186 if (tb[NFTA_TUNNEL_KEY_IP6_FLOWLABEL])
187 info->key.label = nla_get_be32(tb[NFTA_TUNNEL_KEY_IP6_FLOWLABEL]);
188
189 info->mode |= IP_TUNNEL_INFO_IPV6;
190
191 return 0;
192}
193
194static const struct nla_policy nft_tunnel_opts_vxlan_policy[NFTA_TUNNEL_KEY_VXLAN_MAX + 1] = {
195 [NFTA_TUNNEL_KEY_VXLAN_GBP] = { .type = NLA_U32 },
196};
197
198static int nft_tunnel_obj_vxlan_init(const struct nlattr *attr,
199 struct nft_tunnel_opts *opts)
200{
201 struct nlattr *tb[NFTA_TUNNEL_KEY_VXLAN_MAX + 1];
202 int err;
203
204 err = nla_parse_nested(tb, NFTA_TUNNEL_KEY_VXLAN_MAX, attr,
205 nft_tunnel_opts_vxlan_policy, NULL);
206 if (err < 0)
207 return err;
208
209 if (!tb[NFTA_TUNNEL_KEY_VXLAN_GBP])
210 return -EINVAL;
211
212 opts->u.vxlan.gbp = ntohl(nla_get_be32(tb[NFTA_TUNNEL_KEY_VXLAN_GBP]));
213
214 opts->len = sizeof(struct vxlan_metadata);
215 opts->flags = TUNNEL_VXLAN_OPT;
216
217 return 0;
218}
219
220static const struct nla_policy nft_tunnel_opts_erspan_policy[NFTA_TUNNEL_KEY_ERSPAN_MAX + 1] = {
221 [NFTA_TUNNEL_KEY_ERSPAN_V1_INDEX] = { .type = NLA_U32 },
222 [NFTA_TUNNEL_KEY_ERSPAN_V2_DIR] = { .type = NLA_U8 },
223 [NFTA_TUNNEL_KEY_ERSPAN_V2_HWID] = { .type = NLA_U8 },
224};
225
226static int nft_tunnel_obj_erspan_init(const struct nlattr *attr,
227 struct nft_tunnel_opts *opts)
228{
229 struct nlattr *tb[NFTA_TUNNEL_KEY_ERSPAN_MAX + 1];
230 uint8_t hwid, dir;
231 int err, version;
232
233 err = nla_parse_nested(tb, NFTA_TUNNEL_KEY_ERSPAN_MAX, attr,
234 nft_tunnel_opts_erspan_policy, NULL);
235 if (err < 0)
236 return err;
237
238 version = ntohl(nla_get_be32(tb[NFTA_TUNNEL_KEY_ERSPAN_VERSION]));
239 switch (version) {
240 case ERSPAN_VERSION:
241 if (!tb[NFTA_TUNNEL_KEY_ERSPAN_V1_INDEX])
242 return -EINVAL;
243
244 opts->u.erspan.u.index =
245 nla_get_be32(tb[NFTA_TUNNEL_KEY_ERSPAN_V1_INDEX]);
246 break;
247 case ERSPAN_VERSION2:
248 if (!tb[NFTA_TUNNEL_KEY_ERSPAN_V2_DIR] ||
249 !tb[NFTA_TUNNEL_KEY_ERSPAN_V2_HWID])
250 return -EINVAL;
251
252 hwid = nla_get_u8(tb[NFTA_TUNNEL_KEY_ERSPAN_V2_HWID]);
253 dir = nla_get_u8(tb[NFTA_TUNNEL_KEY_ERSPAN_V2_DIR]);
254
255 set_hwid(&opts->u.erspan.u.md2, hwid);
256 opts->u.erspan.u.md2.dir = dir;
257 break;
258 default:
259 return -EOPNOTSUPP;
260 }
261 opts->u.erspan.version = version;
262
263 opts->len = sizeof(struct erspan_metadata);
264 opts->flags = TUNNEL_ERSPAN_OPT;
265
266 return 0;
267}
268
269static const struct nla_policy nft_tunnel_opts_policy[NFTA_TUNNEL_KEY_OPTS_MAX + 1] = {
270 [NFTA_TUNNEL_KEY_OPTS_VXLAN] = { .type = NLA_NESTED, },
271 [NFTA_TUNNEL_KEY_OPTS_ERSPAN] = { .type = NLA_NESTED, },
272};
273
274static int nft_tunnel_obj_opts_init(const struct nft_ctx *ctx,
275 const struct nlattr *attr,
276 struct ip_tunnel_info *info,
277 struct nft_tunnel_opts *opts)
278{
279 struct nlattr *tb[NFTA_TUNNEL_KEY_OPTS_MAX + 1];
280 int err;
281
282 err = nla_parse_nested(tb, NFTA_TUNNEL_KEY_OPTS_MAX, attr,
283 nft_tunnel_opts_policy, NULL);
284 if (err < 0)
285 return err;
286
287 if (tb[NFTA_TUNNEL_KEY_OPTS_VXLAN]) {
288 err = nft_tunnel_obj_vxlan_init(tb[NFTA_TUNNEL_KEY_OPTS_VXLAN],
289 opts);
290 } else if (tb[NFTA_TUNNEL_KEY_OPTS_ERSPAN]) {
291 err = nft_tunnel_obj_erspan_init(tb[NFTA_TUNNEL_KEY_OPTS_ERSPAN],
292 opts);
293 } else {
294 return -EOPNOTSUPP;
295 }
296
297 return err;
298}
299
300static const struct nla_policy nft_tunnel_key_policy[NFTA_TUNNEL_KEY_MAX + 1] = {
301 [NFTA_TUNNEL_KEY_IP] = { .type = NLA_NESTED, },
302 [NFTA_TUNNEL_KEY_IP6] = { .type = NLA_NESTED, },
303 [NFTA_TUNNEL_KEY_ID] = { .type = NLA_U32, },
304 [NFTA_TUNNEL_KEY_FLAGS] = { .type = NLA_U32, },
305 [NFTA_TUNNEL_KEY_TOS] = { .type = NLA_U8, },
306 [NFTA_TUNNEL_KEY_TTL] = { .type = NLA_U8, },
307 [NFTA_TUNNEL_KEY_OPTS] = { .type = NLA_NESTED, },
308};
309
310static int nft_tunnel_obj_init(const struct nft_ctx *ctx,
311 const struct nlattr * const tb[],
312 struct nft_object *obj)
313{
314 struct nft_tunnel_obj *priv = nft_obj_data(obj);
315 struct ip_tunnel_info info;
316 struct metadata_dst *md;
317 int err;
318
319 if (!tb[NFTA_TUNNEL_KEY_ID])
320 return -EINVAL;
321
322 memset(&info, 0, sizeof(info));
323 info.mode = IP_TUNNEL_INFO_TX;
324 info.key.tun_id = key32_to_tunnel_id(nla_get_be32(tb[NFTA_TUNNEL_KEY_ID]));
325 info.key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE;
326
327 if (tb[NFTA_TUNNEL_KEY_IP]) {
328 err = nft_tunnel_obj_ip_init(ctx, tb[NFTA_TUNNEL_KEY_IP], &info);
329 if (err < 0)
330 return err;
331 } else if (tb[NFTA_TUNNEL_KEY_IP6]) {
332 err = nft_tunnel_obj_ip6_init(ctx, tb[NFTA_TUNNEL_KEY_IP6], &info);
333 if (err < 0)
334 return err;
335 } else {
336 return -EINVAL;
337 }
338
339 if (tb[NFTA_TUNNEL_KEY_SPORT]) {
340 info.key.tp_src = nla_get_be16(tb[NFTA_TUNNEL_KEY_SPORT]);
341 }
342 if (tb[NFTA_TUNNEL_KEY_DPORT]) {
343 info.key.tp_dst = nla_get_be16(tb[NFTA_TUNNEL_KEY_DPORT]);
344 }
345
346 if (tb[NFTA_TUNNEL_KEY_FLAGS]) {
347 u32 tun_flags;
348
349 tun_flags = ntohl(nla_get_be32(tb[NFTA_TUNNEL_KEY_FLAGS]));
350 if (tun_flags & ~NFT_TUNNEL_F_MASK)
351 return -EOPNOTSUPP;
352
353 if (tun_flags & NFT_TUNNEL_F_ZERO_CSUM_TX)
354 info.key.tun_flags &= ~TUNNEL_CSUM;
355 if (tun_flags & NFT_TUNNEL_F_DONT_FRAGMENT)
356 info.key.tun_flags |= TUNNEL_DONT_FRAGMENT;
357 if (tun_flags & NFT_TUNNEL_F_SEQ_NUMBER)
358 info.key.tun_flags |= TUNNEL_SEQ;
359 }
360 if (tb[NFTA_TUNNEL_KEY_TOS])
361 info.key.tos = nla_get_u8(tb[NFTA_TUNNEL_KEY_TOS]);
362 if (tb[NFTA_TUNNEL_KEY_TTL])
363 info.key.ttl = nla_get_u8(tb[NFTA_TUNNEL_KEY_TTL]);
364 else
365 info.key.ttl = U8_MAX;
366
367 if (tb[NFTA_TUNNEL_KEY_OPTS]) {
368 err = nft_tunnel_obj_opts_init(ctx, tb[NFTA_TUNNEL_KEY_OPTS],
369 &info, &priv->opts);
370 if (err < 0)
371 return err;
372 }
373
374 md = metadata_dst_alloc(priv->opts.len, METADATA_IP_TUNNEL, GFP_KERNEL);
375 if (!md)
376 return -ENOMEM;
377
378 memcpy(&md->u.tun_info, &info, sizeof(info));
379 ip_tunnel_info_opts_set(&md->u.tun_info, &priv->opts.u, priv->opts.len,
380 priv->opts.flags);
381 priv->md = md;
382
383 return 0;
384}
385
386static inline void nft_tunnel_obj_eval(struct nft_object *obj,
387 struct nft_regs *regs,
388 const struct nft_pktinfo *pkt)
389{
390 struct nft_tunnel_obj *priv = nft_obj_data(obj);
391 struct sk_buff *skb = pkt->skb;
392
393 skb_dst_drop(skb);
394 dst_hold((struct dst_entry *) priv->md);
395 skb_dst_set(skb, (struct dst_entry *) priv->md);
396}
397
398static int nft_tunnel_ip_dump(struct sk_buff *skb, struct ip_tunnel_info *info)
399{
400 struct nlattr *nest;
401
402 if (info->mode & IP_TUNNEL_INFO_IPV6) {
403 nest = nla_nest_start(skb, NFTA_TUNNEL_KEY_IP6);
404 if (!nest)
405 return -1;
406
407 if (nla_put_in6_addr(skb, NFTA_TUNNEL_KEY_IP6_SRC, &info->key.u.ipv6.src) < 0 ||
408 nla_put_in6_addr(skb, NFTA_TUNNEL_KEY_IP6_DST, &info->key.u.ipv6.dst) < 0 ||
409 nla_put_be32(skb, NFTA_TUNNEL_KEY_IP6_FLOWLABEL, info->key.label))
410 return -1;
411
412 nla_nest_end(skb, nest);
413 } else {
414 nest = nla_nest_start(skb, NFTA_TUNNEL_KEY_IP);
415 if (!nest)
416 return -1;
417
418 if (nla_put_in_addr(skb, NFTA_TUNNEL_KEY_IP_SRC, info->key.u.ipv4.src) < 0 ||
419 nla_put_in_addr(skb, NFTA_TUNNEL_KEY_IP_DST, info->key.u.ipv4.dst) < 0)
420 return -1;
421
422 nla_nest_end(skb, nest);
423 }
424
425 return 0;
426}
427
428static int nft_tunnel_opts_dump(struct sk_buff *skb,
429 struct nft_tunnel_obj *priv)
430{
431 struct nft_tunnel_opts *opts = &priv->opts;
432 struct nlattr *nest;
433
434 nest = nla_nest_start(skb, NFTA_TUNNEL_KEY_OPTS);
435 if (!nest)
436 return -1;
437
438 if (opts->flags & TUNNEL_VXLAN_OPT) {
439 if (nla_put_be32(skb, NFTA_TUNNEL_KEY_VXLAN_GBP,
440 htonl(opts->u.vxlan.gbp)))
441 return -1;
442 } else if (opts->flags & TUNNEL_ERSPAN_OPT) {
443 switch (opts->u.erspan.version) {
444 case ERSPAN_VERSION:
445 if (nla_put_be32(skb, NFTA_TUNNEL_KEY_ERSPAN_V1_INDEX,
446 opts->u.erspan.u.index))
447 return -1;
448 break;
449 case ERSPAN_VERSION2:
450 if (nla_put_u8(skb, NFTA_TUNNEL_KEY_ERSPAN_V2_HWID,
451 get_hwid(&opts->u.erspan.u.md2)) ||
452 nla_put_u8(skb, NFTA_TUNNEL_KEY_ERSPAN_V2_DIR,
453 opts->u.erspan.u.md2.dir))
454 return -1;
455 break;
456 }
457 }
458 nla_nest_end(skb, nest);
459
460 return 0;
461}
462
463static int nft_tunnel_ports_dump(struct sk_buff *skb,
464 struct ip_tunnel_info *info)
465{
466 if (nla_put_be16(skb, NFTA_TUNNEL_KEY_SPORT, htons(info->key.tp_src)) < 0 ||
467 nla_put_be16(skb, NFTA_TUNNEL_KEY_DPORT, htons(info->key.tp_dst)) < 0)
468 return -1;
469
470 return 0;
471}
472
473static int nft_tunnel_flags_dump(struct sk_buff *skb,
474 struct ip_tunnel_info *info)
475{
476 u32 flags = 0;
477
478 if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT)
479 flags |= NFT_TUNNEL_F_DONT_FRAGMENT;
480 if (!(info->key.tun_flags & TUNNEL_CSUM))
481 flags |= NFT_TUNNEL_F_ZERO_CSUM_TX;
482 if (info->key.tun_flags & TUNNEL_SEQ)
483 flags |= NFT_TUNNEL_F_SEQ_NUMBER;
484
485 if (nla_put_be32(skb, NFTA_TUNNEL_KEY_FLAGS, htonl(flags)) < 0)
486 return -1;
487
488 return 0;
489}
490
491static int nft_tunnel_obj_dump(struct sk_buff *skb,
492 struct nft_object *obj, bool reset)
493{
494 struct nft_tunnel_obj *priv = nft_obj_data(obj);
495 struct ip_tunnel_info *info = &priv->md->u.tun_info;
496
497 if (nla_put_be32(skb, NFTA_TUNNEL_KEY_ID,
498 tunnel_id_to_key32(info->key.tun_id)) ||
499 nft_tunnel_ip_dump(skb, info) < 0 ||
500 nft_tunnel_ports_dump(skb, info) < 0 ||
501 nft_tunnel_flags_dump(skb, info) < 0 ||
502 nla_put_u8(skb, NFTA_TUNNEL_KEY_TOS, info->key.tos) ||
503 nla_put_u8(skb, NFTA_TUNNEL_KEY_TTL, info->key.ttl) ||
504 nft_tunnel_opts_dump(skb, priv) < 0)
505 goto nla_put_failure;
506
507 return 0;
508
509nla_put_failure:
510 return -1;
511}
512
513static void nft_tunnel_obj_destroy(const struct nft_ctx *ctx,
514 struct nft_object *obj)
515{
516 struct nft_tunnel_obj *priv = nft_obj_data(obj);
517
518 metadata_dst_free(priv->md);
519}
520
521static struct nft_object_type nft_tunnel_obj_type;
522static const struct nft_object_ops nft_tunnel_obj_ops = {
523 .type = &nft_tunnel_obj_type,
524 .size = sizeof(struct nft_tunnel_obj),
525 .eval = nft_tunnel_obj_eval,
526 .init = nft_tunnel_obj_init,
527 .destroy = nft_tunnel_obj_destroy,
528 .dump = nft_tunnel_obj_dump,
529};
530
531static struct nft_object_type nft_tunnel_obj_type __read_mostly = {
532 .type = NFT_OBJECT_TUNNEL,
533 .ops = &nft_tunnel_obj_ops,
534 .maxattr = NFTA_TUNNEL_KEY_MAX,
535 .policy = nft_tunnel_key_policy,
536 .owner = THIS_MODULE,
537};
538
539static int __init nft_tunnel_module_init(void)
540{
541 int err;
542
543 err = nft_register_expr(&nft_tunnel_type);
544 if (err < 0)
545 return err;
546
547 err = nft_register_obj(&nft_tunnel_obj_type);
548 if (err < 0)
549 nft_unregister_expr(&nft_tunnel_type);
550
551 return err;
552}
553
554static void __exit nft_tunnel_module_exit(void)
555{
556 nft_unregister_obj(&nft_tunnel_obj_type);
557 nft_unregister_expr(&nft_tunnel_type);
558}
559
560module_init(nft_tunnel_module_init);
561module_exit(nft_tunnel_module_exit);
562
563MODULE_LICENSE("GPL");
564MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
565MODULE_ALIAS_NFT_EXPR("tunnel");
566MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_TUNNEL);
diff --git a/net/netfilter/utils.c b/net/netfilter/utils.c
index 0b660c568156..e8da9a9bba73 100644
--- a/net/netfilter/utils.c
+++ b/net/netfilter/utils.c
@@ -1,14 +1,128 @@
1// SPDX-License-Identifier: GPL-2.0
1#include <linux/kernel.h> 2#include <linux/kernel.h>
2#include <linux/netfilter.h> 3#include <linux/netfilter.h>
3#include <linux/netfilter_ipv4.h> 4#include <linux/netfilter_ipv4.h>
4#include <linux/netfilter_ipv6.h> 5#include <linux/netfilter_ipv6.h>
5#include <net/netfilter/nf_queue.h> 6#include <net/netfilter/nf_queue.h>
7#include <net/ip6_checksum.h>
8
9#ifdef CONFIG_INET
10__sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook,
11 unsigned int dataoff, u8 protocol)
12{
13 const struct iphdr *iph = ip_hdr(skb);
14 __sum16 csum = 0;
15
16 switch (skb->ip_summed) {
17 case CHECKSUM_COMPLETE:
18 if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN)
19 break;
20 if ((protocol == 0 && !csum_fold(skb->csum)) ||
21 !csum_tcpudp_magic(iph->saddr, iph->daddr,
22 skb->len - dataoff, protocol,
23 skb->csum)) {
24 skb->ip_summed = CHECKSUM_UNNECESSARY;
25 break;
26 }
27 /* fall through */
28 case CHECKSUM_NONE:
29 if (protocol == 0)
30 skb->csum = 0;
31 else
32 skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
33 skb->len - dataoff,
34 protocol, 0);
35 csum = __skb_checksum_complete(skb);
36 }
37 return csum;
38}
39EXPORT_SYMBOL(nf_ip_checksum);
40#endif
41
42static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook,
43 unsigned int dataoff, unsigned int len,
44 u8 protocol)
45{
46 const struct iphdr *iph = ip_hdr(skb);
47 __sum16 csum = 0;
48
49 switch (skb->ip_summed) {
50 case CHECKSUM_COMPLETE:
51 if (len == skb->len - dataoff)
52 return nf_ip_checksum(skb, hook, dataoff, protocol);
53 /* fall through */
54 case CHECKSUM_NONE:
55 skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, protocol,
56 skb->len - dataoff, 0);
57 skb->ip_summed = CHECKSUM_NONE;
58 return __skb_checksum_complete_head(skb, dataoff + len);
59 }
60 return csum;
61}
62
63__sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook,
64 unsigned int dataoff, u8 protocol)
65{
66 const struct ipv6hdr *ip6h = ipv6_hdr(skb);
67 __sum16 csum = 0;
68
69 switch (skb->ip_summed) {
70 case CHECKSUM_COMPLETE:
71 if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN)
72 break;
73 if (!csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
74 skb->len - dataoff, protocol,
75 csum_sub(skb->csum,
76 skb_checksum(skb, 0,
77 dataoff, 0)))) {
78 skb->ip_summed = CHECKSUM_UNNECESSARY;
79 break;
80 }
81 /* fall through */
82 case CHECKSUM_NONE:
83 skb->csum = ~csum_unfold(
84 csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
85 skb->len - dataoff,
86 protocol,
87 csum_sub(0,
88 skb_checksum(skb, 0,
89 dataoff, 0))));
90 csum = __skb_checksum_complete(skb);
91 }
92 return csum;
93}
94EXPORT_SYMBOL(nf_ip6_checksum);
95
96static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook,
97 unsigned int dataoff, unsigned int len,
98 u8 protocol)
99{
100 const struct ipv6hdr *ip6h = ipv6_hdr(skb);
101 __wsum hsum;
102 __sum16 csum = 0;
103
104 switch (skb->ip_summed) {
105 case CHECKSUM_COMPLETE:
106 if (len == skb->len - dataoff)
107 return nf_ip6_checksum(skb, hook, dataoff, protocol);
108 /* fall through */
109 case CHECKSUM_NONE:
110 hsum = skb_checksum(skb, 0, dataoff, 0);
111 skb->csum = ~csum_unfold(csum_ipv6_magic(&ip6h->saddr,
112 &ip6h->daddr,
113 skb->len - dataoff,
114 protocol,
115 csum_sub(0, hsum)));
116 skb->ip_summed = CHECKSUM_NONE;
117 return __skb_checksum_complete_head(skb, dataoff + len);
118 }
119 return csum;
120};
6 121
7__sum16 nf_checksum(struct sk_buff *skb, unsigned int hook, 122__sum16 nf_checksum(struct sk_buff *skb, unsigned int hook,
8 unsigned int dataoff, u_int8_t protocol, 123 unsigned int dataoff, u8 protocol,
9 unsigned short family) 124 unsigned short family)
10{ 125{
11 const struct nf_ipv6_ops *v6ops;
12 __sum16 csum = 0; 126 __sum16 csum = 0;
13 127
14 switch (family) { 128 switch (family) {
@@ -16,9 +130,7 @@ __sum16 nf_checksum(struct sk_buff *skb, unsigned int hook,
16 csum = nf_ip_checksum(skb, hook, dataoff, protocol); 130 csum = nf_ip_checksum(skb, hook, dataoff, protocol);
17 break; 131 break;
18 case AF_INET6: 132 case AF_INET6:
19 v6ops = rcu_dereference(nf_ipv6_ops); 133 csum = nf_ip6_checksum(skb, hook, dataoff, protocol);
20 if (v6ops)
21 csum = v6ops->checksum(skb, hook, dataoff, protocol);
22 break; 134 break;
23 } 135 }
24 136
@@ -28,9 +140,8 @@ EXPORT_SYMBOL_GPL(nf_checksum);
28 140
29__sum16 nf_checksum_partial(struct sk_buff *skb, unsigned int hook, 141__sum16 nf_checksum_partial(struct sk_buff *skb, unsigned int hook,
30 unsigned int dataoff, unsigned int len, 142 unsigned int dataoff, unsigned int len,
31 u_int8_t protocol, unsigned short family) 143 u8 protocol, unsigned short family)
32{ 144{
33 const struct nf_ipv6_ops *v6ops;
34 __sum16 csum = 0; 145 __sum16 csum = 0;
35 146
36 switch (family) { 147 switch (family) {
@@ -39,10 +150,8 @@ __sum16 nf_checksum_partial(struct sk_buff *skb, unsigned int hook,
39 protocol); 150 protocol);
40 break; 151 break;
41 case AF_INET6: 152 case AF_INET6:
42 v6ops = rcu_dereference(nf_ipv6_ops); 153 csum = nf_ip6_checksum_partial(skb, hook, dataoff, len,
43 if (v6ops) 154 protocol);
44 csum = v6ops->checksum_partial(skb, hook, dataoff, len,
45 protocol);
46 break; 155 break;
47 } 156 }
48 157
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index d0d8397c9588..aecadd471e1d 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -1178,12 +1178,7 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size)
1178 if (sz < sizeof(*info) || sz >= XT_MAX_TABLE_SIZE) 1178 if (sz < sizeof(*info) || sz >= XT_MAX_TABLE_SIZE)
1179 return NULL; 1179 return NULL;
1180 1180
1181 /* __GFP_NORETRY is not fully supported by kvmalloc but it should 1181 info = kvmalloc(sz, GFP_KERNEL_ACCOUNT);
1182 * work reasonably well if sz is too large and bail out rather
1183 * than shoot all processes down before realizing there is nothing
1184 * more to reclaim.
1185 */
1186 info = kvmalloc(sz, GFP_KERNEL | __GFP_NORETRY);
1187 if (!info) 1182 if (!info)
1188 return NULL; 1183 return NULL;
1189 1184
diff --git a/net/netfilter/xt_AUDIT.c b/net/netfilter/xt_AUDIT.c
index f368ee6741db..af883f1b64f9 100644
--- a/net/netfilter/xt_AUDIT.c
+++ b/net/netfilter/xt_AUDIT.c
@@ -72,7 +72,7 @@ audit_tg(struct sk_buff *skb, const struct xt_action_param *par)
72 struct audit_buffer *ab; 72 struct audit_buffer *ab;
73 int fam = -1; 73 int fam = -1;
74 74
75 if (audit_enabled == 0) 75 if (audit_enabled == AUDIT_OFF)
76 goto errout; 76 goto errout;
77 ab = audit_log_start(NULL, GFP_ATOMIC, AUDIT_NETFILTER_PKT); 77 ab = audit_log_start(NULL, GFP_ATOMIC, AUDIT_NETFILTER_PKT);
78 if (ab == NULL) 78 if (ab == NULL)
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index 03b9a50ec93b..89457efd2e00 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -93,7 +93,7 @@ xt_ct_set_helper(struct nf_conn *ct, const char *helper_name,
93 return -ENOENT; 93 return -ENOENT;
94 } 94 }
95 95
96 help = nf_ct_helper_ext_add(ct, helper, GFP_KERNEL); 96 help = nf_ct_helper_ext_add(ct, GFP_KERNEL);
97 if (help == NULL) { 97 if (help == NULL) {
98 nf_conntrack_helper_put(helper); 98 nf_conntrack_helper_put(helper);
99 return -ENOMEM; 99 return -ENOMEM;
@@ -104,7 +104,7 @@ xt_ct_set_helper(struct nf_conn *ct, const char *helper_name,
104} 104}
105 105
106#ifdef CONFIG_NF_CONNTRACK_TIMEOUT 106#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
107static void __xt_ct_tg_timeout_put(struct ctnl_timeout *timeout) 107static void __xt_ct_tg_timeout_put(struct nf_ct_timeout *timeout)
108{ 108{
109 typeof(nf_ct_timeout_put_hook) timeout_put; 109 typeof(nf_ct_timeout_put_hook) timeout_put;
110 110
@@ -121,7 +121,7 @@ xt_ct_set_timeout(struct nf_conn *ct, const struct xt_tgchk_param *par,
121#ifdef CONFIG_NF_CONNTRACK_TIMEOUT 121#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
122 typeof(nf_ct_timeout_find_get_hook) timeout_find_get; 122 typeof(nf_ct_timeout_find_get_hook) timeout_find_get;
123 const struct nf_conntrack_l4proto *l4proto; 123 const struct nf_conntrack_l4proto *l4proto;
124 struct ctnl_timeout *timeout; 124 struct nf_ct_timeout *timeout;
125 struct nf_conn_timeout *timeout_ext; 125 struct nf_conn_timeout *timeout_ext;
126 const char *errmsg = NULL; 126 const char *errmsg = NULL;
127 int ret = 0; 127 int ret = 0;
diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c
index 475957cfcf50..0d0d68c989df 100644
--- a/net/netfilter/xt_TEE.c
+++ b/net/netfilter/xt_TEE.c
@@ -38,7 +38,7 @@ tee_tg4(struct sk_buff *skb, const struct xt_action_param *par)
38 return XT_CONTINUE; 38 return XT_CONTINUE;
39} 39}
40 40
41#if IS_ENABLED(CONFIG_IPV6) 41#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
42static unsigned int 42static unsigned int
43tee_tg6(struct sk_buff *skb, const struct xt_action_param *par) 43tee_tg6(struct sk_buff *skb, const struct xt_action_param *par)
44{ 44{
@@ -141,7 +141,7 @@ static struct xt_target tee_tg_reg[] __read_mostly = {
141 .destroy = tee_tg_destroy, 141 .destroy = tee_tg_destroy,
142 .me = THIS_MODULE, 142 .me = THIS_MODULE,
143 }, 143 },
144#if IS_ENABLED(CONFIG_IPV6) 144#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
145 { 145 {
146 .name = "TEE", 146 .name = "TEE",
147 .revision = 1, 147 .revision = 1,
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index 58fce4e749a9..ad7420cdc439 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -36,15 +36,6 @@
36#include <net/netfilter/nf_tproxy.h> 36#include <net/netfilter/nf_tproxy.h>
37#include <linux/netfilter/xt_TPROXY.h> 37#include <linux/netfilter/xt_TPROXY.h>
38 38
39/* assign a socket to the skb -- consumes sk */
40static void
41nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk)
42{
43 skb_orphan(skb);
44 skb->sk = sk;
45 skb->destructor = sock_edemux;
46}
47
48static unsigned int 39static unsigned int
49tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport, 40tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport,
50 u_int32_t mark_mask, u_int32_t mark_value) 41 u_int32_t mark_mask, u_int32_t mark_value)
@@ -61,7 +52,7 @@ tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport,
61 * addresses, this happens if the redirect already happened 52 * addresses, this happens if the redirect already happened
62 * and the current packet belongs to an already established 53 * and the current packet belongs to an already established
63 * connection */ 54 * connection */
64 sk = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol, 55 sk = nf_tproxy_get_sock_v4(net, skb, iph->protocol,
65 iph->saddr, iph->daddr, 56 iph->saddr, iph->daddr,
66 hp->source, hp->dest, 57 hp->source, hp->dest,
67 skb->dev, NF_TPROXY_LOOKUP_ESTABLISHED); 58 skb->dev, NF_TPROXY_LOOKUP_ESTABLISHED);
@@ -77,7 +68,7 @@ tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport,
77 else if (!sk) 68 else if (!sk)
78 /* no, there's no established connection, check if 69 /* no, there's no established connection, check if
79 * there's a listener on the redirected addr/port */ 70 * there's a listener on the redirected addr/port */
80 sk = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol, 71 sk = nf_tproxy_get_sock_v4(net, skb, iph->protocol,
81 iph->saddr, laddr, 72 iph->saddr, laddr,
82 hp->source, lport, 73 hp->source, lport,
83 skb->dev, NF_TPROXY_LOOKUP_LISTENER); 74 skb->dev, NF_TPROXY_LOOKUP_LISTENER);
@@ -150,7 +141,7 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
150 * addresses, this happens if the redirect already happened 141 * addresses, this happens if the redirect already happened
151 * and the current packet belongs to an already established 142 * and the current packet belongs to an already established
152 * connection */ 143 * connection */
153 sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, hp, tproto, 144 sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, tproto,
154 &iph->saddr, &iph->daddr, 145 &iph->saddr, &iph->daddr,
155 hp->source, hp->dest, 146 hp->source, hp->dest,
156 xt_in(par), NF_TPROXY_LOOKUP_ESTABLISHED); 147 xt_in(par), NF_TPROXY_LOOKUP_ESTABLISHED);
@@ -171,7 +162,7 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
171 else if (!sk) 162 else if (!sk)
172 /* no there's no established connection, check if 163 /* no there's no established connection, check if
173 * there's a listener on the redirected addr/port */ 164 * there's a listener on the redirected addr/port */
174 sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, hp, 165 sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff,
175 tproto, &iph->saddr, laddr, 166 tproto, &iph->saddr, laddr,
176 hp->source, lport, 167 hp->source, lport,
177 xt_in(par), NF_TPROXY_LOOKUP_LISTENER); 168 xt_in(par), NF_TPROXY_LOOKUP_LISTENER);
diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c
index 7df2dece57d3..5d92e1781980 100644
--- a/net/netfilter/xt_cgroup.c
+++ b/net/netfilter/xt_cgroup.c
@@ -72,8 +72,9 @@ static bool
72cgroup_mt_v0(const struct sk_buff *skb, struct xt_action_param *par) 72cgroup_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
73{ 73{
74 const struct xt_cgroup_info_v0 *info = par->matchinfo; 74 const struct xt_cgroup_info_v0 *info = par->matchinfo;
75 struct sock *sk = skb->sk;
75 76
76 if (skb->sk == NULL || !sk_fullsock(skb->sk)) 77 if (!sk || !sk_fullsock(sk) || !net_eq(xt_net(par), sock_net(sk)))
77 return false; 78 return false;
78 79
79 return (info->id == sock_cgroup_classid(&skb->sk->sk_cgrp_data)) ^ 80 return (info->id == sock_cgroup_classid(&skb->sk->sk_cgrp_data)) ^
@@ -85,8 +86,9 @@ static bool cgroup_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
85 const struct xt_cgroup_info_v1 *info = par->matchinfo; 86 const struct xt_cgroup_info_v1 *info = par->matchinfo;
86 struct sock_cgroup_data *skcd = &skb->sk->sk_cgrp_data; 87 struct sock_cgroup_data *skcd = &skb->sk->sk_cgrp_data;
87 struct cgroup *ancestor = info->priv; 88 struct cgroup *ancestor = info->priv;
89 struct sock *sk = skb->sk;
88 90
89 if (!skb->sk || !sk_fullsock(skb->sk)) 91 if (!sk || !sk_fullsock(sk) || !net_eq(xt_net(par), sock_net(sk)))
90 return false; 92 return false;
91 93
92 if (ancestor) 94 if (ancestor)
diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c
index 6275106ccf50..bc6c8ab0fa62 100644
--- a/net/netfilter/xt_connlimit.c
+++ b/net/netfilter/xt_connlimit.c
@@ -93,10 +93,8 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par)
93 93
94 /* init private data */ 94 /* init private data */
95 info->data = nf_conncount_init(par->net, par->family, keylen); 95 info->data = nf_conncount_init(par->net, par->family, keylen);
96 if (IS_ERR(info->data))
97 return PTR_ERR(info->data);
98 96
99 return 0; 97 return PTR_ERR_OR_ZERO(info->data);
100} 98}
101 99
102static void connlimit_mt_destroy(const struct xt_mtdtor_param *par) 100static void connlimit_mt_destroy(const struct xt_mtdtor_param *par)
diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c
index 9cfef73b4107..bf7bba80e24c 100644
--- a/net/netfilter/xt_osf.c
+++ b/net/netfilter/xt_osf.c
@@ -37,118 +37,6 @@
37#include <net/netfilter/nf_log.h> 37#include <net/netfilter/nf_log.h>
38#include <linux/netfilter/xt_osf.h> 38#include <linux/netfilter/xt_osf.h>
39 39
40/*
41 * Indexed by dont-fragment bit.
42 * It is the only constant value in the fingerprint.
43 */
44static struct list_head xt_osf_fingers[2];
45
46static const struct nla_policy xt_osf_policy[OSF_ATTR_MAX + 1] = {
47 [OSF_ATTR_FINGER] = { .len = sizeof(struct xt_osf_user_finger) },
48};
49
50static int xt_osf_add_callback(struct net *net, struct sock *ctnl,
51 struct sk_buff *skb, const struct nlmsghdr *nlh,
52 const struct nlattr * const osf_attrs[],
53 struct netlink_ext_ack *extack)
54{
55 struct xt_osf_user_finger *f;
56 struct xt_osf_finger *kf = NULL, *sf;
57 int err = 0;
58
59 if (!capable(CAP_NET_ADMIN))
60 return -EPERM;
61
62 if (!osf_attrs[OSF_ATTR_FINGER])
63 return -EINVAL;
64
65 if (!(nlh->nlmsg_flags & NLM_F_CREATE))
66 return -EINVAL;
67
68 f = nla_data(osf_attrs[OSF_ATTR_FINGER]);
69
70 kf = kmalloc(sizeof(struct xt_osf_finger), GFP_KERNEL);
71 if (!kf)
72 return -ENOMEM;
73
74 memcpy(&kf->finger, f, sizeof(struct xt_osf_user_finger));
75
76 list_for_each_entry(sf, &xt_osf_fingers[!!f->df], finger_entry) {
77 if (memcmp(&sf->finger, f, sizeof(struct xt_osf_user_finger)))
78 continue;
79
80 kfree(kf);
81 kf = NULL;
82
83 if (nlh->nlmsg_flags & NLM_F_EXCL)
84 err = -EEXIST;
85 break;
86 }
87
88 /*
89 * We are protected by nfnl mutex.
90 */
91 if (kf)
92 list_add_tail_rcu(&kf->finger_entry, &xt_osf_fingers[!!f->df]);
93
94 return err;
95}
96
97static int xt_osf_remove_callback(struct net *net, struct sock *ctnl,
98 struct sk_buff *skb,
99 const struct nlmsghdr *nlh,
100 const struct nlattr * const osf_attrs[],
101 struct netlink_ext_ack *extack)
102{
103 struct xt_osf_user_finger *f;
104 struct xt_osf_finger *sf;
105 int err = -ENOENT;
106
107 if (!capable(CAP_NET_ADMIN))
108 return -EPERM;
109
110 if (!osf_attrs[OSF_ATTR_FINGER])
111 return -EINVAL;
112
113 f = nla_data(osf_attrs[OSF_ATTR_FINGER]);
114
115 list_for_each_entry(sf, &xt_osf_fingers[!!f->df], finger_entry) {
116 if (memcmp(&sf->finger, f, sizeof(struct xt_osf_user_finger)))
117 continue;
118
119 /*
120 * We are protected by nfnl mutex.
121 */
122 list_del_rcu(&sf->finger_entry);
123 kfree_rcu(sf, rcu_head);
124
125 err = 0;
126 break;
127 }
128
129 return err;
130}
131
132static const struct nfnl_callback xt_osf_nfnetlink_callbacks[OSF_MSG_MAX] = {
133 [OSF_MSG_ADD] = {
134 .call = xt_osf_add_callback,
135 .attr_count = OSF_ATTR_MAX,
136 .policy = xt_osf_policy,
137 },
138 [OSF_MSG_REMOVE] = {
139 .call = xt_osf_remove_callback,
140 .attr_count = OSF_ATTR_MAX,
141 .policy = xt_osf_policy,
142 },
143};
144
145static const struct nfnetlink_subsystem xt_osf_nfnetlink = {
146 .name = "osf",
147 .subsys_id = NFNL_SUBSYS_OSF,
148 .cb_count = OSF_MSG_MAX,
149 .cb = xt_osf_nfnetlink_callbacks,
150};
151
152static bool 40static bool
153xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p) 41xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
154{ 42{
@@ -159,7 +47,7 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
159 return false; 47 return false;
160 48
161 return nf_osf_match(skb, xt_family(p), xt_hooknum(p), xt_in(p), 49 return nf_osf_match(skb, xt_family(p), xt_hooknum(p), xt_in(p),
162 xt_out(p), info, net, xt_osf_fingers); 50 xt_out(p), info, net, nf_osf_fingers);
163} 51}
164 52
165static struct xt_match xt_osf_match = { 53static struct xt_match xt_osf_match = {
@@ -177,52 +65,21 @@ static struct xt_match xt_osf_match = {
177 65
178static int __init xt_osf_init(void) 66static int __init xt_osf_init(void)
179{ 67{
180 int err = -EINVAL; 68 int err;
181 int i;
182
183 for (i=0; i<ARRAY_SIZE(xt_osf_fingers); ++i)
184 INIT_LIST_HEAD(&xt_osf_fingers[i]);
185
186 err = nfnetlink_subsys_register(&xt_osf_nfnetlink);
187 if (err < 0) {
188 pr_err("Failed to register OSF nsfnetlink helper (%d)\n", err);
189 goto err_out_exit;
190 }
191 69
192 err = xt_register_match(&xt_osf_match); 70 err = xt_register_match(&xt_osf_match);
193 if (err) { 71 if (err) {
194 pr_err("Failed to register OS fingerprint " 72 pr_err("Failed to register OS fingerprint "
195 "matching module (%d)\n", err); 73 "matching module (%d)\n", err);
196 goto err_out_remove; 74 return err;
197 } 75 }
198 76
199 return 0; 77 return 0;
200
201err_out_remove:
202 nfnetlink_subsys_unregister(&xt_osf_nfnetlink);
203err_out_exit:
204 return err;
205} 78}
206 79
207static void __exit xt_osf_fini(void) 80static void __exit xt_osf_fini(void)
208{ 81{
209 struct xt_osf_finger *f;
210 int i;
211
212 nfnetlink_subsys_unregister(&xt_osf_nfnetlink);
213 xt_unregister_match(&xt_osf_match); 82 xt_unregister_match(&xt_osf_match);
214
215 rcu_read_lock();
216 for (i=0; i<ARRAY_SIZE(xt_osf_fingers); ++i) {
217
218 list_for_each_entry_rcu(f, &xt_osf_fingers[i], finger_entry) {
219 list_del_rcu(&f->finger_entry);
220 kfree_rcu(f, rcu_head);
221 }
222 }
223 rcu_read_unlock();
224
225 rcu_barrier();
226} 83}
227 84
228module_init(xt_osf_init); 85module_init(xt_osf_init);
diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c
index 3d705c688a27..46686fb73784 100644
--- a/net/netfilter/xt_owner.c
+++ b/net/netfilter/xt_owner.c
@@ -67,7 +67,7 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par)
67 struct sock *sk = skb_to_full_sk(skb); 67 struct sock *sk = skb_to_full_sk(skb);
68 struct net *net = xt_net(par); 68 struct net *net = xt_net(par);
69 69
70 if (sk == NULL || sk->sk_socket == NULL) 70 if (!sk || !sk->sk_socket || !net_eq(net, sock_net(sk)))
71 return (info->match ^ info->invert) == 0; 71 return (info->match ^ info->invert) == 0;
72 else if (info->match & info->invert & XT_OWNER_SOCKET) 72 else if (info->match & info->invert & XT_OWNER_SOCKET)
73 /* 73 /*
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index 07085c22b19c..f44de4bc2100 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -265,7 +265,8 @@ recent_mt(const struct sk_buff *skb, struct xt_action_param *par)
265 } 265 }
266 266
267 /* use TTL as seen before forwarding */ 267 /* use TTL as seen before forwarding */
268 if (xt_out(par) != NULL && skb->sk == NULL) 268 if (xt_out(par) != NULL &&
269 (!skb->sk || !net_eq(net, sock_net(skb->sk))))
269 ttl++; 270 ttl++;
270 271
271 spin_lock_bh(&recent_lock); 272 spin_lock_bh(&recent_lock);
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index 5c0779c4fa3c..0472f3472842 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -56,8 +56,12 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
56 struct sk_buff *pskb = (struct sk_buff *)skb; 56 struct sk_buff *pskb = (struct sk_buff *)skb;
57 struct sock *sk = skb->sk; 57 struct sock *sk = skb->sk;
58 58
59 if (!net_eq(xt_net(par), sock_net(sk)))
60 sk = NULL;
61
59 if (!sk) 62 if (!sk)
60 sk = nf_sk_lookup_slow_v4(xt_net(par), skb, xt_in(par)); 63 sk = nf_sk_lookup_slow_v4(xt_net(par), skb, xt_in(par));
64
61 if (sk) { 65 if (sk) {
62 bool wildcard; 66 bool wildcard;
63 bool transparent = true; 67 bool transparent = true;
@@ -113,8 +117,12 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par)
113 struct sk_buff *pskb = (struct sk_buff *)skb; 117 struct sk_buff *pskb = (struct sk_buff *)skb;
114 struct sock *sk = skb->sk; 118 struct sock *sk = skb->sk;
115 119
120 if (!net_eq(xt_net(par), sock_net(sk)))
121 sk = NULL;
122
116 if (!sk) 123 if (!sk)
117 sk = nf_sk_lookup_slow_v6(xt_net(par), skb, xt_in(par)); 124 sk = nf_sk_lookup_slow_v6(xt_net(par), skb, xt_in(par));
125
118 if (sk) { 126 if (sk) {
119 bool wildcard; 127 bool wildcard;
120 bool transparent = true; 128 bool transparent = true;
diff --git a/net/netlabel/netlabel_user.c b/net/netlabel/netlabel_user.c
index 2f328af91a52..4676f5bb16ae 100644
--- a/net/netlabel/netlabel_user.c
+++ b/net/netlabel/netlabel_user.c
@@ -101,7 +101,7 @@ struct audit_buffer *netlbl_audit_start_common(int type,
101 char *secctx; 101 char *secctx;
102 u32 secctx_len; 102 u32 secctx_len;
103 103
104 if (audit_enabled == 0) 104 if (audit_enabled == AUDIT_OFF)
105 return NULL; 105 return NULL;
106 106
107 audit_buf = audit_log_start(audit_context(), GFP_ATOMIC, type); 107 audit_buf = audit_log_start(audit_context(), GFP_ATOMIC, type);
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 1189b84413d5..930d17fa906c 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -63,6 +63,7 @@
63#include <linux/hash.h> 63#include <linux/hash.h>
64#include <linux/genetlink.h> 64#include <linux/genetlink.h>
65#include <linux/net_namespace.h> 65#include <linux/net_namespace.h>
66#include <linux/nospec.h>
66 67
67#include <net/net_namespace.h> 68#include <net/net_namespace.h>
68#include <net/netns/generic.h> 69#include <net/netns/generic.h>
@@ -679,6 +680,7 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol,
679 680
680 if (protocol < 0 || protocol >= MAX_LINKS) 681 if (protocol < 0 || protocol >= MAX_LINKS)
681 return -EPROTONOSUPPORT; 682 return -EPROTONOSUPPORT;
683 protocol = array_index_nospec(protocol, MAX_LINKS);
682 684
683 netlink_lock_table(); 685 netlink_lock_table();
684#ifdef CONFIG_MODULES 686#ifdef CONFIG_MODULES
@@ -1009,6 +1011,11 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr,
1009 return err; 1011 return err;
1010 } 1012 }
1011 1013
1014 if (nlk->ngroups == 0)
1015 groups = 0;
1016 else if (nlk->ngroups < 8*sizeof(groups))
1017 groups &= (1UL << nlk->ngroups) - 1;
1018
1012 bound = nlk->bound; 1019 bound = nlk->bound;
1013 if (bound) { 1020 if (bound) {
1014 /* Ensure nlk->portid is up-to-date. */ 1021 /* Ensure nlk->portid is up-to-date. */
@@ -2300,7 +2307,6 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
2300 2307
2301 cb = &nlk->cb; 2308 cb = &nlk->cb;
2302 memset(cb, 0, sizeof(*cb)); 2309 memset(cb, 0, sizeof(*cb));
2303 cb->start = control->start;
2304 cb->dump = control->dump; 2310 cb->dump = control->dump;
2305 cb->done = control->done; 2311 cb->done = control->done;
2306 cb->nlh = nlh; 2312 cb->nlh = nlh;
@@ -2309,8 +2315,8 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
2309 cb->min_dump_alloc = control->min_dump_alloc; 2315 cb->min_dump_alloc = control->min_dump_alloc;
2310 cb->skb = skb; 2316 cb->skb = skb;
2311 2317
2312 if (cb->start) { 2318 if (control->start) {
2313 ret = cb->start(cb); 2319 ret = control->start(cb);
2314 if (ret) 2320 if (ret)
2315 goto error_put; 2321 goto error_put;
2316 } 2322 }
@@ -2658,7 +2664,7 @@ static const struct proto_ops netlink_ops = {
2658 .socketpair = sock_no_socketpair, 2664 .socketpair = sock_no_socketpair,
2659 .accept = sock_no_accept, 2665 .accept = sock_no_accept,
2660 .getname = netlink_getname, 2666 .getname = netlink_getname,
2661 .poll_mask = datagram_poll_mask, 2667 .poll = datagram_poll,
2662 .ioctl = netlink_ioctl, 2668 .ioctl = netlink_ioctl,
2663 .listen = sock_no_listen, 2669 .listen = sock_no_listen,
2664 .shutdown = sock_no_shutdown, 2670 .shutdown = sock_no_shutdown,
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index 93fbcafbf388..03f37c4e64fe 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -1355,7 +1355,7 @@ static const struct proto_ops nr_proto_ops = {
1355 .socketpair = sock_no_socketpair, 1355 .socketpair = sock_no_socketpair,
1356 .accept = nr_accept, 1356 .accept = nr_accept,
1357 .getname = nr_getname, 1357 .getname = nr_getname,
1358 .poll_mask = datagram_poll_mask, 1358 .poll = datagram_poll,
1359 .ioctl = nr_ioctl, 1359 .ioctl = nr_ioctl,
1360 .listen = nr_listen, 1360 .listen = nr_listen,
1361 .shutdown = sock_no_shutdown, 1361 .shutdown = sock_no_shutdown,
diff --git a/net/nfc/llcp_commands.c b/net/nfc/llcp_commands.c
index 2ceefa183cee..6a196e438b6c 100644
--- a/net/nfc/llcp_commands.c
+++ b/net/nfc/llcp_commands.c
@@ -752,11 +752,14 @@ int nfc_llcp_send_ui_frame(struct nfc_llcp_sock *sock, u8 ssap, u8 dsap,
752 pr_debug("Fragment %zd bytes remaining %zd", 752 pr_debug("Fragment %zd bytes remaining %zd",
753 frag_len, remaining_len); 753 frag_len, remaining_len);
754 754
755 pdu = nfc_alloc_send_skb(sock->dev, &sock->sk, MSG_DONTWAIT, 755 pdu = nfc_alloc_send_skb(sock->dev, &sock->sk, 0,
756 frag_len + LLCP_HEADER_SIZE, &err); 756 frag_len + LLCP_HEADER_SIZE, &err);
757 if (pdu == NULL) { 757 if (pdu == NULL) {
758 pr_err("Could not allocate PDU\n"); 758 pr_err("Could not allocate PDU (error=%d)\n", err);
759 continue; 759 len -= remaining_len;
760 if (len == 0)
761 len = err;
762 break;
760 } 763 }
761 764
762 pdu = llcp_add_header(pdu, dsap, ssap, LLCP_PDU_UI); 765 pdu = llcp_add_header(pdu, dsap, ssap, LLCP_PDU_UI);
diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c
index ab5bb14b49af..dd4adf8b1167 100644
--- a/net/nfc/llcp_sock.c
+++ b/net/nfc/llcp_sock.c
@@ -548,13 +548,16 @@ static inline __poll_t llcp_accept_poll(struct sock *parent)
548 return 0; 548 return 0;
549} 549}
550 550
551static __poll_t llcp_sock_poll_mask(struct socket *sock, __poll_t events) 551static __poll_t llcp_sock_poll(struct file *file, struct socket *sock,
552 poll_table *wait)
552{ 553{
553 struct sock *sk = sock->sk; 554 struct sock *sk = sock->sk;
554 __poll_t mask = 0; 555 __poll_t mask = 0;
555 556
556 pr_debug("%p\n", sk); 557 pr_debug("%p\n", sk);
557 558
559 sock_poll_wait(file, wait);
560
558 if (sk->sk_state == LLCP_LISTEN) 561 if (sk->sk_state == LLCP_LISTEN)
559 return llcp_accept_poll(sk); 562 return llcp_accept_poll(sk);
560 563
@@ -896,7 +899,7 @@ static const struct proto_ops llcp_sock_ops = {
896 .socketpair = sock_no_socketpair, 899 .socketpair = sock_no_socketpair,
897 .accept = llcp_sock_accept, 900 .accept = llcp_sock_accept,
898 .getname = llcp_sock_getname, 901 .getname = llcp_sock_getname,
899 .poll_mask = llcp_sock_poll_mask, 902 .poll = llcp_sock_poll,
900 .ioctl = sock_no_ioctl, 903 .ioctl = sock_no_ioctl,
901 .listen = llcp_sock_listen, 904 .listen = llcp_sock_listen,
902 .shutdown = sock_no_shutdown, 905 .shutdown = sock_no_shutdown,
@@ -916,7 +919,7 @@ static const struct proto_ops llcp_rawsock_ops = {
916 .socketpair = sock_no_socketpair, 919 .socketpair = sock_no_socketpair,
917 .accept = sock_no_accept, 920 .accept = sock_no_accept,
918 .getname = llcp_sock_getname, 921 .getname = llcp_sock_getname,
919 .poll_mask = llcp_sock_poll_mask, 922 .poll = llcp_sock_poll,
920 .ioctl = sock_no_ioctl, 923 .ioctl = sock_no_ioctl,
921 .listen = sock_no_listen, 924 .listen = sock_no_listen,
922 .shutdown = sock_no_shutdown, 925 .shutdown = sock_no_shutdown,
diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c
index 60c322531c49..e2188deb08dc 100644
--- a/net/nfc/rawsock.c
+++ b/net/nfc/rawsock.c
@@ -284,7 +284,7 @@ static const struct proto_ops rawsock_ops = {
284 .socketpair = sock_no_socketpair, 284 .socketpair = sock_no_socketpair,
285 .accept = sock_no_accept, 285 .accept = sock_no_accept,
286 .getname = sock_no_getname, 286 .getname = sock_no_getname,
287 .poll_mask = datagram_poll_mask, 287 .poll = datagram_poll,
288 .ioctl = sock_no_ioctl, 288 .ioctl = sock_no_ioctl,
289 .listen = sock_no_listen, 289 .listen = sock_no_listen,
290 .shutdown = sock_no_shutdown, 290 .shutdown = sock_no_shutdown,
@@ -304,7 +304,7 @@ static const struct proto_ops rawsock_raw_ops = {
304 .socketpair = sock_no_socketpair, 304 .socketpair = sock_no_socketpair,
305 .accept = sock_no_accept, 305 .accept = sock_no_accept,
306 .getname = sock_no_getname, 306 .getname = sock_no_getname,
307 .poll_mask = datagram_poll_mask, 307 .poll = datagram_poll,
308 .ioctl = sock_no_ioctl, 308 .ioctl = sock_no_ioctl,
309 .listen = sock_no_listen, 309 .listen = sock_no_listen,
310 .shutdown = sock_no_shutdown, 310 .shutdown = sock_no_shutdown,
diff --git a/net/nsh/nsh.c b/net/nsh/nsh.c
index 9696ef96b719..1a30e165eeb4 100644
--- a/net/nsh/nsh.c
+++ b/net/nsh/nsh.c
@@ -104,7 +104,7 @@ static struct sk_buff *nsh_gso_segment(struct sk_buff *skb,
104 __skb_pull(skb, nsh_len); 104 __skb_pull(skb, nsh_len);
105 105
106 skb_reset_mac_header(skb); 106 skb_reset_mac_header(skb);
107 skb_reset_mac_len(skb); 107 skb->mac_len = proto == htons(ETH_P_TEB) ? ETH_HLEN : 0;
108 skb->protocol = proto; 108 skb->protocol = proto;
109 109
110 features &= NETIF_F_SG; 110 features &= NETIF_F_SG;
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 30a5df27116e..85ae53d8fd09 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -1057,6 +1057,28 @@ static int sample(struct datapath *dp, struct sk_buff *skb,
1057 clone_flow_key); 1057 clone_flow_key);
1058} 1058}
1059 1059
1060/* When 'last' is true, clone() should always consume the 'skb'.
1061 * Otherwise, clone() should keep 'skb' intact regardless what
1062 * actions are executed within clone().
1063 */
1064static int clone(struct datapath *dp, struct sk_buff *skb,
1065 struct sw_flow_key *key, const struct nlattr *attr,
1066 bool last)
1067{
1068 struct nlattr *actions;
1069 struct nlattr *clone_arg;
1070 int rem = nla_len(attr);
1071 bool dont_clone_flow_key;
1072
1073 /* The first action is always 'OVS_CLONE_ATTR_ARG'. */
1074 clone_arg = nla_data(attr);
1075 dont_clone_flow_key = nla_get_u32(clone_arg);
1076 actions = nla_next(clone_arg, &rem);
1077
1078 return clone_execute(dp, skb, key, 0, actions, rem, last,
1079 !dont_clone_flow_key);
1080}
1081
1060static void execute_hash(struct sk_buff *skb, struct sw_flow_key *key, 1082static void execute_hash(struct sk_buff *skb, struct sw_flow_key *key,
1061 const struct nlattr *attr) 1083 const struct nlattr *attr)
1062{ 1084{
@@ -1336,6 +1358,17 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
1336 consume_skb(skb); 1358 consume_skb(skb);
1337 return 0; 1359 return 0;
1338 } 1360 }
1361 break;
1362
1363 case OVS_ACTION_ATTR_CLONE: {
1364 bool last = nla_is_last(a, rem);
1365
1366 err = clone(dp, skb, key, a, last);
1367 if (last)
1368 return err;
1369
1370 break;
1371 }
1339 } 1372 }
1340 1373
1341 if (unlikely(err)) { 1374 if (unlikely(err)) {
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 284aca2a252d..86a75105af1a 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -26,6 +26,7 @@
26#include <net/netfilter/nf_conntrack_seqadj.h> 26#include <net/netfilter/nf_conntrack_seqadj.h>
27#include <net/netfilter/nf_conntrack_zones.h> 27#include <net/netfilter/nf_conntrack_zones.h>
28#include <net/netfilter/ipv6/nf_defrag_ipv6.h> 28#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
29#include <net/ipv6_frag.h>
29 30
30#ifdef CONFIG_NF_NAT_NEEDED 31#ifdef CONFIG_NF_NAT_NEEDED
31#include <linux/netfilter/nf_nat.h> 32#include <linux/netfilter/nf_nat.h>
@@ -607,23 +608,12 @@ static struct nf_conn *
607ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone, 608ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone,
608 u8 l3num, struct sk_buff *skb, bool natted) 609 u8 l3num, struct sk_buff *skb, bool natted)
609{ 610{
610 const struct nf_conntrack_l3proto *l3proto;
611 const struct nf_conntrack_l4proto *l4proto;
612 struct nf_conntrack_tuple tuple; 611 struct nf_conntrack_tuple tuple;
613 struct nf_conntrack_tuple_hash *h; 612 struct nf_conntrack_tuple_hash *h;
614 struct nf_conn *ct; 613 struct nf_conn *ct;
615 unsigned int dataoff;
616 u8 protonum;
617 614
618 l3proto = __nf_ct_l3proto_find(l3num); 615 if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), l3num,
619 if (l3proto->get_l4proto(skb, skb_network_offset(skb), &dataoff, 616 net, &tuple)) {
620 &protonum) <= 0) {
621 pr_debug("ovs_ct_find_existing: Can't get protonum\n");
622 return NULL;
623 }
624 l4proto = __nf_ct_l4proto_find(l3num, protonum);
625 if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num,
626 protonum, net, &tuple, l3proto, l4proto)) {
627 pr_debug("ovs_ct_find_existing: Can't get tuple\n"); 617 pr_debug("ovs_ct_find_existing: Can't get tuple\n");
628 return NULL; 618 return NULL;
629 } 619 }
@@ -632,7 +622,7 @@ ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone,
632 if (natted) { 622 if (natted) {
633 struct nf_conntrack_tuple inverse; 623 struct nf_conntrack_tuple inverse;
634 624
635 if (!nf_ct_invert_tuple(&inverse, &tuple, l3proto, l4proto)) { 625 if (!nf_ct_invert_tuplepr(&inverse, &tuple)) {
636 pr_debug("ovs_ct_find_existing: Inversion failed!\n"); 626 pr_debug("ovs_ct_find_existing: Inversion failed!\n");
637 return NULL; 627 return NULL;
638 } 628 }
@@ -1314,7 +1304,7 @@ static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name,
1314 return -EINVAL; 1304 return -EINVAL;
1315 } 1305 }
1316 1306
1317 help = nf_ct_helper_ext_add(info->ct, helper, GFP_KERNEL); 1307 help = nf_ct_helper_ext_add(info->ct, GFP_KERNEL);
1318 if (!help) { 1308 if (!help) {
1319 nf_conntrack_helper_put(helper); 1309 nf_conntrack_helper_put(helper);
1320 return -ENOMEM; 1310 return -ENOMEM;
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 492ab0c36f7c..a70097ecf33c 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -2460,6 +2460,40 @@ static int validate_and_copy_sample(struct net *net, const struct nlattr *attr,
2460 return 0; 2460 return 0;
2461} 2461}
2462 2462
2463static int validate_and_copy_clone(struct net *net,
2464 const struct nlattr *attr,
2465 const struct sw_flow_key *key,
2466 struct sw_flow_actions **sfa,
2467 __be16 eth_type, __be16 vlan_tci,
2468 bool log, bool last)
2469{
2470 int start, err;
2471 u32 exec;
2472
2473 if (nla_len(attr) && nla_len(attr) < NLA_HDRLEN)
2474 return -EINVAL;
2475
2476 start = add_nested_action_start(sfa, OVS_ACTION_ATTR_CLONE, log);
2477 if (start < 0)
2478 return start;
2479
2480 exec = last || !actions_may_change_flow(attr);
2481
2482 err = ovs_nla_add_action(sfa, OVS_CLONE_ATTR_EXEC, &exec,
2483 sizeof(exec), log);
2484 if (err)
2485 return err;
2486
2487 err = __ovs_nla_copy_actions(net, attr, key, sfa,
2488 eth_type, vlan_tci, log);
2489 if (err)
2490 return err;
2491
2492 add_nested_action_end(*sfa, start);
2493
2494 return 0;
2495}
2496
2463void ovs_match_init(struct sw_flow_match *match, 2497void ovs_match_init(struct sw_flow_match *match,
2464 struct sw_flow_key *key, 2498 struct sw_flow_key *key,
2465 bool reset_key, 2499 bool reset_key,
@@ -2516,7 +2550,9 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
2516 struct ovs_tunnel_info *ovs_tun; 2550 struct ovs_tunnel_info *ovs_tun;
2517 struct nlattr *a; 2551 struct nlattr *a;
2518 int err = 0, start, opts_type; 2552 int err = 0, start, opts_type;
2553 __be16 dst_opt_type;
2519 2554
2555 dst_opt_type = 0;
2520 ovs_match_init(&match, &key, true, NULL); 2556 ovs_match_init(&match, &key, true, NULL);
2521 opts_type = ip_tun_from_nlattr(nla_data(attr), &match, false, log); 2557 opts_type = ip_tun_from_nlattr(nla_data(attr), &match, false, log);
2522 if (opts_type < 0) 2558 if (opts_type < 0)
@@ -2528,10 +2564,13 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
2528 err = validate_geneve_opts(&key); 2564 err = validate_geneve_opts(&key);
2529 if (err < 0) 2565 if (err < 0)
2530 return err; 2566 return err;
2567 dst_opt_type = TUNNEL_GENEVE_OPT;
2531 break; 2568 break;
2532 case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS: 2569 case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS:
2570 dst_opt_type = TUNNEL_VXLAN_OPT;
2533 break; 2571 break;
2534 case OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS: 2572 case OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS:
2573 dst_opt_type = TUNNEL_ERSPAN_OPT;
2535 break; 2574 break;
2536 } 2575 }
2537 } 2576 }
@@ -2574,7 +2613,7 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
2574 */ 2613 */
2575 ip_tunnel_info_opts_set(tun_info, 2614 ip_tunnel_info_opts_set(tun_info,
2576 TUN_METADATA_OPTS(&key, key.tun_opts_len), 2615 TUN_METADATA_OPTS(&key, key.tun_opts_len),
2577 key.tun_opts_len); 2616 key.tun_opts_len, dst_opt_type);
2578 add_nested_action_end(*sfa, start); 2617 add_nested_action_end(*sfa, start);
2579 2618
2580 return err; 2619 return err;
@@ -2844,6 +2883,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
2844 [OVS_ACTION_ATTR_PUSH_NSH] = (u32)-1, 2883 [OVS_ACTION_ATTR_PUSH_NSH] = (u32)-1,
2845 [OVS_ACTION_ATTR_POP_NSH] = 0, 2884 [OVS_ACTION_ATTR_POP_NSH] = 0,
2846 [OVS_ACTION_ATTR_METER] = sizeof(u32), 2885 [OVS_ACTION_ATTR_METER] = sizeof(u32),
2886 [OVS_ACTION_ATTR_CLONE] = (u32)-1,
2847 }; 2887 };
2848 const struct ovs_action_push_vlan *vlan; 2888 const struct ovs_action_push_vlan *vlan;
2849 int type = nla_type(a); 2889 int type = nla_type(a);
@@ -3033,6 +3073,18 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
3033 /* Non-existent meters are simply ignored. */ 3073 /* Non-existent meters are simply ignored. */
3034 break; 3074 break;
3035 3075
3076 case OVS_ACTION_ATTR_CLONE: {
3077 bool last = nla_is_last(a, rem);
3078
3079 err = validate_and_copy_clone(net, a, key, sfa,
3080 eth_type, vlan_tci,
3081 log, last);
3082 if (err)
3083 return err;
3084 skip_copy = true;
3085 break;
3086 }
3087
3036 default: 3088 default:
3037 OVS_NLERR(log, "Unknown Action type %d", type); 3089 OVS_NLERR(log, "Unknown Action type %d", type);
3038 return -EINVAL; 3090 return -EINVAL;
@@ -3111,6 +3163,26 @@ out:
3111 return err; 3163 return err;
3112} 3164}
3113 3165
3166static int clone_action_to_attr(const struct nlattr *attr,
3167 struct sk_buff *skb)
3168{
3169 struct nlattr *start;
3170 int err = 0, rem = nla_len(attr);
3171
3172 start = nla_nest_start(skb, OVS_ACTION_ATTR_CLONE);
3173 if (!start)
3174 return -EMSGSIZE;
3175
3176 err = ovs_nla_put_actions(nla_data(attr), rem, skb);
3177
3178 if (err)
3179 nla_nest_cancel(skb, start);
3180 else
3181 nla_nest_end(skb, start);
3182
3183 return err;
3184}
3185
3114static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) 3186static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb)
3115{ 3187{
3116 const struct nlattr *ovs_key = nla_data(a); 3188 const struct nlattr *ovs_key = nla_data(a);
@@ -3199,6 +3271,12 @@ int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb)
3199 return err; 3271 return err;
3200 break; 3272 break;
3201 3273
3274 case OVS_ACTION_ATTR_CLONE:
3275 err = clone_action_to_attr(a, skb);
3276 if (err)
3277 return err;
3278 break;
3279
3202 default: 3280 default:
3203 if (nla_put(skb, type, nla_len(a), nla_data(a))) 3281 if (nla_put(skb, type, nla_len(a), nla_data(a)))
3204 return -EMSGSIZE; 3282 return -EMSGSIZE;
diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c
index b891a91577f8..c038e021a591 100644
--- a/net/openvswitch/meter.c
+++ b/net/openvswitch/meter.c
@@ -211,6 +211,7 @@ static struct dp_meter *dp_meter_create(struct nlattr **a)
211 if (!meter) 211 if (!meter)
212 return ERR_PTR(-ENOMEM); 212 return ERR_PTR(-ENOMEM);
213 213
214 meter->id = nla_get_u32(a[OVS_METER_ATTR_ID]);
214 meter->used = div_u64(ktime_get_ns(), 1000 * 1000); 215 meter->used = div_u64(ktime_get_ns(), 1000 * 1000);
215 meter->kbps = a[OVS_METER_ATTR_KBPS] ? 1 : 0; 216 meter->kbps = a[OVS_METER_ATTR_KBPS] ? 1 : 0;
216 meter->keep_stats = !a[OVS_METER_ATTR_CLEAR]; 217 meter->keep_stats = !a[OVS_METER_ATTR_CLEAR];
@@ -280,6 +281,10 @@ static int ovs_meter_cmd_set(struct sk_buff *skb, struct genl_info *info)
280 u32 meter_id; 281 u32 meter_id;
281 bool failed; 282 bool failed;
282 283
284 if (!a[OVS_METER_ATTR_ID]) {
285 return -ENODEV;
286 }
287
283 meter = dp_meter_create(a); 288 meter = dp_meter_create(a);
284 if (IS_ERR_OR_NULL(meter)) 289 if (IS_ERR_OR_NULL(meter))
285 return PTR_ERR(meter); 290 return PTR_ERR(meter);
@@ -298,11 +303,6 @@ static int ovs_meter_cmd_set(struct sk_buff *skb, struct genl_info *info)
298 goto exit_unlock; 303 goto exit_unlock;
299 } 304 }
300 305
301 if (!a[OVS_METER_ATTR_ID]) {
302 err = -ENODEV;
303 goto exit_unlock;
304 }
305
306 meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]); 306 meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]);
307 307
308 /* Cannot fail after this. */ 308 /* Cannot fail after this. */
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 50809748c127..5610061e7f2e 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -275,9 +275,10 @@ static bool packet_use_direct_xmit(const struct packet_sock *po)
275 return po->xmit == packet_direct_xmit; 275 return po->xmit == packet_direct_xmit;
276} 276}
277 277
278static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb) 278static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb,
279 struct net_device *sb_dev)
279{ 280{
280 return (u16) raw_smp_processor_id() % dev->real_num_tx_queues; 281 return dev_pick_tx_cpu_id(dev, skb, sb_dev, NULL);
281} 282}
282 283
283static u16 packet_pick_tx_queue(struct sk_buff *skb) 284static u16 packet_pick_tx_queue(struct sk_buff *skb)
@@ -291,7 +292,7 @@ static u16 packet_pick_tx_queue(struct sk_buff *skb)
291 __packet_pick_tx_queue); 292 __packet_pick_tx_queue);
292 queue_index = netdev_cap_txqueue(dev, queue_index); 293 queue_index = netdev_cap_txqueue(dev, queue_index);
293 } else { 294 } else {
294 queue_index = __packet_pick_tx_queue(dev, skb); 295 queue_index = __packet_pick_tx_queue(dev, skb, NULL);
295 } 296 }
296 297
297 return queue_index; 298 return queue_index;
@@ -1581,7 +1582,7 @@ static int fanout_set_data(struct packet_sock *po, char __user *data,
1581 return fanout_set_data_ebpf(po, data, len); 1582 return fanout_set_data_ebpf(po, data, len);
1582 default: 1583 default:
1583 return -EINVAL; 1584 return -EINVAL;
1584 }; 1585 }
1585} 1586}
1586 1587
1587static void fanout_release_data(struct packet_fanout *f) 1588static void fanout_release_data(struct packet_fanout *f)
@@ -1590,7 +1591,7 @@ static void fanout_release_data(struct packet_fanout *f)
1590 case PACKET_FANOUT_CBPF: 1591 case PACKET_FANOUT_CBPF:
1591 case PACKET_FANOUT_EBPF: 1592 case PACKET_FANOUT_EBPF:
1592 __fanout_set_data_bpf(f, NULL); 1593 __fanout_set_data_bpf(f, NULL);
1593 }; 1594 }
1594} 1595}
1595 1596
1596static bool __fanout_id_is_free(struct sock *sk, u16 candidate_id) 1597static bool __fanout_id_is_free(struct sock *sk, u16 candidate_id)
@@ -1951,7 +1952,7 @@ retry:
1951 goto out_unlock; 1952 goto out_unlock;
1952 } 1953 }
1953 1954
1954 sockc.tsflags = sk->sk_tsflags; 1955 sockcm_init(&sockc, sk);
1955 if (msg->msg_controllen) { 1956 if (msg->msg_controllen) {
1956 err = sock_cmsg_send(sk, msg, &sockc); 1957 err = sock_cmsg_send(sk, msg, &sockc);
1957 if (unlikely(err)) 1958 if (unlikely(err))
@@ -1962,6 +1963,7 @@ retry:
1962 skb->dev = dev; 1963 skb->dev = dev;
1963 skb->priority = sk->sk_priority; 1964 skb->priority = sk->sk_priority;
1964 skb->mark = sk->sk_mark; 1965 skb->mark = sk->sk_mark;
1966 skb->tstamp = sockc.transmit_time;
1965 1967
1966 sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags); 1968 sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags);
1967 1969
@@ -2262,6 +2264,13 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
2262 if (po->stats.stats1.tp_drops) 2264 if (po->stats.stats1.tp_drops)
2263 status |= TP_STATUS_LOSING; 2265 status |= TP_STATUS_LOSING;
2264 } 2266 }
2267
2268 if (do_vnet &&
2269 virtio_net_hdr_from_skb(skb, h.raw + macoff -
2270 sizeof(struct virtio_net_hdr),
2271 vio_le(), true, 0))
2272 goto drop_n_account;
2273
2265 po->stats.stats1.tp_packets++; 2274 po->stats.stats1.tp_packets++;
2266 if (copy_skb) { 2275 if (copy_skb) {
2267 status |= TP_STATUS_COPY; 2276 status |= TP_STATUS_COPY;
@@ -2269,15 +2278,6 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
2269 } 2278 }
2270 spin_unlock(&sk->sk_receive_queue.lock); 2279 spin_unlock(&sk->sk_receive_queue.lock);
2271 2280
2272 if (do_vnet) {
2273 if (virtio_net_hdr_from_skb(skb, h.raw + macoff -
2274 sizeof(struct virtio_net_hdr),
2275 vio_le(), true, 0)) {
2276 spin_lock(&sk->sk_receive_queue.lock);
2277 goto drop_n_account;
2278 }
2279 }
2280
2281 skb_copy_bits(skb, 0, h.raw + macoff, snaplen); 2281 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
2282 2282
2283 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp))) 2283 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
@@ -2459,6 +2459,7 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
2459 skb->dev = dev; 2459 skb->dev = dev;
2460 skb->priority = po->sk.sk_priority; 2460 skb->priority = po->sk.sk_priority;
2461 skb->mark = po->sk.sk_mark; 2461 skb->mark = po->sk.sk_mark;
2462 skb->tstamp = sockc->transmit_time;
2462 sock_tx_timestamp(&po->sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags); 2463 sock_tx_timestamp(&po->sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags);
2463 skb_shinfo(skb)->destructor_arg = ph.raw; 2464 skb_shinfo(skb)->destructor_arg = ph.raw;
2464 2465
@@ -2635,7 +2636,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2635 if (unlikely(!(dev->flags & IFF_UP))) 2636 if (unlikely(!(dev->flags & IFF_UP)))
2636 goto out_put; 2637 goto out_put;
2637 2638
2638 sockc.tsflags = po->sk.sk_tsflags; 2639 sockcm_init(&sockc, &po->sk);
2639 if (msg->msg_controllen) { 2640 if (msg->msg_controllen) {
2640 err = sock_cmsg_send(&po->sk, msg, &sockc); 2641 err = sock_cmsg_send(&po->sk, msg, &sockc);
2641 if (unlikely(err)) 2642 if (unlikely(err))
@@ -2831,7 +2832,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
2831 if (unlikely(!(dev->flags & IFF_UP))) 2832 if (unlikely(!(dev->flags & IFF_UP)))
2832 goto out_unlock; 2833 goto out_unlock;
2833 2834
2834 sockc.tsflags = sk->sk_tsflags; 2835 sockcm_init(&sockc, sk);
2835 sockc.mark = sk->sk_mark; 2836 sockc.mark = sk->sk_mark;
2836 if (msg->msg_controllen) { 2837 if (msg->msg_controllen) {
2837 err = sock_cmsg_send(sk, msg, &sockc); 2838 err = sock_cmsg_send(sk, msg, &sockc);
@@ -2880,6 +2881,8 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
2880 goto out_free; 2881 goto out_free;
2881 } else if (reserve) { 2882 } else if (reserve) {
2882 skb_reserve(skb, -reserve); 2883 skb_reserve(skb, -reserve);
2884 if (len < reserve)
2885 skb_reset_network_header(skb);
2883 } 2886 }
2884 2887
2885 /* Returns -EFAULT on error */ 2888 /* Returns -EFAULT on error */
@@ -2905,6 +2908,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
2905 skb->dev = dev; 2908 skb->dev = dev;
2906 skb->priority = sk->sk_priority; 2909 skb->priority = sk->sk_priority;
2907 skb->mark = sockc.mark; 2910 skb->mark = sockc.mark;
2911 skb->tstamp = sockc.transmit_time;
2908 2912
2909 if (has_vnet_hdr) { 2913 if (has_vnet_hdr) {
2910 err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le()); 2914 err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
@@ -4078,11 +4082,12 @@ static int packet_ioctl(struct socket *sock, unsigned int cmd,
4078 return 0; 4082 return 0;
4079} 4083}
4080 4084
4081static __poll_t packet_poll_mask(struct socket *sock, __poll_t events) 4085static __poll_t packet_poll(struct file *file, struct socket *sock,
4086 poll_table *wait)
4082{ 4087{
4083 struct sock *sk = sock->sk; 4088 struct sock *sk = sock->sk;
4084 struct packet_sock *po = pkt_sk(sk); 4089 struct packet_sock *po = pkt_sk(sk);
4085 __poll_t mask = datagram_poll_mask(sock, events); 4090 __poll_t mask = datagram_poll(file, sock, wait);
4086 4091
4087 spin_lock_bh(&sk->sk_receive_queue.lock); 4092 spin_lock_bh(&sk->sk_receive_queue.lock);
4088 if (po->rx_ring.pg_vec) { 4093 if (po->rx_ring.pg_vec) {
@@ -4132,52 +4137,36 @@ static const struct vm_operations_struct packet_mmap_ops = {
4132 .close = packet_mm_close, 4137 .close = packet_mm_close,
4133}; 4138};
4134 4139
4135static void free_pg_vec(struct pgv *pg_vec, unsigned int order, 4140static void free_pg_vec(struct pgv *pg_vec, unsigned int len)
4136 unsigned int len)
4137{ 4141{
4138 int i; 4142 int i;
4139 4143
4140 for (i = 0; i < len; i++) { 4144 for (i = 0; i < len; i++) {
4141 if (likely(pg_vec[i].buffer)) { 4145 if (likely(pg_vec[i].buffer)) {
4142 if (is_vmalloc_addr(pg_vec[i].buffer)) 4146 kvfree(pg_vec[i].buffer);
4143 vfree(pg_vec[i].buffer);
4144 else
4145 free_pages((unsigned long)pg_vec[i].buffer,
4146 order);
4147 pg_vec[i].buffer = NULL; 4147 pg_vec[i].buffer = NULL;
4148 } 4148 }
4149 } 4149 }
4150 kfree(pg_vec); 4150 kfree(pg_vec);
4151} 4151}
4152 4152
4153static char *alloc_one_pg_vec_page(unsigned long order) 4153static char *alloc_one_pg_vec_page(unsigned long size)
4154{ 4154{
4155 char *buffer; 4155 char *buffer;
4156 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
4157 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
4158
4159 buffer = (char *) __get_free_pages(gfp_flags, order);
4160 if (buffer)
4161 return buffer;
4162 4156
4163 /* __get_free_pages failed, fall back to vmalloc */ 4157 buffer = kvzalloc(size, GFP_KERNEL);
4164 buffer = vzalloc(array_size((1 << order), PAGE_SIZE));
4165 if (buffer) 4158 if (buffer)
4166 return buffer; 4159 return buffer;
4167 4160
4168 /* vmalloc failed, lets dig into swap here */ 4161 buffer = kvzalloc(size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
4169 gfp_flags &= ~__GFP_NORETRY;
4170 buffer = (char *) __get_free_pages(gfp_flags, order);
4171 if (buffer)
4172 return buffer;
4173 4162
4174 /* complete and utter failure */ 4163 return buffer;
4175 return NULL;
4176} 4164}
4177 4165
4178static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order) 4166static struct pgv *alloc_pg_vec(struct tpacket_req *req)
4179{ 4167{
4180 unsigned int block_nr = req->tp_block_nr; 4168 unsigned int block_nr = req->tp_block_nr;
4169 unsigned long size = req->tp_block_size;
4181 struct pgv *pg_vec; 4170 struct pgv *pg_vec;
4182 int i; 4171 int i;
4183 4172
@@ -4186,7 +4175,7 @@ static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4186 goto out; 4175 goto out;
4187 4176
4188 for (i = 0; i < block_nr; i++) { 4177 for (i = 0; i < block_nr; i++) {
4189 pg_vec[i].buffer = alloc_one_pg_vec_page(order); 4178 pg_vec[i].buffer = alloc_one_pg_vec_page(size);
4190 if (unlikely(!pg_vec[i].buffer)) 4179 if (unlikely(!pg_vec[i].buffer))
4191 goto out_free_pgvec; 4180 goto out_free_pgvec;
4192 } 4181 }
@@ -4195,7 +4184,7 @@ out:
4195 return pg_vec; 4184 return pg_vec;
4196 4185
4197out_free_pgvec: 4186out_free_pgvec:
4198 free_pg_vec(pg_vec, order, block_nr); 4187 free_pg_vec(pg_vec, block_nr);
4199 pg_vec = NULL; 4188 pg_vec = NULL;
4200 goto out; 4189 goto out;
4201} 4190}
@@ -4205,9 +4194,9 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
4205{ 4194{
4206 struct pgv *pg_vec = NULL; 4195 struct pgv *pg_vec = NULL;
4207 struct packet_sock *po = pkt_sk(sk); 4196 struct packet_sock *po = pkt_sk(sk);
4208 int was_running, order = 0;
4209 struct packet_ring_buffer *rb; 4197 struct packet_ring_buffer *rb;
4210 struct sk_buff_head *rb_queue; 4198 struct sk_buff_head *rb_queue;
4199 int was_running;
4211 __be16 num; 4200 __be16 num;
4212 int err = -EINVAL; 4201 int err = -EINVAL;
4213 /* Added to avoid minimal code churn */ 4202 /* Added to avoid minimal code churn */
@@ -4225,6 +4214,8 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
4225 } 4214 }
4226 4215
4227 if (req->tp_block_nr) { 4216 if (req->tp_block_nr) {
4217 unsigned int min_frame_size;
4218
4228 /* Sanity tests and some calculations */ 4219 /* Sanity tests and some calculations */
4229 err = -EBUSY; 4220 err = -EBUSY;
4230 if (unlikely(rb->pg_vec)) 4221 if (unlikely(rb->pg_vec))
@@ -4247,12 +4238,12 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
4247 goto out; 4238 goto out;
4248 if (unlikely(!PAGE_ALIGNED(req->tp_block_size))) 4239 if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
4249 goto out; 4240 goto out;
4241 min_frame_size = po->tp_hdrlen + po->tp_reserve;
4250 if (po->tp_version >= TPACKET_V3 && 4242 if (po->tp_version >= TPACKET_V3 &&
4251 req->tp_block_size <= 4243 req->tp_block_size <
4252 BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv) + sizeof(struct tpacket3_hdr)) 4244 BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv) + min_frame_size)
4253 goto out; 4245 goto out;
4254 if (unlikely(req->tp_frame_size < po->tp_hdrlen + 4246 if (unlikely(req->tp_frame_size < min_frame_size))
4255 po->tp_reserve))
4256 goto out; 4247 goto out;
4257 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1))) 4248 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
4258 goto out; 4249 goto out;
@@ -4267,8 +4258,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
4267 goto out; 4258 goto out;
4268 4259
4269 err = -ENOMEM; 4260 err = -ENOMEM;
4270 order = get_order(req->tp_block_size); 4261 pg_vec = alloc_pg_vec(req);
4271 pg_vec = alloc_pg_vec(req, order);
4272 if (unlikely(!pg_vec)) 4262 if (unlikely(!pg_vec))
4273 goto out; 4263 goto out;
4274 switch (po->tp_version) { 4264 switch (po->tp_version) {
@@ -4322,7 +4312,6 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
4322 rb->frame_size = req->tp_frame_size; 4312 rb->frame_size = req->tp_frame_size;
4323 spin_unlock_bh(&rb_queue->lock); 4313 spin_unlock_bh(&rb_queue->lock);
4324 4314
4325 swap(rb->pg_vec_order, order);
4326 swap(rb->pg_vec_len, req->tp_block_nr); 4315 swap(rb->pg_vec_len, req->tp_block_nr);
4327 4316
4328 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE; 4317 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
@@ -4348,7 +4337,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
4348 } 4337 }
4349 4338
4350 if (pg_vec) 4339 if (pg_vec)
4351 free_pg_vec(pg_vec, order, req->tp_block_nr); 4340 free_pg_vec(pg_vec, req->tp_block_nr);
4352out: 4341out:
4353 return err; 4342 return err;
4354} 4343}
@@ -4424,7 +4413,7 @@ static const struct proto_ops packet_ops_spkt = {
4424 .socketpair = sock_no_socketpair, 4413 .socketpair = sock_no_socketpair,
4425 .accept = sock_no_accept, 4414 .accept = sock_no_accept,
4426 .getname = packet_getname_spkt, 4415 .getname = packet_getname_spkt,
4427 .poll_mask = datagram_poll_mask, 4416 .poll = datagram_poll,
4428 .ioctl = packet_ioctl, 4417 .ioctl = packet_ioctl,
4429 .listen = sock_no_listen, 4418 .listen = sock_no_listen,
4430 .shutdown = sock_no_shutdown, 4419 .shutdown = sock_no_shutdown,
@@ -4445,7 +4434,7 @@ static const struct proto_ops packet_ops = {
4445 .socketpair = sock_no_socketpair, 4434 .socketpair = sock_no_socketpair,
4446 .accept = sock_no_accept, 4435 .accept = sock_no_accept,
4447 .getname = packet_getname, 4436 .getname = packet_getname,
4448 .poll_mask = packet_poll_mask, 4437 .poll = packet_poll,
4449 .ioctl = packet_ioctl, 4438 .ioctl = packet_ioctl,
4450 .listen = sock_no_listen, 4439 .listen = sock_no_listen,
4451 .shutdown = sock_no_shutdown, 4440 .shutdown = sock_no_shutdown,
diff --git a/net/packet/internal.h b/net/packet/internal.h
index 3bb7c5fb3bff..8f50036f62f0 100644
--- a/net/packet/internal.h
+++ b/net/packet/internal.h
@@ -64,7 +64,6 @@ struct packet_ring_buffer {
64 unsigned int frame_size; 64 unsigned int frame_size;
65 unsigned int frame_max; 65 unsigned int frame_max;
66 66
67 unsigned int pg_vec_order;
68 unsigned int pg_vec_pages; 67 unsigned int pg_vec_pages;
69 unsigned int pg_vec_len; 68 unsigned int pg_vec_len;
70 69
diff --git a/net/phonet/socket.c b/net/phonet/socket.c
index c295c4e20f01..30187990257f 100644
--- a/net/phonet/socket.c
+++ b/net/phonet/socket.c
@@ -340,12 +340,15 @@ static int pn_socket_getname(struct socket *sock, struct sockaddr *addr,
340 return sizeof(struct sockaddr_pn); 340 return sizeof(struct sockaddr_pn);
341} 341}
342 342
343static __poll_t pn_socket_poll_mask(struct socket *sock, __poll_t events) 343static __poll_t pn_socket_poll(struct file *file, struct socket *sock,
344 poll_table *wait)
344{ 345{
345 struct sock *sk = sock->sk; 346 struct sock *sk = sock->sk;
346 struct pep_sock *pn = pep_sk(sk); 347 struct pep_sock *pn = pep_sk(sk);
347 __poll_t mask = 0; 348 __poll_t mask = 0;
348 349
350 poll_wait(file, sk_sleep(sk), wait);
351
349 if (sk->sk_state == TCP_CLOSE) 352 if (sk->sk_state == TCP_CLOSE)
350 return EPOLLERR; 353 return EPOLLERR;
351 if (!skb_queue_empty(&sk->sk_receive_queue)) 354 if (!skb_queue_empty(&sk->sk_receive_queue))
@@ -445,7 +448,7 @@ const struct proto_ops phonet_dgram_ops = {
445 .socketpair = sock_no_socketpair, 448 .socketpair = sock_no_socketpair,
446 .accept = sock_no_accept, 449 .accept = sock_no_accept,
447 .getname = pn_socket_getname, 450 .getname = pn_socket_getname,
448 .poll_mask = datagram_poll_mask, 451 .poll = datagram_poll,
449 .ioctl = pn_socket_ioctl, 452 .ioctl = pn_socket_ioctl,
450 .listen = sock_no_listen, 453 .listen = sock_no_listen,
451 .shutdown = sock_no_shutdown, 454 .shutdown = sock_no_shutdown,
@@ -470,7 +473,7 @@ const struct proto_ops phonet_stream_ops = {
470 .socketpair = sock_no_socketpair, 473 .socketpair = sock_no_socketpair,
471 .accept = pn_socket_accept, 474 .accept = pn_socket_accept,
472 .getname = pn_socket_getname, 475 .getname = pn_socket_getname,
473 .poll_mask = pn_socket_poll_mask, 476 .poll = pn_socket_poll,
474 .ioctl = pn_socket_ioctl, 477 .ioctl = pn_socket_ioctl,
475 .listen = pn_socket_listen, 478 .listen = pn_socket_listen,
476 .shutdown = sock_no_shutdown, 479 .shutdown = sock_no_shutdown,
diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c
index 1b5025ea5b04..86e1e37eb4e8 100644
--- a/net/qrtr/qrtr.c
+++ b/net/qrtr/qrtr.c
@@ -191,8 +191,13 @@ static int qrtr_node_enqueue(struct qrtr_node *node, struct sk_buff *skb,
191 hdr->type = cpu_to_le32(type); 191 hdr->type = cpu_to_le32(type);
192 hdr->src_node_id = cpu_to_le32(from->sq_node); 192 hdr->src_node_id = cpu_to_le32(from->sq_node);
193 hdr->src_port_id = cpu_to_le32(from->sq_port); 193 hdr->src_port_id = cpu_to_le32(from->sq_port);
194 hdr->dst_node_id = cpu_to_le32(to->sq_node); 194 if (to->sq_port == QRTR_PORT_CTRL) {
195 hdr->dst_port_id = cpu_to_le32(to->sq_port); 195 hdr->dst_node_id = cpu_to_le32(node->nid);
196 hdr->dst_port_id = cpu_to_le32(QRTR_NODE_BCAST);
197 } else {
198 hdr->dst_node_id = cpu_to_le32(to->sq_node);
199 hdr->dst_port_id = cpu_to_le32(to->sq_port);
200 }
196 201
197 hdr->size = cpu_to_le32(len); 202 hdr->size = cpu_to_le32(len);
198 hdr->confirm_rx = 0; 203 hdr->confirm_rx = 0;
@@ -764,6 +769,10 @@ static int qrtr_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
764 node = NULL; 769 node = NULL;
765 if (addr->sq_node == QRTR_NODE_BCAST) { 770 if (addr->sq_node == QRTR_NODE_BCAST) {
766 enqueue_fn = qrtr_bcast_enqueue; 771 enqueue_fn = qrtr_bcast_enqueue;
772 if (addr->sq_port != QRTR_PORT_CTRL) {
773 release_sock(sk);
774 return -ENOTCONN;
775 }
767 } else if (addr->sq_node == ipc->us.sq_node) { 776 } else if (addr->sq_node == ipc->us.sq_node) {
768 enqueue_fn = qrtr_local_enqueue; 777 enqueue_fn = qrtr_local_enqueue;
769 } else { 778 } else {
@@ -1023,7 +1032,7 @@ static const struct proto_ops qrtr_proto_ops = {
1023 .recvmsg = qrtr_recvmsg, 1032 .recvmsg = qrtr_recvmsg,
1024 .getname = qrtr_getname, 1033 .getname = qrtr_getname,
1025 .ioctl = qrtr_ioctl, 1034 .ioctl = qrtr_ioctl,
1026 .poll_mask = datagram_poll_mask, 1035 .poll = datagram_poll,
1027 .shutdown = sock_no_shutdown, 1036 .shutdown = sock_no_shutdown,
1028 .setsockopt = sock_no_setsockopt, 1037 .setsockopt = sock_no_setsockopt,
1029 .getsockopt = sock_no_getsockopt, 1038 .getsockopt = sock_no_getsockopt,
diff --git a/net/rds/Kconfig b/net/rds/Kconfig
index bffde4b46c5d..01b3bd6a3708 100644
--- a/net/rds/Kconfig
+++ b/net/rds/Kconfig
@@ -16,6 +16,7 @@ config RDS_RDMA
16config RDS_TCP 16config RDS_TCP
17 tristate "RDS over TCP" 17 tristate "RDS over TCP"
18 depends on RDS 18 depends on RDS
19 depends on IPV6 || !IPV6
19 ---help--- 20 ---help---
20 Allow RDS to use TCP as a transport. 21 Allow RDS to use TCP as a transport.
21 This transport does not support RDMA operations. 22 This transport does not support RDMA operations.
@@ -24,4 +25,3 @@ config RDS_DEBUG
24 bool "RDS debugging messages" 25 bool "RDS debugging messages"
25 depends on RDS 26 depends on RDS
26 default n 27 default n
27
diff --git a/net/rds/Makefile b/net/rds/Makefile
index b5d568bd479c..e647f9de104a 100644
--- a/net/rds/Makefile
+++ b/net/rds/Makefile
@@ -15,4 +15,3 @@ rds_tcp-y := tcp.o tcp_connect.o tcp_listen.o tcp_recv.o \
15 tcp_send.o tcp_stats.o 15 tcp_send.o tcp_stats.o
16 16
17ccflags-$(CONFIG_RDS_DEBUG) := -DRDS_DEBUG 17ccflags-$(CONFIG_RDS_DEBUG) := -DRDS_DEBUG
18
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index ab751a150f70..65387e1e6964 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2006 Oracle. All rights reserved. 2 * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
3 * 3 *
4 * This software is available to you under a choice of one of two 4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU 5 * licenses. You may choose to be licensed under the terms of the GNU
@@ -35,6 +35,7 @@
35#include <linux/kernel.h> 35#include <linux/kernel.h>
36#include <linux/gfp.h> 36#include <linux/gfp.h>
37#include <linux/in.h> 37#include <linux/in.h>
38#include <linux/ipv6.h>
38#include <linux/poll.h> 39#include <linux/poll.h>
39#include <net/sock.h> 40#include <net/sock.h>
40 41
@@ -113,26 +114,82 @@ void rds_wake_sk_sleep(struct rds_sock *rs)
113static int rds_getname(struct socket *sock, struct sockaddr *uaddr, 114static int rds_getname(struct socket *sock, struct sockaddr *uaddr,
114 int peer) 115 int peer)
115{ 116{
116 struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
117 struct rds_sock *rs = rds_sk_to_rs(sock->sk); 117 struct rds_sock *rs = rds_sk_to_rs(sock->sk);
118 118 struct sockaddr_in6 *sin6;
119 memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); 119 struct sockaddr_in *sin;
120 int uaddr_len;
120 121
121 /* racey, don't care */ 122 /* racey, don't care */
122 if (peer) { 123 if (peer) {
123 if (!rs->rs_conn_addr) 124 if (ipv6_addr_any(&rs->rs_conn_addr))
124 return -ENOTCONN; 125 return -ENOTCONN;
125 126
126 sin->sin_port = rs->rs_conn_port; 127 if (ipv6_addr_v4mapped(&rs->rs_conn_addr)) {
127 sin->sin_addr.s_addr = rs->rs_conn_addr; 128 sin = (struct sockaddr_in *)uaddr;
129 memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
130 sin->sin_family = AF_INET;
131 sin->sin_port = rs->rs_conn_port;
132 sin->sin_addr.s_addr = rs->rs_conn_addr_v4;
133 uaddr_len = sizeof(*sin);
134 } else {
135 sin6 = (struct sockaddr_in6 *)uaddr;
136 sin6->sin6_family = AF_INET6;
137 sin6->sin6_port = rs->rs_conn_port;
138 sin6->sin6_addr = rs->rs_conn_addr;
139 sin6->sin6_flowinfo = 0;
140 /* scope_id is the same as in the bound address. */
141 sin6->sin6_scope_id = rs->rs_bound_scope_id;
142 uaddr_len = sizeof(*sin6);
143 }
128 } else { 144 } else {
129 sin->sin_port = rs->rs_bound_port; 145 /* If socket is not yet bound and the socket is connected,
130 sin->sin_addr.s_addr = rs->rs_bound_addr; 146 * set the return address family to be the same as the
147 * connected address, but with 0 address value. If it is not
148 * connected, set the family to be AF_UNSPEC (value 0) and
149 * the address size to be that of an IPv4 address.
150 */
151 if (ipv6_addr_any(&rs->rs_bound_addr)) {
152 if (ipv6_addr_any(&rs->rs_conn_addr)) {
153 sin = (struct sockaddr_in *)uaddr;
154 memset(sin, 0, sizeof(*sin));
155 sin->sin_family = AF_UNSPEC;
156 return sizeof(*sin);
157 }
158
159#if IS_ENABLED(CONFIG_IPV6)
160 if (!(ipv6_addr_type(&rs->rs_conn_addr) &
161 IPV6_ADDR_MAPPED)) {
162 sin6 = (struct sockaddr_in6 *)uaddr;
163 memset(sin6, 0, sizeof(*sin6));
164 sin6->sin6_family = AF_INET6;
165 return sizeof(*sin6);
166 }
167#endif
168
169 sin = (struct sockaddr_in *)uaddr;
170 memset(sin, 0, sizeof(*sin));
171 sin->sin_family = AF_INET;
172 return sizeof(*sin);
173 }
174 if (ipv6_addr_v4mapped(&rs->rs_bound_addr)) {
175 sin = (struct sockaddr_in *)uaddr;
176 memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
177 sin->sin_family = AF_INET;
178 sin->sin_port = rs->rs_bound_port;
179 sin->sin_addr.s_addr = rs->rs_bound_addr_v4;
180 uaddr_len = sizeof(*sin);
181 } else {
182 sin6 = (struct sockaddr_in6 *)uaddr;
183 sin6->sin6_family = AF_INET6;
184 sin6->sin6_port = rs->rs_bound_port;
185 sin6->sin6_addr = rs->rs_bound_addr;
186 sin6->sin6_flowinfo = 0;
187 sin6->sin6_scope_id = rs->rs_bound_scope_id;
188 uaddr_len = sizeof(*sin6);
189 }
131 } 190 }
132 191
133 sin->sin_family = AF_INET; 192 return uaddr_len;
134
135 return sizeof(*sin);
136} 193}
137 194
138/* 195/*
@@ -203,11 +260,12 @@ static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
203static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval, 260static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval,
204 int len) 261 int len)
205{ 262{
263 struct sockaddr_in6 sin6;
206 struct sockaddr_in sin; 264 struct sockaddr_in sin;
207 int ret = 0; 265 int ret = 0;
208 266
209 /* racing with another thread binding seems ok here */ 267 /* racing with another thread binding seems ok here */
210 if (rs->rs_bound_addr == 0) { 268 if (ipv6_addr_any(&rs->rs_bound_addr)) {
211 ret = -ENOTCONN; /* XXX not a great errno */ 269 ret = -ENOTCONN; /* XXX not a great errno */
212 goto out; 270 goto out;
213 } 271 }
@@ -215,14 +273,23 @@ static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval,
215 if (len < sizeof(struct sockaddr_in)) { 273 if (len < sizeof(struct sockaddr_in)) {
216 ret = -EINVAL; 274 ret = -EINVAL;
217 goto out; 275 goto out;
276 } else if (len < sizeof(struct sockaddr_in6)) {
277 /* Assume IPv4 */
278 if (copy_from_user(&sin, optval, sizeof(struct sockaddr_in))) {
279 ret = -EFAULT;
280 goto out;
281 }
282 ipv6_addr_set_v4mapped(sin.sin_addr.s_addr, &sin6.sin6_addr);
283 sin6.sin6_port = sin.sin_port;
284 } else {
285 if (copy_from_user(&sin6, optval,
286 sizeof(struct sockaddr_in6))) {
287 ret = -EFAULT;
288 goto out;
289 }
218 } 290 }
219 291
220 if (copy_from_user(&sin, optval, sizeof(sin))) { 292 rds_send_drop_to(rs, &sin6);
221 ret = -EFAULT;
222 goto out;
223 }
224
225 rds_send_drop_to(rs, &sin);
226out: 293out:
227 return ret; 294 return ret;
228} 295}
@@ -435,31 +502,91 @@ static int rds_connect(struct socket *sock, struct sockaddr *uaddr,
435 int addr_len, int flags) 502 int addr_len, int flags)
436{ 503{
437 struct sock *sk = sock->sk; 504 struct sock *sk = sock->sk;
438 struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; 505 struct sockaddr_in *sin;
439 struct rds_sock *rs = rds_sk_to_rs(sk); 506 struct rds_sock *rs = rds_sk_to_rs(sk);
440 int ret = 0; 507 int ret = 0;
441 508
442 lock_sock(sk); 509 lock_sock(sk);
443 510
444 if (addr_len != sizeof(struct sockaddr_in)) { 511 switch (uaddr->sa_family) {
445 ret = -EINVAL; 512 case AF_INET:
446 goto out; 513 sin = (struct sockaddr_in *)uaddr;
447 } 514 if (addr_len < sizeof(struct sockaddr_in)) {
515 ret = -EINVAL;
516 break;
517 }
518 if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
519 ret = -EDESTADDRREQ;
520 break;
521 }
522 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) ||
523 sin->sin_addr.s_addr == htonl(INADDR_BROADCAST)) {
524 ret = -EINVAL;
525 break;
526 }
527 ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &rs->rs_conn_addr);
528 rs->rs_conn_port = sin->sin_port;
529 break;
448 530
449 if (sin->sin_family != AF_INET) { 531#if IS_ENABLED(CONFIG_IPV6)
450 ret = -EAFNOSUPPORT; 532 case AF_INET6: {
451 goto out; 533 struct sockaddr_in6 *sin6;
452 } 534 int addr_type;
453 535
454 if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) { 536 sin6 = (struct sockaddr_in6 *)uaddr;
455 ret = -EDESTADDRREQ; 537 if (addr_len < sizeof(struct sockaddr_in6)) {
456 goto out; 538 ret = -EINVAL;
539 break;
540 }
541 addr_type = ipv6_addr_type(&sin6->sin6_addr);
542 if (!(addr_type & IPV6_ADDR_UNICAST)) {
543 __be32 addr4;
544
545 if (!(addr_type & IPV6_ADDR_MAPPED)) {
546 ret = -EPROTOTYPE;
547 break;
548 }
549
550 /* It is a mapped address. Need to do some sanity
551 * checks.
552 */
553 addr4 = sin6->sin6_addr.s6_addr32[3];
554 if (addr4 == htonl(INADDR_ANY) ||
555 addr4 == htonl(INADDR_BROADCAST) ||
556 IN_MULTICAST(ntohl(addr4))) {
557 ret = -EPROTOTYPE;
558 break;
559 }
560 }
561
562 if (addr_type & IPV6_ADDR_LINKLOCAL) {
563 /* If socket is arleady bound to a link local address,
564 * the peer address must be on the same link.
565 */
566 if (sin6->sin6_scope_id == 0 ||
567 (!ipv6_addr_any(&rs->rs_bound_addr) &&
568 rs->rs_bound_scope_id &&
569 sin6->sin6_scope_id != rs->rs_bound_scope_id)) {
570 ret = -EINVAL;
571 break;
572 }
573 /* Remember the connected address scope ID. It will
574 * be checked against the binding local address when
575 * the socket is bound.
576 */
577 rs->rs_bound_scope_id = sin6->sin6_scope_id;
578 }
579 rs->rs_conn_addr = sin6->sin6_addr;
580 rs->rs_conn_port = sin6->sin6_port;
581 break;
457 } 582 }
583#endif
458 584
459 rs->rs_conn_addr = sin->sin_addr.s_addr; 585 default:
460 rs->rs_conn_port = sin->sin_port; 586 ret = -EAFNOSUPPORT;
587 break;
588 }
461 589
462out:
463 release_sock(sk); 590 release_sock(sk);
464 return ret; 591 return ret;
465} 592}
@@ -578,8 +705,10 @@ static void rds_sock_inc_info(struct socket *sock, unsigned int len,
578 list_for_each_entry(inc, &rs->rs_recv_queue, i_item) { 705 list_for_each_entry(inc, &rs->rs_recv_queue, i_item) {
579 total++; 706 total++;
580 if (total <= len) 707 if (total <= len)
581 rds_inc_info_copy(inc, iter, inc->i_saddr, 708 rds_inc_info_copy(inc, iter,
582 rs->rs_bound_addr, 1); 709 inc->i_saddr.s6_addr32[3],
710 rs->rs_bound_addr_v4,
711 1);
583 } 712 }
584 713
585 read_unlock(&rs->rs_recv_lock); 714 read_unlock(&rs->rs_recv_lock);
@@ -608,8 +737,8 @@ static void rds_sock_info(struct socket *sock, unsigned int len,
608 list_for_each_entry(rs, &rds_sock_list, rs_item) { 737 list_for_each_entry(rs, &rds_sock_list, rs_item) {
609 sinfo.sndbuf = rds_sk_sndbuf(rs); 738 sinfo.sndbuf = rds_sk_sndbuf(rs);
610 sinfo.rcvbuf = rds_sk_rcvbuf(rs); 739 sinfo.rcvbuf = rds_sk_rcvbuf(rs);
611 sinfo.bound_addr = rs->rs_bound_addr; 740 sinfo.bound_addr = rs->rs_bound_addr_v4;
612 sinfo.connected_addr = rs->rs_conn_addr; 741 sinfo.connected_addr = rs->rs_conn_addr_v4;
613 sinfo.bound_port = rs->rs_bound_port; 742 sinfo.bound_port = rs->rs_bound_port;
614 sinfo.connected_port = rs->rs_conn_port; 743 sinfo.connected_port = rs->rs_conn_port;
615 sinfo.inum = sock_i_ino(rds_rs_to_sk(rs)); 744 sinfo.inum = sock_i_ino(rds_rs_to_sk(rs));
diff --git a/net/rds/bind.c b/net/rds/bind.c
index 5aa3a64aa4f0..3ab55784b637 100644
--- a/net/rds/bind.c
+++ b/net/rds/bind.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2006 Oracle. All rights reserved. 2 * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
3 * 3 *
4 * This software is available to you under a choice of one of two 4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU 5 * licenses. You may choose to be licensed under the terms of the GNU
@@ -33,6 +33,7 @@
33#include <linux/kernel.h> 33#include <linux/kernel.h>
34#include <net/sock.h> 34#include <net/sock.h>
35#include <linux/in.h> 35#include <linux/in.h>
36#include <linux/ipv6.h>
36#include <linux/if_arp.h> 37#include <linux/if_arp.h>
37#include <linux/jhash.h> 38#include <linux/jhash.h>
38#include <linux/ratelimit.h> 39#include <linux/ratelimit.h>
@@ -42,42 +43,58 @@ static struct rhashtable bind_hash_table;
42 43
43static const struct rhashtable_params ht_parms = { 44static const struct rhashtable_params ht_parms = {
44 .nelem_hint = 768, 45 .nelem_hint = 768,
45 .key_len = sizeof(u64), 46 .key_len = RDS_BOUND_KEY_LEN,
46 .key_offset = offsetof(struct rds_sock, rs_bound_key), 47 .key_offset = offsetof(struct rds_sock, rs_bound_key),
47 .head_offset = offsetof(struct rds_sock, rs_bound_node), 48 .head_offset = offsetof(struct rds_sock, rs_bound_node),
48 .max_size = 16384, 49 .max_size = 16384,
49 .min_size = 1024, 50 .min_size = 1024,
50}; 51};
51 52
53/* Create a key for the bind hash table manipulation. Port is in network byte
54 * order.
55 */
56static inline void __rds_create_bind_key(u8 *key, const struct in6_addr *addr,
57 __be16 port, __u32 scope_id)
58{
59 memcpy(key, addr, sizeof(*addr));
60 key += sizeof(*addr);
61 memcpy(key, &port, sizeof(port));
62 key += sizeof(port);
63 memcpy(key, &scope_id, sizeof(scope_id));
64}
65
52/* 66/*
53 * Return the rds_sock bound at the given local address. 67 * Return the rds_sock bound at the given local address.
54 * 68 *
55 * The rx path can race with rds_release. We notice if rds_release() has 69 * The rx path can race with rds_release. We notice if rds_release() has
56 * marked this socket and don't return a rs ref to the rx path. 70 * marked this socket and don't return a rs ref to the rx path.
57 */ 71 */
58struct rds_sock *rds_find_bound(__be32 addr, __be16 port) 72struct rds_sock *rds_find_bound(const struct in6_addr *addr, __be16 port,
73 __u32 scope_id)
59{ 74{
60 u64 key = ((u64)addr << 32) | port; 75 u8 key[RDS_BOUND_KEY_LEN];
61 struct rds_sock *rs; 76 struct rds_sock *rs;
62 77
63 rs = rhashtable_lookup_fast(&bind_hash_table, &key, ht_parms); 78 __rds_create_bind_key(key, addr, port, scope_id);
79 rs = rhashtable_lookup_fast(&bind_hash_table, key, ht_parms);
64 if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD)) 80 if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD))
65 rds_sock_addref(rs); 81 rds_sock_addref(rs);
66 else 82 else
67 rs = NULL; 83 rs = NULL;
68 84
69 rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr, 85 rdsdebug("returning rs %p for %pI6c:%u\n", rs, addr,
70 ntohs(port)); 86 ntohs(port));
71 87
72 return rs; 88 return rs;
73} 89}
74 90
75/* returns -ve errno or +ve port */ 91/* returns -ve errno or +ve port */
76static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) 92static int rds_add_bound(struct rds_sock *rs, const struct in6_addr *addr,
93 __be16 *port, __u32 scope_id)
77{ 94{
78 int ret = -EADDRINUSE; 95 int ret = -EADDRINUSE;
79 u16 rover, last; 96 u16 rover, last;
80 u64 key; 97 u8 key[RDS_BOUND_KEY_LEN];
81 98
82 if (*port != 0) { 99 if (*port != 0) {
83 rover = be16_to_cpu(*port); 100 rover = be16_to_cpu(*port);
@@ -95,12 +112,13 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
95 112
96 if (rover == RDS_FLAG_PROBE_PORT) 113 if (rover == RDS_FLAG_PROBE_PORT)
97 continue; 114 continue;
98 key = ((u64)addr << 32) | cpu_to_be16(rover); 115 __rds_create_bind_key(key, addr, cpu_to_be16(rover),
99 if (rhashtable_lookup_fast(&bind_hash_table, &key, ht_parms)) 116 scope_id);
117 if (rhashtable_lookup_fast(&bind_hash_table, key, ht_parms))
100 continue; 118 continue;
101 119
102 rs->rs_bound_key = key; 120 memcpy(rs->rs_bound_key, key, sizeof(rs->rs_bound_key));
103 rs->rs_bound_addr = addr; 121 rs->rs_bound_addr = *addr;
104 net_get_random_once(&rs->rs_hash_initval, 122 net_get_random_once(&rs->rs_hash_initval,
105 sizeof(rs->rs_hash_initval)); 123 sizeof(rs->rs_hash_initval));
106 rs->rs_bound_port = cpu_to_be16(rover); 124 rs->rs_bound_port = cpu_to_be16(rover);
@@ -109,12 +127,13 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
109 if (!rhashtable_insert_fast(&bind_hash_table, 127 if (!rhashtable_insert_fast(&bind_hash_table,
110 &rs->rs_bound_node, ht_parms)) { 128 &rs->rs_bound_node, ht_parms)) {
111 *port = rs->rs_bound_port; 129 *port = rs->rs_bound_port;
130 rs->rs_bound_scope_id = scope_id;
112 ret = 0; 131 ret = 0;
113 rdsdebug("rs %p binding to %pI4:%d\n", 132 rdsdebug("rs %p binding to %pI6c:%d\n",
114 rs, &addr, (int)ntohs(*port)); 133 rs, addr, (int)ntohs(*port));
115 break; 134 break;
116 } else { 135 } else {
117 rs->rs_bound_addr = 0; 136 rs->rs_bound_addr = in6addr_any;
118 rds_sock_put(rs); 137 rds_sock_put(rs);
119 ret = -ENOMEM; 138 ret = -ENOMEM;
120 break; 139 break;
@@ -127,44 +146,103 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
127void rds_remove_bound(struct rds_sock *rs) 146void rds_remove_bound(struct rds_sock *rs)
128{ 147{
129 148
130 if (!rs->rs_bound_addr) 149 if (ipv6_addr_any(&rs->rs_bound_addr))
131 return; 150 return;
132 151
133 rdsdebug("rs %p unbinding from %pI4:%d\n", 152 rdsdebug("rs %p unbinding from %pI6c:%d\n",
134 rs, &rs->rs_bound_addr, 153 rs, &rs->rs_bound_addr,
135 ntohs(rs->rs_bound_port)); 154 ntohs(rs->rs_bound_port));
136 155
137 rhashtable_remove_fast(&bind_hash_table, &rs->rs_bound_node, ht_parms); 156 rhashtable_remove_fast(&bind_hash_table, &rs->rs_bound_node, ht_parms);
138 rds_sock_put(rs); 157 rds_sock_put(rs);
139 rs->rs_bound_addr = 0; 158 rs->rs_bound_addr = in6addr_any;
140} 159}
141 160
142int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) 161int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
143{ 162{
144 struct sock *sk = sock->sk; 163 struct sock *sk = sock->sk;
145 struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
146 struct rds_sock *rs = rds_sk_to_rs(sk); 164 struct rds_sock *rs = rds_sk_to_rs(sk);
165 struct in6_addr v6addr, *binding_addr;
147 struct rds_transport *trans; 166 struct rds_transport *trans;
167 __u32 scope_id = 0;
148 int ret = 0; 168 int ret = 0;
169 __be16 port;
170
171 /* We allow an RDS socket to be bound to either IPv4 or IPv6
172 * address.
173 */
174 if (uaddr->sa_family == AF_INET) {
175 struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
176
177 if (addr_len < sizeof(struct sockaddr_in) ||
178 sin->sin_addr.s_addr == htonl(INADDR_ANY) ||
179 sin->sin_addr.s_addr == htonl(INADDR_BROADCAST) ||
180 IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
181 return -EINVAL;
182 ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &v6addr);
183 binding_addr = &v6addr;
184 port = sin->sin_port;
185#if IS_ENABLED(CONFIG_IPV6)
186 } else if (uaddr->sa_family == AF_INET6) {
187 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)uaddr;
188 int addr_type;
189
190 if (addr_len < sizeof(struct sockaddr_in6))
191 return -EINVAL;
192 addr_type = ipv6_addr_type(&sin6->sin6_addr);
193 if (!(addr_type & IPV6_ADDR_UNICAST)) {
194 __be32 addr4;
149 195
196 if (!(addr_type & IPV6_ADDR_MAPPED))
197 return -EINVAL;
198
199 /* It is a mapped address. Need to do some sanity
200 * checks.
201 */
202 addr4 = sin6->sin6_addr.s6_addr32[3];
203 if (addr4 == htonl(INADDR_ANY) ||
204 addr4 == htonl(INADDR_BROADCAST) ||
205 IN_MULTICAST(ntohl(addr4)))
206 return -EINVAL;
207 }
208 /* The scope ID must be specified for link local address. */
209 if (addr_type & IPV6_ADDR_LINKLOCAL) {
210 if (sin6->sin6_scope_id == 0)
211 return -EINVAL;
212 scope_id = sin6->sin6_scope_id;
213 }
214 binding_addr = &sin6->sin6_addr;
215 port = sin6->sin6_port;
216#endif
217 } else {
218 return -EINVAL;
219 }
150 lock_sock(sk); 220 lock_sock(sk);
151 221
152 if (addr_len != sizeof(struct sockaddr_in) || 222 /* RDS socket does not allow re-binding. */
153 sin->sin_family != AF_INET || 223 if (!ipv6_addr_any(&rs->rs_bound_addr)) {
154 rs->rs_bound_addr || 224 ret = -EINVAL;
155 sin->sin_addr.s_addr == htonl(INADDR_ANY)) { 225 goto out;
226 }
227 /* Socket is connected. The binding address should have the same
228 * scope ID as the connected address, except the case when one is
229 * non-link local address (scope_id is 0).
230 */
231 if (!ipv6_addr_any(&rs->rs_conn_addr) && scope_id &&
232 rs->rs_bound_scope_id &&
233 scope_id != rs->rs_bound_scope_id) {
156 ret = -EINVAL; 234 ret = -EINVAL;
157 goto out; 235 goto out;
158 } 236 }
159 237
160 ret = rds_add_bound(rs, sin->sin_addr.s_addr, &sin->sin_port); 238 ret = rds_add_bound(rs, binding_addr, &port, scope_id);
161 if (ret) 239 if (ret)
162 goto out; 240 goto out;
163 241
164 if (rs->rs_transport) { /* previously bound */ 242 if (rs->rs_transport) { /* previously bound */
165 trans = rs->rs_transport; 243 trans = rs->rs_transport;
166 if (trans->laddr_check(sock_net(sock->sk), 244 if (trans->laddr_check(sock_net(sock->sk),
167 sin->sin_addr.s_addr) != 0) { 245 binding_addr, scope_id) != 0) {
168 ret = -ENOPROTOOPT; 246 ret = -ENOPROTOOPT;
169 rds_remove_bound(rs); 247 rds_remove_bound(rs);
170 } else { 248 } else {
@@ -172,13 +250,13 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
172 } 250 }
173 goto out; 251 goto out;
174 } 252 }
175 trans = rds_trans_get_preferred(sock_net(sock->sk), 253 trans = rds_trans_get_preferred(sock_net(sock->sk), binding_addr,
176 sin->sin_addr.s_addr); 254 scope_id);
177 if (!trans) { 255 if (!trans) {
178 ret = -EADDRNOTAVAIL; 256 ret = -EADDRNOTAVAIL;
179 rds_remove_bound(rs); 257 rds_remove_bound(rs);
180 pr_info_ratelimited("RDS: %s could not find a transport for %pI4, load rds_tcp or rds_rdma?\n", 258 pr_info_ratelimited("RDS: %s could not find a transport for %pI6c, load rds_tcp or rds_rdma?\n",
181 __func__, &sin->sin_addr.s_addr); 259 __func__, binding_addr);
182 goto out; 260 goto out;
183 } 261 }
184 262
diff --git a/net/rds/cong.c b/net/rds/cong.c
index 63da9d2f142d..ccdff09a79c8 100644
--- a/net/rds/cong.c
+++ b/net/rds/cong.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2007 Oracle. All rights reserved. 2 * Copyright (c) 2007, 2017 Oracle and/or its affiliates. All rights reserved.
3 * 3 *
4 * This software is available to you under a choice of one of two 4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU 5 * licenses. You may choose to be licensed under the terms of the GNU
@@ -101,7 +101,7 @@ static DEFINE_RWLOCK(rds_cong_monitor_lock);
101static DEFINE_SPINLOCK(rds_cong_lock); 101static DEFINE_SPINLOCK(rds_cong_lock);
102static struct rb_root rds_cong_tree = RB_ROOT; 102static struct rb_root rds_cong_tree = RB_ROOT;
103 103
104static struct rds_cong_map *rds_cong_tree_walk(__be32 addr, 104static struct rds_cong_map *rds_cong_tree_walk(const struct in6_addr *addr,
105 struct rds_cong_map *insert) 105 struct rds_cong_map *insert)
106{ 106{
107 struct rb_node **p = &rds_cong_tree.rb_node; 107 struct rb_node **p = &rds_cong_tree.rb_node;
@@ -109,12 +109,15 @@ static struct rds_cong_map *rds_cong_tree_walk(__be32 addr,
109 struct rds_cong_map *map; 109 struct rds_cong_map *map;
110 110
111 while (*p) { 111 while (*p) {
112 int diff;
113
112 parent = *p; 114 parent = *p;
113 map = rb_entry(parent, struct rds_cong_map, m_rb_node); 115 map = rb_entry(parent, struct rds_cong_map, m_rb_node);
114 116
115 if (addr < map->m_addr) 117 diff = rds_addr_cmp(addr, &map->m_addr);
118 if (diff < 0)
116 p = &(*p)->rb_left; 119 p = &(*p)->rb_left;
117 else if (addr > map->m_addr) 120 else if (diff > 0)
118 p = &(*p)->rb_right; 121 p = &(*p)->rb_right;
119 else 122 else
120 return map; 123 return map;
@@ -132,7 +135,7 @@ static struct rds_cong_map *rds_cong_tree_walk(__be32 addr,
132 * these bitmaps in the process getting pointers to them. The bitmaps are only 135 * these bitmaps in the process getting pointers to them. The bitmaps are only
133 * ever freed as the module is removed after all connections have been freed. 136 * ever freed as the module is removed after all connections have been freed.
134 */ 137 */
135static struct rds_cong_map *rds_cong_from_addr(__be32 addr) 138static struct rds_cong_map *rds_cong_from_addr(const struct in6_addr *addr)
136{ 139{
137 struct rds_cong_map *map; 140 struct rds_cong_map *map;
138 struct rds_cong_map *ret = NULL; 141 struct rds_cong_map *ret = NULL;
@@ -144,7 +147,7 @@ static struct rds_cong_map *rds_cong_from_addr(__be32 addr)
144 if (!map) 147 if (!map)
145 return NULL; 148 return NULL;
146 149
147 map->m_addr = addr; 150 map->m_addr = *addr;
148 init_waitqueue_head(&map->m_waitq); 151 init_waitqueue_head(&map->m_waitq);
149 INIT_LIST_HEAD(&map->m_conn_list); 152 INIT_LIST_HEAD(&map->m_conn_list);
150 153
@@ -171,7 +174,7 @@ out:
171 kfree(map); 174 kfree(map);
172 } 175 }
173 176
174 rdsdebug("map %p for addr %x\n", ret, be32_to_cpu(addr)); 177 rdsdebug("map %p for addr %pI6c\n", ret, addr);
175 178
176 return ret; 179 return ret;
177} 180}
@@ -202,8 +205,8 @@ void rds_cong_remove_conn(struct rds_connection *conn)
202 205
203int rds_cong_get_maps(struct rds_connection *conn) 206int rds_cong_get_maps(struct rds_connection *conn)
204{ 207{
205 conn->c_lcong = rds_cong_from_addr(conn->c_laddr); 208 conn->c_lcong = rds_cong_from_addr(&conn->c_laddr);
206 conn->c_fcong = rds_cong_from_addr(conn->c_faddr); 209 conn->c_fcong = rds_cong_from_addr(&conn->c_faddr);
207 210
208 if (!(conn->c_lcong && conn->c_fcong)) 211 if (!(conn->c_lcong && conn->c_fcong))
209 return -ENOMEM; 212 return -ENOMEM;
@@ -353,7 +356,7 @@ void rds_cong_remove_socket(struct rds_sock *rs)
353 356
354 /* update congestion map for now-closed port */ 357 /* update congestion map for now-closed port */
355 spin_lock_irqsave(&rds_cong_lock, flags); 358 spin_lock_irqsave(&rds_cong_lock, flags);
356 map = rds_cong_tree_walk(rs->rs_bound_addr, NULL); 359 map = rds_cong_tree_walk(&rs->rs_bound_addr, NULL);
357 spin_unlock_irqrestore(&rds_cong_lock, flags); 360 spin_unlock_irqrestore(&rds_cong_lock, flags);
358 361
359 if (map && rds_cong_test_bit(map, rs->rs_bound_port)) { 362 if (map && rds_cong_test_bit(map, rs->rs_bound_port)) {
diff --git a/net/rds/connection.c b/net/rds/connection.c
index abef75da89a7..3bd2f4a5a30d 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2006 Oracle. All rights reserved. 2 * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
3 * 3 *
4 * This software is available to you under a choice of one of two 4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU 5 * licenses. You may choose to be licensed under the terms of the GNU
@@ -34,7 +34,9 @@
34#include <linux/list.h> 34#include <linux/list.h>
35#include <linux/slab.h> 35#include <linux/slab.h>
36#include <linux/export.h> 36#include <linux/export.h>
37#include <net/inet_hashtables.h> 37#include <net/ipv6.h>
38#include <net/inet6_hashtables.h>
39#include <net/addrconf.h>
38 40
39#include "rds.h" 41#include "rds.h"
40#include "loop.h" 42#include "loop.h"
@@ -49,18 +51,25 @@ static unsigned long rds_conn_count;
49static struct hlist_head rds_conn_hash[RDS_CONNECTION_HASH_ENTRIES]; 51static struct hlist_head rds_conn_hash[RDS_CONNECTION_HASH_ENTRIES];
50static struct kmem_cache *rds_conn_slab; 52static struct kmem_cache *rds_conn_slab;
51 53
52static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr) 54static struct hlist_head *rds_conn_bucket(const struct in6_addr *laddr,
55 const struct in6_addr *faddr)
53{ 56{
57 static u32 rds6_hash_secret __read_mostly;
54 static u32 rds_hash_secret __read_mostly; 58 static u32 rds_hash_secret __read_mostly;
55 59
56 unsigned long hash; 60 u32 lhash, fhash, hash;
57 61
58 net_get_random_once(&rds_hash_secret, sizeof(rds_hash_secret)); 62 net_get_random_once(&rds_hash_secret, sizeof(rds_hash_secret));
63 net_get_random_once(&rds6_hash_secret, sizeof(rds6_hash_secret));
64
65 lhash = (__force u32)laddr->s6_addr32[3];
66#if IS_ENABLED(CONFIG_IPV6)
67 fhash = __ipv6_addr_jhash(faddr, rds6_hash_secret);
68#else
69 fhash = (__force u32)faddr->s6_addr32[3];
70#endif
71 hash = __inet_ehashfn(lhash, 0, fhash, 0, rds_hash_secret);
59 72
60 /* Pass NULL, don't need struct net for hash */
61 hash = __inet_ehashfn(be32_to_cpu(laddr), 0,
62 be32_to_cpu(faddr), 0,
63 rds_hash_secret);
64 return &rds_conn_hash[hash & RDS_CONNECTION_HASH_MASK]; 73 return &rds_conn_hash[hash & RDS_CONNECTION_HASH_MASK];
65} 74}
66 75
@@ -72,20 +81,25 @@ static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr)
72/* rcu read lock must be held or the connection spinlock */ 81/* rcu read lock must be held or the connection spinlock */
73static struct rds_connection *rds_conn_lookup(struct net *net, 82static struct rds_connection *rds_conn_lookup(struct net *net,
74 struct hlist_head *head, 83 struct hlist_head *head,
75 __be32 laddr, __be32 faddr, 84 const struct in6_addr *laddr,
76 struct rds_transport *trans) 85 const struct in6_addr *faddr,
86 struct rds_transport *trans,
87 int dev_if)
77{ 88{
78 struct rds_connection *conn, *ret = NULL; 89 struct rds_connection *conn, *ret = NULL;
79 90
80 hlist_for_each_entry_rcu(conn, head, c_hash_node) { 91 hlist_for_each_entry_rcu(conn, head, c_hash_node) {
81 if (conn->c_faddr == faddr && conn->c_laddr == laddr && 92 if (ipv6_addr_equal(&conn->c_faddr, faddr) &&
82 conn->c_trans == trans && net == rds_conn_net(conn)) { 93 ipv6_addr_equal(&conn->c_laddr, laddr) &&
94 conn->c_trans == trans &&
95 net == rds_conn_net(conn) &&
96 conn->c_dev_if == dev_if) {
83 ret = conn; 97 ret = conn;
84 break; 98 break;
85 } 99 }
86 } 100 }
87 rdsdebug("returning conn %p for %pI4 -> %pI4\n", ret, 101 rdsdebug("returning conn %p for %pI6c -> %pI6c\n", ret,
88 &laddr, &faddr); 102 laddr, faddr);
89 return ret; 103 return ret;
90} 104}
91 105
@@ -99,8 +113,8 @@ static void rds_conn_path_reset(struct rds_conn_path *cp)
99{ 113{
100 struct rds_connection *conn = cp->cp_conn; 114 struct rds_connection *conn = cp->cp_conn;
101 115
102 rdsdebug("connection %pI4 to %pI4 reset\n", 116 rdsdebug("connection %pI6c to %pI6c reset\n",
103 &conn->c_laddr, &conn->c_faddr); 117 &conn->c_laddr, &conn->c_faddr);
104 118
105 rds_stats_inc(s_conn_reset); 119 rds_stats_inc(s_conn_reset);
106 rds_send_path_reset(cp); 120 rds_send_path_reset(cp);
@@ -142,9 +156,12 @@ static void __rds_conn_path_init(struct rds_connection *conn,
142 * are torn down as the module is removed, if ever. 156 * are torn down as the module is removed, if ever.
143 */ 157 */
144static struct rds_connection *__rds_conn_create(struct net *net, 158static struct rds_connection *__rds_conn_create(struct net *net,
145 __be32 laddr, __be32 faddr, 159 const struct in6_addr *laddr,
146 struct rds_transport *trans, gfp_t gfp, 160 const struct in6_addr *faddr,
147 int is_outgoing) 161 struct rds_transport *trans,
162 gfp_t gfp,
163 int is_outgoing,
164 int dev_if)
148{ 165{
149 struct rds_connection *conn, *parent = NULL; 166 struct rds_connection *conn, *parent = NULL;
150 struct hlist_head *head = rds_conn_bucket(laddr, faddr); 167 struct hlist_head *head = rds_conn_bucket(laddr, faddr);
@@ -154,9 +171,12 @@ static struct rds_connection *__rds_conn_create(struct net *net,
154 int npaths = (trans->t_mp_capable ? RDS_MPATH_WORKERS : 1); 171 int npaths = (trans->t_mp_capable ? RDS_MPATH_WORKERS : 1);
155 172
156 rcu_read_lock(); 173 rcu_read_lock();
157 conn = rds_conn_lookup(net, head, laddr, faddr, trans); 174 conn = rds_conn_lookup(net, head, laddr, faddr, trans, dev_if);
158 if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport && 175 if (conn &&
159 laddr == faddr && !is_outgoing) { 176 conn->c_loopback &&
177 conn->c_trans != &rds_loop_transport &&
178 ipv6_addr_equal(laddr, faddr) &&
179 !is_outgoing) {
160 /* This is a looped back IB connection, and we're 180 /* This is a looped back IB connection, and we're
161 * called by the code handling the incoming connect. 181 * called by the code handling the incoming connect.
162 * We need a second connection object into which we 182 * We need a second connection object into which we
@@ -181,8 +201,22 @@ static struct rds_connection *__rds_conn_create(struct net *net,
181 } 201 }
182 202
183 INIT_HLIST_NODE(&conn->c_hash_node); 203 INIT_HLIST_NODE(&conn->c_hash_node);
184 conn->c_laddr = laddr; 204 conn->c_laddr = *laddr;
185 conn->c_faddr = faddr; 205 conn->c_isv6 = !ipv6_addr_v4mapped(laddr);
206 conn->c_faddr = *faddr;
207 conn->c_dev_if = dev_if;
208
209#if IS_ENABLED(CONFIG_IPV6)
210 /* If the local address is link local, set c_bound_if to be the
211 * index used for this connection. Otherwise, set it to 0 as
212 * the socket is not bound to an interface. c_bound_if is used
213 * to look up a socket when a packet is received
214 */
215 if (ipv6_addr_type(laddr) & IPV6_ADDR_LINKLOCAL)
216 conn->c_bound_if = dev_if;
217 else
218#endif
219 conn->c_bound_if = 0;
186 220
187 rds_conn_net_set(conn, net); 221 rds_conn_net_set(conn, net);
188 222
@@ -199,7 +233,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
199 * can bind to the destination address then we'd rather the messages 233 * can bind to the destination address then we'd rather the messages
200 * flow through loopback rather than either transport. 234 * flow through loopback rather than either transport.
201 */ 235 */
202 loop_trans = rds_trans_get_preferred(net, faddr); 236 loop_trans = rds_trans_get_preferred(net, faddr, conn->c_dev_if);
203 if (loop_trans) { 237 if (loop_trans) {
204 rds_trans_put(loop_trans); 238 rds_trans_put(loop_trans);
205 conn->c_loopback = 1; 239 conn->c_loopback = 1;
@@ -233,10 +267,10 @@ static struct rds_connection *__rds_conn_create(struct net *net,
233 goto out; 267 goto out;
234 } 268 }
235 269
236 rdsdebug("allocated conn %p for %pI4 -> %pI4 over %s %s\n", 270 rdsdebug("allocated conn %p for %pI6c -> %pI6c over %s %s\n",
237 conn, &laddr, &faddr, 271 conn, laddr, faddr,
238 strnlen(trans->t_name, sizeof(trans->t_name)) ? trans->t_name : 272 strnlen(trans->t_name, sizeof(trans->t_name)) ?
239 "[unknown]", is_outgoing ? "(outgoing)" : ""); 273 trans->t_name : "[unknown]", is_outgoing ? "(outgoing)" : "");
240 274
241 /* 275 /*
242 * Since we ran without holding the conn lock, someone could 276 * Since we ran without holding the conn lock, someone could
@@ -262,7 +296,8 @@ static struct rds_connection *__rds_conn_create(struct net *net,
262 /* Creating normal conn */ 296 /* Creating normal conn */
263 struct rds_connection *found; 297 struct rds_connection *found;
264 298
265 found = rds_conn_lookup(net, head, laddr, faddr, trans); 299 found = rds_conn_lookup(net, head, laddr, faddr, trans,
300 dev_if);
266 if (found) { 301 if (found) {
267 struct rds_conn_path *cp; 302 struct rds_conn_path *cp;
268 int i; 303 int i;
@@ -295,18 +330,22 @@ out:
295} 330}
296 331
297struct rds_connection *rds_conn_create(struct net *net, 332struct rds_connection *rds_conn_create(struct net *net,
298 __be32 laddr, __be32 faddr, 333 const struct in6_addr *laddr,
299 struct rds_transport *trans, gfp_t gfp) 334 const struct in6_addr *faddr,
335 struct rds_transport *trans, gfp_t gfp,
336 int dev_if)
300{ 337{
301 return __rds_conn_create(net, laddr, faddr, trans, gfp, 0); 338 return __rds_conn_create(net, laddr, faddr, trans, gfp, 0, dev_if);
302} 339}
303EXPORT_SYMBOL_GPL(rds_conn_create); 340EXPORT_SYMBOL_GPL(rds_conn_create);
304 341
305struct rds_connection *rds_conn_create_outgoing(struct net *net, 342struct rds_connection *rds_conn_create_outgoing(struct net *net,
306 __be32 laddr, __be32 faddr, 343 const struct in6_addr *laddr,
307 struct rds_transport *trans, gfp_t gfp) 344 const struct in6_addr *faddr,
345 struct rds_transport *trans,
346 gfp_t gfp, int dev_if)
308{ 347{
309 return __rds_conn_create(net, laddr, faddr, trans, gfp, 1); 348 return __rds_conn_create(net, laddr, faddr, trans, gfp, 1, dev_if);
310} 349}
311EXPORT_SYMBOL_GPL(rds_conn_create_outgoing); 350EXPORT_SYMBOL_GPL(rds_conn_create_outgoing);
312 351
@@ -464,10 +503,23 @@ void rds_conn_destroy(struct rds_connection *conn)
464} 503}
465EXPORT_SYMBOL_GPL(rds_conn_destroy); 504EXPORT_SYMBOL_GPL(rds_conn_destroy);
466 505
467static void rds_conn_message_info(struct socket *sock, unsigned int len, 506static void __rds_inc_msg_cp(struct rds_incoming *inc,
468 struct rds_info_iterator *iter, 507 struct rds_info_iterator *iter,
469 struct rds_info_lengths *lens, 508 void *saddr, void *daddr, int flip, bool isv6)
470 int want_send) 509{
510#if IS_ENABLED(CONFIG_IPV6)
511 if (isv6)
512 rds6_inc_info_copy(inc, iter, saddr, daddr, flip);
513 else
514#endif
515 rds_inc_info_copy(inc, iter, *(__be32 *)saddr,
516 *(__be32 *)daddr, flip);
517}
518
519static void rds_conn_message_info_cmn(struct socket *sock, unsigned int len,
520 struct rds_info_iterator *iter,
521 struct rds_info_lengths *lens,
522 int want_send, bool isv6)
471{ 523{
472 struct hlist_head *head; 524 struct hlist_head *head;
473 struct list_head *list; 525 struct list_head *list;
@@ -478,7 +530,10 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
478 size_t i; 530 size_t i;
479 int j; 531 int j;
480 532
481 len /= sizeof(struct rds_info_message); 533 if (isv6)
534 len /= sizeof(struct rds6_info_message);
535 else
536 len /= sizeof(struct rds_info_message);
482 537
483 rcu_read_lock(); 538 rcu_read_lock();
484 539
@@ -488,6 +543,9 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
488 struct rds_conn_path *cp; 543 struct rds_conn_path *cp;
489 int npaths; 544 int npaths;
490 545
546 if (!isv6 && conn->c_isv6)
547 continue;
548
491 npaths = (conn->c_trans->t_mp_capable ? 549 npaths = (conn->c_trans->t_mp_capable ?
492 RDS_MPATH_WORKERS : 1); 550 RDS_MPATH_WORKERS : 1);
493 551
@@ -504,11 +562,11 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
504 list_for_each_entry(rm, list, m_conn_item) { 562 list_for_each_entry(rm, list, m_conn_item) {
505 total++; 563 total++;
506 if (total <= len) 564 if (total <= len)
507 rds_inc_info_copy(&rm->m_inc, 565 __rds_inc_msg_cp(&rm->m_inc,
508 iter, 566 iter,
509 conn->c_laddr, 567 &conn->c_laddr,
510 conn->c_faddr, 568 &conn->c_faddr,
511 0); 569 0, isv6);
512 } 570 }
513 571
514 spin_unlock_irqrestore(&cp->cp_lock, flags); 572 spin_unlock_irqrestore(&cp->cp_lock, flags);
@@ -518,9 +576,30 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
518 rcu_read_unlock(); 576 rcu_read_unlock();
519 577
520 lens->nr = total; 578 lens->nr = total;
521 lens->each = sizeof(struct rds_info_message); 579 if (isv6)
580 lens->each = sizeof(struct rds6_info_message);
581 else
582 lens->each = sizeof(struct rds_info_message);
522} 583}
523 584
585static void rds_conn_message_info(struct socket *sock, unsigned int len,
586 struct rds_info_iterator *iter,
587 struct rds_info_lengths *lens,
588 int want_send)
589{
590 rds_conn_message_info_cmn(sock, len, iter, lens, want_send, false);
591}
592
593#if IS_ENABLED(CONFIG_IPV6)
594static void rds6_conn_message_info(struct socket *sock, unsigned int len,
595 struct rds_info_iterator *iter,
596 struct rds_info_lengths *lens,
597 int want_send)
598{
599 rds_conn_message_info_cmn(sock, len, iter, lens, want_send, true);
600}
601#endif
602
524static void rds_conn_message_info_send(struct socket *sock, unsigned int len, 603static void rds_conn_message_info_send(struct socket *sock, unsigned int len,
525 struct rds_info_iterator *iter, 604 struct rds_info_iterator *iter,
526 struct rds_info_lengths *lens) 605 struct rds_info_lengths *lens)
@@ -528,6 +607,15 @@ static void rds_conn_message_info_send(struct socket *sock, unsigned int len,
528 rds_conn_message_info(sock, len, iter, lens, 1); 607 rds_conn_message_info(sock, len, iter, lens, 1);
529} 608}
530 609
610#if IS_ENABLED(CONFIG_IPV6)
611static void rds6_conn_message_info_send(struct socket *sock, unsigned int len,
612 struct rds_info_iterator *iter,
613 struct rds_info_lengths *lens)
614{
615 rds6_conn_message_info(sock, len, iter, lens, 1);
616}
617#endif
618
531static void rds_conn_message_info_retrans(struct socket *sock, 619static void rds_conn_message_info_retrans(struct socket *sock,
532 unsigned int len, 620 unsigned int len,
533 struct rds_info_iterator *iter, 621 struct rds_info_iterator *iter,
@@ -536,6 +624,16 @@ static void rds_conn_message_info_retrans(struct socket *sock,
536 rds_conn_message_info(sock, len, iter, lens, 0); 624 rds_conn_message_info(sock, len, iter, lens, 0);
537} 625}
538 626
627#if IS_ENABLED(CONFIG_IPV6)
628static void rds6_conn_message_info_retrans(struct socket *sock,
629 unsigned int len,
630 struct rds_info_iterator *iter,
631 struct rds_info_lengths *lens)
632{
633 rds6_conn_message_info(sock, len, iter, lens, 0);
634}
635#endif
636
539void rds_for_each_conn_info(struct socket *sock, unsigned int len, 637void rds_for_each_conn_info(struct socket *sock, unsigned int len,
540 struct rds_info_iterator *iter, 638 struct rds_info_iterator *iter,
541 struct rds_info_lengths *lens, 639 struct rds_info_lengths *lens,
@@ -584,7 +682,6 @@ static void rds_walk_conn_path_info(struct socket *sock, unsigned int len,
584 struct hlist_head *head; 682 struct hlist_head *head;
585 struct rds_connection *conn; 683 struct rds_connection *conn;
586 size_t i; 684 size_t i;
587 int j;
588 685
589 rcu_read_lock(); 686 rcu_read_lock();
590 687
@@ -595,17 +692,20 @@ static void rds_walk_conn_path_info(struct socket *sock, unsigned int len,
595 i++, head++) { 692 i++, head++) {
596 hlist_for_each_entry_rcu(conn, head, c_hash_node) { 693 hlist_for_each_entry_rcu(conn, head, c_hash_node) {
597 struct rds_conn_path *cp; 694 struct rds_conn_path *cp;
598 int npaths;
599 695
600 npaths = (conn->c_trans->t_mp_capable ? 696 /* XXX We only copy the information from the first
601 RDS_MPATH_WORKERS : 1); 697 * path for now. The problem is that if there are
602 for (j = 0; j < npaths; j++) { 698 * more than one underlying paths, we cannot report
603 cp = &conn->c_path[j]; 699 * information of all of them using the existing
700 * API. For example, there is only one next_tx_seq,
701 * which path's next_tx_seq should we report? It is
702 * a bug in the design of MPRDS.
703 */
704 cp = conn->c_path;
604 705
605 /* XXX no cp_lock usage.. */ 706 /* XXX no cp_lock usage.. */
606 if (!visitor(cp, buffer)) 707 if (!visitor(cp, buffer))
607 continue; 708 continue;
608 }
609 709
610 /* We copy as much as we can fit in the buffer, 710 /* We copy as much as we can fit in the buffer,
611 * but we count all items so that the caller 711 * but we count all items so that the caller
@@ -624,12 +724,16 @@ static void rds_walk_conn_path_info(struct socket *sock, unsigned int len,
624static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer) 724static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer)
625{ 725{
626 struct rds_info_connection *cinfo = buffer; 726 struct rds_info_connection *cinfo = buffer;
727 struct rds_connection *conn = cp->cp_conn;
728
729 if (conn->c_isv6)
730 return 0;
627 731
628 cinfo->next_tx_seq = cp->cp_next_tx_seq; 732 cinfo->next_tx_seq = cp->cp_next_tx_seq;
629 cinfo->next_rx_seq = cp->cp_next_rx_seq; 733 cinfo->next_rx_seq = cp->cp_next_rx_seq;
630 cinfo->laddr = cp->cp_conn->c_laddr; 734 cinfo->laddr = conn->c_laddr.s6_addr32[3];
631 cinfo->faddr = cp->cp_conn->c_faddr; 735 cinfo->faddr = conn->c_faddr.s6_addr32[3];
632 strncpy(cinfo->transport, cp->cp_conn->c_trans->t_name, 736 strncpy(cinfo->transport, conn->c_trans->t_name,
633 sizeof(cinfo->transport)); 737 sizeof(cinfo->transport));
634 cinfo->flags = 0; 738 cinfo->flags = 0;
635 739
@@ -645,6 +749,36 @@ static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer)
645 return 1; 749 return 1;
646} 750}
647 751
752#if IS_ENABLED(CONFIG_IPV6)
753static int rds6_conn_info_visitor(struct rds_conn_path *cp, void *buffer)
754{
755 struct rds6_info_connection *cinfo6 = buffer;
756 struct rds_connection *conn = cp->cp_conn;
757
758 cinfo6->next_tx_seq = cp->cp_next_tx_seq;
759 cinfo6->next_rx_seq = cp->cp_next_rx_seq;
760 cinfo6->laddr = conn->c_laddr;
761 cinfo6->faddr = conn->c_faddr;
762 strncpy(cinfo6->transport, conn->c_trans->t_name,
763 sizeof(cinfo6->transport));
764 cinfo6->flags = 0;
765
766 rds_conn_info_set(cinfo6->flags, test_bit(RDS_IN_XMIT, &cp->cp_flags),
767 SENDING);
768 /* XXX Future: return the state rather than these funky bits */
769 rds_conn_info_set(cinfo6->flags,
770 atomic_read(&cp->cp_state) == RDS_CONN_CONNECTING,
771 CONNECTING);
772 rds_conn_info_set(cinfo6->flags,
773 atomic_read(&cp->cp_state) == RDS_CONN_UP,
774 CONNECTED);
775 /* Just return 1 as there is no error case. This is a helper function
776 * for rds_walk_conn_path_info() and it wants a return value.
777 */
778 return 1;
779}
780#endif
781
648static void rds_conn_info(struct socket *sock, unsigned int len, 782static void rds_conn_info(struct socket *sock, unsigned int len,
649 struct rds_info_iterator *iter, 783 struct rds_info_iterator *iter,
650 struct rds_info_lengths *lens) 784 struct rds_info_lengths *lens)
@@ -657,25 +791,54 @@ static void rds_conn_info(struct socket *sock, unsigned int len,
657 sizeof(struct rds_info_connection)); 791 sizeof(struct rds_info_connection));
658} 792}
659 793
794#if IS_ENABLED(CONFIG_IPV6)
795static void rds6_conn_info(struct socket *sock, unsigned int len,
796 struct rds_info_iterator *iter,
797 struct rds_info_lengths *lens)
798{
799 u64 buffer[(sizeof(struct rds6_info_connection) + 7) / 8];
800
801 rds_walk_conn_path_info(sock, len, iter, lens,
802 rds6_conn_info_visitor,
803 buffer,
804 sizeof(struct rds6_info_connection));
805}
806#endif
807
660int rds_conn_init(void) 808int rds_conn_init(void)
661{ 809{
810 int ret;
811
812 ret = rds_loop_net_init(); /* register pernet callback */
813 if (ret)
814 return ret;
815
662 rds_conn_slab = kmem_cache_create("rds_connection", 816 rds_conn_slab = kmem_cache_create("rds_connection",
663 sizeof(struct rds_connection), 817 sizeof(struct rds_connection),
664 0, 0, NULL); 818 0, 0, NULL);
665 if (!rds_conn_slab) 819 if (!rds_conn_slab) {
820 rds_loop_net_exit();
666 return -ENOMEM; 821 return -ENOMEM;
822 }
667 823
668 rds_info_register_func(RDS_INFO_CONNECTIONS, rds_conn_info); 824 rds_info_register_func(RDS_INFO_CONNECTIONS, rds_conn_info);
669 rds_info_register_func(RDS_INFO_SEND_MESSAGES, 825 rds_info_register_func(RDS_INFO_SEND_MESSAGES,
670 rds_conn_message_info_send); 826 rds_conn_message_info_send);
671 rds_info_register_func(RDS_INFO_RETRANS_MESSAGES, 827 rds_info_register_func(RDS_INFO_RETRANS_MESSAGES,
672 rds_conn_message_info_retrans); 828 rds_conn_message_info_retrans);
673 829#if IS_ENABLED(CONFIG_IPV6)
830 rds_info_register_func(RDS6_INFO_CONNECTIONS, rds6_conn_info);
831 rds_info_register_func(RDS6_INFO_SEND_MESSAGES,
832 rds6_conn_message_info_send);
833 rds_info_register_func(RDS6_INFO_RETRANS_MESSAGES,
834 rds6_conn_message_info_retrans);
835#endif
674 return 0; 836 return 0;
675} 837}
676 838
677void rds_conn_exit(void) 839void rds_conn_exit(void)
678{ 840{
841 rds_loop_net_exit(); /* unregister pernet callback */
679 rds_loop_exit(); 842 rds_loop_exit();
680 843
681 WARN_ON(!hlist_empty(rds_conn_hash)); 844 WARN_ON(!hlist_empty(rds_conn_hash));
@@ -687,6 +850,13 @@ void rds_conn_exit(void)
687 rds_conn_message_info_send); 850 rds_conn_message_info_send);
688 rds_info_deregister_func(RDS_INFO_RETRANS_MESSAGES, 851 rds_info_deregister_func(RDS_INFO_RETRANS_MESSAGES,
689 rds_conn_message_info_retrans); 852 rds_conn_message_info_retrans);
853#if IS_ENABLED(CONFIG_IPV6)
854 rds_info_deregister_func(RDS6_INFO_CONNECTIONS, rds6_conn_info);
855 rds_info_deregister_func(RDS6_INFO_SEND_MESSAGES,
856 rds6_conn_message_info_send);
857 rds_info_deregister_func(RDS6_INFO_RETRANS_MESSAGES,
858 rds6_conn_message_info_retrans);
859#endif
690} 860}
691 861
692/* 862/*
diff --git a/net/rds/ib.c b/net/rds/ib.c
index b6ad38e48f62..c1d97640c0be 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2006 Oracle. All rights reserved. 2 * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
3 * 3 *
4 * This software is available to you under a choice of one of two 4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU 5 * licenses. You may choose to be licensed under the terms of the GNU
@@ -39,6 +39,7 @@
39#include <linux/delay.h> 39#include <linux/delay.h>
40#include <linux/slab.h> 40#include <linux/slab.h>
41#include <linux/module.h> 41#include <linux/module.h>
42#include <net/addrconf.h>
42 43
43#include "rds_single_path.h" 44#include "rds_single_path.h"
44#include "rds.h" 45#include "rds.h"
@@ -143,7 +144,7 @@ static void rds_ib_add_one(struct ib_device *device)
143 INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free); 144 INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free);
144 145
145 rds_ibdev->max_wrs = device->attrs.max_qp_wr; 146 rds_ibdev->max_wrs = device->attrs.max_qp_wr;
146 rds_ibdev->max_sge = min(device->attrs.max_sge, RDS_IB_MAX_SGE); 147 rds_ibdev->max_sge = min(device->attrs.max_send_sge, RDS_IB_MAX_SGE);
147 148
148 has_fr = (device->attrs.device_cap_flags & 149 has_fr = (device->attrs.device_cap_flags &
149 IB_DEVICE_MEM_MGT_EXTENSIONS); 150 IB_DEVICE_MEM_MGT_EXTENSIONS);
@@ -295,9 +296,11 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn,
295 /* We will only ever look at IB transports */ 296 /* We will only ever look at IB transports */
296 if (conn->c_trans != &rds_ib_transport) 297 if (conn->c_trans != &rds_ib_transport)
297 return 0; 298 return 0;
299 if (conn->c_isv6)
300 return 0;
298 301
299 iinfo->src_addr = conn->c_laddr; 302 iinfo->src_addr = conn->c_laddr.s6_addr32[3];
300 iinfo->dst_addr = conn->c_faddr; 303 iinfo->dst_addr = conn->c_faddr.s6_addr32[3];
301 304
302 memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid)); 305 memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid));
303 memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid)); 306 memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid));
@@ -318,6 +321,45 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn,
318 return 1; 321 return 1;
319} 322}
320 323
324#if IS_ENABLED(CONFIG_IPV6)
325/* IPv6 version of rds_ib_conn_info_visitor(). */
326static int rds6_ib_conn_info_visitor(struct rds_connection *conn,
327 void *buffer)
328{
329 struct rds6_info_rdma_connection *iinfo6 = buffer;
330 struct rds_ib_connection *ic;
331
332 /* We will only ever look at IB transports */
333 if (conn->c_trans != &rds_ib_transport)
334 return 0;
335
336 iinfo6->src_addr = conn->c_laddr;
337 iinfo6->dst_addr = conn->c_faddr;
338
339 memset(&iinfo6->src_gid, 0, sizeof(iinfo6->src_gid));
340 memset(&iinfo6->dst_gid, 0, sizeof(iinfo6->dst_gid));
341
342 if (rds_conn_state(conn) == RDS_CONN_UP) {
343 struct rds_ib_device *rds_ibdev;
344 struct rdma_dev_addr *dev_addr;
345
346 ic = conn->c_transport_data;
347 dev_addr = &ic->i_cm_id->route.addr.dev_addr;
348 rdma_addr_get_sgid(dev_addr,
349 (union ib_gid *)&iinfo6->src_gid);
350 rdma_addr_get_dgid(dev_addr,
351 (union ib_gid *)&iinfo6->dst_gid);
352
353 rds_ibdev = ic->rds_ibdev;
354 iinfo6->max_send_wr = ic->i_send_ring.w_nr;
355 iinfo6->max_recv_wr = ic->i_recv_ring.w_nr;
356 iinfo6->max_send_sge = rds_ibdev->max_sge;
357 rds6_ib_get_mr_info(rds_ibdev, iinfo6);
358 }
359 return 1;
360}
361#endif
362
321static void rds_ib_ic_info(struct socket *sock, unsigned int len, 363static void rds_ib_ic_info(struct socket *sock, unsigned int len,
322 struct rds_info_iterator *iter, 364 struct rds_info_iterator *iter,
323 struct rds_info_lengths *lens) 365 struct rds_info_lengths *lens)
@@ -330,6 +372,20 @@ static void rds_ib_ic_info(struct socket *sock, unsigned int len,
330 sizeof(struct rds_info_rdma_connection)); 372 sizeof(struct rds_info_rdma_connection));
331} 373}
332 374
375#if IS_ENABLED(CONFIG_IPV6)
376/* IPv6 version of rds_ib_ic_info(). */
377static void rds6_ib_ic_info(struct socket *sock, unsigned int len,
378 struct rds_info_iterator *iter,
379 struct rds_info_lengths *lens)
380{
381 u64 buffer[(sizeof(struct rds6_info_rdma_connection) + 7) / 8];
382
383 rds_for_each_conn_info(sock, len, iter, lens,
384 rds6_ib_conn_info_visitor,
385 buffer,
386 sizeof(struct rds6_info_rdma_connection));
387}
388#endif
333 389
334/* 390/*
335 * Early RDS/IB was built to only bind to an address if there is an IPoIB 391 * Early RDS/IB was built to only bind to an address if there is an IPoIB
@@ -341,12 +397,19 @@ static void rds_ib_ic_info(struct socket *sock, unsigned int len,
341 * allowed to influence which paths have priority. We could call userspace 397 * allowed to influence which paths have priority. We could call userspace
342 * asserting this policy "routing". 398 * asserting this policy "routing".
343 */ 399 */
344static int rds_ib_laddr_check(struct net *net, __be32 addr) 400static int rds_ib_laddr_check(struct net *net, const struct in6_addr *addr,
401 __u32 scope_id)
345{ 402{
346 int ret; 403 int ret;
347 struct rdma_cm_id *cm_id; 404 struct rdma_cm_id *cm_id;
405#if IS_ENABLED(CONFIG_IPV6)
406 struct sockaddr_in6 sin6;
407#endif
348 struct sockaddr_in sin; 408 struct sockaddr_in sin;
409 struct sockaddr *sa;
410 bool isv4;
349 411
412 isv4 = ipv6_addr_v4mapped(addr);
350 /* Create a CMA ID and try to bind it. This catches both 413 /* Create a CMA ID and try to bind it. This catches both
351 * IB and iWARP capable NICs. 414 * IB and iWARP capable NICs.
352 */ 415 */
@@ -355,22 +418,66 @@ static int rds_ib_laddr_check(struct net *net, __be32 addr)
355 if (IS_ERR(cm_id)) 418 if (IS_ERR(cm_id))
356 return PTR_ERR(cm_id); 419 return PTR_ERR(cm_id);
357 420
358 memset(&sin, 0, sizeof(sin)); 421 if (isv4) {
359 sin.sin_family = AF_INET; 422 memset(&sin, 0, sizeof(sin));
360 sin.sin_addr.s_addr = addr; 423 sin.sin_family = AF_INET;
424 sin.sin_addr.s_addr = addr->s6_addr32[3];
425 sa = (struct sockaddr *)&sin;
426 } else {
427#if IS_ENABLED(CONFIG_IPV6)
428 memset(&sin6, 0, sizeof(sin6));
429 sin6.sin6_family = AF_INET6;
430 sin6.sin6_addr = *addr;
431 sin6.sin6_scope_id = scope_id;
432 sa = (struct sockaddr *)&sin6;
433
434 /* XXX Do a special IPv6 link local address check here. The
435 * reason is that rdma_bind_addr() always succeeds with IPv6
436 * link local address regardless it is indeed configured in a
437 * system.
438 */
439 if (ipv6_addr_type(addr) & IPV6_ADDR_LINKLOCAL) {
440 struct net_device *dev;
441
442 if (scope_id == 0) {
443 ret = -EADDRNOTAVAIL;
444 goto out;
445 }
446
447 /* Use init_net for now as RDS is not network
448 * name space aware.
449 */
450 dev = dev_get_by_index(&init_net, scope_id);
451 if (!dev) {
452 ret = -EADDRNOTAVAIL;
453 goto out;
454 }
455 if (!ipv6_chk_addr(&init_net, addr, dev, 1)) {
456 dev_put(dev);
457 ret = -EADDRNOTAVAIL;
458 goto out;
459 }
460 dev_put(dev);
461 }
462#else
463 ret = -EADDRNOTAVAIL;
464 goto out;
465#endif
466 }
361 467
362 /* rdma_bind_addr will only succeed for IB & iWARP devices */ 468 /* rdma_bind_addr will only succeed for IB & iWARP devices */
363 ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); 469 ret = rdma_bind_addr(cm_id, sa);
364 /* due to this, we will claim to support iWARP devices unless we 470 /* due to this, we will claim to support iWARP devices unless we
365 check node_type. */ 471 check node_type. */
366 if (ret || !cm_id->device || 472 if (ret || !cm_id->device ||
367 cm_id->device->node_type != RDMA_NODE_IB_CA) 473 cm_id->device->node_type != RDMA_NODE_IB_CA)
368 ret = -EADDRNOTAVAIL; 474 ret = -EADDRNOTAVAIL;
369 475
370 rdsdebug("addr %pI4 ret %d node type %d\n", 476 rdsdebug("addr %pI6c%%%u ret %d node type %d\n",
371 &addr, ret, 477 addr, scope_id, ret,
372 cm_id->device ? cm_id->device->node_type : -1); 478 cm_id->device ? cm_id->device->node_type : -1);
373 479
480out:
374 rdma_destroy_id(cm_id); 481 rdma_destroy_id(cm_id);
375 482
376 return ret; 483 return ret;
@@ -401,6 +508,9 @@ void rds_ib_exit(void)
401 rds_ib_set_unloading(); 508 rds_ib_set_unloading();
402 synchronize_rcu(); 509 synchronize_rcu();
403 rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); 510 rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
511#if IS_ENABLED(CONFIG_IPV6)
512 rds_info_deregister_func(RDS6_INFO_IB_CONNECTIONS, rds6_ib_ic_info);
513#endif
404 rds_ib_unregister_client(); 514 rds_ib_unregister_client();
405 rds_ib_destroy_nodev_conns(); 515 rds_ib_destroy_nodev_conns();
406 rds_ib_sysctl_exit(); 516 rds_ib_sysctl_exit();
@@ -462,6 +572,9 @@ int rds_ib_init(void)
462 rds_trans_register(&rds_ib_transport); 572 rds_trans_register(&rds_ib_transport);
463 573
464 rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); 574 rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
575#if IS_ENABLED(CONFIG_IPV6)
576 rds_info_register_func(RDS6_INFO_IB_CONNECTIONS, rds6_ib_ic_info);
577#endif
465 578
466 goto out; 579 goto out;
467 580
@@ -476,4 +589,3 @@ out:
476} 589}
477 590
478MODULE_LICENSE("GPL"); 591MODULE_LICENSE("GPL");
479
diff --git a/net/rds/ib.h b/net/rds/ib.h
index a6f4d7d68e95..73427ff439f9 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -57,16 +57,44 @@ struct rds_ib_refill_cache {
57 struct list_head *ready; 57 struct list_head *ready;
58}; 58};
59 59
60/* This is the common structure for the IB private data exchange in setting up
61 * an RDS connection. The exchange is different for IPv4 and IPv6 connections.
62 * The reason is that the address size is different and the addresses
63 * exchanged are in the beginning of the structure. Hence it is not possible
64 * for interoperability if same structure is used.
65 */
66struct rds_ib_conn_priv_cmn {
67 u8 ricpc_protocol_major;
68 u8 ricpc_protocol_minor;
69 __be16 ricpc_protocol_minor_mask; /* bitmask */
70 __be32 ricpc_reserved1;
71 __be64 ricpc_ack_seq;
72 __be32 ricpc_credit; /* non-zero enables flow ctl */
73};
74
60struct rds_ib_connect_private { 75struct rds_ib_connect_private {
61 /* Add new fields at the end, and don't permute existing fields. */ 76 /* Add new fields at the end, and don't permute existing fields. */
62 __be32 dp_saddr; 77 __be32 dp_saddr;
63 __be32 dp_daddr; 78 __be32 dp_daddr;
64 u8 dp_protocol_major; 79 struct rds_ib_conn_priv_cmn dp_cmn;
65 u8 dp_protocol_minor; 80};
66 __be16 dp_protocol_minor_mask; /* bitmask */ 81
67 __be32 dp_reserved1; 82struct rds6_ib_connect_private {
68 __be64 dp_ack_seq; 83 /* Add new fields at the end, and don't permute existing fields. */
69 __be32 dp_credit; /* non-zero enables flow ctl */ 84 struct in6_addr dp_saddr;
85 struct in6_addr dp_daddr;
86 struct rds_ib_conn_priv_cmn dp_cmn;
87};
88
89#define dp_protocol_major dp_cmn.ricpc_protocol_major
90#define dp_protocol_minor dp_cmn.ricpc_protocol_minor
91#define dp_protocol_minor_mask dp_cmn.ricpc_protocol_minor_mask
92#define dp_ack_seq dp_cmn.ricpc_ack_seq
93#define dp_credit dp_cmn.ricpc_credit
94
95union rds_ib_conn_priv {
96 struct rds_ib_connect_private ricp_v4;
97 struct rds6_ib_connect_private ricp_v6;
70}; 98};
71 99
72struct rds_ib_send_work { 100struct rds_ib_send_work {
@@ -351,8 +379,8 @@ void rds_ib_listen_stop(void);
351__printf(2, 3) 379__printf(2, 3)
352void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...); 380void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...);
353int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, 381int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
354 struct rdma_cm_event *event); 382 struct rdma_cm_event *event, bool isv6);
355int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id); 383int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6);
356void rds_ib_cm_connect_complete(struct rds_connection *conn, 384void rds_ib_cm_connect_complete(struct rds_connection *conn,
357 struct rdma_cm_event *event); 385 struct rdma_cm_event *event);
358 386
@@ -361,7 +389,8 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn,
361 __rds_ib_conn_error(conn, KERN_WARNING "RDS/IB: " fmt) 389 __rds_ib_conn_error(conn, KERN_WARNING "RDS/IB: " fmt)
362 390
363/* ib_rdma.c */ 391/* ib_rdma.c */
364int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr); 392int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev,
393 struct in6_addr *ipaddr);
365void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); 394void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
366void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); 395void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
367void rds_ib_destroy_nodev_conns(void); 396void rds_ib_destroy_nodev_conns(void);
@@ -371,7 +400,7 @@ void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc);
371int rds_ib_recv_init(void); 400int rds_ib_recv_init(void);
372void rds_ib_recv_exit(void); 401void rds_ib_recv_exit(void);
373int rds_ib_recv_path(struct rds_conn_path *conn); 402int rds_ib_recv_path(struct rds_conn_path *conn);
374int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic); 403int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic, gfp_t gfp);
375void rds_ib_recv_free_caches(struct rds_ib_connection *ic); 404void rds_ib_recv_free_caches(struct rds_ib_connection *ic);
376void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp); 405void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp);
377void rds_ib_inc_free(struct rds_incoming *inc); 406void rds_ib_inc_free(struct rds_incoming *inc);
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index f1684ae6abfd..bfbb31f0c7fd 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2006 Oracle. All rights reserved. 2 * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
3 * 3 *
4 * This software is available to you under a choice of one of two 4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU 5 * licenses. You may choose to be licensed under the terms of the GNU
@@ -35,6 +35,7 @@
35#include <linux/slab.h> 35#include <linux/slab.h>
36#include <linux/vmalloc.h> 36#include <linux/vmalloc.h>
37#include <linux/ratelimit.h> 37#include <linux/ratelimit.h>
38#include <net/addrconf.h>
38 39
39#include "rds_single_path.h" 40#include "rds_single_path.h"
40#include "rds.h" 41#include "rds.h"
@@ -95,25 +96,45 @@ rds_ib_tune_rnr(struct rds_ib_connection *ic, struct ib_qp_attr *attr)
95 */ 96 */
96void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event) 97void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event)
97{ 98{
98 const struct rds_ib_connect_private *dp = NULL;
99 struct rds_ib_connection *ic = conn->c_transport_data; 99 struct rds_ib_connection *ic = conn->c_transport_data;
100 const union rds_ib_conn_priv *dp = NULL;
100 struct ib_qp_attr qp_attr; 101 struct ib_qp_attr qp_attr;
102 __be64 ack_seq = 0;
103 __be32 credit = 0;
104 u8 major = 0;
105 u8 minor = 0;
101 int err; 106 int err;
102 107
103 if (event->param.conn.private_data_len >= sizeof(*dp)) { 108 dp = event->param.conn.private_data;
104 dp = event->param.conn.private_data; 109 if (conn->c_isv6) {
105 110 if (event->param.conn.private_data_len >=
106 /* make sure it isn't empty data */ 111 sizeof(struct rds6_ib_connect_private)) {
107 if (dp->dp_protocol_major) { 112 major = dp->ricp_v6.dp_protocol_major;
108 rds_ib_set_protocol(conn, 113 minor = dp->ricp_v6.dp_protocol_minor;
109 RDS_PROTOCOL(dp->dp_protocol_major, 114 credit = dp->ricp_v6.dp_credit;
110 dp->dp_protocol_minor)); 115 /* dp structure start is not guaranteed to be 8 bytes
111 rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); 116 * aligned. Since dp_ack_seq is 64-bit extended load
117 * operations can be used so go through get_unaligned
118 * to avoid unaligned errors.
119 */
120 ack_seq = get_unaligned(&dp->ricp_v6.dp_ack_seq);
112 } 121 }
122 } else if (event->param.conn.private_data_len >=
123 sizeof(struct rds_ib_connect_private)) {
124 major = dp->ricp_v4.dp_protocol_major;
125 minor = dp->ricp_v4.dp_protocol_minor;
126 credit = dp->ricp_v4.dp_credit;
127 ack_seq = get_unaligned(&dp->ricp_v4.dp_ack_seq);
128 }
129
130 /* make sure it isn't empty data */
131 if (major) {
132 rds_ib_set_protocol(conn, RDS_PROTOCOL(major, minor));
133 rds_ib_set_flow_control(conn, be32_to_cpu(credit));
113 } 134 }
114 135
115 if (conn->c_version < RDS_PROTOCOL(3, 1)) { 136 if (conn->c_version < RDS_PROTOCOL(3, 1)) {
116 pr_notice("RDS/IB: Connection <%pI4,%pI4> version %u.%u no longer supported\n", 137 pr_notice("RDS/IB: Connection <%pI6c,%pI6c> version %u.%u no longer supported\n",
117 &conn->c_laddr, &conn->c_faddr, 138 &conn->c_laddr, &conn->c_faddr,
118 RDS_PROTOCOL_MAJOR(conn->c_version), 139 RDS_PROTOCOL_MAJOR(conn->c_version),
119 RDS_PROTOCOL_MINOR(conn->c_version)); 140 RDS_PROTOCOL_MINOR(conn->c_version));
@@ -121,7 +142,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
121 rds_conn_destroy(conn); 142 rds_conn_destroy(conn);
122 return; 143 return;
123 } else { 144 } else {
124 pr_notice("RDS/IB: %s conn connected <%pI4,%pI4> version %u.%u%s\n", 145 pr_notice("RDS/IB: %s conn connected <%pI6c,%pI6c> version %u.%u%s\n",
125 ic->i_active_side ? "Active" : "Passive", 146 ic->i_active_side ? "Active" : "Passive",
126 &conn->c_laddr, &conn->c_faddr, 147 &conn->c_laddr, &conn->c_faddr,
127 RDS_PROTOCOL_MAJOR(conn->c_version), 148 RDS_PROTOCOL_MAJOR(conn->c_version),
@@ -150,7 +171,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
150 printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err); 171 printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err);
151 172
152 /* update ib_device with this local ipaddr */ 173 /* update ib_device with this local ipaddr */
153 err = rds_ib_update_ipaddr(ic->rds_ibdev, conn->c_laddr); 174 err = rds_ib_update_ipaddr(ic->rds_ibdev, &conn->c_laddr);
154 if (err) 175 if (err)
155 printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n", 176 printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n",
156 err); 177 err);
@@ -158,14 +179,8 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
158 /* If the peer gave us the last packet it saw, process this as if 179 /* If the peer gave us the last packet it saw, process this as if
159 * we had received a regular ACK. */ 180 * we had received a regular ACK. */
160 if (dp) { 181 if (dp) {
161 /* dp structure start is not guaranteed to be 8 bytes aligned. 182 if (ack_seq)
162 * Since dp_ack_seq is 64-bit extended load operations can be 183 rds_send_drop_acked(conn, be64_to_cpu(ack_seq),
163 * used so go through get_unaligned to avoid unaligned errors.
164 */
165 __be64 dp_ack_seq = get_unaligned(&dp->dp_ack_seq);
166
167 if (dp_ack_seq)
168 rds_send_drop_acked(conn, be64_to_cpu(dp_ack_seq),
169 NULL); 184 NULL);
170 } 185 }
171 186
@@ -173,11 +188,12 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
173} 188}
174 189
175static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, 190static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
176 struct rdma_conn_param *conn_param, 191 struct rdma_conn_param *conn_param,
177 struct rds_ib_connect_private *dp, 192 union rds_ib_conn_priv *dp,
178 u32 protocol_version, 193 u32 protocol_version,
179 u32 max_responder_resources, 194 u32 max_responder_resources,
180 u32 max_initiator_depth) 195 u32 max_initiator_depth,
196 bool isv6)
181{ 197{
182 struct rds_ib_connection *ic = conn->c_transport_data; 198 struct rds_ib_connection *ic = conn->c_transport_data;
183 struct rds_ib_device *rds_ibdev = ic->rds_ibdev; 199 struct rds_ib_device *rds_ibdev = ic->rds_ibdev;
@@ -193,24 +209,49 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
193 209
194 if (dp) { 210 if (dp) {
195 memset(dp, 0, sizeof(*dp)); 211 memset(dp, 0, sizeof(*dp));
196 dp->dp_saddr = conn->c_laddr; 212 if (isv6) {
197 dp->dp_daddr = conn->c_faddr; 213 dp->ricp_v6.dp_saddr = conn->c_laddr;
198 dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version); 214 dp->ricp_v6.dp_daddr = conn->c_faddr;
199 dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version); 215 dp->ricp_v6.dp_protocol_major =
200 dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS); 216 RDS_PROTOCOL_MAJOR(protocol_version);
201 dp->dp_ack_seq = cpu_to_be64(rds_ib_piggyb_ack(ic)); 217 dp->ricp_v6.dp_protocol_minor =
218 RDS_PROTOCOL_MINOR(protocol_version);
219 dp->ricp_v6.dp_protocol_minor_mask =
220 cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS);
221 dp->ricp_v6.dp_ack_seq =
222 cpu_to_be64(rds_ib_piggyb_ack(ic));
223
224 conn_param->private_data = &dp->ricp_v6;
225 conn_param->private_data_len = sizeof(dp->ricp_v6);
226 } else {
227 dp->ricp_v4.dp_saddr = conn->c_laddr.s6_addr32[3];
228 dp->ricp_v4.dp_daddr = conn->c_faddr.s6_addr32[3];
229 dp->ricp_v4.dp_protocol_major =
230 RDS_PROTOCOL_MAJOR(protocol_version);
231 dp->ricp_v4.dp_protocol_minor =
232 RDS_PROTOCOL_MINOR(protocol_version);
233 dp->ricp_v4.dp_protocol_minor_mask =
234 cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS);
235 dp->ricp_v4.dp_ack_seq =
236 cpu_to_be64(rds_ib_piggyb_ack(ic));
237
238 conn_param->private_data = &dp->ricp_v4;
239 conn_param->private_data_len = sizeof(dp->ricp_v4);
240 }
202 241
203 /* Advertise flow control */ 242 /* Advertise flow control */
204 if (ic->i_flowctl) { 243 if (ic->i_flowctl) {
205 unsigned int credits; 244 unsigned int credits;
206 245
207 credits = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)); 246 credits = IB_GET_POST_CREDITS
208 dp->dp_credit = cpu_to_be32(credits); 247 (atomic_read(&ic->i_credits));
209 atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits); 248 if (isv6)
249 dp->ricp_v6.dp_credit = cpu_to_be32(credits);
250 else
251 dp->ricp_v4.dp_credit = cpu_to_be32(credits);
252 atomic_sub(IB_SET_POST_CREDITS(credits),
253 &ic->i_credits);
210 } 254 }
211
212 conn_param->private_data = dp;
213 conn_param->private_data_len = sizeof(*dp);
214 } 255 }
215} 256}
216 257
@@ -349,7 +390,7 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
349 break; 390 break;
350 default: 391 default:
351 rdsdebug("Fatal QP Event %u (%s) " 392 rdsdebug("Fatal QP Event %u (%s) "
352 "- connection %pI4->%pI4, reconnecting\n", 393 "- connection %pI6c->%pI6c, reconnecting\n",
353 event->event, ib_event_msg(event->event), 394 event->event, ib_event_msg(event->event),
354 &conn->c_laddr, &conn->c_faddr); 395 &conn->c_laddr, &conn->c_faddr);
355 rds_conn_drop(conn); 396 rds_conn_drop(conn);
@@ -580,11 +621,13 @@ out:
580 return ret; 621 return ret;
581} 622}
582 623
583static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event) 624static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event, bool isv6)
584{ 625{
585 const struct rds_ib_connect_private *dp = event->param.conn.private_data; 626 const union rds_ib_conn_priv *dp = event->param.conn.private_data;
586 u16 common; 627 u8 data_len, major, minor;
587 u32 version = 0; 628 u32 version = 0;
629 __be16 mask;
630 u16 common;
588 631
589 /* 632 /*
590 * rdma_cm private data is odd - when there is any private data in the 633 * rdma_cm private data is odd - when there is any private data in the
@@ -603,51 +646,140 @@ static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event)
603 return 0; 646 return 0;
604 } 647 }
605 648
649 if (isv6) {
650 data_len = sizeof(struct rds6_ib_connect_private);
651 major = dp->ricp_v6.dp_protocol_major;
652 minor = dp->ricp_v6.dp_protocol_minor;
653 mask = dp->ricp_v6.dp_protocol_minor_mask;
654 } else {
655 data_len = sizeof(struct rds_ib_connect_private);
656 major = dp->ricp_v4.dp_protocol_major;
657 minor = dp->ricp_v4.dp_protocol_minor;
658 mask = dp->ricp_v4.dp_protocol_minor_mask;
659 }
660
606 /* Even if len is crap *now* I still want to check it. -ASG */ 661 /* Even if len is crap *now* I still want to check it. -ASG */
607 if (event->param.conn.private_data_len < sizeof (*dp) || 662 if (event->param.conn.private_data_len < data_len || major == 0)
608 dp->dp_protocol_major == 0)
609 return RDS_PROTOCOL_3_0; 663 return RDS_PROTOCOL_3_0;
610 664
611 common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IB_SUPPORTED_PROTOCOLS; 665 common = be16_to_cpu(mask) & RDS_IB_SUPPORTED_PROTOCOLS;
612 if (dp->dp_protocol_major == 3 && common) { 666 if (major == 3 && common) {
613 version = RDS_PROTOCOL_3_0; 667 version = RDS_PROTOCOL_3_0;
614 while ((common >>= 1) != 0) 668 while ((common >>= 1) != 0)
615 version++; 669 version++;
616 } else 670 } else {
617 printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI4 using incompatible protocol version %u.%u\n", 671 if (isv6)
618 &dp->dp_saddr, 672 printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI6c using incompatible protocol version %u.%u\n",
619 dp->dp_protocol_major, 673 &dp->ricp_v6.dp_saddr, major, minor);
620 dp->dp_protocol_minor); 674 else
675 printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI4 using incompatible protocol version %u.%u\n",
676 &dp->ricp_v4.dp_saddr, major, minor);
677 }
621 return version; 678 return version;
622} 679}
623 680
681#if IS_ENABLED(CONFIG_IPV6)
682/* Given an IPv6 address, find the net_device which hosts that address and
683 * return its index. This is used by the rds_ib_cm_handle_connect() code to
684 * find the interface index of where an incoming request comes from when
685 * the request is using a link local address.
686 *
687 * Note one problem in this search. It is possible that two interfaces have
688 * the same link local address. Unfortunately, this cannot be solved unless
689 * the underlying layer gives us the interface which an incoming RDMA connect
690 * request comes from.
691 */
692static u32 __rds_find_ifindex(struct net *net, const struct in6_addr *addr)
693{
694 struct net_device *dev;
695 int idx = 0;
696
697 rcu_read_lock();
698 for_each_netdev_rcu(net, dev) {
699 if (ipv6_chk_addr(net, addr, dev, 1)) {
700 idx = dev->ifindex;
701 break;
702 }
703 }
704 rcu_read_unlock();
705
706 return idx;
707}
708#endif
709
624int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, 710int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
625 struct rdma_cm_event *event) 711 struct rdma_cm_event *event, bool isv6)
626{ 712{
627 __be64 lguid = cm_id->route.path_rec->sgid.global.interface_id; 713 __be64 lguid = cm_id->route.path_rec->sgid.global.interface_id;
628 __be64 fguid = cm_id->route.path_rec->dgid.global.interface_id; 714 __be64 fguid = cm_id->route.path_rec->dgid.global.interface_id;
629 const struct rds_ib_connect_private *dp = event->param.conn.private_data; 715 const struct rds_ib_conn_priv_cmn *dp_cmn;
630 struct rds_ib_connect_private dp_rep;
631 struct rds_connection *conn = NULL; 716 struct rds_connection *conn = NULL;
632 struct rds_ib_connection *ic = NULL; 717 struct rds_ib_connection *ic = NULL;
633 struct rdma_conn_param conn_param; 718 struct rdma_conn_param conn_param;
719 const union rds_ib_conn_priv *dp;
720 union rds_ib_conn_priv dp_rep;
721 struct in6_addr s_mapped_addr;
722 struct in6_addr d_mapped_addr;
723 const struct in6_addr *saddr6;
724 const struct in6_addr *daddr6;
725 int destroy = 1;
726 u32 ifindex = 0;
634 u32 version; 727 u32 version;
635 int err = 1, destroy = 1; 728 int err = 1;
636 729
637 /* Check whether the remote protocol version matches ours. */ 730 /* Check whether the remote protocol version matches ours. */
638 version = rds_ib_protocol_compatible(event); 731 version = rds_ib_protocol_compatible(event, isv6);
639 if (!version) 732 if (!version)
640 goto out; 733 goto out;
641 734
642 rdsdebug("saddr %pI4 daddr %pI4 RDSv%u.%u lguid 0x%llx fguid " 735 dp = event->param.conn.private_data;
643 "0x%llx\n", &dp->dp_saddr, &dp->dp_daddr, 736 if (isv6) {
737#if IS_ENABLED(CONFIG_IPV6)
738 dp_cmn = &dp->ricp_v6.dp_cmn;
739 saddr6 = &dp->ricp_v6.dp_saddr;
740 daddr6 = &dp->ricp_v6.dp_daddr;
741 /* If either address is link local, need to find the
742 * interface index in order to create a proper RDS
743 * connection.
744 */
745 if (ipv6_addr_type(daddr6) & IPV6_ADDR_LINKLOCAL) {
746 /* Using init_net for now .. */
747 ifindex = __rds_find_ifindex(&init_net, daddr6);
748 /* No index found... Need to bail out. */
749 if (ifindex == 0) {
750 err = -EOPNOTSUPP;
751 goto out;
752 }
753 } else if (ipv6_addr_type(saddr6) & IPV6_ADDR_LINKLOCAL) {
754 /* Use our address to find the correct index. */
755 ifindex = __rds_find_ifindex(&init_net, daddr6);
756 /* No index found... Need to bail out. */
757 if (ifindex == 0) {
758 err = -EOPNOTSUPP;
759 goto out;
760 }
761 }
762#else
763 err = -EOPNOTSUPP;
764 goto out;
765#endif
766 } else {
767 dp_cmn = &dp->ricp_v4.dp_cmn;
768 ipv6_addr_set_v4mapped(dp->ricp_v4.dp_saddr, &s_mapped_addr);
769 ipv6_addr_set_v4mapped(dp->ricp_v4.dp_daddr, &d_mapped_addr);
770 saddr6 = &s_mapped_addr;
771 daddr6 = &d_mapped_addr;
772 }
773
774 rdsdebug("saddr %pI6c daddr %pI6c RDSv%u.%u lguid 0x%llx fguid "
775 "0x%llx\n", saddr6, daddr6,
644 RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version), 776 RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version),
645 (unsigned long long)be64_to_cpu(lguid), 777 (unsigned long long)be64_to_cpu(lguid),
646 (unsigned long long)be64_to_cpu(fguid)); 778 (unsigned long long)be64_to_cpu(fguid));
647 779
648 /* RDS/IB is not currently netns aware, thus init_net */ 780 /* RDS/IB is not currently netns aware, thus init_net */
649 conn = rds_conn_create(&init_net, dp->dp_daddr, dp->dp_saddr, 781 conn = rds_conn_create(&init_net, daddr6, saddr6,
650 &rds_ib_transport, GFP_KERNEL); 782 &rds_ib_transport, GFP_KERNEL, ifindex);
651 if (IS_ERR(conn)) { 783 if (IS_ERR(conn)) {
652 rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn)); 784 rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
653 conn = NULL; 785 conn = NULL;
@@ -678,12 +810,13 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
678 ic = conn->c_transport_data; 810 ic = conn->c_transport_data;
679 811
680 rds_ib_set_protocol(conn, version); 812 rds_ib_set_protocol(conn, version);
681 rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); 813 rds_ib_set_flow_control(conn, be32_to_cpu(dp_cmn->ricpc_credit));
682 814
683 /* If the peer gave us the last packet it saw, process this as if 815 /* If the peer gave us the last packet it saw, process this as if
684 * we had received a regular ACK. */ 816 * we had received a regular ACK. */
685 if (dp->dp_ack_seq) 817 if (dp_cmn->ricpc_ack_seq)
686 rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL); 818 rds_send_drop_acked(conn, be64_to_cpu(dp_cmn->ricpc_ack_seq),
819 NULL);
687 820
688 BUG_ON(cm_id->context); 821 BUG_ON(cm_id->context);
689 BUG_ON(ic->i_cm_id); 822 BUG_ON(ic->i_cm_id);
@@ -702,8 +835,8 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
702 } 835 }
703 836
704 rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version, 837 rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version,
705 event->param.conn.responder_resources, 838 event->param.conn.responder_resources,
706 event->param.conn.initiator_depth); 839 event->param.conn.initiator_depth, isv6);
707 840
708 /* rdma_accept() calls rdma_reject() internally if it fails */ 841 /* rdma_accept() calls rdma_reject() internally if it fails */
709 if (rdma_accept(cm_id, &conn_param)) 842 if (rdma_accept(cm_id, &conn_param))
@@ -718,12 +851,12 @@ out:
718} 851}
719 852
720 853
721int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id) 854int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6)
722{ 855{
723 struct rds_connection *conn = cm_id->context; 856 struct rds_connection *conn = cm_id->context;
724 struct rds_ib_connection *ic = conn->c_transport_data; 857 struct rds_ib_connection *ic = conn->c_transport_data;
725 struct rdma_conn_param conn_param; 858 struct rdma_conn_param conn_param;
726 struct rds_ib_connect_private dp; 859 union rds_ib_conn_priv dp;
727 int ret; 860 int ret;
728 861
729 /* If the peer doesn't do protocol negotiation, we must 862 /* If the peer doesn't do protocol negotiation, we must
@@ -738,7 +871,7 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id)
738 } 871 }
739 872
740 rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION, 873 rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION,
741 UINT_MAX, UINT_MAX); 874 UINT_MAX, UINT_MAX, isv6);
742 ret = rdma_connect(cm_id, &conn_param); 875 ret = rdma_connect(cm_id, &conn_param);
743 if (ret) 876 if (ret)
744 rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret); 877 rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret);
@@ -758,13 +891,22 @@ out:
758int rds_ib_conn_path_connect(struct rds_conn_path *cp) 891int rds_ib_conn_path_connect(struct rds_conn_path *cp)
759{ 892{
760 struct rds_connection *conn = cp->cp_conn; 893 struct rds_connection *conn = cp->cp_conn;
761 struct rds_ib_connection *ic = conn->c_transport_data; 894 struct sockaddr_storage src, dest;
762 struct sockaddr_in src, dest; 895 rdma_cm_event_handler handler;
896 struct rds_ib_connection *ic;
763 int ret; 897 int ret;
764 898
899 ic = conn->c_transport_data;
900
765 /* XXX I wonder what affect the port space has */ 901 /* XXX I wonder what affect the port space has */
766 /* delegate cm event handler to rdma_transport */ 902 /* delegate cm event handler to rdma_transport */
767 ic->i_cm_id = rdma_create_id(&init_net, rds_rdma_cm_event_handler, conn, 903#if IS_ENABLED(CONFIG_IPV6)
904 if (conn->c_isv6)
905 handler = rds6_rdma_cm_event_handler;
906 else
907#endif
908 handler = rds_rdma_cm_event_handler;
909 ic->i_cm_id = rdma_create_id(&init_net, handler, conn,
768 RDMA_PS_TCP, IB_QPT_RC); 910 RDMA_PS_TCP, IB_QPT_RC);
769 if (IS_ERR(ic->i_cm_id)) { 911 if (IS_ERR(ic->i_cm_id)) {
770 ret = PTR_ERR(ic->i_cm_id); 912 ret = PTR_ERR(ic->i_cm_id);
@@ -775,13 +917,33 @@ int rds_ib_conn_path_connect(struct rds_conn_path *cp)
775 917
776 rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn); 918 rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn);
777 919
778 src.sin_family = AF_INET; 920 if (ipv6_addr_v4mapped(&conn->c_faddr)) {
779 src.sin_addr.s_addr = (__force u32)conn->c_laddr; 921 struct sockaddr_in *sin;
780 src.sin_port = (__force u16)htons(0); 922
923 sin = (struct sockaddr_in *)&src;
924 sin->sin_family = AF_INET;
925 sin->sin_addr.s_addr = conn->c_laddr.s6_addr32[3];
926 sin->sin_port = 0;
781 927
782 dest.sin_family = AF_INET; 928 sin = (struct sockaddr_in *)&dest;
783 dest.sin_addr.s_addr = (__force u32)conn->c_faddr; 929 sin->sin_family = AF_INET;
784 dest.sin_port = (__force u16)htons(RDS_PORT); 930 sin->sin_addr.s_addr = conn->c_faddr.s6_addr32[3];
931 sin->sin_port = htons(RDS_PORT);
932 } else {
933 struct sockaddr_in6 *sin6;
934
935 sin6 = (struct sockaddr_in6 *)&src;
936 sin6->sin6_family = AF_INET6;
937 sin6->sin6_addr = conn->c_laddr;
938 sin6->sin6_port = 0;
939 sin6->sin6_scope_id = conn->c_dev_if;
940
941 sin6 = (struct sockaddr_in6 *)&dest;
942 sin6->sin6_family = AF_INET6;
943 sin6->sin6_addr = conn->c_faddr;
944 sin6->sin6_port = htons(RDS_CM_PORT);
945 sin6->sin6_scope_id = conn->c_dev_if;
946 }
785 947
786 ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src, 948 ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src,
787 (struct sockaddr *)&dest, 949 (struct sockaddr *)&dest,
@@ -949,7 +1111,7 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
949 if (!ic) 1111 if (!ic)
950 return -ENOMEM; 1112 return -ENOMEM;
951 1113
952 ret = rds_ib_recv_alloc_caches(ic); 1114 ret = rds_ib_recv_alloc_caches(ic, gfp);
953 if (ret) { 1115 if (ret) {
954 kfree(ic); 1116 kfree(ic);
955 return ret; 1117 return ret;
diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c
index 48332a6ed738..6431a023ac89 100644
--- a/net/rds/ib_frmr.c
+++ b/net/rds/ib_frmr.c
@@ -61,6 +61,7 @@ static struct rds_ib_mr *rds_ib_alloc_frmr(struct rds_ib_device *rds_ibdev,
61 pool->fmr_attr.max_pages); 61 pool->fmr_attr.max_pages);
62 if (IS_ERR(frmr->mr)) { 62 if (IS_ERR(frmr->mr)) {
63 pr_warn("RDS/IB: %s failed to allocate MR", __func__); 63 pr_warn("RDS/IB: %s failed to allocate MR", __func__);
64 err = PTR_ERR(frmr->mr);
64 goto out_no_cigar; 65 goto out_no_cigar;
65 } 66 }
66 67
@@ -102,7 +103,6 @@ static void rds_ib_free_frmr(struct rds_ib_mr *ibmr, bool drop)
102static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr) 103static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr)
103{ 104{
104 struct rds_ib_frmr *frmr = &ibmr->u.frmr; 105 struct rds_ib_frmr *frmr = &ibmr->u.frmr;
105 struct ib_send_wr *failed_wr;
106 struct ib_reg_wr reg_wr; 106 struct ib_reg_wr reg_wr;
107 int ret, off = 0; 107 int ret, off = 0;
108 108
@@ -135,9 +135,7 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr)
135 IB_ACCESS_REMOTE_WRITE; 135 IB_ACCESS_REMOTE_WRITE;
136 reg_wr.wr.send_flags = IB_SEND_SIGNALED; 136 reg_wr.wr.send_flags = IB_SEND_SIGNALED;
137 137
138 failed_wr = &reg_wr.wr; 138 ret = ib_post_send(ibmr->ic->i_cm_id->qp, &reg_wr.wr, NULL);
139 ret = ib_post_send(ibmr->ic->i_cm_id->qp, &reg_wr.wr, &failed_wr);
140 WARN_ON(failed_wr != &reg_wr.wr);
141 if (unlikely(ret)) { 139 if (unlikely(ret)) {
142 /* Failure here can be because of -ENOMEM as well */ 140 /* Failure here can be because of -ENOMEM as well */
143 frmr->fr_state = FRMR_IS_STALE; 141 frmr->fr_state = FRMR_IS_STALE;
@@ -230,7 +228,7 @@ out_unmap:
230 228
231static int rds_ib_post_inv(struct rds_ib_mr *ibmr) 229static int rds_ib_post_inv(struct rds_ib_mr *ibmr)
232{ 230{
233 struct ib_send_wr *s_wr, *failed_wr; 231 struct ib_send_wr *s_wr;
234 struct rds_ib_frmr *frmr = &ibmr->u.frmr; 232 struct rds_ib_frmr *frmr = &ibmr->u.frmr;
235 struct rdma_cm_id *i_cm_id = ibmr->ic->i_cm_id; 233 struct rdma_cm_id *i_cm_id = ibmr->ic->i_cm_id;
236 int ret = -EINVAL; 234 int ret = -EINVAL;
@@ -255,9 +253,7 @@ static int rds_ib_post_inv(struct rds_ib_mr *ibmr)
255 s_wr->ex.invalidate_rkey = frmr->mr->rkey; 253 s_wr->ex.invalidate_rkey = frmr->mr->rkey;
256 s_wr->send_flags = IB_SEND_SIGNALED; 254 s_wr->send_flags = IB_SEND_SIGNALED;
257 255
258 failed_wr = s_wr; 256 ret = ib_post_send(i_cm_id->qp, s_wr, NULL);
259 ret = ib_post_send(i_cm_id->qp, s_wr, &failed_wr);
260 WARN_ON(failed_wr != s_wr);
261 if (unlikely(ret)) { 257 if (unlikely(ret)) {
262 frmr->fr_state = FRMR_IS_STALE; 258 frmr->fr_state = FRMR_IS_STALE;
263 frmr->fr_inv = false; 259 frmr->fr_inv = false;
@@ -344,6 +340,11 @@ struct rds_ib_mr *rds_ib_reg_frmr(struct rds_ib_device *rds_ibdev,
344 struct rds_ib_frmr *frmr; 340 struct rds_ib_frmr *frmr;
345 int ret; 341 int ret;
346 342
343 if (!ic) {
344 /* TODO: Add FRWR support for RDS_GET_MR using proxy qp*/
345 return ERR_PTR(-EOPNOTSUPP);
346 }
347
347 do { 348 do {
348 if (ibmr) 349 if (ibmr)
349 rds_ib_free_frmr(ibmr, true); 350 rds_ib_free_frmr(ibmr, true);
diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h
index 0ea4ab017a8c..5da12c248431 100644
--- a/net/rds/ib_mr.h
+++ b/net/rds/ib_mr.h
@@ -113,9 +113,12 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_dev,
113 int npages); 113 int npages);
114void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, 114void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev,
115 struct rds_info_rdma_connection *iinfo); 115 struct rds_info_rdma_connection *iinfo);
116void rds6_ib_get_mr_info(struct rds_ib_device *rds_ibdev,
117 struct rds6_info_rdma_connection *iinfo6);
116void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *); 118void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *);
117void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, 119void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
118 struct rds_sock *rs, u32 *key_ret); 120 struct rds_sock *rs, u32 *key_ret,
121 struct rds_connection *conn);
119void rds_ib_sync_mr(void *trans_private, int dir); 122void rds_ib_sync_mr(void *trans_private, int dir);
120void rds_ib_free_mr(void *trans_private, int invalidate); 123void rds_ib_free_mr(void *trans_private, int invalidate);
121void rds_ib_flush_mrs(void); 124void rds_ib_flush_mrs(void);
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index e678699268a2..63c8d107adcf 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2006 Oracle. All rights reserved. 2 * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
3 * 3 *
4 * This software is available to you under a choice of one of two 4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU 5 * licenses. You may choose to be licensed under the terms of the GNU
@@ -100,18 +100,19 @@ static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
100 kfree_rcu(to_free, rcu); 100 kfree_rcu(to_free, rcu);
101} 101}
102 102
103int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) 103int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev,
104 struct in6_addr *ipaddr)
104{ 105{
105 struct rds_ib_device *rds_ibdev_old; 106 struct rds_ib_device *rds_ibdev_old;
106 107
107 rds_ibdev_old = rds_ib_get_device(ipaddr); 108 rds_ibdev_old = rds_ib_get_device(ipaddr->s6_addr32[3]);
108 if (!rds_ibdev_old) 109 if (!rds_ibdev_old)
109 return rds_ib_add_ipaddr(rds_ibdev, ipaddr); 110 return rds_ib_add_ipaddr(rds_ibdev, ipaddr->s6_addr32[3]);
110 111
111 if (rds_ibdev_old != rds_ibdev) { 112 if (rds_ibdev_old != rds_ibdev) {
112 rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr); 113 rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr->s6_addr32[3]);
113 rds_ib_dev_put(rds_ibdev_old); 114 rds_ib_dev_put(rds_ibdev_old);
114 return rds_ib_add_ipaddr(rds_ibdev, ipaddr); 115 return rds_ib_add_ipaddr(rds_ibdev, ipaddr->s6_addr32[3]);
115 } 116 }
116 rds_ib_dev_put(rds_ibdev_old); 117 rds_ib_dev_put(rds_ibdev_old);
117 118
@@ -179,6 +180,17 @@ void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_co
179 iinfo->rdma_mr_size = pool_1m->fmr_attr.max_pages; 180 iinfo->rdma_mr_size = pool_1m->fmr_attr.max_pages;
180} 181}
181 182
183#if IS_ENABLED(CONFIG_IPV6)
184void rds6_ib_get_mr_info(struct rds_ib_device *rds_ibdev,
185 struct rds6_info_rdma_connection *iinfo6)
186{
187 struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool;
188
189 iinfo6->rdma_mr_max = pool_1m->max_items;
190 iinfo6->rdma_mr_size = pool_1m->fmr_attr.max_pages;
191}
192#endif
193
182struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool) 194struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool)
183{ 195{
184 struct rds_ib_mr *ibmr = NULL; 196 struct rds_ib_mr *ibmr = NULL;
@@ -537,19 +549,23 @@ void rds_ib_flush_mrs(void)
537} 549}
538 550
539void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, 551void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
540 struct rds_sock *rs, u32 *key_ret) 552 struct rds_sock *rs, u32 *key_ret,
553 struct rds_connection *conn)
541{ 554{
542 struct rds_ib_device *rds_ibdev; 555 struct rds_ib_device *rds_ibdev;
543 struct rds_ib_mr *ibmr = NULL; 556 struct rds_ib_mr *ibmr = NULL;
544 struct rds_ib_connection *ic = rs->rs_conn->c_transport_data; 557 struct rds_ib_connection *ic = NULL;
545 int ret; 558 int ret;
546 559
547 rds_ibdev = rds_ib_get_device(rs->rs_bound_addr); 560 rds_ibdev = rds_ib_get_device(rs->rs_bound_addr.s6_addr32[3]);
548 if (!rds_ibdev) { 561 if (!rds_ibdev) {
549 ret = -ENODEV; 562 ret = -ENODEV;
550 goto out; 563 goto out;
551 } 564 }
552 565
566 if (conn)
567 ic = conn->c_transport_data;
568
553 if (!rds_ibdev->mr_8k_pool || !rds_ibdev->mr_1m_pool) { 569 if (!rds_ibdev->mr_8k_pool || !rds_ibdev->mr_1m_pool) {
554 ret = -ENODEV; 570 ret = -ENODEV;
555 goto out; 571 goto out;
@@ -559,17 +575,18 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
559 ibmr = rds_ib_reg_frmr(rds_ibdev, ic, sg, nents, key_ret); 575 ibmr = rds_ib_reg_frmr(rds_ibdev, ic, sg, nents, key_ret);
560 else 576 else
561 ibmr = rds_ib_reg_fmr(rds_ibdev, sg, nents, key_ret); 577 ibmr = rds_ib_reg_fmr(rds_ibdev, sg, nents, key_ret);
562 if (ibmr) 578 if (IS_ERR(ibmr)) {
563 rds_ibdev = NULL; 579 ret = PTR_ERR(ibmr);
564
565 out:
566 if (!ibmr)
567 pr_warn("RDS/IB: rds_ib_get_mr failed (errno=%d)\n", ret); 580 pr_warn("RDS/IB: rds_ib_get_mr failed (errno=%d)\n", ret);
581 } else {
582 return ibmr;
583 }
568 584
585 out:
569 if (rds_ibdev) 586 if (rds_ibdev)
570 rds_ib_dev_put(rds_ibdev); 587 rds_ib_dev_put(rds_ibdev);
571 588
572 return ibmr; 589 return ERR_PTR(ret);
573} 590}
574 591
575void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool) 592void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool)
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index b4e421aa9727..2f16146e4ec9 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2006 Oracle. All rights reserved. 2 * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
3 * 3 *
4 * This software is available to you under a choice of one of two 4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU 5 * licenses. You may choose to be licensed under the terms of the GNU
@@ -98,12 +98,12 @@ static void rds_ib_cache_xfer_to_ready(struct rds_ib_refill_cache *cache)
98 } 98 }
99} 99}
100 100
101static int rds_ib_recv_alloc_cache(struct rds_ib_refill_cache *cache) 101static int rds_ib_recv_alloc_cache(struct rds_ib_refill_cache *cache, gfp_t gfp)
102{ 102{
103 struct rds_ib_cache_head *head; 103 struct rds_ib_cache_head *head;
104 int cpu; 104 int cpu;
105 105
106 cache->percpu = alloc_percpu(struct rds_ib_cache_head); 106 cache->percpu = alloc_percpu_gfp(struct rds_ib_cache_head, gfp);
107 if (!cache->percpu) 107 if (!cache->percpu)
108 return -ENOMEM; 108 return -ENOMEM;
109 109
@@ -118,13 +118,13 @@ static int rds_ib_recv_alloc_cache(struct rds_ib_refill_cache *cache)
118 return 0; 118 return 0;
119} 119}
120 120
121int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic) 121int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic, gfp_t gfp)
122{ 122{
123 int ret; 123 int ret;
124 124
125 ret = rds_ib_recv_alloc_cache(&ic->i_cache_incs); 125 ret = rds_ib_recv_alloc_cache(&ic->i_cache_incs, gfp);
126 if (!ret) { 126 if (!ret) {
127 ret = rds_ib_recv_alloc_cache(&ic->i_cache_frags); 127 ret = rds_ib_recv_alloc_cache(&ic->i_cache_frags, gfp);
128 if (ret) 128 if (ret)
129 free_percpu(ic->i_cache_incs.percpu); 129 free_percpu(ic->i_cache_incs.percpu);
130 } 130 }
@@ -266,7 +266,7 @@ static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *i
266 rds_ib_stats_inc(s_ib_rx_total_incs); 266 rds_ib_stats_inc(s_ib_rx_total_incs);
267 } 267 }
268 INIT_LIST_HEAD(&ibinc->ii_frags); 268 INIT_LIST_HEAD(&ibinc->ii_frags);
269 rds_inc_init(&ibinc->ii_inc, ic->conn, ic->conn->c_faddr); 269 rds_inc_init(&ibinc->ii_inc, ic->conn, &ic->conn->c_faddr);
270 270
271 return ibinc; 271 return ibinc;
272} 272}
@@ -376,14 +376,11 @@ static void release_refill(struct rds_connection *conn)
376 * This tries to allocate and post unused work requests after making sure that 376 * This tries to allocate and post unused work requests after making sure that
377 * they have all the allocations they need to queue received fragments into 377 * they have all the allocations they need to queue received fragments into
378 * sockets. 378 * sockets.
379 *
380 * -1 is returned if posting fails due to temporary resource exhaustion.
381 */ 379 */
382void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp) 380void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp)
383{ 381{
384 struct rds_ib_connection *ic = conn->c_transport_data; 382 struct rds_ib_connection *ic = conn->c_transport_data;
385 struct rds_ib_recv_work *recv; 383 struct rds_ib_recv_work *recv;
386 struct ib_recv_wr *failed_wr;
387 unsigned int posted = 0; 384 unsigned int posted = 0;
388 int ret = 0; 385 int ret = 0;
389 bool can_wait = !!(gfp & __GFP_DIRECT_RECLAIM); 386 bool can_wait = !!(gfp & __GFP_DIRECT_RECLAIM);
@@ -417,10 +414,10 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp)
417 &recv->r_frag->f_sg)); 414 &recv->r_frag->f_sg));
418 415
419 /* XXX when can this fail? */ 416 /* XXX when can this fail? */
420 ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr); 417 ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, NULL);
421 if (ret) { 418 if (ret) {
422 rds_ib_conn_error(conn, "recv post on " 419 rds_ib_conn_error(conn, "recv post on "
423 "%pI4 returned %d, disconnecting and " 420 "%pI6c returned %d, disconnecting and "
424 "reconnecting\n", &conn->c_faddr, 421 "reconnecting\n", &conn->c_faddr,
425 ret); 422 ret);
426 break; 423 break;
@@ -650,7 +647,6 @@ static u64 rds_ib_get_ack(struct rds_ib_connection *ic)
650static void rds_ib_send_ack(struct rds_ib_connection *ic, unsigned int adv_credits) 647static void rds_ib_send_ack(struct rds_ib_connection *ic, unsigned int adv_credits)
651{ 648{
652 struct rds_header *hdr = ic->i_ack; 649 struct rds_header *hdr = ic->i_ack;
653 struct ib_send_wr *failed_wr;
654 u64 seq; 650 u64 seq;
655 int ret; 651 int ret;
656 652
@@ -663,7 +659,7 @@ static void rds_ib_send_ack(struct rds_ib_connection *ic, unsigned int adv_credi
663 rds_message_make_checksum(hdr); 659 rds_message_make_checksum(hdr);
664 ic->i_ack_queued = jiffies; 660 ic->i_ack_queued = jiffies;
665 661
666 ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, &failed_wr); 662 ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, NULL);
667 if (unlikely(ret)) { 663 if (unlikely(ret)) {
668 /* Failed to send. Release the WR, and 664 /* Failed to send. Release the WR, and
669 * force another ACK. 665 * force another ACK.
@@ -850,7 +846,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
850 846
851 if (data_len < sizeof(struct rds_header)) { 847 if (data_len < sizeof(struct rds_header)) {
852 rds_ib_conn_error(conn, "incoming message " 848 rds_ib_conn_error(conn, "incoming message "
853 "from %pI4 didn't include a " 849 "from %pI6c didn't include a "
854 "header, disconnecting and " 850 "header, disconnecting and "
855 "reconnecting\n", 851 "reconnecting\n",
856 &conn->c_faddr); 852 &conn->c_faddr);
@@ -863,7 +859,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
863 /* Validate the checksum. */ 859 /* Validate the checksum. */
864 if (!rds_message_verify_checksum(ihdr)) { 860 if (!rds_message_verify_checksum(ihdr)) {
865 rds_ib_conn_error(conn, "incoming message " 861 rds_ib_conn_error(conn, "incoming message "
866 "from %pI4 has corrupted header - " 862 "from %pI6c has corrupted header - "
867 "forcing a reconnect\n", 863 "forcing a reconnect\n",
868 &conn->c_faddr); 864 &conn->c_faddr);
869 rds_stats_inc(s_recv_drop_bad_checksum); 865 rds_stats_inc(s_recv_drop_bad_checksum);
@@ -943,10 +939,10 @@ static void rds_ib_process_recv(struct rds_connection *conn,
943 ic->i_recv_data_rem = 0; 939 ic->i_recv_data_rem = 0;
944 ic->i_ibinc = NULL; 940 ic->i_ibinc = NULL;
945 941
946 if (ibinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) 942 if (ibinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) {
947 rds_ib_cong_recv(conn, ibinc); 943 rds_ib_cong_recv(conn, ibinc);
948 else { 944 } else {
949 rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr, 945 rds_recv_incoming(conn, &conn->c_faddr, &conn->c_laddr,
950 &ibinc->ii_inc, GFP_ATOMIC); 946 &ibinc->ii_inc, GFP_ATOMIC);
951 state->ack_next = be64_to_cpu(hdr->h_sequence); 947 state->ack_next = be64_to_cpu(hdr->h_sequence);
952 state->ack_next_valid = 1; 948 state->ack_next_valid = 1;
@@ -990,7 +986,7 @@ void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic,
990 } else { 986 } else {
991 /* We expect errors as the qp is drained during shutdown */ 987 /* We expect errors as the qp is drained during shutdown */
992 if (rds_conn_up(conn) || rds_conn_connecting(conn)) 988 if (rds_conn_up(conn) || rds_conn_connecting(conn))
993 rds_ib_conn_error(conn, "recv completion on <%pI4,%pI4> had status %u (%s), disconnecting and reconnecting\n", 989 rds_ib_conn_error(conn, "recv completion on <%pI6c,%pI6c> had status %u (%s), disconnecting and reconnecting\n",
994 &conn->c_laddr, &conn->c_faddr, 990 &conn->c_laddr, &conn->c_faddr,
995 wc->status, 991 wc->status,
996 ib_wc_status_msg(wc->status)); 992 ib_wc_status_msg(wc->status));
@@ -1025,7 +1021,6 @@ int rds_ib_recv_path(struct rds_conn_path *cp)
1025{ 1021{
1026 struct rds_connection *conn = cp->cp_conn; 1022 struct rds_connection *conn = cp->cp_conn;
1027 struct rds_ib_connection *ic = conn->c_transport_data; 1023 struct rds_ib_connection *ic = conn->c_transport_data;
1028 int ret = 0;
1029 1024
1030 rdsdebug("conn %p\n", conn); 1025 rdsdebug("conn %p\n", conn);
1031 if (rds_conn_up(conn)) { 1026 if (rds_conn_up(conn)) {
@@ -1034,7 +1029,7 @@ int rds_ib_recv_path(struct rds_conn_path *cp)
1034 rds_ib_stats_inc(s_ib_rx_refill_from_thread); 1029 rds_ib_stats_inc(s_ib_rx_refill_from_thread);
1035 } 1030 }
1036 1031
1037 return ret; 1032 return 0;
1038} 1033}
1039 1034
1040int rds_ib_recv_init(void) 1035int rds_ib_recv_init(void)
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 8557a1cae041..2dcb555e6350 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2006 Oracle. All rights reserved. 2 * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
3 * 3 *
4 * This software is available to you under a choice of one of two 4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU 5 * licenses. You may choose to be licensed under the terms of the GNU
@@ -305,7 +305,7 @@ void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
305 305
306 /* We expect errors as the qp is drained during shutdown */ 306 /* We expect errors as the qp is drained during shutdown */
307 if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) { 307 if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) {
308 rds_ib_conn_error(conn, "send completion on <%pI4,%pI4> had status %u (%s), disconnecting and reconnecting\n", 308 rds_ib_conn_error(conn, "send completion on <%pI6c,%pI6c> had status %u (%s), disconnecting and reconnecting\n",
309 &conn->c_laddr, &conn->c_faddr, wc->status, 309 &conn->c_laddr, &conn->c_faddr, wc->status,
310 ib_wc_status_msg(wc->status)); 310 ib_wc_status_msg(wc->status));
311 } 311 }
@@ -492,7 +492,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
492 struct rds_ib_send_work *send = NULL; 492 struct rds_ib_send_work *send = NULL;
493 struct rds_ib_send_work *first; 493 struct rds_ib_send_work *first;
494 struct rds_ib_send_work *prev; 494 struct rds_ib_send_work *prev;
495 struct ib_send_wr *failed_wr; 495 const struct ib_send_wr *failed_wr;
496 struct scatterlist *scat; 496 struct scatterlist *scat;
497 u32 pos; 497 u32 pos;
498 u32 i; 498 u32 i;
@@ -730,7 +730,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
730 first, &first->s_wr, ret, failed_wr); 730 first, &first->s_wr, ret, failed_wr);
731 BUG_ON(failed_wr != &first->s_wr); 731 BUG_ON(failed_wr != &first->s_wr);
732 if (ret) { 732 if (ret) {
733 printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 " 733 printk(KERN_WARNING "RDS/IB: ib_post_send to %pI6c "
734 "returned %d\n", &conn->c_faddr, ret); 734 "returned %d\n", &conn->c_faddr, ret);
735 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); 735 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
736 rds_ib_sub_signaled(ic, nr_sig); 736 rds_ib_sub_signaled(ic, nr_sig);
@@ -758,15 +758,12 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
758{ 758{
759 struct rds_ib_connection *ic = conn->c_transport_data; 759 struct rds_ib_connection *ic = conn->c_transport_data;
760 struct rds_ib_send_work *send = NULL; 760 struct rds_ib_send_work *send = NULL;
761 struct ib_send_wr *failed_wr; 761 const struct ib_send_wr *failed_wr;
762 struct rds_ib_device *rds_ibdev;
763 u32 pos; 762 u32 pos;
764 u32 work_alloc; 763 u32 work_alloc;
765 int ret; 764 int ret;
766 int nr_sig = 0; 765 int nr_sig = 0;
767 766
768 rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
769
770 work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, 1, &pos); 767 work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, 1, &pos);
771 if (work_alloc != 1) { 768 if (work_alloc != 1) {
772 rds_ib_stats_inc(s_ib_tx_ring_full); 769 rds_ib_stats_inc(s_ib_tx_ring_full);
@@ -827,7 +824,7 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
827 send, &send->s_atomic_wr, ret, failed_wr); 824 send, &send->s_atomic_wr, ret, failed_wr);
828 BUG_ON(failed_wr != &send->s_atomic_wr.wr); 825 BUG_ON(failed_wr != &send->s_atomic_wr.wr);
829 if (ret) { 826 if (ret) {
830 printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI4 " 827 printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI6c "
831 "returned %d\n", &conn->c_faddr, ret); 828 "returned %d\n", &conn->c_faddr, ret);
832 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); 829 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
833 rds_ib_sub_signaled(ic, nr_sig); 830 rds_ib_sub_signaled(ic, nr_sig);
@@ -849,7 +846,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
849 struct rds_ib_send_work *send = NULL; 846 struct rds_ib_send_work *send = NULL;
850 struct rds_ib_send_work *first; 847 struct rds_ib_send_work *first;
851 struct rds_ib_send_work *prev; 848 struct rds_ib_send_work *prev;
852 struct ib_send_wr *failed_wr; 849 const struct ib_send_wr *failed_wr;
853 struct scatterlist *scat; 850 struct scatterlist *scat;
854 unsigned long len; 851 unsigned long len;
855 u64 remote_addr = op->op_remote_addr; 852 u64 remote_addr = op->op_remote_addr;
@@ -967,7 +964,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
967 first, &first->s_rdma_wr.wr, ret, failed_wr); 964 first, &first->s_rdma_wr.wr, ret, failed_wr);
968 BUG_ON(failed_wr != &first->s_rdma_wr.wr); 965 BUG_ON(failed_wr != &first->s_rdma_wr.wr);
969 if (ret) { 966 if (ret) {
970 printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 " 967 printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI6c "
971 "returned %d\n", &conn->c_faddr, ret); 968 "returned %d\n", &conn->c_faddr, ret);
972 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); 969 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
973 rds_ib_sub_signaled(ic, nr_sig); 970 rds_ib_sub_signaled(ic, nr_sig);
diff --git a/net/rds/loop.c b/net/rds/loop.c
index dac6218a460e..1d73ad79c847 100644
--- a/net/rds/loop.c
+++ b/net/rds/loop.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2006 Oracle. All rights reserved. 2 * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
3 * 3 *
4 * This software is available to you under a choice of one of two 4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU 5 * licenses. You may choose to be licensed under the terms of the GNU
@@ -33,6 +33,9 @@
33#include <linux/kernel.h> 33#include <linux/kernel.h>
34#include <linux/slab.h> 34#include <linux/slab.h>
35#include <linux/in.h> 35#include <linux/in.h>
36#include <net/net_namespace.h>
37#include <net/netns/generic.h>
38#include <linux/ipv6.h>
36 39
37#include "rds_single_path.h" 40#include "rds_single_path.h"
38#include "rds.h" 41#include "rds.h"
@@ -40,6 +43,17 @@
40 43
41static DEFINE_SPINLOCK(loop_conns_lock); 44static DEFINE_SPINLOCK(loop_conns_lock);
42static LIST_HEAD(loop_conns); 45static LIST_HEAD(loop_conns);
46static atomic_t rds_loop_unloading = ATOMIC_INIT(0);
47
48static void rds_loop_set_unloading(void)
49{
50 atomic_set(&rds_loop_unloading, 1);
51}
52
53static bool rds_loop_is_unloading(struct rds_connection *conn)
54{
55 return atomic_read(&rds_loop_unloading) != 0;
56}
43 57
44/* 58/*
45 * This 'loopback' transport is a special case for flows that originate 59 * This 'loopback' transport is a special case for flows that originate
@@ -75,11 +89,11 @@ static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm,
75 89
76 BUG_ON(hdr_off || sg || off); 90 BUG_ON(hdr_off || sg || off);
77 91
78 rds_inc_init(&rm->m_inc, conn, conn->c_laddr); 92 rds_inc_init(&rm->m_inc, conn, &conn->c_laddr);
79 /* For the embedded inc. Matching put is in loop_inc_free() */ 93 /* For the embedded inc. Matching put is in loop_inc_free() */
80 rds_message_addref(rm); 94 rds_message_addref(rm);
81 95
82 rds_recv_incoming(conn, conn->c_laddr, conn->c_faddr, &rm->m_inc, 96 rds_recv_incoming(conn, &conn->c_laddr, &conn->c_faddr, &rm->m_inc,
83 GFP_KERNEL); 97 GFP_KERNEL);
84 98
85 rds_send_drop_acked(conn, be64_to_cpu(rm->m_inc.i_hdr.h_sequence), 99 rds_send_drop_acked(conn, be64_to_cpu(rm->m_inc.i_hdr.h_sequence),
@@ -165,6 +179,8 @@ void rds_loop_exit(void)
165 struct rds_loop_connection *lc, *_lc; 179 struct rds_loop_connection *lc, *_lc;
166 LIST_HEAD(tmp_list); 180 LIST_HEAD(tmp_list);
167 181
182 rds_loop_set_unloading();
183 synchronize_rcu();
168 /* avoid calling conn_destroy with irqs off */ 184 /* avoid calling conn_destroy with irqs off */
169 spin_lock_irq(&loop_conns_lock); 185 spin_lock_irq(&loop_conns_lock);
170 list_splice(&loop_conns, &tmp_list); 186 list_splice(&loop_conns, &tmp_list);
@@ -177,6 +193,46 @@ void rds_loop_exit(void)
177 } 193 }
178} 194}
179 195
196static void rds_loop_kill_conns(struct net *net)
197{
198 struct rds_loop_connection *lc, *_lc;
199 LIST_HEAD(tmp_list);
200
201 spin_lock_irq(&loop_conns_lock);
202 list_for_each_entry_safe(lc, _lc, &loop_conns, loop_node) {
203 struct net *c_net = read_pnet(&lc->conn->c_net);
204
205 if (net != c_net)
206 continue;
207 list_move_tail(&lc->loop_node, &tmp_list);
208 }
209 spin_unlock_irq(&loop_conns_lock);
210
211 list_for_each_entry_safe(lc, _lc, &tmp_list, loop_node) {
212 WARN_ON(lc->conn->c_passive);
213 rds_conn_destroy(lc->conn);
214 }
215}
216
217static void __net_exit rds_loop_exit_net(struct net *net)
218{
219 rds_loop_kill_conns(net);
220}
221
222static struct pernet_operations rds_loop_net_ops = {
223 .exit = rds_loop_exit_net,
224};
225
226int rds_loop_net_init(void)
227{
228 return register_pernet_device(&rds_loop_net_ops);
229}
230
231void rds_loop_net_exit(void)
232{
233 unregister_pernet_device(&rds_loop_net_ops);
234}
235
180/* 236/*
181 * This is missing .xmit_* because loop doesn't go through generic 237 * This is missing .xmit_* because loop doesn't go through generic
182 * rds_send_xmit() and doesn't call rds_recv_incoming(). .listen_stop and 238 * rds_send_xmit() and doesn't call rds_recv_incoming(). .listen_stop and
@@ -194,4 +250,5 @@ struct rds_transport rds_loop_transport = {
194 .inc_free = rds_loop_inc_free, 250 .inc_free = rds_loop_inc_free,
195 .t_name = "loopback", 251 .t_name = "loopback",
196 .t_type = RDS_TRANS_LOOP, 252 .t_type = RDS_TRANS_LOOP,
253 .t_unloading = rds_loop_is_unloading,
197}; 254};
diff --git a/net/rds/loop.h b/net/rds/loop.h
index 469fa4b2da4f..bbc8cdd030df 100644
--- a/net/rds/loop.h
+++ b/net/rds/loop.h
@@ -5,6 +5,8 @@
5/* loop.c */ 5/* loop.c */
6extern struct rds_transport rds_loop_transport; 6extern struct rds_transport rds_loop_transport;
7 7
8int rds_loop_net_init(void);
9void rds_loop_net_exit(void);
8void rds_loop_exit(void); 10void rds_loop_exit(void);
9 11
10#endif 12#endif
diff --git a/net/rds/message.c b/net/rds/message.c
index a35f76971984..4b00b1152a5f 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -514,4 +514,3 @@ void rds_message_unmapped(struct rds_message *rm)
514 wake_up_interruptible(&rm->m_flush_wait); 514 wake_up_interruptible(&rm->m_flush_wait);
515} 515}
516EXPORT_SYMBOL_GPL(rds_message_unmapped); 516EXPORT_SYMBOL_GPL(rds_message_unmapped);
517
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index 634cfcb7bba6..98237feb607a 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2007 Oracle. All rights reserved. 2 * Copyright (c) 2007, 2017 Oracle and/or its affiliates. All rights reserved.
3 * 3 *
4 * This software is available to you under a choice of one of two 4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU 5 * licenses. You may choose to be licensed under the terms of the GNU
@@ -170,7 +170,8 @@ static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages,
170} 170}
171 171
172static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, 172static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
173 u64 *cookie_ret, struct rds_mr **mr_ret) 173 u64 *cookie_ret, struct rds_mr **mr_ret,
174 struct rds_conn_path *cp)
174{ 175{
175 struct rds_mr *mr = NULL, *found; 176 struct rds_mr *mr = NULL, *found;
176 unsigned int nr_pages; 177 unsigned int nr_pages;
@@ -183,7 +184,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
183 long i; 184 long i;
184 int ret; 185 int ret;
185 186
186 if (rs->rs_bound_addr == 0 || !rs->rs_transport) { 187 if (ipv6_addr_any(&rs->rs_bound_addr) || !rs->rs_transport) {
187 ret = -ENOTCONN; /* XXX not a great errno */ 188 ret = -ENOTCONN; /* XXX not a great errno */
188 goto out; 189 goto out;
189 } 190 }
@@ -269,7 +270,8 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
269 * Note that dma_map() implies that pending writes are 270 * Note that dma_map() implies that pending writes are
270 * flushed to RAM, so no dma_sync is needed here. */ 271 * flushed to RAM, so no dma_sync is needed here. */
271 trans_private = rs->rs_transport->get_mr(sg, nents, rs, 272 trans_private = rs->rs_transport->get_mr(sg, nents, rs,
272 &mr->r_key); 273 &mr->r_key,
274 cp ? cp->cp_conn : NULL);
273 275
274 if (IS_ERR(trans_private)) { 276 if (IS_ERR(trans_private)) {
275 for (i = 0 ; i < nents; i++) 277 for (i = 0 ; i < nents; i++)
@@ -330,7 +332,7 @@ int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen)
330 sizeof(struct rds_get_mr_args))) 332 sizeof(struct rds_get_mr_args)))
331 return -EFAULT; 333 return -EFAULT;
332 334
333 return __rds_rdma_map(rs, &args, NULL, NULL); 335 return __rds_rdma_map(rs, &args, NULL, NULL, NULL);
334} 336}
335 337
336int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen) 338int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen)
@@ -354,7 +356,7 @@ int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen)
354 new_args.cookie_addr = args.cookie_addr; 356 new_args.cookie_addr = args.cookie_addr;
355 new_args.flags = args.flags; 357 new_args.flags = args.flags;
356 358
357 return __rds_rdma_map(rs, &new_args, NULL, NULL); 359 return __rds_rdma_map(rs, &new_args, NULL, NULL, NULL);
358} 360}
359 361
360/* 362/*
@@ -574,7 +576,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
574 576
575 args = CMSG_DATA(cmsg); 577 args = CMSG_DATA(cmsg);
576 578
577 if (rs->rs_bound_addr == 0) { 579 if (ipv6_addr_any(&rs->rs_bound_addr)) {
578 ret = -ENOTCONN; /* XXX not a great errno */ 580 ret = -ENOTCONN; /* XXX not a great errno */
579 goto out_ret; 581 goto out_ret;
580 } 582 }
@@ -782,7 +784,8 @@ int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
782 rm->m_rdma_cookie != 0) 784 rm->m_rdma_cookie != 0)
783 return -EINVAL; 785 return -EINVAL;
784 786
785 return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->rdma.op_rdma_mr); 787 return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie,
788 &rm->rdma.op_rdma_mr, rm->m_conn_path);
786} 789}
787 790
788/* 791/*
diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
index fc59821f0a27..6b0f57c83a2a 100644
--- a/net/rds/rdma_transport.c
+++ b/net/rds/rdma_transport.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2009 Oracle. All rights reserved. 2 * Copyright (c) 2009, 2018 Oracle and/or its affiliates. All rights reserved.
3 * 3 *
4 * This software is available to you under a choice of one of two 4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU 5 * licenses. You may choose to be licensed under the terms of the GNU
@@ -37,10 +37,15 @@
37#include "rdma_transport.h" 37#include "rdma_transport.h"
38#include "ib.h" 38#include "ib.h"
39 39
40/* Global IPv4 and IPv6 RDS RDMA listener cm_id */
40static struct rdma_cm_id *rds_rdma_listen_id; 41static struct rdma_cm_id *rds_rdma_listen_id;
42#if IS_ENABLED(CONFIG_IPV6)
43static struct rdma_cm_id *rds6_rdma_listen_id;
44#endif
41 45
42int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, 46static int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id,
43 struct rdma_cm_event *event) 47 struct rdma_cm_event *event,
48 bool isv6)
44{ 49{
45 /* this can be null in the listening path */ 50 /* this can be null in the listening path */
46 struct rds_connection *conn = cm_id->context; 51 struct rds_connection *conn = cm_id->context;
@@ -72,7 +77,7 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
72 77
73 switch (event->event) { 78 switch (event->event) {
74 case RDMA_CM_EVENT_CONNECT_REQUEST: 79 case RDMA_CM_EVENT_CONNECT_REQUEST:
75 ret = trans->cm_handle_connect(cm_id, event); 80 ret = trans->cm_handle_connect(cm_id, event, isv6);
76 break; 81 break;
77 82
78 case RDMA_CM_EVENT_ADDR_RESOLVED: 83 case RDMA_CM_EVENT_ADDR_RESOLVED:
@@ -90,7 +95,7 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
90 95
91 ibic = conn->c_transport_data; 96 ibic = conn->c_transport_data;
92 if (ibic && ibic->i_cm_id == cm_id) 97 if (ibic && ibic->i_cm_id == cm_id)
93 ret = trans->cm_initiate_connect(cm_id); 98 ret = trans->cm_initiate_connect(cm_id, isv6);
94 else 99 else
95 rds_conn_drop(conn); 100 rds_conn_drop(conn);
96 } 101 }
@@ -116,14 +121,14 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
116 121
117 case RDMA_CM_EVENT_DISCONNECTED: 122 case RDMA_CM_EVENT_DISCONNECTED:
118 rdsdebug("DISCONNECT event - dropping connection " 123 rdsdebug("DISCONNECT event - dropping connection "
119 "%pI4->%pI4\n", &conn->c_laddr, 124 "%pI6c->%pI6c\n", &conn->c_laddr,
120 &conn->c_faddr); 125 &conn->c_faddr);
121 rds_conn_drop(conn); 126 rds_conn_drop(conn);
122 break; 127 break;
123 128
124 case RDMA_CM_EVENT_TIMEWAIT_EXIT: 129 case RDMA_CM_EVENT_TIMEWAIT_EXIT:
125 if (conn) { 130 if (conn) {
126 pr_info("RDS: RDMA_CM_EVENT_TIMEWAIT_EXIT event: dropping connection %pI4->%pI4\n", 131 pr_info("RDS: RDMA_CM_EVENT_TIMEWAIT_EXIT event: dropping connection %pI6c->%pI6c\n",
127 &conn->c_laddr, &conn->c_faddr); 132 &conn->c_laddr, &conn->c_faddr);
128 rds_conn_drop(conn); 133 rds_conn_drop(conn);
129 } 134 }
@@ -146,13 +151,28 @@ out:
146 return ret; 151 return ret;
147} 152}
148 153
149static int rds_rdma_listen_init(void) 154int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
155 struct rdma_cm_event *event)
156{
157 return rds_rdma_cm_event_handler_cmn(cm_id, event, false);
158}
159
160#if IS_ENABLED(CONFIG_IPV6)
161int rds6_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
162 struct rdma_cm_event *event)
163{
164 return rds_rdma_cm_event_handler_cmn(cm_id, event, true);
165}
166#endif
167
168static int rds_rdma_listen_init_common(rdma_cm_event_handler handler,
169 struct sockaddr *sa,
170 struct rdma_cm_id **ret_cm_id)
150{ 171{
151 struct sockaddr_in sin;
152 struct rdma_cm_id *cm_id; 172 struct rdma_cm_id *cm_id;
153 int ret; 173 int ret;
154 174
155 cm_id = rdma_create_id(&init_net, rds_rdma_cm_event_handler, NULL, 175 cm_id = rdma_create_id(&init_net, handler, NULL,
156 RDMA_PS_TCP, IB_QPT_RC); 176 RDMA_PS_TCP, IB_QPT_RC);
157 if (IS_ERR(cm_id)) { 177 if (IS_ERR(cm_id)) {
158 ret = PTR_ERR(cm_id); 178 ret = PTR_ERR(cm_id);
@@ -161,15 +181,11 @@ static int rds_rdma_listen_init(void)
161 return ret; 181 return ret;
162 } 182 }
163 183
164 sin.sin_family = AF_INET;
165 sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY);
166 sin.sin_port = (__force u16)htons(RDS_PORT);
167
168 /* 184 /*
169 * XXX I bet this binds the cm_id to a device. If we want to support 185 * XXX I bet this binds the cm_id to a device. If we want to support
170 * fail-over we'll have to take this into consideration. 186 * fail-over we'll have to take this into consideration.
171 */ 187 */
172 ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); 188 ret = rdma_bind_addr(cm_id, sa);
173 if (ret) { 189 if (ret) {
174 printk(KERN_ERR "RDS/RDMA: failed to setup listener, " 190 printk(KERN_ERR "RDS/RDMA: failed to setup listener, "
175 "rdma_bind_addr() returned %d\n", ret); 191 "rdma_bind_addr() returned %d\n", ret);
@@ -185,7 +201,7 @@ static int rds_rdma_listen_init(void)
185 201
186 rdsdebug("cm %p listening on port %u\n", cm_id, RDS_PORT); 202 rdsdebug("cm %p listening on port %u\n", cm_id, RDS_PORT);
187 203
188 rds_rdma_listen_id = cm_id; 204 *ret_cm_id = cm_id;
189 cm_id = NULL; 205 cm_id = NULL;
190out: 206out:
191 if (cm_id) 207 if (cm_id)
@@ -193,6 +209,45 @@ out:
193 return ret; 209 return ret;
194} 210}
195 211
212/* Initialize the RDS RDMA listeners. We create two listeners for
213 * compatibility reason. The one on RDS_PORT is used for IPv4
214 * requests only. The one on RDS_CM_PORT is used for IPv6 requests
215 * only. So only IPv6 enabled RDS module will communicate using this
216 * port.
217 */
218static int rds_rdma_listen_init(void)
219{
220 int ret;
221#if IS_ENABLED(CONFIG_IPV6)
222 struct sockaddr_in6 sin6;
223#endif
224 struct sockaddr_in sin;
225
226 sin.sin_family = PF_INET;
227 sin.sin_addr.s_addr = htonl(INADDR_ANY);
228 sin.sin_port = htons(RDS_PORT);
229 ret = rds_rdma_listen_init_common(rds_rdma_cm_event_handler,
230 (struct sockaddr *)&sin,
231 &rds_rdma_listen_id);
232 if (ret != 0)
233 return ret;
234
235#if IS_ENABLED(CONFIG_IPV6)
236 sin6.sin6_family = PF_INET6;
237 sin6.sin6_addr = in6addr_any;
238 sin6.sin6_port = htons(RDS_CM_PORT);
239 sin6.sin6_scope_id = 0;
240 sin6.sin6_flowinfo = 0;
241 ret = rds_rdma_listen_init_common(rds6_rdma_cm_event_handler,
242 (struct sockaddr *)&sin6,
243 &rds6_rdma_listen_id);
244 /* Keep going even when IPv6 is not enabled in the system. */
245 if (ret != 0)
246 rdsdebug("Cannot set up IPv6 RDMA listener\n");
247#endif
248 return 0;
249}
250
196static void rds_rdma_listen_stop(void) 251static void rds_rdma_listen_stop(void)
197{ 252{
198 if (rds_rdma_listen_id) { 253 if (rds_rdma_listen_id) {
@@ -200,6 +255,13 @@ static void rds_rdma_listen_stop(void)
200 rdma_destroy_id(rds_rdma_listen_id); 255 rdma_destroy_id(rds_rdma_listen_id);
201 rds_rdma_listen_id = NULL; 256 rds_rdma_listen_id = NULL;
202 } 257 }
258#if IS_ENABLED(CONFIG_IPV6)
259 if (rds6_rdma_listen_id) {
260 rdsdebug("cm %p\n", rds6_rdma_listen_id);
261 rdma_destroy_id(rds6_rdma_listen_id);
262 rds6_rdma_listen_id = NULL;
263 }
264#endif
203} 265}
204 266
205static int rds_rdma_init(void) 267static int rds_rdma_init(void)
@@ -229,4 +291,3 @@ module_exit(rds_rdma_exit);
229MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>"); 291MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
230MODULE_DESCRIPTION("RDS: IB transport"); 292MODULE_DESCRIPTION("RDS: IB transport");
231MODULE_LICENSE("Dual BSD/GPL"); 293MODULE_LICENSE("Dual BSD/GPL");
232
diff --git a/net/rds/rdma_transport.h b/net/rds/rdma_transport.h
index d309c4430124..200d3134aaae 100644
--- a/net/rds/rdma_transport.h
+++ b/net/rds/rdma_transport.h
@@ -6,11 +6,16 @@
6#include <rdma/rdma_cm.h> 6#include <rdma/rdma_cm.h>
7#include "rds.h" 7#include "rds.h"
8 8
9/* RDMA_CM also uses 16385 as the listener port. */
10#define RDS_CM_PORT 16385
11
9#define RDS_RDMA_RESOLVE_TIMEOUT_MS 5000 12#define RDS_RDMA_RESOLVE_TIMEOUT_MS 5000
10 13
11int rds_rdma_conn_connect(struct rds_connection *conn); 14int rds_rdma_conn_connect(struct rds_connection *conn);
12int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, 15int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
13 struct rdma_cm_event *event); 16 struct rdma_cm_event *event);
17int rds6_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
18 struct rdma_cm_event *event);
14 19
15/* from ib.c */ 20/* from ib.c */
16extern struct rds_transport rds_ib_transport; 21extern struct rds_transport rds_ib_transport;
diff --git a/net/rds/rds.h b/net/rds/rds.h
index f2272fb8cd45..c4dcf654d8fe 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -10,6 +10,7 @@
10#include <linux/rds.h> 10#include <linux/rds.h>
11#include <linux/rhashtable.h> 11#include <linux/rhashtable.h>
12#include <linux/refcount.h> 12#include <linux/refcount.h>
13#include <linux/in6.h>
13 14
14#include "info.h" 15#include "info.h"
15 16
@@ -23,11 +24,13 @@
23#define RDS_PROTOCOL_MINOR(v) ((v) & 255) 24#define RDS_PROTOCOL_MINOR(v) ((v) & 255)
24#define RDS_PROTOCOL(maj, min) (((maj) << 8) | min) 25#define RDS_PROTOCOL(maj, min) (((maj) << 8) | min)
25 26
26/* 27/* The following ports, 16385, 18634, 18635, are registered with IANA as
27 * XXX randomly chosen, but at least seems to be unused: 28 * the ports to be used for RDS over TCP and UDP. Currently, only RDS over
28 * # 18464-18768 Unassigned 29 * TCP and RDS over IB/RDMA are implemented. 18634 is the historical value
29 * We should do better. We want a reserved port to discourage unpriv'ed 30 * used for the RDMA_CM listener port. RDS/TCP uses port 16385. After
30 * userspace from listening. 31 * IPv6 work, RDMA_CM also uses 16385 as the listener port. 18634 is kept
32 * to ensure compatibility with older RDS modules. Those ports are defined
33 * in each transport's header file.
31 */ 34 */
32#define RDS_PORT 18634 35#define RDS_PORT 18634
33 36
@@ -61,7 +64,7 @@ void rdsdebug(char *fmt, ...)
61 64
62struct rds_cong_map { 65struct rds_cong_map {
63 struct rb_node m_rb_node; 66 struct rb_node m_rb_node;
64 __be32 m_addr; 67 struct in6_addr m_addr;
65 wait_queue_head_t m_waitq; 68 wait_queue_head_t m_waitq;
66 struct list_head m_conn_list; 69 struct list_head m_conn_list;
67 unsigned long m_page_addrs[RDS_CONG_MAP_PAGES]; 70 unsigned long m_page_addrs[RDS_CONG_MAP_PAGES];
@@ -136,11 +139,14 @@ struct rds_conn_path {
136/* One rds_connection per RDS address pair */ 139/* One rds_connection per RDS address pair */
137struct rds_connection { 140struct rds_connection {
138 struct hlist_node c_hash_node; 141 struct hlist_node c_hash_node;
139 __be32 c_laddr; 142 struct in6_addr c_laddr;
140 __be32 c_faddr; 143 struct in6_addr c_faddr;
144 int c_dev_if; /* ifindex used for this conn */
145 int c_bound_if; /* ifindex of c_laddr */
141 unsigned int c_loopback:1, 146 unsigned int c_loopback:1,
147 c_isv6:1,
142 c_ping_triggered:1, 148 c_ping_triggered:1,
143 c_pad_to_32:30; 149 c_pad_to_32:29;
144 int c_npaths; 150 int c_npaths;
145 struct rds_connection *c_passive; 151 struct rds_connection *c_passive;
146 struct rds_transport *c_trans; 152 struct rds_transport *c_trans;
@@ -269,7 +275,7 @@ struct rds_incoming {
269 struct rds_conn_path *i_conn_path; 275 struct rds_conn_path *i_conn_path;
270 struct rds_header i_hdr; 276 struct rds_header i_hdr;
271 unsigned long i_rx_jiffies; 277 unsigned long i_rx_jiffies;
272 __be32 i_saddr; 278 struct in6_addr i_saddr;
273 279
274 rds_rdma_cookie_t i_rdma_cookie; 280 rds_rdma_cookie_t i_rdma_cookie;
275 struct timeval i_rx_tstamp; 281 struct timeval i_rx_tstamp;
@@ -386,7 +392,7 @@ struct rds_message {
386 struct list_head m_conn_item; 392 struct list_head m_conn_item;
387 struct rds_incoming m_inc; 393 struct rds_incoming m_inc;
388 u64 m_ack_seq; 394 u64 m_ack_seq;
389 __be32 m_daddr; 395 struct in6_addr m_daddr;
390 unsigned long m_flags; 396 unsigned long m_flags;
391 397
392 /* Never access m_rs without holding m_rs_lock. 398 /* Never access m_rs without holding m_rs_lock.
@@ -464,6 +470,8 @@ struct rds_message {
464 struct scatterlist *op_sg; 470 struct scatterlist *op_sg;
465 } data; 471 } data;
466 }; 472 };
473
474 struct rds_conn_path *m_conn_path;
467}; 475};
468 476
469/* 477/*
@@ -519,7 +527,8 @@ struct rds_transport {
519 t_mp_capable:1; 527 t_mp_capable:1;
520 unsigned int t_type; 528 unsigned int t_type;
521 529
522 int (*laddr_check)(struct net *net, __be32 addr); 530 int (*laddr_check)(struct net *net, const struct in6_addr *addr,
531 __u32 scope_id);
523 int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp); 532 int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp);
524 void (*conn_free)(void *data); 533 void (*conn_free)(void *data);
525 int (*conn_path_connect)(struct rds_conn_path *cp); 534 int (*conn_path_connect)(struct rds_conn_path *cp);
@@ -535,8 +544,8 @@ struct rds_transport {
535 void (*inc_free)(struct rds_incoming *inc); 544 void (*inc_free)(struct rds_incoming *inc);
536 545
537 int (*cm_handle_connect)(struct rdma_cm_id *cm_id, 546 int (*cm_handle_connect)(struct rdma_cm_id *cm_id,
538 struct rdma_cm_event *event); 547 struct rdma_cm_event *event, bool isv6);
539 int (*cm_initiate_connect)(struct rdma_cm_id *cm_id); 548 int (*cm_initiate_connect)(struct rdma_cm_id *cm_id, bool isv6);
540 void (*cm_connect_complete)(struct rds_connection *conn, 549 void (*cm_connect_complete)(struct rds_connection *conn,
541 struct rdma_cm_event *event); 550 struct rdma_cm_event *event);
542 551
@@ -544,13 +553,20 @@ struct rds_transport {
544 unsigned int avail); 553 unsigned int avail);
545 void (*exit)(void); 554 void (*exit)(void);
546 void *(*get_mr)(struct scatterlist *sg, unsigned long nr_sg, 555 void *(*get_mr)(struct scatterlist *sg, unsigned long nr_sg,
547 struct rds_sock *rs, u32 *key_ret); 556 struct rds_sock *rs, u32 *key_ret,
557 struct rds_connection *conn);
548 void (*sync_mr)(void *trans_private, int direction); 558 void (*sync_mr)(void *trans_private, int direction);
549 void (*free_mr)(void *trans_private, int invalidate); 559 void (*free_mr)(void *trans_private, int invalidate);
550 void (*flush_mrs)(void); 560 void (*flush_mrs)(void);
551 bool (*t_unloading)(struct rds_connection *conn); 561 bool (*t_unloading)(struct rds_connection *conn);
552}; 562};
553 563
564/* Bind hash table key length. It is the sum of the size of a struct
565 * in6_addr, a scope_id and a port.
566 */
567#define RDS_BOUND_KEY_LEN \
568 (sizeof(struct in6_addr) + sizeof(__u32) + sizeof(__be16))
569
554struct rds_sock { 570struct rds_sock {
555 struct sock rs_sk; 571 struct sock rs_sk;
556 572
@@ -562,10 +578,14 @@ struct rds_sock {
562 * support. 578 * support.
563 */ 579 */
564 struct rhash_head rs_bound_node; 580 struct rhash_head rs_bound_node;
565 u64 rs_bound_key; 581 u8 rs_bound_key[RDS_BOUND_KEY_LEN];
566 __be32 rs_bound_addr; 582 struct sockaddr_in6 rs_bound_sin6;
567 __be32 rs_conn_addr; 583#define rs_bound_addr rs_bound_sin6.sin6_addr
568 __be16 rs_bound_port; 584#define rs_bound_addr_v4 rs_bound_sin6.sin6_addr.s6_addr32[3]
585#define rs_bound_port rs_bound_sin6.sin6_port
586#define rs_bound_scope_id rs_bound_sin6.sin6_scope_id
587 struct in6_addr rs_conn_addr;
588#define rs_conn_addr_v4 rs_conn_addr.s6_addr32[3]
569 __be16 rs_conn_port; 589 __be16 rs_conn_port;
570 struct rds_transport *rs_transport; 590 struct rds_transport *rs_transport;
571 591
@@ -701,7 +721,8 @@ extern wait_queue_head_t rds_poll_waitq;
701/* bind.c */ 721/* bind.c */
702int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); 722int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
703void rds_remove_bound(struct rds_sock *rs); 723void rds_remove_bound(struct rds_sock *rs);
704struct rds_sock *rds_find_bound(__be32 addr, __be16 port); 724struct rds_sock *rds_find_bound(const struct in6_addr *addr, __be16 port,
725 __u32 scope_id);
705int rds_bind_lock_init(void); 726int rds_bind_lock_init(void);
706void rds_bind_lock_destroy(void); 727void rds_bind_lock_destroy(void);
707 728
@@ -720,16 +741,20 @@ void rds_cong_remove_socket(struct rds_sock *);
720void rds_cong_exit(void); 741void rds_cong_exit(void);
721struct rds_message *rds_cong_update_alloc(struct rds_connection *conn); 742struct rds_message *rds_cong_update_alloc(struct rds_connection *conn);
722 743
723/* conn.c */ 744/* connection.c */
724extern u32 rds_gen_num; 745extern u32 rds_gen_num;
725int rds_conn_init(void); 746int rds_conn_init(void);
726void rds_conn_exit(void); 747void rds_conn_exit(void);
727struct rds_connection *rds_conn_create(struct net *net, 748struct rds_connection *rds_conn_create(struct net *net,
728 __be32 laddr, __be32 faddr, 749 const struct in6_addr *laddr,
729 struct rds_transport *trans, gfp_t gfp); 750 const struct in6_addr *faddr,
751 struct rds_transport *trans, gfp_t gfp,
752 int dev_if);
730struct rds_connection *rds_conn_create_outgoing(struct net *net, 753struct rds_connection *rds_conn_create_outgoing(struct net *net,
731 __be32 laddr, __be32 faddr, 754 const struct in6_addr *laddr,
732 struct rds_transport *trans, gfp_t gfp); 755 const struct in6_addr *faddr,
756 struct rds_transport *trans,
757 gfp_t gfp, int dev_if);
733void rds_conn_shutdown(struct rds_conn_path *cpath); 758void rds_conn_shutdown(struct rds_conn_path *cpath);
734void rds_conn_destroy(struct rds_connection *conn); 759void rds_conn_destroy(struct rds_connection *conn);
735void rds_conn_drop(struct rds_connection *conn); 760void rds_conn_drop(struct rds_connection *conn);
@@ -840,11 +865,12 @@ void rds_page_exit(void);
840 865
841/* recv.c */ 866/* recv.c */
842void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, 867void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
843 __be32 saddr); 868 struct in6_addr *saddr);
844void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *conn, 869void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *conn,
845 __be32 saddr); 870 struct in6_addr *saddr);
846void rds_inc_put(struct rds_incoming *inc); 871void rds_inc_put(struct rds_incoming *inc);
847void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, 872void rds_recv_incoming(struct rds_connection *conn, struct in6_addr *saddr,
873 struct in6_addr *daddr,
848 struct rds_incoming *inc, gfp_t gfp); 874 struct rds_incoming *inc, gfp_t gfp);
849int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 875int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
850 int msg_flags); 876 int msg_flags);
@@ -853,13 +879,17 @@ int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msg);
853void rds_inc_info_copy(struct rds_incoming *inc, 879void rds_inc_info_copy(struct rds_incoming *inc,
854 struct rds_info_iterator *iter, 880 struct rds_info_iterator *iter,
855 __be32 saddr, __be32 daddr, int flip); 881 __be32 saddr, __be32 daddr, int flip);
882void rds6_inc_info_copy(struct rds_incoming *inc,
883 struct rds_info_iterator *iter,
884 struct in6_addr *saddr, struct in6_addr *daddr,
885 int flip);
856 886
857/* send.c */ 887/* send.c */
858int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len); 888int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len);
859void rds_send_path_reset(struct rds_conn_path *conn); 889void rds_send_path_reset(struct rds_conn_path *conn);
860int rds_send_xmit(struct rds_conn_path *cp); 890int rds_send_xmit(struct rds_conn_path *cp);
861struct sockaddr_in; 891struct sockaddr_in;
862void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest); 892void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in6 *dest);
863typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack); 893typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack);
864void rds_send_drop_acked(struct rds_connection *conn, u64 ack, 894void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
865 is_acked_func is_acked); 895 is_acked_func is_acked);
@@ -946,11 +976,14 @@ void rds_send_worker(struct work_struct *);
946void rds_recv_worker(struct work_struct *); 976void rds_recv_worker(struct work_struct *);
947void rds_connect_path_complete(struct rds_conn_path *conn, int curr); 977void rds_connect_path_complete(struct rds_conn_path *conn, int curr);
948void rds_connect_complete(struct rds_connection *conn); 978void rds_connect_complete(struct rds_connection *conn);
979int rds_addr_cmp(const struct in6_addr *a1, const struct in6_addr *a2);
949 980
950/* transport.c */ 981/* transport.c */
951void rds_trans_register(struct rds_transport *trans); 982void rds_trans_register(struct rds_transport *trans);
952void rds_trans_unregister(struct rds_transport *trans); 983void rds_trans_unregister(struct rds_transport *trans);
953struct rds_transport *rds_trans_get_preferred(struct net *net, __be32 addr); 984struct rds_transport *rds_trans_get_preferred(struct net *net,
985 const struct in6_addr *addr,
986 __u32 scope_id);
954void rds_trans_put(struct rds_transport *trans); 987void rds_trans_put(struct rds_transport *trans);
955unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter, 988unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
956 unsigned int avail); 989 unsigned int avail);
diff --git a/net/rds/recv.c b/net/rds/recv.c
index 192ac6f78ded..504cd6bcc54c 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2006 Oracle. All rights reserved. 2 * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
3 * 3 *
4 * This software is available to you under a choice of one of two 4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU 5 * licenses. You may choose to be licensed under the terms of the GNU
@@ -41,14 +41,14 @@
41#include "rds.h" 41#include "rds.h"
42 42
43void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, 43void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
44 __be32 saddr) 44 struct in6_addr *saddr)
45{ 45{
46 int i; 46 int i;
47 47
48 refcount_set(&inc->i_refcount, 1); 48 refcount_set(&inc->i_refcount, 1);
49 INIT_LIST_HEAD(&inc->i_item); 49 INIT_LIST_HEAD(&inc->i_item);
50 inc->i_conn = conn; 50 inc->i_conn = conn;
51 inc->i_saddr = saddr; 51 inc->i_saddr = *saddr;
52 inc->i_rdma_cookie = 0; 52 inc->i_rdma_cookie = 0;
53 inc->i_rx_tstamp.tv_sec = 0; 53 inc->i_rx_tstamp.tv_sec = 0;
54 inc->i_rx_tstamp.tv_usec = 0; 54 inc->i_rx_tstamp.tv_usec = 0;
@@ -59,13 +59,13 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
59EXPORT_SYMBOL_GPL(rds_inc_init); 59EXPORT_SYMBOL_GPL(rds_inc_init);
60 60
61void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *cp, 61void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *cp,
62 __be32 saddr) 62 struct in6_addr *saddr)
63{ 63{
64 refcount_set(&inc->i_refcount, 1); 64 refcount_set(&inc->i_refcount, 1);
65 INIT_LIST_HEAD(&inc->i_item); 65 INIT_LIST_HEAD(&inc->i_item);
66 inc->i_conn = cp->cp_conn; 66 inc->i_conn = cp->cp_conn;
67 inc->i_conn_path = cp; 67 inc->i_conn_path = cp;
68 inc->i_saddr = saddr; 68 inc->i_saddr = *saddr;
69 inc->i_rdma_cookie = 0; 69 inc->i_rdma_cookie = 0;
70 inc->i_rx_tstamp.tv_sec = 0; 70 inc->i_rx_tstamp.tv_sec = 0;
71 inc->i_rx_tstamp.tv_usec = 0; 71 inc->i_rx_tstamp.tv_usec = 0;
@@ -110,7 +110,7 @@ static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk,
110 110
111 now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs); 111 now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs);
112 112
113 rdsdebug("rs %p (%pI4:%u) recv bytes %d buf %d " 113 rdsdebug("rs %p (%pI6c:%u) recv bytes %d buf %d "
114 "now_cong %d delta %d\n", 114 "now_cong %d delta %d\n",
115 rs, &rs->rs_bound_addr, 115 rs, &rs->rs_bound_addr,
116 ntohs(rs->rs_bound_port), rs->rs_rcv_bytes, 116 ntohs(rs->rs_bound_port), rs->rs_rcv_bytes,
@@ -260,7 +260,7 @@ static void rds_start_mprds(struct rds_connection *conn)
260 struct rds_conn_path *cp; 260 struct rds_conn_path *cp;
261 261
262 if (conn->c_npaths > 1 && 262 if (conn->c_npaths > 1 &&
263 IS_CANONICAL(conn->c_laddr, conn->c_faddr)) { 263 rds_addr_cmp(&conn->c_laddr, &conn->c_faddr) < 0) {
264 for (i = 0; i < conn->c_npaths; i++) { 264 for (i = 0; i < conn->c_npaths; i++) {
265 cp = &conn->c_path[i]; 265 cp = &conn->c_path[i];
266 rds_conn_path_connect_if_down(cp); 266 rds_conn_path_connect_if_down(cp);
@@ -284,7 +284,8 @@ static void rds_start_mprds(struct rds_connection *conn)
284 * conn. This lets loopback, who only has one conn for both directions, 284 * conn. This lets loopback, who only has one conn for both directions,
285 * tell us which roles the addrs in the conn are playing for this message. 285 * tell us which roles the addrs in the conn are playing for this message.
286 */ 286 */
287void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, 287void rds_recv_incoming(struct rds_connection *conn, struct in6_addr *saddr,
288 struct in6_addr *daddr,
288 struct rds_incoming *inc, gfp_t gfp) 289 struct rds_incoming *inc, gfp_t gfp)
289{ 290{
290 struct rds_sock *rs = NULL; 291 struct rds_sock *rs = NULL;
@@ -339,7 +340,8 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
339 340
340 if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) { 341 if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) {
341 if (inc->i_hdr.h_sport == 0) { 342 if (inc->i_hdr.h_sport == 0) {
342 rdsdebug("ignore ping with 0 sport from 0x%x\n", saddr); 343 rdsdebug("ignore ping with 0 sport from %pI6c\n",
344 saddr);
343 goto out; 345 goto out;
344 } 346 }
345 rds_stats_inc(s_recv_ping); 347 rds_stats_inc(s_recv_ping);
@@ -362,7 +364,7 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
362 goto out; 364 goto out;
363 } 365 }
364 366
365 rs = rds_find_bound(daddr, inc->i_hdr.h_dport); 367 rs = rds_find_bound(daddr, inc->i_hdr.h_dport, conn->c_bound_if);
366 if (!rs) { 368 if (!rs) {
367 rds_stats_inc(s_recv_drop_no_sock); 369 rds_stats_inc(s_recv_drop_no_sock);
368 goto out; 370 goto out;
@@ -625,6 +627,7 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
625 struct rds_sock *rs = rds_sk_to_rs(sk); 627 struct rds_sock *rs = rds_sk_to_rs(sk);
626 long timeo; 628 long timeo;
627 int ret = 0, nonblock = msg_flags & MSG_DONTWAIT; 629 int ret = 0, nonblock = msg_flags & MSG_DONTWAIT;
630 DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
628 DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); 631 DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
629 struct rds_incoming *inc = NULL; 632 struct rds_incoming *inc = NULL;
630 633
@@ -673,7 +676,7 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
673 break; 676 break;
674 } 677 }
675 678
676 rdsdebug("copying inc %p from %pI4:%u to user\n", inc, 679 rdsdebug("copying inc %p from %pI6c:%u to user\n", inc,
677 &inc->i_conn->c_faddr, 680 &inc->i_conn->c_faddr,
678 ntohs(inc->i_hdr.h_sport)); 681 ntohs(inc->i_hdr.h_sport));
679 ret = inc->i_conn->c_trans->inc_copy_to_user(inc, &msg->msg_iter); 682 ret = inc->i_conn->c_trans->inc_copy_to_user(inc, &msg->msg_iter);
@@ -707,12 +710,26 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
707 710
708 rds_stats_inc(s_recv_delivered); 711 rds_stats_inc(s_recv_delivered);
709 712
710 if (sin) { 713 if (msg->msg_name) {
711 sin->sin_family = AF_INET; 714 if (ipv6_addr_v4mapped(&inc->i_saddr)) {
712 sin->sin_port = inc->i_hdr.h_sport; 715 sin = (struct sockaddr_in *)msg->msg_name;
713 sin->sin_addr.s_addr = inc->i_saddr; 716
714 memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); 717 sin->sin_family = AF_INET;
715 msg->msg_namelen = sizeof(*sin); 718 sin->sin_port = inc->i_hdr.h_sport;
719 sin->sin_addr.s_addr =
720 inc->i_saddr.s6_addr32[3];
721 memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
722 msg->msg_namelen = sizeof(*sin);
723 } else {
724 sin6 = (struct sockaddr_in6 *)msg->msg_name;
725
726 sin6->sin6_family = AF_INET6;
727 sin6->sin6_port = inc->i_hdr.h_sport;
728 sin6->sin6_addr = inc->i_saddr;
729 sin6->sin6_flowinfo = 0;
730 sin6->sin6_scope_id = rs->rs_bound_scope_id;
731 msg->msg_namelen = sizeof(*sin6);
732 }
716 } 733 }
717 break; 734 break;
718 } 735 }
@@ -775,3 +792,30 @@ void rds_inc_info_copy(struct rds_incoming *inc,
775 792
776 rds_info_copy(iter, &minfo, sizeof(minfo)); 793 rds_info_copy(iter, &minfo, sizeof(minfo));
777} 794}
795
796#if IS_ENABLED(CONFIG_IPV6)
797void rds6_inc_info_copy(struct rds_incoming *inc,
798 struct rds_info_iterator *iter,
799 struct in6_addr *saddr, struct in6_addr *daddr,
800 int flip)
801{
802 struct rds6_info_message minfo6;
803
804 minfo6.seq = be64_to_cpu(inc->i_hdr.h_sequence);
805 minfo6.len = be32_to_cpu(inc->i_hdr.h_len);
806
807 if (flip) {
808 minfo6.laddr = *daddr;
809 minfo6.faddr = *saddr;
810 minfo6.lport = inc->i_hdr.h_dport;
811 minfo6.fport = inc->i_hdr.h_sport;
812 } else {
813 minfo6.laddr = *saddr;
814 minfo6.faddr = *daddr;
815 minfo6.lport = inc->i_hdr.h_sport;
816 minfo6.fport = inc->i_hdr.h_dport;
817 }
818
819 rds_info_copy(iter, &minfo6, sizeof(minfo6));
820}
821#endif
diff --git a/net/rds/send.c b/net/rds/send.c
index 94c7f74909be..57b3d5a8b2db 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -709,7 +709,7 @@ void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
709} 709}
710EXPORT_SYMBOL_GPL(rds_send_drop_acked); 710EXPORT_SYMBOL_GPL(rds_send_drop_acked);
711 711
712void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) 712void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in6 *dest)
713{ 713{
714 struct rds_message *rm, *tmp; 714 struct rds_message *rm, *tmp;
715 struct rds_connection *conn; 715 struct rds_connection *conn;
@@ -721,8 +721,9 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
721 spin_lock_irqsave(&rs->rs_lock, flags); 721 spin_lock_irqsave(&rs->rs_lock, flags);
722 722
723 list_for_each_entry_safe(rm, tmp, &rs->rs_send_queue, m_sock_item) { 723 list_for_each_entry_safe(rm, tmp, &rs->rs_send_queue, m_sock_item) {
724 if (dest && (dest->sin_addr.s_addr != rm->m_daddr || 724 if (dest &&
725 dest->sin_port != rm->m_inc.i_hdr.h_dport)) 725 (!ipv6_addr_equal(&dest->sin6_addr, &rm->m_daddr) ||
726 dest->sin6_port != rm->m_inc.i_hdr.h_dport))
726 continue; 727 continue;
727 728
728 list_move(&rm->m_sock_item, &list); 729 list_move(&rm->m_sock_item, &list);
@@ -1059,8 +1060,8 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
1059{ 1060{
1060 struct sock *sk = sock->sk; 1061 struct sock *sk = sock->sk;
1061 struct rds_sock *rs = rds_sk_to_rs(sk); 1062 struct rds_sock *rs = rds_sk_to_rs(sk);
1063 DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
1062 DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); 1064 DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
1063 __be32 daddr;
1064 __be16 dport; 1065 __be16 dport;
1065 struct rds_message *rm = NULL; 1066 struct rds_message *rm = NULL;
1066 struct rds_connection *conn; 1067 struct rds_connection *conn;
@@ -1069,10 +1070,13 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
1069 int nonblock = msg->msg_flags & MSG_DONTWAIT; 1070 int nonblock = msg->msg_flags & MSG_DONTWAIT;
1070 long timeo = sock_sndtimeo(sk, nonblock); 1071 long timeo = sock_sndtimeo(sk, nonblock);
1071 struct rds_conn_path *cpath; 1072 struct rds_conn_path *cpath;
1073 struct in6_addr daddr;
1074 __u32 scope_id = 0;
1072 size_t total_payload_len = payload_len, rdma_payload_len = 0; 1075 size_t total_payload_len = payload_len, rdma_payload_len = 0;
1073 bool zcopy = ((msg->msg_flags & MSG_ZEROCOPY) && 1076 bool zcopy = ((msg->msg_flags & MSG_ZEROCOPY) &&
1074 sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY)); 1077 sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY));
1075 int num_sgs = ceil(payload_len, PAGE_SIZE); 1078 int num_sgs = ceil(payload_len, PAGE_SIZE);
1079 int namelen;
1076 1080
1077 /* Mirror Linux UDP mirror of BSD error message compatibility */ 1081 /* Mirror Linux UDP mirror of BSD error message compatibility */
1078 /* XXX: Perhaps MSG_MORE someday */ 1082 /* XXX: Perhaps MSG_MORE someday */
@@ -1081,27 +1085,108 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
1081 goto out; 1085 goto out;
1082 } 1086 }
1083 1087
1084 if (msg->msg_namelen) { 1088 namelen = msg->msg_namelen;
1085 /* XXX fail non-unicast destination IPs? */ 1089 if (namelen != 0) {
1086 if (msg->msg_namelen < sizeof(*usin) || usin->sin_family != AF_INET) { 1090 if (namelen < sizeof(*usin)) {
1091 ret = -EINVAL;
1092 goto out;
1093 }
1094 switch (usin->sin_family) {
1095 case AF_INET:
1096 if (usin->sin_addr.s_addr == htonl(INADDR_ANY) ||
1097 usin->sin_addr.s_addr == htonl(INADDR_BROADCAST) ||
1098 IN_MULTICAST(ntohl(usin->sin_addr.s_addr))) {
1099 ret = -EINVAL;
1100 goto out;
1101 }
1102 ipv6_addr_set_v4mapped(usin->sin_addr.s_addr, &daddr);
1103 dport = usin->sin_port;
1104 break;
1105
1106#if IS_ENABLED(CONFIG_IPV6)
1107 case AF_INET6: {
1108 int addr_type;
1109
1110 if (namelen < sizeof(*sin6)) {
1111 ret = -EINVAL;
1112 goto out;
1113 }
1114 addr_type = ipv6_addr_type(&sin6->sin6_addr);
1115 if (!(addr_type & IPV6_ADDR_UNICAST)) {
1116 __be32 addr4;
1117
1118 if (!(addr_type & IPV6_ADDR_MAPPED)) {
1119 ret = -EINVAL;
1120 goto out;
1121 }
1122
1123 /* It is a mapped address. Need to do some
1124 * sanity checks.
1125 */
1126 addr4 = sin6->sin6_addr.s6_addr32[3];
1127 if (addr4 == htonl(INADDR_ANY) ||
1128 addr4 == htonl(INADDR_BROADCAST) ||
1129 IN_MULTICAST(ntohl(addr4))) {
1130 ret = -EINVAL;
1131 goto out;
1132 }
1133 }
1134 if (addr_type & IPV6_ADDR_LINKLOCAL) {
1135 if (sin6->sin6_scope_id == 0) {
1136 ret = -EINVAL;
1137 goto out;
1138 }
1139 scope_id = sin6->sin6_scope_id;
1140 }
1141
1142 daddr = sin6->sin6_addr;
1143 dport = sin6->sin6_port;
1144 break;
1145 }
1146#endif
1147
1148 default:
1087 ret = -EINVAL; 1149 ret = -EINVAL;
1088 goto out; 1150 goto out;
1089 } 1151 }
1090 daddr = usin->sin_addr.s_addr;
1091 dport = usin->sin_port;
1092 } else { 1152 } else {
1093 /* We only care about consistency with ->connect() */ 1153 /* We only care about consistency with ->connect() */
1094 lock_sock(sk); 1154 lock_sock(sk);
1095 daddr = rs->rs_conn_addr; 1155 daddr = rs->rs_conn_addr;
1096 dport = rs->rs_conn_port; 1156 dport = rs->rs_conn_port;
1157 scope_id = rs->rs_bound_scope_id;
1097 release_sock(sk); 1158 release_sock(sk);
1098 } 1159 }
1099 1160
1100 lock_sock(sk); 1161 lock_sock(sk);
1101 if (daddr == 0 || rs->rs_bound_addr == 0) { 1162 if (ipv6_addr_any(&rs->rs_bound_addr) || ipv6_addr_any(&daddr)) {
1102 release_sock(sk); 1163 release_sock(sk);
1103 ret = -ENOTCONN; /* XXX not a great errno */ 1164 ret = -ENOTCONN;
1104 goto out; 1165 goto out;
1166 } else if (namelen != 0) {
1167 /* Cannot send to an IPv4 address using an IPv6 source
1168 * address and cannot send to an IPv6 address using an
1169 * IPv4 source address.
1170 */
1171 if (ipv6_addr_v4mapped(&daddr) ^
1172 ipv6_addr_v4mapped(&rs->rs_bound_addr)) {
1173 release_sock(sk);
1174 ret = -EOPNOTSUPP;
1175 goto out;
1176 }
1177 /* If the socket is already bound to a link local address,
1178 * it can only send to peers on the same link. But allow
1179 * communicating beween link local and non-link local address.
1180 */
1181 if (scope_id != rs->rs_bound_scope_id) {
1182 if (!scope_id) {
1183 scope_id = rs->rs_bound_scope_id;
1184 } else if (rs->rs_bound_scope_id) {
1185 release_sock(sk);
1186 ret = -EINVAL;
1187 goto out;
1188 }
1189 }
1105 } 1190 }
1106 release_sock(sk); 1191 release_sock(sk);
1107 1192
@@ -1155,13 +1240,14 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
1155 1240
1156 /* rds_conn_create has a spinlock that runs with IRQ off. 1241 /* rds_conn_create has a spinlock that runs with IRQ off.
1157 * Caching the conn in the socket helps a lot. */ 1242 * Caching the conn in the socket helps a lot. */
1158 if (rs->rs_conn && rs->rs_conn->c_faddr == daddr) 1243 if (rs->rs_conn && ipv6_addr_equal(&rs->rs_conn->c_faddr, &daddr))
1159 conn = rs->rs_conn; 1244 conn = rs->rs_conn;
1160 else { 1245 else {
1161 conn = rds_conn_create_outgoing(sock_net(sock->sk), 1246 conn = rds_conn_create_outgoing(sock_net(sock->sk),
1162 rs->rs_bound_addr, daddr, 1247 &rs->rs_bound_addr, &daddr,
1163 rs->rs_transport, 1248 rs->rs_transport,
1164 sock->sk->sk_allocation); 1249 sock->sk->sk_allocation,
1250 scope_id);
1165 if (IS_ERR(conn)) { 1251 if (IS_ERR(conn)) {
1166 ret = PTR_ERR(conn); 1252 ret = PTR_ERR(conn);
1167 goto out; 1253 goto out;
@@ -1169,6 +1255,13 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
1169 rs->rs_conn = conn; 1255 rs->rs_conn = conn;
1170 } 1256 }
1171 1257
1258 if (conn->c_trans->t_mp_capable)
1259 cpath = &conn->c_path[rds_send_mprds_hash(rs, conn)];
1260 else
1261 cpath = &conn->c_path[0];
1262
1263 rm->m_conn_path = cpath;
1264
1172 /* Parse any control messages the user may have included. */ 1265 /* Parse any control messages the user may have included. */
1173 ret = rds_cmsg_send(rs, rm, msg, &allocated_mr); 1266 ret = rds_cmsg_send(rs, rm, msg, &allocated_mr);
1174 if (ret) { 1267 if (ret) {
@@ -1192,11 +1285,6 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
1192 goto out; 1285 goto out;
1193 } 1286 }
1194 1287
1195 if (conn->c_trans->t_mp_capable)
1196 cpath = &conn->c_path[rds_send_mprds_hash(rs, conn)];
1197 else
1198 cpath = &conn->c_path[0];
1199
1200 if (rds_destroy_pending(conn)) { 1288 if (rds_destroy_pending(conn)) {
1201 ret = -EAGAIN; 1289 ret = -EAGAIN;
1202 goto out; 1290 goto out;
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 351a28474667..b9bbcf3d6c63 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2006 Oracle. All rights reserved. 2 * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
3 * 3 *
4 * This software is available to you under a choice of one of two 4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU 5 * licenses. You may choose to be licensed under the terms of the GNU
@@ -37,6 +37,7 @@
37#include <net/tcp.h> 37#include <net/tcp.h>
38#include <net/net_namespace.h> 38#include <net/net_namespace.h>
39#include <net/netns/generic.h> 39#include <net/netns/generic.h>
40#include <net/addrconf.h>
40 41
41#include "rds.h" 42#include "rds.h"
42#include "tcp.h" 43#include "tcp.h"
@@ -44,7 +45,14 @@
44/* only for info exporting */ 45/* only for info exporting */
45static DEFINE_SPINLOCK(rds_tcp_tc_list_lock); 46static DEFINE_SPINLOCK(rds_tcp_tc_list_lock);
46static LIST_HEAD(rds_tcp_tc_list); 47static LIST_HEAD(rds_tcp_tc_list);
48
49/* rds_tcp_tc_count counts only IPv4 connections.
50 * rds6_tcp_tc_count counts both IPv4 and IPv6 connections.
51 */
47static unsigned int rds_tcp_tc_count; 52static unsigned int rds_tcp_tc_count;
53#if IS_ENABLED(CONFIG_IPV6)
54static unsigned int rds6_tcp_tc_count;
55#endif
48 56
49/* Track rds_tcp_connection structs so they can be cleaned up */ 57/* Track rds_tcp_connection structs so they can be cleaned up */
50static DEFINE_SPINLOCK(rds_tcp_conn_lock); 58static DEFINE_SPINLOCK(rds_tcp_conn_lock);
@@ -111,7 +119,11 @@ void rds_tcp_restore_callbacks(struct socket *sock,
111 /* done under the callback_lock to serialize with write_space */ 119 /* done under the callback_lock to serialize with write_space */
112 spin_lock(&rds_tcp_tc_list_lock); 120 spin_lock(&rds_tcp_tc_list_lock);
113 list_del_init(&tc->t_list_item); 121 list_del_init(&tc->t_list_item);
114 rds_tcp_tc_count--; 122#if IS_ENABLED(CONFIG_IPV6)
123 rds6_tcp_tc_count--;
124#endif
125 if (!tc->t_cpath->cp_conn->c_isv6)
126 rds_tcp_tc_count--;
115 spin_unlock(&rds_tcp_tc_list_lock); 127 spin_unlock(&rds_tcp_tc_list_lock);
116 128
117 tc->t_sock = NULL; 129 tc->t_sock = NULL;
@@ -198,7 +210,11 @@ void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp)
198 /* done under the callback_lock to serialize with write_space */ 210 /* done under the callback_lock to serialize with write_space */
199 spin_lock(&rds_tcp_tc_list_lock); 211 spin_lock(&rds_tcp_tc_list_lock);
200 list_add_tail(&tc->t_list_item, &rds_tcp_tc_list); 212 list_add_tail(&tc->t_list_item, &rds_tcp_tc_list);
201 rds_tcp_tc_count++; 213#if IS_ENABLED(CONFIG_IPV6)
214 rds6_tcp_tc_count++;
215#endif
216 if (!tc->t_cpath->cp_conn->c_isv6)
217 rds_tcp_tc_count++;
202 spin_unlock(&rds_tcp_tc_list_lock); 218 spin_unlock(&rds_tcp_tc_list_lock);
203 219
204 /* accepted sockets need our listen data ready undone */ 220 /* accepted sockets need our listen data ready undone */
@@ -219,6 +235,9 @@ void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp)
219 write_unlock_bh(&sock->sk->sk_callback_lock); 235 write_unlock_bh(&sock->sk->sk_callback_lock);
220} 236}
221 237
238/* Handle RDS_INFO_TCP_SOCKETS socket option. It only returns IPv4
239 * connections for backward compatibility.
240 */
222static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len, 241static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len,
223 struct rds_info_iterator *iter, 242 struct rds_info_iterator *iter,
224 struct rds_info_lengths *lens) 243 struct rds_info_lengths *lens)
@@ -226,8 +245,6 @@ static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len,
226 struct rds_info_tcp_socket tsinfo; 245 struct rds_info_tcp_socket tsinfo;
227 struct rds_tcp_connection *tc; 246 struct rds_tcp_connection *tc;
228 unsigned long flags; 247 unsigned long flags;
229 struct sockaddr_in sin;
230 struct socket *sock;
231 248
232 spin_lock_irqsave(&rds_tcp_tc_list_lock, flags); 249 spin_lock_irqsave(&rds_tcp_tc_list_lock, flags);
233 250
@@ -235,16 +252,15 @@ static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len,
235 goto out; 252 goto out;
236 253
237 list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) { 254 list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) {
255 struct inet_sock *inet = inet_sk(tc->t_sock->sk);
238 256
239 sock = tc->t_sock; 257 if (tc->t_cpath->cp_conn->c_isv6)
240 if (sock) { 258 continue;
241 sock->ops->getname(sock, (struct sockaddr *)&sin, 0); 259
242 tsinfo.local_addr = sin.sin_addr.s_addr; 260 tsinfo.local_addr = inet->inet_saddr;
243 tsinfo.local_port = sin.sin_port; 261 tsinfo.local_port = inet->inet_sport;
244 sock->ops->getname(sock, (struct sockaddr *)&sin, 1); 262 tsinfo.peer_addr = inet->inet_daddr;
245 tsinfo.peer_addr = sin.sin_addr.s_addr; 263 tsinfo.peer_port = inet->inet_dport;
246 tsinfo.peer_port = sin.sin_port;
247 }
248 264
249 tsinfo.hdr_rem = tc->t_tinc_hdr_rem; 265 tsinfo.hdr_rem = tc->t_tinc_hdr_rem;
250 tsinfo.data_rem = tc->t_tinc_data_rem; 266 tsinfo.data_rem = tc->t_tinc_data_rem;
@@ -262,10 +278,82 @@ out:
262 spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags); 278 spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags);
263} 279}
264 280
265static int rds_tcp_laddr_check(struct net *net, __be32 addr) 281#if IS_ENABLED(CONFIG_IPV6)
282/* Handle RDS6_INFO_TCP_SOCKETS socket option. It returns both IPv4 and
283 * IPv6 connections. IPv4 connection address is returned in an IPv4 mapped
284 * address.
285 */
286static void rds6_tcp_tc_info(struct socket *sock, unsigned int len,
287 struct rds_info_iterator *iter,
288 struct rds_info_lengths *lens)
289{
290 struct rds6_info_tcp_socket tsinfo6;
291 struct rds_tcp_connection *tc;
292 unsigned long flags;
293
294 spin_lock_irqsave(&rds_tcp_tc_list_lock, flags);
295
296 if (len / sizeof(tsinfo6) < rds6_tcp_tc_count)
297 goto out;
298
299 list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) {
300 struct sock *sk = tc->t_sock->sk;
301 struct inet_sock *inet = inet_sk(sk);
302
303 tsinfo6.local_addr = sk->sk_v6_rcv_saddr;
304 tsinfo6.local_port = inet->inet_sport;
305 tsinfo6.peer_addr = sk->sk_v6_daddr;
306 tsinfo6.peer_port = inet->inet_dport;
307
308 tsinfo6.hdr_rem = tc->t_tinc_hdr_rem;
309 tsinfo6.data_rem = tc->t_tinc_data_rem;
310 tsinfo6.last_sent_nxt = tc->t_last_sent_nxt;
311 tsinfo6.last_expected_una = tc->t_last_expected_una;
312 tsinfo6.last_seen_una = tc->t_last_seen_una;
313
314 rds_info_copy(iter, &tsinfo6, sizeof(tsinfo6));
315 }
316
317out:
318 lens->nr = rds6_tcp_tc_count;
319 lens->each = sizeof(tsinfo6);
320
321 spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags);
322}
323#endif
324
325static int rds_tcp_laddr_check(struct net *net, const struct in6_addr *addr,
326 __u32 scope_id)
266{ 327{
267 if (inet_addr_type(net, addr) == RTN_LOCAL) 328 struct net_device *dev = NULL;
329#if IS_ENABLED(CONFIG_IPV6)
330 int ret;
331#endif
332
333 if (ipv6_addr_v4mapped(addr)) {
334 if (inet_addr_type(net, addr->s6_addr32[3]) == RTN_LOCAL)
335 return 0;
336 return -EADDRNOTAVAIL;
337 }
338
339 /* If the scope_id is specified, check only those addresses
340 * hosted on the specified interface.
341 */
342 if (scope_id != 0) {
343 rcu_read_lock();
344 dev = dev_get_by_index_rcu(net, scope_id);
345 /* scope_id is not valid... */
346 if (!dev) {
347 rcu_read_unlock();
348 return -EADDRNOTAVAIL;
349 }
350 rcu_read_unlock();
351 }
352#if IS_ENABLED(CONFIG_IPV6)
353 ret = ipv6_chk_addr(net, addr, dev, 0);
354 if (ret)
268 return 0; 355 return 0;
356#endif
269 return -EADDRNOTAVAIL; 357 return -EADDRNOTAVAIL;
270} 358}
271 359
@@ -468,13 +556,27 @@ static __net_init int rds_tcp_init_net(struct net *net)
468 err = -ENOMEM; 556 err = -ENOMEM;
469 goto fail; 557 goto fail;
470 } 558 }
471 rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net); 559
560#if IS_ENABLED(CONFIG_IPV6)
561 rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net, true);
562#else
563 rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net, false);
564#endif
472 if (!rtn->rds_tcp_listen_sock) { 565 if (!rtn->rds_tcp_listen_sock) {
473 pr_warn("could not set up listen sock\n"); 566 pr_warn("could not set up IPv6 listen sock\n");
474 unregister_net_sysctl_table(rtn->rds_tcp_sysctl); 567
475 rtn->rds_tcp_sysctl = NULL; 568#if IS_ENABLED(CONFIG_IPV6)
476 err = -EAFNOSUPPORT; 569 /* Try IPv4 as some systems disable IPv6 */
477 goto fail; 570 rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net, false);
571 if (!rtn->rds_tcp_listen_sock) {
572#endif
573 unregister_net_sysctl_table(rtn->rds_tcp_sysctl);
574 rtn->rds_tcp_sysctl = NULL;
575 err = -EAFNOSUPPORT;
576 goto fail;
577#if IS_ENABLED(CONFIG_IPV6)
578 }
579#endif
478 } 580 }
479 INIT_WORK(&rtn->rds_tcp_accept_w, rds_tcp_accept_worker); 581 INIT_WORK(&rtn->rds_tcp_accept_w, rds_tcp_accept_worker);
480 return 0; 582 return 0;
@@ -588,6 +690,9 @@ static void rds_tcp_exit(void)
588 rds_tcp_set_unloading(); 690 rds_tcp_set_unloading();
589 synchronize_rcu(); 691 synchronize_rcu();
590 rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); 692 rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
693#if IS_ENABLED(CONFIG_IPV6)
694 rds_info_deregister_func(RDS6_INFO_TCP_SOCKETS, rds6_tcp_tc_info);
695#endif
591 unregister_pernet_device(&rds_tcp_net_ops); 696 unregister_pernet_device(&rds_tcp_net_ops);
592 rds_tcp_destroy_conns(); 697 rds_tcp_destroy_conns();
593 rds_trans_unregister(&rds_tcp_transport); 698 rds_trans_unregister(&rds_tcp_transport);
@@ -619,6 +724,9 @@ static int rds_tcp_init(void)
619 rds_trans_register(&rds_tcp_transport); 724 rds_trans_register(&rds_tcp_transport);
620 725
621 rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); 726 rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
727#if IS_ENABLED(CONFIG_IPV6)
728 rds_info_register_func(RDS6_INFO_TCP_SOCKETS, rds6_tcp_tc_info);
729#endif
622 730
623 goto out; 731 goto out;
624out_recv: 732out_recv:
@@ -633,4 +741,3 @@ module_init(rds_tcp_init);
633MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>"); 741MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
634MODULE_DESCRIPTION("RDS: TCP transport"); 742MODULE_DESCRIPTION("RDS: TCP transport");
635MODULE_LICENSE("Dual BSD/GPL"); 743MODULE_LICENSE("Dual BSD/GPL");
636
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
index c6fa080e9b6d..3c69361d21c7 100644
--- a/net/rds/tcp.h
+++ b/net/rds/tcp.h
@@ -67,7 +67,7 @@ void rds_tcp_conn_path_shutdown(struct rds_conn_path *conn);
67void rds_tcp_state_change(struct sock *sk); 67void rds_tcp_state_change(struct sock *sk);
68 68
69/* tcp_listen.c */ 69/* tcp_listen.c */
70struct socket *rds_tcp_listen_init(struct net *); 70struct socket *rds_tcp_listen_init(struct net *net, bool isv6);
71void rds_tcp_listen_stop(struct socket *sock, struct work_struct *acceptor); 71void rds_tcp_listen_stop(struct socket *sock, struct work_struct *acceptor);
72void rds_tcp_listen_data_ready(struct sock *sk); 72void rds_tcp_listen_data_ready(struct sock *sk);
73int rds_tcp_accept_one(struct socket *sock); 73int rds_tcp_accept_one(struct socket *sock);
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
index d999e7075645..008f50fb25dd 100644
--- a/net/rds/tcp_connect.c
+++ b/net/rds/tcp_connect.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2006 Oracle. All rights reserved. 2 * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
3 * 3 *
4 * This software is available to you under a choice of one of two 4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU 5 * licenses. You may choose to be licensed under the terms of the GNU
@@ -66,7 +66,8 @@ void rds_tcp_state_change(struct sock *sk)
66 * RDS connection as RDS_CONN_UP until the reconnect, 66 * RDS connection as RDS_CONN_UP until the reconnect,
67 * to avoid RDS datagram loss. 67 * to avoid RDS datagram loss.
68 */ 68 */
69 if (!IS_CANONICAL(cp->cp_conn->c_laddr, cp->cp_conn->c_faddr) && 69 if (rds_addr_cmp(&cp->cp_conn->c_laddr,
70 &cp->cp_conn->c_faddr) >= 0 &&
70 rds_conn_path_transition(cp, RDS_CONN_CONNECTING, 71 rds_conn_path_transition(cp, RDS_CONN_CONNECTING,
71 RDS_CONN_ERROR)) { 72 RDS_CONN_ERROR)) {
72 rds_conn_path_drop(cp, false); 73 rds_conn_path_drop(cp, false);
@@ -88,7 +89,11 @@ out:
88int rds_tcp_conn_path_connect(struct rds_conn_path *cp) 89int rds_tcp_conn_path_connect(struct rds_conn_path *cp)
89{ 90{
90 struct socket *sock = NULL; 91 struct socket *sock = NULL;
91 struct sockaddr_in src, dest; 92 struct sockaddr_in6 sin6;
93 struct sockaddr_in sin;
94 struct sockaddr *addr;
95 int addrlen;
96 bool isv6;
92 int ret; 97 int ret;
93 struct rds_connection *conn = cp->cp_conn; 98 struct rds_connection *conn = cp->cp_conn;
94 struct rds_tcp_connection *tc = cp->cp_transport_data; 99 struct rds_tcp_connection *tc = cp->cp_transport_data;
@@ -105,37 +110,68 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp)
105 mutex_unlock(&tc->t_conn_path_lock); 110 mutex_unlock(&tc->t_conn_path_lock);
106 return 0; 111 return 0;
107 } 112 }
108 ret = sock_create_kern(rds_conn_net(conn), PF_INET, 113 if (ipv6_addr_v4mapped(&conn->c_laddr)) {
109 SOCK_STREAM, IPPROTO_TCP, &sock); 114 ret = sock_create_kern(rds_conn_net(conn), PF_INET,
115 SOCK_STREAM, IPPROTO_TCP, &sock);
116 isv6 = false;
117 } else {
118 ret = sock_create_kern(rds_conn_net(conn), PF_INET6,
119 SOCK_STREAM, IPPROTO_TCP, &sock);
120 isv6 = true;
121 }
122
110 if (ret < 0) 123 if (ret < 0)
111 goto out; 124 goto out;
112 125
113 rds_tcp_tune(sock); 126 rds_tcp_tune(sock);
114 127
115 src.sin_family = AF_INET; 128 if (isv6) {
116 src.sin_addr.s_addr = (__force u32)conn->c_laddr; 129 sin6.sin6_family = AF_INET6;
117 src.sin_port = (__force u16)htons(0); 130 sin6.sin6_addr = conn->c_laddr;
131 sin6.sin6_port = 0;
132 sin6.sin6_flowinfo = 0;
133 sin6.sin6_scope_id = conn->c_dev_if;
134 addr = (struct sockaddr *)&sin6;
135 addrlen = sizeof(sin6);
136 } else {
137 sin.sin_family = AF_INET;
138 sin.sin_addr.s_addr = conn->c_laddr.s6_addr32[3];
139 sin.sin_port = 0;
140 addr = (struct sockaddr *)&sin;
141 addrlen = sizeof(sin);
142 }
118 143
119 ret = sock->ops->bind(sock, (struct sockaddr *)&src, sizeof(src)); 144 ret = sock->ops->bind(sock, addr, addrlen);
120 if (ret) { 145 if (ret) {
121 rdsdebug("bind failed with %d at address %pI4\n", 146 rdsdebug("bind failed with %d at address %pI6c\n",
122 ret, &conn->c_laddr); 147 ret, &conn->c_laddr);
123 goto out; 148 goto out;
124 } 149 }
125 150
126 dest.sin_family = AF_INET; 151 if (isv6) {
127 dest.sin_addr.s_addr = (__force u32)conn->c_faddr; 152 sin6.sin6_family = AF_INET6;
128 dest.sin_port = (__force u16)htons(RDS_TCP_PORT); 153 sin6.sin6_addr = conn->c_faddr;
154 sin6.sin6_port = htons(RDS_TCP_PORT);
155 sin6.sin6_flowinfo = 0;
156 sin6.sin6_scope_id = conn->c_dev_if;
157 addr = (struct sockaddr *)&sin6;
158 addrlen = sizeof(sin6);
159 } else {
160 sin.sin_family = AF_INET;
161 sin.sin_addr.s_addr = conn->c_faddr.s6_addr32[3];
162 sin.sin_port = htons(RDS_TCP_PORT);
163 addr = (struct sockaddr *)&sin;
164 addrlen = sizeof(sin);
165 }
129 166
130 /* 167 /*
131 * once we call connect() we can start getting callbacks and they 168 * once we call connect() we can start getting callbacks and they
132 * own the socket 169 * own the socket
133 */ 170 */
134 rds_tcp_set_callbacks(sock, cp); 171 rds_tcp_set_callbacks(sock, cp);
135 ret = sock->ops->connect(sock, (struct sockaddr *)&dest, sizeof(dest), 172 ret = sock->ops->connect(sock, addr, addrlen, O_NONBLOCK);
136 O_NONBLOCK);
137 173
138 rdsdebug("connect to address %pI4 returned %d\n", &conn->c_faddr, ret); 174 rdsdebug("connect to address %pI6c returned %d\n", &conn->c_faddr, ret);
139 if (ret == -EINPROGRESS) 175 if (ret == -EINPROGRESS)
140 ret = 0; 176 ret = 0;
141 if (ret == 0) { 177 if (ret == 0) {
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 22571189f21e..c12203f646da 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2006, 2018 Oracle. All rights reserved. 2 * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
3 * 3 *
4 * This software is available to you under a choice of one of two 4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU 5 * licenses. You may choose to be licensed under the terms of the GNU
@@ -83,13 +83,12 @@ static
83struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn) 83struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn)
84{ 84{
85 int i; 85 int i;
86 bool peer_is_smaller = IS_CANONICAL(conn->c_faddr, conn->c_laddr);
87 int npaths = max_t(int, 1, conn->c_npaths); 86 int npaths = max_t(int, 1, conn->c_npaths);
88 87
89 /* for mprds, all paths MUST be initiated by the peer 88 /* for mprds, all paths MUST be initiated by the peer
90 * with the smaller address. 89 * with the smaller address.
91 */ 90 */
92 if (!peer_is_smaller) { 91 if (rds_addr_cmp(&conn->c_faddr, &conn->c_laddr) >= 0) {
93 /* Make sure we initiate at least one path if this 92 /* Make sure we initiate at least one path if this
94 * has not already been done; rds_start_mprds() will 93 * has not already been done; rds_start_mprds() will
95 * take care of additional paths, if necessary. 94 * take care of additional paths, if necessary.
@@ -132,6 +131,11 @@ int rds_tcp_accept_one(struct socket *sock)
132 struct rds_tcp_connection *rs_tcp = NULL; 131 struct rds_tcp_connection *rs_tcp = NULL;
133 int conn_state; 132 int conn_state;
134 struct rds_conn_path *cp; 133 struct rds_conn_path *cp;
134 struct in6_addr *my_addr, *peer_addr;
135#if !IS_ENABLED(CONFIG_IPV6)
136 struct in6_addr saddr, daddr;
137#endif
138 int dev_if = 0;
135 139
136 if (!sock) /* module unload or netns delete in progress */ 140 if (!sock) /* module unload or netns delete in progress */
137 return -ENETUNREACH; 141 return -ENETUNREACH;
@@ -164,13 +168,40 @@ int rds_tcp_accept_one(struct socket *sock)
164 168
165 inet = inet_sk(new_sock->sk); 169 inet = inet_sk(new_sock->sk);
166 170
167 rdsdebug("accepted tcp %pI4:%u -> %pI4:%u\n", 171#if IS_ENABLED(CONFIG_IPV6)
168 &inet->inet_saddr, ntohs(inet->inet_sport), 172 my_addr = &new_sock->sk->sk_v6_rcv_saddr;
169 &inet->inet_daddr, ntohs(inet->inet_dport)); 173 peer_addr = &new_sock->sk->sk_v6_daddr;
174#else
175 ipv6_addr_set_v4mapped(inet->inet_saddr, &saddr);
176 ipv6_addr_set_v4mapped(inet->inet_daddr, &daddr);
177 my_addr = &saddr;
178 peer_addr = &daddr;
179#endif
180 rdsdebug("accepted family %d tcp %pI6c:%u -> %pI6c:%u\n",
181 sock->sk->sk_family,
182 my_addr, ntohs(inet->inet_sport),
183 peer_addr, ntohs(inet->inet_dport));
184
185#if IS_ENABLED(CONFIG_IPV6)
186 /* sk_bound_dev_if is not set if the peer address is not link local
187 * address. In this case, it happens that mcast_oif is set. So
188 * just use it.
189 */
190 if ((ipv6_addr_type(my_addr) & IPV6_ADDR_LINKLOCAL) &&
191 !(ipv6_addr_type(peer_addr) & IPV6_ADDR_LINKLOCAL)) {
192 struct ipv6_pinfo *inet6;
193
194 inet6 = inet6_sk(new_sock->sk);
195 dev_if = inet6->mcast_oif;
196 } else {
197 dev_if = new_sock->sk->sk_bound_dev_if;
198 }
199#endif
170 200
171 conn = rds_conn_create(sock_net(sock->sk), 201 conn = rds_conn_create(sock_net(sock->sk),
172 inet->inet_saddr, inet->inet_daddr, 202 my_addr, peer_addr,
173 &rds_tcp_transport, GFP_KERNEL); 203 &rds_tcp_transport, GFP_KERNEL, dev_if);
204
174 if (IS_ERR(conn)) { 205 if (IS_ERR(conn)) {
175 ret = PTR_ERR(conn); 206 ret = PTR_ERR(conn);
176 goto out; 207 goto out;
@@ -254,15 +285,22 @@ out:
254 ready(sk); 285 ready(sk);
255} 286}
256 287
257struct socket *rds_tcp_listen_init(struct net *net) 288struct socket *rds_tcp_listen_init(struct net *net, bool isv6)
258{ 289{
259 struct sockaddr_in sin;
260 struct socket *sock = NULL; 290 struct socket *sock = NULL;
291 struct sockaddr_storage ss;
292 struct sockaddr_in6 *sin6;
293 struct sockaddr_in *sin;
294 int addr_len;
261 int ret; 295 int ret;
262 296
263 ret = sock_create_kern(net, PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); 297 ret = sock_create_kern(net, isv6 ? PF_INET6 : PF_INET, SOCK_STREAM,
264 if (ret < 0) 298 IPPROTO_TCP, &sock);
299 if (ret < 0) {
300 rdsdebug("could not create %s listener socket: %d\n",
301 isv6 ? "IPv6" : "IPv4", ret);
265 goto out; 302 goto out;
303 }
266 304
267 sock->sk->sk_reuse = SK_CAN_REUSE; 305 sock->sk->sk_reuse = SK_CAN_REUSE;
268 rds_tcp_nonagle(sock); 306 rds_tcp_nonagle(sock);
@@ -272,13 +310,28 @@ struct socket *rds_tcp_listen_init(struct net *net)
272 sock->sk->sk_data_ready = rds_tcp_listen_data_ready; 310 sock->sk->sk_data_ready = rds_tcp_listen_data_ready;
273 write_unlock_bh(&sock->sk->sk_callback_lock); 311 write_unlock_bh(&sock->sk->sk_callback_lock);
274 312
275 sin.sin_family = PF_INET; 313 if (isv6) {
276 sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY); 314 sin6 = (struct sockaddr_in6 *)&ss;
277 sin.sin_port = (__force u16)htons(RDS_TCP_PORT); 315 sin6->sin6_family = PF_INET6;
316 sin6->sin6_addr = in6addr_any;
317 sin6->sin6_port = (__force u16)htons(RDS_TCP_PORT);
318 sin6->sin6_scope_id = 0;
319 sin6->sin6_flowinfo = 0;
320 addr_len = sizeof(*sin6);
321 } else {
322 sin = (struct sockaddr_in *)&ss;
323 sin->sin_family = PF_INET;
324 sin->sin_addr.s_addr = INADDR_ANY;
325 sin->sin_port = (__force u16)htons(RDS_TCP_PORT);
326 addr_len = sizeof(*sin);
327 }
278 328
279 ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin)); 329 ret = sock->ops->bind(sock, (struct sockaddr *)&ss, addr_len);
280 if (ret < 0) 330 if (ret < 0) {
331 rdsdebug("could not bind %s listener socket: %d\n",
332 isv6 ? "IPv6" : "IPv4", ret);
281 goto out; 333 goto out;
334 }
282 335
283 ret = sock->ops->listen(sock, 64); 336 ret = sock->ops->listen(sock, 64);
284 if (ret < 0) 337 if (ret < 0)
diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c
index b9fbd2ee74ef..42c5ff1eda95 100644
--- a/net/rds/tcp_recv.c
+++ b/net/rds/tcp_recv.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2006 Oracle. All rights reserved. 2 * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
3 * 3 *
4 * This software is available to you under a choice of one of two 4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU 5 * licenses. You may choose to be licensed under the terms of the GNU
@@ -179,7 +179,7 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
179 tc->t_tinc = tinc; 179 tc->t_tinc = tinc;
180 rdsdebug("alloced tinc %p\n", tinc); 180 rdsdebug("alloced tinc %p\n", tinc);
181 rds_inc_path_init(&tinc->ti_inc, cp, 181 rds_inc_path_init(&tinc->ti_inc, cp,
182 cp->cp_conn->c_faddr); 182 &cp->cp_conn->c_faddr);
183 tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] = 183 tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] =
184 local_clock(); 184 local_clock();
185 185
@@ -239,8 +239,9 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
239 if (tinc->ti_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) 239 if (tinc->ti_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
240 rds_tcp_cong_recv(conn, tinc); 240 rds_tcp_cong_recv(conn, tinc);
241 else 241 else
242 rds_recv_incoming(conn, conn->c_faddr, 242 rds_recv_incoming(conn, &conn->c_faddr,
243 conn->c_laddr, &tinc->ti_inc, 243 &conn->c_laddr,
244 &tinc->ti_inc,
244 arg->gfp); 245 arg->gfp);
245 246
246 tc->t_tinc_hdr_rem = sizeof(struct rds_header); 247 tc->t_tinc_hdr_rem = sizeof(struct rds_header);
diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c
index 7df869d37afd..78a2554a4497 100644
--- a/net/rds/tcp_send.c
+++ b/net/rds/tcp_send.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2006 Oracle. All rights reserved. 2 * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
3 * 3 *
4 * This software is available to you under a choice of one of two 4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU 5 * licenses. You may choose to be licensed under the terms of the GNU
@@ -153,7 +153,7 @@ out:
153 * an incoming RST. 153 * an incoming RST.
154 */ 154 */
155 if (rds_conn_path_up(cp)) { 155 if (rds_conn_path_up(cp)) {
156 pr_warn("RDS/tcp: send to %pI4 on cp [%d]" 156 pr_warn("RDS/tcp: send to %pI6c on cp [%d]"
157 "returned %d, " 157 "returned %d, "
158 "disconnecting and reconnecting\n", 158 "disconnecting and reconnecting\n",
159 &conn->c_faddr, cp->cp_index, ret); 159 &conn->c_faddr, cp->cp_index, ret);
diff --git a/net/rds/threads.c b/net/rds/threads.c
index c52861d77a59..e64f9e4c3cda 100644
--- a/net/rds/threads.c
+++ b/net/rds/threads.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2006 Oracle. All rights reserved. 2 * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
3 * 3 *
4 * This software is available to you under a choice of one of two 4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU 5 * licenses. You may choose to be licensed under the terms of the GNU
@@ -82,8 +82,8 @@ void rds_connect_path_complete(struct rds_conn_path *cp, int curr)
82 return; 82 return;
83 } 83 }
84 84
85 rdsdebug("conn %p for %pI4 to %pI4 complete\n", 85 rdsdebug("conn %p for %pI6c to %pI6c complete\n",
86 cp->cp_conn, &cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr); 86 cp->cp_conn, &cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr);
87 87
88 cp->cp_reconnect_jiffies = 0; 88 cp->cp_reconnect_jiffies = 0;
89 set_bit(0, &cp->cp_conn->c_map_queued); 89 set_bit(0, &cp->cp_conn->c_map_queued);
@@ -125,13 +125,13 @@ void rds_queue_reconnect(struct rds_conn_path *cp)
125 unsigned long rand; 125 unsigned long rand;
126 struct rds_connection *conn = cp->cp_conn; 126 struct rds_connection *conn = cp->cp_conn;
127 127
128 rdsdebug("conn %p for %pI4 to %pI4 reconnect jiffies %lu\n", 128 rdsdebug("conn %p for %pI6c to %pI6c reconnect jiffies %lu\n",
129 conn, &conn->c_laddr, &conn->c_faddr, 129 conn, &conn->c_laddr, &conn->c_faddr,
130 cp->cp_reconnect_jiffies); 130 cp->cp_reconnect_jiffies);
131 131
132 /* let peer with smaller addr initiate reconnect, to avoid duels */ 132 /* let peer with smaller addr initiate reconnect, to avoid duels */
133 if (conn->c_trans->t_type == RDS_TRANS_TCP && 133 if (conn->c_trans->t_type == RDS_TRANS_TCP &&
134 !IS_CANONICAL(conn->c_laddr, conn->c_faddr)) 134 rds_addr_cmp(&conn->c_laddr, &conn->c_faddr) >= 0)
135 return; 135 return;
136 136
137 set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags); 137 set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags);
@@ -145,7 +145,7 @@ void rds_queue_reconnect(struct rds_conn_path *cp)
145 } 145 }
146 146
147 get_random_bytes(&rand, sizeof(rand)); 147 get_random_bytes(&rand, sizeof(rand));
148 rdsdebug("%lu delay %lu ceil conn %p for %pI4 -> %pI4\n", 148 rdsdebug("%lu delay %lu ceil conn %p for %pI6c -> %pI6c\n",
149 rand % cp->cp_reconnect_jiffies, cp->cp_reconnect_jiffies, 149 rand % cp->cp_reconnect_jiffies, cp->cp_reconnect_jiffies,
150 conn, &conn->c_laddr, &conn->c_faddr); 150 conn, &conn->c_laddr, &conn->c_faddr);
151 rcu_read_lock(); 151 rcu_read_lock();
@@ -167,14 +167,14 @@ void rds_connect_worker(struct work_struct *work)
167 int ret; 167 int ret;
168 168
169 if (cp->cp_index > 0 && 169 if (cp->cp_index > 0 &&
170 !IS_CANONICAL(cp->cp_conn->c_laddr, cp->cp_conn->c_faddr)) 170 rds_addr_cmp(&cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr) >= 0)
171 return; 171 return;
172 clear_bit(RDS_RECONNECT_PENDING, &cp->cp_flags); 172 clear_bit(RDS_RECONNECT_PENDING, &cp->cp_flags);
173 ret = rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_CONNECTING); 173 ret = rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_CONNECTING);
174 if (ret) { 174 if (ret) {
175 ret = conn->c_trans->conn_path_connect(cp); 175 ret = conn->c_trans->conn_path_connect(cp);
176 rdsdebug("conn %p for %pI4 to %pI4 dispatched, ret %d\n", 176 rdsdebug("conn %p for %pI6c to %pI6c dispatched, ret %d\n",
177 conn, &conn->c_laddr, &conn->c_faddr, ret); 177 conn, &conn->c_laddr, &conn->c_faddr, ret);
178 178
179 if (ret) { 179 if (ret) {
180 if (rds_conn_path_transition(cp, 180 if (rds_conn_path_transition(cp,
@@ -259,3 +259,50 @@ int rds_threads_init(void)
259 259
260 return 0; 260 return 0;
261} 261}
262
263/* Compare two IPv6 addresses. Return 0 if the two addresses are equal.
264 * Return 1 if the first is greater. Return -1 if the second is greater.
265 */
266int rds_addr_cmp(const struct in6_addr *addr1,
267 const struct in6_addr *addr2)
268{
269#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
270 const __be64 *a1, *a2;
271 u64 x, y;
272
273 a1 = (__be64 *)addr1;
274 a2 = (__be64 *)addr2;
275
276 if (*a1 != *a2) {
277 if (be64_to_cpu(*a1) < be64_to_cpu(*a2))
278 return -1;
279 else
280 return 1;
281 } else {
282 x = be64_to_cpu(*++a1);
283 y = be64_to_cpu(*++a2);
284 if (x < y)
285 return -1;
286 else if (x > y)
287 return 1;
288 else
289 return 0;
290 }
291#else
292 u32 a, b;
293 int i;
294
295 for (i = 0; i < 4; i++) {
296 if (addr1->s6_addr32[i] != addr2->s6_addr32[i]) {
297 a = ntohl(addr1->s6_addr32[i]);
298 b = ntohl(addr2->s6_addr32[i]);
299 if (a < b)
300 return -1;
301 else if (a > b)
302 return 1;
303 }
304 }
305 return 0;
306#endif
307}
308EXPORT_SYMBOL_GPL(rds_addr_cmp);
diff --git a/net/rds/transport.c b/net/rds/transport.c
index 0b188dd0a344..46f709a4b577 100644
--- a/net/rds/transport.c
+++ b/net/rds/transport.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2006 Oracle. All rights reserved. 2 * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
3 * 3 *
4 * This software is available to you under a choice of one of two 4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU 5 * licenses. You may choose to be licensed under the terms of the GNU
@@ -33,6 +33,7 @@
33#include <linux/kernel.h> 33#include <linux/kernel.h>
34#include <linux/module.h> 34#include <linux/module.h>
35#include <linux/in.h> 35#include <linux/in.h>
36#include <linux/ipv6.h>
36 37
37#include "rds.h" 38#include "rds.h"
38#include "loop.h" 39#include "loop.h"
@@ -75,20 +76,26 @@ void rds_trans_put(struct rds_transport *trans)
75 module_put(trans->t_owner); 76 module_put(trans->t_owner);
76} 77}
77 78
78struct rds_transport *rds_trans_get_preferred(struct net *net, __be32 addr) 79struct rds_transport *rds_trans_get_preferred(struct net *net,
80 const struct in6_addr *addr,
81 __u32 scope_id)
79{ 82{
80 struct rds_transport *ret = NULL; 83 struct rds_transport *ret = NULL;
81 struct rds_transport *trans; 84 struct rds_transport *trans;
82 unsigned int i; 85 unsigned int i;
83 86
84 if (IN_LOOPBACK(ntohl(addr))) 87 if (ipv6_addr_v4mapped(addr)) {
88 if (*(u_int8_t *)&addr->s6_addr32[3] == IN_LOOPBACKNET)
89 return &rds_loop_transport;
90 } else if (ipv6_addr_loopback(addr)) {
85 return &rds_loop_transport; 91 return &rds_loop_transport;
92 }
86 93
87 down_read(&rds_trans_sem); 94 down_read(&rds_trans_sem);
88 for (i = 0; i < RDS_TRANS_COUNT; i++) { 95 for (i = 0; i < RDS_TRANS_COUNT; i++) {
89 trans = transports[i]; 96 trans = transports[i];
90 97
91 if (trans && (trans->laddr_check(net, addr) == 0) && 98 if (trans && (trans->laddr_check(net, addr, scope_id) == 0) &&
92 (!trans->t_owner || try_module_get(trans->t_owner))) { 99 (!trans->t_owner || try_module_get(trans->t_owner))) {
93 ret = trans; 100 ret = trans;
94 break; 101 break;
@@ -152,4 +159,3 @@ unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
152 159
153 return total; 160 return total;
154} 161}
155
diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index a7a4e6ff9be2..1355f5ca8d22 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -141,13 +141,15 @@ static void rfkill_led_trigger_event(struct rfkill *rfkill)
141 led_trigger_event(trigger, LED_FULL); 141 led_trigger_event(trigger, LED_FULL);
142} 142}
143 143
144static void rfkill_led_trigger_activate(struct led_classdev *led) 144static int rfkill_led_trigger_activate(struct led_classdev *led)
145{ 145{
146 struct rfkill *rfkill; 146 struct rfkill *rfkill;
147 147
148 rfkill = container_of(led->trigger, struct rfkill, led_trigger); 148 rfkill = container_of(led->trigger, struct rfkill, led_trigger);
149 149
150 rfkill_led_trigger_event(rfkill); 150 rfkill_led_trigger_event(rfkill);
151
152 return 0;
151} 153}
152 154
153const char *rfkill_get_led_trigger_name(struct rfkill *rfkill) 155const char *rfkill_get_led_trigger_name(struct rfkill *rfkill)
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index ebe42e7eb456..d00a0ef39a56 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -1470,7 +1470,7 @@ static const struct proto_ops rose_proto_ops = {
1470 .socketpair = sock_no_socketpair, 1470 .socketpair = sock_no_socketpair,
1471 .accept = rose_accept, 1471 .accept = rose_accept,
1472 .getname = rose_getname, 1472 .getname = rose_getname,
1473 .poll_mask = datagram_poll_mask, 1473 .poll = datagram_poll,
1474 .ioctl = rose_ioctl, 1474 .ioctl = rose_ioctl,
1475 .listen = rose_listen, 1475 .listen = rose_listen,
1476 .shutdown = sock_no_shutdown, 1476 .shutdown = sock_no_shutdown,
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 3b1ac93efee2..ac44d8afffb1 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -734,11 +734,15 @@ static int rxrpc_getsockopt(struct socket *sock, int level, int optname,
734/* 734/*
735 * permit an RxRPC socket to be polled 735 * permit an RxRPC socket to be polled
736 */ 736 */
737static __poll_t rxrpc_poll_mask(struct socket *sock, __poll_t events) 737static __poll_t rxrpc_poll(struct file *file, struct socket *sock,
738 poll_table *wait)
738{ 739{
739 struct sock *sk = sock->sk; 740 struct sock *sk = sock->sk;
740 struct rxrpc_sock *rx = rxrpc_sk(sk); 741 struct rxrpc_sock *rx = rxrpc_sk(sk);
741 __poll_t mask = 0; 742 __poll_t mask;
743
744 sock_poll_wait(file, wait);
745 mask = 0;
742 746
743 /* the socket is readable if there are any messages waiting on the Rx 747 /* the socket is readable if there are any messages waiting on the Rx
744 * queue */ 748 * queue */
@@ -945,7 +949,7 @@ static const struct proto_ops rxrpc_rpc_ops = {
945 .socketpair = sock_no_socketpair, 949 .socketpair = sock_no_socketpair,
946 .accept = sock_no_accept, 950 .accept = sock_no_accept,
947 .getname = sock_no_getname, 951 .getname = sock_no_getname,
948 .poll_mask = rxrpc_poll_mask, 952 .poll = rxrpc_poll,
949 .ioctl = sock_no_ioctl, 953 .ioctl = sock_no_ioctl,
950 .listen = rxrpc_listen, 954 .listen = rxrpc_listen,
951 .shutdown = rxrpc_shutdown, 955 .shutdown = rxrpc_shutdown,
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 5fb7d3254d9e..c97558710421 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -104,9 +104,9 @@ struct rxrpc_net {
104 104
105#define RXRPC_KEEPALIVE_TIME 20 /* NAT keepalive time in seconds */ 105#define RXRPC_KEEPALIVE_TIME 20 /* NAT keepalive time in seconds */
106 u8 peer_keepalive_cursor; 106 u8 peer_keepalive_cursor;
107 ktime_t peer_keepalive_base; 107 time64_t peer_keepalive_base;
108 struct hlist_head peer_keepalive[RXRPC_KEEPALIVE_TIME + 1]; 108 struct list_head peer_keepalive[32];
109 struct hlist_head peer_keepalive_new; 109 struct list_head peer_keepalive_new;
110 struct timer_list peer_keepalive_timer; 110 struct timer_list peer_keepalive_timer;
111 struct work_struct peer_keepalive_work; 111 struct work_struct peer_keepalive_work;
112}; 112};
@@ -295,7 +295,7 @@ struct rxrpc_peer {
295 struct hlist_head error_targets; /* targets for net error distribution */ 295 struct hlist_head error_targets; /* targets for net error distribution */
296 struct work_struct error_distributor; 296 struct work_struct error_distributor;
297 struct rb_root service_conns; /* Service connections */ 297 struct rb_root service_conns; /* Service connections */
298 struct hlist_node keepalive_link; /* Link in net->peer_keepalive[] */ 298 struct list_head keepalive_link; /* Link in net->peer_keepalive[] */
299 time64_t last_tx_at; /* Last time packet sent here */ 299 time64_t last_tx_at; /* Last time packet sent here */
300 seqlock_t service_conn_lock; 300 seqlock_t service_conn_lock;
301 spinlock_t lock; /* access lock */ 301 spinlock_t lock; /* access lock */
@@ -420,6 +420,7 @@ struct rxrpc_connection {
420 struct rxrpc_channel { 420 struct rxrpc_channel {
421 unsigned long final_ack_at; /* Time at which to issue final ACK */ 421 unsigned long final_ack_at; /* Time at which to issue final ACK */
422 struct rxrpc_call __rcu *call; /* Active call */ 422 struct rxrpc_call __rcu *call; /* Active call */
423 unsigned int call_debug_id; /* call->debug_id */
423 u32 call_id; /* ID of current call */ 424 u32 call_id; /* ID of current call */
424 u32 call_counter; /* Call ID counter */ 425 u32 call_counter; /* Call ID counter */
425 u32 last_call; /* ID of last call */ 426 u32 last_call; /* ID of last call */
@@ -478,6 +479,7 @@ enum rxrpc_call_flag {
478 RXRPC_CALL_RETRANS_TIMEOUT, /* Retransmission due to timeout occurred */ 479 RXRPC_CALL_RETRANS_TIMEOUT, /* Retransmission due to timeout occurred */
479 RXRPC_CALL_BEGAN_RX_TIMER, /* We began the expect_rx_by timer */ 480 RXRPC_CALL_BEGAN_RX_TIMER, /* We began the expect_rx_by timer */
480 RXRPC_CALL_RX_HEARD, /* The peer responded at least once to this call */ 481 RXRPC_CALL_RX_HEARD, /* The peer responded at least once to this call */
482 RXRPC_CALL_RX_UNDERRUN, /* Got data underrun */
481}; 483};
482 484
483/* 485/*
@@ -588,7 +590,7 @@ struct rxrpc_call {
588 */ 590 */
589#define RXRPC_RXTX_BUFF_SIZE 64 591#define RXRPC_RXTX_BUFF_SIZE 64
590#define RXRPC_RXTX_BUFF_MASK (RXRPC_RXTX_BUFF_SIZE - 1) 592#define RXRPC_RXTX_BUFF_MASK (RXRPC_RXTX_BUFF_SIZE - 1)
591#define RXRPC_INIT_RX_WINDOW_SIZE 32 593#define RXRPC_INIT_RX_WINDOW_SIZE 63
592 struct sk_buff **rxtx_buffer; 594 struct sk_buff **rxtx_buffer;
593 u8 *rxtx_annotations; 595 u8 *rxtx_annotations;
594#define RXRPC_TX_ANNO_ACK 0 596#define RXRPC_TX_ANNO_ACK 0
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index a9a9be5519b9..9d1e298b784c 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -116,9 +116,9 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx,
116 while (*pp) { 116 while (*pp) {
117 parent = *pp; 117 parent = *pp;
118 xcall = rb_entry(parent, struct rxrpc_call, sock_node); 118 xcall = rb_entry(parent, struct rxrpc_call, sock_node);
119 if (user_call_ID < call->user_call_ID) 119 if (user_call_ID < xcall->user_call_ID)
120 pp = &(*pp)->rb_left; 120 pp = &(*pp)->rb_left;
121 else if (user_call_ID > call->user_call_ID) 121 else if (user_call_ID > xcall->user_call_ID)
122 pp = &(*pp)->rb_right; 122 pp = &(*pp)->rb_right;
123 else 123 else
124 goto id_in_use; 124 goto id_in_use;
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index 20210418904b..8e7434e92097 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -162,7 +162,6 @@ static void rxrpc_congestion_timeout(struct rxrpc_call *call)
162 */ 162 */
163static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) 163static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j)
164{ 164{
165 struct rxrpc_skb_priv *sp;
166 struct sk_buff *skb; 165 struct sk_buff *skb;
167 unsigned long resend_at; 166 unsigned long resend_at;
168 rxrpc_seq_t cursor, seq, top; 167 rxrpc_seq_t cursor, seq, top;
@@ -207,7 +206,6 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j)
207 206
208 skb = call->rxtx_buffer[ix]; 207 skb = call->rxtx_buffer[ix];
209 rxrpc_see_skb(skb, rxrpc_skb_tx_seen); 208 rxrpc_see_skb(skb, rxrpc_skb_tx_seen);
210 sp = rxrpc_skb(skb);
211 209
212 if (anno_type == RXRPC_TX_ANNO_UNACK) { 210 if (anno_type == RXRPC_TX_ANNO_UNACK) {
213 if (ktime_after(skb->tstamp, max_age)) { 211 if (ktime_after(skb->tstamp, max_age)) {
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index f6734d8cb01a..9486293fef5c 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -415,7 +415,7 @@ void rxrpc_incoming_call(struct rxrpc_sock *rx,
415bool rxrpc_queue_call(struct rxrpc_call *call) 415bool rxrpc_queue_call(struct rxrpc_call *call)
416{ 416{
417 const void *here = __builtin_return_address(0); 417 const void *here = __builtin_return_address(0);
418 int n = __atomic_add_unless(&call->usage, 1, 0); 418 int n = atomic_fetch_add_unless(&call->usage, 1, 0);
419 if (n == 0) 419 if (n == 0)
420 return false; 420 return false;
421 if (rxrpc_queue_work(&call->processor)) 421 if (rxrpc_queue_work(&call->processor))
diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
index 5736f643c516..f8f37188a932 100644
--- a/net/rxrpc/conn_client.c
+++ b/net/rxrpc/conn_client.c
@@ -590,6 +590,7 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn,
590 */ 590 */
591 smp_wmb(); 591 smp_wmb();
592 chan->call_id = call_id; 592 chan->call_id = call_id;
593 chan->call_debug_id = call->debug_id;
593 rcu_assign_pointer(chan->call, call); 594 rcu_assign_pointer(chan->call, call);
594 wake_up(&call->waitq); 595 wake_up(&call->waitq);
595} 596}
@@ -1051,7 +1052,6 @@ void rxrpc_discard_expired_client_conns(struct work_struct *work)
1051 container_of(work, struct rxrpc_net, client_conn_reaper); 1052 container_of(work, struct rxrpc_net, client_conn_reaper);
1052 unsigned long expiry, conn_expires_at, now; 1053 unsigned long expiry, conn_expires_at, now;
1053 unsigned int nr_conns; 1054 unsigned int nr_conns;
1054 bool did_discard = false;
1055 1055
1056 _enter(""); 1056 _enter("");
1057 1057
@@ -1113,7 +1113,6 @@ next:
1113 * If someone re-sets the flag and re-gets the ref, that's fine. 1113 * If someone re-sets the flag and re-gets the ref, that's fine.
1114 */ 1114 */
1115 rxrpc_put_connection(conn); 1115 rxrpc_put_connection(conn);
1116 did_discard = true;
1117 nr_conns--; 1116 nr_conns--;
1118 goto next; 1117 goto next;
1119 1118
diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index 8229a52c2acd..6df56ce68861 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -129,17 +129,22 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
129 _proto("Tx ABORT %%%u { %d } [re]", serial, conn->local_abort); 129 _proto("Tx ABORT %%%u { %d } [re]", serial, conn->local_abort);
130 break; 130 break;
131 case RXRPC_PACKET_TYPE_ACK: 131 case RXRPC_PACKET_TYPE_ACK:
132 trace_rxrpc_tx_ack(NULL, serial, chan->last_seq, 0, 132 trace_rxrpc_tx_ack(chan->call_debug_id, serial,
133 RXRPC_ACK_DUPLICATE, 0); 133 ntohl(pkt.ack.firstPacket),
134 ntohl(pkt.ack.serial),
135 pkt.ack.reason, 0);
134 _proto("Tx ACK %%%u [re]", serial); 136 _proto("Tx ACK %%%u [re]", serial);
135 break; 137 break;
136 } 138 }
137 139
138 ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, ioc, len); 140 ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, ioc, len);
139 conn->params.peer->last_tx_at = ktime_get_real(); 141 conn->params.peer->last_tx_at = ktime_get_seconds();
140 if (ret < 0) 142 if (ret < 0)
141 trace_rxrpc_tx_fail(conn->debug_id, serial, ret, 143 trace_rxrpc_tx_fail(chan->call_debug_id, serial, ret,
142 rxrpc_tx_fail_call_final_resend); 144 rxrpc_tx_point_call_final_resend);
145 else
146 trace_rxrpc_tx_packet(chan->call_debug_id, &pkt.whdr,
147 rxrpc_tx_point_call_final_resend);
143 148
144 _leave(""); 149 _leave("");
145} 150}
@@ -240,12 +245,14 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn,
240 ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len); 245 ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len);
241 if (ret < 0) { 246 if (ret < 0) {
242 trace_rxrpc_tx_fail(conn->debug_id, serial, ret, 247 trace_rxrpc_tx_fail(conn->debug_id, serial, ret,
243 rxrpc_tx_fail_conn_abort); 248 rxrpc_tx_point_conn_abort);
244 _debug("sendmsg failed: %d", ret); 249 _debug("sendmsg failed: %d", ret);
245 return -EAGAIN; 250 return -EAGAIN;
246 } 251 }
247 252
248 conn->params.peer->last_tx_at = ktime_get_real(); 253 trace_rxrpc_tx_packet(conn->debug_id, &whdr, rxrpc_tx_point_conn_abort);
254
255 conn->params.peer->last_tx_at = ktime_get_seconds();
249 256
250 _leave(" = 0"); 257 _leave(" = 0");
251 return 0; 258 return 0;
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index 4c77a78a252a..77440a356b14 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -266,7 +266,7 @@ void rxrpc_kill_connection(struct rxrpc_connection *conn)
266bool rxrpc_queue_conn(struct rxrpc_connection *conn) 266bool rxrpc_queue_conn(struct rxrpc_connection *conn)
267{ 267{
268 const void *here = __builtin_return_address(0); 268 const void *here = __builtin_return_address(0);
269 int n = __atomic_add_unless(&conn->usage, 1, 0); 269 int n = atomic_fetch_add_unless(&conn->usage, 1, 0);
270 if (n == 0) 270 if (n == 0)
271 return false; 271 return false;
272 if (rxrpc_queue_work(&conn->processor)) 272 if (rxrpc_queue_work(&conn->processor))
@@ -309,7 +309,7 @@ rxrpc_get_connection_maybe(struct rxrpc_connection *conn)
309 const void *here = __builtin_return_address(0); 309 const void *here = __builtin_return_address(0);
310 310
311 if (conn) { 311 if (conn) {
312 int n = __atomic_add_unless(&conn->usage, 1, 0); 312 int n = atomic_fetch_add_unless(&conn->usage, 1, 0);
313 if (n > 0) 313 if (n > 0)
314 trace_rxrpc_conn(conn, rxrpc_conn_got, n + 1, here); 314 trace_rxrpc_conn(conn, rxrpc_conn_got, n + 1, here);
315 else 315 else
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 608d078a4981..cfdc199c6351 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -496,7 +496,7 @@ next_subpacket:
496 return rxrpc_proto_abort("LSA", call, seq); 496 return rxrpc_proto_abort("LSA", call, seq);
497 } 497 }
498 498
499 trace_rxrpc_rx_data(call, seq, serial, flags, annotation); 499 trace_rxrpc_rx_data(call->debug_id, seq, serial, flags, annotation);
500 if (before_eq(seq, hard_ack)) { 500 if (before_eq(seq, hard_ack)) {
501 ack = RXRPC_ACK_DUPLICATE; 501 ack = RXRPC_ACK_DUPLICATE;
502 ack_serial = serial; 502 ack_serial = serial;
@@ -592,9 +592,15 @@ ack:
592 rxrpc_propose_ACK(call, ack, skew, ack_serial, 592 rxrpc_propose_ACK(call, ack, skew, ack_serial,
593 immediate_ack, true, 593 immediate_ack, true,
594 rxrpc_propose_ack_input_data); 594 rxrpc_propose_ack_input_data);
595 else
596 rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, skew, serial,
597 false, true,
598 rxrpc_propose_ack_input_data);
595 599
596 if (sp->hdr.seq == READ_ONCE(call->rx_hard_ack) + 1) 600 if (sp->hdr.seq == READ_ONCE(call->rx_hard_ack) + 1) {
601 trace_rxrpc_notify_socket(call->debug_id, serial);
597 rxrpc_notify_socket(call); 602 rxrpc_notify_socket(call);
603 }
598 _leave(" [queued]"); 604 _leave(" [queued]");
599} 605}
600 606
@@ -1262,6 +1268,11 @@ void rxrpc_data_ready(struct sock *udp_sk)
1262 /* But otherwise we need to retransmit the final packet 1268 /* But otherwise we need to retransmit the final packet
1263 * from data cached in the connection record. 1269 * from data cached in the connection record.
1264 */ 1270 */
1271 if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA)
1272 trace_rxrpc_rx_data(chan->call_debug_id,
1273 sp->hdr.seq,
1274 sp->hdr.serial,
1275 sp->hdr.flags, 0);
1265 rxrpc_post_packet_to_conn(conn, skb); 1276 rxrpc_post_packet_to_conn(conn, skb);
1266 goto out_unlock; 1277 goto out_unlock;
1267 } 1278 }
diff --git a/net/rxrpc/local_event.c b/net/rxrpc/local_event.c
index 8325f1b86840..13bd8a4dfac7 100644
--- a/net/rxrpc/local_event.c
+++ b/net/rxrpc/local_event.c
@@ -72,7 +72,10 @@ static void rxrpc_send_version_request(struct rxrpc_local *local,
72 ret = kernel_sendmsg(local->socket, &msg, iov, 2, len); 72 ret = kernel_sendmsg(local->socket, &msg, iov, 2, len);
73 if (ret < 0) 73 if (ret < 0)
74 trace_rxrpc_tx_fail(local->debug_id, 0, ret, 74 trace_rxrpc_tx_fail(local->debug_id, 0, ret,
75 rxrpc_tx_fail_version_reply); 75 rxrpc_tx_point_version_reply);
76 else
77 trace_rxrpc_tx_packet(local->debug_id, &whdr,
78 rxrpc_tx_point_version_reply);
76 79
77 _leave(""); 80 _leave("");
78} 81}
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index b493e6b62740..777c3ed4cfc0 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -305,7 +305,7 @@ struct rxrpc_local *rxrpc_get_local_maybe(struct rxrpc_local *local)
305 const void *here = __builtin_return_address(0); 305 const void *here = __builtin_return_address(0);
306 306
307 if (local) { 307 if (local) {
308 int n = __atomic_add_unless(&local->usage, 1, 0); 308 int n = atomic_fetch_add_unless(&local->usage, 1, 0);
309 if (n > 0) 309 if (n > 0)
310 trace_rxrpc_local(local, rxrpc_local_got, n + 1, here); 310 trace_rxrpc_local(local, rxrpc_local_got, n + 1, here);
311 else 311 else
diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c
index 5d6a773db973..417d80867c4f 100644
--- a/net/rxrpc/net_ns.c
+++ b/net/rxrpc/net_ns.c
@@ -85,12 +85,12 @@ static __net_init int rxrpc_init_net(struct net *net)
85 hash_init(rxnet->peer_hash); 85 hash_init(rxnet->peer_hash);
86 spin_lock_init(&rxnet->peer_hash_lock); 86 spin_lock_init(&rxnet->peer_hash_lock);
87 for (i = 0; i < ARRAY_SIZE(rxnet->peer_keepalive); i++) 87 for (i = 0; i < ARRAY_SIZE(rxnet->peer_keepalive); i++)
88 INIT_HLIST_HEAD(&rxnet->peer_keepalive[i]); 88 INIT_LIST_HEAD(&rxnet->peer_keepalive[i]);
89 INIT_HLIST_HEAD(&rxnet->peer_keepalive_new); 89 INIT_LIST_HEAD(&rxnet->peer_keepalive_new);
90 timer_setup(&rxnet->peer_keepalive_timer, 90 timer_setup(&rxnet->peer_keepalive_timer,
91 rxrpc_peer_keepalive_timeout, 0); 91 rxrpc_peer_keepalive_timeout, 0);
92 INIT_WORK(&rxnet->peer_keepalive_work, rxrpc_peer_keepalive_worker); 92 INIT_WORK(&rxnet->peer_keepalive_work, rxrpc_peer_keepalive_worker);
93 rxnet->peer_keepalive_base = ktime_add(ktime_get_real(), NSEC_PER_SEC); 93 rxnet->peer_keepalive_base = ktime_get_seconds();
94 94
95 ret = -ENOMEM; 95 ret = -ENOMEM;
96 rxnet->proc_net = proc_net_mkdir(net, "rxrpc", net->proc_net); 96 rxnet->proc_net = proc_net_mkdir(net, "rxrpc", net->proc_net);
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index f03de1c59ba3..ccf5de160444 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -183,7 +183,7 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping,
183 183
184 serial = atomic_inc_return(&conn->serial); 184 serial = atomic_inc_return(&conn->serial);
185 pkt->whdr.serial = htonl(serial); 185 pkt->whdr.serial = htonl(serial);
186 trace_rxrpc_tx_ack(call, serial, 186 trace_rxrpc_tx_ack(call->debug_id, serial,
187 ntohl(pkt->ack.firstPacket), 187 ntohl(pkt->ack.firstPacket),
188 ntohl(pkt->ack.serial), 188 ntohl(pkt->ack.serial),
189 pkt->ack.reason, pkt->ack.nAcks); 189 pkt->ack.reason, pkt->ack.nAcks);
@@ -209,10 +209,13 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping,
209 now = ktime_get_real(); 209 now = ktime_get_real();
210 if (ping) 210 if (ping)
211 call->ping_time = now; 211 call->ping_time = now;
212 conn->params.peer->last_tx_at = ktime_get_real(); 212 conn->params.peer->last_tx_at = ktime_get_seconds();
213 if (ret < 0) 213 if (ret < 0)
214 trace_rxrpc_tx_fail(call->debug_id, serial, ret, 214 trace_rxrpc_tx_fail(call->debug_id, serial, ret,
215 rxrpc_tx_fail_call_ack); 215 rxrpc_tx_point_call_ack);
216 else
217 trace_rxrpc_tx_packet(call->debug_id, &pkt->whdr,
218 rxrpc_tx_point_call_ack);
216 219
217 if (call->state < RXRPC_CALL_COMPLETE) { 220 if (call->state < RXRPC_CALL_COMPLETE) {
218 if (ret < 0) { 221 if (ret < 0) {
@@ -296,10 +299,13 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call)
296 299
297 ret = kernel_sendmsg(conn->params.local->socket, 300 ret = kernel_sendmsg(conn->params.local->socket,
298 &msg, iov, 1, sizeof(pkt)); 301 &msg, iov, 1, sizeof(pkt));
299 conn->params.peer->last_tx_at = ktime_get_real(); 302 conn->params.peer->last_tx_at = ktime_get_seconds();
300 if (ret < 0) 303 if (ret < 0)
301 trace_rxrpc_tx_fail(call->debug_id, serial, ret, 304 trace_rxrpc_tx_fail(call->debug_id, serial, ret,
302 rxrpc_tx_fail_call_abort); 305 rxrpc_tx_point_call_abort);
306 else
307 trace_rxrpc_tx_packet(call->debug_id, &pkt.whdr,
308 rxrpc_tx_point_call_abort);
303 309
304 310
305 rxrpc_put_connection(conn); 311 rxrpc_put_connection(conn);
@@ -391,12 +397,15 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb,
391 * message and update the peer record 397 * message and update the peer record
392 */ 398 */
393 ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len); 399 ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len);
394 conn->params.peer->last_tx_at = ktime_get_real(); 400 conn->params.peer->last_tx_at = ktime_get_seconds();
395 401
396 up_read(&conn->params.local->defrag_sem); 402 up_read(&conn->params.local->defrag_sem);
397 if (ret < 0) 403 if (ret < 0)
398 trace_rxrpc_tx_fail(call->debug_id, serial, ret, 404 trace_rxrpc_tx_fail(call->debug_id, serial, ret,
399 rxrpc_tx_fail_call_data_nofrag); 405 rxrpc_tx_point_call_data_nofrag);
406 else
407 trace_rxrpc_tx_packet(call->debug_id, &whdr,
408 rxrpc_tx_point_call_data_nofrag);
400 if (ret == -EMSGSIZE) 409 if (ret == -EMSGSIZE)
401 goto send_fragmentable; 410 goto send_fragmentable;
402 411
@@ -457,7 +466,7 @@ send_fragmentable:
457 if (ret == 0) { 466 if (ret == 0) {
458 ret = kernel_sendmsg(conn->params.local->socket, &msg, 467 ret = kernel_sendmsg(conn->params.local->socket, &msg,
459 iov, 2, len); 468 iov, 2, len);
460 conn->params.peer->last_tx_at = ktime_get_real(); 469 conn->params.peer->last_tx_at = ktime_get_seconds();
461 470
462 opt = IP_PMTUDISC_DO; 471 opt = IP_PMTUDISC_DO;
463 kernel_setsockopt(conn->params.local->socket, SOL_IP, 472 kernel_setsockopt(conn->params.local->socket, SOL_IP,
@@ -475,7 +484,7 @@ send_fragmentable:
475 if (ret == 0) { 484 if (ret == 0) {
476 ret = kernel_sendmsg(conn->params.local->socket, &msg, 485 ret = kernel_sendmsg(conn->params.local->socket, &msg,
477 iov, 2, len); 486 iov, 2, len);
478 conn->params.peer->last_tx_at = ktime_get_real(); 487 conn->params.peer->last_tx_at = ktime_get_seconds();
479 488
480 opt = IPV6_PMTUDISC_DO; 489 opt = IPV6_PMTUDISC_DO;
481 kernel_setsockopt(conn->params.local->socket, 490 kernel_setsockopt(conn->params.local->socket,
@@ -488,7 +497,10 @@ send_fragmentable:
488 497
489 if (ret < 0) 498 if (ret < 0)
490 trace_rxrpc_tx_fail(call->debug_id, serial, ret, 499 trace_rxrpc_tx_fail(call->debug_id, serial, ret,
491 rxrpc_tx_fail_call_data_frag); 500 rxrpc_tx_point_call_data_frag);
501 else
502 trace_rxrpc_tx_packet(call->debug_id, &whdr,
503 rxrpc_tx_point_call_data_frag);
492 504
493 up_write(&conn->params.local->defrag_sem); 505 up_write(&conn->params.local->defrag_sem);
494 goto done; 506 goto done;
@@ -545,7 +557,10 @@ void rxrpc_reject_packets(struct rxrpc_local *local)
545 ret = kernel_sendmsg(local->socket, &msg, iov, 2, size); 557 ret = kernel_sendmsg(local->socket, &msg, iov, 2, size);
546 if (ret < 0) 558 if (ret < 0)
547 trace_rxrpc_tx_fail(local->debug_id, 0, ret, 559 trace_rxrpc_tx_fail(local->debug_id, 0, ret,
548 rxrpc_tx_fail_reject); 560 rxrpc_tx_point_reject);
561 else
562 trace_rxrpc_tx_packet(local->debug_id, &whdr,
563 rxrpc_tx_point_reject);
549 } 564 }
550 565
551 rxrpc_free_skb(skb, rxrpc_skb_rx_freed); 566 rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
@@ -597,8 +612,11 @@ void rxrpc_send_keepalive(struct rxrpc_peer *peer)
597 ret = kernel_sendmsg(peer->local->socket, &msg, iov, 2, len); 612 ret = kernel_sendmsg(peer->local->socket, &msg, iov, 2, len);
598 if (ret < 0) 613 if (ret < 0)
599 trace_rxrpc_tx_fail(peer->debug_id, 0, ret, 614 trace_rxrpc_tx_fail(peer->debug_id, 0, ret,
600 rxrpc_tx_fail_version_keepalive); 615 rxrpc_tx_point_version_keepalive);
616 else
617 trace_rxrpc_tx_packet(peer->debug_id, &whdr,
618 rxrpc_tx_point_version_keepalive);
601 619
602 peer->last_tx_at = ktime_get_real(); 620 peer->last_tx_at = ktime_get_seconds();
603 _leave(""); 621 _leave("");
604} 622}
diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c
index 0ed8b651cec2..4f9da2f51c69 100644
--- a/net/rxrpc/peer_event.c
+++ b/net/rxrpc/peer_event.c
@@ -350,97 +350,117 @@ void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why,
350} 350}
351 351
352/* 352/*
353 * Perform keep-alive pings with VERSION packets to keep any NAT alive. 353 * Perform keep-alive pings.
354 */ 354 */
355void rxrpc_peer_keepalive_worker(struct work_struct *work) 355static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet,
356 struct list_head *collector,
357 time64_t base,
358 u8 cursor)
356{ 359{
357 struct rxrpc_net *rxnet =
358 container_of(work, struct rxrpc_net, peer_keepalive_work);
359 struct rxrpc_peer *peer; 360 struct rxrpc_peer *peer;
360 unsigned long delay; 361 const u8 mask = ARRAY_SIZE(rxnet->peer_keepalive) - 1;
361 ktime_t base, now = ktime_get_real(); 362 time64_t keepalive_at;
362 s64 diff; 363 int slot;
363 u8 cursor, slot;
364 364
365 base = rxnet->peer_keepalive_base; 365 spin_lock_bh(&rxnet->peer_hash_lock);
366 cursor = rxnet->peer_keepalive_cursor;
367 366
368 _enter("%u,%lld", cursor, ktime_sub(now, base)); 367 while (!list_empty(collector)) {
368 peer = list_entry(collector->next,
369 struct rxrpc_peer, keepalive_link);
369 370
370next_bucket: 371 list_del_init(&peer->keepalive_link);
371 diff = ktime_to_ns(ktime_sub(now, base)); 372 if (!rxrpc_get_peer_maybe(peer))
372 if (diff < 0) 373 continue;
373 goto resched;
374 374
375 _debug("at %u", cursor);
376 spin_lock_bh(&rxnet->peer_hash_lock);
377next_peer:
378 if (!rxnet->live) {
379 spin_unlock_bh(&rxnet->peer_hash_lock); 375 spin_unlock_bh(&rxnet->peer_hash_lock);
380 goto out;
381 }
382 376
383 /* Everything in the bucket at the cursor is processed this second; the 377 keepalive_at = peer->last_tx_at + RXRPC_KEEPALIVE_TIME;
384 * bucket at cursor + 1 goes now + 1s and so on... 378 slot = keepalive_at - base;
385 */ 379 _debug("%02x peer %u t=%d {%pISp}",
386 if (hlist_empty(&rxnet->peer_keepalive[cursor])) { 380 cursor, peer->debug_id, slot, &peer->srx.transport);
387 if (hlist_empty(&rxnet->peer_keepalive_new)) { 381
388 spin_unlock_bh(&rxnet->peer_hash_lock); 382 if (keepalive_at <= base ||
389 goto emptied_bucket; 383 keepalive_at > base + RXRPC_KEEPALIVE_TIME) {
384 rxrpc_send_keepalive(peer);
385 slot = RXRPC_KEEPALIVE_TIME;
390 } 386 }
391 387
392 hlist_move_list(&rxnet->peer_keepalive_new, 388 /* A transmission to this peer occurred since last we examined
393 &rxnet->peer_keepalive[cursor]); 389 * it so put it into the appropriate future bucket.
390 */
391 slot += cursor;
392 slot &= mask;
393 spin_lock_bh(&rxnet->peer_hash_lock);
394 list_add_tail(&peer->keepalive_link,
395 &rxnet->peer_keepalive[slot & mask]);
396 rxrpc_put_peer(peer);
394 } 397 }
395 398
396 peer = hlist_entry(rxnet->peer_keepalive[cursor].first,
397 struct rxrpc_peer, keepalive_link);
398 hlist_del_init(&peer->keepalive_link);
399 if (!rxrpc_get_peer_maybe(peer))
400 goto next_peer;
401
402 spin_unlock_bh(&rxnet->peer_hash_lock); 399 spin_unlock_bh(&rxnet->peer_hash_lock);
400}
403 401
404 _debug("peer %u {%pISp}", peer->debug_id, &peer->srx.transport); 402/*
403 * Perform keep-alive pings with VERSION packets to keep any NAT alive.
404 */
405void rxrpc_peer_keepalive_worker(struct work_struct *work)
406{
407 struct rxrpc_net *rxnet =
408 container_of(work, struct rxrpc_net, peer_keepalive_work);
409 const u8 mask = ARRAY_SIZE(rxnet->peer_keepalive) - 1;
410 time64_t base, now, delay;
411 u8 cursor, stop;
412 LIST_HEAD(collector);
405 413
406recalc: 414 now = ktime_get_seconds();
407 diff = ktime_divns(ktime_sub(peer->last_tx_at, base), NSEC_PER_SEC); 415 base = rxnet->peer_keepalive_base;
408 if (diff < -30 || diff > 30) 416 cursor = rxnet->peer_keepalive_cursor;
409 goto send; /* LSW of 64-bit time probably wrapped on 32-bit */ 417 _enter("%lld,%u", base - now, cursor);
410 diff += RXRPC_KEEPALIVE_TIME - 1;
411 if (diff < 0)
412 goto send;
413 418
414 slot = (diff > RXRPC_KEEPALIVE_TIME - 1) ? RXRPC_KEEPALIVE_TIME - 1 : diff; 419 if (!rxnet->live)
415 if (slot == 0) 420 return;
416 goto send;
417 421
418 /* A transmission to this peer occurred since last we examined it so 422 /* Remove to a temporary list all the peers that are currently lodged
419 * put it into the appropriate future bucket. 423 * in expired buckets plus all new peers.
424 *
425 * Everything in the bucket at the cursor is processed this
426 * second; the bucket at cursor + 1 goes at now + 1s and so
427 * on...
420 */ 428 */
421 slot = (slot + cursor) % ARRAY_SIZE(rxnet->peer_keepalive);
422 spin_lock_bh(&rxnet->peer_hash_lock); 429 spin_lock_bh(&rxnet->peer_hash_lock);
423 hlist_add_head(&peer->keepalive_link, &rxnet->peer_keepalive[slot]); 430 list_splice_init(&rxnet->peer_keepalive_new, &collector);
424 rxrpc_put_peer(peer); 431
425 goto next_peer; 432 stop = cursor + ARRAY_SIZE(rxnet->peer_keepalive);
426 433 while (base <= now && (s8)(cursor - stop) < 0) {
427send: 434 list_splice_tail_init(&rxnet->peer_keepalive[cursor & mask],
428 rxrpc_send_keepalive(peer); 435 &collector);
429 now = ktime_get_real(); 436 base++;
430 goto recalc; 437 cursor++;
438 }
431 439
432emptied_bucket: 440 base = now;
433 cursor++; 441 spin_unlock_bh(&rxnet->peer_hash_lock);
434 if (cursor >= ARRAY_SIZE(rxnet->peer_keepalive))
435 cursor = 0;
436 base = ktime_add_ns(base, NSEC_PER_SEC);
437 goto next_bucket;
438 442
439resched:
440 rxnet->peer_keepalive_base = base; 443 rxnet->peer_keepalive_base = base;
441 rxnet->peer_keepalive_cursor = cursor; 444 rxnet->peer_keepalive_cursor = cursor;
442 delay = nsecs_to_jiffies(-diff) + 1; 445 rxrpc_peer_keepalive_dispatch(rxnet, &collector, base, cursor);
443 timer_reduce(&rxnet->peer_keepalive_timer, jiffies + delay); 446 ASSERT(list_empty(&collector));
444out: 447
448 /* Schedule the timer for the next occupied timeslot. */
449 cursor = rxnet->peer_keepalive_cursor;
450 stop = cursor + RXRPC_KEEPALIVE_TIME - 1;
451 for (; (s8)(cursor - stop) < 0; cursor++) {
452 if (!list_empty(&rxnet->peer_keepalive[cursor & mask]))
453 break;
454 base++;
455 }
456
457 now = ktime_get_seconds();
458 delay = base - now;
459 if (delay < 1)
460 delay = 1;
461 delay *= HZ;
462 if (rxnet->live)
463 timer_reduce(&rxnet->peer_keepalive_timer, jiffies + delay);
464
445 _leave(""); 465 _leave("");
446} 466}
diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c
index 1b7e8107b3ae..1dc7648e3eff 100644
--- a/net/rxrpc/peer_object.c
+++ b/net/rxrpc/peer_object.c
@@ -322,7 +322,7 @@ struct rxrpc_peer *rxrpc_lookup_incoming_peer(struct rxrpc_local *local,
322 if (!peer) { 322 if (!peer) {
323 peer = prealloc; 323 peer = prealloc;
324 hash_add_rcu(rxnet->peer_hash, &peer->hash_link, hash_key); 324 hash_add_rcu(rxnet->peer_hash, &peer->hash_link, hash_key);
325 hlist_add_head(&peer->keepalive_link, &rxnet->peer_keepalive_new); 325 list_add_tail(&peer->keepalive_link, &rxnet->peer_keepalive_new);
326 } 326 }
327 327
328 spin_unlock(&rxnet->peer_hash_lock); 328 spin_unlock(&rxnet->peer_hash_lock);
@@ -367,8 +367,8 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local,
367 if (!peer) { 367 if (!peer) {
368 hash_add_rcu(rxnet->peer_hash, 368 hash_add_rcu(rxnet->peer_hash,
369 &candidate->hash_link, hash_key); 369 &candidate->hash_link, hash_key);
370 hlist_add_head(&candidate->keepalive_link, 370 list_add_tail(&candidate->keepalive_link,
371 &rxnet->peer_keepalive_new); 371 &rxnet->peer_keepalive_new);
372 } 372 }
373 373
374 spin_unlock_bh(&rxnet->peer_hash_lock); 374 spin_unlock_bh(&rxnet->peer_hash_lock);
@@ -406,7 +406,7 @@ struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *peer)
406 const void *here = __builtin_return_address(0); 406 const void *here = __builtin_return_address(0);
407 407
408 if (peer) { 408 if (peer) {
409 int n = __atomic_add_unless(&peer->usage, 1, 0); 409 int n = atomic_fetch_add_unless(&peer->usage, 1, 0);
410 if (n > 0) 410 if (n > 0)
411 trace_rxrpc_peer(peer, rxrpc_peer_got, n + 1, here); 411 trace_rxrpc_peer(peer, rxrpc_peer_got, n + 1, here);
412 else 412 else
@@ -441,7 +441,7 @@ static void __rxrpc_put_peer(struct rxrpc_peer *peer)
441 441
442 spin_lock_bh(&rxnet->peer_hash_lock); 442 spin_lock_bh(&rxnet->peer_hash_lock);
443 hash_del_rcu(&peer->hash_link); 443 hash_del_rcu(&peer->hash_link);
444 hlist_del_init(&peer->keepalive_link); 444 list_del_init(&peer->keepalive_link);
445 spin_unlock_bh(&rxnet->peer_hash_lock); 445 spin_unlock_bh(&rxnet->peer_hash_lock);
446 446
447 kfree_rcu(peer, rcu); 447 kfree_rcu(peer, rcu);
diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c
index d9fca8c4bcdc..9805e3b85c36 100644
--- a/net/rxrpc/proc.c
+++ b/net/rxrpc/proc.c
@@ -63,6 +63,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
63 struct rxrpc_peer *peer; 63 struct rxrpc_peer *peer;
64 struct rxrpc_call *call; 64 struct rxrpc_call *call;
65 struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq)); 65 struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq));
66 unsigned long timeout = 0;
66 rxrpc_seq_t tx_hard_ack, rx_hard_ack; 67 rxrpc_seq_t tx_hard_ack, rx_hard_ack;
67 char lbuff[50], rbuff[50]; 68 char lbuff[50], rbuff[50];
68 69
@@ -71,7 +72,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
71 "Proto Local " 72 "Proto Local "
72 " Remote " 73 " Remote "
73 " SvID ConnID CallID End Use State Abort " 74 " SvID ConnID CallID End Use State Abort "
74 " UserID\n"); 75 " UserID TxSeq TW RxSeq RW RxSerial RxTimo\n");
75 return 0; 76 return 0;
76 } 77 }
77 78
@@ -94,11 +95,16 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
94 else 95 else
95 strcpy(rbuff, "no_connection"); 96 strcpy(rbuff, "no_connection");
96 97
98 if (call->state != RXRPC_CALL_SERVER_PREALLOC) {
99 timeout = READ_ONCE(call->expect_rx_by);
100 timeout -= jiffies;
101 }
102
97 tx_hard_ack = READ_ONCE(call->tx_hard_ack); 103 tx_hard_ack = READ_ONCE(call->tx_hard_ack);
98 rx_hard_ack = READ_ONCE(call->rx_hard_ack); 104 rx_hard_ack = READ_ONCE(call->rx_hard_ack);
99 seq_printf(seq, 105 seq_printf(seq,
100 "UDP %-47.47s %-47.47s %4x %08x %08x %s %3u" 106 "UDP %-47.47s %-47.47s %4x %08x %08x %s %3u"
101 " %-8.8s %08x %lx %08x %02x %08x %02x\n", 107 " %-8.8s %08x %lx %08x %02x %08x %02x %08x %06lx\n",
102 lbuff, 108 lbuff,
103 rbuff, 109 rbuff,
104 call->service_id, 110 call->service_id,
@@ -110,7 +116,9 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
110 call->abort_code, 116 call->abort_code,
111 call->user_call_ID, 117 call->user_call_ID,
112 tx_hard_ack, READ_ONCE(call->tx_top) - tx_hard_ack, 118 tx_hard_ack, READ_ONCE(call->tx_top) - tx_hard_ack,
113 rx_hard_ack, READ_ONCE(call->rx_top) - rx_hard_ack); 119 rx_hard_ack, READ_ONCE(call->rx_top) - rx_hard_ack,
120 call->rx_serial,
121 timeout);
114 122
115 return 0; 123 return 0;
116} 124}
@@ -179,7 +187,7 @@ static int rxrpc_connection_seq_show(struct seq_file *seq, void *v)
179print: 187print:
180 seq_printf(seq, 188 seq_printf(seq,
181 "UDP %-47.47s %-47.47s %4x %08x %s %3u" 189 "UDP %-47.47s %-47.47s %4x %08x %s %3u"
182 " %s %08x %08x %08x\n", 190 " %s %08x %08x %08x %08x %08x %08x %08x\n",
183 lbuff, 191 lbuff,
184 rbuff, 192 rbuff,
185 conn->service_id, 193 conn->service_id,
@@ -189,7 +197,11 @@ print:
189 rxrpc_conn_states[conn->state], 197 rxrpc_conn_states[conn->state],
190 key_serial(conn->params.key), 198 key_serial(conn->params.key),
191 atomic_read(&conn->serial), 199 atomic_read(&conn->serial),
192 conn->hi_serial); 200 conn->hi_serial,
201 conn->channels[0].call_id,
202 conn->channels[1].call_id,
203 conn->channels[2].call_id,
204 conn->channels[3].call_id);
193 205
194 return 0; 206 return 0;
195} 207}
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index 7bff716e911e..816b19a78809 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -144,13 +144,11 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call, rxrpc_serial_t serial)
144 trace_rxrpc_receive(call, rxrpc_receive_end, 0, call->rx_top); 144 trace_rxrpc_receive(call, rxrpc_receive_end, 0, call->rx_top);
145 ASSERTCMP(call->rx_hard_ack, ==, call->rx_top); 145 ASSERTCMP(call->rx_hard_ack, ==, call->rx_top);
146 146
147#if 0 // TODO: May want to transmit final ACK under some circumstances anyway
148 if (call->state == RXRPC_CALL_CLIENT_RECV_REPLY) { 147 if (call->state == RXRPC_CALL_CLIENT_RECV_REPLY) {
149 rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, serial, true, false, 148 rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, serial, false, true,
150 rxrpc_propose_ack_terminal_ack); 149 rxrpc_propose_ack_terminal_ack);
151 rxrpc_send_ack_packet(call, false, NULL); 150 //rxrpc_send_ack_packet(call, false, NULL);
152 } 151 }
153#endif
154 152
155 write_lock_bh(&call->state_lock); 153 write_lock_bh(&call->state_lock);
156 154
@@ -315,6 +313,10 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
315 unsigned int rx_pkt_offset, rx_pkt_len; 313 unsigned int rx_pkt_offset, rx_pkt_len;
316 int ix, copy, ret = -EAGAIN, ret2; 314 int ix, copy, ret = -EAGAIN, ret2;
317 315
316 if (test_and_clear_bit(RXRPC_CALL_RX_UNDERRUN, &call->flags) &&
317 call->ackr_reason)
318 rxrpc_send_ack_packet(call, false, NULL);
319
318 rx_pkt_offset = call->rx_pkt_offset; 320 rx_pkt_offset = call->rx_pkt_offset;
319 rx_pkt_len = call->rx_pkt_len; 321 rx_pkt_len = call->rx_pkt_len;
320 322
@@ -414,6 +416,8 @@ out:
414done: 416done:
415 trace_rxrpc_recvmsg(call, rxrpc_recvmsg_data_return, seq, 417 trace_rxrpc_recvmsg(call, rxrpc_recvmsg_data_return, seq,
416 rx_pkt_offset, rx_pkt_len, ret); 418 rx_pkt_offset, rx_pkt_len, ret);
419 if (ret == -EAGAIN)
420 set_bit(RXRPC_CALL_RX_UNDERRUN, &call->flags);
417 return ret; 421 return ret;
418} 422}
419 423
@@ -607,9 +611,7 @@ wait_error:
607 * rxrpc_kernel_recv_data - Allow a kernel service to receive data/info 611 * rxrpc_kernel_recv_data - Allow a kernel service to receive data/info
608 * @sock: The socket that the call exists on 612 * @sock: The socket that the call exists on
609 * @call: The call to send data through 613 * @call: The call to send data through
610 * @buf: The buffer to receive into 614 * @iter: The buffer to receive into
611 * @size: The size of the buffer, including data already read
612 * @_offset: The running offset into the buffer.
613 * @want_more: True if more data is expected to be read 615 * @want_more: True if more data is expected to be read
614 * @_abort: Where the abort code is stored if -ECONNABORTED is returned 616 * @_abort: Where the abort code is stored if -ECONNABORTED is returned
615 * @_service: Where to store the actual service ID (may be upgraded) 617 * @_service: Where to store the actual service ID (may be upgraded)
@@ -622,39 +624,30 @@ wait_error:
622 * Note that we may return -EAGAIN to drain empty packets at the end of the 624 * Note that we may return -EAGAIN to drain empty packets at the end of the
623 * data, even if we've already copied over the requested data. 625 * data, even if we've already copied over the requested data.
624 * 626 *
625 * This function adds the amount it transfers to *_offset, so this should be
626 * precleared as appropriate. Note that the amount remaining in the buffer is
627 * taken to be size - *_offset.
628 *
629 * *_abort should also be initialised to 0. 627 * *_abort should also be initialised to 0.
630 */ 628 */
631int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call, 629int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call,
632 void *buf, size_t size, size_t *_offset, 630 struct iov_iter *iter,
633 bool want_more, u32 *_abort, u16 *_service) 631 bool want_more, u32 *_abort, u16 *_service)
634{ 632{
635 struct iov_iter iter; 633 size_t offset = 0;
636 struct kvec iov;
637 int ret; 634 int ret;
638 635
639 _enter("{%d,%s},%zu/%zu,%d", 636 _enter("{%d,%s},%zu,%d",
640 call->debug_id, rxrpc_call_states[call->state], 637 call->debug_id, rxrpc_call_states[call->state],
641 *_offset, size, want_more); 638 iov_iter_count(iter), want_more);
642 639
643 ASSERTCMP(*_offset, <=, size);
644 ASSERTCMP(call->state, !=, RXRPC_CALL_SERVER_ACCEPTING); 640 ASSERTCMP(call->state, !=, RXRPC_CALL_SERVER_ACCEPTING);
645 641
646 iov.iov_base = buf + *_offset;
647 iov.iov_len = size - *_offset;
648 iov_iter_kvec(&iter, ITER_KVEC | READ, &iov, 1, size - *_offset);
649
650 mutex_lock(&call->user_mutex); 642 mutex_lock(&call->user_mutex);
651 643
652 switch (READ_ONCE(call->state)) { 644 switch (READ_ONCE(call->state)) {
653 case RXRPC_CALL_CLIENT_RECV_REPLY: 645 case RXRPC_CALL_CLIENT_RECV_REPLY:
654 case RXRPC_CALL_SERVER_RECV_REQUEST: 646 case RXRPC_CALL_SERVER_RECV_REQUEST:
655 case RXRPC_CALL_SERVER_ACK_REQUEST: 647 case RXRPC_CALL_SERVER_ACK_REQUEST:
656 ret = rxrpc_recvmsg_data(sock, call, NULL, &iter, size, 0, 648 ret = rxrpc_recvmsg_data(sock, call, NULL, iter,
657 _offset); 649 iov_iter_count(iter), 0,
650 &offset);
658 if (ret < 0) 651 if (ret < 0)
659 goto out; 652 goto out;
660 653
@@ -663,7 +656,7 @@ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call,
663 * full buffer or have been given -EAGAIN. 656 * full buffer or have been given -EAGAIN.
664 */ 657 */
665 if (ret == 1) { 658 if (ret == 1) {
666 if (*_offset < size) 659 if (iov_iter_count(iter) > 0)
667 goto short_data; 660 goto short_data;
668 if (!want_more) 661 if (!want_more)
669 goto read_phase_complete; 662 goto read_phase_complete;
@@ -686,10 +679,21 @@ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call,
686read_phase_complete: 679read_phase_complete:
687 ret = 1; 680 ret = 1;
688out: 681out:
682 switch (call->ackr_reason) {
683 case RXRPC_ACK_IDLE:
684 break;
685 case RXRPC_ACK_DELAY:
686 if (ret != -EAGAIN)
687 break;
688 /* Fall through */
689 default:
690 rxrpc_send_ack_packet(call, false, NULL);
691 }
692
689 if (_service) 693 if (_service)
690 *_service = call->service_id; 694 *_service = call->service_id;
691 mutex_unlock(&call->user_mutex); 695 mutex_unlock(&call->user_mutex);
692 _leave(" = %d [%zu,%d]", ret, *_offset, *_abort); 696 _leave(" = %d [%zu,%d]", ret, iov_iter_count(iter), *_abort);
693 return ret; 697 return ret;
694 698
695short_data: 699short_data:
@@ -705,7 +709,7 @@ call_complete:
705 ret = call->error; 709 ret = call->error;
706 if (call->completion == RXRPC_CALL_SUCCEEDED) { 710 if (call->completion == RXRPC_CALL_SUCCEEDED) {
707 ret = 1; 711 ret = 1;
708 if (size > 0) 712 if (iov_iter_count(iter) > 0)
709 ret = -ECONNRESET; 713 ret = -ECONNRESET;
710 } 714 }
711 goto out; 715 goto out;
diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c
index 278ac0807a60..cea16838d588 100644
--- a/net/rxrpc/rxkad.c
+++ b/net/rxrpc/rxkad.c
@@ -146,10 +146,10 @@ static int rxkad_prime_packet_security(struct rxrpc_connection *conn)
146static int rxkad_secure_packet_auth(const struct rxrpc_call *call, 146static int rxkad_secure_packet_auth(const struct rxrpc_call *call,
147 struct sk_buff *skb, 147 struct sk_buff *skb,
148 u32 data_size, 148 u32 data_size,
149 void *sechdr) 149 void *sechdr,
150 struct skcipher_request *req)
150{ 151{
151 struct rxrpc_skb_priv *sp = rxrpc_skb(skb); 152 struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
152 SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher);
153 struct rxkad_level1_hdr hdr; 153 struct rxkad_level1_hdr hdr;
154 struct rxrpc_crypt iv; 154 struct rxrpc_crypt iv;
155 struct scatterlist sg; 155 struct scatterlist sg;
@@ -183,12 +183,12 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call,
183static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, 183static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call,
184 struct sk_buff *skb, 184 struct sk_buff *skb,
185 u32 data_size, 185 u32 data_size,
186 void *sechdr) 186 void *sechdr,
187 struct skcipher_request *req)
187{ 188{
188 const struct rxrpc_key_token *token; 189 const struct rxrpc_key_token *token;
189 struct rxkad_level2_hdr rxkhdr; 190 struct rxkad_level2_hdr rxkhdr;
190 struct rxrpc_skb_priv *sp; 191 struct rxrpc_skb_priv *sp;
191 SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher);
192 struct rxrpc_crypt iv; 192 struct rxrpc_crypt iv;
193 struct scatterlist sg[16]; 193 struct scatterlist sg[16];
194 struct sk_buff *trailer; 194 struct sk_buff *trailer;
@@ -296,11 +296,12 @@ static int rxkad_secure_packet(struct rxrpc_call *call,
296 ret = 0; 296 ret = 0;
297 break; 297 break;
298 case RXRPC_SECURITY_AUTH: 298 case RXRPC_SECURITY_AUTH:
299 ret = rxkad_secure_packet_auth(call, skb, data_size, sechdr); 299 ret = rxkad_secure_packet_auth(call, skb, data_size, sechdr,
300 req);
300 break; 301 break;
301 case RXRPC_SECURITY_ENCRYPT: 302 case RXRPC_SECURITY_ENCRYPT:
302 ret = rxkad_secure_packet_encrypt(call, skb, data_size, 303 ret = rxkad_secure_packet_encrypt(call, skb, data_size,
303 sechdr); 304 sechdr, req);
304 break; 305 break;
305 default: 306 default:
306 ret = -EPERM; 307 ret = -EPERM;
@@ -316,10 +317,10 @@ static int rxkad_secure_packet(struct rxrpc_call *call,
316 */ 317 */
317static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb, 318static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb,
318 unsigned int offset, unsigned int len, 319 unsigned int offset, unsigned int len,
319 rxrpc_seq_t seq) 320 rxrpc_seq_t seq,
321 struct skcipher_request *req)
320{ 322{
321 struct rxkad_level1_hdr sechdr; 323 struct rxkad_level1_hdr sechdr;
322 SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher);
323 struct rxrpc_crypt iv; 324 struct rxrpc_crypt iv;
324 struct scatterlist sg[16]; 325 struct scatterlist sg[16];
325 struct sk_buff *trailer; 326 struct sk_buff *trailer;
@@ -402,11 +403,11 @@ nomem:
402 */ 403 */
403static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, 404static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb,
404 unsigned int offset, unsigned int len, 405 unsigned int offset, unsigned int len,
405 rxrpc_seq_t seq) 406 rxrpc_seq_t seq,
407 struct skcipher_request *req)
406{ 408{
407 const struct rxrpc_key_token *token; 409 const struct rxrpc_key_token *token;
408 struct rxkad_level2_hdr sechdr; 410 struct rxkad_level2_hdr sechdr;
409 SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher);
410 struct rxrpc_crypt iv; 411 struct rxrpc_crypt iv;
411 struct scatterlist _sg[4], *sg; 412 struct scatterlist _sg[4], *sg;
412 struct sk_buff *trailer; 413 struct sk_buff *trailer;
@@ -549,9 +550,9 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb,
549 case RXRPC_SECURITY_PLAIN: 550 case RXRPC_SECURITY_PLAIN:
550 return 0; 551 return 0;
551 case RXRPC_SECURITY_AUTH: 552 case RXRPC_SECURITY_AUTH:
552 return rxkad_verify_packet_1(call, skb, offset, len, seq); 553 return rxkad_verify_packet_1(call, skb, offset, len, seq, req);
553 case RXRPC_SECURITY_ENCRYPT: 554 case RXRPC_SECURITY_ENCRYPT:
554 return rxkad_verify_packet_2(call, skb, offset, len, seq); 555 return rxkad_verify_packet_2(call, skb, offset, len, seq, req);
555 default: 556 default:
556 return -ENOANO; 557 return -ENOANO;
557 } 558 }
@@ -665,11 +666,13 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn)
665 ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len); 666 ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len);
666 if (ret < 0) { 667 if (ret < 0) {
667 trace_rxrpc_tx_fail(conn->debug_id, serial, ret, 668 trace_rxrpc_tx_fail(conn->debug_id, serial, ret,
668 rxrpc_tx_fail_conn_challenge); 669 rxrpc_tx_point_rxkad_challenge);
669 return -EAGAIN; 670 return -EAGAIN;
670 } 671 }
671 672
672 conn->params.peer->last_tx_at = ktime_get_real(); 673 conn->params.peer->last_tx_at = ktime_get_seconds();
674 trace_rxrpc_tx_packet(conn->debug_id, &whdr,
675 rxrpc_tx_point_rxkad_challenge);
673 _leave(" = 0"); 676 _leave(" = 0");
674 return 0; 677 return 0;
675} 678}
@@ -721,11 +724,11 @@ static int rxkad_send_response(struct rxrpc_connection *conn,
721 ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 3, len); 724 ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 3, len);
722 if (ret < 0) { 725 if (ret < 0) {
723 trace_rxrpc_tx_fail(conn->debug_id, serial, ret, 726 trace_rxrpc_tx_fail(conn->debug_id, serial, ret,
724 rxrpc_tx_fail_conn_response); 727 rxrpc_tx_point_rxkad_response);
725 return -EAGAIN; 728 return -EAGAIN;
726 } 729 }
727 730
728 conn->params.peer->last_tx_at = ktime_get_real(); 731 conn->params.peer->last_tx_at = ktime_get_seconds();
729 _leave(" = 0"); 732 _leave(" = 0");
730 return 0; 733 return 0;
731} 734}
diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c
index 4a7af7aff37d..d75bd15151e6 100644
--- a/net/rxrpc/sysctl.c
+++ b/net/rxrpc/sysctl.c
@@ -15,7 +15,6 @@
15#include "ar-internal.h" 15#include "ar-internal.h"
16 16
17static struct ctl_table_header *rxrpc_sysctl_reg_table; 17static struct ctl_table_header *rxrpc_sysctl_reg_table;
18static const unsigned int zero = 0;
19static const unsigned int one = 1; 18static const unsigned int one = 1;
20static const unsigned int four = 4; 19static const unsigned int four = 4;
21static const unsigned int thirtytwo = 32; 20static const unsigned int thirtytwo = 32;
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index a01169fb5325..e95741388311 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -1,6 +1,6 @@
1# 1#
2# Traffic control configuration. 2# Traffic control configuration.
3# 3#
4 4
5menuconfig NET_SCHED 5menuconfig NET_SCHED
6 bool "QoS and/or fair queueing" 6 bool "QoS and/or fair queueing"
@@ -183,6 +183,17 @@ config NET_SCH_CBS
183 To compile this code as a module, choose M here: the 183 To compile this code as a module, choose M here: the
184 module will be called sch_cbs. 184 module will be called sch_cbs.
185 185
186config NET_SCH_ETF
187 tristate "Earliest TxTime First (ETF)"
188 help
189 Say Y here if you want to use the Earliest TxTime First (ETF) packet
190 scheduling algorithm.
191
192 See the top of <file:net/sched/sch_etf.c> for more details.
193
194 To compile this code as a module, choose M here: the
195 module will be called sch_etf.
196
186config NET_SCH_GRED 197config NET_SCH_GRED
187 tristate "Generic Random Early Detection (GRED)" 198 tristate "Generic Random Early Detection (GRED)"
188 ---help--- 199 ---help---
@@ -240,6 +251,19 @@ config NET_SCH_MQPRIO
240 251
241 If unsure, say N. 252 If unsure, say N.
242 253
254config NET_SCH_SKBPRIO
255 tristate "SKB priority queue scheduler (SKBPRIO)"
256 help
257 Say Y here if you want to use the SKB priority queue
258 scheduler. This schedules packets according to skb->priority,
259 which is useful for request packets in DoS mitigation systems such
260 as Gatekeeper.
261
262 To compile this driver as a module, choose M here: the module will
263 be called sch_skbprio.
264
265 If unsure, say N.
266
243config NET_SCH_CHOKE 267config NET_SCH_CHOKE
244 tristate "CHOose and Keep responsive flow scheduler (CHOKE)" 268 tristate "CHOose and Keep responsive flow scheduler (CHOKE)"
245 help 269 help
@@ -284,6 +308,17 @@ config NET_SCH_FQ_CODEL
284 308
285 If unsure, say N. 309 If unsure, say N.
286 310
311config NET_SCH_CAKE
312 tristate "Common Applications Kept Enhanced (CAKE)"
313 help
314 Say Y here if you want to use the Common Applications Kept Enhanced
315 (CAKE) queue management algorithm.
316
317 To compile this driver as a module, choose M here: the module
318 will be called sch_cake.
319
320 If unsure, say N.
321
287config NET_SCH_FQ 322config NET_SCH_FQ
288 tristate "Fair Queue" 323 tristate "Fair Queue"
289 help 324 help
@@ -684,7 +719,7 @@ config NET_CLS_ACT
684 719
685config NET_ACT_POLICE 720config NET_ACT_POLICE
686 tristate "Traffic Policing" 721 tristate "Traffic Policing"
687 depends on NET_CLS_ACT 722 depends on NET_CLS_ACT
688 ---help--- 723 ---help---
689 Say Y here if you want to do traffic policing, i.e. strict 724 Say Y here if you want to do traffic policing, i.e. strict
690 bandwidth limiting. This action replaces the existing policing 725 bandwidth limiting. This action replaces the existing policing
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 8811d3804878..f0403f49edcb 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -33,7 +33,7 @@ obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o
33obj-$(CONFIG_NET_SCH_HFSC) += sch_hfsc.o 33obj-$(CONFIG_NET_SCH_HFSC) += sch_hfsc.o
34obj-$(CONFIG_NET_SCH_RED) += sch_red.o 34obj-$(CONFIG_NET_SCH_RED) += sch_red.o
35obj-$(CONFIG_NET_SCH_GRED) += sch_gred.o 35obj-$(CONFIG_NET_SCH_GRED) += sch_gred.o
36obj-$(CONFIG_NET_SCH_INGRESS) += sch_ingress.o 36obj-$(CONFIG_NET_SCH_INGRESS) += sch_ingress.o
37obj-$(CONFIG_NET_SCH_DSMARK) += sch_dsmark.o 37obj-$(CONFIG_NET_SCH_DSMARK) += sch_dsmark.o
38obj-$(CONFIG_NET_SCH_SFB) += sch_sfb.o 38obj-$(CONFIG_NET_SCH_SFB) += sch_sfb.o
39obj-$(CONFIG_NET_SCH_SFQ) += sch_sfq.o 39obj-$(CONFIG_NET_SCH_SFQ) += sch_sfq.o
@@ -46,14 +46,17 @@ obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o
46obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o 46obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o
47obj-$(CONFIG_NET_SCH_PLUG) += sch_plug.o 47obj-$(CONFIG_NET_SCH_PLUG) += sch_plug.o
48obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o 48obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o
49obj-$(CONFIG_NET_SCH_SKBPRIO) += sch_skbprio.o
49obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o 50obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o
50obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o 51obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o
51obj-$(CONFIG_NET_SCH_CODEL) += sch_codel.o 52obj-$(CONFIG_NET_SCH_CODEL) += sch_codel.o
52obj-$(CONFIG_NET_SCH_FQ_CODEL) += sch_fq_codel.o 53obj-$(CONFIG_NET_SCH_FQ_CODEL) += sch_fq_codel.o
54obj-$(CONFIG_NET_SCH_CAKE) += sch_cake.o
53obj-$(CONFIG_NET_SCH_FQ) += sch_fq.o 55obj-$(CONFIG_NET_SCH_FQ) += sch_fq.o
54obj-$(CONFIG_NET_SCH_HHF) += sch_hhf.o 56obj-$(CONFIG_NET_SCH_HHF) += sch_hhf.o
55obj-$(CONFIG_NET_SCH_PIE) += sch_pie.o 57obj-$(CONFIG_NET_SCH_PIE) += sch_pie.o
56obj-$(CONFIG_NET_SCH_CBS) += sch_cbs.o 58obj-$(CONFIG_NET_SCH_CBS) += sch_cbs.o
59obj-$(CONFIG_NET_SCH_ETF) += sch_etf.o
57 60
58obj-$(CONFIG_NET_CLS_U32) += cls_u32.o 61obj-$(CONFIG_NET_CLS_U32) += cls_u32.o
59obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o 62obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 3f4cf930f809..db83dac1e7f4 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -36,7 +36,7 @@ static int tcf_action_goto_chain_init(struct tc_action *a, struct tcf_proto *tp)
36 36
37 if (!tp) 37 if (!tp)
38 return -EINVAL; 38 return -EINVAL;
39 a->goto_chain = tcf_chain_get(tp->chain->block, chain_index, true); 39 a->goto_chain = tcf_chain_get_by_act(tp->chain->block, chain_index);
40 if (!a->goto_chain) 40 if (!a->goto_chain)
41 return -ENOMEM; 41 return -ENOMEM;
42 return 0; 42 return 0;
@@ -44,7 +44,7 @@ static int tcf_action_goto_chain_init(struct tc_action *a, struct tcf_proto *tp)
44 44
45static void tcf_action_goto_chain_fini(struct tc_action *a) 45static void tcf_action_goto_chain_fini(struct tc_action *a)
46{ 46{
47 tcf_chain_put(a->goto_chain); 47 tcf_chain_put_by_act(a->goto_chain);
48} 48}
49 49
50static void tcf_action_goto_chain_exec(const struct tc_action *a, 50static void tcf_action_goto_chain_exec(const struct tc_action *a,
@@ -55,6 +55,24 @@ static void tcf_action_goto_chain_exec(const struct tc_action *a,
55 res->goto_tp = rcu_dereference_bh(chain->filter_chain); 55 res->goto_tp = rcu_dereference_bh(chain->filter_chain);
56} 56}
57 57
58static void tcf_free_cookie_rcu(struct rcu_head *p)
59{
60 struct tc_cookie *cookie = container_of(p, struct tc_cookie, rcu);
61
62 kfree(cookie->data);
63 kfree(cookie);
64}
65
66static void tcf_set_action_cookie(struct tc_cookie __rcu **old_cookie,
67 struct tc_cookie *new_cookie)
68{
69 struct tc_cookie *old;
70
71 old = xchg((__force struct tc_cookie **)old_cookie, new_cookie);
72 if (old)
73 call_rcu(&old->rcu, tcf_free_cookie_rcu);
74}
75
58/* XXX: For standalone actions, we don't need a RCU grace period either, because 76/* XXX: For standalone actions, we don't need a RCU grace period either, because
59 * actions are always connected to filters and filters are already destroyed in 77 * actions are always connected to filters and filters are already destroyed in
60 * RCU callbacks, so after a RCU grace period actions are already disconnected 78 * RCU callbacks, so after a RCU grace period actions are already disconnected
@@ -65,44 +83,64 @@ static void free_tcf(struct tc_action *p)
65 free_percpu(p->cpu_bstats); 83 free_percpu(p->cpu_bstats);
66 free_percpu(p->cpu_qstats); 84 free_percpu(p->cpu_qstats);
67 85
68 if (p->act_cookie) { 86 tcf_set_action_cookie(&p->act_cookie, NULL);
69 kfree(p->act_cookie->data);
70 kfree(p->act_cookie);
71 }
72 if (p->goto_chain) 87 if (p->goto_chain)
73 tcf_action_goto_chain_fini(p); 88 tcf_action_goto_chain_fini(p);
74 89
75 kfree(p); 90 kfree(p);
76} 91}
77 92
78static void tcf_idr_remove(struct tcf_idrinfo *idrinfo, struct tc_action *p) 93static void tcf_action_cleanup(struct tc_action *p)
79{ 94{
80 spin_lock(&idrinfo->lock); 95 if (p->ops->cleanup)
81 idr_remove(&idrinfo->action_idr, p->tcfa_index); 96 p->ops->cleanup(p);
82 spin_unlock(&idrinfo->lock); 97
83 gen_kill_estimator(&p->tcfa_rate_est); 98 gen_kill_estimator(&p->tcfa_rate_est);
84 free_tcf(p); 99 free_tcf(p);
85} 100}
86 101
102static int __tcf_action_put(struct tc_action *p, bool bind)
103{
104 struct tcf_idrinfo *idrinfo = p->idrinfo;
105
106 if (refcount_dec_and_lock(&p->tcfa_refcnt, &idrinfo->lock)) {
107 if (bind)
108 atomic_dec(&p->tcfa_bindcnt);
109 idr_remove(&idrinfo->action_idr, p->tcfa_index);
110 spin_unlock(&idrinfo->lock);
111
112 tcf_action_cleanup(p);
113 return 1;
114 }
115
116 if (bind)
117 atomic_dec(&p->tcfa_bindcnt);
118
119 return 0;
120}
121
87int __tcf_idr_release(struct tc_action *p, bool bind, bool strict) 122int __tcf_idr_release(struct tc_action *p, bool bind, bool strict)
88{ 123{
89 int ret = 0; 124 int ret = 0;
90 125
91 ASSERT_RTNL(); 126 /* Release with strict==1 and bind==0 is only called through act API
92 127 * interface (classifiers always bind). Only case when action with
128 * positive reference count and zero bind count can exist is when it was
129 * also created with act API (unbinding last classifier will destroy the
130 * action if it was created by classifier). So only case when bind count
131 * can be changed after initial check is when unbound action is
132 * destroyed by act API while classifier binds to action with same id
133 * concurrently. This result either creation of new action(same behavior
134 * as before), or reusing existing action if concurrent process
135 * increments reference count before action is deleted. Both scenarios
136 * are acceptable.
137 */
93 if (p) { 138 if (p) {
94 if (bind) 139 if (!bind && strict && atomic_read(&p->tcfa_bindcnt) > 0)
95 p->tcfa_bindcnt--;
96 else if (strict && p->tcfa_bindcnt > 0)
97 return -EPERM; 140 return -EPERM;
98 141
99 p->tcfa_refcnt--; 142 if (__tcf_action_put(p, bind))
100 if (p->tcfa_bindcnt <= 0 && p->tcfa_refcnt <= 0) {
101 if (p->ops->cleanup)
102 p->ops->cleanup(p);
103 tcf_idr_remove(p->idrinfo, p);
104 ret = ACT_P_DELETED; 143 ret = ACT_P_DELETED;
105 }
106 } 144 }
107 145
108 return ret; 146 return ret;
@@ -111,10 +149,15 @@ EXPORT_SYMBOL(__tcf_idr_release);
111 149
112static size_t tcf_action_shared_attrs_size(const struct tc_action *act) 150static size_t tcf_action_shared_attrs_size(const struct tc_action *act)
113{ 151{
152 struct tc_cookie *act_cookie;
114 u32 cookie_len = 0; 153 u32 cookie_len = 0;
115 154
116 if (act->act_cookie) 155 rcu_read_lock();
117 cookie_len = nla_total_size(act->act_cookie->len); 156 act_cookie = rcu_dereference(act->act_cookie);
157
158 if (act_cookie)
159 cookie_len = nla_total_size(act_cookie->len);
160 rcu_read_unlock();
118 161
119 return nla_total_size(0) /* action number nested */ 162 return nla_total_size(0) /* action number nested */
120 + nla_total_size(IFNAMSIZ) /* TCA_ACT_KIND */ 163 + nla_total_size(IFNAMSIZ) /* TCA_ACT_KIND */
@@ -257,46 +300,59 @@ int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb,
257} 300}
258EXPORT_SYMBOL(tcf_generic_walker); 301EXPORT_SYMBOL(tcf_generic_walker);
259 302
260static struct tc_action *tcf_idr_lookup(u32 index, struct tcf_idrinfo *idrinfo) 303int tcf_idr_search(struct tc_action_net *tn, struct tc_action **a, u32 index)
261{ 304{
262 struct tc_action *p = NULL; 305 struct tcf_idrinfo *idrinfo = tn->idrinfo;
306 struct tc_action *p;
263 307
264 spin_lock(&idrinfo->lock); 308 spin_lock(&idrinfo->lock);
265 p = idr_find(&idrinfo->action_idr, index); 309 p = idr_find(&idrinfo->action_idr, index);
310 if (IS_ERR(p))
311 p = NULL;
312 else if (p)
313 refcount_inc(&p->tcfa_refcnt);
266 spin_unlock(&idrinfo->lock); 314 spin_unlock(&idrinfo->lock);
267 315
268 return p;
269}
270
271int tcf_idr_search(struct tc_action_net *tn, struct tc_action **a, u32 index)
272{
273 struct tcf_idrinfo *idrinfo = tn->idrinfo;
274 struct tc_action *p = tcf_idr_lookup(index, idrinfo);
275
276 if (p) { 316 if (p) {
277 *a = p; 317 *a = p;
278 return 1; 318 return true;
279 } 319 }
280 return 0; 320 return false;
281} 321}
282EXPORT_SYMBOL(tcf_idr_search); 322EXPORT_SYMBOL(tcf_idr_search);
283 323
284bool tcf_idr_check(struct tc_action_net *tn, u32 index, struct tc_action **a, 324static int tcf_idr_delete_index(struct tcf_idrinfo *idrinfo, u32 index)
285 int bind)
286{ 325{
287 struct tcf_idrinfo *idrinfo = tn->idrinfo; 326 struct tc_action *p;
288 struct tc_action *p = tcf_idr_lookup(index, idrinfo); 327 int ret = 0;
289 328
290 if (index && p) { 329 spin_lock(&idrinfo->lock);
291 if (bind) 330 p = idr_find(&idrinfo->action_idr, index);
292 p->tcfa_bindcnt++; 331 if (!p) {
293 p->tcfa_refcnt++; 332 spin_unlock(&idrinfo->lock);
294 *a = p; 333 return -ENOENT;
295 return true;
296 } 334 }
297 return false; 335
336 if (!atomic_read(&p->tcfa_bindcnt)) {
337 if (refcount_dec_and_test(&p->tcfa_refcnt)) {
338 struct module *owner = p->ops->owner;
339
340 WARN_ON(p != idr_remove(&idrinfo->action_idr,
341 p->tcfa_index));
342 spin_unlock(&idrinfo->lock);
343
344 tcf_action_cleanup(p);
345 module_put(owner);
346 return 0;
347 }
348 ret = 0;
349 } else {
350 ret = -EPERM;
351 }
352
353 spin_unlock(&idrinfo->lock);
354 return ret;
298} 355}
299EXPORT_SYMBOL(tcf_idr_check);
300 356
301int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, 357int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
302 struct tc_action **a, const struct tc_action_ops *ops, 358 struct tc_action **a, const struct tc_action_ops *ops,
@@ -304,14 +360,13 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
304{ 360{
305 struct tc_action *p = kzalloc(ops->size, GFP_KERNEL); 361 struct tc_action *p = kzalloc(ops->size, GFP_KERNEL);
306 struct tcf_idrinfo *idrinfo = tn->idrinfo; 362 struct tcf_idrinfo *idrinfo = tn->idrinfo;
307 struct idr *idr = &idrinfo->action_idr;
308 int err = -ENOMEM; 363 int err = -ENOMEM;
309 364
310 if (unlikely(!p)) 365 if (unlikely(!p))
311 return -ENOMEM; 366 return -ENOMEM;
312 p->tcfa_refcnt = 1; 367 refcount_set(&p->tcfa_refcnt, 1);
313 if (bind) 368 if (bind)
314 p->tcfa_bindcnt = 1; 369 atomic_set(&p->tcfa_bindcnt, 1);
315 370
316 if (cpustats) { 371 if (cpustats) {
317 p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); 372 p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
@@ -322,20 +377,6 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
322 goto err2; 377 goto err2;
323 } 378 }
324 spin_lock_init(&p->tcfa_lock); 379 spin_lock_init(&p->tcfa_lock);
325 idr_preload(GFP_KERNEL);
326 spin_lock(&idrinfo->lock);
327 /* user doesn't specify an index */
328 if (!index) {
329 index = 1;
330 err = idr_alloc_u32(idr, NULL, &index, UINT_MAX, GFP_ATOMIC);
331 } else {
332 err = idr_alloc_u32(idr, NULL, &index, index, GFP_ATOMIC);
333 }
334 spin_unlock(&idrinfo->lock);
335 idr_preload_end();
336 if (err)
337 goto err3;
338
339 p->tcfa_index = index; 380 p->tcfa_index = index;
340 p->tcfa_tm.install = jiffies; 381 p->tcfa_tm.install = jiffies;
341 p->tcfa_tm.lastuse = jiffies; 382 p->tcfa_tm.lastuse = jiffies;
@@ -345,16 +386,13 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
345 &p->tcfa_rate_est, 386 &p->tcfa_rate_est,
346 &p->tcfa_lock, NULL, est); 387 &p->tcfa_lock, NULL, est);
347 if (err) 388 if (err)
348 goto err4; 389 goto err3;
349 } 390 }
350 391
351 p->idrinfo = idrinfo; 392 p->idrinfo = idrinfo;
352 p->ops = ops; 393 p->ops = ops;
353 INIT_LIST_HEAD(&p->list);
354 *a = p; 394 *a = p;
355 return 0; 395 return 0;
356err4:
357 idr_remove(idr, index);
358err3: 396err3:
359 free_percpu(p->cpu_qstats); 397 free_percpu(p->cpu_qstats);
360err2: 398err2:
@@ -370,11 +408,78 @@ void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a)
370 struct tcf_idrinfo *idrinfo = tn->idrinfo; 408 struct tcf_idrinfo *idrinfo = tn->idrinfo;
371 409
372 spin_lock(&idrinfo->lock); 410 spin_lock(&idrinfo->lock);
373 idr_replace(&idrinfo->action_idr, a, a->tcfa_index); 411 /* Replace ERR_PTR(-EBUSY) allocated by tcf_idr_check_alloc */
412 WARN_ON(!IS_ERR(idr_replace(&idrinfo->action_idr, a, a->tcfa_index)));
374 spin_unlock(&idrinfo->lock); 413 spin_unlock(&idrinfo->lock);
375} 414}
376EXPORT_SYMBOL(tcf_idr_insert); 415EXPORT_SYMBOL(tcf_idr_insert);
377 416
417/* Cleanup idr index that was allocated but not initialized. */
418
419void tcf_idr_cleanup(struct tc_action_net *tn, u32 index)
420{
421 struct tcf_idrinfo *idrinfo = tn->idrinfo;
422
423 spin_lock(&idrinfo->lock);
424 /* Remove ERR_PTR(-EBUSY) allocated by tcf_idr_check_alloc */
425 WARN_ON(!IS_ERR(idr_remove(&idrinfo->action_idr, index)));
426 spin_unlock(&idrinfo->lock);
427}
428EXPORT_SYMBOL(tcf_idr_cleanup);
429
430/* Check if action with specified index exists. If actions is found, increments
431 * its reference and bind counters, and return 1. Otherwise insert temporary
432 * error pointer (to prevent concurrent users from inserting actions with same
433 * index) and return 0.
434 */
435
436int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index,
437 struct tc_action **a, int bind)
438{
439 struct tcf_idrinfo *idrinfo = tn->idrinfo;
440 struct tc_action *p;
441 int ret;
442
443again:
444 spin_lock(&idrinfo->lock);
445 if (*index) {
446 p = idr_find(&idrinfo->action_idr, *index);
447 if (IS_ERR(p)) {
448 /* This means that another process allocated
449 * index but did not assign the pointer yet.
450 */
451 spin_unlock(&idrinfo->lock);
452 goto again;
453 }
454
455 if (p) {
456 refcount_inc(&p->tcfa_refcnt);
457 if (bind)
458 atomic_inc(&p->tcfa_bindcnt);
459 *a = p;
460 ret = 1;
461 } else {
462 *a = NULL;
463 ret = idr_alloc_u32(&idrinfo->action_idr, NULL, index,
464 *index, GFP_ATOMIC);
465 if (!ret)
466 idr_replace(&idrinfo->action_idr,
467 ERR_PTR(-EBUSY), *index);
468 }
469 } else {
470 *index = 1;
471 *a = NULL;
472 ret = idr_alloc_u32(&idrinfo->action_idr, NULL, index,
473 UINT_MAX, GFP_ATOMIC);
474 if (!ret)
475 idr_replace(&idrinfo->action_idr, ERR_PTR(-EBUSY),
476 *index);
477 }
478 spin_unlock(&idrinfo->lock);
479 return ret;
480}
481EXPORT_SYMBOL(tcf_idr_check_alloc);
482
378void tcf_idrinfo_destroy(const struct tc_action_ops *ops, 483void tcf_idrinfo_destroy(const struct tc_action_ops *ops,
379 struct tcf_idrinfo *idrinfo) 484 struct tcf_idrinfo *idrinfo)
380{ 485{
@@ -538,13 +643,15 @@ repeat:
538} 643}
539EXPORT_SYMBOL(tcf_action_exec); 644EXPORT_SYMBOL(tcf_action_exec);
540 645
541int tcf_action_destroy(struct list_head *actions, int bind) 646int tcf_action_destroy(struct tc_action *actions[], int bind)
542{ 647{
543 const struct tc_action_ops *ops; 648 const struct tc_action_ops *ops;
544 struct tc_action *a, *tmp; 649 struct tc_action *a;
545 int ret = 0; 650 int ret = 0, i;
546 651
547 list_for_each_entry_safe(a, tmp, actions, list) { 652 for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) {
653 a = actions[i];
654 actions[i] = NULL;
548 ops = a->ops; 655 ops = a->ops;
549 ret = __tcf_idr_release(a, bind, true); 656 ret = __tcf_idr_release(a, bind, true);
550 if (ret == ACT_P_DELETED) 657 if (ret == ACT_P_DELETED)
@@ -555,6 +662,28 @@ int tcf_action_destroy(struct list_head *actions, int bind)
555 return ret; 662 return ret;
556} 663}
557 664
665static int tcf_action_put(struct tc_action *p)
666{
667 return __tcf_action_put(p, false);
668}
669
670/* Put all actions in this array, skip those NULL's. */
671static void tcf_action_put_many(struct tc_action *actions[])
672{
673 int i;
674
675 for (i = 0; i < TCA_ACT_MAX_PRIO; i++) {
676 struct tc_action *a = actions[i];
677 const struct tc_action_ops *ops;
678
679 if (!a)
680 continue;
681 ops = a->ops;
682 if (tcf_action_put(a))
683 module_put(ops->owner);
684 }
685}
686
558int 687int
559tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int ref) 688tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
560{ 689{
@@ -567,16 +696,22 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
567 int err = -EINVAL; 696 int err = -EINVAL;
568 unsigned char *b = skb_tail_pointer(skb); 697 unsigned char *b = skb_tail_pointer(skb);
569 struct nlattr *nest; 698 struct nlattr *nest;
699 struct tc_cookie *cookie;
570 700
571 if (nla_put_string(skb, TCA_KIND, a->ops->kind)) 701 if (nla_put_string(skb, TCA_KIND, a->ops->kind))
572 goto nla_put_failure; 702 goto nla_put_failure;
573 if (tcf_action_copy_stats(skb, a, 0)) 703 if (tcf_action_copy_stats(skb, a, 0))
574 goto nla_put_failure; 704 goto nla_put_failure;
575 if (a->act_cookie) { 705
576 if (nla_put(skb, TCA_ACT_COOKIE, a->act_cookie->len, 706 rcu_read_lock();
577 a->act_cookie->data)) 707 cookie = rcu_dereference(a->act_cookie);
708 if (cookie) {
709 if (nla_put(skb, TCA_ACT_COOKIE, cookie->len, cookie->data)) {
710 rcu_read_unlock();
578 goto nla_put_failure; 711 goto nla_put_failure;
712 }
579 } 713 }
714 rcu_read_unlock();
580 715
581 nest = nla_nest_start(skb, TCA_OPTIONS); 716 nest = nla_nest_start(skb, TCA_OPTIONS);
582 if (nest == NULL) 717 if (nest == NULL)
@@ -593,14 +728,15 @@ nla_put_failure:
593} 728}
594EXPORT_SYMBOL(tcf_action_dump_1); 729EXPORT_SYMBOL(tcf_action_dump_1);
595 730
596int tcf_action_dump(struct sk_buff *skb, struct list_head *actions, 731int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[],
597 int bind, int ref) 732 int bind, int ref)
598{ 733{
599 struct tc_action *a; 734 struct tc_action *a;
600 int err = -EINVAL; 735 int err = -EINVAL, i;
601 struct nlattr *nest; 736 struct nlattr *nest;
602 737
603 list_for_each_entry(a, actions, list) { 738 for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) {
739 a = actions[i];
604 nest = nla_nest_start(skb, a->order); 740 nest = nla_nest_start(skb, a->order);
605 if (nest == NULL) 741 if (nest == NULL)
606 goto nla_put_failure; 742 goto nla_put_failure;
@@ -635,9 +771,19 @@ static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb)
635 return c; 771 return c;
636} 772}
637 773
774static bool tcf_action_valid(int action)
775{
776 int opcode = TC_ACT_EXT_OPCODE(action);
777
778 if (!opcode)
779 return action <= TC_ACT_VALUE_MAX;
780 return opcode <= TC_ACT_EXT_OPCODE_MAX || action == TC_ACT_UNSPEC;
781}
782
638struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, 783struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
639 struct nlattr *nla, struct nlattr *est, 784 struct nlattr *nla, struct nlattr *est,
640 char *name, int ovr, int bind, 785 char *name, int ovr, int bind,
786 bool rtnl_held,
641 struct netlink_ext_ack *extack) 787 struct netlink_ext_ack *extack)
642{ 788{
643 struct tc_action *a; 789 struct tc_action *a;
@@ -688,9 +834,11 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
688 a_o = tc_lookup_action_n(act_name); 834 a_o = tc_lookup_action_n(act_name);
689 if (a_o == NULL) { 835 if (a_o == NULL) {
690#ifdef CONFIG_MODULES 836#ifdef CONFIG_MODULES
691 rtnl_unlock(); 837 if (rtnl_held)
838 rtnl_unlock();
692 request_module("act_%s", act_name); 839 request_module("act_%s", act_name);
693 rtnl_lock(); 840 if (rtnl_held)
841 rtnl_lock();
694 842
695 a_o = tc_lookup_action_n(act_name); 843 a_o = tc_lookup_action_n(act_name);
696 844
@@ -713,19 +861,15 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
713 /* backward compatibility for policer */ 861 /* backward compatibility for policer */
714 if (name == NULL) 862 if (name == NULL)
715 err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, ovr, bind, 863 err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, ovr, bind,
716 extack); 864 rtnl_held, extack);
717 else 865 else
718 err = a_o->init(net, nla, est, &a, ovr, bind, extack); 866 err = a_o->init(net, nla, est, &a, ovr, bind, rtnl_held,
867 extack);
719 if (err < 0) 868 if (err < 0)
720 goto err_mod; 869 goto err_mod;
721 870
722 if (name == NULL && tb[TCA_ACT_COOKIE]) { 871 if (!name && tb[TCA_ACT_COOKIE])
723 if (a->act_cookie) { 872 tcf_set_action_cookie(&a->act_cookie, cookie);
724 kfree(a->act_cookie->data);
725 kfree(a->act_cookie);
726 }
727 a->act_cookie = cookie;
728 }
729 873
730 /* module count goes up only when brand new policy is created 874 /* module count goes up only when brand new policy is created
731 * if it exists and is only bound to in a_o->init() then 875 * if it exists and is only bound to in a_o->init() then
@@ -737,15 +881,19 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
737 if (TC_ACT_EXT_CMP(a->tcfa_action, TC_ACT_GOTO_CHAIN)) { 881 if (TC_ACT_EXT_CMP(a->tcfa_action, TC_ACT_GOTO_CHAIN)) {
738 err = tcf_action_goto_chain_init(a, tp); 882 err = tcf_action_goto_chain_init(a, tp);
739 if (err) { 883 if (err) {
740 LIST_HEAD(actions); 884 struct tc_action *actions[] = { a, NULL };
741 885
742 list_add_tail(&a->list, &actions); 886 tcf_action_destroy(actions, bind);
743 tcf_action_destroy(&actions, bind);
744 NL_SET_ERR_MSG(extack, "Failed to init TC action chain"); 887 NL_SET_ERR_MSG(extack, "Failed to init TC action chain");
745 return ERR_PTR(err); 888 return ERR_PTR(err);
746 } 889 }
747 } 890 }
748 891
892 if (!tcf_action_valid(a->tcfa_action)) {
893 NL_SET_ERR_MSG(extack, "invalid action value, using TC_ACT_UNSPEC instead");
894 a->tcfa_action = TC_ACT_UNSPEC;
895 }
896
749 return a; 897 return a;
750 898
751err_mod: 899err_mod:
@@ -758,21 +906,12 @@ err_out:
758 return ERR_PTR(err); 906 return ERR_PTR(err);
759} 907}
760 908
761static void cleanup_a(struct list_head *actions, int ovr) 909/* Returns numbers of initialized actions or negative error. */
762{
763 struct tc_action *a;
764
765 if (!ovr)
766 return;
767
768 list_for_each_entry(a, actions, list)
769 a->tcfa_refcnt--;
770}
771 910
772int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, 911int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
773 struct nlattr *est, char *name, int ovr, int bind, 912 struct nlattr *est, char *name, int ovr, int bind,
774 struct list_head *actions, size_t *attr_size, 913 struct tc_action *actions[], size_t *attr_size,
775 struct netlink_ext_ack *extack) 914 bool rtnl_held, struct netlink_ext_ack *extack)
776{ 915{
777 struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; 916 struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
778 struct tc_action *act; 917 struct tc_action *act;
@@ -786,25 +925,19 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
786 925
787 for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { 926 for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) {
788 act = tcf_action_init_1(net, tp, tb[i], est, name, ovr, bind, 927 act = tcf_action_init_1(net, tp, tb[i], est, name, ovr, bind,
789 extack); 928 rtnl_held, extack);
790 if (IS_ERR(act)) { 929 if (IS_ERR(act)) {
791 err = PTR_ERR(act); 930 err = PTR_ERR(act);
792 goto err; 931 goto err;
793 } 932 }
794 act->order = i; 933 act->order = i;
795 sz += tcf_action_fill_size(act); 934 sz += tcf_action_fill_size(act);
796 if (ovr) 935 /* Start from index 0 */
797 act->tcfa_refcnt++; 936 actions[i - 1] = act;
798 list_add_tail(&act->list, actions);
799 } 937 }
800 938
801 *attr_size = tcf_action_full_attrs_size(sz); 939 *attr_size = tcf_action_full_attrs_size(sz);
802 940 return i - 1;
803 /* Remove the temp refcnt which was necessary to protect against
804 * destroying an existing action which was being replaced
805 */
806 cleanup_a(actions, ovr);
807 return 0;
808 941
809err: 942err:
810 tcf_action_destroy(actions, bind); 943 tcf_action_destroy(actions, bind);
@@ -855,7 +988,7 @@ errout:
855 return -1; 988 return -1;
856} 989}
857 990
858static int tca_get_fill(struct sk_buff *skb, struct list_head *actions, 991static int tca_get_fill(struct sk_buff *skb, struct tc_action *actions[],
859 u32 portid, u32 seq, u16 flags, int event, int bind, 992 u32 portid, u32 seq, u16 flags, int event, int bind,
860 int ref) 993 int ref)
861{ 994{
@@ -891,7 +1024,7 @@ out_nlmsg_trim:
891 1024
892static int 1025static int
893tcf_get_notify(struct net *net, u32 portid, struct nlmsghdr *n, 1026tcf_get_notify(struct net *net, u32 portid, struct nlmsghdr *n,
894 struct list_head *actions, int event, 1027 struct tc_action *actions[], int event,
895 struct netlink_ext_ack *extack) 1028 struct netlink_ext_ack *extack)
896{ 1029{
897 struct sk_buff *skb; 1030 struct sk_buff *skb;
@@ -900,7 +1033,7 @@ tcf_get_notify(struct net *net, u32 portid, struct nlmsghdr *n,
900 if (!skb) 1033 if (!skb)
901 return -ENOBUFS; 1034 return -ENOBUFS;
902 if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, event, 1035 if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, event,
903 0, 0) <= 0) { 1036 0, 1) <= 0) {
904 NL_SET_ERR_MSG(extack, "Failed to fill netlink attributes while adding TC action"); 1037 NL_SET_ERR_MSG(extack, "Failed to fill netlink attributes while adding TC action");
905 kfree_skb(skb); 1038 kfree_skb(skb);
906 return -EINVAL; 1039 return -EINVAL;
@@ -1027,8 +1160,37 @@ err_out:
1027 return err; 1160 return err;
1028} 1161}
1029 1162
1163static int tcf_action_delete(struct net *net, struct tc_action *actions[])
1164{
1165 int i;
1166
1167 for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) {
1168 struct tc_action *a = actions[i];
1169 const struct tc_action_ops *ops = a->ops;
1170 /* Actions can be deleted concurrently so we must save their
1171 * type and id to search again after reference is released.
1172 */
1173 struct tcf_idrinfo *idrinfo = a->idrinfo;
1174 u32 act_index = a->tcfa_index;
1175
1176 if (tcf_action_put(a)) {
1177 /* last reference, action was deleted concurrently */
1178 module_put(ops->owner);
1179 } else {
1180 int ret;
1181
1182 /* now do the delete */
1183 ret = tcf_idr_delete_index(idrinfo, act_index);
1184 if (ret < 0)
1185 return ret;
1186 }
1187 actions[i] = NULL;
1188 }
1189 return 0;
1190}
1191
1030static int 1192static int
1031tcf_del_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, 1193tcf_del_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[],
1032 u32 portid, size_t attr_size, struct netlink_ext_ack *extack) 1194 u32 portid, size_t attr_size, struct netlink_ext_ack *extack)
1033{ 1195{
1034 int ret; 1196 int ret;
@@ -1040,14 +1202,14 @@ tcf_del_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions,
1040 return -ENOBUFS; 1202 return -ENOBUFS;
1041 1203
1042 if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, RTM_DELACTION, 1204 if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, RTM_DELACTION,
1043 0, 1) <= 0) { 1205 0, 2) <= 0) {
1044 NL_SET_ERR_MSG(extack, "Failed to fill netlink TC action attributes"); 1206 NL_SET_ERR_MSG(extack, "Failed to fill netlink TC action attributes");
1045 kfree_skb(skb); 1207 kfree_skb(skb);
1046 return -EINVAL; 1208 return -EINVAL;
1047 } 1209 }
1048 1210
1049 /* now do the delete */ 1211 /* now do the delete */
1050 ret = tcf_action_destroy(actions, 0); 1212 ret = tcf_action_delete(net, actions);
1051 if (ret < 0) { 1213 if (ret < 0) {
1052 NL_SET_ERR_MSG(extack, "Failed to delete TC action"); 1214 NL_SET_ERR_MSG(extack, "Failed to delete TC action");
1053 kfree_skb(skb); 1215 kfree_skb(skb);
@@ -1069,7 +1231,7 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
1069 struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; 1231 struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
1070 struct tc_action *act; 1232 struct tc_action *act;
1071 size_t attr_size = 0; 1233 size_t attr_size = 0;
1072 LIST_HEAD(actions); 1234 struct tc_action *actions[TCA_ACT_MAX_PRIO] = {};
1073 1235
1074 ret = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL, extack); 1236 ret = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL, extack);
1075 if (ret < 0) 1237 if (ret < 0)
@@ -1091,27 +1253,26 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
1091 } 1253 }
1092 act->order = i; 1254 act->order = i;
1093 attr_size += tcf_action_fill_size(act); 1255 attr_size += tcf_action_fill_size(act);
1094 list_add_tail(&act->list, &actions); 1256 actions[i - 1] = act;
1095 } 1257 }
1096 1258
1097 attr_size = tcf_action_full_attrs_size(attr_size); 1259 attr_size = tcf_action_full_attrs_size(attr_size);
1098 1260
1099 if (event == RTM_GETACTION) 1261 if (event == RTM_GETACTION)
1100 ret = tcf_get_notify(net, portid, n, &actions, event, extack); 1262 ret = tcf_get_notify(net, portid, n, actions, event, extack);
1101 else { /* delete */ 1263 else { /* delete */
1102 ret = tcf_del_notify(net, n, &actions, portid, attr_size, extack); 1264 ret = tcf_del_notify(net, n, actions, portid, attr_size, extack);
1103 if (ret) 1265 if (ret)
1104 goto err; 1266 goto err;
1105 return ret; 1267 return 0;
1106 } 1268 }
1107err: 1269err:
1108 if (event != RTM_GETACTION) 1270 tcf_action_put_many(actions);
1109 tcf_action_destroy(&actions, 0);
1110 return ret; 1271 return ret;
1111} 1272}
1112 1273
1113static int 1274static int
1114tcf_add_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, 1275tcf_add_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[],
1115 u32 portid, size_t attr_size, struct netlink_ext_ack *extack) 1276 u32 portid, size_t attr_size, struct netlink_ext_ack *extack)
1116{ 1277{
1117 struct sk_buff *skb; 1278 struct sk_buff *skb;
@@ -1142,14 +1303,17 @@ static int tcf_action_add(struct net *net, struct nlattr *nla,
1142{ 1303{
1143 size_t attr_size = 0; 1304 size_t attr_size = 0;
1144 int ret = 0; 1305 int ret = 0;
1145 LIST_HEAD(actions); 1306 struct tc_action *actions[TCA_ACT_MAX_PRIO] = {};
1146 1307
1147 ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, &actions, 1308 ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, actions,
1148 &attr_size, extack); 1309 &attr_size, true, extack);
1149 if (ret) 1310 if (ret < 0)
1150 return ret; 1311 return ret;
1312 ret = tcf_add_notify(net, n, actions, portid, attr_size, extack);
1313 if (ovr)
1314 tcf_action_put_many(actions);
1151 1315
1152 return tcf_add_notify(net, n, &actions, portid, attr_size, extack); 1316 return ret;
1153} 1317}
1154 1318
1155static u32 tcaa_root_flags_allowed = TCA_FLAG_LARGE_DUMP_ON; 1319static u32 tcaa_root_flags_allowed = TCA_FLAG_LARGE_DUMP_ON;
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 18089c02e557..0c68bc9cf0b4 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -34,8 +34,8 @@ struct tcf_bpf_cfg {
34static unsigned int bpf_net_id; 34static unsigned int bpf_net_id;
35static struct tc_action_ops act_bpf_ops; 35static struct tc_action_ops act_bpf_ops;
36 36
37static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act, 37static int tcf_bpf_act(struct sk_buff *skb, const struct tc_action *act,
38 struct tcf_result *res) 38 struct tcf_result *res)
39{ 39{
40 bool at_ingress = skb_at_tc_ingress(skb); 40 bool at_ingress = skb_at_tc_ingress(skb);
41 struct tcf_bpf *prog = to_bpf(act); 41 struct tcf_bpf *prog = to_bpf(act);
@@ -141,13 +141,14 @@ static int tcf_bpf_dump(struct sk_buff *skb, struct tc_action *act,
141 struct tcf_bpf *prog = to_bpf(act); 141 struct tcf_bpf *prog = to_bpf(act);
142 struct tc_act_bpf opt = { 142 struct tc_act_bpf opt = {
143 .index = prog->tcf_index, 143 .index = prog->tcf_index,
144 .refcnt = prog->tcf_refcnt - ref, 144 .refcnt = refcount_read(&prog->tcf_refcnt) - ref,
145 .bindcnt = prog->tcf_bindcnt - bind, 145 .bindcnt = atomic_read(&prog->tcf_bindcnt) - bind,
146 .action = prog->tcf_action,
147 }; 146 };
148 struct tcf_t tm; 147 struct tcf_t tm;
149 int ret; 148 int ret;
150 149
150 spin_lock_bh(&prog->tcf_lock);
151 opt.action = prog->tcf_action;
151 if (nla_put(skb, TCA_ACT_BPF_PARMS, sizeof(opt), &opt)) 152 if (nla_put(skb, TCA_ACT_BPF_PARMS, sizeof(opt), &opt))
152 goto nla_put_failure; 153 goto nla_put_failure;
153 154
@@ -163,9 +164,11 @@ static int tcf_bpf_dump(struct sk_buff *skb, struct tc_action *act,
163 TCA_ACT_BPF_PAD)) 164 TCA_ACT_BPF_PAD))
164 goto nla_put_failure; 165 goto nla_put_failure;
165 166
167 spin_unlock_bh(&prog->tcf_lock);
166 return skb->len; 168 return skb->len;
167 169
168nla_put_failure: 170nla_put_failure:
171 spin_unlock_bh(&prog->tcf_lock);
169 nlmsg_trim(skb, tp); 172 nlmsg_trim(skb, tp);
170 return -1; 173 return -1;
171} 174}
@@ -196,12 +199,10 @@ static int tcf_bpf_init_from_ops(struct nlattr **tb, struct tcf_bpf_cfg *cfg)
196 if (bpf_size != nla_len(tb[TCA_ACT_BPF_OPS])) 199 if (bpf_size != nla_len(tb[TCA_ACT_BPF_OPS]))
197 return -EINVAL; 200 return -EINVAL;
198 201
199 bpf_ops = kzalloc(bpf_size, GFP_KERNEL); 202 bpf_ops = kmemdup(nla_data(tb[TCA_ACT_BPF_OPS]), bpf_size, GFP_KERNEL);
200 if (bpf_ops == NULL) 203 if (bpf_ops == NULL)
201 return -ENOMEM; 204 return -ENOMEM;
202 205
203 memcpy(bpf_ops, nla_data(tb[TCA_ACT_BPF_OPS]), bpf_size);
204
205 fprog_tmp.len = bpf_num_ops; 206 fprog_tmp.len = bpf_num_ops;
206 fprog_tmp.filter = bpf_ops; 207 fprog_tmp.filter = bpf_ops;
207 208
@@ -266,7 +267,7 @@ static void tcf_bpf_prog_fill_cfg(const struct tcf_bpf *prog,
266{ 267{
267 cfg->is_ebpf = tcf_bpf_is_ebpf(prog); 268 cfg->is_ebpf = tcf_bpf_is_ebpf(prog);
268 /* updates to prog->filter are prevented, since it's called either 269 /* updates to prog->filter are prevented, since it's called either
269 * with rtnl lock or during final cleanup in rcu callback 270 * with tcf lock or during final cleanup in rcu callback
270 */ 271 */
271 cfg->filter = rcu_dereference_protected(prog->filter, 1); 272 cfg->filter = rcu_dereference_protected(prog->filter, 1);
272 273
@@ -276,7 +277,8 @@ static void tcf_bpf_prog_fill_cfg(const struct tcf_bpf *prog,
276 277
277static int tcf_bpf_init(struct net *net, struct nlattr *nla, 278static int tcf_bpf_init(struct net *net, struct nlattr *nla,
278 struct nlattr *est, struct tc_action **act, 279 struct nlattr *est, struct tc_action **act,
279 int replace, int bind, struct netlink_ext_ack *extack) 280 int replace, int bind, bool rtnl_held,
281 struct netlink_ext_ack *extack)
280{ 282{
281 struct tc_action_net *tn = net_generic(net, bpf_net_id); 283 struct tc_action_net *tn = net_generic(net, bpf_net_id);
282 struct nlattr *tb[TCA_ACT_BPF_MAX + 1]; 284 struct nlattr *tb[TCA_ACT_BPF_MAX + 1];
@@ -298,21 +300,27 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
298 300
299 parm = nla_data(tb[TCA_ACT_BPF_PARMS]); 301 parm = nla_data(tb[TCA_ACT_BPF_PARMS]);
300 302
301 if (!tcf_idr_check(tn, parm->index, act, bind)) { 303 ret = tcf_idr_check_alloc(tn, &parm->index, act, bind);
304 if (!ret) {
302 ret = tcf_idr_create(tn, parm->index, est, act, 305 ret = tcf_idr_create(tn, parm->index, est, act,
303 &act_bpf_ops, bind, true); 306 &act_bpf_ops, bind, true);
304 if (ret < 0) 307 if (ret < 0) {
308 tcf_idr_cleanup(tn, parm->index);
305 return ret; 309 return ret;
310 }
306 311
307 res = ACT_P_CREATED; 312 res = ACT_P_CREATED;
308 } else { 313 } else if (ret > 0) {
309 /* Don't override defaults. */ 314 /* Don't override defaults. */
310 if (bind) 315 if (bind)
311 return 0; 316 return 0;
312 317
313 tcf_idr_release(*act, bind); 318 if (!replace) {
314 if (!replace) 319 tcf_idr_release(*act, bind);
315 return -EEXIST; 320 return -EEXIST;
321 }
322 } else {
323 return ret;
316 } 324 }
317 325
318 is_bpf = tb[TCA_ACT_BPF_OPS_LEN] && tb[TCA_ACT_BPF_OPS]; 326 is_bpf = tb[TCA_ACT_BPF_OPS_LEN] && tb[TCA_ACT_BPF_OPS];
@@ -331,8 +339,8 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
331 goto out; 339 goto out;
332 340
333 prog = to_bpf(*act); 341 prog = to_bpf(*act);
334 ASSERT_RTNL();
335 342
343 spin_lock_bh(&prog->tcf_lock);
336 if (res != ACT_P_CREATED) 344 if (res != ACT_P_CREATED)
337 tcf_bpf_prog_fill_cfg(prog, &old); 345 tcf_bpf_prog_fill_cfg(prog, &old);
338 346
@@ -344,6 +352,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
344 352
345 prog->tcf_action = parm->action; 353 prog->tcf_action = parm->action;
346 rcu_assign_pointer(prog->filter, cfg.filter); 354 rcu_assign_pointer(prog->filter, cfg.filter);
355 spin_unlock_bh(&prog->tcf_lock);
347 356
348 if (res == ACT_P_CREATED) { 357 if (res == ACT_P_CREATED) {
349 tcf_idr_insert(tn, *act); 358 tcf_idr_insert(tn, *act);
@@ -355,8 +364,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
355 364
356 return res; 365 return res;
357out: 366out:
358 if (res == ACT_P_CREATED) 367 tcf_idr_release(*act, bind);
359 tcf_idr_release(*act, bind);
360 368
361 return ret; 369 return ret;
362} 370}
@@ -391,7 +399,7 @@ static struct tc_action_ops act_bpf_ops __read_mostly = {
391 .kind = "bpf", 399 .kind = "bpf",
392 .type = TCA_ACT_BPF, 400 .type = TCA_ACT_BPF,
393 .owner = THIS_MODULE, 401 .owner = THIS_MODULE,
394 .act = tcf_bpf, 402 .act = tcf_bpf_act,
395 .dump = tcf_bpf_dump, 403 .dump = tcf_bpf_dump,
396 .cleanup = tcf_bpf_cleanup, 404 .cleanup = tcf_bpf_cleanup,
397 .init = tcf_bpf_init, 405 .init = tcf_bpf_init,
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index e4b880fa51fe..6f0f273f1139 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -31,8 +31,8 @@
31static unsigned int connmark_net_id; 31static unsigned int connmark_net_id;
32static struct tc_action_ops act_connmark_ops; 32static struct tc_action_ops act_connmark_ops;
33 33
34static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a, 34static int tcf_connmark_act(struct sk_buff *skb, const struct tc_action *a,
35 struct tcf_result *res) 35 struct tcf_result *res)
36{ 36{
37 const struct nf_conntrack_tuple_hash *thash; 37 const struct nf_conntrack_tuple_hash *thash;
38 struct nf_conntrack_tuple tuple; 38 struct nf_conntrack_tuple tuple;
@@ -96,7 +96,7 @@ static const struct nla_policy connmark_policy[TCA_CONNMARK_MAX + 1] = {
96 96
97static int tcf_connmark_init(struct net *net, struct nlattr *nla, 97static int tcf_connmark_init(struct net *net, struct nlattr *nla,
98 struct nlattr *est, struct tc_action **a, 98 struct nlattr *est, struct tc_action **a,
99 int ovr, int bind, 99 int ovr, int bind, bool rtnl_held,
100 struct netlink_ext_ack *extack) 100 struct netlink_ext_ack *extack)
101{ 101{
102 struct tc_action_net *tn = net_generic(net, connmark_net_id); 102 struct tc_action_net *tn = net_generic(net, connmark_net_id);
@@ -118,11 +118,14 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla,
118 118
119 parm = nla_data(tb[TCA_CONNMARK_PARMS]); 119 parm = nla_data(tb[TCA_CONNMARK_PARMS]);
120 120
121 if (!tcf_idr_check(tn, parm->index, a, bind)) { 121 ret = tcf_idr_check_alloc(tn, &parm->index, a, bind);
122 if (!ret) {
122 ret = tcf_idr_create(tn, parm->index, est, a, 123 ret = tcf_idr_create(tn, parm->index, est, a,
123 &act_connmark_ops, bind, false); 124 &act_connmark_ops, bind, false);
124 if (ret) 125 if (ret) {
126 tcf_idr_cleanup(tn, parm->index);
125 return ret; 127 return ret;
128 }
126 129
127 ci = to_connmark(*a); 130 ci = to_connmark(*a);
128 ci->tcf_action = parm->action; 131 ci->tcf_action = parm->action;
@@ -131,16 +134,18 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla,
131 134
132 tcf_idr_insert(tn, *a); 135 tcf_idr_insert(tn, *a);
133 ret = ACT_P_CREATED; 136 ret = ACT_P_CREATED;
134 } else { 137 } else if (ret > 0) {
135 ci = to_connmark(*a); 138 ci = to_connmark(*a);
136 if (bind) 139 if (bind)
137 return 0; 140 return 0;
138 tcf_idr_release(*a, bind); 141 if (!ovr) {
139 if (!ovr) 142 tcf_idr_release(*a, bind);
140 return -EEXIST; 143 return -EEXIST;
144 }
141 /* replacing action and zone */ 145 /* replacing action and zone */
142 ci->tcf_action = parm->action; 146 ci->tcf_action = parm->action;
143 ci->zone = parm->zone; 147 ci->zone = parm->zone;
148 ret = 0;
144 } 149 }
145 150
146 return ret; 151 return ret;
@@ -154,8 +159,8 @@ static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a,
154 159
155 struct tc_connmark opt = { 160 struct tc_connmark opt = {
156 .index = ci->tcf_index, 161 .index = ci->tcf_index,
157 .refcnt = ci->tcf_refcnt - ref, 162 .refcnt = refcount_read(&ci->tcf_refcnt) - ref,
158 .bindcnt = ci->tcf_bindcnt - bind, 163 .bindcnt = atomic_read(&ci->tcf_bindcnt) - bind,
159 .action = ci->tcf_action, 164 .action = ci->tcf_action,
160 .zone = ci->zone, 165 .zone = ci->zone,
161 }; 166 };
@@ -197,7 +202,7 @@ static struct tc_action_ops act_connmark_ops = {
197 .kind = "connmark", 202 .kind = "connmark",
198 .type = TCA_ACT_CONNMARK, 203 .type = TCA_ACT_CONNMARK,
199 .owner = THIS_MODULE, 204 .owner = THIS_MODULE,
200 .act = tcf_connmark, 205 .act = tcf_connmark_act,
201 .dump = tcf_connmark_dump, 206 .dump = tcf_connmark_dump,
202 .init = tcf_connmark_init, 207 .init = tcf_connmark_init,
203 .walk = tcf_connmark_walker, 208 .walk = tcf_connmark_walker,
@@ -239,4 +244,3 @@ module_exit(connmark_cleanup_module);
239MODULE_AUTHOR("Felix Fietkau <nbd@openwrt.org>"); 244MODULE_AUTHOR("Felix Fietkau <nbd@openwrt.org>");
240MODULE_DESCRIPTION("Connection tracking mark restoring"); 245MODULE_DESCRIPTION("Connection tracking mark restoring");
241MODULE_LICENSE("GPL"); 246MODULE_LICENSE("GPL");
242
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 526a8e491626..b8a67ae3105a 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -46,10 +46,11 @@ static struct tc_action_ops act_csum_ops;
46 46
47static int tcf_csum_init(struct net *net, struct nlattr *nla, 47static int tcf_csum_init(struct net *net, struct nlattr *nla,
48 struct nlattr *est, struct tc_action **a, int ovr, 48 struct nlattr *est, struct tc_action **a, int ovr,
49 int bind, struct netlink_ext_ack *extack) 49 int bind, bool rtnl_held,
50 struct netlink_ext_ack *extack)
50{ 51{
51 struct tc_action_net *tn = net_generic(net, csum_net_id); 52 struct tc_action_net *tn = net_generic(net, csum_net_id);
52 struct tcf_csum_params *params_old, *params_new; 53 struct tcf_csum_params *params_new;
53 struct nlattr *tb[TCA_CSUM_MAX + 1]; 54 struct nlattr *tb[TCA_CSUM_MAX + 1];
54 struct tc_csum *parm; 55 struct tc_csum *parm;
55 struct tcf_csum *p; 56 struct tcf_csum *p;
@@ -66,36 +67,43 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla,
66 return -EINVAL; 67 return -EINVAL;
67 parm = nla_data(tb[TCA_CSUM_PARMS]); 68 parm = nla_data(tb[TCA_CSUM_PARMS]);
68 69
69 if (!tcf_idr_check(tn, parm->index, a, bind)) { 70 err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
71 if (!err) {
70 ret = tcf_idr_create(tn, parm->index, est, a, 72 ret = tcf_idr_create(tn, parm->index, est, a,
71 &act_csum_ops, bind, true); 73 &act_csum_ops, bind, true);
72 if (ret) 74 if (ret) {
75 tcf_idr_cleanup(tn, parm->index);
73 return ret; 76 return ret;
77 }
74 ret = ACT_P_CREATED; 78 ret = ACT_P_CREATED;
75 } else { 79 } else if (err > 0) {
76 if (bind)/* dont override defaults */ 80 if (bind)/* dont override defaults */
77 return 0; 81 return 0;
78 tcf_idr_release(*a, bind); 82 if (!ovr) {
79 if (!ovr) 83 tcf_idr_release(*a, bind);
80 return -EEXIST; 84 return -EEXIST;
85 }
86 } else {
87 return err;
81 } 88 }
82 89
83 p = to_tcf_csum(*a); 90 p = to_tcf_csum(*a);
84 ASSERT_RTNL();
85 91
86 params_new = kzalloc(sizeof(*params_new), GFP_KERNEL); 92 params_new = kzalloc(sizeof(*params_new), GFP_KERNEL);
87 if (unlikely(!params_new)) { 93 if (unlikely(!params_new)) {
88 if (ret == ACT_P_CREATED) 94 tcf_idr_release(*a, bind);
89 tcf_idr_release(*a, bind);
90 return -ENOMEM; 95 return -ENOMEM;
91 } 96 }
92 params_old = rtnl_dereference(p->params);
93
94 params_new->action = parm->action;
95 params_new->update_flags = parm->update_flags; 97 params_new->update_flags = parm->update_flags;
96 rcu_assign_pointer(p->params, params_new); 98
97 if (params_old) 99 spin_lock_bh(&p->tcf_lock);
98 kfree_rcu(params_old, rcu); 100 p->tcf_action = parm->action;
101 rcu_swap_protected(p->params, params_new,
102 lockdep_is_held(&p->tcf_lock));
103 spin_unlock_bh(&p->tcf_lock);
104
105 if (params_new)
106 kfree_rcu(params_new, rcu);
99 107
100 if (ret == ACT_P_CREATED) 108 if (ret == ACT_P_CREATED)
101 tcf_idr_insert(tn, *a); 109 tcf_idr_insert(tn, *a);
@@ -547,23 +555,22 @@ fail:
547 return 0; 555 return 0;
548} 556}
549 557
550static int tcf_csum(struct sk_buff *skb, const struct tc_action *a, 558static int tcf_csum_act(struct sk_buff *skb, const struct tc_action *a,
551 struct tcf_result *res) 559 struct tcf_result *res)
552{ 560{
553 struct tcf_csum *p = to_tcf_csum(a); 561 struct tcf_csum *p = to_tcf_csum(a);
554 struct tcf_csum_params *params; 562 struct tcf_csum_params *params;
555 u32 update_flags; 563 u32 update_flags;
556 int action; 564 int action;
557 565
558 rcu_read_lock(); 566 params = rcu_dereference_bh(p->params);
559 params = rcu_dereference(p->params);
560 567
561 tcf_lastuse_update(&p->tcf_tm); 568 tcf_lastuse_update(&p->tcf_tm);
562 bstats_cpu_update(this_cpu_ptr(p->common.cpu_bstats), skb); 569 bstats_cpu_update(this_cpu_ptr(p->common.cpu_bstats), skb);
563 570
564 action = params->action; 571 action = READ_ONCE(p->tcf_action);
565 if (unlikely(action == TC_ACT_SHOT)) 572 if (unlikely(action == TC_ACT_SHOT))
566 goto drop_stats; 573 goto drop;
567 574
568 update_flags = params->update_flags; 575 update_flags = params->update_flags;
569 switch (tc_skb_protocol(skb)) { 576 switch (tc_skb_protocol(skb)) {
@@ -577,16 +584,11 @@ static int tcf_csum(struct sk_buff *skb, const struct tc_action *a,
577 break; 584 break;
578 } 585 }
579 586
580unlock:
581 rcu_read_unlock();
582 return action; 587 return action;
583 588
584drop: 589drop:
585 action = TC_ACT_SHOT;
586
587drop_stats:
588 qstats_drop_inc(this_cpu_ptr(p->common.cpu_qstats)); 590 qstats_drop_inc(this_cpu_ptr(p->common.cpu_qstats));
589 goto unlock; 591 return TC_ACT_SHOT;
590} 592}
591 593
592static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind, 594static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind,
@@ -597,13 +599,15 @@ static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind,
597 struct tcf_csum_params *params; 599 struct tcf_csum_params *params;
598 struct tc_csum opt = { 600 struct tc_csum opt = {
599 .index = p->tcf_index, 601 .index = p->tcf_index,
600 .refcnt = p->tcf_refcnt - ref, 602 .refcnt = refcount_read(&p->tcf_refcnt) - ref,
601 .bindcnt = p->tcf_bindcnt - bind, 603 .bindcnt = atomic_read(&p->tcf_bindcnt) - bind,
602 }; 604 };
603 struct tcf_t t; 605 struct tcf_t t;
604 606
605 params = rtnl_dereference(p->params); 607 spin_lock_bh(&p->tcf_lock);
606 opt.action = params->action; 608 params = rcu_dereference_protected(p->params,
609 lockdep_is_held(&p->tcf_lock));
610 opt.action = p->tcf_action;
607 opt.update_flags = params->update_flags; 611 opt.update_flags = params->update_flags;
608 612
609 if (nla_put(skb, TCA_CSUM_PARMS, sizeof(opt), &opt)) 613 if (nla_put(skb, TCA_CSUM_PARMS, sizeof(opt), &opt))
@@ -612,10 +616,12 @@ static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind,
612 tcf_tm_dump(&t, &p->tcf_tm); 616 tcf_tm_dump(&t, &p->tcf_tm);
613 if (nla_put_64bit(skb, TCA_CSUM_TM, sizeof(t), &t, TCA_CSUM_PAD)) 617 if (nla_put_64bit(skb, TCA_CSUM_TM, sizeof(t), &t, TCA_CSUM_PAD))
614 goto nla_put_failure; 618 goto nla_put_failure;
619 spin_unlock_bh(&p->tcf_lock);
615 620
616 return skb->len; 621 return skb->len;
617 622
618nla_put_failure: 623nla_put_failure:
624 spin_unlock_bh(&p->tcf_lock);
619 nlmsg_trim(skb, b); 625 nlmsg_trim(skb, b);
620 return -1; 626 return -1;
621} 627}
@@ -657,7 +663,7 @@ static struct tc_action_ops act_csum_ops = {
657 .kind = "csum", 663 .kind = "csum",
658 .type = TCA_ACT_CSUM, 664 .type = TCA_ACT_CSUM,
659 .owner = THIS_MODULE, 665 .owner = THIS_MODULE,
660 .act = tcf_csum, 666 .act = tcf_csum_act,
661 .dump = tcf_csum_dump, 667 .dump = tcf_csum_dump,
662 .init = tcf_csum_init, 668 .init = tcf_csum_init,
663 .cleanup = tcf_csum_cleanup, 669 .cleanup = tcf_csum_cleanup,
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index 4dc4f153cad8..cd1d9bd32ef9 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -56,7 +56,8 @@ static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = {
56 56
57static int tcf_gact_init(struct net *net, struct nlattr *nla, 57static int tcf_gact_init(struct net *net, struct nlattr *nla,
58 struct nlattr *est, struct tc_action **a, 58 struct nlattr *est, struct tc_action **a,
59 int ovr, int bind, struct netlink_ext_ack *extack) 59 int ovr, int bind, bool rtnl_held,
60 struct netlink_ext_ack *extack)
60{ 61{
61 struct tc_action_net *tn = net_generic(net, gact_net_id); 62 struct tc_action_net *tn = net_generic(net, gact_net_id);
62 struct nlattr *tb[TCA_GACT_MAX + 1]; 63 struct nlattr *tb[TCA_GACT_MAX + 1];
@@ -90,23 +91,29 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla,
90 } 91 }
91#endif 92#endif
92 93
93 if (!tcf_idr_check(tn, parm->index, a, bind)) { 94 err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
95 if (!err) {
94 ret = tcf_idr_create(tn, parm->index, est, a, 96 ret = tcf_idr_create(tn, parm->index, est, a,
95 &act_gact_ops, bind, true); 97 &act_gact_ops, bind, true);
96 if (ret) 98 if (ret) {
99 tcf_idr_cleanup(tn, parm->index);
97 return ret; 100 return ret;
101 }
98 ret = ACT_P_CREATED; 102 ret = ACT_P_CREATED;
99 } else { 103 } else if (err > 0) {
100 if (bind)/* dont override defaults */ 104 if (bind)/* dont override defaults */
101 return 0; 105 return 0;
102 tcf_idr_release(*a, bind); 106 if (!ovr) {
103 if (!ovr) 107 tcf_idr_release(*a, bind);
104 return -EEXIST; 108 return -EEXIST;
109 }
110 } else {
111 return err;
105 } 112 }
106 113
107 gact = to_gact(*a); 114 gact = to_gact(*a);
108 115
109 ASSERT_RTNL(); 116 spin_lock_bh(&gact->tcf_lock);
110 gact->tcf_action = parm->action; 117 gact->tcf_action = parm->action;
111#ifdef CONFIG_GACT_PROB 118#ifdef CONFIG_GACT_PROB
112 if (p_parm) { 119 if (p_parm) {
@@ -119,13 +126,15 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla,
119 gact->tcfg_ptype = p_parm->ptype; 126 gact->tcfg_ptype = p_parm->ptype;
120 } 127 }
121#endif 128#endif
129 spin_unlock_bh(&gact->tcf_lock);
130
122 if (ret == ACT_P_CREATED) 131 if (ret == ACT_P_CREATED)
123 tcf_idr_insert(tn, *a); 132 tcf_idr_insert(tn, *a);
124 return ret; 133 return ret;
125} 134}
126 135
127static int tcf_gact(struct sk_buff *skb, const struct tc_action *a, 136static int tcf_gact_act(struct sk_buff *skb, const struct tc_action *a,
128 struct tcf_result *res) 137 struct tcf_result *res)
129{ 138{
130 struct tcf_gact *gact = to_gact(a); 139 struct tcf_gact *gact = to_gact(a);
131 int action = READ_ONCE(gact->tcf_action); 140 int action = READ_ONCE(gact->tcf_action);
@@ -169,12 +178,13 @@ static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a,
169 struct tcf_gact *gact = to_gact(a); 178 struct tcf_gact *gact = to_gact(a);
170 struct tc_gact opt = { 179 struct tc_gact opt = {
171 .index = gact->tcf_index, 180 .index = gact->tcf_index,
172 .refcnt = gact->tcf_refcnt - ref, 181 .refcnt = refcount_read(&gact->tcf_refcnt) - ref,
173 .bindcnt = gact->tcf_bindcnt - bind, 182 .bindcnt = atomic_read(&gact->tcf_bindcnt) - bind,
174 .action = gact->tcf_action,
175 }; 183 };
176 struct tcf_t t; 184 struct tcf_t t;
177 185
186 spin_lock_bh(&gact->tcf_lock);
187 opt.action = gact->tcf_action;
178 if (nla_put(skb, TCA_GACT_PARMS, sizeof(opt), &opt)) 188 if (nla_put(skb, TCA_GACT_PARMS, sizeof(opt), &opt))
179 goto nla_put_failure; 189 goto nla_put_failure;
180#ifdef CONFIG_GACT_PROB 190#ifdef CONFIG_GACT_PROB
@@ -192,9 +202,12 @@ static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a,
192 tcf_tm_dump(&t, &gact->tcf_tm); 202 tcf_tm_dump(&t, &gact->tcf_tm);
193 if (nla_put_64bit(skb, TCA_GACT_TM, sizeof(t), &t, TCA_GACT_PAD)) 203 if (nla_put_64bit(skb, TCA_GACT_TM, sizeof(t), &t, TCA_GACT_PAD))
194 goto nla_put_failure; 204 goto nla_put_failure;
205 spin_unlock_bh(&gact->tcf_lock);
206
195 return skb->len; 207 return skb->len;
196 208
197nla_put_failure: 209nla_put_failure:
210 spin_unlock_bh(&gact->tcf_lock);
198 nlmsg_trim(skb, b); 211 nlmsg_trim(skb, b);
199 return -1; 212 return -1;
200} 213}
@@ -234,7 +247,7 @@ static struct tc_action_ops act_gact_ops = {
234 .kind = "gact", 247 .kind = "gact",
235 .type = TCA_ACT_GACT, 248 .type = TCA_ACT_GACT,
236 .owner = THIS_MODULE, 249 .owner = THIS_MODULE,
237 .act = tcf_gact, 250 .act = tcf_gact_act,
238 .stats_update = tcf_gact_stats_update, 251 .stats_update = tcf_gact_stats_update,
239 .dump = tcf_gact_dump, 252 .dump = tcf_gact_dump,
240 .init = tcf_gact_init, 253 .init = tcf_gact_init,
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 20d7d36b2fc9..196430aefe87 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -265,10 +265,8 @@ static const char *ife_meta_id2name(u32 metaid)
265#endif 265#endif
266 266
267/* called when adding new meta information 267/* called when adding new meta information
268 * under ife->tcf_lock for existing action
269*/ 268*/
270static int load_metaops_and_vet(struct tcf_ife_info *ife, u32 metaid, 269static int load_metaops_and_vet(u32 metaid, void *val, int len, bool rtnl_held)
271 void *val, int len, bool exists)
272{ 270{
273 struct tcf_meta_ops *ops = find_ife_oplist(metaid); 271 struct tcf_meta_ops *ops = find_ife_oplist(metaid);
274 int ret = 0; 272 int ret = 0;
@@ -276,13 +274,11 @@ static int load_metaops_and_vet(struct tcf_ife_info *ife, u32 metaid,
276 if (!ops) { 274 if (!ops) {
277 ret = -ENOENT; 275 ret = -ENOENT;
278#ifdef CONFIG_MODULES 276#ifdef CONFIG_MODULES
279 if (exists) 277 if (rtnl_held)
280 spin_unlock_bh(&ife->tcf_lock); 278 rtnl_unlock();
281 rtnl_unlock();
282 request_module("ife-meta-%s", ife_meta_id2name(metaid)); 279 request_module("ife-meta-%s", ife_meta_id2name(metaid));
283 rtnl_lock(); 280 if (rtnl_held)
284 if (exists) 281 rtnl_lock();
285 spin_lock_bh(&ife->tcf_lock);
286 ops = find_ife_oplist(metaid); 282 ops = find_ife_oplist(metaid);
287#endif 283#endif
288 } 284 }
@@ -299,24 +295,17 @@ static int load_metaops_and_vet(struct tcf_ife_info *ife, u32 metaid,
299} 295}
300 296
301/* called when adding new meta information 297/* called when adding new meta information
302 * under ife->tcf_lock for existing action
303*/ 298*/
304static int add_metainfo(struct tcf_ife_info *ife, u32 metaid, void *metaval, 299static int __add_metainfo(const struct tcf_meta_ops *ops,
305 int len, bool atomic) 300 struct tcf_ife_info *ife, u32 metaid, void *metaval,
301 int len, bool atomic, bool exists)
306{ 302{
307 struct tcf_meta_info *mi = NULL; 303 struct tcf_meta_info *mi = NULL;
308 struct tcf_meta_ops *ops = find_ife_oplist(metaid);
309 int ret = 0; 304 int ret = 0;
310 305
311 if (!ops)
312 return -ENOENT;
313
314 mi = kzalloc(sizeof(*mi), atomic ? GFP_ATOMIC : GFP_KERNEL); 306 mi = kzalloc(sizeof(*mi), atomic ? GFP_ATOMIC : GFP_KERNEL);
315 if (!mi) { 307 if (!mi)
316 /*put back what find_ife_oplist took */
317 module_put(ops->owner);
318 return -ENOMEM; 308 return -ENOMEM;
319 }
320 309
321 mi->metaid = metaid; 310 mi->metaid = metaid;
322 mi->ops = ops; 311 mi->ops = ops;
@@ -324,17 +313,35 @@ static int add_metainfo(struct tcf_ife_info *ife, u32 metaid, void *metaval,
324 ret = ops->alloc(mi, metaval, atomic ? GFP_ATOMIC : GFP_KERNEL); 313 ret = ops->alloc(mi, metaval, atomic ? GFP_ATOMIC : GFP_KERNEL);
325 if (ret != 0) { 314 if (ret != 0) {
326 kfree(mi); 315 kfree(mi);
327 module_put(ops->owner);
328 return ret; 316 return ret;
329 } 317 }
330 } 318 }
331 319
320 if (exists)
321 spin_lock_bh(&ife->tcf_lock);
332 list_add_tail(&mi->metalist, &ife->metalist); 322 list_add_tail(&mi->metalist, &ife->metalist);
323 if (exists)
324 spin_unlock_bh(&ife->tcf_lock);
333 325
334 return ret; 326 return ret;
335} 327}
336 328
337static int use_all_metadata(struct tcf_ife_info *ife) 329static int add_metainfo(struct tcf_ife_info *ife, u32 metaid, void *metaval,
330 int len, bool exists)
331{
332 const struct tcf_meta_ops *ops = find_ife_oplist(metaid);
333 int ret;
334
335 if (!ops)
336 return -ENOENT;
337 ret = __add_metainfo(ops, ife, metaid, metaval, len, false, exists);
338 if (ret)
339 /*put back what find_ife_oplist took */
340 module_put(ops->owner);
341 return ret;
342}
343
344static int use_all_metadata(struct tcf_ife_info *ife, bool exists)
338{ 345{
339 struct tcf_meta_ops *o; 346 struct tcf_meta_ops *o;
340 int rc = 0; 347 int rc = 0;
@@ -342,7 +349,7 @@ static int use_all_metadata(struct tcf_ife_info *ife)
342 349
343 read_lock(&ife_mod_lock); 350 read_lock(&ife_mod_lock);
344 list_for_each_entry(o, &ifeoplist, list) { 351 list_for_each_entry(o, &ifeoplist, list) {
345 rc = add_metainfo(ife, o->metaid, NULL, 0, true); 352 rc = __add_metainfo(o, ife, o->metaid, NULL, 0, true, exists);
346 if (rc == 0) 353 if (rc == 0)
347 installed += 1; 354 installed += 1;
348 } 355 }
@@ -419,9 +426,8 @@ static void tcf_ife_cleanup(struct tc_action *a)
419 kfree_rcu(p, rcu); 426 kfree_rcu(p, rcu);
420} 427}
421 428
422/* under ife->tcf_lock for existing action */
423static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb, 429static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb,
424 bool exists) 430 bool exists, bool rtnl_held)
425{ 431{
426 int len = 0; 432 int len = 0;
427 int rc = 0; 433 int rc = 0;
@@ -433,7 +439,7 @@ static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb,
433 val = nla_data(tb[i]); 439 val = nla_data(tb[i]);
434 len = nla_len(tb[i]); 440 len = nla_len(tb[i]);
435 441
436 rc = load_metaops_and_vet(ife, i, val, len, exists); 442 rc = load_metaops_and_vet(i, val, len, rtnl_held);
437 if (rc != 0) 443 if (rc != 0)
438 return rc; 444 return rc;
439 445
@@ -448,12 +454,13 @@ static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb,
448 454
449static int tcf_ife_init(struct net *net, struct nlattr *nla, 455static int tcf_ife_init(struct net *net, struct nlattr *nla,
450 struct nlattr *est, struct tc_action **a, 456 struct nlattr *est, struct tc_action **a,
451 int ovr, int bind, struct netlink_ext_ack *extack) 457 int ovr, int bind, bool rtnl_held,
458 struct netlink_ext_ack *extack)
452{ 459{
453 struct tc_action_net *tn = net_generic(net, ife_net_id); 460 struct tc_action_net *tn = net_generic(net, ife_net_id);
454 struct nlattr *tb[TCA_IFE_MAX + 1]; 461 struct nlattr *tb[TCA_IFE_MAX + 1];
455 struct nlattr *tb2[IFE_META_MAX + 1]; 462 struct nlattr *tb2[IFE_META_MAX + 1];
456 struct tcf_ife_params *p, *p_old; 463 struct tcf_ife_params *p;
457 struct tcf_ife_info *ife; 464 struct tcf_ife_info *ife;
458 u16 ife_type = ETH_P_IFE; 465 u16 ife_type = ETH_P_IFE;
459 struct tc_ife *parm; 466 struct tc_ife *parm;
@@ -483,7 +490,12 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
483 if (!p) 490 if (!p)
484 return -ENOMEM; 491 return -ENOMEM;
485 492
486 exists = tcf_idr_check(tn, parm->index, a, bind); 493 err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
494 if (err < 0) {
495 kfree(p);
496 return err;
497 }
498 exists = err;
487 if (exists && bind) { 499 if (exists && bind) {
488 kfree(p); 500 kfree(p);
489 return 0; 501 return 0;
@@ -493,16 +505,15 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
493 ret = tcf_idr_create(tn, parm->index, est, a, &act_ife_ops, 505 ret = tcf_idr_create(tn, parm->index, est, a, &act_ife_ops,
494 bind, true); 506 bind, true);
495 if (ret) { 507 if (ret) {
508 tcf_idr_cleanup(tn, parm->index);
496 kfree(p); 509 kfree(p);
497 return ret; 510 return ret;
498 } 511 }
499 ret = ACT_P_CREATED; 512 ret = ACT_P_CREATED;
500 } else { 513 } else if (!ovr) {
501 tcf_idr_release(*a, bind); 514 tcf_idr_release(*a, bind);
502 if (!ovr) { 515 kfree(p);
503 kfree(p); 516 return -EEXIST;
504 return -EEXIST;
505 }
506 } 517 }
507 518
508 ife = to_ife(*a); 519 ife = to_ife(*a);
@@ -531,8 +542,6 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
531 p->eth_type = ife_type; 542 p->eth_type = ife_type;
532 } 543 }
533 544
534 if (exists)
535 spin_lock_bh(&ife->tcf_lock);
536 545
537 if (ret == ACT_P_CREATED) 546 if (ret == ACT_P_CREATED)
538 INIT_LIST_HEAD(&ife->metalist); 547 INIT_LIST_HEAD(&ife->metalist);
@@ -542,16 +551,12 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
542 NULL, NULL); 551 NULL, NULL);
543 if (err) { 552 if (err) {
544metadata_parse_err: 553metadata_parse_err:
545 if (ret == ACT_P_CREATED) 554 tcf_idr_release(*a, bind);
546 tcf_idr_release(*a, bind);
547
548 if (exists)
549 spin_unlock_bh(&ife->tcf_lock);
550 kfree(p); 555 kfree(p);
551 return err; 556 return err;
552 } 557 }
553 558
554 err = populate_metalist(ife, tb2, exists); 559 err = populate_metalist(ife, tb2, exists, rtnl_held);
555 if (err) 560 if (err)
556 goto metadata_parse_err; 561 goto metadata_parse_err;
557 562
@@ -561,26 +566,24 @@ metadata_parse_err:
561 * as we can. You better have at least one else we are 566 * as we can. You better have at least one else we are
562 * going to bail out 567 * going to bail out
563 */ 568 */
564 err = use_all_metadata(ife); 569 err = use_all_metadata(ife, exists);
565 if (err) { 570 if (err) {
566 if (ret == ACT_P_CREATED) 571 tcf_idr_release(*a, bind);
567 tcf_idr_release(*a, bind);
568
569 if (exists)
570 spin_unlock_bh(&ife->tcf_lock);
571 kfree(p); 572 kfree(p);
572 return err; 573 return err;
573 } 574 }
574 } 575 }
575 576
577 if (exists)
578 spin_lock_bh(&ife->tcf_lock);
576 ife->tcf_action = parm->action; 579 ife->tcf_action = parm->action;
580 /* protected by tcf_lock when modifying existing action */
581 rcu_swap_protected(ife->params, p, 1);
582
577 if (exists) 583 if (exists)
578 spin_unlock_bh(&ife->tcf_lock); 584 spin_unlock_bh(&ife->tcf_lock);
579 585 if (p)
580 p_old = rtnl_dereference(ife->params); 586 kfree_rcu(p, rcu);
581 rcu_assign_pointer(ife->params, p);
582 if (p_old)
583 kfree_rcu(p_old, rcu);
584 587
585 if (ret == ACT_P_CREATED) 588 if (ret == ACT_P_CREATED)
586 tcf_idr_insert(tn, *a); 589 tcf_idr_insert(tn, *a);
@@ -593,16 +596,20 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind,
593{ 596{
594 unsigned char *b = skb_tail_pointer(skb); 597 unsigned char *b = skb_tail_pointer(skb);
595 struct tcf_ife_info *ife = to_ife(a); 598 struct tcf_ife_info *ife = to_ife(a);
596 struct tcf_ife_params *p = rtnl_dereference(ife->params); 599 struct tcf_ife_params *p;
597 struct tc_ife opt = { 600 struct tc_ife opt = {
598 .index = ife->tcf_index, 601 .index = ife->tcf_index,
599 .refcnt = ife->tcf_refcnt - ref, 602 .refcnt = refcount_read(&ife->tcf_refcnt) - ref,
600 .bindcnt = ife->tcf_bindcnt - bind, 603 .bindcnt = atomic_read(&ife->tcf_bindcnt) - bind,
601 .action = ife->tcf_action,
602 .flags = p->flags,
603 }; 604 };
604 struct tcf_t t; 605 struct tcf_t t;
605 606
607 spin_lock_bh(&ife->tcf_lock);
608 opt.action = ife->tcf_action;
609 p = rcu_dereference_protected(ife->params,
610 lockdep_is_held(&ife->tcf_lock));
611 opt.flags = p->flags;
612
606 if (nla_put(skb, TCA_IFE_PARMS, sizeof(opt), &opt)) 613 if (nla_put(skb, TCA_IFE_PARMS, sizeof(opt), &opt))
607 goto nla_put_failure; 614 goto nla_put_failure;
608 615
@@ -628,9 +635,11 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind,
628 pr_info("Failed to dump metalist\n"); 635 pr_info("Failed to dump metalist\n");
629 } 636 }
630 637
638 spin_unlock_bh(&ife->tcf_lock);
631 return skb->len; 639 return skb->len;
632 640
633nla_put_failure: 641nla_put_failure:
642 spin_unlock_bh(&ife->tcf_lock);
634 nlmsg_trim(skb, b); 643 nlmsg_trim(skb, b);
635 return -1; 644 return -1;
636} 645}
@@ -813,14 +822,11 @@ static int tcf_ife_act(struct sk_buff *skb, const struct tc_action *a,
813 struct tcf_ife_params *p; 822 struct tcf_ife_params *p;
814 int ret; 823 int ret;
815 824
816 rcu_read_lock(); 825 p = rcu_dereference_bh(ife->params);
817 p = rcu_dereference(ife->params);
818 if (p->flags & IFE_ENCODE) { 826 if (p->flags & IFE_ENCODE) {
819 ret = tcf_ife_encode(skb, a, res, p); 827 ret = tcf_ife_encode(skb, a, res, p);
820 rcu_read_unlock();
821 return ret; 828 return ret;
822 } 829 }
823 rcu_read_unlock();
824 830
825 return tcf_ife_decode(skb, a, res); 831 return tcf_ife_decode(skb, a, res);
826} 832}
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index 14c312d7908f..23273b5303fd 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -119,13 +119,18 @@ static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla,
119 if (tb[TCA_IPT_INDEX] != NULL) 119 if (tb[TCA_IPT_INDEX] != NULL)
120 index = nla_get_u32(tb[TCA_IPT_INDEX]); 120 index = nla_get_u32(tb[TCA_IPT_INDEX]);
121 121
122 exists = tcf_idr_check(tn, index, a, bind); 122 err = tcf_idr_check_alloc(tn, &index, a, bind);
123 if (err < 0)
124 return err;
125 exists = err;
123 if (exists && bind) 126 if (exists && bind)
124 return 0; 127 return 0;
125 128
126 if (tb[TCA_IPT_HOOK] == NULL || tb[TCA_IPT_TARG] == NULL) { 129 if (tb[TCA_IPT_HOOK] == NULL || tb[TCA_IPT_TARG] == NULL) {
127 if (exists) 130 if (exists)
128 tcf_idr_release(*a, bind); 131 tcf_idr_release(*a, bind);
132 else
133 tcf_idr_cleanup(tn, index);
129 return -EINVAL; 134 return -EINVAL;
130 } 135 }
131 136
@@ -133,22 +138,27 @@ static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla,
133 if (nla_len(tb[TCA_IPT_TARG]) < td->u.target_size) { 138 if (nla_len(tb[TCA_IPT_TARG]) < td->u.target_size) {
134 if (exists) 139 if (exists)
135 tcf_idr_release(*a, bind); 140 tcf_idr_release(*a, bind);
141 else
142 tcf_idr_cleanup(tn, index);
136 return -EINVAL; 143 return -EINVAL;
137 } 144 }
138 145
139 if (!exists) { 146 if (!exists) {
140 ret = tcf_idr_create(tn, index, est, a, ops, bind, 147 ret = tcf_idr_create(tn, index, est, a, ops, bind,
141 false); 148 false);
142 if (ret) 149 if (ret) {
150 tcf_idr_cleanup(tn, index);
143 return ret; 151 return ret;
152 }
144 ret = ACT_P_CREATED; 153 ret = ACT_P_CREATED;
145 } else { 154 } else {
146 if (bind)/* dont override defaults */ 155 if (bind)/* dont override defaults */
147 return 0; 156 return 0;
148 tcf_idr_release(*a, bind);
149 157
150 if (!ovr) 158 if (!ovr) {
159 tcf_idr_release(*a, bind);
151 return -EEXIST; 160 return -EEXIST;
161 }
152 } 162 }
153 hook = nla_get_u32(tb[TCA_IPT_HOOK]); 163 hook = nla_get_u32(tb[TCA_IPT_HOOK]);
154 164
@@ -196,7 +206,8 @@ err1:
196 206
197static int tcf_ipt_init(struct net *net, struct nlattr *nla, 207static int tcf_ipt_init(struct net *net, struct nlattr *nla,
198 struct nlattr *est, struct tc_action **a, int ovr, 208 struct nlattr *est, struct tc_action **a, int ovr,
199 int bind, struct netlink_ext_ack *extack) 209 int bind, bool rtnl_held,
210 struct netlink_ext_ack *extack)
200{ 211{
201 return __tcf_ipt_init(net, ipt_net_id, nla, est, a, &act_ipt_ops, ovr, 212 return __tcf_ipt_init(net, ipt_net_id, nla, est, a, &act_ipt_ops, ovr,
202 bind); 213 bind);
@@ -204,14 +215,15 @@ static int tcf_ipt_init(struct net *net, struct nlattr *nla,
204 215
205static int tcf_xt_init(struct net *net, struct nlattr *nla, 216static int tcf_xt_init(struct net *net, struct nlattr *nla,
206 struct nlattr *est, struct tc_action **a, int ovr, 217 struct nlattr *est, struct tc_action **a, int ovr,
207 int bind, struct netlink_ext_ack *extack) 218 int bind, bool unlocked,
219 struct netlink_ext_ack *extack)
208{ 220{
209 return __tcf_ipt_init(net, xt_net_id, nla, est, a, &act_xt_ops, ovr, 221 return __tcf_ipt_init(net, xt_net_id, nla, est, a, &act_xt_ops, ovr,
210 bind); 222 bind);
211} 223}
212 224
213static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a, 225static int tcf_ipt_act(struct sk_buff *skb, const struct tc_action *a,
214 struct tcf_result *res) 226 struct tcf_result *res)
215{ 227{
216 int ret = 0, result = 0; 228 int ret = 0, result = 0;
217 struct tcf_ipt *ipt = to_ipt(a); 229 struct tcf_ipt *ipt = to_ipt(a);
@@ -276,12 +288,13 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind,
276 * for foolproof you need to not assume this 288 * for foolproof you need to not assume this
277 */ 289 */
278 290
291 spin_lock_bh(&ipt->tcf_lock);
279 t = kmemdup(ipt->tcfi_t, ipt->tcfi_t->u.user.target_size, GFP_ATOMIC); 292 t = kmemdup(ipt->tcfi_t, ipt->tcfi_t->u.user.target_size, GFP_ATOMIC);
280 if (unlikely(!t)) 293 if (unlikely(!t))
281 goto nla_put_failure; 294 goto nla_put_failure;
282 295
283 c.bindcnt = ipt->tcf_bindcnt - bind; 296 c.bindcnt = atomic_read(&ipt->tcf_bindcnt) - bind;
284 c.refcnt = ipt->tcf_refcnt - ref; 297 c.refcnt = refcount_read(&ipt->tcf_refcnt) - ref;
285 strcpy(t->u.user.name, ipt->tcfi_t->u.kernel.target->name); 298 strcpy(t->u.user.name, ipt->tcfi_t->u.kernel.target->name);
286 299
287 if (nla_put(skb, TCA_IPT_TARG, ipt->tcfi_t->u.user.target_size, t) || 300 if (nla_put(skb, TCA_IPT_TARG, ipt->tcfi_t->u.user.target_size, t) ||
@@ -295,10 +308,12 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind,
295 if (nla_put_64bit(skb, TCA_IPT_TM, sizeof(tm), &tm, TCA_IPT_PAD)) 308 if (nla_put_64bit(skb, TCA_IPT_TM, sizeof(tm), &tm, TCA_IPT_PAD))
296 goto nla_put_failure; 309 goto nla_put_failure;
297 310
311 spin_unlock_bh(&ipt->tcf_lock);
298 kfree(t); 312 kfree(t);
299 return skb->len; 313 return skb->len;
300 314
301nla_put_failure: 315nla_put_failure:
316 spin_unlock_bh(&ipt->tcf_lock);
302 nlmsg_trim(skb, b); 317 nlmsg_trim(skb, b);
303 kfree(t); 318 kfree(t);
304 return -1; 319 return -1;
@@ -326,7 +341,7 @@ static struct tc_action_ops act_ipt_ops = {
326 .kind = "ipt", 341 .kind = "ipt",
327 .type = TCA_ACT_IPT, 342 .type = TCA_ACT_IPT,
328 .owner = THIS_MODULE, 343 .owner = THIS_MODULE,
329 .act = tcf_ipt, 344 .act = tcf_ipt_act,
330 .dump = tcf_ipt_dump, 345 .dump = tcf_ipt_dump,
331 .cleanup = tcf_ipt_release, 346 .cleanup = tcf_ipt_release,
332 .init = tcf_ipt_init, 347 .init = tcf_ipt_init,
@@ -376,7 +391,7 @@ static struct tc_action_ops act_xt_ops = {
376 .kind = "xt", 391 .kind = "xt",
377 .type = TCA_ACT_XT, 392 .type = TCA_ACT_XT,
378 .owner = THIS_MODULE, 393 .owner = THIS_MODULE,
379 .act = tcf_ipt, 394 .act = tcf_ipt_act,
380 .dump = tcf_ipt_dump, 395 .dump = tcf_ipt_dump,
381 .cleanup = tcf_ipt_release, 396 .cleanup = tcf_ipt_release,
382 .init = tcf_xt_init, 397 .init = tcf_xt_init,
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index fd34015331ab..8bf66d0a6800 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -25,10 +25,12 @@
25#include <net/net_namespace.h> 25#include <net/net_namespace.h>
26#include <net/netlink.h> 26#include <net/netlink.h>
27#include <net/pkt_sched.h> 27#include <net/pkt_sched.h>
28#include <net/pkt_cls.h>
28#include <linux/tc_act/tc_mirred.h> 29#include <linux/tc_act/tc_mirred.h>
29#include <net/tc_act/tc_mirred.h> 30#include <net/tc_act/tc_mirred.h>
30 31
31static LIST_HEAD(mirred_list); 32static LIST_HEAD(mirred_list);
33static DEFINE_SPINLOCK(mirred_list_lock);
32 34
33static bool tcf_mirred_is_act_redirect(int action) 35static bool tcf_mirred_is_act_redirect(int action)
34{ 36{
@@ -49,13 +51,35 @@ static bool tcf_mirred_act_wants_ingress(int action)
49 } 51 }
50} 52}
51 53
54static bool tcf_mirred_can_reinsert(int action)
55{
56 switch (action) {
57 case TC_ACT_SHOT:
58 case TC_ACT_STOLEN:
59 case TC_ACT_QUEUED:
60 case TC_ACT_TRAP:
61 return true;
62 }
63 return false;
64}
65
66static struct net_device *tcf_mirred_dev_dereference(struct tcf_mirred *m)
67{
68 return rcu_dereference_protected(m->tcfm_dev,
69 lockdep_is_held(&m->tcf_lock));
70}
71
52static void tcf_mirred_release(struct tc_action *a) 72static void tcf_mirred_release(struct tc_action *a)
53{ 73{
54 struct tcf_mirred *m = to_mirred(a); 74 struct tcf_mirred *m = to_mirred(a);
55 struct net_device *dev; 75 struct net_device *dev;
56 76
77 spin_lock(&mirred_list_lock);
57 list_del(&m->tcfm_list); 78 list_del(&m->tcfm_list);
58 dev = rtnl_dereference(m->tcfm_dev); 79 spin_unlock(&mirred_list_lock);
80
81 /* last reference to action, no need to lock */
82 dev = rcu_dereference_protected(m->tcfm_dev, 1);
59 if (dev) 83 if (dev)
60 dev_put(dev); 84 dev_put(dev);
61} 85}
@@ -68,8 +92,9 @@ static unsigned int mirred_net_id;
68static struct tc_action_ops act_mirred_ops; 92static struct tc_action_ops act_mirred_ops;
69 93
70static int tcf_mirred_init(struct net *net, struct nlattr *nla, 94static int tcf_mirred_init(struct net *net, struct nlattr *nla,
71 struct nlattr *est, struct tc_action **a, int ovr, 95 struct nlattr *est, struct tc_action **a,
72 int bind, struct netlink_ext_ack *extack) 96 int ovr, int bind, bool rtnl_held,
97 struct netlink_ext_ack *extack)
73{ 98{
74 struct tc_action_net *tn = net_generic(net, mirred_net_id); 99 struct tc_action_net *tn = net_generic(net, mirred_net_id);
75 struct nlattr *tb[TCA_MIRRED_MAX + 1]; 100 struct nlattr *tb[TCA_MIRRED_MAX + 1];
@@ -78,7 +103,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
78 struct tcf_mirred *m; 103 struct tcf_mirred *m;
79 struct net_device *dev; 104 struct net_device *dev;
80 bool exists = false; 105 bool exists = false;
81 int ret; 106 int ret, err;
82 107
83 if (!nla) { 108 if (!nla) {
84 NL_SET_ERR_MSG_MOD(extack, "Mirred requires attributes to be passed"); 109 NL_SET_ERR_MSG_MOD(extack, "Mirred requires attributes to be passed");
@@ -93,7 +118,10 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
93 } 118 }
94 parm = nla_data(tb[TCA_MIRRED_PARMS]); 119 parm = nla_data(tb[TCA_MIRRED_PARMS]);
95 120
96 exists = tcf_idr_check(tn, parm->index, a, bind); 121 err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
122 if (err < 0)
123 return err;
124 exists = err;
97 if (exists && bind) 125 if (exists && bind)
98 return 0; 126 return 0;
99 127
@@ -106,76 +134,83 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
106 default: 134 default:
107 if (exists) 135 if (exists)
108 tcf_idr_release(*a, bind); 136 tcf_idr_release(*a, bind);
137 else
138 tcf_idr_cleanup(tn, parm->index);
109 NL_SET_ERR_MSG_MOD(extack, "Unknown mirred option"); 139 NL_SET_ERR_MSG_MOD(extack, "Unknown mirred option");
110 return -EINVAL; 140 return -EINVAL;
111 } 141 }
112 if (parm->ifindex) {
113 dev = __dev_get_by_index(net, parm->ifindex);
114 if (dev == NULL) {
115 if (exists)
116 tcf_idr_release(*a, bind);
117 return -ENODEV;
118 }
119 mac_header_xmit = dev_is_mac_header_xmit(dev);
120 } else {
121 dev = NULL;
122 }
123 142
124 if (!exists) { 143 if (!exists) {
125 if (!dev) { 144 if (!parm->ifindex) {
145 tcf_idr_cleanup(tn, parm->index);
126 NL_SET_ERR_MSG_MOD(extack, "Specified device does not exist"); 146 NL_SET_ERR_MSG_MOD(extack, "Specified device does not exist");
127 return -EINVAL; 147 return -EINVAL;
128 } 148 }
129 ret = tcf_idr_create(tn, parm->index, est, a, 149 ret = tcf_idr_create(tn, parm->index, est, a,
130 &act_mirred_ops, bind, true); 150 &act_mirred_ops, bind, true);
131 if (ret) 151 if (ret) {
152 tcf_idr_cleanup(tn, parm->index);
132 return ret; 153 return ret;
154 }
133 ret = ACT_P_CREATED; 155 ret = ACT_P_CREATED;
134 } else { 156 } else if (!ovr) {
135 tcf_idr_release(*a, bind); 157 tcf_idr_release(*a, bind);
136 if (!ovr) 158 return -EEXIST;
137 return -EEXIST;
138 } 159 }
139 m = to_mirred(*a); 160 m = to_mirred(*a);
140 161
141 ASSERT_RTNL(); 162 spin_lock_bh(&m->tcf_lock);
142 m->tcf_action = parm->action; 163 m->tcf_action = parm->action;
143 m->tcfm_eaction = parm->eaction; 164 m->tcfm_eaction = parm->eaction;
144 if (dev != NULL) { 165
145 if (ret != ACT_P_CREATED) 166 if (parm->ifindex) {
146 dev_put(rcu_dereference_protected(m->tcfm_dev, 1)); 167 dev = dev_get_by_index(net, parm->ifindex);
147 dev_hold(dev); 168 if (!dev) {
148 rcu_assign_pointer(m->tcfm_dev, dev); 169 spin_unlock_bh(&m->tcf_lock);
170 tcf_idr_release(*a, bind);
171 return -ENODEV;
172 }
173 mac_header_xmit = dev_is_mac_header_xmit(dev);
174 rcu_swap_protected(m->tcfm_dev, dev,
175 lockdep_is_held(&m->tcf_lock));
176 if (dev)
177 dev_put(dev);
149 m->tcfm_mac_header_xmit = mac_header_xmit; 178 m->tcfm_mac_header_xmit = mac_header_xmit;
150 } 179 }
180 spin_unlock_bh(&m->tcf_lock);
151 181
152 if (ret == ACT_P_CREATED) { 182 if (ret == ACT_P_CREATED) {
183 spin_lock(&mirred_list_lock);
153 list_add(&m->tcfm_list, &mirred_list); 184 list_add(&m->tcfm_list, &mirred_list);
185 spin_unlock(&mirred_list_lock);
186
154 tcf_idr_insert(tn, *a); 187 tcf_idr_insert(tn, *a);
155 } 188 }
156 189
157 return ret; 190 return ret;
158} 191}
159 192
160static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, 193static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a,
161 struct tcf_result *res) 194 struct tcf_result *res)
162{ 195{
163 struct tcf_mirred *m = to_mirred(a); 196 struct tcf_mirred *m = to_mirred(a);
197 struct sk_buff *skb2 = skb;
164 bool m_mac_header_xmit; 198 bool m_mac_header_xmit;
165 struct net_device *dev; 199 struct net_device *dev;
166 struct sk_buff *skb2;
167 int retval, err = 0; 200 int retval, err = 0;
201 bool use_reinsert;
202 bool want_ingress;
203 bool is_redirect;
168 int m_eaction; 204 int m_eaction;
169 int mac_len; 205 int mac_len;
170 206
171 tcf_lastuse_update(&m->tcf_tm); 207 tcf_lastuse_update(&m->tcf_tm);
172 bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb); 208 bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb);
173 209
174 rcu_read_lock();
175 m_mac_header_xmit = READ_ONCE(m->tcfm_mac_header_xmit); 210 m_mac_header_xmit = READ_ONCE(m->tcfm_mac_header_xmit);
176 m_eaction = READ_ONCE(m->tcfm_eaction); 211 m_eaction = READ_ONCE(m->tcfm_eaction);
177 retval = READ_ONCE(m->tcf_action); 212 retval = READ_ONCE(m->tcf_action);
178 dev = rcu_dereference(m->tcfm_dev); 213 dev = rcu_dereference_bh(m->tcfm_dev);
179 if (unlikely(!dev)) { 214 if (unlikely(!dev)) {
180 pr_notice_once("tc mirred: target device is gone\n"); 215 pr_notice_once("tc mirred: target device is gone\n");
181 goto out; 216 goto out;
@@ -187,16 +222,25 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
187 goto out; 222 goto out;
188 } 223 }
189 224
190 skb2 = skb_clone(skb, GFP_ATOMIC); 225 /* we could easily avoid the clone only if called by ingress and clsact;
191 if (!skb2) 226 * since we can't easily detect the clsact caller, skip clone only for
192 goto out; 227 * ingress - that covers the TC S/W datapath.
228 */
229 is_redirect = tcf_mirred_is_act_redirect(m_eaction);
230 use_reinsert = skb_at_tc_ingress(skb) && is_redirect &&
231 tcf_mirred_can_reinsert(retval);
232 if (!use_reinsert) {
233 skb2 = skb_clone(skb, GFP_ATOMIC);
234 if (!skb2)
235 goto out;
236 }
193 237
194 /* If action's target direction differs than filter's direction, 238 /* If action's target direction differs than filter's direction,
195 * and devices expect a mac header on xmit, then mac push/pull is 239 * and devices expect a mac header on xmit, then mac push/pull is
196 * needed. 240 * needed.
197 */ 241 */
198 if (skb_at_tc_ingress(skb) != tcf_mirred_act_wants_ingress(m_eaction) && 242 want_ingress = tcf_mirred_act_wants_ingress(m_eaction);
199 m_mac_header_xmit) { 243 if (skb_at_tc_ingress(skb) != want_ingress && m_mac_header_xmit) {
200 if (!skb_at_tc_ingress(skb)) { 244 if (!skb_at_tc_ingress(skb)) {
201 /* caught at egress, act ingress: pull mac */ 245 /* caught at egress, act ingress: pull mac */
202 mac_len = skb_network_header(skb) - skb_mac_header(skb); 246 mac_len = skb_network_header(skb) - skb_mac_header(skb);
@@ -207,15 +251,23 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
207 } 251 }
208 } 252 }
209 253
254 skb2->skb_iif = skb->dev->ifindex;
255 skb2->dev = dev;
256
210 /* mirror is always swallowed */ 257 /* mirror is always swallowed */
211 if (tcf_mirred_is_act_redirect(m_eaction)) { 258 if (is_redirect) {
212 skb2->tc_redirected = 1; 259 skb2->tc_redirected = 1;
213 skb2->tc_from_ingress = skb2->tc_at_ingress; 260 skb2->tc_from_ingress = skb2->tc_at_ingress;
261
262 /* let's the caller reinsert the packet, if possible */
263 if (use_reinsert) {
264 res->ingress = want_ingress;
265 res->qstats = this_cpu_ptr(m->common.cpu_qstats);
266 return TC_ACT_REINSERT;
267 }
214 } 268 }
215 269
216 skb2->skb_iif = skb->dev->ifindex; 270 if (!want_ingress)
217 skb2->dev = dev;
218 if (!tcf_mirred_act_wants_ingress(m_eaction))
219 err = dev_queue_xmit(skb2); 271 err = dev_queue_xmit(skb2);
220 else 272 else
221 err = netif_receive_skb(skb2); 273 err = netif_receive_skb(skb2);
@@ -226,7 +278,6 @@ out:
226 if (tcf_mirred_is_act_redirect(m_eaction)) 278 if (tcf_mirred_is_act_redirect(m_eaction))
227 retval = TC_ACT_SHOT; 279 retval = TC_ACT_SHOT;
228 } 280 }
229 rcu_read_unlock();
230 281
231 return retval; 282 return retval;
232} 283}
@@ -246,26 +297,33 @@ static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind,
246{ 297{
247 unsigned char *b = skb_tail_pointer(skb); 298 unsigned char *b = skb_tail_pointer(skb);
248 struct tcf_mirred *m = to_mirred(a); 299 struct tcf_mirred *m = to_mirred(a);
249 struct net_device *dev = rtnl_dereference(m->tcfm_dev);
250 struct tc_mirred opt = { 300 struct tc_mirred opt = {
251 .index = m->tcf_index, 301 .index = m->tcf_index,
252 .action = m->tcf_action, 302 .refcnt = refcount_read(&m->tcf_refcnt) - ref,
253 .refcnt = m->tcf_refcnt - ref, 303 .bindcnt = atomic_read(&m->tcf_bindcnt) - bind,
254 .bindcnt = m->tcf_bindcnt - bind,
255 .eaction = m->tcfm_eaction,
256 .ifindex = dev ? dev->ifindex : 0,
257 }; 304 };
305 struct net_device *dev;
258 struct tcf_t t; 306 struct tcf_t t;
259 307
308 spin_lock_bh(&m->tcf_lock);
309 opt.action = m->tcf_action;
310 opt.eaction = m->tcfm_eaction;
311 dev = tcf_mirred_dev_dereference(m);
312 if (dev)
313 opt.ifindex = dev->ifindex;
314
260 if (nla_put(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt)) 315 if (nla_put(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt))
261 goto nla_put_failure; 316 goto nla_put_failure;
262 317
263 tcf_tm_dump(&t, &m->tcf_tm); 318 tcf_tm_dump(&t, &m->tcf_tm);
264 if (nla_put_64bit(skb, TCA_MIRRED_TM, sizeof(t), &t, TCA_MIRRED_PAD)) 319 if (nla_put_64bit(skb, TCA_MIRRED_TM, sizeof(t), &t, TCA_MIRRED_PAD))
265 goto nla_put_failure; 320 goto nla_put_failure;
321 spin_unlock_bh(&m->tcf_lock);
322
266 return skb->len; 323 return skb->len;
267 324
268nla_put_failure: 325nla_put_failure:
326 spin_unlock_bh(&m->tcf_lock);
269 nlmsg_trim(skb, b); 327 nlmsg_trim(skb, b);
270 return -1; 328 return -1;
271} 329}
@@ -296,15 +354,19 @@ static int mirred_device_event(struct notifier_block *unused,
296 354
297 ASSERT_RTNL(); 355 ASSERT_RTNL();
298 if (event == NETDEV_UNREGISTER) { 356 if (event == NETDEV_UNREGISTER) {
357 spin_lock(&mirred_list_lock);
299 list_for_each_entry(m, &mirred_list, tcfm_list) { 358 list_for_each_entry(m, &mirred_list, tcfm_list) {
300 if (rcu_access_pointer(m->tcfm_dev) == dev) { 359 spin_lock_bh(&m->tcf_lock);
360 if (tcf_mirred_dev_dereference(m) == dev) {
301 dev_put(dev); 361 dev_put(dev);
302 /* Note : no rcu grace period necessary, as 362 /* Note : no rcu grace period necessary, as
303 * net_device are already rcu protected. 363 * net_device are already rcu protected.
304 */ 364 */
305 RCU_INIT_POINTER(m->tcfm_dev, NULL); 365 RCU_INIT_POINTER(m->tcfm_dev, NULL);
306 } 366 }
367 spin_unlock_bh(&m->tcf_lock);
307 } 368 }
369 spin_unlock(&mirred_list_lock);
308 } 370 }
309 371
310 return NOTIFY_DONE; 372 return NOTIFY_DONE;
@@ -317,15 +379,27 @@ static struct notifier_block mirred_device_notifier = {
317static struct net_device *tcf_mirred_get_dev(const struct tc_action *a) 379static struct net_device *tcf_mirred_get_dev(const struct tc_action *a)
318{ 380{
319 struct tcf_mirred *m = to_mirred(a); 381 struct tcf_mirred *m = to_mirred(a);
382 struct net_device *dev;
320 383
321 return rtnl_dereference(m->tcfm_dev); 384 rcu_read_lock();
385 dev = rcu_dereference(m->tcfm_dev);
386 if (dev)
387 dev_hold(dev);
388 rcu_read_unlock();
389
390 return dev;
391}
392
393static void tcf_mirred_put_dev(struct net_device *dev)
394{
395 dev_put(dev);
322} 396}
323 397
324static struct tc_action_ops act_mirred_ops = { 398static struct tc_action_ops act_mirred_ops = {
325 .kind = "mirred", 399 .kind = "mirred",
326 .type = TCA_ACT_MIRRED, 400 .type = TCA_ACT_MIRRED,
327 .owner = THIS_MODULE, 401 .owner = THIS_MODULE,
328 .act = tcf_mirred, 402 .act = tcf_mirred_act,
329 .stats_update = tcf_stats_update, 403 .stats_update = tcf_stats_update,
330 .dump = tcf_mirred_dump, 404 .dump = tcf_mirred_dump,
331 .cleanup = tcf_mirred_release, 405 .cleanup = tcf_mirred_release,
@@ -334,6 +408,7 @@ static struct tc_action_ops act_mirred_ops = {
334 .lookup = tcf_mirred_search, 408 .lookup = tcf_mirred_search,
335 .size = sizeof(struct tcf_mirred), 409 .size = sizeof(struct tcf_mirred),
336 .get_dev = tcf_mirred_get_dev, 410 .get_dev = tcf_mirred_get_dev,
411 .put_dev = tcf_mirred_put_dev,
337}; 412};
338 413
339static __net_init int mirred_init_net(struct net *net) 414static __net_init int mirred_init_net(struct net *net)
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index 4b5848b6c252..4313aa102440 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -38,7 +38,7 @@ static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = {
38 38
39static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, 39static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
40 struct tc_action **a, int ovr, int bind, 40 struct tc_action **a, int ovr, int bind,
41 struct netlink_ext_ack *extack) 41 bool rtnl_held, struct netlink_ext_ack *extack)
42{ 42{
43 struct tc_action_net *tn = net_generic(net, nat_net_id); 43 struct tc_action_net *tn = net_generic(net, nat_net_id);
44 struct nlattr *tb[TCA_NAT_MAX + 1]; 44 struct nlattr *tb[TCA_NAT_MAX + 1];
@@ -57,18 +57,24 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
57 return -EINVAL; 57 return -EINVAL;
58 parm = nla_data(tb[TCA_NAT_PARMS]); 58 parm = nla_data(tb[TCA_NAT_PARMS]);
59 59
60 if (!tcf_idr_check(tn, parm->index, a, bind)) { 60 err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
61 if (!err) {
61 ret = tcf_idr_create(tn, parm->index, est, a, 62 ret = tcf_idr_create(tn, parm->index, est, a,
62 &act_nat_ops, bind, false); 63 &act_nat_ops, bind, false);
63 if (ret) 64 if (ret) {
65 tcf_idr_cleanup(tn, parm->index);
64 return ret; 66 return ret;
67 }
65 ret = ACT_P_CREATED; 68 ret = ACT_P_CREATED;
66 } else { 69 } else if (err > 0) {
67 if (bind) 70 if (bind)
68 return 0; 71 return 0;
69 tcf_idr_release(*a, bind); 72 if (!ovr) {
70 if (!ovr) 73 tcf_idr_release(*a, bind);
71 return -EEXIST; 74 return -EEXIST;
75 }
76 } else {
77 return err;
72 } 78 }
73 p = to_tcf_nat(*a); 79 p = to_tcf_nat(*a);
74 80
@@ -87,8 +93,8 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
87 return ret; 93 return ret;
88} 94}
89 95
90static int tcf_nat(struct sk_buff *skb, const struct tc_action *a, 96static int tcf_nat_act(struct sk_buff *skb, const struct tc_action *a,
91 struct tcf_result *res) 97 struct tcf_result *res)
92{ 98{
93 struct tcf_nat *p = to_tcf_nat(a); 99 struct tcf_nat *p = to_tcf_nat(a);
94 struct iphdr *iph; 100 struct iphdr *iph;
@@ -257,8 +263,8 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a,
257 263
258 .index = p->tcf_index, 264 .index = p->tcf_index,
259 .action = p->tcf_action, 265 .action = p->tcf_action,
260 .refcnt = p->tcf_refcnt - ref, 266 .refcnt = refcount_read(&p->tcf_refcnt) - ref,
261 .bindcnt = p->tcf_bindcnt - bind, 267 .bindcnt = atomic_read(&p->tcf_bindcnt) - bind,
262 }; 268 };
263 struct tcf_t t; 269 struct tcf_t t;
264 270
@@ -298,7 +304,7 @@ static struct tc_action_ops act_nat_ops = {
298 .kind = "nat", 304 .kind = "nat",
299 .type = TCA_ACT_NAT, 305 .type = TCA_ACT_NAT,
300 .owner = THIS_MODULE, 306 .owner = THIS_MODULE,
301 .act = tcf_nat, 307 .act = tcf_nat_act,
302 .dump = tcf_nat_dump, 308 .dump = tcf_nat_dump,
303 .init = tcf_nat_init, 309 .init = tcf_nat_init,
304 .walk = tcf_nat_walker, 310 .walk = tcf_nat_walker,
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index 8a925c72db5f..107034070019 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -132,20 +132,23 @@ static int tcf_pedit_key_ex_dump(struct sk_buff *skb,
132 132
133static int tcf_pedit_init(struct net *net, struct nlattr *nla, 133static int tcf_pedit_init(struct net *net, struct nlattr *nla,
134 struct nlattr *est, struct tc_action **a, 134 struct nlattr *est, struct tc_action **a,
135 int ovr, int bind, struct netlink_ext_ack *extack) 135 int ovr, int bind, bool rtnl_held,
136 struct netlink_ext_ack *extack)
136{ 137{
137 struct tc_action_net *tn = net_generic(net, pedit_net_id); 138 struct tc_action_net *tn = net_generic(net, pedit_net_id);
138 struct nlattr *tb[TCA_PEDIT_MAX + 1]; 139 struct nlattr *tb[TCA_PEDIT_MAX + 1];
139 struct nlattr *pattr;
140 struct tc_pedit *parm;
141 int ret = 0, err;
142 struct tcf_pedit *p;
143 struct tc_pedit_key *keys = NULL; 140 struct tc_pedit_key *keys = NULL;
144 struct tcf_pedit_key_ex *keys_ex; 141 struct tcf_pedit_key_ex *keys_ex;
142 struct tc_pedit *parm;
143 struct nlattr *pattr;
144 struct tcf_pedit *p;
145 int ret = 0, err;
145 int ksize; 146 int ksize;
146 147
147 if (nla == NULL) 148 if (!nla) {
149 NL_SET_ERR_MSG_MOD(extack, "Pedit requires attributes to be passed");
148 return -EINVAL; 150 return -EINVAL;
151 }
149 152
150 err = nla_parse_nested(tb, TCA_PEDIT_MAX, nla, pedit_policy, NULL); 153 err = nla_parse_nested(tb, TCA_PEDIT_MAX, nla, pedit_policy, NULL);
151 if (err < 0) 154 if (err < 0)
@@ -154,59 +157,68 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
154 pattr = tb[TCA_PEDIT_PARMS]; 157 pattr = tb[TCA_PEDIT_PARMS];
155 if (!pattr) 158 if (!pattr)
156 pattr = tb[TCA_PEDIT_PARMS_EX]; 159 pattr = tb[TCA_PEDIT_PARMS_EX];
157 if (!pattr) 160 if (!pattr) {
161 NL_SET_ERR_MSG_MOD(extack, "Missing required TCA_PEDIT_PARMS or TCA_PEDIT_PARMS_EX pedit attribute");
158 return -EINVAL; 162 return -EINVAL;
163 }
159 164
160 parm = nla_data(pattr); 165 parm = nla_data(pattr);
161 ksize = parm->nkeys * sizeof(struct tc_pedit_key); 166 ksize = parm->nkeys * sizeof(struct tc_pedit_key);
162 if (nla_len(pattr) < sizeof(*parm) + ksize) 167 if (nla_len(pattr) < sizeof(*parm) + ksize) {
168 NL_SET_ERR_MSG_ATTR(extack, pattr, "Length of TCA_PEDIT_PARMS or TCA_PEDIT_PARMS_EX pedit attribute is invalid");
163 return -EINVAL; 169 return -EINVAL;
170 }
164 171
165 keys_ex = tcf_pedit_keys_ex_parse(tb[TCA_PEDIT_KEYS_EX], parm->nkeys); 172 keys_ex = tcf_pedit_keys_ex_parse(tb[TCA_PEDIT_KEYS_EX], parm->nkeys);
166 if (IS_ERR(keys_ex)) 173 if (IS_ERR(keys_ex))
167 return PTR_ERR(keys_ex); 174 return PTR_ERR(keys_ex);
168 175
169 if (!tcf_idr_check(tn, parm->index, a, bind)) { 176 err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
170 if (!parm->nkeys) 177 if (!err) {
171 return -EINVAL; 178 if (!parm->nkeys) {
179 tcf_idr_cleanup(tn, parm->index);
180 NL_SET_ERR_MSG_MOD(extack, "Pedit requires keys to be passed");
181 ret = -EINVAL;
182 goto out_free;
183 }
172 ret = tcf_idr_create(tn, parm->index, est, a, 184 ret = tcf_idr_create(tn, parm->index, est, a,
173 &act_pedit_ops, bind, false); 185 &act_pedit_ops, bind, false);
174 if (ret) 186 if (ret) {
175 return ret; 187 tcf_idr_cleanup(tn, parm->index);
176 p = to_pedit(*a); 188 goto out_free;
177 keys = kmalloc(ksize, GFP_KERNEL);
178 if (keys == NULL) {
179 tcf_idr_release(*a, bind);
180 kfree(keys_ex);
181 return -ENOMEM;
182 } 189 }
183 ret = ACT_P_CREATED; 190 ret = ACT_P_CREATED;
184 } else { 191 } else if (err > 0) {
185 if (bind) 192 if (bind)
186 return 0; 193 goto out_free;
187 tcf_idr_release(*a, bind); 194 if (!ovr) {
188 if (!ovr) 195 ret = -EEXIST;
189 return -EEXIST; 196 goto out_release;
190 p = to_pedit(*a);
191 if (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys) {
192 keys = kmalloc(ksize, GFP_KERNEL);
193 if (!keys) {
194 kfree(keys_ex);
195 return -ENOMEM;
196 }
197 } 197 }
198 } else {
199 return err;
198 } 200 }
199 201
202 p = to_pedit(*a);
200 spin_lock_bh(&p->tcf_lock); 203 spin_lock_bh(&p->tcf_lock);
201 p->tcfp_flags = parm->flags; 204
202 p->tcf_action = parm->action; 205 if (ret == ACT_P_CREATED ||
203 if (keys) { 206 (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys)) {
207 keys = kmalloc(ksize, GFP_ATOMIC);
208 if (!keys) {
209 spin_unlock_bh(&p->tcf_lock);
210 ret = -ENOMEM;
211 goto out_release;
212 }
204 kfree(p->tcfp_keys); 213 kfree(p->tcfp_keys);
205 p->tcfp_keys = keys; 214 p->tcfp_keys = keys;
206 p->tcfp_nkeys = parm->nkeys; 215 p->tcfp_nkeys = parm->nkeys;
207 } 216 }
208 memcpy(p->tcfp_keys, parm->keys, ksize); 217 memcpy(p->tcfp_keys, parm->keys, ksize);
209 218
219 p->tcfp_flags = parm->flags;
220 p->tcf_action = parm->action;
221
210 kfree(p->tcfp_keys_ex); 222 kfree(p->tcfp_keys_ex);
211 p->tcfp_keys_ex = keys_ex; 223 p->tcfp_keys_ex = keys_ex;
212 224
@@ -214,12 +226,20 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
214 if (ret == ACT_P_CREATED) 226 if (ret == ACT_P_CREATED)
215 tcf_idr_insert(tn, *a); 227 tcf_idr_insert(tn, *a);
216 return ret; 228 return ret;
229
230out_release:
231 tcf_idr_release(*a, bind);
232out_free:
233 kfree(keys_ex);
234 return ret;
235
217} 236}
218 237
219static void tcf_pedit_cleanup(struct tc_action *a) 238static void tcf_pedit_cleanup(struct tc_action *a)
220{ 239{
221 struct tcf_pedit *p = to_pedit(a); 240 struct tcf_pedit *p = to_pedit(a);
222 struct tc_pedit_key *keys = p->tcfp_keys; 241 struct tc_pedit_key *keys = p->tcfp_keys;
242
223 kfree(keys); 243 kfree(keys);
224 kfree(p->tcfp_keys_ex); 244 kfree(p->tcfp_keys_ex);
225} 245}
@@ -263,13 +283,13 @@ static int pedit_skb_hdr_offset(struct sk_buff *skb,
263 default: 283 default:
264 ret = -EINVAL; 284 ret = -EINVAL;
265 break; 285 break;
266 }; 286 }
267 287
268 return ret; 288 return ret;
269} 289}
270 290
271static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, 291static int tcf_pedit_act(struct sk_buff *skb, const struct tc_action *a,
272 struct tcf_result *res) 292 struct tcf_result *res)
273{ 293{
274 struct tcf_pedit *p = to_pedit(a); 294 struct tcf_pedit *p = to_pedit(a);
275 int i; 295 int i;
@@ -284,11 +304,12 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
284 if (p->tcfp_nkeys > 0) { 304 if (p->tcfp_nkeys > 0) {
285 struct tc_pedit_key *tkey = p->tcfp_keys; 305 struct tc_pedit_key *tkey = p->tcfp_keys;
286 struct tcf_pedit_key_ex *tkey_ex = p->tcfp_keys_ex; 306 struct tcf_pedit_key_ex *tkey_ex = p->tcfp_keys_ex;
287 enum pedit_header_type htype = TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK; 307 enum pedit_header_type htype =
308 TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK;
288 enum pedit_cmd cmd = TCA_PEDIT_KEY_EX_CMD_SET; 309 enum pedit_cmd cmd = TCA_PEDIT_KEY_EX_CMD_SET;
289 310
290 for (i = p->tcfp_nkeys; i > 0; i--, tkey++) { 311 for (i = p->tcfp_nkeys; i > 0; i--, tkey++) {
291 u32 *ptr, _data; 312 u32 *ptr, hdata;
292 int offset = tkey->off; 313 int offset = tkey->off;
293 int hoffset; 314 int hoffset;
294 u32 val; 315 u32 val;
@@ -303,39 +324,39 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
303 324
304 rc = pedit_skb_hdr_offset(skb, htype, &hoffset); 325 rc = pedit_skb_hdr_offset(skb, htype, &hoffset);
305 if (rc) { 326 if (rc) {
306 pr_info("tc filter pedit bad header type specified (0x%x)\n", 327 pr_info("tc action pedit bad header type specified (0x%x)\n",
307 htype); 328 htype);
308 goto bad; 329 goto bad;
309 } 330 }
310 331
311 if (tkey->offmask) { 332 if (tkey->offmask) {
312 char *d, _d; 333 u8 *d, _d;
313 334
314 if (!offset_valid(skb, hoffset + tkey->at)) { 335 if (!offset_valid(skb, hoffset + tkey->at)) {
315 pr_info("tc filter pedit 'at' offset %d out of bounds\n", 336 pr_info("tc action pedit 'at' offset %d out of bounds\n",
316 hoffset + tkey->at); 337 hoffset + tkey->at);
317 goto bad; 338 goto bad;
318 } 339 }
319 d = skb_header_pointer(skb, hoffset + tkey->at, 1, 340 d = skb_header_pointer(skb, hoffset + tkey->at,
320 &_d); 341 sizeof(_d), &_d);
321 if (!d) 342 if (!d)
322 goto bad; 343 goto bad;
323 offset += (*d & tkey->offmask) >> tkey->shift; 344 offset += (*d & tkey->offmask) >> tkey->shift;
324 } 345 }
325 346
326 if (offset % 4) { 347 if (offset % 4) {
327 pr_info("tc filter pedit" 348 pr_info("tc action pedit offset must be on 32 bit boundaries\n");
328 " offset must be on 32 bit boundaries\n");
329 goto bad; 349 goto bad;
330 } 350 }
331 351
332 if (!offset_valid(skb, hoffset + offset)) { 352 if (!offset_valid(skb, hoffset + offset)) {
333 pr_info("tc filter pedit offset %d out of bounds\n", 353 pr_info("tc action pedit offset %d out of bounds\n",
334 hoffset + offset); 354 hoffset + offset);
335 goto bad; 355 goto bad;
336 } 356 }
337 357
338 ptr = skb_header_pointer(skb, hoffset + offset, 4, &_data); 358 ptr = skb_header_pointer(skb, hoffset + offset,
359 sizeof(hdata), &hdata);
339 if (!ptr) 360 if (!ptr)
340 goto bad; 361 goto bad;
341 /* just do it, baby */ 362 /* just do it, baby */
@@ -347,19 +368,20 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
347 val = (*ptr + tkey->val) & ~tkey->mask; 368 val = (*ptr + tkey->val) & ~tkey->mask;
348 break; 369 break;
349 default: 370 default:
350 pr_info("tc filter pedit bad command (%d)\n", 371 pr_info("tc action pedit bad command (%d)\n",
351 cmd); 372 cmd);
352 goto bad; 373 goto bad;
353 } 374 }
354 375
355 *ptr = ((*ptr & tkey->mask) ^ val); 376 *ptr = ((*ptr & tkey->mask) ^ val);
356 if (ptr == &_data) 377 if (ptr == &hdata)
357 skb_store_bits(skb, hoffset + offset, ptr, 4); 378 skb_store_bits(skb, hoffset + offset, ptr, 4);
358 } 379 }
359 380
360 goto done; 381 goto done;
361 } else 382 } else {
362 WARN(1, "pedit BUG: index %d\n", p->tcf_index); 383 WARN(1, "pedit BUG: index %d\n", p->tcf_index);
384 }
363 385
364bad: 386bad:
365 p->tcf_qstats.overlimits++; 387 p->tcf_qstats.overlimits++;
@@ -385,14 +407,15 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,
385 if (unlikely(!opt)) 407 if (unlikely(!opt))
386 return -ENOBUFS; 408 return -ENOBUFS;
387 409
410 spin_lock_bh(&p->tcf_lock);
388 memcpy(opt->keys, p->tcfp_keys, 411 memcpy(opt->keys, p->tcfp_keys,
389 p->tcfp_nkeys * sizeof(struct tc_pedit_key)); 412 p->tcfp_nkeys * sizeof(struct tc_pedit_key));
390 opt->index = p->tcf_index; 413 opt->index = p->tcf_index;
391 opt->nkeys = p->tcfp_nkeys; 414 opt->nkeys = p->tcfp_nkeys;
392 opt->flags = p->tcfp_flags; 415 opt->flags = p->tcfp_flags;
393 opt->action = p->tcf_action; 416 opt->action = p->tcf_action;
394 opt->refcnt = p->tcf_refcnt - ref; 417 opt->refcnt = refcount_read(&p->tcf_refcnt) - ref;
395 opt->bindcnt = p->tcf_bindcnt - bind; 418 opt->bindcnt = atomic_read(&p->tcf_bindcnt) - bind;
396 419
397 if (p->tcfp_keys_ex) { 420 if (p->tcfp_keys_ex) {
398 tcf_pedit_key_ex_dump(skb, p->tcfp_keys_ex, p->tcfp_nkeys); 421 tcf_pedit_key_ex_dump(skb, p->tcfp_keys_ex, p->tcfp_nkeys);
@@ -407,11 +430,13 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,
407 tcf_tm_dump(&t, &p->tcf_tm); 430 tcf_tm_dump(&t, &p->tcf_tm);
408 if (nla_put_64bit(skb, TCA_PEDIT_TM, sizeof(t), &t, TCA_PEDIT_PAD)) 431 if (nla_put_64bit(skb, TCA_PEDIT_TM, sizeof(t), &t, TCA_PEDIT_PAD))
409 goto nla_put_failure; 432 goto nla_put_failure;
433 spin_unlock_bh(&p->tcf_lock);
410 434
411 kfree(opt); 435 kfree(opt);
412 return skb->len; 436 return skb->len;
413 437
414nla_put_failure: 438nla_put_failure:
439 spin_unlock_bh(&p->tcf_lock);
415 nlmsg_trim(skb, b); 440 nlmsg_trim(skb, b);
416 kfree(opt); 441 kfree(opt);
417 return -1; 442 return -1;
@@ -439,7 +464,7 @@ static struct tc_action_ops act_pedit_ops = {
439 .kind = "pedit", 464 .kind = "pedit",
440 .type = TCA_ACT_PEDIT, 465 .type = TCA_ACT_PEDIT,
441 .owner = THIS_MODULE, 466 .owner = THIS_MODULE,
442 .act = tcf_pedit, 467 .act = tcf_pedit_act,
443 .dump = tcf_pedit_dump, 468 .dump = tcf_pedit_dump,
444 .cleanup = tcf_pedit_cleanup, 469 .cleanup = tcf_pedit_cleanup,
445 .init = tcf_pedit_init, 470 .init = tcf_pedit_init,
@@ -483,4 +508,3 @@ static void __exit pedit_cleanup_module(void)
483 508
484module_init(pedit_init_module); 509module_init(pedit_init_module);
485module_exit(pedit_cleanup_module); 510module_exit(pedit_cleanup_module);
486
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 4e72bc2a0dfb..5d8bfa878477 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -56,7 +56,7 @@ struct tc_police_compat {
56static unsigned int police_net_id; 56static unsigned int police_net_id;
57static struct tc_action_ops act_police_ops; 57static struct tc_action_ops act_police_ops;
58 58
59static int tcf_act_police_walker(struct net *net, struct sk_buff *skb, 59static int tcf_police_walker(struct net *net, struct sk_buff *skb,
60 struct netlink_callback *cb, int type, 60 struct netlink_callback *cb, int type,
61 const struct tc_action_ops *ops, 61 const struct tc_action_ops *ops,
62 struct netlink_ext_ack *extack) 62 struct netlink_ext_ack *extack)
@@ -73,9 +73,9 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = {
73 [TCA_POLICE_RESULT] = { .type = NLA_U32 }, 73 [TCA_POLICE_RESULT] = { .type = NLA_U32 },
74}; 74};
75 75
76static int tcf_act_police_init(struct net *net, struct nlattr *nla, 76static int tcf_police_init(struct net *net, struct nlattr *nla,
77 struct nlattr *est, struct tc_action **a, 77 struct nlattr *est, struct tc_action **a,
78 int ovr, int bind, 78 int ovr, int bind, bool rtnl_held,
79 struct netlink_ext_ack *extack) 79 struct netlink_ext_ack *extack)
80{ 80{
81 int ret = 0, err; 81 int ret = 0, err;
@@ -101,20 +101,24 @@ static int tcf_act_police_init(struct net *net, struct nlattr *nla,
101 return -EINVAL; 101 return -EINVAL;
102 102
103 parm = nla_data(tb[TCA_POLICE_TBF]); 103 parm = nla_data(tb[TCA_POLICE_TBF]);
104 exists = tcf_idr_check(tn, parm->index, a, bind); 104 err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
105 if (err < 0)
106 return err;
107 exists = err;
105 if (exists && bind) 108 if (exists && bind)
106 return 0; 109 return 0;
107 110
108 if (!exists) { 111 if (!exists) {
109 ret = tcf_idr_create(tn, parm->index, NULL, a, 112 ret = tcf_idr_create(tn, parm->index, NULL, a,
110 &act_police_ops, bind, false); 113 &act_police_ops, bind, false);
111 if (ret) 114 if (ret) {
115 tcf_idr_cleanup(tn, parm->index);
112 return ret; 116 return ret;
117 }
113 ret = ACT_P_CREATED; 118 ret = ACT_P_CREATED;
114 } else { 119 } else if (!ovr) {
115 tcf_idr_release(*a, bind); 120 tcf_idr_release(*a, bind);
116 if (!ovr) 121 return -EEXIST;
117 return -EEXIST;
118 } 122 }
119 123
120 police = to_police(*a); 124 police = to_police(*a);
@@ -195,12 +199,11 @@ static int tcf_act_police_init(struct net *net, struct nlattr *nla,
195failure: 199failure:
196 qdisc_put_rtab(P_tab); 200 qdisc_put_rtab(P_tab);
197 qdisc_put_rtab(R_tab); 201 qdisc_put_rtab(R_tab);
198 if (ret == ACT_P_CREATED) 202 tcf_idr_release(*a, bind);
199 tcf_idr_release(*a, bind);
200 return err; 203 return err;
201} 204}
202 205
203static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a, 206static int tcf_police_act(struct sk_buff *skb, const struct tc_action *a,
204 struct tcf_result *res) 207 struct tcf_result *res)
205{ 208{
206 struct tcf_police *police = to_police(a); 209 struct tcf_police *police = to_police(a);
@@ -264,21 +267,22 @@ static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a,
264 return police->tcf_action; 267 return police->tcf_action;
265} 268}
266 269
267static int tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, 270static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a,
268 int bind, int ref) 271 int bind, int ref)
269{ 272{
270 unsigned char *b = skb_tail_pointer(skb); 273 unsigned char *b = skb_tail_pointer(skb);
271 struct tcf_police *police = to_police(a); 274 struct tcf_police *police = to_police(a);
272 struct tc_police opt = { 275 struct tc_police opt = {
273 .index = police->tcf_index, 276 .index = police->tcf_index,
274 .action = police->tcf_action, 277 .refcnt = refcount_read(&police->tcf_refcnt) - ref,
275 .mtu = police->tcfp_mtu, 278 .bindcnt = atomic_read(&police->tcf_bindcnt) - bind,
276 .burst = PSCHED_NS2TICKS(police->tcfp_burst),
277 .refcnt = police->tcf_refcnt - ref,
278 .bindcnt = police->tcf_bindcnt - bind,
279 }; 279 };
280 struct tcf_t t; 280 struct tcf_t t;
281 281
282 spin_lock_bh(&police->tcf_lock);
283 opt.action = police->tcf_action;
284 opt.mtu = police->tcfp_mtu;
285 opt.burst = PSCHED_NS2TICKS(police->tcfp_burst);
282 if (police->rate_present) 286 if (police->rate_present)
283 psched_ratecfg_getrate(&opt.rate, &police->rate); 287 psched_ratecfg_getrate(&opt.rate, &police->rate);
284 if (police->peak_present) 288 if (police->peak_present)
@@ -298,10 +302,12 @@ static int tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a,
298 t.expires = jiffies_to_clock_t(police->tcf_tm.expires); 302 t.expires = jiffies_to_clock_t(police->tcf_tm.expires);
299 if (nla_put_64bit(skb, TCA_POLICE_TM, sizeof(t), &t, TCA_POLICE_PAD)) 303 if (nla_put_64bit(skb, TCA_POLICE_TM, sizeof(t), &t, TCA_POLICE_PAD))
300 goto nla_put_failure; 304 goto nla_put_failure;
305 spin_unlock_bh(&police->tcf_lock);
301 306
302 return skb->len; 307 return skb->len;
303 308
304nla_put_failure: 309nla_put_failure:
310 spin_unlock_bh(&police->tcf_lock);
305 nlmsg_trim(skb, b); 311 nlmsg_trim(skb, b);
306 return -1; 312 return -1;
307} 313}
@@ -322,10 +328,10 @@ static struct tc_action_ops act_police_ops = {
322 .kind = "police", 328 .kind = "police",
323 .type = TCA_ID_POLICE, 329 .type = TCA_ID_POLICE,
324 .owner = THIS_MODULE, 330 .owner = THIS_MODULE,
325 .act = tcf_act_police, 331 .act = tcf_police_act,
326 .dump = tcf_act_police_dump, 332 .dump = tcf_police_dump,
327 .init = tcf_act_police_init, 333 .init = tcf_police_init,
328 .walk = tcf_act_police_walker, 334 .walk = tcf_police_walker,
329 .lookup = tcf_police_search, 335 .lookup = tcf_police_search,
330 .size = sizeof(struct tcf_police), 336 .size = sizeof(struct tcf_police),
331}; 337};
diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
index 5db358497c9e..44e9c00657bc 100644
--- a/net/sched/act_sample.c
+++ b/net/sched/act_sample.c
@@ -37,15 +37,17 @@ static const struct nla_policy sample_policy[TCA_SAMPLE_MAX + 1] = {
37 37
38static int tcf_sample_init(struct net *net, struct nlattr *nla, 38static int tcf_sample_init(struct net *net, struct nlattr *nla,
39 struct nlattr *est, struct tc_action **a, int ovr, 39 struct nlattr *est, struct tc_action **a, int ovr,
40 int bind, struct netlink_ext_ack *extack) 40 int bind, bool rtnl_held,
41 struct netlink_ext_ack *extack)
41{ 42{
42 struct tc_action_net *tn = net_generic(net, sample_net_id); 43 struct tc_action_net *tn = net_generic(net, sample_net_id);
43 struct nlattr *tb[TCA_SAMPLE_MAX + 1]; 44 struct nlattr *tb[TCA_SAMPLE_MAX + 1];
44 struct psample_group *psample_group; 45 struct psample_group *psample_group;
45 struct tc_sample *parm; 46 struct tc_sample *parm;
47 u32 psample_group_num;
46 struct tcf_sample *s; 48 struct tcf_sample *s;
47 bool exists = false; 49 bool exists = false;
48 int ret; 50 int ret, err;
49 51
50 if (!nla) 52 if (!nla)
51 return -EINVAL; 53 return -EINVAL;
@@ -58,38 +60,46 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla,
58 60
59 parm = nla_data(tb[TCA_SAMPLE_PARMS]); 61 parm = nla_data(tb[TCA_SAMPLE_PARMS]);
60 62
61 exists = tcf_idr_check(tn, parm->index, a, bind); 63 err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
64 if (err < 0)
65 return err;
66 exists = err;
62 if (exists && bind) 67 if (exists && bind)
63 return 0; 68 return 0;
64 69
65 if (!exists) { 70 if (!exists) {
66 ret = tcf_idr_create(tn, parm->index, est, a, 71 ret = tcf_idr_create(tn, parm->index, est, a,
67 &act_sample_ops, bind, false); 72 &act_sample_ops, bind, false);
68 if (ret) 73 if (ret) {
74 tcf_idr_cleanup(tn, parm->index);
69 return ret; 75 return ret;
76 }
70 ret = ACT_P_CREATED; 77 ret = ACT_P_CREATED;
71 } else { 78 } else if (!ovr) {
72 tcf_idr_release(*a, bind); 79 tcf_idr_release(*a, bind);
73 if (!ovr) 80 return -EEXIST;
74 return -EEXIST;
75 } 81 }
76 s = to_sample(*a);
77 82
78 s->tcf_action = parm->action; 83 psample_group_num = nla_get_u32(tb[TCA_SAMPLE_PSAMPLE_GROUP]);
79 s->rate = nla_get_u32(tb[TCA_SAMPLE_RATE]); 84 psample_group = psample_group_get(net, psample_group_num);
80 s->psample_group_num = nla_get_u32(tb[TCA_SAMPLE_PSAMPLE_GROUP]);
81 psample_group = psample_group_get(net, s->psample_group_num);
82 if (!psample_group) { 85 if (!psample_group) {
83 if (ret == ACT_P_CREATED) 86 tcf_idr_release(*a, bind);
84 tcf_idr_release(*a, bind);
85 return -ENOMEM; 87 return -ENOMEM;
86 } 88 }
89
90 s = to_sample(*a);
91
92 spin_lock_bh(&s->tcf_lock);
93 s->tcf_action = parm->action;
94 s->rate = nla_get_u32(tb[TCA_SAMPLE_RATE]);
95 s->psample_group_num = psample_group_num;
87 RCU_INIT_POINTER(s->psample_group, psample_group); 96 RCU_INIT_POINTER(s->psample_group, psample_group);
88 97
89 if (tb[TCA_SAMPLE_TRUNC_SIZE]) { 98 if (tb[TCA_SAMPLE_TRUNC_SIZE]) {
90 s->truncate = true; 99 s->truncate = true;
91 s->trunc_size = nla_get_u32(tb[TCA_SAMPLE_TRUNC_SIZE]); 100 s->trunc_size = nla_get_u32(tb[TCA_SAMPLE_TRUNC_SIZE]);
92 } 101 }
102 spin_unlock_bh(&s->tcf_lock);
93 103
94 if (ret == ACT_P_CREATED) 104 if (ret == ACT_P_CREATED)
95 tcf_idr_insert(tn, *a); 105 tcf_idr_insert(tn, *a);
@@ -101,7 +111,8 @@ static void tcf_sample_cleanup(struct tc_action *a)
101 struct tcf_sample *s = to_sample(a); 111 struct tcf_sample *s = to_sample(a);
102 struct psample_group *psample_group; 112 struct psample_group *psample_group;
103 113
104 psample_group = rtnl_dereference(s->psample_group); 114 /* last reference to action, no need to lock */
115 psample_group = rcu_dereference_protected(s->psample_group, 1);
105 RCU_INIT_POINTER(s->psample_group, NULL); 116 RCU_INIT_POINTER(s->psample_group, NULL);
106 if (psample_group) 117 if (psample_group)
107 psample_group_put(psample_group); 118 psample_group_put(psample_group);
@@ -136,8 +147,7 @@ static int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a,
136 bstats_cpu_update(this_cpu_ptr(s->common.cpu_bstats), skb); 147 bstats_cpu_update(this_cpu_ptr(s->common.cpu_bstats), skb);
137 retval = READ_ONCE(s->tcf_action); 148 retval = READ_ONCE(s->tcf_action);
138 149
139 rcu_read_lock(); 150 psample_group = rcu_dereference_bh(s->psample_group);
140 psample_group = rcu_dereference(s->psample_group);
141 151
142 /* randomly sample packets according to rate */ 152 /* randomly sample packets according to rate */
143 if (psample_group && (prandom_u32() % s->rate == 0)) { 153 if (psample_group && (prandom_u32() % s->rate == 0)) {
@@ -161,7 +171,6 @@ static int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a,
161 skb_pull(skb, skb->mac_len); 171 skb_pull(skb, skb->mac_len);
162 } 172 }
163 173
164 rcu_read_unlock();
165 return retval; 174 return retval;
166} 175}
167 176
@@ -172,12 +181,13 @@ static int tcf_sample_dump(struct sk_buff *skb, struct tc_action *a,
172 struct tcf_sample *s = to_sample(a); 181 struct tcf_sample *s = to_sample(a);
173 struct tc_sample opt = { 182 struct tc_sample opt = {
174 .index = s->tcf_index, 183 .index = s->tcf_index,
175 .action = s->tcf_action, 184 .refcnt = refcount_read(&s->tcf_refcnt) - ref,
176 .refcnt = s->tcf_refcnt - ref, 185 .bindcnt = atomic_read(&s->tcf_bindcnt) - bind,
177 .bindcnt = s->tcf_bindcnt - bind,
178 }; 186 };
179 struct tcf_t t; 187 struct tcf_t t;
180 188
189 spin_lock_bh(&s->tcf_lock);
190 opt.action = s->tcf_action;
181 if (nla_put(skb, TCA_SAMPLE_PARMS, sizeof(opt), &opt)) 191 if (nla_put(skb, TCA_SAMPLE_PARMS, sizeof(opt), &opt))
182 goto nla_put_failure; 192 goto nla_put_failure;
183 193
@@ -194,9 +204,12 @@ static int tcf_sample_dump(struct sk_buff *skb, struct tc_action *a,
194 204
195 if (nla_put_u32(skb, TCA_SAMPLE_PSAMPLE_GROUP, s->psample_group_num)) 205 if (nla_put_u32(skb, TCA_SAMPLE_PSAMPLE_GROUP, s->psample_group_num))
196 goto nla_put_failure; 206 goto nla_put_failure;
207 spin_unlock_bh(&s->tcf_lock);
208
197 return skb->len; 209 return skb->len;
198 210
199nla_put_failure: 211nla_put_failure:
212 spin_unlock_bh(&s->tcf_lock);
200 nlmsg_trim(skb, b); 213 nlmsg_trim(skb, b);
201 return -1; 214 return -1;
202} 215}
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index 98c4afe7c15b..52400d49f81f 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -28,8 +28,8 @@ static unsigned int simp_net_id;
28static struct tc_action_ops act_simp_ops; 28static struct tc_action_ops act_simp_ops;
29 29
30#define SIMP_MAX_DATA 32 30#define SIMP_MAX_DATA 32
31static int tcf_simp(struct sk_buff *skb, const struct tc_action *a, 31static int tcf_simp_act(struct sk_buff *skb, const struct tc_action *a,
32 struct tcf_result *res) 32 struct tcf_result *res)
33{ 33{
34 struct tcf_defact *d = to_defact(a); 34 struct tcf_defact *d = to_defact(a);
35 35
@@ -79,7 +79,8 @@ static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = {
79 79
80static int tcf_simp_init(struct net *net, struct nlattr *nla, 80static int tcf_simp_init(struct net *net, struct nlattr *nla,
81 struct nlattr *est, struct tc_action **a, 81 struct nlattr *est, struct tc_action **a,
82 int ovr, int bind, struct netlink_ext_ack *extack) 82 int ovr, int bind, bool rtnl_held,
83 struct netlink_ext_ack *extack)
83{ 84{
84 struct tc_action_net *tn = net_generic(net, simp_net_id); 85 struct tc_action_net *tn = net_generic(net, simp_net_id);
85 struct nlattr *tb[TCA_DEF_MAX + 1]; 86 struct nlattr *tb[TCA_DEF_MAX + 1];
@@ -99,21 +100,28 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
99 return -EINVAL; 100 return -EINVAL;
100 101
101 parm = nla_data(tb[TCA_DEF_PARMS]); 102 parm = nla_data(tb[TCA_DEF_PARMS]);
102 exists = tcf_idr_check(tn, parm->index, a, bind); 103 err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
104 if (err < 0)
105 return err;
106 exists = err;
103 if (exists && bind) 107 if (exists && bind)
104 return 0; 108 return 0;
105 109
106 if (tb[TCA_DEF_DATA] == NULL) { 110 if (tb[TCA_DEF_DATA] == NULL) {
107 if (exists) 111 if (exists)
108 tcf_idr_release(*a, bind); 112 tcf_idr_release(*a, bind);
113 else
114 tcf_idr_cleanup(tn, parm->index);
109 return -EINVAL; 115 return -EINVAL;
110 } 116 }
111 117
112 if (!exists) { 118 if (!exists) {
113 ret = tcf_idr_create(tn, parm->index, est, a, 119 ret = tcf_idr_create(tn, parm->index, est, a,
114 &act_simp_ops, bind, false); 120 &act_simp_ops, bind, false);
115 if (ret) 121 if (ret) {
122 tcf_idr_cleanup(tn, parm->index);
116 return ret; 123 return ret;
124 }
117 125
118 d = to_defact(*a); 126 d = to_defact(*a);
119 ret = alloc_defdata(d, tb[TCA_DEF_DATA]); 127 ret = alloc_defdata(d, tb[TCA_DEF_DATA]);
@@ -126,9 +134,10 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
126 } else { 134 } else {
127 d = to_defact(*a); 135 d = to_defact(*a);
128 136
129 tcf_idr_release(*a, bind); 137 if (!ovr) {
130 if (!ovr) 138 tcf_idr_release(*a, bind);
131 return -EEXIST; 139 return -EEXIST;
140 }
132 141
133 reset_policy(d, tb[TCA_DEF_DATA], parm); 142 reset_policy(d, tb[TCA_DEF_DATA], parm);
134 } 143 }
@@ -145,12 +154,13 @@ static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a,
145 struct tcf_defact *d = to_defact(a); 154 struct tcf_defact *d = to_defact(a);
146 struct tc_defact opt = { 155 struct tc_defact opt = {
147 .index = d->tcf_index, 156 .index = d->tcf_index,
148 .refcnt = d->tcf_refcnt - ref, 157 .refcnt = refcount_read(&d->tcf_refcnt) - ref,
149 .bindcnt = d->tcf_bindcnt - bind, 158 .bindcnt = atomic_read(&d->tcf_bindcnt) - bind,
150 .action = d->tcf_action,
151 }; 159 };
152 struct tcf_t t; 160 struct tcf_t t;
153 161
162 spin_lock_bh(&d->tcf_lock);
163 opt.action = d->tcf_action;
154 if (nla_put(skb, TCA_DEF_PARMS, sizeof(opt), &opt) || 164 if (nla_put(skb, TCA_DEF_PARMS, sizeof(opt), &opt) ||
155 nla_put_string(skb, TCA_DEF_DATA, d->tcfd_defdata)) 165 nla_put_string(skb, TCA_DEF_DATA, d->tcfd_defdata))
156 goto nla_put_failure; 166 goto nla_put_failure;
@@ -158,9 +168,12 @@ static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a,
158 tcf_tm_dump(&t, &d->tcf_tm); 168 tcf_tm_dump(&t, &d->tcf_tm);
159 if (nla_put_64bit(skb, TCA_DEF_TM, sizeof(t), &t, TCA_DEF_PAD)) 169 if (nla_put_64bit(skb, TCA_DEF_TM, sizeof(t), &t, TCA_DEF_PAD))
160 goto nla_put_failure; 170 goto nla_put_failure;
171 spin_unlock_bh(&d->tcf_lock);
172
161 return skb->len; 173 return skb->len;
162 174
163nla_put_failure: 175nla_put_failure:
176 spin_unlock_bh(&d->tcf_lock);
164 nlmsg_trim(skb, b); 177 nlmsg_trim(skb, b);
165 return -1; 178 return -1;
166} 179}
@@ -187,7 +200,7 @@ static struct tc_action_ops act_simp_ops = {
187 .kind = "simple", 200 .kind = "simple",
188 .type = TCA_ACT_SIMP, 201 .type = TCA_ACT_SIMP,
189 .owner = THIS_MODULE, 202 .owner = THIS_MODULE,
190 .act = tcf_simp, 203 .act = tcf_simp_act,
191 .dump = tcf_simp_dump, 204 .dump = tcf_simp_dump,
192 .cleanup = tcf_simp_release, 205 .cleanup = tcf_simp_release,
193 .init = tcf_simp_init, 206 .init = tcf_simp_init,
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 6138d1d71900..73e44ce2a883 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -23,6 +23,9 @@
23#include <linux/rtnetlink.h> 23#include <linux/rtnetlink.h>
24#include <net/netlink.h> 24#include <net/netlink.h>
25#include <net/pkt_sched.h> 25#include <net/pkt_sched.h>
26#include <net/ip.h>
27#include <net/ipv6.h>
28#include <net/dsfield.h>
26 29
27#include <linux/tc_act/tc_skbedit.h> 30#include <linux/tc_act/tc_skbedit.h>
28#include <net/tc_act/tc_skbedit.h> 31#include <net/tc_act/tc_skbedit.h>
@@ -30,29 +33,54 @@
30static unsigned int skbedit_net_id; 33static unsigned int skbedit_net_id;
31static struct tc_action_ops act_skbedit_ops; 34static struct tc_action_ops act_skbedit_ops;
32 35
33static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a, 36static int tcf_skbedit_act(struct sk_buff *skb, const struct tc_action *a,
34 struct tcf_result *res) 37 struct tcf_result *res)
35{ 38{
36 struct tcf_skbedit *d = to_skbedit(a); 39 struct tcf_skbedit *d = to_skbedit(a);
40 struct tcf_skbedit_params *params;
41 int action;
37 42
38 spin_lock(&d->tcf_lock);
39 tcf_lastuse_update(&d->tcf_tm); 43 tcf_lastuse_update(&d->tcf_tm);
40 bstats_update(&d->tcf_bstats, skb); 44 bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb);
41 45
42 if (d->flags & SKBEDIT_F_PRIORITY) 46 params = rcu_dereference_bh(d->params);
43 skb->priority = d->priority; 47 action = READ_ONCE(d->tcf_action);
44 if (d->flags & SKBEDIT_F_QUEUE_MAPPING && 48
45 skb->dev->real_num_tx_queues > d->queue_mapping) 49 if (params->flags & SKBEDIT_F_PRIORITY)
46 skb_set_queue_mapping(skb, d->queue_mapping); 50 skb->priority = params->priority;
47 if (d->flags & SKBEDIT_F_MARK) { 51 if (params->flags & SKBEDIT_F_INHERITDSFIELD) {
48 skb->mark &= ~d->mask; 52 int wlen = skb_network_offset(skb);
49 skb->mark |= d->mark & d->mask; 53
54 switch (tc_skb_protocol(skb)) {
55 case htons(ETH_P_IP):
56 wlen += sizeof(struct iphdr);
57 if (!pskb_may_pull(skb, wlen))
58 goto err;
59 skb->priority = ipv4_get_dsfield(ip_hdr(skb)) >> 2;
60 break;
61
62 case htons(ETH_P_IPV6):
63 wlen += sizeof(struct ipv6hdr);
64 if (!pskb_may_pull(skb, wlen))
65 goto err;
66 skb->priority = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2;
67 break;
68 }
50 } 69 }
51 if (d->flags & SKBEDIT_F_PTYPE) 70 if (params->flags & SKBEDIT_F_QUEUE_MAPPING &&
52 skb->pkt_type = d->ptype; 71 skb->dev->real_num_tx_queues > params->queue_mapping)
72 skb_set_queue_mapping(skb, params->queue_mapping);
73 if (params->flags & SKBEDIT_F_MARK) {
74 skb->mark &= ~params->mask;
75 skb->mark |= params->mark & params->mask;
76 }
77 if (params->flags & SKBEDIT_F_PTYPE)
78 skb->pkt_type = params->ptype;
79 return action;
53 80
54 spin_unlock(&d->tcf_lock); 81err:
55 return d->tcf_action; 82 qstats_drop_inc(this_cpu_ptr(d->common.cpu_qstats));
83 return TC_ACT_SHOT;
56} 84}
57 85
58static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = { 86static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
@@ -62,13 +90,16 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
62 [TCA_SKBEDIT_MARK] = { .len = sizeof(u32) }, 90 [TCA_SKBEDIT_MARK] = { .len = sizeof(u32) },
63 [TCA_SKBEDIT_PTYPE] = { .len = sizeof(u16) }, 91 [TCA_SKBEDIT_PTYPE] = { .len = sizeof(u16) },
64 [TCA_SKBEDIT_MASK] = { .len = sizeof(u32) }, 92 [TCA_SKBEDIT_MASK] = { .len = sizeof(u32) },
93 [TCA_SKBEDIT_FLAGS] = { .len = sizeof(u64) },
65}; 94};
66 95
67static int tcf_skbedit_init(struct net *net, struct nlattr *nla, 96static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
68 struct nlattr *est, struct tc_action **a, 97 struct nlattr *est, struct tc_action **a,
69 int ovr, int bind, struct netlink_ext_ack *extack) 98 int ovr, int bind, bool rtnl_held,
99 struct netlink_ext_ack *extack)
70{ 100{
71 struct tc_action_net *tn = net_generic(net, skbedit_net_id); 101 struct tc_action_net *tn = net_generic(net, skbedit_net_id);
102 struct tcf_skbedit_params *params_old, *params_new;
72 struct nlattr *tb[TCA_SKBEDIT_MAX + 1]; 103 struct nlattr *tb[TCA_SKBEDIT_MAX + 1];
73 struct tc_skbedit *parm; 104 struct tc_skbedit *parm;
74 struct tcf_skbedit *d; 105 struct tcf_skbedit *d;
@@ -114,52 +145,76 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
114 mask = nla_data(tb[TCA_SKBEDIT_MASK]); 145 mask = nla_data(tb[TCA_SKBEDIT_MASK]);
115 } 146 }
116 147
148 if (tb[TCA_SKBEDIT_FLAGS] != NULL) {
149 u64 *pure_flags = nla_data(tb[TCA_SKBEDIT_FLAGS]);
150
151 if (*pure_flags & SKBEDIT_F_INHERITDSFIELD)
152 flags |= SKBEDIT_F_INHERITDSFIELD;
153 }
154
117 parm = nla_data(tb[TCA_SKBEDIT_PARMS]); 155 parm = nla_data(tb[TCA_SKBEDIT_PARMS]);
118 156
119 exists = tcf_idr_check(tn, parm->index, a, bind); 157 err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
158 if (err < 0)
159 return err;
160 exists = err;
120 if (exists && bind) 161 if (exists && bind)
121 return 0; 162 return 0;
122 163
123 if (!flags) { 164 if (!flags) {
124 if (exists) 165 if (exists)
125 tcf_idr_release(*a, bind); 166 tcf_idr_release(*a, bind);
167 else
168 tcf_idr_cleanup(tn, parm->index);
126 return -EINVAL; 169 return -EINVAL;
127 } 170 }
128 171
129 if (!exists) { 172 if (!exists) {
130 ret = tcf_idr_create(tn, parm->index, est, a, 173 ret = tcf_idr_create(tn, parm->index, est, a,
131 &act_skbedit_ops, bind, false); 174 &act_skbedit_ops, bind, true);
132 if (ret) 175 if (ret) {
176 tcf_idr_cleanup(tn, parm->index);
133 return ret; 177 return ret;
178 }
134 179
135 d = to_skbedit(*a); 180 d = to_skbedit(*a);
136 ret = ACT_P_CREATED; 181 ret = ACT_P_CREATED;
137 } else { 182 } else {
138 d = to_skbedit(*a); 183 d = to_skbedit(*a);
139 tcf_idr_release(*a, bind); 184 if (!ovr) {
140 if (!ovr) 185 tcf_idr_release(*a, bind);
141 return -EEXIST; 186 return -EEXIST;
187 }
142 } 188 }
143 189
144 spin_lock_bh(&d->tcf_lock); 190 ASSERT_RTNL();
191
192 params_new = kzalloc(sizeof(*params_new), GFP_KERNEL);
193 if (unlikely(!params_new)) {
194 if (ret == ACT_P_CREATED)
195 tcf_idr_release(*a, bind);
196 return -ENOMEM;
197 }
145 198
146 d->flags = flags; 199 params_new->flags = flags;
147 if (flags & SKBEDIT_F_PRIORITY) 200 if (flags & SKBEDIT_F_PRIORITY)
148 d->priority = *priority; 201 params_new->priority = *priority;
149 if (flags & SKBEDIT_F_QUEUE_MAPPING) 202 if (flags & SKBEDIT_F_QUEUE_MAPPING)
150 d->queue_mapping = *queue_mapping; 203 params_new->queue_mapping = *queue_mapping;
151 if (flags & SKBEDIT_F_MARK) 204 if (flags & SKBEDIT_F_MARK)
152 d->mark = *mark; 205 params_new->mark = *mark;
153 if (flags & SKBEDIT_F_PTYPE) 206 if (flags & SKBEDIT_F_PTYPE)
154 d->ptype = *ptype; 207 params_new->ptype = *ptype;
155 /* default behaviour is to use all the bits */ 208 /* default behaviour is to use all the bits */
156 d->mask = 0xffffffff; 209 params_new->mask = 0xffffffff;
157 if (flags & SKBEDIT_F_MASK) 210 if (flags & SKBEDIT_F_MASK)
158 d->mask = *mask; 211 params_new->mask = *mask;
159 212
160 d->tcf_action = parm->action; 213 d->tcf_action = parm->action;
161 214 params_old = rtnl_dereference(d->params);
162 spin_unlock_bh(&d->tcf_lock); 215 rcu_assign_pointer(d->params, params_new);
216 if (params_old)
217 kfree_rcu(params_old, rcu);
163 218
164 if (ret == ACT_P_CREATED) 219 if (ret == ACT_P_CREATED)
165 tcf_idr_insert(tn, *a); 220 tcf_idr_insert(tn, *a);
@@ -171,30 +226,39 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
171{ 226{
172 unsigned char *b = skb_tail_pointer(skb); 227 unsigned char *b = skb_tail_pointer(skb);
173 struct tcf_skbedit *d = to_skbedit(a); 228 struct tcf_skbedit *d = to_skbedit(a);
229 struct tcf_skbedit_params *params;
174 struct tc_skbedit opt = { 230 struct tc_skbedit opt = {
175 .index = d->tcf_index, 231 .index = d->tcf_index,
176 .refcnt = d->tcf_refcnt - ref, 232 .refcnt = refcount_read(&d->tcf_refcnt) - ref,
177 .bindcnt = d->tcf_bindcnt - bind, 233 .bindcnt = atomic_read(&d->tcf_bindcnt) - bind,
178 .action = d->tcf_action, 234 .action = d->tcf_action,
179 }; 235 };
236 u64 pure_flags = 0;
180 struct tcf_t t; 237 struct tcf_t t;
181 238
239 params = rtnl_dereference(d->params);
240
182 if (nla_put(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt)) 241 if (nla_put(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt))
183 goto nla_put_failure; 242 goto nla_put_failure;
184 if ((d->flags & SKBEDIT_F_PRIORITY) && 243 if ((params->flags & SKBEDIT_F_PRIORITY) &&
185 nla_put_u32(skb, TCA_SKBEDIT_PRIORITY, d->priority)) 244 nla_put_u32(skb, TCA_SKBEDIT_PRIORITY, params->priority))
186 goto nla_put_failure; 245 goto nla_put_failure;
187 if ((d->flags & SKBEDIT_F_QUEUE_MAPPING) && 246 if ((params->flags & SKBEDIT_F_QUEUE_MAPPING) &&
188 nla_put_u16(skb, TCA_SKBEDIT_QUEUE_MAPPING, d->queue_mapping)) 247 nla_put_u16(skb, TCA_SKBEDIT_QUEUE_MAPPING, params->queue_mapping))
189 goto nla_put_failure; 248 goto nla_put_failure;
190 if ((d->flags & SKBEDIT_F_MARK) && 249 if ((params->flags & SKBEDIT_F_MARK) &&
191 nla_put_u32(skb, TCA_SKBEDIT_MARK, d->mark)) 250 nla_put_u32(skb, TCA_SKBEDIT_MARK, params->mark))
192 goto nla_put_failure; 251 goto nla_put_failure;
193 if ((d->flags & SKBEDIT_F_PTYPE) && 252 if ((params->flags & SKBEDIT_F_PTYPE) &&
194 nla_put_u16(skb, TCA_SKBEDIT_PTYPE, d->ptype)) 253 nla_put_u16(skb, TCA_SKBEDIT_PTYPE, params->ptype))
195 goto nla_put_failure; 254 goto nla_put_failure;
196 if ((d->flags & SKBEDIT_F_MASK) && 255 if ((params->flags & SKBEDIT_F_MASK) &&
197 nla_put_u32(skb, TCA_SKBEDIT_MASK, d->mask)) 256 nla_put_u32(skb, TCA_SKBEDIT_MASK, params->mask))
257 goto nla_put_failure;
258 if (params->flags & SKBEDIT_F_INHERITDSFIELD)
259 pure_flags |= SKBEDIT_F_INHERITDSFIELD;
260 if (pure_flags != 0 &&
261 nla_put(skb, TCA_SKBEDIT_FLAGS, sizeof(pure_flags), &pure_flags))
198 goto nla_put_failure; 262 goto nla_put_failure;
199 263
200 tcf_tm_dump(&t, &d->tcf_tm); 264 tcf_tm_dump(&t, &d->tcf_tm);
@@ -207,6 +271,16 @@ nla_put_failure:
207 return -1; 271 return -1;
208} 272}
209 273
274static void tcf_skbedit_cleanup(struct tc_action *a)
275{
276 struct tcf_skbedit *d = to_skbedit(a);
277 struct tcf_skbedit_params *params;
278
279 params = rcu_dereference_protected(d->params, 1);
280 if (params)
281 kfree_rcu(params, rcu);
282}
283
210static int tcf_skbedit_walker(struct net *net, struct sk_buff *skb, 284static int tcf_skbedit_walker(struct net *net, struct sk_buff *skb,
211 struct netlink_callback *cb, int type, 285 struct netlink_callback *cb, int type,
212 const struct tc_action_ops *ops, 286 const struct tc_action_ops *ops,
@@ -229,9 +303,10 @@ static struct tc_action_ops act_skbedit_ops = {
229 .kind = "skbedit", 303 .kind = "skbedit",
230 .type = TCA_ACT_SKBEDIT, 304 .type = TCA_ACT_SKBEDIT,
231 .owner = THIS_MODULE, 305 .owner = THIS_MODULE,
232 .act = tcf_skbedit, 306 .act = tcf_skbedit_act,
233 .dump = tcf_skbedit_dump, 307 .dump = tcf_skbedit_dump,
234 .init = tcf_skbedit_init, 308 .init = tcf_skbedit_init,
309 .cleanup = tcf_skbedit_cleanup,
235 .walk = tcf_skbedit_walker, 310 .walk = tcf_skbedit_walker,
236 .lookup = tcf_skbedit_search, 311 .lookup = tcf_skbedit_search,
237 .size = sizeof(struct tcf_skbedit), 312 .size = sizeof(struct tcf_skbedit),
diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
index ad050d7d4b46..588077fafd6c 100644
--- a/net/sched/act_skbmod.c
+++ b/net/sched/act_skbmod.c
@@ -24,7 +24,7 @@ static unsigned int skbmod_net_id;
24static struct tc_action_ops act_skbmod_ops; 24static struct tc_action_ops act_skbmod_ops;
25 25
26#define MAX_EDIT_LEN ETH_HLEN 26#define MAX_EDIT_LEN ETH_HLEN
27static int tcf_skbmod_run(struct sk_buff *skb, const struct tc_action *a, 27static int tcf_skbmod_act(struct sk_buff *skb, const struct tc_action *a,
28 struct tcf_result *res) 28 struct tcf_result *res)
29{ 29{
30 struct tcf_skbmod *d = to_skbmod(a); 30 struct tcf_skbmod *d = to_skbmod(a);
@@ -41,20 +41,14 @@ static int tcf_skbmod_run(struct sk_buff *skb, const struct tc_action *a,
41 * then MAX_EDIT_LEN needs to change appropriately 41 * then MAX_EDIT_LEN needs to change appropriately
42 */ 42 */
43 err = skb_ensure_writable(skb, MAX_EDIT_LEN); 43 err = skb_ensure_writable(skb, MAX_EDIT_LEN);
44 if (unlikely(err)) { /* best policy is to drop on the floor */ 44 if (unlikely(err)) /* best policy is to drop on the floor */
45 qstats_overlimit_inc(this_cpu_ptr(d->common.cpu_qstats)); 45 goto drop;
46 return TC_ACT_SHOT;
47 }
48 46
49 rcu_read_lock();
50 action = READ_ONCE(d->tcf_action); 47 action = READ_ONCE(d->tcf_action);
51 if (unlikely(action == TC_ACT_SHOT)) { 48 if (unlikely(action == TC_ACT_SHOT))
52 qstats_overlimit_inc(this_cpu_ptr(d->common.cpu_qstats)); 49 goto drop;
53 rcu_read_unlock();
54 return action;
55 }
56 50
57 p = rcu_dereference(d->skbmod_p); 51 p = rcu_dereference_bh(d->skbmod_p);
58 flags = p->flags; 52 flags = p->flags;
59 if (flags & SKBMOD_F_DMAC) 53 if (flags & SKBMOD_F_DMAC)
60 ether_addr_copy(eth_hdr(skb)->h_dest, p->eth_dst); 54 ether_addr_copy(eth_hdr(skb)->h_dest, p->eth_dst);
@@ -62,7 +56,6 @@ static int tcf_skbmod_run(struct sk_buff *skb, const struct tc_action *a,
62 ether_addr_copy(eth_hdr(skb)->h_source, p->eth_src); 56 ether_addr_copy(eth_hdr(skb)->h_source, p->eth_src);
63 if (flags & SKBMOD_F_ETYPE) 57 if (flags & SKBMOD_F_ETYPE)
64 eth_hdr(skb)->h_proto = p->eth_type; 58 eth_hdr(skb)->h_proto = p->eth_type;
65 rcu_read_unlock();
66 59
67 if (flags & SKBMOD_F_SWAPMAC) { 60 if (flags & SKBMOD_F_SWAPMAC) {
68 u16 tmpaddr[ETH_ALEN / 2]; /* ether_addr_copy() requirement */ 61 u16 tmpaddr[ETH_ALEN / 2]; /* ether_addr_copy() requirement */
@@ -73,6 +66,10 @@ static int tcf_skbmod_run(struct sk_buff *skb, const struct tc_action *a,
73 } 66 }
74 67
75 return action; 68 return action;
69
70drop:
71 qstats_overlimit_inc(this_cpu_ptr(d->common.cpu_qstats));
72 return TC_ACT_SHOT;
76} 73}
77 74
78static const struct nla_policy skbmod_policy[TCA_SKBMOD_MAX + 1] = { 75static const struct nla_policy skbmod_policy[TCA_SKBMOD_MAX + 1] = {
@@ -84,7 +81,8 @@ static const struct nla_policy skbmod_policy[TCA_SKBMOD_MAX + 1] = {
84 81
85static int tcf_skbmod_init(struct net *net, struct nlattr *nla, 82static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
86 struct nlattr *est, struct tc_action **a, 83 struct nlattr *est, struct tc_action **a,
87 int ovr, int bind, struct netlink_ext_ack *extack) 84 int ovr, int bind, bool rtnl_held,
85 struct netlink_ext_ack *extack)
88{ 86{
89 struct tc_action_net *tn = net_generic(net, skbmod_net_id); 87 struct tc_action_net *tn = net_generic(net, skbmod_net_id);
90 struct nlattr *tb[TCA_SKBMOD_MAX + 1]; 88 struct nlattr *tb[TCA_SKBMOD_MAX + 1];
@@ -127,46 +125,50 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
127 if (parm->flags & SKBMOD_F_SWAPMAC) 125 if (parm->flags & SKBMOD_F_SWAPMAC)
128 lflags = SKBMOD_F_SWAPMAC; 126 lflags = SKBMOD_F_SWAPMAC;
129 127
130 exists = tcf_idr_check(tn, parm->index, a, bind); 128 err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
129 if (err < 0)
130 return err;
131 exists = err;
131 if (exists && bind) 132 if (exists && bind)
132 return 0; 133 return 0;
133 134
134 if (!lflags) { 135 if (!lflags) {
135 if (exists) 136 if (exists)
136 tcf_idr_release(*a, bind); 137 tcf_idr_release(*a, bind);
138 else
139 tcf_idr_cleanup(tn, parm->index);
137 return -EINVAL; 140 return -EINVAL;
138 } 141 }
139 142
140 if (!exists) { 143 if (!exists) {
141 ret = tcf_idr_create(tn, parm->index, est, a, 144 ret = tcf_idr_create(tn, parm->index, est, a,
142 &act_skbmod_ops, bind, true); 145 &act_skbmod_ops, bind, true);
143 if (ret) 146 if (ret) {
147 tcf_idr_cleanup(tn, parm->index);
144 return ret; 148 return ret;
149 }
145 150
146 ret = ACT_P_CREATED; 151 ret = ACT_P_CREATED;
147 } else { 152 } else if (!ovr) {
148 tcf_idr_release(*a, bind); 153 tcf_idr_release(*a, bind);
149 if (!ovr) 154 return -EEXIST;
150 return -EEXIST;
151 } 155 }
152 156
153 d = to_skbmod(*a); 157 d = to_skbmod(*a);
154 158
155 ASSERT_RTNL();
156 p = kzalloc(sizeof(struct tcf_skbmod_params), GFP_KERNEL); 159 p = kzalloc(sizeof(struct tcf_skbmod_params), GFP_KERNEL);
157 if (unlikely(!p)) { 160 if (unlikely(!p)) {
158 if (ret == ACT_P_CREATED) 161 tcf_idr_release(*a, bind);
159 tcf_idr_release(*a, bind);
160 return -ENOMEM; 162 return -ENOMEM;
161 } 163 }
162 164
163 p->flags = lflags; 165 p->flags = lflags;
164 d->tcf_action = parm->action; 166 d->tcf_action = parm->action;
165 167
166 p_old = rtnl_dereference(d->skbmod_p);
167
168 if (ovr) 168 if (ovr)
169 spin_lock_bh(&d->tcf_lock); 169 spin_lock_bh(&d->tcf_lock);
170 /* Protected by tcf_lock if overwriting existing action. */
171 p_old = rcu_dereference_protected(d->skbmod_p, 1);
170 172
171 if (lflags & SKBMOD_F_DMAC) 173 if (lflags & SKBMOD_F_DMAC)
172 ether_addr_copy(p->eth_dst, daddr); 174 ether_addr_copy(p->eth_dst, daddr);
@@ -202,15 +204,18 @@ static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a,
202{ 204{
203 struct tcf_skbmod *d = to_skbmod(a); 205 struct tcf_skbmod *d = to_skbmod(a);
204 unsigned char *b = skb_tail_pointer(skb); 206 unsigned char *b = skb_tail_pointer(skb);
205 struct tcf_skbmod_params *p = rtnl_dereference(d->skbmod_p); 207 struct tcf_skbmod_params *p;
206 struct tc_skbmod opt = { 208 struct tc_skbmod opt = {
207 .index = d->tcf_index, 209 .index = d->tcf_index,
208 .refcnt = d->tcf_refcnt - ref, 210 .refcnt = refcount_read(&d->tcf_refcnt) - ref,
209 .bindcnt = d->tcf_bindcnt - bind, 211 .bindcnt = atomic_read(&d->tcf_bindcnt) - bind,
210 .action = d->tcf_action,
211 }; 212 };
212 struct tcf_t t; 213 struct tcf_t t;
213 214
215 spin_lock_bh(&d->tcf_lock);
216 opt.action = d->tcf_action;
217 p = rcu_dereference_protected(d->skbmod_p,
218 lockdep_is_held(&d->tcf_lock));
214 opt.flags = p->flags; 219 opt.flags = p->flags;
215 if (nla_put(skb, TCA_SKBMOD_PARMS, sizeof(opt), &opt)) 220 if (nla_put(skb, TCA_SKBMOD_PARMS, sizeof(opt), &opt))
216 goto nla_put_failure; 221 goto nla_put_failure;
@@ -228,8 +233,10 @@ static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a,
228 if (nla_put_64bit(skb, TCA_SKBMOD_TM, sizeof(t), &t, TCA_SKBMOD_PAD)) 233 if (nla_put_64bit(skb, TCA_SKBMOD_TM, sizeof(t), &t, TCA_SKBMOD_PAD))
229 goto nla_put_failure; 234 goto nla_put_failure;
230 235
236 spin_unlock_bh(&d->tcf_lock);
231 return skb->len; 237 return skb->len;
232nla_put_failure: 238nla_put_failure:
239 spin_unlock_bh(&d->tcf_lock);
233 nlmsg_trim(skb, b); 240 nlmsg_trim(skb, b);
234 return -1; 241 return -1;
235} 242}
@@ -256,7 +263,7 @@ static struct tc_action_ops act_skbmod_ops = {
256 .kind = "skbmod", 263 .kind = "skbmod",
257 .type = TCA_ACT_SKBMOD, 264 .type = TCA_ACT_SKBMOD,
258 .owner = THIS_MODULE, 265 .owner = THIS_MODULE,
259 .act = tcf_skbmod_run, 266 .act = tcf_skbmod_act,
260 .dump = tcf_skbmod_dump, 267 .dump = tcf_skbmod_dump,
261 .init = tcf_skbmod_init, 268 .init = tcf_skbmod_init,
262 .cleanup = tcf_skbmod_cleanup, 269 .cleanup = tcf_skbmod_cleanup,
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index 626dac81a48a..420759153d5f 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -13,6 +13,7 @@
13#include <linux/kernel.h> 13#include <linux/kernel.h>
14#include <linux/skbuff.h> 14#include <linux/skbuff.h>
15#include <linux/rtnetlink.h> 15#include <linux/rtnetlink.h>
16#include <net/geneve.h>
16#include <net/netlink.h> 17#include <net/netlink.h>
17#include <net/pkt_sched.h> 18#include <net/pkt_sched.h>
18#include <net/dst.h> 19#include <net/dst.h>
@@ -30,13 +31,11 @@ static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a,
30 struct tcf_tunnel_key_params *params; 31 struct tcf_tunnel_key_params *params;
31 int action; 32 int action;
32 33
33 rcu_read_lock(); 34 params = rcu_dereference_bh(t->params);
34
35 params = rcu_dereference(t->params);
36 35
37 tcf_lastuse_update(&t->tcf_tm); 36 tcf_lastuse_update(&t->tcf_tm);
38 bstats_cpu_update(this_cpu_ptr(t->common.cpu_bstats), skb); 37 bstats_cpu_update(this_cpu_ptr(t->common.cpu_bstats), skb);
39 action = params->action; 38 action = READ_ONCE(t->tcf_action);
40 39
41 switch (params->tcft_action) { 40 switch (params->tcft_action) {
42 case TCA_TUNNEL_KEY_ACT_RELEASE: 41 case TCA_TUNNEL_KEY_ACT_RELEASE:
@@ -52,11 +51,138 @@ static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a,
52 break; 51 break;
53 } 52 }
54 53
55 rcu_read_unlock();
56
57 return action; 54 return action;
58} 55}
59 56
57static const struct nla_policy
58enc_opts_policy[TCA_TUNNEL_KEY_ENC_OPTS_MAX + 1] = {
59 [TCA_TUNNEL_KEY_ENC_OPTS_GENEVE] = { .type = NLA_NESTED },
60};
61
62static const struct nla_policy
63geneve_opt_policy[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX + 1] = {
64 [TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS] = { .type = NLA_U16 },
65 [TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE] = { .type = NLA_U8 },
66 [TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA] = { .type = NLA_BINARY,
67 .len = 128 },
68};
69
70static int
71tunnel_key_copy_geneve_opt(const struct nlattr *nla, void *dst, int dst_len,
72 struct netlink_ext_ack *extack)
73{
74 struct nlattr *tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX + 1];
75 int err, data_len, opt_len;
76 u8 *data;
77
78 err = nla_parse_nested(tb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX,
79 nla, geneve_opt_policy, extack);
80 if (err < 0)
81 return err;
82
83 if (!tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS] ||
84 !tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE] ||
85 !tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA]) {
86 NL_SET_ERR_MSG(extack, "Missing tunnel key geneve option class, type or data");
87 return -EINVAL;
88 }
89
90 data = nla_data(tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA]);
91 data_len = nla_len(tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA]);
92 if (data_len < 4) {
93 NL_SET_ERR_MSG(extack, "Tunnel key geneve option data is less than 4 bytes long");
94 return -ERANGE;
95 }
96 if (data_len % 4) {
97 NL_SET_ERR_MSG(extack, "Tunnel key geneve option data is not a multiple of 4 bytes long");
98 return -ERANGE;
99 }
100
101 opt_len = sizeof(struct geneve_opt) + data_len;
102 if (dst) {
103 struct geneve_opt *opt = dst;
104
105 WARN_ON(dst_len < opt_len);
106
107 opt->opt_class =
108 nla_get_be16(tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS]);
109 opt->type = nla_get_u8(tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE]);
110 opt->length = data_len / 4; /* length is in units of 4 bytes */
111 opt->r1 = 0;
112 opt->r2 = 0;
113 opt->r3 = 0;
114
115 memcpy(opt + 1, data, data_len);
116 }
117
118 return opt_len;
119}
120
121static int tunnel_key_copy_opts(const struct nlattr *nla, u8 *dst,
122 int dst_len, struct netlink_ext_ack *extack)
123{
124 int err, rem, opt_len, len = nla_len(nla), opts_len = 0;
125 const struct nlattr *attr, *head = nla_data(nla);
126
127 err = nla_validate(head, len, TCA_TUNNEL_KEY_ENC_OPTS_MAX,
128 enc_opts_policy, extack);
129 if (err)
130 return err;
131
132 nla_for_each_attr(attr, head, len, rem) {
133 switch (nla_type(attr)) {
134 case TCA_TUNNEL_KEY_ENC_OPTS_GENEVE:
135 opt_len = tunnel_key_copy_geneve_opt(attr, dst,
136 dst_len, extack);
137 if (opt_len < 0)
138 return opt_len;
139 opts_len += opt_len;
140 if (dst) {
141 dst_len -= opt_len;
142 dst += opt_len;
143 }
144 break;
145 }
146 }
147
148 if (!opts_len) {
149 NL_SET_ERR_MSG(extack, "Empty list of tunnel options");
150 return -EINVAL;
151 }
152
153 if (rem > 0) {
154 NL_SET_ERR_MSG(extack, "Trailing data after parsing tunnel key options attributes");
155 return -EINVAL;
156 }
157
158 return opts_len;
159}
160
161static int tunnel_key_get_opts_len(struct nlattr *nla,
162 struct netlink_ext_ack *extack)
163{
164 return tunnel_key_copy_opts(nla, NULL, 0, extack);
165}
166
167static int tunnel_key_opts_set(struct nlattr *nla, struct ip_tunnel_info *info,
168 int opts_len, struct netlink_ext_ack *extack)
169{
170 info->options_len = opts_len;
171 switch (nla_type(nla_data(nla))) {
172 case TCA_TUNNEL_KEY_ENC_OPTS_GENEVE:
173#if IS_ENABLED(CONFIG_INET)
174 info->key.tun_flags |= TUNNEL_GENEVE_OPT;
175 return tunnel_key_copy_opts(nla, ip_tunnel_info_opts(info),
176 opts_len, extack);
177#else
178 return -EAFNOSUPPORT;
179#endif
180 default:
181 NL_SET_ERR_MSG(extack, "Cannot set tunnel options for unknown tunnel type");
182 return -EINVAL;
183 }
184}
185
60static const struct nla_policy tunnel_key_policy[TCA_TUNNEL_KEY_MAX + 1] = { 186static const struct nla_policy tunnel_key_policy[TCA_TUNNEL_KEY_MAX + 1] = {
61 [TCA_TUNNEL_KEY_PARMS] = { .len = sizeof(struct tc_tunnel_key) }, 187 [TCA_TUNNEL_KEY_PARMS] = { .len = sizeof(struct tc_tunnel_key) },
62 [TCA_TUNNEL_KEY_ENC_IPV4_SRC] = { .type = NLA_U32 }, 188 [TCA_TUNNEL_KEY_ENC_IPV4_SRC] = { .type = NLA_U32 },
@@ -66,39 +192,53 @@ static const struct nla_policy tunnel_key_policy[TCA_TUNNEL_KEY_MAX + 1] = {
66 [TCA_TUNNEL_KEY_ENC_KEY_ID] = { .type = NLA_U32 }, 192 [TCA_TUNNEL_KEY_ENC_KEY_ID] = { .type = NLA_U32 },
67 [TCA_TUNNEL_KEY_ENC_DST_PORT] = {.type = NLA_U16}, 193 [TCA_TUNNEL_KEY_ENC_DST_PORT] = {.type = NLA_U16},
68 [TCA_TUNNEL_KEY_NO_CSUM] = { .type = NLA_U8 }, 194 [TCA_TUNNEL_KEY_NO_CSUM] = { .type = NLA_U8 },
195 [TCA_TUNNEL_KEY_ENC_OPTS] = { .type = NLA_NESTED },
196 [TCA_TUNNEL_KEY_ENC_TOS] = { .type = NLA_U8 },
197 [TCA_TUNNEL_KEY_ENC_TTL] = { .type = NLA_U8 },
69}; 198};
70 199
71static int tunnel_key_init(struct net *net, struct nlattr *nla, 200static int tunnel_key_init(struct net *net, struct nlattr *nla,
72 struct nlattr *est, struct tc_action **a, 201 struct nlattr *est, struct tc_action **a,
73 int ovr, int bind, struct netlink_ext_ack *extack) 202 int ovr, int bind, bool rtnl_held,
203 struct netlink_ext_ack *extack)
74{ 204{
75 struct tc_action_net *tn = net_generic(net, tunnel_key_net_id); 205 struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
76 struct nlattr *tb[TCA_TUNNEL_KEY_MAX + 1]; 206 struct nlattr *tb[TCA_TUNNEL_KEY_MAX + 1];
77 struct tcf_tunnel_key_params *params_old;
78 struct tcf_tunnel_key_params *params_new; 207 struct tcf_tunnel_key_params *params_new;
79 struct metadata_dst *metadata = NULL; 208 struct metadata_dst *metadata = NULL;
80 struct tc_tunnel_key *parm; 209 struct tc_tunnel_key *parm;
81 struct tcf_tunnel_key *t; 210 struct tcf_tunnel_key *t;
82 bool exists = false; 211 bool exists = false;
83 __be16 dst_port = 0; 212 __be16 dst_port = 0;
213 int opts_len = 0;
84 __be64 key_id; 214 __be64 key_id;
85 __be16 flags; 215 __be16 flags;
216 u8 tos, ttl;
86 int ret = 0; 217 int ret = 0;
87 int err; 218 int err;
88 219
89 if (!nla) 220 if (!nla) {
221 NL_SET_ERR_MSG(extack, "Tunnel requires attributes to be passed");
90 return -EINVAL; 222 return -EINVAL;
223 }
91 224
92 err = nla_parse_nested(tb, TCA_TUNNEL_KEY_MAX, nla, tunnel_key_policy, 225 err = nla_parse_nested(tb, TCA_TUNNEL_KEY_MAX, nla, tunnel_key_policy,
93 NULL); 226 extack);
94 if (err < 0) 227 if (err < 0) {
228 NL_SET_ERR_MSG(extack, "Failed to parse nested tunnel key attributes");
95 return err; 229 return err;
230 }
96 231
97 if (!tb[TCA_TUNNEL_KEY_PARMS]) 232 if (!tb[TCA_TUNNEL_KEY_PARMS]) {
233 NL_SET_ERR_MSG(extack, "Missing tunnel key parameters");
98 return -EINVAL; 234 return -EINVAL;
235 }
99 236
100 parm = nla_data(tb[TCA_TUNNEL_KEY_PARMS]); 237 parm = nla_data(tb[TCA_TUNNEL_KEY_PARMS]);
101 exists = tcf_idr_check(tn, parm->index, a, bind); 238 err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
239 if (err < 0)
240 return err;
241 exists = err;
102 if (exists && bind) 242 if (exists && bind)
103 return 0; 243 return 0;
104 244
@@ -107,6 +247,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
107 break; 247 break;
108 case TCA_TUNNEL_KEY_ACT_SET: 248 case TCA_TUNNEL_KEY_ACT_SET:
109 if (!tb[TCA_TUNNEL_KEY_ENC_KEY_ID]) { 249 if (!tb[TCA_TUNNEL_KEY_ENC_KEY_ID]) {
250 NL_SET_ERR_MSG(extack, "Missing tunnel key id");
110 ret = -EINVAL; 251 ret = -EINVAL;
111 goto err_out; 252 goto err_out;
112 } 253 }
@@ -121,6 +262,22 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
121 if (tb[TCA_TUNNEL_KEY_ENC_DST_PORT]) 262 if (tb[TCA_TUNNEL_KEY_ENC_DST_PORT])
122 dst_port = nla_get_be16(tb[TCA_TUNNEL_KEY_ENC_DST_PORT]); 263 dst_port = nla_get_be16(tb[TCA_TUNNEL_KEY_ENC_DST_PORT]);
123 264
265 if (tb[TCA_TUNNEL_KEY_ENC_OPTS]) {
266 opts_len = tunnel_key_get_opts_len(tb[TCA_TUNNEL_KEY_ENC_OPTS],
267 extack);
268 if (opts_len < 0) {
269 ret = opts_len;
270 goto err_out;
271 }
272 }
273
274 tos = 0;
275 if (tb[TCA_TUNNEL_KEY_ENC_TOS])
276 tos = nla_get_u8(tb[TCA_TUNNEL_KEY_ENC_TOS]);
277 ttl = 0;
278 if (tb[TCA_TUNNEL_KEY_ENC_TTL])
279 ttl = nla_get_u8(tb[TCA_TUNNEL_KEY_ENC_TTL]);
280
124 if (tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC] && 281 if (tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC] &&
125 tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]) { 282 tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]) {
126 __be32 saddr; 283 __be32 saddr;
@@ -129,9 +286,9 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
129 saddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC]); 286 saddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC]);
130 daddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]); 287 daddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]);
131 288
132 metadata = __ip_tun_set_dst(saddr, daddr, 0, 0, 289 metadata = __ip_tun_set_dst(saddr, daddr, tos, ttl,
133 dst_port, flags, 290 dst_port, flags,
134 key_id, 0); 291 key_id, opts_len);
135 } else if (tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC] && 292 } else if (tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC] &&
136 tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]) { 293 tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]) {
137 struct in6_addr saddr; 294 struct in6_addr saddr;
@@ -140,19 +297,33 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
140 saddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC]); 297 saddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC]);
141 daddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]); 298 daddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]);
142 299
143 metadata = __ipv6_tun_set_dst(&saddr, &daddr, 0, 0, dst_port, 300 metadata = __ipv6_tun_set_dst(&saddr, &daddr, tos, ttl, dst_port,
144 0, flags, 301 0, flags,
145 key_id, 0); 302 key_id, 0);
303 } else {
304 NL_SET_ERR_MSG(extack, "Missing either ipv4 or ipv6 src and dst");
305 ret = -EINVAL;
306 goto err_out;
146 } 307 }
147 308
148 if (!metadata) { 309 if (!metadata) {
149 ret = -EINVAL; 310 NL_SET_ERR_MSG(extack, "Cannot allocate tunnel metadata dst");
311 ret = -ENOMEM;
150 goto err_out; 312 goto err_out;
151 } 313 }
152 314
315 if (opts_len) {
316 ret = tunnel_key_opts_set(tb[TCA_TUNNEL_KEY_ENC_OPTS],
317 &metadata->u.tun_info,
318 opts_len, extack);
319 if (ret < 0)
320 goto err_out;
321 }
322
153 metadata->u.tun_info.mode |= IP_TUNNEL_INFO_TX; 323 metadata->u.tun_info.mode |= IP_TUNNEL_INFO_TX;
154 break; 324 break;
155 default: 325 default:
326 NL_SET_ERR_MSG(extack, "Unknown tunnel key action");
156 ret = -EINVAL; 327 ret = -EINVAL;
157 goto err_out; 328 goto err_out;
158 } 329 }
@@ -160,36 +331,36 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
160 if (!exists) { 331 if (!exists) {
161 ret = tcf_idr_create(tn, parm->index, est, a, 332 ret = tcf_idr_create(tn, parm->index, est, a,
162 &act_tunnel_key_ops, bind, true); 333 &act_tunnel_key_ops, bind, true);
163 if (ret) 334 if (ret) {
164 return ret; 335 NL_SET_ERR_MSG(extack, "Cannot create TC IDR");
336 goto err_out;
337 }
165 338
166 ret = ACT_P_CREATED; 339 ret = ACT_P_CREATED;
167 } else { 340 } else if (!ovr) {
168 tcf_idr_release(*a, bind); 341 tcf_idr_release(*a, bind);
169 if (!ovr) 342 NL_SET_ERR_MSG(extack, "TC IDR already exists");
170 return -EEXIST; 343 return -EEXIST;
171 } 344 }
172 345
173 t = to_tunnel_key(*a); 346 t = to_tunnel_key(*a);
174 347
175 ASSERT_RTNL();
176 params_new = kzalloc(sizeof(*params_new), GFP_KERNEL); 348 params_new = kzalloc(sizeof(*params_new), GFP_KERNEL);
177 if (unlikely(!params_new)) { 349 if (unlikely(!params_new)) {
178 if (ret == ACT_P_CREATED) 350 tcf_idr_release(*a, bind);
179 tcf_idr_release(*a, bind); 351 NL_SET_ERR_MSG(extack, "Cannot allocate tunnel key parameters");
180 return -ENOMEM; 352 return -ENOMEM;
181 } 353 }
182
183 params_old = rtnl_dereference(t->params);
184
185 params_new->action = parm->action;
186 params_new->tcft_action = parm->t_action; 354 params_new->tcft_action = parm->t_action;
187 params_new->tcft_enc_metadata = metadata; 355 params_new->tcft_enc_metadata = metadata;
188 356
189 rcu_assign_pointer(t->params, params_new); 357 spin_lock_bh(&t->tcf_lock);
190 358 t->tcf_action = parm->action;
191 if (params_old) 359 rcu_swap_protected(t->params, params_new,
192 kfree_rcu(params_old, rcu); 360 lockdep_is_held(&t->tcf_lock));
361 spin_unlock_bh(&t->tcf_lock);
362 if (params_new)
363 kfree_rcu(params_new, rcu);
193 364
194 if (ret == ACT_P_CREATED) 365 if (ret == ACT_P_CREATED)
195 tcf_idr_insert(tn, *a); 366 tcf_idr_insert(tn, *a);
@@ -199,6 +370,8 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
199err_out: 370err_out:
200 if (exists) 371 if (exists)
201 tcf_idr_release(*a, bind); 372 tcf_idr_release(*a, bind);
373 else
374 tcf_idr_cleanup(tn, parm->index);
202 return ret; 375 return ret;
203} 376}
204 377
@@ -216,6 +389,61 @@ static void tunnel_key_release(struct tc_action *a)
216 } 389 }
217} 390}
218 391
392static int tunnel_key_geneve_opts_dump(struct sk_buff *skb,
393 const struct ip_tunnel_info *info)
394{
395 int len = info->options_len;
396 u8 *src = (u8 *)(info + 1);
397 struct nlattr *start;
398
399 start = nla_nest_start(skb, TCA_TUNNEL_KEY_ENC_OPTS_GENEVE);
400 if (!start)
401 return -EMSGSIZE;
402
403 while (len > 0) {
404 struct geneve_opt *opt = (struct geneve_opt *)src;
405
406 if (nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS,
407 opt->opt_class) ||
408 nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE,
409 opt->type) ||
410 nla_put(skb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA,
411 opt->length * 4, opt + 1))
412 return -EMSGSIZE;
413
414 len -= sizeof(struct geneve_opt) + opt->length * 4;
415 src += sizeof(struct geneve_opt) + opt->length * 4;
416 }
417
418 nla_nest_end(skb, start);
419 return 0;
420}
421
422static int tunnel_key_opts_dump(struct sk_buff *skb,
423 const struct ip_tunnel_info *info)
424{
425 struct nlattr *start;
426 int err;
427
428 if (!info->options_len)
429 return 0;
430
431 start = nla_nest_start(skb, TCA_TUNNEL_KEY_ENC_OPTS);
432 if (!start)
433 return -EMSGSIZE;
434
435 if (info->key.tun_flags & TUNNEL_GENEVE_OPT) {
436 err = tunnel_key_geneve_opts_dump(skb, info);
437 if (err)
438 return err;
439 } else {
440 return -EINVAL;
441 }
442
443 nla_nest_end(skb, start);
444 return 0;
445}
446
219static int tunnel_key_dump_addresses(struct sk_buff *skb, 447static int tunnel_key_dump_addresses(struct sk_buff *skb,
220 const struct ip_tunnel_info *info) 448 const struct ip_tunnel_info *info)
221{ 449{
@@ -252,22 +480,24 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a,
252 struct tcf_tunnel_key_params *params; 480 struct tcf_tunnel_key_params *params;
253 struct tc_tunnel_key opt = { 481 struct tc_tunnel_key opt = {
254 .index = t->tcf_index, 482 .index = t->tcf_index,
255 .refcnt = t->tcf_refcnt - ref, 483 .refcnt = refcount_read(&t->tcf_refcnt) - ref,
256 .bindcnt = t->tcf_bindcnt - bind, 484 .bindcnt = atomic_read(&t->tcf_bindcnt) - bind,
257 }; 485 };
258 struct tcf_t tm; 486 struct tcf_t tm;
259 487
260 params = rtnl_dereference(t->params); 488 spin_lock_bh(&t->tcf_lock);
261 489 params = rcu_dereference_protected(t->params,
490 lockdep_is_held(&t->tcf_lock));
491 opt.action = t->tcf_action;
262 opt.t_action = params->tcft_action; 492 opt.t_action = params->tcft_action;
263 opt.action = params->action;
264 493
265 if (nla_put(skb, TCA_TUNNEL_KEY_PARMS, sizeof(opt), &opt)) 494 if (nla_put(skb, TCA_TUNNEL_KEY_PARMS, sizeof(opt), &opt))
266 goto nla_put_failure; 495 goto nla_put_failure;
267 496
268 if (params->tcft_action == TCA_TUNNEL_KEY_ACT_SET) { 497 if (params->tcft_action == TCA_TUNNEL_KEY_ACT_SET) {
269 struct ip_tunnel_key *key = 498 struct ip_tunnel_info *info =
270 &params->tcft_enc_metadata->u.tun_info.key; 499 &params->tcft_enc_metadata->u.tun_info;
500 struct ip_tunnel_key *key = &info->key;
271 __be32 key_id = tunnel_id_to_key32(key->tun_id); 501 __be32 key_id = tunnel_id_to_key32(key->tun_id);
272 502
273 if (nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_KEY_ID, key_id) || 503 if (nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_KEY_ID, key_id) ||
@@ -275,7 +505,14 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a,
275 &params->tcft_enc_metadata->u.tun_info) || 505 &params->tcft_enc_metadata->u.tun_info) ||
276 nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_DST_PORT, key->tp_dst) || 506 nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_DST_PORT, key->tp_dst) ||
277 nla_put_u8(skb, TCA_TUNNEL_KEY_NO_CSUM, 507 nla_put_u8(skb, TCA_TUNNEL_KEY_NO_CSUM,
278 !(key->tun_flags & TUNNEL_CSUM))) 508 !(key->tun_flags & TUNNEL_CSUM)) ||
509 tunnel_key_opts_dump(skb, info))
510 goto nla_put_failure;
511
512 if (key->tos && nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_TOS, key->tos))
513 goto nla_put_failure;
514
515 if (key->ttl && nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_TTL, key->ttl))
279 goto nla_put_failure; 516 goto nla_put_failure;
280 } 517 }
281 518
@@ -283,10 +520,12 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a,
283 if (nla_put_64bit(skb, TCA_TUNNEL_KEY_TM, sizeof(tm), 520 if (nla_put_64bit(skb, TCA_TUNNEL_KEY_TM, sizeof(tm),
284 &tm, TCA_TUNNEL_KEY_PAD)) 521 &tm, TCA_TUNNEL_KEY_PAD))
285 goto nla_put_failure; 522 goto nla_put_failure;
523 spin_unlock_bh(&t->tcf_lock);
286 524
287 return skb->len; 525 return skb->len;
288 526
289nla_put_failure: 527nla_put_failure:
528 spin_unlock_bh(&t->tcf_lock);
290 nlmsg_trim(skb, b); 529 nlmsg_trim(skb, b);
291 return -1; 530 return -1;
292} 531}
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 1fb39e1f9d07..033d273afe50 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -22,8 +22,8 @@
22static unsigned int vlan_net_id; 22static unsigned int vlan_net_id;
23static struct tc_action_ops act_vlan_ops; 23static struct tc_action_ops act_vlan_ops;
24 24
25static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a, 25static int tcf_vlan_act(struct sk_buff *skb, const struct tc_action *a,
26 struct tcf_result *res) 26 struct tcf_result *res)
27{ 27{
28 struct tcf_vlan *v = to_vlan(a); 28 struct tcf_vlan *v = to_vlan(a);
29 struct tcf_vlan_params *p; 29 struct tcf_vlan_params *p;
@@ -40,11 +40,9 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a,
40 if (skb_at_tc_ingress(skb)) 40 if (skb_at_tc_ingress(skb))
41 skb_push_rcsum(skb, skb->mac_len); 41 skb_push_rcsum(skb, skb->mac_len);
42 42
43 rcu_read_lock();
44
45 action = READ_ONCE(v->tcf_action); 43 action = READ_ONCE(v->tcf_action);
46 44
47 p = rcu_dereference(v->vlan_p); 45 p = rcu_dereference_bh(v->vlan_p);
48 46
49 switch (p->tcfv_action) { 47 switch (p->tcfv_action) {
50 case TCA_VLAN_ACT_POP: 48 case TCA_VLAN_ACT_POP:
@@ -61,7 +59,7 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a,
61 case TCA_VLAN_ACT_MODIFY: 59 case TCA_VLAN_ACT_MODIFY:
62 /* No-op if no vlan tag (either hw-accel or in-payload) */ 60 /* No-op if no vlan tag (either hw-accel or in-payload) */
63 if (!skb_vlan_tagged(skb)) 61 if (!skb_vlan_tagged(skb))
64 goto unlock; 62 goto out;
65 /* extract existing tag (and guarantee no hw-accel tag) */ 63 /* extract existing tag (and guarantee no hw-accel tag) */
66 if (skb_vlan_tag_present(skb)) { 64 if (skb_vlan_tag_present(skb)) {
67 tci = skb_vlan_tag_get(skb); 65 tci = skb_vlan_tag_get(skb);
@@ -86,18 +84,15 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a,
86 BUG(); 84 BUG();
87 } 85 }
88 86
89 goto unlock; 87out:
90
91drop:
92 action = TC_ACT_SHOT;
93 qstats_drop_inc(this_cpu_ptr(v->common.cpu_qstats));
94
95unlock:
96 rcu_read_unlock();
97 if (skb_at_tc_ingress(skb)) 88 if (skb_at_tc_ingress(skb))
98 skb_pull_rcsum(skb, skb->mac_len); 89 skb_pull_rcsum(skb, skb->mac_len);
99 90
100 return action; 91 return action;
92
93drop:
94 qstats_drop_inc(this_cpu_ptr(v->common.cpu_qstats));
95 return TC_ACT_SHOT;
101} 96}
102 97
103static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = { 98static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = {
@@ -109,11 +104,12 @@ static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = {
109 104
110static int tcf_vlan_init(struct net *net, struct nlattr *nla, 105static int tcf_vlan_init(struct net *net, struct nlattr *nla,
111 struct nlattr *est, struct tc_action **a, 106 struct nlattr *est, struct tc_action **a,
112 int ovr, int bind, struct netlink_ext_ack *extack) 107 int ovr, int bind, bool rtnl_held,
108 struct netlink_ext_ack *extack)
113{ 109{
114 struct tc_action_net *tn = net_generic(net, vlan_net_id); 110 struct tc_action_net *tn = net_generic(net, vlan_net_id);
115 struct nlattr *tb[TCA_VLAN_MAX + 1]; 111 struct nlattr *tb[TCA_VLAN_MAX + 1];
116 struct tcf_vlan_params *p, *p_old; 112 struct tcf_vlan_params *p;
117 struct tc_vlan *parm; 113 struct tc_vlan *parm;
118 struct tcf_vlan *v; 114 struct tcf_vlan *v;
119 int action; 115 int action;
@@ -133,7 +129,10 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
133 if (!tb[TCA_VLAN_PARMS]) 129 if (!tb[TCA_VLAN_PARMS])
134 return -EINVAL; 130 return -EINVAL;
135 parm = nla_data(tb[TCA_VLAN_PARMS]); 131 parm = nla_data(tb[TCA_VLAN_PARMS]);
136 exists = tcf_idr_check(tn, parm->index, a, bind); 132 err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
133 if (err < 0)
134 return err;
135 exists = err;
137 if (exists && bind) 136 if (exists && bind)
138 return 0; 137 return 0;
139 138
@@ -145,12 +144,16 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
145 if (!tb[TCA_VLAN_PUSH_VLAN_ID]) { 144 if (!tb[TCA_VLAN_PUSH_VLAN_ID]) {
146 if (exists) 145 if (exists)
147 tcf_idr_release(*a, bind); 146 tcf_idr_release(*a, bind);
147 else
148 tcf_idr_cleanup(tn, parm->index);
148 return -EINVAL; 149 return -EINVAL;
149 } 150 }
150 push_vid = nla_get_u16(tb[TCA_VLAN_PUSH_VLAN_ID]); 151 push_vid = nla_get_u16(tb[TCA_VLAN_PUSH_VLAN_ID]);
151 if (push_vid >= VLAN_VID_MASK) { 152 if (push_vid >= VLAN_VID_MASK) {
152 if (exists) 153 if (exists)
153 tcf_idr_release(*a, bind); 154 tcf_idr_release(*a, bind);
155 else
156 tcf_idr_cleanup(tn, parm->index);
154 return -ERANGE; 157 return -ERANGE;
155 } 158 }
156 159
@@ -163,6 +166,8 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
163 default: 166 default:
164 if (exists) 167 if (exists)
165 tcf_idr_release(*a, bind); 168 tcf_idr_release(*a, bind);
169 else
170 tcf_idr_cleanup(tn, parm->index);
166 return -EPROTONOSUPPORT; 171 return -EPROTONOSUPPORT;
167 } 172 }
168 } else { 173 } else {
@@ -175,6 +180,8 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
175 default: 180 default:
176 if (exists) 181 if (exists)
177 tcf_idr_release(*a, bind); 182 tcf_idr_release(*a, bind);
183 else
184 tcf_idr_cleanup(tn, parm->index);
178 return -EINVAL; 185 return -EINVAL;
179 } 186 }
180 action = parm->v_action; 187 action = parm->v_action;
@@ -182,39 +189,37 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
182 if (!exists) { 189 if (!exists) {
183 ret = tcf_idr_create(tn, parm->index, est, a, 190 ret = tcf_idr_create(tn, parm->index, est, a,
184 &act_vlan_ops, bind, true); 191 &act_vlan_ops, bind, true);
185 if (ret) 192 if (ret) {
193 tcf_idr_cleanup(tn, parm->index);
186 return ret; 194 return ret;
195 }
187 196
188 ret = ACT_P_CREATED; 197 ret = ACT_P_CREATED;
189 } else { 198 } else if (!ovr) {
190 tcf_idr_release(*a, bind); 199 tcf_idr_release(*a, bind);
191 if (!ovr) 200 return -EEXIST;
192 return -EEXIST;
193 } 201 }
194 202
195 v = to_vlan(*a); 203 v = to_vlan(*a);
196 204
197 ASSERT_RTNL();
198 p = kzalloc(sizeof(*p), GFP_KERNEL); 205 p = kzalloc(sizeof(*p), GFP_KERNEL);
199 if (!p) { 206 if (!p) {
200 if (ret == ACT_P_CREATED) 207 tcf_idr_release(*a, bind);
201 tcf_idr_release(*a, bind);
202 return -ENOMEM; 208 return -ENOMEM;
203 } 209 }
204 210
205 v->tcf_action = parm->action;
206
207 p_old = rtnl_dereference(v->vlan_p);
208
209 p->tcfv_action = action; 211 p->tcfv_action = action;
210 p->tcfv_push_vid = push_vid; 212 p->tcfv_push_vid = push_vid;
211 p->tcfv_push_prio = push_prio; 213 p->tcfv_push_prio = push_prio;
212 p->tcfv_push_proto = push_proto; 214 p->tcfv_push_proto = push_proto;
213 215
214 rcu_assign_pointer(v->vlan_p, p); 216 spin_lock_bh(&v->tcf_lock);
217 v->tcf_action = parm->action;
218 rcu_swap_protected(v->vlan_p, p, lockdep_is_held(&v->tcf_lock));
219 spin_unlock_bh(&v->tcf_lock);
215 220
216 if (p_old) 221 if (p)
217 kfree_rcu(p_old, rcu); 222 kfree_rcu(p, rcu);
218 223
219 if (ret == ACT_P_CREATED) 224 if (ret == ACT_P_CREATED)
220 tcf_idr_insert(tn, *a); 225 tcf_idr_insert(tn, *a);
@@ -236,16 +241,18 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a,
236{ 241{
237 unsigned char *b = skb_tail_pointer(skb); 242 unsigned char *b = skb_tail_pointer(skb);
238 struct tcf_vlan *v = to_vlan(a); 243 struct tcf_vlan *v = to_vlan(a);
239 struct tcf_vlan_params *p = rtnl_dereference(v->vlan_p); 244 struct tcf_vlan_params *p;
240 struct tc_vlan opt = { 245 struct tc_vlan opt = {
241 .index = v->tcf_index, 246 .index = v->tcf_index,
242 .refcnt = v->tcf_refcnt - ref, 247 .refcnt = refcount_read(&v->tcf_refcnt) - ref,
243 .bindcnt = v->tcf_bindcnt - bind, 248 .bindcnt = atomic_read(&v->tcf_bindcnt) - bind,
244 .action = v->tcf_action,
245 .v_action = p->tcfv_action,
246 }; 249 };
247 struct tcf_t t; 250 struct tcf_t t;
248 251
252 spin_lock_bh(&v->tcf_lock);
253 opt.action = v->tcf_action;
254 p = rcu_dereference_protected(v->vlan_p, lockdep_is_held(&v->tcf_lock));
255 opt.v_action = p->tcfv_action;
249 if (nla_put(skb, TCA_VLAN_PARMS, sizeof(opt), &opt)) 256 if (nla_put(skb, TCA_VLAN_PARMS, sizeof(opt), &opt))
250 goto nla_put_failure; 257 goto nla_put_failure;
251 258
@@ -261,9 +268,12 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a,
261 tcf_tm_dump(&t, &v->tcf_tm); 268 tcf_tm_dump(&t, &v->tcf_tm);
262 if (nla_put_64bit(skb, TCA_VLAN_TM, sizeof(t), &t, TCA_VLAN_PAD)) 269 if (nla_put_64bit(skb, TCA_VLAN_TM, sizeof(t), &t, TCA_VLAN_PAD))
263 goto nla_put_failure; 270 goto nla_put_failure;
271 spin_unlock_bh(&v->tcf_lock);
272
264 return skb->len; 273 return skb->len;
265 274
266nla_put_failure: 275nla_put_failure:
276 spin_unlock_bh(&v->tcf_lock);
267 nlmsg_trim(skb, b); 277 nlmsg_trim(skb, b);
268 return -1; 278 return -1;
269} 279}
@@ -290,7 +300,7 @@ static struct tc_action_ops act_vlan_ops = {
290 .kind = "vlan", 300 .kind = "vlan",
291 .type = TCA_ACT_VLAN, 301 .type = TCA_ACT_VLAN,
292 .owner = THIS_MODULE, 302 .owner = THIS_MODULE,
293 .act = tcf_vlan, 303 .act = tcf_vlan_act,
294 .dump = tcf_vlan_dump, 304 .dump = tcf_vlan_dump,
295 .init = tcf_vlan_init, 305 .init = tcf_vlan_init,
296 .cleanup = tcf_vlan_cleanup, 306 .cleanup = tcf_vlan_cleanup,
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index cdc3c87c53e6..31bd1439cf60 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -39,7 +39,7 @@ static DEFINE_RWLOCK(cls_mod_lock);
39 39
40/* Find classifier type by string name */ 40/* Find classifier type by string name */
41 41
42static const struct tcf_proto_ops *tcf_proto_lookup_ops(const char *kind) 42static const struct tcf_proto_ops *__tcf_proto_lookup_ops(const char *kind)
43{ 43{
44 const struct tcf_proto_ops *t, *res = NULL; 44 const struct tcf_proto_ops *t, *res = NULL;
45 45
@@ -57,6 +57,33 @@ static const struct tcf_proto_ops *tcf_proto_lookup_ops(const char *kind)
57 return res; 57 return res;
58} 58}
59 59
60static const struct tcf_proto_ops *
61tcf_proto_lookup_ops(const char *kind, struct netlink_ext_ack *extack)
62{
63 const struct tcf_proto_ops *ops;
64
65 ops = __tcf_proto_lookup_ops(kind);
66 if (ops)
67 return ops;
68#ifdef CONFIG_MODULES
69 rtnl_unlock();
70 request_module("cls_%s", kind);
71 rtnl_lock();
72 ops = __tcf_proto_lookup_ops(kind);
73 /* We dropped the RTNL semaphore in order to perform
74 * the module load. So, even if we succeeded in loading
75 * the module we have to replay the request. We indicate
76 * this using -EAGAIN.
77 */
78 if (ops) {
79 module_put(ops->owner);
80 return ERR_PTR(-EAGAIN);
81 }
82#endif
83 NL_SET_ERR_MSG(extack, "TC classifier not found");
84 return ERR_PTR(-ENOENT);
85}
86
60/* Register(unregister) new classifier type */ 87/* Register(unregister) new classifier type */
61 88
62int register_tcf_proto_ops(struct tcf_proto_ops *ops) 89int register_tcf_proto_ops(struct tcf_proto_ops *ops)
@@ -133,27 +160,9 @@ static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol,
133 if (!tp) 160 if (!tp)
134 return ERR_PTR(-ENOBUFS); 161 return ERR_PTR(-ENOBUFS);
135 162
136 err = -ENOENT; 163 tp->ops = tcf_proto_lookup_ops(kind, extack);
137 tp->ops = tcf_proto_lookup_ops(kind); 164 if (IS_ERR(tp->ops)) {
138 if (!tp->ops) { 165 err = PTR_ERR(tp->ops);
139#ifdef CONFIG_MODULES
140 rtnl_unlock();
141 request_module("cls_%s", kind);
142 rtnl_lock();
143 tp->ops = tcf_proto_lookup_ops(kind);
144 /* We dropped the RTNL semaphore in order to perform
145 * the module load. So, even if we succeeded in loading
146 * the module we have to replay the request. We indicate
147 * this using -EAGAIN.
148 */
149 if (tp->ops) {
150 module_put(tp->ops->owner);
151 err = -EAGAIN;
152 } else {
153 NL_SET_ERR_MSG(extack, "TC classifier not found");
154 err = -ENOENT;
155 }
156#endif
157 goto errout; 166 goto errout;
158 } 167 }
159 tp->classify = tp->ops->classify; 168 tp->classify = tp->ops->classify;
@@ -195,11 +204,12 @@ static struct tcf_chain *tcf_chain_create(struct tcf_block *block,
195 chain = kzalloc(sizeof(*chain), GFP_KERNEL); 204 chain = kzalloc(sizeof(*chain), GFP_KERNEL);
196 if (!chain) 205 if (!chain)
197 return NULL; 206 return NULL;
198 INIT_LIST_HEAD(&chain->filter_chain_list);
199 list_add_tail(&chain->list, &block->chain_list); 207 list_add_tail(&chain->list, &block->chain_list);
200 chain->block = block; 208 chain->block = block;
201 chain->index = chain_index; 209 chain->index = chain_index;
202 chain->refcnt = 1; 210 chain->refcnt = 1;
211 if (!chain->index)
212 block->chain0.chain = chain;
203 return chain; 213 return chain;
204} 214}
205 215
@@ -209,35 +219,28 @@ static void tcf_chain_head_change_item(struct tcf_filter_chain_list_item *item,
209 if (item->chain_head_change) 219 if (item->chain_head_change)
210 item->chain_head_change(tp_head, item->chain_head_change_priv); 220 item->chain_head_change(tp_head, item->chain_head_change_priv);
211} 221}
212static void tcf_chain_head_change(struct tcf_chain *chain, 222
213 struct tcf_proto *tp_head) 223static void tcf_chain0_head_change(struct tcf_chain *chain,
224 struct tcf_proto *tp_head)
214{ 225{
215 struct tcf_filter_chain_list_item *item; 226 struct tcf_filter_chain_list_item *item;
227 struct tcf_block *block = chain->block;
216 228
217 list_for_each_entry(item, &chain->filter_chain_list, list) 229 if (chain->index)
230 return;
231 list_for_each_entry(item, &block->chain0.filter_chain_list, list)
218 tcf_chain_head_change_item(item, tp_head); 232 tcf_chain_head_change_item(item, tp_head);
219} 233}
220 234
221static void tcf_chain_flush(struct tcf_chain *chain)
222{
223 struct tcf_proto *tp = rtnl_dereference(chain->filter_chain);
224
225 tcf_chain_head_change(chain, NULL);
226 while (tp) {
227 RCU_INIT_POINTER(chain->filter_chain, tp->next);
228 tcf_proto_destroy(tp, NULL);
229 tp = rtnl_dereference(chain->filter_chain);
230 tcf_chain_put(chain);
231 }
232}
233
234static void tcf_chain_destroy(struct tcf_chain *chain) 235static void tcf_chain_destroy(struct tcf_chain *chain)
235{ 236{
236 struct tcf_block *block = chain->block; 237 struct tcf_block *block = chain->block;
237 238
238 list_del(&chain->list); 239 list_del(&chain->list);
240 if (!chain->index)
241 block->chain0.chain = NULL;
239 kfree(chain); 242 kfree(chain);
240 if (list_empty(&block->chain_list)) 243 if (list_empty(&block->chain_list) && block->refcnt == 0)
241 kfree(block); 244 kfree(block);
242} 245}
243 246
@@ -246,28 +249,119 @@ static void tcf_chain_hold(struct tcf_chain *chain)
246 ++chain->refcnt; 249 ++chain->refcnt;
247} 250}
248 251
249struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, 252static bool tcf_chain_held_by_acts_only(struct tcf_chain *chain)
250 bool create) 253{
254 /* In case all the references are action references, this
255 * chain should not be shown to the user.
256 */
257 return chain->refcnt == chain->action_refcnt;
258}
259
260static struct tcf_chain *tcf_chain_lookup(struct tcf_block *block,
261 u32 chain_index)
251{ 262{
252 struct tcf_chain *chain; 263 struct tcf_chain *chain;
253 264
254 list_for_each_entry(chain, &block->chain_list, list) { 265 list_for_each_entry(chain, &block->chain_list, list) {
255 if (chain->index == chain_index) { 266 if (chain->index == chain_index)
256 tcf_chain_hold(chain);
257 return chain; 267 return chain;
258 }
259 } 268 }
269 return NULL;
270}
260 271
261 return create ? tcf_chain_create(block, chain_index) : NULL; 272static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb,
273 u32 seq, u16 flags, int event, bool unicast);
274
275static struct tcf_chain *__tcf_chain_get(struct tcf_block *block,
276 u32 chain_index, bool create,
277 bool by_act)
278{
279 struct tcf_chain *chain = tcf_chain_lookup(block, chain_index);
280
281 if (chain) {
282 tcf_chain_hold(chain);
283 } else {
284 if (!create)
285 return NULL;
286 chain = tcf_chain_create(block, chain_index);
287 if (!chain)
288 return NULL;
289 }
290
291 if (by_act)
292 ++chain->action_refcnt;
293
294 /* Send notification only in case we got the first
295 * non-action reference. Until then, the chain acts only as
296 * a placeholder for actions pointing to it and user ought
297 * not know about them.
298 */
299 if (chain->refcnt - chain->action_refcnt == 1 && !by_act)
300 tc_chain_notify(chain, NULL, 0, NLM_F_CREATE | NLM_F_EXCL,
301 RTM_NEWCHAIN, false);
302
303 return chain;
262} 304}
263EXPORT_SYMBOL(tcf_chain_get);
264 305
265void tcf_chain_put(struct tcf_chain *chain) 306static struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index,
307 bool create)
266{ 308{
267 if (--chain->refcnt == 0) 309 return __tcf_chain_get(block, chain_index, create, false);
310}
311
312struct tcf_chain *tcf_chain_get_by_act(struct tcf_block *block, u32 chain_index)
313{
314 return __tcf_chain_get(block, chain_index, true, true);
315}
316EXPORT_SYMBOL(tcf_chain_get_by_act);
317
318static void tc_chain_tmplt_del(struct tcf_chain *chain);
319
320static void __tcf_chain_put(struct tcf_chain *chain, bool by_act)
321{
322 if (by_act)
323 chain->action_refcnt--;
324 chain->refcnt--;
325
326 /* The last dropped non-action reference will trigger notification. */
327 if (chain->refcnt - chain->action_refcnt == 0 && !by_act)
328 tc_chain_notify(chain, NULL, 0, 0, RTM_DELCHAIN, false);
329
330 if (chain->refcnt == 0) {
331 tc_chain_tmplt_del(chain);
268 tcf_chain_destroy(chain); 332 tcf_chain_destroy(chain);
333 }
334}
335
336static void tcf_chain_put(struct tcf_chain *chain)
337{
338 __tcf_chain_put(chain, false);
339}
340
341void tcf_chain_put_by_act(struct tcf_chain *chain)
342{
343 __tcf_chain_put(chain, true);
344}
345EXPORT_SYMBOL(tcf_chain_put_by_act);
346
347static void tcf_chain_put_explicitly_created(struct tcf_chain *chain)
348{
349 if (chain->explicitly_created)
350 tcf_chain_put(chain);
351}
352
353static void tcf_chain_flush(struct tcf_chain *chain)
354{
355 struct tcf_proto *tp = rtnl_dereference(chain->filter_chain);
356
357 tcf_chain0_head_change(chain, NULL);
358 while (tp) {
359 RCU_INIT_POINTER(chain->filter_chain, tp->next);
360 tcf_proto_destroy(tp, NULL);
361 tp = rtnl_dereference(chain->filter_chain);
362 tcf_chain_put(chain);
363 }
269} 364}
270EXPORT_SYMBOL(tcf_chain_put);
271 365
272static bool tcf_block_offload_in_use(struct tcf_block *block) 366static bool tcf_block_offload_in_use(struct tcf_block *block)
273{ 367{
@@ -277,18 +371,21 @@ static bool tcf_block_offload_in_use(struct tcf_block *block)
277static int tcf_block_offload_cmd(struct tcf_block *block, 371static int tcf_block_offload_cmd(struct tcf_block *block,
278 struct net_device *dev, 372 struct net_device *dev,
279 struct tcf_block_ext_info *ei, 373 struct tcf_block_ext_info *ei,
280 enum tc_block_command command) 374 enum tc_block_command command,
375 struct netlink_ext_ack *extack)
281{ 376{
282 struct tc_block_offload bo = {}; 377 struct tc_block_offload bo = {};
283 378
284 bo.command = command; 379 bo.command = command;
285 bo.binder_type = ei->binder_type; 380 bo.binder_type = ei->binder_type;
286 bo.block = block; 381 bo.block = block;
382 bo.extack = extack;
287 return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo); 383 return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo);
288} 384}
289 385
290static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q, 386static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q,
291 struct tcf_block_ext_info *ei) 387 struct tcf_block_ext_info *ei,
388 struct netlink_ext_ack *extack)
292{ 389{
293 struct net_device *dev = q->dev_queue->dev; 390 struct net_device *dev = q->dev_queue->dev;
294 int err; 391 int err;
@@ -299,10 +396,12 @@ static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q,
299 /* If tc offload feature is disabled and the block we try to bind 396 /* If tc offload feature is disabled and the block we try to bind
300 * to already has some offloaded filters, forbid to bind. 397 * to already has some offloaded filters, forbid to bind.
301 */ 398 */
302 if (!tc_can_offload(dev) && tcf_block_offload_in_use(block)) 399 if (!tc_can_offload(dev) && tcf_block_offload_in_use(block)) {
400 NL_SET_ERR_MSG(extack, "Bind to offloaded block failed as dev has offload disabled");
303 return -EOPNOTSUPP; 401 return -EOPNOTSUPP;
402 }
304 403
305 err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_BIND); 404 err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_BIND, extack);
306 if (err == -EOPNOTSUPP) 405 if (err == -EOPNOTSUPP)
307 goto no_offload_dev_inc; 406 goto no_offload_dev_inc;
308 return err; 407 return err;
@@ -322,7 +421,7 @@ static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q,
322 421
323 if (!dev->netdev_ops->ndo_setup_tc) 422 if (!dev->netdev_ops->ndo_setup_tc)
324 goto no_offload_dev_dec; 423 goto no_offload_dev_dec;
325 err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_UNBIND); 424 err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_UNBIND, NULL);
326 if (err == -EOPNOTSUPP) 425 if (err == -EOPNOTSUPP)
327 goto no_offload_dev_dec; 426 goto no_offload_dev_dec;
328 return; 427 return;
@@ -332,10 +431,11 @@ no_offload_dev_dec:
332} 431}
333 432
334static int 433static int
335tcf_chain_head_change_cb_add(struct tcf_chain *chain, 434tcf_chain0_head_change_cb_add(struct tcf_block *block,
336 struct tcf_block_ext_info *ei, 435 struct tcf_block_ext_info *ei,
337 struct netlink_ext_ack *extack) 436 struct netlink_ext_ack *extack)
338{ 437{
438 struct tcf_chain *chain0 = block->chain0.chain;
339 struct tcf_filter_chain_list_item *item; 439 struct tcf_filter_chain_list_item *item;
340 440
341 item = kmalloc(sizeof(*item), GFP_KERNEL); 441 item = kmalloc(sizeof(*item), GFP_KERNEL);
@@ -345,23 +445,25 @@ tcf_chain_head_change_cb_add(struct tcf_chain *chain,
345 } 445 }
346 item->chain_head_change = ei->chain_head_change; 446 item->chain_head_change = ei->chain_head_change;
347 item->chain_head_change_priv = ei->chain_head_change_priv; 447 item->chain_head_change_priv = ei->chain_head_change_priv;
348 if (chain->filter_chain) 448 if (chain0 && chain0->filter_chain)
349 tcf_chain_head_change_item(item, chain->filter_chain); 449 tcf_chain_head_change_item(item, chain0->filter_chain);
350 list_add(&item->list, &chain->filter_chain_list); 450 list_add(&item->list, &block->chain0.filter_chain_list);
351 return 0; 451 return 0;
352} 452}
353 453
354static void 454static void
355tcf_chain_head_change_cb_del(struct tcf_chain *chain, 455tcf_chain0_head_change_cb_del(struct tcf_block *block,
356 struct tcf_block_ext_info *ei) 456 struct tcf_block_ext_info *ei)
357{ 457{
458 struct tcf_chain *chain0 = block->chain0.chain;
358 struct tcf_filter_chain_list_item *item; 459 struct tcf_filter_chain_list_item *item;
359 460
360 list_for_each_entry(item, &chain->filter_chain_list, list) { 461 list_for_each_entry(item, &block->chain0.filter_chain_list, list) {
361 if ((!ei->chain_head_change && !ei->chain_head_change_priv) || 462 if ((!ei->chain_head_change && !ei->chain_head_change_priv) ||
362 (item->chain_head_change == ei->chain_head_change && 463 (item->chain_head_change == ei->chain_head_change &&
363 item->chain_head_change_priv == ei->chain_head_change_priv)) { 464 item->chain_head_change_priv == ei->chain_head_change_priv)) {
364 tcf_chain_head_change_item(item, NULL); 465 if (chain0)
466 tcf_chain_head_change_item(item, NULL);
365 list_del(&item->list); 467 list_del(&item->list);
366 kfree(item); 468 kfree(item);
367 return; 469 return;
@@ -397,8 +499,6 @@ static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q,
397 struct netlink_ext_ack *extack) 499 struct netlink_ext_ack *extack)
398{ 500{
399 struct tcf_block *block; 501 struct tcf_block *block;
400 struct tcf_chain *chain;
401 int err;
402 502
403 block = kzalloc(sizeof(*block), GFP_KERNEL); 503 block = kzalloc(sizeof(*block), GFP_KERNEL);
404 if (!block) { 504 if (!block) {
@@ -408,14 +508,8 @@ static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q,
408 INIT_LIST_HEAD(&block->chain_list); 508 INIT_LIST_HEAD(&block->chain_list);
409 INIT_LIST_HEAD(&block->cb_list); 509 INIT_LIST_HEAD(&block->cb_list);
410 INIT_LIST_HEAD(&block->owner_list); 510 INIT_LIST_HEAD(&block->owner_list);
511 INIT_LIST_HEAD(&block->chain0.filter_chain_list);
411 512
412 /* Create chain 0 by default, it has to be always present. */
413 chain = tcf_chain_create(block, 0);
414 if (!chain) {
415 NL_SET_ERR_MSG(extack, "Failed to create new tcf chain");
416 err = -ENOMEM;
417 goto err_chain_create;
418 }
419 block->refcnt = 1; 513 block->refcnt = 1;
420 block->net = net; 514 block->net = net;
421 block->index = block_index; 515 block->index = block_index;
@@ -424,10 +518,6 @@ static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q,
424 if (!tcf_block_shared(block)) 518 if (!tcf_block_shared(block))
425 block->q = q; 519 block->q = q;
426 return block; 520 return block;
427
428err_chain_create:
429 kfree(block);
430 return ERR_PTR(err);
431} 521}
432 522
433static struct tcf_block *tcf_block_lookup(struct net *net, u32 block_index) 523static struct tcf_block *tcf_block_lookup(struct net *net, u32 block_index)
@@ -509,11 +599,6 @@ static struct tcf_block *tcf_block_find(struct net *net, struct Qdisc **q,
509 return block; 599 return block;
510} 600}
511 601
512static struct tcf_chain *tcf_block_chain_zero(struct tcf_block *block)
513{
514 return list_first_entry(&block->chain_list, struct tcf_chain, list);
515}
516
517struct tcf_block_owner_item { 602struct tcf_block_owner_item {
518 struct list_head list; 603 struct list_head list;
519 struct Qdisc *q; 604 struct Qdisc *q;
@@ -607,12 +692,11 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q,
607 692
608 tcf_block_owner_netif_keep_dst(block, q, ei->binder_type); 693 tcf_block_owner_netif_keep_dst(block, q, ei->binder_type);
609 694
610 err = tcf_chain_head_change_cb_add(tcf_block_chain_zero(block), 695 err = tcf_chain0_head_change_cb_add(block, ei, extack);
611 ei, extack);
612 if (err) 696 if (err)
613 goto err_chain_head_change_cb_add; 697 goto err_chain0_head_change_cb_add;
614 698
615 err = tcf_block_offload_bind(block, q, ei); 699 err = tcf_block_offload_bind(block, q, ei, extack);
616 if (err) 700 if (err)
617 goto err_block_offload_bind; 701 goto err_block_offload_bind;
618 702
@@ -620,15 +704,14 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q,
620 return 0; 704 return 0;
621 705
622err_block_offload_bind: 706err_block_offload_bind:
623 tcf_chain_head_change_cb_del(tcf_block_chain_zero(block), ei); 707 tcf_chain0_head_change_cb_del(block, ei);
624err_chain_head_change_cb_add: 708err_chain0_head_change_cb_add:
625 tcf_block_owner_del(block, q, ei->binder_type); 709 tcf_block_owner_del(block, q, ei->binder_type);
626err_block_owner_add: 710err_block_owner_add:
627 if (created) { 711 if (created) {
628 if (tcf_block_shared(block)) 712 if (tcf_block_shared(block))
629 tcf_block_remove(block, net); 713 tcf_block_remove(block, net);
630err_block_insert: 714err_block_insert:
631 kfree(tcf_block_chain_zero(block));
632 kfree(block); 715 kfree(block);
633 } else { 716 } else {
634 block->refcnt--; 717 block->refcnt--;
@@ -668,10 +751,10 @@ void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q,
668 751
669 if (!block) 752 if (!block)
670 return; 753 return;
671 tcf_chain_head_change_cb_del(tcf_block_chain_zero(block), ei); 754 tcf_chain0_head_change_cb_del(block, ei);
672 tcf_block_owner_del(block, q, ei->binder_type); 755 tcf_block_owner_del(block, q, ei->binder_type);
673 756
674 if (--block->refcnt == 0) { 757 if (block->refcnt == 1) {
675 if (tcf_block_shared(block)) 758 if (tcf_block_shared(block))
676 tcf_block_remove(block, block->net); 759 tcf_block_remove(block, block->net);
677 760
@@ -687,13 +770,18 @@ void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q,
687 770
688 tcf_block_offload_unbind(block, q, ei); 771 tcf_block_offload_unbind(block, q, ei);
689 772
690 if (block->refcnt == 0) { 773 if (block->refcnt == 1) {
691 /* At this point, all the chains should have refcnt >= 1. */ 774 /* At this point, all the chains should have refcnt >= 1. */
692 list_for_each_entry_safe(chain, tmp, &block->chain_list, list) 775 list_for_each_entry_safe(chain, tmp, &block->chain_list, list) {
776 tcf_chain_put_explicitly_created(chain);
693 tcf_chain_put(chain); 777 tcf_chain_put(chain);
778 }
694 779
695 /* Finally, put chain 0 and allow block to be freed. */ 780 block->refcnt--;
696 tcf_chain_put(tcf_block_chain_zero(block)); 781 if (list_empty(&block->chain_list))
782 kfree(block);
783 } else {
784 block->refcnt--;
697 } 785 }
698} 786}
699EXPORT_SYMBOL(tcf_block_put_ext); 787EXPORT_SYMBOL(tcf_block_put_ext);
@@ -746,18 +834,53 @@ unsigned int tcf_block_cb_decref(struct tcf_block_cb *block_cb)
746} 834}
747EXPORT_SYMBOL(tcf_block_cb_decref); 835EXPORT_SYMBOL(tcf_block_cb_decref);
748 836
837static int
838tcf_block_playback_offloads(struct tcf_block *block, tc_setup_cb_t *cb,
839 void *cb_priv, bool add, bool offload_in_use,
840 struct netlink_ext_ack *extack)
841{
842 struct tcf_chain *chain;
843 struct tcf_proto *tp;
844 int err;
845
846 list_for_each_entry(chain, &block->chain_list, list) {
847 for (tp = rtnl_dereference(chain->filter_chain); tp;
848 tp = rtnl_dereference(tp->next)) {
849 if (tp->ops->reoffload) {
850 err = tp->ops->reoffload(tp, add, cb, cb_priv,
851 extack);
852 if (err && add)
853 goto err_playback_remove;
854 } else if (add && offload_in_use) {
855 err = -EOPNOTSUPP;
856 NL_SET_ERR_MSG(extack, "Filter HW offload failed - classifier without re-offloading support");
857 goto err_playback_remove;
858 }
859 }
860 }
861
862 return 0;
863
864err_playback_remove:
865 tcf_block_playback_offloads(block, cb, cb_priv, false, offload_in_use,
866 extack);
867 return err;
868}
869
749struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block, 870struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block,
750 tc_setup_cb_t *cb, void *cb_ident, 871 tc_setup_cb_t *cb, void *cb_ident,
751 void *cb_priv) 872 void *cb_priv,
873 struct netlink_ext_ack *extack)
752{ 874{
753 struct tcf_block_cb *block_cb; 875 struct tcf_block_cb *block_cb;
876 int err;
754 877
755 /* At this point, playback of previous block cb calls is not supported, 878 /* Replay any already present rules */
756 * so forbid to register to block which already has some offloaded 879 err = tcf_block_playback_offloads(block, cb, cb_priv, true,
757 * filters present. 880 tcf_block_offload_in_use(block),
758 */ 881 extack);
759 if (tcf_block_offload_in_use(block)) 882 if (err)
760 return ERR_PTR(-EOPNOTSUPP); 883 return ERR_PTR(err);
761 884
762 block_cb = kzalloc(sizeof(*block_cb), GFP_KERNEL); 885 block_cb = kzalloc(sizeof(*block_cb), GFP_KERNEL);
763 if (!block_cb) 886 if (!block_cb)
@@ -772,17 +895,22 @@ EXPORT_SYMBOL(__tcf_block_cb_register);
772 895
773int tcf_block_cb_register(struct tcf_block *block, 896int tcf_block_cb_register(struct tcf_block *block,
774 tc_setup_cb_t *cb, void *cb_ident, 897 tc_setup_cb_t *cb, void *cb_ident,
775 void *cb_priv) 898 void *cb_priv, struct netlink_ext_ack *extack)
776{ 899{
777 struct tcf_block_cb *block_cb; 900 struct tcf_block_cb *block_cb;
778 901
779 block_cb = __tcf_block_cb_register(block, cb, cb_ident, cb_priv); 902 block_cb = __tcf_block_cb_register(block, cb, cb_ident, cb_priv,
780 return IS_ERR(block_cb) ? PTR_ERR(block_cb) : 0; 903 extack);
904 return PTR_ERR_OR_ZERO(block_cb);
781} 905}
782EXPORT_SYMBOL(tcf_block_cb_register); 906EXPORT_SYMBOL(tcf_block_cb_register);
783 907
784void __tcf_block_cb_unregister(struct tcf_block_cb *block_cb) 908void __tcf_block_cb_unregister(struct tcf_block *block,
909 struct tcf_block_cb *block_cb)
785{ 910{
911 tcf_block_playback_offloads(block, block_cb->cb, block_cb->cb_priv,
912 false, tcf_block_offload_in_use(block),
913 NULL);
786 list_del(&block_cb->list); 914 list_del(&block_cb->list);
787 kfree(block_cb); 915 kfree(block_cb);
788} 916}
@@ -796,7 +924,7 @@ void tcf_block_cb_unregister(struct tcf_block *block,
796 block_cb = tcf_block_cb_lookup(block, cb, cb_ident); 924 block_cb = tcf_block_cb_lookup(block, cb, cb_ident);
797 if (!block_cb) 925 if (!block_cb)
798 return; 926 return;
799 __tcf_block_cb_unregister(block_cb); 927 __tcf_block_cb_unregister(block, block_cb);
800} 928}
801EXPORT_SYMBOL(tcf_block_cb_unregister); 929EXPORT_SYMBOL(tcf_block_cb_unregister);
802 930
@@ -893,7 +1021,7 @@ static void tcf_chain_tp_insert(struct tcf_chain *chain,
893 struct tcf_proto *tp) 1021 struct tcf_proto *tp)
894{ 1022{
895 if (*chain_info->pprev == chain->filter_chain) 1023 if (*chain_info->pprev == chain->filter_chain)
896 tcf_chain_head_change(chain, tp); 1024 tcf_chain0_head_change(chain, tp);
897 RCU_INIT_POINTER(tp->next, tcf_chain_tp_prev(chain_info)); 1025 RCU_INIT_POINTER(tp->next, tcf_chain_tp_prev(chain_info));
898 rcu_assign_pointer(*chain_info->pprev, tp); 1026 rcu_assign_pointer(*chain_info->pprev, tp);
899 tcf_chain_hold(chain); 1027 tcf_chain_hold(chain);
@@ -906,7 +1034,7 @@ static void tcf_chain_tp_remove(struct tcf_chain *chain,
906 struct tcf_proto *next = rtnl_dereference(chain_info->next); 1034 struct tcf_proto *next = rtnl_dereference(chain_info->next);
907 1035
908 if (tp == chain->filter_chain) 1036 if (tp == chain->filter_chain)
909 tcf_chain_head_change(chain, next); 1037 tcf_chain0_head_change(chain, next);
910 RCU_INIT_POINTER(*chain_info->pprev, next); 1038 RCU_INIT_POINTER(*chain_info->pprev, next);
911 tcf_chain_put(chain); 1039 tcf_chain_put(chain);
912} 1040}
@@ -1053,7 +1181,7 @@ static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb,
1053 for (tp = rtnl_dereference(chain->filter_chain); 1181 for (tp = rtnl_dereference(chain->filter_chain);
1054 tp; tp = rtnl_dereference(tp->next)) 1182 tp; tp = rtnl_dereference(tp->next))
1055 tfilter_notify(net, oskb, n, tp, block, 1183 tfilter_notify(net, oskb, n, tp, block,
1056 q, parent, 0, event, false); 1184 q, parent, NULL, event, false);
1057} 1185}
1058 1186
1059static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n, 1187static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
@@ -1182,6 +1310,12 @@ replay:
1182 goto errout; 1310 goto errout;
1183 } 1311 }
1184 1312
1313 if (chain->tmplt_ops && chain->tmplt_ops != tp->ops) {
1314 NL_SET_ERR_MSG(extack, "Chain template is set to a different filter kind");
1315 err = -EINVAL;
1316 goto errout;
1317 }
1318
1185 err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh, 1319 err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh,
1186 n->nlmsg_flags & NLM_F_CREATE ? TCA_ACT_NOREPLACE : TCA_ACT_REPLACE, 1320 n->nlmsg_flags & NLM_F_CREATE ? TCA_ACT_NOREPLACE : TCA_ACT_REPLACE,
1187 extack); 1321 extack);
@@ -1257,6 +1391,13 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
1257 } 1391 }
1258 chain = tcf_chain_get(block, chain_index, false); 1392 chain = tcf_chain_get(block, chain_index, false);
1259 if (!chain) { 1393 if (!chain) {
1394 /* User requested flush on non-existent chain. Nothing to do,
1395 * so just return success.
1396 */
1397 if (prio == 0) {
1398 err = 0;
1399 goto errout;
1400 }
1260 NL_SET_ERR_MSG(extack, "Cannot find specified filter chain"); 1401 NL_SET_ERR_MSG(extack, "Cannot find specified filter chain");
1261 err = -EINVAL; 1402 err = -EINVAL;
1262 goto errout; 1403 goto errout;
@@ -1444,7 +1585,7 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent,
1444 memset(&cb->args[1], 0, 1585 memset(&cb->args[1], 0,
1445 sizeof(cb->args) - sizeof(cb->args[0])); 1586 sizeof(cb->args) - sizeof(cb->args[0]));
1446 if (cb->args[1] == 0) { 1587 if (cb->args[1] == 0) {
1447 if (tcf_fill_node(net, skb, tp, block, q, parent, 0, 1588 if (tcf_fill_node(net, skb, tp, block, q, parent, NULL,
1448 NETLINK_CB(cb->skb).portid, 1589 NETLINK_CB(cb->skb).portid,
1449 cb->nlh->nlmsg_seq, NLM_F_MULTI, 1590 cb->nlh->nlmsg_seq, NLM_F_MULTI,
1450 RTM_NEWTFILTER) <= 0) 1591 RTM_NEWTFILTER) <= 0)
@@ -1463,7 +1604,9 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent,
1463 arg.w.stop = 0; 1604 arg.w.stop = 0;
1464 arg.w.skip = cb->args[1] - 1; 1605 arg.w.skip = cb->args[1] - 1;
1465 arg.w.count = 0; 1606 arg.w.count = 0;
1607 arg.w.cookie = cb->args[2];
1466 tp->ops->walk(tp, &arg.w); 1608 tp->ops->walk(tp, &arg.w);
1609 cb->args[2] = arg.w.cookie;
1467 cb->args[1] = arg.w.count + 1; 1610 cb->args[1] = arg.w.count + 1;
1468 if (arg.w.stop) 1611 if (arg.w.stop)
1469 return false; 1612 return false;
@@ -1561,14 +1704,334 @@ out:
1561 return skb->len; 1704 return skb->len;
1562} 1705}
1563 1706
1707static int tc_chain_fill_node(struct tcf_chain *chain, struct net *net,
1708 struct sk_buff *skb, struct tcf_block *block,
1709 u32 portid, u32 seq, u16 flags, int event)
1710{
1711 unsigned char *b = skb_tail_pointer(skb);
1712 const struct tcf_proto_ops *ops;
1713 struct nlmsghdr *nlh;
1714 struct tcmsg *tcm;
1715 void *priv;
1716
1717 ops = chain->tmplt_ops;
1718 priv = chain->tmplt_priv;
1719
1720 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1721 if (!nlh)
1722 goto out_nlmsg_trim;
1723 tcm = nlmsg_data(nlh);
1724 tcm->tcm_family = AF_UNSPEC;
1725 tcm->tcm__pad1 = 0;
1726 tcm->tcm__pad2 = 0;
1727 tcm->tcm_handle = 0;
1728 if (block->q) {
1729 tcm->tcm_ifindex = qdisc_dev(block->q)->ifindex;
1730 tcm->tcm_parent = block->q->handle;
1731 } else {
1732 tcm->tcm_ifindex = TCM_IFINDEX_MAGIC_BLOCK;
1733 tcm->tcm_block_index = block->index;
1734 }
1735
1736 if (nla_put_u32(skb, TCA_CHAIN, chain->index))
1737 goto nla_put_failure;
1738
1739 if (ops) {
1740 if (nla_put_string(skb, TCA_KIND, ops->kind))
1741 goto nla_put_failure;
1742 if (ops->tmplt_dump(skb, net, priv) < 0)
1743 goto nla_put_failure;
1744 }
1745
1746 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1747 return skb->len;
1748
1749out_nlmsg_trim:
1750nla_put_failure:
1751 nlmsg_trim(skb, b);
1752 return -EMSGSIZE;
1753}
1754
1755static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb,
1756 u32 seq, u16 flags, int event, bool unicast)
1757{
1758 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1759 struct tcf_block *block = chain->block;
1760 struct net *net = block->net;
1761 struct sk_buff *skb;
1762
1763 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1764 if (!skb)
1765 return -ENOBUFS;
1766
1767 if (tc_chain_fill_node(chain, net, skb, block, portid,
1768 seq, flags, event) <= 0) {
1769 kfree_skb(skb);
1770 return -EINVAL;
1771 }
1772
1773 if (unicast)
1774 return netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT);
1775
1776 return rtnetlink_send(skb, net, portid, RTNLGRP_TC, flags & NLM_F_ECHO);
1777}
1778
1779static int tc_chain_tmplt_add(struct tcf_chain *chain, struct net *net,
1780 struct nlattr **tca,
1781 struct netlink_ext_ack *extack)
1782{
1783 const struct tcf_proto_ops *ops;
1784 void *tmplt_priv;
1785
1786 /* If kind is not set, user did not specify template. */
1787 if (!tca[TCA_KIND])
1788 return 0;
1789
1790 ops = tcf_proto_lookup_ops(nla_data(tca[TCA_KIND]), extack);
1791 if (IS_ERR(ops))
1792 return PTR_ERR(ops);
1793 if (!ops->tmplt_create || !ops->tmplt_destroy || !ops->tmplt_dump) {
1794 NL_SET_ERR_MSG(extack, "Chain templates are not supported with specified classifier");
1795 return -EOPNOTSUPP;
1796 }
1797
1798 tmplt_priv = ops->tmplt_create(net, chain, tca, extack);
1799 if (IS_ERR(tmplt_priv)) {
1800 module_put(ops->owner);
1801 return PTR_ERR(tmplt_priv);
1802 }
1803 chain->tmplt_ops = ops;
1804 chain->tmplt_priv = tmplt_priv;
1805 return 0;
1806}
1807
1808static void tc_chain_tmplt_del(struct tcf_chain *chain)
1809{
1810 const struct tcf_proto_ops *ops = chain->tmplt_ops;
1811
1812 /* If template ops are set, no work to do for us. */
1813 if (!ops)
1814 return;
1815
1816 ops->tmplt_destroy(chain->tmplt_priv);
1817 module_put(ops->owner);
1818}
1819
1820/* Add/delete/get a chain */
1821
1822static int tc_ctl_chain(struct sk_buff *skb, struct nlmsghdr *n,
1823 struct netlink_ext_ack *extack)
1824{
1825 struct net *net = sock_net(skb->sk);
1826 struct nlattr *tca[TCA_MAX + 1];
1827 struct tcmsg *t;
1828 u32 parent;
1829 u32 chain_index;
1830 struct Qdisc *q = NULL;
1831 struct tcf_chain *chain = NULL;
1832 struct tcf_block *block;
1833 unsigned long cl;
1834 int err;
1835
1836 if (n->nlmsg_type != RTM_GETCHAIN &&
1837 !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1838 return -EPERM;
1839
1840replay:
1841 err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL, extack);
1842 if (err < 0)
1843 return err;
1844
1845 t = nlmsg_data(n);
1846 parent = t->tcm_parent;
1847 cl = 0;
1848
1849 block = tcf_block_find(net, &q, &parent, &cl,
1850 t->tcm_ifindex, t->tcm_block_index, extack);
1851 if (IS_ERR(block))
1852 return PTR_ERR(block);
1853
1854 chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0;
1855 if (chain_index > TC_ACT_EXT_VAL_MASK) {
1856 NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit");
1857 return -EINVAL;
1858 }
1859 chain = tcf_chain_lookup(block, chain_index);
1860 if (n->nlmsg_type == RTM_NEWCHAIN) {
1861 if (chain) {
1862 if (tcf_chain_held_by_acts_only(chain)) {
1863 /* The chain exists only because there is
1864 * some action referencing it.
1865 */
1866 tcf_chain_hold(chain);
1867 } else {
1868 NL_SET_ERR_MSG(extack, "Filter chain already exists");
1869 return -EEXIST;
1870 }
1871 } else {
1872 if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1873 NL_SET_ERR_MSG(extack, "Need both RTM_NEWCHAIN and NLM_F_CREATE to create a new chain");
1874 return -ENOENT;
1875 }
1876 chain = tcf_chain_create(block, chain_index);
1877 if (!chain) {
1878 NL_SET_ERR_MSG(extack, "Failed to create filter chain");
1879 return -ENOMEM;
1880 }
1881 }
1882 } else {
1883 if (!chain || tcf_chain_held_by_acts_only(chain)) {
1884 NL_SET_ERR_MSG(extack, "Cannot find specified filter chain");
1885 return -EINVAL;
1886 }
1887 tcf_chain_hold(chain);
1888 }
1889
1890 switch (n->nlmsg_type) {
1891 case RTM_NEWCHAIN:
1892 err = tc_chain_tmplt_add(chain, net, tca, extack);
1893 if (err)
1894 goto errout;
1895 /* In case the chain was successfully added, take a reference
1896 * to the chain. This ensures that an empty chain
1897 * does not disappear at the end of this function.
1898 */
1899 tcf_chain_hold(chain);
1900 chain->explicitly_created = true;
1901 tc_chain_notify(chain, NULL, 0, NLM_F_CREATE | NLM_F_EXCL,
1902 RTM_NEWCHAIN, false);
1903 break;
1904 case RTM_DELCHAIN:
1905 /* Flush the chain first as the user requested chain removal. */
1906 tcf_chain_flush(chain);
1907 /* In case the chain was successfully deleted, put a reference
1908 * to the chain previously taken during addition.
1909 */
1910 tcf_chain_put_explicitly_created(chain);
1911 chain->explicitly_created = false;
1912 break;
1913 case RTM_GETCHAIN:
1914 err = tc_chain_notify(chain, skb, n->nlmsg_seq,
1915 n->nlmsg_seq, n->nlmsg_type, true);
1916 if (err < 0)
1917 NL_SET_ERR_MSG(extack, "Failed to send chain notify message");
1918 break;
1919 default:
1920 err = -EOPNOTSUPP;
1921 NL_SET_ERR_MSG(extack, "Unsupported message type");
1922 goto errout;
1923 }
1924
1925errout:
1926 tcf_chain_put(chain);
1927 if (err == -EAGAIN)
1928 /* Replay the request. */
1929 goto replay;
1930 return err;
1931}
1932
1933/* called with RTNL */
1934static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb)
1935{
1936 struct net *net = sock_net(skb->sk);
1937 struct nlattr *tca[TCA_MAX + 1];
1938 struct Qdisc *q = NULL;
1939 struct tcf_block *block;
1940 struct tcf_chain *chain;
1941 struct tcmsg *tcm = nlmsg_data(cb->nlh);
1942 long index_start;
1943 long index;
1944 u32 parent;
1945 int err;
1946
1947 if (nlmsg_len(cb->nlh) < sizeof(*tcm))
1948 return skb->len;
1949
1950 err = nlmsg_parse(cb->nlh, sizeof(*tcm), tca, TCA_MAX, NULL, NULL);
1951 if (err)
1952 return err;
1953
1954 if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) {
1955 block = tcf_block_lookup(net, tcm->tcm_block_index);
1956 if (!block)
1957 goto out;
1958 /* If we work with block index, q is NULL and parent value
1959 * will never be used in the following code. The check
1960 * in tcf_fill_node prevents it. However, compiler does not
1961 * see that far, so set parent to zero to silence the warning
1962 * about parent being uninitialized.
1963 */
1964 parent = 0;
1965 } else {
1966 const struct Qdisc_class_ops *cops;
1967 struct net_device *dev;
1968 unsigned long cl = 0;
1969
1970 dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1971 if (!dev)
1972 return skb->len;
1973
1974 parent = tcm->tcm_parent;
1975 if (!parent) {
1976 q = dev->qdisc;
1977 parent = q->handle;
1978 } else {
1979 q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent));
1980 }
1981 if (!q)
1982 goto out;
1983 cops = q->ops->cl_ops;
1984 if (!cops)
1985 goto out;
1986 if (!cops->tcf_block)
1987 goto out;
1988 if (TC_H_MIN(tcm->tcm_parent)) {
1989 cl = cops->find(q, tcm->tcm_parent);
1990 if (cl == 0)
1991 goto out;
1992 }
1993 block = cops->tcf_block(q, cl, NULL);
1994 if (!block)
1995 goto out;
1996 if (tcf_block_shared(block))
1997 q = NULL;
1998 }
1999
2000 index_start = cb->args[0];
2001 index = 0;
2002
2003 list_for_each_entry(chain, &block->chain_list, list) {
2004 if ((tca[TCA_CHAIN] &&
2005 nla_get_u32(tca[TCA_CHAIN]) != chain->index))
2006 continue;
2007 if (index < index_start) {
2008 index++;
2009 continue;
2010 }
2011 if (tcf_chain_held_by_acts_only(chain))
2012 continue;
2013 err = tc_chain_fill_node(chain, net, skb, block,
2014 NETLINK_CB(cb->skb).portid,
2015 cb->nlh->nlmsg_seq, NLM_F_MULTI,
2016 RTM_NEWCHAIN);
2017 if (err <= 0)
2018 break;
2019 index++;
2020 }
2021
2022 cb->args[0] = index;
2023
2024out:
2025 /* If we did no progress, the error (EMSGSIZE) is real */
2026 if (skb->len == 0 && err)
2027 return err;
2028 return skb->len;
2029}
2030
1564void tcf_exts_destroy(struct tcf_exts *exts) 2031void tcf_exts_destroy(struct tcf_exts *exts)
1565{ 2032{
1566#ifdef CONFIG_NET_CLS_ACT 2033#ifdef CONFIG_NET_CLS_ACT
1567 LIST_HEAD(actions); 2034 tcf_action_destroy(exts->actions, TCA_ACT_UNBIND);
1568
1569 ASSERT_RTNL();
1570 tcf_exts_to_list(exts, &actions);
1571 tcf_action_destroy(&actions, TCA_ACT_UNBIND);
1572 kfree(exts->actions); 2035 kfree(exts->actions);
1573 exts->nr_actions = 0; 2036 exts->nr_actions = 0;
1574#endif 2037#endif
@@ -1587,7 +2050,7 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
1587 if (exts->police && tb[exts->police]) { 2050 if (exts->police && tb[exts->police]) {
1588 act = tcf_action_init_1(net, tp, tb[exts->police], 2051 act = tcf_action_init_1(net, tp, tb[exts->police],
1589 rate_tlv, "police", ovr, 2052 rate_tlv, "police", ovr,
1590 TCA_ACT_BIND, extack); 2053 TCA_ACT_BIND, true, extack);
1591 if (IS_ERR(act)) 2054 if (IS_ERR(act))
1592 return PTR_ERR(act); 2055 return PTR_ERR(act);
1593 2056
@@ -1595,17 +2058,15 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
1595 exts->actions[0] = act; 2058 exts->actions[0] = act;
1596 exts->nr_actions = 1; 2059 exts->nr_actions = 1;
1597 } else if (exts->action && tb[exts->action]) { 2060 } else if (exts->action && tb[exts->action]) {
1598 LIST_HEAD(actions); 2061 int err;
1599 int err, i = 0;
1600 2062
1601 err = tcf_action_init(net, tp, tb[exts->action], 2063 err = tcf_action_init(net, tp, tb[exts->action],
1602 rate_tlv, NULL, ovr, TCA_ACT_BIND, 2064 rate_tlv, NULL, ovr, TCA_ACT_BIND,
1603 &actions, &attr_size, extack); 2065 exts->actions, &attr_size, true,
1604 if (err) 2066 extack);
2067 if (err < 0)
1605 return err; 2068 return err;
1606 list_for_each_entry(act, &actions, list) 2069 exts->nr_actions = err;
1607 exts->actions[i++] = act;
1608 exts->nr_actions = i;
1609 } 2070 }
1610 exts->net = net; 2071 exts->net = net;
1611 } 2072 }
@@ -1654,14 +2115,11 @@ int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts)
1654 * tc data even if iproute2 was newer - jhs 2115 * tc data even if iproute2 was newer - jhs
1655 */ 2116 */
1656 if (exts->type != TCA_OLD_COMPAT) { 2117 if (exts->type != TCA_OLD_COMPAT) {
1657 LIST_HEAD(actions);
1658
1659 nest = nla_nest_start(skb, exts->action); 2118 nest = nla_nest_start(skb, exts->action);
1660 if (nest == NULL) 2119 if (nest == NULL)
1661 goto nla_put_failure; 2120 goto nla_put_failure;
1662 2121
1663 tcf_exts_to_list(exts, &actions); 2122 if (tcf_action_dump(skb, exts->actions, 0, 0) < 0)
1664 if (tcf_action_dump(skb, &actions, 0, 0) < 0)
1665 goto nla_put_failure; 2123 goto nla_put_failure;
1666 nla_nest_end(skb, nest); 2124 nla_nest_end(skb, nest);
1667 } else if (exts->police) { 2125 } else if (exts->police) {
@@ -1718,6 +2176,7 @@ static int tc_exts_setup_cb_egdev_call(struct tcf_exts *exts,
1718 if (!dev) 2176 if (!dev)
1719 continue; 2177 continue;
1720 ret = tc_setup_cb_egdev_call(dev, type, type_data, err_stop); 2178 ret = tc_setup_cb_egdev_call(dev, type, type_data, err_stop);
2179 a->ops->put_dev(dev);
1721 if (ret < 0) 2180 if (ret < 0)
1722 return ret; 2181 return ret;
1723 ok_count += ret; 2182 ok_count += ret;
@@ -1786,6 +2245,10 @@ static int __init tc_filter_init(void)
1786 rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_del_tfilter, NULL, 0); 2245 rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_del_tfilter, NULL, 0);
1787 rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_get_tfilter, 2246 rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_get_tfilter,
1788 tc_dump_tfilter, 0); 2247 tc_dump_tfilter, 0);
2248 rtnl_register(PF_UNSPEC, RTM_NEWCHAIN, tc_ctl_chain, NULL, 0);
2249 rtnl_register(PF_UNSPEC, RTM_DELCHAIN, tc_ctl_chain, NULL, 0);
2250 rtnl_register(PF_UNSPEC, RTM_GETCHAIN, tc_ctl_chain,
2251 tc_dump_chain, 0);
1789 2252
1790 return 0; 2253 return 0;
1791 2254
diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index 95367f37098d..6a5dce8baf19 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -324,4 +324,3 @@ static void __exit exit_basic(void)
324module_init(init_basic) 324module_init(init_basic)
325module_exit(exit_basic) 325module_exit(exit_basic)
326MODULE_LICENSE("GPL"); 326MODULE_LICENSE("GPL");
327
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 1aa7f6511065..fa6fe2fe0f32 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -43,6 +43,7 @@ struct cls_bpf_prog {
43 struct tcf_result res; 43 struct tcf_result res;
44 bool exts_integrated; 44 bool exts_integrated;
45 u32 gen_flags; 45 u32 gen_flags;
46 unsigned int in_hw_count;
46 struct tcf_exts exts; 47 struct tcf_exts exts;
47 u32 handle; 48 u32 handle;
48 u16 bpf_num_ops; 49 u16 bpf_num_ops;
@@ -174,6 +175,7 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
174 cls_bpf_offload_cmd(tp, oldprog, prog, extack); 175 cls_bpf_offload_cmd(tp, oldprog, prog, extack);
175 return err; 176 return err;
176 } else if (err > 0) { 177 } else if (err > 0) {
178 prog->in_hw_count = err;
177 tcf_block_offload_inc(block, &prog->gen_flags); 179 tcf_block_offload_inc(block, &prog->gen_flags);
178 } 180 }
179 } 181 }
@@ -347,12 +349,10 @@ static int cls_bpf_prog_from_ops(struct nlattr **tb, struct cls_bpf_prog *prog)
347 if (bpf_size != nla_len(tb[TCA_BPF_OPS])) 349 if (bpf_size != nla_len(tb[TCA_BPF_OPS]))
348 return -EINVAL; 350 return -EINVAL;
349 351
350 bpf_ops = kzalloc(bpf_size, GFP_KERNEL); 352 bpf_ops = kmemdup(nla_data(tb[TCA_BPF_OPS]), bpf_size, GFP_KERNEL);
351 if (bpf_ops == NULL) 353 if (bpf_ops == NULL)
352 return -ENOMEM; 354 return -ENOMEM;
353 355
354 memcpy(bpf_ops, nla_data(tb[TCA_BPF_OPS]), bpf_size);
355
356 fprog_tmp.len = bpf_num_ops; 356 fprog_tmp.len = bpf_num_ops;
357 fprog_tmp.filter = bpf_ops; 357 fprog_tmp.filter = bpf_ops;
358 358
@@ -652,6 +652,42 @@ skip:
652 } 652 }
653} 653}
654 654
655static int cls_bpf_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb,
656 void *cb_priv, struct netlink_ext_ack *extack)
657{
658 struct cls_bpf_head *head = rtnl_dereference(tp->root);
659 struct tcf_block *block = tp->chain->block;
660 struct tc_cls_bpf_offload cls_bpf = {};
661 struct cls_bpf_prog *prog;
662 int err;
663
664 list_for_each_entry(prog, &head->plist, link) {
665 if (tc_skip_hw(prog->gen_flags))
666 continue;
667
668 tc_cls_common_offload_init(&cls_bpf.common, tp, prog->gen_flags,
669 extack);
670 cls_bpf.command = TC_CLSBPF_OFFLOAD;
671 cls_bpf.exts = &prog->exts;
672 cls_bpf.prog = add ? prog->filter : NULL;
673 cls_bpf.oldprog = add ? NULL : prog->filter;
674 cls_bpf.name = prog->bpf_name;
675 cls_bpf.exts_integrated = prog->exts_integrated;
676
677 err = cb(TC_SETUP_CLSBPF, &cls_bpf, cb_priv);
678 if (err) {
679 if (add && tc_skip_sw(prog->gen_flags))
680 return err;
681 continue;
682 }
683
684 tc_cls_offload_cnt_update(block, &prog->in_hw_count,
685 &prog->gen_flags, add);
686 }
687
688 return 0;
689}
690
655static struct tcf_proto_ops cls_bpf_ops __read_mostly = { 691static struct tcf_proto_ops cls_bpf_ops __read_mostly = {
656 .kind = "bpf", 692 .kind = "bpf",
657 .owner = THIS_MODULE, 693 .owner = THIS_MODULE,
@@ -662,6 +698,7 @@ static struct tcf_proto_ops cls_bpf_ops __read_mostly = {
662 .change = cls_bpf_change, 698 .change = cls_bpf_change,
663 .delete = cls_bpf_delete, 699 .delete = cls_bpf_delete,
664 .walk = cls_bpf_walk, 700 .walk = cls_bpf_walk,
701 .reoffload = cls_bpf_reoffload,
665 .dump = cls_bpf_dump, 702 .dump = cls_bpf_dump,
666 .bind_class = cls_bpf_bind_class, 703 .bind_class = cls_bpf_bind_class,
667}; 704};
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 2b5be42a9f1c..6fd9bdd93796 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -24,6 +24,7 @@
24#include <net/pkt_cls.h> 24#include <net/pkt_cls.h>
25#include <net/ip.h> 25#include <net/ip.h>
26#include <net/flow_dissector.h> 26#include <net/flow_dissector.h>
27#include <net/geneve.h>
27 28
28#include <net/dst.h> 29#include <net/dst.h>
29#include <net/dst_metadata.h> 30#include <net/dst_metadata.h>
@@ -35,6 +36,7 @@ struct fl_flow_key {
35 struct flow_dissector_key_basic basic; 36 struct flow_dissector_key_basic basic;
36 struct flow_dissector_key_eth_addrs eth; 37 struct flow_dissector_key_eth_addrs eth;
37 struct flow_dissector_key_vlan vlan; 38 struct flow_dissector_key_vlan vlan;
39 struct flow_dissector_key_vlan cvlan;
38 union { 40 union {
39 struct flow_dissector_key_ipv4_addrs ipv4; 41 struct flow_dissector_key_ipv4_addrs ipv4;
40 struct flow_dissector_key_ipv6_addrs ipv6; 42 struct flow_dissector_key_ipv6_addrs ipv6;
@@ -51,6 +53,8 @@ struct fl_flow_key {
51 struct flow_dissector_key_mpls mpls; 53 struct flow_dissector_key_mpls mpls;
52 struct flow_dissector_key_tcp tcp; 54 struct flow_dissector_key_tcp tcp;
53 struct flow_dissector_key_ip ip; 55 struct flow_dissector_key_ip ip;
56 struct flow_dissector_key_ip enc_ip;
57 struct flow_dissector_key_enc_opts enc_opts;
54} __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ 58} __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */
55 59
56struct fl_flow_mask_range { 60struct fl_flow_mask_range {
@@ -66,10 +70,17 @@ struct fl_flow_mask {
66 struct rhashtable_params filter_ht_params; 70 struct rhashtable_params filter_ht_params;
67 struct flow_dissector dissector; 71 struct flow_dissector dissector;
68 struct list_head filters; 72 struct list_head filters;
69 struct rcu_head rcu; 73 struct rcu_work rwork;
70 struct list_head list; 74 struct list_head list;
71}; 75};
72 76
77struct fl_flow_tmplt {
78 struct fl_flow_key dummy_key;
79 struct fl_flow_key mask;
80 struct flow_dissector dissector;
81 struct tcf_chain *chain;
82};
83
73struct cls_fl_head { 84struct cls_fl_head {
74 struct rhashtable ht; 85 struct rhashtable ht;
75 struct list_head masks; 86 struct list_head masks;
@@ -87,6 +98,7 @@ struct cls_fl_filter {
87 struct list_head list; 98 struct list_head list;
88 u32 handle; 99 u32 handle;
89 u32 flags; 100 u32 flags;
101 unsigned int in_hw_count;
90 struct rcu_work rwork; 102 struct rcu_work rwork;
91 struct net_device *hw_dev; 103 struct net_device *hw_dev;
92}; 104};
@@ -144,6 +156,23 @@ static void fl_set_masked_key(struct fl_flow_key *mkey, struct fl_flow_key *key,
144 *lmkey++ = *lkey++ & *lmask++; 156 *lmkey++ = *lkey++ & *lmask++;
145} 157}
146 158
159static bool fl_mask_fits_tmplt(struct fl_flow_tmplt *tmplt,
160 struct fl_flow_mask *mask)
161{
162 const long *lmask = fl_key_get_start(&mask->key, mask);
163 const long *ltmplt;
164 int i;
165
166 if (!tmplt)
167 return true;
168 ltmplt = fl_key_get_start(&tmplt->mask, mask);
169 for (i = 0; i < fl_mask_range(mask); i += sizeof(long)) {
170 if (~*ltmplt++ & *lmask++)
171 return false;
172 }
173 return true;
174}
175
147static void fl_clear_masked_range(struct fl_flow_key *key, 176static void fl_clear_masked_range(struct fl_flow_key *key,
148 struct fl_flow_mask *mask) 177 struct fl_flow_mask *mask)
149{ 178{
@@ -203,6 +232,20 @@ static int fl_init(struct tcf_proto *tp)
203 return rhashtable_init(&head->ht, &mask_ht_params); 232 return rhashtable_init(&head->ht, &mask_ht_params);
204} 233}
205 234
235static void fl_mask_free(struct fl_flow_mask *mask)
236{
237 rhashtable_destroy(&mask->ht);
238 kfree(mask);
239}
240
241static void fl_mask_free_work(struct work_struct *work)
242{
243 struct fl_flow_mask *mask = container_of(to_rcu_work(work),
244 struct fl_flow_mask, rwork);
245
246 fl_mask_free(mask);
247}
248
206static bool fl_mask_put(struct cls_fl_head *head, struct fl_flow_mask *mask, 249static bool fl_mask_put(struct cls_fl_head *head, struct fl_flow_mask *mask,
207 bool async) 250 bool async)
208{ 251{
@@ -210,12 +253,11 @@ static bool fl_mask_put(struct cls_fl_head *head, struct fl_flow_mask *mask,
210 return false; 253 return false;
211 254
212 rhashtable_remove_fast(&head->ht, &mask->ht_node, mask_ht_params); 255 rhashtable_remove_fast(&head->ht, &mask->ht_node, mask_ht_params);
213 rhashtable_destroy(&mask->ht);
214 list_del_rcu(&mask->list); 256 list_del_rcu(&mask->list);
215 if (async) 257 if (async)
216 kfree_rcu(mask, rcu); 258 tcf_queue_work(&mask->rwork, fl_mask_free_work);
217 else 259 else
218 kfree(mask); 260 fl_mask_free(mask);
219 261
220 return true; 262 return true;
221} 263}
@@ -276,6 +318,7 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
276 fl_hw_destroy_filter(tp, f, NULL); 318 fl_hw_destroy_filter(tp, f, NULL);
277 return err; 319 return err;
278 } else if (err > 0) { 320 } else if (err > 0) {
321 f->in_hw_count = err;
279 tcf_block_offload_inc(block, &f->flags); 322 tcf_block_offload_inc(block, &f->flags);
280 } 323 }
281 324
@@ -434,6 +477,28 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
434 [TCA_FLOWER_KEY_IP_TOS_MASK] = { .type = NLA_U8 }, 477 [TCA_FLOWER_KEY_IP_TOS_MASK] = { .type = NLA_U8 },
435 [TCA_FLOWER_KEY_IP_TTL] = { .type = NLA_U8 }, 478 [TCA_FLOWER_KEY_IP_TTL] = { .type = NLA_U8 },
436 [TCA_FLOWER_KEY_IP_TTL_MASK] = { .type = NLA_U8 }, 479 [TCA_FLOWER_KEY_IP_TTL_MASK] = { .type = NLA_U8 },
480 [TCA_FLOWER_KEY_CVLAN_ID] = { .type = NLA_U16 },
481 [TCA_FLOWER_KEY_CVLAN_PRIO] = { .type = NLA_U8 },
482 [TCA_FLOWER_KEY_CVLAN_ETH_TYPE] = { .type = NLA_U16 },
483 [TCA_FLOWER_KEY_ENC_IP_TOS] = { .type = NLA_U8 },
484 [TCA_FLOWER_KEY_ENC_IP_TOS_MASK] = { .type = NLA_U8 },
485 [TCA_FLOWER_KEY_ENC_IP_TTL] = { .type = NLA_U8 },
486 [TCA_FLOWER_KEY_ENC_IP_TTL_MASK] = { .type = NLA_U8 },
487 [TCA_FLOWER_KEY_ENC_OPTS] = { .type = NLA_NESTED },
488 [TCA_FLOWER_KEY_ENC_OPTS_MASK] = { .type = NLA_NESTED },
489};
490
491static const struct nla_policy
492enc_opts_policy[TCA_FLOWER_KEY_ENC_OPTS_MAX + 1] = {
493 [TCA_FLOWER_KEY_ENC_OPTS_GENEVE] = { .type = NLA_NESTED },
494};
495
496static const struct nla_policy
497geneve_opt_policy[TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX + 1] = {
498 [TCA_FLOWER_KEY_ENC_OPT_GENEVE_CLASS] = { .type = NLA_U16 },
499 [TCA_FLOWER_KEY_ENC_OPT_GENEVE_TYPE] = { .type = NLA_U8 },
500 [TCA_FLOWER_KEY_ENC_OPT_GENEVE_DATA] = { .type = NLA_BINARY,
501 .len = 128 },
437}; 502};
438 503
439static void fl_set_key_val(struct nlattr **tb, 504static void fl_set_key_val(struct nlattr **tb,
@@ -485,22 +550,26 @@ static int fl_set_key_mpls(struct nlattr **tb,
485} 550}
486 551
487static void fl_set_key_vlan(struct nlattr **tb, 552static void fl_set_key_vlan(struct nlattr **tb,
553 __be16 ethertype,
554 int vlan_id_key, int vlan_prio_key,
488 struct flow_dissector_key_vlan *key_val, 555 struct flow_dissector_key_vlan *key_val,
489 struct flow_dissector_key_vlan *key_mask) 556 struct flow_dissector_key_vlan *key_mask)
490{ 557{
491#define VLAN_PRIORITY_MASK 0x7 558#define VLAN_PRIORITY_MASK 0x7
492 559
493 if (tb[TCA_FLOWER_KEY_VLAN_ID]) { 560 if (tb[vlan_id_key]) {
494 key_val->vlan_id = 561 key_val->vlan_id =
495 nla_get_u16(tb[TCA_FLOWER_KEY_VLAN_ID]) & VLAN_VID_MASK; 562 nla_get_u16(tb[vlan_id_key]) & VLAN_VID_MASK;
496 key_mask->vlan_id = VLAN_VID_MASK; 563 key_mask->vlan_id = VLAN_VID_MASK;
497 } 564 }
498 if (tb[TCA_FLOWER_KEY_VLAN_PRIO]) { 565 if (tb[vlan_prio_key]) {
499 key_val->vlan_priority = 566 key_val->vlan_priority =
500 nla_get_u8(tb[TCA_FLOWER_KEY_VLAN_PRIO]) & 567 nla_get_u8(tb[vlan_prio_key]) &
501 VLAN_PRIORITY_MASK; 568 VLAN_PRIORITY_MASK;
502 key_mask->vlan_priority = VLAN_PRIORITY_MASK; 569 key_mask->vlan_priority = VLAN_PRIORITY_MASK;
503 } 570 }
571 key_val->vlan_tpid = ethertype;
572 key_mask->vlan_tpid = cpu_to_be16(~0);
504} 573}
505 574
506static void fl_set_key_flag(u32 flower_key, u32 flower_mask, 575static void fl_set_key_flag(u32 flower_key, u32 flower_mask,
@@ -538,17 +607,156 @@ static int fl_set_key_flags(struct nlattr **tb,
538 return 0; 607 return 0;
539} 608}
540 609
541static void fl_set_key_ip(struct nlattr **tb, 610static void fl_set_key_ip(struct nlattr **tb, bool encap,
542 struct flow_dissector_key_ip *key, 611 struct flow_dissector_key_ip *key,
543 struct flow_dissector_key_ip *mask) 612 struct flow_dissector_key_ip *mask)
544{ 613{
545 fl_set_key_val(tb, &key->tos, TCA_FLOWER_KEY_IP_TOS, 614 int tos_key = encap ? TCA_FLOWER_KEY_ENC_IP_TOS : TCA_FLOWER_KEY_IP_TOS;
546 &mask->tos, TCA_FLOWER_KEY_IP_TOS_MASK, 615 int ttl_key = encap ? TCA_FLOWER_KEY_ENC_IP_TTL : TCA_FLOWER_KEY_IP_TTL;
547 sizeof(key->tos)); 616 int tos_mask = encap ? TCA_FLOWER_KEY_ENC_IP_TOS_MASK : TCA_FLOWER_KEY_IP_TOS_MASK;
617 int ttl_mask = encap ? TCA_FLOWER_KEY_ENC_IP_TTL_MASK : TCA_FLOWER_KEY_IP_TTL_MASK;
548 618
549 fl_set_key_val(tb, &key->ttl, TCA_FLOWER_KEY_IP_TTL, 619 fl_set_key_val(tb, &key->tos, tos_key, &mask->tos, tos_mask, sizeof(key->tos));
550 &mask->ttl, TCA_FLOWER_KEY_IP_TTL_MASK, 620 fl_set_key_val(tb, &key->ttl, ttl_key, &mask->ttl, ttl_mask, sizeof(key->ttl));
551 sizeof(key->ttl)); 621}
622
623static int fl_set_geneve_opt(const struct nlattr *nla, struct fl_flow_key *key,
624 int depth, int option_len,
625 struct netlink_ext_ack *extack)
626{
627 struct nlattr *tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX + 1];
628 struct nlattr *class = NULL, *type = NULL, *data = NULL;
629 struct geneve_opt *opt;
630 int err, data_len = 0;
631
632 if (option_len > sizeof(struct geneve_opt))
633 data_len = option_len - sizeof(struct geneve_opt);
634
635 opt = (struct geneve_opt *)&key->enc_opts.data[key->enc_opts.len];
636 memset(opt, 0xff, option_len);
637 opt->length = data_len / 4;
638 opt->r1 = 0;
639 opt->r2 = 0;
640 opt->r3 = 0;
641
642 /* If no mask has been prodived we assume an exact match. */
643 if (!depth)
644 return sizeof(struct geneve_opt) + data_len;
645
646 if (nla_type(nla) != TCA_FLOWER_KEY_ENC_OPTS_GENEVE) {
647 NL_SET_ERR_MSG(extack, "Non-geneve option type for mask");
648 return -EINVAL;
649 }
650
651 err = nla_parse_nested(tb, TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX,
652 nla, geneve_opt_policy, extack);
653 if (err < 0)
654 return err;
655
656 /* We are not allowed to omit any of CLASS, TYPE or DATA
657 * fields from the key.
658 */
659 if (!option_len &&
660 (!tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_CLASS] ||
661 !tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_TYPE] ||
662 !tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_DATA])) {
663 NL_SET_ERR_MSG(extack, "Missing tunnel key geneve option class, type or data");
664 return -EINVAL;
665 }
666
667 /* Omitting any of CLASS, TYPE or DATA fields is allowed
668 * for the mask.
669 */
670 if (tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_DATA]) {
671 int new_len = key->enc_opts.len;
672
673 data = tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_DATA];
674 data_len = nla_len(data);
675 if (data_len < 4) {
676 NL_SET_ERR_MSG(extack, "Tunnel key geneve option data is less than 4 bytes long");
677 return -ERANGE;
678 }
679 if (data_len % 4) {
680 NL_SET_ERR_MSG(extack, "Tunnel key geneve option data is not a multiple of 4 bytes long");
681 return -ERANGE;
682 }
683
684 new_len += sizeof(struct geneve_opt) + data_len;
685 BUILD_BUG_ON(FLOW_DIS_TUN_OPTS_MAX != IP_TUNNEL_OPTS_MAX);
686 if (new_len > FLOW_DIS_TUN_OPTS_MAX) {
687 NL_SET_ERR_MSG(extack, "Tunnel options exceeds max size");
688 return -ERANGE;
689 }
690 opt->length = data_len / 4;
691 memcpy(opt->opt_data, nla_data(data), data_len);
692 }
693
694 if (tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_CLASS]) {
695 class = tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_CLASS];
696 opt->opt_class = nla_get_be16(class);
697 }
698
699 if (tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_TYPE]) {
700 type = tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_TYPE];
701 opt->type = nla_get_u8(type);
702 }
703
704 return sizeof(struct geneve_opt) + data_len;
705}
706
707static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key,
708 struct fl_flow_key *mask,
709 struct netlink_ext_ack *extack)
710{
711 const struct nlattr *nla_enc_key, *nla_opt_key, *nla_opt_msk = NULL;
712 int option_len, key_depth, msk_depth = 0;
713
714 nla_enc_key = nla_data(tb[TCA_FLOWER_KEY_ENC_OPTS]);
715
716 if (tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]) {
717 nla_opt_msk = nla_data(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]);
718 msk_depth = nla_len(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]);
719 }
720
721 nla_for_each_attr(nla_opt_key, nla_enc_key,
722 nla_len(tb[TCA_FLOWER_KEY_ENC_OPTS]), key_depth) {
723 switch (nla_type(nla_opt_key)) {
724 case TCA_FLOWER_KEY_ENC_OPTS_GENEVE:
725 option_len = 0;
726 key->enc_opts.dst_opt_type = TUNNEL_GENEVE_OPT;
727 option_len = fl_set_geneve_opt(nla_opt_key, key,
728 key_depth, option_len,
729 extack);
730 if (option_len < 0)
731 return option_len;
732
733 key->enc_opts.len += option_len;
734 /* At the same time we need to parse through the mask
735 * in order to verify exact and mask attribute lengths.
736 */
737 mask->enc_opts.dst_opt_type = TUNNEL_GENEVE_OPT;
738 option_len = fl_set_geneve_opt(nla_opt_msk, mask,
739 msk_depth, option_len,
740 extack);
741 if (option_len < 0)
742 return option_len;
743
744 mask->enc_opts.len += option_len;
745 if (key->enc_opts.len != mask->enc_opts.len) {
746 NL_SET_ERR_MSG(extack, "Key and mask miss aligned");
747 return -EINVAL;
748 }
749
750 if (msk_depth)
751 nla_opt_msk = nla_next(nla_opt_msk, &msk_depth);
752 break;
753 default:
754 NL_SET_ERR_MSG(extack, "Unknown tunnel option type");
755 return -EINVAL;
756 }
757 }
758
759 return 0;
552} 760}
553 761
554static int fl_set_key(struct net *net, struct nlattr **tb, 762static int fl_set_key(struct net *net, struct nlattr **tb,
@@ -577,12 +785,28 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
577 if (tb[TCA_FLOWER_KEY_ETH_TYPE]) { 785 if (tb[TCA_FLOWER_KEY_ETH_TYPE]) {
578 ethertype = nla_get_be16(tb[TCA_FLOWER_KEY_ETH_TYPE]); 786 ethertype = nla_get_be16(tb[TCA_FLOWER_KEY_ETH_TYPE]);
579 787
580 if (ethertype == htons(ETH_P_8021Q)) { 788 if (eth_type_vlan(ethertype)) {
581 fl_set_key_vlan(tb, &key->vlan, &mask->vlan); 789 fl_set_key_vlan(tb, ethertype, TCA_FLOWER_KEY_VLAN_ID,
582 fl_set_key_val(tb, &key->basic.n_proto, 790 TCA_FLOWER_KEY_VLAN_PRIO, &key->vlan,
583 TCA_FLOWER_KEY_VLAN_ETH_TYPE, 791 &mask->vlan);
584 &mask->basic.n_proto, TCA_FLOWER_UNSPEC, 792
585 sizeof(key->basic.n_proto)); 793 if (tb[TCA_FLOWER_KEY_VLAN_ETH_TYPE]) {
794 ethertype = nla_get_be16(tb[TCA_FLOWER_KEY_VLAN_ETH_TYPE]);
795 if (eth_type_vlan(ethertype)) {
796 fl_set_key_vlan(tb, ethertype,
797 TCA_FLOWER_KEY_CVLAN_ID,
798 TCA_FLOWER_KEY_CVLAN_PRIO,
799 &key->cvlan, &mask->cvlan);
800 fl_set_key_val(tb, &key->basic.n_proto,
801 TCA_FLOWER_KEY_CVLAN_ETH_TYPE,
802 &mask->basic.n_proto,
803 TCA_FLOWER_UNSPEC,
804 sizeof(key->basic.n_proto));
805 } else {
806 key->basic.n_proto = ethertype;
807 mask->basic.n_proto = cpu_to_be16(~0);
808 }
809 }
586 } else { 810 } else {
587 key->basic.n_proto = ethertype; 811 key->basic.n_proto = ethertype;
588 mask->basic.n_proto = cpu_to_be16(~0); 812 mask->basic.n_proto = cpu_to_be16(~0);
@@ -594,7 +818,7 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
594 fl_set_key_val(tb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO, 818 fl_set_key_val(tb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO,
595 &mask->basic.ip_proto, TCA_FLOWER_UNSPEC, 819 &mask->basic.ip_proto, TCA_FLOWER_UNSPEC,
596 sizeof(key->basic.ip_proto)); 820 sizeof(key->basic.ip_proto));
597 fl_set_key_ip(tb, &key->ip, &mask->ip); 821 fl_set_key_ip(tb, false, &key->ip, &mask->ip);
598 } 822 }
599 823
600 if (tb[TCA_FLOWER_KEY_IPV4_SRC] || tb[TCA_FLOWER_KEY_IPV4_DST]) { 824 if (tb[TCA_FLOWER_KEY_IPV4_SRC] || tb[TCA_FLOWER_KEY_IPV4_DST]) {
@@ -729,6 +953,14 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
729 &mask->enc_tp.dst, TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK, 953 &mask->enc_tp.dst, TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK,
730 sizeof(key->enc_tp.dst)); 954 sizeof(key->enc_tp.dst));
731 955
956 fl_set_key_ip(tb, true, &key->enc_ip, &mask->enc_ip);
957
958 if (tb[TCA_FLOWER_KEY_ENC_OPTS]) {
959 ret = fl_set_enc_opt(tb, key, mask, extack);
960 if (ret)
961 return ret;
962 }
963
732 if (tb[TCA_FLOWER_KEY_FLAGS]) 964 if (tb[TCA_FLOWER_KEY_FLAGS])
733 ret = fl_set_key_flags(tb, &key->control.flags, &mask->control.flags); 965 ret = fl_set_key_flags(tb, &key->control.flags, &mask->control.flags);
734 966
@@ -780,47 +1012,54 @@ static int fl_init_mask_hashtable(struct fl_flow_mask *mask)
780 FL_KEY_SET(keys, cnt, id, member); \ 1012 FL_KEY_SET(keys, cnt, id, member); \
781 } while(0); 1013 } while(0);
782 1014
783static void fl_init_dissector(struct fl_flow_mask *mask) 1015static void fl_init_dissector(struct flow_dissector *dissector,
1016 struct fl_flow_key *mask)
784{ 1017{
785 struct flow_dissector_key keys[FLOW_DISSECTOR_KEY_MAX]; 1018 struct flow_dissector_key keys[FLOW_DISSECTOR_KEY_MAX];
786 size_t cnt = 0; 1019 size_t cnt = 0;
787 1020
788 FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_CONTROL, control); 1021 FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_CONTROL, control);
789 FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_BASIC, basic); 1022 FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_BASIC, basic);
790 FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, 1023 FL_KEY_SET_IF_MASKED(mask, keys, cnt,
791 FLOW_DISSECTOR_KEY_ETH_ADDRS, eth); 1024 FLOW_DISSECTOR_KEY_ETH_ADDRS, eth);
792 FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, 1025 FL_KEY_SET_IF_MASKED(mask, keys, cnt,
793 FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4); 1026 FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4);
794 FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, 1027 FL_KEY_SET_IF_MASKED(mask, keys, cnt,
795 FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6); 1028 FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6);
796 FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, 1029 FL_KEY_SET_IF_MASKED(mask, keys, cnt,
797 FLOW_DISSECTOR_KEY_PORTS, tp); 1030 FLOW_DISSECTOR_KEY_PORTS, tp);
798 FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, 1031 FL_KEY_SET_IF_MASKED(mask, keys, cnt,
799 FLOW_DISSECTOR_KEY_IP, ip); 1032 FLOW_DISSECTOR_KEY_IP, ip);
800 FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, 1033 FL_KEY_SET_IF_MASKED(mask, keys, cnt,
801 FLOW_DISSECTOR_KEY_TCP, tcp); 1034 FLOW_DISSECTOR_KEY_TCP, tcp);
802 FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, 1035 FL_KEY_SET_IF_MASKED(mask, keys, cnt,
803 FLOW_DISSECTOR_KEY_ICMP, icmp); 1036 FLOW_DISSECTOR_KEY_ICMP, icmp);
804 FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, 1037 FL_KEY_SET_IF_MASKED(mask, keys, cnt,
805 FLOW_DISSECTOR_KEY_ARP, arp); 1038 FLOW_DISSECTOR_KEY_ARP, arp);
806 FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, 1039 FL_KEY_SET_IF_MASKED(mask, keys, cnt,
807 FLOW_DISSECTOR_KEY_MPLS, mpls); 1040 FLOW_DISSECTOR_KEY_MPLS, mpls);
808 FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, 1041 FL_KEY_SET_IF_MASKED(mask, keys, cnt,
809 FLOW_DISSECTOR_KEY_VLAN, vlan); 1042 FLOW_DISSECTOR_KEY_VLAN, vlan);
810 FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, 1043 FL_KEY_SET_IF_MASKED(mask, keys, cnt,
1044 FLOW_DISSECTOR_KEY_CVLAN, cvlan);
1045 FL_KEY_SET_IF_MASKED(mask, keys, cnt,
811 FLOW_DISSECTOR_KEY_ENC_KEYID, enc_key_id); 1046 FLOW_DISSECTOR_KEY_ENC_KEYID, enc_key_id);
812 FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, 1047 FL_KEY_SET_IF_MASKED(mask, keys, cnt,
813 FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, enc_ipv4); 1048 FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, enc_ipv4);
814 FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, 1049 FL_KEY_SET_IF_MASKED(mask, keys, cnt,
815 FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, enc_ipv6); 1050 FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, enc_ipv6);
816 if (FL_KEY_IS_MASKED(&mask->key, enc_ipv4) || 1051 if (FL_KEY_IS_MASKED(mask, enc_ipv4) ||
817 FL_KEY_IS_MASKED(&mask->key, enc_ipv6)) 1052 FL_KEY_IS_MASKED(mask, enc_ipv6))
818 FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_ENC_CONTROL, 1053 FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_ENC_CONTROL,
819 enc_control); 1054 enc_control);
820 FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, 1055 FL_KEY_SET_IF_MASKED(mask, keys, cnt,
821 FLOW_DISSECTOR_KEY_ENC_PORTS, enc_tp); 1056 FLOW_DISSECTOR_KEY_ENC_PORTS, enc_tp);
1057 FL_KEY_SET_IF_MASKED(mask, keys, cnt,
1058 FLOW_DISSECTOR_KEY_ENC_IP, enc_ip);
1059 FL_KEY_SET_IF_MASKED(mask, keys, cnt,
1060 FLOW_DISSECTOR_KEY_ENC_OPTS, enc_opts);
822 1061
823 skb_flow_dissector_init(&mask->dissector, keys, cnt); 1062 skb_flow_dissector_init(dissector, keys, cnt);
824} 1063}
825 1064
826static struct fl_flow_mask *fl_create_new_mask(struct cls_fl_head *head, 1065static struct fl_flow_mask *fl_create_new_mask(struct cls_fl_head *head,
@@ -839,7 +1078,7 @@ static struct fl_flow_mask *fl_create_new_mask(struct cls_fl_head *head,
839 if (err) 1078 if (err)
840 goto errout_free; 1079 goto errout_free;
841 1080
842 fl_init_dissector(newmask); 1081 fl_init_dissector(&newmask->dissector, &newmask->key);
843 1082
844 INIT_LIST_HEAD_RCU(&newmask->filters); 1083 INIT_LIST_HEAD_RCU(&newmask->filters);
845 1084
@@ -888,6 +1127,7 @@ static int fl_set_parms(struct net *net, struct tcf_proto *tp,
888 struct cls_fl_filter *f, struct fl_flow_mask *mask, 1127 struct cls_fl_filter *f, struct fl_flow_mask *mask,
889 unsigned long base, struct nlattr **tb, 1128 unsigned long base, struct nlattr **tb,
890 struct nlattr *est, bool ovr, 1129 struct nlattr *est, bool ovr,
1130 struct fl_flow_tmplt *tmplt,
891 struct netlink_ext_ack *extack) 1131 struct netlink_ext_ack *extack)
892{ 1132{
893 int err; 1133 int err;
@@ -908,6 +1148,11 @@ static int fl_set_parms(struct net *net, struct tcf_proto *tp,
908 fl_mask_update_range(mask); 1148 fl_mask_update_range(mask);
909 fl_set_masked_key(&f->mkey, &f->key, mask); 1149 fl_set_masked_key(&f->mkey, &f->key, mask);
910 1150
1151 if (!fl_mask_fits_tmplt(tmplt, mask)) {
1152 NL_SET_ERR_MSG_MOD(extack, "Mask does not fit the template");
1153 return -EINVAL;
1154 }
1155
911 return 0; 1156 return 0;
912} 1157}
913 1158
@@ -973,7 +1218,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
973 } 1218 }
974 1219
975 err = fl_set_parms(net, tp, fnew, &mask, base, tb, tca[TCA_RATE], ovr, 1220 err = fl_set_parms(net, tp, fnew, &mask, base, tb, tca[TCA_RATE], ovr,
976 extack); 1221 tp->chain->tmplt_priv, extack);
977 if (err) 1222 if (err)
978 goto errout_idr; 1223 goto errout_idr;
979 1224
@@ -1058,20 +1303,146 @@ static void fl_walk(struct tcf_proto *tp, struct tcf_walker *arg)
1058{ 1303{
1059 struct cls_fl_head *head = rtnl_dereference(tp->root); 1304 struct cls_fl_head *head = rtnl_dereference(tp->root);
1060 struct cls_fl_filter *f; 1305 struct cls_fl_filter *f;
1306
1307 arg->count = arg->skip;
1308
1309 while ((f = idr_get_next_ul(&head->handle_idr,
1310 &arg->cookie)) != NULL) {
1311 if (arg->fn(tp, f, arg) < 0) {
1312 arg->stop = 1;
1313 break;
1314 }
1315 arg->cookie = f->handle + 1;
1316 arg->count++;
1317 }
1318}
1319
1320static int fl_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb,
1321 void *cb_priv, struct netlink_ext_ack *extack)
1322{
1323 struct cls_fl_head *head = rtnl_dereference(tp->root);
1324 struct tc_cls_flower_offload cls_flower = {};
1325 struct tcf_block *block = tp->chain->block;
1061 struct fl_flow_mask *mask; 1326 struct fl_flow_mask *mask;
1327 struct cls_fl_filter *f;
1328 int err;
1062 1329
1063 list_for_each_entry_rcu(mask, &head->masks, list) { 1330 list_for_each_entry(mask, &head->masks, list) {
1064 list_for_each_entry_rcu(f, &mask->filters, list) { 1331 list_for_each_entry(f, &mask->filters, list) {
1065 if (arg->count < arg->skip) 1332 if (tc_skip_hw(f->flags))
1066 goto skip; 1333 continue;
1067 if (arg->fn(tp, f, arg) < 0) { 1334
1068 arg->stop = 1; 1335 tc_cls_common_offload_init(&cls_flower.common, tp,
1069 break; 1336 f->flags, extack);
1337 cls_flower.command = add ?
1338 TC_CLSFLOWER_REPLACE : TC_CLSFLOWER_DESTROY;
1339 cls_flower.cookie = (unsigned long)f;
1340 cls_flower.dissector = &mask->dissector;
1341 cls_flower.mask = &mask->key;
1342 cls_flower.key = &f->mkey;
1343 cls_flower.exts = &f->exts;
1344 cls_flower.classid = f->res.classid;
1345
1346 err = cb(TC_SETUP_CLSFLOWER, &cls_flower, cb_priv);
1347 if (err) {
1348 if (add && tc_skip_sw(f->flags))
1349 return err;
1350 continue;
1070 } 1351 }
1071skip: 1352
1072 arg->count++; 1353 tc_cls_offload_cnt_update(block, &f->in_hw_count,
1354 &f->flags, add);
1073 } 1355 }
1074 } 1356 }
1357
1358 return 0;
1359}
1360
1361static void fl_hw_create_tmplt(struct tcf_chain *chain,
1362 struct fl_flow_tmplt *tmplt)
1363{
1364 struct tc_cls_flower_offload cls_flower = {};
1365 struct tcf_block *block = chain->block;
1366 struct tcf_exts dummy_exts = { 0, };
1367
1368 cls_flower.common.chain_index = chain->index;
1369 cls_flower.command = TC_CLSFLOWER_TMPLT_CREATE;
1370 cls_flower.cookie = (unsigned long) tmplt;
1371 cls_flower.dissector = &tmplt->dissector;
1372 cls_flower.mask = &tmplt->mask;
1373 cls_flower.key = &tmplt->dummy_key;
1374 cls_flower.exts = &dummy_exts;
1375
1376 /* We don't care if driver (any of them) fails to handle this
1377 * call. It serves just as a hint for it.
1378 */
1379 tc_setup_cb_call(block, NULL, TC_SETUP_CLSFLOWER,
1380 &cls_flower, false);
1381}
1382
1383static void fl_hw_destroy_tmplt(struct tcf_chain *chain,
1384 struct fl_flow_tmplt *tmplt)
1385{
1386 struct tc_cls_flower_offload cls_flower = {};
1387 struct tcf_block *block = chain->block;
1388
1389 cls_flower.common.chain_index = chain->index;
1390 cls_flower.command = TC_CLSFLOWER_TMPLT_DESTROY;
1391 cls_flower.cookie = (unsigned long) tmplt;
1392
1393 tc_setup_cb_call(block, NULL, TC_SETUP_CLSFLOWER,
1394 &cls_flower, false);
1395}
1396
1397static void *fl_tmplt_create(struct net *net, struct tcf_chain *chain,
1398 struct nlattr **tca,
1399 struct netlink_ext_ack *extack)
1400{
1401 struct fl_flow_tmplt *tmplt;
1402 struct nlattr **tb;
1403 int err;
1404
1405 if (!tca[TCA_OPTIONS])
1406 return ERR_PTR(-EINVAL);
1407
1408 tb = kcalloc(TCA_FLOWER_MAX + 1, sizeof(struct nlattr *), GFP_KERNEL);
1409 if (!tb)
1410 return ERR_PTR(-ENOBUFS);
1411 err = nla_parse_nested(tb, TCA_FLOWER_MAX, tca[TCA_OPTIONS],
1412 fl_policy, NULL);
1413 if (err)
1414 goto errout_tb;
1415
1416 tmplt = kzalloc(sizeof(*tmplt), GFP_KERNEL);
1417 if (!tmplt) {
1418 err = -ENOMEM;
1419 goto errout_tb;
1420 }
1421 tmplt->chain = chain;
1422 err = fl_set_key(net, tb, &tmplt->dummy_key, &tmplt->mask, extack);
1423 if (err)
1424 goto errout_tmplt;
1425 kfree(tb);
1426
1427 fl_init_dissector(&tmplt->dissector, &tmplt->mask);
1428
1429 fl_hw_create_tmplt(chain, tmplt);
1430
1431 return tmplt;
1432
1433errout_tmplt:
1434 kfree(tmplt);
1435errout_tb:
1436 kfree(tb);
1437 return ERR_PTR(err);
1438}
1439
1440static void fl_tmplt_destroy(void *tmplt_priv)
1441{
1442 struct fl_flow_tmplt *tmplt = tmplt_priv;
1443
1444 fl_hw_destroy_tmplt(tmplt->chain, tmplt);
1445 kfree(tmplt);
1075} 1446}
1076 1447
1077static int fl_dump_key_val(struct sk_buff *skb, 1448static int fl_dump_key_val(struct sk_buff *skb,
@@ -1128,20 +1499,24 @@ static int fl_dump_key_mpls(struct sk_buff *skb,
1128 return 0; 1499 return 0;
1129} 1500}
1130 1501
1131static int fl_dump_key_ip(struct sk_buff *skb, 1502static int fl_dump_key_ip(struct sk_buff *skb, bool encap,
1132 struct flow_dissector_key_ip *key, 1503 struct flow_dissector_key_ip *key,
1133 struct flow_dissector_key_ip *mask) 1504 struct flow_dissector_key_ip *mask)
1134{ 1505{
1135 if (fl_dump_key_val(skb, &key->tos, TCA_FLOWER_KEY_IP_TOS, &mask->tos, 1506 int tos_key = encap ? TCA_FLOWER_KEY_ENC_IP_TOS : TCA_FLOWER_KEY_IP_TOS;
1136 TCA_FLOWER_KEY_IP_TOS_MASK, sizeof(key->tos)) || 1507 int ttl_key = encap ? TCA_FLOWER_KEY_ENC_IP_TTL : TCA_FLOWER_KEY_IP_TTL;
1137 fl_dump_key_val(skb, &key->ttl, TCA_FLOWER_KEY_IP_TTL, &mask->ttl, 1508 int tos_mask = encap ? TCA_FLOWER_KEY_ENC_IP_TOS_MASK : TCA_FLOWER_KEY_IP_TOS_MASK;
1138 TCA_FLOWER_KEY_IP_TTL_MASK, sizeof(key->ttl))) 1509 int ttl_mask = encap ? TCA_FLOWER_KEY_ENC_IP_TTL_MASK : TCA_FLOWER_KEY_IP_TTL_MASK;
1510
1511 if (fl_dump_key_val(skb, &key->tos, tos_key, &mask->tos, tos_mask, sizeof(key->tos)) ||
1512 fl_dump_key_val(skb, &key->ttl, ttl_key, &mask->ttl, ttl_mask, sizeof(key->ttl)))
1139 return -1; 1513 return -1;
1140 1514
1141 return 0; 1515 return 0;
1142} 1516}
1143 1517
1144static int fl_dump_key_vlan(struct sk_buff *skb, 1518static int fl_dump_key_vlan(struct sk_buff *skb,
1519 int vlan_id_key, int vlan_prio_key,
1145 struct flow_dissector_key_vlan *vlan_key, 1520 struct flow_dissector_key_vlan *vlan_key,
1146 struct flow_dissector_key_vlan *vlan_mask) 1521 struct flow_dissector_key_vlan *vlan_mask)
1147{ 1522{
@@ -1150,13 +1525,13 @@ static int fl_dump_key_vlan(struct sk_buff *skb,
1150 if (!memchr_inv(vlan_mask, 0, sizeof(*vlan_mask))) 1525 if (!memchr_inv(vlan_mask, 0, sizeof(*vlan_mask)))
1151 return 0; 1526 return 0;
1152 if (vlan_mask->vlan_id) { 1527 if (vlan_mask->vlan_id) {
1153 err = nla_put_u16(skb, TCA_FLOWER_KEY_VLAN_ID, 1528 err = nla_put_u16(skb, vlan_id_key,
1154 vlan_key->vlan_id); 1529 vlan_key->vlan_id);
1155 if (err) 1530 if (err)
1156 return err; 1531 return err;
1157 } 1532 }
1158 if (vlan_mask->vlan_priority) { 1533 if (vlan_mask->vlan_priority) {
1159 err = nla_put_u8(skb, TCA_FLOWER_KEY_VLAN_PRIO, 1534 err = nla_put_u8(skb, vlan_prio_key,
1160 vlan_key->vlan_priority); 1535 vlan_key->vlan_priority);
1161 if (err) 1536 if (err)
1162 return err; 1537 return err;
@@ -1203,29 +1578,86 @@ static int fl_dump_key_flags(struct sk_buff *skb, u32 flags_key, u32 flags_mask)
1203 return nla_put(skb, TCA_FLOWER_KEY_FLAGS_MASK, 4, &_mask); 1578 return nla_put(skb, TCA_FLOWER_KEY_FLAGS_MASK, 4, &_mask);
1204} 1579}
1205 1580
1206static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh, 1581static int fl_dump_key_geneve_opt(struct sk_buff *skb,
1207 struct sk_buff *skb, struct tcmsg *t) 1582 struct flow_dissector_key_enc_opts *enc_opts)
1208{ 1583{
1209 struct cls_fl_filter *f = fh; 1584 struct geneve_opt *opt;
1210 struct nlattr *nest; 1585 struct nlattr *nest;
1211 struct fl_flow_key *key, *mask; 1586 int opt_off = 0;
1212 1587
1213 if (!f) 1588 nest = nla_nest_start(skb, TCA_FLOWER_KEY_ENC_OPTS_GENEVE);
1214 return skb->len; 1589 if (!nest)
1590 goto nla_put_failure;
1215 1591
1216 t->tcm_handle = f->handle; 1592 while (enc_opts->len > opt_off) {
1593 opt = (struct geneve_opt *)&enc_opts->data[opt_off];
1217 1594
1218 nest = nla_nest_start(skb, TCA_OPTIONS); 1595 if (nla_put_be16(skb, TCA_FLOWER_KEY_ENC_OPT_GENEVE_CLASS,
1596 opt->opt_class))
1597 goto nla_put_failure;
1598 if (nla_put_u8(skb, TCA_FLOWER_KEY_ENC_OPT_GENEVE_TYPE,
1599 opt->type))
1600 goto nla_put_failure;
1601 if (nla_put(skb, TCA_FLOWER_KEY_ENC_OPT_GENEVE_DATA,
1602 opt->length * 4, opt->opt_data))
1603 goto nla_put_failure;
1604
1605 opt_off += sizeof(struct geneve_opt) + opt->length * 4;
1606 }
1607 nla_nest_end(skb, nest);
1608 return 0;
1609
1610nla_put_failure:
1611 nla_nest_cancel(skb, nest);
1612 return -EMSGSIZE;
1613}
1614
1615static int fl_dump_key_options(struct sk_buff *skb, int enc_opt_type,
1616 struct flow_dissector_key_enc_opts *enc_opts)
1617{
1618 struct nlattr *nest;
1619 int err;
1620
1621 if (!enc_opts->len)
1622 return 0;
1623
1624 nest = nla_nest_start(skb, enc_opt_type);
1219 if (!nest) 1625 if (!nest)
1220 goto nla_put_failure; 1626 goto nla_put_failure;
1221 1627
1222 if (f->res.classid && 1628 switch (enc_opts->dst_opt_type) {
1223 nla_put_u32(skb, TCA_FLOWER_CLASSID, f->res.classid)) 1629 case TUNNEL_GENEVE_OPT:
1630 err = fl_dump_key_geneve_opt(skb, enc_opts);
1631 if (err)
1632 goto nla_put_failure;
1633 break;
1634 default:
1224 goto nla_put_failure; 1635 goto nla_put_failure;
1636 }
1637 nla_nest_end(skb, nest);
1638 return 0;
1225 1639
1226 key = &f->key; 1640nla_put_failure:
1227 mask = &f->mask->key; 1641 nla_nest_cancel(skb, nest);
1642 return -EMSGSIZE;
1643}
1228 1644
1645static int fl_dump_key_enc_opt(struct sk_buff *skb,
1646 struct flow_dissector_key_enc_opts *key_opts,
1647 struct flow_dissector_key_enc_opts *msk_opts)
1648{
1649 int err;
1650
1651 err = fl_dump_key_options(skb, TCA_FLOWER_KEY_ENC_OPTS, key_opts);
1652 if (err)
1653 return err;
1654
1655 return fl_dump_key_options(skb, TCA_FLOWER_KEY_ENC_OPTS_MASK, msk_opts);
1656}
1657
1658static int fl_dump_key(struct sk_buff *skb, struct net *net,
1659 struct fl_flow_key *key, struct fl_flow_key *mask)
1660{
1229 if (mask->indev_ifindex) { 1661 if (mask->indev_ifindex) {
1230 struct net_device *dev; 1662 struct net_device *dev;
1231 1663
@@ -1234,9 +1666,6 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh,
1234 goto nla_put_failure; 1666 goto nla_put_failure;
1235 } 1667 }
1236 1668
1237 if (!tc_skip_hw(f->flags))
1238 fl_hw_update_stats(tp, f);
1239
1240 if (fl_dump_key_val(skb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST, 1669 if (fl_dump_key_val(skb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST,
1241 mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK, 1670 mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK,
1242 sizeof(key->eth.dst)) || 1671 sizeof(key->eth.dst)) ||
@@ -1251,15 +1680,36 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh,
1251 if (fl_dump_key_mpls(skb, &key->mpls, &mask->mpls)) 1680 if (fl_dump_key_mpls(skb, &key->mpls, &mask->mpls))
1252 goto nla_put_failure; 1681 goto nla_put_failure;
1253 1682
1254 if (fl_dump_key_vlan(skb, &key->vlan, &mask->vlan)) 1683 if (fl_dump_key_vlan(skb, TCA_FLOWER_KEY_VLAN_ID,
1684 TCA_FLOWER_KEY_VLAN_PRIO, &key->vlan, &mask->vlan))
1685 goto nla_put_failure;
1686
1687 if (fl_dump_key_vlan(skb, TCA_FLOWER_KEY_CVLAN_ID,
1688 TCA_FLOWER_KEY_CVLAN_PRIO,
1689 &key->cvlan, &mask->cvlan) ||
1690 (mask->cvlan.vlan_tpid &&
1691 nla_put_be16(skb, TCA_FLOWER_KEY_VLAN_ETH_TYPE,
1692 key->cvlan.vlan_tpid)))
1255 goto nla_put_failure; 1693 goto nla_put_failure;
1256 1694
1695 if (mask->basic.n_proto) {
1696 if (mask->cvlan.vlan_tpid) {
1697 if (nla_put_be16(skb, TCA_FLOWER_KEY_CVLAN_ETH_TYPE,
1698 key->basic.n_proto))
1699 goto nla_put_failure;
1700 } else if (mask->vlan.vlan_tpid) {
1701 if (nla_put_be16(skb, TCA_FLOWER_KEY_VLAN_ETH_TYPE,
1702 key->basic.n_proto))
1703 goto nla_put_failure;
1704 }
1705 }
1706
1257 if ((key->basic.n_proto == htons(ETH_P_IP) || 1707 if ((key->basic.n_proto == htons(ETH_P_IP) ||
1258 key->basic.n_proto == htons(ETH_P_IPV6)) && 1708 key->basic.n_proto == htons(ETH_P_IPV6)) &&
1259 (fl_dump_key_val(skb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO, 1709 (fl_dump_key_val(skb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO,
1260 &mask->basic.ip_proto, TCA_FLOWER_UNSPEC, 1710 &mask->basic.ip_proto, TCA_FLOWER_UNSPEC,
1261 sizeof(key->basic.ip_proto)) || 1711 sizeof(key->basic.ip_proto)) ||
1262 fl_dump_key_ip(skb, &key->ip, &mask->ip))) 1712 fl_dump_key_ip(skb, false, &key->ip, &mask->ip)))
1263 goto nla_put_failure; 1713 goto nla_put_failure;
1264 1714
1265 if (key->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS && 1715 if (key->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS &&
@@ -1384,12 +1834,49 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh,
1384 TCA_FLOWER_KEY_ENC_UDP_DST_PORT, 1834 TCA_FLOWER_KEY_ENC_UDP_DST_PORT,
1385 &mask->enc_tp.dst, 1835 &mask->enc_tp.dst,
1386 TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK, 1836 TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK,
1387 sizeof(key->enc_tp.dst))) 1837 sizeof(key->enc_tp.dst)) ||
1838 fl_dump_key_ip(skb, true, &key->enc_ip, &mask->enc_ip) ||
1839 fl_dump_key_enc_opt(skb, &key->enc_opts, &mask->enc_opts))
1388 goto nla_put_failure; 1840 goto nla_put_failure;
1389 1841
1390 if (fl_dump_key_flags(skb, key->control.flags, mask->control.flags)) 1842 if (fl_dump_key_flags(skb, key->control.flags, mask->control.flags))
1391 goto nla_put_failure; 1843 goto nla_put_failure;
1392 1844
1845 return 0;
1846
1847nla_put_failure:
1848 return -EMSGSIZE;
1849}
1850
1851static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh,
1852 struct sk_buff *skb, struct tcmsg *t)
1853{
1854 struct cls_fl_filter *f = fh;
1855 struct nlattr *nest;
1856 struct fl_flow_key *key, *mask;
1857
1858 if (!f)
1859 return skb->len;
1860
1861 t->tcm_handle = f->handle;
1862
1863 nest = nla_nest_start(skb, TCA_OPTIONS);
1864 if (!nest)
1865 goto nla_put_failure;
1866
1867 if (f->res.classid &&
1868 nla_put_u32(skb, TCA_FLOWER_CLASSID, f->res.classid))
1869 goto nla_put_failure;
1870
1871 key = &f->key;
1872 mask = &f->mask->key;
1873
1874 if (fl_dump_key(skb, net, key, mask))
1875 goto nla_put_failure;
1876
1877 if (!tc_skip_hw(f->flags))
1878 fl_hw_update_stats(tp, f);
1879
1393 if (f->flags && nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags)) 1880 if (f->flags && nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags))
1394 goto nla_put_failure; 1881 goto nla_put_failure;
1395 1882
@@ -1408,6 +1895,31 @@ nla_put_failure:
1408 return -1; 1895 return -1;
1409} 1896}
1410 1897
1898static int fl_tmplt_dump(struct sk_buff *skb, struct net *net, void *tmplt_priv)
1899{
1900 struct fl_flow_tmplt *tmplt = tmplt_priv;
1901 struct fl_flow_key *key, *mask;
1902 struct nlattr *nest;
1903
1904 nest = nla_nest_start(skb, TCA_OPTIONS);
1905 if (!nest)
1906 goto nla_put_failure;
1907
1908 key = &tmplt->dummy_key;
1909 mask = &tmplt->mask;
1910
1911 if (fl_dump_key(skb, net, key, mask))
1912 goto nla_put_failure;
1913
1914 nla_nest_end(skb, nest);
1915
1916 return skb->len;
1917
1918nla_put_failure:
1919 nla_nest_cancel(skb, nest);
1920 return -EMSGSIZE;
1921}
1922
1411static void fl_bind_class(void *fh, u32 classid, unsigned long cl) 1923static void fl_bind_class(void *fh, u32 classid, unsigned long cl)
1412{ 1924{
1413 struct cls_fl_filter *f = fh; 1925 struct cls_fl_filter *f = fh;
@@ -1425,8 +1937,12 @@ static struct tcf_proto_ops cls_fl_ops __read_mostly = {
1425 .change = fl_change, 1937 .change = fl_change,
1426 .delete = fl_delete, 1938 .delete = fl_delete,
1427 .walk = fl_walk, 1939 .walk = fl_walk,
1940 .reoffload = fl_reoffload,
1428 .dump = fl_dump, 1941 .dump = fl_dump,
1429 .bind_class = fl_bind_class, 1942 .bind_class = fl_bind_class,
1943 .tmplt_create = fl_tmplt_create,
1944 .tmplt_destroy = fl_tmplt_destroy,
1945 .tmplt_dump = fl_tmplt_dump,
1430 .owner = THIS_MODULE, 1946 .owner = THIS_MODULE,
1431}; 1947};
1432 1948
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index 47b207ef7762..856fa79d4ffd 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -21,6 +21,7 @@ struct cls_mall_head {
21 struct tcf_result res; 21 struct tcf_result res;
22 u32 handle; 22 u32 handle;
23 u32 flags; 23 u32 flags;
24 unsigned int in_hw_count;
24 struct rcu_work rwork; 25 struct rcu_work rwork;
25}; 26};
26 27
@@ -95,6 +96,7 @@ static int mall_replace_hw_filter(struct tcf_proto *tp,
95 mall_destroy_hw_filter(tp, head, cookie, NULL); 96 mall_destroy_hw_filter(tp, head, cookie, NULL);
96 return err; 97 return err;
97 } else if (err > 0) { 98 } else if (err > 0) {
99 head->in_hw_count = err;
98 tcf_block_offload_inc(block, &head->flags); 100 tcf_block_offload_inc(block, &head->flags);
99 } 101 }
100 102
@@ -111,6 +113,8 @@ static void mall_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
111 if (!head) 113 if (!head)
112 return; 114 return;
113 115
116 tcf_unbind_filter(tp, &head->res);
117
114 if (!tc_skip_hw(head->flags)) 118 if (!tc_skip_hw(head->flags))
115 mall_destroy_hw_filter(tp, head, (unsigned long) head, extack); 119 mall_destroy_hw_filter(tp, head, (unsigned long) head, extack);
116 120
@@ -235,6 +239,35 @@ skip:
235 arg->count++; 239 arg->count++;
236} 240}
237 241
242static int mall_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb,
243 void *cb_priv, struct netlink_ext_ack *extack)
244{
245 struct cls_mall_head *head = rtnl_dereference(tp->root);
246 struct tc_cls_matchall_offload cls_mall = {};
247 struct tcf_block *block = tp->chain->block;
248 int err;
249
250 if (tc_skip_hw(head->flags))
251 return 0;
252
253 tc_cls_common_offload_init(&cls_mall.common, tp, head->flags, extack);
254 cls_mall.command = add ?
255 TC_CLSMATCHALL_REPLACE : TC_CLSMATCHALL_DESTROY;
256 cls_mall.exts = &head->exts;
257 cls_mall.cookie = (unsigned long)head;
258
259 err = cb(TC_SETUP_CLSMATCHALL, &cls_mall, cb_priv);
260 if (err) {
261 if (add && tc_skip_sw(head->flags))
262 return err;
263 return 0;
264 }
265
266 tc_cls_offload_cnt_update(block, &head->in_hw_count, &head->flags, add);
267
268 return 0;
269}
270
238static int mall_dump(struct net *net, struct tcf_proto *tp, void *fh, 271static int mall_dump(struct net *net, struct tcf_proto *tp, void *fh,
239 struct sk_buff *skb, struct tcmsg *t) 272 struct sk_buff *skb, struct tcmsg *t)
240{ 273{
@@ -289,6 +322,7 @@ static struct tcf_proto_ops cls_mall_ops __read_mostly = {
289 .change = mall_change, 322 .change = mall_change,
290 .delete = mall_delete, 323 .delete = mall_delete,
291 .walk = mall_walk, 324 .walk = mall_walk,
325 .reoffload = mall_reoffload,
292 .dump = mall_dump, 326 .dump = mall_dump,
293 .bind_class = mall_bind_class, 327 .bind_class = mall_bind_class,
294 .owner = THIS_MODULE, 328 .owner = THIS_MODULE,
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
index 32f4bbd82f35..9ccc93f257db 100644
--- a/net/sched/cls_tcindex.c
+++ b/net/sched/cls_tcindex.c
@@ -447,11 +447,6 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
447 tcf_bind_filter(tp, &cr.res, base); 447 tcf_bind_filter(tp, &cr.res, base);
448 } 448 }
449 449
450 if (old_r)
451 tcf_exts_change(&r->exts, &e);
452 else
453 tcf_exts_change(&cr.exts, &e);
454
455 if (old_r && old_r != r) { 450 if (old_r && old_r != r) {
456 err = tcindex_filter_result_init(old_r); 451 err = tcindex_filter_result_init(old_r);
457 if (err < 0) { 452 if (err < 0) {
@@ -462,12 +457,15 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
462 457
463 oldp = p; 458 oldp = p;
464 r->res = cr.res; 459 r->res = cr.res;
460 tcf_exts_change(&r->exts, &e);
461
465 rcu_assign_pointer(tp->root, cp); 462 rcu_assign_pointer(tp->root, cp);
466 463
467 if (r == &new_filter_result) { 464 if (r == &new_filter_result) {
468 struct tcindex_filter *nfp; 465 struct tcindex_filter *nfp;
469 struct tcindex_filter __rcu **fp; 466 struct tcindex_filter __rcu **fp;
470 467
468 f->result.res = r->res;
471 tcf_exts_change(&f->result.exts, &r->exts); 469 tcf_exts_change(&f->result.exts, &r->exts);
472 470
473 fp = cp->h + (handle % cp->hash); 471 fp = cp->h + (handle % cp->hash);
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index fb861f90fde6..f218ccf1e2d9 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -62,6 +62,7 @@ struct tc_u_knode {
62 struct tc_u32_pcnt __percpu *pf; 62 struct tc_u32_pcnt __percpu *pf;
63#endif 63#endif
64 u32 flags; 64 u32 flags;
65 unsigned int in_hw_count;
65#ifdef CONFIG_CLS_U32_MARK 66#ifdef CONFIG_CLS_U32_MARK
66 u32 val; 67 u32 val;
67 u32 mask; 68 u32 mask;
@@ -571,6 +572,7 @@ static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
571 u32_remove_hw_knode(tp, n, NULL); 572 u32_remove_hw_knode(tp, n, NULL);
572 return err; 573 return err;
573 } else if (err > 0) { 574 } else if (err > 0) {
575 n->in_hw_count = err;
574 tcf_block_offload_inc(block, &n->flags); 576 tcf_block_offload_inc(block, &n->flags);
575 } 577 }
576 578
@@ -912,6 +914,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
912 struct nlattr *opt = tca[TCA_OPTIONS]; 914 struct nlattr *opt = tca[TCA_OPTIONS];
913 struct nlattr *tb[TCA_U32_MAX + 1]; 915 struct nlattr *tb[TCA_U32_MAX + 1];
914 u32 htid, flags = 0; 916 u32 htid, flags = 0;
917 size_t sel_size;
915 int err; 918 int err;
916#ifdef CONFIG_CLS_U32_PERF 919#ifdef CONFIG_CLS_U32_PERF
917 size_t size; 920 size_t size;
@@ -1074,8 +1077,13 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
1074 } 1077 }
1075 1078
1076 s = nla_data(tb[TCA_U32_SEL]); 1079 s = nla_data(tb[TCA_U32_SEL]);
1080 sel_size = struct_size(s, keys, s->nkeys);
1081 if (nla_len(tb[TCA_U32_SEL]) < sel_size) {
1082 err = -EINVAL;
1083 goto erridr;
1084 }
1077 1085
1078 n = kzalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key), GFP_KERNEL); 1086 n = kzalloc(offsetof(typeof(*n), sel) + sel_size, GFP_KERNEL);
1079 if (n == NULL) { 1087 if (n == NULL) {
1080 err = -ENOBUFS; 1088 err = -ENOBUFS;
1081 goto erridr; 1089 goto erridr;
@@ -1090,7 +1098,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
1090 } 1098 }
1091#endif 1099#endif
1092 1100
1093 memcpy(&n->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key)); 1101 memcpy(&n->sel, s, sel_size);
1094 RCU_INIT_POINTER(n->ht_up, ht); 1102 RCU_INIT_POINTER(n->ht_up, ht);
1095 n->handle = handle; 1103 n->handle = handle;
1096 n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0; 1104 n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0;
@@ -1199,6 +1207,114 @@ static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg)
1199 } 1207 }
1200} 1208}
1201 1209
1210static int u32_reoffload_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht,
1211 bool add, tc_setup_cb_t *cb, void *cb_priv,
1212 struct netlink_ext_ack *extack)
1213{
1214 struct tc_cls_u32_offload cls_u32 = {};
1215 int err;
1216
1217 tc_cls_common_offload_init(&cls_u32.common, tp, ht->flags, extack);
1218 cls_u32.command = add ? TC_CLSU32_NEW_HNODE : TC_CLSU32_DELETE_HNODE;
1219 cls_u32.hnode.divisor = ht->divisor;
1220 cls_u32.hnode.handle = ht->handle;
1221 cls_u32.hnode.prio = ht->prio;
1222
1223 err = cb(TC_SETUP_CLSU32, &cls_u32, cb_priv);
1224 if (err && add && tc_skip_sw(ht->flags))
1225 return err;
1226
1227 return 0;
1228}
1229
1230static int u32_reoffload_knode(struct tcf_proto *tp, struct tc_u_knode *n,
1231 bool add, tc_setup_cb_t *cb, void *cb_priv,
1232 struct netlink_ext_ack *extack)
1233{
1234 struct tc_u_hnode *ht = rtnl_dereference(n->ht_down);
1235 struct tcf_block *block = tp->chain->block;
1236 struct tc_cls_u32_offload cls_u32 = {};
1237 int err;
1238
1239 tc_cls_common_offload_init(&cls_u32.common, tp, n->flags, extack);
1240 cls_u32.command = add ?
1241 TC_CLSU32_REPLACE_KNODE : TC_CLSU32_DELETE_KNODE;
1242 cls_u32.knode.handle = n->handle;
1243
1244 if (add) {
1245 cls_u32.knode.fshift = n->fshift;
1246#ifdef CONFIG_CLS_U32_MARK
1247 cls_u32.knode.val = n->val;
1248 cls_u32.knode.mask = n->mask;
1249#else
1250 cls_u32.knode.val = 0;
1251 cls_u32.knode.mask = 0;
1252#endif
1253 cls_u32.knode.sel = &n->sel;
1254 cls_u32.knode.exts = &n->exts;
1255 if (n->ht_down)
1256 cls_u32.knode.link_handle = ht->handle;
1257 }
1258
1259 err = cb(TC_SETUP_CLSU32, &cls_u32, cb_priv);
1260 if (err) {
1261 if (add && tc_skip_sw(n->flags))
1262 return err;
1263 return 0;
1264 }
1265
1266 tc_cls_offload_cnt_update(block, &n->in_hw_count, &n->flags, add);
1267
1268 return 0;
1269}
1270
1271static int u32_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb,
1272 void *cb_priv, struct netlink_ext_ack *extack)
1273{
1274 struct tc_u_common *tp_c = tp->data;
1275 struct tc_u_hnode *ht;
1276 struct tc_u_knode *n;
1277 unsigned int h;
1278 int err;
1279
1280 for (ht = rtnl_dereference(tp_c->hlist);
1281 ht;
1282 ht = rtnl_dereference(ht->next)) {
1283 if (ht->prio != tp->prio)
1284 continue;
1285
1286 /* When adding filters to a new dev, try to offload the
1287 * hashtable first. When removing, do the filters before the
1288 * hashtable.
1289 */
1290 if (add && !tc_skip_hw(ht->flags)) {
1291 err = u32_reoffload_hnode(tp, ht, add, cb, cb_priv,
1292 extack);
1293 if (err)
1294 return err;
1295 }
1296
1297 for (h = 0; h <= ht->divisor; h++) {
1298 for (n = rtnl_dereference(ht->ht[h]);
1299 n;
1300 n = rtnl_dereference(n->next)) {
1301 if (tc_skip_hw(n->flags))
1302 continue;
1303
1304 err = u32_reoffload_knode(tp, n, add, cb,
1305 cb_priv, extack);
1306 if (err)
1307 return err;
1308 }
1309 }
1310
1311 if (!add && !tc_skip_hw(ht->flags))
1312 u32_reoffload_hnode(tp, ht, add, cb, cb_priv, extack);
1313 }
1314
1315 return 0;
1316}
1317
1202static void u32_bind_class(void *fh, u32 classid, unsigned long cl) 1318static void u32_bind_class(void *fh, u32 classid, unsigned long cl)
1203{ 1319{
1204 struct tc_u_knode *n = fh; 1320 struct tc_u_knode *n = fh;
@@ -1336,6 +1452,7 @@ static struct tcf_proto_ops cls_u32_ops __read_mostly = {
1336 .change = u32_change, 1452 .change = u32_change,
1337 .delete = u32_delete, 1453 .delete = u32_delete,
1338 .walk = u32_walk, 1454 .walk = u32_walk,
1455 .reoffload = u32_reoffload,
1339 .dump = u32_dump, 1456 .dump = u32_dump,
1340 .bind_class = u32_bind_class, 1457 .bind_class = u32_bind_class,
1341 .owner = THIS_MODULE, 1458 .owner = THIS_MODULE,
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 54eca685420f..98541c6399db 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -596,12 +596,19 @@ static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
596 return HRTIMER_NORESTART; 596 return HRTIMER_NORESTART;
597} 597}
598 598
599void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc) 599void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
600 clockid_t clockid)
600{ 601{
601 hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); 602 hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
602 wd->timer.function = qdisc_watchdog; 603 wd->timer.function = qdisc_watchdog;
603 wd->qdisc = qdisc; 604 wd->qdisc = qdisc;
604} 605}
606EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
607
608void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
609{
610 qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
611}
605EXPORT_SYMBOL(qdisc_watchdog_init); 612EXPORT_SYMBOL(qdisc_watchdog_init);
606 613
607void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires) 614void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
new file mode 100644
index 000000000000..c07c30b916d5
--- /dev/null
+++ b/net/sched/sch_cake.c
@@ -0,0 +1,3034 @@
1// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2
3/* COMMON Applications Kept Enhanced (CAKE) discipline
4 *
5 * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com>
6 * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk>
7 * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com>
8 * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de>
9 * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk>
10 * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au>
11 *
12 * The CAKE Principles:
13 * (or, how to have your cake and eat it too)
14 *
15 * This is a combination of several shaping, AQM and FQ techniques into one
16 * easy-to-use package:
17 *
18 * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE
19 * equipment and bloated MACs. This operates in deficit mode (as in sch_fq),
20 * eliminating the need for any sort of burst parameter (eg. token bucket
21 * depth). Burst support is limited to that necessary to overcome scheduling
22 * latency.
23 *
24 * - A Diffserv-aware priority queue, giving more priority to certain classes,
25 * up to a specified fraction of bandwidth. Above that bandwidth threshold,
26 * the priority is reduced to avoid starving other tins.
27 *
28 * - Each priority tin has a separate Flow Queue system, to isolate traffic
29 * flows from each other. This prevents a burst on one flow from increasing
30 * the delay to another. Flows are distributed to queues using a
31 * set-associative hash function.
32 *
33 * - Each queue is actively managed by Cobalt, which is a combination of the
34 * Codel and Blue AQM algorithms. This serves flows fairly, and signals
35 * congestion early via ECN (if available) and/or packet drops, to keep
36 * latency low. The codel parameters are auto-tuned based on the bandwidth
37 * setting, as is necessary at low bandwidths.
38 *
39 * The configuration parameters are kept deliberately simple for ease of use.
40 * Everything has sane defaults. Complete generality of configuration is *not*
41 * a goal.
42 *
43 * The priority queue operates according to a weighted DRR scheme, combined with
44 * a bandwidth tracker which reuses the shaper logic to detect which side of the
45 * bandwidth sharing threshold the tin is operating. This determines whether a
46 * priority-based weight (high) or a bandwidth-based weight (low) is used for
47 * that tin in the current pass.
48 *
49 * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly
50 * granted us permission to leverage.
51 */
52
53#include <linux/module.h>
54#include <linux/types.h>
55#include <linux/kernel.h>
56#include <linux/jiffies.h>
57#include <linux/string.h>
58#include <linux/in.h>
59#include <linux/errno.h>
60#include <linux/init.h>
61#include <linux/skbuff.h>
62#include <linux/jhash.h>
63#include <linux/slab.h>
64#include <linux/vmalloc.h>
65#include <linux/reciprocal_div.h>
66#include <net/netlink.h>
67#include <linux/if_vlan.h>
68#include <net/pkt_sched.h>
69#include <net/pkt_cls.h>
70#include <net/tcp.h>
71#include <net/flow_dissector.h>
72
73#if IS_ENABLED(CONFIG_NF_CONNTRACK)
74#include <net/netfilter/nf_conntrack_core.h>
75#endif
76
77#define CAKE_SET_WAYS (8)
78#define CAKE_MAX_TINS (8)
79#define CAKE_QUEUES (1024)
80#define CAKE_FLOW_MASK 63
81#define CAKE_FLOW_NAT_FLAG 64
82
83/* struct cobalt_params - contains codel and blue parameters
84 * @interval: codel initial drop rate
85 * @target: maximum persistent sojourn time & blue update rate
86 * @mtu_time: serialisation delay of maximum-size packet
87 * @p_inc: increment of blue drop probability (0.32 fxp)
88 * @p_dec: decrement of blue drop probability (0.32 fxp)
89 */
90struct cobalt_params {
91 u64 interval;
92 u64 target;
93 u64 mtu_time;
94 u32 p_inc;
95 u32 p_dec;
96};
97
98/* struct cobalt_vars - contains codel and blue variables
99 * @count: codel dropping frequency
100 * @rec_inv_sqrt: reciprocal value of sqrt(count) >> 1
101 * @drop_next: time to drop next packet, or when we dropped last
102 * @blue_timer: Blue time to next drop
103 * @p_drop: BLUE drop probability (0.32 fxp)
104 * @dropping: set if in dropping state
105 * @ecn_marked: set if marked
106 */
107struct cobalt_vars {
108 u32 count;
109 u32 rec_inv_sqrt;
110 ktime_t drop_next;
111 ktime_t blue_timer;
112 u32 p_drop;
113 bool dropping;
114 bool ecn_marked;
115};
116
117enum {
118 CAKE_SET_NONE = 0,
119 CAKE_SET_SPARSE,
120 CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */
121 CAKE_SET_BULK,
122 CAKE_SET_DECAYING
123};
124
125struct cake_flow {
126 /* this stuff is all needed per-flow at dequeue time */
127 struct sk_buff *head;
128 struct sk_buff *tail;
129 struct list_head flowchain;
130 s32 deficit;
131 u32 dropped;
132 struct cobalt_vars cvars;
133 u16 srchost; /* index into cake_host table */
134 u16 dsthost;
135 u8 set;
136}; /* please try to keep this structure <= 64 bytes */
137
138struct cake_host {
139 u32 srchost_tag;
140 u32 dsthost_tag;
141 u16 srchost_refcnt;
142 u16 dsthost_refcnt;
143};
144
145struct cake_heap_entry {
146 u16 t:3, b:10;
147};
148
149struct cake_tin_data {
150 struct cake_flow flows[CAKE_QUEUES];
151 u32 backlogs[CAKE_QUEUES];
152 u32 tags[CAKE_QUEUES]; /* for set association */
153 u16 overflow_idx[CAKE_QUEUES];
154 struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */
155 u16 flow_quantum;
156
157 struct cobalt_params cparams;
158 u32 drop_overlimit;
159 u16 bulk_flow_count;
160 u16 sparse_flow_count;
161 u16 decaying_flow_count;
162 u16 unresponsive_flow_count;
163
164 u32 max_skblen;
165
166 struct list_head new_flows;
167 struct list_head old_flows;
168 struct list_head decaying_flows;
169
170 /* time_next = time_this + ((len * rate_ns) >> rate_shft) */
171 ktime_t time_next_packet;
172 u64 tin_rate_ns;
173 u64 tin_rate_bps;
174 u16 tin_rate_shft;
175
176 u16 tin_quantum_prio;
177 u16 tin_quantum_band;
178 s32 tin_deficit;
179 u32 tin_backlog;
180 u32 tin_dropped;
181 u32 tin_ecn_mark;
182
183 u32 packets;
184 u64 bytes;
185
186 u32 ack_drops;
187
188 /* moving averages */
189 u64 avge_delay;
190 u64 peak_delay;
191 u64 base_delay;
192
193 /* hash function stats */
194 u32 way_directs;
195 u32 way_hits;
196 u32 way_misses;
197 u32 way_collisions;
198}; /* number of tins is small, so size of this struct doesn't matter much */
199
200struct cake_sched_data {
201 struct tcf_proto __rcu *filter_list; /* optional external classifier */
202 struct tcf_block *block;
203 struct cake_tin_data *tins;
204
205 struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS];
206 u16 overflow_timeout;
207
208 u16 tin_cnt;
209 u8 tin_mode;
210 u8 flow_mode;
211 u8 ack_filter;
212 u8 atm_mode;
213
214 /* time_next = time_this + ((len * rate_ns) >> rate_shft) */
215 u16 rate_shft;
216 ktime_t time_next_packet;
217 ktime_t failsafe_next_packet;
218 u64 rate_ns;
219 u64 rate_bps;
220 u16 rate_flags;
221 s16 rate_overhead;
222 u16 rate_mpu;
223 u64 interval;
224 u64 target;
225
226 /* resource tracking */
227 u32 buffer_used;
228 u32 buffer_max_used;
229 u32 buffer_limit;
230 u32 buffer_config_limit;
231
232 /* indices for dequeue */
233 u16 cur_tin;
234 u16 cur_flow;
235
236 struct qdisc_watchdog watchdog;
237 const u8 *tin_index;
238 const u8 *tin_order;
239
240 /* bandwidth capacity estimate */
241 ktime_t last_packet_time;
242 ktime_t avg_window_begin;
243 u64 avg_packet_interval;
244 u64 avg_window_bytes;
245 u64 avg_peak_bandwidth;
246 ktime_t last_reconfig_time;
247
248 /* packet length stats */
249 u32 avg_netoff;
250 u16 max_netlen;
251 u16 max_adjlen;
252 u16 min_netlen;
253 u16 min_adjlen;
254};
255
256enum {
257 CAKE_FLAG_OVERHEAD = BIT(0),
258 CAKE_FLAG_AUTORATE_INGRESS = BIT(1),
259 CAKE_FLAG_INGRESS = BIT(2),
260 CAKE_FLAG_WASH = BIT(3),
261 CAKE_FLAG_SPLIT_GSO = BIT(4)
262};
263
264/* COBALT operates the Codel and BLUE algorithms in parallel, in order to
265 * obtain the best features of each. Codel is excellent on flows which
266 * respond to congestion signals in a TCP-like way. BLUE is more effective on
267 * unresponsive flows.
268 */
269
270struct cobalt_skb_cb {
271 ktime_t enqueue_time;
272 u32 adjusted_len;
273};
274
275static u64 us_to_ns(u64 us)
276{
277 return us * NSEC_PER_USEC;
278}
279
280static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb)
281{
282 qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb));
283 return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data;
284}
285
286static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb)
287{
288 return get_cobalt_cb(skb)->enqueue_time;
289}
290
291static void cobalt_set_enqueue_time(struct sk_buff *skb,
292 ktime_t now)
293{
294 get_cobalt_cb(skb)->enqueue_time = now;
295}
296
297static u16 quantum_div[CAKE_QUEUES + 1] = {0};
298
299/* Diffserv lookup tables */
300
301static const u8 precedence[] = {
302 0, 0, 0, 0, 0, 0, 0, 0,
303 1, 1, 1, 1, 1, 1, 1, 1,
304 2, 2, 2, 2, 2, 2, 2, 2,
305 3, 3, 3, 3, 3, 3, 3, 3,
306 4, 4, 4, 4, 4, 4, 4, 4,
307 5, 5, 5, 5, 5, 5, 5, 5,
308 6, 6, 6, 6, 6, 6, 6, 6,
309 7, 7, 7, 7, 7, 7, 7, 7,
310};
311
312static const u8 diffserv8[] = {
313 2, 5, 1, 2, 4, 2, 2, 2,
314 0, 2, 1, 2, 1, 2, 1, 2,
315 5, 2, 4, 2, 4, 2, 4, 2,
316 3, 2, 3, 2, 3, 2, 3, 2,
317 6, 2, 3, 2, 3, 2, 3, 2,
318 6, 2, 2, 2, 6, 2, 6, 2,
319 7, 2, 2, 2, 2, 2, 2, 2,
320 7, 2, 2, 2, 2, 2, 2, 2,
321};
322
323static const u8 diffserv4[] = {
324 0, 2, 0, 0, 2, 0, 0, 0,
325 1, 0, 0, 0, 0, 0, 0, 0,
326 2, 0, 2, 0, 2, 0, 2, 0,
327 2, 0, 2, 0, 2, 0, 2, 0,
328 3, 0, 2, 0, 2, 0, 2, 0,
329 3, 0, 0, 0, 3, 0, 3, 0,
330 3, 0, 0, 0, 0, 0, 0, 0,
331 3, 0, 0, 0, 0, 0, 0, 0,
332};
333
334static const u8 diffserv3[] = {
335 0, 0, 0, 0, 2, 0, 0, 0,
336 1, 0, 0, 0, 0, 0, 0, 0,
337 0, 0, 0, 0, 0, 0, 0, 0,
338 0, 0, 0, 0, 0, 0, 0, 0,
339 0, 0, 0, 0, 0, 0, 0, 0,
340 0, 0, 0, 0, 2, 0, 2, 0,
341 2, 0, 0, 0, 0, 0, 0, 0,
342 2, 0, 0, 0, 0, 0, 0, 0,
343};
344
345static const u8 besteffort[] = {
346 0, 0, 0, 0, 0, 0, 0, 0,
347 0, 0, 0, 0, 0, 0, 0, 0,
348 0, 0, 0, 0, 0, 0, 0, 0,
349 0, 0, 0, 0, 0, 0, 0, 0,
350 0, 0, 0, 0, 0, 0, 0, 0,
351 0, 0, 0, 0, 0, 0, 0, 0,
352 0, 0, 0, 0, 0, 0, 0, 0,
353 0, 0, 0, 0, 0, 0, 0, 0,
354};
355
356/* tin priority order for stats dumping */
357
358static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7};
359static const u8 bulk_order[] = {1, 0, 2, 3};
360
361#define REC_INV_SQRT_CACHE (16)
362static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0};
363
364/* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
365 * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
366 *
367 * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
368 */
369
370static void cobalt_newton_step(struct cobalt_vars *vars)
371{
372 u32 invsqrt, invsqrt2;
373 u64 val;
374
375 invsqrt = vars->rec_inv_sqrt;
376 invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
377 val = (3LL << 32) - ((u64)vars->count * invsqrt2);
378
379 val >>= 2; /* avoid overflow in following multiply */
380 val = (val * invsqrt) >> (32 - 2 + 1);
381
382 vars->rec_inv_sqrt = val;
383}
384
385static void cobalt_invsqrt(struct cobalt_vars *vars)
386{
387 if (vars->count < REC_INV_SQRT_CACHE)
388 vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count];
389 else
390 cobalt_newton_step(vars);
391}
392
393/* There is a big difference in timing between the accurate values placed in
394 * the cache and the approximations given by a single Newton step for small
395 * count values, particularly when stepping from count 1 to 2 or vice versa.
396 * Above 16, a single Newton step gives sufficient accuracy in either
397 * direction, given the precision stored.
398 *
399 * The magnitude of the error when stepping up to count 2 is such as to give
400 * the value that *should* have been produced at count 4.
401 */
402
403static void cobalt_cache_init(void)
404{
405 struct cobalt_vars v;
406
407 memset(&v, 0, sizeof(v));
408 v.rec_inv_sqrt = ~0U;
409 cobalt_rec_inv_sqrt_cache[0] = v.rec_inv_sqrt;
410
411 for (v.count = 1; v.count < REC_INV_SQRT_CACHE; v.count++) {
412 cobalt_newton_step(&v);
413 cobalt_newton_step(&v);
414 cobalt_newton_step(&v);
415 cobalt_newton_step(&v);
416
417 cobalt_rec_inv_sqrt_cache[v.count] = v.rec_inv_sqrt;
418 }
419}
420
421static void cobalt_vars_init(struct cobalt_vars *vars)
422{
423 memset(vars, 0, sizeof(*vars));
424
425 if (!cobalt_rec_inv_sqrt_cache[0]) {
426 cobalt_cache_init();
427 cobalt_rec_inv_sqrt_cache[0] = ~0;
428 }
429}
430
431/* CoDel control_law is t + interval/sqrt(count)
432 * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
433 * both sqrt() and divide operation.
434 */
435static ktime_t cobalt_control(ktime_t t,
436 u64 interval,
437 u32 rec_inv_sqrt)
438{
439 return ktime_add_ns(t, reciprocal_scale(interval,
440 rec_inv_sqrt));
441}
442
443/* Call this when a packet had to be dropped due to queue overflow. Returns
444 * true if the BLUE state was quiescent before but active after this call.
445 */
446static bool cobalt_queue_full(struct cobalt_vars *vars,
447 struct cobalt_params *p,
448 ktime_t now)
449{
450 bool up = false;
451
452 if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
453 up = !vars->p_drop;
454 vars->p_drop += p->p_inc;
455 if (vars->p_drop < p->p_inc)
456 vars->p_drop = ~0;
457 vars->blue_timer = now;
458 }
459 vars->dropping = true;
460 vars->drop_next = now;
461 if (!vars->count)
462 vars->count = 1;
463
464 return up;
465}
466
467/* Call this when the queue was serviced but turned out to be empty. Returns
468 * true if the BLUE state was active before but quiescent after this call.
469 */
470static bool cobalt_queue_empty(struct cobalt_vars *vars,
471 struct cobalt_params *p,
472 ktime_t now)
473{
474 bool down = false;
475
476 if (vars->p_drop &&
477 ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
478 if (vars->p_drop < p->p_dec)
479 vars->p_drop = 0;
480 else
481 vars->p_drop -= p->p_dec;
482 vars->blue_timer = now;
483 down = !vars->p_drop;
484 }
485 vars->dropping = false;
486
487 if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
488 vars->count--;
489 cobalt_invsqrt(vars);
490 vars->drop_next = cobalt_control(vars->drop_next,
491 p->interval,
492 vars->rec_inv_sqrt);
493 }
494
495 return down;
496}
497
498/* Call this with a freshly dequeued packet for possible congestion marking.
499 * Returns true as an instruction to drop the packet, false for delivery.
500 */
501static bool cobalt_should_drop(struct cobalt_vars *vars,
502 struct cobalt_params *p,
503 ktime_t now,
504 struct sk_buff *skb,
505 u32 bulk_flows)
506{
507 bool next_due, over_target, drop = false;
508 ktime_t schedule;
509 u64 sojourn;
510
511/* The 'schedule' variable records, in its sign, whether 'now' is before or
512 * after 'drop_next'. This allows 'drop_next' to be updated before the next
513 * scheduling decision is actually branched, without destroying that
514 * information. Similarly, the first 'schedule' value calculated is preserved
515 * in the boolean 'next_due'.
516 *
517 * As for 'drop_next', we take advantage of the fact that 'interval' is both
518 * the delay between first exceeding 'target' and the first signalling event,
519 * *and* the scaling factor for the signalling frequency. It's therefore very
520 * natural to use a single mechanism for both purposes, and eliminates a
521 * significant amount of reference Codel's spaghetti code. To help with this,
522 * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close
523 * as possible to 1.0 in fixed-point.
524 */
525
526 sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
527 schedule = ktime_sub(now, vars->drop_next);
528 over_target = sojourn > p->target &&
529 sojourn > p->mtu_time * bulk_flows * 2 &&
530 sojourn > p->mtu_time * 4;
531 next_due = vars->count && ktime_to_ns(schedule) >= 0;
532
533 vars->ecn_marked = false;
534
535 if (over_target) {
536 if (!vars->dropping) {
537 vars->dropping = true;
538 vars->drop_next = cobalt_control(now,
539 p->interval,
540 vars->rec_inv_sqrt);
541 }
542 if (!vars->count)
543 vars->count = 1;
544 } else if (vars->dropping) {
545 vars->dropping = false;
546 }
547
548 if (next_due && vars->dropping) {
549 /* Use ECN mark if possible, otherwise drop */
550 drop = !(vars->ecn_marked = INET_ECN_set_ce(skb));
551
552 vars->count++;
553 if (!vars->count)
554 vars->count--;
555 cobalt_invsqrt(vars);
556 vars->drop_next = cobalt_control(vars->drop_next,
557 p->interval,
558 vars->rec_inv_sqrt);
559 schedule = ktime_sub(now, vars->drop_next);
560 } else {
561 while (next_due) {
562 vars->count--;
563 cobalt_invsqrt(vars);
564 vars->drop_next = cobalt_control(vars->drop_next,
565 p->interval,
566 vars->rec_inv_sqrt);
567 schedule = ktime_sub(now, vars->drop_next);
568 next_due = vars->count && ktime_to_ns(schedule) >= 0;
569 }
570 }
571
572 /* Simple BLUE implementation. Lack of ECN is deliberate. */
573 if (vars->p_drop)
574 drop |= (prandom_u32() < vars->p_drop);
575
576 /* Overload the drop_next field as an activity timeout */
577 if (!vars->count)
578 vars->drop_next = ktime_add_ns(now, p->interval);
579 else if (ktime_to_ns(schedule) > 0 && !drop)
580 vars->drop_next = now;
581
582 return drop;
583}
584
585static void cake_update_flowkeys(struct flow_keys *keys,
586 const struct sk_buff *skb)
587{
588#if IS_ENABLED(CONFIG_NF_CONNTRACK)
589 struct nf_conntrack_tuple tuple = {};
590 bool rev = !skb->_nfct;
591
592 if (tc_skb_protocol(skb) != htons(ETH_P_IP))
593 return;
594
595 if (!nf_ct_get_tuple_skb(&tuple, skb))
596 return;
597
598 keys->addrs.v4addrs.src = rev ? tuple.dst.u3.ip : tuple.src.u3.ip;
599 keys->addrs.v4addrs.dst = rev ? tuple.src.u3.ip : tuple.dst.u3.ip;
600
601 if (keys->ports.ports) {
602 keys->ports.src = rev ? tuple.dst.u.all : tuple.src.u.all;
603 keys->ports.dst = rev ? tuple.src.u.all : tuple.dst.u.all;
604 }
605#endif
606}
607
608/* Cake has several subtle multiple bit settings. In these cases you
609 * would be matching triple isolate mode as well.
610 */
611
612static bool cake_dsrc(int flow_mode)
613{
614 return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC;
615}
616
617static bool cake_ddst(int flow_mode)
618{
619 return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST;
620}
621
622static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
623 int flow_mode, u16 flow_override, u16 host_override)
624{
625 u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0;
626 u16 reduced_hash, srchost_idx, dsthost_idx;
627 struct flow_keys keys, host_keys;
628
629 if (unlikely(flow_mode == CAKE_FLOW_NONE))
630 return 0;
631
632 /* If both overrides are set we can skip packet dissection entirely */
633 if ((flow_override || !(flow_mode & CAKE_FLOW_FLOWS)) &&
634 (host_override || !(flow_mode & CAKE_FLOW_HOSTS)))
635 goto skip_hash;
636
637 skb_flow_dissect_flow_keys(skb, &keys,
638 FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
639
640 if (flow_mode & CAKE_FLOW_NAT_FLAG)
641 cake_update_flowkeys(&keys, skb);
642
643 /* flow_hash_from_keys() sorts the addresses by value, so we have
644 * to preserve their order in a separate data structure to treat
645 * src and dst host addresses as independently selectable.
646 */
647 host_keys = keys;
648 host_keys.ports.ports = 0;
649 host_keys.basic.ip_proto = 0;
650 host_keys.keyid.keyid = 0;
651 host_keys.tags.flow_label = 0;
652
653 switch (host_keys.control.addr_type) {
654 case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
655 host_keys.addrs.v4addrs.src = 0;
656 dsthost_hash = flow_hash_from_keys(&host_keys);
657 host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
658 host_keys.addrs.v4addrs.dst = 0;
659 srchost_hash = flow_hash_from_keys(&host_keys);
660 break;
661
662 case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
663 memset(&host_keys.addrs.v6addrs.src, 0,
664 sizeof(host_keys.addrs.v6addrs.src));
665 dsthost_hash = flow_hash_from_keys(&host_keys);
666 host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
667 memset(&host_keys.addrs.v6addrs.dst, 0,
668 sizeof(host_keys.addrs.v6addrs.dst));
669 srchost_hash = flow_hash_from_keys(&host_keys);
670 break;
671
672 default:
673 dsthost_hash = 0;
674 srchost_hash = 0;
675 }
676
677 /* This *must* be after the above switch, since as a
678 * side-effect it sorts the src and dst addresses.
679 */
680 if (flow_mode & CAKE_FLOW_FLOWS)
681 flow_hash = flow_hash_from_keys(&keys);
682
683skip_hash:
684 if (flow_override)
685 flow_hash = flow_override - 1;
686 if (host_override) {
687 dsthost_hash = host_override - 1;
688 srchost_hash = host_override - 1;
689 }
690
691 if (!(flow_mode & CAKE_FLOW_FLOWS)) {
692 if (flow_mode & CAKE_FLOW_SRC_IP)
693 flow_hash ^= srchost_hash;
694
695 if (flow_mode & CAKE_FLOW_DST_IP)
696 flow_hash ^= dsthost_hash;
697 }
698
699 reduced_hash = flow_hash % CAKE_QUEUES;
700
701 /* set-associative hashing */
702 /* fast path if no hash collision (direct lookup succeeds) */
703 if (likely(q->tags[reduced_hash] == flow_hash &&
704 q->flows[reduced_hash].set)) {
705 q->way_directs++;
706 } else {
707 u32 inner_hash = reduced_hash % CAKE_SET_WAYS;
708 u32 outer_hash = reduced_hash - inner_hash;
709 bool allocate_src = false;
710 bool allocate_dst = false;
711 u32 i, k;
712
713 /* check if any active queue in the set is reserved for
714 * this flow.
715 */
716 for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
717 i++, k = (k + 1) % CAKE_SET_WAYS) {
718 if (q->tags[outer_hash + k] == flow_hash) {
719 if (i)
720 q->way_hits++;
721
722 if (!q->flows[outer_hash + k].set) {
723 /* need to increment host refcnts */
724 allocate_src = cake_dsrc(flow_mode);
725 allocate_dst = cake_ddst(flow_mode);
726 }
727
728 goto found;
729 }
730 }
731
732 /* no queue is reserved for this flow, look for an
733 * empty one.
734 */
735 for (i = 0; i < CAKE_SET_WAYS;
736 i++, k = (k + 1) % CAKE_SET_WAYS) {
737 if (!q->flows[outer_hash + k].set) {
738 q->way_misses++;
739 allocate_src = cake_dsrc(flow_mode);
740 allocate_dst = cake_ddst(flow_mode);
741 goto found;
742 }
743 }
744
745 /* With no empty queues, default to the original
746 * queue, accept the collision, update the host tags.
747 */
748 q->way_collisions++;
749 q->hosts[q->flows[reduced_hash].srchost].srchost_refcnt--;
750 q->hosts[q->flows[reduced_hash].dsthost].dsthost_refcnt--;
751 allocate_src = cake_dsrc(flow_mode);
752 allocate_dst = cake_ddst(flow_mode);
753found:
754 /* reserve queue for future packets in same flow */
755 reduced_hash = outer_hash + k;
756 q->tags[reduced_hash] = flow_hash;
757
758 if (allocate_src) {
759 srchost_idx = srchost_hash % CAKE_QUEUES;
760 inner_hash = srchost_idx % CAKE_SET_WAYS;
761 outer_hash = srchost_idx - inner_hash;
762 for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
763 i++, k = (k + 1) % CAKE_SET_WAYS) {
764 if (q->hosts[outer_hash + k].srchost_tag ==
765 srchost_hash)
766 goto found_src;
767 }
768 for (i = 0; i < CAKE_SET_WAYS;
769 i++, k = (k + 1) % CAKE_SET_WAYS) {
770 if (!q->hosts[outer_hash + k].srchost_refcnt)
771 break;
772 }
773 q->hosts[outer_hash + k].srchost_tag = srchost_hash;
774found_src:
775 srchost_idx = outer_hash + k;
776 q->hosts[srchost_idx].srchost_refcnt++;
777 q->flows[reduced_hash].srchost = srchost_idx;
778 }
779
780 if (allocate_dst) {
781 dsthost_idx = dsthost_hash % CAKE_QUEUES;
782 inner_hash = dsthost_idx % CAKE_SET_WAYS;
783 outer_hash = dsthost_idx - inner_hash;
784 for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
785 i++, k = (k + 1) % CAKE_SET_WAYS) {
786 if (q->hosts[outer_hash + k].dsthost_tag ==
787 dsthost_hash)
788 goto found_dst;
789 }
790 for (i = 0; i < CAKE_SET_WAYS;
791 i++, k = (k + 1) % CAKE_SET_WAYS) {
792 if (!q->hosts[outer_hash + k].dsthost_refcnt)
793 break;
794 }
795 q->hosts[outer_hash + k].dsthost_tag = dsthost_hash;
796found_dst:
797 dsthost_idx = outer_hash + k;
798 q->hosts[dsthost_idx].dsthost_refcnt++;
799 q->flows[reduced_hash].dsthost = dsthost_idx;
800 }
801 }
802
803 return reduced_hash;
804}
805
806/* helper functions : might be changed when/if skb use a standard list_head */
807/* remove one skb from head of slot queue */
808
809static struct sk_buff *dequeue_head(struct cake_flow *flow)
810{
811 struct sk_buff *skb = flow->head;
812
813 if (skb) {
814 flow->head = skb->next;
815 skb->next = NULL;
816 }
817
818 return skb;
819}
820
821/* add skb to flow queue (tail add) */
822
823static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb)
824{
825 if (!flow->head)
826 flow->head = skb;
827 else
828 flow->tail->next = skb;
829 flow->tail = skb;
830 skb->next = NULL;
831}
832
833static struct iphdr *cake_get_iphdr(const struct sk_buff *skb,
834 struct ipv6hdr *buf)
835{
836 unsigned int offset = skb_network_offset(skb);
837 struct iphdr *iph;
838
839 iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf);
840
841 if (!iph)
842 return NULL;
843
844 if (iph->version == 4 && iph->protocol == IPPROTO_IPV6)
845 return skb_header_pointer(skb, offset + iph->ihl * 4,
846 sizeof(struct ipv6hdr), buf);
847
848 else if (iph->version == 4)
849 return iph;
850
851 else if (iph->version == 6)
852 return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr),
853 buf);
854
855 return NULL;
856}
857
858static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb,
859 void *buf, unsigned int bufsize)
860{
861 unsigned int offset = skb_network_offset(skb);
862 const struct ipv6hdr *ipv6h;
863 const struct tcphdr *tcph;
864 const struct iphdr *iph;
865 struct ipv6hdr _ipv6h;
866 struct tcphdr _tcph;
867
868 ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
869
870 if (!ipv6h)
871 return NULL;
872
873 if (ipv6h->version == 4) {
874 iph = (struct iphdr *)ipv6h;
875 offset += iph->ihl * 4;
876
877 /* special-case 6in4 tunnelling, as that is a common way to get
878 * v6 connectivity in the home
879 */
880 if (iph->protocol == IPPROTO_IPV6) {
881 ipv6h = skb_header_pointer(skb, offset,
882 sizeof(_ipv6h), &_ipv6h);
883
884 if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP)
885 return NULL;
886
887 offset += sizeof(struct ipv6hdr);
888
889 } else if (iph->protocol != IPPROTO_TCP) {
890 return NULL;
891 }
892
893 } else if (ipv6h->version == 6) {
894 if (ipv6h->nexthdr != IPPROTO_TCP)
895 return NULL;
896
897 offset += sizeof(struct ipv6hdr);
898 } else {
899 return NULL;
900 }
901
902 tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
903 if (!tcph)
904 return NULL;
905
906 return skb_header_pointer(skb, offset,
907 min(__tcp_hdrlen(tcph), bufsize), buf);
908}
909
910static const void *cake_get_tcpopt(const struct tcphdr *tcph,
911 int code, int *oplen)
912{
913 /* inspired by tcp_parse_options in tcp_input.c */
914 int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
915 const u8 *ptr = (const u8 *)(tcph + 1);
916
917 while (length > 0) {
918 int opcode = *ptr++;
919 int opsize;
920
921 if (opcode == TCPOPT_EOL)
922 break;
923 if (opcode == TCPOPT_NOP) {
924 length--;
925 continue;
926 }
927 opsize = *ptr++;
928 if (opsize < 2 || opsize > length)
929 break;
930
931 if (opcode == code) {
932 *oplen = opsize;
933 return ptr;
934 }
935
936 ptr += opsize - 2;
937 length -= opsize;
938 }
939
940 return NULL;
941}
942
943/* Compare two SACK sequences. A sequence is considered greater if it SACKs more
944 * bytes than the other. In the case where both sequences ACKs bytes that the
945 * other doesn't, A is considered greater. DSACKs in A also makes A be
946 * considered greater.
947 *
948 * @return -1, 0 or 1 as normal compare functions
949 */
950static int cake_tcph_sack_compare(const struct tcphdr *tcph_a,
951 const struct tcphdr *tcph_b)
952{
953 const struct tcp_sack_block_wire *sack_a, *sack_b;
954 u32 ack_seq_a = ntohl(tcph_a->ack_seq);
955 u32 bytes_a = 0, bytes_b = 0;
956 int oplen_a, oplen_b;
957 bool first = true;
958
959 sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a);
960 sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b);
961
962 /* pointers point to option contents */
963 oplen_a -= TCPOLEN_SACK_BASE;
964 oplen_b -= TCPOLEN_SACK_BASE;
965
966 if (sack_a && oplen_a >= sizeof(*sack_a) &&
967 (!sack_b || oplen_b < sizeof(*sack_b)))
968 return -1;
969 else if (sack_b && oplen_b >= sizeof(*sack_b) &&
970 (!sack_a || oplen_a < sizeof(*sack_a)))
971 return 1;
972 else if ((!sack_a || oplen_a < sizeof(*sack_a)) &&
973 (!sack_b || oplen_b < sizeof(*sack_b)))
974 return 0;
975
976 while (oplen_a >= sizeof(*sack_a)) {
977 const struct tcp_sack_block_wire *sack_tmp = sack_b;
978 u32 start_a = get_unaligned_be32(&sack_a->start_seq);
979 u32 end_a = get_unaligned_be32(&sack_a->end_seq);
980 int oplen_tmp = oplen_b;
981 bool found = false;
982
983 /* DSACK; always considered greater to prevent dropping */
984 if (before(start_a, ack_seq_a))
985 return -1;
986
987 bytes_a += end_a - start_a;
988
989 while (oplen_tmp >= sizeof(*sack_tmp)) {
990 u32 start_b = get_unaligned_be32(&sack_tmp->start_seq);
991 u32 end_b = get_unaligned_be32(&sack_tmp->end_seq);
992
993 /* first time through we count the total size */
994 if (first)
995 bytes_b += end_b - start_b;
996
997 if (!after(start_b, start_a) && !before(end_b, end_a)) {
998 found = true;
999 if (!first)
1000 break;
1001 }
1002 oplen_tmp -= sizeof(*sack_tmp);
1003 sack_tmp++;
1004 }
1005
1006 if (!found)
1007 return -1;
1008
1009 oplen_a -= sizeof(*sack_a);
1010 sack_a++;
1011 first = false;
1012 }
1013
1014 /* If we made it this far, all ranges SACKed by A are covered by B, so
1015 * either the SACKs are equal, or B SACKs more bytes.
1016 */
1017 return bytes_b > bytes_a ? 1 : 0;
1018}
1019
1020static void cake_tcph_get_tstamp(const struct tcphdr *tcph,
1021 u32 *tsval, u32 *tsecr)
1022{
1023 const u8 *ptr;
1024 int opsize;
1025
1026 ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize);
1027
1028 if (ptr && opsize == TCPOLEN_TIMESTAMP) {
1029 *tsval = get_unaligned_be32(ptr);
1030 *tsecr = get_unaligned_be32(ptr + 4);
1031 }
1032}
1033
1034static bool cake_tcph_may_drop(const struct tcphdr *tcph,
1035 u32 tstamp_new, u32 tsecr_new)
1036{
1037 /* inspired by tcp_parse_options in tcp_input.c */
1038 int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1039 const u8 *ptr = (const u8 *)(tcph + 1);
1040 u32 tstamp, tsecr;
1041
1042 /* 3 reserved flags must be unset to avoid future breakage
1043 * ACK must be set
1044 * ECE/CWR are handled separately
1045 * All other flags URG/PSH/RST/SYN/FIN must be unset
1046 * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero)
1047 * 0x00C00000 = CWR/ECE (handled separately)
1048 * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000
1049 */
1050 if (((tcp_flag_word(tcph) &
1051 cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK))
1052 return false;
1053
1054 while (length > 0) {
1055 int opcode = *ptr++;
1056 int opsize;
1057
1058 if (opcode == TCPOPT_EOL)
1059 break;
1060 if (opcode == TCPOPT_NOP) {
1061 length--;
1062 continue;
1063 }
1064 opsize = *ptr++;
1065 if (opsize < 2 || opsize > length)
1066 break;
1067
1068 switch (opcode) {
1069 case TCPOPT_MD5SIG: /* doesn't influence state */
1070 break;
1071
1072 case TCPOPT_SACK: /* stricter checking performed later */
1073 if (opsize % 8 != 2)
1074 return false;
1075 break;
1076
1077 case TCPOPT_TIMESTAMP:
1078 /* only drop timestamps lower than new */
1079 if (opsize != TCPOLEN_TIMESTAMP)
1080 return false;
1081 tstamp = get_unaligned_be32(ptr);
1082 tsecr = get_unaligned_be32(ptr + 4);
1083 if (after(tstamp, tstamp_new) ||
1084 after(tsecr, tsecr_new))
1085 return false;
1086 break;
1087
1088 case TCPOPT_MSS: /* these should only be set on SYN */
1089 case TCPOPT_WINDOW:
1090 case TCPOPT_SACK_PERM:
1091 case TCPOPT_FASTOPEN:
1092 case TCPOPT_EXP:
1093 default: /* don't drop if any unknown options are present */
1094 return false;
1095 }
1096
1097 ptr += opsize - 2;
1098 length -= opsize;
1099 }
1100
1101 return true;
1102}
1103
1104static struct sk_buff *cake_ack_filter(struct cake_sched_data *q,
1105 struct cake_flow *flow)
1106{
1107 bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE;
1108 struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL;
1109 struct sk_buff *skb_check, *skb_prev = NULL;
1110 const struct ipv6hdr *ipv6h, *ipv6h_check;
1111 unsigned char _tcph[64], _tcph_check[64];
1112 const struct tcphdr *tcph, *tcph_check;
1113 const struct iphdr *iph, *iph_check;
1114 struct ipv6hdr _iph, _iph_check;
1115 const struct sk_buff *skb;
1116 int seglen, num_found = 0;
1117 u32 tstamp = 0, tsecr = 0;
1118 __be32 elig_flags = 0;
1119 int sack_comp;
1120
1121 /* no other possible ACKs to filter */
1122 if (flow->head == flow->tail)
1123 return NULL;
1124
1125 skb = flow->tail;
1126 tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph));
1127 iph = cake_get_iphdr(skb, &_iph);
1128 if (!tcph)
1129 return NULL;
1130
1131 cake_tcph_get_tstamp(tcph, &tstamp, &tsecr);
1132
1133 /* the 'triggering' packet need only have the ACK flag set.
1134 * also check that SYN is not set, as there won't be any previous ACKs.
1135 */
1136 if ((tcp_flag_word(tcph) &
1137 (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK)
1138 return NULL;
1139
1140 /* the 'triggering' ACK is at the tail of the queue, we have already
1141 * returned if it is the only packet in the flow. loop through the rest
1142 * of the queue looking for pure ACKs with the same 5-tuple as the
1143 * triggering one.
1144 */
1145 for (skb_check = flow->head;
1146 skb_check && skb_check != skb;
1147 skb_prev = skb_check, skb_check = skb_check->next) {
1148 iph_check = cake_get_iphdr(skb_check, &_iph_check);
1149 tcph_check = cake_get_tcphdr(skb_check, &_tcph_check,
1150 sizeof(_tcph_check));
1151
1152 /* only TCP packets with matching 5-tuple are eligible, and only
1153 * drop safe headers
1154 */
1155 if (!tcph_check || iph->version != iph_check->version ||
1156 tcph_check->source != tcph->source ||
1157 tcph_check->dest != tcph->dest)
1158 continue;
1159
1160 if (iph_check->version == 4) {
1161 if (iph_check->saddr != iph->saddr ||
1162 iph_check->daddr != iph->daddr)
1163 continue;
1164
1165 seglen = ntohs(iph_check->tot_len) -
1166 (4 * iph_check->ihl);
1167 } else if (iph_check->version == 6) {
1168 ipv6h = (struct ipv6hdr *)iph;
1169 ipv6h_check = (struct ipv6hdr *)iph_check;
1170
1171 if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) ||
1172 ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr))
1173 continue;
1174
1175 seglen = ntohs(ipv6h_check->payload_len);
1176 } else {
1177 WARN_ON(1); /* shouldn't happen */
1178 continue;
1179 }
1180
1181 /* If the ECE/CWR flags changed from the previous eligible
1182 * packet in the same flow, we should no longer be dropping that
1183 * previous packet as this would lose information.
1184 */
1185 if (elig_ack && (tcp_flag_word(tcph_check) &
1186 (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) {
1187 elig_ack = NULL;
1188 elig_ack_prev = NULL;
1189 num_found--;
1190 }
1191
1192 /* Check TCP options and flags, don't drop ACKs with segment
1193 * data, and don't drop ACKs with a higher cumulative ACK
1194 * counter than the triggering packet. Check ACK seqno here to
1195 * avoid parsing SACK options of packets we are going to exclude
1196 * anyway.
1197 */
1198 if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) ||
1199 (seglen - __tcp_hdrlen(tcph_check)) != 0 ||
1200 after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq)))
1201 continue;
1202
1203 /* Check SACK options. The triggering packet must SACK more data
1204 * than the ACK under consideration, or SACK the same range but
1205 * have a larger cumulative ACK counter. The latter is a
1206 * pathological case, but is contained in the following check
1207 * anyway, just to be safe.
1208 */
1209 sack_comp = cake_tcph_sack_compare(tcph_check, tcph);
1210
1211 if (sack_comp < 0 ||
1212 (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) &&
1213 sack_comp == 0))
1214 continue;
1215
1216 /* At this point we have found an eligible pure ACK to drop; if
1217 * we are in aggressive mode, we are done. Otherwise, keep
1218 * searching unless this is the second eligible ACK we
1219 * found.
1220 *
1221 * Since we want to drop ACK closest to the head of the queue,
1222 * save the first eligible ACK we find, even if we need to loop
1223 * again.
1224 */
1225 if (!elig_ack) {
1226 elig_ack = skb_check;
1227 elig_ack_prev = skb_prev;
1228 elig_flags = (tcp_flag_word(tcph_check)
1229 & (TCP_FLAG_ECE | TCP_FLAG_CWR));
1230 }
1231
1232 if (num_found++ > 0)
1233 goto found;
1234 }
1235
1236 /* We made it through the queue without finding two eligible ACKs . If
1237 * we found a single eligible ACK we can drop it in aggressive mode if
1238 * we can guarantee that this does not interfere with ECN flag
1239 * information. We ensure this by dropping it only if the enqueued
1240 * packet is consecutive with the eligible ACK, and their flags match.
1241 */
1242 if (elig_ack && aggressive && elig_ack->next == skb &&
1243 (elig_flags == (tcp_flag_word(tcph) &
1244 (TCP_FLAG_ECE | TCP_FLAG_CWR))))
1245 goto found;
1246
1247 return NULL;
1248
1249found:
1250 if (elig_ack_prev)
1251 elig_ack_prev->next = elig_ack->next;
1252 else
1253 flow->head = elig_ack->next;
1254
1255 elig_ack->next = NULL;
1256
1257 return elig_ack;
1258}
1259
1260static u64 cake_ewma(u64 avg, u64 sample, u32 shift)
1261{
1262 avg -= avg >> shift;
1263 avg += sample >> shift;
1264 return avg;
1265}
1266
1267static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off)
1268{
1269 if (q->rate_flags & CAKE_FLAG_OVERHEAD)
1270 len -= off;
1271
1272 if (q->max_netlen < len)
1273 q->max_netlen = len;
1274 if (q->min_netlen > len)
1275 q->min_netlen = len;
1276
1277 len += q->rate_overhead;
1278
1279 if (len < q->rate_mpu)
1280 len = q->rate_mpu;
1281
1282 if (q->atm_mode == CAKE_ATM_ATM) {
1283 len += 47;
1284 len /= 48;
1285 len *= 53;
1286 } else if (q->atm_mode == CAKE_ATM_PTM) {
1287 /* Add one byte per 64 bytes or part thereof.
1288 * This is conservative and easier to calculate than the
1289 * precise value.
1290 */
1291 len += (len + 63) / 64;
1292 }
1293
1294 if (q->max_adjlen < len)
1295 q->max_adjlen = len;
1296 if (q->min_adjlen > len)
1297 q->min_adjlen = len;
1298
1299 return len;
1300}
1301
1302static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
1303{
1304 const struct skb_shared_info *shinfo = skb_shinfo(skb);
1305 unsigned int hdr_len, last_len = 0;
1306 u32 off = skb_network_offset(skb);
1307 u32 len = qdisc_pkt_len(skb);
1308 u16 segs = 1;
1309
1310 q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8);
1311
1312 if (!shinfo->gso_size)
1313 return cake_calc_overhead(q, len, off);
1314
1315 /* borrowed from qdisc_pkt_len_init() */
1316 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
1317
1318 /* + transport layer */
1319 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 |
1320 SKB_GSO_TCPV6))) {
1321 const struct tcphdr *th;
1322 struct tcphdr _tcphdr;
1323
1324 th = skb_header_pointer(skb, skb_transport_offset(skb),
1325 sizeof(_tcphdr), &_tcphdr);
1326 if (likely(th))
1327 hdr_len += __tcp_hdrlen(th);
1328 } else {
1329 struct udphdr _udphdr;
1330
1331 if (skb_header_pointer(skb, skb_transport_offset(skb),
1332 sizeof(_udphdr), &_udphdr))
1333 hdr_len += sizeof(struct udphdr);
1334 }
1335
1336 if (unlikely(shinfo->gso_type & SKB_GSO_DODGY))
1337 segs = DIV_ROUND_UP(skb->len - hdr_len,
1338 shinfo->gso_size);
1339 else
1340 segs = shinfo->gso_segs;
1341
1342 len = shinfo->gso_size + hdr_len;
1343 last_len = skb->len - shinfo->gso_size * (segs - 1);
1344
1345 return (cake_calc_overhead(q, len, off) * (segs - 1) +
1346 cake_calc_overhead(q, last_len, off));
1347}
1348
1349static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j)
1350{
1351 struct cake_heap_entry ii = q->overflow_heap[i];
1352 struct cake_heap_entry jj = q->overflow_heap[j];
1353
1354 q->overflow_heap[i] = jj;
1355 q->overflow_heap[j] = ii;
1356
1357 q->tins[ii.t].overflow_idx[ii.b] = j;
1358 q->tins[jj.t].overflow_idx[jj.b] = i;
1359}
1360
1361static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i)
1362{
1363 struct cake_heap_entry ii = q->overflow_heap[i];
1364
1365 return q->tins[ii.t].backlogs[ii.b];
1366}
1367
1368static void cake_heapify(struct cake_sched_data *q, u16 i)
1369{
1370 static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES;
1371 u32 mb = cake_heap_get_backlog(q, i);
1372 u32 m = i;
1373
1374 while (m < a) {
1375 u32 l = m + m + 1;
1376 u32 r = l + 1;
1377
1378 if (l < a) {
1379 u32 lb = cake_heap_get_backlog(q, l);
1380
1381 if (lb > mb) {
1382 m = l;
1383 mb = lb;
1384 }
1385 }
1386
1387 if (r < a) {
1388 u32 rb = cake_heap_get_backlog(q, r);
1389
1390 if (rb > mb) {
1391 m = r;
1392 mb = rb;
1393 }
1394 }
1395
1396 if (m != i) {
1397 cake_heap_swap(q, i, m);
1398 i = m;
1399 } else {
1400 break;
1401 }
1402 }
1403}
1404
1405static void cake_heapify_up(struct cake_sched_data *q, u16 i)
1406{
1407 while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) {
1408 u16 p = (i - 1) >> 1;
1409 u32 ib = cake_heap_get_backlog(q, i);
1410 u32 pb = cake_heap_get_backlog(q, p);
1411
1412 if (ib > pb) {
1413 cake_heap_swap(q, i, p);
1414 i = p;
1415 } else {
1416 break;
1417 }
1418 }
1419}
1420
1421static int cake_advance_shaper(struct cake_sched_data *q,
1422 struct cake_tin_data *b,
1423 struct sk_buff *skb,
1424 ktime_t now, bool drop)
1425{
1426 u32 len = get_cobalt_cb(skb)->adjusted_len;
1427
1428 /* charge packet bandwidth to this tin
1429 * and to the global shaper.
1430 */
1431 if (q->rate_ns) {
1432 u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft;
1433 u64 global_dur = (len * q->rate_ns) >> q->rate_shft;
1434 u64 failsafe_dur = global_dur + (global_dur >> 1);
1435
1436 if (ktime_before(b->time_next_packet, now))
1437 b->time_next_packet = ktime_add_ns(b->time_next_packet,
1438 tin_dur);
1439
1440 else if (ktime_before(b->time_next_packet,
1441 ktime_add_ns(now, tin_dur)))
1442 b->time_next_packet = ktime_add_ns(now, tin_dur);
1443
1444 q->time_next_packet = ktime_add_ns(q->time_next_packet,
1445 global_dur);
1446 if (!drop)
1447 q->failsafe_next_packet = \
1448 ktime_add_ns(q->failsafe_next_packet,
1449 failsafe_dur);
1450 }
1451 return len;
1452}
1453
1454static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
1455{
1456 struct cake_sched_data *q = qdisc_priv(sch);
1457 ktime_t now = ktime_get();
1458 u32 idx = 0, tin = 0, len;
1459 struct cake_heap_entry qq;
1460 struct cake_tin_data *b;
1461 struct cake_flow *flow;
1462 struct sk_buff *skb;
1463
1464 if (!q->overflow_timeout) {
1465 int i;
1466 /* Build fresh max-heap */
1467 for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2; i >= 0; i--)
1468 cake_heapify(q, i);
1469 }
1470 q->overflow_timeout = 65535;
1471
1472 /* select longest queue for pruning */
1473 qq = q->overflow_heap[0];
1474 tin = qq.t;
1475 idx = qq.b;
1476
1477 b = &q->tins[tin];
1478 flow = &b->flows[idx];
1479 skb = dequeue_head(flow);
1480 if (unlikely(!skb)) {
1481 /* heap has gone wrong, rebuild it next time */
1482 q->overflow_timeout = 0;
1483 return idx + (tin << 16);
1484 }
1485
1486 if (cobalt_queue_full(&flow->cvars, &b->cparams, now))
1487 b->unresponsive_flow_count++;
1488
1489 len = qdisc_pkt_len(skb);
1490 q->buffer_used -= skb->truesize;
1491 b->backlogs[idx] -= len;
1492 b->tin_backlog -= len;
1493 sch->qstats.backlog -= len;
1494 qdisc_tree_reduce_backlog(sch, 1, len);
1495
1496 flow->dropped++;
1497 b->tin_dropped++;
1498 sch->qstats.drops++;
1499
1500 if (q->rate_flags & CAKE_FLAG_INGRESS)
1501 cake_advance_shaper(q, b, skb, now, true);
1502
1503 __qdisc_drop(skb, to_free);
1504 sch->q.qlen--;
1505
1506 cake_heapify(q, 0);
1507
1508 return idx + (tin << 16);
1509}
1510
1511static void cake_wash_diffserv(struct sk_buff *skb)
1512{
1513 switch (skb->protocol) {
1514 case htons(ETH_P_IP):
1515 ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
1516 break;
1517 case htons(ETH_P_IPV6):
1518 ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
1519 break;
1520 default:
1521 break;
1522 }
1523}
1524
1525static u8 cake_handle_diffserv(struct sk_buff *skb, u16 wash)
1526{
1527 u8 dscp;
1528
1529 switch (skb->protocol) {
1530 case htons(ETH_P_IP):
1531 dscp = ipv4_get_dsfield(ip_hdr(skb)) >> 2;
1532 if (wash && dscp)
1533 ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
1534 return dscp;
1535
1536 case htons(ETH_P_IPV6):
1537 dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2;
1538 if (wash && dscp)
1539 ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
1540 return dscp;
1541
1542 case htons(ETH_P_ARP):
1543 return 0x38; /* CS7 - Net Control */
1544
1545 default:
1546 /* If there is no Diffserv field, treat as best-effort */
1547 return 0;
1548 }
1549}
1550
1551static struct cake_tin_data *cake_select_tin(struct Qdisc *sch,
1552 struct sk_buff *skb)
1553{
1554 struct cake_sched_data *q = qdisc_priv(sch);
1555 u32 tin;
1556
1557 if (TC_H_MAJ(skb->priority) == sch->handle &&
1558 TC_H_MIN(skb->priority) > 0 &&
1559 TC_H_MIN(skb->priority) <= q->tin_cnt) {
1560 tin = q->tin_order[TC_H_MIN(skb->priority) - 1];
1561
1562 if (q->rate_flags & CAKE_FLAG_WASH)
1563 cake_wash_diffserv(skb);
1564 } else if (q->tin_mode != CAKE_DIFFSERV_BESTEFFORT) {
1565 /* extract the Diffserv Precedence field, if it exists */
1566 /* and clear DSCP bits if washing */
1567 tin = q->tin_index[cake_handle_diffserv(skb,
1568 q->rate_flags & CAKE_FLAG_WASH)];
1569 if (unlikely(tin >= q->tin_cnt))
1570 tin = 0;
1571 } else {
1572 tin = 0;
1573 if (q->rate_flags & CAKE_FLAG_WASH)
1574 cake_wash_diffserv(skb);
1575 }
1576
1577 return &q->tins[tin];
1578}
1579
1580static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
1581 struct sk_buff *skb, int flow_mode, int *qerr)
1582{
1583 struct cake_sched_data *q = qdisc_priv(sch);
1584 struct tcf_proto *filter;
1585 struct tcf_result res;
1586 u16 flow = 0, host = 0;
1587 int result;
1588
1589 filter = rcu_dereference_bh(q->filter_list);
1590 if (!filter)
1591 goto hash;
1592
1593 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
1594 result = tcf_classify(skb, filter, &res, false);
1595
1596 if (result >= 0) {
1597#ifdef CONFIG_NET_CLS_ACT
1598 switch (result) {
1599 case TC_ACT_STOLEN:
1600 case TC_ACT_QUEUED:
1601 case TC_ACT_TRAP:
1602 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
1603 /* fall through */
1604 case TC_ACT_SHOT:
1605 return 0;
1606 }
1607#endif
1608 if (TC_H_MIN(res.classid) <= CAKE_QUEUES)
1609 flow = TC_H_MIN(res.classid);
1610 if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16))
1611 host = TC_H_MAJ(res.classid) >> 16;
1612 }
1613hash:
1614 *t = cake_select_tin(sch, skb);
1615 return cake_hash(*t, skb, flow_mode, flow, host) + 1;
1616}
1617
1618static void cake_reconfigure(struct Qdisc *sch);
1619
1620static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
1621 struct sk_buff **to_free)
1622{
1623 struct cake_sched_data *q = qdisc_priv(sch);
1624 int len = qdisc_pkt_len(skb);
1625 int uninitialized_var(ret);
1626 struct sk_buff *ack = NULL;
1627 ktime_t now = ktime_get();
1628 struct cake_tin_data *b;
1629 struct cake_flow *flow;
1630 u32 idx;
1631
1632 /* choose flow to insert into */
1633 idx = cake_classify(sch, &b, skb, q->flow_mode, &ret);
1634 if (idx == 0) {
1635 if (ret & __NET_XMIT_BYPASS)
1636 qdisc_qstats_drop(sch);
1637 __qdisc_drop(skb, to_free);
1638 return ret;
1639 }
1640 idx--;
1641 flow = &b->flows[idx];
1642
1643 /* ensure shaper state isn't stale */
1644 if (!b->tin_backlog) {
1645 if (ktime_before(b->time_next_packet, now))
1646 b->time_next_packet = now;
1647
1648 if (!sch->q.qlen) {
1649 if (ktime_before(q->time_next_packet, now)) {
1650 q->failsafe_next_packet = now;
1651 q->time_next_packet = now;
1652 } else if (ktime_after(q->time_next_packet, now) &&
1653 ktime_after(q->failsafe_next_packet, now)) {
1654 u64 next = \
1655 min(ktime_to_ns(q->time_next_packet),
1656 ktime_to_ns(
1657 q->failsafe_next_packet));
1658 sch->qstats.overlimits++;
1659 qdisc_watchdog_schedule_ns(&q->watchdog, next);
1660 }
1661 }
1662 }
1663
1664 if (unlikely(len > b->max_skblen))
1665 b->max_skblen = len;
1666
1667 if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) {
1668 struct sk_buff *segs, *nskb;
1669 netdev_features_t features = netif_skb_features(skb);
1670 unsigned int slen = 0;
1671
1672 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
1673 if (IS_ERR_OR_NULL(segs))
1674 return qdisc_drop(skb, sch, to_free);
1675
1676 while (segs) {
1677 nskb = segs->next;
1678 segs->next = NULL;
1679 qdisc_skb_cb(segs)->pkt_len = segs->len;
1680 cobalt_set_enqueue_time(segs, now);
1681 get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
1682 segs);
1683 flow_queue_add(flow, segs);
1684
1685 sch->q.qlen++;
1686 slen += segs->len;
1687 q->buffer_used += segs->truesize;
1688 b->packets++;
1689 segs = nskb;
1690 }
1691
1692 /* stats */
1693 b->bytes += slen;
1694 b->backlogs[idx] += slen;
1695 b->tin_backlog += slen;
1696 sch->qstats.backlog += slen;
1697 q->avg_window_bytes += slen;
1698
1699 qdisc_tree_reduce_backlog(sch, 1, len);
1700 consume_skb(skb);
1701 } else {
1702 /* not splitting */
1703 cobalt_set_enqueue_time(skb, now);
1704 get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb);
1705 flow_queue_add(flow, skb);
1706
1707 if (q->ack_filter)
1708 ack = cake_ack_filter(q, flow);
1709
1710 if (ack) {
1711 b->ack_drops++;
1712 sch->qstats.drops++;
1713 b->bytes += qdisc_pkt_len(ack);
1714 len -= qdisc_pkt_len(ack);
1715 q->buffer_used += skb->truesize - ack->truesize;
1716 if (q->rate_flags & CAKE_FLAG_INGRESS)
1717 cake_advance_shaper(q, b, ack, now, true);
1718
1719 qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack));
1720 consume_skb(ack);
1721 } else {
1722 sch->q.qlen++;
1723 q->buffer_used += skb->truesize;
1724 }
1725
1726 /* stats */
1727 b->packets++;
1728 b->bytes += len;
1729 b->backlogs[idx] += len;
1730 b->tin_backlog += len;
1731 sch->qstats.backlog += len;
1732 q->avg_window_bytes += len;
1733 }
1734
1735 if (q->overflow_timeout)
1736 cake_heapify_up(q, b->overflow_idx[idx]);
1737
1738 /* incoming bandwidth capacity estimate */
1739 if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) {
1740 u64 packet_interval = \
1741 ktime_to_ns(ktime_sub(now, q->last_packet_time));
1742
1743 if (packet_interval > NSEC_PER_SEC)
1744 packet_interval = NSEC_PER_SEC;
1745
1746 /* filter out short-term bursts, eg. wifi aggregation */
1747 q->avg_packet_interval = \
1748 cake_ewma(q->avg_packet_interval,
1749 packet_interval,
1750 (packet_interval > q->avg_packet_interval ?
1751 2 : 8));
1752
1753 q->last_packet_time = now;
1754
1755 if (packet_interval > q->avg_packet_interval) {
1756 u64 window_interval = \
1757 ktime_to_ns(ktime_sub(now,
1758 q->avg_window_begin));
1759 u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC;
1760
1761 do_div(b, window_interval);
1762 q->avg_peak_bandwidth =
1763 cake_ewma(q->avg_peak_bandwidth, b,
1764 b > q->avg_peak_bandwidth ? 2 : 8);
1765 q->avg_window_bytes = 0;
1766 q->avg_window_begin = now;
1767
1768 if (ktime_after(now,
1769 ktime_add_ms(q->last_reconfig_time,
1770 250))) {
1771 q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4;
1772 cake_reconfigure(sch);
1773 }
1774 }
1775 } else {
1776 q->avg_window_bytes = 0;
1777 q->last_packet_time = now;
1778 }
1779
1780 /* flowchain */
1781 if (!flow->set || flow->set == CAKE_SET_DECAYING) {
1782 struct cake_host *srchost = &b->hosts[flow->srchost];
1783 struct cake_host *dsthost = &b->hosts[flow->dsthost];
1784 u16 host_load = 1;
1785
1786 if (!flow->set) {
1787 list_add_tail(&flow->flowchain, &b->new_flows);
1788 } else {
1789 b->decaying_flow_count--;
1790 list_move_tail(&flow->flowchain, &b->new_flows);
1791 }
1792 flow->set = CAKE_SET_SPARSE;
1793 b->sparse_flow_count++;
1794
1795 if (cake_dsrc(q->flow_mode))
1796 host_load = max(host_load, srchost->srchost_refcnt);
1797
1798 if (cake_ddst(q->flow_mode))
1799 host_load = max(host_load, dsthost->dsthost_refcnt);
1800
1801 flow->deficit = (b->flow_quantum *
1802 quantum_div[host_load]) >> 16;
1803 } else if (flow->set == CAKE_SET_SPARSE_WAIT) {
1804 /* this flow was empty, accounted as a sparse flow, but actually
1805 * in the bulk rotation.
1806 */
1807 flow->set = CAKE_SET_BULK;
1808 b->sparse_flow_count--;
1809 b->bulk_flow_count++;
1810 }
1811
1812 if (q->buffer_used > q->buffer_max_used)
1813 q->buffer_max_used = q->buffer_used;
1814
1815 if (q->buffer_used > q->buffer_limit) {
1816 u32 dropped = 0;
1817
1818 while (q->buffer_used > q->buffer_limit) {
1819 dropped++;
1820 cake_drop(sch, to_free);
1821 }
1822 b->drop_overlimit += dropped;
1823 }
1824 return NET_XMIT_SUCCESS;
1825}
1826
1827static struct sk_buff *cake_dequeue_one(struct Qdisc *sch)
1828{
1829 struct cake_sched_data *q = qdisc_priv(sch);
1830 struct cake_tin_data *b = &q->tins[q->cur_tin];
1831 struct cake_flow *flow = &b->flows[q->cur_flow];
1832 struct sk_buff *skb = NULL;
1833 u32 len;
1834
1835 if (flow->head) {
1836 skb = dequeue_head(flow);
1837 len = qdisc_pkt_len(skb);
1838 b->backlogs[q->cur_flow] -= len;
1839 b->tin_backlog -= len;
1840 sch->qstats.backlog -= len;
1841 q->buffer_used -= skb->truesize;
1842 sch->q.qlen--;
1843
1844 if (q->overflow_timeout)
1845 cake_heapify(q, b->overflow_idx[q->cur_flow]);
1846 }
1847 return skb;
1848}
1849
1850/* Discard leftover packets from a tin no longer in use. */
1851static void cake_clear_tin(struct Qdisc *sch, u16 tin)
1852{
1853 struct cake_sched_data *q = qdisc_priv(sch);
1854 struct sk_buff *skb;
1855
1856 q->cur_tin = tin;
1857 for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++)
1858 while (!!(skb = cake_dequeue_one(sch)))
1859 kfree_skb(skb);
1860}
1861
1862static struct sk_buff *cake_dequeue(struct Qdisc *sch)
1863{
1864 struct cake_sched_data *q = qdisc_priv(sch);
1865 struct cake_tin_data *b = &q->tins[q->cur_tin];
1866 struct cake_host *srchost, *dsthost;
1867 ktime_t now = ktime_get();
1868 struct cake_flow *flow;
1869 struct list_head *head;
1870 bool first_flow = true;
1871 struct sk_buff *skb;
1872 u16 host_load;
1873 u64 delay;
1874 u32 len;
1875
1876begin:
1877 if (!sch->q.qlen)
1878 return NULL;
1879
1880 /* global hard shaper */
1881 if (ktime_after(q->time_next_packet, now) &&
1882 ktime_after(q->failsafe_next_packet, now)) {
1883 u64 next = min(ktime_to_ns(q->time_next_packet),
1884 ktime_to_ns(q->failsafe_next_packet));
1885
1886 sch->qstats.overlimits++;
1887 qdisc_watchdog_schedule_ns(&q->watchdog, next);
1888 return NULL;
1889 }
1890
1891 /* Choose a class to work on. */
1892 if (!q->rate_ns) {
1893 /* In unlimited mode, can't rely on shaper timings, just balance
1894 * with DRR
1895 */
1896 bool wrapped = false, empty = true;
1897
1898 while (b->tin_deficit < 0 ||
1899 !(b->sparse_flow_count + b->bulk_flow_count)) {
1900 if (b->tin_deficit <= 0)
1901 b->tin_deficit += b->tin_quantum_band;
1902 if (b->sparse_flow_count + b->bulk_flow_count)
1903 empty = false;
1904
1905 q->cur_tin++;
1906 b++;
1907 if (q->cur_tin >= q->tin_cnt) {
1908 q->cur_tin = 0;
1909 b = q->tins;
1910
1911 if (wrapped) {
1912 /* It's possible for q->qlen to be
1913 * nonzero when we actually have no
1914 * packets anywhere.
1915 */
1916 if (empty)
1917 return NULL;
1918 } else {
1919 wrapped = true;
1920 }
1921 }
1922 }
1923 } else {
1924 /* In shaped mode, choose:
1925 * - Highest-priority tin with queue and meeting schedule, or
1926 * - The earliest-scheduled tin with queue.
1927 */
1928 ktime_t best_time = KTIME_MAX;
1929 int tin, best_tin = 0;
1930
1931 for (tin = 0; tin < q->tin_cnt; tin++) {
1932 b = q->tins + tin;
1933 if ((b->sparse_flow_count + b->bulk_flow_count) > 0) {
1934 ktime_t time_to_pkt = \
1935 ktime_sub(b->time_next_packet, now);
1936
1937 if (ktime_to_ns(time_to_pkt) <= 0 ||
1938 ktime_compare(time_to_pkt,
1939 best_time) <= 0) {
1940 best_time = time_to_pkt;
1941 best_tin = tin;
1942 }
1943 }
1944 }
1945
1946 q->cur_tin = best_tin;
1947 b = q->tins + best_tin;
1948
1949 /* No point in going further if no packets to deliver. */
1950 if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count)))
1951 return NULL;
1952 }
1953
1954retry:
1955 /* service this class */
1956 head = &b->decaying_flows;
1957 if (!first_flow || list_empty(head)) {
1958 head = &b->new_flows;
1959 if (list_empty(head)) {
1960 head = &b->old_flows;
1961 if (unlikely(list_empty(head))) {
1962 head = &b->decaying_flows;
1963 if (unlikely(list_empty(head)))
1964 goto begin;
1965 }
1966 }
1967 }
1968 flow = list_first_entry(head, struct cake_flow, flowchain);
1969 q->cur_flow = flow - b->flows;
1970 first_flow = false;
1971
1972 /* triple isolation (modified DRR++) */
1973 srchost = &b->hosts[flow->srchost];
1974 dsthost = &b->hosts[flow->dsthost];
1975 host_load = 1;
1976
1977 if (cake_dsrc(q->flow_mode))
1978 host_load = max(host_load, srchost->srchost_refcnt);
1979
1980 if (cake_ddst(q->flow_mode))
1981 host_load = max(host_load, dsthost->dsthost_refcnt);
1982
1983 WARN_ON(host_load > CAKE_QUEUES);
1984
1985 /* flow isolation (DRR++) */
1986 if (flow->deficit <= 0) {
1987 /* The shifted prandom_u32() is a way to apply dithering to
1988 * avoid accumulating roundoff errors
1989 */
1990 flow->deficit += (b->flow_quantum * quantum_div[host_load] +
1991 (prandom_u32() >> 16)) >> 16;
1992 list_move_tail(&flow->flowchain, &b->old_flows);
1993
1994 /* Keep all flows with deficits out of the sparse and decaying
1995 * rotations. No non-empty flow can go into the decaying
1996 * rotation, so they can't get deficits
1997 */
1998 if (flow->set == CAKE_SET_SPARSE) {
1999 if (flow->head) {
2000 b->sparse_flow_count--;
2001 b->bulk_flow_count++;
2002 flow->set = CAKE_SET_BULK;
2003 } else {
2004 /* we've moved it to the bulk rotation for
2005 * correct deficit accounting but we still want
2006 * to count it as a sparse flow, not a bulk one.
2007 */
2008 flow->set = CAKE_SET_SPARSE_WAIT;
2009 }
2010 }
2011 goto retry;
2012 }
2013
2014 /* Retrieve a packet via the AQM */
2015 while (1) {
2016 skb = cake_dequeue_one(sch);
2017 if (!skb) {
2018 /* this queue was actually empty */
2019 if (cobalt_queue_empty(&flow->cvars, &b->cparams, now))
2020 b->unresponsive_flow_count--;
2021
2022 if (flow->cvars.p_drop || flow->cvars.count ||
2023 ktime_before(now, flow->cvars.drop_next)) {
2024 /* keep in the flowchain until the state has
2025 * decayed to rest
2026 */
2027 list_move_tail(&flow->flowchain,
2028 &b->decaying_flows);
2029 if (flow->set == CAKE_SET_BULK) {
2030 b->bulk_flow_count--;
2031 b->decaying_flow_count++;
2032 } else if (flow->set == CAKE_SET_SPARSE ||
2033 flow->set == CAKE_SET_SPARSE_WAIT) {
2034 b->sparse_flow_count--;
2035 b->decaying_flow_count++;
2036 }
2037 flow->set = CAKE_SET_DECAYING;
2038 } else {
2039 /* remove empty queue from the flowchain */
2040 list_del_init(&flow->flowchain);
2041 if (flow->set == CAKE_SET_SPARSE ||
2042 flow->set == CAKE_SET_SPARSE_WAIT)
2043 b->sparse_flow_count--;
2044 else if (flow->set == CAKE_SET_BULK)
2045 b->bulk_flow_count--;
2046 else
2047 b->decaying_flow_count--;
2048
2049 flow->set = CAKE_SET_NONE;
2050 srchost->srchost_refcnt--;
2051 dsthost->dsthost_refcnt--;
2052 }
2053 goto begin;
2054 }
2055
2056 /* Last packet in queue may be marked, shouldn't be dropped */
2057 if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb,
2058 (b->bulk_flow_count *
2059 !!(q->rate_flags &
2060 CAKE_FLAG_INGRESS))) ||
2061 !flow->head)
2062 break;
2063
2064 /* drop this packet, get another one */
2065 if (q->rate_flags & CAKE_FLAG_INGRESS) {
2066 len = cake_advance_shaper(q, b, skb,
2067 now, true);
2068 flow->deficit -= len;
2069 b->tin_deficit -= len;
2070 }
2071 flow->dropped++;
2072 b->tin_dropped++;
2073 qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
2074 qdisc_qstats_drop(sch);
2075 kfree_skb(skb);
2076 if (q->rate_flags & CAKE_FLAG_INGRESS)
2077 goto retry;
2078 }
2079
2080 b->tin_ecn_mark += !!flow->cvars.ecn_marked;
2081 qdisc_bstats_update(sch, skb);
2082
2083 /* collect delay stats */
2084 delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
2085 b->avge_delay = cake_ewma(b->avge_delay, delay, 8);
2086 b->peak_delay = cake_ewma(b->peak_delay, delay,
2087 delay > b->peak_delay ? 2 : 8);
2088 b->base_delay = cake_ewma(b->base_delay, delay,
2089 delay < b->base_delay ? 2 : 8);
2090
2091 len = cake_advance_shaper(q, b, skb, now, false);
2092 flow->deficit -= len;
2093 b->tin_deficit -= len;
2094
2095 if (ktime_after(q->time_next_packet, now) && sch->q.qlen) {
2096 u64 next = min(ktime_to_ns(q->time_next_packet),
2097 ktime_to_ns(q->failsafe_next_packet));
2098
2099 qdisc_watchdog_schedule_ns(&q->watchdog, next);
2100 } else if (!sch->q.qlen) {
2101 int i;
2102
2103 for (i = 0; i < q->tin_cnt; i++) {
2104 if (q->tins[i].decaying_flow_count) {
2105 ktime_t next = \
2106 ktime_add_ns(now,
2107 q->tins[i].cparams.target);
2108
2109 qdisc_watchdog_schedule_ns(&q->watchdog,
2110 ktime_to_ns(next));
2111 break;
2112 }
2113 }
2114 }
2115
2116 if (q->overflow_timeout)
2117 q->overflow_timeout--;
2118
2119 return skb;
2120}
2121
2122static void cake_reset(struct Qdisc *sch)
2123{
2124 u32 c;
2125
2126 for (c = 0; c < CAKE_MAX_TINS; c++)
2127 cake_clear_tin(sch, c);
2128}
2129
2130static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = {
2131 [TCA_CAKE_BASE_RATE64] = { .type = NLA_U64 },
2132 [TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 },
2133 [TCA_CAKE_ATM] = { .type = NLA_U32 },
2134 [TCA_CAKE_FLOW_MODE] = { .type = NLA_U32 },
2135 [TCA_CAKE_OVERHEAD] = { .type = NLA_S32 },
2136 [TCA_CAKE_RTT] = { .type = NLA_U32 },
2137 [TCA_CAKE_TARGET] = { .type = NLA_U32 },
2138 [TCA_CAKE_AUTORATE] = { .type = NLA_U32 },
2139 [TCA_CAKE_MEMORY] = { .type = NLA_U32 },
2140 [TCA_CAKE_NAT] = { .type = NLA_U32 },
2141 [TCA_CAKE_RAW] = { .type = NLA_U32 },
2142 [TCA_CAKE_WASH] = { .type = NLA_U32 },
2143 [TCA_CAKE_MPU] = { .type = NLA_U32 },
2144 [TCA_CAKE_INGRESS] = { .type = NLA_U32 },
2145 [TCA_CAKE_ACK_FILTER] = { .type = NLA_U32 },
2146};
2147
2148static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
2149 u64 target_ns, u64 rtt_est_ns)
2150{
2151 /* convert byte-rate into time-per-byte
2152 * so it will always unwedge in reasonable time.
2153 */
2154 static const u64 MIN_RATE = 64;
2155 u32 byte_target = mtu;
2156 u64 byte_target_ns;
2157 u8 rate_shft = 0;
2158 u64 rate_ns = 0;
2159
2160 b->flow_quantum = 1514;
2161 if (rate) {
2162 b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL);
2163 rate_shft = 34;
2164 rate_ns = ((u64)NSEC_PER_SEC) << rate_shft;
2165 rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate));
2166 while (!!(rate_ns >> 34)) {
2167 rate_ns >>= 1;
2168 rate_shft--;
2169 }
2170 } /* else unlimited, ie. zero delay */
2171
2172 b->tin_rate_bps = rate;
2173 b->tin_rate_ns = rate_ns;
2174 b->tin_rate_shft = rate_shft;
2175
2176 byte_target_ns = (byte_target * rate_ns) >> rate_shft;
2177
2178 b->cparams.target = max((byte_target_ns * 3) / 2, target_ns);
2179 b->cparams.interval = max(rtt_est_ns +
2180 b->cparams.target - target_ns,
2181 b->cparams.target * 2);
2182 b->cparams.mtu_time = byte_target_ns;
2183 b->cparams.p_inc = 1 << 24; /* 1/256 */
2184 b->cparams.p_dec = 1 << 20; /* 1/4096 */
2185}
2186
2187static int cake_config_besteffort(struct Qdisc *sch)
2188{
2189 struct cake_sched_data *q = qdisc_priv(sch);
2190 struct cake_tin_data *b = &q->tins[0];
2191 u32 mtu = psched_mtu(qdisc_dev(sch));
2192 u64 rate = q->rate_bps;
2193
2194 q->tin_cnt = 1;
2195
2196 q->tin_index = besteffort;
2197 q->tin_order = normal_order;
2198
2199 cake_set_rate(b, rate, mtu,
2200 us_to_ns(q->target), us_to_ns(q->interval));
2201 b->tin_quantum_band = 65535;
2202 b->tin_quantum_prio = 65535;
2203
2204 return 0;
2205}
2206
2207static int cake_config_precedence(struct Qdisc *sch)
2208{
2209 /* convert high-level (user visible) parameters into internal format */
2210 struct cake_sched_data *q = qdisc_priv(sch);
2211 u32 mtu = psched_mtu(qdisc_dev(sch));
2212 u64 rate = q->rate_bps;
2213 u32 quantum1 = 256;
2214 u32 quantum2 = 256;
2215 u32 i;
2216
2217 q->tin_cnt = 8;
2218 q->tin_index = precedence;
2219 q->tin_order = normal_order;
2220
2221 for (i = 0; i < q->tin_cnt; i++) {
2222 struct cake_tin_data *b = &q->tins[i];
2223
2224 cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2225 us_to_ns(q->interval));
2226
2227 b->tin_quantum_prio = max_t(u16, 1U, quantum1);
2228 b->tin_quantum_band = max_t(u16, 1U, quantum2);
2229
2230 /* calculate next class's parameters */
2231 rate *= 7;
2232 rate >>= 3;
2233
2234 quantum1 *= 3;
2235 quantum1 >>= 1;
2236
2237 quantum2 *= 7;
2238 quantum2 >>= 3;
2239 }
2240
2241 return 0;
2242}
2243
2244/* List of known Diffserv codepoints:
2245 *
2246 * Least Effort (CS1)
2247 * Best Effort (CS0)
2248 * Max Reliability & LLT "Lo" (TOS1)
2249 * Max Throughput (TOS2)
2250 * Min Delay (TOS4)
2251 * LLT "La" (TOS5)
2252 * Assured Forwarding 1 (AF1x) - x3
2253 * Assured Forwarding 2 (AF2x) - x3
2254 * Assured Forwarding 3 (AF3x) - x3
2255 * Assured Forwarding 4 (AF4x) - x3
2256 * Precedence Class 2 (CS2)
2257 * Precedence Class 3 (CS3)
2258 * Precedence Class 4 (CS4)
2259 * Precedence Class 5 (CS5)
2260 * Precedence Class 6 (CS6)
2261 * Precedence Class 7 (CS7)
2262 * Voice Admit (VA)
2263 * Expedited Forwarding (EF)
2264
2265 * Total 25 codepoints.
2266 */
2267
2268/* List of traffic classes in RFC 4594:
2269 * (roughly descending order of contended priority)
2270 * (roughly ascending order of uncontended throughput)
2271 *
2272 * Network Control (CS6,CS7) - routing traffic
2273 * Telephony (EF,VA) - aka. VoIP streams
2274 * Signalling (CS5) - VoIP setup
2275 * Multimedia Conferencing (AF4x) - aka. video calls
2276 * Realtime Interactive (CS4) - eg. games
2277 * Multimedia Streaming (AF3x) - eg. YouTube, NetFlix, Twitch
2278 * Broadcast Video (CS3)
2279 * Low Latency Data (AF2x,TOS4) - eg. database
2280 * Ops, Admin, Management (CS2,TOS1) - eg. ssh
2281 * Standard Service (CS0 & unrecognised codepoints)
2282 * High Throughput Data (AF1x,TOS2) - eg. web traffic
2283 * Low Priority Data (CS1) - eg. BitTorrent
2284
2285 * Total 12 traffic classes.
2286 */
2287
2288static int cake_config_diffserv8(struct Qdisc *sch)
2289{
2290/* Pruned list of traffic classes for typical applications:
2291 *
2292 * Network Control (CS6, CS7)
2293 * Minimum Latency (EF, VA, CS5, CS4)
2294 * Interactive Shell (CS2, TOS1)
2295 * Low Latency Transactions (AF2x, TOS4)
2296 * Video Streaming (AF4x, AF3x, CS3)
2297 * Bog Standard (CS0 etc.)
2298 * High Throughput (AF1x, TOS2)
2299 * Background Traffic (CS1)
2300 *
2301 * Total 8 traffic classes.
2302 */
2303
2304 struct cake_sched_data *q = qdisc_priv(sch);
2305 u32 mtu = psched_mtu(qdisc_dev(sch));
2306 u64 rate = q->rate_bps;
2307 u32 quantum1 = 256;
2308 u32 quantum2 = 256;
2309 u32 i;
2310
2311 q->tin_cnt = 8;
2312
2313 /* codepoint to class mapping */
2314 q->tin_index = diffserv8;
2315 q->tin_order = normal_order;
2316
2317 /* class characteristics */
2318 for (i = 0; i < q->tin_cnt; i++) {
2319 struct cake_tin_data *b = &q->tins[i];
2320
2321 cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2322 us_to_ns(q->interval));
2323
2324 b->tin_quantum_prio = max_t(u16, 1U, quantum1);
2325 b->tin_quantum_band = max_t(u16, 1U, quantum2);
2326
2327 /* calculate next class's parameters */
2328 rate *= 7;
2329 rate >>= 3;
2330
2331 quantum1 *= 3;
2332 quantum1 >>= 1;
2333
2334 quantum2 *= 7;
2335 quantum2 >>= 3;
2336 }
2337
2338 return 0;
2339}
2340
2341static int cake_config_diffserv4(struct Qdisc *sch)
2342{
2343/* Further pruned list of traffic classes for four-class system:
2344 *
2345 * Latency Sensitive (CS7, CS6, EF, VA, CS5, CS4)
2346 * Streaming Media (AF4x, AF3x, CS3, AF2x, TOS4, CS2, TOS1)
2347 * Best Effort (CS0, AF1x, TOS2, and those not specified)
2348 * Background Traffic (CS1)
2349 *
2350 * Total 4 traffic classes.
2351 */
2352
2353 struct cake_sched_data *q = qdisc_priv(sch);
2354 u32 mtu = psched_mtu(qdisc_dev(sch));
2355 u64 rate = q->rate_bps;
2356 u32 quantum = 1024;
2357
2358 q->tin_cnt = 4;
2359
2360 /* codepoint to class mapping */
2361 q->tin_index = diffserv4;
2362 q->tin_order = bulk_order;
2363
2364 /* class characteristics */
2365 cake_set_rate(&q->tins[0], rate, mtu,
2366 us_to_ns(q->target), us_to_ns(q->interval));
2367 cake_set_rate(&q->tins[1], rate >> 4, mtu,
2368 us_to_ns(q->target), us_to_ns(q->interval));
2369 cake_set_rate(&q->tins[2], rate >> 1, mtu,
2370 us_to_ns(q->target), us_to_ns(q->interval));
2371 cake_set_rate(&q->tins[3], rate >> 2, mtu,
2372 us_to_ns(q->target), us_to_ns(q->interval));
2373
2374 /* priority weights */
2375 q->tins[0].tin_quantum_prio = quantum;
2376 q->tins[1].tin_quantum_prio = quantum >> 4;
2377 q->tins[2].tin_quantum_prio = quantum << 2;
2378 q->tins[3].tin_quantum_prio = quantum << 4;
2379
2380 /* bandwidth-sharing weights */
2381 q->tins[0].tin_quantum_band = quantum;
2382 q->tins[1].tin_quantum_band = quantum >> 4;
2383 q->tins[2].tin_quantum_band = quantum >> 1;
2384 q->tins[3].tin_quantum_band = quantum >> 2;
2385
2386 return 0;
2387}
2388
2389static int cake_config_diffserv3(struct Qdisc *sch)
2390{
2391/* Simplified Diffserv structure with 3 tins.
2392 * Low Priority (CS1)
2393 * Best Effort
2394 * Latency Sensitive (TOS4, VA, EF, CS6, CS7)
2395 */
2396 struct cake_sched_data *q = qdisc_priv(sch);
2397 u32 mtu = psched_mtu(qdisc_dev(sch));
2398 u64 rate = q->rate_bps;
2399 u32 quantum = 1024;
2400
2401 q->tin_cnt = 3;
2402
2403 /* codepoint to class mapping */
2404 q->tin_index = diffserv3;
2405 q->tin_order = bulk_order;
2406
2407 /* class characteristics */
2408 cake_set_rate(&q->tins[0], rate, mtu,
2409 us_to_ns(q->target), us_to_ns(q->interval));
2410 cake_set_rate(&q->tins[1], rate >> 4, mtu,
2411 us_to_ns(q->target), us_to_ns(q->interval));
2412 cake_set_rate(&q->tins[2], rate >> 2, mtu,
2413 us_to_ns(q->target), us_to_ns(q->interval));
2414
2415 /* priority weights */
2416 q->tins[0].tin_quantum_prio = quantum;
2417 q->tins[1].tin_quantum_prio = quantum >> 4;
2418 q->tins[2].tin_quantum_prio = quantum << 4;
2419
2420 /* bandwidth-sharing weights */
2421 q->tins[0].tin_quantum_band = quantum;
2422 q->tins[1].tin_quantum_band = quantum >> 4;
2423 q->tins[2].tin_quantum_band = quantum >> 2;
2424
2425 return 0;
2426}
2427
2428static void cake_reconfigure(struct Qdisc *sch)
2429{
2430 struct cake_sched_data *q = qdisc_priv(sch);
2431 int c, ft;
2432
2433 switch (q->tin_mode) {
2434 case CAKE_DIFFSERV_BESTEFFORT:
2435 ft = cake_config_besteffort(sch);
2436 break;
2437
2438 case CAKE_DIFFSERV_PRECEDENCE:
2439 ft = cake_config_precedence(sch);
2440 break;
2441
2442 case CAKE_DIFFSERV_DIFFSERV8:
2443 ft = cake_config_diffserv8(sch);
2444 break;
2445
2446 case CAKE_DIFFSERV_DIFFSERV4:
2447 ft = cake_config_diffserv4(sch);
2448 break;
2449
2450 case CAKE_DIFFSERV_DIFFSERV3:
2451 default:
2452 ft = cake_config_diffserv3(sch);
2453 break;
2454 }
2455
2456 for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) {
2457 cake_clear_tin(sch, c);
2458 q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time;
2459 }
2460
2461 q->rate_ns = q->tins[ft].tin_rate_ns;
2462 q->rate_shft = q->tins[ft].tin_rate_shft;
2463
2464 if (q->buffer_config_limit) {
2465 q->buffer_limit = q->buffer_config_limit;
2466 } else if (q->rate_bps) {
2467 u64 t = q->rate_bps * q->interval;
2468
2469 do_div(t, USEC_PER_SEC / 4);
2470 q->buffer_limit = max_t(u32, t, 4U << 20);
2471 } else {
2472 q->buffer_limit = ~0;
2473 }
2474
2475 sch->flags &= ~TCQ_F_CAN_BYPASS;
2476
2477 q->buffer_limit = min(q->buffer_limit,
2478 max(sch->limit * psched_mtu(qdisc_dev(sch)),
2479 q->buffer_config_limit));
2480}
2481
2482static int cake_change(struct Qdisc *sch, struct nlattr *opt,
2483 struct netlink_ext_ack *extack)
2484{
2485 struct cake_sched_data *q = qdisc_priv(sch);
2486 struct nlattr *tb[TCA_CAKE_MAX + 1];
2487 int err;
2488
2489 if (!opt)
2490 return -EINVAL;
2491
2492 err = nla_parse_nested(tb, TCA_CAKE_MAX, opt, cake_policy, extack);
2493 if (err < 0)
2494 return err;
2495
2496 if (tb[TCA_CAKE_NAT]) {
2497#if IS_ENABLED(CONFIG_NF_CONNTRACK)
2498 q->flow_mode &= ~CAKE_FLOW_NAT_FLAG;
2499 q->flow_mode |= CAKE_FLOW_NAT_FLAG *
2500 !!nla_get_u32(tb[TCA_CAKE_NAT]);
2501#else
2502 NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT],
2503 "No conntrack support in kernel");
2504 return -EOPNOTSUPP;
2505#endif
2506 }
2507
2508 if (tb[TCA_CAKE_BASE_RATE64])
2509 q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]);
2510
2511 if (tb[TCA_CAKE_DIFFSERV_MODE])
2512 q->tin_mode = nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]);
2513
2514 if (tb[TCA_CAKE_WASH]) {
2515 if (!!nla_get_u32(tb[TCA_CAKE_WASH]))
2516 q->rate_flags |= CAKE_FLAG_WASH;
2517 else
2518 q->rate_flags &= ~CAKE_FLAG_WASH;
2519 }
2520
2521 if (tb[TCA_CAKE_FLOW_MODE])
2522 q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) |
2523 (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) &
2524 CAKE_FLOW_MASK));
2525
2526 if (tb[TCA_CAKE_ATM])
2527 q->atm_mode = nla_get_u32(tb[TCA_CAKE_ATM]);
2528
2529 if (tb[TCA_CAKE_OVERHEAD]) {
2530 q->rate_overhead = nla_get_s32(tb[TCA_CAKE_OVERHEAD]);
2531 q->rate_flags |= CAKE_FLAG_OVERHEAD;
2532
2533 q->max_netlen = 0;
2534 q->max_adjlen = 0;
2535 q->min_netlen = ~0;
2536 q->min_adjlen = ~0;
2537 }
2538
2539 if (tb[TCA_CAKE_RAW]) {
2540 q->rate_flags &= ~CAKE_FLAG_OVERHEAD;
2541
2542 q->max_netlen = 0;
2543 q->max_adjlen = 0;
2544 q->min_netlen = ~0;
2545 q->min_adjlen = ~0;
2546 }
2547
2548 if (tb[TCA_CAKE_MPU])
2549 q->rate_mpu = nla_get_u32(tb[TCA_CAKE_MPU]);
2550
2551 if (tb[TCA_CAKE_RTT]) {
2552 q->interval = nla_get_u32(tb[TCA_CAKE_RTT]);
2553
2554 if (!q->interval)
2555 q->interval = 1;
2556 }
2557
2558 if (tb[TCA_CAKE_TARGET]) {
2559 q->target = nla_get_u32(tb[TCA_CAKE_TARGET]);
2560
2561 if (!q->target)
2562 q->target = 1;
2563 }
2564
2565 if (tb[TCA_CAKE_AUTORATE]) {
2566 if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE]))
2567 q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS;
2568 else
2569 q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
2570 }
2571
2572 if (tb[TCA_CAKE_INGRESS]) {
2573 if (!!nla_get_u32(tb[TCA_CAKE_INGRESS]))
2574 q->rate_flags |= CAKE_FLAG_INGRESS;
2575 else
2576 q->rate_flags &= ~CAKE_FLAG_INGRESS;
2577 }
2578
2579 if (tb[TCA_CAKE_ACK_FILTER])
2580 q->ack_filter = nla_get_u32(tb[TCA_CAKE_ACK_FILTER]);
2581
2582 if (tb[TCA_CAKE_MEMORY])
2583 q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]);
2584
2585 if (tb[TCA_CAKE_SPLIT_GSO]) {
2586 if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO]))
2587 q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2588 else
2589 q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
2590 }
2591
2592 if (q->tins) {
2593 sch_tree_lock(sch);
2594 cake_reconfigure(sch);
2595 sch_tree_unlock(sch);
2596 }
2597
2598 return 0;
2599}
2600
2601static void cake_destroy(struct Qdisc *sch)
2602{
2603 struct cake_sched_data *q = qdisc_priv(sch);
2604
2605 qdisc_watchdog_cancel(&q->watchdog);
2606 tcf_block_put(q->block);
2607 kvfree(q->tins);
2608}
2609
2610static int cake_init(struct Qdisc *sch, struct nlattr *opt,
2611 struct netlink_ext_ack *extack)
2612{
2613 struct cake_sched_data *q = qdisc_priv(sch);
2614 int i, j, err;
2615
2616 sch->limit = 10240;
2617 q->tin_mode = CAKE_DIFFSERV_DIFFSERV3;
2618 q->flow_mode = CAKE_FLOW_TRIPLE;
2619
2620 q->rate_bps = 0; /* unlimited by default */
2621
2622 q->interval = 100000; /* 100ms default */
2623 q->target = 5000; /* 5ms: codel RFC argues
2624 * for 5 to 10% of interval
2625 */
2626 q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2627 q->cur_tin = 0;
2628 q->cur_flow = 0;
2629
2630 qdisc_watchdog_init(&q->watchdog, sch);
2631
2632 if (opt) {
2633 int err = cake_change(sch, opt, extack);
2634
2635 if (err)
2636 return err;
2637 }
2638
2639 err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
2640 if (err)
2641 return err;
2642
2643 quantum_div[0] = ~0;
2644 for (i = 1; i <= CAKE_QUEUES; i++)
2645 quantum_div[i] = 65535 / i;
2646
2647 q->tins = kvzalloc(CAKE_MAX_TINS * sizeof(struct cake_tin_data),
2648 GFP_KERNEL);
2649 if (!q->tins)
2650 goto nomem;
2651
2652 for (i = 0; i < CAKE_MAX_TINS; i++) {
2653 struct cake_tin_data *b = q->tins + i;
2654
2655 INIT_LIST_HEAD(&b->new_flows);
2656 INIT_LIST_HEAD(&b->old_flows);
2657 INIT_LIST_HEAD(&b->decaying_flows);
2658 b->sparse_flow_count = 0;
2659 b->bulk_flow_count = 0;
2660 b->decaying_flow_count = 0;
2661
2662 for (j = 0; j < CAKE_QUEUES; j++) {
2663 struct cake_flow *flow = b->flows + j;
2664 u32 k = j * CAKE_MAX_TINS + i;
2665
2666 INIT_LIST_HEAD(&flow->flowchain);
2667 cobalt_vars_init(&flow->cvars);
2668
2669 q->overflow_heap[k].t = i;
2670 q->overflow_heap[k].b = j;
2671 b->overflow_idx[j] = k;
2672 }
2673 }
2674
2675 cake_reconfigure(sch);
2676 q->avg_peak_bandwidth = q->rate_bps;
2677 q->min_netlen = ~0;
2678 q->min_adjlen = ~0;
2679 return 0;
2680
2681nomem:
2682 cake_destroy(sch);
2683 return -ENOMEM;
2684}
2685
2686static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
2687{
2688 struct cake_sched_data *q = qdisc_priv(sch);
2689 struct nlattr *opts;
2690
2691 opts = nla_nest_start(skb, TCA_OPTIONS);
2692 if (!opts)
2693 goto nla_put_failure;
2694
2695 if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, q->rate_bps,
2696 TCA_CAKE_PAD))
2697 goto nla_put_failure;
2698
2699 if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE,
2700 q->flow_mode & CAKE_FLOW_MASK))
2701 goto nla_put_failure;
2702
2703 if (nla_put_u32(skb, TCA_CAKE_RTT, q->interval))
2704 goto nla_put_failure;
2705
2706 if (nla_put_u32(skb, TCA_CAKE_TARGET, q->target))
2707 goto nla_put_failure;
2708
2709 if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit))
2710 goto nla_put_failure;
2711
2712 if (nla_put_u32(skb, TCA_CAKE_AUTORATE,
2713 !!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS)))
2714 goto nla_put_failure;
2715
2716 if (nla_put_u32(skb, TCA_CAKE_INGRESS,
2717 !!(q->rate_flags & CAKE_FLAG_INGRESS)))
2718 goto nla_put_failure;
2719
2720 if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter))
2721 goto nla_put_failure;
2722
2723 if (nla_put_u32(skb, TCA_CAKE_NAT,
2724 !!(q->flow_mode & CAKE_FLOW_NAT_FLAG)))
2725 goto nla_put_failure;
2726
2727 if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, q->tin_mode))
2728 goto nla_put_failure;
2729
2730 if (nla_put_u32(skb, TCA_CAKE_WASH,
2731 !!(q->rate_flags & CAKE_FLAG_WASH)))
2732 goto nla_put_failure;
2733
2734 if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, q->rate_overhead))
2735 goto nla_put_failure;
2736
2737 if (!(q->rate_flags & CAKE_FLAG_OVERHEAD))
2738 if (nla_put_u32(skb, TCA_CAKE_RAW, 0))
2739 goto nla_put_failure;
2740
2741 if (nla_put_u32(skb, TCA_CAKE_ATM, q->atm_mode))
2742 goto nla_put_failure;
2743
2744 if (nla_put_u32(skb, TCA_CAKE_MPU, q->rate_mpu))
2745 goto nla_put_failure;
2746
2747 if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO,
2748 !!(q->rate_flags & CAKE_FLAG_SPLIT_GSO)))
2749 goto nla_put_failure;
2750
2751 return nla_nest_end(skb, opts);
2752
2753nla_put_failure:
2754 return -1;
2755}
2756
2757static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
2758{
2759 struct nlattr *stats = nla_nest_start(d->skb, TCA_STATS_APP);
2760 struct cake_sched_data *q = qdisc_priv(sch);
2761 struct nlattr *tstats, *ts;
2762 int i;
2763
2764 if (!stats)
2765 return -1;
2766
2767#define PUT_STAT_U32(attr, data) do { \
2768 if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2769 goto nla_put_failure; \
2770 } while (0)
2771#define PUT_STAT_U64(attr, data) do { \
2772 if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \
2773 data, TCA_CAKE_STATS_PAD)) \
2774 goto nla_put_failure; \
2775 } while (0)
2776
2777 PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth);
2778 PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit);
2779 PUT_STAT_U32(MEMORY_USED, q->buffer_max_used);
2780 PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16));
2781 PUT_STAT_U32(MAX_NETLEN, q->max_netlen);
2782 PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen);
2783 PUT_STAT_U32(MIN_NETLEN, q->min_netlen);
2784 PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen);
2785
2786#undef PUT_STAT_U32
2787#undef PUT_STAT_U64
2788
2789 tstats = nla_nest_start(d->skb, TCA_CAKE_STATS_TIN_STATS);
2790 if (!tstats)
2791 goto nla_put_failure;
2792
2793#define PUT_TSTAT_U32(attr, data) do { \
2794 if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \
2795 goto nla_put_failure; \
2796 } while (0)
2797#define PUT_TSTAT_U64(attr, data) do { \
2798 if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \
2799 data, TCA_CAKE_TIN_STATS_PAD)) \
2800 goto nla_put_failure; \
2801 } while (0)
2802
2803 for (i = 0; i < q->tin_cnt; i++) {
2804 struct cake_tin_data *b = &q->tins[q->tin_order[i]];
2805
2806 ts = nla_nest_start(d->skb, i + 1);
2807 if (!ts)
2808 goto nla_put_failure;
2809
2810 PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps);
2811 PUT_TSTAT_U64(SENT_BYTES64, b->bytes);
2812 PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog);
2813
2814 PUT_TSTAT_U32(TARGET_US,
2815 ktime_to_us(ns_to_ktime(b->cparams.target)));
2816 PUT_TSTAT_U32(INTERVAL_US,
2817 ktime_to_us(ns_to_ktime(b->cparams.interval)));
2818
2819 PUT_TSTAT_U32(SENT_PACKETS, b->packets);
2820 PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped);
2821 PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark);
2822 PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops);
2823
2824 PUT_TSTAT_U32(PEAK_DELAY_US,
2825 ktime_to_us(ns_to_ktime(b->peak_delay)));
2826 PUT_TSTAT_U32(AVG_DELAY_US,
2827 ktime_to_us(ns_to_ktime(b->avge_delay)));
2828 PUT_TSTAT_U32(BASE_DELAY_US,
2829 ktime_to_us(ns_to_ktime(b->base_delay)));
2830
2831 PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits);
2832 PUT_TSTAT_U32(WAY_MISSES, b->way_misses);
2833 PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions);
2834
2835 PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count +
2836 b->decaying_flow_count);
2837 PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count);
2838 PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count);
2839 PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen);
2840
2841 PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum);
2842 nla_nest_end(d->skb, ts);
2843 }
2844
2845#undef PUT_TSTAT_U32
2846#undef PUT_TSTAT_U64
2847
2848 nla_nest_end(d->skb, tstats);
2849 return nla_nest_end(d->skb, stats);
2850
2851nla_put_failure:
2852 nla_nest_cancel(d->skb, stats);
2853 return -1;
2854}
2855
2856static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg)
2857{
2858 return NULL;
2859}
2860
2861static unsigned long cake_find(struct Qdisc *sch, u32 classid)
2862{
2863 return 0;
2864}
2865
2866static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent,
2867 u32 classid)
2868{
2869 return 0;
2870}
2871
2872static void cake_unbind(struct Qdisc *q, unsigned long cl)
2873{
2874}
2875
2876static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl,
2877 struct netlink_ext_ack *extack)
2878{
2879 struct cake_sched_data *q = qdisc_priv(sch);
2880
2881 if (cl)
2882 return NULL;
2883 return q->block;
2884}
2885
2886static int cake_dump_class(struct Qdisc *sch, unsigned long cl,
2887 struct sk_buff *skb, struct tcmsg *tcm)
2888{
2889 tcm->tcm_handle |= TC_H_MIN(cl);
2890 return 0;
2891}
2892
2893static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
2894 struct gnet_dump *d)
2895{
2896 struct cake_sched_data *q = qdisc_priv(sch);
2897 const struct cake_flow *flow = NULL;
2898 struct gnet_stats_queue qs = { 0 };
2899 struct nlattr *stats;
2900 u32 idx = cl - 1;
2901
2902 if (idx < CAKE_QUEUES * q->tin_cnt) {
2903 const struct cake_tin_data *b = \
2904 &q->tins[q->tin_order[idx / CAKE_QUEUES]];
2905 const struct sk_buff *skb;
2906
2907 flow = &b->flows[idx % CAKE_QUEUES];
2908
2909 if (flow->head) {
2910 sch_tree_lock(sch);
2911 skb = flow->head;
2912 while (skb) {
2913 qs.qlen++;
2914 skb = skb->next;
2915 }
2916 sch_tree_unlock(sch);
2917 }
2918 qs.backlog = b->backlogs[idx % CAKE_QUEUES];
2919 qs.drops = flow->dropped;
2920 }
2921 if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
2922 return -1;
2923 if (flow) {
2924 ktime_t now = ktime_get();
2925
2926 stats = nla_nest_start(d->skb, TCA_STATS_APP);
2927 if (!stats)
2928 return -1;
2929
2930#define PUT_STAT_U32(attr, data) do { \
2931 if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2932 goto nla_put_failure; \
2933 } while (0)
2934#define PUT_STAT_S32(attr, data) do { \
2935 if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2936 goto nla_put_failure; \
2937 } while (0)
2938
2939 PUT_STAT_S32(DEFICIT, flow->deficit);
2940 PUT_STAT_U32(DROPPING, flow->cvars.dropping);
2941 PUT_STAT_U32(COBALT_COUNT, flow->cvars.count);
2942 PUT_STAT_U32(P_DROP, flow->cvars.p_drop);
2943 if (flow->cvars.p_drop) {
2944 PUT_STAT_S32(BLUE_TIMER_US,
2945 ktime_to_us(
2946 ktime_sub(now,
2947 flow->cvars.blue_timer)));
2948 }
2949 if (flow->cvars.dropping) {
2950 PUT_STAT_S32(DROP_NEXT_US,
2951 ktime_to_us(
2952 ktime_sub(now,
2953 flow->cvars.drop_next)));
2954 }
2955
2956 if (nla_nest_end(d->skb, stats) < 0)
2957 return -1;
2958 }
2959
2960 return 0;
2961
2962nla_put_failure:
2963 nla_nest_cancel(d->skb, stats);
2964 return -1;
2965}
2966
2967static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2968{
2969 struct cake_sched_data *q = qdisc_priv(sch);
2970 unsigned int i, j;
2971
2972 if (arg->stop)
2973 return;
2974
2975 for (i = 0; i < q->tin_cnt; i++) {
2976 struct cake_tin_data *b = &q->tins[q->tin_order[i]];
2977
2978 for (j = 0; j < CAKE_QUEUES; j++) {
2979 if (list_empty(&b->flows[j].flowchain) ||
2980 arg->count < arg->skip) {
2981 arg->count++;
2982 continue;
2983 }
2984 if (arg->fn(sch, i * CAKE_QUEUES + j + 1, arg) < 0) {
2985 arg->stop = 1;
2986 break;
2987 }
2988 arg->count++;
2989 }
2990 }
2991}
2992
2993static const struct Qdisc_class_ops cake_class_ops = {
2994 .leaf = cake_leaf,
2995 .find = cake_find,
2996 .tcf_block = cake_tcf_block,
2997 .bind_tcf = cake_bind,
2998 .unbind_tcf = cake_unbind,
2999 .dump = cake_dump_class,
3000 .dump_stats = cake_dump_class_stats,
3001 .walk = cake_walk,
3002};
3003
3004static struct Qdisc_ops cake_qdisc_ops __read_mostly = {
3005 .cl_ops = &cake_class_ops,
3006 .id = "cake",
3007 .priv_size = sizeof(struct cake_sched_data),
3008 .enqueue = cake_enqueue,
3009 .dequeue = cake_dequeue,
3010 .peek = qdisc_peek_dequeued,
3011 .init = cake_init,
3012 .reset = cake_reset,
3013 .destroy = cake_destroy,
3014 .change = cake_change,
3015 .dump = cake_dump,
3016 .dump_stats = cake_dump_stats,
3017 .owner = THIS_MODULE,
3018};
3019
3020static int __init cake_module_init(void)
3021{
3022 return register_qdisc(&cake_qdisc_ops);
3023}
3024
3025static void __exit cake_module_exit(void)
3026{
3027 unregister_qdisc(&cake_qdisc_ops);
3028}
3029
3030module_init(cake_module_init)
3031module_exit(cake_module_exit)
3032MODULE_AUTHOR("Jonathan Morton");
3033MODULE_LICENSE("Dual BSD/GPL");
3034MODULE_DESCRIPTION("The CAKE shaper.");
diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c
index cdd96b9a27bc..e26a24017faa 100644
--- a/net/sched/sch_cbs.c
+++ b/net/sched/sch_cbs.c
@@ -78,18 +78,42 @@ struct cbs_sched_data {
78 s64 sendslope; /* in bytes/s */ 78 s64 sendslope; /* in bytes/s */
79 s64 idleslope; /* in bytes/s */ 79 s64 idleslope; /* in bytes/s */
80 struct qdisc_watchdog watchdog; 80 struct qdisc_watchdog watchdog;
81 int (*enqueue)(struct sk_buff *skb, struct Qdisc *sch); 81 int (*enqueue)(struct sk_buff *skb, struct Qdisc *sch,
82 struct sk_buff **to_free);
82 struct sk_buff *(*dequeue)(struct Qdisc *sch); 83 struct sk_buff *(*dequeue)(struct Qdisc *sch);
84 struct Qdisc *qdisc;
83}; 85};
84 86
85static int cbs_enqueue_offload(struct sk_buff *skb, struct Qdisc *sch) 87static int cbs_child_enqueue(struct sk_buff *skb, struct Qdisc *sch,
88 struct Qdisc *child,
89 struct sk_buff **to_free)
86{ 90{
87 return qdisc_enqueue_tail(skb, sch); 91 int err;
92
93 err = child->ops->enqueue(skb, child, to_free);
94 if (err != NET_XMIT_SUCCESS)
95 return err;
96
97 qdisc_qstats_backlog_inc(sch, skb);
98 sch->q.qlen++;
99
100 return NET_XMIT_SUCCESS;
88} 101}
89 102
90static int cbs_enqueue_soft(struct sk_buff *skb, struct Qdisc *sch) 103static int cbs_enqueue_offload(struct sk_buff *skb, struct Qdisc *sch,
104 struct sk_buff **to_free)
91{ 105{
92 struct cbs_sched_data *q = qdisc_priv(sch); 106 struct cbs_sched_data *q = qdisc_priv(sch);
107 struct Qdisc *qdisc = q->qdisc;
108
109 return cbs_child_enqueue(skb, sch, qdisc, to_free);
110}
111
112static int cbs_enqueue_soft(struct sk_buff *skb, struct Qdisc *sch,
113 struct sk_buff **to_free)
114{
115 struct cbs_sched_data *q = qdisc_priv(sch);
116 struct Qdisc *qdisc = q->qdisc;
93 117
94 if (sch->q.qlen == 0 && q->credits > 0) { 118 if (sch->q.qlen == 0 && q->credits > 0) {
95 /* We need to stop accumulating credits when there's 119 /* We need to stop accumulating credits when there's
@@ -99,7 +123,7 @@ static int cbs_enqueue_soft(struct sk_buff *skb, struct Qdisc *sch)
99 q->last = ktime_get_ns(); 123 q->last = ktime_get_ns();
100 } 124 }
101 125
102 return qdisc_enqueue_tail(skb, sch); 126 return cbs_child_enqueue(skb, sch, qdisc, to_free);
103} 127}
104 128
105static int cbs_enqueue(struct sk_buff *skb, struct Qdisc *sch, 129static int cbs_enqueue(struct sk_buff *skb, struct Qdisc *sch,
@@ -107,7 +131,7 @@ static int cbs_enqueue(struct sk_buff *skb, struct Qdisc *sch,
107{ 131{
108 struct cbs_sched_data *q = qdisc_priv(sch); 132 struct cbs_sched_data *q = qdisc_priv(sch);
109 133
110 return q->enqueue(skb, sch); 134 return q->enqueue(skb, sch, to_free);
111} 135}
112 136
113/* timediff is in ns, slope is in bytes/s */ 137/* timediff is in ns, slope is in bytes/s */
@@ -132,9 +156,25 @@ static s64 credits_from_len(unsigned int len, s64 slope, s64 port_rate)
132 return div64_s64(len * slope, port_rate); 156 return div64_s64(len * slope, port_rate);
133} 157}
134 158
159static struct sk_buff *cbs_child_dequeue(struct Qdisc *sch, struct Qdisc *child)
160{
161 struct sk_buff *skb;
162
163 skb = child->ops->dequeue(child);
164 if (!skb)
165 return NULL;
166
167 qdisc_qstats_backlog_dec(sch, skb);
168 qdisc_bstats_update(sch, skb);
169 sch->q.qlen--;
170
171 return skb;
172}
173
135static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch) 174static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch)
136{ 175{
137 struct cbs_sched_data *q = qdisc_priv(sch); 176 struct cbs_sched_data *q = qdisc_priv(sch);
177 struct Qdisc *qdisc = q->qdisc;
138 s64 now = ktime_get_ns(); 178 s64 now = ktime_get_ns();
139 struct sk_buff *skb; 179 struct sk_buff *skb;
140 s64 credits; 180 s64 credits;
@@ -157,8 +197,7 @@ static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch)
157 return NULL; 197 return NULL;
158 } 198 }
159 } 199 }
160 200 skb = cbs_child_dequeue(sch, qdisc);
161 skb = qdisc_dequeue_head(sch);
162 if (!skb) 201 if (!skb)
163 return NULL; 202 return NULL;
164 203
@@ -178,7 +217,10 @@ static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch)
178 217
179static struct sk_buff *cbs_dequeue_offload(struct Qdisc *sch) 218static struct sk_buff *cbs_dequeue_offload(struct Qdisc *sch)
180{ 219{
181 return qdisc_dequeue_head(sch); 220 struct cbs_sched_data *q = qdisc_priv(sch);
221 struct Qdisc *qdisc = q->qdisc;
222
223 return cbs_child_dequeue(sch, qdisc);
182} 224}
183 225
184static struct sk_buff *cbs_dequeue(struct Qdisc *sch) 226static struct sk_buff *cbs_dequeue(struct Qdisc *sch)
@@ -310,6 +352,13 @@ static int cbs_init(struct Qdisc *sch, struct nlattr *opt,
310 return -EINVAL; 352 return -EINVAL;
311 } 353 }
312 354
355 q->qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
356 sch->handle, extack);
357 if (!q->qdisc)
358 return -ENOMEM;
359
360 qdisc_hash_add(q->qdisc, false);
361
313 q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0); 362 q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0);
314 363
315 q->enqueue = cbs_enqueue_soft; 364 q->enqueue = cbs_enqueue_soft;
@@ -328,6 +377,9 @@ static void cbs_destroy(struct Qdisc *sch)
328 qdisc_watchdog_cancel(&q->watchdog); 377 qdisc_watchdog_cancel(&q->watchdog);
329 378
330 cbs_disable_offload(dev, q); 379 cbs_disable_offload(dev, q);
380
381 if (q->qdisc)
382 qdisc_destroy(q->qdisc);
331} 383}
332 384
333static int cbs_dump(struct Qdisc *sch, struct sk_buff *skb) 385static int cbs_dump(struct Qdisc *sch, struct sk_buff *skb)
@@ -356,8 +408,72 @@ nla_put_failure:
356 return -1; 408 return -1;
357} 409}
358 410
411static int cbs_dump_class(struct Qdisc *sch, unsigned long cl,
412 struct sk_buff *skb, struct tcmsg *tcm)
413{
414 struct cbs_sched_data *q = qdisc_priv(sch);
415
416 if (cl != 1 || !q->qdisc) /* only one class */
417 return -ENOENT;
418
419 tcm->tcm_handle |= TC_H_MIN(1);
420 tcm->tcm_info = q->qdisc->handle;
421
422 return 0;
423}
424
425static int cbs_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
426 struct Qdisc **old, struct netlink_ext_ack *extack)
427{
428 struct cbs_sched_data *q = qdisc_priv(sch);
429
430 if (!new) {
431 new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
432 sch->handle, NULL);
433 if (!new)
434 new = &noop_qdisc;
435 }
436
437 *old = qdisc_replace(sch, new, &q->qdisc);
438 return 0;
439}
440
441static struct Qdisc *cbs_leaf(struct Qdisc *sch, unsigned long arg)
442{
443 struct cbs_sched_data *q = qdisc_priv(sch);
444
445 return q->qdisc;
446}
447
448static unsigned long cbs_find(struct Qdisc *sch, u32 classid)
449{
450 return 1;
451}
452
453static void cbs_walk(struct Qdisc *sch, struct qdisc_walker *walker)
454{
455 if (!walker->stop) {
456 if (walker->count >= walker->skip) {
457 if (walker->fn(sch, 1, walker) < 0) {
458 walker->stop = 1;
459 return;
460 }
461 }
462 walker->count++;
463 }
464}
465
466static const struct Qdisc_class_ops cbs_class_ops = {
467 .graft = cbs_graft,
468 .leaf = cbs_leaf,
469 .find = cbs_find,
470 .walk = cbs_walk,
471 .dump = cbs_dump_class,
472};
473
359static struct Qdisc_ops cbs_qdisc_ops __read_mostly = { 474static struct Qdisc_ops cbs_qdisc_ops __read_mostly = {
360 .id = "cbs", 475 .id = "cbs",
476 .cl_ops = &cbs_class_ops,
361 .priv_size = sizeof(struct cbs_sched_data), 477 .priv_size = sizeof(struct cbs_sched_data),
362 .enqueue = cbs_enqueue, 478 .enqueue = cbs_enqueue,
363 .dequeue = cbs_dequeue, 479 .dequeue = cbs_dequeue,
diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c
new file mode 100644
index 000000000000..1538d6fa8165
--- /dev/null
+++ b/net/sched/sch_etf.c
@@ -0,0 +1,484 @@
1// SPDX-License-Identifier: GPL-2.0
2
3/* net/sched/sch_etf.c Earliest TxTime First queueing discipline.
4 *
5 * Authors: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
6 * Vinicius Costa Gomes <vinicius.gomes@intel.com>
7 */
8
9#include <linux/module.h>
10#include <linux/types.h>
11#include <linux/kernel.h>
12#include <linux/string.h>
13#include <linux/errno.h>
14#include <linux/errqueue.h>
15#include <linux/rbtree.h>
16#include <linux/skbuff.h>
17#include <linux/posix-timers.h>
18#include <net/netlink.h>
19#include <net/sch_generic.h>
20#include <net/pkt_sched.h>
21#include <net/sock.h>
22
23#define DEADLINE_MODE_IS_ON(x) ((x)->flags & TC_ETF_DEADLINE_MODE_ON)
24#define OFFLOAD_IS_ON(x) ((x)->flags & TC_ETF_OFFLOAD_ON)
25
26struct etf_sched_data {
27 bool offload;
28 bool deadline_mode;
29 int clockid;
30 int queue;
31 s32 delta; /* in ns */
32 ktime_t last; /* The txtime of the last skb sent to the netdevice. */
33 struct rb_root head;
34 struct qdisc_watchdog watchdog;
35 ktime_t (*get_time)(void);
36};
37
38static const struct nla_policy etf_policy[TCA_ETF_MAX + 1] = {
39 [TCA_ETF_PARMS] = { .len = sizeof(struct tc_etf_qopt) },
40};
41
42static inline int validate_input_params(struct tc_etf_qopt *qopt,
43 struct netlink_ext_ack *extack)
44{
45 /* Check if params comply to the following rules:
46 * * Clockid and delta must be valid.
47 *
48 * * Dynamic clockids are not supported.
49 *
50 * * Delta must be a positive integer.
51 *
52 * Also note that for the HW offload case, we must
53 * expect that system clocks have been synchronized to PHC.
54 */
55 if (qopt->clockid < 0) {
56 NL_SET_ERR_MSG(extack, "Dynamic clockids are not supported");
57 return -ENOTSUPP;
58 }
59
60 if (qopt->clockid != CLOCK_TAI) {
61 NL_SET_ERR_MSG(extack, "Invalid clockid. CLOCK_TAI must be used");
62 return -EINVAL;
63 }
64
65 if (qopt->delta < 0) {
66 NL_SET_ERR_MSG(extack, "Delta must be positive");
67 return -EINVAL;
68 }
69
70 return 0;
71}
72
73static bool is_packet_valid(struct Qdisc *sch, struct sk_buff *nskb)
74{
75 struct etf_sched_data *q = qdisc_priv(sch);
76 ktime_t txtime = nskb->tstamp;
77 struct sock *sk = nskb->sk;
78 ktime_t now;
79
80 if (!sk)
81 return false;
82
83 if (!sock_flag(sk, SOCK_TXTIME))
84 return false;
85
86 /* We don't perform crosstimestamping.
87 * Drop if packet's clockid differs from qdisc's.
88 */
89 if (sk->sk_clockid != q->clockid)
90 return false;
91
92 if (sk->sk_txtime_deadline_mode != q->deadline_mode)
93 return false;
94
95 now = q->get_time();
96 if (ktime_before(txtime, now) || ktime_before(txtime, q->last))
97 return false;
98
99 return true;
100}
101
102static struct sk_buff *etf_peek_timesortedlist(struct Qdisc *sch)
103{
104 struct etf_sched_data *q = qdisc_priv(sch);
105 struct rb_node *p;
106
107 p = rb_first(&q->head);
108 if (!p)
109 return NULL;
110
111 return rb_to_skb(p);
112}
113
114static void reset_watchdog(struct Qdisc *sch)
115{
116 struct etf_sched_data *q = qdisc_priv(sch);
117 struct sk_buff *skb = etf_peek_timesortedlist(sch);
118 ktime_t next;
119
120 if (!skb)
121 return;
122
123 next = ktime_sub_ns(skb->tstamp, q->delta);
124 qdisc_watchdog_schedule_ns(&q->watchdog, ktime_to_ns(next));
125}
126
127static void report_sock_error(struct sk_buff *skb, u32 err, u8 code)
128{
129 struct sock_exterr_skb *serr;
130 struct sk_buff *clone;
131 ktime_t txtime = skb->tstamp;
132
133 if (!skb->sk || !(skb->sk->sk_txtime_report_errors))
134 return;
135
136 clone = skb_clone(skb, GFP_ATOMIC);
137 if (!clone)
138 return;
139
140 serr = SKB_EXT_ERR(clone);
141 serr->ee.ee_errno = err;
142 serr->ee.ee_origin = SO_EE_ORIGIN_TXTIME;
143 serr->ee.ee_type = 0;
144 serr->ee.ee_code = code;
145 serr->ee.ee_pad = 0;
146 serr->ee.ee_data = (txtime >> 32); /* high part of tstamp */
147 serr->ee.ee_info = txtime; /* low part of tstamp */
148
149 if (sock_queue_err_skb(skb->sk, clone))
150 kfree_skb(clone);
151}
152
153static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch,
154 struct sk_buff **to_free)
155{
156 struct etf_sched_data *q = qdisc_priv(sch);
157 struct rb_node **p = &q->head.rb_node, *parent = NULL;
158 ktime_t txtime = nskb->tstamp;
159
160 if (!is_packet_valid(sch, nskb)) {
161 report_sock_error(nskb, EINVAL,
162 SO_EE_CODE_TXTIME_INVALID_PARAM);
163 return qdisc_drop(nskb, sch, to_free);
164 }
165
166 while (*p) {
167 struct sk_buff *skb;
168
169 parent = *p;
170 skb = rb_to_skb(parent);
171 if (ktime_after(txtime, skb->tstamp))
172 p = &parent->rb_right;
173 else
174 p = &parent->rb_left;
175 }
176 rb_link_node(&nskb->rbnode, parent, p);
177 rb_insert_color(&nskb->rbnode, &q->head);
178
179 qdisc_qstats_backlog_inc(sch, nskb);
180 sch->q.qlen++;
181
182 /* Now we may need to re-arm the qdisc watchdog for the next packet. */
183 reset_watchdog(sch);
184
185 return NET_XMIT_SUCCESS;
186}
187
188static void timesortedlist_erase(struct Qdisc *sch, struct sk_buff *skb,
189 bool drop)
190{
191 struct etf_sched_data *q = qdisc_priv(sch);
192
193 rb_erase(&skb->rbnode, &q->head);
194
195 /* The rbnode field in the skb re-uses these fields, now that
196 * we are done with the rbnode, reset them.
197 */
198 skb->next = NULL;
199 skb->prev = NULL;
200 skb->dev = qdisc_dev(sch);
201
202 qdisc_qstats_backlog_dec(sch, skb);
203
204 if (drop) {
205 struct sk_buff *to_free = NULL;
206
207 report_sock_error(skb, ECANCELED, SO_EE_CODE_TXTIME_MISSED);
208
209 qdisc_drop(skb, sch, &to_free);
210 kfree_skb_list(to_free);
211 qdisc_qstats_overlimit(sch);
212 } else {
213 qdisc_bstats_update(sch, skb);
214
215 q->last = skb->tstamp;
216 }
217
218 sch->q.qlen--;
219}
220
221static struct sk_buff *etf_dequeue_timesortedlist(struct Qdisc *sch)
222{
223 struct etf_sched_data *q = qdisc_priv(sch);
224 struct sk_buff *skb;
225 ktime_t now, next;
226
227 skb = etf_peek_timesortedlist(sch);
228 if (!skb)
229 return NULL;
230
231 now = q->get_time();
232
233 /* Drop if packet has expired while in queue. */
234 if (ktime_before(skb->tstamp, now)) {
235 timesortedlist_erase(sch, skb, true);
236 skb = NULL;
237 goto out;
238 }
239
240 /* When in deadline mode, dequeue as soon as possible and change the
241 * txtime from deadline to (now + delta).
242 */
243 if (q->deadline_mode) {
244 timesortedlist_erase(sch, skb, false);
245 skb->tstamp = now;
246 goto out;
247 }
248
249 next = ktime_sub_ns(skb->tstamp, q->delta);
250
251 /* Dequeue only if now is within the [txtime - delta, txtime] range. */
252 if (ktime_after(now, next))
253 timesortedlist_erase(sch, skb, false);
254 else
255 skb = NULL;
256
257out:
258 /* Now we may need to re-arm the qdisc watchdog for the next packet. */
259 reset_watchdog(sch);
260
261 return skb;
262}
263
264static void etf_disable_offload(struct net_device *dev,
265 struct etf_sched_data *q)
266{
267 struct tc_etf_qopt_offload etf = { };
268 const struct net_device_ops *ops;
269 int err;
270
271 if (!q->offload)
272 return;
273
274 ops = dev->netdev_ops;
275 if (!ops->ndo_setup_tc)
276 return;
277
278 etf.queue = q->queue;
279 etf.enable = 0;
280
281 err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETF, &etf);
282 if (err < 0)
283 pr_warn("Couldn't disable ETF offload for queue %d\n",
284 etf.queue);
285}
286
287static int etf_enable_offload(struct net_device *dev, struct etf_sched_data *q,
288 struct netlink_ext_ack *extack)
289{
290 const struct net_device_ops *ops = dev->netdev_ops;
291 struct tc_etf_qopt_offload etf = { };
292 int err;
293
294 if (q->offload)
295 return 0;
296
297 if (!ops->ndo_setup_tc) {
298 NL_SET_ERR_MSG(extack, "Specified device does not support ETF offload");
299 return -EOPNOTSUPP;
300 }
301
302 etf.queue = q->queue;
303 etf.enable = 1;
304
305 err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETF, &etf);
306 if (err < 0) {
307 NL_SET_ERR_MSG(extack, "Specified device failed to setup ETF hardware offload");
308 return err;
309 }
310
311 return 0;
312}
313
314static int etf_init(struct Qdisc *sch, struct nlattr *opt,
315 struct netlink_ext_ack *extack)
316{
317 struct etf_sched_data *q = qdisc_priv(sch);
318 struct net_device *dev = qdisc_dev(sch);
319 struct nlattr *tb[TCA_ETF_MAX + 1];
320 struct tc_etf_qopt *qopt;
321 int err;
322
323 if (!opt) {
324 NL_SET_ERR_MSG(extack,
325 "Missing ETF qdisc options which are mandatory");
326 return -EINVAL;
327 }
328
329 err = nla_parse_nested(tb, TCA_ETF_MAX, opt, etf_policy, extack);
330 if (err < 0)
331 return err;
332
333 if (!tb[TCA_ETF_PARMS]) {
334 NL_SET_ERR_MSG(extack, "Missing mandatory ETF parameters");
335 return -EINVAL;
336 }
337
338 qopt = nla_data(tb[TCA_ETF_PARMS]);
339
340 pr_debug("delta %d clockid %d offload %s deadline %s\n",
341 qopt->delta, qopt->clockid,
342 OFFLOAD_IS_ON(qopt) ? "on" : "off",
343 DEADLINE_MODE_IS_ON(qopt) ? "on" : "off");
344
345 err = validate_input_params(qopt, extack);
346 if (err < 0)
347 return err;
348
349 q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0);
350
351 if (OFFLOAD_IS_ON(qopt)) {
352 err = etf_enable_offload(dev, q, extack);
353 if (err < 0)
354 return err;
355 }
356
357 /* Everything went OK, save the parameters used. */
358 q->delta = qopt->delta;
359 q->clockid = qopt->clockid;
360 q->offload = OFFLOAD_IS_ON(qopt);
361 q->deadline_mode = DEADLINE_MODE_IS_ON(qopt);
362
363 switch (q->clockid) {
364 case CLOCK_REALTIME:
365 q->get_time = ktime_get_real;
366 break;
367 case CLOCK_MONOTONIC:
368 q->get_time = ktime_get;
369 break;
370 case CLOCK_BOOTTIME:
371 q->get_time = ktime_get_boottime;
372 break;
373 case CLOCK_TAI:
374 q->get_time = ktime_get_clocktai;
375 break;
376 default:
377 NL_SET_ERR_MSG(extack, "Clockid is not supported");
378 return -ENOTSUPP;
379 }
380
381 qdisc_watchdog_init_clockid(&q->watchdog, sch, q->clockid);
382
383 return 0;
384}
385
386static void timesortedlist_clear(struct Qdisc *sch)
387{
388 struct etf_sched_data *q = qdisc_priv(sch);
389 struct rb_node *p = rb_first(&q->head);
390
391 while (p) {
392 struct sk_buff *skb = rb_to_skb(p);
393
394 p = rb_next(p);
395
396 rb_erase(&skb->rbnode, &q->head);
397 rtnl_kfree_skbs(skb, skb);
398 sch->q.qlen--;
399 }
400}
401
402static void etf_reset(struct Qdisc *sch)
403{
404 struct etf_sched_data *q = qdisc_priv(sch);
405
406 /* Only cancel watchdog if it's been initialized. */
407 if (q->watchdog.qdisc == sch)
408 qdisc_watchdog_cancel(&q->watchdog);
409
410 /* No matter which mode we are on, it's safe to clear both lists. */
411 timesortedlist_clear(sch);
412 __qdisc_reset_queue(&sch->q);
413
414 sch->qstats.backlog = 0;
415 sch->q.qlen = 0;
416
417 q->last = 0;
418}
419
420static void etf_destroy(struct Qdisc *sch)
421{
422 struct etf_sched_data *q = qdisc_priv(sch);
423 struct net_device *dev = qdisc_dev(sch);
424
425 /* Only cancel watchdog if it's been initialized. */
426 if (q->watchdog.qdisc == sch)
427 qdisc_watchdog_cancel(&q->watchdog);
428
429 etf_disable_offload(dev, q);
430}
431
432static int etf_dump(struct Qdisc *sch, struct sk_buff *skb)
433{
434 struct etf_sched_data *q = qdisc_priv(sch);
435 struct tc_etf_qopt opt = { };
436 struct nlattr *nest;
437
438 nest = nla_nest_start(skb, TCA_OPTIONS);
439 if (!nest)
440 goto nla_put_failure;
441
442 opt.delta = q->delta;
443 opt.clockid = q->clockid;
444 if (q->offload)
445 opt.flags |= TC_ETF_OFFLOAD_ON;
446
447 if (q->deadline_mode)
448 opt.flags |= TC_ETF_DEADLINE_MODE_ON;
449
450 if (nla_put(skb, TCA_ETF_PARMS, sizeof(opt), &opt))
451 goto nla_put_failure;
452
453 return nla_nest_end(skb, nest);
454
455nla_put_failure:
456 nla_nest_cancel(skb, nest);
457 return -1;
458}
459
460static struct Qdisc_ops etf_qdisc_ops __read_mostly = {
461 .id = "etf",
462 .priv_size = sizeof(struct etf_sched_data),
463 .enqueue = etf_enqueue_timesortedlist,
464 .dequeue = etf_dequeue_timesortedlist,
465 .peek = etf_peek_timesortedlist,
466 .init = etf_init,
467 .reset = etf_reset,
468 .destroy = etf_destroy,
469 .dump = etf_dump,
470 .owner = THIS_MODULE,
471};
472
473static int __init etf_module_init(void)
474{
475 return register_qdisc(&etf_qdisc_ops);
476}
477
478static void __exit etf_module_exit(void)
479{
480 unregister_qdisc(&etf_qdisc_ops);
481}
482module_init(etf_module_init)
483module_exit(etf_module_exit)
484MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index cd2e0e342fb6..6c0a9d5dbf94 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -479,24 +479,28 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt,
479 q->cparams.mtu = psched_mtu(qdisc_dev(sch)); 479 q->cparams.mtu = psched_mtu(qdisc_dev(sch));
480 480
481 if (opt) { 481 if (opt) {
482 int err = fq_codel_change(sch, opt, extack); 482 err = fq_codel_change(sch, opt, extack);
483 if (err) 483 if (err)
484 return err; 484 goto init_failure;
485 } 485 }
486 486
487 err = tcf_block_get(&q->block, &q->filter_list, sch, extack); 487 err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
488 if (err) 488 if (err)
489 return err; 489 goto init_failure;
490 490
491 if (!q->flows) { 491 if (!q->flows) {
492 q->flows = kvcalloc(q->flows_cnt, 492 q->flows = kvcalloc(q->flows_cnt,
493 sizeof(struct fq_codel_flow), 493 sizeof(struct fq_codel_flow),
494 GFP_KERNEL); 494 GFP_KERNEL);
495 if (!q->flows) 495 if (!q->flows) {
496 return -ENOMEM; 496 err = -ENOMEM;
497 goto init_failure;
498 }
497 q->backlogs = kvcalloc(q->flows_cnt, sizeof(u32), GFP_KERNEL); 499 q->backlogs = kvcalloc(q->flows_cnt, sizeof(u32), GFP_KERNEL);
498 if (!q->backlogs) 500 if (!q->backlogs) {
499 return -ENOMEM; 501 err = -ENOMEM;
502 goto alloc_failure;
503 }
500 for (i = 0; i < q->flows_cnt; i++) { 504 for (i = 0; i < q->flows_cnt; i++) {
501 struct fq_codel_flow *flow = q->flows + i; 505 struct fq_codel_flow *flow = q->flows + i;
502 506
@@ -509,6 +513,13 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt,
509 else 513 else
510 sch->flags &= ~TCQ_F_CAN_BYPASS; 514 sch->flags &= ~TCQ_F_CAN_BYPASS;
511 return 0; 515 return 0;
516
517alloc_failure:
518 kvfree(q->flows);
519 q->flows = NULL;
520init_failure:
521 q->flows_cnt = 0;
522 return err;
512} 523}
513 524
514static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb) 525static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb)
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 3ae9877ea205..3278a76f6861 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -1385,8 +1385,8 @@ hfsc_schedule_watchdog(struct Qdisc *sch)
1385 if (next_time == 0 || next_time > q->root.cl_cfmin) 1385 if (next_time == 0 || next_time > q->root.cl_cfmin)
1386 next_time = q->root.cl_cfmin; 1386 next_time = q->root.cl_cfmin;
1387 } 1387 }
1388 WARN_ON(next_time == 0); 1388 if (next_time)
1389 qdisc_watchdog_schedule(&q->watchdog, next_time); 1389 qdisc_watchdog_schedule(&q->watchdog, next_time);
1390} 1390}
1391 1391
1392static int 1392static int
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 2a4ab7caf553..43c4bfe625a9 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -126,7 +126,6 @@ struct htb_class {
126 126
127 union { 127 union {
128 struct htb_class_leaf { 128 struct htb_class_leaf {
129 struct list_head drop_list;
130 int deficit[TC_HTB_MAXDEPTH]; 129 int deficit[TC_HTB_MAXDEPTH];
131 struct Qdisc *q; 130 struct Qdisc *q;
132 } leaf; 131 } leaf;
@@ -171,7 +170,6 @@ struct htb_sched {
171 struct qdisc_watchdog watchdog; 170 struct qdisc_watchdog watchdog;
172 171
173 s64 now; /* cached dequeue time */ 172 s64 now; /* cached dequeue time */
174 struct list_head drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */
175 173
176 /* time of nearest event per level (row) */ 174 /* time of nearest event per level (row) */
177 s64 near_ev_cache[TC_HTB_MAXDEPTH]; 175 s64 near_ev_cache[TC_HTB_MAXDEPTH];
@@ -562,8 +560,6 @@ static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
562 if (!cl->prio_activity) { 560 if (!cl->prio_activity) {
563 cl->prio_activity = 1 << cl->prio; 561 cl->prio_activity = 1 << cl->prio;
564 htb_activate_prios(q, cl); 562 htb_activate_prios(q, cl);
565 list_add_tail(&cl->un.leaf.drop_list,
566 q->drops + cl->prio);
567 } 563 }
568} 564}
569 565
@@ -579,7 +575,6 @@ static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
579 575
580 htb_deactivate_prios(q, cl); 576 htb_deactivate_prios(q, cl);
581 cl->prio_activity = 0; 577 cl->prio_activity = 0;
582 list_del_init(&cl->un.leaf.drop_list);
583} 578}
584 579
585static void htb_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch, 580static void htb_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch,
@@ -981,7 +976,6 @@ static void htb_reset(struct Qdisc *sch)
981 else { 976 else {
982 if (cl->un.leaf.q) 977 if (cl->un.leaf.q)
983 qdisc_reset(cl->un.leaf.q); 978 qdisc_reset(cl->un.leaf.q);
984 INIT_LIST_HEAD(&cl->un.leaf.drop_list);
985 } 979 }
986 cl->prio_activity = 0; 980 cl->prio_activity = 0;
987 cl->cmode = HTB_CAN_SEND; 981 cl->cmode = HTB_CAN_SEND;
@@ -993,8 +987,6 @@ static void htb_reset(struct Qdisc *sch)
993 sch->qstats.backlog = 0; 987 sch->qstats.backlog = 0;
994 memset(q->hlevel, 0, sizeof(q->hlevel)); 988 memset(q->hlevel, 0, sizeof(q->hlevel));
995 memset(q->row_mask, 0, sizeof(q->row_mask)); 989 memset(q->row_mask, 0, sizeof(q->row_mask));
996 for (i = 0; i < TC_HTB_NUMPRIO; i++)
997 INIT_LIST_HEAD(q->drops + i);
998} 990}
999 991
1000static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = { 992static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
@@ -1024,7 +1016,6 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt,
1024 struct nlattr *tb[TCA_HTB_MAX + 1]; 1016 struct nlattr *tb[TCA_HTB_MAX + 1];
1025 struct tc_htb_glob *gopt; 1017 struct tc_htb_glob *gopt;
1026 int err; 1018 int err;
1027 int i;
1028 1019
1029 qdisc_watchdog_init(&q->watchdog, sch); 1020 qdisc_watchdog_init(&q->watchdog, sch);
1030 INIT_WORK(&q->work, htb_work_func); 1021 INIT_WORK(&q->work, htb_work_func);
@@ -1050,8 +1041,6 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt,
1050 err = qdisc_class_hash_init(&q->clhash); 1041 err = qdisc_class_hash_init(&q->clhash);
1051 if (err < 0) 1042 if (err < 0)
1052 return err; 1043 return err;
1053 for (i = 0; i < TC_HTB_NUMPRIO; i++)
1054 INIT_LIST_HEAD(q->drops + i);
1055 1044
1056 qdisc_skb_head_init(&q->direct_queue); 1045 qdisc_skb_head_init(&q->direct_queue);
1057 1046
@@ -1224,7 +1213,6 @@ static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl,
1224 1213
1225 parent->level = 0; 1214 parent->level = 0;
1226 memset(&parent->un.inner, 0, sizeof(parent->un.inner)); 1215 memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1227 INIT_LIST_HEAD(&parent->un.leaf.drop_list);
1228 parent->un.leaf.q = new_q ? new_q : &noop_qdisc; 1216 parent->un.leaf.q = new_q ? new_q : &noop_qdisc;
1229 parent->tokens = parent->buffer; 1217 parent->tokens = parent->buffer;
1230 parent->ctokens = parent->cbuffer; 1218 parent->ctokens = parent->cbuffer;
@@ -1418,7 +1406,6 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
1418 } 1406 }
1419 1407
1420 cl->children = 0; 1408 cl->children = 0;
1421 INIT_LIST_HEAD(&cl->un.leaf.drop_list);
1422 RB_CLEAR_NODE(&cl->pq_node); 1409 RB_CLEAR_NODE(&cl->pq_node);
1423 1410
1424 for (prio = 0; prio < TC_HTB_NUMPRIO; prio++) 1411 for (prio = 0; prio < TC_HTB_NUMPRIO; prio++)
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 7d6801fc5340..ad18a2052416 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -68,6 +68,11 @@
68 Fabio Ludovici <fabio.ludovici at yahoo.it> 68 Fabio Ludovici <fabio.ludovici at yahoo.it>
69*/ 69*/
70 70
71struct disttable {
72 u32 size;
73 s16 table[0];
74};
75
71struct netem_sched_data { 76struct netem_sched_data {
72 /* internal t(ime)fifo qdisc uses t_root and sch->limit */ 77 /* internal t(ime)fifo qdisc uses t_root and sch->limit */
73 struct rb_root t_root; 78 struct rb_root t_root;
@@ -99,10 +104,7 @@ struct netem_sched_data {
99 u32 rho; 104 u32 rho;
100 } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor; 105 } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
101 106
102 struct disttable { 107 struct disttable *delay_dist;
103 u32 size;
104 s16 table[0];
105 } *delay_dist;
106 108
107 enum { 109 enum {
108 CLG_RANDOM, 110 CLG_RANDOM,
@@ -142,6 +144,7 @@ struct netem_sched_data {
142 s32 bytes_left; 144 s32 bytes_left;
143 } slot; 145 } slot;
144 146
147 struct disttable *slot_dist;
145}; 148};
146 149
147/* Time stamp put into socket buffer control block 150/* Time stamp put into socket buffer control block
@@ -180,7 +183,7 @@ static u32 get_crandom(struct crndstate *state)
180 u64 value, rho; 183 u64 value, rho;
181 unsigned long answer; 184 unsigned long answer;
182 185
183 if (state->rho == 0) /* no correlation */ 186 if (!state || state->rho == 0) /* no correlation */
184 return prandom_u32(); 187 return prandom_u32();
185 188
186 value = prandom_u32(); 189 value = prandom_u32();
@@ -601,10 +604,19 @@ finish_segs:
601 604
602static void get_slot_next(struct netem_sched_data *q, u64 now) 605static void get_slot_next(struct netem_sched_data *q, u64 now)
603{ 606{
604 q->slot.slot_next = now + q->slot_config.min_delay + 607 s64 next_delay;
605 (prandom_u32() * 608
606 (q->slot_config.max_delay - 609 if (!q->slot_dist)
607 q->slot_config.min_delay) >> 32); 610 next_delay = q->slot_config.min_delay +
611 (prandom_u32() *
612 (q->slot_config.max_delay -
613 q->slot_config.min_delay) >> 32);
614 else
615 next_delay = tabledist(q->slot_config.dist_delay,
616 (s32)(q->slot_config.dist_jitter),
617 NULL, q->slot_dist);
618
619 q->slot.slot_next = now + next_delay;
608 q->slot.packets_left = q->slot_config.max_packets; 620 q->slot.packets_left = q->slot_config.max_packets;
609 q->slot.bytes_left = q->slot_config.max_bytes; 621 q->slot.bytes_left = q->slot_config.max_bytes;
610} 622}
@@ -721,9 +733,9 @@ static void dist_free(struct disttable *d)
721 * signed 16 bit values. 733 * signed 16 bit values.
722 */ 734 */
723 735
724static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr) 736static int get_dist_table(struct Qdisc *sch, struct disttable **tbl,
737 const struct nlattr *attr)
725{ 738{
726 struct netem_sched_data *q = qdisc_priv(sch);
727 size_t n = nla_len(attr)/sizeof(__s16); 739 size_t n = nla_len(attr)/sizeof(__s16);
728 const __s16 *data = nla_data(attr); 740 const __s16 *data = nla_data(attr);
729 spinlock_t *root_lock; 741 spinlock_t *root_lock;
@@ -744,7 +756,7 @@ static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
744 root_lock = qdisc_root_sleeping_lock(sch); 756 root_lock = qdisc_root_sleeping_lock(sch);
745 757
746 spin_lock_bh(root_lock); 758 spin_lock_bh(root_lock);
747 swap(q->delay_dist, d); 759 swap(*tbl, d);
748 spin_unlock_bh(root_lock); 760 spin_unlock_bh(root_lock);
749 761
750 dist_free(d); 762 dist_free(d);
@@ -762,7 +774,8 @@ static void get_slot(struct netem_sched_data *q, const struct nlattr *attr)
762 q->slot_config.max_bytes = INT_MAX; 774 q->slot_config.max_bytes = INT_MAX;
763 q->slot.packets_left = q->slot_config.max_packets; 775 q->slot.packets_left = q->slot_config.max_packets;
764 q->slot.bytes_left = q->slot_config.max_bytes; 776 q->slot.bytes_left = q->slot_config.max_bytes;
765 if (q->slot_config.min_delay | q->slot_config.max_delay) 777 if (q->slot_config.min_delay | q->slot_config.max_delay |
778 q->slot_config.dist_jitter)
766 q->slot.slot_next = ktime_get_ns(); 779 q->slot.slot_next = ktime_get_ns();
767 else 780 else
768 q->slot.slot_next = 0; 781 q->slot.slot_next = 0;
@@ -926,16 +939,17 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt,
926 } 939 }
927 940
928 if (tb[TCA_NETEM_DELAY_DIST]) { 941 if (tb[TCA_NETEM_DELAY_DIST]) {
929 ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]); 942 ret = get_dist_table(sch, &q->delay_dist,
930 if (ret) { 943 tb[TCA_NETEM_DELAY_DIST]);
931 /* recover clg and loss_model, in case of 944 if (ret)
932 * q->clg and q->loss_model were modified 945 goto get_table_failure;
933 * in get_loss_clg() 946 }
934 */ 947
935 q->clg = old_clg; 948 if (tb[TCA_NETEM_SLOT_DIST]) {
936 q->loss_model = old_loss_model; 949 ret = get_dist_table(sch, &q->slot_dist,
937 return ret; 950 tb[TCA_NETEM_SLOT_DIST]);
938 } 951 if (ret)
952 goto get_table_failure;
939 } 953 }
940 954
941 sch->limit = qopt->limit; 955 sch->limit = qopt->limit;
@@ -983,6 +997,15 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt,
983 get_slot(q, tb[TCA_NETEM_SLOT]); 997 get_slot(q, tb[TCA_NETEM_SLOT]);
984 998
985 return ret; 999 return ret;
1000
1001get_table_failure:
1002 /* recover clg and loss_model, in case of
1003 * q->clg and q->loss_model were modified
1004 * in get_loss_clg()
1005 */
1006 q->clg = old_clg;
1007 q->loss_model = old_loss_model;
1008 return ret;
986} 1009}
987 1010
988static int netem_init(struct Qdisc *sch, struct nlattr *opt, 1011static int netem_init(struct Qdisc *sch, struct nlattr *opt,
@@ -1011,6 +1034,7 @@ static void netem_destroy(struct Qdisc *sch)
1011 if (q->qdisc) 1034 if (q->qdisc)
1012 qdisc_destroy(q->qdisc); 1035 qdisc_destroy(q->qdisc);
1013 dist_free(q->delay_dist); 1036 dist_free(q->delay_dist);
1037 dist_free(q->slot_dist);
1014} 1038}
1015 1039
1016static int dump_loss_model(const struct netem_sched_data *q, 1040static int dump_loss_model(const struct netem_sched_data *q,
@@ -1127,7 +1151,8 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
1127 if (dump_loss_model(q, skb) != 0) 1151 if (dump_loss_model(q, skb) != 0)
1128 goto nla_put_failure; 1152 goto nla_put_failure;
1129 1153
1130 if (q->slot_config.min_delay | q->slot_config.max_delay) { 1154 if (q->slot_config.min_delay | q->slot_config.max_delay |
1155 q->slot_config.dist_jitter) {
1131 slot = q->slot_config; 1156 slot = q->slot_config;
1132 if (slot.max_packets == INT_MAX) 1157 if (slot.max_packets == INT_MAX)
1133 slot.max_packets = 0; 1158 slot.max_packets = 0;
diff --git a/net/sched/sch_skbprio.c b/net/sched/sch_skbprio.c
new file mode 100644
index 000000000000..52c0b6d8f1d7
--- /dev/null
+++ b/net/sched/sch_skbprio.c
@@ -0,0 +1,320 @@
1/*
2 * net/sched/sch_skbprio.c SKB Priority Queue.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Nishanth Devarajan, <ndev2021@gmail.com>
10 * Cody Doucette, <doucette@bu.edu>
11 * original idea by Michel Machado, Cody Doucette, and Qiaobin Fu
12 */
13
14#include <linux/string.h>
15#include <linux/module.h>
16#include <linux/slab.h>
17#include <linux/types.h>
18#include <linux/kernel.h>
19#include <linux/errno.h>
20#include <linux/skbuff.h>
21#include <net/pkt_sched.h>
22#include <net/sch_generic.h>
23#include <net/inet_ecn.h>
24
25/* SKB Priority Queue
26 * =================================
27 *
28 * Skbprio (SKB Priority Queue) is a queueing discipline that prioritizes
29 * packets according to their skb->priority field. Under congestion,
30 * Skbprio drops already-enqueued lower priority packets to make space
31 * available for higher priority packets; it was conceived as a solution
32 * for denial-of-service defenses that need to route packets with different
33 * priorities as a mean to overcome DoS attacks.
34 */
35
36struct skbprio_sched_data {
37 /* Queue state. */
38 struct sk_buff_head qdiscs[SKBPRIO_MAX_PRIORITY];
39 struct gnet_stats_queue qstats[SKBPRIO_MAX_PRIORITY];
40 u16 highest_prio;
41 u16 lowest_prio;
42};
43
44static u16 calc_new_high_prio(const struct skbprio_sched_data *q)
45{
46 int prio;
47
48 for (prio = q->highest_prio - 1; prio >= q->lowest_prio; prio--) {
49 if (!skb_queue_empty(&q->qdiscs[prio]))
50 return prio;
51 }
52
53 /* SKB queue is empty, return 0 (default highest priority setting). */
54 return 0;
55}
56
57static u16 calc_new_low_prio(const struct skbprio_sched_data *q)
58{
59 int prio;
60
61 for (prio = q->lowest_prio + 1; prio <= q->highest_prio; prio++) {
62 if (!skb_queue_empty(&q->qdiscs[prio]))
63 return prio;
64 }
65
66 /* SKB queue is empty, return SKBPRIO_MAX_PRIORITY - 1
67 * (default lowest priority setting).
68 */
69 return SKBPRIO_MAX_PRIORITY - 1;
70}
71
72static int skbprio_enqueue(struct sk_buff *skb, struct Qdisc *sch,
73 struct sk_buff **to_free)
74{
75 const unsigned int max_priority = SKBPRIO_MAX_PRIORITY - 1;
76 struct skbprio_sched_data *q = qdisc_priv(sch);
77 struct sk_buff_head *qdisc;
78 struct sk_buff_head *lp_qdisc;
79 struct sk_buff *to_drop;
80 u16 prio, lp;
81
82 /* Obtain the priority of @skb. */
83 prio = min(skb->priority, max_priority);
84
85 qdisc = &q->qdiscs[prio];
86 if (sch->q.qlen < sch->limit) {
87 __skb_queue_tail(qdisc, skb);
88 qdisc_qstats_backlog_inc(sch, skb);
89 q->qstats[prio].backlog += qdisc_pkt_len(skb);
90
91 /* Check to update highest and lowest priorities. */
92 if (prio > q->highest_prio)
93 q->highest_prio = prio;
94
95 if (prio < q->lowest_prio)
96 q->lowest_prio = prio;
97
98 sch->q.qlen++;
99 return NET_XMIT_SUCCESS;
100 }
101
102 /* If this packet has the lowest priority, drop it. */
103 lp = q->lowest_prio;
104 if (prio <= lp) {
105 q->qstats[prio].drops++;
106 q->qstats[prio].overlimits++;
107 return qdisc_drop(skb, sch, to_free);
108 }
109
110 __skb_queue_tail(qdisc, skb);
111 qdisc_qstats_backlog_inc(sch, skb);
112 q->qstats[prio].backlog += qdisc_pkt_len(skb);
113
114 /* Drop the packet at the tail of the lowest priority qdisc. */
115 lp_qdisc = &q->qdiscs[lp];
116 to_drop = __skb_dequeue_tail(lp_qdisc);
117 BUG_ON(!to_drop);
118 qdisc_qstats_backlog_dec(sch, to_drop);
119 qdisc_drop(to_drop, sch, to_free);
120
121 q->qstats[lp].backlog -= qdisc_pkt_len(to_drop);
122 q->qstats[lp].drops++;
123 q->qstats[lp].overlimits++;
124
125 /* Check to update highest and lowest priorities. */
126 if (skb_queue_empty(lp_qdisc)) {
127 if (q->lowest_prio == q->highest_prio) {
128 /* The incoming packet is the only packet in queue. */
129 BUG_ON(sch->q.qlen != 1);
130 q->lowest_prio = prio;
131 q->highest_prio = prio;
132 } else {
133 q->lowest_prio = calc_new_low_prio(q);
134 }
135 }
136
137 if (prio > q->highest_prio)
138 q->highest_prio = prio;
139
140 return NET_XMIT_CN;
141}
142
143static struct sk_buff *skbprio_dequeue(struct Qdisc *sch)
144{
145 struct skbprio_sched_data *q = qdisc_priv(sch);
146 struct sk_buff_head *hpq = &q->qdiscs[q->highest_prio];
147 struct sk_buff *skb = __skb_dequeue(hpq);
148
149 if (unlikely(!skb))
150 return NULL;
151
152 sch->q.qlen--;
153 qdisc_qstats_backlog_dec(sch, skb);
154 qdisc_bstats_update(sch, skb);
155
156 q->qstats[q->highest_prio].backlog -= qdisc_pkt_len(skb);
157
158 /* Update highest priority field. */
159 if (skb_queue_empty(hpq)) {
160 if (q->lowest_prio == q->highest_prio) {
161 BUG_ON(sch->q.qlen);
162 q->highest_prio = 0;
163 q->lowest_prio = SKBPRIO_MAX_PRIORITY - 1;
164 } else {
165 q->highest_prio = calc_new_high_prio(q);
166 }
167 }
168 return skb;
169}
170
171static int skbprio_change(struct Qdisc *sch, struct nlattr *opt,
172 struct netlink_ext_ack *extack)
173{
174 struct tc_skbprio_qopt *ctl = nla_data(opt);
175
176 sch->limit = ctl->limit;
177 return 0;
178}
179
180static int skbprio_init(struct Qdisc *sch, struct nlattr *opt,
181 struct netlink_ext_ack *extack)
182{
183 struct skbprio_sched_data *q = qdisc_priv(sch);
184 int prio;
185
186 /* Initialise all queues, one for each possible priority. */
187 for (prio = 0; prio < SKBPRIO_MAX_PRIORITY; prio++)
188 __skb_queue_head_init(&q->qdiscs[prio]);
189
190 memset(&q->qstats, 0, sizeof(q->qstats));
191 q->highest_prio = 0;
192 q->lowest_prio = SKBPRIO_MAX_PRIORITY - 1;
193 sch->limit = 64;
194 if (!opt)
195 return 0;
196
197 return skbprio_change(sch, opt, extack);
198}
199
200static int skbprio_dump(struct Qdisc *sch, struct sk_buff *skb)
201{
202 struct tc_skbprio_qopt opt;
203
204 opt.limit = sch->limit;
205
206 if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
207 return -1;
208
209 return skb->len;
210}
211
212static void skbprio_reset(struct Qdisc *sch)
213{
214 struct skbprio_sched_data *q = qdisc_priv(sch);
215 int prio;
216
217 sch->qstats.backlog = 0;
218 sch->q.qlen = 0;
219
220 for (prio = 0; prio < SKBPRIO_MAX_PRIORITY; prio++)
221 __skb_queue_purge(&q->qdiscs[prio]);
222
223 memset(&q->qstats, 0, sizeof(q->qstats));
224 q->highest_prio = 0;
225 q->lowest_prio = SKBPRIO_MAX_PRIORITY - 1;
226}
227
228static void skbprio_destroy(struct Qdisc *sch)
229{
230 struct skbprio_sched_data *q = qdisc_priv(sch);
231 int prio;
232
233 for (prio = 0; prio < SKBPRIO_MAX_PRIORITY; prio++)
234 __skb_queue_purge(&q->qdiscs[prio]);
235}
236
237static struct Qdisc *skbprio_leaf(struct Qdisc *sch, unsigned long arg)
238{
239 return NULL;
240}
241
242static unsigned long skbprio_find(struct Qdisc *sch, u32 classid)
243{
244 return 0;
245}
246
247static int skbprio_dump_class(struct Qdisc *sch, unsigned long cl,
248 struct sk_buff *skb, struct tcmsg *tcm)
249{
250 tcm->tcm_handle |= TC_H_MIN(cl);
251 return 0;
252}
253
254static int skbprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
255 struct gnet_dump *d)
256{
257 struct skbprio_sched_data *q = qdisc_priv(sch);
258 if (gnet_stats_copy_queue(d, NULL, &q->qstats[cl - 1],
259 q->qstats[cl - 1].qlen) < 0)
260 return -1;
261 return 0;
262}
263
264static void skbprio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
265{
266 unsigned int i;
267
268 if (arg->stop)
269 return;
270
271 for (i = 0; i < SKBPRIO_MAX_PRIORITY; i++) {
272 if (arg->count < arg->skip) {
273 arg->count++;
274 continue;
275 }
276 if (arg->fn(sch, i + 1, arg) < 0) {
277 arg->stop = 1;
278 break;
279 }
280 arg->count++;
281 }
282}
283
284static const struct Qdisc_class_ops skbprio_class_ops = {
285 .leaf = skbprio_leaf,
286 .find = skbprio_find,
287 .dump = skbprio_dump_class,
288 .dump_stats = skbprio_dump_class_stats,
289 .walk = skbprio_walk,
290};
291
292static struct Qdisc_ops skbprio_qdisc_ops __read_mostly = {
293 .cl_ops = &skbprio_class_ops,
294 .id = "skbprio",
295 .priv_size = sizeof(struct skbprio_sched_data),
296 .enqueue = skbprio_enqueue,
297 .dequeue = skbprio_dequeue,
298 .peek = qdisc_peek_dequeued,
299 .init = skbprio_init,
300 .reset = skbprio_reset,
301 .change = skbprio_change,
302 .dump = skbprio_dump,
303 .destroy = skbprio_destroy,
304 .owner = THIS_MODULE,
305};
306
307static int __init skbprio_module_init(void)
308{
309 return register_qdisc(&skbprio_qdisc_ops);
310}
311
312static void __exit skbprio_module_exit(void)
313{
314 unregister_qdisc(&skbprio_qdisc_ops);
315}
316
317module_init(skbprio_module_init)
318module_exit(skbprio_module_exit)
319
320MODULE_LICENSE("GPL");
diff --git a/net/sctp/Kconfig b/net/sctp/Kconfig
index c740b189d4ba..950ecf6e7439 100644
--- a/net/sctp/Kconfig
+++ b/net/sctp/Kconfig
@@ -41,8 +41,8 @@ config SCTP_DBG_OBJCNT
41 bool "SCTP: Debug object counts" 41 bool "SCTP: Debug object counts"
42 depends on PROC_FS 42 depends on PROC_FS
43 help 43 help
44 If you say Y, this will enable debugging support for counting the 44 If you say Y, this will enable debugging support for counting the
45 type of objects that are currently allocated. This is useful for 45 type of objects that are currently allocated. This is useful for
46 identifying memory leaks. This debug information can be viewed by 46 identifying memory leaks. This debug information can be viewed by
47 'cat /proc/net/sctp/sctp_dbg_objcnt' 47 'cat /proc/net/sctp/sctp_dbg_objcnt'
48 48
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 5d5a16204d50..297d9cf960b9 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -115,6 +115,9 @@ static struct sctp_association *sctp_association_init(
115 /* Initialize path max retrans value. */ 115 /* Initialize path max retrans value. */
116 asoc->pathmaxrxt = sp->pathmaxrxt; 116 asoc->pathmaxrxt = sp->pathmaxrxt;
117 117
118 asoc->flowlabel = sp->flowlabel;
119 asoc->dscp = sp->dscp;
120
118 /* Initialize default path MTU. */ 121 /* Initialize default path MTU. */
119 asoc->pathmtu = sp->pathmtu; 122 asoc->pathmtu = sp->pathmtu;
120 123
@@ -647,6 +650,18 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
647 peer->sackdelay = asoc->sackdelay; 650 peer->sackdelay = asoc->sackdelay;
648 peer->sackfreq = asoc->sackfreq; 651 peer->sackfreq = asoc->sackfreq;
649 652
653 if (addr->sa.sa_family == AF_INET6) {
654 __be32 info = addr->v6.sin6_flowinfo;
655
656 if (info) {
657 peer->flowlabel = ntohl(info & IPV6_FLOWLABEL_MASK);
658 peer->flowlabel |= SCTP_FLOWLABEL_SET_MASK;
659 } else {
660 peer->flowlabel = asoc->flowlabel;
661 }
662 }
663 peer->dscp = asoc->dscp;
664
650 /* Enable/disable heartbeat, SACK delay, and path MTU discovery 665 /* Enable/disable heartbeat, SACK delay, and path MTU discovery
651 * based on association setting. 666 * based on association setting.
652 */ 667 */
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index 79daa98208c3..ce8087846f05 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -237,7 +237,9 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
237 /* Account for a different sized first fragment */ 237 /* Account for a different sized first fragment */
238 if (msg_len >= first_len) { 238 if (msg_len >= first_len) {
239 msg->can_delay = 0; 239 msg->can_delay = 0;
240 SCTP_INC_STATS(sock_net(asoc->base.sk), SCTP_MIB_FRAGUSRMSGS); 240 if (msg_len > first_len)
241 SCTP_INC_STATS(sock_net(asoc->base.sk),
242 SCTP_MIB_FRAGUSRMSGS);
241 } else { 243 } else {
242 /* Which may be the only one... */ 244 /* Which may be the only one... */
243 first_len = msg_len; 245 first_len = msg_len;
@@ -323,7 +325,8 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk)
323 if (SCTP_PR_TTL_ENABLED(chunk->sinfo.sinfo_flags) && 325 if (SCTP_PR_TTL_ENABLED(chunk->sinfo.sinfo_flags) &&
324 time_after(jiffies, chunk->msg->expires_at)) { 326 time_after(jiffies, chunk->msg->expires_at)) {
325 struct sctp_stream_out *streamout = 327 struct sctp_stream_out *streamout =
326 &chunk->asoc->stream.out[chunk->sinfo.sinfo_stream]; 328 SCTP_SO(&chunk->asoc->stream,
329 chunk->sinfo.sinfo_stream);
327 330
328 if (chunk->sent_count) { 331 if (chunk->sent_count) {
329 chunk->asoc->abandoned_sent[SCTP_PR_INDEX(TTL)]++; 332 chunk->asoc->abandoned_sent[SCTP_PR_INDEX(TTL)]++;
@@ -337,7 +340,8 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk)
337 } else if (SCTP_PR_RTX_ENABLED(chunk->sinfo.sinfo_flags) && 340 } else if (SCTP_PR_RTX_ENABLED(chunk->sinfo.sinfo_flags) &&
338 chunk->sent_count > chunk->sinfo.sinfo_timetolive) { 341 chunk->sent_count > chunk->sinfo.sinfo_timetolive) {
339 struct sctp_stream_out *streamout = 342 struct sctp_stream_out *streamout =
340 &chunk->asoc->stream.out[chunk->sinfo.sinfo_stream]; 343 SCTP_SO(&chunk->asoc->stream,
344 chunk->sinfo.sinfo_stream);
341 345
342 chunk->asoc->abandoned_sent[SCTP_PR_INDEX(RTX)]++; 346 chunk->asoc->abandoned_sent[SCTP_PR_INDEX(RTX)]++;
343 streamout->ext->abandoned_sent[SCTP_PR_INDEX(RTX)]++; 347 streamout->ext->abandoned_sent[SCTP_PR_INDEX(RTX)]++;
diff --git a/net/sctp/input.c b/net/sctp/input.c
index ba8a6e6c36fa..9bbc5f92c941 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -56,6 +56,7 @@
56#include <net/sctp/sm.h> 56#include <net/sctp/sm.h>
57#include <net/sctp/checksum.h> 57#include <net/sctp/checksum.h>
58#include <net/net_namespace.h> 58#include <net/net_namespace.h>
59#include <linux/rhashtable.h>
59 60
60/* Forward declarations for internal helpers. */ 61/* Forward declarations for internal helpers. */
61static int sctp_rcv_ootb(struct sk_buff *); 62static int sctp_rcv_ootb(struct sk_buff *);
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 7339918a805d..fc6c5e4bffa5 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -209,12 +209,17 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport)
209 struct sock *sk = skb->sk; 209 struct sock *sk = skb->sk;
210 struct ipv6_pinfo *np = inet6_sk(sk); 210 struct ipv6_pinfo *np = inet6_sk(sk);
211 struct flowi6 *fl6 = &transport->fl.u.ip6; 211 struct flowi6 *fl6 = &transport->fl.u.ip6;
212 __u8 tclass = np->tclass;
212 int res; 213 int res;
213 214
214 pr_debug("%s: skb:%p, len:%d, src:%pI6 dst:%pI6\n", __func__, skb, 215 pr_debug("%s: skb:%p, len:%d, src:%pI6 dst:%pI6\n", __func__, skb,
215 skb->len, &fl6->saddr, &fl6->daddr); 216 skb->len, &fl6->saddr, &fl6->daddr);
216 217
217 IP6_ECN_flow_xmit(sk, fl6->flowlabel); 218 if (transport->dscp & SCTP_DSCP_SET_MASK)
219 tclass = transport->dscp & SCTP_DSCP_VAL_MASK;
220
221 if (INET_ECN_is_capable(tclass))
222 IP6_ECN_flow_xmit(sk, fl6->flowlabel);
218 223
219 if (!(transport->param_flags & SPP_PMTUD_ENABLE)) 224 if (!(transport->param_flags & SPP_PMTUD_ENABLE))
220 skb->ignore_df = 1; 225 skb->ignore_df = 1;
@@ -223,7 +228,7 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport)
223 228
224 rcu_read_lock(); 229 rcu_read_lock();
225 res = ip6_xmit(sk, skb, fl6, sk->sk_mark, rcu_dereference(np->opt), 230 res = ip6_xmit(sk, skb, fl6, sk->sk_mark, rcu_dereference(np->opt),
226 np->tclass); 231 tclass);
227 rcu_read_unlock(); 232 rcu_read_unlock();
228 return res; 233 return res;
229} 234}
@@ -254,6 +259,17 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
254 fl6->flowi6_oif = daddr->v6.sin6_scope_id; 259 fl6->flowi6_oif = daddr->v6.sin6_scope_id;
255 else if (asoc) 260 else if (asoc)
256 fl6->flowi6_oif = asoc->base.sk->sk_bound_dev_if; 261 fl6->flowi6_oif = asoc->base.sk->sk_bound_dev_if;
262 if (t->flowlabel & SCTP_FLOWLABEL_SET_MASK)
263 fl6->flowlabel = htonl(t->flowlabel & SCTP_FLOWLABEL_VAL_MASK);
264
265 if (np->sndflow && (fl6->flowlabel & IPV6_FLOWLABEL_MASK)) {
266 struct ip6_flowlabel *flowlabel;
267
268 flowlabel = fl6_sock_lookup(sk, fl6->flowlabel);
269 if (!flowlabel)
270 goto out;
271 fl6_sock_release(flowlabel);
272 }
257 273
258 pr_debug("%s: dst=%pI6 ", __func__, &fl6->daddr); 274 pr_debug("%s: dst=%pI6 ", __func__, &fl6->daddr);
259 275
@@ -1010,7 +1026,7 @@ static const struct proto_ops inet6_seqpacket_ops = {
1010 .socketpair = sock_no_socketpair, 1026 .socketpair = sock_no_socketpair,
1011 .accept = inet_accept, 1027 .accept = inet_accept,
1012 .getname = sctp_getname, 1028 .getname = sctp_getname,
1013 .poll_mask = sctp_poll_mask, 1029 .poll = sctp_poll,
1014 .ioctl = inet6_ioctl, 1030 .ioctl = inet6_ioctl,
1015 .listen = sctp_inet_listen, 1031 .listen = sctp_inet_listen,
1016 .shutdown = inet_shutdown, 1032 .shutdown = inet_shutdown,
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index d68aa33485a9..d74d00b29942 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -80,7 +80,7 @@ static inline void sctp_outq_head_data(struct sctp_outq *q,
80 q->out_qlen += ch->skb->len; 80 q->out_qlen += ch->skb->len;
81 81
82 stream = sctp_chunk_stream_no(ch); 82 stream = sctp_chunk_stream_no(ch);
83 oute = q->asoc->stream.out[stream].ext; 83 oute = SCTP_SO(&q->asoc->stream, stream)->ext;
84 list_add(&ch->stream_list, &oute->outq); 84 list_add(&ch->stream_list, &oute->outq);
85} 85}
86 86
@@ -101,7 +101,7 @@ static inline void sctp_outq_tail_data(struct sctp_outq *q,
101 q->out_qlen += ch->skb->len; 101 q->out_qlen += ch->skb->len;
102 102
103 stream = sctp_chunk_stream_no(ch); 103 stream = sctp_chunk_stream_no(ch);
104 oute = q->asoc->stream.out[stream].ext; 104 oute = SCTP_SO(&q->asoc->stream, stream)->ext;
105 list_add_tail(&ch->stream_list, &oute->outq); 105 list_add_tail(&ch->stream_list, &oute->outq);
106} 106}
107 107
@@ -372,7 +372,7 @@ static int sctp_prsctp_prune_sent(struct sctp_association *asoc,
372 sctp_insert_list(&asoc->outqueue.abandoned, 372 sctp_insert_list(&asoc->outqueue.abandoned,
373 &chk->transmitted_list); 373 &chk->transmitted_list);
374 374
375 streamout = &asoc->stream.out[chk->sinfo.sinfo_stream]; 375 streamout = SCTP_SO(&asoc->stream, chk->sinfo.sinfo_stream);
376 asoc->sent_cnt_removable--; 376 asoc->sent_cnt_removable--;
377 asoc->abandoned_sent[SCTP_PR_INDEX(PRIO)]++; 377 asoc->abandoned_sent[SCTP_PR_INDEX(PRIO)]++;
378 streamout->ext->abandoned_sent[SCTP_PR_INDEX(PRIO)]++; 378 streamout->ext->abandoned_sent[SCTP_PR_INDEX(PRIO)]++;
@@ -416,7 +416,7 @@ static int sctp_prsctp_prune_unsent(struct sctp_association *asoc,
416 asoc->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++; 416 asoc->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++;
417 if (chk->sinfo.sinfo_stream < asoc->stream.outcnt) { 417 if (chk->sinfo.sinfo_stream < asoc->stream.outcnt) {
418 struct sctp_stream_out *streamout = 418 struct sctp_stream_out *streamout =
419 &asoc->stream.out[chk->sinfo.sinfo_stream]; 419 SCTP_SO(&asoc->stream, chk->sinfo.sinfo_stream);
420 420
421 streamout->ext->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++; 421 streamout->ext->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++;
422 } 422 }
@@ -1082,6 +1082,7 @@ static void sctp_outq_flush_data(struct sctp_flush_ctx *ctx,
1082 /* Finally, transmit new packets. */ 1082 /* Finally, transmit new packets. */
1083 while ((chunk = sctp_outq_dequeue_data(ctx->q)) != NULL) { 1083 while ((chunk = sctp_outq_dequeue_data(ctx->q)) != NULL) {
1084 __u32 sid = ntohs(chunk->subh.data_hdr->stream); 1084 __u32 sid = ntohs(chunk->subh.data_hdr->stream);
1085 __u8 stream_state = SCTP_SO(&ctx->asoc->stream, sid)->state;
1085 1086
1086 /* Has this chunk expired? */ 1087 /* Has this chunk expired? */
1087 if (sctp_chunk_abandoned(chunk)) { 1088 if (sctp_chunk_abandoned(chunk)) {
@@ -1091,7 +1092,7 @@ static void sctp_outq_flush_data(struct sctp_flush_ctx *ctx,
1091 continue; 1092 continue;
1092 } 1093 }
1093 1094
1094 if (ctx->asoc->stream.out[sid].state == SCTP_STREAM_CLOSED) { 1095 if (stream_state == SCTP_STREAM_CLOSED) {
1095 sctp_outq_head_data(ctx->q, chunk); 1096 sctp_outq_head_data(ctx->q, chunk);
1096 break; 1097 break;
1097 } 1098 }
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 5dffbc493008..e948db29ab53 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -426,13 +426,16 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
426 struct dst_entry *dst = NULL; 426 struct dst_entry *dst = NULL;
427 union sctp_addr *daddr = &t->ipaddr; 427 union sctp_addr *daddr = &t->ipaddr;
428 union sctp_addr dst_saddr; 428 union sctp_addr dst_saddr;
429 __u8 tos = inet_sk(sk)->tos;
429 430
431 if (t->dscp & SCTP_DSCP_SET_MASK)
432 tos = t->dscp & SCTP_DSCP_VAL_MASK;
430 memset(fl4, 0x0, sizeof(struct flowi4)); 433 memset(fl4, 0x0, sizeof(struct flowi4));
431 fl4->daddr = daddr->v4.sin_addr.s_addr; 434 fl4->daddr = daddr->v4.sin_addr.s_addr;
432 fl4->fl4_dport = daddr->v4.sin_port; 435 fl4->fl4_dport = daddr->v4.sin_port;
433 fl4->flowi4_proto = IPPROTO_SCTP; 436 fl4->flowi4_proto = IPPROTO_SCTP;
434 if (asoc) { 437 if (asoc) {
435 fl4->flowi4_tos = RT_CONN_FLAGS(asoc->base.sk); 438 fl4->flowi4_tos = RT_CONN_FLAGS_TOS(asoc->base.sk, tos);
436 fl4->flowi4_oif = asoc->base.sk->sk_bound_dev_if; 439 fl4->flowi4_oif = asoc->base.sk->sk_bound_dev_if;
437 fl4->fl4_sport = htons(asoc->base.bind_addr.port); 440 fl4->fl4_sport = htons(asoc->base.bind_addr.port);
438 } 441 }
@@ -495,7 +498,7 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
495 fl4->fl4_sport = laddr->a.v4.sin_port; 498 fl4->fl4_sport = laddr->a.v4.sin_port;
496 flowi4_update_output(fl4, 499 flowi4_update_output(fl4,
497 asoc->base.sk->sk_bound_dev_if, 500 asoc->base.sk->sk_bound_dev_if,
498 RT_CONN_FLAGS(asoc->base.sk), 501 RT_CONN_FLAGS_TOS(asoc->base.sk, tos),
499 daddr->v4.sin_addr.s_addr, 502 daddr->v4.sin_addr.s_addr,
500 laddr->a.v4.sin_addr.s_addr); 503 laddr->a.v4.sin_addr.s_addr);
501 504
@@ -971,16 +974,21 @@ static inline int sctp_v4_xmit(struct sk_buff *skb,
971 struct sctp_transport *transport) 974 struct sctp_transport *transport)
972{ 975{
973 struct inet_sock *inet = inet_sk(skb->sk); 976 struct inet_sock *inet = inet_sk(skb->sk);
977 __u8 dscp = inet->tos;
974 978
975 pr_debug("%s: skb:%p, len:%d, src:%pI4, dst:%pI4\n", __func__, skb, 979 pr_debug("%s: skb:%p, len:%d, src:%pI4, dst:%pI4\n", __func__, skb,
976 skb->len, &transport->fl.u.ip4.saddr, &transport->fl.u.ip4.daddr); 980 skb->len, &transport->fl.u.ip4.saddr,
981 &transport->fl.u.ip4.daddr);
982
983 if (transport->dscp & SCTP_DSCP_SET_MASK)
984 dscp = transport->dscp & SCTP_DSCP_VAL_MASK;
977 985
978 inet->pmtudisc = transport->param_flags & SPP_PMTUD_ENABLE ? 986 inet->pmtudisc = transport->param_flags & SPP_PMTUD_ENABLE ?
979 IP_PMTUDISC_DO : IP_PMTUDISC_DONT; 987 IP_PMTUDISC_DO : IP_PMTUDISC_DONT;
980 988
981 SCTP_INC_STATS(sock_net(&inet->sk), SCTP_MIB_OUTSCTPPACKS); 989 SCTP_INC_STATS(sock_net(&inet->sk), SCTP_MIB_OUTSCTPPACKS);
982 990
983 return ip_queue_xmit(&inet->sk, skb, &transport->fl); 991 return __ip_queue_xmit(&inet->sk, skb, &transport->fl, dscp);
984} 992}
985 993
986static struct sctp_af sctp_af_inet; 994static struct sctp_af sctp_af_inet;
@@ -1016,7 +1024,7 @@ static const struct proto_ops inet_seqpacket_ops = {
1016 .socketpair = sock_no_socketpair, 1024 .socketpair = sock_no_socketpair,
1017 .accept = inet_accept, 1025 .accept = inet_accept,
1018 .getname = inet_getname, /* Semantics are different. */ 1026 .getname = inet_getname, /* Semantics are different. */
1019 .poll_mask = sctp_poll_mask, 1027 .poll = sctp_poll,
1020 .ioctl = inet_ioctl, 1028 .ioctl = inet_ioctl,
1021 .listen = sctp_inet_listen, 1029 .listen = sctp_inet_listen,
1022 .shutdown = inet_shutdown, /* Looks harmless. */ 1030 .shutdown = inet_shutdown, /* Looks harmless. */
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 298112ca8c06..85d393090238 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -1827,4 +1827,3 @@ nomem:
1827 error = -ENOMEM; 1827 error = -ENOMEM;
1828 goto out; 1828 goto out;
1829} 1829}
1830
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index d20f7addee19..e96b15a66aba 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -66,6 +66,7 @@
66#include <linux/slab.h> 66#include <linux/slab.h>
67#include <linux/file.h> 67#include <linux/file.h>
68#include <linux/compat.h> 68#include <linux/compat.h>
69#include <linux/rhashtable.h>
69 70
70#include <net/ip.h> 71#include <net/ip.h>
71#include <net/icmp.h> 72#include <net/icmp.h>
@@ -1696,6 +1697,7 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags,
1696 struct sctp_association *asoc; 1697 struct sctp_association *asoc;
1697 enum sctp_scope scope; 1698 enum sctp_scope scope;
1698 struct cmsghdr *cmsg; 1699 struct cmsghdr *cmsg;
1700 __be32 flowinfo = 0;
1699 struct sctp_af *af; 1701 struct sctp_af *af;
1700 int err; 1702 int err;
1701 1703
@@ -1780,6 +1782,9 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags,
1780 if (!cmsgs->addrs_msg) 1782 if (!cmsgs->addrs_msg)
1781 return 0; 1783 return 0;
1782 1784
1785 if (daddr->sa.sa_family == AF_INET6)
1786 flowinfo = daddr->v6.sin6_flowinfo;
1787
1783 /* sendv addr list parse */ 1788 /* sendv addr list parse */
1784 for_each_cmsghdr(cmsg, cmsgs->addrs_msg) { 1789 for_each_cmsghdr(cmsg, cmsgs->addrs_msg) {
1785 struct sctp_transport *transport; 1790 struct sctp_transport *transport;
@@ -1812,6 +1817,7 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags,
1812 } 1817 }
1813 1818
1814 dlen = sizeof(struct in6_addr); 1819 dlen = sizeof(struct in6_addr);
1820 daddr->v6.sin6_flowinfo = flowinfo;
1815 daddr->v6.sin6_family = AF_INET6; 1821 daddr->v6.sin6_family = AF_INET6;
1816 daddr->v6.sin6_port = htons(asoc->peer.port); 1822 daddr->v6.sin6_port = htons(asoc->peer.port);
1817 memcpy(&daddr->v6.sin6_addr, CMSG_DATA(cmsg), dlen); 1823 memcpy(&daddr->v6.sin6_addr, CMSG_DATA(cmsg), dlen);
@@ -1905,7 +1911,7 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc,
1905 goto err; 1911 goto err;
1906 } 1912 }
1907 1913
1908 if (unlikely(!asoc->stream.out[sinfo->sinfo_stream].ext)) { 1914 if (unlikely(!SCTP_SO(&asoc->stream, sinfo->sinfo_stream)->ext)) {
1909 err = sctp_stream_init_ext(&asoc->stream, sinfo->sinfo_stream); 1915 err = sctp_stream_init_ext(&asoc->stream, sinfo->sinfo_stream);
1910 if (err) 1916 if (err)
1911 goto err; 1917 goto err;
@@ -2392,6 +2398,8 @@ static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval,
2392 * uint32_t spp_pathmtu; 2398 * uint32_t spp_pathmtu;
2393 * uint32_t spp_sackdelay; 2399 * uint32_t spp_sackdelay;
2394 * uint32_t spp_flags; 2400 * uint32_t spp_flags;
2401 * uint32_t spp_ipv6_flowlabel;
2402 * uint8_t spp_dscp;
2395 * }; 2403 * };
2396 * 2404 *
2397 * spp_assoc_id - (one-to-many style socket) This is filled in the 2405 * spp_assoc_id - (one-to-many style socket) This is filled in the
@@ -2471,6 +2479,45 @@ static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval,
2471 * also that this field is mutually exclusive to 2479 * also that this field is mutually exclusive to
2472 * SPP_SACKDELAY_ENABLE, setting both will have undefined 2480 * SPP_SACKDELAY_ENABLE, setting both will have undefined
2473 * results. 2481 * results.
2482 *
2483 * SPP_IPV6_FLOWLABEL: Setting this flag enables the
2484 * setting of the IPV6 flow label value. The value is
2485 * contained in the spp_ipv6_flowlabel field.
2486 * Upon retrieval, this flag will be set to indicate that
2487 * the spp_ipv6_flowlabel field has a valid value returned.
2488 * If a specific destination address is set (in the
2489 * spp_address field), then the value returned is that of
2490 * the address. If just an association is specified (and
2491 * no address), then the association's default flow label
2492 * is returned. If neither an association nor a destination
2493 * is specified, then the socket's default flow label is
2494 * returned. For non-IPv6 sockets, this flag will be left
2495 * cleared.
2496 *
2497 * SPP_DSCP: Setting this flag enables the setting of the
2498 * Differentiated Services Code Point (DSCP) value
2499 * associated with either the association or a specific
2500 * address. The value is obtained in the spp_dscp field.
2501 * Upon retrieval, this flag will be set to indicate that
2502 * the spp_dscp field has a valid value returned. If a
2503 * specific destination address is set when called (in the
2504 * spp_address field), then that specific destination
2505 * address's DSCP value is returned. If just an association
2506 * is specified, then the association's default DSCP is
2507 * returned. If neither an association nor a destination is
2508 * specified, then the socket's default DSCP is returned.
2509 *
2510 * spp_ipv6_flowlabel
2511 * - This field is used in conjunction with the
2512 * SPP_IPV6_FLOWLABEL flag and contains the IPv6 flow label.
2513 * The 20 least significant bits are used for the flow
2514 * label. This setting has precedence over any IPv6-layer
2515 * setting.
2516 *
2517 * spp_dscp - This field is used in conjunction with the SPP_DSCP flag
2518 * and contains the DSCP. The 6 most significant bits are
2519 * used for the DSCP. This setting has precedence over any
2520 * IPv4- or IPv6- layer setting.
2474 */ 2521 */
2475static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params, 2522static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params,
2476 struct sctp_transport *trans, 2523 struct sctp_transport *trans,
@@ -2610,6 +2657,51 @@ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params,
2610 } 2657 }
2611 } 2658 }
2612 2659
2660 if (params->spp_flags & SPP_IPV6_FLOWLABEL) {
2661 if (trans && trans->ipaddr.sa.sa_family == AF_INET6) {
2662 trans->flowlabel = params->spp_ipv6_flowlabel &
2663 SCTP_FLOWLABEL_VAL_MASK;
2664 trans->flowlabel |= SCTP_FLOWLABEL_SET_MASK;
2665 } else if (asoc) {
2666 list_for_each_entry(trans,
2667 &asoc->peer.transport_addr_list,
2668 transports) {
2669 if (trans->ipaddr.sa.sa_family != AF_INET6)
2670 continue;
2671 trans->flowlabel = params->spp_ipv6_flowlabel &
2672 SCTP_FLOWLABEL_VAL_MASK;
2673 trans->flowlabel |= SCTP_FLOWLABEL_SET_MASK;
2674 }
2675 asoc->flowlabel = params->spp_ipv6_flowlabel &
2676 SCTP_FLOWLABEL_VAL_MASK;
2677 asoc->flowlabel |= SCTP_FLOWLABEL_SET_MASK;
2678 } else if (sctp_opt2sk(sp)->sk_family == AF_INET6) {
2679 sp->flowlabel = params->spp_ipv6_flowlabel &
2680 SCTP_FLOWLABEL_VAL_MASK;
2681 sp->flowlabel |= SCTP_FLOWLABEL_SET_MASK;
2682 }
2683 }
2684
2685 if (params->spp_flags & SPP_DSCP) {
2686 if (trans) {
2687 trans->dscp = params->spp_dscp & SCTP_DSCP_VAL_MASK;
2688 trans->dscp |= SCTP_DSCP_SET_MASK;
2689 } else if (asoc) {
2690 list_for_each_entry(trans,
2691 &asoc->peer.transport_addr_list,
2692 transports) {
2693 trans->dscp = params->spp_dscp &
2694 SCTP_DSCP_VAL_MASK;
2695 trans->dscp |= SCTP_DSCP_SET_MASK;
2696 }
2697 asoc->dscp = params->spp_dscp & SCTP_DSCP_VAL_MASK;
2698 asoc->dscp |= SCTP_DSCP_SET_MASK;
2699 } else {
2700 sp->dscp = params->spp_dscp & SCTP_DSCP_VAL_MASK;
2701 sp->dscp |= SCTP_DSCP_SET_MASK;
2702 }
2703 }
2704
2613 return 0; 2705 return 0;
2614} 2706}
2615 2707
@@ -2624,11 +2716,18 @@ static int sctp_setsockopt_peer_addr_params(struct sock *sk,
2624 int error; 2716 int error;
2625 int hb_change, pmtud_change, sackdelay_change; 2717 int hb_change, pmtud_change, sackdelay_change;
2626 2718
2627 if (optlen != sizeof(struct sctp_paddrparams)) 2719 if (optlen == sizeof(params)) {
2720 if (copy_from_user(&params, optval, optlen))
2721 return -EFAULT;
2722 } else if (optlen == ALIGN(offsetof(struct sctp_paddrparams,
2723 spp_ipv6_flowlabel), 4)) {
2724 if (copy_from_user(&params, optval, optlen))
2725 return -EFAULT;
2726 if (params.spp_flags & (SPP_DSCP | SPP_IPV6_FLOWLABEL))
2727 return -EINVAL;
2728 } else {
2628 return -EINVAL; 2729 return -EINVAL;
2629 2730 }
2630 if (copy_from_user(&params, optval, optlen))
2631 return -EFAULT;
2632 2731
2633 /* Validate flags and value parameters. */ 2732 /* Validate flags and value parameters. */
2634 hb_change = params.spp_flags & SPP_HB; 2733 hb_change = params.spp_flags & SPP_HB;
@@ -4169,6 +4268,28 @@ out:
4169 return retval; 4268 return retval;
4170} 4269}
4171 4270
4271static int sctp_setsockopt_reuse_port(struct sock *sk, char __user *optval,
4272 unsigned int optlen)
4273{
4274 int val;
4275
4276 if (!sctp_style(sk, TCP))
4277 return -EOPNOTSUPP;
4278
4279 if (sctp_sk(sk)->ep->base.bind_addr.port)
4280 return -EFAULT;
4281
4282 if (optlen < sizeof(int))
4283 return -EINVAL;
4284
4285 if (get_user(val, (int __user *)optval))
4286 return -EFAULT;
4287
4288 sctp_sk(sk)->reuse = !!val;
4289
4290 return 0;
4291}
4292
4172/* API 6.2 setsockopt(), getsockopt() 4293/* API 6.2 setsockopt(), getsockopt()
4173 * 4294 *
4174 * Applications use setsockopt() and getsockopt() to set or retrieve 4295 * Applications use setsockopt() and getsockopt() to set or retrieve
@@ -4363,6 +4484,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname,
4363 retval = sctp_setsockopt_interleaving_supported(sk, optval, 4484 retval = sctp_setsockopt_interleaving_supported(sk, optval,
4364 optlen); 4485 optlen);
4365 break; 4486 break;
4487 case SCTP_REUSE_PORT:
4488 retval = sctp_setsockopt_reuse_port(sk, optval, optlen);
4489 break;
4366 default: 4490 default:
4367 retval = -ENOPROTOOPT; 4491 retval = -ENOPROTOOPT;
4368 break; 4492 break;
@@ -5427,6 +5551,45 @@ out:
5427 * also that this field is mutually exclusive to 5551 * also that this field is mutually exclusive to
5428 * SPP_SACKDELAY_ENABLE, setting both will have undefined 5552 * SPP_SACKDELAY_ENABLE, setting both will have undefined
5429 * results. 5553 * results.
5554 *
5555 * SPP_IPV6_FLOWLABEL: Setting this flag enables the
5556 * setting of the IPV6 flow label value. The value is
5557 * contained in the spp_ipv6_flowlabel field.
5558 * Upon retrieval, this flag will be set to indicate that
5559 * the spp_ipv6_flowlabel field has a valid value returned.
5560 * If a specific destination address is set (in the
5561 * spp_address field), then the value returned is that of
5562 * the address. If just an association is specified (and
5563 * no address), then the association's default flow label
5564 * is returned. If neither an association nor a destination
5565 * is specified, then the socket's default flow label is
5566 * returned. For non-IPv6 sockets, this flag will be left
5567 * cleared.
5568 *
5569 * SPP_DSCP: Setting this flag enables the setting of the
5570 * Differentiated Services Code Point (DSCP) value
5571 * associated with either the association or a specific
5572 * address. The value is obtained in the spp_dscp field.
5573 * Upon retrieval, this flag will be set to indicate that
5574 * the spp_dscp field has a valid value returned. If a
5575 * specific destination address is set when called (in the
5576 * spp_address field), then that specific destination
5577 * address's DSCP value is returned. If just an association
5578 * is specified, then the association's default DSCP is
5579 * returned. If neither an association nor a destination is
5580 * specified, then the socket's default DSCP is returned.
5581 *
5582 * spp_ipv6_flowlabel
5583 * - This field is used in conjunction with the
5584 * SPP_IPV6_FLOWLABEL flag and contains the IPv6 flow label.
5585 * The 20 least significant bits are used for the flow
5586 * label. This setting has precedence over any IPv6-layer
5587 * setting.
5588 *
5589 * spp_dscp - This field is used in conjunction with the SPP_DSCP flag
5590 * and contains the DSCP. The 6 most significant bits are
5591 * used for the DSCP. This setting has precedence over any
5592 * IPv4- or IPv6- layer setting.
5430 */ 5593 */
5431static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len, 5594static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len,
5432 char __user *optval, int __user *optlen) 5595 char __user *optval, int __user *optlen)
@@ -5436,9 +5599,15 @@ static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len,
5436 struct sctp_association *asoc = NULL; 5599 struct sctp_association *asoc = NULL;
5437 struct sctp_sock *sp = sctp_sk(sk); 5600 struct sctp_sock *sp = sctp_sk(sk);
5438 5601
5439 if (len < sizeof(struct sctp_paddrparams)) 5602 if (len >= sizeof(params))
5603 len = sizeof(params);
5604 else if (len >= ALIGN(offsetof(struct sctp_paddrparams,
5605 spp_ipv6_flowlabel), 4))
5606 len = ALIGN(offsetof(struct sctp_paddrparams,
5607 spp_ipv6_flowlabel), 4);
5608 else
5440 return -EINVAL; 5609 return -EINVAL;
5441 len = sizeof(struct sctp_paddrparams); 5610
5442 if (copy_from_user(&params, optval, len)) 5611 if (copy_from_user(&params, optval, len))
5443 return -EFAULT; 5612 return -EFAULT;
5444 5613
@@ -5473,6 +5642,15 @@ static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len,
5473 5642
5474 /*draft-11 doesn't say what to return in spp_flags*/ 5643 /*draft-11 doesn't say what to return in spp_flags*/
5475 params.spp_flags = trans->param_flags; 5644 params.spp_flags = trans->param_flags;
5645 if (trans->flowlabel & SCTP_FLOWLABEL_SET_MASK) {
5646 params.spp_ipv6_flowlabel = trans->flowlabel &
5647 SCTP_FLOWLABEL_VAL_MASK;
5648 params.spp_flags |= SPP_IPV6_FLOWLABEL;
5649 }
5650 if (trans->dscp & SCTP_DSCP_SET_MASK) {
5651 params.spp_dscp = trans->dscp & SCTP_DSCP_VAL_MASK;
5652 params.spp_flags |= SPP_DSCP;
5653 }
5476 } else if (asoc) { 5654 } else if (asoc) {
5477 /* Fetch association values. */ 5655 /* Fetch association values. */
5478 params.spp_hbinterval = jiffies_to_msecs(asoc->hbinterval); 5656 params.spp_hbinterval = jiffies_to_msecs(asoc->hbinterval);
@@ -5482,6 +5660,15 @@ static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len,
5482 5660
5483 /*draft-11 doesn't say what to return in spp_flags*/ 5661 /*draft-11 doesn't say what to return in spp_flags*/
5484 params.spp_flags = asoc->param_flags; 5662 params.spp_flags = asoc->param_flags;
5663 if (asoc->flowlabel & SCTP_FLOWLABEL_SET_MASK) {
5664 params.spp_ipv6_flowlabel = asoc->flowlabel &
5665 SCTP_FLOWLABEL_VAL_MASK;
5666 params.spp_flags |= SPP_IPV6_FLOWLABEL;
5667 }
5668 if (asoc->dscp & SCTP_DSCP_SET_MASK) {
5669 params.spp_dscp = asoc->dscp & SCTP_DSCP_VAL_MASK;
5670 params.spp_flags |= SPP_DSCP;
5671 }
5485 } else { 5672 } else {
5486 /* Fetch socket values. */ 5673 /* Fetch socket values. */
5487 params.spp_hbinterval = sp->hbinterval; 5674 params.spp_hbinterval = sp->hbinterval;
@@ -5491,6 +5678,15 @@ static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len,
5491 5678
5492 /*draft-11 doesn't say what to return in spp_flags*/ 5679 /*draft-11 doesn't say what to return in spp_flags*/
5493 params.spp_flags = sp->param_flags; 5680 params.spp_flags = sp->param_flags;
5681 if (sp->flowlabel & SCTP_FLOWLABEL_SET_MASK) {
5682 params.spp_ipv6_flowlabel = sp->flowlabel &
5683 SCTP_FLOWLABEL_VAL_MASK;
5684 params.spp_flags |= SPP_IPV6_FLOWLABEL;
5685 }
5686 if (sp->dscp & SCTP_DSCP_SET_MASK) {
5687 params.spp_dscp = sp->dscp & SCTP_DSCP_VAL_MASK;
5688 params.spp_flags |= SPP_DSCP;
5689 }
5494 } 5690 }
5495 5691
5496 if (copy_to_user(optval, &params, len)) 5692 if (copy_to_user(optval, &params, len))
@@ -6958,7 +7154,7 @@ static int sctp_getsockopt_pr_streamstatus(struct sock *sk, int len,
6958 if (!asoc || params.sprstat_sid >= asoc->stream.outcnt) 7154 if (!asoc || params.sprstat_sid >= asoc->stream.outcnt)
6959 goto out; 7155 goto out;
6960 7156
6961 streamoute = asoc->stream.out[params.sprstat_sid].ext; 7157 streamoute = SCTP_SO(&asoc->stream, params.sprstat_sid)->ext;
6962 if (!streamoute) { 7158 if (!streamoute) {
6963 /* Not allocated yet, means all stats are 0 */ 7159 /* Not allocated yet, means all stats are 0 */
6964 params.sprstat_abandoned_unsent = 0; 7160 params.sprstat_abandoned_unsent = 0;
@@ -7196,6 +7392,26 @@ out:
7196 return retval; 7392 return retval;
7197} 7393}
7198 7394
7395static int sctp_getsockopt_reuse_port(struct sock *sk, int len,
7396 char __user *optval,
7397 int __user *optlen)
7398{
7399 int val;
7400
7401 if (len < sizeof(int))
7402 return -EINVAL;
7403
7404 len = sizeof(int);
7405 val = sctp_sk(sk)->reuse;
7406 if (put_user(len, optlen))
7407 return -EFAULT;
7408
7409 if (copy_to_user(optval, &val, len))
7410 return -EFAULT;
7411
7412 return 0;
7413}
7414
7199static int sctp_getsockopt(struct sock *sk, int level, int optname, 7415static int sctp_getsockopt(struct sock *sk, int level, int optname,
7200 char __user *optval, int __user *optlen) 7416 char __user *optval, int __user *optlen)
7201{ 7417{
@@ -7391,6 +7607,9 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname,
7391 retval = sctp_getsockopt_interleaving_supported(sk, len, optval, 7607 retval = sctp_getsockopt_interleaving_supported(sk, len, optval,
7392 optlen); 7608 optlen);
7393 break; 7609 break;
7610 case SCTP_REUSE_PORT:
7611 retval = sctp_getsockopt_reuse_port(sk, len, optval, optlen);
7612 break;
7394 default: 7613 default:
7395 retval = -ENOPROTOOPT; 7614 retval = -ENOPROTOOPT;
7396 break; 7615 break;
@@ -7428,6 +7647,7 @@ static struct sctp_bind_bucket *sctp_bucket_create(
7428 7647
7429static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr) 7648static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr)
7430{ 7649{
7650 bool reuse = (sk->sk_reuse || sctp_sk(sk)->reuse);
7431 struct sctp_bind_hashbucket *head; /* hash list */ 7651 struct sctp_bind_hashbucket *head; /* hash list */
7432 struct sctp_bind_bucket *pp; 7652 struct sctp_bind_bucket *pp;
7433 unsigned short snum; 7653 unsigned short snum;
@@ -7500,13 +7720,11 @@ pp_found:
7500 * used by other socket (pp->owner not empty); that other 7720 * used by other socket (pp->owner not empty); that other
7501 * socket is going to be sk2. 7721 * socket is going to be sk2.
7502 */ 7722 */
7503 int reuse = sk->sk_reuse;
7504 struct sock *sk2; 7723 struct sock *sk2;
7505 7724
7506 pr_debug("%s: found a possible match\n", __func__); 7725 pr_debug("%s: found a possible match\n", __func__);
7507 7726
7508 if (pp->fastreuse && sk->sk_reuse && 7727 if (pp->fastreuse && reuse && sk->sk_state != SCTP_SS_LISTENING)
7509 sk->sk_state != SCTP_SS_LISTENING)
7510 goto success; 7728 goto success;
7511 7729
7512 /* Run through the list of sockets bound to the port 7730 /* Run through the list of sockets bound to the port
@@ -7524,7 +7742,7 @@ pp_found:
7524 ep2 = sctp_sk(sk2)->ep; 7742 ep2 = sctp_sk(sk2)->ep;
7525 7743
7526 if (sk == sk2 || 7744 if (sk == sk2 ||
7527 (reuse && sk2->sk_reuse && 7745 (reuse && (sk2->sk_reuse || sctp_sk(sk2)->reuse) &&
7528 sk2->sk_state != SCTP_SS_LISTENING)) 7746 sk2->sk_state != SCTP_SS_LISTENING))
7529 continue; 7747 continue;
7530 7748
@@ -7548,12 +7766,12 @@ pp_not_found:
7548 * SO_REUSEADDR on this socket -sk-). 7766 * SO_REUSEADDR on this socket -sk-).
7549 */ 7767 */
7550 if (hlist_empty(&pp->owner)) { 7768 if (hlist_empty(&pp->owner)) {
7551 if (sk->sk_reuse && sk->sk_state != SCTP_SS_LISTENING) 7769 if (reuse && sk->sk_state != SCTP_SS_LISTENING)
7552 pp->fastreuse = 1; 7770 pp->fastreuse = 1;
7553 else 7771 else
7554 pp->fastreuse = 0; 7772 pp->fastreuse = 0;
7555 } else if (pp->fastreuse && 7773 } else if (pp->fastreuse &&
7556 (!sk->sk_reuse || sk->sk_state == SCTP_SS_LISTENING)) 7774 (!reuse || sk->sk_state == SCTP_SS_LISTENING))
7557 pp->fastreuse = 0; 7775 pp->fastreuse = 0;
7558 7776
7559 /* We are set, so fill up all the data in the hash table 7777 /* We are set, so fill up all the data in the hash table
@@ -7684,7 +7902,7 @@ int sctp_inet_listen(struct socket *sock, int backlog)
7684 err = 0; 7902 err = 0;
7685 sctp_unhash_endpoint(ep); 7903 sctp_unhash_endpoint(ep);
7686 sk->sk_state = SCTP_SS_CLOSED; 7904 sk->sk_state = SCTP_SS_CLOSED;
7687 if (sk->sk_reuse) 7905 if (sk->sk_reuse || sctp_sk(sk)->reuse)
7688 sctp_sk(sk)->bind_hash->fastreuse = 1; 7906 sctp_sk(sk)->bind_hash->fastreuse = 1;
7689 goto out; 7907 goto out;
7690 } 7908 }
@@ -7717,12 +7935,14 @@ out:
7717 * here, again, by modeling the current TCP/UDP code. We don't have 7935 * here, again, by modeling the current TCP/UDP code. We don't have
7718 * a good way to test with it yet. 7936 * a good way to test with it yet.
7719 */ 7937 */
7720__poll_t sctp_poll_mask(struct socket *sock, __poll_t events) 7938__poll_t sctp_poll(struct file *file, struct socket *sock, poll_table *wait)
7721{ 7939{
7722 struct sock *sk = sock->sk; 7940 struct sock *sk = sock->sk;
7723 struct sctp_sock *sp = sctp_sk(sk); 7941 struct sctp_sock *sp = sctp_sk(sk);
7724 __poll_t mask; 7942 __poll_t mask;
7725 7943
7944 poll_wait(file, sk_sleep(sk), wait);
7945
7726 sock_rps_record_flow(sk); 7946 sock_rps_record_flow(sk);
7727 7947
7728 /* A TCP-style listening socket becomes readable when the accept queue 7948 /* A TCP-style listening socket becomes readable when the accept queue
@@ -8549,6 +8769,7 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk,
8549 newsk->sk_no_check_tx = sk->sk_no_check_tx; 8769 newsk->sk_no_check_tx = sk->sk_no_check_tx;
8550 newsk->sk_no_check_rx = sk->sk_no_check_rx; 8770 newsk->sk_no_check_rx = sk->sk_no_check_rx;
8551 newsk->sk_reuse = sk->sk_reuse; 8771 newsk->sk_reuse = sk->sk_reuse;
8772 sctp_sk(newsk)->reuse = sp->reuse;
8552 8773
8553 newsk->sk_shutdown = sk->sk_shutdown; 8774 newsk->sk_shutdown = sk->sk_shutdown;
8554 newsk->sk_destruct = sctp_destruct_sock; 8775 newsk->sk_destruct = sctp_destruct_sock;
diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index f1f1d1b232ba..ffb940d3b57c 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -37,6 +37,53 @@
37#include <net/sctp/sm.h> 37#include <net/sctp/sm.h>
38#include <net/sctp/stream_sched.h> 38#include <net/sctp/stream_sched.h>
39 39
40static struct flex_array *fa_alloc(size_t elem_size, size_t elem_count,
41 gfp_t gfp)
42{
43 struct flex_array *result;
44 int err;
45
46 result = flex_array_alloc(elem_size, elem_count, gfp);
47 if (result) {
48 err = flex_array_prealloc(result, 0, elem_count, gfp);
49 if (err) {
50 flex_array_free(result);
51 result = NULL;
52 }
53 }
54
55 return result;
56}
57
58static void fa_free(struct flex_array *fa)
59{
60 if (fa)
61 flex_array_free(fa);
62}
63
64static void fa_copy(struct flex_array *fa, struct flex_array *from,
65 size_t index, size_t count)
66{
67 void *elem;
68
69 while (count--) {
70 elem = flex_array_get(from, index);
71 flex_array_put(fa, index, elem, 0);
72 index++;
73 }
74}
75
76static void fa_zero(struct flex_array *fa, size_t index, size_t count)
77{
78 void *elem;
79
80 while (count--) {
81 elem = flex_array_get(fa, index);
82 memset(elem, 0, fa->element_size);
83 index++;
84 }
85}
86
40/* Migrates chunks from stream queues to new stream queues if needed, 87/* Migrates chunks from stream queues to new stream queues if needed,
41 * but not across associations. Also, removes those chunks to streams 88 * but not across associations. Also, removes those chunks to streams
42 * higher than the new max. 89 * higher than the new max.
@@ -78,34 +125,33 @@ static void sctp_stream_outq_migrate(struct sctp_stream *stream,
78 * sctp_stream_update will swap ->out pointers. 125 * sctp_stream_update will swap ->out pointers.
79 */ 126 */
80 for (i = 0; i < outcnt; i++) { 127 for (i = 0; i < outcnt; i++) {
81 kfree(new->out[i].ext); 128 kfree(SCTP_SO(new, i)->ext);
82 new->out[i].ext = stream->out[i].ext; 129 SCTP_SO(new, i)->ext = SCTP_SO(stream, i)->ext;
83 stream->out[i].ext = NULL; 130 SCTP_SO(stream, i)->ext = NULL;
84 } 131 }
85 } 132 }
86 133
87 for (i = outcnt; i < stream->outcnt; i++) 134 for (i = outcnt; i < stream->outcnt; i++)
88 kfree(stream->out[i].ext); 135 kfree(SCTP_SO(stream, i)->ext);
89} 136}
90 137
91static int sctp_stream_alloc_out(struct sctp_stream *stream, __u16 outcnt, 138static int sctp_stream_alloc_out(struct sctp_stream *stream, __u16 outcnt,
92 gfp_t gfp) 139 gfp_t gfp)
93{ 140{
94 struct sctp_stream_out *out; 141 struct flex_array *out;
142 size_t elem_size = sizeof(struct sctp_stream_out);
95 143
96 out = kmalloc_array(outcnt, sizeof(*out), gfp); 144 out = fa_alloc(elem_size, outcnt, gfp);
97 if (!out) 145 if (!out)
98 return -ENOMEM; 146 return -ENOMEM;
99 147
100 if (stream->out) { 148 if (stream->out) {
101 memcpy(out, stream->out, min(outcnt, stream->outcnt) * 149 fa_copy(out, stream->out, 0, min(outcnt, stream->outcnt));
102 sizeof(*out)); 150 fa_free(stream->out);
103 kfree(stream->out);
104 } 151 }
105 152
106 if (outcnt > stream->outcnt) 153 if (outcnt > stream->outcnt)
107 memset(out + stream->outcnt, 0, 154 fa_zero(out, stream->outcnt, (outcnt - stream->outcnt));
108 (outcnt - stream->outcnt) * sizeof(*out));
109 155
110 stream->out = out; 156 stream->out = out;
111 157
@@ -115,22 +161,20 @@ static int sctp_stream_alloc_out(struct sctp_stream *stream, __u16 outcnt,
115static int sctp_stream_alloc_in(struct sctp_stream *stream, __u16 incnt, 161static int sctp_stream_alloc_in(struct sctp_stream *stream, __u16 incnt,
116 gfp_t gfp) 162 gfp_t gfp)
117{ 163{
118 struct sctp_stream_in *in; 164 struct flex_array *in;
119 165 size_t elem_size = sizeof(struct sctp_stream_in);
120 in = kmalloc_array(incnt, sizeof(*stream->in), gfp);
121 166
167 in = fa_alloc(elem_size, incnt, gfp);
122 if (!in) 168 if (!in)
123 return -ENOMEM; 169 return -ENOMEM;
124 170
125 if (stream->in) { 171 if (stream->in) {
126 memcpy(in, stream->in, min(incnt, stream->incnt) * 172 fa_copy(in, stream->in, 0, min(incnt, stream->incnt));
127 sizeof(*in)); 173 fa_free(stream->in);
128 kfree(stream->in);
129 } 174 }
130 175
131 if (incnt > stream->incnt) 176 if (incnt > stream->incnt)
132 memset(in + stream->incnt, 0, 177 fa_zero(in, stream->incnt, (incnt - stream->incnt));
133 (incnt - stream->incnt) * sizeof(*in));
134 178
135 stream->in = in; 179 stream->in = in;
136 180
@@ -162,7 +206,7 @@ int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt,
162 206
163 stream->outcnt = outcnt; 207 stream->outcnt = outcnt;
164 for (i = 0; i < stream->outcnt; i++) 208 for (i = 0; i < stream->outcnt; i++)
165 stream->out[i].state = SCTP_STREAM_OPEN; 209 SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN;
166 210
167 sched->init(stream); 211 sched->init(stream);
168 212
@@ -174,7 +218,7 @@ in:
174 ret = sctp_stream_alloc_in(stream, incnt, gfp); 218 ret = sctp_stream_alloc_in(stream, incnt, gfp);
175 if (ret) { 219 if (ret) {
176 sched->free(stream); 220 sched->free(stream);
177 kfree(stream->out); 221 fa_free(stream->out);
178 stream->out = NULL; 222 stream->out = NULL;
179 stream->outcnt = 0; 223 stream->outcnt = 0;
180 goto out; 224 goto out;
@@ -193,7 +237,7 @@ int sctp_stream_init_ext(struct sctp_stream *stream, __u16 sid)
193 soute = kzalloc(sizeof(*soute), GFP_KERNEL); 237 soute = kzalloc(sizeof(*soute), GFP_KERNEL);
194 if (!soute) 238 if (!soute)
195 return -ENOMEM; 239 return -ENOMEM;
196 stream->out[sid].ext = soute; 240 SCTP_SO(stream, sid)->ext = soute;
197 241
198 return sctp_sched_init_sid(stream, sid, GFP_KERNEL); 242 return sctp_sched_init_sid(stream, sid, GFP_KERNEL);
199} 243}
@@ -205,9 +249,9 @@ void sctp_stream_free(struct sctp_stream *stream)
205 249
206 sched->free(stream); 250 sched->free(stream);
207 for (i = 0; i < stream->outcnt; i++) 251 for (i = 0; i < stream->outcnt; i++)
208 kfree(stream->out[i].ext); 252 kfree(SCTP_SO(stream, i)->ext);
209 kfree(stream->out); 253 fa_free(stream->out);
210 kfree(stream->in); 254 fa_free(stream->in);
211} 255}
212 256
213void sctp_stream_clear(struct sctp_stream *stream) 257void sctp_stream_clear(struct sctp_stream *stream)
@@ -215,12 +259,12 @@ void sctp_stream_clear(struct sctp_stream *stream)
215 int i; 259 int i;
216 260
217 for (i = 0; i < stream->outcnt; i++) { 261 for (i = 0; i < stream->outcnt; i++) {
218 stream->out[i].mid = 0; 262 SCTP_SO(stream, i)->mid = 0;
219 stream->out[i].mid_uo = 0; 263 SCTP_SO(stream, i)->mid_uo = 0;
220 } 264 }
221 265
222 for (i = 0; i < stream->incnt; i++) 266 for (i = 0; i < stream->incnt; i++)
223 stream->in[i].mid = 0; 267 SCTP_SI(stream, i)->mid = 0;
224} 268}
225 269
226void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new) 270void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new)
@@ -273,8 +317,8 @@ static bool sctp_stream_outq_is_empty(struct sctp_stream *stream,
273 for (i = 0; i < str_nums; i++) { 317 for (i = 0; i < str_nums; i++) {
274 __u16 sid = ntohs(str_list[i]); 318 __u16 sid = ntohs(str_list[i]);
275 319
276 if (stream->out[sid].ext && 320 if (SCTP_SO(stream, sid)->ext &&
277 !list_empty(&stream->out[sid].ext->outq)) 321 !list_empty(&SCTP_SO(stream, sid)->ext->outq))
278 return false; 322 return false;
279 } 323 }
280 324
@@ -361,11 +405,11 @@ int sctp_send_reset_streams(struct sctp_association *asoc,
361 if (out) { 405 if (out) {
362 if (str_nums) 406 if (str_nums)
363 for (i = 0; i < str_nums; i++) 407 for (i = 0; i < str_nums; i++)
364 stream->out[str_list[i]].state = 408 SCTP_SO(stream, str_list[i])->state =
365 SCTP_STREAM_CLOSED; 409 SCTP_STREAM_CLOSED;
366 else 410 else
367 for (i = 0; i < stream->outcnt; i++) 411 for (i = 0; i < stream->outcnt; i++)
368 stream->out[i].state = SCTP_STREAM_CLOSED; 412 SCTP_SO(stream, i)->state = SCTP_STREAM_CLOSED;
369 } 413 }
370 414
371 asoc->strreset_chunk = chunk; 415 asoc->strreset_chunk = chunk;
@@ -380,11 +424,11 @@ int sctp_send_reset_streams(struct sctp_association *asoc,
380 424
381 if (str_nums) 425 if (str_nums)
382 for (i = 0; i < str_nums; i++) 426 for (i = 0; i < str_nums; i++)
383 stream->out[str_list[i]].state = 427 SCTP_SO(stream, str_list[i])->state =
384 SCTP_STREAM_OPEN; 428 SCTP_STREAM_OPEN;
385 else 429 else
386 for (i = 0; i < stream->outcnt; i++) 430 for (i = 0; i < stream->outcnt; i++)
387 stream->out[i].state = SCTP_STREAM_OPEN; 431 SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN;
388 432
389 goto out; 433 goto out;
390 } 434 }
@@ -418,7 +462,7 @@ int sctp_send_reset_assoc(struct sctp_association *asoc)
418 462
419 /* Block further xmit of data until this request is completed */ 463 /* Block further xmit of data until this request is completed */
420 for (i = 0; i < stream->outcnt; i++) 464 for (i = 0; i < stream->outcnt; i++)
421 stream->out[i].state = SCTP_STREAM_CLOSED; 465 SCTP_SO(stream, i)->state = SCTP_STREAM_CLOSED;
422 466
423 asoc->strreset_chunk = chunk; 467 asoc->strreset_chunk = chunk;
424 sctp_chunk_hold(asoc->strreset_chunk); 468 sctp_chunk_hold(asoc->strreset_chunk);
@@ -429,7 +473,7 @@ int sctp_send_reset_assoc(struct sctp_association *asoc)
429 asoc->strreset_chunk = NULL; 473 asoc->strreset_chunk = NULL;
430 474
431 for (i = 0; i < stream->outcnt; i++) 475 for (i = 0; i < stream->outcnt; i++)
432 stream->out[i].state = SCTP_STREAM_OPEN; 476 SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN;
433 477
434 return retval; 478 return retval;
435 } 479 }
@@ -609,10 +653,10 @@ struct sctp_chunk *sctp_process_strreset_outreq(
609 } 653 }
610 654
611 for (i = 0; i < nums; i++) 655 for (i = 0; i < nums; i++)
612 stream->in[ntohs(str_p[i])].mid = 0; 656 SCTP_SI(stream, ntohs(str_p[i]))->mid = 0;
613 } else { 657 } else {
614 for (i = 0; i < stream->incnt; i++) 658 for (i = 0; i < stream->incnt; i++)
615 stream->in[i].mid = 0; 659 SCTP_SI(stream, i)->mid = 0;
616 } 660 }
617 661
618 result = SCTP_STRRESET_PERFORMED; 662 result = SCTP_STRRESET_PERFORMED;
@@ -683,11 +727,11 @@ struct sctp_chunk *sctp_process_strreset_inreq(
683 727
684 if (nums) 728 if (nums)
685 for (i = 0; i < nums; i++) 729 for (i = 0; i < nums; i++)
686 stream->out[ntohs(str_p[i])].state = 730 SCTP_SO(stream, ntohs(str_p[i]))->state =
687 SCTP_STREAM_CLOSED; 731 SCTP_STREAM_CLOSED;
688 else 732 else
689 for (i = 0; i < stream->outcnt; i++) 733 for (i = 0; i < stream->outcnt; i++)
690 stream->out[i].state = SCTP_STREAM_CLOSED; 734 SCTP_SO(stream, i)->state = SCTP_STREAM_CLOSED;
691 735
692 asoc->strreset_chunk = chunk; 736 asoc->strreset_chunk = chunk;
693 asoc->strreset_outstanding = 1; 737 asoc->strreset_outstanding = 1;
@@ -786,11 +830,11 @@ struct sctp_chunk *sctp_process_strreset_tsnreq(
786 * incoming and outgoing streams. 830 * incoming and outgoing streams.
787 */ 831 */
788 for (i = 0; i < stream->outcnt; i++) { 832 for (i = 0; i < stream->outcnt; i++) {
789 stream->out[i].mid = 0; 833 SCTP_SO(stream, i)->mid = 0;
790 stream->out[i].mid_uo = 0; 834 SCTP_SO(stream, i)->mid_uo = 0;
791 } 835 }
792 for (i = 0; i < stream->incnt; i++) 836 for (i = 0; i < stream->incnt; i++)
793 stream->in[i].mid = 0; 837 SCTP_SI(stream, i)->mid = 0;
794 838
795 result = SCTP_STRRESET_PERFORMED; 839 result = SCTP_STRRESET_PERFORMED;
796 840
@@ -979,15 +1023,18 @@ struct sctp_chunk *sctp_process_strreset_resp(
979 sizeof(__u16); 1023 sizeof(__u16);
980 1024
981 if (result == SCTP_STRRESET_PERFORMED) { 1025 if (result == SCTP_STRRESET_PERFORMED) {
1026 struct sctp_stream_out *sout;
982 if (nums) { 1027 if (nums) {
983 for (i = 0; i < nums; i++) { 1028 for (i = 0; i < nums; i++) {
984 stream->out[ntohs(str_p[i])].mid = 0; 1029 sout = SCTP_SO(stream, ntohs(str_p[i]));
985 stream->out[ntohs(str_p[i])].mid_uo = 0; 1030 sout->mid = 0;
1031 sout->mid_uo = 0;
986 } 1032 }
987 } else { 1033 } else {
988 for (i = 0; i < stream->outcnt; i++) { 1034 for (i = 0; i < stream->outcnt; i++) {
989 stream->out[i].mid = 0; 1035 sout = SCTP_SO(stream, i);
990 stream->out[i].mid_uo = 0; 1036 sout->mid = 0;
1037 sout->mid_uo = 0;
991 } 1038 }
992 } 1039 }
993 1040
@@ -995,7 +1042,7 @@ struct sctp_chunk *sctp_process_strreset_resp(
995 } 1042 }
996 1043
997 for (i = 0; i < stream->outcnt; i++) 1044 for (i = 0; i < stream->outcnt; i++)
998 stream->out[i].state = SCTP_STREAM_OPEN; 1045 SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN;
999 1046
1000 *evp = sctp_ulpevent_make_stream_reset_event(asoc, flags, 1047 *evp = sctp_ulpevent_make_stream_reset_event(asoc, flags,
1001 nums, str_p, GFP_ATOMIC); 1048 nums, str_p, GFP_ATOMIC);
@@ -1050,15 +1097,15 @@ struct sctp_chunk *sctp_process_strreset_resp(
1050 asoc->adv_peer_ack_point = asoc->ctsn_ack_point; 1097 asoc->adv_peer_ack_point = asoc->ctsn_ack_point;
1051 1098
1052 for (i = 0; i < stream->outcnt; i++) { 1099 for (i = 0; i < stream->outcnt; i++) {
1053 stream->out[i].mid = 0; 1100 SCTP_SO(stream, i)->mid = 0;
1054 stream->out[i].mid_uo = 0; 1101 SCTP_SO(stream, i)->mid_uo = 0;
1055 } 1102 }
1056 for (i = 0; i < stream->incnt; i++) 1103 for (i = 0; i < stream->incnt; i++)
1057 stream->in[i].mid = 0; 1104 SCTP_SI(stream, i)->mid = 0;
1058 } 1105 }
1059 1106
1060 for (i = 0; i < stream->outcnt; i++) 1107 for (i = 0; i < stream->outcnt; i++)
1061 stream->out[i].state = SCTP_STREAM_OPEN; 1108 SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN;
1062 1109
1063 *evp = sctp_ulpevent_make_assoc_reset_event(asoc, flags, 1110 *evp = sctp_ulpevent_make_assoc_reset_event(asoc, flags,
1064 stsn, rtsn, GFP_ATOMIC); 1111 stsn, rtsn, GFP_ATOMIC);
@@ -1072,7 +1119,7 @@ struct sctp_chunk *sctp_process_strreset_resp(
1072 1119
1073 if (result == SCTP_STRRESET_PERFORMED) 1120 if (result == SCTP_STRRESET_PERFORMED)
1074 for (i = number; i < stream->outcnt; i++) 1121 for (i = number; i < stream->outcnt; i++)
1075 stream->out[i].state = SCTP_STREAM_OPEN; 1122 SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN;
1076 else 1123 else
1077 stream->outcnt = number; 1124 stream->outcnt = number;
1078 1125
diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c
index d3764c181299..0a78cdf86463 100644
--- a/net/sctp/stream_interleave.c
+++ b/net/sctp/stream_interleave.c
@@ -197,7 +197,7 @@ static struct sctp_ulpevent *sctp_intl_retrieve_partial(
197 __u32 next_fsn = 0; 197 __u32 next_fsn = 0;
198 int is_last = 0; 198 int is_last = 0;
199 199
200 sin = sctp_stream_in(ulpq->asoc, event->stream); 200 sin = sctp_stream_in(&ulpq->asoc->stream, event->stream);
201 201
202 skb_queue_walk(&ulpq->reasm, pos) { 202 skb_queue_walk(&ulpq->reasm, pos) {
203 struct sctp_ulpevent *cevent = sctp_skb2event(pos); 203 struct sctp_ulpevent *cevent = sctp_skb2event(pos);
@@ -278,7 +278,7 @@ static struct sctp_ulpevent *sctp_intl_retrieve_reassembled(
278 __u32 pd_len = 0; 278 __u32 pd_len = 0;
279 __u32 mid = 0; 279 __u32 mid = 0;
280 280
281 sin = sctp_stream_in(ulpq->asoc, event->stream); 281 sin = sctp_stream_in(&ulpq->asoc->stream, event->stream);
282 282
283 skb_queue_walk(&ulpq->reasm, pos) { 283 skb_queue_walk(&ulpq->reasm, pos) {
284 struct sctp_ulpevent *cevent = sctp_skb2event(pos); 284 struct sctp_ulpevent *cevent = sctp_skb2event(pos);
@@ -368,7 +368,7 @@ static struct sctp_ulpevent *sctp_intl_reasm(struct sctp_ulpq *ulpq,
368 368
369 sctp_intl_store_reasm(ulpq, event); 369 sctp_intl_store_reasm(ulpq, event);
370 370
371 sin = sctp_stream_in(ulpq->asoc, event->stream); 371 sin = sctp_stream_in(&ulpq->asoc->stream, event->stream);
372 if (sin->pd_mode && event->mid == sin->mid && 372 if (sin->pd_mode && event->mid == sin->mid &&
373 event->fsn == sin->fsn) 373 event->fsn == sin->fsn)
374 retval = sctp_intl_retrieve_partial(ulpq, event); 374 retval = sctp_intl_retrieve_partial(ulpq, event);
@@ -575,7 +575,7 @@ static struct sctp_ulpevent *sctp_intl_retrieve_partial_uo(
575 __u32 next_fsn = 0; 575 __u32 next_fsn = 0;
576 int is_last = 0; 576 int is_last = 0;
577 577
578 sin = sctp_stream_in(ulpq->asoc, event->stream); 578 sin = sctp_stream_in(&ulpq->asoc->stream, event->stream);
579 579
580 skb_queue_walk(&ulpq->reasm_uo, pos) { 580 skb_queue_walk(&ulpq->reasm_uo, pos) {
581 struct sctp_ulpevent *cevent = sctp_skb2event(pos); 581 struct sctp_ulpevent *cevent = sctp_skb2event(pos);
@@ -659,7 +659,7 @@ static struct sctp_ulpevent *sctp_intl_retrieve_reassembled_uo(
659 __u32 pd_len = 0; 659 __u32 pd_len = 0;
660 __u32 mid = 0; 660 __u32 mid = 0;
661 661
662 sin = sctp_stream_in(ulpq->asoc, event->stream); 662 sin = sctp_stream_in(&ulpq->asoc->stream, event->stream);
663 663
664 skb_queue_walk(&ulpq->reasm_uo, pos) { 664 skb_queue_walk(&ulpq->reasm_uo, pos) {
665 struct sctp_ulpevent *cevent = sctp_skb2event(pos); 665 struct sctp_ulpevent *cevent = sctp_skb2event(pos);
@@ -750,7 +750,7 @@ static struct sctp_ulpevent *sctp_intl_reasm_uo(struct sctp_ulpq *ulpq,
750 750
751 sctp_intl_store_reasm_uo(ulpq, event); 751 sctp_intl_store_reasm_uo(ulpq, event);
752 752
753 sin = sctp_stream_in(ulpq->asoc, event->stream); 753 sin = sctp_stream_in(&ulpq->asoc->stream, event->stream);
754 if (sin->pd_mode_uo && event->mid == sin->mid_uo && 754 if (sin->pd_mode_uo && event->mid == sin->mid_uo &&
755 event->fsn == sin->fsn_uo) 755 event->fsn == sin->fsn_uo)
756 retval = sctp_intl_retrieve_partial_uo(ulpq, event); 756 retval = sctp_intl_retrieve_partial_uo(ulpq, event);
@@ -774,7 +774,7 @@ static struct sctp_ulpevent *sctp_intl_retrieve_first_uo(struct sctp_ulpq *ulpq)
774 skb_queue_walk(&ulpq->reasm_uo, pos) { 774 skb_queue_walk(&ulpq->reasm_uo, pos) {
775 struct sctp_ulpevent *cevent = sctp_skb2event(pos); 775 struct sctp_ulpevent *cevent = sctp_skb2event(pos);
776 776
777 csin = sctp_stream_in(ulpq->asoc, cevent->stream); 777 csin = sctp_stream_in(&ulpq->asoc->stream, cevent->stream);
778 if (csin->pd_mode_uo) 778 if (csin->pd_mode_uo)
779 continue; 779 continue;
780 780
@@ -875,7 +875,7 @@ static struct sctp_ulpevent *sctp_intl_retrieve_first(struct sctp_ulpq *ulpq)
875 skb_queue_walk(&ulpq->reasm, pos) { 875 skb_queue_walk(&ulpq->reasm, pos) {
876 struct sctp_ulpevent *cevent = sctp_skb2event(pos); 876 struct sctp_ulpevent *cevent = sctp_skb2event(pos);
877 877
878 csin = sctp_stream_in(ulpq->asoc, cevent->stream); 878 csin = sctp_stream_in(&ulpq->asoc->stream, cevent->stream);
879 if (csin->pd_mode) 879 if (csin->pd_mode)
880 continue; 880 continue;
881 881
@@ -1053,7 +1053,7 @@ static void sctp_intl_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp)
1053 __u16 sid; 1053 __u16 sid;
1054 1054
1055 for (sid = 0; sid < stream->incnt; sid++) { 1055 for (sid = 0; sid < stream->incnt; sid++) {
1056 struct sctp_stream_in *sin = &stream->in[sid]; 1056 struct sctp_stream_in *sin = SCTP_SI(stream, sid);
1057 __u32 mid; 1057 __u32 mid;
1058 1058
1059 if (sin->pd_mode_uo) { 1059 if (sin->pd_mode_uo) {
@@ -1247,7 +1247,7 @@ static void sctp_handle_fwdtsn(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk)
1247static void sctp_intl_skip(struct sctp_ulpq *ulpq, __u16 sid, __u32 mid, 1247static void sctp_intl_skip(struct sctp_ulpq *ulpq, __u16 sid, __u32 mid,
1248 __u8 flags) 1248 __u8 flags)
1249{ 1249{
1250 struct sctp_stream_in *sin = sctp_stream_in(ulpq->asoc, sid); 1250 struct sctp_stream_in *sin = sctp_stream_in(&ulpq->asoc->stream, sid);
1251 struct sctp_stream *stream = &ulpq->asoc->stream; 1251 struct sctp_stream *stream = &ulpq->asoc->stream;
1252 1252
1253 if (flags & SCTP_FTSN_U_BIT) { 1253 if (flags & SCTP_FTSN_U_BIT) {
diff --git a/net/sctp/stream_sched.c b/net/sctp/stream_sched.c
index f5fcd425232a..a6c04a94b08f 100644
--- a/net/sctp/stream_sched.c
+++ b/net/sctp/stream_sched.c
@@ -161,7 +161,7 @@ int sctp_sched_set_sched(struct sctp_association *asoc,
161 161
162 /* Give the next scheduler a clean slate. */ 162 /* Give the next scheduler a clean slate. */
163 for (i = 0; i < asoc->stream.outcnt; i++) { 163 for (i = 0; i < asoc->stream.outcnt; i++) {
164 void *p = asoc->stream.out[i].ext; 164 void *p = SCTP_SO(&asoc->stream, i)->ext;
165 165
166 if (!p) 166 if (!p)
167 continue; 167 continue;
@@ -175,7 +175,7 @@ int sctp_sched_set_sched(struct sctp_association *asoc,
175 asoc->outqueue.sched = n; 175 asoc->outqueue.sched = n;
176 n->init(&asoc->stream); 176 n->init(&asoc->stream);
177 for (i = 0; i < asoc->stream.outcnt; i++) { 177 for (i = 0; i < asoc->stream.outcnt; i++) {
178 if (!asoc->stream.out[i].ext) 178 if (!SCTP_SO(&asoc->stream, i)->ext)
179 continue; 179 continue;
180 180
181 ret = n->init_sid(&asoc->stream, i, GFP_KERNEL); 181 ret = n->init_sid(&asoc->stream, i, GFP_KERNEL);
@@ -217,7 +217,7 @@ int sctp_sched_set_value(struct sctp_association *asoc, __u16 sid,
217 if (sid >= asoc->stream.outcnt) 217 if (sid >= asoc->stream.outcnt)
218 return -EINVAL; 218 return -EINVAL;
219 219
220 if (!asoc->stream.out[sid].ext) { 220 if (!SCTP_SO(&asoc->stream, sid)->ext) {
221 int ret; 221 int ret;
222 222
223 ret = sctp_stream_init_ext(&asoc->stream, sid); 223 ret = sctp_stream_init_ext(&asoc->stream, sid);
@@ -234,7 +234,7 @@ int sctp_sched_get_value(struct sctp_association *asoc, __u16 sid,
234 if (sid >= asoc->stream.outcnt) 234 if (sid >= asoc->stream.outcnt)
235 return -EINVAL; 235 return -EINVAL;
236 236
237 if (!asoc->stream.out[sid].ext) 237 if (!SCTP_SO(&asoc->stream, sid)->ext)
238 return 0; 238 return 0;
239 239
240 return asoc->outqueue.sched->get(&asoc->stream, sid, value); 240 return asoc->outqueue.sched->get(&asoc->stream, sid, value);
@@ -252,7 +252,7 @@ void sctp_sched_dequeue_done(struct sctp_outq *q, struct sctp_chunk *ch)
252 * priority stream comes in. 252 * priority stream comes in.
253 */ 253 */
254 sid = sctp_chunk_stream_no(ch); 254 sid = sctp_chunk_stream_no(ch);
255 sout = &q->asoc->stream.out[sid]; 255 sout = SCTP_SO(&q->asoc->stream, sid);
256 q->asoc->stream.out_curr = sout; 256 q->asoc->stream.out_curr = sout;
257 return; 257 return;
258 } 258 }
@@ -272,8 +272,9 @@ void sctp_sched_dequeue_common(struct sctp_outq *q, struct sctp_chunk *ch)
272int sctp_sched_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp) 272int sctp_sched_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp)
273{ 273{
274 struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); 274 struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream);
275 struct sctp_stream_out_ext *ext = SCTP_SO(stream, sid)->ext;
275 276
276 INIT_LIST_HEAD(&stream->out[sid].ext->outq); 277 INIT_LIST_HEAD(&ext->outq);
277 return sched->init_sid(stream, sid, gfp); 278 return sched->init_sid(stream, sid, gfp);
278} 279}
279 280
diff --git a/net/sctp/stream_sched_prio.c b/net/sctp/stream_sched_prio.c
index 7997d35dd0fd..2245083a98f2 100644
--- a/net/sctp/stream_sched_prio.c
+++ b/net/sctp/stream_sched_prio.c
@@ -75,10 +75,10 @@ static struct sctp_stream_priorities *sctp_sched_prio_get_head(
75 75
76 /* No luck. So we search on all streams now. */ 76 /* No luck. So we search on all streams now. */
77 for (i = 0; i < stream->outcnt; i++) { 77 for (i = 0; i < stream->outcnt; i++) {
78 if (!stream->out[i].ext) 78 if (!SCTP_SO(stream, i)->ext)
79 continue; 79 continue;
80 80
81 p = stream->out[i].ext->prio_head; 81 p = SCTP_SO(stream, i)->ext->prio_head;
82 if (!p) 82 if (!p)
83 /* Means all other streams won't be initialized 83 /* Means all other streams won't be initialized
84 * as well. 84 * as well.
@@ -165,7 +165,7 @@ static void sctp_sched_prio_sched(struct sctp_stream *stream,
165static int sctp_sched_prio_set(struct sctp_stream *stream, __u16 sid, 165static int sctp_sched_prio_set(struct sctp_stream *stream, __u16 sid,
166 __u16 prio, gfp_t gfp) 166 __u16 prio, gfp_t gfp)
167{ 167{
168 struct sctp_stream_out *sout = &stream->out[sid]; 168 struct sctp_stream_out *sout = SCTP_SO(stream, sid);
169 struct sctp_stream_out_ext *soute = sout->ext; 169 struct sctp_stream_out_ext *soute = sout->ext;
170 struct sctp_stream_priorities *prio_head, *old; 170 struct sctp_stream_priorities *prio_head, *old;
171 bool reschedule = false; 171 bool reschedule = false;
@@ -186,7 +186,7 @@ static int sctp_sched_prio_set(struct sctp_stream *stream, __u16 sid,
186 return 0; 186 return 0;
187 187
188 for (i = 0; i < stream->outcnt; i++) { 188 for (i = 0; i < stream->outcnt; i++) {
189 soute = stream->out[i].ext; 189 soute = SCTP_SO(stream, i)->ext;
190 if (soute && soute->prio_head == old) 190 if (soute && soute->prio_head == old)
191 /* It's still in use, nothing else to do here. */ 191 /* It's still in use, nothing else to do here. */
192 return 0; 192 return 0;
@@ -201,7 +201,7 @@ static int sctp_sched_prio_set(struct sctp_stream *stream, __u16 sid,
201static int sctp_sched_prio_get(struct sctp_stream *stream, __u16 sid, 201static int sctp_sched_prio_get(struct sctp_stream *stream, __u16 sid,
202 __u16 *value) 202 __u16 *value)
203{ 203{
204 *value = stream->out[sid].ext->prio_head->prio; 204 *value = SCTP_SO(stream, sid)->ext->prio_head->prio;
205 return 0; 205 return 0;
206} 206}
207 207
@@ -215,7 +215,7 @@ static int sctp_sched_prio_init(struct sctp_stream *stream)
215static int sctp_sched_prio_init_sid(struct sctp_stream *stream, __u16 sid, 215static int sctp_sched_prio_init_sid(struct sctp_stream *stream, __u16 sid,
216 gfp_t gfp) 216 gfp_t gfp)
217{ 217{
218 INIT_LIST_HEAD(&stream->out[sid].ext->prio_list); 218 INIT_LIST_HEAD(&SCTP_SO(stream, sid)->ext->prio_list);
219 return sctp_sched_prio_set(stream, sid, 0, gfp); 219 return sctp_sched_prio_set(stream, sid, 0, gfp);
220} 220}
221 221
@@ -233,9 +233,9 @@ static void sctp_sched_prio_free(struct sctp_stream *stream)
233 */ 233 */
234 sctp_sched_prio_unsched_all(stream); 234 sctp_sched_prio_unsched_all(stream);
235 for (i = 0; i < stream->outcnt; i++) { 235 for (i = 0; i < stream->outcnt; i++) {
236 if (!stream->out[i].ext) 236 if (!SCTP_SO(stream, i)->ext)
237 continue; 237 continue;
238 prio = stream->out[i].ext->prio_head; 238 prio = SCTP_SO(stream, i)->ext->prio_head;
239 if (prio && list_empty(&prio->prio_sched)) 239 if (prio && list_empty(&prio->prio_sched))
240 list_add(&prio->prio_sched, &list); 240 list_add(&prio->prio_sched, &list);
241 } 241 }
@@ -255,7 +255,7 @@ static void sctp_sched_prio_enqueue(struct sctp_outq *q,
255 ch = list_first_entry(&msg->chunks, struct sctp_chunk, frag_list); 255 ch = list_first_entry(&msg->chunks, struct sctp_chunk, frag_list);
256 sid = sctp_chunk_stream_no(ch); 256 sid = sctp_chunk_stream_no(ch);
257 stream = &q->asoc->stream; 257 stream = &q->asoc->stream;
258 sctp_sched_prio_sched(stream, stream->out[sid].ext); 258 sctp_sched_prio_sched(stream, SCTP_SO(stream, sid)->ext);
259} 259}
260 260
261static struct sctp_chunk *sctp_sched_prio_dequeue(struct sctp_outq *q) 261static struct sctp_chunk *sctp_sched_prio_dequeue(struct sctp_outq *q)
@@ -297,7 +297,7 @@ static void sctp_sched_prio_dequeue_done(struct sctp_outq *q,
297 * this priority. 297 * this priority.
298 */ 298 */
299 sid = sctp_chunk_stream_no(ch); 299 sid = sctp_chunk_stream_no(ch);
300 soute = q->asoc->stream.out[sid].ext; 300 soute = SCTP_SO(&q->asoc->stream, sid)->ext;
301 prio = soute->prio_head; 301 prio = soute->prio_head;
302 302
303 sctp_sched_prio_next_stream(prio); 303 sctp_sched_prio_next_stream(prio);
@@ -317,7 +317,7 @@ static void sctp_sched_prio_sched_all(struct sctp_stream *stream)
317 __u16 sid; 317 __u16 sid;
318 318
319 sid = sctp_chunk_stream_no(ch); 319 sid = sctp_chunk_stream_no(ch);
320 sout = &stream->out[sid]; 320 sout = SCTP_SO(stream, sid);
321 if (sout->ext) 321 if (sout->ext)
322 sctp_sched_prio_sched(stream, sout->ext); 322 sctp_sched_prio_sched(stream, sout->ext);
323 } 323 }
diff --git a/net/sctp/stream_sched_rr.c b/net/sctp/stream_sched_rr.c
index 1155692448f1..52ba743fa7a7 100644
--- a/net/sctp/stream_sched_rr.c
+++ b/net/sctp/stream_sched_rr.c
@@ -100,7 +100,7 @@ static int sctp_sched_rr_init(struct sctp_stream *stream)
100static int sctp_sched_rr_init_sid(struct sctp_stream *stream, __u16 sid, 100static int sctp_sched_rr_init_sid(struct sctp_stream *stream, __u16 sid,
101 gfp_t gfp) 101 gfp_t gfp)
102{ 102{
103 INIT_LIST_HEAD(&stream->out[sid].ext->rr_list); 103 INIT_LIST_HEAD(&SCTP_SO(stream, sid)->ext->rr_list);
104 104
105 return 0; 105 return 0;
106} 106}
@@ -120,7 +120,7 @@ static void sctp_sched_rr_enqueue(struct sctp_outq *q,
120 ch = list_first_entry(&msg->chunks, struct sctp_chunk, frag_list); 120 ch = list_first_entry(&msg->chunks, struct sctp_chunk, frag_list);
121 sid = sctp_chunk_stream_no(ch); 121 sid = sctp_chunk_stream_no(ch);
122 stream = &q->asoc->stream; 122 stream = &q->asoc->stream;
123 sctp_sched_rr_sched(stream, stream->out[sid].ext); 123 sctp_sched_rr_sched(stream, SCTP_SO(stream, sid)->ext);
124} 124}
125 125
126static struct sctp_chunk *sctp_sched_rr_dequeue(struct sctp_outq *q) 126static struct sctp_chunk *sctp_sched_rr_dequeue(struct sctp_outq *q)
@@ -154,7 +154,7 @@ static void sctp_sched_rr_dequeue_done(struct sctp_outq *q,
154 154
155 /* Last chunk on that msg, move to the next stream */ 155 /* Last chunk on that msg, move to the next stream */
156 sid = sctp_chunk_stream_no(ch); 156 sid = sctp_chunk_stream_no(ch);
157 soute = q->asoc->stream.out[sid].ext; 157 soute = SCTP_SO(&q->asoc->stream, sid)->ext;
158 158
159 sctp_sched_rr_next_stream(&q->asoc->stream); 159 sctp_sched_rr_next_stream(&q->asoc->stream);
160 160
@@ -173,7 +173,7 @@ static void sctp_sched_rr_sched_all(struct sctp_stream *stream)
173 __u16 sid; 173 __u16 sid;
174 174
175 sid = sctp_chunk_stream_no(ch); 175 sid = sctp_chunk_stream_no(ch);
176 soute = stream->out[sid].ext; 176 soute = SCTP_SO(stream, sid)->ext;
177 if (soute) 177 if (soute)
178 sctp_sched_rr_sched(stream, soute); 178 sctp_sched_rr_sched(stream, soute);
179 } 179 }
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 445b7ef61677..12cac85da994 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -282,7 +282,7 @@ bool sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu)
282 282
283 if (dst) { 283 if (dst) {
284 /* Re-fetch, as under layers may have a higher minimum size */ 284 /* Re-fetch, as under layers may have a higher minimum size */
285 pmtu = SCTP_TRUNC4(dst_mtu(dst)); 285 pmtu = sctp_dst_mtu(dst);
286 change = t->pathmtu != pmtu; 286 change = t->pathmtu != pmtu;
287 } 287 }
288 t->pathmtu = pmtu; 288 t->pathmtu = pmtu;
diff --git a/net/smc/Makefile b/net/smc/Makefile
index 188104654b54..4df96b4b8130 100644
--- a/net/smc/Makefile
+++ b/net/smc/Makefile
@@ -1,4 +1,4 @@
1obj-$(CONFIG_SMC) += smc.o 1obj-$(CONFIG_SMC) += smc.o
2obj-$(CONFIG_SMC_DIAG) += smc_diag.o 2obj-$(CONFIG_SMC_DIAG) += smc_diag.o
3smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o 3smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o
4smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o 4smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index da7f02edcd37..2d8a1e15e4f9 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -23,6 +23,7 @@
23#include <linux/workqueue.h> 23#include <linux/workqueue.h>
24#include <linux/in.h> 24#include <linux/in.h>
25#include <linux/sched/signal.h> 25#include <linux/sched/signal.h>
26#include <linux/if_vlan.h>
26 27
27#include <net/sock.h> 28#include <net/sock.h>
28#include <net/tcp.h> 29#include <net/tcp.h>
@@ -35,6 +36,7 @@
35#include "smc_cdc.h" 36#include "smc_cdc.h"
36#include "smc_core.h" 37#include "smc_core.h"
37#include "smc_ib.h" 38#include "smc_ib.h"
39#include "smc_ism.h"
38#include "smc_pnet.h" 40#include "smc_pnet.h"
39#include "smc_tx.h" 41#include "smc_tx.h"
40#include "smc_rx.h" 42#include "smc_rx.h"
@@ -45,6 +47,7 @@ static DEFINE_MUTEX(smc_create_lgr_pending); /* serialize link group
45 */ 47 */
46 48
47static void smc_tcp_listen_work(struct work_struct *); 49static void smc_tcp_listen_work(struct work_struct *);
50static void smc_connect_work(struct work_struct *);
48 51
49static void smc_set_keepalive(struct sock *sk, int val) 52static void smc_set_keepalive(struct sock *sk, int val)
50{ 53{
@@ -122,6 +125,12 @@ static int smc_release(struct socket *sock)
122 goto out; 125 goto out;
123 126
124 smc = smc_sk(sk); 127 smc = smc_sk(sk);
128
129 /* cleanup for a dangling non-blocking connect */
130 flush_work(&smc->connect_work);
131 kfree(smc->connect_info);
132 smc->connect_info = NULL;
133
125 if (sk->sk_state == SMC_LISTEN) 134 if (sk->sk_state == SMC_LISTEN)
126 /* smc_close_non_accepted() is called and acquires 135 /* smc_close_non_accepted() is called and acquires
127 * sock lock for child sockets again 136 * sock lock for child sockets again
@@ -140,7 +149,8 @@ static int smc_release(struct socket *sock)
140 smc->clcsock = NULL; 149 smc->clcsock = NULL;
141 } 150 }
142 if (smc->use_fallback) { 151 if (smc->use_fallback) {
143 sock_put(sk); /* passive closing */ 152 if (sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_INIT)
153 sock_put(sk); /* passive closing */
144 sk->sk_state = SMC_CLOSED; 154 sk->sk_state = SMC_CLOSED;
145 sk->sk_state_change(sk); 155 sk->sk_state_change(sk);
146 } 156 }
@@ -186,6 +196,7 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
186 sk->sk_protocol = protocol; 196 sk->sk_protocol = protocol;
187 smc = smc_sk(sk); 197 smc = smc_sk(sk);
188 INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); 198 INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
199 INIT_WORK(&smc->connect_work, smc_connect_work);
189 INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work); 200 INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work);
190 INIT_LIST_HEAD(&smc->accept_q); 201 INIT_LIST_HEAD(&smc->accept_q);
191 spin_lock_init(&smc->accept_q_lock); 202 spin_lock_init(&smc->accept_q_lock);
@@ -333,20 +344,17 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc)
333 344
334 rc = smc_ib_modify_qp_rts(link); 345 rc = smc_ib_modify_qp_rts(link);
335 if (rc) 346 if (rc)
336 return SMC_CLC_DECL_INTERR; 347 return SMC_CLC_DECL_ERR_RDYLNK;
337 348
338 smc_wr_remember_qp_attr(link); 349 smc_wr_remember_qp_attr(link);
339 350
340 if (smc_reg_rmb(link, smc->conn.rmb_desc, false)) 351 if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
341 return SMC_CLC_DECL_INTERR; 352 return SMC_CLC_DECL_ERR_REGRMB;
342 353
343 /* send CONFIRM LINK response over RoCE fabric */ 354 /* send CONFIRM LINK response over RoCE fabric */
344 rc = smc_llc_send_confirm_link(link, 355 rc = smc_llc_send_confirm_link(link, SMC_LLC_RESP);
345 link->smcibdev->mac[link->ibport - 1],
346 &link->smcibdev->gid[link->ibport - 1],
347 SMC_LLC_RESP);
348 if (rc < 0) 356 if (rc < 0)
349 return SMC_CLC_DECL_TCL; 357 return SMC_CLC_DECL_TIMEOUT_CL;
350 358
351 /* receive ADD LINK request from server over RoCE fabric */ 359 /* receive ADD LINK request from server over RoCE fabric */
352 rest = wait_for_completion_interruptible_timeout(&link->llc_add, 360 rest = wait_for_completion_interruptible_timeout(&link->llc_add,
@@ -362,18 +370,17 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc)
362 /* send add link reject message, only one link supported for now */ 370 /* send add link reject message, only one link supported for now */
363 rc = smc_llc_send_add_link(link, 371 rc = smc_llc_send_add_link(link,
364 link->smcibdev->mac[link->ibport - 1], 372 link->smcibdev->mac[link->ibport - 1],
365 &link->smcibdev->gid[link->ibport - 1], 373 link->gid, SMC_LLC_RESP);
366 SMC_LLC_RESP);
367 if (rc < 0) 374 if (rc < 0)
368 return SMC_CLC_DECL_TCL; 375 return SMC_CLC_DECL_TIMEOUT_AL;
369 376
370 smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time); 377 smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
371 378
372 return 0; 379 return 0;
373} 380}
374 381
375static void smc_conn_save_peer_info(struct smc_sock *smc, 382static void smcr_conn_save_peer_info(struct smc_sock *smc,
376 struct smc_clc_msg_accept_confirm *clc) 383 struct smc_clc_msg_accept_confirm *clc)
377{ 384{
378 int bufsize = smc_uncompress_bufsize(clc->rmbe_size); 385 int bufsize = smc_uncompress_bufsize(clc->rmbe_size);
379 386
@@ -384,6 +391,28 @@ static void smc_conn_save_peer_info(struct smc_sock *smc,
384 smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1); 391 smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1);
385} 392}
386 393
394static void smcd_conn_save_peer_info(struct smc_sock *smc,
395 struct smc_clc_msg_accept_confirm *clc)
396{
397 int bufsize = smc_uncompress_bufsize(clc->dmbe_size);
398
399 smc->conn.peer_rmbe_idx = clc->dmbe_idx;
400 smc->conn.peer_token = clc->token;
401 /* msg header takes up space in the buffer */
402 smc->conn.peer_rmbe_size = bufsize - sizeof(struct smcd_cdc_msg);
403 atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
404 smc->conn.tx_off = bufsize * smc->conn.peer_rmbe_idx;
405}
406
407static void smc_conn_save_peer_info(struct smc_sock *smc,
408 struct smc_clc_msg_accept_confirm *clc)
409{
410 if (smc->conn.lgr->is_smcd)
411 smcd_conn_save_peer_info(smc, clc);
412 else
413 smcr_conn_save_peer_info(smc, clc);
414}
415
387static void smc_link_save_peer_info(struct smc_link *link, 416static void smc_link_save_peer_info(struct smc_link *link,
388 struct smc_clc_msg_accept_confirm *clc) 417 struct smc_clc_msg_accept_confirm *clc)
389{ 418{
@@ -395,9 +424,10 @@ static void smc_link_save_peer_info(struct smc_link *link,
395} 424}
396 425
397/* fall back during connect */ 426/* fall back during connect */
398static int smc_connect_fallback(struct smc_sock *smc) 427static int smc_connect_fallback(struct smc_sock *smc, int reason_code)
399{ 428{
400 smc->use_fallback = true; 429 smc->use_fallback = true;
430 smc->fallback_rsn = reason_code;
401 smc_copy_sock_settings_to_clc(smc); 431 smc_copy_sock_settings_to_clc(smc);
402 if (smc->sk.sk_state == SMC_INIT) 432 if (smc->sk.sk_state == SMC_INIT)
403 smc->sk.sk_state = SMC_ACTIVE; 433 smc->sk.sk_state = SMC_ACTIVE;
@@ -409,14 +439,20 @@ static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code)
409{ 439{
410 int rc; 440 int rc;
411 441
412 if (reason_code < 0) /* error, fallback is not possible */ 442 if (reason_code < 0) { /* error, fallback is not possible */
443 if (smc->sk.sk_state == SMC_INIT)
444 sock_put(&smc->sk); /* passive closing */
413 return reason_code; 445 return reason_code;
414 if (reason_code != SMC_CLC_DECL_REPLY) { 446 }
447 if (reason_code != SMC_CLC_DECL_PEERDECL) {
415 rc = smc_clc_send_decline(smc, reason_code); 448 rc = smc_clc_send_decline(smc, reason_code);
416 if (rc < 0) 449 if (rc < 0) {
450 if (smc->sk.sk_state == SMC_INIT)
451 sock_put(&smc->sk); /* passive closing */
417 return rc; 452 return rc;
453 }
418 } 454 }
419 return smc_connect_fallback(smc); 455 return smc_connect_fallback(smc, reason_code);
420} 456}
421 457
422/* abort connecting */ 458/* abort connecting */
@@ -427,15 +463,13 @@ static int smc_connect_abort(struct smc_sock *smc, int reason_code,
427 smc_lgr_forget(smc->conn.lgr); 463 smc_lgr_forget(smc->conn.lgr);
428 mutex_unlock(&smc_create_lgr_pending); 464 mutex_unlock(&smc_create_lgr_pending);
429 smc_conn_free(&smc->conn); 465 smc_conn_free(&smc->conn);
430 if (reason_code < 0 && smc->sk.sk_state == SMC_INIT)
431 sock_put(&smc->sk); /* passive closing */
432 return reason_code; 466 return reason_code;
433} 467}
434 468
435/* check if there is a rdma device available for this connection. */ 469/* check if there is a rdma device available for this connection. */
436/* called for connect and listen */ 470/* called for connect and listen */
437static int smc_check_rdma(struct smc_sock *smc, struct smc_ib_device **ibdev, 471static int smc_check_rdma(struct smc_sock *smc, struct smc_ib_device **ibdev,
438 u8 *ibport) 472 u8 *ibport, unsigned short vlan_id, u8 gid[])
439{ 473{
440 int reason_code = 0; 474 int reason_code = 0;
441 475
@@ -443,22 +477,59 @@ static int smc_check_rdma(struct smc_sock *smc, struct smc_ib_device **ibdev,
443 * within same PNETID that also contains the ethernet device 477 * within same PNETID that also contains the ethernet device
444 * used for the internal TCP socket 478 * used for the internal TCP socket
445 */ 479 */
446 smc_pnet_find_roce_resource(smc->clcsock->sk, ibdev, ibport); 480 smc_pnet_find_roce_resource(smc->clcsock->sk, ibdev, ibport, vlan_id,
481 gid);
447 if (!(*ibdev)) 482 if (!(*ibdev))
448 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ 483 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
449 484
450 return reason_code; 485 return reason_code;
451} 486}
452 487
488/* check if there is an ISM device available for this connection. */
489/* called for connect and listen */
490static int smc_check_ism(struct smc_sock *smc, struct smcd_dev **ismdev)
491{
492 /* Find ISM device with same PNETID as connecting interface */
493 smc_pnet_find_ism_resource(smc->clcsock->sk, ismdev);
494 if (!(*ismdev))
495 return SMC_CLC_DECL_CNFERR; /* configuration error */
496 return 0;
497}
498
499/* Check for VLAN ID and register it on ISM device just for CLC handshake */
500static int smc_connect_ism_vlan_setup(struct smc_sock *smc,
501 struct smcd_dev *ismdev,
502 unsigned short vlan_id)
503{
504 if (vlan_id && smc_ism_get_vlan(ismdev, vlan_id))
505 return SMC_CLC_DECL_CNFERR;
506 return 0;
507}
508
509/* cleanup temporary VLAN ID registration used for CLC handshake. If ISM is
510 * used, the VLAN ID will be registered again during the connection setup.
511 */
512static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, bool is_smcd,
513 struct smcd_dev *ismdev,
514 unsigned short vlan_id)
515{
516 if (!is_smcd)
517 return 0;
518 if (vlan_id && smc_ism_put_vlan(ismdev, vlan_id))
519 return SMC_CLC_DECL_CNFERR;
520 return 0;
521}
522
453/* CLC handshake during connect */ 523/* CLC handshake during connect */
454static int smc_connect_clc(struct smc_sock *smc, 524static int smc_connect_clc(struct smc_sock *smc, int smc_type,
455 struct smc_clc_msg_accept_confirm *aclc, 525 struct smc_clc_msg_accept_confirm *aclc,
456 struct smc_ib_device *ibdev, u8 ibport) 526 struct smc_ib_device *ibdev, u8 ibport,
527 u8 gid[], struct smcd_dev *ismdev)
457{ 528{
458 int rc = 0; 529 int rc = 0;
459 530
460 /* do inband token exchange */ 531 /* do inband token exchange */
461 rc = smc_clc_send_proposal(smc, ibdev, ibport); 532 rc = smc_clc_send_proposal(smc, smc_type, ibdev, ibport, gid, ismdev);
462 if (rc) 533 if (rc)
463 return rc; 534 return rc;
464 /* receive SMC Accept CLC message */ 535 /* receive SMC Accept CLC message */
@@ -475,8 +546,8 @@ static int smc_connect_rdma(struct smc_sock *smc,
475 int reason_code = 0; 546 int reason_code = 0;
476 547
477 mutex_lock(&smc_create_lgr_pending); 548 mutex_lock(&smc_create_lgr_pending);
478 local_contact = smc_conn_create(smc, ibdev, ibport, &aclc->lcl, 549 local_contact = smc_conn_create(smc, false, aclc->hdr.flag, ibdev,
479 aclc->hdr.flag); 550 ibport, &aclc->lcl, NULL, 0);
480 if (local_contact < 0) { 551 if (local_contact < 0) {
481 if (local_contact == -ENOMEM) 552 if (local_contact == -ENOMEM)
482 reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/ 553 reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
@@ -491,14 +562,14 @@ static int smc_connect_rdma(struct smc_sock *smc,
491 smc_conn_save_peer_info(smc, aclc); 562 smc_conn_save_peer_info(smc, aclc);
492 563
493 /* create send buffer and rmb */ 564 /* create send buffer and rmb */
494 if (smc_buf_create(smc)) 565 if (smc_buf_create(smc, false))
495 return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact); 566 return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact);
496 567
497 if (local_contact == SMC_FIRST_CONTACT) 568 if (local_contact == SMC_FIRST_CONTACT)
498 smc_link_save_peer_info(link, aclc); 569 smc_link_save_peer_info(link, aclc);
499 570
500 if (smc_rmb_rtoken_handling(&smc->conn, aclc)) 571 if (smc_rmb_rtoken_handling(&smc->conn, aclc))
501 return smc_connect_abort(smc, SMC_CLC_DECL_INTERR, 572 return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RTOK,
502 local_contact); 573 local_contact);
503 574
504 smc_close_init(smc); 575 smc_close_init(smc);
@@ -506,12 +577,12 @@ static int smc_connect_rdma(struct smc_sock *smc,
506 577
507 if (local_contact == SMC_FIRST_CONTACT) { 578 if (local_contact == SMC_FIRST_CONTACT) {
508 if (smc_ib_ready_link(link)) 579 if (smc_ib_ready_link(link))
509 return smc_connect_abort(smc, SMC_CLC_DECL_INTERR, 580 return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RDYLNK,
510 local_contact); 581 local_contact);
511 } else { 582 } else {
512 if (!smc->conn.rmb_desc->reused && 583 if (!smc->conn.rmb_desc->reused &&
513 smc_reg_rmb(link, smc->conn.rmb_desc, true)) 584 smc_reg_rmb(link, smc->conn.rmb_desc, true))
514 return smc_connect_abort(smc, SMC_CLC_DECL_INTERR, 585 return smc_connect_abort(smc, SMC_CLC_DECL_ERR_REGRMB,
515 local_contact); 586 local_contact);
516 } 587 }
517 smc_rmb_sync_sg_for_device(&smc->conn); 588 smc_rmb_sync_sg_for_device(&smc->conn);
@@ -538,44 +609,145 @@ static int smc_connect_rdma(struct smc_sock *smc,
538 return 0; 609 return 0;
539} 610}
540 611
612/* setup for ISM connection of client */
613static int smc_connect_ism(struct smc_sock *smc,
614 struct smc_clc_msg_accept_confirm *aclc,
615 struct smcd_dev *ismdev)
616{
617 int local_contact = SMC_FIRST_CONTACT;
618 int rc = 0;
619
620 mutex_lock(&smc_create_lgr_pending);
621 local_contact = smc_conn_create(smc, true, aclc->hdr.flag, NULL, 0,
622 NULL, ismdev, aclc->gid);
623 if (local_contact < 0)
624 return smc_connect_abort(smc, SMC_CLC_DECL_MEM, 0);
625
626 /* Create send and receive buffers */
627 if (smc_buf_create(smc, true))
628 return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact);
629
630 smc_conn_save_peer_info(smc, aclc);
631 smc_close_init(smc);
632 smc_rx_init(smc);
633 smc_tx_init(smc);
634
635 rc = smc_clc_send_confirm(smc);
636 if (rc)
637 return smc_connect_abort(smc, rc, local_contact);
638 mutex_unlock(&smc_create_lgr_pending);
639
640 smc_copy_sock_settings_to_clc(smc);
641 if (smc->sk.sk_state == SMC_INIT)
642 smc->sk.sk_state = SMC_ACTIVE;
643
644 return 0;
645}
646
541/* perform steps before actually connecting */ 647/* perform steps before actually connecting */
542static int __smc_connect(struct smc_sock *smc) 648static int __smc_connect(struct smc_sock *smc)
543{ 649{
650 bool ism_supported = false, rdma_supported = false;
544 struct smc_clc_msg_accept_confirm aclc; 651 struct smc_clc_msg_accept_confirm aclc;
545 struct smc_ib_device *ibdev; 652 struct smc_ib_device *ibdev;
653 struct smcd_dev *ismdev;
654 u8 gid[SMC_GID_SIZE];
655 unsigned short vlan;
656 int smc_type;
546 int rc = 0; 657 int rc = 0;
547 u8 ibport; 658 u8 ibport;
548 659
549 sock_hold(&smc->sk); /* sock put in passive closing */ 660 sock_hold(&smc->sk); /* sock put in passive closing */
550 661
551 if (smc->use_fallback) 662 if (smc->use_fallback)
552 return smc_connect_fallback(smc); 663 return smc_connect_fallback(smc, smc->fallback_rsn);
553 664
554 /* if peer has not signalled SMC-capability, fall back */ 665 /* if peer has not signalled SMC-capability, fall back */
555 if (!tcp_sk(smc->clcsock->sk)->syn_smc) 666 if (!tcp_sk(smc->clcsock->sk)->syn_smc)
556 return smc_connect_fallback(smc); 667 return smc_connect_fallback(smc, SMC_CLC_DECL_PEERNOSMC);
557 668
558 /* IPSec connections opt out of SMC-R optimizations */ 669 /* IPSec connections opt out of SMC-R optimizations */
559 if (using_ipsec(smc)) 670 if (using_ipsec(smc))
560 return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC); 671 return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC);
561 672
562 /* check if a RDMA device is available; if not, fall back */ 673 /* check for VLAN ID */
563 if (smc_check_rdma(smc, &ibdev, &ibport)) 674 if (smc_vlan_by_tcpsk(smc->clcsock, &vlan))
564 return smc_connect_decline_fallback(smc, SMC_CLC_DECL_CNFERR); 675 return smc_connect_decline_fallback(smc, SMC_CLC_DECL_CNFERR);
565 676
677 /* check if there is an ism device available */
678 if (!smc_check_ism(smc, &ismdev) &&
679 !smc_connect_ism_vlan_setup(smc, ismdev, vlan)) {
680 /* ISM is supported for this connection */
681 ism_supported = true;
682 smc_type = SMC_TYPE_D;
683 }
684
685 /* check if there is a rdma device available */
686 if (!smc_check_rdma(smc, &ibdev, &ibport, vlan, gid)) {
687 /* RDMA is supported for this connection */
688 rdma_supported = true;
689 if (ism_supported)
690 smc_type = SMC_TYPE_B; /* both */
691 else
692 smc_type = SMC_TYPE_R; /* only RDMA */
693 }
694
695 /* if neither ISM nor RDMA are supported, fallback */
696 if (!rdma_supported && !ism_supported)
697 return smc_connect_decline_fallback(smc, SMC_CLC_DECL_NOSMCDEV);
698
566 /* perform CLC handshake */ 699 /* perform CLC handshake */
567 rc = smc_connect_clc(smc, &aclc, ibdev, ibport); 700 rc = smc_connect_clc(smc, smc_type, &aclc, ibdev, ibport, gid, ismdev);
568 if (rc) 701 if (rc) {
702 smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan);
569 return smc_connect_decline_fallback(smc, rc); 703 return smc_connect_decline_fallback(smc, rc);
704 }
570 705
571 /* connect using rdma */ 706 /* depending on previous steps, connect using rdma or ism */
572 rc = smc_connect_rdma(smc, &aclc, ibdev, ibport); 707 if (rdma_supported && aclc.hdr.path == SMC_TYPE_R)
573 if (rc) 708 rc = smc_connect_rdma(smc, &aclc, ibdev, ibport);
709 else if (ism_supported && aclc.hdr.path == SMC_TYPE_D)
710 rc = smc_connect_ism(smc, &aclc, ismdev);
711 else
712 rc = SMC_CLC_DECL_MODEUNSUPP;
713 if (rc) {
714 smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan);
574 return smc_connect_decline_fallback(smc, rc); 715 return smc_connect_decline_fallback(smc, rc);
716 }
575 717
718 smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan);
576 return 0; 719 return 0;
577} 720}
578 721
722static void smc_connect_work(struct work_struct *work)
723{
724 struct smc_sock *smc = container_of(work, struct smc_sock,
725 connect_work);
726 int rc;
727
728 lock_sock(&smc->sk);
729 rc = kernel_connect(smc->clcsock, &smc->connect_info->addr,
730 smc->connect_info->alen, smc->connect_info->flags);
731 if (smc->clcsock->sk->sk_err) {
732 smc->sk.sk_err = smc->clcsock->sk->sk_err;
733 goto out;
734 }
735 if (rc < 0) {
736 smc->sk.sk_err = -rc;
737 goto out;
738 }
739
740 rc = __smc_connect(smc);
741 if (rc < 0)
742 smc->sk.sk_err = -rc;
743
744out:
745 smc->sk.sk_state_change(&smc->sk);
746 kfree(smc->connect_info);
747 smc->connect_info = NULL;
748 release_sock(&smc->sk);
749}
750
579static int smc_connect(struct socket *sock, struct sockaddr *addr, 751static int smc_connect(struct socket *sock, struct sockaddr *addr,
580 int alen, int flags) 752 int alen, int flags)
581{ 753{
@@ -605,15 +777,32 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr,
605 777
606 smc_copy_sock_settings_to_clc(smc); 778 smc_copy_sock_settings_to_clc(smc);
607 tcp_sk(smc->clcsock->sk)->syn_smc = 1; 779 tcp_sk(smc->clcsock->sk)->syn_smc = 1;
608 rc = kernel_connect(smc->clcsock, addr, alen, flags); 780 if (flags & O_NONBLOCK) {
609 if (rc) 781 if (smc->connect_info) {
610 goto out; 782 rc = -EALREADY;
783 goto out;
784 }
785 smc->connect_info = kzalloc(alen + 2 * sizeof(int), GFP_KERNEL);
786 if (!smc->connect_info) {
787 rc = -ENOMEM;
788 goto out;
789 }
790 smc->connect_info->alen = alen;
791 smc->connect_info->flags = flags ^ O_NONBLOCK;
792 memcpy(&smc->connect_info->addr, addr, alen);
793 schedule_work(&smc->connect_work);
794 rc = -EINPROGRESS;
795 } else {
796 rc = kernel_connect(smc->clcsock, addr, alen, flags);
797 if (rc)
798 goto out;
611 799
612 rc = __smc_connect(smc); 800 rc = __smc_connect(smc);
613 if (rc < 0) 801 if (rc < 0)
614 goto out; 802 goto out;
615 else 803 else
616 rc = 0; /* success cases including fallback */ 804 rc = 0; /* success cases including fallback */
805 }
617 806
618out: 807out:
619 release_sock(sk); 808 release_sock(sk);
@@ -758,15 +947,12 @@ static int smc_serv_conf_first_link(struct smc_sock *smc)
758 link = &lgr->lnk[SMC_SINGLE_LINK]; 947 link = &lgr->lnk[SMC_SINGLE_LINK];
759 948
760 if (smc_reg_rmb(link, smc->conn.rmb_desc, false)) 949 if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
761 return SMC_CLC_DECL_INTERR; 950 return SMC_CLC_DECL_ERR_REGRMB;
762 951
763 /* send CONFIRM LINK request to client over the RoCE fabric */ 952 /* send CONFIRM LINK request to client over the RoCE fabric */
764 rc = smc_llc_send_confirm_link(link, 953 rc = smc_llc_send_confirm_link(link, SMC_LLC_REQ);
765 link->smcibdev->mac[link->ibport - 1],
766 &link->smcibdev->gid[link->ibport - 1],
767 SMC_LLC_REQ);
768 if (rc < 0) 954 if (rc < 0)
769 return SMC_CLC_DECL_TCL; 955 return SMC_CLC_DECL_TIMEOUT_CL;
770 956
771 /* receive CONFIRM LINK response from client over the RoCE fabric */ 957 /* receive CONFIRM LINK response from client over the RoCE fabric */
772 rest = wait_for_completion_interruptible_timeout( 958 rest = wait_for_completion_interruptible_timeout(
@@ -786,10 +972,9 @@ static int smc_serv_conf_first_link(struct smc_sock *smc)
786 /* send ADD LINK request to client over the RoCE fabric */ 972 /* send ADD LINK request to client over the RoCE fabric */
787 rc = smc_llc_send_add_link(link, 973 rc = smc_llc_send_add_link(link,
788 link->smcibdev->mac[link->ibport - 1], 974 link->smcibdev->mac[link->ibport - 1],
789 &link->smcibdev->gid[link->ibport - 1], 975 link->gid, SMC_LLC_REQ);
790 SMC_LLC_REQ);
791 if (rc < 0) 976 if (rc < 0)
792 return SMC_CLC_DECL_TCL; 977 return SMC_CLC_DECL_TIMEOUT_AL;
793 978
794 /* receive ADD LINK response from client over the RoCE fabric */ 979 /* receive ADD LINK response from client over the RoCE fabric */
795 rest = wait_for_completion_interruptible_timeout(&link->llc_add_resp, 980 rest = wait_for_completion_interruptible_timeout(&link->llc_add_resp,
@@ -864,7 +1049,8 @@ static void smc_listen_decline(struct smc_sock *new_smc, int reason_code,
864 } 1049 }
865 smc_conn_free(&new_smc->conn); 1050 smc_conn_free(&new_smc->conn);
866 new_smc->use_fallback = true; 1051 new_smc->use_fallback = true;
867 if (reason_code && reason_code != SMC_CLC_DECL_REPLY) { 1052 new_smc->fallback_rsn = reason_code;
1053 if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) {
868 if (smc_clc_send_decline(new_smc, reason_code) < 0) { 1054 if (smc_clc_send_decline(new_smc, reason_code) < 0) {
869 smc_listen_out_err(new_smc); 1055 smc_listen_out_err(new_smc);
870 return; 1056 return;
@@ -894,7 +1080,8 @@ static int smc_listen_rdma_init(struct smc_sock *new_smc,
894 int *local_contact) 1080 int *local_contact)
895{ 1081{
896 /* allocate connection / link group */ 1082 /* allocate connection / link group */
897 *local_contact = smc_conn_create(new_smc, ibdev, ibport, &pclc->lcl, 0); 1083 *local_contact = smc_conn_create(new_smc, false, 0, ibdev, ibport,
1084 &pclc->lcl, NULL, 0);
898 if (*local_contact < 0) { 1085 if (*local_contact < 0) {
899 if (*local_contact == -ENOMEM) 1086 if (*local_contact == -ENOMEM)
900 return SMC_CLC_DECL_MEM;/* insufficient memory*/ 1087 return SMC_CLC_DECL_MEM;/* insufficient memory*/
@@ -902,12 +1089,50 @@ static int smc_listen_rdma_init(struct smc_sock *new_smc,
902 } 1089 }
903 1090
904 /* create send buffer and rmb */ 1091 /* create send buffer and rmb */
905 if (smc_buf_create(new_smc)) 1092 if (smc_buf_create(new_smc, false))
906 return SMC_CLC_DECL_MEM; 1093 return SMC_CLC_DECL_MEM;
907 1094
908 return 0; 1095 return 0;
909} 1096}
910 1097
1098/* listen worker: initialize connection and buffers for SMC-D */
1099static int smc_listen_ism_init(struct smc_sock *new_smc,
1100 struct smc_clc_msg_proposal *pclc,
1101 struct smcd_dev *ismdev,
1102 int *local_contact)
1103{
1104 struct smc_clc_msg_smcd *pclc_smcd;
1105
1106 pclc_smcd = smc_get_clc_msg_smcd(pclc);
1107 *local_contact = smc_conn_create(new_smc, true, 0, NULL, 0, NULL,
1108 ismdev, pclc_smcd->gid);
1109 if (*local_contact < 0) {
1110 if (*local_contact == -ENOMEM)
1111 return SMC_CLC_DECL_MEM;/* insufficient memory*/
1112 return SMC_CLC_DECL_INTERR; /* other error */
1113 }
1114
1115 /* Check if peer can be reached via ISM device */
1116 if (smc_ism_cantalk(new_smc->conn.lgr->peer_gid,
1117 new_smc->conn.lgr->vlan_id,
1118 new_smc->conn.lgr->smcd)) {
1119 if (*local_contact == SMC_FIRST_CONTACT)
1120 smc_lgr_forget(new_smc->conn.lgr);
1121 smc_conn_free(&new_smc->conn);
1122 return SMC_CLC_DECL_CNFERR;
1123 }
1124
1125 /* Create send and receive buffers */
1126 if (smc_buf_create(new_smc, true)) {
1127 if (*local_contact == SMC_FIRST_CONTACT)
1128 smc_lgr_forget(new_smc->conn.lgr);
1129 smc_conn_free(&new_smc->conn);
1130 return SMC_CLC_DECL_MEM;
1131 }
1132
1133 return 0;
1134}
1135
911/* listen worker: register buffers */ 1136/* listen worker: register buffers */
912static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact) 1137static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact)
913{ 1138{
@@ -916,7 +1141,7 @@ static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact)
916 if (local_contact != SMC_FIRST_CONTACT) { 1141 if (local_contact != SMC_FIRST_CONTACT) {
917 if (!new_smc->conn.rmb_desc->reused) { 1142 if (!new_smc->conn.rmb_desc->reused) {
918 if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true)) 1143 if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true))
919 return SMC_CLC_DECL_INTERR; 1144 return SMC_CLC_DECL_ERR_REGRMB;
920 } 1145 }
921 } 1146 }
922 smc_rmb_sync_sg_for_device(&new_smc->conn); 1147 smc_rmb_sync_sg_for_device(&new_smc->conn);
@@ -936,13 +1161,13 @@ static void smc_listen_rdma_finish(struct smc_sock *new_smc,
936 smc_link_save_peer_info(link, cclc); 1161 smc_link_save_peer_info(link, cclc);
937 1162
938 if (smc_rmb_rtoken_handling(&new_smc->conn, cclc)) { 1163 if (smc_rmb_rtoken_handling(&new_smc->conn, cclc)) {
939 reason_code = SMC_CLC_DECL_INTERR; 1164 reason_code = SMC_CLC_DECL_ERR_RTOK;
940 goto decline; 1165 goto decline;
941 } 1166 }
942 1167
943 if (local_contact == SMC_FIRST_CONTACT) { 1168 if (local_contact == SMC_FIRST_CONTACT) {
944 if (smc_ib_ready_link(link)) { 1169 if (smc_ib_ready_link(link)) {
945 reason_code = SMC_CLC_DECL_INTERR; 1170 reason_code = SMC_CLC_DECL_ERR_RDYLNK;
946 goto decline; 1171 goto decline;
947 } 1172 }
948 /* QP confirmation over RoCE fabric */ 1173 /* QP confirmation over RoCE fabric */
@@ -966,8 +1191,11 @@ static void smc_listen_work(struct work_struct *work)
966 struct smc_clc_msg_accept_confirm cclc; 1191 struct smc_clc_msg_accept_confirm cclc;
967 struct smc_clc_msg_proposal *pclc; 1192 struct smc_clc_msg_proposal *pclc;
968 struct smc_ib_device *ibdev; 1193 struct smc_ib_device *ibdev;
1194 bool ism_supported = false;
1195 struct smcd_dev *ismdev;
969 u8 buf[SMC_CLC_MAX_LEN]; 1196 u8 buf[SMC_CLC_MAX_LEN];
970 int local_contact = 0; 1197 int local_contact = 0;
1198 unsigned short vlan;
971 int reason_code = 0; 1199 int reason_code = 0;
972 int rc = 0; 1200 int rc = 0;
973 u8 ibport; 1201 u8 ibport;
@@ -980,6 +1208,7 @@ static void smc_listen_work(struct work_struct *work)
980 /* check if peer is smc capable */ 1208 /* check if peer is smc capable */
981 if (!tcp_sk(newclcsock->sk)->syn_smc) { 1209 if (!tcp_sk(newclcsock->sk)->syn_smc) {
982 new_smc->use_fallback = true; 1210 new_smc->use_fallback = true;
1211 new_smc->fallback_rsn = SMC_CLC_DECL_PEERNOSMC;
983 smc_listen_out_connected(new_smc); 1212 smc_listen_out_connected(new_smc);
984 return; 1213 return;
985 } 1214 }
@@ -1006,15 +1235,26 @@ static void smc_listen_work(struct work_struct *work)
1006 smc_rx_init(new_smc); 1235 smc_rx_init(new_smc);
1007 smc_tx_init(new_smc); 1236 smc_tx_init(new_smc);
1008 1237
1238 /* check if ISM is available */
1239 if ((pclc->hdr.path == SMC_TYPE_D || pclc->hdr.path == SMC_TYPE_B) &&
1240 !smc_check_ism(new_smc, &ismdev) &&
1241 !smc_listen_ism_init(new_smc, pclc, ismdev, &local_contact)) {
1242 ism_supported = true;
1243 }
1244
1009 /* check if RDMA is available */ 1245 /* check if RDMA is available */
1010 if (smc_check_rdma(new_smc, &ibdev, &ibport) || 1246 if (!ism_supported &&
1011 smc_listen_rdma_check(new_smc, pclc) || 1247 ((pclc->hdr.path != SMC_TYPE_R && pclc->hdr.path != SMC_TYPE_B) ||
1012 smc_listen_rdma_init(new_smc, pclc, ibdev, ibport, 1248 smc_vlan_by_tcpsk(new_smc->clcsock, &vlan) ||
1013 &local_contact) || 1249 smc_check_rdma(new_smc, &ibdev, &ibport, vlan, NULL) ||
1014 smc_listen_rdma_reg(new_smc, local_contact)) { 1250 smc_listen_rdma_check(new_smc, pclc) ||
1251 smc_listen_rdma_init(new_smc, pclc, ibdev, ibport,
1252 &local_contact) ||
1253 smc_listen_rdma_reg(new_smc, local_contact))) {
1015 /* SMC not supported, decline */ 1254 /* SMC not supported, decline */
1016 mutex_unlock(&smc_create_lgr_pending); 1255 mutex_unlock(&smc_create_lgr_pending);
1017 smc_listen_decline(new_smc, SMC_CLC_DECL_CNFERR, local_contact); 1256 smc_listen_decline(new_smc, SMC_CLC_DECL_MODEUNSUPP,
1257 local_contact);
1018 return; 1258 return;
1019 } 1259 }
1020 1260
@@ -1036,7 +1276,8 @@ static void smc_listen_work(struct work_struct *work)
1036 } 1276 }
1037 1277
1038 /* finish worker */ 1278 /* finish worker */
1039 smc_listen_rdma_finish(new_smc, &cclc, local_contact); 1279 if (!ism_supported)
1280 smc_listen_rdma_finish(new_smc, &cclc, local_contact);
1040 smc_conn_save_peer_info(new_smc, &cclc); 1281 smc_conn_save_peer_info(new_smc, &cclc);
1041 mutex_unlock(&smc_create_lgr_pending); 1282 mutex_unlock(&smc_create_lgr_pending);
1042 smc_listen_out_connected(new_smc); 1283 smc_listen_out_connected(new_smc);
@@ -1060,9 +1301,12 @@ static void smc_tcp_listen_work(struct work_struct *work)
1060 1301
1061 new_smc->listen_smc = lsmc; 1302 new_smc->listen_smc = lsmc;
1062 new_smc->use_fallback = lsmc->use_fallback; 1303 new_smc->use_fallback = lsmc->use_fallback;
1304 new_smc->fallback_rsn = lsmc->fallback_rsn;
1063 sock_hold(lsk); /* sock_put in smc_listen_work */ 1305 sock_hold(lsk); /* sock_put in smc_listen_work */
1064 INIT_WORK(&new_smc->smc_listen_work, smc_listen_work); 1306 INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
1065 smc_copy_sock_settings_to_smc(new_smc); 1307 smc_copy_sock_settings_to_smc(new_smc);
1308 new_smc->sk.sk_sndbuf = lsmc->sk.sk_sndbuf;
1309 new_smc->sk.sk_rcvbuf = lsmc->sk.sk_rcvbuf;
1066 sock_hold(&new_smc->sk); /* sock_put in passive closing */ 1310 sock_hold(&new_smc->sk); /* sock_put in passive closing */
1067 if (!schedule_work(&new_smc->smc_listen_work)) 1311 if (!schedule_work(&new_smc->smc_listen_work))
1068 sock_put(&new_smc->sk); 1312 sock_put(&new_smc->sk);
@@ -1214,6 +1458,7 @@ static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
1214 if (msg->msg_flags & MSG_FASTOPEN) { 1458 if (msg->msg_flags & MSG_FASTOPEN) {
1215 if (sk->sk_state == SMC_INIT) { 1459 if (sk->sk_state == SMC_INIT) {
1216 smc->use_fallback = true; 1460 smc->use_fallback = true;
1461 smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP;
1217 } else { 1462 } else {
1218 rc = -EINVAL; 1463 rc = -EINVAL;
1219 goto out; 1464 goto out;
@@ -1273,40 +1518,26 @@ static __poll_t smc_accept_poll(struct sock *parent)
1273 return mask; 1518 return mask;
1274} 1519}
1275 1520
1276static __poll_t smc_poll_mask(struct socket *sock, __poll_t events) 1521static __poll_t smc_poll(struct file *file, struct socket *sock,
1522 poll_table *wait)
1277{ 1523{
1278 struct sock *sk = sock->sk; 1524 struct sock *sk = sock->sk;
1279 __poll_t mask = 0; 1525 __poll_t mask = 0;
1280 struct smc_sock *smc; 1526 struct smc_sock *smc;
1281 int rc;
1282 1527
1283 if (!sk) 1528 if (!sk)
1284 return EPOLLNVAL; 1529 return EPOLLNVAL;
1285 1530
1286 smc = smc_sk(sock->sk); 1531 smc = smc_sk(sock->sk);
1287 sock_hold(sk);
1288 lock_sock(sk);
1289 if ((sk->sk_state == SMC_INIT) || smc->use_fallback) { 1532 if ((sk->sk_state == SMC_INIT) || smc->use_fallback) {
1290 /* delegate to CLC child sock */ 1533 /* delegate to CLC child sock */
1291 release_sock(sk); 1534 mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
1292 mask = smc->clcsock->ops->poll_mask(smc->clcsock, events);
1293 lock_sock(sk);
1294 sk->sk_err = smc->clcsock->sk->sk_err; 1535 sk->sk_err = smc->clcsock->sk->sk_err;
1295 if (sk->sk_err) { 1536 if (sk->sk_err)
1296 mask |= EPOLLERR; 1537 mask |= EPOLLERR;
1297 } else {
1298 /* if non-blocking connect finished ... */
1299 if (sk->sk_state == SMC_INIT &&
1300 mask & EPOLLOUT &&
1301 smc->clcsock->sk->sk_state != TCP_CLOSE) {
1302 rc = __smc_connect(smc);
1303 if (rc < 0)
1304 mask |= EPOLLERR;
1305 /* success cases including fallback */
1306 mask |= EPOLLOUT | EPOLLWRNORM;
1307 }
1308 }
1309 } else { 1538 } else {
1539 if (sk->sk_state != SMC_CLOSED)
1540 sock_poll_wait(file, wait);
1310 if (sk->sk_err) 1541 if (sk->sk_err)
1311 mask |= EPOLLERR; 1542 mask |= EPOLLERR;
1312 if ((sk->sk_shutdown == SHUTDOWN_MASK) || 1543 if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
@@ -1332,10 +1563,7 @@ static __poll_t smc_poll_mask(struct socket *sock, __poll_t events)
1332 } 1563 }
1333 if (smc->conn.urg_state == SMC_URG_VALID) 1564 if (smc->conn.urg_state == SMC_URG_VALID)
1334 mask |= EPOLLPRI; 1565 mask |= EPOLLPRI;
1335
1336 } 1566 }
1337 release_sock(sk);
1338 sock_put(sk);
1339 1567
1340 return mask; 1568 return mask;
1341} 1569}
@@ -1355,8 +1583,7 @@ static int smc_shutdown(struct socket *sock, int how)
1355 lock_sock(sk); 1583 lock_sock(sk);
1356 1584
1357 rc = -ENOTCONN; 1585 rc = -ENOTCONN;
1358 if ((sk->sk_state != SMC_LISTEN) && 1586 if ((sk->sk_state != SMC_ACTIVE) &&
1359 (sk->sk_state != SMC_ACTIVE) &&
1360 (sk->sk_state != SMC_PEERCLOSEWAIT1) && 1587 (sk->sk_state != SMC_PEERCLOSEWAIT1) &&
1361 (sk->sk_state != SMC_PEERCLOSEWAIT2) && 1588 (sk->sk_state != SMC_PEERCLOSEWAIT2) &&
1362 (sk->sk_state != SMC_APPCLOSEWAIT1) && 1589 (sk->sk_state != SMC_APPCLOSEWAIT1) &&
@@ -1415,7 +1642,8 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
1415 1642
1416 if (optlen < sizeof(int)) 1643 if (optlen < sizeof(int))
1417 return -EINVAL; 1644 return -EINVAL;
1418 get_user(val, (int __user *)optval); 1645 if (get_user(val, (int __user *)optval))
1646 return -EFAULT;
1419 1647
1420 lock_sock(sk); 1648 lock_sock(sk);
1421 switch (optname) { 1649 switch (optname) {
@@ -1427,6 +1655,7 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
1427 /* option not supported by SMC */ 1655 /* option not supported by SMC */
1428 if (sk->sk_state == SMC_INIT) { 1656 if (sk->sk_state == SMC_INIT) {
1429 smc->use_fallback = true; 1657 smc->use_fallback = true;
1658 smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP;
1430 } else { 1659 } else {
1431 if (!smc->use_fallback) 1660 if (!smc->use_fallback)
1432 rc = -EINVAL; 1661 rc = -EINVAL;
@@ -1478,15 +1707,22 @@ static int smc_ioctl(struct socket *sock, unsigned int cmd,
1478 1707
1479 smc = smc_sk(sock->sk); 1708 smc = smc_sk(sock->sk);
1480 conn = &smc->conn; 1709 conn = &smc->conn;
1710 lock_sock(&smc->sk);
1481 if (smc->use_fallback) { 1711 if (smc->use_fallback) {
1482 if (!smc->clcsock) 1712 if (!smc->clcsock) {
1713 release_sock(&smc->sk);
1483 return -EBADF; 1714 return -EBADF;
1484 return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg); 1715 }
1716 answ = smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
1717 release_sock(&smc->sk);
1718 return answ;
1485 } 1719 }
1486 switch (cmd) { 1720 switch (cmd) {
1487 case SIOCINQ: /* same as FIONREAD */ 1721 case SIOCINQ: /* same as FIONREAD */
1488 if (smc->sk.sk_state == SMC_LISTEN) 1722 if (smc->sk.sk_state == SMC_LISTEN) {
1723 release_sock(&smc->sk);
1489 return -EINVAL; 1724 return -EINVAL;
1725 }
1490 if (smc->sk.sk_state == SMC_INIT || 1726 if (smc->sk.sk_state == SMC_INIT ||
1491 smc->sk.sk_state == SMC_CLOSED) 1727 smc->sk.sk_state == SMC_CLOSED)
1492 answ = 0; 1728 answ = 0;
@@ -1495,8 +1731,10 @@ static int smc_ioctl(struct socket *sock, unsigned int cmd,
1495 break; 1731 break;
1496 case SIOCOUTQ: 1732 case SIOCOUTQ:
1497 /* output queue size (not send + not acked) */ 1733 /* output queue size (not send + not acked) */
1498 if (smc->sk.sk_state == SMC_LISTEN) 1734 if (smc->sk.sk_state == SMC_LISTEN) {
1735 release_sock(&smc->sk);
1499 return -EINVAL; 1736 return -EINVAL;
1737 }
1500 if (smc->sk.sk_state == SMC_INIT || 1738 if (smc->sk.sk_state == SMC_INIT ||
1501 smc->sk.sk_state == SMC_CLOSED) 1739 smc->sk.sk_state == SMC_CLOSED)
1502 answ = 0; 1740 answ = 0;
@@ -1506,8 +1744,10 @@ static int smc_ioctl(struct socket *sock, unsigned int cmd,
1506 break; 1744 break;
1507 case SIOCOUTQNSD: 1745 case SIOCOUTQNSD:
1508 /* output queue size (not send only) */ 1746 /* output queue size (not send only) */
1509 if (smc->sk.sk_state == SMC_LISTEN) 1747 if (smc->sk.sk_state == SMC_LISTEN) {
1748 release_sock(&smc->sk);
1510 return -EINVAL; 1749 return -EINVAL;
1750 }
1511 if (smc->sk.sk_state == SMC_INIT || 1751 if (smc->sk.sk_state == SMC_INIT ||
1512 smc->sk.sk_state == SMC_CLOSED) 1752 smc->sk.sk_state == SMC_CLOSED)
1513 answ = 0; 1753 answ = 0;
@@ -1515,25 +1755,25 @@ static int smc_ioctl(struct socket *sock, unsigned int cmd,
1515 answ = smc_tx_prepared_sends(&smc->conn); 1755 answ = smc_tx_prepared_sends(&smc->conn);
1516 break; 1756 break;
1517 case SIOCATMARK: 1757 case SIOCATMARK:
1518 if (smc->sk.sk_state == SMC_LISTEN) 1758 if (smc->sk.sk_state == SMC_LISTEN) {
1759 release_sock(&smc->sk);
1519 return -EINVAL; 1760 return -EINVAL;
1761 }
1520 if (smc->sk.sk_state == SMC_INIT || 1762 if (smc->sk.sk_state == SMC_INIT ||
1521 smc->sk.sk_state == SMC_CLOSED) { 1763 smc->sk.sk_state == SMC_CLOSED) {
1522 answ = 0; 1764 answ = 0;
1523 } else { 1765 } else {
1524 smc_curs_write(&cons, 1766 smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
1525 smc_curs_read(&conn->local_tx_ctrl.cons, conn), 1767 smc_curs_copy(&urg, &conn->urg_curs, conn);
1526 conn);
1527 smc_curs_write(&urg,
1528 smc_curs_read(&conn->urg_curs, conn),
1529 conn);
1530 answ = smc_curs_diff(conn->rmb_desc->len, 1768 answ = smc_curs_diff(conn->rmb_desc->len,
1531 &cons, &urg) == 1; 1769 &cons, &urg) == 1;
1532 } 1770 }
1533 break; 1771 break;
1534 default: 1772 default:
1773 release_sock(&smc->sk);
1535 return -ENOIOCTLCMD; 1774 return -ENOIOCTLCMD;
1536 } 1775 }
1776 release_sock(&smc->sk);
1537 1777
1538 return put_user(answ, (int __user *)arg); 1778 return put_user(answ, (int __user *)arg);
1539} 1779}
@@ -1619,7 +1859,7 @@ static const struct proto_ops smc_sock_ops = {
1619 .socketpair = sock_no_socketpair, 1859 .socketpair = sock_no_socketpair,
1620 .accept = smc_accept, 1860 .accept = smc_accept,
1621 .getname = smc_getname, 1861 .getname = smc_getname,
1622 .poll_mask = smc_poll_mask, 1862 .poll = smc_poll,
1623 .ioctl = smc_ioctl, 1863 .ioctl = smc_ioctl,
1624 .listen = smc_listen, 1864 .listen = smc_listen,
1625 .shutdown = smc_shutdown, 1865 .shutdown = smc_shutdown,
@@ -1657,6 +1897,7 @@ static int smc_create(struct net *net, struct socket *sock, int protocol,
1657 /* create internal TCP socket for CLC handshake and fallback */ 1897 /* create internal TCP socket for CLC handshake and fallback */
1658 smc = smc_sk(sk); 1898 smc = smc_sk(sk);
1659 smc->use_fallback = false; /* assume rdma capability first */ 1899 smc->use_fallback = false; /* assume rdma capability first */
1900 smc->fallback_rsn = 0;
1660 rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP, 1901 rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP,
1661 &smc->clcsock); 1902 &smc->clcsock);
1662 if (rc) { 1903 if (rc) {
diff --git a/net/smc/smc.h b/net/smc/smc.h
index 51ae1f10d81a..08786ace6010 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -21,8 +21,6 @@
21#define SMCPROTO_SMC 0 /* SMC protocol, IPv4 */ 21#define SMCPROTO_SMC 0 /* SMC protocol, IPv4 */
22#define SMCPROTO_SMC6 1 /* SMC protocol, IPv6 */ 22#define SMCPROTO_SMC6 1 /* SMC protocol, IPv6 */
23 23
24#define SMC_MAX_PORTS 2 /* Max # of ports */
25
26extern struct proto smc_proto; 24extern struct proto smc_proto;
27extern struct proto smc_proto6; 25extern struct proto smc_proto6;
28 26
@@ -185,6 +183,17 @@ struct smc_connection {
185 spinlock_t acurs_lock; /* protect cursors */ 183 spinlock_t acurs_lock; /* protect cursors */
186#endif 184#endif
187 struct work_struct close_work; /* peer sent some closing */ 185 struct work_struct close_work; /* peer sent some closing */
186 struct tasklet_struct rx_tsklet; /* Receiver tasklet for SMC-D */
187 u8 rx_off; /* receive offset:
188 * 0 for SMC-R, 32 for SMC-D
189 */
190 u64 peer_token; /* SMC-D token of peer */
191};
192
193struct smc_connect_info {
194 int flags;
195 int alen;
196 struct sockaddr addr;
188}; 197};
189 198
190struct smc_sock { /* smc sock container */ 199struct smc_sock { /* smc sock container */
@@ -192,11 +201,15 @@ struct smc_sock { /* smc sock container */
192 struct socket *clcsock; /* internal tcp socket */ 201 struct socket *clcsock; /* internal tcp socket */
193 struct smc_connection conn; /* smc connection */ 202 struct smc_connection conn; /* smc connection */
194 struct smc_sock *listen_smc; /* listen parent */ 203 struct smc_sock *listen_smc; /* listen parent */
204 struct smc_connect_info *connect_info; /* connect address & flags */
205 struct work_struct connect_work; /* handle non-blocking connect*/
195 struct work_struct tcp_listen_work;/* handle tcp socket accepts */ 206 struct work_struct tcp_listen_work;/* handle tcp socket accepts */
196 struct work_struct smc_listen_work;/* prepare new accept socket */ 207 struct work_struct smc_listen_work;/* prepare new accept socket */
197 struct list_head accept_q; /* sockets to be accepted */ 208 struct list_head accept_q; /* sockets to be accepted */
198 spinlock_t accept_q_lock; /* protects accept_q */ 209 spinlock_t accept_q_lock; /* protects accept_q */
199 bool use_fallback; /* fallback to tcp */ 210 bool use_fallback; /* fallback to tcp */
211 int fallback_rsn; /* reason for fallback */
212 u32 peer_diagnosis; /* decline reason from peer */
200 int sockopt_defer_accept; 213 int sockopt_defer_accept;
201 /* sockopt TCP_DEFER_ACCEPT 214 /* sockopt TCP_DEFER_ACCEPT
202 * value 215 * value
diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
index a7e8d63fc8ae..ed5dcf03fe0b 100644
--- a/net/smc/smc_cdc.c
+++ b/net/smc/smc_cdc.c
@@ -34,14 +34,15 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd,
34 enum ib_wc_status wc_status) 34 enum ib_wc_status wc_status)
35{ 35{
36 struct smc_cdc_tx_pend *cdcpend = (struct smc_cdc_tx_pend *)pnd_snd; 36 struct smc_cdc_tx_pend *cdcpend = (struct smc_cdc_tx_pend *)pnd_snd;
37 struct smc_connection *conn = cdcpend->conn;
37 struct smc_sock *smc; 38 struct smc_sock *smc;
38 int diff; 39 int diff;
39 40
40 if (!cdcpend->conn) 41 if (!conn)
41 /* already dismissed */ 42 /* already dismissed */
42 return; 43 return;
43 44
44 smc = container_of(cdcpend->conn, struct smc_sock, conn); 45 smc = container_of(conn, struct smc_sock, conn);
45 bh_lock_sock(&smc->sk); 46 bh_lock_sock(&smc->sk);
46 if (!wc_status) { 47 if (!wc_status) {
47 diff = smc_curs_diff(cdcpend->conn->sndbuf_desc->len, 48 diff = smc_curs_diff(cdcpend->conn->sndbuf_desc->len,
@@ -52,9 +53,7 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd,
52 atomic_add(diff, &cdcpend->conn->sndbuf_space); 53 atomic_add(diff, &cdcpend->conn->sndbuf_space);
53 /* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */ 54 /* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */
54 smp_mb__after_atomic(); 55 smp_mb__after_atomic();
55 smc_curs_write(&cdcpend->conn->tx_curs_fin, 56 smc_curs_copy(&conn->tx_curs_fin, &cdcpend->cursor, conn);
56 smc_curs_read(&cdcpend->cursor, cdcpend->conn),
57 cdcpend->conn);
58 } 57 }
59 smc_tx_sndbuf_nonfull(smc); 58 smc_tx_sndbuf_nonfull(smc);
60 bh_unlock_sock(&smc->sk); 59 bh_unlock_sock(&smc->sk);
@@ -110,14 +109,13 @@ int smc_cdc_msg_send(struct smc_connection *conn,
110 &conn->local_tx_ctrl, conn); 109 &conn->local_tx_ctrl, conn);
111 rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend); 110 rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend);
112 if (!rc) 111 if (!rc)
113 smc_curs_write(&conn->rx_curs_confirmed, 112 smc_curs_copy(&conn->rx_curs_confirmed,
114 smc_curs_read(&conn->local_tx_ctrl.cons, conn), 113 &conn->local_tx_ctrl.cons, conn);
115 conn);
116 114
117 return rc; 115 return rc;
118} 116}
119 117
120int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn) 118static int smcr_cdc_get_slot_and_msg_send(struct smc_connection *conn)
121{ 119{
122 struct smc_cdc_tx_pend *pend; 120 struct smc_cdc_tx_pend *pend;
123 struct smc_wr_buf *wr_buf; 121 struct smc_wr_buf *wr_buf;
@@ -130,6 +128,21 @@ int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn)
130 return smc_cdc_msg_send(conn, wr_buf, pend); 128 return smc_cdc_msg_send(conn, wr_buf, pend);
131} 129}
132 130
131int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn)
132{
133 int rc;
134
135 if (conn->lgr->is_smcd) {
136 spin_lock_bh(&conn->send_lock);
137 rc = smcd_cdc_msg_send(conn);
138 spin_unlock_bh(&conn->send_lock);
139 } else {
140 rc = smcr_cdc_get_slot_and_msg_send(conn);
141 }
142
143 return rc;
144}
145
133static bool smc_cdc_tx_filter(struct smc_wr_tx_pend_priv *tx_pend, 146static bool smc_cdc_tx_filter(struct smc_wr_tx_pend_priv *tx_pend,
134 unsigned long data) 147 unsigned long data)
135{ 148{
@@ -157,6 +170,44 @@ void smc_cdc_tx_dismiss_slots(struct smc_connection *conn)
157 (unsigned long)conn); 170 (unsigned long)conn);
158} 171}
159 172
173/* Send a SMC-D CDC header.
174 * This increments the free space available in our send buffer.
175 * Also update the confirmed receive buffer with what was sent to the peer.
176 */
177int smcd_cdc_msg_send(struct smc_connection *conn)
178{
179 struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
180 struct smcd_cdc_msg cdc;
181 int rc, diff;
182
183 memset(&cdc, 0, sizeof(cdc));
184 cdc.common.type = SMC_CDC_MSG_TYPE;
185 cdc.prod_wrap = conn->local_tx_ctrl.prod.wrap;
186 cdc.prod_count = conn->local_tx_ctrl.prod.count;
187
188 cdc.cons_wrap = conn->local_tx_ctrl.cons.wrap;
189 cdc.cons_count = conn->local_tx_ctrl.cons.count;
190 cdc.prod_flags = conn->local_tx_ctrl.prod_flags;
191 cdc.conn_state_flags = conn->local_tx_ctrl.conn_state_flags;
192 rc = smcd_tx_ism_write(conn, &cdc, sizeof(cdc), 0, 1);
193 if (rc)
194 return rc;
195 smc_curs_copy(&conn->rx_curs_confirmed, &conn->local_tx_ctrl.cons,
196 conn);
197 /* Calculate transmitted data and increment free send buffer space */
198 diff = smc_curs_diff(conn->sndbuf_desc->len, &conn->tx_curs_fin,
199 &conn->tx_curs_sent);
200 /* increased by confirmed number of bytes */
201 smp_mb__before_atomic();
202 atomic_add(diff, &conn->sndbuf_space);
203 /* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */
204 smp_mb__after_atomic();
205 smc_curs_copy(&conn->tx_curs_fin, &conn->tx_curs_sent, conn);
206
207 smc_tx_sndbuf_nonfull(smc);
208 return rc;
209}
210
160/********************************* receive ***********************************/ 211/********************************* receive ***********************************/
161 212
162static inline bool smc_cdc_before(u16 seq1, u16 seq2) 213static inline bool smc_cdc_before(u16 seq1, u16 seq2)
@@ -171,14 +222,12 @@ static void smc_cdc_handle_urg_data_arrival(struct smc_sock *smc,
171 char *base; 222 char *base;
172 223
173 /* new data included urgent business */ 224 /* new data included urgent business */
174 smc_curs_write(&conn->urg_curs, 225 smc_curs_copy(&conn->urg_curs, &conn->local_rx_ctrl.prod, conn);
175 smc_curs_read(&conn->local_rx_ctrl.prod, conn),
176 conn);
177 conn->urg_state = SMC_URG_VALID; 226 conn->urg_state = SMC_URG_VALID;
178 if (!sock_flag(&smc->sk, SOCK_URGINLINE)) 227 if (!sock_flag(&smc->sk, SOCK_URGINLINE))
179 /* we'll skip the urgent byte, so don't account for it */ 228 /* we'll skip the urgent byte, so don't account for it */
180 (*diff_prod)--; 229 (*diff_prod)--;
181 base = (char *)conn->rmb_desc->cpu_addr; 230 base = (char *)conn->rmb_desc->cpu_addr + conn->rx_off;
182 if (conn->urg_curs.count) 231 if (conn->urg_curs.count)
183 conn->urg_rx_byte = *(base + conn->urg_curs.count - 1); 232 conn->urg_rx_byte = *(base + conn->urg_curs.count - 1);
184 else 233 else
@@ -193,12 +242,8 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
193 struct smc_connection *conn = &smc->conn; 242 struct smc_connection *conn = &smc->conn;
194 int diff_cons, diff_prod; 243 int diff_cons, diff_prod;
195 244
196 smc_curs_write(&prod_old, 245 smc_curs_copy(&prod_old, &conn->local_rx_ctrl.prod, conn);
197 smc_curs_read(&conn->local_rx_ctrl.prod, conn), 246 smc_curs_copy(&cons_old, &conn->local_rx_ctrl.cons, conn);
198 conn);
199 smc_curs_write(&cons_old,
200 smc_curs_read(&conn->local_rx_ctrl.cons, conn),
201 conn);
202 smc_cdc_msg_to_host(&conn->local_rx_ctrl, cdc, conn); 247 smc_cdc_msg_to_host(&conn->local_rx_ctrl, cdc, conn);
203 248
204 diff_cons = smc_curs_diff(conn->peer_rmbe_size, &cons_old, 249 diff_cons = smc_curs_diff(conn->peer_rmbe_size, &cons_old,
@@ -233,7 +278,8 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
233 /* force immediate tx of current consumer cursor, but 278 /* force immediate tx of current consumer cursor, but
234 * under send_lock to guarantee arrival in seqno-order 279 * under send_lock to guarantee arrival in seqno-order
235 */ 280 */
236 smc_tx_sndbuf_nonempty(conn); 281 if (smc->sk.sk_state != SMC_INIT)
282 smc_tx_sndbuf_nonempty(conn);
237 } 283 }
238 } 284 }
239 285
@@ -276,6 +322,34 @@ static void smc_cdc_msg_recv(struct smc_sock *smc, struct smc_cdc_msg *cdc)
276 sock_put(&smc->sk); /* no free sk in softirq-context */ 322 sock_put(&smc->sk); /* no free sk in softirq-context */
277} 323}
278 324
325/* Schedule a tasklet for this connection. Triggered from the ISM device IRQ
326 * handler to indicate update in the DMBE.
327 *
328 * Context:
329 * - tasklet context
330 */
331static void smcd_cdc_rx_tsklet(unsigned long data)
332{
333 struct smc_connection *conn = (struct smc_connection *)data;
334 struct smcd_cdc_msg cdc;
335 struct smc_sock *smc;
336
337 if (!conn)
338 return;
339
340 memcpy(&cdc, conn->rmb_desc->cpu_addr, sizeof(cdc));
341 smc = container_of(conn, struct smc_sock, conn);
342 smc_cdc_msg_recv(smc, (struct smc_cdc_msg *)&cdc);
343}
344
345/* Initialize receive tasklet. Called from ISM device IRQ handler to start
346 * receiver side.
347 */
348void smcd_cdc_rx_init(struct smc_connection *conn)
349{
350 tasklet_init(&conn->rx_tsklet, smcd_cdc_rx_tsklet, (unsigned long)conn);
351}
352
279/***************************** init, exit, misc ******************************/ 353/***************************** init, exit, misc ******************************/
280 354
281static void smc_cdc_rx_handler(struct ib_wc *wc, void *buf) 355static void smc_cdc_rx_handler(struct ib_wc *wc, void *buf)
@@ -292,7 +366,7 @@ static void smc_cdc_rx_handler(struct ib_wc *wc, void *buf)
292 return; /* invalid message */ 366 return; /* invalid message */
293 367
294 /* lookup connection */ 368 /* lookup connection */
295 lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]); 369 lgr = smc_get_lgr(link);
296 read_lock_bh(&lgr->conns_lock); 370 read_lock_bh(&lgr->conns_lock);
297 conn = smc_lgr_find_conn(ntohl(cdc->token), lgr); 371 conn = smc_lgr_find_conn(ntohl(cdc->token), lgr);
298 read_unlock_bh(&lgr->conns_lock); 372 read_unlock_bh(&lgr->conns_lock);
diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h
index f60082fee5b8..934df4473a7c 100644
--- a/net/smc/smc_cdc.h
+++ b/net/smc/smc_cdc.h
@@ -50,6 +50,20 @@ struct smc_cdc_msg {
50 u8 reserved[18]; 50 u8 reserved[18];
51} __packed; /* format defined in RFC7609 */ 51} __packed; /* format defined in RFC7609 */
52 52
53/* CDC message for SMC-D */
54struct smcd_cdc_msg {
55 struct smc_wr_rx_hdr common; /* Type = 0xFE */
56 u8 res1[7];
57 u16 prod_wrap;
58 u32 prod_count;
59 u8 res2[2];
60 u16 cons_wrap;
61 u32 cons_count;
62 struct smc_cdc_producer_flags prod_flags;
63 struct smc_cdc_conn_state_flags conn_state_flags;
64 u8 res3[8];
65} __packed;
66
53static inline bool smc_cdc_rxed_any_close(struct smc_connection *conn) 67static inline bool smc_cdc_rxed_any_close(struct smc_connection *conn)
54{ 68{
55 return conn->local_rx_ctrl.conn_state_flags.peer_conn_abort || 69 return conn->local_rx_ctrl.conn_state_flags.peer_conn_abort ||
@@ -90,47 +104,34 @@ static inline u64 smc_curs_read(union smc_host_cursor *curs,
90#endif 104#endif
91} 105}
92 106
93static inline u64 smc_curs_read_net(union smc_cdc_cursor *curs, 107/* Copy cursor src into tgt */
94 struct smc_connection *conn) 108static inline void smc_curs_copy(union smc_host_cursor *tgt,
95{ 109 union smc_host_cursor *src,
96#ifndef KERNEL_HAS_ATOMIC64 110 struct smc_connection *conn)
97 unsigned long flags;
98 u64 ret;
99
100 spin_lock_irqsave(&conn->acurs_lock, flags);
101 ret = curs->acurs;
102 spin_unlock_irqrestore(&conn->acurs_lock, flags);
103 return ret;
104#else
105 return atomic64_read(&curs->acurs);
106#endif
107}
108
109static inline void smc_curs_write(union smc_host_cursor *curs, u64 val,
110 struct smc_connection *conn)
111{ 111{
112#ifndef KERNEL_HAS_ATOMIC64 112#ifndef KERNEL_HAS_ATOMIC64
113 unsigned long flags; 113 unsigned long flags;
114 114
115 spin_lock_irqsave(&conn->acurs_lock, flags); 115 spin_lock_irqsave(&conn->acurs_lock, flags);
116 curs->acurs = val; 116 tgt->acurs = src->acurs;
117 spin_unlock_irqrestore(&conn->acurs_lock, flags); 117 spin_unlock_irqrestore(&conn->acurs_lock, flags);
118#else 118#else
119 atomic64_set(&curs->acurs, val); 119 atomic64_set(&tgt->acurs, atomic64_read(&src->acurs));
120#endif 120#endif
121} 121}
122 122
123static inline void smc_curs_write_net(union smc_cdc_cursor *curs, u64 val, 123static inline void smc_curs_copy_net(union smc_cdc_cursor *tgt,
124 struct smc_connection *conn) 124 union smc_cdc_cursor *src,
125 struct smc_connection *conn)
125{ 126{
126#ifndef KERNEL_HAS_ATOMIC64 127#ifndef KERNEL_HAS_ATOMIC64
127 unsigned long flags; 128 unsigned long flags;
128 129
129 spin_lock_irqsave(&conn->acurs_lock, flags); 130 spin_lock_irqsave(&conn->acurs_lock, flags);
130 curs->acurs = val; 131 tgt->acurs = src->acurs;
131 spin_unlock_irqrestore(&conn->acurs_lock, flags); 132 spin_unlock_irqrestore(&conn->acurs_lock, flags);
132#else 133#else
133 atomic64_set(&curs->acurs, val); 134 atomic64_set(&tgt->acurs, atomic64_read(&src->acurs));
134#endif 135#endif
135} 136}
136 137
@@ -165,7 +166,7 @@ static inline void smc_host_cursor_to_cdc(union smc_cdc_cursor *peer,
165{ 166{
166 union smc_host_cursor temp; 167 union smc_host_cursor temp;
167 168
168 smc_curs_write(&temp, smc_curs_read(local, conn), conn); 169 smc_curs_copy(&temp, local, conn);
169 peer->count = htonl(temp.count); 170 peer->count = htonl(temp.count);
170 peer->wrap = htons(temp.wrap); 171 peer->wrap = htons(temp.wrap);
171 /* peer->reserved = htons(0); must be ensured by caller */ 172 /* peer->reserved = htons(0); must be ensured by caller */
@@ -192,8 +193,8 @@ static inline void smc_cdc_cursor_to_host(union smc_host_cursor *local,
192 union smc_host_cursor temp, old; 193 union smc_host_cursor temp, old;
193 union smc_cdc_cursor net; 194 union smc_cdc_cursor net;
194 195
195 smc_curs_write(&old, smc_curs_read(local, conn), conn); 196 smc_curs_copy(&old, local, conn);
196 smc_curs_write_net(&net, smc_curs_read_net(peer, conn), conn); 197 smc_curs_copy_net(&net, peer, conn);
197 temp.count = ntohl(net.count); 198 temp.count = ntohl(net.count);
198 temp.wrap = ntohs(net.wrap); 199 temp.wrap = ntohs(net.wrap);
199 if ((old.wrap > temp.wrap) && temp.wrap) 200 if ((old.wrap > temp.wrap) && temp.wrap)
@@ -201,12 +202,12 @@ static inline void smc_cdc_cursor_to_host(union smc_host_cursor *local,
201 if ((old.wrap == temp.wrap) && 202 if ((old.wrap == temp.wrap) &&
202 (old.count > temp.count)) 203 (old.count > temp.count))
203 return; 204 return;
204 smc_curs_write(local, smc_curs_read(&temp, conn), conn); 205 smc_curs_copy(local, &temp, conn);
205} 206}
206 207
207static inline void smc_cdc_msg_to_host(struct smc_host_cdc_msg *local, 208static inline void smcr_cdc_msg_to_host(struct smc_host_cdc_msg *local,
208 struct smc_cdc_msg *peer, 209 struct smc_cdc_msg *peer,
209 struct smc_connection *conn) 210 struct smc_connection *conn)
210{ 211{
211 local->common.type = peer->common.type; 212 local->common.type = peer->common.type;
212 local->len = peer->len; 213 local->len = peer->len;
@@ -218,6 +219,27 @@ static inline void smc_cdc_msg_to_host(struct smc_host_cdc_msg *local,
218 local->conn_state_flags = peer->conn_state_flags; 219 local->conn_state_flags = peer->conn_state_flags;
219} 220}
220 221
222static inline void smcd_cdc_msg_to_host(struct smc_host_cdc_msg *local,
223 struct smcd_cdc_msg *peer)
224{
225 local->prod.wrap = peer->prod_wrap;
226 local->prod.count = peer->prod_count;
227 local->cons.wrap = peer->cons_wrap;
228 local->cons.count = peer->cons_count;
229 local->prod_flags = peer->prod_flags;
230 local->conn_state_flags = peer->conn_state_flags;
231}
232
233static inline void smc_cdc_msg_to_host(struct smc_host_cdc_msg *local,
234 struct smc_cdc_msg *peer,
235 struct smc_connection *conn)
236{
237 if (conn->lgr->is_smcd)
238 smcd_cdc_msg_to_host(local, (struct smcd_cdc_msg *)peer);
239 else
240 smcr_cdc_msg_to_host(local, peer, conn);
241}
242
221struct smc_cdc_tx_pend; 243struct smc_cdc_tx_pend;
222 244
223int smc_cdc_get_free_slot(struct smc_connection *conn, 245int smc_cdc_get_free_slot(struct smc_connection *conn,
@@ -227,6 +249,8 @@ void smc_cdc_tx_dismiss_slots(struct smc_connection *conn);
227int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf, 249int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf,
228 struct smc_cdc_tx_pend *pend); 250 struct smc_cdc_tx_pend *pend);
229int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn); 251int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn);
252int smcd_cdc_msg_send(struct smc_connection *conn);
230int smc_cdc_init(void) __init; 253int smc_cdc_init(void) __init;
254void smcd_cdc_rx_init(struct smc_connection *conn);
231 255
232#endif /* SMC_CDC_H */ 256#endif /* SMC_CDC_H */
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
index 717449b1da0b..83aba9ade060 100644
--- a/net/smc/smc_clc.c
+++ b/net/smc/smc_clc.c
@@ -23,9 +23,15 @@
23#include "smc_core.h" 23#include "smc_core.h"
24#include "smc_clc.h" 24#include "smc_clc.h"
25#include "smc_ib.h" 25#include "smc_ib.h"
26#include "smc_ism.h"
27
28#define SMCR_CLC_ACCEPT_CONFIRM_LEN 68
29#define SMCD_CLC_ACCEPT_CONFIRM_LEN 48
26 30
27/* eye catcher "SMCR" EBCDIC for CLC messages */ 31/* eye catcher "SMCR" EBCDIC for CLC messages */
28static const char SMC_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xd9'}; 32static const char SMC_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xd9'};
33/* eye catcher "SMCD" EBCDIC for CLC messages */
34static const char SMCD_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xc4'};
29 35
30/* check if received message has a correct header length and contains valid 36/* check if received message has a correct header length and contains valid
31 * heading and trailing eyecatchers 37 * heading and trailing eyecatchers
@@ -38,10 +44,14 @@ static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm)
38 struct smc_clc_msg_decline *dclc; 44 struct smc_clc_msg_decline *dclc;
39 struct smc_clc_msg_trail *trl; 45 struct smc_clc_msg_trail *trl;
40 46
41 if (memcmp(clcm->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER))) 47 if (memcmp(clcm->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)) &&
48 memcmp(clcm->eyecatcher, SMCD_EYECATCHER, sizeof(SMCD_EYECATCHER)))
42 return false; 49 return false;
43 switch (clcm->type) { 50 switch (clcm->type) {
44 case SMC_CLC_PROPOSAL: 51 case SMC_CLC_PROPOSAL:
52 if (clcm->path != SMC_TYPE_R && clcm->path != SMC_TYPE_D &&
53 clcm->path != SMC_TYPE_B)
54 return false;
45 pclc = (struct smc_clc_msg_proposal *)clcm; 55 pclc = (struct smc_clc_msg_proposal *)clcm;
46 pclc_prfx = smc_clc_proposal_get_prefix(pclc); 56 pclc_prfx = smc_clc_proposal_get_prefix(pclc);
47 if (ntohs(pclc->hdr.length) != 57 if (ntohs(pclc->hdr.length) !=
@@ -56,10 +66,16 @@ static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm)
56 break; 66 break;
57 case SMC_CLC_ACCEPT: 67 case SMC_CLC_ACCEPT:
58 case SMC_CLC_CONFIRM: 68 case SMC_CLC_CONFIRM:
69 if (clcm->path != SMC_TYPE_R && clcm->path != SMC_TYPE_D)
70 return false;
59 clc = (struct smc_clc_msg_accept_confirm *)clcm; 71 clc = (struct smc_clc_msg_accept_confirm *)clcm;
60 if (ntohs(clc->hdr.length) != sizeof(*clc)) 72 if ((clcm->path == SMC_TYPE_R &&
73 ntohs(clc->hdr.length) != SMCR_CLC_ACCEPT_CONFIRM_LEN) ||
74 (clcm->path == SMC_TYPE_D &&
75 ntohs(clc->hdr.length) != SMCD_CLC_ACCEPT_CONFIRM_LEN))
61 return false; 76 return false;
62 trl = &clc->trl; 77 trl = (struct smc_clc_msg_trail *)
78 ((u8 *)clc + ntohs(clc->hdr.length) - sizeof(*trl));
63 break; 79 break;
64 case SMC_CLC_DECLINE: 80 case SMC_CLC_DECLINE:
65 dclc = (struct smc_clc_msg_decline *)clcm; 81 dclc = (struct smc_clc_msg_decline *)clcm;
@@ -70,7 +86,8 @@ static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm)
70 default: 86 default:
71 return false; 87 return false;
72 } 88 }
73 if (memcmp(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER))) 89 if (memcmp(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)) &&
90 memcmp(trl->eyecatcher, SMCD_EYECATCHER, sizeof(SMCD_EYECATCHER)))
74 return false; 91 return false;
75 return true; 92 return true;
76} 93}
@@ -250,6 +267,7 @@ out:
250int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, 267int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
251 u8 expected_type) 268 u8 expected_type)
252{ 269{
270 long rcvtimeo = smc->clcsock->sk->sk_rcvtimeo;
253 struct sock *clc_sk = smc->clcsock->sk; 271 struct sock *clc_sk = smc->clcsock->sk;
254 struct smc_clc_msg_hdr *clcm = buf; 272 struct smc_clc_msg_hdr *clcm = buf;
255 struct msghdr msg = {NULL, 0}; 273 struct msghdr msg = {NULL, 0};
@@ -295,6 +313,9 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
295 datlen = ntohs(clcm->length); 313 datlen = ntohs(clcm->length);
296 if ((len < sizeof(struct smc_clc_msg_hdr)) || 314 if ((len < sizeof(struct smc_clc_msg_hdr)) ||
297 (datlen > buflen) || 315 (datlen > buflen) ||
316 (clcm->version != SMC_CLC_V1) ||
317 (clcm->path != SMC_TYPE_R && clcm->path != SMC_TYPE_D &&
318 clcm->path != SMC_TYPE_B) ||
298 ((clcm->type != SMC_CLC_DECLINE) && 319 ((clcm->type != SMC_CLC_DECLINE) &&
299 (clcm->type != expected_type))) { 320 (clcm->type != expected_type))) {
300 smc->sk.sk_err = EPROTO; 321 smc->sk.sk_err = EPROTO;
@@ -306,7 +327,6 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
306 memset(&msg, 0, sizeof(struct msghdr)); 327 memset(&msg, 0, sizeof(struct msghdr));
307 iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &vec, 1, datlen); 328 iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &vec, 1, datlen);
308 krflags = MSG_WAITALL; 329 krflags = MSG_WAITALL;
309 smc->clcsock->sk->sk_rcvtimeo = CLC_WAIT_TIME;
310 len = sock_recvmsg(smc->clcsock, &msg, krflags); 330 len = sock_recvmsg(smc->clcsock, &msg, krflags);
311 if (len < datlen || !smc_clc_msg_hdr_valid(clcm)) { 331 if (len < datlen || !smc_clc_msg_hdr_valid(clcm)) {
312 smc->sk.sk_err = EPROTO; 332 smc->sk.sk_err = EPROTO;
@@ -314,7 +334,11 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
314 goto out; 334 goto out;
315 } 335 }
316 if (clcm->type == SMC_CLC_DECLINE) { 336 if (clcm->type == SMC_CLC_DECLINE) {
317 reason_code = SMC_CLC_DECL_REPLY; 337 struct smc_clc_msg_decline *dclc;
338
339 dclc = (struct smc_clc_msg_decline *)clcm;
340 reason_code = SMC_CLC_DECL_PEERDECL;
341 smc->peer_diagnosis = ntohl(dclc->peer_diagnosis);
318 if (((struct smc_clc_msg_decline *)buf)->hdr.flag) { 342 if (((struct smc_clc_msg_decline *)buf)->hdr.flag) {
319 smc->conn.lgr->sync_err = 1; 343 smc->conn.lgr->sync_err = 1;
320 smc_lgr_terminate(smc->conn.lgr); 344 smc_lgr_terminate(smc->conn.lgr);
@@ -322,6 +346,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
322 } 346 }
323 347
324out: 348out:
349 smc->clcsock->sk->sk_rcvtimeo = rcvtimeo;
325 return reason_code; 350 return reason_code;
326} 351}
327 352
@@ -356,17 +381,18 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info)
356} 381}
357 382
358/* send CLC PROPOSAL message across internal TCP socket */ 383/* send CLC PROPOSAL message across internal TCP socket */
359int smc_clc_send_proposal(struct smc_sock *smc, 384int smc_clc_send_proposal(struct smc_sock *smc, int smc_type,
360 struct smc_ib_device *smcibdev, 385 struct smc_ib_device *ibdev, u8 ibport, u8 gid[],
361 u8 ibport) 386 struct smcd_dev *ismdev)
362{ 387{
363 struct smc_clc_ipv6_prefix ipv6_prfx[SMC_CLC_MAX_V6_PREFIX]; 388 struct smc_clc_ipv6_prefix ipv6_prfx[SMC_CLC_MAX_V6_PREFIX];
364 struct smc_clc_msg_proposal_prefix pclc_prfx; 389 struct smc_clc_msg_proposal_prefix pclc_prfx;
390 struct smc_clc_msg_smcd pclc_smcd;
365 struct smc_clc_msg_proposal pclc; 391 struct smc_clc_msg_proposal pclc;
366 struct smc_clc_msg_trail trl; 392 struct smc_clc_msg_trail trl;
367 int len, i, plen, rc; 393 int len, i, plen, rc;
368 int reason_code = 0; 394 int reason_code = 0;
369 struct kvec vec[4]; 395 struct kvec vec[5];
370 struct msghdr msg; 396 struct msghdr msg;
371 397
372 /* retrieve ip prefixes for CLC proposal msg */ 398 /* retrieve ip prefixes for CLC proposal msg */
@@ -381,18 +407,34 @@ int smc_clc_send_proposal(struct smc_sock *smc,
381 memset(&pclc, 0, sizeof(pclc)); 407 memset(&pclc, 0, sizeof(pclc));
382 memcpy(pclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); 408 memcpy(pclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
383 pclc.hdr.type = SMC_CLC_PROPOSAL; 409 pclc.hdr.type = SMC_CLC_PROPOSAL;
384 pclc.hdr.length = htons(plen);
385 pclc.hdr.version = SMC_CLC_V1; /* SMC version */ 410 pclc.hdr.version = SMC_CLC_V1; /* SMC version */
386 memcpy(pclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid)); 411 pclc.hdr.path = smc_type;
387 memcpy(&pclc.lcl.gid, &smcibdev->gid[ibport - 1], SMC_GID_SIZE); 412 if (smc_type == SMC_TYPE_R || smc_type == SMC_TYPE_B) {
388 memcpy(&pclc.lcl.mac, &smcibdev->mac[ibport - 1], ETH_ALEN); 413 /* add SMC-R specifics */
389 pclc.iparea_offset = htons(0); 414 memcpy(pclc.lcl.id_for_peer, local_systemid,
415 sizeof(local_systemid));
416 memcpy(&pclc.lcl.gid, gid, SMC_GID_SIZE);
417 memcpy(&pclc.lcl.mac, &ibdev->mac[ibport - 1], ETH_ALEN);
418 pclc.iparea_offset = htons(0);
419 }
420 if (smc_type == SMC_TYPE_D || smc_type == SMC_TYPE_B) {
421 /* add SMC-D specifics */
422 memset(&pclc_smcd, 0, sizeof(pclc_smcd));
423 plen += sizeof(pclc_smcd);
424 pclc.iparea_offset = htons(SMC_CLC_PROPOSAL_MAX_OFFSET);
425 pclc_smcd.gid = ismdev->local_gid;
426 }
427 pclc.hdr.length = htons(plen);
390 428
391 memcpy(trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); 429 memcpy(trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
392 memset(&msg, 0, sizeof(msg)); 430 memset(&msg, 0, sizeof(msg));
393 i = 0; 431 i = 0;
394 vec[i].iov_base = &pclc; 432 vec[i].iov_base = &pclc;
395 vec[i++].iov_len = sizeof(pclc); 433 vec[i++].iov_len = sizeof(pclc);
434 if (smc_type == SMC_TYPE_D || smc_type == SMC_TYPE_B) {
435 vec[i].iov_base = &pclc_smcd;
436 vec[i++].iov_len = sizeof(pclc_smcd);
437 }
396 vec[i].iov_base = &pclc_prfx; 438 vec[i].iov_base = &pclc_prfx;
397 vec[i++].iov_len = sizeof(pclc_prfx); 439 vec[i++].iov_len = sizeof(pclc_prfx);
398 if (pclc_prfx.ipv6_prefixes_cnt > 0) { 440 if (pclc_prfx.ipv6_prefixes_cnt > 0) {
@@ -428,35 +470,55 @@ int smc_clc_send_confirm(struct smc_sock *smc)
428 struct kvec vec; 470 struct kvec vec;
429 int len; 471 int len;
430 472
431 link = &conn->lgr->lnk[SMC_SINGLE_LINK];
432 /* send SMC Confirm CLC msg */ 473 /* send SMC Confirm CLC msg */
433 memset(&cclc, 0, sizeof(cclc)); 474 memset(&cclc, 0, sizeof(cclc));
434 memcpy(cclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
435 cclc.hdr.type = SMC_CLC_CONFIRM; 475 cclc.hdr.type = SMC_CLC_CONFIRM;
436 cclc.hdr.length = htons(sizeof(cclc));
437 cclc.hdr.version = SMC_CLC_V1; /* SMC version */ 476 cclc.hdr.version = SMC_CLC_V1; /* SMC version */
438 memcpy(cclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid)); 477 if (smc->conn.lgr->is_smcd) {
439 memcpy(&cclc.lcl.gid, &link->smcibdev->gid[link->ibport - 1], 478 /* SMC-D specific settings */
440 SMC_GID_SIZE); 479 memcpy(cclc.hdr.eyecatcher, SMCD_EYECATCHER,
441 memcpy(&cclc.lcl.mac, &link->smcibdev->mac[link->ibport - 1], ETH_ALEN); 480 sizeof(SMCD_EYECATCHER));
442 hton24(cclc.qpn, link->roce_qp->qp_num); 481 cclc.hdr.path = SMC_TYPE_D;
443 cclc.rmb_rkey = 482 cclc.hdr.length = htons(SMCD_CLC_ACCEPT_CONFIRM_LEN);
444 htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); 483 cclc.gid = conn->lgr->smcd->local_gid;
445 cclc.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */ 484 cclc.token = conn->rmb_desc->token;
446 cclc.rmbe_alert_token = htonl(conn->alert_token_local); 485 cclc.dmbe_size = conn->rmbe_size_short;
447 cclc.qp_mtu = min(link->path_mtu, link->peer_mtu); 486 cclc.dmbe_idx = 0;
448 cclc.rmbe_size = conn->rmbe_size_short; 487 memcpy(&cclc.linkid, conn->lgr->id, SMC_LGR_ID_SIZE);
449 cclc.rmb_dma_addr = cpu_to_be64( 488 memcpy(cclc.smcd_trl.eyecatcher, SMCD_EYECATCHER,
450 (u64)sg_dma_address(conn->rmb_desc->sgt[SMC_SINGLE_LINK].sgl)); 489 sizeof(SMCD_EYECATCHER));
451 hton24(cclc.psn, link->psn_initial); 490 } else {
452 491 /* SMC-R specific settings */
453 memcpy(cclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); 492 link = &conn->lgr->lnk[SMC_SINGLE_LINK];
493 memcpy(cclc.hdr.eyecatcher, SMC_EYECATCHER,
494 sizeof(SMC_EYECATCHER));
495 cclc.hdr.path = SMC_TYPE_R;
496 cclc.hdr.length = htons(SMCR_CLC_ACCEPT_CONFIRM_LEN);
497 memcpy(cclc.lcl.id_for_peer, local_systemid,
498 sizeof(local_systemid));
499 memcpy(&cclc.lcl.gid, link->gid, SMC_GID_SIZE);
500 memcpy(&cclc.lcl.mac, &link->smcibdev->mac[link->ibport - 1],
501 ETH_ALEN);
502 hton24(cclc.qpn, link->roce_qp->qp_num);
503 cclc.rmb_rkey =
504 htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
505 cclc.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */
506 cclc.rmbe_alert_token = htonl(conn->alert_token_local);
507 cclc.qp_mtu = min(link->path_mtu, link->peer_mtu);
508 cclc.rmbe_size = conn->rmbe_size_short;
509 cclc.rmb_dma_addr = cpu_to_be64((u64)sg_dma_address
510 (conn->rmb_desc->sgt[SMC_SINGLE_LINK].sgl));
511 hton24(cclc.psn, link->psn_initial);
512 memcpy(cclc.smcr_trl.eyecatcher, SMC_EYECATCHER,
513 sizeof(SMC_EYECATCHER));
514 }
454 515
455 memset(&msg, 0, sizeof(msg)); 516 memset(&msg, 0, sizeof(msg));
456 vec.iov_base = &cclc; 517 vec.iov_base = &cclc;
457 vec.iov_len = sizeof(cclc); 518 vec.iov_len = ntohs(cclc.hdr.length);
458 len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, sizeof(cclc)); 519 len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1,
459 if (len < sizeof(cclc)) { 520 ntohs(cclc.hdr.length));
521 if (len < ntohs(cclc.hdr.length)) {
460 if (len >= 0) { 522 if (len >= 0) {
461 reason_code = -ENETUNREACH; 523 reason_code = -ENETUNREACH;
462 smc->sk.sk_err = -reason_code; 524 smc->sk.sk_err = -reason_code;
@@ -479,35 +541,57 @@ int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact)
479 int rc = 0; 541 int rc = 0;
480 int len; 542 int len;
481 543
482 link = &conn->lgr->lnk[SMC_SINGLE_LINK];
483 memset(&aclc, 0, sizeof(aclc)); 544 memset(&aclc, 0, sizeof(aclc));
484 memcpy(aclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
485 aclc.hdr.type = SMC_CLC_ACCEPT; 545 aclc.hdr.type = SMC_CLC_ACCEPT;
486 aclc.hdr.length = htons(sizeof(aclc));
487 aclc.hdr.version = SMC_CLC_V1; /* SMC version */ 546 aclc.hdr.version = SMC_CLC_V1; /* SMC version */
488 if (srv_first_contact) 547 if (srv_first_contact)
489 aclc.hdr.flag = 1; 548 aclc.hdr.flag = 1;
490 memcpy(aclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid)); 549
491 memcpy(&aclc.lcl.gid, &link->smcibdev->gid[link->ibport - 1], 550 if (new_smc->conn.lgr->is_smcd) {
492 SMC_GID_SIZE); 551 /* SMC-D specific settings */
493 memcpy(&aclc.lcl.mac, link->smcibdev->mac[link->ibport - 1], ETH_ALEN); 552 aclc.hdr.length = htons(SMCD_CLC_ACCEPT_CONFIRM_LEN);
494 hton24(aclc.qpn, link->roce_qp->qp_num); 553 memcpy(aclc.hdr.eyecatcher, SMCD_EYECATCHER,
495 aclc.rmb_rkey = 554 sizeof(SMCD_EYECATCHER));
496 htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); 555 aclc.hdr.path = SMC_TYPE_D;
497 aclc.rmbe_idx = 1; /* as long as 1 RMB = 1 RMBE */ 556 aclc.gid = conn->lgr->smcd->local_gid;
498 aclc.rmbe_alert_token = htonl(conn->alert_token_local); 557 aclc.token = conn->rmb_desc->token;
499 aclc.qp_mtu = link->path_mtu; 558 aclc.dmbe_size = conn->rmbe_size_short;
500 aclc.rmbe_size = conn->rmbe_size_short, 559 aclc.dmbe_idx = 0;
501 aclc.rmb_dma_addr = cpu_to_be64( 560 memcpy(&aclc.linkid, conn->lgr->id, SMC_LGR_ID_SIZE);
502 (u64)sg_dma_address(conn->rmb_desc->sgt[SMC_SINGLE_LINK].sgl)); 561 memcpy(aclc.smcd_trl.eyecatcher, SMCD_EYECATCHER,
503 hton24(aclc.psn, link->psn_initial); 562 sizeof(SMCD_EYECATCHER));
504 memcpy(aclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); 563 } else {
564 /* SMC-R specific settings */
565 aclc.hdr.length = htons(SMCR_CLC_ACCEPT_CONFIRM_LEN);
566 memcpy(aclc.hdr.eyecatcher, SMC_EYECATCHER,
567 sizeof(SMC_EYECATCHER));
568 aclc.hdr.path = SMC_TYPE_R;
569 link = &conn->lgr->lnk[SMC_SINGLE_LINK];
570 memcpy(aclc.lcl.id_for_peer, local_systemid,
571 sizeof(local_systemid));
572 memcpy(&aclc.lcl.gid, link->gid, SMC_GID_SIZE);
573 memcpy(&aclc.lcl.mac, link->smcibdev->mac[link->ibport - 1],
574 ETH_ALEN);
575 hton24(aclc.qpn, link->roce_qp->qp_num);
576 aclc.rmb_rkey =
577 htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
578 aclc.rmbe_idx = 1; /* as long as 1 RMB = 1 RMBE */
579 aclc.rmbe_alert_token = htonl(conn->alert_token_local);
580 aclc.qp_mtu = link->path_mtu;
581 aclc.rmbe_size = conn->rmbe_size_short,
582 aclc.rmb_dma_addr = cpu_to_be64((u64)sg_dma_address
583 (conn->rmb_desc->sgt[SMC_SINGLE_LINK].sgl));
584 hton24(aclc.psn, link->psn_initial);
585 memcpy(aclc.smcr_trl.eyecatcher, SMC_EYECATCHER,
586 sizeof(SMC_EYECATCHER));
587 }
505 588
506 memset(&msg, 0, sizeof(msg)); 589 memset(&msg, 0, sizeof(msg));
507 vec.iov_base = &aclc; 590 vec.iov_base = &aclc;
508 vec.iov_len = sizeof(aclc); 591 vec.iov_len = ntohs(aclc.hdr.length);
509 len = kernel_sendmsg(new_smc->clcsock, &msg, &vec, 1, sizeof(aclc)); 592 len = kernel_sendmsg(new_smc->clcsock, &msg, &vec, 1,
510 if (len < sizeof(aclc)) { 593 ntohs(aclc.hdr.length));
594 if (len < ntohs(aclc.hdr.length)) {
511 if (len >= 0) 595 if (len >= 0)
512 new_smc->sk.sk_err = EPROTO; 596 new_smc->sk.sk_err = EPROTO;
513 else 597 else
diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h
index 41ff9ea96139..18da89b681c2 100644
--- a/net/smc/smc_clc.h
+++ b/net/smc/smc_clc.h
@@ -23,17 +23,26 @@
23#define SMC_CLC_DECLINE 0x04 23#define SMC_CLC_DECLINE 0x04
24 24
25#define SMC_CLC_V1 0x1 /* SMC version */ 25#define SMC_CLC_V1 0x1 /* SMC version */
26#define SMC_TYPE_R 0 /* SMC-R only */
27#define SMC_TYPE_D 1 /* SMC-D only */
28#define SMC_TYPE_B 3 /* SMC-R and SMC-D */
26#define CLC_WAIT_TIME (6 * HZ) /* max. wait time on clcsock */ 29#define CLC_WAIT_TIME (6 * HZ) /* max. wait time on clcsock */
27#define SMC_CLC_DECL_MEM 0x01010000 /* insufficient memory resources */ 30#define SMC_CLC_DECL_MEM 0x01010000 /* insufficient memory resources */
28#define SMC_CLC_DECL_TIMEOUT 0x02000000 /* timeout */ 31#define SMC_CLC_DECL_TIMEOUT_CL 0x02010000 /* timeout w4 QP confirm link */
32#define SMC_CLC_DECL_TIMEOUT_AL 0x02020000 /* timeout w4 QP add link */
29#define SMC_CLC_DECL_CNFERR 0x03000000 /* configuration error */ 33#define SMC_CLC_DECL_CNFERR 0x03000000 /* configuration error */
30#define SMC_CLC_DECL_IPSEC 0x03030000 /* IPsec usage */ 34#define SMC_CLC_DECL_PEERNOSMC 0x03010000 /* peer did not indicate SMC */
35#define SMC_CLC_DECL_IPSEC 0x03020000 /* IPsec usage */
36#define SMC_CLC_DECL_NOSMCDEV 0x03030000 /* no SMC device found */
37#define SMC_CLC_DECL_MODEUNSUPP 0x03040000 /* smc modes do not match (R or D)*/
38#define SMC_CLC_DECL_RMBE_EC 0x03050000 /* peer has eyecatcher in RMBE */
39#define SMC_CLC_DECL_OPTUNSUPP 0x03060000 /* fastopen sockopt not supported */
31#define SMC_CLC_DECL_SYNCERR 0x04000000 /* synchronization error */ 40#define SMC_CLC_DECL_SYNCERR 0x04000000 /* synchronization error */
32#define SMC_CLC_DECL_REPLY 0x06000000 /* reply to a received decline */ 41#define SMC_CLC_DECL_PEERDECL 0x05000000 /* peer declined during handshake */
33#define SMC_CLC_DECL_INTERR 0x99990000 /* internal error */ 42#define SMC_CLC_DECL_INTERR 0x99990000 /* internal error */
34#define SMC_CLC_DECL_TCL 0x02040000 /* timeout w4 QP confirm */ 43#define SMC_CLC_DECL_ERR_RTOK 0x99990001 /* rtoken handling failed */
35#define SMC_CLC_DECL_SEND 0x07000000 /* sending problem */ 44#define SMC_CLC_DECL_ERR_RDYLNK 0x99990002 /* ib ready link failed */
36#define SMC_CLC_DECL_RMBE_EC 0x08000000 /* peer has eyecatcher in RMBE */ 45#define SMC_CLC_DECL_ERR_REGRMB 0x99990003 /* reg rmb failed */
37 46
38struct smc_clc_msg_hdr { /* header1 of clc messages */ 47struct smc_clc_msg_hdr { /* header1 of clc messages */
39 u8 eyecatcher[4]; /* eye catcher */ 48 u8 eyecatcher[4]; /* eye catcher */
@@ -42,9 +51,11 @@ struct smc_clc_msg_hdr { /* header1 of clc messages */
42#if defined(__BIG_ENDIAN_BITFIELD) 51#if defined(__BIG_ENDIAN_BITFIELD)
43 u8 version : 4, 52 u8 version : 4,
44 flag : 1, 53 flag : 1,
45 rsvd : 3; 54 rsvd : 1,
55 path : 2;
46#elif defined(__LITTLE_ENDIAN_BITFIELD) 56#elif defined(__LITTLE_ENDIAN_BITFIELD)
47 u8 rsvd : 3, 57 u8 path : 2,
58 rsvd : 1,
48 flag : 1, 59 flag : 1,
49 version : 4; 60 version : 4;
50#endif 61#endif
@@ -77,6 +88,11 @@ struct smc_clc_msg_proposal_prefix { /* prefix part of clc proposal message*/
77 u8 ipv6_prefixes_cnt; /* number of IPv6 prefixes in prefix array */ 88 u8 ipv6_prefixes_cnt; /* number of IPv6 prefixes in prefix array */
78} __aligned(4); 89} __aligned(4);
79 90
91struct smc_clc_msg_smcd { /* SMC-D GID information */
92 u64 gid; /* ISM GID of requestor */
93 u8 res[32];
94};
95
80struct smc_clc_msg_proposal { /* clc proposal message sent by Linux */ 96struct smc_clc_msg_proposal { /* clc proposal message sent by Linux */
81 struct smc_clc_msg_hdr hdr; 97 struct smc_clc_msg_hdr hdr;
82 struct smc_clc_msg_local lcl; 98 struct smc_clc_msg_local lcl;
@@ -94,23 +110,45 @@ struct smc_clc_msg_proposal { /* clc proposal message sent by Linux */
94 110
95struct smc_clc_msg_accept_confirm { /* clc accept / confirm message */ 111struct smc_clc_msg_accept_confirm { /* clc accept / confirm message */
96 struct smc_clc_msg_hdr hdr; 112 struct smc_clc_msg_hdr hdr;
97 struct smc_clc_msg_local lcl; 113 union {
98 u8 qpn[3]; /* QP number */ 114 struct { /* SMC-R */
99 __be32 rmb_rkey; /* RMB rkey */ 115 struct smc_clc_msg_local lcl;
100 u8 rmbe_idx; /* Index of RMBE in RMB */ 116 u8 qpn[3]; /* QP number */
101 __be32 rmbe_alert_token;/* unique connection id */ 117 __be32 rmb_rkey; /* RMB rkey */
118 u8 rmbe_idx; /* Index of RMBE in RMB */
119 __be32 rmbe_alert_token;/* unique connection id */
102#if defined(__BIG_ENDIAN_BITFIELD) 120#if defined(__BIG_ENDIAN_BITFIELD)
103 u8 rmbe_size : 4, /* RMBE buf size (compressed notation) */ 121 u8 rmbe_size : 4, /* buf size (compressed) */
104 qp_mtu : 4; /* QP mtu */ 122 qp_mtu : 4; /* QP mtu */
105#elif defined(__LITTLE_ENDIAN_BITFIELD) 123#elif defined(__LITTLE_ENDIAN_BITFIELD)
106 u8 qp_mtu : 4, 124 u8 qp_mtu : 4,
107 rmbe_size : 4; 125 rmbe_size : 4;
108#endif 126#endif
109 u8 reserved; 127 u8 reserved;
110 __be64 rmb_dma_addr; /* RMB virtual address */ 128 __be64 rmb_dma_addr; /* RMB virtual address */
111 u8 reserved2; 129 u8 reserved2;
112 u8 psn[3]; /* initial packet sequence number */ 130 u8 psn[3]; /* packet sequence number */
113 struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */ 131 struct smc_clc_msg_trail smcr_trl;
132 /* eye catcher "SMCR" EBCDIC */
133 } __packed;
134 struct { /* SMC-D */
135 u64 gid; /* Sender GID */
136 u64 token; /* DMB token */
137 u8 dmbe_idx; /* DMBE index */
138#if defined(__BIG_ENDIAN_BITFIELD)
139 u8 dmbe_size : 4, /* buf size (compressed) */
140 reserved3 : 4;
141#elif defined(__LITTLE_ENDIAN_BITFIELD)
142 u8 reserved3 : 4,
143 dmbe_size : 4;
144#endif
145 u16 reserved4;
146 u32 linkid; /* Link identifier */
147 u32 reserved5[3];
148 struct smc_clc_msg_trail smcd_trl;
149 /* eye catcher "SMCD" EBCDIC */
150 } __packed;
151 };
114} __packed; /* format defined in RFC7609 */ 152} __packed; /* format defined in RFC7609 */
115 153
116struct smc_clc_msg_decline { /* clc decline message */ 154struct smc_clc_msg_decline { /* clc decline message */
@@ -129,13 +167,26 @@ smc_clc_proposal_get_prefix(struct smc_clc_msg_proposal *pclc)
129 ((u8 *)pclc + sizeof(*pclc) + ntohs(pclc->iparea_offset)); 167 ((u8 *)pclc + sizeof(*pclc) + ntohs(pclc->iparea_offset));
130} 168}
131 169
170/* get SMC-D info from proposal message */
171static inline struct smc_clc_msg_smcd *
172smc_get_clc_msg_smcd(struct smc_clc_msg_proposal *prop)
173{
174 if (ntohs(prop->iparea_offset) != sizeof(struct smc_clc_msg_smcd))
175 return NULL;
176
177 return (struct smc_clc_msg_smcd *)(prop + 1);
178}
179
180struct smcd_dev;
181
132int smc_clc_prfx_match(struct socket *clcsock, 182int smc_clc_prfx_match(struct socket *clcsock,
133 struct smc_clc_msg_proposal_prefix *prop); 183 struct smc_clc_msg_proposal_prefix *prop);
134int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, 184int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
135 u8 expected_type); 185 u8 expected_type);
136int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info); 186int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info);
137int smc_clc_send_proposal(struct smc_sock *smc, struct smc_ib_device *smcibdev, 187int smc_clc_send_proposal(struct smc_sock *smc, int smc_type,
138 u8 ibport); 188 struct smc_ib_device *smcibdev, u8 ibport, u8 gid[],
189 struct smcd_dev *ismdev);
139int smc_clc_send_confirm(struct smc_sock *smc); 190int smc_clc_send_confirm(struct smc_sock *smc);
140int smc_clc_send_accept(struct smc_sock *smc, int srv_first_contact); 191int smc_clc_send_accept(struct smc_sock *smc, int srv_first_contact);
141 192
diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c
index fa41d9881741..ac961dfb1ea1 100644
--- a/net/smc/smc_close.c
+++ b/net/smc/smc_close.c
@@ -107,6 +107,8 @@ static void smc_close_active_abort(struct smc_sock *smc)
107 } 107 }
108 switch (sk->sk_state) { 108 switch (sk->sk_state) {
109 case SMC_INIT: 109 case SMC_INIT:
110 sk->sk_state = SMC_PEERABORTWAIT;
111 break;
110 case SMC_ACTIVE: 112 case SMC_ACTIVE:
111 sk->sk_state = SMC_PEERABORTWAIT; 113 sk->sk_state = SMC_PEERABORTWAIT;
112 release_sock(sk); 114 release_sock(sk);
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index add82b0266f3..e871368500e3 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -16,6 +16,7 @@
16#include <net/tcp.h> 16#include <net/tcp.h>
17#include <net/sock.h> 17#include <net/sock.h>
18#include <rdma/ib_verbs.h> 18#include <rdma/ib_verbs.h>
19#include <rdma/ib_cache.h>
19 20
20#include "smc.h" 21#include "smc.h"
21#include "smc_clc.h" 22#include "smc_clc.h"
@@ -25,10 +26,12 @@
25#include "smc_llc.h" 26#include "smc_llc.h"
26#include "smc_cdc.h" 27#include "smc_cdc.h"
27#include "smc_close.h" 28#include "smc_close.h"
29#include "smc_ism.h"
28 30
29#define SMC_LGR_NUM_INCR 256 31#define SMC_LGR_NUM_INCR 256
30#define SMC_LGR_FREE_DELAY_SERV (600 * HZ) 32#define SMC_LGR_FREE_DELAY_SERV (600 * HZ)
31#define SMC_LGR_FREE_DELAY_CLNT (SMC_LGR_FREE_DELAY_SERV + 10 * HZ) 33#define SMC_LGR_FREE_DELAY_CLNT (SMC_LGR_FREE_DELAY_SERV + 10 * HZ)
34#define SMC_LGR_FREE_DELAY_FAST (8 * HZ)
32 35
33static struct smc_lgr_list smc_lgr_list = { /* established link groups */ 36static struct smc_lgr_list smc_lgr_list = { /* established link groups */
34 .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock), 37 .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
@@ -46,8 +49,13 @@ static void smc_lgr_schedule_free_work(struct smc_link_group *lgr)
46 * otherwise there is a risk of out-of-sync link groups. 49 * otherwise there is a risk of out-of-sync link groups.
47 */ 50 */
48 mod_delayed_work(system_wq, &lgr->free_work, 51 mod_delayed_work(system_wq, &lgr->free_work,
49 lgr->role == SMC_CLNT ? SMC_LGR_FREE_DELAY_CLNT : 52 (!lgr->is_smcd && lgr->role == SMC_CLNT) ?
50 SMC_LGR_FREE_DELAY_SERV); 53 SMC_LGR_FREE_DELAY_CLNT : SMC_LGR_FREE_DELAY_SERV);
54}
55
56void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr)
57{
58 mod_delayed_work(system_wq, &lgr->free_work, SMC_LGR_FREE_DELAY_FAST);
51} 59}
52 60
53/* Register connection's alert token in our lookup structure. 61/* Register connection's alert token in our lookup structure.
@@ -132,6 +140,20 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn)
132 smc_lgr_schedule_free_work(lgr); 140 smc_lgr_schedule_free_work(lgr);
133} 141}
134 142
143/* Send delete link, either as client to request the initiation
144 * of the DELETE LINK sequence from server; or as server to
145 * initiate the delete processing. See smc_llc_rx_delete_link().
146 */
147static int smc_link_send_delete(struct smc_link *lnk)
148{
149 if (lnk->state == SMC_LNK_ACTIVE &&
150 !smc_llc_send_delete_link(lnk, SMC_LLC_REQ, true)) {
151 smc_llc_link_deleting(lnk);
152 return 0;
153 }
154 return -ENOTCONN;
155}
156
135static void smc_lgr_free_work(struct work_struct *work) 157static void smc_lgr_free_work(struct work_struct *work)
136{ 158{
137 struct smc_link_group *lgr = container_of(to_delayed_work(work), 159 struct smc_link_group *lgr = container_of(to_delayed_work(work),
@@ -152,17 +174,30 @@ static void smc_lgr_free_work(struct work_struct *work)
152 list_del_init(&lgr->list); /* remove from smc_lgr_list */ 174 list_del_init(&lgr->list); /* remove from smc_lgr_list */
153free: 175free:
154 spin_unlock_bh(&smc_lgr_list.lock); 176 spin_unlock_bh(&smc_lgr_list.lock);
177
178 if (!lgr->is_smcd && !lgr->terminating) {
179 /* try to send del link msg, on error free lgr immediately */
180 if (!smc_link_send_delete(&lgr->lnk[SMC_SINGLE_LINK])) {
181 /* reschedule in case we never receive a response */
182 smc_lgr_schedule_free_work(lgr);
183 return;
184 }
185 }
186
155 if (!delayed_work_pending(&lgr->free_work)) { 187 if (!delayed_work_pending(&lgr->free_work)) {
156 if (lgr->lnk[SMC_SINGLE_LINK].state != SMC_LNK_INACTIVE) 188 struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
157 smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); 189
190 if (!lgr->is_smcd && lnk->state != SMC_LNK_INACTIVE)
191 smc_llc_link_inactive(lnk);
158 smc_lgr_free(lgr); 192 smc_lgr_free(lgr);
159 } 193 }
160} 194}
161 195
162/* create a new SMC link group */ 196/* create a new SMC link group */
163static int smc_lgr_create(struct smc_sock *smc, 197static int smc_lgr_create(struct smc_sock *smc, bool is_smcd,
164 struct smc_ib_device *smcibdev, u8 ibport, 198 struct smc_ib_device *smcibdev, u8 ibport,
165 char *peer_systemid, unsigned short vlan_id) 199 char *peer_systemid, unsigned short vlan_id,
200 struct smcd_dev *smcismdev, u64 peer_gid)
166{ 201{
167 struct smc_link_group *lgr; 202 struct smc_link_group *lgr;
168 struct smc_link *lnk; 203 struct smc_link *lnk;
@@ -170,17 +205,23 @@ static int smc_lgr_create(struct smc_sock *smc,
170 int rc = 0; 205 int rc = 0;
171 int i; 206 int i;
172 207
208 if (is_smcd && vlan_id) {
209 rc = smc_ism_get_vlan(smcismdev, vlan_id);
210 if (rc)
211 goto out;
212 }
213
173 lgr = kzalloc(sizeof(*lgr), GFP_KERNEL); 214 lgr = kzalloc(sizeof(*lgr), GFP_KERNEL);
174 if (!lgr) { 215 if (!lgr) {
175 rc = -ENOMEM; 216 rc = -ENOMEM;
176 goto out; 217 goto out;
177 } 218 }
178 lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT; 219 lgr->is_smcd = is_smcd;
179 lgr->sync_err = 0; 220 lgr->sync_err = 0;
180 memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN);
181 lgr->vlan_id = vlan_id; 221 lgr->vlan_id = vlan_id;
182 rwlock_init(&lgr->sndbufs_lock); 222 rwlock_init(&lgr->sndbufs_lock);
183 rwlock_init(&lgr->rmbs_lock); 223 rwlock_init(&lgr->rmbs_lock);
224 rwlock_init(&lgr->conns_lock);
184 for (i = 0; i < SMC_RMBE_SIZES; i++) { 225 for (i = 0; i < SMC_RMBE_SIZES; i++) {
185 INIT_LIST_HEAD(&lgr->sndbufs[i]); 226 INIT_LIST_HEAD(&lgr->sndbufs[i]);
186 INIT_LIST_HEAD(&lgr->rmbs[i]); 227 INIT_LIST_HEAD(&lgr->rmbs[i]);
@@ -189,36 +230,48 @@ static int smc_lgr_create(struct smc_sock *smc,
189 memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE); 230 memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE);
190 INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work); 231 INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work);
191 lgr->conns_all = RB_ROOT; 232 lgr->conns_all = RB_ROOT;
192 233 if (is_smcd) {
193 lnk = &lgr->lnk[SMC_SINGLE_LINK]; 234 /* SMC-D specific settings */
194 /* initialize link */ 235 lgr->peer_gid = peer_gid;
195 lnk->state = SMC_LNK_ACTIVATING; 236 lgr->smcd = smcismdev;
196 lnk->link_id = SMC_SINGLE_LINK; 237 } else {
197 lnk->smcibdev = smcibdev; 238 /* SMC-R specific settings */
198 lnk->ibport = ibport; 239 lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
199 lnk->path_mtu = smcibdev->pattr[ibport - 1].active_mtu; 240 memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN);
200 if (!smcibdev->initialized) 241
201 smc_ib_setup_per_ibdev(smcibdev); 242 lnk = &lgr->lnk[SMC_SINGLE_LINK];
202 get_random_bytes(rndvec, sizeof(rndvec)); 243 /* initialize link */
203 lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + (rndvec[2] << 16); 244 lnk->state = SMC_LNK_ACTIVATING;
204 rc = smc_llc_link_init(lnk); 245 lnk->link_id = SMC_SINGLE_LINK;
205 if (rc) 246 lnk->smcibdev = smcibdev;
206 goto free_lgr; 247 lnk->ibport = ibport;
207 rc = smc_wr_alloc_link_mem(lnk); 248 lnk->path_mtu = smcibdev->pattr[ibport - 1].active_mtu;
208 if (rc) 249 if (!smcibdev->initialized)
209 goto clear_llc_lnk; 250 smc_ib_setup_per_ibdev(smcibdev);
210 rc = smc_ib_create_protection_domain(lnk); 251 get_random_bytes(rndvec, sizeof(rndvec));
211 if (rc) 252 lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) +
212 goto free_link_mem; 253 (rndvec[2] << 16);
213 rc = smc_ib_create_queue_pair(lnk); 254 rc = smc_ib_determine_gid(lnk->smcibdev, lnk->ibport,
214 if (rc) 255 vlan_id, lnk->gid, &lnk->sgid_index);
215 goto dealloc_pd; 256 if (rc)
216 rc = smc_wr_create_link(lnk); 257 goto free_lgr;
217 if (rc) 258 rc = smc_llc_link_init(lnk);
218 goto destroy_qp; 259 if (rc)
219 260 goto free_lgr;
261 rc = smc_wr_alloc_link_mem(lnk);
262 if (rc)
263 goto clear_llc_lnk;
264 rc = smc_ib_create_protection_domain(lnk);
265 if (rc)
266 goto free_link_mem;
267 rc = smc_ib_create_queue_pair(lnk);
268 if (rc)
269 goto dealloc_pd;
270 rc = smc_wr_create_link(lnk);
271 if (rc)
272 goto destroy_qp;
273 }
220 smc->conn.lgr = lgr; 274 smc->conn.lgr = lgr;
221 rwlock_init(&lgr->conns_lock);
222 spin_lock_bh(&smc_lgr_list.lock); 275 spin_lock_bh(&smc_lgr_list.lock);
223 list_add(&lgr->list, &smc_lgr_list.list); 276 list_add(&lgr->list, &smc_lgr_list.list);
224 spin_unlock_bh(&smc_lgr_list.lock); 277 spin_unlock_bh(&smc_lgr_list.lock);
@@ -264,7 +317,12 @@ void smc_conn_free(struct smc_connection *conn)
264{ 317{
265 if (!conn->lgr) 318 if (!conn->lgr)
266 return; 319 return;
267 smc_cdc_tx_dismiss_slots(conn); 320 if (conn->lgr->is_smcd) {
321 smc_ism_unset_conn(conn);
322 tasklet_kill(&conn->rx_tsklet);
323 } else {
324 smc_cdc_tx_dismiss_slots(conn);
325 }
268 smc_lgr_unregister_conn(conn); 326 smc_lgr_unregister_conn(conn);
269 smc_buf_unuse(conn); 327 smc_buf_unuse(conn);
270} 328}
@@ -280,8 +338,8 @@ static void smc_link_clear(struct smc_link *lnk)
280 smc_wr_free_link_mem(lnk); 338 smc_wr_free_link_mem(lnk);
281} 339}
282 340
283static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, 341static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb,
284 struct smc_buf_desc *buf_desc) 342 struct smc_buf_desc *buf_desc)
285{ 343{
286 struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; 344 struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
287 345
@@ -301,6 +359,28 @@ static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb,
301 kfree(buf_desc); 359 kfree(buf_desc);
302} 360}
303 361
362static void smcd_buf_free(struct smc_link_group *lgr, bool is_dmb,
363 struct smc_buf_desc *buf_desc)
364{
365 if (is_dmb) {
366 /* restore original buf len */
367 buf_desc->len += sizeof(struct smcd_cdc_msg);
368 smc_ism_unregister_dmb(lgr->smcd, buf_desc);
369 } else {
370 kfree(buf_desc->cpu_addr);
371 }
372 kfree(buf_desc);
373}
374
375static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb,
376 struct smc_buf_desc *buf_desc)
377{
378 if (lgr->is_smcd)
379 smcd_buf_free(lgr, is_rmb, buf_desc);
380 else
381 smcr_buf_free(lgr, is_rmb, buf_desc);
382}
383
304static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb) 384static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb)
305{ 385{
306 struct smc_buf_desc *buf_desc, *bf_desc; 386 struct smc_buf_desc *buf_desc, *bf_desc;
@@ -332,7 +412,10 @@ static void smc_lgr_free_bufs(struct smc_link_group *lgr)
332void smc_lgr_free(struct smc_link_group *lgr) 412void smc_lgr_free(struct smc_link_group *lgr)
333{ 413{
334 smc_lgr_free_bufs(lgr); 414 smc_lgr_free_bufs(lgr);
335 smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]); 415 if (lgr->is_smcd)
416 smc_ism_put_vlan(lgr->smcd, lgr->vlan_id);
417 else
418 smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]);
336 kfree(lgr); 419 kfree(lgr);
337} 420}
338 421
@@ -357,7 +440,8 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr)
357 lgr->terminating = 1; 440 lgr->terminating = 1;
358 if (!list_empty(&lgr->list)) /* forget lgr */ 441 if (!list_empty(&lgr->list)) /* forget lgr */
359 list_del_init(&lgr->list); 442 list_del_init(&lgr->list);
360 smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); 443 if (!lgr->is_smcd)
444 smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]);
361 445
362 write_lock_bh(&lgr->conns_lock); 446 write_lock_bh(&lgr->conns_lock);
363 node = rb_first(&lgr->conns_all); 447 node = rb_first(&lgr->conns_all);
@@ -374,7 +458,8 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr)
374 node = rb_first(&lgr->conns_all); 458 node = rb_first(&lgr->conns_all);
375 } 459 }
376 write_unlock_bh(&lgr->conns_lock); 460 write_unlock_bh(&lgr->conns_lock);
377 wake_up(&lgr->lnk[SMC_SINGLE_LINK].wr_reg_wait); 461 if (!lgr->is_smcd)
462 wake_up(&lgr->lnk[SMC_SINGLE_LINK].wr_reg_wait);
378 smc_lgr_schedule_free_work(lgr); 463 smc_lgr_schedule_free_work(lgr);
379} 464}
380 465
@@ -392,17 +477,44 @@ void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport)
392 477
393 spin_lock_bh(&smc_lgr_list.lock); 478 spin_lock_bh(&smc_lgr_list.lock);
394 list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) { 479 list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) {
395 if (lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev && 480 if (!lgr->is_smcd &&
481 lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev &&
396 lgr->lnk[SMC_SINGLE_LINK].ibport == ibport) 482 lgr->lnk[SMC_SINGLE_LINK].ibport == ibport)
397 __smc_lgr_terminate(lgr); 483 __smc_lgr_terminate(lgr);
398 } 484 }
399 spin_unlock_bh(&smc_lgr_list.lock); 485 spin_unlock_bh(&smc_lgr_list.lock);
400} 486}
401 487
488/* Called when SMC-D device is terminated or peer is lost */
489void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid)
490{
491 struct smc_link_group *lgr, *l;
492 LIST_HEAD(lgr_free_list);
493
494 /* run common cleanup function and build free list */
495 spin_lock_bh(&smc_lgr_list.lock);
496 list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) {
497 if (lgr->is_smcd && lgr->smcd == dev &&
498 (!peer_gid || lgr->peer_gid == peer_gid) &&
499 !list_empty(&lgr->list)) {
500 __smc_lgr_terminate(lgr);
501 list_move(&lgr->list, &lgr_free_list);
502 }
503 }
504 spin_unlock_bh(&smc_lgr_list.lock);
505
506 /* cancel the regular free workers and actually free lgrs */
507 list_for_each_entry_safe(lgr, l, &lgr_free_list, list) {
508 list_del_init(&lgr->list);
509 cancel_delayed_work_sync(&lgr->free_work);
510 smc_lgr_free(lgr);
511 }
512}
513
402/* Determine vlan of internal TCP socket. 514/* Determine vlan of internal TCP socket.
403 * @vlan_id: address to store the determined vlan id into 515 * @vlan_id: address to store the determined vlan id into
404 */ 516 */
405static int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id) 517int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id)
406{ 518{
407 struct dst_entry *dst = sk_dst_get(clcsock->sk); 519 struct dst_entry *dst = sk_dst_get(clcsock->sk);
408 struct net_device *ndev; 520 struct net_device *ndev;
@@ -446,41 +558,30 @@ out:
446 return rc; 558 return rc;
447} 559}
448 560
449/* determine the link gid matching the vlan id of the link group */ 561static bool smcr_lgr_match(struct smc_link_group *lgr,
450static int smc_link_determine_gid(struct smc_link_group *lgr) 562 struct smc_clc_msg_local *lcl,
563 enum smc_lgr_role role)
451{ 564{
452 struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; 565 return !memcmp(lgr->peer_systemid, lcl->id_for_peer,
453 struct ib_gid_attr gattr; 566 SMC_SYSTEMID_LEN) &&
454 union ib_gid gid; 567 !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_gid, &lcl->gid,
455 int i; 568 SMC_GID_SIZE) &&
456 569 !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_mac, lcl->mac,
457 if (!lgr->vlan_id) { 570 sizeof(lcl->mac)) &&
458 lnk->gid = lnk->smcibdev->gid[lnk->ibport - 1]; 571 lgr->role == role;
459 return 0; 572}
460 }
461 573
462 for (i = 0; i < lnk->smcibdev->pattr[lnk->ibport - 1].gid_tbl_len; 574static bool smcd_lgr_match(struct smc_link_group *lgr,
463 i++) { 575 struct smcd_dev *smcismdev, u64 peer_gid)
464 if (ib_query_gid(lnk->smcibdev->ibdev, lnk->ibport, i, &gid, 576{
465 &gattr)) 577 return lgr->peer_gid == peer_gid && lgr->smcd == smcismdev;
466 continue;
467 if (gattr.ndev) {
468 if (is_vlan_dev(gattr.ndev) &&
469 vlan_dev_vlan_id(gattr.ndev) == lgr->vlan_id) {
470 lnk->gid = gid;
471 dev_put(gattr.ndev);
472 return 0;
473 }
474 dev_put(gattr.ndev);
475 }
476 }
477 return -ENODEV;
478} 578}
479 579
480/* create a new SMC connection (and a new link group if necessary) */ 580/* create a new SMC connection (and a new link group if necessary) */
481int smc_conn_create(struct smc_sock *smc, 581int smc_conn_create(struct smc_sock *smc, bool is_smcd, int srv_first_contact,
482 struct smc_ib_device *smcibdev, u8 ibport, 582 struct smc_ib_device *smcibdev, u8 ibport,
483 struct smc_clc_msg_local *lcl, int srv_first_contact) 583 struct smc_clc_msg_local *lcl, struct smcd_dev *smcd,
584 u64 peer_gid)
484{ 585{
485 struct smc_connection *conn = &smc->conn; 586 struct smc_connection *conn = &smc->conn;
486 int local_contact = SMC_FIRST_CONTACT; 587 int local_contact = SMC_FIRST_CONTACT;
@@ -502,17 +603,12 @@ int smc_conn_create(struct smc_sock *smc,
502 spin_lock_bh(&smc_lgr_list.lock); 603 spin_lock_bh(&smc_lgr_list.lock);
503 list_for_each_entry(lgr, &smc_lgr_list.list, list) { 604 list_for_each_entry(lgr, &smc_lgr_list.list, list) {
504 write_lock_bh(&lgr->conns_lock); 605 write_lock_bh(&lgr->conns_lock);
505 if (!memcmp(lgr->peer_systemid, lcl->id_for_peer, 606 if ((is_smcd ? smcd_lgr_match(lgr, smcd, peer_gid) :
506 SMC_SYSTEMID_LEN) && 607 smcr_lgr_match(lgr, lcl, role)) &&
507 !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_gid, &lcl->gid,
508 SMC_GID_SIZE) &&
509 !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_mac, lcl->mac,
510 sizeof(lcl->mac)) &&
511 !lgr->sync_err && 608 !lgr->sync_err &&
512 (lgr->role == role) && 609 lgr->vlan_id == vlan_id &&
513 (lgr->vlan_id == vlan_id) && 610 (role == SMC_CLNT ||
514 ((role == SMC_CLNT) || 611 lgr->conns_num < SMC_RMBS_PER_LGR_MAX)) {
515 (lgr->conns_num < SMC_RMBS_PER_LGR_MAX))) {
516 /* link group found */ 612 /* link group found */
517 local_contact = SMC_REUSE_CONTACT; 613 local_contact = SMC_REUSE_CONTACT;
518 conn->lgr = lgr; 614 conn->lgr = lgr;
@@ -535,16 +631,19 @@ int smc_conn_create(struct smc_sock *smc,
535 631
536create: 632create:
537 if (local_contact == SMC_FIRST_CONTACT) { 633 if (local_contact == SMC_FIRST_CONTACT) {
538 rc = smc_lgr_create(smc, smcibdev, ibport, 634 rc = smc_lgr_create(smc, is_smcd, smcibdev, ibport,
539 lcl->id_for_peer, vlan_id); 635 lcl->id_for_peer, vlan_id, smcd, peer_gid);
540 if (rc) 636 if (rc)
541 goto out; 637 goto out;
542 smc_lgr_register_conn(conn); /* add smc conn to lgr */ 638 smc_lgr_register_conn(conn); /* add smc conn to lgr */
543 rc = smc_link_determine_gid(conn->lgr);
544 } 639 }
545 conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; 640 conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE;
546 conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; 641 conn->local_tx_ctrl.len = SMC_WR_TX_SIZE;
547 conn->urg_state = SMC_URG_READ; 642 conn->urg_state = SMC_URG_READ;
643 if (is_smcd) {
644 conn->rx_off = sizeof(struct smcd_cdc_msg);
645 smcd_cdc_rx_init(conn); /* init tasklet for this conn */
646 }
548#ifndef KERNEL_HAS_ATOMIC64 647#ifndef KERNEL_HAS_ATOMIC64
549 spin_lock_init(&conn->acurs_lock); 648 spin_lock_init(&conn->acurs_lock);
550#endif 649#endif
@@ -609,8 +708,8 @@ static inline int smc_rmb_wnd_update_limit(int rmbe_size)
609 return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2); 708 return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2);
610} 709}
611 710
612static struct smc_buf_desc *smc_new_buf_create(struct smc_link_group *lgr, 711static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr,
613 bool is_rmb, int bufsize) 712 bool is_rmb, int bufsize)
614{ 713{
615 struct smc_buf_desc *buf_desc; 714 struct smc_buf_desc *buf_desc;
616 struct smc_link *lnk; 715 struct smc_link *lnk;
@@ -668,7 +767,44 @@ static struct smc_buf_desc *smc_new_buf_create(struct smc_link_group *lgr,
668 return buf_desc; 767 return buf_desc;
669} 768}
670 769
671static int __smc_buf_create(struct smc_sock *smc, bool is_rmb) 770#define SMCD_DMBE_SIZES 7 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */
771
772static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr,
773 bool is_dmb, int bufsize)
774{
775 struct smc_buf_desc *buf_desc;
776 int rc;
777
778 if (smc_compress_bufsize(bufsize) > SMCD_DMBE_SIZES)
779 return ERR_PTR(-EAGAIN);
780
781 /* try to alloc a new DMB */
782 buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL);
783 if (!buf_desc)
784 return ERR_PTR(-ENOMEM);
785 if (is_dmb) {
786 rc = smc_ism_register_dmb(lgr, bufsize, buf_desc);
787 if (rc) {
788 kfree(buf_desc);
789 return ERR_PTR(-EAGAIN);
790 }
791 buf_desc->pages = virt_to_page(buf_desc->cpu_addr);
792 /* CDC header stored in buf. So, pretend it was smaller */
793 buf_desc->len = bufsize - sizeof(struct smcd_cdc_msg);
794 } else {
795 buf_desc->cpu_addr = kzalloc(bufsize, GFP_KERNEL |
796 __GFP_NOWARN | __GFP_NORETRY |
797 __GFP_NOMEMALLOC);
798 if (!buf_desc->cpu_addr) {
799 kfree(buf_desc);
800 return ERR_PTR(-EAGAIN);
801 }
802 buf_desc->len = bufsize;
803 }
804 return buf_desc;
805}
806
807static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
672{ 808{
673 struct smc_buf_desc *buf_desc = ERR_PTR(-ENOMEM); 809 struct smc_buf_desc *buf_desc = ERR_PTR(-ENOMEM);
674 struct smc_connection *conn = &smc->conn; 810 struct smc_connection *conn = &smc->conn;
@@ -706,7 +842,11 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_rmb)
706 break; /* found reusable slot */ 842 break; /* found reusable slot */
707 } 843 }
708 844
709 buf_desc = smc_new_buf_create(lgr, is_rmb, bufsize); 845 if (is_smcd)
846 buf_desc = smcd_new_buf_create(lgr, is_rmb, bufsize);
847 else
848 buf_desc = smcr_new_buf_create(lgr, is_rmb, bufsize);
849
710 if (PTR_ERR(buf_desc) == -ENOMEM) 850 if (PTR_ERR(buf_desc) == -ENOMEM)
711 break; 851 break;
712 if (IS_ERR(buf_desc)) 852 if (IS_ERR(buf_desc))
@@ -727,7 +867,10 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_rmb)
727 conn->rmbe_size_short = bufsize_short; 867 conn->rmbe_size_short = bufsize_short;
728 smc->sk.sk_rcvbuf = bufsize * 2; 868 smc->sk.sk_rcvbuf = bufsize * 2;
729 atomic_set(&conn->bytes_to_rcv, 0); 869 atomic_set(&conn->bytes_to_rcv, 0);
730 conn->rmbe_update_limit = smc_rmb_wnd_update_limit(bufsize); 870 conn->rmbe_update_limit =
871 smc_rmb_wnd_update_limit(buf_desc->len);
872 if (is_smcd)
873 smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */
731 } else { 874 } else {
732 conn->sndbuf_desc = buf_desc; 875 conn->sndbuf_desc = buf_desc;
733 smc->sk.sk_sndbuf = bufsize * 2; 876 smc->sk.sk_sndbuf = bufsize * 2;
@@ -740,6 +883,8 @@ void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn)
740{ 883{
741 struct smc_link_group *lgr = conn->lgr; 884 struct smc_link_group *lgr = conn->lgr;
742 885
886 if (!conn->lgr || conn->lgr->is_smcd)
887 return;
743 smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev, 888 smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
744 conn->sndbuf_desc, DMA_TO_DEVICE); 889 conn->sndbuf_desc, DMA_TO_DEVICE);
745} 890}
@@ -748,6 +893,8 @@ void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn)
748{ 893{
749 struct smc_link_group *lgr = conn->lgr; 894 struct smc_link_group *lgr = conn->lgr;
750 895
896 if (!conn->lgr || conn->lgr->is_smcd)
897 return;
751 smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev, 898 smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
752 conn->sndbuf_desc, DMA_TO_DEVICE); 899 conn->sndbuf_desc, DMA_TO_DEVICE);
753} 900}
@@ -756,6 +903,8 @@ void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn)
756{ 903{
757 struct smc_link_group *lgr = conn->lgr; 904 struct smc_link_group *lgr = conn->lgr;
758 905
906 if (!conn->lgr || conn->lgr->is_smcd)
907 return;
759 smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev, 908 smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
760 conn->rmb_desc, DMA_FROM_DEVICE); 909 conn->rmb_desc, DMA_FROM_DEVICE);
761} 910}
@@ -764,6 +913,8 @@ void smc_rmb_sync_sg_for_device(struct smc_connection *conn)
764{ 913{
765 struct smc_link_group *lgr = conn->lgr; 914 struct smc_link_group *lgr = conn->lgr;
766 915
916 if (!conn->lgr || conn->lgr->is_smcd)
917 return;
767 smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev, 918 smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
768 conn->rmb_desc, DMA_FROM_DEVICE); 919 conn->rmb_desc, DMA_FROM_DEVICE);
769} 920}
@@ -774,16 +925,16 @@ void smc_rmb_sync_sg_for_device(struct smc_connection *conn)
774 * the Linux implementation uses just one RMB-element per RMB, i.e. uses an 925 * the Linux implementation uses just one RMB-element per RMB, i.e. uses an
775 * extra RMB for every connection in a link group 926 * extra RMB for every connection in a link group
776 */ 927 */
777int smc_buf_create(struct smc_sock *smc) 928int smc_buf_create(struct smc_sock *smc, bool is_smcd)
778{ 929{
779 int rc; 930 int rc;
780 931
781 /* create send buffer */ 932 /* create send buffer */
782 rc = __smc_buf_create(smc, false); 933 rc = __smc_buf_create(smc, is_smcd, false);
783 if (rc) 934 if (rc)
784 return rc; 935 return rc;
785 /* create rmb */ 936 /* create rmb */
786 rc = __smc_buf_create(smc, true); 937 rc = __smc_buf_create(smc, is_smcd, true);
787 if (rc) 938 if (rc)
788 smc_buf_free(smc->conn.lgr, false, smc->conn.sndbuf_desc); 939 smc_buf_free(smc->conn.lgr, false, smc->conn.sndbuf_desc);
789 return rc; 940 return rc;
@@ -865,7 +1016,14 @@ void smc_core_exit(void)
865 spin_unlock_bh(&smc_lgr_list.lock); 1016 spin_unlock_bh(&smc_lgr_list.lock);
866 list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) { 1017 list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) {
867 list_del_init(&lgr->list); 1018 list_del_init(&lgr->list);
868 smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); 1019 if (!lgr->is_smcd) {
1020 struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
1021
1022 if (lnk->state == SMC_LNK_ACTIVE)
1023 smc_llc_send_delete_link(lnk, SMC_LLC_REQ,
1024 false);
1025 smc_llc_link_inactive(lnk);
1026 }
869 cancel_delayed_work_sync(&lgr->free_work); 1027 cancel_delayed_work_sync(&lgr->free_work);
870 smc_lgr_free(lgr); /* free link group */ 1028 smc_lgr_free(lgr); /* free link group */
871 } 1029 }
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 93cb3523bf50..c156674733c9 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -34,7 +34,8 @@ enum smc_lgr_role { /* possible roles of a link group */
34enum smc_link_state { /* possible states of a link */ 34enum smc_link_state { /* possible states of a link */
35 SMC_LNK_INACTIVE, /* link is inactive */ 35 SMC_LNK_INACTIVE, /* link is inactive */
36 SMC_LNK_ACTIVATING, /* link is being activated */ 36 SMC_LNK_ACTIVATING, /* link is being activated */
37 SMC_LNK_ACTIVE /* link is active */ 37 SMC_LNK_ACTIVE, /* link is active */
38 SMC_LNK_DELETING, /* link is being deleted */
38}; 39};
39 40
40#define SMC_WR_BUF_SIZE 48 /* size of work request buffer */ 41#define SMC_WR_BUF_SIZE 48 /* size of work request buffer */
@@ -84,14 +85,15 @@ struct smc_link {
84 wait_queue_head_t wr_reg_wait; /* wait for wr_reg result */ 85 wait_queue_head_t wr_reg_wait; /* wait for wr_reg result */
85 enum smc_wr_reg_state wr_reg_state; /* state of wr_reg request */ 86 enum smc_wr_reg_state wr_reg_state; /* state of wr_reg request */
86 87
87 union ib_gid gid; /* gid matching used vlan id */ 88 u8 gid[SMC_GID_SIZE];/* gid matching used vlan id*/
89 u8 sgid_index; /* gid index for vlan id */
88 u32 peer_qpn; /* QP number of peer */ 90 u32 peer_qpn; /* QP number of peer */
89 enum ib_mtu path_mtu; /* used mtu */ 91 enum ib_mtu path_mtu; /* used mtu */
90 enum ib_mtu peer_mtu; /* mtu size of peer */ 92 enum ib_mtu peer_mtu; /* mtu size of peer */
91 u32 psn_initial; /* QP tx initial packet seqno */ 93 u32 psn_initial; /* QP tx initial packet seqno */
92 u32 peer_psn; /* QP rx initial packet seqno */ 94 u32 peer_psn; /* QP rx initial packet seqno */
93 u8 peer_mac[ETH_ALEN]; /* = gid[8:10||13:15] */ 95 u8 peer_mac[ETH_ALEN]; /* = gid[8:10||13:15] */
94 u8 peer_gid[sizeof(union ib_gid)]; /* gid of peer*/ 96 u8 peer_gid[SMC_GID_SIZE]; /* gid of peer*/
95 u8 link_id; /* unique # within link group */ 97 u8 link_id; /* unique # within link group */
96 98
97 enum smc_link_state state; /* state of link */ 99 enum smc_link_state state; /* state of link */
@@ -124,15 +126,28 @@ struct smc_buf_desc {
124 void *cpu_addr; /* virtual address of buffer */ 126 void *cpu_addr; /* virtual address of buffer */
125 struct page *pages; 127 struct page *pages;
126 int len; /* length of buffer */ 128 int len; /* length of buffer */
127 struct sg_table sgt[SMC_LINKS_PER_LGR_MAX];/* virtual buffer */
128 struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX];
129 /* for rmb only: memory region
130 * incl. rkey provided to peer
131 */
132 u32 order; /* allocation order */
133 u32 used; /* currently used / unused */ 129 u32 used; /* currently used / unused */
134 u8 reused : 1; /* new created / reused */ 130 u8 reused : 1; /* new created / reused */
135 u8 regerr : 1; /* err during registration */ 131 u8 regerr : 1; /* err during registration */
132 union {
133 struct { /* SMC-R */
134 struct sg_table sgt[SMC_LINKS_PER_LGR_MAX];
135 /* virtual buffer */
136 struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX];
137 /* for rmb only: memory region
138 * incl. rkey provided to peer
139 */
140 u32 order; /* allocation order */
141 };
142 struct { /* SMC-D */
143 unsigned short sba_idx;
144 /* SBA index number */
145 u64 token;
146 /* DMB token number */
147 dma_addr_t dma_addr;
148 /* DMA address */
149 };
150 };
136}; 151};
137 152
138struct smc_rtoken { /* address/key of remote RMB */ 153struct smc_rtoken { /* address/key of remote RMB */
@@ -148,12 +163,10 @@ struct smc_rtoken { /* address/key of remote RMB */
148 * struct smc_clc_msg_accept_confirm.rmbe_size being a 4 bit value (0..15) 163 * struct smc_clc_msg_accept_confirm.rmbe_size being a 4 bit value (0..15)
149 */ 164 */
150 165
166struct smcd_dev;
167
151struct smc_link_group { 168struct smc_link_group {
152 struct list_head list; 169 struct list_head list;
153 enum smc_lgr_role role; /* client or server */
154 struct smc_link lnk[SMC_LINKS_PER_LGR_MAX]; /* smc link */
155 char peer_systemid[SMC_SYSTEMID_LEN];
156 /* unique system_id of peer */
157 struct rb_root conns_all; /* connection tree */ 170 struct rb_root conns_all; /* connection tree */
158 rwlock_t conns_lock; /* protects conns_all */ 171 rwlock_t conns_lock; /* protects conns_all */
159 unsigned int conns_num; /* current # of connections */ 172 unsigned int conns_num; /* current # of connections */
@@ -163,17 +176,34 @@ struct smc_link_group {
163 rwlock_t sndbufs_lock; /* protects tx buffers */ 176 rwlock_t sndbufs_lock; /* protects tx buffers */
164 struct list_head rmbs[SMC_RMBE_SIZES]; /* rx buffers */ 177 struct list_head rmbs[SMC_RMBE_SIZES]; /* rx buffers */
165 rwlock_t rmbs_lock; /* protects rx buffers */ 178 rwlock_t rmbs_lock; /* protects rx buffers */
166 struct smc_rtoken rtokens[SMC_RMBS_PER_LGR_MAX]
167 [SMC_LINKS_PER_LGR_MAX];
168 /* remote addr/key pairs */
169 unsigned long rtokens_used_mask[BITS_TO_LONGS(
170 SMC_RMBS_PER_LGR_MAX)];
171 /* used rtoken elements */
172 179
173 u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */ 180 u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */
174 struct delayed_work free_work; /* delayed freeing of an lgr */ 181 struct delayed_work free_work; /* delayed freeing of an lgr */
175 u8 sync_err : 1; /* lgr no longer fits to peer */ 182 u8 sync_err : 1; /* lgr no longer fits to peer */
176 u8 terminating : 1;/* lgr is terminating */ 183 u8 terminating : 1;/* lgr is terminating */
184
185 bool is_smcd; /* SMC-R or SMC-D */
186 union {
187 struct { /* SMC-R */
188 enum smc_lgr_role role;
189 /* client or server */
190 struct smc_link lnk[SMC_LINKS_PER_LGR_MAX];
191 /* smc link */
192 char peer_systemid[SMC_SYSTEMID_LEN];
193 /* unique system_id of peer */
194 struct smc_rtoken rtokens[SMC_RMBS_PER_LGR_MAX]
195 [SMC_LINKS_PER_LGR_MAX];
196 /* remote addr/key pairs */
197 DECLARE_BITMAP(rtokens_used_mask, SMC_RMBS_PER_LGR_MAX);
198 /* used rtoken elements */
199 };
200 struct { /* SMC-D */
201 u64 peer_gid;
202 /* Peer GID (remote) */
203 struct smcd_dev *smcd;
204 /* ISM device for VLAN reg. */
205 };
206 };
177}; 207};
178 208
179/* Find the connection associated with the given alert token in the link group. 209/* Find the connection associated with the given alert token in the link group.
@@ -217,7 +247,8 @@ void smc_lgr_free(struct smc_link_group *lgr);
217void smc_lgr_forget(struct smc_link_group *lgr); 247void smc_lgr_forget(struct smc_link_group *lgr);
218void smc_lgr_terminate(struct smc_link_group *lgr); 248void smc_lgr_terminate(struct smc_link_group *lgr);
219void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport); 249void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport);
220int smc_buf_create(struct smc_sock *smc); 250void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid);
251int smc_buf_create(struct smc_sock *smc, bool is_smcd);
221int smc_uncompress_bufsize(u8 compressed); 252int smc_uncompress_bufsize(u8 compressed);
222int smc_rmb_rtoken_handling(struct smc_connection *conn, 253int smc_rmb_rtoken_handling(struct smc_connection *conn,
223 struct smc_clc_msg_accept_confirm *clc); 254 struct smc_clc_msg_accept_confirm *clc);
@@ -227,9 +258,19 @@ void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn);
227void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn); 258void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn);
228void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn); 259void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn);
229void smc_rmb_sync_sg_for_device(struct smc_connection *conn); 260void smc_rmb_sync_sg_for_device(struct smc_connection *conn);
261int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id);
262
230void smc_conn_free(struct smc_connection *conn); 263void smc_conn_free(struct smc_connection *conn);
231int smc_conn_create(struct smc_sock *smc, 264int smc_conn_create(struct smc_sock *smc, bool is_smcd, int srv_first_contact,
232 struct smc_ib_device *smcibdev, u8 ibport, 265 struct smc_ib_device *smcibdev, u8 ibport,
233 struct smc_clc_msg_local *lcl, int srv_first_contact); 266 struct smc_clc_msg_local *lcl, struct smcd_dev *smcd,
267 u64 peer_gid);
268void smcd_conn_free(struct smc_connection *conn);
269void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr);
234void smc_core_exit(void); 270void smc_core_exit(void);
271
272static inline struct smc_link_group *smc_get_lgr(struct smc_link *link)
273{
274 return container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]);
275}
235#endif 276#endif
diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c
index 839354402215..dbf64a93d68a 100644
--- a/net/smc/smc_diag.c
+++ b/net/smc/smc_diag.c
@@ -79,6 +79,7 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb,
79 struct nlattr *bc) 79 struct nlattr *bc)
80{ 80{
81 struct smc_sock *smc = smc_sk(sk); 81 struct smc_sock *smc = smc_sk(sk);
82 struct smc_diag_fallback fallback;
82 struct user_namespace *user_ns; 83 struct user_namespace *user_ns;
83 struct smc_diag_msg *r; 84 struct smc_diag_msg *r;
84 struct nlmsghdr *nlh; 85 struct nlmsghdr *nlh;
@@ -91,11 +92,21 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb,
91 r = nlmsg_data(nlh); 92 r = nlmsg_data(nlh);
92 smc_diag_msg_common_fill(r, sk); 93 smc_diag_msg_common_fill(r, sk);
93 r->diag_state = sk->sk_state; 94 r->diag_state = sk->sk_state;
94 r->diag_fallback = smc->use_fallback; 95 if (smc->use_fallback)
96 r->diag_mode = SMC_DIAG_MODE_FALLBACK_TCP;
97 else if (smc->conn.lgr && smc->conn.lgr->is_smcd)
98 r->diag_mode = SMC_DIAG_MODE_SMCD;
99 else
100 r->diag_mode = SMC_DIAG_MODE_SMCR;
95 user_ns = sk_user_ns(NETLINK_CB(cb->skb).sk); 101 user_ns = sk_user_ns(NETLINK_CB(cb->skb).sk);
96 if (smc_diag_msg_attrs_fill(sk, skb, r, user_ns)) 102 if (smc_diag_msg_attrs_fill(sk, skb, r, user_ns))
97 goto errout; 103 goto errout;
98 104
105 fallback.reason = smc->fallback_rsn;
106 fallback.peer_diagnosis = smc->peer_diagnosis;
107 if (nla_put(skb, SMC_DIAG_FALLBACK, sizeof(fallback), &fallback) < 0)
108 goto errout;
109
99 if ((req->diag_ext & (1 << (SMC_DIAG_CONNINFO - 1))) && 110 if ((req->diag_ext & (1 << (SMC_DIAG_CONNINFO - 1))) &&
100 smc->conn.alert_token_local) { 111 smc->conn.alert_token_local) {
101 struct smc_connection *conn = &smc->conn; 112 struct smc_connection *conn = &smc->conn;
@@ -136,7 +147,8 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb,
136 goto errout; 147 goto errout;
137 } 148 }
138 149
139 if ((req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) && smc->conn.lgr && 150 if (smc->conn.lgr && !smc->conn.lgr->is_smcd &&
151 (req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) &&
140 !list_empty(&smc->conn.lgr->list)) { 152 !list_empty(&smc->conn.lgr->list)) {
141 struct smc_diag_lgrinfo linfo = { 153 struct smc_diag_lgrinfo linfo = {
142 .role = smc->conn.lgr->role, 154 .role = smc->conn.lgr->role,
@@ -148,13 +160,28 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb,
148 smc->conn.lgr->lnk[0].smcibdev->ibdev->name, 160 smc->conn.lgr->lnk[0].smcibdev->ibdev->name,
149 sizeof(smc->conn.lgr->lnk[0].smcibdev->ibdev->name)); 161 sizeof(smc->conn.lgr->lnk[0].smcibdev->ibdev->name));
150 smc_gid_be16_convert(linfo.lnk[0].gid, 162 smc_gid_be16_convert(linfo.lnk[0].gid,
151 smc->conn.lgr->lnk[0].gid.raw); 163 smc->conn.lgr->lnk[0].gid);
152 smc_gid_be16_convert(linfo.lnk[0].peer_gid, 164 smc_gid_be16_convert(linfo.lnk[0].peer_gid,
153 smc->conn.lgr->lnk[0].peer_gid); 165 smc->conn.lgr->lnk[0].peer_gid);
154 166
155 if (nla_put(skb, SMC_DIAG_LGRINFO, sizeof(linfo), &linfo) < 0) 167 if (nla_put(skb, SMC_DIAG_LGRINFO, sizeof(linfo), &linfo) < 0)
156 goto errout; 168 goto errout;
157 } 169 }
170 if (smc->conn.lgr && smc->conn.lgr->is_smcd &&
171 (req->diag_ext & (1 << (SMC_DIAG_DMBINFO - 1))) &&
172 !list_empty(&smc->conn.lgr->list)) {
173 struct smc_connection *conn = &smc->conn;
174 struct smcd_diag_dmbinfo dinfo = {
175 .linkid = *((u32 *)conn->lgr->id),
176 .peer_gid = conn->lgr->peer_gid,
177 .my_gid = conn->lgr->smcd->local_gid,
178 .token = conn->rmb_desc->token,
179 .peer_token = conn->peer_token
180 };
181
182 if (nla_put(skb, SMC_DIAG_DMBINFO, sizeof(dinfo), &dinfo) < 0)
183 goto errout;
184 }
158 185
159 nlmsg_end(skb, nlh); 186 nlmsg_end(skb, nlh);
160 return 0; 187 return 0;
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index 0eed7ab9f28b..e519ef29c0ff 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -16,6 +16,7 @@
16#include <linux/workqueue.h> 16#include <linux/workqueue.h>
17#include <linux/scatterlist.h> 17#include <linux/scatterlist.h>
18#include <rdma/ib_verbs.h> 18#include <rdma/ib_verbs.h>
19#include <rdma/ib_cache.h>
19 20
20#include "smc_pnet.h" 21#include "smc_pnet.h"
21#include "smc_ib.h" 22#include "smc_ib.h"
@@ -68,7 +69,7 @@ static int smc_ib_modify_qp_rtr(struct smc_link *lnk)
68 qp_attr.path_mtu = min(lnk->path_mtu, lnk->peer_mtu); 69 qp_attr.path_mtu = min(lnk->path_mtu, lnk->peer_mtu);
69 qp_attr.ah_attr.type = RDMA_AH_ATTR_TYPE_ROCE; 70 qp_attr.ah_attr.type = RDMA_AH_ATTR_TYPE_ROCE;
70 rdma_ah_set_port_num(&qp_attr.ah_attr, lnk->ibport); 71 rdma_ah_set_port_num(&qp_attr.ah_attr, lnk->ibport);
71 rdma_ah_set_grh(&qp_attr.ah_attr, NULL, 0, 0, 1, 0); 72 rdma_ah_set_grh(&qp_attr.ah_attr, NULL, 0, lnk->sgid_index, 1, 0);
72 rdma_ah_set_dgid_raw(&qp_attr.ah_attr, lnk->peer_gid); 73 rdma_ah_set_dgid_raw(&qp_attr.ah_attr, lnk->peer_gid);
73 memcpy(&qp_attr.ah_attr.roce.dmac, lnk->peer_mac, 74 memcpy(&qp_attr.ah_attr.roce.dmac, lnk->peer_mac,
74 sizeof(lnk->peer_mac)); 75 sizeof(lnk->peer_mac));
@@ -112,8 +113,7 @@ int smc_ib_modify_qp_reset(struct smc_link *lnk)
112 113
113int smc_ib_ready_link(struct smc_link *lnk) 114int smc_ib_ready_link(struct smc_link *lnk)
114{ 115{
115 struct smc_link_group *lgr = 116 struct smc_link_group *lgr = smc_get_lgr(lnk);
116 container_of(lnk, struct smc_link_group, lnk[0]);
117 int rc = 0; 117 int rc = 0;
118 118
119 rc = smc_ib_modify_qp_init(lnk); 119 rc = smc_ib_modify_qp_init(lnk);
@@ -143,6 +143,95 @@ out:
143 return rc; 143 return rc;
144} 144}
145 145
146static int smc_ib_fill_mac(struct smc_ib_device *smcibdev, u8 ibport)
147{
148 const struct ib_gid_attr *attr;
149 int rc = 0;
150
151 attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, 0);
152 if (IS_ERR(attr))
153 return -ENODEV;
154
155 if (attr->ndev)
156 memcpy(smcibdev->mac[ibport - 1], attr->ndev->dev_addr,
157 ETH_ALEN);
158 else
159 rc = -ENODEV;
160
161 rdma_put_gid_attr(attr);
162 return rc;
163}
164
165/* Create an identifier unique for this instance of SMC-R.
166 * The MAC-address of the first active registered IB device
167 * plus a random 2-byte number is used to create this identifier.
168 * This name is delivered to the peer during connection initialization.
169 */
170static inline void smc_ib_define_local_systemid(struct smc_ib_device *smcibdev,
171 u8 ibport)
172{
173 memcpy(&local_systemid[2], &smcibdev->mac[ibport - 1],
174 sizeof(smcibdev->mac[ibport - 1]));
175 get_random_bytes(&local_systemid[0], 2);
176}
177
178bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport)
179{
180 return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE;
181}
182
183/* determine the gid for an ib-device port and vlan id */
184int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport,
185 unsigned short vlan_id, u8 gid[], u8 *sgid_index)
186{
187 const struct ib_gid_attr *attr;
188 int i;
189
190 for (i = 0; i < smcibdev->pattr[ibport - 1].gid_tbl_len; i++) {
191 attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, i);
192 if (IS_ERR(attr))
193 continue;
194
195 if (attr->ndev &&
196 ((!vlan_id && !is_vlan_dev(attr->ndev)) ||
197 (vlan_id && is_vlan_dev(attr->ndev) &&
198 vlan_dev_vlan_id(attr->ndev) == vlan_id)) &&
199 attr->gid_type == IB_GID_TYPE_ROCE) {
200 if (gid)
201 memcpy(gid, &attr->gid, SMC_GID_SIZE);
202 if (sgid_index)
203 *sgid_index = attr->index;
204 rdma_put_gid_attr(attr);
205 return 0;
206 }
207 rdma_put_gid_attr(attr);
208 }
209 return -ENODEV;
210}
211
212static int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport)
213{
214 int rc;
215
216 memset(&smcibdev->pattr[ibport - 1], 0,
217 sizeof(smcibdev->pattr[ibport - 1]));
218 rc = ib_query_port(smcibdev->ibdev, ibport,
219 &smcibdev->pattr[ibport - 1]);
220 if (rc)
221 goto out;
222 /* the SMC protocol requires specification of the RoCE MAC address */
223 rc = smc_ib_fill_mac(smcibdev, ibport);
224 if (rc)
225 goto out;
226 if (!strncmp(local_systemid, SMC_LOCAL_SYSTEMID_RESET,
227 sizeof(local_systemid)) &&
228 smc_ib_port_active(smcibdev, ibport))
229 /* create unique system identifier */
230 smc_ib_define_local_systemid(smcibdev, ibport);
231out:
232 return rc;
233}
234
146/* process context wrapper for might_sleep smc_ib_remember_port_attr */ 235/* process context wrapper for might_sleep smc_ib_remember_port_attr */
147static void smc_ib_port_event_work(struct work_struct *work) 236static void smc_ib_port_event_work(struct work_struct *work)
148{ 237{
@@ -370,62 +459,6 @@ void smc_ib_buf_unmap_sg(struct smc_ib_device *smcibdev,
370 buf_slot->sgt[SMC_SINGLE_LINK].sgl->dma_address = 0; 459 buf_slot->sgt[SMC_SINGLE_LINK].sgl->dma_address = 0;
371} 460}
372 461
373static int smc_ib_fill_gid_and_mac(struct smc_ib_device *smcibdev, u8 ibport)
374{
375 struct ib_gid_attr gattr;
376 int rc;
377
378 rc = ib_query_gid(smcibdev->ibdev, ibport, 0,
379 &smcibdev->gid[ibport - 1], &gattr);
380 if (rc || !gattr.ndev)
381 return -ENODEV;
382
383 memcpy(smcibdev->mac[ibport - 1], gattr.ndev->dev_addr, ETH_ALEN);
384 dev_put(gattr.ndev);
385 return 0;
386}
387
388/* Create an identifier unique for this instance of SMC-R.
389 * The MAC-address of the first active registered IB device
390 * plus a random 2-byte number is used to create this identifier.
391 * This name is delivered to the peer during connection initialization.
392 */
393static inline void smc_ib_define_local_systemid(struct smc_ib_device *smcibdev,
394 u8 ibport)
395{
396 memcpy(&local_systemid[2], &smcibdev->mac[ibport - 1],
397 sizeof(smcibdev->mac[ibport - 1]));
398 get_random_bytes(&local_systemid[0], 2);
399}
400
401bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport)
402{
403 return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE;
404}
405
406int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport)
407{
408 int rc;
409
410 memset(&smcibdev->pattr[ibport - 1], 0,
411 sizeof(smcibdev->pattr[ibport - 1]));
412 rc = ib_query_port(smcibdev->ibdev, ibport,
413 &smcibdev->pattr[ibport - 1]);
414 if (rc)
415 goto out;
416 /* the SMC protocol requires specification of the RoCE MAC address */
417 rc = smc_ib_fill_gid_and_mac(smcibdev, ibport);
418 if (rc)
419 goto out;
420 if (!strncmp(local_systemid, SMC_LOCAL_SYSTEMID_RESET,
421 sizeof(local_systemid)) &&
422 smc_ib_port_active(smcibdev, ibport))
423 /* create unique system identifier */
424 smc_ib_define_local_systemid(smcibdev, ibport);
425out:
426 return rc;
427}
428
429long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) 462long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev)
430{ 463{
431 struct ib_cq_init_attr cqattr = { 464 struct ib_cq_init_attr cqattr = {
@@ -454,9 +487,6 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev)
454 smcibdev->roce_cq_recv = NULL; 487 smcibdev->roce_cq_recv = NULL;
455 goto err; 488 goto err;
456 } 489 }
457 INIT_IB_EVENT_HANDLER(&smcibdev->event_handler, smcibdev->ibdev,
458 smc_ib_global_event_handler);
459 ib_register_event_handler(&smcibdev->event_handler);
460 smc_wr_add_dev(smcibdev); 490 smc_wr_add_dev(smcibdev);
461 smcibdev->initialized = 1; 491 smcibdev->initialized = 1;
462 return rc; 492 return rc;
@@ -472,7 +502,6 @@ static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev)
472 return; 502 return;
473 smcibdev->initialized = 0; 503 smcibdev->initialized = 0;
474 smc_wr_remove_dev(smcibdev); 504 smc_wr_remove_dev(smcibdev);
475 ib_unregister_event_handler(&smcibdev->event_handler);
476 ib_destroy_cq(smcibdev->roce_cq_recv); 505 ib_destroy_cq(smcibdev->roce_cq_recv);
477 ib_destroy_cq(smcibdev->roce_cq_send); 506 ib_destroy_cq(smcibdev->roce_cq_send);
478} 507}
@@ -483,6 +512,8 @@ static struct ib_client smc_ib_client;
483static void smc_ib_add_dev(struct ib_device *ibdev) 512static void smc_ib_add_dev(struct ib_device *ibdev)
484{ 513{
485 struct smc_ib_device *smcibdev; 514 struct smc_ib_device *smcibdev;
515 u8 port_cnt;
516 int i;
486 517
487 if (ibdev->node_type != RDMA_NODE_IB_CA) 518 if (ibdev->node_type != RDMA_NODE_IB_CA)
488 return; 519 return;
@@ -498,6 +529,21 @@ static void smc_ib_add_dev(struct ib_device *ibdev)
498 list_add_tail(&smcibdev->list, &smc_ib_devices.list); 529 list_add_tail(&smcibdev->list, &smc_ib_devices.list);
499 spin_unlock(&smc_ib_devices.lock); 530 spin_unlock(&smc_ib_devices.lock);
500 ib_set_client_data(ibdev, &smc_ib_client, smcibdev); 531 ib_set_client_data(ibdev, &smc_ib_client, smcibdev);
532 INIT_IB_EVENT_HANDLER(&smcibdev->event_handler, smcibdev->ibdev,
533 smc_ib_global_event_handler);
534 ib_register_event_handler(&smcibdev->event_handler);
535
536 /* trigger reading of the port attributes */
537 port_cnt = smcibdev->ibdev->phys_port_cnt;
538 for (i = 0;
539 i < min_t(size_t, port_cnt, SMC_MAX_PORTS);
540 i++) {
541 set_bit(i, &smcibdev->port_event_mask);
542 /* determine pnetids of the port */
543 smc_pnetid_by_dev_port(ibdev->dev.parent, i,
544 smcibdev->pnetid[i]);
545 }
546 schedule_work(&smcibdev->port_event_work);
501} 547}
502 548
503/* callback function for ib_register_client() */ 549/* callback function for ib_register_client() */
@@ -512,6 +558,7 @@ static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data)
512 spin_unlock(&smc_ib_devices.lock); 558 spin_unlock(&smc_ib_devices.lock);
513 smc_pnet_remove_by_ibdev(smcibdev); 559 smc_pnet_remove_by_ibdev(smcibdev);
514 smc_ib_cleanup_per_ibdev(smcibdev); 560 smc_ib_cleanup_per_ibdev(smcibdev);
561 ib_unregister_event_handler(&smcibdev->event_handler);
515 kfree(smcibdev); 562 kfree(smcibdev);
516} 563}
517 564
diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h
index e90630dadf8e..bac7fd65a4c0 100644
--- a/net/smc/smc_ib.h
+++ b/net/smc/smc_ib.h
@@ -15,6 +15,7 @@
15#include <linux/interrupt.h> 15#include <linux/interrupt.h>
16#include <linux/if_ether.h> 16#include <linux/if_ether.h>
17#include <rdma/ib_verbs.h> 17#include <rdma/ib_verbs.h>
18#include <net/smc.h>
18 19
19#define SMC_MAX_PORTS 2 /* Max # of ports */ 20#define SMC_MAX_PORTS 2 /* Max # of ports */
20#define SMC_GID_SIZE sizeof(union ib_gid) 21#define SMC_GID_SIZE sizeof(union ib_gid)
@@ -39,7 +40,8 @@ struct smc_ib_device { /* ib-device infos for smc */
39 struct tasklet_struct recv_tasklet; /* called by recv cq handler */ 40 struct tasklet_struct recv_tasklet; /* called by recv cq handler */
40 char mac[SMC_MAX_PORTS][ETH_ALEN]; 41 char mac[SMC_MAX_PORTS][ETH_ALEN];
41 /* mac address per port*/ 42 /* mac address per port*/
42 union ib_gid gid[SMC_MAX_PORTS]; /* gid per port */ 43 u8 pnetid[SMC_MAX_PORTS][SMC_MAX_PNETID_LEN];
44 /* pnetid per port */
43 u8 initialized : 1; /* ib dev CQ, evthdl done */ 45 u8 initialized : 1; /* ib dev CQ, evthdl done */
44 struct work_struct port_event_work; 46 struct work_struct port_event_work;
45 unsigned long port_event_mask; 47 unsigned long port_event_mask;
@@ -51,7 +53,6 @@ struct smc_link;
51int smc_ib_register_client(void) __init; 53int smc_ib_register_client(void) __init;
52void smc_ib_unregister_client(void); 54void smc_ib_unregister_client(void);
53bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport); 55bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport);
54int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport);
55int smc_ib_buf_map_sg(struct smc_ib_device *smcibdev, 56int smc_ib_buf_map_sg(struct smc_ib_device *smcibdev,
56 struct smc_buf_desc *buf_slot, 57 struct smc_buf_desc *buf_slot,
57 enum dma_data_direction data_direction); 58 enum dma_data_direction data_direction);
@@ -75,4 +76,6 @@ void smc_ib_sync_sg_for_cpu(struct smc_ib_device *smcibdev,
75void smc_ib_sync_sg_for_device(struct smc_ib_device *smcibdev, 76void smc_ib_sync_sg_for_device(struct smc_ib_device *smcibdev,
76 struct smc_buf_desc *buf_slot, 77 struct smc_buf_desc *buf_slot,
77 enum dma_data_direction data_direction); 78 enum dma_data_direction data_direction);
79int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport,
80 unsigned short vlan_id, u8 gid[], u8 *sgid_index);
78#endif 81#endif
diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c
new file mode 100644
index 000000000000..e36f21ce7252
--- /dev/null
+++ b/net/smc/smc_ism.c
@@ -0,0 +1,348 @@
1// SPDX-License-Identifier: GPL-2.0
2/* Shared Memory Communications Direct over ISM devices (SMC-D)
3 *
4 * Functions for ISM device.
5 *
6 * Copyright IBM Corp. 2018
7 */
8
9#include <linux/spinlock.h>
10#include <linux/slab.h>
11#include <asm/page.h>
12
13#include "smc.h"
14#include "smc_core.h"
15#include "smc_ism.h"
16#include "smc_pnet.h"
17
18struct smcd_dev_list smcd_dev_list = {
19 .list = LIST_HEAD_INIT(smcd_dev_list.list),
20 .lock = __SPIN_LOCK_UNLOCKED(smcd_dev_list.lock)
21};
22
23/* Test if an ISM communication is possible. */
24int smc_ism_cantalk(u64 peer_gid, unsigned short vlan_id, struct smcd_dev *smcd)
25{
26 return smcd->ops->query_remote_gid(smcd, peer_gid, vlan_id ? 1 : 0,
27 vlan_id);
28}
29
30int smc_ism_write(struct smcd_dev *smcd, const struct smc_ism_position *pos,
31 void *data, size_t len)
32{
33 int rc;
34
35 rc = smcd->ops->move_data(smcd, pos->token, pos->index, pos->signal,
36 pos->offset, data, len);
37
38 return rc < 0 ? rc : 0;
39}
40
41/* Set a connection using this DMBE. */
42void smc_ism_set_conn(struct smc_connection *conn)
43{
44 unsigned long flags;
45
46 spin_lock_irqsave(&conn->lgr->smcd->lock, flags);
47 conn->lgr->smcd->conn[conn->rmb_desc->sba_idx] = conn;
48 spin_unlock_irqrestore(&conn->lgr->smcd->lock, flags);
49}
50
51/* Unset a connection using this DMBE. */
52void smc_ism_unset_conn(struct smc_connection *conn)
53{
54 unsigned long flags;
55
56 if (!conn->rmb_desc)
57 return;
58
59 spin_lock_irqsave(&conn->lgr->smcd->lock, flags);
60 conn->lgr->smcd->conn[conn->rmb_desc->sba_idx] = NULL;
61 spin_unlock_irqrestore(&conn->lgr->smcd->lock, flags);
62}
63
64/* Register a VLAN identifier with the ISM device. Use a reference count
65 * and add a VLAN identifier only when the first DMB using this VLAN is
66 * registered.
67 */
68int smc_ism_get_vlan(struct smcd_dev *smcd, unsigned short vlanid)
69{
70 struct smc_ism_vlanid *new_vlan, *vlan;
71 unsigned long flags;
72 int rc = 0;
73
74 if (!vlanid) /* No valid vlan id */
75 return -EINVAL;
76
77 /* create new vlan entry, in case we need it */
78 new_vlan = kzalloc(sizeof(*new_vlan), GFP_KERNEL);
79 if (!new_vlan)
80 return -ENOMEM;
81 new_vlan->vlanid = vlanid;
82 refcount_set(&new_vlan->refcnt, 1);
83
84 /* if there is an existing entry, increase count and return */
85 spin_lock_irqsave(&smcd->lock, flags);
86 list_for_each_entry(vlan, &smcd->vlan, list) {
87 if (vlan->vlanid == vlanid) {
88 refcount_inc(&vlan->refcnt);
89 kfree(new_vlan);
90 goto out;
91 }
92 }
93
94 /* no existing entry found.
95 * add new entry to device; might fail, e.g., if HW limit reached
96 */
97 if (smcd->ops->add_vlan_id(smcd, vlanid)) {
98 kfree(new_vlan);
99 rc = -EIO;
100 goto out;
101 }
102 list_add_tail(&new_vlan->list, &smcd->vlan);
103out:
104 spin_unlock_irqrestore(&smcd->lock, flags);
105 return rc;
106}
107
108/* Unregister a VLAN identifier with the ISM device. Use a reference count
109 * and remove a VLAN identifier only when the last DMB using this VLAN is
110 * unregistered.
111 */
112int smc_ism_put_vlan(struct smcd_dev *smcd, unsigned short vlanid)
113{
114 struct smc_ism_vlanid *vlan;
115 unsigned long flags;
116 bool found = false;
117 int rc = 0;
118
119 if (!vlanid) /* No valid vlan id */
120 return -EINVAL;
121
122 spin_lock_irqsave(&smcd->lock, flags);
123 list_for_each_entry(vlan, &smcd->vlan, list) {
124 if (vlan->vlanid == vlanid) {
125 if (!refcount_dec_and_test(&vlan->refcnt))
126 goto out;
127 found = true;
128 break;
129 }
130 }
131 if (!found) {
132 rc = -ENOENT;
133 goto out; /* VLAN id not in table */
134 }
135
136 /* Found and the last reference just gone */
137 if (smcd->ops->del_vlan_id(smcd, vlanid))
138 rc = -EIO;
139 list_del(&vlan->list);
140 kfree(vlan);
141out:
142 spin_unlock_irqrestore(&smcd->lock, flags);
143 return rc;
144}
145
146int smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc)
147{
148 struct smcd_dmb dmb;
149
150 memset(&dmb, 0, sizeof(dmb));
151 dmb.dmb_tok = dmb_desc->token;
152 dmb.sba_idx = dmb_desc->sba_idx;
153 dmb.cpu_addr = dmb_desc->cpu_addr;
154 dmb.dma_addr = dmb_desc->dma_addr;
155 dmb.dmb_len = dmb_desc->len;
156 return smcd->ops->unregister_dmb(smcd, &dmb);
157}
158
159int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len,
160 struct smc_buf_desc *dmb_desc)
161{
162 struct smcd_dmb dmb;
163 int rc;
164
165 memset(&dmb, 0, sizeof(dmb));
166 dmb.dmb_len = dmb_len;
167 dmb.sba_idx = dmb_desc->sba_idx;
168 dmb.vlan_id = lgr->vlan_id;
169 dmb.rgid = lgr->peer_gid;
170 rc = lgr->smcd->ops->register_dmb(lgr->smcd, &dmb);
171 if (!rc) {
172 dmb_desc->sba_idx = dmb.sba_idx;
173 dmb_desc->token = dmb.dmb_tok;
174 dmb_desc->cpu_addr = dmb.cpu_addr;
175 dmb_desc->dma_addr = dmb.dma_addr;
176 dmb_desc->len = dmb.dmb_len;
177 }
178 return rc;
179}
180
181struct smc_ism_event_work {
182 struct work_struct work;
183 struct smcd_dev *smcd;
184 struct smcd_event event;
185};
186
187#define ISM_EVENT_REQUEST 0x0001
188#define ISM_EVENT_RESPONSE 0x0002
189#define ISM_EVENT_REQUEST_IR 0x00000001
190#define ISM_EVENT_CODE_TESTLINK 0x83
191
192static void smcd_handle_sw_event(struct smc_ism_event_work *wrk)
193{
194 union {
195 u64 info;
196 struct {
197 u32 uid;
198 unsigned short vlanid;
199 u16 code;
200 };
201 } ev_info;
202
203 switch (wrk->event.code) {
204 case ISM_EVENT_CODE_TESTLINK: /* Activity timer */
205 ev_info.info = wrk->event.info;
206 if (ev_info.code == ISM_EVENT_REQUEST) {
207 ev_info.code = ISM_EVENT_RESPONSE;
208 wrk->smcd->ops->signal_event(wrk->smcd,
209 wrk->event.tok,
210 ISM_EVENT_REQUEST_IR,
211 ISM_EVENT_CODE_TESTLINK,
212 ev_info.info);
213 }
214 break;
215 }
216}
217
218/* worker for SMC-D events */
219static void smc_ism_event_work(struct work_struct *work)
220{
221 struct smc_ism_event_work *wrk =
222 container_of(work, struct smc_ism_event_work, work);
223
224 switch (wrk->event.type) {
225 case ISM_EVENT_GID: /* GID event, token is peer GID */
226 smc_smcd_terminate(wrk->smcd, wrk->event.tok);
227 break;
228 case ISM_EVENT_DMB:
229 break;
230 case ISM_EVENT_SWR: /* Software defined event */
231 smcd_handle_sw_event(wrk);
232 break;
233 }
234 kfree(wrk);
235}
236
237static void smcd_release(struct device *dev)
238{
239 struct smcd_dev *smcd = container_of(dev, struct smcd_dev, dev);
240
241 kfree(smcd->conn);
242 kfree(smcd);
243}
244
245struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name,
246 const struct smcd_ops *ops, int max_dmbs)
247{
248 struct smcd_dev *smcd;
249
250 smcd = kzalloc(sizeof(*smcd), GFP_KERNEL);
251 if (!smcd)
252 return NULL;
253 smcd->conn = kcalloc(max_dmbs, sizeof(struct smc_connection *),
254 GFP_KERNEL);
255 if (!smcd->conn) {
256 kfree(smcd);
257 return NULL;
258 }
259
260 smcd->dev.parent = parent;
261 smcd->dev.release = smcd_release;
262 device_initialize(&smcd->dev);
263 dev_set_name(&smcd->dev, name);
264 smcd->ops = ops;
265 smc_pnetid_by_dev_port(parent, 0, smcd->pnetid);
266
267 spin_lock_init(&smcd->lock);
268 INIT_LIST_HEAD(&smcd->vlan);
269 smcd->event_wq = alloc_ordered_workqueue("ism_evt_wq-%s)",
270 WQ_MEM_RECLAIM, name);
271 return smcd;
272}
273EXPORT_SYMBOL_GPL(smcd_alloc_dev);
274
275int smcd_register_dev(struct smcd_dev *smcd)
276{
277 spin_lock(&smcd_dev_list.lock);
278 list_add_tail(&smcd->list, &smcd_dev_list.list);
279 spin_unlock(&smcd_dev_list.lock);
280
281 return device_add(&smcd->dev);
282}
283EXPORT_SYMBOL_GPL(smcd_register_dev);
284
285void smcd_unregister_dev(struct smcd_dev *smcd)
286{
287 spin_lock(&smcd_dev_list.lock);
288 list_del(&smcd->list);
289 spin_unlock(&smcd_dev_list.lock);
290 flush_workqueue(smcd->event_wq);
291 destroy_workqueue(smcd->event_wq);
292 smc_smcd_terminate(smcd, 0);
293
294 device_del(&smcd->dev);
295}
296EXPORT_SYMBOL_GPL(smcd_unregister_dev);
297
298void smcd_free_dev(struct smcd_dev *smcd)
299{
300 put_device(&smcd->dev);
301}
302EXPORT_SYMBOL_GPL(smcd_free_dev);
303
304/* SMCD Device event handler. Called from ISM device interrupt handler.
305 * Parameters are smcd device pointer,
306 * - event->type (0 --> DMB, 1 --> GID),
307 * - event->code (event code),
308 * - event->tok (either DMB token when event type 0, or GID when event type 1)
309 * - event->time (time of day)
310 * - event->info (debug info).
311 *
312 * Context:
313 * - Function called in IRQ context from ISM device driver event handler.
314 */
315void smcd_handle_event(struct smcd_dev *smcd, struct smcd_event *event)
316{
317 struct smc_ism_event_work *wrk;
318
319 /* copy event to event work queue, and let it be handled there */
320 wrk = kmalloc(sizeof(*wrk), GFP_ATOMIC);
321 if (!wrk)
322 return;
323 INIT_WORK(&wrk->work, smc_ism_event_work);
324 wrk->smcd = smcd;
325 wrk->event = *event;
326 queue_work(smcd->event_wq, &wrk->work);
327}
328EXPORT_SYMBOL_GPL(smcd_handle_event);
329
330/* SMCD Device interrupt handler. Called from ISM device interrupt handler.
331 * Parameters are smcd device pointer and DMB number. Find the connection and
332 * schedule the tasklet for this connection.
333 *
334 * Context:
335 * - Function called in IRQ context from ISM device driver IRQ handler.
336 */
337void smcd_handle_irq(struct smcd_dev *smcd, unsigned int dmbno)
338{
339 struct smc_connection *conn = NULL;
340 unsigned long flags;
341
342 spin_lock_irqsave(&smcd->lock, flags);
343 conn = smcd->conn[dmbno];
344 if (conn)
345 tasklet_schedule(&conn->rx_tsklet);
346 spin_unlock_irqrestore(&smcd->lock, flags);
347}
348EXPORT_SYMBOL_GPL(smcd_handle_irq);
diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h
new file mode 100644
index 000000000000..aee45b860b79
--- /dev/null
+++ b/net/smc/smc_ism.h
@@ -0,0 +1,48 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2/* Shared Memory Communications Direct over ISM devices (SMC-D)
3 *
4 * SMC-D ISM device structure definitions.
5 *
6 * Copyright IBM Corp. 2018
7 */
8
9#ifndef SMCD_ISM_H
10#define SMCD_ISM_H
11
12#include <linux/uio.h>
13
14#include "smc.h"
15
16struct smcd_dev_list { /* List of SMCD devices */
17 struct list_head list;
18 spinlock_t lock; /* Protects list of devices */
19};
20
21extern struct smcd_dev_list smcd_dev_list; /* list of smcd devices */
22
23struct smc_ism_vlanid { /* VLAN id set on ISM device */
24 struct list_head list;
25 unsigned short vlanid; /* Vlan id */
26 refcount_t refcnt; /* Reference count */
27};
28
29struct smc_ism_position { /* ISM device position to write to */
30 u64 token; /* Token of DMB */
31 u32 offset; /* Offset into DMBE */
32 u8 index; /* Index of DMBE */
33 u8 signal; /* Generate interrupt on owner side */
34};
35
36struct smcd_dev;
37
38int smc_ism_cantalk(u64 peer_gid, unsigned short vlan_id, struct smcd_dev *dev);
39void smc_ism_set_conn(struct smc_connection *conn);
40void smc_ism_unset_conn(struct smc_connection *conn);
41int smc_ism_get_vlan(struct smcd_dev *dev, unsigned short vlan_id);
42int smc_ism_put_vlan(struct smcd_dev *dev, unsigned short vlan_id);
43int smc_ism_register_dmb(struct smc_link_group *lgr, int buf_size,
44 struct smc_buf_desc *dmb_desc);
45int smc_ism_unregister_dmb(struct smcd_dev *dev, struct smc_buf_desc *dmb_desc);
46int smc_ism_write(struct smcd_dev *dev, const struct smc_ism_position *pos,
47 void *data, size_t len);
48#endif
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 5800a6b43d83..9c916c709ca7 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -182,12 +182,10 @@ static int smc_llc_add_pending_send(struct smc_link *link,
182} 182}
183 183
184/* high-level API to send LLC confirm link */ 184/* high-level API to send LLC confirm link */
185int smc_llc_send_confirm_link(struct smc_link *link, u8 mac[], 185int smc_llc_send_confirm_link(struct smc_link *link,
186 union ib_gid *gid,
187 enum smc_llc_reqresp reqresp) 186 enum smc_llc_reqresp reqresp)
188{ 187{
189 struct smc_link_group *lgr = container_of(link, struct smc_link_group, 188 struct smc_link_group *lgr = smc_get_lgr(link);
190 lnk[SMC_SINGLE_LINK]);
191 struct smc_llc_msg_confirm_link *confllc; 189 struct smc_llc_msg_confirm_link *confllc;
192 struct smc_wr_tx_pend_priv *pend; 190 struct smc_wr_tx_pend_priv *pend;
193 struct smc_wr_buf *wr_buf; 191 struct smc_wr_buf *wr_buf;
@@ -203,8 +201,9 @@ int smc_llc_send_confirm_link(struct smc_link *link, u8 mac[],
203 confllc->hd.flags |= SMC_LLC_FLAG_NO_RMBE_EYEC; 201 confllc->hd.flags |= SMC_LLC_FLAG_NO_RMBE_EYEC;
204 if (reqresp == SMC_LLC_RESP) 202 if (reqresp == SMC_LLC_RESP)
205 confllc->hd.flags |= SMC_LLC_FLAG_RESP; 203 confllc->hd.flags |= SMC_LLC_FLAG_RESP;
206 memcpy(confllc->sender_mac, mac, ETH_ALEN); 204 memcpy(confllc->sender_mac, link->smcibdev->mac[link->ibport - 1],
207 memcpy(confllc->sender_gid, gid, SMC_GID_SIZE); 205 ETH_ALEN);
206 memcpy(confllc->sender_gid, link->gid, SMC_GID_SIZE);
208 hton24(confllc->sender_qp_num, link->roce_qp->qp_num); 207 hton24(confllc->sender_qp_num, link->roce_qp->qp_num);
209 confllc->link_num = link->link_id; 208 confllc->link_num = link->link_id;
210 memcpy(confllc->link_uid, lgr->id, SMC_LGR_ID_SIZE); 209 memcpy(confllc->link_uid, lgr->id, SMC_LGR_ID_SIZE);
@@ -241,8 +240,7 @@ static int smc_llc_send_confirm_rkey(struct smc_link *link,
241 240
242/* prepare an add link message */ 241/* prepare an add link message */
243static void smc_llc_prep_add_link(struct smc_llc_msg_add_link *addllc, 242static void smc_llc_prep_add_link(struct smc_llc_msg_add_link *addllc,
244 struct smc_link *link, u8 mac[], 243 struct smc_link *link, u8 mac[], u8 gid[],
245 union ib_gid *gid,
246 enum smc_llc_reqresp reqresp) 244 enum smc_llc_reqresp reqresp)
247{ 245{
248 memset(addllc, 0, sizeof(*addllc)); 246 memset(addllc, 0, sizeof(*addllc));
@@ -259,8 +257,7 @@ static void smc_llc_prep_add_link(struct smc_llc_msg_add_link *addllc,
259} 257}
260 258
261/* send ADD LINK request or response */ 259/* send ADD LINK request or response */
262int smc_llc_send_add_link(struct smc_link *link, u8 mac[], 260int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[],
263 union ib_gid *gid,
264 enum smc_llc_reqresp reqresp) 261 enum smc_llc_reqresp reqresp)
265{ 262{
266 struct smc_llc_msg_add_link *addllc; 263 struct smc_llc_msg_add_link *addllc;
@@ -281,7 +278,7 @@ int smc_llc_send_add_link(struct smc_link *link, u8 mac[],
281/* prepare a delete link message */ 278/* prepare a delete link message */
282static void smc_llc_prep_delete_link(struct smc_llc_msg_del_link *delllc, 279static void smc_llc_prep_delete_link(struct smc_llc_msg_del_link *delllc,
283 struct smc_link *link, 280 struct smc_link *link,
284 enum smc_llc_reqresp reqresp) 281 enum smc_llc_reqresp reqresp, bool orderly)
285{ 282{
286 memset(delllc, 0, sizeof(*delllc)); 283 memset(delllc, 0, sizeof(*delllc));
287 delllc->hd.common.type = SMC_LLC_DELETE_LINK; 284 delllc->hd.common.type = SMC_LLC_DELETE_LINK;
@@ -290,13 +287,14 @@ static void smc_llc_prep_delete_link(struct smc_llc_msg_del_link *delllc,
290 delllc->hd.flags |= SMC_LLC_FLAG_RESP; 287 delllc->hd.flags |= SMC_LLC_FLAG_RESP;
291 /* DEL_LINK_ALL because only 1 link supported */ 288 /* DEL_LINK_ALL because only 1 link supported */
292 delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ALL; 289 delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ALL;
293 delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY; 290 if (orderly)
291 delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY;
294 delllc->link_num = link->link_id; 292 delllc->link_num = link->link_id;
295} 293}
296 294
297/* send DELETE LINK request or response */ 295/* send DELETE LINK request or response */
298int smc_llc_send_delete_link(struct smc_link *link, 296int smc_llc_send_delete_link(struct smc_link *link,
299 enum smc_llc_reqresp reqresp) 297 enum smc_llc_reqresp reqresp, bool orderly)
300{ 298{
301 struct smc_llc_msg_del_link *delllc; 299 struct smc_llc_msg_del_link *delllc;
302 struct smc_wr_tx_pend_priv *pend; 300 struct smc_wr_tx_pend_priv *pend;
@@ -307,7 +305,7 @@ int smc_llc_send_delete_link(struct smc_link *link,
307 if (rc) 305 if (rc)
308 return rc; 306 return rc;
309 delllc = (struct smc_llc_msg_del_link *)wr_buf; 307 delllc = (struct smc_llc_msg_del_link *)wr_buf;
310 smc_llc_prep_delete_link(delllc, link, reqresp); 308 smc_llc_prep_delete_link(delllc, link, reqresp, orderly);
311 /* send llc message */ 309 /* send llc message */
312 rc = smc_wr_tx_send(link, pend); 310 rc = smc_wr_tx_send(link, pend);
313 return rc; 311 return rc;
@@ -381,11 +379,9 @@ static int smc_llc_send_message(struct smc_link *link, void *llcbuf, int llclen)
381static void smc_llc_rx_confirm_link(struct smc_link *link, 379static void smc_llc_rx_confirm_link(struct smc_link *link,
382 struct smc_llc_msg_confirm_link *llc) 380 struct smc_llc_msg_confirm_link *llc)
383{ 381{
384 struct smc_link_group *lgr; 382 struct smc_link_group *lgr = smc_get_lgr(link);
385 int conf_rc; 383 int conf_rc;
386 384
387 lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]);
388
389 /* RMBE eyecatchers are not supported */ 385 /* RMBE eyecatchers are not supported */
390 if (llc->hd.flags & SMC_LLC_FLAG_NO_RMBE_EYEC) 386 if (llc->hd.flags & SMC_LLC_FLAG_NO_RMBE_EYEC)
391 conf_rc = 0; 387 conf_rc = 0;
@@ -411,8 +407,7 @@ static void smc_llc_rx_confirm_link(struct smc_link *link,
411static void smc_llc_rx_add_link(struct smc_link *link, 407static void smc_llc_rx_add_link(struct smc_link *link,
412 struct smc_llc_msg_add_link *llc) 408 struct smc_llc_msg_add_link *llc)
413{ 409{
414 struct smc_link_group *lgr = container_of(link, struct smc_link_group, 410 struct smc_link_group *lgr = smc_get_lgr(link);
415 lnk[SMC_SINGLE_LINK]);
416 411
417 if (llc->hd.flags & SMC_LLC_FLAG_RESP) { 412 if (llc->hd.flags & SMC_LLC_FLAG_RESP) {
418 if (link->state == SMC_LNK_ACTIVATING) 413 if (link->state == SMC_LNK_ACTIVATING)
@@ -426,14 +421,12 @@ static void smc_llc_rx_add_link(struct smc_link *link,
426 if (lgr->role == SMC_SERV) { 421 if (lgr->role == SMC_SERV) {
427 smc_llc_prep_add_link(llc, link, 422 smc_llc_prep_add_link(llc, link,
428 link->smcibdev->mac[link->ibport - 1], 423 link->smcibdev->mac[link->ibport - 1],
429 &link->smcibdev->gid[link->ibport - 1], 424 link->gid, SMC_LLC_REQ);
430 SMC_LLC_REQ);
431 425
432 } else { 426 } else {
433 smc_llc_prep_add_link(llc, link, 427 smc_llc_prep_add_link(llc, link,
434 link->smcibdev->mac[link->ibport - 1], 428 link->smcibdev->mac[link->ibport - 1],
435 &link->smcibdev->gid[link->ibport - 1], 429 link->gid, SMC_LLC_RESP);
436 SMC_LLC_RESP);
437 } 430 }
438 smc_llc_send_message(link, llc, sizeof(*llc)); 431 smc_llc_send_message(link, llc, sizeof(*llc));
439 } 432 }
@@ -442,22 +435,23 @@ static void smc_llc_rx_add_link(struct smc_link *link,
442static void smc_llc_rx_delete_link(struct smc_link *link, 435static void smc_llc_rx_delete_link(struct smc_link *link,
443 struct smc_llc_msg_del_link *llc) 436 struct smc_llc_msg_del_link *llc)
444{ 437{
445 struct smc_link_group *lgr = container_of(link, struct smc_link_group, 438 struct smc_link_group *lgr = smc_get_lgr(link);
446 lnk[SMC_SINGLE_LINK]);
447 439
448 if (llc->hd.flags & SMC_LLC_FLAG_RESP) { 440 if (llc->hd.flags & SMC_LLC_FLAG_RESP) {
449 if (lgr->role == SMC_SERV) 441 if (lgr->role == SMC_SERV)
450 smc_lgr_terminate(lgr); 442 smc_lgr_schedule_free_work_fast(lgr);
451 } else { 443 } else {
444 smc_lgr_forget(lgr);
445 smc_llc_link_deleting(link);
452 if (lgr->role == SMC_SERV) { 446 if (lgr->role == SMC_SERV) {
453 smc_lgr_forget(lgr); 447 /* client asks to delete this link, send request */
454 smc_llc_prep_delete_link(llc, link, SMC_LLC_REQ); 448 smc_llc_prep_delete_link(llc, link, SMC_LLC_REQ, true);
455 smc_llc_send_message(link, llc, sizeof(*llc));
456 } else { 449 } else {
457 smc_llc_prep_delete_link(llc, link, SMC_LLC_RESP); 450 /* server requests to delete this link, send response */
458 smc_llc_send_message(link, llc, sizeof(*llc)); 451 smc_llc_prep_delete_link(llc, link, SMC_LLC_RESP, true);
459 smc_lgr_terminate(lgr);
460 } 452 }
453 smc_llc_send_message(link, llc, sizeof(*llc));
454 smc_lgr_schedule_free_work_fast(lgr);
461 } 455 }
462} 456}
463 457
@@ -476,17 +470,14 @@ static void smc_llc_rx_test_link(struct smc_link *link,
476static void smc_llc_rx_confirm_rkey(struct smc_link *link, 470static void smc_llc_rx_confirm_rkey(struct smc_link *link,
477 struct smc_llc_msg_confirm_rkey *llc) 471 struct smc_llc_msg_confirm_rkey *llc)
478{ 472{
479 struct smc_link_group *lgr;
480 int rc; 473 int rc;
481 474
482 lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]);
483
484 if (llc->hd.flags & SMC_LLC_FLAG_RESP) { 475 if (llc->hd.flags & SMC_LLC_FLAG_RESP) {
485 link->llc_confirm_rkey_rc = llc->hd.flags & 476 link->llc_confirm_rkey_rc = llc->hd.flags &
486 SMC_LLC_FLAG_RKEY_NEG; 477 SMC_LLC_FLAG_RKEY_NEG;
487 complete(&link->llc_confirm_rkey); 478 complete(&link->llc_confirm_rkey);
488 } else { 479 } else {
489 rc = smc_rtoken_add(lgr, 480 rc = smc_rtoken_add(smc_get_lgr(link),
490 llc->rtoken[0].rmb_vaddr, 481 llc->rtoken[0].rmb_vaddr,
491 llc->rtoken[0].rmb_key); 482 llc->rtoken[0].rmb_key);
492 483
@@ -514,18 +505,15 @@ static void smc_llc_rx_confirm_rkey_cont(struct smc_link *link,
514static void smc_llc_rx_delete_rkey(struct smc_link *link, 505static void smc_llc_rx_delete_rkey(struct smc_link *link,
515 struct smc_llc_msg_delete_rkey *llc) 506 struct smc_llc_msg_delete_rkey *llc)
516{ 507{
517 struct smc_link_group *lgr;
518 u8 err_mask = 0; 508 u8 err_mask = 0;
519 int i, max; 509 int i, max;
520 510
521 lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]);
522
523 if (llc->hd.flags & SMC_LLC_FLAG_RESP) { 511 if (llc->hd.flags & SMC_LLC_FLAG_RESP) {
524 /* unused as long as we don't send this type of msg */ 512 /* unused as long as we don't send this type of msg */
525 } else { 513 } else {
526 max = min_t(u8, llc->num_rkeys, SMC_LLC_DEL_RKEY_MAX); 514 max = min_t(u8, llc->num_rkeys, SMC_LLC_DEL_RKEY_MAX);
527 for (i = 0; i < max; i++) { 515 for (i = 0; i < max; i++) {
528 if (smc_rtoken_delete(lgr, llc->rkey[i])) 516 if (smc_rtoken_delete(smc_get_lgr(link), llc->rkey[i]))
529 err_mask |= 1 << (SMC_LLC_DEL_RKEY_MAX - 1 - i); 517 err_mask |= 1 << (SMC_LLC_DEL_RKEY_MAX - 1 - i);
530 } 518 }
531 519
@@ -583,12 +571,10 @@ static void smc_llc_testlink_work(struct work_struct *work)
583 struct smc_link *link = container_of(to_delayed_work(work), 571 struct smc_link *link = container_of(to_delayed_work(work),
584 struct smc_link, llc_testlink_wrk); 572 struct smc_link, llc_testlink_wrk);
585 unsigned long next_interval; 573 unsigned long next_interval;
586 struct smc_link_group *lgr;
587 unsigned long expire_time; 574 unsigned long expire_time;
588 u8 user_data[16] = { 0 }; 575 u8 user_data[16] = { 0 };
589 int rc; 576 int rc;
590 577
591 lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]);
592 if (link->state != SMC_LNK_ACTIVE) 578 if (link->state != SMC_LNK_ACTIVE)
593 return; /* don't reschedule worker */ 579 return; /* don't reschedule worker */
594 expire_time = link->wr_rx_tstamp + link->llc_testlink_time; 580 expire_time = link->wr_rx_tstamp + link->llc_testlink_time;
@@ -602,7 +588,7 @@ static void smc_llc_testlink_work(struct work_struct *work)
602 rc = wait_for_completion_interruptible_timeout(&link->llc_testlink_resp, 588 rc = wait_for_completion_interruptible_timeout(&link->llc_testlink_resp,
603 SMC_LLC_WAIT_TIME); 589 SMC_LLC_WAIT_TIME);
604 if (rc <= 0) { 590 if (rc <= 0) {
605 smc_lgr_terminate(lgr); 591 smc_lgr_terminate(smc_get_lgr(link));
606 return; 592 return;
607 } 593 }
608 next_interval = link->llc_testlink_time; 594 next_interval = link->llc_testlink_time;
@@ -613,8 +599,7 @@ out:
613 599
614int smc_llc_link_init(struct smc_link *link) 600int smc_llc_link_init(struct smc_link *link)
615{ 601{
616 struct smc_link_group *lgr = container_of(link, struct smc_link_group, 602 struct smc_link_group *lgr = smc_get_lgr(link);
617 lnk[SMC_SINGLE_LINK]);
618 link->llc_wq = alloc_ordered_workqueue("llc_wq-%x:%x)", WQ_MEM_RECLAIM, 603 link->llc_wq = alloc_ordered_workqueue("llc_wq-%x:%x)", WQ_MEM_RECLAIM,
619 *((u32 *)lgr->id), 604 *((u32 *)lgr->id),
620 link->link_id); 605 link->link_id);
@@ -640,6 +625,11 @@ void smc_llc_link_active(struct smc_link *link, int testlink_time)
640 } 625 }
641} 626}
642 627
628void smc_llc_link_deleting(struct smc_link *link)
629{
630 link->state = SMC_LNK_DELETING;
631}
632
643/* called in tasklet context */ 633/* called in tasklet context */
644void smc_llc_link_inactive(struct smc_link *link) 634void smc_llc_link_inactive(struct smc_link *link)
645{ 635{
diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h
index 65c8645e96a1..9e2ff088e301 100644
--- a/net/smc/smc_llc.h
+++ b/net/smc/smc_llc.h
@@ -36,14 +36,15 @@ enum smc_llc_msg_type {
36}; 36};
37 37
38/* transmit */ 38/* transmit */
39int smc_llc_send_confirm_link(struct smc_link *lnk, u8 mac[], union ib_gid *gid, 39int smc_llc_send_confirm_link(struct smc_link *lnk,
40 enum smc_llc_reqresp reqresp); 40 enum smc_llc_reqresp reqresp);
41int smc_llc_send_add_link(struct smc_link *link, u8 mac[], union ib_gid *gid, 41int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[],
42 enum smc_llc_reqresp reqresp); 42 enum smc_llc_reqresp reqresp);
43int smc_llc_send_delete_link(struct smc_link *link, 43int smc_llc_send_delete_link(struct smc_link *link,
44 enum smc_llc_reqresp reqresp); 44 enum smc_llc_reqresp reqresp, bool orderly);
45int smc_llc_link_init(struct smc_link *link); 45int smc_llc_link_init(struct smc_link *link);
46void smc_llc_link_active(struct smc_link *link, int testlink_time); 46void smc_llc_link_active(struct smc_link *link, int testlink_time);
47void smc_llc_link_deleting(struct smc_link *link);
47void smc_llc_link_inactive(struct smc_link *link); 48void smc_llc_link_inactive(struct smc_link *link);
48void smc_llc_link_clear(struct smc_link *link); 49void smc_llc_link_clear(struct smc_link *link);
49int smc_llc_do_confirm_rkey(struct smc_link *link, 50int smc_llc_do_confirm_rkey(struct smc_link *link,
diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c
index d7b88b2d1b22..01c6ce042a1c 100644
--- a/net/smc/smc_pnet.c
+++ b/net/smc/smc_pnet.c
@@ -22,13 +22,12 @@
22 22
23#include "smc_pnet.h" 23#include "smc_pnet.h"
24#include "smc_ib.h" 24#include "smc_ib.h"
25 25#include "smc_ism.h"
26#define SMC_MAX_PNET_ID_LEN 16 /* Max. length of PNET id */
27 26
28static struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = { 27static struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = {
29 [SMC_PNETID_NAME] = { 28 [SMC_PNETID_NAME] = {
30 .type = NLA_NUL_STRING, 29 .type = NLA_NUL_STRING,
31 .len = SMC_MAX_PNET_ID_LEN - 1 30 .len = SMC_MAX_PNETID_LEN - 1
32 }, 31 },
33 [SMC_PNETID_ETHNAME] = { 32 [SMC_PNETID_ETHNAME] = {
34 .type = NLA_NUL_STRING, 33 .type = NLA_NUL_STRING,
@@ -65,7 +64,7 @@ static struct smc_pnettable {
65 */ 64 */
66struct smc_pnetentry { 65struct smc_pnetentry {
67 struct list_head list; 66 struct list_head list;
68 char pnet_name[SMC_MAX_PNET_ID_LEN + 1]; 67 char pnet_name[SMC_MAX_PNETID_LEN + 1];
69 struct net_device *ndev; 68 struct net_device *ndev;
70 struct smc_ib_device *smcibdev; 69 struct smc_ib_device *smcibdev;
71 u8 ib_port; 70 u8 ib_port;
@@ -209,7 +208,7 @@ static bool smc_pnetid_valid(const char *pnet_name, char *pnetid)
209 return false; 208 return false;
210 while (--end >= bf && isspace(*end)) 209 while (--end >= bf && isspace(*end))
211 ; 210 ;
212 if (end - bf >= SMC_MAX_PNET_ID_LEN) 211 if (end - bf >= SMC_MAX_PNETID_LEN)
213 return false; 212 return false;
214 while (bf <= end) { 213 while (bf <= end) {
215 if (!isalnum(*bf)) 214 if (!isalnum(*bf))
@@ -358,9 +357,6 @@ static int smc_pnet_add(struct sk_buff *skb, struct genl_info *info)
358 kfree(pnetelem); 357 kfree(pnetelem);
359 return rc; 358 return rc;
360 } 359 }
361 rc = smc_ib_remember_port_attr(pnetelem->smcibdev, pnetelem->ib_port);
362 if (rc)
363 smc_pnet_remove_by_pnetid(pnetelem->pnet_name);
364 return rc; 360 return rc;
365} 361}
366 362
@@ -485,10 +481,10 @@ static int smc_pnet_netdev_event(struct notifier_block *this,
485 case NETDEV_REBOOT: 481 case NETDEV_REBOOT:
486 case NETDEV_UNREGISTER: 482 case NETDEV_UNREGISTER:
487 smc_pnet_remove_by_ndev(event_dev); 483 smc_pnet_remove_by_ndev(event_dev);
484 return NOTIFY_OK;
488 default: 485 default:
489 break; 486 return NOTIFY_DONE;
490 } 487 }
491 return NOTIFY_DONE;
492} 488}
493 489
494static struct notifier_block smc_netdev_notifier = { 490static struct notifier_block smc_netdev_notifier = {
@@ -515,28 +511,104 @@ void smc_pnet_exit(void)
515 genl_unregister_family(&smc_pnet_nl_family); 511 genl_unregister_family(&smc_pnet_nl_family);
516} 512}
517 513
518/* PNET table analysis for a given sock: 514/* Determine one base device for stacked net devices.
519 * determine ib_device and port belonging to used internal TCP socket 515 * If the lower device level contains more than one devices
520 * ethernet interface. 516 * (for instance with bonding slaves), just the first device
517 * is used to reach a base device.
521 */ 518 */
522void smc_pnet_find_roce_resource(struct sock *sk, 519static struct net_device *pnet_find_base_ndev(struct net_device *ndev)
523 struct smc_ib_device **smcibdev, u8 *ibport)
524{ 520{
525 struct dst_entry *dst = sk_dst_get(sk); 521 int i, nest_lvl;
526 struct smc_pnetentry *pnetelem;
527 522
528 *smcibdev = NULL; 523 rtnl_lock();
529 *ibport = 0; 524 nest_lvl = dev_get_nest_level(ndev);
525 for (i = 0; i < nest_lvl; i++) {
526 struct list_head *lower = &ndev->adj_list.lower;
527
528 if (list_empty(lower))
529 break;
530 lower = lower->next;
531 ndev = netdev_lower_get_next(ndev, &lower);
532 }
533 rtnl_unlock();
534 return ndev;
535}
536
537/* Determine the corresponding IB device port based on the hardware PNETID.
538 * Searching stops at the first matching active IB device port with vlan_id
539 * configured.
540 */
541static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev,
542 struct smc_ib_device **smcibdev,
543 u8 *ibport, unsigned short vlan_id,
544 u8 gid[])
545{
546 u8 ndev_pnetid[SMC_MAX_PNETID_LEN];
547 struct smc_ib_device *ibdev;
548 int i;
549
550 ndev = pnet_find_base_ndev(ndev);
551 if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port,
552 ndev_pnetid))
553 return; /* pnetid could not be determined */
554
555 spin_lock(&smc_ib_devices.lock);
556 list_for_each_entry(ibdev, &smc_ib_devices.list, list) {
557 for (i = 1; i <= SMC_MAX_PORTS; i++) {
558 if (!rdma_is_port_valid(ibdev->ibdev, i))
559 continue;
560 if (!memcmp(ibdev->pnetid[i - 1], ndev_pnetid,
561 SMC_MAX_PNETID_LEN) &&
562 smc_ib_port_active(ibdev, i) &&
563 !smc_ib_determine_gid(ibdev, i, vlan_id, gid,
564 NULL)) {
565 *smcibdev = ibdev;
566 *ibport = i;
567 goto out;
568 }
569 }
570 }
571out:
572 spin_unlock(&smc_ib_devices.lock);
573}
574
575static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev,
576 struct smcd_dev **smcismdev)
577{
578 u8 ndev_pnetid[SMC_MAX_PNETID_LEN];
579 struct smcd_dev *ismdev;
580
581 ndev = pnet_find_base_ndev(ndev);
582 if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port,
583 ndev_pnetid))
584 return; /* pnetid could not be determined */
585
586 spin_lock(&smcd_dev_list.lock);
587 list_for_each_entry(ismdev, &smcd_dev_list.list, list) {
588 if (!memcmp(ismdev->pnetid, ndev_pnetid, SMC_MAX_PNETID_LEN)) {
589 *smcismdev = ismdev;
590 break;
591 }
592 }
593 spin_unlock(&smcd_dev_list.lock);
594}
595
596/* Lookup of coupled ib_device via SMC pnet table */
597static void smc_pnet_find_roce_by_table(struct net_device *netdev,
598 struct smc_ib_device **smcibdev,
599 u8 *ibport, unsigned short vlan_id,
600 u8 gid[])
601{
602 struct smc_pnetentry *pnetelem;
530 603
531 if (!dst)
532 return;
533 if (!dst->dev)
534 goto out_rel;
535 read_lock(&smc_pnettable.lock); 604 read_lock(&smc_pnettable.lock);
536 list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) { 605 list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) {
537 if (dst->dev == pnetelem->ndev) { 606 if (netdev == pnetelem->ndev) {
538 if (smc_ib_port_active(pnetelem->smcibdev, 607 if (smc_ib_port_active(pnetelem->smcibdev,
539 pnetelem->ib_port)) { 608 pnetelem->ib_port) &&
609 !smc_ib_determine_gid(pnetelem->smcibdev,
610 pnetelem->ib_port, vlan_id,
611 gid, NULL)) {
540 *smcibdev = pnetelem->smcibdev; 612 *smcibdev = pnetelem->smcibdev;
541 *ibport = pnetelem->ib_port; 613 *ibport = pnetelem->ib_port;
542 } 614 }
@@ -544,6 +616,55 @@ void smc_pnet_find_roce_resource(struct sock *sk,
544 } 616 }
545 } 617 }
546 read_unlock(&smc_pnettable.lock); 618 read_unlock(&smc_pnettable.lock);
619}
620
621/* PNET table analysis for a given sock:
622 * determine ib_device and port belonging to used internal TCP socket
623 * ethernet interface.
624 */
625void smc_pnet_find_roce_resource(struct sock *sk,
626 struct smc_ib_device **smcibdev, u8 *ibport,
627 unsigned short vlan_id, u8 gid[])
628{
629 struct dst_entry *dst = sk_dst_get(sk);
630
631 *smcibdev = NULL;
632 *ibport = 0;
633
634 if (!dst)
635 goto out;
636 if (!dst->dev)
637 goto out_rel;
638
639 /* if possible, lookup via hardware-defined pnetid */
640 smc_pnet_find_roce_by_pnetid(dst->dev, smcibdev, ibport, vlan_id, gid);
641 if (*smcibdev)
642 goto out_rel;
643
644 /* lookup via SMC PNET table */
645 smc_pnet_find_roce_by_table(dst->dev, smcibdev, ibport, vlan_id, gid);
646
647out_rel:
648 dst_release(dst);
649out:
650 return;
651}
652
653void smc_pnet_find_ism_resource(struct sock *sk, struct smcd_dev **smcismdev)
654{
655 struct dst_entry *dst = sk_dst_get(sk);
656
657 *smcismdev = NULL;
658 if (!dst)
659 goto out;
660 if (!dst->dev)
661 goto out_rel;
662
663 /* if possible, lookup via hardware-defined pnetid */
664 smc_pnet_find_ism_by_pnetid(dst->dev, smcismdev);
665
547out_rel: 666out_rel:
548 dst_release(dst); 667 dst_release(dst);
668out:
669 return;
549} 670}
diff --git a/net/smc/smc_pnet.h b/net/smc/smc_pnet.h
index 5a29519db976..8ff777636e32 100644
--- a/net/smc/smc_pnet.h
+++ b/net/smc/smc_pnet.h
@@ -12,12 +12,29 @@
12#ifndef _SMC_PNET_H 12#ifndef _SMC_PNET_H
13#define _SMC_PNET_H 13#define _SMC_PNET_H
14 14
15#if IS_ENABLED(CONFIG_HAVE_PNETID)
16#include <asm/pnet.h>
17#endif
18
15struct smc_ib_device; 19struct smc_ib_device;
20struct smcd_dev;
21
22static inline int smc_pnetid_by_dev_port(struct device *dev,
23 unsigned short port, u8 *pnetid)
24{
25#if IS_ENABLED(CONFIG_HAVE_PNETID)
26 return pnet_id_by_dev_port(dev, port, pnetid);
27#else
28 return -ENOENT;
29#endif
30}
16 31
17int smc_pnet_init(void) __init; 32int smc_pnet_init(void) __init;
18void smc_pnet_exit(void); 33void smc_pnet_exit(void);
19int smc_pnet_remove_by_ibdev(struct smc_ib_device *ibdev); 34int smc_pnet_remove_by_ibdev(struct smc_ib_device *ibdev);
20void smc_pnet_find_roce_resource(struct sock *sk, 35void smc_pnet_find_roce_resource(struct sock *sk,
21 struct smc_ib_device **smcibdev, u8 *ibport); 36 struct smc_ib_device **smcibdev, u8 *ibport,
37 unsigned short vlan_id, u8 gid[]);
38void smc_pnet_find_ism_resource(struct sock *sk, struct smcd_dev **smcismdev);
22 39
23#endif 40#endif
diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c
index 3d77b383cccd..bbcf0fe4ae10 100644
--- a/net/smc/smc_rx.c
+++ b/net/smc/smc_rx.c
@@ -82,8 +82,7 @@ static int smc_rx_update_consumer(struct smc_sock *smc,
82 } 82 }
83 } 83 }
84 84
85 smc_curs_write(&conn->local_tx_ctrl.cons, smc_curs_read(&cons, conn), 85 smc_curs_copy(&conn->local_tx_ctrl.cons, &cons, conn);
86 conn);
87 86
88 /* send consumer cursor update if required */ 87 /* send consumer cursor update if required */
89 /* similar to advertising new TCP rcv_wnd if required */ 88 /* similar to advertising new TCP rcv_wnd if required */
@@ -97,8 +96,7 @@ static void smc_rx_update_cons(struct smc_sock *smc, size_t len)
97 struct smc_connection *conn = &smc->conn; 96 struct smc_connection *conn = &smc->conn;
98 union smc_host_cursor cons; 97 union smc_host_cursor cons;
99 98
100 smc_curs_write(&cons, smc_curs_read(&conn->local_tx_ctrl.cons, conn), 99 smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
101 conn);
102 smc_rx_update_consumer(smc, cons, len); 100 smc_rx_update_consumer(smc, cons, len);
103} 101}
104 102
@@ -157,10 +155,8 @@ static int smc_rx_splice(struct pipe_inode_info *pipe, char *src, size_t len,
157 struct splice_pipe_desc spd; 155 struct splice_pipe_desc spd;
158 struct partial_page partial; 156 struct partial_page partial;
159 struct smc_spd_priv *priv; 157 struct smc_spd_priv *priv;
160 struct page *page;
161 int bytes; 158 int bytes;
162 159
163 page = virt_to_page(smc->conn.rmb_desc->cpu_addr);
164 priv = kzalloc(sizeof(*priv), GFP_KERNEL); 160 priv = kzalloc(sizeof(*priv), GFP_KERNEL);
165 if (!priv) 161 if (!priv)
166 return -ENOMEM; 162 return -ENOMEM;
@@ -172,7 +168,7 @@ static int smc_rx_splice(struct pipe_inode_info *pipe, char *src, size_t len,
172 168
173 spd.nr_pages_max = 1; 169 spd.nr_pages_max = 1;
174 spd.nr_pages = 1; 170 spd.nr_pages = 1;
175 spd.pages = &page; 171 spd.pages = &smc->conn.rmb_desc->pages;
176 spd.partial = &partial; 172 spd.partial = &partial;
177 spd.ops = &smc_pipe_ops; 173 spd.ops = &smc_pipe_ops;
178 spd.spd_release = smc_rx_spd_release; 174 spd.spd_release = smc_rx_spd_release;
@@ -245,10 +241,7 @@ static int smc_rx_recv_urg(struct smc_sock *smc, struct msghdr *msg, int len,
245 if (!(flags & MSG_TRUNC)) 241 if (!(flags & MSG_TRUNC))
246 rc = memcpy_to_msg(msg, &conn->urg_rx_byte, 1); 242 rc = memcpy_to_msg(msg, &conn->urg_rx_byte, 1);
247 len = 1; 243 len = 1;
248 smc_curs_write(&cons, 244 smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
249 smc_curs_read(&conn->local_tx_ctrl.cons,
250 conn),
251 conn);
252 if (smc_curs_diff(conn->rmb_desc->len, &cons, 245 if (smc_curs_diff(conn->rmb_desc->len, &cons,
253 &conn->urg_curs) > 1) 246 &conn->urg_curs) > 1)
254 conn->urg_rx_skip_pend = true; 247 conn->urg_rx_skip_pend = true;
@@ -305,7 +298,7 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg,
305 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); 298 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
306 299
307 /* we currently use 1 RMBE per RMB, so RMBE == RMB base addr */ 300 /* we currently use 1 RMBE per RMB, so RMBE == RMB base addr */
308 rcvbuf_base = conn->rmb_desc->cpu_addr; 301 rcvbuf_base = conn->rx_off + conn->rmb_desc->cpu_addr;
309 302
310 do { /* while (read_remaining) */ 303 do { /* while (read_remaining) */
311 if (read_done >= target || (pipe && read_done)) 304 if (read_done >= target || (pipe && read_done))
@@ -370,9 +363,7 @@ copy:
370 continue; 363 continue;
371 } 364 }
372 365
373 smc_curs_write(&cons, 366 smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
374 smc_curs_read(&conn->local_tx_ctrl.cons, conn),
375 conn);
376 /* subsequent splice() calls pick up where previous left */ 367 /* subsequent splice() calls pick up where previous left */
377 if (splbytes) 368 if (splbytes)
378 smc_curs_add(conn->rmb_desc->len, &cons, splbytes); 369 smc_curs_add(conn->rmb_desc->len, &cons, splbytes);
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index cee666400752..d8366ed51757 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -24,6 +24,7 @@
24#include "smc.h" 24#include "smc.h"
25#include "smc_wr.h" 25#include "smc_wr.h"
26#include "smc_cdc.h" 26#include "smc_cdc.h"
27#include "smc_ism.h"
27#include "smc_tx.h" 28#include "smc_tx.h"
28 29
29#define SMC_TX_WORK_DELAY HZ 30#define SMC_TX_WORK_DELAY HZ
@@ -180,9 +181,7 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
180 copylen = min_t(size_t, send_remaining, writespace); 181 copylen = min_t(size_t, send_remaining, writespace);
181 /* determine start of sndbuf */ 182 /* determine start of sndbuf */
182 sndbuf_base = conn->sndbuf_desc->cpu_addr; 183 sndbuf_base = conn->sndbuf_desc->cpu_addr;
183 smc_curs_write(&prep, 184 smc_curs_copy(&prep, &conn->tx_curs_prep, conn);
184 smc_curs_read(&conn->tx_curs_prep, conn),
185 conn);
186 tx_cnt_prep = prep.count; 185 tx_cnt_prep = prep.count;
187 /* determine chunks where to write into sndbuf */ 186 /* determine chunks where to write into sndbuf */
188 /* either unwrapped case, or 1st chunk of wrapped case */ 187 /* either unwrapped case, or 1st chunk of wrapped case */
@@ -213,9 +212,7 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
213 smc_sndbuf_sync_sg_for_device(conn); 212 smc_sndbuf_sync_sg_for_device(conn);
214 /* update cursors */ 213 /* update cursors */
215 smc_curs_add(conn->sndbuf_desc->len, &prep, copylen); 214 smc_curs_add(conn->sndbuf_desc->len, &prep, copylen);
216 smc_curs_write(&conn->tx_curs_prep, 215 smc_curs_copy(&conn->tx_curs_prep, &prep, conn);
217 smc_curs_read(&prep, conn),
218 conn);
219 /* increased in send tasklet smc_cdc_tx_handler() */ 216 /* increased in send tasklet smc_cdc_tx_handler() */
220 smp_mb__before_atomic(); 217 smp_mb__before_atomic();
221 atomic_sub(copylen, &conn->sndbuf_space); 218 atomic_sub(copylen, &conn->sndbuf_space);
@@ -250,12 +247,29 @@ out_err:
250 247
251/***************************** sndbuf consumer *******************************/ 248/***************************** sndbuf consumer *******************************/
252 249
250/* sndbuf consumer: actual data transfer of one target chunk with ISM write */
251int smcd_tx_ism_write(struct smc_connection *conn, void *data, size_t len,
252 u32 offset, int signal)
253{
254 struct smc_ism_position pos;
255 int rc;
256
257 memset(&pos, 0, sizeof(pos));
258 pos.token = conn->peer_token;
259 pos.index = conn->peer_rmbe_idx;
260 pos.offset = conn->tx_off + offset;
261 pos.signal = signal;
262 rc = smc_ism_write(conn->lgr->smcd, &pos, data, len);
263 if (rc)
264 conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
265 return rc;
266}
267
253/* sndbuf consumer: actual data transfer of one target chunk with RDMA write */ 268/* sndbuf consumer: actual data transfer of one target chunk with RDMA write */
254static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset, 269static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset,
255 int num_sges, struct ib_sge sges[]) 270 int num_sges, struct ib_sge sges[])
256{ 271{
257 struct smc_link_group *lgr = conn->lgr; 272 struct smc_link_group *lgr = conn->lgr;
258 struct ib_send_wr *failed_wr = NULL;
259 struct ib_rdma_wr rdma_wr; 273 struct ib_rdma_wr rdma_wr;
260 struct smc_link *link; 274 struct smc_link *link;
261 int rc; 275 int rc;
@@ -273,7 +287,7 @@ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset,
273 /* offset within RMBE */ 287 /* offset within RMBE */
274 peer_rmbe_offset; 288 peer_rmbe_offset;
275 rdma_wr.rkey = lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey; 289 rdma_wr.rkey = lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey;
276 rc = ib_post_send(link->roce_qp, &rdma_wr.wr, &failed_wr); 290 rc = ib_post_send(link->roce_qp, &rdma_wr.wr, NULL);
277 if (rc) { 291 if (rc) {
278 conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; 292 conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
279 smc_lgr_terminate(lgr); 293 smc_lgr_terminate(lgr);
@@ -297,26 +311,109 @@ static inline void smc_tx_advance_cursors(struct smc_connection *conn,
297 smc_curs_add(conn->sndbuf_desc->len, sent, len); 311 smc_curs_add(conn->sndbuf_desc->len, sent, len);
298} 312}
299 313
314/* SMC-R helper for smc_tx_rdma_writes() */
315static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len,
316 size_t src_off, size_t src_len,
317 size_t dst_off, size_t dst_len)
318{
319 dma_addr_t dma_addr =
320 sg_dma_address(conn->sndbuf_desc->sgt[SMC_SINGLE_LINK].sgl);
321 struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK];
322 int src_len_sum = src_len, dst_len_sum = dst_len;
323 struct ib_sge sges[SMC_IB_MAX_SEND_SGE];
324 int sent_count = src_off;
325 int srcchunk, dstchunk;
326 int num_sges;
327 int rc;
328
329 for (dstchunk = 0; dstchunk < 2; dstchunk++) {
330 num_sges = 0;
331 for (srcchunk = 0; srcchunk < 2; srcchunk++) {
332 sges[srcchunk].addr = dma_addr + src_off;
333 sges[srcchunk].length = src_len;
334 sges[srcchunk].lkey = link->roce_pd->local_dma_lkey;
335 num_sges++;
336
337 src_off += src_len;
338 if (src_off >= conn->sndbuf_desc->len)
339 src_off -= conn->sndbuf_desc->len;
340 /* modulo in send ring */
341 if (src_len_sum == dst_len)
342 break; /* either on 1st or 2nd iteration */
343 /* prepare next (== 2nd) iteration */
344 src_len = dst_len - src_len; /* remainder */
345 src_len_sum += src_len;
346 }
347 rc = smc_tx_rdma_write(conn, dst_off, num_sges, sges);
348 if (rc)
349 return rc;
350 if (dst_len_sum == len)
351 break; /* either on 1st or 2nd iteration */
352 /* prepare next (== 2nd) iteration */
353 dst_off = 0; /* modulo offset in RMBE ring buffer */
354 dst_len = len - dst_len; /* remainder */
355 dst_len_sum += dst_len;
356 src_len = min_t(int, dst_len, conn->sndbuf_desc->len -
357 sent_count);
358 src_len_sum = src_len;
359 }
360 return 0;
361}
362
363/* SMC-D helper for smc_tx_rdma_writes() */
364static int smcd_tx_rdma_writes(struct smc_connection *conn, size_t len,
365 size_t src_off, size_t src_len,
366 size_t dst_off, size_t dst_len)
367{
368 int src_len_sum = src_len, dst_len_sum = dst_len;
369 int srcchunk, dstchunk;
370 int rc;
371
372 for (dstchunk = 0; dstchunk < 2; dstchunk++) {
373 for (srcchunk = 0; srcchunk < 2; srcchunk++) {
374 void *data = conn->sndbuf_desc->cpu_addr + src_off;
375
376 rc = smcd_tx_ism_write(conn, data, src_len, dst_off +
377 sizeof(struct smcd_cdc_msg), 0);
378 if (rc)
379 return rc;
380 dst_off += src_len;
381 src_off += src_len;
382 if (src_off >= conn->sndbuf_desc->len)
383 src_off -= conn->sndbuf_desc->len;
384 /* modulo in send ring */
385 if (src_len_sum == dst_len)
386 break; /* either on 1st or 2nd iteration */
387 /* prepare next (== 2nd) iteration */
388 src_len = dst_len - src_len; /* remainder */
389 src_len_sum += src_len;
390 }
391 if (dst_len_sum == len)
392 break; /* either on 1st or 2nd iteration */
393 /* prepare next (== 2nd) iteration */
394 dst_off = 0; /* modulo offset in RMBE ring buffer */
395 dst_len = len - dst_len; /* remainder */
396 dst_len_sum += dst_len;
397 src_len = min_t(int, dst_len, conn->sndbuf_desc->len - src_off);
398 src_len_sum = src_len;
399 }
400 return 0;
401}
402
300/* sndbuf consumer: prepare all necessary (src&dst) chunks of data transmit; 403/* sndbuf consumer: prepare all necessary (src&dst) chunks of data transmit;
301 * usable snd_wnd as max transmit 404 * usable snd_wnd as max transmit
302 */ 405 */
303static int smc_tx_rdma_writes(struct smc_connection *conn) 406static int smc_tx_rdma_writes(struct smc_connection *conn)
304{ 407{
305 size_t src_off, src_len, dst_off, dst_len; /* current chunk values */ 408 size_t len, src_len, dst_off, dst_len; /* current chunk values */
306 size_t len, dst_len_sum, src_len_sum, dstchunk, srcchunk;
307 union smc_host_cursor sent, prep, prod, cons; 409 union smc_host_cursor sent, prep, prod, cons;
308 struct ib_sge sges[SMC_IB_MAX_SEND_SGE];
309 struct smc_link_group *lgr = conn->lgr;
310 struct smc_cdc_producer_flags *pflags; 410 struct smc_cdc_producer_flags *pflags;
311 int to_send, rmbespace; 411 int to_send, rmbespace;
312 struct smc_link *link;
313 dma_addr_t dma_addr;
314 int num_sges;
315 int rc; 412 int rc;
316 413
317 /* source: sndbuf */ 414 /* source: sndbuf */
318 smc_curs_write(&sent, smc_curs_read(&conn->tx_curs_sent, conn), conn); 415 smc_curs_copy(&sent, &conn->tx_curs_sent, conn);
319 smc_curs_write(&prep, smc_curs_read(&conn->tx_curs_prep, conn), conn); 416 smc_curs_copy(&prep, &conn->tx_curs_prep, conn);
320 /* cf. wmem_alloc - (snd_max - snd_una) */ 417 /* cf. wmem_alloc - (snd_max - snd_una) */
321 to_send = smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep); 418 to_send = smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep);
322 if (to_send <= 0) 419 if (to_send <= 0)
@@ -327,12 +424,8 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
327 rmbespace = atomic_read(&conn->peer_rmbe_space); 424 rmbespace = atomic_read(&conn->peer_rmbe_space);
328 if (rmbespace <= 0) 425 if (rmbespace <= 0)
329 return 0; 426 return 0;
330 smc_curs_write(&prod, 427 smc_curs_copy(&prod, &conn->local_tx_ctrl.prod, conn);
331 smc_curs_read(&conn->local_tx_ctrl.prod, conn), 428 smc_curs_copy(&cons, &conn->local_rx_ctrl.cons, conn);
332 conn);
333 smc_curs_write(&cons,
334 smc_curs_read(&conn->local_rx_ctrl.cons, conn),
335 conn);
336 429
337 /* if usable snd_wnd closes ask peer to advertise once it opens again */ 430 /* if usable snd_wnd closes ask peer to advertise once it opens again */
338 pflags = &conn->local_tx_ctrl.prod_flags; 431 pflags = &conn->local_tx_ctrl.prod_flags;
@@ -341,7 +434,6 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
341 len = min(to_send, rmbespace); 434 len = min(to_send, rmbespace);
342 435
343 /* initialize variables for first iteration of subsequent nested loop */ 436 /* initialize variables for first iteration of subsequent nested loop */
344 link = &lgr->lnk[SMC_SINGLE_LINK];
345 dst_off = prod.count; 437 dst_off = prod.count;
346 if (prod.wrap == cons.wrap) { 438 if (prod.wrap == cons.wrap) {
347 /* the filled destination area is unwrapped, 439 /* the filled destination area is unwrapped,
@@ -358,8 +450,6 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
358 */ 450 */
359 dst_len = len; 451 dst_len = len;
360 } 452 }
361 dst_len_sum = dst_len;
362 src_off = sent.count;
363 /* dst_len determines the maximum src_len */ 453 /* dst_len determines the maximum src_len */
364 if (sent.count + dst_len <= conn->sndbuf_desc->len) { 454 if (sent.count + dst_len <= conn->sndbuf_desc->len) {
365 /* unwrapped src case: single chunk of entire dst_len */ 455 /* unwrapped src case: single chunk of entire dst_len */
@@ -368,51 +458,23 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
368 /* wrapped src case: 2 chunks of sum dst_len; start with 1st: */ 458 /* wrapped src case: 2 chunks of sum dst_len; start with 1st: */
369 src_len = conn->sndbuf_desc->len - sent.count; 459 src_len = conn->sndbuf_desc->len - sent.count;
370 } 460 }
371 src_len_sum = src_len; 461
372 dma_addr = sg_dma_address(conn->sndbuf_desc->sgt[SMC_SINGLE_LINK].sgl); 462 if (conn->lgr->is_smcd)
373 for (dstchunk = 0; dstchunk < 2; dstchunk++) { 463 rc = smcd_tx_rdma_writes(conn, len, sent.count, src_len,
374 num_sges = 0; 464 dst_off, dst_len);
375 for (srcchunk = 0; srcchunk < 2; srcchunk++) { 465 else
376 sges[srcchunk].addr = dma_addr + src_off; 466 rc = smcr_tx_rdma_writes(conn, len, sent.count, src_len,
377 sges[srcchunk].length = src_len; 467 dst_off, dst_len);
378 sges[srcchunk].lkey = link->roce_pd->local_dma_lkey; 468 if (rc)
379 num_sges++; 469 return rc;
380 src_off += src_len;
381 if (src_off >= conn->sndbuf_desc->len)
382 src_off -= conn->sndbuf_desc->len;
383 /* modulo in send ring */
384 if (src_len_sum == dst_len)
385 break; /* either on 1st or 2nd iteration */
386 /* prepare next (== 2nd) iteration */
387 src_len = dst_len - src_len; /* remainder */
388 src_len_sum += src_len;
389 }
390 rc = smc_tx_rdma_write(conn, dst_off, num_sges, sges);
391 if (rc)
392 return rc;
393 if (dst_len_sum == len)
394 break; /* either on 1st or 2nd iteration */
395 /* prepare next (== 2nd) iteration */
396 dst_off = 0; /* modulo offset in RMBE ring buffer */
397 dst_len = len - dst_len; /* remainder */
398 dst_len_sum += dst_len;
399 src_len = min_t(int,
400 dst_len, conn->sndbuf_desc->len - sent.count);
401 src_len_sum = src_len;
402 }
403 470
404 if (conn->urg_tx_pend && len == to_send) 471 if (conn->urg_tx_pend && len == to_send)
405 pflags->urg_data_present = 1; 472 pflags->urg_data_present = 1;
406 smc_tx_advance_cursors(conn, &prod, &sent, len); 473 smc_tx_advance_cursors(conn, &prod, &sent, len);
407 /* update connection's cursors with advanced local cursors */ 474 /* update connection's cursors with advanced local cursors */
408 smc_curs_write(&conn->local_tx_ctrl.prod, 475 smc_curs_copy(&conn->local_tx_ctrl.prod, &prod, conn);
409 smc_curs_read(&prod, conn),
410 conn);
411 /* dst: peer RMBE */ 476 /* dst: peer RMBE */
412 smc_curs_write(&conn->tx_curs_sent, 477 smc_curs_copy(&conn->tx_curs_sent, &sent, conn);/* src: local sndbuf */
413 smc_curs_read(&sent, conn),
414 conn);
415 /* src: local sndbuf */
416 478
417 return 0; 479 return 0;
418} 480}
@@ -420,7 +482,7 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
420/* Wakeup sndbuf consumers from any context (IRQ or process) 482/* Wakeup sndbuf consumers from any context (IRQ or process)
421 * since there is more data to transmit; usable snd_wnd as max transmit 483 * since there is more data to transmit; usable snd_wnd as max transmit
422 */ 484 */
423int smc_tx_sndbuf_nonempty(struct smc_connection *conn) 485static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn)
424{ 486{
425 struct smc_cdc_producer_flags *pflags; 487 struct smc_cdc_producer_flags *pflags;
426 struct smc_cdc_tx_pend *pend; 488 struct smc_cdc_tx_pend *pend;
@@ -467,6 +529,37 @@ out_unlock:
467 return rc; 529 return rc;
468} 530}
469 531
532static int smcd_tx_sndbuf_nonempty(struct smc_connection *conn)
533{
534 struct smc_cdc_producer_flags *pflags = &conn->local_tx_ctrl.prod_flags;
535 int rc = 0;
536
537 spin_lock_bh(&conn->send_lock);
538 if (!pflags->urg_data_present)
539 rc = smc_tx_rdma_writes(conn);
540 if (!rc)
541 rc = smcd_cdc_msg_send(conn);
542
543 if (!rc && pflags->urg_data_present) {
544 pflags->urg_data_pending = 0;
545 pflags->urg_data_present = 0;
546 }
547 spin_unlock_bh(&conn->send_lock);
548 return rc;
549}
550
551int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
552{
553 int rc;
554
555 if (conn->lgr->is_smcd)
556 rc = smcd_tx_sndbuf_nonempty(conn);
557 else
558 rc = smcr_tx_sndbuf_nonempty(conn);
559
560 return rc;
561}
562
470/* Wakeup sndbuf consumers from process context 563/* Wakeup sndbuf consumers from process context
471 * since there is more data to transmit 564 * since there is more data to transmit
472 */ 565 */
@@ -495,21 +588,23 @@ out:
495 588
496void smc_tx_consumer_update(struct smc_connection *conn, bool force) 589void smc_tx_consumer_update(struct smc_connection *conn, bool force)
497{ 590{
498 union smc_host_cursor cfed, cons; 591 union smc_host_cursor cfed, cons, prod;
592 int sender_free = conn->rmb_desc->len;
499 int to_confirm; 593 int to_confirm;
500 594
501 smc_curs_write(&cons, 595 smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
502 smc_curs_read(&conn->local_tx_ctrl.cons, conn), 596 smc_curs_copy(&cfed, &conn->rx_curs_confirmed, conn);
503 conn);
504 smc_curs_write(&cfed,
505 smc_curs_read(&conn->rx_curs_confirmed, conn),
506 conn);
507 to_confirm = smc_curs_diff(conn->rmb_desc->len, &cfed, &cons); 597 to_confirm = smc_curs_diff(conn->rmb_desc->len, &cfed, &cons);
598 if (to_confirm > conn->rmbe_update_limit) {
599 smc_curs_copy(&prod, &conn->local_rx_ctrl.prod, conn);
600 sender_free = conn->rmb_desc->len -
601 smc_curs_diff(conn->rmb_desc->len, &prod, &cfed);
602 }
508 603
509 if (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req || 604 if (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req ||
510 force || 605 force ||
511 ((to_confirm > conn->rmbe_update_limit) && 606 ((to_confirm > conn->rmbe_update_limit) &&
512 ((to_confirm > (conn->rmb_desc->len / 2)) || 607 ((sender_free <= (conn->rmb_desc->len / 2)) ||
513 conn->local_rx_ctrl.prod_flags.write_blocked))) { 608 conn->local_rx_ctrl.prod_flags.write_blocked))) {
514 if ((smc_cdc_get_slot_and_msg_send(conn) < 0) && 609 if ((smc_cdc_get_slot_and_msg_send(conn) < 0) &&
515 conn->alert_token_local) { /* connection healthy */ 610 conn->alert_token_local) { /* connection healthy */
@@ -517,9 +612,8 @@ void smc_tx_consumer_update(struct smc_connection *conn, bool force)
517 SMC_TX_WORK_DELAY); 612 SMC_TX_WORK_DELAY);
518 return; 613 return;
519 } 614 }
520 smc_curs_write(&conn->rx_curs_confirmed, 615 smc_curs_copy(&conn->rx_curs_confirmed,
521 smc_curs_read(&conn->local_tx_ctrl.cons, conn), 616 &conn->local_tx_ctrl.cons, conn);
522 conn);
523 conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0; 617 conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0;
524 } 618 }
525 if (conn->local_rx_ctrl.prod_flags.write_blocked && 619 if (conn->local_rx_ctrl.prod_flags.write_blocked &&
diff --git a/net/smc/smc_tx.h b/net/smc/smc_tx.h
index 9d2238909fa0..07e6ad76224a 100644
--- a/net/smc/smc_tx.h
+++ b/net/smc/smc_tx.h
@@ -22,8 +22,8 @@ static inline int smc_tx_prepared_sends(struct smc_connection *conn)
22{ 22{
23 union smc_host_cursor sent, prep; 23 union smc_host_cursor sent, prep;
24 24
25 smc_curs_write(&sent, smc_curs_read(&conn->tx_curs_sent, conn), conn); 25 smc_curs_copy(&sent, &conn->tx_curs_sent, conn);
26 smc_curs_write(&prep, smc_curs_read(&conn->tx_curs_prep, conn), conn); 26 smc_curs_copy(&prep, &conn->tx_curs_prep, conn);
27 return smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep); 27 return smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep);
28} 28}
29 29
@@ -33,5 +33,7 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len);
33int smc_tx_sndbuf_nonempty(struct smc_connection *conn); 33int smc_tx_sndbuf_nonempty(struct smc_connection *conn);
34void smc_tx_sndbuf_nonfull(struct smc_sock *smc); 34void smc_tx_sndbuf_nonfull(struct smc_sock *smc);
35void smc_tx_consumer_update(struct smc_connection *conn, bool force); 35void smc_tx_consumer_update(struct smc_connection *conn, bool force);
36int smcd_tx_ism_write(struct smc_connection *conn, void *data, size_t len,
37 u32 offset, int signal);
36 38
37#endif /* SMC_TX_H */ 39#endif /* SMC_TX_H */
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
index dbd2605d1962..3c458d279855 100644
--- a/net/smc/smc_wr.c
+++ b/net/smc/smc_wr.c
@@ -92,8 +92,6 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
92 if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask)) 92 if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask))
93 return; 93 return;
94 if (wc->status) { 94 if (wc->status) {
95 struct smc_link_group *lgr;
96
97 for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) { 95 for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) {
98 /* clear full struct smc_wr_tx_pend including .priv */ 96 /* clear full struct smc_wr_tx_pend including .priv */
99 memset(&link->wr_tx_pends[i], 0, 97 memset(&link->wr_tx_pends[i], 0,
@@ -103,9 +101,7 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
103 clear_bit(i, link->wr_tx_mask); 101 clear_bit(i, link->wr_tx_mask);
104 } 102 }
105 /* terminate connections of this link group abnormally */ 103 /* terminate connections of this link group abnormally */
106 lgr = container_of(link, struct smc_link_group, 104 smc_lgr_terminate(smc_get_lgr(link));
107 lnk[SMC_SINGLE_LINK]);
108 smc_lgr_terminate(lgr);
109 } 105 }
110 if (pnd_snd.handler) 106 if (pnd_snd.handler)
111 pnd_snd.handler(&pnd_snd.priv, link, wc->status); 107 pnd_snd.handler(&pnd_snd.priv, link, wc->status);
@@ -186,18 +182,14 @@ int smc_wr_tx_get_free_slot(struct smc_link *link,
186 if (rc) 182 if (rc)
187 return rc; 183 return rc;
188 } else { 184 } else {
189 struct smc_link_group *lgr;
190
191 lgr = container_of(link, struct smc_link_group,
192 lnk[SMC_SINGLE_LINK]);
193 rc = wait_event_timeout( 185 rc = wait_event_timeout(
194 link->wr_tx_wait, 186 link->wr_tx_wait,
195 list_empty(&lgr->list) || /* lgr terminated */ 187 link->state == SMC_LNK_INACTIVE ||
196 (smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY), 188 (smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY),
197 SMC_WR_TX_WAIT_FREE_SLOT_TIME); 189 SMC_WR_TX_WAIT_FREE_SLOT_TIME);
198 if (!rc) { 190 if (!rc) {
199 /* timeout - terminate connections */ 191 /* timeout - terminate connections */
200 smc_lgr_terminate(lgr); 192 smc_lgr_terminate(smc_get_lgr(link));
201 return -EPIPE; 193 return -EPIPE;
202 } 194 }
203 if (idx == link->wr_tx_cnt) 195 if (idx == link->wr_tx_cnt)
@@ -240,22 +232,16 @@ int smc_wr_tx_put_slot(struct smc_link *link,
240 */ 232 */
241int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv) 233int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv)
242{ 234{
243 struct ib_send_wr *failed_wr = NULL;
244 struct smc_wr_tx_pend *pend; 235 struct smc_wr_tx_pend *pend;
245 int rc; 236 int rc;
246 237
247 ib_req_notify_cq(link->smcibdev->roce_cq_send, 238 ib_req_notify_cq(link->smcibdev->roce_cq_send,
248 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); 239 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
249 pend = container_of(priv, struct smc_wr_tx_pend, priv); 240 pend = container_of(priv, struct smc_wr_tx_pend, priv);
250 rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], 241 rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], NULL);
251 &failed_wr);
252 if (rc) { 242 if (rc) {
253 struct smc_link_group *lgr =
254 container_of(link, struct smc_link_group,
255 lnk[SMC_SINGLE_LINK]);
256
257 smc_wr_tx_put_slot(link, priv); 243 smc_wr_tx_put_slot(link, priv);
258 smc_lgr_terminate(lgr); 244 smc_lgr_terminate(smc_get_lgr(link));
259 } 245 }
260 return rc; 246 return rc;
261} 247}
@@ -263,7 +249,6 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv)
263/* Register a memory region and wait for result. */ 249/* Register a memory region and wait for result. */
264int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) 250int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr)
265{ 251{
266 struct ib_send_wr *failed_wr = NULL;
267 int rc; 252 int rc;
268 253
269 ib_req_notify_cq(link->smcibdev->roce_cq_send, 254 ib_req_notify_cq(link->smcibdev->roce_cq_send,
@@ -272,9 +257,7 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr)
272 link->wr_reg.wr.wr_id = (u64)(uintptr_t)mr; 257 link->wr_reg.wr.wr_id = (u64)(uintptr_t)mr;
273 link->wr_reg.mr = mr; 258 link->wr_reg.mr = mr;
274 link->wr_reg.key = mr->rkey; 259 link->wr_reg.key = mr->rkey;
275 failed_wr = &link->wr_reg.wr; 260 rc = ib_post_send(link->roce_qp, &link->wr_reg.wr, NULL);
276 rc = ib_post_send(link->roce_qp, &link->wr_reg.wr, &failed_wr);
277 WARN_ON(failed_wr != &link->wr_reg.wr);
278 if (rc) 261 if (rc)
279 return rc; 262 return rc;
280 263
@@ -283,11 +266,7 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr)
283 SMC_WR_REG_MR_WAIT_TIME); 266 SMC_WR_REG_MR_WAIT_TIME);
284 if (!rc) { 267 if (!rc) {
285 /* timeout - terminate connections */ 268 /* timeout - terminate connections */
286 struct smc_link_group *lgr; 269 smc_lgr_terminate(smc_get_lgr(link));
287
288 lgr = container_of(link, struct smc_link_group,
289 lnk[SMC_SINGLE_LINK]);
290 smc_lgr_terminate(lgr);
291 return -EPIPE; 270 return -EPIPE;
292 } 271 }
293 if (rc == -ERESTARTSYS) 272 if (rc == -ERESTARTSYS)
@@ -380,8 +359,6 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
380 smc_wr_rx_demultiplex(&wc[i]); 359 smc_wr_rx_demultiplex(&wc[i]);
381 smc_wr_rx_post(link); /* refill WR RX */ 360 smc_wr_rx_post(link); /* refill WR RX */
382 } else { 361 } else {
383 struct smc_link_group *lgr;
384
385 /* handle status errors */ 362 /* handle status errors */
386 switch (wc[i].status) { 363 switch (wc[i].status) {
387 case IB_WC_RETRY_EXC_ERR: 364 case IB_WC_RETRY_EXC_ERR:
@@ -390,9 +367,7 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
390 /* terminate connections of this link group 367 /* terminate connections of this link group
391 * abnormally 368 * abnormally
392 */ 369 */
393 lgr = container_of(link, struct smc_link_group, 370 smc_lgr_terminate(smc_get_lgr(link));
394 lnk[SMC_SINGLE_LINK]);
395 smc_lgr_terminate(lgr);
396 break; 371 break;
397 default: 372 default:
398 smc_wr_rx_post(link); /* refill WR RX */ 373 smc_wr_rx_post(link); /* refill WR RX */
diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
index 210bec3c3ebe..1d85bb14fd6f 100644
--- a/net/smc/smc_wr.h
+++ b/net/smc/smc_wr.h
@@ -63,7 +63,6 @@ static inline void smc_wr_tx_set_wr_id(atomic_long_t *wr_tx_id, long val)
63/* post a new receive work request to fill a completed old work request entry */ 63/* post a new receive work request to fill a completed old work request entry */
64static inline int smc_wr_rx_post(struct smc_link *link) 64static inline int smc_wr_rx_post(struct smc_link *link)
65{ 65{
66 struct ib_recv_wr *bad_recv_wr = NULL;
67 int rc; 66 int rc;
68 u64 wr_id, temp_wr_id; 67 u64 wr_id, temp_wr_id;
69 u32 index; 68 u32 index;
@@ -72,7 +71,7 @@ static inline int smc_wr_rx_post(struct smc_link *link)
72 temp_wr_id = wr_id; 71 temp_wr_id = wr_id;
73 index = do_div(temp_wr_id, link->wr_rx_cnt); 72 index = do_div(temp_wr_id, link->wr_rx_cnt);
74 link->wr_rx_ibs[index].wr_id = wr_id; 73 link->wr_rx_ibs[index].wr_id = wr_id;
75 rc = ib_post_recv(link->roce_qp, &link->wr_rx_ibs[index], &bad_recv_wr); 74 rc = ib_post_recv(link->roce_qp, &link->wr_rx_ibs[index], NULL);
76 return rc; 75 return rc;
77} 76}
78 77
diff --git a/net/socket.c b/net/socket.c
index 8a109012608a..e6945e318f02 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -89,6 +89,7 @@
89#include <linux/magic.h> 89#include <linux/magic.h>
90#include <linux/slab.h> 90#include <linux/slab.h>
91#include <linux/xattr.h> 91#include <linux/xattr.h>
92#include <linux/nospec.h>
92 93
93#include <linux/uaccess.h> 94#include <linux/uaccess.h>
94#include <asm/unistd.h> 95#include <asm/unistd.h>
@@ -117,10 +118,8 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from);
117static int sock_mmap(struct file *file, struct vm_area_struct *vma); 118static int sock_mmap(struct file *file, struct vm_area_struct *vma);
118 119
119static int sock_close(struct inode *inode, struct file *file); 120static int sock_close(struct inode *inode, struct file *file);
120static struct wait_queue_head *sock_get_poll_head(struct file *file, 121static __poll_t sock_poll(struct file *file,
121 __poll_t events); 122 struct poll_table_struct *wait);
122static __poll_t sock_poll_mask(struct file *file, __poll_t);
123static __poll_t sock_poll(struct file *file, struct poll_table_struct *wait);
124static long sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg); 123static long sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
125#ifdef CONFIG_COMPAT 124#ifdef CONFIG_COMPAT
126static long compat_sock_ioctl(struct file *file, 125static long compat_sock_ioctl(struct file *file,
@@ -143,8 +142,6 @@ static const struct file_operations socket_file_ops = {
143 .llseek = no_llseek, 142 .llseek = no_llseek,
144 .read_iter = sock_read_iter, 143 .read_iter = sock_read_iter,
145 .write_iter = sock_write_iter, 144 .write_iter = sock_write_iter,
146 .get_poll_head = sock_get_poll_head,
147 .poll_mask = sock_poll_mask,
148 .poll = sock_poll, 145 .poll = sock_poll,
149 .unlocked_ioctl = sock_ioctl, 146 .unlocked_ioctl = sock_ioctl,
150#ifdef CONFIG_COMPAT 147#ifdef CONFIG_COMPAT
@@ -255,7 +252,7 @@ static struct inode *sock_alloc_inode(struct super_block *sb)
255 init_waitqueue_head(&wq->wait); 252 init_waitqueue_head(&wq->wait);
256 wq->fasync_list = NULL; 253 wq->fasync_list = NULL;
257 wq->flags = 0; 254 wq->flags = 0;
258 RCU_INIT_POINTER(ei->socket.wq, wq); 255 ei->socket.wq = wq;
259 256
260 ei->socket.state = SS_UNCONNECTED; 257 ei->socket.state = SS_UNCONNECTED;
261 ei->socket.flags = 0; 258 ei->socket.flags = 0;
@@ -269,11 +266,9 @@ static struct inode *sock_alloc_inode(struct super_block *sb)
269static void sock_destroy_inode(struct inode *inode) 266static void sock_destroy_inode(struct inode *inode)
270{ 267{
271 struct socket_alloc *ei; 268 struct socket_alloc *ei;
272 struct socket_wq *wq;
273 269
274 ei = container_of(inode, struct socket_alloc, vfs_inode); 270 ei = container_of(inode, struct socket_alloc, vfs_inode);
275 wq = rcu_dereference_protected(ei->socket.wq, 1); 271 kfree_rcu(ei->socket.wq, rcu);
276 kfree_rcu(wq, rcu);
277 kmem_cache_free(sock_inode_cachep, ei); 272 kmem_cache_free(sock_inode_cachep, ei);
278} 273}
279 274
@@ -391,39 +386,20 @@ static struct file_system_type sock_fs_type = {
391 386
392struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname) 387struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname)
393{ 388{
394 struct qstr name = { .name = "" };
395 struct path path;
396 struct file *file; 389 struct file *file;
397 390
398 if (dname) { 391 if (!dname)
399 name.name = dname; 392 dname = sock->sk ? sock->sk->sk_prot_creator->name : "";
400 name.len = strlen(name.name);
401 } else if (sock->sk) {
402 name.name = sock->sk->sk_prot_creator->name;
403 name.len = strlen(name.name);
404 }
405 path.dentry = d_alloc_pseudo(sock_mnt->mnt_sb, &name);
406 if (unlikely(!path.dentry)) {
407 sock_release(sock);
408 return ERR_PTR(-ENOMEM);
409 }
410 path.mnt = mntget(sock_mnt);
411
412 d_instantiate(path.dentry, SOCK_INODE(sock));
413 393
414 file = alloc_file(&path, FMODE_READ | FMODE_WRITE, 394 file = alloc_file_pseudo(SOCK_INODE(sock), sock_mnt, dname,
415 &socket_file_ops); 395 O_RDWR | (flags & O_NONBLOCK),
396 &socket_file_ops);
416 if (IS_ERR(file)) { 397 if (IS_ERR(file)) {
417 /* drop dentry, keep inode for a bit */
418 ihold(d_inode(path.dentry));
419 path_put(&path);
420 /* ... and now kill it properly */
421 sock_release(sock); 398 sock_release(sock);
422 return file; 399 return file;
423 } 400 }
424 401
425 sock->file = file; 402 sock->file = file;
426 file->f_flags = O_RDWR | (flags & O_NONBLOCK);
427 file->private_data = sock; 403 file->private_data = sock;
428 return file; 404 return file;
429} 405}
@@ -607,7 +583,7 @@ static void __sock_release(struct socket *sock, struct inode *inode)
607 module_put(owner); 583 module_put(owner);
608 } 584 }
609 585
610 if (rcu_dereference_protected(sock->wq, 1)->fasync_list) 586 if (sock->wq->fasync_list)
611 pr_err("%s: fasync list not empty!\n", __func__); 587 pr_err("%s: fasync list not empty!\n", __func__);
612 588
613 if (!sock->file) { 589 if (!sock->file) {
@@ -1130,48 +1106,25 @@ out_release:
1130} 1106}
1131EXPORT_SYMBOL(sock_create_lite); 1107EXPORT_SYMBOL(sock_create_lite);
1132 1108
1133static struct wait_queue_head *sock_get_poll_head(struct file *file,
1134 __poll_t events)
1135{
1136 struct socket *sock = file->private_data;
1137
1138 if (!sock->ops->poll_mask)
1139 return NULL;
1140 sock_poll_busy_loop(sock, events);
1141 return sk_sleep(sock->sk);
1142}
1143
1144static __poll_t sock_poll_mask(struct file *file, __poll_t events)
1145{
1146 struct socket *sock = file->private_data;
1147
1148 /*
1149 * We need to be sure we are in sync with the socket flags modification.
1150 *
1151 * This memory barrier is paired in the wq_has_sleeper.
1152 */
1153 smp_mb();
1154
1155 /* this socket can poll_ll so tell the system call */
1156 return sock->ops->poll_mask(sock, events) |
1157 (sk_can_busy_loop(sock->sk) ? POLL_BUSY_LOOP : 0);
1158}
1159
1160/* No kernel lock held - perfect */ 1109/* No kernel lock held - perfect */
1161static __poll_t sock_poll(struct file *file, poll_table *wait) 1110static __poll_t sock_poll(struct file *file, poll_table *wait)
1162{ 1111{
1163 struct socket *sock = file->private_data; 1112 struct socket *sock = file->private_data;
1164 __poll_t events = poll_requested_events(wait), mask = 0; 1113 __poll_t events = poll_requested_events(wait), flag = 0;
1114
1115 if (!sock->ops->poll)
1116 return 0;
1117
1118 if (sk_can_busy_loop(sock->sk)) {
1119 /* poll once if requested by the syscall */
1120 if (events & POLL_BUSY_LOOP)
1121 sk_busy_loop(sock->sk, 1);
1165 1122
1166 if (sock->ops->poll) { 1123 /* if this socket can poll_ll, tell the system call */
1167 sock_poll_busy_loop(sock, events); 1124 flag = POLL_BUSY_LOOP;
1168 mask = sock->ops->poll(file, sock, wait);
1169 } else if (sock->ops->poll_mask) {
1170 sock_poll_wait(file, sock_get_poll_head(file, events), wait);
1171 mask = sock->ops->poll_mask(sock, events);
1172 } 1125 }
1173 1126
1174 return mask | sock_poll_busy_flag(sock); 1127 return sock->ops->poll(file, sock, wait) | flag;
1175} 1128}
1176 1129
1177static int sock_mmap(struct file *file, struct vm_area_struct *vma) 1130static int sock_mmap(struct file *file, struct vm_area_struct *vma)
@@ -1208,7 +1161,7 @@ static int sock_fasync(int fd, struct file *filp, int on)
1208 return -EINVAL; 1161 return -EINVAL;
1209 1162
1210 lock_sock(sk); 1163 lock_sock(sk);
1211 wq = rcu_dereference_protected(sock->wq, lockdep_sock_is_held(sk)); 1164 wq = sock->wq;
1212 fasync_helper(fd, filp, on, &wq->fasync_list); 1165 fasync_helper(fd, filp, on, &wq->fasync_list);
1213 1166
1214 if (!wq->fasync_list) 1167 if (!wq->fasync_list)
@@ -2558,6 +2511,7 @@ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)
2558 2511
2559 if (call < 1 || call > SYS_SENDMMSG) 2512 if (call < 1 || call > SYS_SENDMMSG)
2560 return -EINVAL; 2513 return -EINVAL;
2514 call = array_index_nospec(call, SYS_SENDMMSG + 1);
2561 2515
2562 len = nargs[call]; 2516 len = nargs[call];
2563 if (len > sizeof(a)) 2517 if (len > sizeof(a))
diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c
index 1a9695183599..da1a676860ca 100644
--- a/net/strparser/strparser.c
+++ b/net/strparser/strparser.c
@@ -35,7 +35,6 @@ struct _strp_msg {
35 */ 35 */
36 struct strp_msg strp; 36 struct strp_msg strp;
37 int accum_len; 37 int accum_len;
38 int early_eaten;
39}; 38};
40 39
41static inline struct _strp_msg *_strp_msg(struct sk_buff *skb) 40static inline struct _strp_msg *_strp_msg(struct sk_buff *skb)
@@ -115,20 +114,6 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
115 head = strp->skb_head; 114 head = strp->skb_head;
116 if (head) { 115 if (head) {
117 /* Message already in progress */ 116 /* Message already in progress */
118
119 stm = _strp_msg(head);
120 if (unlikely(stm->early_eaten)) {
121 /* Already some number of bytes on the receive sock
122 * data saved in skb_head, just indicate they
123 * are consumed.
124 */
125 eaten = orig_len <= stm->early_eaten ?
126 orig_len : stm->early_eaten;
127 stm->early_eaten -= eaten;
128
129 return eaten;
130 }
131
132 if (unlikely(orig_offset)) { 117 if (unlikely(orig_offset)) {
133 /* Getting data with a non-zero offset when a message is 118 /* Getting data with a non-zero offset when a message is
134 * in progress is not expected. If it does happen, we 119 * in progress is not expected. If it does happen, we
@@ -155,11 +140,13 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
155 /* We are going to append to the frags_list of head. 140 /* We are going to append to the frags_list of head.
156 * Need to unshare the frag_list. 141 * Need to unshare the frag_list.
157 */ 142 */
158 err = skb_unclone(head, GFP_ATOMIC); 143 if (skb_has_frag_list(head)) {
159 if (err) { 144 err = skb_unclone(head, GFP_ATOMIC);
160 STRP_STATS_INCR(strp->stats.mem_fail); 145 if (err) {
161 desc->error = err; 146 STRP_STATS_INCR(strp->stats.mem_fail);
162 return 0; 147 desc->error = err;
148 return 0;
149 }
163 } 150 }
164 151
165 if (unlikely(skb_shinfo(head)->frag_list)) { 152 if (unlikely(skb_shinfo(head)->frag_list)) {
@@ -216,14 +203,16 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
216 memset(stm, 0, sizeof(*stm)); 203 memset(stm, 0, sizeof(*stm));
217 stm->strp.offset = orig_offset + eaten; 204 stm->strp.offset = orig_offset + eaten;
218 } else { 205 } else {
219 /* Unclone since we may be appending to an skb that we 206 /* Unclone if we are appending to an skb that we
220 * already share a frag_list with. 207 * already share a frag_list with.
221 */ 208 */
222 err = skb_unclone(skb, GFP_ATOMIC); 209 if (skb_has_frag_list(skb)) {
223 if (err) { 210 err = skb_unclone(skb, GFP_ATOMIC);
224 STRP_STATS_INCR(strp->stats.mem_fail); 211 if (err) {
225 desc->error = err; 212 STRP_STATS_INCR(strp->stats.mem_fail);
226 break; 213 desc->error = err;
214 break;
215 }
227 } 216 }
228 217
229 stm = _strp_msg(head); 218 stm = _strp_msg(head);
@@ -297,9 +286,9 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
297 } 286 }
298 287
299 stm->accum_len += cand_len; 288 stm->accum_len += cand_len;
289 eaten += cand_len;
300 strp->need_bytes = stm->strp.full_len - 290 strp->need_bytes = stm->strp.full_len -
301 stm->accum_len; 291 stm->accum_len;
302 stm->early_eaten = cand_len;
303 STRP_STATS_ADD(strp->stats.bytes, cand_len); 292 STRP_STATS_ADD(strp->stats.bytes, cand_len);
304 desc->count = 0; /* Stop reading socket */ 293 desc->count = 0; /* Stop reading socket */
305 break; 294 break;
@@ -392,7 +381,7 @@ static int strp_read_sock(struct strparser *strp)
392/* Lower sock lock held */ 381/* Lower sock lock held */
393void strp_data_ready(struct strparser *strp) 382void strp_data_ready(struct strparser *strp)
394{ 383{
395 if (unlikely(strp->stopped)) 384 if (unlikely(strp->stopped) || strp->paused)
396 return; 385 return;
397 386
398 /* This check is needed to synchronize with do_strp_work. 387 /* This check is needed to synchronize with do_strp_work.
@@ -407,9 +396,6 @@ void strp_data_ready(struct strparser *strp)
407 return; 396 return;
408 } 397 }
409 398
410 if (strp->paused)
411 return;
412
413 if (strp->need_bytes) { 399 if (strp->need_bytes) {
414 if (strp_peek_len(strp) < strp->need_bytes) 400 if (strp_peek_len(strp) < strp->need_bytes)
415 return; 401 return;
@@ -422,8 +408,6 @@ EXPORT_SYMBOL_GPL(strp_data_ready);
422 408
423static void do_strp_work(struct strparser *strp) 409static void do_strp_work(struct strparser *strp)
424{ 410{
425 read_descriptor_t rd_desc;
426
427 /* We need the read lock to synchronize with strp_data_ready. We 411 /* We need the read lock to synchronize with strp_data_ready. We
428 * need the socket lock for calling strp_read_sock. 412 * need the socket lock for calling strp_read_sock.
429 */ 413 */
@@ -435,8 +419,6 @@ static void do_strp_work(struct strparser *strp)
435 if (strp->paused) 419 if (strp->paused)
436 goto out; 420 goto out;
437 421
438 rd_desc.arg.data = strp;
439
440 if (strp_read_sock(strp) == -ENOMEM) 422 if (strp_read_sock(strp) == -ENOMEM)
441 queue_work(strp_wq, &strp->work); 423 queue_work(strp_wq, &strp->work);
442 424
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index d2623b9f23d6..305ecea92170 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -50,7 +50,7 @@ static int param_set_hashtbl_sz(const char *val, const struct kernel_param *kp)
50 if (!val) 50 if (!val)
51 goto out_inval; 51 goto out_inval;
52 ret = kstrtoul(val, 0, &num); 52 ret = kstrtoul(val, 0, &num);
53 if (ret == -EINVAL) 53 if (ret)
54 goto out_inval; 54 goto out_inval;
55 nbits = fls(num - 1); 55 nbits = fls(num - 1);
56 if (nbits > MAX_HASHTABLE_BITS || nbits < 2) 56 if (nbits > MAX_HASHTABLE_BITS || nbits < 2)
@@ -253,7 +253,7 @@ rpcauth_list_flavors(rpc_authflavor_t *array, int size)
253EXPORT_SYMBOL_GPL(rpcauth_list_flavors); 253EXPORT_SYMBOL_GPL(rpcauth_list_flavors);
254 254
255struct rpc_auth * 255struct rpc_auth *
256rpcauth_create(struct rpc_auth_create_args *args, struct rpc_clnt *clnt) 256rpcauth_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
257{ 257{
258 struct rpc_auth *auth; 258 struct rpc_auth *auth;
259 const struct rpc_authops *ops; 259 const struct rpc_authops *ops;
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index be8f103d22fd..21c0aa0a0d1d 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -284,7 +284,12 @@ err:
284 return p; 284 return p;
285} 285}
286 286
287#define UPCALL_BUF_LEN 128 287/* XXX: Need some documentation about why UPCALL_BUF_LEN is so small.
288 * Is user space expecting no more than UPCALL_BUF_LEN bytes?
289 * Note that there are now _two_ NI_MAXHOST sized data items
290 * being passed in this string.
291 */
292#define UPCALL_BUF_LEN 256
288 293
289struct gss_upcall_msg { 294struct gss_upcall_msg {
290 refcount_t count; 295 refcount_t count;
@@ -456,18 +461,44 @@ static int gss_encode_v1_msg(struct gss_upcall_msg *gss_msg,
456 buflen -= len; 461 buflen -= len;
457 p += len; 462 p += len;
458 gss_msg->msg.len = len; 463 gss_msg->msg.len = len;
464
465 /*
466 * target= is a full service principal that names the remote
467 * identity that we are authenticating to.
468 */
459 if (target_name) { 469 if (target_name) {
460 len = scnprintf(p, buflen, "target=%s ", target_name); 470 len = scnprintf(p, buflen, "target=%s ", target_name);
461 buflen -= len; 471 buflen -= len;
462 p += len; 472 p += len;
463 gss_msg->msg.len += len; 473 gss_msg->msg.len += len;
464 } 474 }
465 if (service_name != NULL) { 475
466 len = scnprintf(p, buflen, "service=%s ", service_name); 476 /*
477 * gssd uses service= and srchost= to select a matching key from
478 * the system's keytab to use as the source principal.
479 *
480 * service= is the service name part of the source principal,
481 * or "*" (meaning choose any).
482 *
483 * srchost= is the hostname part of the source principal. When
484 * not provided, gssd uses the local hostname.
485 */
486 if (service_name) {
487 char *c = strchr(service_name, '@');
488
489 if (!c)
490 len = scnprintf(p, buflen, "service=%s ",
491 service_name);
492 else
493 len = scnprintf(p, buflen,
494 "service=%.*s srchost=%s ",
495 (int)(c - service_name),
496 service_name, c + 1);
467 buflen -= len; 497 buflen -= len;
468 p += len; 498 p += len;
469 gss_msg->msg.len += len; 499 gss_msg->msg.len += len;
470 } 500 }
501
471 if (mech->gm_upcall_enctypes) { 502 if (mech->gm_upcall_enctypes) {
472 len = scnprintf(p, buflen, "enctypes=%s ", 503 len = scnprintf(p, buflen, "enctypes=%s ",
473 mech->gm_upcall_enctypes); 504 mech->gm_upcall_enctypes);
@@ -517,7 +548,7 @@ gss_alloc_msg(struct gss_auth *gss_auth,
517 err = gss_encode_v1_msg(gss_msg, service_name, gss_auth->target_name); 548 err = gss_encode_v1_msg(gss_msg, service_name, gss_auth->target_name);
518 if (err) 549 if (err)
519 goto err_put_pipe_version; 550 goto err_put_pipe_version;
520 }; 551 }
521 kref_get(&gss_auth->kref); 552 kref_get(&gss_auth->kref);
522 return gss_msg; 553 return gss_msg;
523err_put_pipe_version: 554err_put_pipe_version:
@@ -985,7 +1016,7 @@ static void gss_pipe_free(struct gss_pipe *p)
985 * parameters based on the input flavor (which must be a pseudoflavor) 1016 * parameters based on the input flavor (which must be a pseudoflavor)
986 */ 1017 */
987static struct gss_auth * 1018static struct gss_auth *
988gss_create_new(struct rpc_auth_create_args *args, struct rpc_clnt *clnt) 1019gss_create_new(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
989{ 1020{
990 rpc_authflavor_t flavor = args->pseudoflavor; 1021 rpc_authflavor_t flavor = args->pseudoflavor;
991 struct gss_auth *gss_auth; 1022 struct gss_auth *gss_auth;
@@ -1132,7 +1163,7 @@ gss_destroy(struct rpc_auth *auth)
1132 * (which is guaranteed to last as long as any of its descendants). 1163 * (which is guaranteed to last as long as any of its descendants).
1133 */ 1164 */
1134static struct gss_auth * 1165static struct gss_auth *
1135gss_auth_find_or_add_hashed(struct rpc_auth_create_args *args, 1166gss_auth_find_or_add_hashed(const struct rpc_auth_create_args *args,
1136 struct rpc_clnt *clnt, 1167 struct rpc_clnt *clnt,
1137 struct gss_auth *new) 1168 struct gss_auth *new)
1138{ 1169{
@@ -1169,7 +1200,8 @@ out:
1169} 1200}
1170 1201
1171static struct gss_auth * 1202static struct gss_auth *
1172gss_create_hashed(struct rpc_auth_create_args *args, struct rpc_clnt *clnt) 1203gss_create_hashed(const struct rpc_auth_create_args *args,
1204 struct rpc_clnt *clnt)
1173{ 1205{
1174 struct gss_auth *gss_auth; 1206 struct gss_auth *gss_auth;
1175 struct gss_auth *new; 1207 struct gss_auth *new;
@@ -1188,7 +1220,7 @@ out:
1188} 1220}
1189 1221
1190static struct rpc_auth * 1222static struct rpc_auth *
1191gss_create(struct rpc_auth_create_args *args, struct rpc_clnt *clnt) 1223gss_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
1192{ 1224{
1193 struct gss_auth *gss_auth; 1225 struct gss_auth *gss_auth;
1194 struct rpc_xprt_switch *xps = rcu_access_pointer(clnt->cl_xpi.xpi_xpswitch); 1226 struct rpc_xprt_switch *xps = rcu_access_pointer(clnt->cl_xpi.xpi_xpswitch);
@@ -1571,7 +1603,7 @@ static int gss_cred_is_negative_entry(struct rpc_cred *cred)
1571 if (test_bit(RPCAUTH_CRED_NEGATIVE, &cred->cr_flags)) { 1603 if (test_bit(RPCAUTH_CRED_NEGATIVE, &cred->cr_flags)) {
1572 unsigned long now = jiffies; 1604 unsigned long now = jiffies;
1573 unsigned long begin, expire; 1605 unsigned long begin, expire;
1574 struct gss_cred *gss_cred; 1606 struct gss_cred *gss_cred;
1575 1607
1576 gss_cred = container_of(cred, struct gss_cred, gc_base); 1608 gss_cred = container_of(cred, struct gss_cred, gc_base);
1577 begin = gss_cred->gc_upcall_timestamp; 1609 begin = gss_cred->gc_upcall_timestamp;
diff --git a/net/sunrpc/auth_gss/gss_generic_token.c b/net/sunrpc/auth_gss/gss_generic_token.c
index 254defe446a7..fe97f3106536 100644
--- a/net/sunrpc/auth_gss/gss_generic_token.c
+++ b/net/sunrpc/auth_gss/gss_generic_token.c
@@ -231,4 +231,3 @@ g_verify_token_header(struct xdr_netobj *mech, int *body_size,
231} 231}
232 232
233EXPORT_SYMBOL_GPL(g_verify_token_header); 233EXPORT_SYMBOL_GPL(g_verify_token_header);
234
diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c
index 8654494b4d0a..0220e1ca5280 100644
--- a/net/sunrpc/auth_gss/gss_krb5_crypto.c
+++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c
@@ -169,7 +169,7 @@ make_checksum_hmac_md5(struct krb5_ctx *kctx, char *header, int hdrlen,
169 struct scatterlist sg[1]; 169 struct scatterlist sg[1];
170 int err = -1; 170 int err = -1;
171 u8 *checksumdata; 171 u8 *checksumdata;
172 u8 rc4salt[4]; 172 u8 *rc4salt;
173 struct crypto_ahash *md5; 173 struct crypto_ahash *md5;
174 struct crypto_ahash *hmac_md5; 174 struct crypto_ahash *hmac_md5;
175 struct ahash_request *req; 175 struct ahash_request *req;
@@ -183,14 +183,18 @@ make_checksum_hmac_md5(struct krb5_ctx *kctx, char *header, int hdrlen,
183 return GSS_S_FAILURE; 183 return GSS_S_FAILURE;
184 } 184 }
185 185
186 rc4salt = kmalloc_array(4, sizeof(*rc4salt), GFP_NOFS);
187 if (!rc4salt)
188 return GSS_S_FAILURE;
189
186 if (arcfour_hmac_md5_usage_to_salt(usage, rc4salt)) { 190 if (arcfour_hmac_md5_usage_to_salt(usage, rc4salt)) {
187 dprintk("%s: invalid usage value %u\n", __func__, usage); 191 dprintk("%s: invalid usage value %u\n", __func__, usage);
188 return GSS_S_FAILURE; 192 goto out_free_rc4salt;
189 } 193 }
190 194
191 checksumdata = kmalloc(GSS_KRB5_MAX_CKSUM_LEN, GFP_NOFS); 195 checksumdata = kmalloc(GSS_KRB5_MAX_CKSUM_LEN, GFP_NOFS);
192 if (!checksumdata) 196 if (!checksumdata)
193 return GSS_S_FAILURE; 197 goto out_free_rc4salt;
194 198
195 md5 = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC); 199 md5 = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC);
196 if (IS_ERR(md5)) 200 if (IS_ERR(md5))
@@ -258,6 +262,8 @@ out_free_md5:
258 crypto_free_ahash(md5); 262 crypto_free_ahash(md5);
259out_free_cksum: 263out_free_cksum:
260 kfree(checksumdata); 264 kfree(checksumdata);
265out_free_rc4salt:
266 kfree(rc4salt);
261 return err ? GSS_S_FAILURE : 0; 267 return err ? GSS_S_FAILURE : 0;
262} 268}
263 269
@@ -373,7 +379,6 @@ make_checksum_v2(struct krb5_ctx *kctx, char *header, int hdrlen,
373 struct scatterlist sg[1]; 379 struct scatterlist sg[1];
374 int err = -1; 380 int err = -1;
375 u8 *checksumdata; 381 u8 *checksumdata;
376 unsigned int checksumlen;
377 382
378 if (kctx->gk5e->keyed_cksum == 0) { 383 if (kctx->gk5e->keyed_cksum == 0) {
379 dprintk("%s: expected keyed hash for %s\n", 384 dprintk("%s: expected keyed hash for %s\n",
@@ -393,7 +398,6 @@ make_checksum_v2(struct krb5_ctx *kctx, char *header, int hdrlen,
393 tfm = crypto_alloc_ahash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC); 398 tfm = crypto_alloc_ahash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC);
394 if (IS_ERR(tfm)) 399 if (IS_ERR(tfm))
395 goto out_free_cksum; 400 goto out_free_cksum;
396 checksumlen = crypto_ahash_digestsize(tfm);
397 401
398 req = ahash_request_alloc(tfm, GFP_NOFS); 402 req = ahash_request_alloc(tfm, GFP_NOFS);
399 if (!req) 403 if (!req)
@@ -1077,4 +1081,3 @@ out_err:
1077 dprintk("%s: returning %d\n", __func__, err); 1081 dprintk("%s: returning %d\n", __func__, err);
1078 return err; 1082 return err;
1079} 1083}
1080
diff --git a/net/sunrpc/auth_gss/gss_krb5_keys.c b/net/sunrpc/auth_gss/gss_krb5_keys.c
index 870133146026..f7fe2d2b851f 100644
--- a/net/sunrpc/auth_gss/gss_krb5_keys.c
+++ b/net/sunrpc/auth_gss/gss_krb5_keys.c
@@ -324,4 +324,3 @@ u32 gss_krb5_aes_make_key(const struct gss_krb5_enctype *gk5e,
324err_out: 324err_out:
325 return ret; 325 return ret;
326} 326}
327
diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c
index 94a2b3f082a8..eaad9bc7a0bd 100644
--- a/net/sunrpc/auth_gss/gss_krb5_seal.c
+++ b/net/sunrpc/auth_gss/gss_krb5_seal.c
@@ -229,4 +229,3 @@ gss_get_mic_kerberos(struct gss_ctx *gss_ctx, struct xdr_buf *text,
229 return gss_get_mic_v2(ctx, text, token); 229 return gss_get_mic_v2(ctx, text, token);
230 } 230 }
231} 231}
232
diff --git a/net/sunrpc/auth_gss/gss_krb5_unseal.c b/net/sunrpc/auth_gss/gss_krb5_unseal.c
index b601a73cc9db..ef2b25b86d2f 100644
--- a/net/sunrpc/auth_gss/gss_krb5_unseal.c
+++ b/net/sunrpc/auth_gss/gss_krb5_unseal.c
@@ -225,4 +225,3 @@ gss_verify_mic_kerberos(struct gss_ctx *gss_ctx,
225 return gss_verify_mic_v2(ctx, message_buffer, read_token); 225 return gss_verify_mic_v2(ctx, message_buffer, read_token);
226 } 226 }
227} 227}
228
diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c
index a737c2da0837..39a2e672900b 100644
--- a/net/sunrpc/auth_gss/gss_krb5_wrap.c
+++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c
@@ -440,7 +440,6 @@ static u32
440gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset, 440gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset,
441 struct xdr_buf *buf, struct page **pages) 441 struct xdr_buf *buf, struct page **pages)
442{ 442{
443 int blocksize;
444 u8 *ptr, *plainhdr; 443 u8 *ptr, *plainhdr;
445 s32 now; 444 s32 now;
446 u8 flags = 0x00; 445 u8 flags = 0x00;
@@ -473,7 +472,6 @@ gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset,
473 *ptr++ = 0xff; 472 *ptr++ = 0xff;
474 be16ptr = (__be16 *)ptr; 473 be16ptr = (__be16 *)ptr;
475 474
476 blocksize = crypto_skcipher_blocksize(kctx->acceptor_enc);
477 *be16ptr++ = 0; 475 *be16ptr++ = 0;
478 /* "inner" token header always uses 0 for RRC */ 476 /* "inner" token header always uses 0 for RRC */
479 *be16ptr++ = 0; 477 *be16ptr++ = 0;
@@ -623,4 +621,3 @@ gss_unwrap_kerberos(struct gss_ctx *gctx, int offset, struct xdr_buf *buf)
623 return gss_unwrap_kerberos_v2(kctx, offset, buf); 621 return gss_unwrap_kerberos_v2(kctx, offset, buf);
624 } 622 }
625} 623}
626
diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.c b/net/sunrpc/auth_gss/gss_rpc_upcall.c
index 1c7c49dbf8ba..73dcda060335 100644
--- a/net/sunrpc/auth_gss/gss_rpc_upcall.c
+++ b/net/sunrpc/auth_gss/gss_rpc_upcall.c
@@ -234,6 +234,35 @@ static int gssp_alloc_receive_pages(struct gssx_arg_accept_sec_context *arg)
234 return 0; 234 return 0;
235} 235}
236 236
237static char *gssp_stringify(struct xdr_netobj *netobj)
238{
239 return kstrndup(netobj->data, netobj->len, GFP_KERNEL);
240}
241
242static void gssp_hostbased_service(char **principal)
243{
244 char *c;
245
246 if (!*principal)
247 return;
248
249 /* terminate and remove realm part */
250 c = strchr(*principal, '@');
251 if (c) {
252 *c = '\0';
253
254 /* change service-hostname delimiter */
255 c = strchr(*principal, '/');
256 if (c)
257 *c = '@';
258 }
259 if (!c) {
260 /* not a service principal */
261 kfree(*principal);
262 *principal = NULL;
263 }
264}
265
237/* 266/*
238 * Public functions 267 * Public functions
239 */ 268 */
@@ -262,6 +291,7 @@ int gssp_accept_sec_context_upcall(struct net *net,
262 */ 291 */
263 .exported_context_token.len = GSSX_max_output_handle_sz, 292 .exported_context_token.len = GSSX_max_output_handle_sz,
264 .mech.len = GSS_OID_MAX_LEN, 293 .mech.len = GSS_OID_MAX_LEN,
294 .targ_name.display_name.len = GSSX_max_princ_sz,
265 .src_name.display_name.len = GSSX_max_princ_sz 295 .src_name.display_name.len = GSSX_max_princ_sz
266 }; 296 };
267 struct gssx_res_accept_sec_context res = { 297 struct gssx_res_accept_sec_context res = {
@@ -275,6 +305,7 @@ int gssp_accept_sec_context_upcall(struct net *net,
275 .rpc_cred = NULL, /* FIXME ? */ 305 .rpc_cred = NULL, /* FIXME ? */
276 }; 306 };
277 struct xdr_netobj client_name = { 0 , NULL }; 307 struct xdr_netobj client_name = { 0 , NULL };
308 struct xdr_netobj target_name = { 0, NULL };
278 int ret; 309 int ret;
279 310
280 if (data->in_handle.len != 0) 311 if (data->in_handle.len != 0)
@@ -285,8 +316,6 @@ int gssp_accept_sec_context_upcall(struct net *net,
285 if (ret) 316 if (ret)
286 return ret; 317 return ret;
287 318
288 /* use nfs/ for targ_name ? */
289
290 ret = gssp_call(net, &msg); 319 ret = gssp_call(net, &msg);
291 320
292 gssp_free_receive_pages(&arg); 321 gssp_free_receive_pages(&arg);
@@ -304,6 +333,7 @@ int gssp_accept_sec_context_upcall(struct net *net,
304 kfree(rctxh.mech.data); 333 kfree(rctxh.mech.data);
305 } 334 }
306 client_name = rctxh.src_name.display_name; 335 client_name = rctxh.src_name.display_name;
336 target_name = rctxh.targ_name.display_name;
307 } 337 }
308 338
309 if (res.options.count == 1) { 339 if (res.options.count == 1) {
@@ -325,32 +355,22 @@ int gssp_accept_sec_context_upcall(struct net *net,
325 } 355 }
326 356
327 /* convert to GSS_NT_HOSTBASED_SERVICE form and set into creds */ 357 /* convert to GSS_NT_HOSTBASED_SERVICE form and set into creds */
328 if (data->found_creds && client_name.data != NULL) { 358 if (data->found_creds) {
329 char *c; 359 if (client_name.data) {
330 360 data->creds.cr_raw_principal =
331 data->creds.cr_raw_principal = kstrndup(client_name.data, 361 gssp_stringify(&client_name);
332 client_name.len, GFP_KERNEL); 362 data->creds.cr_principal =
333 363 gssp_stringify(&client_name);
334 data->creds.cr_principal = kstrndup(client_name.data, 364 gssp_hostbased_service(&data->creds.cr_principal);
335 client_name.len, GFP_KERNEL); 365 }
336 if (data->creds.cr_principal) { 366 if (target_name.data) {
337 /* terminate and remove realm part */ 367 data->creds.cr_targ_princ =
338 c = strchr(data->creds.cr_principal, '@'); 368 gssp_stringify(&target_name);
339 if (c) { 369 gssp_hostbased_service(&data->creds.cr_targ_princ);
340 *c = '\0';
341
342 /* change service-hostname delimiter */
343 c = strchr(data->creds.cr_principal, '/');
344 if (c) *c = '@';
345 }
346 if (!c) {
347 /* not a service principal */
348 kfree(data->creds.cr_principal);
349 data->creds.cr_principal = NULL;
350 }
351 } 370 }
352 } 371 }
353 kfree(client_name.data); 372 kfree(client_name.data);
373 kfree(target_name.data);
354 374
355 return ret; 375 return ret;
356} 376}
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 5089dbb96d58..860f2a1bbb67 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -1389,7 +1389,7 @@ static void destroy_use_gss_proxy_proc_entry(struct net *net)
1389 struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); 1389 struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
1390 1390
1391 if (sn->use_gssp_proc) { 1391 if (sn->use_gssp_proc) {
1392 remove_proc_entry("use-gss-proxy", sn->proc_net_rpc); 1392 remove_proc_entry("use-gss-proxy", sn->proc_net_rpc);
1393 clear_gssp_clnt(sn); 1393 clear_gssp_clnt(sn);
1394 } 1394 }
1395} 1395}
diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c
index 75d72e109a04..4b48228ee8c7 100644
--- a/net/sunrpc/auth_null.c
+++ b/net/sunrpc/auth_null.c
@@ -19,7 +19,7 @@ static struct rpc_auth null_auth;
19static struct rpc_cred null_cred; 19static struct rpc_cred null_cred;
20 20
21static struct rpc_auth * 21static struct rpc_auth *
22nul_create(struct rpc_auth_create_args *args, struct rpc_clnt *clnt) 22nul_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
23{ 23{
24 atomic_inc(&null_auth.au_count); 24 atomic_inc(&null_auth.au_count);
25 return &null_auth; 25 return &null_auth;
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index dafd6b870ba3..185e56d4f9ae 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -30,7 +30,7 @@ static struct rpc_auth unix_auth;
30static const struct rpc_credops unix_credops; 30static const struct rpc_credops unix_credops;
31 31
32static struct rpc_auth * 32static struct rpc_auth *
33unx_create(struct rpc_auth_create_args *args, struct rpc_clnt *clnt) 33unx_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
34{ 34{
35 dprintk("RPC: creating UNIX authenticator for client %p\n", 35 dprintk("RPC: creating UNIX authenticator for client %p\n",
36 clnt); 36 clnt);
diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c
index c2c68a15b59d..3c15a99b9700 100644
--- a/net/sunrpc/backchannel_rqst.c
+++ b/net/sunrpc/backchannel_rqst.c
@@ -362,4 +362,3 @@ void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied)
362 wake_up(&bc_serv->sv_cb_waitq); 362 wake_up(&bc_serv->sv_cb_waitq);
363 spin_unlock(&bc_serv->sv_cb_lock); 363 spin_unlock(&bc_serv->sv_cb_lock);
364} 364}
365
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index d839c33ae7d9..8ea2f5fadd96 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -892,7 +892,7 @@ rpc_free_client(struct rpc_clnt *clnt)
892/* 892/*
893 * Free an RPC client 893 * Free an RPC client
894 */ 894 */
895static struct rpc_clnt * 895static struct rpc_clnt *
896rpc_free_auth(struct rpc_clnt *clnt) 896rpc_free_auth(struct rpc_clnt *clnt)
897{ 897{
898 if (clnt->cl_auth == NULL) 898 if (clnt->cl_auth == NULL)
@@ -965,10 +965,20 @@ out:
965} 965}
966EXPORT_SYMBOL_GPL(rpc_bind_new_program); 966EXPORT_SYMBOL_GPL(rpc_bind_new_program);
967 967
968void rpc_task_release_transport(struct rpc_task *task)
969{
970 struct rpc_xprt *xprt = task->tk_xprt;
971
972 if (xprt) {
973 task->tk_xprt = NULL;
974 xprt_put(xprt);
975 }
976}
977EXPORT_SYMBOL_GPL(rpc_task_release_transport);
978
968void rpc_task_release_client(struct rpc_task *task) 979void rpc_task_release_client(struct rpc_task *task)
969{ 980{
970 struct rpc_clnt *clnt = task->tk_client; 981 struct rpc_clnt *clnt = task->tk_client;
971 struct rpc_xprt *xprt = task->tk_xprt;
972 982
973 if (clnt != NULL) { 983 if (clnt != NULL) {
974 /* Remove from client task list */ 984 /* Remove from client task list */
@@ -979,12 +989,14 @@ void rpc_task_release_client(struct rpc_task *task)
979 989
980 rpc_release_client(clnt); 990 rpc_release_client(clnt);
981 } 991 }
992 rpc_task_release_transport(task);
993}
982 994
983 if (xprt != NULL) { 995static
984 task->tk_xprt = NULL; 996void rpc_task_set_transport(struct rpc_task *task, struct rpc_clnt *clnt)
985 997{
986 xprt_put(xprt); 998 if (!task->tk_xprt)
987 } 999 task->tk_xprt = xprt_iter_get_next(&clnt->cl_xpi);
988} 1000}
989 1001
990static 1002static
@@ -992,8 +1004,7 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt)
992{ 1004{
993 1005
994 if (clnt != NULL) { 1006 if (clnt != NULL) {
995 if (task->tk_xprt == NULL) 1007 rpc_task_set_transport(task, clnt);
996 task->tk_xprt = xprt_iter_get_next(&clnt->cl_xpi);
997 task->tk_client = clnt; 1008 task->tk_client = clnt;
998 atomic_inc(&clnt->cl_count); 1009 atomic_inc(&clnt->cl_count);
999 if (clnt->cl_softrtry) 1010 if (clnt->cl_softrtry)
@@ -1512,6 +1523,7 @@ call_start(struct rpc_task *task)
1512 clnt->cl_program->version[clnt->cl_vers]->counts[idx]++; 1523 clnt->cl_program->version[clnt->cl_vers]->counts[idx]++;
1513 clnt->cl_stats->rpccnt++; 1524 clnt->cl_stats->rpccnt++;
1514 task->tk_action = call_reserve; 1525 task->tk_action = call_reserve;
1526 rpc_task_set_transport(task, clnt);
1515} 1527}
1516 1528
1517/* 1529/*
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index c526f8fb37c9..c7872bc13860 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -213,7 +213,7 @@ static void rpcb_set_local(struct net *net, struct rpc_clnt *clnt,
213 sn->rpcb_local_clnt = clnt; 213 sn->rpcb_local_clnt = clnt;
214 sn->rpcb_local_clnt4 = clnt4; 214 sn->rpcb_local_clnt4 = clnt4;
215 sn->rpcb_is_af_local = is_af_local ? 1 : 0; 215 sn->rpcb_is_af_local = is_af_local ? 1 : 0;
216 smp_wmb(); 216 smp_wmb();
217 sn->rpcb_users = 1; 217 sn->rpcb_users = 1;
218 dprintk("RPC: created new rpcb local clients (rpcb_local_clnt: " 218 dprintk("RPC: created new rpcb local clients (rpcb_local_clnt: "
219 "%p, rpcb_local_clnt4: %p) for net %x%s\n", 219 "%p, rpcb_local_clnt4: %p) for net %x%s\n",
diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c
index f68aa46c9dd7..71166b393732 100644
--- a/net/sunrpc/stats.c
+++ b/net/sunrpc/stats.c
@@ -208,13 +208,39 @@ static void _print_name(struct seq_file *seq, unsigned int op,
208 seq_printf(seq, "\t%12u: ", op); 208 seq_printf(seq, "\t%12u: ", op);
209} 209}
210 210
211void rpc_print_iostats(struct seq_file *seq, struct rpc_clnt *clnt) 211static void _add_rpc_iostats(struct rpc_iostats *a, struct rpc_iostats *b)
212{
213 a->om_ops += b->om_ops;
214 a->om_ntrans += b->om_ntrans;
215 a->om_timeouts += b->om_timeouts;
216 a->om_bytes_sent += b->om_bytes_sent;
217 a->om_bytes_recv += b->om_bytes_recv;
218 a->om_queue = ktime_add(a->om_queue, b->om_queue);
219 a->om_rtt = ktime_add(a->om_rtt, b->om_rtt);
220 a->om_execute = ktime_add(a->om_execute, b->om_execute);
221}
222
223static void _print_rpc_iostats(struct seq_file *seq, struct rpc_iostats *stats,
224 int op, const struct rpc_procinfo *procs)
225{
226 _print_name(seq, op, procs);
227 seq_printf(seq, "%lu %lu %lu %Lu %Lu %Lu %Lu %Lu\n",
228 stats->om_ops,
229 stats->om_ntrans,
230 stats->om_timeouts,
231 stats->om_bytes_sent,
232 stats->om_bytes_recv,
233 ktime_to_ms(stats->om_queue),
234 ktime_to_ms(stats->om_rtt),
235 ktime_to_ms(stats->om_execute));
236}
237
238void rpc_clnt_show_stats(struct seq_file *seq, struct rpc_clnt *clnt)
212{ 239{
213 struct rpc_iostats *stats = clnt->cl_metrics;
214 struct rpc_xprt *xprt; 240 struct rpc_xprt *xprt;
215 unsigned int op, maxproc = clnt->cl_maxproc; 241 unsigned int op, maxproc = clnt->cl_maxproc;
216 242
217 if (!stats) 243 if (!clnt->cl_metrics)
218 return; 244 return;
219 245
220 seq_printf(seq, "\tRPC iostats version: %s ", RPC_IOSTATS_VERS); 246 seq_printf(seq, "\tRPC iostats version: %s ", RPC_IOSTATS_VERS);
@@ -229,20 +255,18 @@ void rpc_print_iostats(struct seq_file *seq, struct rpc_clnt *clnt)
229 255
230 seq_printf(seq, "\tper-op statistics\n"); 256 seq_printf(seq, "\tper-op statistics\n");
231 for (op = 0; op < maxproc; op++) { 257 for (op = 0; op < maxproc; op++) {
232 struct rpc_iostats *metrics = &stats[op]; 258 struct rpc_iostats stats = {};
233 _print_name(seq, op, clnt->cl_procinfo); 259 struct rpc_clnt *next = clnt;
234 seq_printf(seq, "%lu %lu %lu %Lu %Lu %Lu %Lu %Lu\n", 260 do {
235 metrics->om_ops, 261 _add_rpc_iostats(&stats, &next->cl_metrics[op]);
236 metrics->om_ntrans, 262 if (next == next->cl_parent)
237 metrics->om_timeouts, 263 break;
238 metrics->om_bytes_sent, 264 next = next->cl_parent;
239 metrics->om_bytes_recv, 265 } while (next);
240 ktime_to_ms(metrics->om_queue), 266 _print_rpc_iostats(seq, &stats, op, clnt->cl_procinfo);
241 ktime_to_ms(metrics->om_rtt),
242 ktime_to_ms(metrics->om_execute));
243 } 267 }
244} 268}
245EXPORT_SYMBOL_GPL(rpc_print_iostats); 269EXPORT_SYMBOL_GPL(rpc_clnt_show_stats);
246 270
247/* 271/*
248 * Register/unregister RPC proc files 272 * Register/unregister RPC proc files
@@ -310,4 +334,3 @@ void rpc_proc_exit(struct net *net)
310 dprintk("RPC: unregistering /proc/net/rpc\n"); 334 dprintk("RPC: unregistering /proc/net/rpc\n");
311 remove_proc_entry("rpc", net->proc_net); 335 remove_proc_entry("rpc", net->proc_net);
312} 336}
313
diff --git a/net/sunrpc/sunrpc.h b/net/sunrpc/sunrpc.h
index 09a0315ea77b..c9bacb3c930f 100644
--- a/net/sunrpc/sunrpc.h
+++ b/net/sunrpc/sunrpc.h
@@ -57,4 +57,3 @@ int svc_send_common(struct socket *sock, struct xdr_buf *xdr,
57int rpc_clients_notifier_register(void); 57int rpc_clients_notifier_register(void);
58void rpc_clients_notifier_unregister(void); 58void rpc_clients_notifier_unregister(void);
59#endif /* _NET_SUNRPC_SUNRPC_H */ 59#endif /* _NET_SUNRPC_SUNRPC_H */
60
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 30a4226baf03..d13e05f1a990 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1537,16 +1537,16 @@ EXPORT_SYMBOL_GPL(svc_max_payload);
1537/** 1537/**
1538 * svc_fill_write_vector - Construct data argument for VFS write call 1538 * svc_fill_write_vector - Construct data argument for VFS write call
1539 * @rqstp: svc_rqst to operate on 1539 * @rqstp: svc_rqst to operate on
1540 * @pages: list of pages containing data payload
1540 * @first: buffer containing first section of write payload 1541 * @first: buffer containing first section of write payload
1541 * @total: total number of bytes of write payload 1542 * @total: total number of bytes of write payload
1542 * 1543 *
1543 * Returns the number of elements populated in the data argument array. 1544 * Fills in rqstp::rq_vec, and returns the number of elements.
1544 */ 1545 */
1545unsigned int svc_fill_write_vector(struct svc_rqst *rqstp, struct kvec *first, 1546unsigned int svc_fill_write_vector(struct svc_rqst *rqstp, struct page **pages,
1546 size_t total) 1547 struct kvec *first, size_t total)
1547{ 1548{
1548 struct kvec *vec = rqstp->rq_vec; 1549 struct kvec *vec = rqstp->rq_vec;
1549 struct page **pages;
1550 unsigned int i; 1550 unsigned int i;
1551 1551
1552 /* Some types of transport can present the write payload 1552 /* Some types of transport can present the write payload
@@ -1560,14 +1560,11 @@ unsigned int svc_fill_write_vector(struct svc_rqst *rqstp, struct kvec *first,
1560 ++i; 1560 ++i;
1561 } 1561 }
1562 1562
1563 WARN_ON_ONCE(rqstp->rq_arg.page_base != 0);
1564 pages = rqstp->rq_arg.pages;
1565 while (total) { 1563 while (total) {
1566 vec[i].iov_base = page_address(*pages); 1564 vec[i].iov_base = page_address(*pages);
1567 vec[i].iov_len = min_t(size_t, total, PAGE_SIZE); 1565 vec[i].iov_len = min_t(size_t, total, PAGE_SIZE);
1568 total -= vec[i].iov_len; 1566 total -= vec[i].iov_len;
1569 ++i; 1567 ++i;
1570
1571 ++pages; 1568 ++pages;
1572 } 1569 }
1573 1570
@@ -1580,65 +1577,48 @@ EXPORT_SYMBOL_GPL(svc_fill_write_vector);
1580 * svc_fill_symlink_pathname - Construct pathname argument for VFS symlink call 1577 * svc_fill_symlink_pathname - Construct pathname argument for VFS symlink call
1581 * @rqstp: svc_rqst to operate on 1578 * @rqstp: svc_rqst to operate on
1582 * @first: buffer containing first section of pathname 1579 * @first: buffer containing first section of pathname
1580 * @p: buffer containing remaining section of pathname
1583 * @total: total length of the pathname argument 1581 * @total: total length of the pathname argument
1584 * 1582 *
1585 * Returns pointer to a NUL-terminated string, or an ERR_PTR. The buffer is 1583 * The VFS symlink API demands a NUL-terminated pathname in mapped memory.
1586 * released automatically when @rqstp is recycled. 1584 * Returns pointer to a NUL-terminated string, or an ERR_PTR. Caller must free
1585 * the returned string.
1587 */ 1586 */
1588char *svc_fill_symlink_pathname(struct svc_rqst *rqstp, struct kvec *first, 1587char *svc_fill_symlink_pathname(struct svc_rqst *rqstp, struct kvec *first,
1589 size_t total) 1588 void *p, size_t total)
1590{ 1589{
1591 struct xdr_buf *arg = &rqstp->rq_arg; 1590 size_t len, remaining;
1592 struct page **pages; 1591 char *result, *dst;
1593 char *result;
1594
1595 /* VFS API demands a NUL-terminated pathname. This function
1596 * uses a page from @rqstp as the pathname buffer, to enable
1597 * direct placement. Thus the total buffer size is PAGE_SIZE.
1598 * Space in this buffer for NUL-termination requires that we
1599 * cap the size of the returned symlink pathname just a
1600 * little early.
1601 */
1602 if (total > PAGE_SIZE - 1)
1603 return ERR_PTR(-ENAMETOOLONG);
1604 1592
1605 /* Some types of transport can present the pathname entirely 1593 result = kmalloc(total + 1, GFP_KERNEL);
1606 * in rq_arg.pages. If not, then copy the pathname into one 1594 if (!result)
1607 * page. 1595 return ERR_PTR(-ESERVERFAULT);
1608 */
1609 pages = arg->pages;
1610 WARN_ON_ONCE(arg->page_base != 0);
1611 if (first->iov_base == 0) {
1612 result = page_address(*pages);
1613 result[total] = '\0';
1614 } else {
1615 size_t len, remaining;
1616 char *dst;
1617 1596
1618 result = page_address(*(rqstp->rq_next_page++)); 1597 dst = result;
1619 dst = result; 1598 remaining = total;
1620 remaining = total;
1621 1599
1622 len = min_t(size_t, total, first->iov_len); 1600 len = min_t(size_t, total, first->iov_len);
1601 if (len) {
1623 memcpy(dst, first->iov_base, len); 1602 memcpy(dst, first->iov_base, len);
1624 dst += len; 1603 dst += len;
1625 remaining -= len; 1604 remaining -= len;
1605 }
1626 1606
1627 /* No more than one page left */ 1607 if (remaining) {
1628 if (remaining) { 1608 len = min_t(size_t, remaining, PAGE_SIZE);
1629 len = min_t(size_t, remaining, PAGE_SIZE); 1609 memcpy(dst, p, len);
1630 memcpy(dst, page_address(*pages), len); 1610 dst += len;
1631 dst += len;
1632 }
1633
1634 *dst = '\0';
1635 } 1611 }
1636 1612
1637 /* Sanity check: we don't allow the pathname argument to 1613 *dst = '\0';
1614
1615 /* Sanity check: Linux doesn't allow the pathname argument to
1638 * contain a NUL byte. 1616 * contain a NUL byte.
1639 */ 1617 */
1640 if (strlen(result) != total) 1618 if (strlen(result) != total) {
1619 kfree(result);
1641 return ERR_PTR(-EINVAL); 1620 return ERR_PTR(-EINVAL);
1621 }
1642 return result; 1622 return result;
1643} 1623}
1644EXPORT_SYMBOL_GPL(svc_fill_symlink_pathname); 1624EXPORT_SYMBOL_GPL(svc_fill_symlink_pathname);
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 3fabf9f6a0f9..a8db2e3f8904 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -880,7 +880,7 @@ static void xprt_wait_on_pinned_rqst(struct rpc_rqst *req)
880__must_hold(&req->rq_xprt->recv_lock) 880__must_hold(&req->rq_xprt->recv_lock)
881{ 881{
882 struct rpc_task *task = req->rq_task; 882 struct rpc_task *task = req->rq_task;
883 883
884 if (task && test_bit(RPC_TASK_MSG_RECV, &task->tk_runstate)) { 884 if (task && test_bit(RPC_TASK_MSG_RECV, &task->tk_runstate)) {
885 spin_unlock(&req->rq_xprt->recv_lock); 885 spin_unlock(&req->rq_xprt->recv_lock);
886 set_bit(RPC_TASK_MSG_RECV_WAIT, &task->tk_runstate); 886 set_bit(RPC_TASK_MSG_RECV_WAIT, &task->tk_runstate);
diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index 17fb1e025654..0f7c465d9a5a 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -279,9 +279,7 @@ out_maperr:
279static int 279static int
280fmr_op_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req) 280fmr_op_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
281{ 281{
282 struct ib_send_wr *bad_wr; 282 return ib_post_send(ia->ri_id->qp, &req->rl_sendctx->sc_wr, NULL);
283
284 return ib_post_send(ia->ri_id->qp, &req->rl_sendctx->sc_wr, &bad_wr);
285} 283}
286 284
287/* Invalidate all memory regions that were registered for "req". 285/* Invalidate all memory regions that were registered for "req".
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index c040de196e13..1bb00dd6ccdb 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -464,7 +464,7 @@ out_mapmr_err:
464static int 464static int
465frwr_op_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req) 465frwr_op_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
466{ 466{
467 struct ib_send_wr *post_wr, *bad_wr; 467 struct ib_send_wr *post_wr;
468 struct rpcrdma_mr *mr; 468 struct rpcrdma_mr *mr;
469 469
470 post_wr = &req->rl_sendctx->sc_wr; 470 post_wr = &req->rl_sendctx->sc_wr;
@@ -486,7 +486,7 @@ frwr_op_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
486 /* If ib_post_send fails, the next ->send_request for 486 /* If ib_post_send fails, the next ->send_request for
487 * @req will queue these MWs for recovery. 487 * @req will queue these MWs for recovery.
488 */ 488 */
489 return ib_post_send(ia->ri_id->qp, post_wr, &bad_wr); 489 return ib_post_send(ia->ri_id->qp, post_wr, NULL);
490} 490}
491 491
492/* Handle a remotely invalidated mr on the @mrs list 492/* Handle a remotely invalidated mr on the @mrs list
@@ -517,7 +517,8 @@ frwr_op_reminv(struct rpcrdma_rep *rep, struct list_head *mrs)
517static void 517static void
518frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs) 518frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs)
519{ 519{
520 struct ib_send_wr *first, **prev, *last, *bad_wr; 520 struct ib_send_wr *first, **prev, *last;
521 const struct ib_send_wr *bad_wr;
521 struct rpcrdma_ia *ia = &r_xprt->rx_ia; 522 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
522 struct rpcrdma_frwr *frwr; 523 struct rpcrdma_frwr *frwr;
523 struct rpcrdma_mr *mr; 524 struct rpcrdma_mr *mr;
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
index 357ba90c382d..134bef6a451e 100644
--- a/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -94,7 +94,6 @@ static int read_reset_stat(struct ctl_table *table, int write,
94 atomic_set(stat, 0); 94 atomic_set(stat, 0);
95 else { 95 else {
96 char str_buf[32]; 96 char str_buf[32];
97 char *data;
98 int len = snprintf(str_buf, 32, "%d\n", atomic_read(stat)); 97 int len = snprintf(str_buf, 32, "%d\n", atomic_read(stat));
99 if (len >= 32) 98 if (len >= 32)
100 return -EFAULT; 99 return -EFAULT;
@@ -103,7 +102,6 @@ static int read_reset_stat(struct ctl_table *table, int write,
103 *lenp = 0; 102 *lenp = 0;
104 return 0; 103 return 0;
105 } 104 }
106 data = &str_buf[*ppos];
107 len -= *ppos; 105 len -= *ppos;
108 if (len > *lenp) 106 if (len > *lenp)
109 len = *lenp; 107 len = *lenp;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 841fca143804..b24d5b8f2fee 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -229,11 +229,10 @@ void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma,
229static int __svc_rdma_post_recv(struct svcxprt_rdma *rdma, 229static int __svc_rdma_post_recv(struct svcxprt_rdma *rdma,
230 struct svc_rdma_recv_ctxt *ctxt) 230 struct svc_rdma_recv_ctxt *ctxt)
231{ 231{
232 struct ib_recv_wr *bad_recv_wr;
233 int ret; 232 int ret;
234 233
235 svc_xprt_get(&rdma->sc_xprt); 234 svc_xprt_get(&rdma->sc_xprt);
236 ret = ib_post_recv(rdma->sc_qp, &ctxt->rc_recv_wr, &bad_recv_wr); 235 ret = ib_post_recv(rdma->sc_qp, &ctxt->rc_recv_wr, NULL);
237 trace_svcrdma_post_recv(&ctxt->rc_recv_wr, ret); 236 trace_svcrdma_post_recv(&ctxt->rc_recv_wr, ret);
238 if (ret) 237 if (ret)
239 goto err_post; 238 goto err_post;
@@ -366,9 +365,6 @@ static void svc_rdma_build_arg_xdr(struct svc_rqst *rqstp,
366 arg->page_base = 0; 365 arg->page_base = 0;
367 arg->buflen = ctxt->rc_byte_len; 366 arg->buflen = ctxt->rc_byte_len;
368 arg->len = ctxt->rc_byte_len; 367 arg->len = ctxt->rc_byte_len;
369
370 rqstp->rq_respages = &rqstp->rq_pages[0];
371 rqstp->rq_next_page = rqstp->rq_respages + 1;
372} 368}
373 369
374/* This accommodates the largest possible Write chunk, 370/* This accommodates the largest possible Write chunk,
@@ -730,6 +726,12 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
730 726
731 svc_rdma_build_arg_xdr(rqstp, ctxt); 727 svc_rdma_build_arg_xdr(rqstp, ctxt);
732 728
729 /* Prevent svc_xprt_release from releasing pages in rq_pages
730 * if we return 0 or an error.
731 */
732 rqstp->rq_respages = rqstp->rq_pages;
733 rqstp->rq_next_page = rqstp->rq_respages;
734
733 p = (__be32 *)rqstp->rq_arg.head[0].iov_base; 735 p = (__be32 *)rqstp->rq_arg.head[0].iov_base;
734 ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg); 736 ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg);
735 if (ret < 0) 737 if (ret < 0)
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
index ce3ea8419704..dc1951759a8e 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -307,7 +307,8 @@ static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc)
307{ 307{
308 struct svcxprt_rdma *rdma = cc->cc_rdma; 308 struct svcxprt_rdma *rdma = cc->cc_rdma;
309 struct svc_xprt *xprt = &rdma->sc_xprt; 309 struct svc_xprt *xprt = &rdma->sc_xprt;
310 struct ib_send_wr *first_wr, *bad_wr; 310 struct ib_send_wr *first_wr;
311 const struct ib_send_wr *bad_wr;
311 struct list_head *tmp; 312 struct list_head *tmp;
312 struct ib_cqe *cqe; 313 struct ib_cqe *cqe;
313 int ret; 314 int ret;
@@ -679,6 +680,7 @@ static int svc_rdma_build_read_chunk(struct svc_rqst *rqstp,
679 struct svc_rdma_read_info *info, 680 struct svc_rdma_read_info *info,
680 __be32 *p) 681 __be32 *p)
681{ 682{
683 unsigned int i;
682 int ret; 684 int ret;
683 685
684 ret = -EINVAL; 686 ret = -EINVAL;
@@ -701,6 +703,12 @@ static int svc_rdma_build_read_chunk(struct svc_rqst *rqstp,
701 info->ri_chunklen += rs_length; 703 info->ri_chunklen += rs_length;
702 } 704 }
703 705
706 /* Pages under I/O have been copied to head->rc_pages.
707 * Prevent their premature release by svc_xprt_release() .
708 */
709 for (i = 0; i < info->ri_readctxt->rc_page_count; i++)
710 rqstp->rq_pages[i] = NULL;
711
704 return ret; 712 return ret;
705} 713}
706 714
@@ -816,7 +824,6 @@ int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp,
816 struct svc_rdma_recv_ctxt *head, __be32 *p) 824 struct svc_rdma_recv_ctxt *head, __be32 *p)
817{ 825{
818 struct svc_rdma_read_info *info; 826 struct svc_rdma_read_info *info;
819 struct page **page;
820 int ret; 827 int ret;
821 828
822 /* The request (with page list) is constructed in 829 /* The request (with page list) is constructed in
@@ -843,27 +850,15 @@ int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp,
843 ret = svc_rdma_build_normal_read_chunk(rqstp, info, p); 850 ret = svc_rdma_build_normal_read_chunk(rqstp, info, p);
844 else 851 else
845 ret = svc_rdma_build_pz_read_chunk(rqstp, info, p); 852 ret = svc_rdma_build_pz_read_chunk(rqstp, info, p);
846
847 /* Mark the start of the pages that can be used for the reply */
848 if (info->ri_pageoff > 0)
849 info->ri_pageno++;
850 rqstp->rq_respages = &rqstp->rq_pages[info->ri_pageno];
851 rqstp->rq_next_page = rqstp->rq_respages + 1;
852
853 if (ret < 0) 853 if (ret < 0)
854 goto out; 854 goto out_err;
855 855
856 ret = svc_rdma_post_chunk_ctxt(&info->ri_cc); 856 ret = svc_rdma_post_chunk_ctxt(&info->ri_cc);
857
858out:
859 /* Read sink pages have been moved from rqstp->rq_pages to
860 * head->rc_arg.pages. Force svc_recv to refill those slots
861 * in rq_pages.
862 */
863 for (page = rqstp->rq_pages; page < rqstp->rq_respages; page++)
864 *page = NULL;
865
866 if (ret < 0) 857 if (ret < 0)
867 svc_rdma_read_info_free(info); 858 goto out_err;
859 return 0;
860
861out_err:
862 svc_rdma_read_info_free(info);
868 return ret; 863 return ret;
869} 864}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 4a3efaea277c..8602a5f1b515 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -291,7 +291,6 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
291 */ 291 */
292int svc_rdma_send(struct svcxprt_rdma *rdma, struct ib_send_wr *wr) 292int svc_rdma_send(struct svcxprt_rdma *rdma, struct ib_send_wr *wr)
293{ 293{
294 struct ib_send_wr *bad_wr;
295 int ret; 294 int ret;
296 295
297 might_sleep(); 296 might_sleep();
@@ -311,7 +310,7 @@ int svc_rdma_send(struct svcxprt_rdma *rdma, struct ib_send_wr *wr)
311 } 310 }
312 311
313 svc_xprt_get(&rdma->sc_xprt); 312 svc_xprt_get(&rdma->sc_xprt);
314 ret = ib_post_send(rdma->sc_qp, wr, &bad_wr); 313 ret = ib_post_send(rdma->sc_qp, wr, NULL);
315 trace_svcrdma_post_send(wr, ret); 314 trace_svcrdma_post_send(wr, ret);
316 if (ret) { 315 if (ret) {
317 set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); 316 set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
@@ -657,7 +656,9 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
657 ctxt->sc_pages[i] = rqstp->rq_respages[i]; 656 ctxt->sc_pages[i] = rqstp->rq_respages[i];
658 rqstp->rq_respages[i] = NULL; 657 rqstp->rq_respages[i] = NULL;
659 } 658 }
660 rqstp->rq_next_page = rqstp->rq_respages + 1; 659
660 /* Prevent svc_xprt_release from releasing pages in rq_pages */
661 rqstp->rq_next_page = rqstp->rq_respages;
661} 662}
662 663
663/* Prepare the portion of the RPC Reply that will be transmitted 664/* Prepare the portion of the RPC Reply that will be transmitted
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index e9535a66bab0..2848cafd4a17 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -296,7 +296,6 @@ static int rdma_listen_handler(struct rdma_cm_id *cma_id,
296 struct rdma_cm_event *event) 296 struct rdma_cm_event *event)
297{ 297{
298 struct sockaddr *sap = (struct sockaddr *)&cma_id->route.addr.src_addr; 298 struct sockaddr *sap = (struct sockaddr *)&cma_id->route.addr.src_addr;
299 int ret = 0;
300 299
301 trace_svcrdma_cm_event(event, sap); 300 trace_svcrdma_cm_event(event, sap);
302 301
@@ -315,7 +314,7 @@ static int rdma_listen_handler(struct rdma_cm_id *cma_id,
315 break; 314 break;
316 } 315 }
317 316
318 return ret; 317 return 0;
319} 318}
320 319
321static int rdma_cma_handler(struct rdma_cm_id *cma_id, 320static int rdma_cma_handler(struct rdma_cm_id *cma_id,
@@ -476,7 +475,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
476 475
477 /* Qualify the transport resource defaults with the 476 /* Qualify the transport resource defaults with the
478 * capabilities of this particular device */ 477 * capabilities of this particular device */
479 newxprt->sc_max_send_sges = dev->attrs.max_sge; 478 newxprt->sc_max_send_sges = dev->attrs.max_send_sge;
480 /* transport hdr, head iovec, one page list entry, tail iovec */ 479 /* transport hdr, head iovec, one page list entry, tail iovec */
481 if (newxprt->sc_max_send_sges < 4) { 480 if (newxprt->sc_max_send_sges < 4) {
482 pr_err("svcrdma: too few Send SGEs available (%d)\n", 481 pr_err("svcrdma: too few Send SGEs available (%d)\n",
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 16161a36dc73..956a5ea47b58 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -280,7 +280,6 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
280 ++xprt->rx_xprt.connect_cookie; 280 ++xprt->rx_xprt.connect_cookie;
281 connstate = -ECONNABORTED; 281 connstate = -ECONNABORTED;
282connected: 282connected:
283 xprt->rx_buf.rb_credits = 1;
284 ep->rep_connected = connstate; 283 ep->rep_connected = connstate;
285 rpcrdma_conn_func(ep); 284 rpcrdma_conn_func(ep);
286 wake_up_all(&ep->rep_connect_wait); 285 wake_up_all(&ep->rep_connect_wait);
@@ -508,7 +507,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
508 unsigned int max_sge; 507 unsigned int max_sge;
509 int rc; 508 int rc;
510 509
511 max_sge = min_t(unsigned int, ia->ri_device->attrs.max_sge, 510 max_sge = min_t(unsigned int, ia->ri_device->attrs.max_send_sge,
512 RPCRDMA_MAX_SEND_SGES); 511 RPCRDMA_MAX_SEND_SGES);
513 if (max_sge < RPCRDMA_MIN_SEND_SGES) { 512 if (max_sge < RPCRDMA_MIN_SEND_SGES) {
514 pr_warn("rpcrdma: HCA provides only %d send SGEs\n", max_sge); 513 pr_warn("rpcrdma: HCA provides only %d send SGEs\n", max_sge);
@@ -755,6 +754,7 @@ retry:
755 } 754 }
756 755
757 ep->rep_connected = 0; 756 ep->rep_connected = 0;
757 rpcrdma_post_recvs(r_xprt, true);
758 758
759 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma); 759 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
760 if (rc) { 760 if (rc) {
@@ -773,8 +773,6 @@ retry:
773 773
774 dprintk("RPC: %s: connected\n", __func__); 774 dprintk("RPC: %s: connected\n", __func__);
775 775
776 rpcrdma_post_recvs(r_xprt, true);
777
778out: 776out:
779 if (rc) 777 if (rc)
780 ep->rep_connected = rc; 778 ep->rep_connected = rc;
@@ -1171,6 +1169,7 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
1171 list_add(&req->rl_list, &buf->rb_send_bufs); 1169 list_add(&req->rl_list, &buf->rb_send_bufs);
1172 } 1170 }
1173 1171
1172 buf->rb_credits = 1;
1174 buf->rb_posted_receives = 0; 1173 buf->rb_posted_receives = 0;
1175 INIT_LIST_HEAD(&buf->rb_recv_bufs); 1174 INIT_LIST_HEAD(&buf->rb_recv_bufs);
1176 1175
@@ -1559,7 +1558,8 @@ rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp)
1559 if (!count) 1558 if (!count)
1560 return; 1559 return;
1561 1560
1562 rc = ib_post_recv(r_xprt->rx_ia.ri_id->qp, wr, &bad_wr); 1561 rc = ib_post_recv(r_xprt->rx_ia.ri_id->qp, wr,
1562 (const struct ib_recv_wr **)&bad_wr);
1563 if (rc) { 1563 if (rc) {
1564 for (wr = bad_wr; wr; wr = wr->next) { 1564 for (wr = bad_wr; wr; wr = wr->next) {
1565 struct rpcrdma_rep *rep; 1565 struct rpcrdma_rep *rep;
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 9e1c5024aba9..6b7539c0466e 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -3375,4 +3375,3 @@ module_param_named(tcp_max_slot_table_entries, xprt_max_tcp_slot_table_entries,
3375 max_slot_table_size, 0644); 3375 max_slot_table_size, 0644);
3376module_param_named(udp_slot_table_entries, xprt_udp_slot_table_entries, 3376module_param_named(udp_slot_table_entries, xprt_udp_slot_table_entries,
3377 slot_table_size, 0644); 3377 slot_table_size, 0644);
3378
diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index f3711176be45..9ee6cfea56dd 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -512,7 +512,7 @@ int tipc_bcast_init(struct net *net)
512 struct tipc_bc_base *bb = NULL; 512 struct tipc_bc_base *bb = NULL;
513 struct tipc_link *l = NULL; 513 struct tipc_link *l = NULL;
514 514
515 bb = kzalloc(sizeof(*bb), GFP_ATOMIC); 515 bb = kzalloc(sizeof(*bb), GFP_KERNEL);
516 if (!bb) 516 if (!bb)
517 goto enomem; 517 goto enomem;
518 tn->bcbase = bb; 518 tn->bcbase = bb;
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index 2dfb492a7c94..418f03d0be90 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -395,6 +395,7 @@ int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b,
395 tipc_net_init(net, node_id, 0); 395 tipc_net_init(net, node_id, 0);
396 } 396 }
397 if (!tipc_own_id(net)) { 397 if (!tipc_own_id(net)) {
398 dev_put(dev);
398 pr_warn("Failed to obtain node identity\n"); 399 pr_warn("Failed to obtain node identity\n");
399 return -EINVAL; 400 return -EINVAL;
400 } 401 }
@@ -610,6 +611,7 @@ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt,
610 case NETDEV_CHANGE: 611 case NETDEV_CHANGE:
611 if (netif_carrier_ok(dev)) 612 if (netif_carrier_ok(dev))
612 break; 613 break;
614 /* else: fall through */
613 case NETDEV_UP: 615 case NETDEV_UP:
614 test_and_set_bit_lock(0, &b->up); 616 test_and_set_bit_lock(0, &b->up);
615 break; 617 break;
diff --git a/net/tipc/discover.c b/net/tipc/discover.c
index 9f666e0650e2..2830709957bd 100644
--- a/net/tipc/discover.c
+++ b/net/tipc/discover.c
@@ -133,6 +133,8 @@ static void disc_dupl_alert(struct tipc_bearer *b, u32 node_addr,
133} 133}
134 134
135/* tipc_disc_addr_trial(): - handle an address uniqueness trial from peer 135/* tipc_disc_addr_trial(): - handle an address uniqueness trial from peer
136 * Returns true if message should be dropped by caller, i.e., if it is a
137 * trial message or we are inside trial period. Otherwise false.
136 */ 138 */
137static bool tipc_disc_addr_trial_msg(struct tipc_discoverer *d, 139static bool tipc_disc_addr_trial_msg(struct tipc_discoverer *d,
138 struct tipc_media_addr *maddr, 140 struct tipc_media_addr *maddr,
@@ -168,8 +170,9 @@ static bool tipc_disc_addr_trial_msg(struct tipc_discoverer *d,
168 msg_set_type(buf_msg(d->skb), DSC_REQ_MSG); 170 msg_set_type(buf_msg(d->skb), DSC_REQ_MSG);
169 } 171 }
170 172
173 /* Accept regular link requests/responses only after trial period */
171 if (mtyp != DSC_TRIAL_MSG) 174 if (mtyp != DSC_TRIAL_MSG)
172 return false; 175 return trial;
173 176
174 sugg_addr = tipc_node_try_addr(net, peer_id, src); 177 sugg_addr = tipc_node_try_addr(net, peer_id, src);
175 if (sugg_addr) 178 if (sugg_addr)
@@ -284,7 +287,6 @@ static void tipc_disc_timeout(struct timer_list *t)
284{ 287{
285 struct tipc_discoverer *d = from_timer(d, t, timer); 288 struct tipc_discoverer *d = from_timer(d, t, timer);
286 struct tipc_net *tn = tipc_net(d->net); 289 struct tipc_net *tn = tipc_net(d->net);
287 u32 self = tipc_own_addr(d->net);
288 struct tipc_media_addr maddr; 290 struct tipc_media_addr maddr;
289 struct sk_buff *skb = NULL; 291 struct sk_buff *skb = NULL;
290 struct net *net = d->net; 292 struct net *net = d->net;
@@ -298,12 +300,14 @@ static void tipc_disc_timeout(struct timer_list *t)
298 goto exit; 300 goto exit;
299 } 301 }
300 302
301 /* Did we just leave the address trial period ? */ 303 /* Trial period over ? */
302 if (!self && !time_before(jiffies, tn->addr_trial_end)) { 304 if (!time_before(jiffies, tn->addr_trial_end)) {
303 self = tn->trial_addr; 305 /* Did we just leave it ? */
304 tipc_net_finalize(net, self); 306 if (!tipc_own_addr(net))
305 msg_set_prevnode(buf_msg(d->skb), self); 307 tipc_net_finalize(net, tn->trial_addr);
308
306 msg_set_type(buf_msg(d->skb), DSC_REQ_MSG); 309 msg_set_type(buf_msg(d->skb), DSC_REQ_MSG);
310 msg_set_prevnode(buf_msg(d->skb), tipc_own_addr(net));
307 } 311 }
308 312
309 /* Adjust timeout interval according to discovery phase */ 313 /* Adjust timeout interval according to discovery phase */
diff --git a/net/tipc/group.c b/net/tipc/group.c
index d7a7befeddd4..e82f13cb2dc5 100644
--- a/net/tipc/group.c
+++ b/net/tipc/group.c
@@ -159,11 +159,6 @@ u32 tipc_group_exclude(struct tipc_group *grp)
159 return 0; 159 return 0;
160} 160}
161 161
162int tipc_group_size(struct tipc_group *grp)
163{
164 return grp->member_cnt;
165}
166
167struct tipc_group *tipc_group_create(struct net *net, u32 portid, 162struct tipc_group *tipc_group_create(struct net *net, u32 portid,
168 struct tipc_group_req *mreq, 163 struct tipc_group_req *mreq,
169 bool *group_is_open) 164 bool *group_is_open)
@@ -232,8 +227,8 @@ void tipc_group_delete(struct net *net, struct tipc_group *grp)
232 kfree(grp); 227 kfree(grp);
233} 228}
234 229
235struct tipc_member *tipc_group_find_member(struct tipc_group *grp, 230static struct tipc_member *tipc_group_find_member(struct tipc_group *grp,
236 u32 node, u32 port) 231 u32 node, u32 port)
237{ 232{
238 struct rb_node *n = grp->members.rb_node; 233 struct rb_node *n = grp->members.rb_node;
239 u64 nkey, key = (u64)node << 32 | port; 234 u64 nkey, key = (u64)node << 32 | port;
@@ -918,3 +913,35 @@ void tipc_group_member_evt(struct tipc_group *grp,
918 } 913 }
919 *sk_rcvbuf = tipc_group_rcvbuf_limit(grp); 914 *sk_rcvbuf = tipc_group_rcvbuf_limit(grp);
920} 915}
916
917int tipc_group_fill_sock_diag(struct tipc_group *grp, struct sk_buff *skb)
918{
919 struct nlattr *group = nla_nest_start(skb, TIPC_NLA_SOCK_GROUP);
920
921 if (nla_put_u32(skb, TIPC_NLA_SOCK_GROUP_ID,
922 grp->type) ||
923 nla_put_u32(skb, TIPC_NLA_SOCK_GROUP_INSTANCE,
924 grp->instance) ||
925 nla_put_u32(skb, TIPC_NLA_SOCK_GROUP_BC_SEND_NEXT,
926 grp->bc_snd_nxt))
927 goto group_msg_cancel;
928
929 if (grp->scope == TIPC_NODE_SCOPE)
930 if (nla_put_flag(skb, TIPC_NLA_SOCK_GROUP_NODE_SCOPE))
931 goto group_msg_cancel;
932
933 if (grp->scope == TIPC_CLUSTER_SCOPE)
934 if (nla_put_flag(skb, TIPC_NLA_SOCK_GROUP_CLUSTER_SCOPE))
935 goto group_msg_cancel;
936
937 if (*grp->open)
938 if (nla_put_flag(skb, TIPC_NLA_SOCK_GROUP_OPEN))
939 goto group_msg_cancel;
940
941 nla_nest_end(skb, group);
942 return 0;
943
944group_msg_cancel:
945 nla_nest_cancel(skb, group);
946 return -1;
947}
diff --git a/net/tipc/group.h b/net/tipc/group.h
index 5996af6e9f1d..76b4e5a7b39d 100644
--- a/net/tipc/group.h
+++ b/net/tipc/group.h
@@ -72,4 +72,5 @@ void tipc_group_update_rcv_win(struct tipc_group *grp, int blks, u32 node,
72 u32 port, struct sk_buff_head *xmitq); 72 u32 port, struct sk_buff_head *xmitq);
73u16 tipc_group_bc_snd_nxt(struct tipc_group *grp); 73u16 tipc_group_bc_snd_nxt(struct tipc_group *grp);
74void tipc_group_update_member(struct tipc_member *m, int len); 74void tipc_group_update_member(struct tipc_member *m, int len);
75int tipc_group_fill_sock_diag(struct tipc_group *grp, struct sk_buff *skb);
75#endif 76#endif
diff --git a/net/tipc/link.c b/net/tipc/link.c
index 695acb783969..b1f0bee54eac 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -106,7 +106,8 @@ struct tipc_stats {
106 * @backlogq: queue for messages waiting to be sent 106 * @backlogq: queue for messages waiting to be sent
107 * @snt_nxt: next sequence number to use for outbound messages 107 * @snt_nxt: next sequence number to use for outbound messages
108 * @last_retransmitted: sequence number of most recently retransmitted message 108 * @last_retransmitted: sequence number of most recently retransmitted message
109 * @stale_count: # of identical retransmit requests made by peer 109 * @stale_cnt: counter for number of identical retransmit attempts
110 * @stale_limit: time when repeated identical retransmits must force link reset
110 * @ackers: # of peers that needs to ack each packet before it can be released 111 * @ackers: # of peers that needs to ack each packet before it can be released
111 * @acked: # last packet acked by a certain peer. Used for broadcast. 112 * @acked: # last packet acked by a certain peer. Used for broadcast.
112 * @rcv_nxt: next sequence number to expect for inbound messages 113 * @rcv_nxt: next sequence number to expect for inbound messages
@@ -127,14 +128,17 @@ struct tipc_link {
127 struct net *net; 128 struct net *net;
128 129
129 /* Management and link supervision data */ 130 /* Management and link supervision data */
130 u32 peer_session; 131 u16 peer_session;
131 u32 session; 132 u16 session;
133 u16 snd_nxt_state;
134 u16 rcv_nxt_state;
132 u32 peer_bearer_id; 135 u32 peer_bearer_id;
133 u32 bearer_id; 136 u32 bearer_id;
134 u32 tolerance; 137 u32 tolerance;
135 u32 abort_limit; 138 u32 abort_limit;
136 u32 state; 139 u32 state;
137 u16 peer_caps; 140 u16 peer_caps;
141 bool in_session;
138 bool active; 142 bool active;
139 u32 silent_intv_cnt; 143 u32 silent_intv_cnt;
140 char if_name[TIPC_MAX_IF_NAME]; 144 char if_name[TIPC_MAX_IF_NAME];
@@ -161,7 +165,8 @@ struct tipc_link {
161 u16 snd_nxt; 165 u16 snd_nxt;
162 u16 last_retransm; 166 u16 last_retransm;
163 u16 window; 167 u16 window;
164 u32 stale_count; 168 u16 stale_cnt;
169 unsigned long stale_limit;
165 170
166 /* Reception */ 171 /* Reception */
167 u16 rcv_nxt; 172 u16 rcv_nxt;
@@ -212,11 +217,6 @@ enum {
212 */ 217 */
213#define TIPC_NACK_INTV (TIPC_MIN_LINK_WIN * 2) 218#define TIPC_NACK_INTV (TIPC_MIN_LINK_WIN * 2)
214 219
215/* Wildcard value for link session numbers. When it is known that
216 * peer endpoint is down, any session number must be accepted.
217 */
218#define ANY_SESSION 0x10000
219
220/* Link FSM states: 220/* Link FSM states:
221 */ 221 */
222enum { 222enum {
@@ -297,11 +297,6 @@ static bool link_is_bc_rcvlink(struct tipc_link *l)
297 return ((l->bc_rcvlink == l) && !link_is_bc_sndlink(l)); 297 return ((l->bc_rcvlink == l) && !link_is_bc_sndlink(l));
298} 298}
299 299
300int tipc_link_is_active(struct tipc_link *l)
301{
302 return l->active;
303}
304
305void tipc_link_set_active(struct tipc_link *l, bool active) 300void tipc_link_set_active(struct tipc_link *l, bool active)
306{ 301{
307 l->active = active; 302 l->active = active;
@@ -337,6 +332,11 @@ char tipc_link_plane(struct tipc_link *l)
337 return l->net_plane; 332 return l->net_plane;
338} 333}
339 334
335void tipc_link_update_caps(struct tipc_link *l, u16 capabilities)
336{
337 l->peer_caps = capabilities;
338}
339
340void tipc_link_add_bc_peer(struct tipc_link *snd_l, 340void tipc_link_add_bc_peer(struct tipc_link *snd_l,
341 struct tipc_link *uc_l, 341 struct tipc_link *uc_l,
342 struct sk_buff_head *xmitq) 342 struct sk_buff_head *xmitq)
@@ -373,7 +373,7 @@ int tipc_link_bc_peers(struct tipc_link *l)
373 return l->ackers; 373 return l->ackers;
374} 374}
375 375
376u16 link_bc_rcv_gap(struct tipc_link *l) 376static u16 link_bc_rcv_gap(struct tipc_link *l)
377{ 377{
378 struct sk_buff *skb = skb_peek(&l->deferdq); 378 struct sk_buff *skb = skb_peek(&l->deferdq);
379 u16 gap = 0; 379 u16 gap = 0;
@@ -469,7 +469,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id,
469 l->addr = peer; 469 l->addr = peer;
470 l->peer_caps = peer_caps; 470 l->peer_caps = peer_caps;
471 l->net = net; 471 l->net = net;
472 l->peer_session = ANY_SESSION; 472 l->in_session = false;
473 l->bearer_id = bearer_id; 473 l->bearer_id = bearer_id;
474 l->tolerance = tolerance; 474 l->tolerance = tolerance;
475 l->net_plane = net_plane; 475 l->net_plane = net_plane;
@@ -820,7 +820,7 @@ static int link_schedule_user(struct tipc_link *l, struct tipc_msg *hdr)
820 * Wake up a number of waiting users, as permitted by available space 820 * Wake up a number of waiting users, as permitted by available space
821 * in the send queue 821 * in the send queue
822 */ 822 */
823void link_prepare_wakeup(struct tipc_link *l) 823static void link_prepare_wakeup(struct tipc_link *l)
824{ 824{
825 struct sk_buff *skb, *tmp; 825 struct sk_buff *skb, *tmp;
826 int imp, i = 0; 826 int imp, i = 0;
@@ -838,7 +838,7 @@ void link_prepare_wakeup(struct tipc_link *l)
838 838
839void tipc_link_reset(struct tipc_link *l) 839void tipc_link_reset(struct tipc_link *l)
840{ 840{
841 l->peer_session = ANY_SESSION; 841 l->in_session = false;
842 l->session++; 842 l->session++;
843 l->mtu = l->advertised_mtu; 843 l->mtu = l->advertised_mtu;
844 __skb_queue_purge(&l->transmq); 844 __skb_queue_purge(&l->transmq);
@@ -857,10 +857,12 @@ void tipc_link_reset(struct tipc_link *l)
857 l->rcv_unacked = 0; 857 l->rcv_unacked = 0;
858 l->snd_nxt = 1; 858 l->snd_nxt = 1;
859 l->rcv_nxt = 1; 859 l->rcv_nxt = 1;
860 l->snd_nxt_state = 1;
861 l->rcv_nxt_state = 1;
860 l->acked = 0; 862 l->acked = 0;
861 l->silent_intv_cnt = 0; 863 l->silent_intv_cnt = 0;
862 l->rst_cnt = 0; 864 l->rst_cnt = 0;
863 l->stale_count = 0; 865 l->stale_cnt = 0;
864 l->bc_peer_is_up = false; 866 l->bc_peer_is_up = false;
865 memset(&l->mon_state, 0, sizeof(l->mon_state)); 867 memset(&l->mon_state, 0, sizeof(l->mon_state));
866 tipc_link_reset_stats(l); 868 tipc_link_reset_stats(l);
@@ -954,7 +956,8 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
954 return rc; 956 return rc;
955} 957}
956 958
957void tipc_link_advance_backlog(struct tipc_link *l, struct sk_buff_head *xmitq) 959static void tipc_link_advance_backlog(struct tipc_link *l,
960 struct sk_buff_head *xmitq)
958{ 961{
959 struct sk_buff *skb, *_skb; 962 struct sk_buff *skb, *_skb;
960 struct tipc_msg *hdr; 963 struct tipc_msg *hdr;
@@ -997,39 +1000,41 @@ static void link_retransmit_failure(struct tipc_link *l, struct sk_buff *skb)
997 msg_seqno(hdr), msg_prevnode(hdr), msg_orignode(hdr)); 1000 msg_seqno(hdr), msg_prevnode(hdr), msg_orignode(hdr));
998} 1001}
999 1002
1000int tipc_link_retrans(struct tipc_link *l, struct tipc_link *nacker, 1003/* tipc_link_retrans() - retransmit one or more packets
1001 u16 from, u16 to, struct sk_buff_head *xmitq) 1004 * @l: the link to transmit on
1005 * @r: the receiving link ordering the retransmit. Same as l if unicast
1006 * @from: retransmit from (inclusive) this sequence number
1007 * @to: retransmit to (inclusive) this sequence number
1008 * xmitq: queue for accumulating the retransmitted packets
1009 */
1010static int tipc_link_retrans(struct tipc_link *l, struct tipc_link *r,
1011 u16 from, u16 to, struct sk_buff_head *xmitq)
1002{ 1012{
1003 struct sk_buff *_skb, *skb = skb_peek(&l->transmq); 1013 struct sk_buff *_skb, *skb = skb_peek(&l->transmq);
1004 struct tipc_msg *hdr;
1005 u16 ack = l->rcv_nxt - 1;
1006 u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; 1014 u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1;
1015 u16 ack = l->rcv_nxt - 1;
1016 struct tipc_msg *hdr;
1007 1017
1008 if (!skb) 1018 if (!skb)
1009 return 0; 1019 return 0;
1010 1020
1011 /* Detect repeated retransmit failures on same packet */ 1021 /* Detect repeated retransmit failures on same packet */
1012 if (nacker->last_retransm != buf_seqno(skb)) { 1022 if (r->last_retransm != buf_seqno(skb)) {
1013 nacker->last_retransm = buf_seqno(skb); 1023 r->last_retransm = buf_seqno(skb);
1014 nacker->stale_count = 1; 1024 r->stale_limit = jiffies + msecs_to_jiffies(l->tolerance);
1015 } else if (++nacker->stale_count > 100) { 1025 } else if (++r->stale_cnt > 99 && time_after(jiffies, r->stale_limit)) {
1016 link_retransmit_failure(l, skb); 1026 link_retransmit_failure(l, skb);
1017 nacker->stale_count = 0;
1018 if (link_is_bc_sndlink(l)) 1027 if (link_is_bc_sndlink(l))
1019 return TIPC_LINK_DOWN_EVT; 1028 return TIPC_LINK_DOWN_EVT;
1020 return tipc_link_fsm_evt(l, LINK_FAILURE_EVT); 1029 return tipc_link_fsm_evt(l, LINK_FAILURE_EVT);
1021 } 1030 }
1022 1031
1023 /* Move forward to where retransmission should start */
1024 skb_queue_walk(&l->transmq, skb) { 1032 skb_queue_walk(&l->transmq, skb) {
1025 if (!less(buf_seqno(skb), from))
1026 break;
1027 }
1028
1029 skb_queue_walk_from(&l->transmq, skb) {
1030 if (more(buf_seqno(skb), to))
1031 break;
1032 hdr = buf_msg(skb); 1033 hdr = buf_msg(skb);
1034 if (less(msg_seqno(hdr), from))
1035 continue;
1036 if (more(msg_seqno(hdr), to))
1037 break;
1033 _skb = __pskb_copy(skb, MIN_H_SIZE, GFP_ATOMIC); 1038 _skb = __pskb_copy(skb, MIN_H_SIZE, GFP_ATOMIC);
1034 if (!_skb) 1039 if (!_skb)
1035 return 0; 1040 return 0;
@@ -1063,6 +1068,7 @@ static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb,
1063 skb_queue_tail(mc_inputq, skb); 1068 skb_queue_tail(mc_inputq, skb);
1064 return true; 1069 return true;
1065 } 1070 }
1071 /* else: fall through */
1066 case CONN_MANAGER: 1072 case CONN_MANAGER:
1067 skb_queue_tail(inputq, skb); 1073 skb_queue_tail(inputq, skb);
1068 return true; 1074 return true;
@@ -1271,6 +1277,7 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb,
1271 1277
1272 /* Forward queues and wake up waiting users */ 1278 /* Forward queues and wake up waiting users */
1273 if (likely(tipc_link_release_pkts(l, msg_ack(hdr)))) { 1279 if (likely(tipc_link_release_pkts(l, msg_ack(hdr)))) {
1280 l->stale_cnt = 0;
1274 tipc_link_advance_backlog(l, xmitq); 1281 tipc_link_advance_backlog(l, xmitq);
1275 if (unlikely(!skb_queue_empty(&l->wakeupq))) 1282 if (unlikely(!skb_queue_empty(&l->wakeupq)))
1276 link_prepare_wakeup(l); 1283 link_prepare_wakeup(l);
@@ -1347,6 +1354,8 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe,
1347 msg_set_seqno(hdr, l->snd_nxt + U16_MAX / 2); 1354 msg_set_seqno(hdr, l->snd_nxt + U16_MAX / 2);
1348 1355
1349 if (mtyp == STATE_MSG) { 1356 if (mtyp == STATE_MSG) {
1357 if (l->peer_caps & TIPC_LINK_PROTO_SEQNO)
1358 msg_set_seqno(hdr, l->snd_nxt_state++);
1350 msg_set_seq_gap(hdr, rcvgap); 1359 msg_set_seq_gap(hdr, rcvgap);
1351 msg_set_bc_gap(hdr, link_bc_rcv_gap(bcl)); 1360 msg_set_bc_gap(hdr, link_bc_rcv_gap(bcl));
1352 msg_set_probe(hdr, probe); 1361 msg_set_probe(hdr, probe);
@@ -1438,6 +1447,44 @@ tnl:
1438 } 1447 }
1439} 1448}
1440 1449
1450/* tipc_link_validate_msg(): validate message against current link state
1451 * Returns true if message should be accepted, otherwise false
1452 */
1453bool tipc_link_validate_msg(struct tipc_link *l, struct tipc_msg *hdr)
1454{
1455 u16 curr_session = l->peer_session;
1456 u16 session = msg_session(hdr);
1457 int mtyp = msg_type(hdr);
1458
1459 if (msg_user(hdr) != LINK_PROTOCOL)
1460 return true;
1461
1462 switch (mtyp) {
1463 case RESET_MSG:
1464 if (!l->in_session)
1465 return true;
1466 /* Accept only RESET with new session number */
1467 return more(session, curr_session);
1468 case ACTIVATE_MSG:
1469 if (!l->in_session)
1470 return true;
1471 /* Accept only ACTIVATE with new or current session number */
1472 return !less(session, curr_session);
1473 case STATE_MSG:
1474 /* Accept only STATE with current session number */
1475 if (!l->in_session)
1476 return false;
1477 if (session != curr_session)
1478 return false;
1479 if (!(l->peer_caps & TIPC_LINK_PROTO_SEQNO))
1480 return true;
1481 /* Accept only STATE with new sequence number */
1482 return !less(msg_seqno(hdr), l->rcv_nxt_state);
1483 default:
1484 return false;
1485 }
1486}
1487
1441/* tipc_link_proto_rcv(): receive link level protocol message : 1488/* tipc_link_proto_rcv(): receive link level protocol message :
1442 * Note that network plane id propagates through the network, and may 1489 * Note that network plane id propagates through the network, and may
1443 * change at any time. The node with lowest numerical id determines 1490 * change at any time. The node with lowest numerical id determines
@@ -1471,17 +1518,12 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
1471 hdr = buf_msg(skb); 1518 hdr = buf_msg(skb);
1472 data = msg_data(hdr); 1519 data = msg_data(hdr);
1473 1520
1521 if (!tipc_link_validate_msg(l, hdr))
1522 goto exit;
1523
1474 switch (mtyp) { 1524 switch (mtyp) {
1475 case RESET_MSG: 1525 case RESET_MSG:
1476
1477 /* Ignore duplicate RESET with old session number */
1478 if ((less_eq(msg_session(hdr), l->peer_session)) &&
1479 (l->peer_session != ANY_SESSION))
1480 break;
1481 /* fall thru' */
1482
1483 case ACTIVATE_MSG: 1526 case ACTIVATE_MSG:
1484
1485 /* Complete own link name with peer's interface name */ 1527 /* Complete own link name with peer's interface name */
1486 if_name = strrchr(l->name, ':') + 1; 1528 if_name = strrchr(l->name, ':') + 1;
1487 if (sizeof(l->name) - (if_name - l->name) <= TIPC_MAX_IF_NAME) 1529 if (sizeof(l->name) - (if_name - l->name) <= TIPC_MAX_IF_NAME)
@@ -1509,12 +1551,14 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
1509 rc = TIPC_LINK_UP_EVT; 1551 rc = TIPC_LINK_UP_EVT;
1510 1552
1511 l->peer_session = msg_session(hdr); 1553 l->peer_session = msg_session(hdr);
1554 l->in_session = true;
1512 l->peer_bearer_id = msg_bearer_id(hdr); 1555 l->peer_bearer_id = msg_bearer_id(hdr);
1513 if (l->mtu > msg_max_pkt(hdr)) 1556 if (l->mtu > msg_max_pkt(hdr))
1514 l->mtu = msg_max_pkt(hdr); 1557 l->mtu = msg_max_pkt(hdr);
1515 break; 1558 break;
1516 1559
1517 case STATE_MSG: 1560 case STATE_MSG:
1561 l->rcv_nxt_state = msg_seqno(hdr) + 1;
1518 1562
1519 /* Update own tolerance if peer indicates a non-zero value */ 1563 /* Update own tolerance if peer indicates a non-zero value */
1520 if (in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL)) 1564 if (in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL))
diff --git a/net/tipc/link.h b/net/tipc/link.h
index ec59348a81e8..7bc494a33fdf 100644
--- a/net/tipc/link.h
+++ b/net/tipc/link.h
@@ -110,6 +110,8 @@ char *tipc_link_name(struct tipc_link *l);
110char tipc_link_plane(struct tipc_link *l); 110char tipc_link_plane(struct tipc_link *l);
111int tipc_link_prio(struct tipc_link *l); 111int tipc_link_prio(struct tipc_link *l);
112int tipc_link_window(struct tipc_link *l); 112int tipc_link_window(struct tipc_link *l);
113void tipc_link_update_caps(struct tipc_link *l, u16 capabilities);
114bool tipc_link_validate_msg(struct tipc_link *l, struct tipc_msg *hdr);
113unsigned long tipc_link_tolerance(struct tipc_link *l); 115unsigned long tipc_link_tolerance(struct tipc_link *l);
114void tipc_link_set_tolerance(struct tipc_link *l, u32 tol, 116void tipc_link_set_tolerance(struct tipc_link *l, u32 tol,
115 struct sk_buff_head *xmitq); 117 struct sk_buff_head *xmitq);
diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c
index 5453e564da82..67f69389ec17 100644
--- a/net/tipc/monitor.c
+++ b/net/tipc/monitor.c
@@ -684,7 +684,8 @@ int tipc_nl_monitor_get_threshold(struct net *net)
684 return tn->mon_threshold; 684 return tn->mon_threshold;
685} 685}
686 686
687int __tipc_nl_add_monitor_peer(struct tipc_peer *peer, struct tipc_nl_msg *msg) 687static int __tipc_nl_add_monitor_peer(struct tipc_peer *peer,
688 struct tipc_nl_msg *msg)
688{ 689{
689 struct tipc_mon_domain *dom = peer->domain; 690 struct tipc_mon_domain *dom = peer->domain;
690 struct nlattr *attrs; 691 struct nlattr *attrs;
diff --git a/net/tipc/msg.c b/net/tipc/msg.c
index b6c45dccba3d..b61891054709 100644
--- a/net/tipc/msg.c
+++ b/net/tipc/msg.c
@@ -416,26 +416,31 @@ bool tipc_msg_bundle(struct sk_buff *skb, struct tipc_msg *msg, u32 mtu)
416 */ 416 */
417bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos) 417bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos)
418{ 418{
419 struct tipc_msg *msg; 419 struct tipc_msg *hdr, *ihdr;
420 int imsz, offset; 420 int imsz;
421 421
422 *iskb = NULL; 422 *iskb = NULL;
423 if (unlikely(skb_linearize(skb))) 423 if (unlikely(skb_linearize(skb)))
424 goto none; 424 goto none;
425 425
426 msg = buf_msg(skb); 426 hdr = buf_msg(skb);
427 offset = msg_hdr_sz(msg) + *pos; 427 if (unlikely(*pos > (msg_data_sz(hdr) - MIN_H_SIZE)))
428 if (unlikely(offset > (msg_size(msg) - MIN_H_SIZE)))
429 goto none; 428 goto none;
430 429
431 *iskb = skb_clone(skb, GFP_ATOMIC); 430 ihdr = (struct tipc_msg *)(msg_data(hdr) + *pos);
432 if (unlikely(!*iskb)) 431 imsz = msg_size(ihdr);
432
433 if ((*pos + imsz) > msg_data_sz(hdr))
433 goto none; 434 goto none;
434 skb_pull(*iskb, offset); 435
435 imsz = msg_size(buf_msg(*iskb)); 436 *iskb = tipc_buf_acquire(imsz, GFP_ATOMIC);
436 skb_trim(*iskb, imsz); 437 if (!*iskb)
438 goto none;
439
440 skb_copy_to_linear_data(*iskb, ihdr, imsz);
437 if (unlikely(!tipc_msg_validate(iskb))) 441 if (unlikely(!tipc_msg_validate(iskb)))
438 goto none; 442 goto none;
443
439 *pos += align(imsz); 444 *pos += align(imsz);
440 return true; 445 return true;
441none: 446none:
@@ -531,12 +536,6 @@ bool tipc_msg_reverse(u32 own_node, struct sk_buff **skb, int err)
531 msg_set_hdr_sz(hdr, BASIC_H_SIZE); 536 msg_set_hdr_sz(hdr, BASIC_H_SIZE);
532 } 537 }
533 538
534 if (skb_cloned(_skb) &&
535 pskb_expand_head(_skb, BUF_HEADROOM, BUF_TAILROOM, GFP_ATOMIC))
536 goto exit;
537
538 /* reassign after skb header modifications */
539 hdr = buf_msg(_skb);
540 /* Now reverse the concerned fields */ 539 /* Now reverse the concerned fields */
541 msg_set_errcode(hdr, err); 540 msg_set_errcode(hdr, err);
542 msg_set_non_seq(hdr, 0); 541 msg_set_non_seq(hdr, 0);
@@ -595,10 +594,6 @@ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err)
595 if (!skb_cloned(skb)) 594 if (!skb_cloned(skb))
596 return true; 595 return true;
597 596
598 /* Unclone buffer in case it was bundled */
599 if (pskb_expand_head(skb, BUF_HEADROOM, BUF_TAILROOM, GFP_ATOMIC))
600 return false;
601
602 return true; 597 return true;
603} 598}
604 599
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index bebe88cae07b..88f027b502f6 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -735,7 +735,7 @@ int tipc_nametbl_init(struct net *net)
735 struct name_table *nt; 735 struct name_table *nt;
736 int i; 736 int i;
737 737
738 nt = kzalloc(sizeof(*nt), GFP_ATOMIC); 738 nt = kzalloc(sizeof(*nt), GFP_KERNEL);
739 if (!nt) 739 if (!nt)
740 return -ENOMEM; 740 return -ENOMEM;
741 741
diff --git a/net/tipc/net.c b/net/tipc/net.c
index 4fbaa0464405..62199cf5a56c 100644
--- a/net/tipc/net.c
+++ b/net/tipc/net.c
@@ -121,12 +121,15 @@ int tipc_net_init(struct net *net, u8 *node_id, u32 addr)
121 121
122void tipc_net_finalize(struct net *net, u32 addr) 122void tipc_net_finalize(struct net *net, u32 addr)
123{ 123{
124 tipc_set_node_addr(net, addr); 124 struct tipc_net *tn = tipc_net(net);
125 smp_mb(); 125
126 tipc_named_reinit(net); 126 if (!cmpxchg(&tn->node_addr, 0, addr)) {
127 tipc_sk_reinit(net); 127 tipc_set_node_addr(net, addr);
128 tipc_nametbl_publish(net, TIPC_CFG_SRV, addr, addr, 128 tipc_named_reinit(net);
129 TIPC_CLUSTER_SCOPE, 0, addr); 129 tipc_sk_reinit(net);
130 tipc_nametbl_publish(net, TIPC_CFG_SRV, addr, addr,
131 TIPC_CLUSTER_SCOPE, 0, addr);
132 }
130} 133}
131 134
132void tipc_net_stop(struct net *net) 135void tipc_net_stop(struct net *net)
diff --git a/net/tipc/node.c b/net/tipc/node.c
index 6a44eb812baf..68014f1b6976 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -45,6 +45,7 @@
45#include "netlink.h" 45#include "netlink.h"
46 46
47#define INVALID_NODE_SIG 0x10000 47#define INVALID_NODE_SIG 0x10000
48#define NODE_CLEANUP_AFTER 300000
48 49
49/* Flags used to take different actions according to flag type 50/* Flags used to take different actions according to flag type
50 * TIPC_NOTIFY_NODE_DOWN: notify node is down 51 * TIPC_NOTIFY_NODE_DOWN: notify node is down
@@ -96,6 +97,7 @@ struct tipc_bclink_entry {
96 * @link_id: local and remote bearer ids of changing link, if any 97 * @link_id: local and remote bearer ids of changing link, if any
97 * @publ_list: list of publications 98 * @publ_list: list of publications
98 * @rcu: rcu struct for tipc_node 99 * @rcu: rcu struct for tipc_node
100 * @delete_at: indicates the time for deleting a down node
99 */ 101 */
100struct tipc_node { 102struct tipc_node {
101 u32 addr; 103 u32 addr;
@@ -121,6 +123,7 @@ struct tipc_node {
121 unsigned long keepalive_intv; 123 unsigned long keepalive_intv;
122 struct timer_list timer; 124 struct timer_list timer;
123 struct rcu_head rcu; 125 struct rcu_head rcu;
126 unsigned long delete_at;
124}; 127};
125 128
126/* Node FSM states and events: 129/* Node FSM states and events:
@@ -160,6 +163,7 @@ static struct tipc_node *tipc_node_find(struct net *net, u32 addr);
160static struct tipc_node *tipc_node_find_by_id(struct net *net, u8 *id); 163static struct tipc_node *tipc_node_find_by_id(struct net *net, u8 *id);
161static void tipc_node_put(struct tipc_node *node); 164static void tipc_node_put(struct tipc_node *node);
162static bool node_is_up(struct tipc_node *n); 165static bool node_is_up(struct tipc_node *n);
166static void tipc_node_delete_from_list(struct tipc_node *node);
163 167
164struct tipc_sock_conn { 168struct tipc_sock_conn {
165 u32 port; 169 u32 port;
@@ -359,13 +363,24 @@ static struct tipc_node *tipc_node_create(struct net *net, u32 addr,
359{ 363{
360 struct tipc_net *tn = net_generic(net, tipc_net_id); 364 struct tipc_net *tn = net_generic(net, tipc_net_id);
361 struct tipc_node *n, *temp_node; 365 struct tipc_node *n, *temp_node;
366 struct tipc_link *l;
367 int bearer_id;
362 int i; 368 int i;
363 369
364 spin_lock_bh(&tn->node_list_lock); 370 spin_lock_bh(&tn->node_list_lock);
365 n = tipc_node_find(net, addr); 371 n = tipc_node_find(net, addr);
366 if (n) { 372 if (n) {
373 if (n->capabilities == capabilities)
374 goto exit;
367 /* Same node may come back with new capabilities */ 375 /* Same node may come back with new capabilities */
376 write_lock_bh(&n->lock);
368 n->capabilities = capabilities; 377 n->capabilities = capabilities;
378 for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) {
379 l = n->links[bearer_id].link;
380 if (l)
381 tipc_link_update_caps(l, capabilities);
382 }
383 write_unlock_bh(&n->lock);
369 goto exit; 384 goto exit;
370 } 385 }
371 n = kzalloc(sizeof(*n), GFP_ATOMIC); 386 n = kzalloc(sizeof(*n), GFP_ATOMIC);
@@ -390,6 +405,7 @@ static struct tipc_node *tipc_node_create(struct net *net, u32 addr,
390 for (i = 0; i < MAX_BEARERS; i++) 405 for (i = 0; i < MAX_BEARERS; i++)
391 spin_lock_init(&n->links[i].lock); 406 spin_lock_init(&n->links[i].lock);
392 n->state = SELF_DOWN_PEER_LEAVING; 407 n->state = SELF_DOWN_PEER_LEAVING;
408 n->delete_at = jiffies + msecs_to_jiffies(NODE_CLEANUP_AFTER);
393 n->signature = INVALID_NODE_SIG; 409 n->signature = INVALID_NODE_SIG;
394 n->active_links[0] = INVALID_BEARER_ID; 410 n->active_links[0] = INVALID_BEARER_ID;
395 n->active_links[1] = INVALID_BEARER_ID; 411 n->active_links[1] = INVALID_BEARER_ID;
@@ -433,11 +449,16 @@ static void tipc_node_calculate_timer(struct tipc_node *n, struct tipc_link *l)
433 tipc_link_set_abort_limit(l, tol / n->keepalive_intv); 449 tipc_link_set_abort_limit(l, tol / n->keepalive_intv);
434} 450}
435 451
436static void tipc_node_delete(struct tipc_node *node) 452static void tipc_node_delete_from_list(struct tipc_node *node)
437{ 453{
438 list_del_rcu(&node->list); 454 list_del_rcu(&node->list);
439 hlist_del_rcu(&node->hash); 455 hlist_del_rcu(&node->hash);
440 tipc_node_put(node); 456 tipc_node_put(node);
457}
458
459static void tipc_node_delete(struct tipc_node *node)
460{
461 tipc_node_delete_from_list(node);
441 462
442 del_timer_sync(&node->timer); 463 del_timer_sync(&node->timer);
443 tipc_node_put(node); 464 tipc_node_put(node);
@@ -544,6 +565,42 @@ void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port)
544 tipc_node_put(node); 565 tipc_node_put(node);
545} 566}
546 567
568static void tipc_node_clear_links(struct tipc_node *node)
569{
570 int i;
571
572 for (i = 0; i < MAX_BEARERS; i++) {
573 struct tipc_link_entry *le = &node->links[i];
574
575 if (le->link) {
576 kfree(le->link);
577 le->link = NULL;
578 node->link_cnt--;
579 }
580 }
581}
582
583/* tipc_node_cleanup - delete nodes that does not
584 * have active links for NODE_CLEANUP_AFTER time
585 */
586static int tipc_node_cleanup(struct tipc_node *peer)
587{
588 struct tipc_net *tn = tipc_net(peer->net);
589 bool deleted = false;
590
591 spin_lock_bh(&tn->node_list_lock);
592 tipc_node_write_lock(peer);
593
594 if (!node_is_up(peer) && time_after(jiffies, peer->delete_at)) {
595 tipc_node_clear_links(peer);
596 tipc_node_delete_from_list(peer);
597 deleted = true;
598 }
599 tipc_node_write_unlock(peer);
600 spin_unlock_bh(&tn->node_list_lock);
601 return deleted;
602}
603
547/* tipc_node_timeout - handle expiration of node timer 604/* tipc_node_timeout - handle expiration of node timer
548 */ 605 */
549static void tipc_node_timeout(struct timer_list *t) 606static void tipc_node_timeout(struct timer_list *t)
@@ -551,21 +608,29 @@ static void tipc_node_timeout(struct timer_list *t)
551 struct tipc_node *n = from_timer(n, t, timer); 608 struct tipc_node *n = from_timer(n, t, timer);
552 struct tipc_link_entry *le; 609 struct tipc_link_entry *le;
553 struct sk_buff_head xmitq; 610 struct sk_buff_head xmitq;
611 int remains = n->link_cnt;
554 int bearer_id; 612 int bearer_id;
555 int rc = 0; 613 int rc = 0;
556 614
615 if (!node_is_up(n) && tipc_node_cleanup(n)) {
616 /*Removing the reference of Timer*/
617 tipc_node_put(n);
618 return;
619 }
620
557 __skb_queue_head_init(&xmitq); 621 __skb_queue_head_init(&xmitq);
558 622
559 for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) { 623 for (bearer_id = 0; remains && (bearer_id < MAX_BEARERS); bearer_id++) {
560 tipc_node_read_lock(n); 624 tipc_node_read_lock(n);
561 le = &n->links[bearer_id]; 625 le = &n->links[bearer_id];
562 spin_lock_bh(&le->lock);
563 if (le->link) { 626 if (le->link) {
627 spin_lock_bh(&le->lock);
564 /* Link tolerance may change asynchronously: */ 628 /* Link tolerance may change asynchronously: */
565 tipc_node_calculate_timer(n, le->link); 629 tipc_node_calculate_timer(n, le->link);
566 rc = tipc_link_timeout(le->link, &xmitq); 630 rc = tipc_link_timeout(le->link, &xmitq);
631 spin_unlock_bh(&le->lock);
632 remains--;
567 } 633 }
568 spin_unlock_bh(&le->lock);
569 tipc_node_read_unlock(n); 634 tipc_node_read_unlock(n);
570 tipc_bearer_xmit(n->net, bearer_id, &xmitq, &le->maddr); 635 tipc_bearer_xmit(n->net, bearer_id, &xmitq, &le->maddr);
571 if (rc & TIPC_LINK_DOWN_EVT) 636 if (rc & TIPC_LINK_DOWN_EVT)
@@ -797,6 +862,7 @@ static u32 tipc_node_suggest_addr(struct net *net, u32 addr)
797} 862}
798 863
799/* tipc_node_try_addr(): Check if addr can be used by peer, suggest other if not 864/* tipc_node_try_addr(): Check if addr can be used by peer, suggest other if not
865 * Returns suggested address if any, otherwise 0
800 */ 866 */
801u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr) 867u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr)
802{ 868{
@@ -819,12 +885,14 @@ u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr)
819 if (n) { 885 if (n) {
820 addr = n->addr; 886 addr = n->addr;
821 tipc_node_put(n); 887 tipc_node_put(n);
888 return addr;
822 } 889 }
823 /* Even this node may be in trial phase */ 890
891 /* Even this node may be in conflict */
824 if (tn->trial_addr == addr) 892 if (tn->trial_addr == addr)
825 return tipc_node_suggest_addr(net, addr); 893 return tipc_node_suggest_addr(net, addr);
826 894
827 return addr; 895 return 0;
828} 896}
829 897
830void tipc_node_check_dest(struct net *net, u32 addr, 898void tipc_node_check_dest(struct net *net, u32 addr,
@@ -1171,6 +1239,7 @@ static void node_lost_contact(struct tipc_node *n,
1171 uint i; 1239 uint i;
1172 1240
1173 pr_debug("Lost contact with %x\n", n->addr); 1241 pr_debug("Lost contact with %x\n", n->addr);
1242 n->delete_at = jiffies + msecs_to_jiffies(NODE_CLEANUP_AFTER);
1174 1243
1175 /* Clean up broadcast state */ 1244 /* Clean up broadcast state */
1176 tipc_bcast_remove_peer(n->net, n->bc_entry.link); 1245 tipc_bcast_remove_peer(n->net, n->bc_entry.link);
@@ -1478,7 +1547,7 @@ static void tipc_node_bc_rcv(struct net *net, struct sk_buff *skb, int bearer_id
1478 * tipc_node_check_state - check and if necessary update node state 1547 * tipc_node_check_state - check and if necessary update node state
1479 * @skb: TIPC packet 1548 * @skb: TIPC packet
1480 * @bearer_id: identity of bearer delivering the packet 1549 * @bearer_id: identity of bearer delivering the packet
1481 * Returns true if state is ok, otherwise consumes buffer and returns false 1550 * Returns true if state and msg are ok, otherwise false
1482 */ 1551 */
1483static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, 1552static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb,
1484 int bearer_id, struct sk_buff_head *xmitq) 1553 int bearer_id, struct sk_buff_head *xmitq)
@@ -1512,6 +1581,9 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb,
1512 } 1581 }
1513 } 1582 }
1514 1583
1584 if (!tipc_link_validate_msg(l, hdr))
1585 return false;
1586
1515 /* Check and update node accesibility if applicable */ 1587 /* Check and update node accesibility if applicable */
1516 if (state == SELF_UP_PEER_COMING) { 1588 if (state == SELF_UP_PEER_COMING) {
1517 if (!tipc_link_is_up(l)) 1589 if (!tipc_link_is_up(l))
@@ -1740,7 +1812,6 @@ int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info)
1740 struct tipc_node *peer; 1812 struct tipc_node *peer;
1741 u32 addr; 1813 u32 addr;
1742 int err; 1814 int err;
1743 int i;
1744 1815
1745 /* We identify the peer by its net */ 1816 /* We identify the peer by its net */
1746 if (!info->attrs[TIPC_NLA_NET]) 1817 if (!info->attrs[TIPC_NLA_NET])
@@ -1775,15 +1846,7 @@ int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info)
1775 goto err_out; 1846 goto err_out;
1776 } 1847 }
1777 1848
1778 for (i = 0; i < MAX_BEARERS; i++) { 1849 tipc_node_clear_links(peer);
1779 struct tipc_link_entry *le = &peer->links[i];
1780
1781 if (le->link) {
1782 kfree(le->link);
1783 le->link = NULL;
1784 peer->link_cnt--;
1785 }
1786 }
1787 tipc_node_write_unlock(peer); 1850 tipc_node_write_unlock(peer);
1788 tipc_node_delete(peer); 1851 tipc_node_delete(peer);
1789 1852
diff --git a/net/tipc/node.h b/net/tipc/node.h
index 846c8f240872..48b3298a248d 100644
--- a/net/tipc/node.h
+++ b/net/tipc/node.h
@@ -49,14 +49,16 @@ enum {
49 TIPC_BCAST_STATE_NACK = (1 << 2), 49 TIPC_BCAST_STATE_NACK = (1 << 2),
50 TIPC_BLOCK_FLOWCTL = (1 << 3), 50 TIPC_BLOCK_FLOWCTL = (1 << 3),
51 TIPC_BCAST_RCAST = (1 << 4), 51 TIPC_BCAST_RCAST = (1 << 4),
52 TIPC_NODE_ID128 = (1 << 5) 52 TIPC_NODE_ID128 = (1 << 5),
53 TIPC_LINK_PROTO_SEQNO = (1 << 6)
53}; 54};
54 55
55#define TIPC_NODE_CAPABILITIES (TIPC_BCAST_SYNCH | \ 56#define TIPC_NODE_CAPABILITIES (TIPC_BCAST_SYNCH | \
56 TIPC_BCAST_STATE_NACK | \ 57 TIPC_BCAST_STATE_NACK | \
57 TIPC_BCAST_RCAST | \ 58 TIPC_BCAST_RCAST | \
58 TIPC_BLOCK_FLOWCTL | \ 59 TIPC_BLOCK_FLOWCTL | \
59 TIPC_NODE_ID128) 60 TIPC_NODE_ID128 | \
61 TIPC_LINK_PROTO_SEQNO)
60#define INVALID_BEARER_ID -1 62#define INVALID_BEARER_ID -1
61 63
62void tipc_node_stop(struct net *net); 64void tipc_node_stop(struct net *net);
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 14a5d055717d..c1e93c9515bc 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -411,7 +411,6 @@ static int tipc_sk_sock_err(struct socket *sock, long *timeout)
411static int tipc_sk_create(struct net *net, struct socket *sock, 411static int tipc_sk_create(struct net *net, struct socket *sock,
412 int protocol, int kern) 412 int protocol, int kern)
413{ 413{
414 struct tipc_net *tn;
415 const struct proto_ops *ops; 414 const struct proto_ops *ops;
416 struct sock *sk; 415 struct sock *sk;
417 struct tipc_sock *tsk; 416 struct tipc_sock *tsk;
@@ -446,7 +445,6 @@ static int tipc_sk_create(struct net *net, struct socket *sock,
446 INIT_LIST_HEAD(&tsk->publications); 445 INIT_LIST_HEAD(&tsk->publications);
447 INIT_LIST_HEAD(&tsk->cong_links); 446 INIT_LIST_HEAD(&tsk->cong_links);
448 msg = &tsk->phdr; 447 msg = &tsk->phdr;
449 tn = net_generic(sock_net(sk), tipc_net_id);
450 448
451 /* Finish initializing socket data structures */ 449 /* Finish initializing socket data structures */
452 sock->ops = ops; 450 sock->ops = ops;
@@ -692,9 +690,10 @@ static int tipc_getname(struct socket *sock, struct sockaddr *uaddr,
692} 690}
693 691
694/** 692/**
695 * tipc_poll - read pollmask 693 * tipc_poll - read and possibly block on pollmask
696 * @file: file structure associated with the socket 694 * @file: file structure associated with the socket
697 * @sock: socket for which to calculate the poll bits 695 * @sock: socket for which to calculate the poll bits
696 * @wait: ???
698 * 697 *
699 * Returns pollmask value 698 * Returns pollmask value
700 * 699 *
@@ -708,12 +707,15 @@ static int tipc_getname(struct socket *sock, struct sockaddr *uaddr,
708 * imply that the operation will succeed, merely that it should be performed 707 * imply that the operation will succeed, merely that it should be performed
709 * and will not block. 708 * and will not block.
710 */ 709 */
711static __poll_t tipc_poll_mask(struct socket *sock, __poll_t events) 710static __poll_t tipc_poll(struct file *file, struct socket *sock,
711 poll_table *wait)
712{ 712{
713 struct sock *sk = sock->sk; 713 struct sock *sk = sock->sk;
714 struct tipc_sock *tsk = tipc_sk(sk); 714 struct tipc_sock *tsk = tipc_sk(sk);
715 __poll_t revents = 0; 715 __poll_t revents = 0;
716 716
717 sock_poll_wait(file, wait);
718
717 if (sk->sk_shutdown & RCV_SHUTDOWN) 719 if (sk->sk_shutdown & RCV_SHUTDOWN)
718 revents |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; 720 revents |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
719 if (sk->sk_shutdown == SHUTDOWN_MASK) 721 if (sk->sk_shutdown == SHUTDOWN_MASK)
@@ -1113,7 +1115,7 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq,
1113 u32 self = tipc_own_addr(net); 1115 u32 self = tipc_own_addr(net);
1114 u32 type, lower, upper, scope; 1116 u32 type, lower, upper, scope;
1115 struct sk_buff *skb, *_skb; 1117 struct sk_buff *skb, *_skb;
1116 u32 portid, oport, onode; 1118 u32 portid, onode;
1117 struct sk_buff_head tmpq; 1119 struct sk_buff_head tmpq;
1118 struct list_head dports; 1120 struct list_head dports;
1119 struct tipc_msg *hdr; 1121 struct tipc_msg *hdr;
@@ -1129,7 +1131,6 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq,
1129 user = msg_user(hdr); 1131 user = msg_user(hdr);
1130 mtyp = msg_type(hdr); 1132 mtyp = msg_type(hdr);
1131 hlen = skb_headroom(skb) + msg_hdr_sz(hdr); 1133 hlen = skb_headroom(skb) + msg_hdr_sz(hdr);
1132 oport = msg_origport(hdr);
1133 onode = msg_orignode(hdr); 1134 onode = msg_orignode(hdr);
1134 type = msg_nametype(hdr); 1135 type = msg_nametype(hdr);
1135 1136
@@ -3033,7 +3034,7 @@ static const struct proto_ops msg_ops = {
3033 .socketpair = tipc_socketpair, 3034 .socketpair = tipc_socketpair,
3034 .accept = sock_no_accept, 3035 .accept = sock_no_accept,
3035 .getname = tipc_getname, 3036 .getname = tipc_getname,
3036 .poll_mask = tipc_poll_mask, 3037 .poll = tipc_poll,
3037 .ioctl = tipc_ioctl, 3038 .ioctl = tipc_ioctl,
3038 .listen = sock_no_listen, 3039 .listen = sock_no_listen,
3039 .shutdown = tipc_shutdown, 3040 .shutdown = tipc_shutdown,
@@ -3054,7 +3055,7 @@ static const struct proto_ops packet_ops = {
3054 .socketpair = tipc_socketpair, 3055 .socketpair = tipc_socketpair,
3055 .accept = tipc_accept, 3056 .accept = tipc_accept,
3056 .getname = tipc_getname, 3057 .getname = tipc_getname,
3057 .poll_mask = tipc_poll_mask, 3058 .poll = tipc_poll,
3058 .ioctl = tipc_ioctl, 3059 .ioctl = tipc_ioctl,
3059 .listen = tipc_listen, 3060 .listen = tipc_listen,
3060 .shutdown = tipc_shutdown, 3061 .shutdown = tipc_shutdown,
@@ -3075,7 +3076,7 @@ static const struct proto_ops stream_ops = {
3075 .socketpair = tipc_socketpair, 3076 .socketpair = tipc_socketpair,
3076 .accept = tipc_accept, 3077 .accept = tipc_accept,
3077 .getname = tipc_getname, 3078 .getname = tipc_getname,
3078 .poll_mask = tipc_poll_mask, 3079 .poll = tipc_poll,
3079 .ioctl = tipc_ioctl, 3080 .ioctl = tipc_ioctl,
3080 .listen = tipc_listen, 3081 .listen = tipc_listen,
3081 .shutdown = tipc_shutdown, 3082 .shutdown = tipc_shutdown,
@@ -3316,6 +3317,11 @@ int tipc_sk_fill_sock_diag(struct sk_buff *skb, struct netlink_callback *cb,
3316 goto stat_msg_cancel; 3317 goto stat_msg_cancel;
3317 3318
3318 nla_nest_end(skb, stat); 3319 nla_nest_end(skb, stat);
3320
3321 if (tsk->group)
3322 if (tipc_group_fill_sock_diag(tsk->group, skb))
3323 goto stat_msg_cancel;
3324
3319 nla_nest_end(skb, attrs); 3325 nla_nest_end(skb, attrs);
3320 3326
3321 return 0; 3327 return 0;
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index a7a8f8e20ff3..292742e50bfa 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -52,9 +52,12 @@ static DEFINE_SPINLOCK(tls_device_lock);
52 52
53static void tls_device_free_ctx(struct tls_context *ctx) 53static void tls_device_free_ctx(struct tls_context *ctx)
54{ 54{
55 struct tls_offload_context *offload_ctx = tls_offload_ctx(ctx); 55 if (ctx->tx_conf == TLS_HW)
56 kfree(tls_offload_ctx_tx(ctx));
57
58 if (ctx->rx_conf == TLS_HW)
59 kfree(tls_offload_ctx_rx(ctx));
56 60
57 kfree(offload_ctx);
58 kfree(ctx); 61 kfree(ctx);
59} 62}
60 63
@@ -71,10 +74,11 @@ static void tls_device_gc_task(struct work_struct *work)
71 list_for_each_entry_safe(ctx, tmp, &gc_list, list) { 74 list_for_each_entry_safe(ctx, tmp, &gc_list, list) {
72 struct net_device *netdev = ctx->netdev; 75 struct net_device *netdev = ctx->netdev;
73 76
74 if (netdev) { 77 if (netdev && ctx->tx_conf == TLS_HW) {
75 netdev->tlsdev_ops->tls_dev_del(netdev, ctx, 78 netdev->tlsdev_ops->tls_dev_del(netdev, ctx,
76 TLS_OFFLOAD_CTX_DIR_TX); 79 TLS_OFFLOAD_CTX_DIR_TX);
77 dev_put(netdev); 80 dev_put(netdev);
81 ctx->netdev = NULL;
78 } 82 }
79 83
80 list_del(&ctx->list); 84 list_del(&ctx->list);
@@ -82,6 +86,22 @@ static void tls_device_gc_task(struct work_struct *work)
82 } 86 }
83} 87}
84 88
89static void tls_device_attach(struct tls_context *ctx, struct sock *sk,
90 struct net_device *netdev)
91{
92 if (sk->sk_destruct != tls_device_sk_destruct) {
93 refcount_set(&ctx->refcount, 1);
94 dev_hold(netdev);
95 ctx->netdev = netdev;
96 spin_lock_irq(&tls_device_lock);
97 list_add_tail(&ctx->list, &tls_device_list);
98 spin_unlock_irq(&tls_device_lock);
99
100 ctx->sk_destruct = sk->sk_destruct;
101 sk->sk_destruct = tls_device_sk_destruct;
102 }
103}
104
85static void tls_device_queue_ctx_destruction(struct tls_context *ctx) 105static void tls_device_queue_ctx_destruction(struct tls_context *ctx)
86{ 106{
87 unsigned long flags; 107 unsigned long flags;
@@ -125,7 +145,7 @@ static void destroy_record(struct tls_record_info *record)
125 kfree(record); 145 kfree(record);
126} 146}
127 147
128static void delete_all_records(struct tls_offload_context *offload_ctx) 148static void delete_all_records(struct tls_offload_context_tx *offload_ctx)
129{ 149{
130 struct tls_record_info *info, *temp; 150 struct tls_record_info *info, *temp;
131 151
@@ -141,14 +161,14 @@ static void tls_icsk_clean_acked(struct sock *sk, u32 acked_seq)
141{ 161{
142 struct tls_context *tls_ctx = tls_get_ctx(sk); 162 struct tls_context *tls_ctx = tls_get_ctx(sk);
143 struct tls_record_info *info, *temp; 163 struct tls_record_info *info, *temp;
144 struct tls_offload_context *ctx; 164 struct tls_offload_context_tx *ctx;
145 u64 deleted_records = 0; 165 u64 deleted_records = 0;
146 unsigned long flags; 166 unsigned long flags;
147 167
148 if (!tls_ctx) 168 if (!tls_ctx)
149 return; 169 return;
150 170
151 ctx = tls_offload_ctx(tls_ctx); 171 ctx = tls_offload_ctx_tx(tls_ctx);
152 172
153 spin_lock_irqsave(&ctx->lock, flags); 173 spin_lock_irqsave(&ctx->lock, flags);
154 info = ctx->retransmit_hint; 174 info = ctx->retransmit_hint;
@@ -179,15 +199,17 @@ static void tls_icsk_clean_acked(struct sock *sk, u32 acked_seq)
179void tls_device_sk_destruct(struct sock *sk) 199void tls_device_sk_destruct(struct sock *sk)
180{ 200{
181 struct tls_context *tls_ctx = tls_get_ctx(sk); 201 struct tls_context *tls_ctx = tls_get_ctx(sk);
182 struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx); 202 struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx);
183 203
184 if (ctx->open_record) 204 tls_ctx->sk_destruct(sk);
185 destroy_record(ctx->open_record);
186 205
187 delete_all_records(ctx); 206 if (tls_ctx->tx_conf == TLS_HW) {
188 crypto_free_aead(ctx->aead_send); 207 if (ctx->open_record)
189 ctx->sk_destruct(sk); 208 destroy_record(ctx->open_record);
190 clean_acked_data_disable(inet_csk(sk)); 209 delete_all_records(ctx);
210 crypto_free_aead(ctx->aead_send);
211 clean_acked_data_disable(inet_csk(sk));
212 }
191 213
192 if (refcount_dec_and_test(&tls_ctx->refcount)) 214 if (refcount_dec_and_test(&tls_ctx->refcount))
193 tls_device_queue_ctx_destruction(tls_ctx); 215 tls_device_queue_ctx_destruction(tls_ctx);
@@ -219,7 +241,7 @@ static void tls_append_frag(struct tls_record_info *record,
219 241
220static int tls_push_record(struct sock *sk, 242static int tls_push_record(struct sock *sk,
221 struct tls_context *ctx, 243 struct tls_context *ctx,
222 struct tls_offload_context *offload_ctx, 244 struct tls_offload_context_tx *offload_ctx,
223 struct tls_record_info *record, 245 struct tls_record_info *record,
224 struct page_frag *pfrag, 246 struct page_frag *pfrag,
225 int flags, 247 int flags,
@@ -264,7 +286,7 @@ static int tls_push_record(struct sock *sk,
264 return tls_push_sg(sk, ctx, offload_ctx->sg_tx_data, 0, flags); 286 return tls_push_sg(sk, ctx, offload_ctx->sg_tx_data, 0, flags);
265} 287}
266 288
267static int tls_create_new_record(struct tls_offload_context *offload_ctx, 289static int tls_create_new_record(struct tls_offload_context_tx *offload_ctx,
268 struct page_frag *pfrag, 290 struct page_frag *pfrag,
269 size_t prepend_size) 291 size_t prepend_size)
270{ 292{
@@ -290,7 +312,7 @@ static int tls_create_new_record(struct tls_offload_context *offload_ctx,
290} 312}
291 313
292static int tls_do_allocation(struct sock *sk, 314static int tls_do_allocation(struct sock *sk,
293 struct tls_offload_context *offload_ctx, 315 struct tls_offload_context_tx *offload_ctx,
294 struct page_frag *pfrag, 316 struct page_frag *pfrag,
295 size_t prepend_size) 317 size_t prepend_size)
296{ 318{
@@ -324,7 +346,7 @@ static int tls_push_data(struct sock *sk,
324 unsigned char record_type) 346 unsigned char record_type)
325{ 347{
326 struct tls_context *tls_ctx = tls_get_ctx(sk); 348 struct tls_context *tls_ctx = tls_get_ctx(sk);
327 struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx); 349 struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx);
328 int tls_push_record_flags = flags | MSG_SENDPAGE_NOTLAST; 350 int tls_push_record_flags = flags | MSG_SENDPAGE_NOTLAST;
329 int more = flags & (MSG_SENDPAGE_NOTLAST | MSG_MORE); 351 int more = flags & (MSG_SENDPAGE_NOTLAST | MSG_MORE);
330 struct tls_record_info *record = ctx->open_record; 352 struct tls_record_info *record = ctx->open_record;
@@ -477,7 +499,7 @@ out:
477 return rc; 499 return rc;
478} 500}
479 501
480struct tls_record_info *tls_get_record(struct tls_offload_context *context, 502struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context,
481 u32 seq, u64 *p_record_sn) 503 u32 seq, u64 *p_record_sn)
482{ 504{
483 u64 record_sn = context->hint_record_sn; 505 u64 record_sn = context->hint_record_sn;
@@ -520,11 +542,123 @@ static int tls_device_push_pending_record(struct sock *sk, int flags)
520 return tls_push_data(sk, &msg_iter, 0, flags, TLS_RECORD_TYPE_DATA); 542 return tls_push_data(sk, &msg_iter, 0, flags, TLS_RECORD_TYPE_DATA);
521} 543}
522 544
545void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn)
546{
547 struct tls_context *tls_ctx = tls_get_ctx(sk);
548 struct net_device *netdev = tls_ctx->netdev;
549 struct tls_offload_context_rx *rx_ctx;
550 u32 is_req_pending;
551 s64 resync_req;
552 u32 req_seq;
553
554 if (tls_ctx->rx_conf != TLS_HW)
555 return;
556
557 rx_ctx = tls_offload_ctx_rx(tls_ctx);
558 resync_req = atomic64_read(&rx_ctx->resync_req);
559 req_seq = ntohl(resync_req >> 32) - ((u32)TLS_HEADER_SIZE - 1);
560 is_req_pending = resync_req;
561
562 if (unlikely(is_req_pending) && req_seq == seq &&
563 atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0))
564 netdev->tlsdev_ops->tls_dev_resync_rx(netdev, sk,
565 seq + TLS_HEADER_SIZE - 1,
566 rcd_sn);
567}
568
569static int tls_device_reencrypt(struct sock *sk, struct sk_buff *skb)
570{
571 struct strp_msg *rxm = strp_msg(skb);
572 int err = 0, offset = rxm->offset, copy, nsg;
573 struct sk_buff *skb_iter, *unused;
574 struct scatterlist sg[1];
575 char *orig_buf, *buf;
576
577 orig_buf = kmalloc(rxm->full_len + TLS_HEADER_SIZE +
578 TLS_CIPHER_AES_GCM_128_IV_SIZE, sk->sk_allocation);
579 if (!orig_buf)
580 return -ENOMEM;
581 buf = orig_buf;
582
583 nsg = skb_cow_data(skb, 0, &unused);
584 if (unlikely(nsg < 0)) {
585 err = nsg;
586 goto free_buf;
587 }
588
589 sg_init_table(sg, 1);
590 sg_set_buf(&sg[0], buf,
591 rxm->full_len + TLS_HEADER_SIZE +
592 TLS_CIPHER_AES_GCM_128_IV_SIZE);
593 skb_copy_bits(skb, offset, buf,
594 TLS_HEADER_SIZE + TLS_CIPHER_AES_GCM_128_IV_SIZE);
595
596 /* We are interested only in the decrypted data not the auth */
597 err = decrypt_skb(sk, skb, sg);
598 if (err != -EBADMSG)
599 goto free_buf;
600 else
601 err = 0;
602
603 copy = min_t(int, skb_pagelen(skb) - offset,
604 rxm->full_len - TLS_CIPHER_AES_GCM_128_TAG_SIZE);
605
606 if (skb->decrypted)
607 skb_store_bits(skb, offset, buf, copy);
608
609 offset += copy;
610 buf += copy;
611
612 skb_walk_frags(skb, skb_iter) {
613 copy = min_t(int, skb_iter->len,
614 rxm->full_len - offset + rxm->offset -
615 TLS_CIPHER_AES_GCM_128_TAG_SIZE);
616
617 if (skb_iter->decrypted)
618 skb_store_bits(skb_iter, offset, buf, copy);
619
620 offset += copy;
621 buf += copy;
622 }
623
624free_buf:
625 kfree(orig_buf);
626 return err;
627}
628
629int tls_device_decrypted(struct sock *sk, struct sk_buff *skb)
630{
631 struct tls_context *tls_ctx = tls_get_ctx(sk);
632 struct tls_offload_context_rx *ctx = tls_offload_ctx_rx(tls_ctx);
633 int is_decrypted = skb->decrypted;
634 int is_encrypted = !is_decrypted;
635 struct sk_buff *skb_iter;
636
637 /* Skip if it is already decrypted */
638 if (ctx->sw.decrypted)
639 return 0;
640
641 /* Check if all the data is decrypted already */
642 skb_walk_frags(skb, skb_iter) {
643 is_decrypted &= skb_iter->decrypted;
644 is_encrypted &= !skb_iter->decrypted;
645 }
646
647 ctx->sw.decrypted |= is_decrypted;
648
649 /* Return immedeatly if the record is either entirely plaintext or
650 * entirely ciphertext. Otherwise handle reencrypt partially decrypted
651 * record.
652 */
653 return (is_encrypted || is_decrypted) ? 0 :
654 tls_device_reencrypt(sk, skb);
655}
656
523int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) 657int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
524{ 658{
525 u16 nonce_size, tag_size, iv_size, rec_seq_size; 659 u16 nonce_size, tag_size, iv_size, rec_seq_size;
526 struct tls_record_info *start_marker_record; 660 struct tls_record_info *start_marker_record;
527 struct tls_offload_context *offload_ctx; 661 struct tls_offload_context_tx *offload_ctx;
528 struct tls_crypto_info *crypto_info; 662 struct tls_crypto_info *crypto_info;
529 struct net_device *netdev; 663 struct net_device *netdev;
530 char *iv, *rec_seq; 664 char *iv, *rec_seq;
@@ -546,7 +680,7 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
546 goto out; 680 goto out;
547 } 681 }
548 682
549 offload_ctx = kzalloc(TLS_OFFLOAD_CONTEXT_SIZE, GFP_KERNEL); 683 offload_ctx = kzalloc(TLS_OFFLOAD_CONTEXT_SIZE_TX, GFP_KERNEL);
550 if (!offload_ctx) { 684 if (!offload_ctx) {
551 rc = -ENOMEM; 685 rc = -ENOMEM;
552 goto free_marker_record; 686 goto free_marker_record;
@@ -582,12 +716,11 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
582 memcpy(ctx->tx.iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size); 716 memcpy(ctx->tx.iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size);
583 717
584 ctx->tx.rec_seq_size = rec_seq_size; 718 ctx->tx.rec_seq_size = rec_seq_size;
585 ctx->tx.rec_seq = kmalloc(rec_seq_size, GFP_KERNEL); 719 ctx->tx.rec_seq = kmemdup(rec_seq, rec_seq_size, GFP_KERNEL);
586 if (!ctx->tx.rec_seq) { 720 if (!ctx->tx.rec_seq) {
587 rc = -ENOMEM; 721 rc = -ENOMEM;
588 goto free_iv; 722 goto free_iv;
589 } 723 }
590 memcpy(ctx->tx.rec_seq, rec_seq, rec_seq_size);
591 724
592 rc = tls_sw_fallback_init(sk, offload_ctx, crypto_info); 725 rc = tls_sw_fallback_init(sk, offload_ctx, crypto_info);
593 if (rc) 726 if (rc)
@@ -609,7 +742,6 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
609 742
610 clean_acked_data_enable(inet_csk(sk), &tls_icsk_clean_acked); 743 clean_acked_data_enable(inet_csk(sk), &tls_icsk_clean_acked);
611 ctx->push_pending_record = tls_device_push_pending_record; 744 ctx->push_pending_record = tls_device_push_pending_record;
612 offload_ctx->sk_destruct = sk->sk_destruct;
613 745
614 /* TLS offload is greatly simplified if we don't send 746 /* TLS offload is greatly simplified if we don't send
615 * SKBs where only part of the payload needs to be encrypted. 747 * SKBs where only part of the payload needs to be encrypted.
@@ -619,8 +751,6 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
619 if (skb) 751 if (skb)
620 TCP_SKB_CB(skb)->eor = 1; 752 TCP_SKB_CB(skb)->eor = 1;
621 753
622 refcount_set(&ctx->refcount, 1);
623
624 /* We support starting offload on multiple sockets 754 /* We support starting offload on multiple sockets
625 * concurrently, so we only need a read lock here. 755 * concurrently, so we only need a read lock here.
626 * This lock must precede get_netdev_for_sock to prevent races between 756 * This lock must precede get_netdev_for_sock to prevent races between
@@ -655,19 +785,14 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
655 if (rc) 785 if (rc)
656 goto release_netdev; 786 goto release_netdev;
657 787
658 ctx->netdev = netdev; 788 tls_device_attach(ctx, sk, netdev);
659
660 spin_lock_irq(&tls_device_lock);
661 list_add_tail(&ctx->list, &tls_device_list);
662 spin_unlock_irq(&tls_device_lock);
663 789
664 sk->sk_validate_xmit_skb = tls_validate_xmit_skb;
665 /* following this assignment tls_is_sk_tx_device_offloaded 790 /* following this assignment tls_is_sk_tx_device_offloaded
666 * will return true and the context might be accessed 791 * will return true and the context might be accessed
667 * by the netdev's xmit function. 792 * by the netdev's xmit function.
668 */ 793 */
669 smp_store_release(&sk->sk_destruct, 794 smp_store_release(&sk->sk_validate_xmit_skb, tls_validate_xmit_skb);
670 &tls_device_sk_destruct); 795 dev_put(netdev);
671 up_read(&device_offload_lock); 796 up_read(&device_offload_lock);
672 goto out; 797 goto out;
673 798
@@ -690,6 +815,105 @@ out:
690 return rc; 815 return rc;
691} 816}
692 817
818int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx)
819{
820 struct tls_offload_context_rx *context;
821 struct net_device *netdev;
822 int rc = 0;
823
824 /* We support starting offload on multiple sockets
825 * concurrently, so we only need a read lock here.
826 * This lock must precede get_netdev_for_sock to prevent races between
827 * NETDEV_DOWN and setsockopt.
828 */
829 down_read(&device_offload_lock);
830 netdev = get_netdev_for_sock(sk);
831 if (!netdev) {
832 pr_err_ratelimited("%s: netdev not found\n", __func__);
833 rc = -EINVAL;
834 goto release_lock;
835 }
836
837 if (!(netdev->features & NETIF_F_HW_TLS_RX)) {
838 pr_err_ratelimited("%s: netdev %s with no TLS offload\n",
839 __func__, netdev->name);
840 rc = -ENOTSUPP;
841 goto release_netdev;
842 }
843
844 /* Avoid offloading if the device is down
845 * We don't want to offload new flows after
846 * the NETDEV_DOWN event
847 */
848 if (!(netdev->flags & IFF_UP)) {
849 rc = -EINVAL;
850 goto release_netdev;
851 }
852
853 context = kzalloc(TLS_OFFLOAD_CONTEXT_SIZE_RX, GFP_KERNEL);
854 if (!context) {
855 rc = -ENOMEM;
856 goto release_netdev;
857 }
858
859 ctx->priv_ctx_rx = context;
860 rc = tls_set_sw_offload(sk, ctx, 0);
861 if (rc)
862 goto release_ctx;
863
864 rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_RX,
865 &ctx->crypto_recv,
866 tcp_sk(sk)->copied_seq);
867 if (rc) {
868 pr_err_ratelimited("%s: The netdev has refused to offload this socket\n",
869 __func__);
870 goto free_sw_resources;
871 }
872
873 tls_device_attach(ctx, sk, netdev);
874 goto release_netdev;
875
876free_sw_resources:
877 tls_sw_free_resources_rx(sk);
878release_ctx:
879 ctx->priv_ctx_rx = NULL;
880release_netdev:
881 dev_put(netdev);
882release_lock:
883 up_read(&device_offload_lock);
884 return rc;
885}
886
887void tls_device_offload_cleanup_rx(struct sock *sk)
888{
889 struct tls_context *tls_ctx = tls_get_ctx(sk);
890 struct net_device *netdev;
891
892 down_read(&device_offload_lock);
893 netdev = tls_ctx->netdev;
894 if (!netdev)
895 goto out;
896
897 if (!(netdev->features & NETIF_F_HW_TLS_RX)) {
898 pr_err_ratelimited("%s: device is missing NETIF_F_HW_TLS_RX cap\n",
899 __func__);
900 goto out;
901 }
902
903 netdev->tlsdev_ops->tls_dev_del(netdev, tls_ctx,
904 TLS_OFFLOAD_CTX_DIR_RX);
905
906 if (tls_ctx->tx_conf != TLS_HW) {
907 dev_put(netdev);
908 tls_ctx->netdev = NULL;
909 }
910out:
911 up_read(&device_offload_lock);
912 kfree(tls_ctx->rx.rec_seq);
913 kfree(tls_ctx->rx.iv);
914 tls_sw_release_resources_rx(sk);
915}
916
693static int tls_device_down(struct net_device *netdev) 917static int tls_device_down(struct net_device *netdev)
694{ 918{
695 struct tls_context *ctx, *tmp; 919 struct tls_context *ctx, *tmp;
@@ -710,8 +934,12 @@ static int tls_device_down(struct net_device *netdev)
710 spin_unlock_irqrestore(&tls_device_lock, flags); 934 spin_unlock_irqrestore(&tls_device_lock, flags);
711 935
712 list_for_each_entry_safe(ctx, tmp, &list, list) { 936 list_for_each_entry_safe(ctx, tmp, &list, list) {
713 netdev->tlsdev_ops->tls_dev_del(netdev, ctx, 937 if (ctx->tx_conf == TLS_HW)
714 TLS_OFFLOAD_CTX_DIR_TX); 938 netdev->tlsdev_ops->tls_dev_del(netdev, ctx,
939 TLS_OFFLOAD_CTX_DIR_TX);
940 if (ctx->rx_conf == TLS_HW)
941 netdev->tlsdev_ops->tls_dev_del(netdev, ctx,
942 TLS_OFFLOAD_CTX_DIR_RX);
715 ctx->netdev = NULL; 943 ctx->netdev = NULL;
716 dev_put(netdev); 944 dev_put(netdev);
717 list_del_init(&ctx->list); 945 list_del_init(&ctx->list);
@@ -732,12 +960,16 @@ static int tls_dev_event(struct notifier_block *this, unsigned long event,
732{ 960{
733 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 961 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
734 962
735 if (!(dev->features & NETIF_F_HW_TLS_TX)) 963 if (!(dev->features & (NETIF_F_HW_TLS_RX | NETIF_F_HW_TLS_TX)))
736 return NOTIFY_DONE; 964 return NOTIFY_DONE;
737 965
738 switch (event) { 966 switch (event) {
739 case NETDEV_REGISTER: 967 case NETDEV_REGISTER:
740 case NETDEV_FEAT_CHANGE: 968 case NETDEV_FEAT_CHANGE:
969 if ((dev->features & NETIF_F_HW_TLS_RX) &&
970 !dev->tlsdev_ops->tls_dev_resync_rx)
971 return NOTIFY_BAD;
972
741 if (dev->tlsdev_ops && 973 if (dev->tlsdev_ops &&
742 dev->tlsdev_ops->tls_dev_add && 974 dev->tlsdev_ops->tls_dev_add &&
743 dev->tlsdev_ops->tls_dev_del) 975 dev->tlsdev_ops->tls_dev_del)
diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c
index 748914abdb60..6102169239d1 100644
--- a/net/tls/tls_device_fallback.c
+++ b/net/tls/tls_device_fallback.c
@@ -42,7 +42,7 @@ static void chain_to_walk(struct scatterlist *sg, struct scatter_walk *walk)
42 sg_set_page(sg, sg_page(src), 42 sg_set_page(sg, sg_page(src),
43 src->length - diff, walk->offset); 43 src->length - diff, walk->offset);
44 44
45 scatterwalk_crypto_chain(sg, sg_next(src), 0, 2); 45 scatterwalk_crypto_chain(sg, sg_next(src), 2);
46} 46}
47 47
48static int tls_enc_record(struct aead_request *aead_req, 48static int tls_enc_record(struct aead_request *aead_req,
@@ -214,7 +214,7 @@ static void complete_skb(struct sk_buff *nskb, struct sk_buff *skb, int headln)
214 214
215static int fill_sg_in(struct scatterlist *sg_in, 215static int fill_sg_in(struct scatterlist *sg_in,
216 struct sk_buff *skb, 216 struct sk_buff *skb,
217 struct tls_offload_context *ctx, 217 struct tls_offload_context_tx *ctx,
218 u64 *rcd_sn, 218 u64 *rcd_sn,
219 s32 *sync_size, 219 s32 *sync_size,
220 int *resync_sgs) 220 int *resync_sgs)
@@ -299,7 +299,7 @@ static struct sk_buff *tls_enc_skb(struct tls_context *tls_ctx,
299 s32 sync_size, u64 rcd_sn) 299 s32 sync_size, u64 rcd_sn)
300{ 300{
301 int tcp_payload_offset = skb_transport_offset(skb) + tcp_hdrlen(skb); 301 int tcp_payload_offset = skb_transport_offset(skb) + tcp_hdrlen(skb);
302 struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx); 302 struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx);
303 int payload_len = skb->len - tcp_payload_offset; 303 int payload_len = skb->len - tcp_payload_offset;
304 void *buf, *iv, *aad, *dummy_buf; 304 void *buf, *iv, *aad, *dummy_buf;
305 struct aead_request *aead_req; 305 struct aead_request *aead_req;
@@ -361,7 +361,7 @@ static struct sk_buff *tls_sw_fallback(struct sock *sk, struct sk_buff *skb)
361{ 361{
362 int tcp_payload_offset = skb_transport_offset(skb) + tcp_hdrlen(skb); 362 int tcp_payload_offset = skb_transport_offset(skb) + tcp_hdrlen(skb);
363 struct tls_context *tls_ctx = tls_get_ctx(sk); 363 struct tls_context *tls_ctx = tls_get_ctx(sk);
364 struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx); 364 struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx);
365 int payload_len = skb->len - tcp_payload_offset; 365 int payload_len = skb->len - tcp_payload_offset;
366 struct scatterlist *sg_in, sg_out[3]; 366 struct scatterlist *sg_in, sg_out[3];
367 struct sk_buff *nskb = NULL; 367 struct sk_buff *nskb = NULL;
@@ -413,9 +413,10 @@ struct sk_buff *tls_validate_xmit_skb(struct sock *sk,
413 413
414 return tls_sw_fallback(sk, skb); 414 return tls_sw_fallback(sk, skb);
415} 415}
416EXPORT_SYMBOL_GPL(tls_validate_xmit_skb);
416 417
417int tls_sw_fallback_init(struct sock *sk, 418int tls_sw_fallback_init(struct sock *sk,
418 struct tls_offload_context *offload_ctx, 419 struct tls_offload_context_tx *offload_ctx,
419 struct tls_crypto_info *crypto_info) 420 struct tls_crypto_info *crypto_info)
420{ 421{
421 const u8 *key; 422 const u8 *key;
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index a127d61e8af9..180b6640e531 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -45,21 +45,13 @@
45MODULE_AUTHOR("Mellanox Technologies"); 45MODULE_AUTHOR("Mellanox Technologies");
46MODULE_DESCRIPTION("Transport Layer Security Support"); 46MODULE_DESCRIPTION("Transport Layer Security Support");
47MODULE_LICENSE("Dual BSD/GPL"); 47MODULE_LICENSE("Dual BSD/GPL");
48MODULE_ALIAS_TCP_ULP("tls");
48 49
49enum { 50enum {
50 TLSV4, 51 TLSV4,
51 TLSV6, 52 TLSV6,
52 TLS_NUM_PROTS, 53 TLS_NUM_PROTS,
53}; 54};
54enum {
55 TLS_BASE,
56 TLS_SW,
57#ifdef CONFIG_TLS_DEVICE
58 TLS_HW,
59#endif
60 TLS_HW_RECORD,
61 TLS_NUM_CONFIG,
62};
63 55
64static struct proto *saved_tcpv6_prot; 56static struct proto *saved_tcpv6_prot;
65static DEFINE_MUTEX(tcpv6_prot_mutex); 57static DEFINE_MUTEX(tcpv6_prot_mutex);
@@ -221,9 +213,14 @@ static void tls_write_space(struct sock *sk)
221{ 213{
222 struct tls_context *ctx = tls_get_ctx(sk); 214 struct tls_context *ctx = tls_get_ctx(sk);
223 215
224 /* We are already sending pages, ignore notification */ 216 /* If in_tcp_sendpages call lower protocol write space handler
225 if (ctx->in_tcp_sendpages) 217 * to ensure we wake up any waiting operations there. For example
218 * if do_tcp_sendpages where to call sk_wait_event.
219 */
220 if (ctx->in_tcp_sendpages) {
221 ctx->sk_write_space(sk);
226 return; 222 return;
223 }
227 224
228 if (!sk->sk_write_pending && tls_is_pending_closed_record(ctx)) { 225 if (!sk->sk_write_pending && tls_is_pending_closed_record(ctx)) {
229 gfp_t sk_allocation = sk->sk_allocation; 226 gfp_t sk_allocation = sk->sk_allocation;
@@ -290,7 +287,10 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
290 } 287 }
291 288
292#ifdef CONFIG_TLS_DEVICE 289#ifdef CONFIG_TLS_DEVICE
293 if (ctx->tx_conf != TLS_HW) { 290 if (ctx->rx_conf == TLS_HW)
291 tls_device_offload_cleanup_rx(sk);
292
293 if (ctx->tx_conf != TLS_HW && ctx->rx_conf != TLS_HW) {
294#else 294#else
295 { 295 {
296#endif 296#endif
@@ -470,8 +470,16 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval,
470 conf = TLS_SW; 470 conf = TLS_SW;
471 } 471 }
472 } else { 472 } else {
473 rc = tls_set_sw_offload(sk, ctx, 0); 473#ifdef CONFIG_TLS_DEVICE
474 conf = TLS_SW; 474 rc = tls_set_device_offload_rx(sk, ctx);
475 conf = TLS_HW;
476 if (rc) {
477#else
478 {
479#endif
480 rc = tls_set_sw_offload(sk, ctx, 0);
481 conf = TLS_SW;
482 }
475 } 483 }
476 484
477 if (rc) 485 if (rc)
@@ -629,6 +637,12 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG],
629 prot[TLS_HW][TLS_SW] = prot[TLS_BASE][TLS_SW]; 637 prot[TLS_HW][TLS_SW] = prot[TLS_BASE][TLS_SW];
630 prot[TLS_HW][TLS_SW].sendmsg = tls_device_sendmsg; 638 prot[TLS_HW][TLS_SW].sendmsg = tls_device_sendmsg;
631 prot[TLS_HW][TLS_SW].sendpage = tls_device_sendpage; 639 prot[TLS_HW][TLS_SW].sendpage = tls_device_sendpage;
640
641 prot[TLS_BASE][TLS_HW] = prot[TLS_BASE][TLS_SW];
642
643 prot[TLS_SW][TLS_HW] = prot[TLS_SW][TLS_SW];
644
645 prot[TLS_HW][TLS_HW] = prot[TLS_HW][TLS_SW];
632#endif 646#endif
633 647
634 prot[TLS_HW_RECORD][TLS_HW_RECORD] = *base; 648 prot[TLS_HW_RECORD][TLS_HW_RECORD] = *base;
@@ -712,7 +726,7 @@ static int __init tls_register(void)
712 build_protos(tls_prots[TLSV4], &tcp_prot); 726 build_protos(tls_prots[TLSV4], &tcp_prot);
713 727
714 tls_sw_proto_ops = inet_stream_ops; 728 tls_sw_proto_ops = inet_stream_ops;
715 tls_sw_proto_ops.poll_mask = tls_sw_poll_mask; 729 tls_sw_proto_ops.poll = tls_sw_poll;
716 tls_sw_proto_ops.splice_read = tls_sw_splice_read; 730 tls_sw_proto_ops.splice_read = tls_sw_splice_read;
717 731
718#ifdef CONFIG_TLS_DEVICE 732#ifdef CONFIG_TLS_DEVICE
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index f127fac88acf..52fbe727d7c1 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -48,21 +48,11 @@ static int tls_do_decryption(struct sock *sk,
48 struct scatterlist *sgout, 48 struct scatterlist *sgout,
49 char *iv_recv, 49 char *iv_recv,
50 size_t data_len, 50 size_t data_len,
51 struct sk_buff *skb, 51 struct aead_request *aead_req)
52 gfp_t flags)
53{ 52{
54 struct tls_context *tls_ctx = tls_get_ctx(sk); 53 struct tls_context *tls_ctx = tls_get_ctx(sk);
55 struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); 54 struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
56 struct strp_msg *rxm = strp_msg(skb);
57 struct aead_request *aead_req;
58
59 int ret; 55 int ret;
60 unsigned int req_size = sizeof(struct aead_request) +
61 crypto_aead_reqsize(ctx->aead_recv);
62
63 aead_req = kzalloc(req_size, flags);
64 if (!aead_req)
65 return -ENOMEM;
66 56
67 aead_request_set_tfm(aead_req, ctx->aead_recv); 57 aead_request_set_tfm(aead_req, ctx->aead_recv);
68 aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE); 58 aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE);
@@ -73,20 +63,6 @@ static int tls_do_decryption(struct sock *sk,
73 crypto_req_done, &ctx->async_wait); 63 crypto_req_done, &ctx->async_wait);
74 64
75 ret = crypto_wait_req(crypto_aead_decrypt(aead_req), &ctx->async_wait); 65 ret = crypto_wait_req(crypto_aead_decrypt(aead_req), &ctx->async_wait);
76
77 if (ret < 0)
78 goto out;
79
80 rxm->offset += tls_ctx->rx.prepend_size;
81 rxm->full_len -= tls_ctx->rx.overhead_size;
82 tls_advance_record_sn(sk, &tls_ctx->rx);
83
84 ctx->decrypted = true;
85
86 ctx->saved_data_ready(sk);
87
88out:
89 kfree(aead_req);
90 return ret; 66 return ret;
91} 67}
92 68
@@ -224,8 +200,7 @@ static int tls_push_record(struct sock *sk, int flags,
224 struct aead_request *req; 200 struct aead_request *req;
225 int rc; 201 int rc;
226 202
227 req = kzalloc(sizeof(struct aead_request) + 203 req = aead_request_alloc(ctx->aead_send, sk->sk_allocation);
228 crypto_aead_reqsize(ctx->aead_send), sk->sk_allocation);
229 if (!req) 204 if (!req)
230 return -ENOMEM; 205 return -ENOMEM;
231 206
@@ -267,7 +242,7 @@ static int tls_push_record(struct sock *sk, int flags,
267 242
268 tls_advance_record_sn(sk, &tls_ctx->tx); 243 tls_advance_record_sn(sk, &tls_ctx->tx);
269out_req: 244out_req:
270 kfree(req); 245 aead_request_free(req);
271 return rc; 246 return rc;
272} 247}
273 248
@@ -328,7 +303,12 @@ static int zerocopy_from_iter(struct sock *sk, struct iov_iter *from,
328 } 303 }
329 } 304 }
330 305
306 /* Mark the end in the last sg entry if newly added */
307 if (num_elem > *pages_used)
308 sg_mark_end(&to[num_elem - 1]);
331out: 309out:
310 if (rc)
311 iov_iter_revert(from, size - *size_used);
332 *size_used = size; 312 *size_used = size;
333 *pages_used = num_elem; 313 *pages_used = num_elem;
334 314
@@ -377,6 +357,7 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
377 int record_room; 357 int record_room;
378 bool full_record; 358 bool full_record;
379 int orig_size; 359 int orig_size;
360 bool is_kvec = msg->msg_iter.type & ITER_KVEC;
380 361
381 if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) 362 if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL))
382 return -ENOTSUPP; 363 return -ENOTSUPP;
@@ -425,8 +406,7 @@ alloc_encrypted:
425 try_to_copy -= required_size - ctx->sg_encrypted_size; 406 try_to_copy -= required_size - ctx->sg_encrypted_size;
426 full_record = true; 407 full_record = true;
427 } 408 }
428 409 if (!is_kvec && (full_record || eor)) {
429 if (full_record || eor) {
430 ret = zerocopy_from_iter(sk, &msg->msg_iter, 410 ret = zerocopy_from_iter(sk, &msg->msg_iter,
431 try_to_copy, &ctx->sg_plaintext_num_elem, 411 try_to_copy, &ctx->sg_plaintext_num_elem,
432 &ctx->sg_plaintext_size, 412 &ctx->sg_plaintext_size,
@@ -438,15 +418,11 @@ alloc_encrypted:
438 418
439 copied += try_to_copy; 419 copied += try_to_copy;
440 ret = tls_push_record(sk, msg->msg_flags, record_type); 420 ret = tls_push_record(sk, msg->msg_flags, record_type);
441 if (!ret) 421 if (ret)
442 continue;
443 if (ret == -EAGAIN)
444 goto send_end; 422 goto send_end;
423 continue;
445 424
446 copied -= try_to_copy;
447fallback_to_reg_send: 425fallback_to_reg_send:
448 iov_iter_revert(&msg->msg_iter,
449 ctx->sg_plaintext_size - orig_size);
450 trim_sg(sk, ctx->sg_plaintext_data, 426 trim_sg(sk, ctx->sg_plaintext_data,
451 &ctx->sg_plaintext_num_elem, 427 &ctx->sg_plaintext_num_elem,
452 &ctx->sg_plaintext_size, 428 &ctx->sg_plaintext_size,
@@ -646,6 +622,9 @@ static struct sk_buff *tls_wait_data(struct sock *sk, int flags,
646 return NULL; 622 return NULL;
647 } 623 }
648 624
625 if (sk->sk_shutdown & RCV_SHUTDOWN)
626 return NULL;
627
649 if (sock_flag(sk, SOCK_DONE)) 628 if (sock_flag(sk, SOCK_DONE))
650 return NULL; 629 return NULL;
651 630
@@ -670,52 +649,167 @@ static struct sk_buff *tls_wait_data(struct sock *sk, int flags,
670 return skb; 649 return skb;
671} 650}
672 651
673static int decrypt_skb(struct sock *sk, struct sk_buff *skb, 652/* This function decrypts the input skb into either out_iov or in out_sg
674 struct scatterlist *sgout) 653 * or in skb buffers itself. The input parameter 'zc' indicates if
654 * zero-copy mode needs to be tried or not. With zero-copy mode, either
655 * out_iov or out_sg must be non-NULL. In case both out_iov and out_sg are
656 * NULL, then the decryption happens inside skb buffers itself, i.e.
657 * zero-copy gets disabled and 'zc' is updated.
658 */
659
660static int decrypt_internal(struct sock *sk, struct sk_buff *skb,
661 struct iov_iter *out_iov,
662 struct scatterlist *out_sg,
663 int *chunk, bool *zc)
675{ 664{
676 struct tls_context *tls_ctx = tls_get_ctx(sk); 665 struct tls_context *tls_ctx = tls_get_ctx(sk);
677 struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); 666 struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
678 char iv[TLS_CIPHER_AES_GCM_128_SALT_SIZE + MAX_IV_SIZE];
679 struct scatterlist sgin_arr[MAX_SKB_FRAGS + 2];
680 struct scatterlist *sgin = &sgin_arr[0];
681 struct strp_msg *rxm = strp_msg(skb); 667 struct strp_msg *rxm = strp_msg(skb);
682 int ret, nsg = ARRAY_SIZE(sgin_arr); 668 int n_sgin, n_sgout, nsg, mem_size, aead_size, err, pages = 0;
669 struct aead_request *aead_req;
683 struct sk_buff *unused; 670 struct sk_buff *unused;
671 u8 *aad, *iv, *mem = NULL;
672 struct scatterlist *sgin = NULL;
673 struct scatterlist *sgout = NULL;
674 const int data_len = rxm->full_len - tls_ctx->rx.overhead_size;
675
676 if (*zc && (out_iov || out_sg)) {
677 if (out_iov)
678 n_sgout = iov_iter_npages(out_iov, INT_MAX) + 1;
679 else
680 n_sgout = sg_nents(out_sg);
681 } else {
682 n_sgout = 0;
683 *zc = false;
684 }
685
686 n_sgin = skb_cow_data(skb, 0, &unused);
687 if (n_sgin < 1)
688 return -EBADMSG;
689
690 /* Increment to accommodate AAD */
691 n_sgin = n_sgin + 1;
692
693 nsg = n_sgin + n_sgout;
694
695 aead_size = sizeof(*aead_req) + crypto_aead_reqsize(ctx->aead_recv);
696 mem_size = aead_size + (nsg * sizeof(struct scatterlist));
697 mem_size = mem_size + TLS_AAD_SPACE_SIZE;
698 mem_size = mem_size + crypto_aead_ivsize(ctx->aead_recv);
699
700 /* Allocate a single block of memory which contains
701 * aead_req || sgin[] || sgout[] || aad || iv.
702 * This order achieves correct alignment for aead_req, sgin, sgout.
703 */
704 mem = kmalloc(mem_size, sk->sk_allocation);
705 if (!mem)
706 return -ENOMEM;
707
708 /* Segment the allocated memory */
709 aead_req = (struct aead_request *)mem;
710 sgin = (struct scatterlist *)(mem + aead_size);
711 sgout = sgin + n_sgin;
712 aad = (u8 *)(sgout + n_sgout);
713 iv = aad + TLS_AAD_SPACE_SIZE;
684 714
685 ret = skb_copy_bits(skb, rxm->offset + TLS_HEADER_SIZE, 715 /* Prepare IV */
716 err = skb_copy_bits(skb, rxm->offset + TLS_HEADER_SIZE,
686 iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, 717 iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE,
687 tls_ctx->rx.iv_size); 718 tls_ctx->rx.iv_size);
688 if (ret < 0) 719 if (err < 0) {
689 return ret; 720 kfree(mem);
690 721 return err;
691 memcpy(iv, tls_ctx->rx.iv, TLS_CIPHER_AES_GCM_128_SALT_SIZE);
692 if (!sgout) {
693 nsg = skb_cow_data(skb, 0, &unused) + 1;
694 sgin = kmalloc_array(nsg, sizeof(*sgin), sk->sk_allocation);
695 sgout = sgin;
696 } 722 }
723 memcpy(iv, tls_ctx->rx.iv, TLS_CIPHER_AES_GCM_128_SALT_SIZE);
697 724
698 sg_init_table(sgin, nsg); 725 /* Prepare AAD */
699 sg_set_buf(&sgin[0], ctx->rx_aad_ciphertext, TLS_AAD_SPACE_SIZE); 726 tls_make_aad(aad, rxm->full_len - tls_ctx->rx.overhead_size,
727 tls_ctx->rx.rec_seq, tls_ctx->rx.rec_seq_size,
728 ctx->control);
700 729
701 nsg = skb_to_sgvec(skb, &sgin[1], 730 /* Prepare sgin */
731 sg_init_table(sgin, n_sgin);
732 sg_set_buf(&sgin[0], aad, TLS_AAD_SPACE_SIZE);
733 err = skb_to_sgvec(skb, &sgin[1],
702 rxm->offset + tls_ctx->rx.prepend_size, 734 rxm->offset + tls_ctx->rx.prepend_size,
703 rxm->full_len - tls_ctx->rx.prepend_size); 735 rxm->full_len - tls_ctx->rx.prepend_size);
736 if (err < 0) {
737 kfree(mem);
738 return err;
739 }
704 740
705 tls_make_aad(ctx->rx_aad_ciphertext, 741 if (n_sgout) {
706 rxm->full_len - tls_ctx->rx.overhead_size, 742 if (out_iov) {
707 tls_ctx->rx.rec_seq, 743 sg_init_table(sgout, n_sgout);
708 tls_ctx->rx.rec_seq_size, 744 sg_set_buf(&sgout[0], aad, TLS_AAD_SPACE_SIZE);
709 ctx->control);
710 745
711 ret = tls_do_decryption(sk, sgin, sgout, iv, 746 *chunk = 0;
712 rxm->full_len - tls_ctx->rx.overhead_size, 747 err = zerocopy_from_iter(sk, out_iov, data_len, &pages,
713 skb, sk->sk_allocation); 748 chunk, &sgout[1],
749 (n_sgout - 1), false);
750 if (err < 0)
751 goto fallback_to_reg_recv;
752 } else if (out_sg) {
753 memcpy(sgout, out_sg, n_sgout * sizeof(*sgout));
754 } else {
755 goto fallback_to_reg_recv;
756 }
757 } else {
758fallback_to_reg_recv:
759 sgout = sgin;
760 pages = 0;
761 *chunk = 0;
762 *zc = false;
763 }
714 764
715 if (sgin != &sgin_arr[0]) 765 /* Prepare and submit AEAD request */
716 kfree(sgin); 766 err = tls_do_decryption(sk, sgin, sgout, iv, data_len, aead_req);
717 767
718 return ret; 768 /* Release the pages in case iov was mapped to pages */
769 for (; pages > 0; pages--)
770 put_page(sg_page(&sgout[pages]));
771
772 kfree(mem);
773 return err;
774}
775
776static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
777 struct iov_iter *dest, int *chunk, bool *zc)
778{
779 struct tls_context *tls_ctx = tls_get_ctx(sk);
780 struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
781 struct strp_msg *rxm = strp_msg(skb);
782 int err = 0;
783
784#ifdef CONFIG_TLS_DEVICE
785 err = tls_device_decrypted(sk, skb);
786 if (err < 0)
787 return err;
788#endif
789 if (!ctx->decrypted) {
790 err = decrypt_internal(sk, skb, dest, NULL, chunk, zc);
791 if (err < 0)
792 return err;
793 } else {
794 *zc = false;
795 }
796
797 rxm->offset += tls_ctx->rx.prepend_size;
798 rxm->full_len -= tls_ctx->rx.overhead_size;
799 tls_advance_record_sn(sk, &tls_ctx->rx);
800 ctx->decrypted = true;
801 ctx->saved_data_ready(sk);
802
803 return err;
804}
805
806int decrypt_skb(struct sock *sk, struct sk_buff *skb,
807 struct scatterlist *sgout)
808{
809 bool zc = true;
810 int chunk;
811
812 return decrypt_internal(sk, skb, NULL, sgout, &chunk, &zc);
719} 813}
720 814
721static bool tls_sw_advance_skb(struct sock *sk, struct sk_buff *skb, 815static bool tls_sw_advance_skb(struct sock *sk, struct sk_buff *skb,
@@ -756,6 +850,7 @@ int tls_sw_recvmsg(struct sock *sk,
756 bool cmsg = false; 850 bool cmsg = false;
757 int target, err = 0; 851 int target, err = 0;
758 long timeo; 852 long timeo;
853 bool is_kvec = msg->msg_iter.type & ITER_KVEC;
759 854
760 flags |= nonblock; 855 flags |= nonblock;
761 856
@@ -793,43 +888,17 @@ int tls_sw_recvmsg(struct sock *sk,
793 } 888 }
794 889
795 if (!ctx->decrypted) { 890 if (!ctx->decrypted) {
796 int page_count; 891 int to_copy = rxm->full_len - tls_ctx->rx.overhead_size;
797 int to_copy;
798
799 page_count = iov_iter_npages(&msg->msg_iter,
800 MAX_SKB_FRAGS);
801 to_copy = rxm->full_len - tls_ctx->rx.overhead_size;
802 if (to_copy <= len && page_count < MAX_SKB_FRAGS &&
803 likely(!(flags & MSG_PEEK))) {
804 struct scatterlist sgin[MAX_SKB_FRAGS + 1];
805 int pages = 0;
806 892
893 if (!is_kvec && to_copy <= len &&
894 likely(!(flags & MSG_PEEK)))
807 zc = true; 895 zc = true;
808 sg_init_table(sgin, MAX_SKB_FRAGS + 1); 896
809 sg_set_buf(&sgin[0], ctx->rx_aad_plaintext, 897 err = decrypt_skb_update(sk, skb, &msg->msg_iter,
810 TLS_AAD_SPACE_SIZE); 898 &chunk, &zc);
811 899 if (err < 0) {
812 err = zerocopy_from_iter(sk, &msg->msg_iter, 900 tls_err_abort(sk, EBADMSG);
813 to_copy, &pages, 901 goto recv_end;
814 &chunk, &sgin[1],
815 MAX_SKB_FRAGS, false);
816 if (err < 0)
817 goto fallback_to_reg_recv;
818
819 err = decrypt_skb(sk, skb, sgin);
820 for (; pages > 0; pages--)
821 put_page(sg_page(&sgin[pages]));
822 if (err < 0) {
823 tls_err_abort(sk, EBADMSG);
824 goto recv_end;
825 }
826 } else {
827fallback_to_reg_recv:
828 err = decrypt_skb(sk, skb, NULL);
829 if (err < 0) {
830 tls_err_abort(sk, EBADMSG);
831 goto recv_end;
832 }
833 } 902 }
834 ctx->decrypted = true; 903 ctx->decrypted = true;
835 } 904 }
@@ -880,6 +949,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
880 int err = 0; 949 int err = 0;
881 long timeo; 950 long timeo;
882 int chunk; 951 int chunk;
952 bool zc = false;
883 953
884 lock_sock(sk); 954 lock_sock(sk);
885 955
@@ -896,7 +966,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
896 } 966 }
897 967
898 if (!ctx->decrypted) { 968 if (!ctx->decrypted) {
899 err = decrypt_skb(sk, skb, NULL); 969 err = decrypt_skb_update(sk, skb, NULL, &chunk, &zc);
900 970
901 if (err < 0) { 971 if (err < 0) {
902 tls_err_abort(sk, EBADMSG); 972 tls_err_abort(sk, EBADMSG);
@@ -919,29 +989,30 @@ splice_read_end:
919 return copied ? : err; 989 return copied ? : err;
920} 990}
921 991
922__poll_t tls_sw_poll_mask(struct socket *sock, __poll_t events) 992unsigned int tls_sw_poll(struct file *file, struct socket *sock,
993 struct poll_table_struct *wait)
923{ 994{
995 unsigned int ret;
924 struct sock *sk = sock->sk; 996 struct sock *sk = sock->sk;
925 struct tls_context *tls_ctx = tls_get_ctx(sk); 997 struct tls_context *tls_ctx = tls_get_ctx(sk);
926 struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); 998 struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
927 __poll_t mask;
928 999
929 /* Grab EPOLLOUT and EPOLLHUP from the underlying socket */ 1000 /* Grab POLLOUT and POLLHUP from the underlying socket */
930 mask = ctx->sk_poll_mask(sock, events); 1001 ret = ctx->sk_poll(file, sock, wait);
931 1002
932 /* Clear EPOLLIN bits, and set based on recv_pkt */ 1003 /* Clear POLLIN bits, and set based on recv_pkt */
933 mask &= ~(EPOLLIN | EPOLLRDNORM); 1004 ret &= ~(POLLIN | POLLRDNORM);
934 if (ctx->recv_pkt) 1005 if (ctx->recv_pkt)
935 mask |= EPOLLIN | EPOLLRDNORM; 1006 ret |= POLLIN | POLLRDNORM;
936 1007
937 return mask; 1008 return ret;
938} 1009}
939 1010
940static int tls_read_size(struct strparser *strp, struct sk_buff *skb) 1011static int tls_read_size(struct strparser *strp, struct sk_buff *skb)
941{ 1012{
942 struct tls_context *tls_ctx = tls_get_ctx(strp->sk); 1013 struct tls_context *tls_ctx = tls_get_ctx(strp->sk);
943 struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); 1014 struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
944 char header[tls_ctx->rx.prepend_size]; 1015 char header[TLS_HEADER_SIZE + MAX_IV_SIZE];
945 struct strp_msg *rxm = strp_msg(skb); 1016 struct strp_msg *rxm = strp_msg(skb);
946 size_t cipher_overhead; 1017 size_t cipher_overhead;
947 size_t data_len = 0; 1018 size_t data_len = 0;
@@ -951,6 +1022,12 @@ static int tls_read_size(struct strparser *strp, struct sk_buff *skb)
951 if (rxm->offset + tls_ctx->rx.prepend_size > skb->len) 1022 if (rxm->offset + tls_ctx->rx.prepend_size > skb->len)
952 return 0; 1023 return 0;
953 1024
1025 /* Sanity-check size of on-stack buffer. */
1026 if (WARN_ON(tls_ctx->rx.prepend_size > sizeof(header))) {
1027 ret = -EINVAL;
1028 goto read_failure;
1029 }
1030
954 /* Linearize header to local buffer */ 1031 /* Linearize header to local buffer */
955 ret = skb_copy_bits(skb, rxm->offset, header, tls_ctx->rx.prepend_size); 1032 ret = skb_copy_bits(skb, rxm->offset, header, tls_ctx->rx.prepend_size);
956 1033
@@ -978,6 +1055,10 @@ static int tls_read_size(struct strparser *strp, struct sk_buff *skb)
978 goto read_failure; 1055 goto read_failure;
979 } 1056 }
980 1057
1058#ifdef CONFIG_TLS_DEVICE
1059 handle_device_resync(strp->sk, TCP_SKB_CB(skb)->seq + rxm->offset,
1060 *(u64*)tls_ctx->rx.rec_seq);
1061#endif
981 return data_len + TLS_HEADER_SIZE; 1062 return data_len + TLS_HEADER_SIZE;
982 1063
983read_failure: 1064read_failure:
@@ -990,16 +1071,13 @@ static void tls_queue(struct strparser *strp, struct sk_buff *skb)
990{ 1071{
991 struct tls_context *tls_ctx = tls_get_ctx(strp->sk); 1072 struct tls_context *tls_ctx = tls_get_ctx(strp->sk);
992 struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); 1073 struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
993 struct strp_msg *rxm;
994
995 rxm = strp_msg(skb);
996 1074
997 ctx->decrypted = false; 1075 ctx->decrypted = false;
998 1076
999 ctx->recv_pkt = skb; 1077 ctx->recv_pkt = skb;
1000 strp_pause(strp); 1078 strp_pause(strp);
1001 1079
1002 strp->sk->sk_state_change(strp->sk); 1080 ctx->saved_data_ready(strp->sk);
1003} 1081}
1004 1082
1005static void tls_data_ready(struct sock *sk) 1083static void tls_data_ready(struct sock *sk)
@@ -1015,23 +1093,20 @@ void tls_sw_free_resources_tx(struct sock *sk)
1015 struct tls_context *tls_ctx = tls_get_ctx(sk); 1093 struct tls_context *tls_ctx = tls_get_ctx(sk);
1016 struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); 1094 struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
1017 1095
1018 if (ctx->aead_send) 1096 crypto_free_aead(ctx->aead_send);
1019 crypto_free_aead(ctx->aead_send);
1020 tls_free_both_sg(sk); 1097 tls_free_both_sg(sk);
1021 1098
1022 kfree(ctx); 1099 kfree(ctx);
1023} 1100}
1024 1101
1025void tls_sw_free_resources_rx(struct sock *sk) 1102void tls_sw_release_resources_rx(struct sock *sk)
1026{ 1103{
1027 struct tls_context *tls_ctx = tls_get_ctx(sk); 1104 struct tls_context *tls_ctx = tls_get_ctx(sk);
1028 struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); 1105 struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
1029 1106
1030 if (ctx->aead_recv) { 1107 if (ctx->aead_recv) {
1031 if (ctx->recv_pkt) { 1108 kfree_skb(ctx->recv_pkt);
1032 kfree_skb(ctx->recv_pkt); 1109 ctx->recv_pkt = NULL;
1033 ctx->recv_pkt = NULL;
1034 }
1035 crypto_free_aead(ctx->aead_recv); 1110 crypto_free_aead(ctx->aead_recv);
1036 strp_stop(&ctx->strp); 1111 strp_stop(&ctx->strp);
1037 write_lock_bh(&sk->sk_callback_lock); 1112 write_lock_bh(&sk->sk_callback_lock);
@@ -1041,6 +1116,14 @@ void tls_sw_free_resources_rx(struct sock *sk)
1041 strp_done(&ctx->strp); 1116 strp_done(&ctx->strp);
1042 lock_sock(sk); 1117 lock_sock(sk);
1043 } 1118 }
1119}
1120
1121void tls_sw_free_resources_rx(struct sock *sk)
1122{
1123 struct tls_context *tls_ctx = tls_get_ctx(sk);
1124 struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
1125
1126 tls_sw_release_resources_rx(sk);
1044 1127
1045 kfree(ctx); 1128 kfree(ctx);
1046} 1129}
@@ -1065,28 +1148,38 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
1065 } 1148 }
1066 1149
1067 if (tx) { 1150 if (tx) {
1068 sw_ctx_tx = kzalloc(sizeof(*sw_ctx_tx), GFP_KERNEL); 1151 if (!ctx->priv_ctx_tx) {
1069 if (!sw_ctx_tx) { 1152 sw_ctx_tx = kzalloc(sizeof(*sw_ctx_tx), GFP_KERNEL);
1070 rc = -ENOMEM; 1153 if (!sw_ctx_tx) {
1071 goto out; 1154 rc = -ENOMEM;
1155 goto out;
1156 }
1157 ctx->priv_ctx_tx = sw_ctx_tx;
1158 } else {
1159 sw_ctx_tx =
1160 (struct tls_sw_context_tx *)ctx->priv_ctx_tx;
1072 } 1161 }
1073 crypto_init_wait(&sw_ctx_tx->async_wait);
1074 ctx->priv_ctx_tx = sw_ctx_tx;
1075 } else { 1162 } else {
1076 sw_ctx_rx = kzalloc(sizeof(*sw_ctx_rx), GFP_KERNEL); 1163 if (!ctx->priv_ctx_rx) {
1077 if (!sw_ctx_rx) { 1164 sw_ctx_rx = kzalloc(sizeof(*sw_ctx_rx), GFP_KERNEL);
1078 rc = -ENOMEM; 1165 if (!sw_ctx_rx) {
1079 goto out; 1166 rc = -ENOMEM;
1167 goto out;
1168 }
1169 ctx->priv_ctx_rx = sw_ctx_rx;
1170 } else {
1171 sw_ctx_rx =
1172 (struct tls_sw_context_rx *)ctx->priv_ctx_rx;
1080 } 1173 }
1081 crypto_init_wait(&sw_ctx_rx->async_wait);
1082 ctx->priv_ctx_rx = sw_ctx_rx;
1083 } 1174 }
1084 1175
1085 if (tx) { 1176 if (tx) {
1177 crypto_init_wait(&sw_ctx_tx->async_wait);
1086 crypto_info = &ctx->crypto_send; 1178 crypto_info = &ctx->crypto_send;
1087 cctx = &ctx->tx; 1179 cctx = &ctx->tx;
1088 aead = &sw_ctx_tx->aead_send; 1180 aead = &sw_ctx_tx->aead_send;
1089 } else { 1181 } else {
1182 crypto_init_wait(&sw_ctx_rx->async_wait);
1090 crypto_info = &ctx->crypto_recv; 1183 crypto_info = &ctx->crypto_recv;
1091 cctx = &ctx->rx; 1184 cctx = &ctx->rx;
1092 aead = &sw_ctx_rx->aead_recv; 1185 aead = &sw_ctx_rx->aead_recv;
@@ -1111,7 +1204,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
1111 } 1204 }
1112 1205
1113 /* Sanity-check the IV size for stack allocations. */ 1206 /* Sanity-check the IV size for stack allocations. */
1114 if (iv_size > MAX_IV_SIZE) { 1207 if (iv_size > MAX_IV_SIZE || nonce_size > MAX_IV_SIZE) {
1115 rc = -EINVAL; 1208 rc = -EINVAL;
1116 goto free_priv; 1209 goto free_priv;
1117 } 1210 }
@@ -1129,12 +1222,11 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
1129 memcpy(cctx->iv, gcm_128_info->salt, TLS_CIPHER_AES_GCM_128_SALT_SIZE); 1222 memcpy(cctx->iv, gcm_128_info->salt, TLS_CIPHER_AES_GCM_128_SALT_SIZE);
1130 memcpy(cctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size); 1223 memcpy(cctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size);
1131 cctx->rec_seq_size = rec_seq_size; 1224 cctx->rec_seq_size = rec_seq_size;
1132 cctx->rec_seq = kmalloc(rec_seq_size, GFP_KERNEL); 1225 cctx->rec_seq = kmemdup(rec_seq, rec_seq_size, GFP_KERNEL);
1133 if (!cctx->rec_seq) { 1226 if (!cctx->rec_seq) {
1134 rc = -ENOMEM; 1227 rc = -ENOMEM;
1135 goto free_iv; 1228 goto free_iv;
1136 } 1229 }
1137 memcpy(cctx->rec_seq, rec_seq, rec_seq_size);
1138 1230
1139 if (sw_ctx_tx) { 1231 if (sw_ctx_tx) {
1140 sg_init_table(sw_ctx_tx->sg_encrypted_data, 1232 sg_init_table(sw_ctx_tx->sg_encrypted_data,
@@ -1191,7 +1283,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
1191 sk->sk_data_ready = tls_data_ready; 1283 sk->sk_data_ready = tls_data_ready;
1192 write_unlock_bh(&sk->sk_callback_lock); 1284 write_unlock_bh(&sk->sk_callback_lock);
1193 1285
1194 sw_ctx_rx->sk_poll_mask = sk->sk_socket->ops->poll_mask; 1286 sw_ctx_rx->sk_poll = sk->sk_socket->ops->poll;
1195 1287
1196 strp_check_rcv(&sw_ctx_rx->strp); 1288 strp_check_rcv(&sw_ctx_rx->strp);
1197 } 1289 }
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 95b02a71fd47..d1edfa3cad61 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -430,7 +430,12 @@ static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
430 430
431 connected = unix_dgram_peer_wake_connect(sk, other); 431 connected = unix_dgram_peer_wake_connect(sk, other);
432 432
433 if (unix_recvq_full(other)) 433 /* If other is SOCK_DEAD, we want to make sure we signal
434 * POLLOUT, such that a subsequent write() can get a
435 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
436 * to other and its full, we will hang waiting for POLLOUT.
437 */
438 if (unix_recvq_full(other) && !sock_flag(other, SOCK_DEAD))
434 return 1; 439 return 1;
435 440
436 if (connected) 441 if (connected)
@@ -638,8 +643,9 @@ static int unix_stream_connect(struct socket *, struct sockaddr *,
638static int unix_socketpair(struct socket *, struct socket *); 643static int unix_socketpair(struct socket *, struct socket *);
639static int unix_accept(struct socket *, struct socket *, int, bool); 644static int unix_accept(struct socket *, struct socket *, int, bool);
640static int unix_getname(struct socket *, struct sockaddr *, int); 645static int unix_getname(struct socket *, struct sockaddr *, int);
641static __poll_t unix_poll_mask(struct socket *, __poll_t); 646static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
642static __poll_t unix_dgram_poll_mask(struct socket *, __poll_t); 647static __poll_t unix_dgram_poll(struct file *, struct socket *,
648 poll_table *);
643static int unix_ioctl(struct socket *, unsigned int, unsigned long); 649static int unix_ioctl(struct socket *, unsigned int, unsigned long);
644static int unix_shutdown(struct socket *, int); 650static int unix_shutdown(struct socket *, int);
645static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t); 651static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
@@ -680,7 +686,7 @@ static const struct proto_ops unix_stream_ops = {
680 .socketpair = unix_socketpair, 686 .socketpair = unix_socketpair,
681 .accept = unix_accept, 687 .accept = unix_accept,
682 .getname = unix_getname, 688 .getname = unix_getname,
683 .poll_mask = unix_poll_mask, 689 .poll = unix_poll,
684 .ioctl = unix_ioctl, 690 .ioctl = unix_ioctl,
685 .listen = unix_listen, 691 .listen = unix_listen,
686 .shutdown = unix_shutdown, 692 .shutdown = unix_shutdown,
@@ -703,7 +709,7 @@ static const struct proto_ops unix_dgram_ops = {
703 .socketpair = unix_socketpair, 709 .socketpair = unix_socketpair,
704 .accept = sock_no_accept, 710 .accept = sock_no_accept,
705 .getname = unix_getname, 711 .getname = unix_getname,
706 .poll_mask = unix_dgram_poll_mask, 712 .poll = unix_dgram_poll,
707 .ioctl = unix_ioctl, 713 .ioctl = unix_ioctl,
708 .listen = sock_no_listen, 714 .listen = sock_no_listen,
709 .shutdown = unix_shutdown, 715 .shutdown = unix_shutdown,
@@ -725,7 +731,7 @@ static const struct proto_ops unix_seqpacket_ops = {
725 .socketpair = unix_socketpair, 731 .socketpair = unix_socketpair,
726 .accept = unix_accept, 732 .accept = unix_accept,
727 .getname = unix_getname, 733 .getname = unix_getname,
728 .poll_mask = unix_dgram_poll_mask, 734 .poll = unix_dgram_poll,
729 .ioctl = unix_ioctl, 735 .ioctl = unix_ioctl,
730 .listen = unix_listen, 736 .listen = unix_listen,
731 .shutdown = unix_shutdown, 737 .shutdown = unix_shutdown,
@@ -2629,10 +2635,13 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2629 return err; 2635 return err;
2630} 2636}
2631 2637
2632static __poll_t unix_poll_mask(struct socket *sock, __poll_t events) 2638static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
2633{ 2639{
2634 struct sock *sk = sock->sk; 2640 struct sock *sk = sock->sk;
2635 __poll_t mask = 0; 2641 __poll_t mask;
2642
2643 sock_poll_wait(file, wait);
2644 mask = 0;
2636 2645
2637 /* exceptional events? */ 2646 /* exceptional events? */
2638 if (sk->sk_err) 2647 if (sk->sk_err)
@@ -2661,11 +2670,15 @@ static __poll_t unix_poll_mask(struct socket *sock, __poll_t events)
2661 return mask; 2670 return mask;
2662} 2671}
2663 2672
2664static __poll_t unix_dgram_poll_mask(struct socket *sock, __poll_t events) 2673static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
2674 poll_table *wait)
2665{ 2675{
2666 struct sock *sk = sock->sk, *other; 2676 struct sock *sk = sock->sk, *other;
2667 int writable; 2677 unsigned int writable;
2668 __poll_t mask = 0; 2678 __poll_t mask;
2679
2680 sock_poll_wait(file, wait);
2681 mask = 0;
2669 2682
2670 /* exceptional events? */ 2683 /* exceptional events? */
2671 if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) 2684 if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
@@ -2691,7 +2704,7 @@ static __poll_t unix_dgram_poll_mask(struct socket *sock, __poll_t events)
2691 } 2704 }
2692 2705
2693 /* No write status requested, avoid expensive OUT tests. */ 2706 /* No write status requested, avoid expensive OUT tests. */
2694 if (!(events & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT))) 2707 if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
2695 return mask; 2708 return mask;
2696 2709
2697 writable = unix_writable(sk); 2710 writable = unix_writable(sk);
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index bb5d5fa68c35..ab27a2872935 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -451,14 +451,14 @@ static int vsock_send_shutdown(struct sock *sk, int mode)
451 return transport->shutdown(vsock_sk(sk), mode); 451 return transport->shutdown(vsock_sk(sk), mode);
452} 452}
453 453
454void vsock_pending_work(struct work_struct *work) 454static void vsock_pending_work(struct work_struct *work)
455{ 455{
456 struct sock *sk; 456 struct sock *sk;
457 struct sock *listener; 457 struct sock *listener;
458 struct vsock_sock *vsk; 458 struct vsock_sock *vsk;
459 bool cleanup; 459 bool cleanup;
460 460
461 vsk = container_of(work, struct vsock_sock, dwork.work); 461 vsk = container_of(work, struct vsock_sock, pending_work.work);
462 sk = sk_vsock(vsk); 462 sk = sk_vsock(vsk);
463 listener = vsk->listener; 463 listener = vsk->listener;
464 cleanup = true; 464 cleanup = true;
@@ -498,7 +498,6 @@ out:
498 sock_put(sk); 498 sock_put(sk);
499 sock_put(listener); 499 sock_put(listener);
500} 500}
501EXPORT_SYMBOL_GPL(vsock_pending_work);
502 501
503/**** SOCKET OPERATIONS ****/ 502/**** SOCKET OPERATIONS ****/
504 503
@@ -597,6 +596,8 @@ static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr)
597 return retval; 596 return retval;
598} 597}
599 598
599static void vsock_connect_timeout(struct work_struct *work);
600
600struct sock *__vsock_create(struct net *net, 601struct sock *__vsock_create(struct net *net,
601 struct socket *sock, 602 struct socket *sock,
602 struct sock *parent, 603 struct sock *parent,
@@ -638,6 +639,8 @@ struct sock *__vsock_create(struct net *net,
638 vsk->sent_request = false; 639 vsk->sent_request = false;
639 vsk->ignore_connecting_rst = false; 640 vsk->ignore_connecting_rst = false;
640 vsk->peer_shutdown = 0; 641 vsk->peer_shutdown = 0;
642 INIT_DELAYED_WORK(&vsk->connect_work, vsock_connect_timeout);
643 INIT_DELAYED_WORK(&vsk->pending_work, vsock_pending_work);
641 644
642 psk = parent ? vsock_sk(parent) : NULL; 645 psk = parent ? vsock_sk(parent) : NULL;
643 if (parent) { 646 if (parent) {
@@ -850,11 +853,18 @@ static int vsock_shutdown(struct socket *sock, int mode)
850 return err; 853 return err;
851} 854}
852 855
853static __poll_t vsock_poll_mask(struct socket *sock, __poll_t events) 856static __poll_t vsock_poll(struct file *file, struct socket *sock,
857 poll_table *wait)
854{ 858{
855 struct sock *sk = sock->sk; 859 struct sock *sk;
856 struct vsock_sock *vsk = vsock_sk(sk); 860 __poll_t mask;
857 __poll_t mask = 0; 861 struct vsock_sock *vsk;
862
863 sk = sock->sk;
864 vsk = vsock_sk(sk);
865
866 poll_wait(file, sk_sleep(sk), wait);
867 mask = 0;
858 868
859 if (sk->sk_err) 869 if (sk->sk_err)
860 /* Signify that there has been an error on this socket. */ 870 /* Signify that there has been an error on this socket. */
@@ -1084,7 +1094,7 @@ static const struct proto_ops vsock_dgram_ops = {
1084 .socketpair = sock_no_socketpair, 1094 .socketpair = sock_no_socketpair,
1085 .accept = sock_no_accept, 1095 .accept = sock_no_accept,
1086 .getname = vsock_getname, 1096 .getname = vsock_getname,
1087 .poll_mask = vsock_poll_mask, 1097 .poll = vsock_poll,
1088 .ioctl = sock_no_ioctl, 1098 .ioctl = sock_no_ioctl,
1089 .listen = sock_no_listen, 1099 .listen = sock_no_listen,
1090 .shutdown = vsock_shutdown, 1100 .shutdown = vsock_shutdown,
@@ -1110,7 +1120,7 @@ static void vsock_connect_timeout(struct work_struct *work)
1110 struct vsock_sock *vsk; 1120 struct vsock_sock *vsk;
1111 int cancel = 0; 1121 int cancel = 0;
1112 1122
1113 vsk = container_of(work, struct vsock_sock, dwork.work); 1123 vsk = container_of(work, struct vsock_sock, connect_work.work);
1114 sk = sk_vsock(vsk); 1124 sk = sk_vsock(vsk);
1115 1125
1116 lock_sock(sk); 1126 lock_sock(sk);
@@ -1214,9 +1224,7 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr,
1214 * timeout fires. 1224 * timeout fires.
1215 */ 1225 */
1216 sock_hold(sk); 1226 sock_hold(sk);
1217 INIT_DELAYED_WORK(&vsk->dwork, 1227 schedule_delayed_work(&vsk->connect_work, timeout);
1218 vsock_connect_timeout);
1219 schedule_delayed_work(&vsk->dwork, timeout);
1220 1228
1221 /* Skip ahead to preserve error code set above. */ 1229 /* Skip ahead to preserve error code set above. */
1222 goto out_wait; 1230 goto out_wait;
@@ -1842,7 +1850,7 @@ static const struct proto_ops vsock_stream_ops = {
1842 .socketpair = sock_no_socketpair, 1850 .socketpair = sock_no_socketpair,
1843 .accept = vsock_accept, 1851 .accept = vsock_accept,
1844 .getname = vsock_getname, 1852 .getname = vsock_getname,
1845 .poll_mask = vsock_poll_mask, 1853 .poll = vsock_poll,
1846 .ioctl = sock_no_ioctl, 1854 .ioctl = sock_no_ioctl,
1847 .listen = vsock_listen, 1855 .listen = vsock_listen,
1848 .shutdown = vsock_shutdown, 1856 .shutdown = vsock_shutdown,
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 8e03bd3f3668..5d3cce9e8744 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -201,7 +201,7 @@ virtio_transport_send_pkt(struct virtio_vsock_pkt *pkt)
201 return -ENODEV; 201 return -ENODEV;
202 } 202 }
203 203
204 if (le32_to_cpu(pkt->hdr.dst_cid) == vsock->guest_cid) 204 if (le64_to_cpu(pkt->hdr.dst_cid) == vsock->guest_cid)
205 return virtio_transport_send_pkt_loopback(vsock, pkt); 205 return virtio_transport_send_pkt_loopback(vsock, pkt);
206 206
207 if (pkt->reply) 207 if (pkt->reply)
diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index a7a73ffe675b..cb332adb84cd 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -1094,8 +1094,7 @@ static int vmci_transport_recv_listen(struct sock *sk,
1094 vpending->listener = sk; 1094 vpending->listener = sk;
1095 sock_hold(sk); 1095 sock_hold(sk);
1096 sock_hold(pending); 1096 sock_hold(pending);
1097 INIT_DELAYED_WORK(&vpending->dwork, vsock_pending_work); 1097 schedule_delayed_work(&vpending->pending_work, HZ);
1098 schedule_delayed_work(&vpending->dwork, HZ);
1099 1098
1100out: 1099out:
1101 return err; 1100 return err;
diff --git a/net/wimax/Makefile b/net/wimax/Makefile
index eb2db0d3b880..c2a71ae487ac 100644
--- a/net/wimax/Makefile
+++ b/net/wimax/Makefile
@@ -11,5 +11,3 @@ wimax-y := \
11 stack.o 11 stack.o
12 12
13wimax-$(CONFIG_DEBUG_FS) += debugfs.o 13wimax-$(CONFIG_DEBUG_FS) += debugfs.o
14
15
diff --git a/net/wimax/debugfs.c b/net/wimax/debugfs.c
index 6c9bedb7431e..24514840746e 100644
--- a/net/wimax/debugfs.c
+++ b/net/wimax/debugfs.c
@@ -76,5 +76,3 @@ void wimax_debugfs_rm(struct wimax_dev *wimax_dev)
76{ 76{
77 debugfs_remove_recursive(wimax_dev->debugfs_dentry); 77 debugfs_remove_recursive(wimax_dev->debugfs_dentry);
78} 78}
79
80
diff --git a/net/wimax/op-msg.c b/net/wimax/op-msg.c
index 54aa146930bd..101b2fa3f32e 100644
--- a/net/wimax/op-msg.c
+++ b/net/wimax/op-msg.c
@@ -404,4 +404,3 @@ error_no_wimax_dev:
404 d_fnend(3, NULL, "(skb %p info %p) = %d\n", skb, info, result); 404 d_fnend(3, NULL, "(skb %p info %p) = %d\n", skb, info, result);
405 return result; 405 return result;
406} 406}
407
diff --git a/net/wimax/stack.c b/net/wimax/stack.c
index 5db731512014..a6307813b6d5 100644
--- a/net/wimax/stack.c
+++ b/net/wimax/stack.c
@@ -486,7 +486,8 @@ int wimax_dev_add(struct wimax_dev *wimax_dev, struct net_device *net_dev)
486 d_fnstart(3, dev, "(wimax_dev %p net_dev %p)\n", wimax_dev, net_dev); 486 d_fnstart(3, dev, "(wimax_dev %p net_dev %p)\n", wimax_dev, net_dev);
487 487
488 /* Do the RFKILL setup before locking, as RFKILL will call 488 /* Do the RFKILL setup before locking, as RFKILL will call
489 * into our functions. */ 489 * into our functions.
490 */
490 wimax_dev->net_dev = net_dev; 491 wimax_dev->net_dev = net_dev;
491 result = wimax_rfkill_add(wimax_dev); 492 result = wimax_rfkill_add(wimax_dev);
492 if (result < 0) 493 if (result < 0)
@@ -629,4 +630,3 @@ module_exit(wimax_subsys_exit);
629MODULE_AUTHOR("Intel Corporation <linux-wimax@intel.com>"); 630MODULE_AUTHOR("Intel Corporation <linux-wimax@intel.com>");
630MODULE_DESCRIPTION("Linux WiMAX stack"); 631MODULE_DESCRIPTION("Linux WiMAX stack");
631MODULE_LICENSE("GPL"); 632MODULE_LICENSE("GPL");
632
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 48e8097339ab..a88551f3bc43 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> 4 * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net>
5 * Copyright 2013-2014 Intel Mobile Communications GmbH 5 * Copyright 2013-2014 Intel Mobile Communications GmbH
6 * Copyright 2015 Intel Deutschland GmbH 6 * Copyright 2015-2017 Intel Deutschland GmbH
7 */ 7 */
8 8
9#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 9#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -744,6 +744,8 @@ int wiphy_register(struct wiphy *wiphy)
744 744
745 /* sanity check supported bands/channels */ 745 /* sanity check supported bands/channels */
746 for (band = 0; band < NUM_NL80211_BANDS; band++) { 746 for (band = 0; band < NUM_NL80211_BANDS; band++) {
747 u16 types = 0;
748
747 sband = wiphy->bands[band]; 749 sband = wiphy->bands[band];
748 if (!sband) 750 if (!sband)
749 continue; 751 continue;
@@ -788,6 +790,23 @@ int wiphy_register(struct wiphy *wiphy)
788 sband->channels[i].band = band; 790 sband->channels[i].band = band;
789 } 791 }
790 792
793 for (i = 0; i < sband->n_iftype_data; i++) {
794 const struct ieee80211_sband_iftype_data *iftd;
795
796 iftd = &sband->iftype_data[i];
797
798 if (WARN_ON(!iftd->types_mask))
799 return -EINVAL;
800 if (WARN_ON(types & iftd->types_mask))
801 return -EINVAL;
802
803 /* at least one piece of information must be present */
804 if (WARN_ON(!iftd->he_cap.has_he))
805 return -EINVAL;
806
807 types |= iftd->types_mask;
808 }
809
791 have_band = true; 810 have_band = true;
792 } 811 }
793 812
diff --git a/net/wireless/core.h b/net/wireless/core.h
index 63eb1b5fdd04..7f52ef569320 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -76,7 +76,7 @@ struct cfg80211_registered_device {
76 struct cfg80211_scan_request *scan_req; /* protected by RTNL */ 76 struct cfg80211_scan_request *scan_req; /* protected by RTNL */
77 struct sk_buff *scan_msg; 77 struct sk_buff *scan_msg;
78 struct list_head sched_scan_req_list; 78 struct list_head sched_scan_req_list;
79 unsigned long suspend_at; 79 time64_t suspend_at;
80 struct work_struct scan_done_wk; 80 struct work_struct scan_done_wk;
81 81
82 struct genl_info *cur_cmd_info; 82 struct genl_info *cur_cmd_info;
diff --git a/net/wireless/lib80211_crypt_tkip.c b/net/wireless/lib80211_crypt_tkip.c
index ba0a1f398ce5..e6bce1f130c9 100644
--- a/net/wireless/lib80211_crypt_tkip.c
+++ b/net/wireless/lib80211_crypt_tkip.c
@@ -65,9 +65,9 @@ struct lib80211_tkip_data {
65 int key_idx; 65 int key_idx;
66 66
67 struct crypto_skcipher *rx_tfm_arc4; 67 struct crypto_skcipher *rx_tfm_arc4;
68 struct crypto_ahash *rx_tfm_michael; 68 struct crypto_shash *rx_tfm_michael;
69 struct crypto_skcipher *tx_tfm_arc4; 69 struct crypto_skcipher *tx_tfm_arc4;
70 struct crypto_ahash *tx_tfm_michael; 70 struct crypto_shash *tx_tfm_michael;
71 71
72 /* scratch buffers for virt_to_page() (crypto API) */ 72 /* scratch buffers for virt_to_page() (crypto API) */
73 u8 rx_hdr[16], tx_hdr[16]; 73 u8 rx_hdr[16], tx_hdr[16];
@@ -106,8 +106,7 @@ static void *lib80211_tkip_init(int key_idx)
106 goto fail; 106 goto fail;
107 } 107 }
108 108
109 priv->tx_tfm_michael = crypto_alloc_ahash("michael_mic", 0, 109 priv->tx_tfm_michael = crypto_alloc_shash("michael_mic", 0, 0);
110 CRYPTO_ALG_ASYNC);
111 if (IS_ERR(priv->tx_tfm_michael)) { 110 if (IS_ERR(priv->tx_tfm_michael)) {
112 priv->tx_tfm_michael = NULL; 111 priv->tx_tfm_michael = NULL;
113 goto fail; 112 goto fail;
@@ -120,8 +119,7 @@ static void *lib80211_tkip_init(int key_idx)
120 goto fail; 119 goto fail;
121 } 120 }
122 121
123 priv->rx_tfm_michael = crypto_alloc_ahash("michael_mic", 0, 122 priv->rx_tfm_michael = crypto_alloc_shash("michael_mic", 0, 0);
124 CRYPTO_ALG_ASYNC);
125 if (IS_ERR(priv->rx_tfm_michael)) { 123 if (IS_ERR(priv->rx_tfm_michael)) {
126 priv->rx_tfm_michael = NULL; 124 priv->rx_tfm_michael = NULL;
127 goto fail; 125 goto fail;
@@ -131,9 +129,9 @@ static void *lib80211_tkip_init(int key_idx)
131 129
132 fail: 130 fail:
133 if (priv) { 131 if (priv) {
134 crypto_free_ahash(priv->tx_tfm_michael); 132 crypto_free_shash(priv->tx_tfm_michael);
135 crypto_free_skcipher(priv->tx_tfm_arc4); 133 crypto_free_skcipher(priv->tx_tfm_arc4);
136 crypto_free_ahash(priv->rx_tfm_michael); 134 crypto_free_shash(priv->rx_tfm_michael);
137 crypto_free_skcipher(priv->rx_tfm_arc4); 135 crypto_free_skcipher(priv->rx_tfm_arc4);
138 kfree(priv); 136 kfree(priv);
139 } 137 }
@@ -145,9 +143,9 @@ static void lib80211_tkip_deinit(void *priv)
145{ 143{
146 struct lib80211_tkip_data *_priv = priv; 144 struct lib80211_tkip_data *_priv = priv;
147 if (_priv) { 145 if (_priv) {
148 crypto_free_ahash(_priv->tx_tfm_michael); 146 crypto_free_shash(_priv->tx_tfm_michael);
149 crypto_free_skcipher(_priv->tx_tfm_arc4); 147 crypto_free_skcipher(_priv->tx_tfm_arc4);
150 crypto_free_ahash(_priv->rx_tfm_michael); 148 crypto_free_shash(_priv->rx_tfm_michael);
151 crypto_free_skcipher(_priv->rx_tfm_arc4); 149 crypto_free_skcipher(_priv->rx_tfm_arc4);
152 } 150 }
153 kfree(priv); 151 kfree(priv);
@@ -510,29 +508,36 @@ static int lib80211_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
510 return keyidx; 508 return keyidx;
511} 509}
512 510
513static int michael_mic(struct crypto_ahash *tfm_michael, u8 * key, u8 * hdr, 511static int michael_mic(struct crypto_shash *tfm_michael, u8 *key, u8 *hdr,
514 u8 * data, size_t data_len, u8 * mic) 512 u8 *data, size_t data_len, u8 *mic)
515{ 513{
516 AHASH_REQUEST_ON_STACK(req, tfm_michael); 514 SHASH_DESC_ON_STACK(desc, tfm_michael);
517 struct scatterlist sg[2];
518 int err; 515 int err;
519 516
520 if (tfm_michael == NULL) { 517 if (tfm_michael == NULL) {
521 pr_warn("%s(): tfm_michael == NULL\n", __func__); 518 pr_warn("%s(): tfm_michael == NULL\n", __func__);
522 return -1; 519 return -1;
523 } 520 }
524 sg_init_table(sg, 2);
525 sg_set_buf(&sg[0], hdr, 16);
526 sg_set_buf(&sg[1], data, data_len);
527 521
528 if (crypto_ahash_setkey(tfm_michael, key, 8)) 522 desc->tfm = tfm_michael;
523 desc->flags = 0;
524
525 if (crypto_shash_setkey(tfm_michael, key, 8))
529 return -1; 526 return -1;
530 527
531 ahash_request_set_tfm(req, tfm_michael); 528 err = crypto_shash_init(desc);
532 ahash_request_set_callback(req, 0, NULL, NULL); 529 if (err)
533 ahash_request_set_crypt(req, sg, mic, data_len + 16); 530 goto out;
534 err = crypto_ahash_digest(req); 531 err = crypto_shash_update(desc, hdr, 16);
535 ahash_request_zero(req); 532 if (err)
533 goto out;
534 err = crypto_shash_update(desc, data, data_len);
535 if (err)
536 goto out;
537 err = crypto_shash_final(desc, mic);
538
539out:
540 shash_desc_zero(desc);
536 return err; 541 return err;
537} 542}
538 543
@@ -654,9 +659,9 @@ static int lib80211_tkip_set_key(void *key, int len, u8 * seq, void *priv)
654{ 659{
655 struct lib80211_tkip_data *tkey = priv; 660 struct lib80211_tkip_data *tkey = priv;
656 int keyidx; 661 int keyidx;
657 struct crypto_ahash *tfm = tkey->tx_tfm_michael; 662 struct crypto_shash *tfm = tkey->tx_tfm_michael;
658 struct crypto_skcipher *tfm2 = tkey->tx_tfm_arc4; 663 struct crypto_skcipher *tfm2 = tkey->tx_tfm_arc4;
659 struct crypto_ahash *tfm3 = tkey->rx_tfm_michael; 664 struct crypto_shash *tfm3 = tkey->rx_tfm_michael;
660 struct crypto_skcipher *tfm4 = tkey->rx_tfm_arc4; 665 struct crypto_skcipher *tfm4 = tkey->rx_tfm_arc4;
661 666
662 keyidx = tkey->key_idx; 667 keyidx = tkey->key_idx;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index c7bbe5f0aae8..5fb9b7dd9831 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -428,6 +428,8 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
428 [NL80211_ATTR_TXQ_LIMIT] = { .type = NLA_U32 }, 428 [NL80211_ATTR_TXQ_LIMIT] = { .type = NLA_U32 },
429 [NL80211_ATTR_TXQ_MEMORY_LIMIT] = { .type = NLA_U32 }, 429 [NL80211_ATTR_TXQ_MEMORY_LIMIT] = { .type = NLA_U32 },
430 [NL80211_ATTR_TXQ_QUANTUM] = { .type = NLA_U32 }, 430 [NL80211_ATTR_TXQ_QUANTUM] = { .type = NLA_U32 },
431 [NL80211_ATTR_HE_CAPABILITY] = { .type = NLA_BINARY,
432 .len = NL80211_HE_MAX_CAPABILITY_LEN },
431}; 433};
432 434
433/* policy for the key attributes */ 435/* policy for the key attributes */
@@ -1324,6 +1326,34 @@ static int nl80211_send_coalesce(struct sk_buff *msg,
1324 return 0; 1326 return 0;
1325} 1327}
1326 1328
1329static int
1330nl80211_send_iftype_data(struct sk_buff *msg,
1331 const struct ieee80211_sband_iftype_data *iftdata)
1332{
1333 const struct ieee80211_sta_he_cap *he_cap = &iftdata->he_cap;
1334
1335 if (nl80211_put_iftypes(msg, NL80211_BAND_IFTYPE_ATTR_IFTYPES,
1336 iftdata->types_mask))
1337 return -ENOBUFS;
1338
1339 if (he_cap->has_he) {
1340 if (nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_CAP_MAC,
1341 sizeof(he_cap->he_cap_elem.mac_cap_info),
1342 he_cap->he_cap_elem.mac_cap_info) ||
1343 nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_CAP_PHY,
1344 sizeof(he_cap->he_cap_elem.phy_cap_info),
1345 he_cap->he_cap_elem.phy_cap_info) ||
1346 nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_CAP_MCS_SET,
1347 sizeof(he_cap->he_mcs_nss_supp),
1348 &he_cap->he_mcs_nss_supp) ||
1349 nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_CAP_PPE,
1350 sizeof(he_cap->ppe_thres), he_cap->ppe_thres))
1351 return -ENOBUFS;
1352 }
1353
1354 return 0;
1355}
1356
1327static int nl80211_send_band_rateinfo(struct sk_buff *msg, 1357static int nl80211_send_band_rateinfo(struct sk_buff *msg,
1328 struct ieee80211_supported_band *sband) 1358 struct ieee80211_supported_band *sband)
1329{ 1359{
@@ -1353,6 +1383,32 @@ static int nl80211_send_band_rateinfo(struct sk_buff *msg,
1353 sband->vht_cap.cap))) 1383 sband->vht_cap.cap)))
1354 return -ENOBUFS; 1384 return -ENOBUFS;
1355 1385
1386 if (sband->n_iftype_data) {
1387 struct nlattr *nl_iftype_data =
1388 nla_nest_start(msg, NL80211_BAND_ATTR_IFTYPE_DATA);
1389 int err;
1390
1391 if (!nl_iftype_data)
1392 return -ENOBUFS;
1393
1394 for (i = 0; i < sband->n_iftype_data; i++) {
1395 struct nlattr *iftdata;
1396
1397 iftdata = nla_nest_start(msg, i + 1);
1398 if (!iftdata)
1399 return -ENOBUFS;
1400
1401 err = nl80211_send_iftype_data(msg,
1402 &sband->iftype_data[i]);
1403 if (err)
1404 return err;
1405
1406 nla_nest_end(msg, iftdata);
1407 }
1408
1409 nla_nest_end(msg, nl_iftype_data);
1410 }
1411
1356 /* add bitrates */ 1412 /* add bitrates */
1357 nl_rates = nla_nest_start(msg, NL80211_BAND_ATTR_RATES); 1413 nl_rates = nla_nest_start(msg, NL80211_BAND_ATTR_RATES);
1358 if (!nl_rates) 1414 if (!nl_rates)
@@ -2757,7 +2813,8 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag
2757 nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, wdev_address(wdev)) || 2813 nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, wdev_address(wdev)) ||
2758 nla_put_u32(msg, NL80211_ATTR_GENERATION, 2814 nla_put_u32(msg, NL80211_ATTR_GENERATION,
2759 rdev->devlist_generation ^ 2815 rdev->devlist_generation ^
2760 (cfg80211_rdev_list_generation << 2))) 2816 (cfg80211_rdev_list_generation << 2)) ||
2817 nla_put_u8(msg, NL80211_ATTR_4ADDR, wdev->use_4addr))
2761 goto nla_put_failure; 2818 goto nla_put_failure;
2762 2819
2763 if (rdev->ops->get_channel) { 2820 if (rdev->ops->get_channel) {
@@ -4409,6 +4466,7 @@ static int parse_station_flags(struct genl_info *info,
4409 params->sta_flags_mask = BIT(NL80211_STA_FLAG_AUTHENTICATED) | 4466 params->sta_flags_mask = BIT(NL80211_STA_FLAG_AUTHENTICATED) |
4410 BIT(NL80211_STA_FLAG_MFP) | 4467 BIT(NL80211_STA_FLAG_MFP) |
4411 BIT(NL80211_STA_FLAG_AUTHORIZED); 4468 BIT(NL80211_STA_FLAG_AUTHORIZED);
4469 break;
4412 default: 4470 default:
4413 return -EINVAL; 4471 return -EINVAL;
4414 } 4472 }
@@ -4471,6 +4529,9 @@ static bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info,
4471 case RATE_INFO_BW_160: 4529 case RATE_INFO_BW_160:
4472 rate_flg = NL80211_RATE_INFO_160_MHZ_WIDTH; 4530 rate_flg = NL80211_RATE_INFO_160_MHZ_WIDTH;
4473 break; 4531 break;
4532 case RATE_INFO_BW_HE_RU:
4533 rate_flg = 0;
4534 WARN_ON(!(info->flags & RATE_INFO_FLAGS_HE_MCS));
4474 } 4535 }
4475 4536
4476 if (rate_flg && nla_put_flag(msg, rate_flg)) 4537 if (rate_flg && nla_put_flag(msg, rate_flg))
@@ -4490,6 +4551,19 @@ static bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info,
4490 if (info->flags & RATE_INFO_FLAGS_SHORT_GI && 4551 if (info->flags & RATE_INFO_FLAGS_SHORT_GI &&
4491 nla_put_flag(msg, NL80211_RATE_INFO_SHORT_GI)) 4552 nla_put_flag(msg, NL80211_RATE_INFO_SHORT_GI))
4492 return false; 4553 return false;
4554 } else if (info->flags & RATE_INFO_FLAGS_HE_MCS) {
4555 if (nla_put_u8(msg, NL80211_RATE_INFO_HE_MCS, info->mcs))
4556 return false;
4557 if (nla_put_u8(msg, NL80211_RATE_INFO_HE_NSS, info->nss))
4558 return false;
4559 if (nla_put_u8(msg, NL80211_RATE_INFO_HE_GI, info->he_gi))
4560 return false;
4561 if (nla_put_u8(msg, NL80211_RATE_INFO_HE_DCM, info->he_dcm))
4562 return false;
4563 if (info->bw == RATE_INFO_BW_HE_RU &&
4564 nla_put_u8(msg, NL80211_RATE_INFO_HE_RU_ALLOC,
4565 info->he_ru_alloc))
4566 return false;
4493 } 4567 }
4494 4568
4495 nla_nest_end(msg, rate); 4569 nla_nest_end(msg, rate);
@@ -4546,13 +4620,13 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
4546 4620
4547#define PUT_SINFO(attr, memb, type) do { \ 4621#define PUT_SINFO(attr, memb, type) do { \
4548 BUILD_BUG_ON(sizeof(type) == sizeof(u64)); \ 4622 BUILD_BUG_ON(sizeof(type) == sizeof(u64)); \
4549 if (sinfo->filled & (1ULL << NL80211_STA_INFO_ ## attr) && \ 4623 if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_ ## attr) && \
4550 nla_put_ ## type(msg, NL80211_STA_INFO_ ## attr, \ 4624 nla_put_ ## type(msg, NL80211_STA_INFO_ ## attr, \
4551 sinfo->memb)) \ 4625 sinfo->memb)) \
4552 goto nla_put_failure; \ 4626 goto nla_put_failure; \
4553 } while (0) 4627 } while (0)
4554#define PUT_SINFO_U64(attr, memb) do { \ 4628#define PUT_SINFO_U64(attr, memb) do { \
4555 if (sinfo->filled & (1ULL << NL80211_STA_INFO_ ## attr) && \ 4629 if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_ ## attr) && \
4556 nla_put_u64_64bit(msg, NL80211_STA_INFO_ ## attr, \ 4630 nla_put_u64_64bit(msg, NL80211_STA_INFO_ ## attr, \
4557 sinfo->memb, NL80211_STA_INFO_PAD)) \ 4631 sinfo->memb, NL80211_STA_INFO_PAD)) \
4558 goto nla_put_failure; \ 4632 goto nla_put_failure; \
@@ -4561,14 +4635,14 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
4561 PUT_SINFO(CONNECTED_TIME, connected_time, u32); 4635 PUT_SINFO(CONNECTED_TIME, connected_time, u32);
4562 PUT_SINFO(INACTIVE_TIME, inactive_time, u32); 4636 PUT_SINFO(INACTIVE_TIME, inactive_time, u32);
4563 4637
4564 if (sinfo->filled & (BIT(NL80211_STA_INFO_RX_BYTES) | 4638 if (sinfo->filled & (BIT_ULL(NL80211_STA_INFO_RX_BYTES) |
4565 BIT(NL80211_STA_INFO_RX_BYTES64)) && 4639 BIT_ULL(NL80211_STA_INFO_RX_BYTES64)) &&
4566 nla_put_u32(msg, NL80211_STA_INFO_RX_BYTES, 4640 nla_put_u32(msg, NL80211_STA_INFO_RX_BYTES,
4567 (u32)sinfo->rx_bytes)) 4641 (u32)sinfo->rx_bytes))
4568 goto nla_put_failure; 4642 goto nla_put_failure;
4569 4643
4570 if (sinfo->filled & (BIT(NL80211_STA_INFO_TX_BYTES) | 4644 if (sinfo->filled & (BIT_ULL(NL80211_STA_INFO_TX_BYTES) |
4571 BIT(NL80211_STA_INFO_TX_BYTES64)) && 4645 BIT_ULL(NL80211_STA_INFO_TX_BYTES64)) &&
4572 nla_put_u32(msg, NL80211_STA_INFO_TX_BYTES, 4646 nla_put_u32(msg, NL80211_STA_INFO_TX_BYTES,
4573 (u32)sinfo->tx_bytes)) 4647 (u32)sinfo->tx_bytes))
4574 goto nla_put_failure; 4648 goto nla_put_failure;
@@ -4588,24 +4662,24 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
4588 default: 4662 default:
4589 break; 4663 break;
4590 } 4664 }
4591 if (sinfo->filled & BIT(NL80211_STA_INFO_CHAIN_SIGNAL)) { 4665 if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL)) {
4592 if (!nl80211_put_signal(msg, sinfo->chains, 4666 if (!nl80211_put_signal(msg, sinfo->chains,
4593 sinfo->chain_signal, 4667 sinfo->chain_signal,
4594 NL80211_STA_INFO_CHAIN_SIGNAL)) 4668 NL80211_STA_INFO_CHAIN_SIGNAL))
4595 goto nla_put_failure; 4669 goto nla_put_failure;
4596 } 4670 }
4597 if (sinfo->filled & BIT(NL80211_STA_INFO_CHAIN_SIGNAL_AVG)) { 4671 if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL_AVG)) {
4598 if (!nl80211_put_signal(msg, sinfo->chains, 4672 if (!nl80211_put_signal(msg, sinfo->chains,
4599 sinfo->chain_signal_avg, 4673 sinfo->chain_signal_avg,
4600 NL80211_STA_INFO_CHAIN_SIGNAL_AVG)) 4674 NL80211_STA_INFO_CHAIN_SIGNAL_AVG))
4601 goto nla_put_failure; 4675 goto nla_put_failure;
4602 } 4676 }
4603 if (sinfo->filled & BIT(NL80211_STA_INFO_TX_BITRATE)) { 4677 if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE)) {
4604 if (!nl80211_put_sta_rate(msg, &sinfo->txrate, 4678 if (!nl80211_put_sta_rate(msg, &sinfo->txrate,
4605 NL80211_STA_INFO_TX_BITRATE)) 4679 NL80211_STA_INFO_TX_BITRATE))
4606 goto nla_put_failure; 4680 goto nla_put_failure;
4607 } 4681 }
4608 if (sinfo->filled & BIT(NL80211_STA_INFO_RX_BITRATE)) { 4682 if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_BITRATE)) {
4609 if (!nl80211_put_sta_rate(msg, &sinfo->rxrate, 4683 if (!nl80211_put_sta_rate(msg, &sinfo->rxrate,
4610 NL80211_STA_INFO_RX_BITRATE)) 4684 NL80211_STA_INFO_RX_BITRATE))
4611 goto nla_put_failure; 4685 goto nla_put_failure;
@@ -4621,7 +4695,7 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
4621 PUT_SINFO(PEER_PM, peer_pm, u32); 4695 PUT_SINFO(PEER_PM, peer_pm, u32);
4622 PUT_SINFO(NONPEER_PM, nonpeer_pm, u32); 4696 PUT_SINFO(NONPEER_PM, nonpeer_pm, u32);
4623 4697
4624 if (sinfo->filled & BIT(NL80211_STA_INFO_BSS_PARAM)) { 4698 if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_BSS_PARAM)) {
4625 bss_param = nla_nest_start(msg, NL80211_STA_INFO_BSS_PARAM); 4699 bss_param = nla_nest_start(msg, NL80211_STA_INFO_BSS_PARAM);
4626 if (!bss_param) 4700 if (!bss_param)
4627 goto nla_put_failure; 4701 goto nla_put_failure;
@@ -4640,7 +4714,7 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
4640 4714
4641 nla_nest_end(msg, bss_param); 4715 nla_nest_end(msg, bss_param);
4642 } 4716 }
4643 if ((sinfo->filled & BIT(NL80211_STA_INFO_STA_FLAGS)) && 4717 if ((sinfo->filled & BIT_ULL(NL80211_STA_INFO_STA_FLAGS)) &&
4644 nla_put(msg, NL80211_STA_INFO_STA_FLAGS, 4718 nla_put(msg, NL80211_STA_INFO_STA_FLAGS,
4645 sizeof(struct nl80211_sta_flag_update), 4719 sizeof(struct nl80211_sta_flag_update),
4646 &sinfo->sta_flags)) 4720 &sinfo->sta_flags))
@@ -4886,7 +4960,8 @@ int cfg80211_check_station_change(struct wiphy *wiphy,
4886 return -EINVAL; 4960 return -EINVAL;
4887 if (params->supported_rates) 4961 if (params->supported_rates)
4888 return -EINVAL; 4962 return -EINVAL;
4889 if (params->ext_capab || params->ht_capa || params->vht_capa) 4963 if (params->ext_capab || params->ht_capa || params->vht_capa ||
4964 params->he_capa)
4890 return -EINVAL; 4965 return -EINVAL;
4891 } 4966 }
4892 4967
@@ -5092,6 +5167,15 @@ static int nl80211_set_station_tdls(struct genl_info *info,
5092 if (info->attrs[NL80211_ATTR_VHT_CAPABILITY]) 5167 if (info->attrs[NL80211_ATTR_VHT_CAPABILITY])
5093 params->vht_capa = 5168 params->vht_capa =
5094 nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY]); 5169 nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY]);
5170 if (info->attrs[NL80211_ATTR_HE_CAPABILITY]) {
5171 params->he_capa =
5172 nla_data(info->attrs[NL80211_ATTR_HE_CAPABILITY]);
5173 params->he_capa_len =
5174 nla_len(info->attrs[NL80211_ATTR_HE_CAPABILITY]);
5175
5176 if (params->he_capa_len < NL80211_HE_MIN_CAPABILITY_LEN)
5177 return -EINVAL;
5178 }
5095 5179
5096 err = nl80211_parse_sta_channel_info(info, params); 5180 err = nl80211_parse_sta_channel_info(info, params);
5097 if (err) 5181 if (err)
@@ -5319,6 +5403,17 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
5319 params.vht_capa = 5403 params.vht_capa =
5320 nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY]); 5404 nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY]);
5321 5405
5406 if (info->attrs[NL80211_ATTR_HE_CAPABILITY]) {
5407 params.he_capa =
5408 nla_data(info->attrs[NL80211_ATTR_HE_CAPABILITY]);
5409 params.he_capa_len =
5410 nla_len(info->attrs[NL80211_ATTR_HE_CAPABILITY]);
5411
5412 /* max len is validated in nla policy */
5413 if (params.he_capa_len < NL80211_HE_MIN_CAPABILITY_LEN)
5414 return -EINVAL;
5415 }
5416
5322 if (info->attrs[NL80211_ATTR_OPMODE_NOTIF]) { 5417 if (info->attrs[NL80211_ATTR_OPMODE_NOTIF]) {
5323 params.opmode_notif_used = true; 5418 params.opmode_notif_used = true;
5324 params.opmode_notif = 5419 params.opmode_notif =
@@ -5351,6 +5446,10 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
5351 if (!(params.sta_flags_set & BIT(NL80211_STA_FLAG_WME))) { 5446 if (!(params.sta_flags_set & BIT(NL80211_STA_FLAG_WME))) {
5352 params.ht_capa = NULL; 5447 params.ht_capa = NULL;
5353 params.vht_capa = NULL; 5448 params.vht_capa = NULL;
5449
5450 /* HE requires WME */
5451 if (params.he_capa_len)
5452 return -EINVAL;
5354 } 5453 }
5355 5454
5356 /* When you run into this, adjust the code below for the new flag */ 5455 /* When you run into this, adjust the code below for the new flag */
@@ -6231,7 +6330,7 @@ do { \
6231 nl80211_check_s32); 6330 nl80211_check_s32);
6232 /* 6331 /*
6233 * Check HT operation mode based on 6332 * Check HT operation mode based on
6234 * IEEE 802.11 2012 8.4.2.59 HT Operation element. 6333 * IEEE 802.11-2016 9.4.2.57 HT Operation element.
6235 */ 6334 */
6236 if (tb[NL80211_MESHCONF_HT_OPMODE]) { 6335 if (tb[NL80211_MESHCONF_HT_OPMODE]) {
6237 ht_opmode = nla_get_u16(tb[NL80211_MESHCONF_HT_OPMODE]); 6336 ht_opmode = nla_get_u16(tb[NL80211_MESHCONF_HT_OPMODE]);
@@ -6241,22 +6340,9 @@ do { \
6241 IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT)) 6340 IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT))
6242 return -EINVAL; 6341 return -EINVAL;
6243 6342
6244 if ((ht_opmode & IEEE80211_HT_OP_MODE_NON_GF_STA_PRSNT) && 6343 /* NON_HT_STA bit is reserved, but some programs set it */
6245 (ht_opmode & IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT)) 6344 ht_opmode &= ~IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT;
6246 return -EINVAL;
6247 6345
6248 switch (ht_opmode & IEEE80211_HT_OP_MODE_PROTECTION) {
6249 case IEEE80211_HT_OP_MODE_PROTECTION_NONE:
6250 case IEEE80211_HT_OP_MODE_PROTECTION_20MHZ:
6251 if (ht_opmode & IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT)
6252 return -EINVAL;
6253 break;
6254 case IEEE80211_HT_OP_MODE_PROTECTION_NONMEMBER:
6255 case IEEE80211_HT_OP_MODE_PROTECTION_NONHT_MIXED:
6256 if (!(ht_opmode & IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT))
6257 return -EINVAL;
6258 break;
6259 }
6260 cfg->ht_opmode = ht_opmode; 6346 cfg->ht_opmode = ht_opmode;
6261 mask |= (1 << (NL80211_MESHCONF_HT_OPMODE - 1)); 6347 mask |= (1 << (NL80211_MESHCONF_HT_OPMODE - 1));
6262 } 6348 }
@@ -6861,6 +6947,16 @@ static bool cfg80211_off_channel_oper_allowed(struct wireless_dev *wdev)
6861 return regulatory_pre_cac_allowed(wdev->wiphy); 6947 return regulatory_pre_cac_allowed(wdev->wiphy);
6862} 6948}
6863 6949
6950static bool nl80211_check_scan_feat(struct wiphy *wiphy, u32 flags, u32 flag,
6951 enum nl80211_ext_feature_index feat)
6952{
6953 if (!(flags & flag))
6954 return true;
6955 if (wiphy_ext_feature_isset(wiphy, feat))
6956 return true;
6957 return false;
6958}
6959
6864static int 6960static int
6865nl80211_check_scan_flags(struct wiphy *wiphy, struct wireless_dev *wdev, 6961nl80211_check_scan_flags(struct wiphy *wiphy, struct wireless_dev *wdev,
6866 void *request, struct nlattr **attrs, 6962 void *request, struct nlattr **attrs,
@@ -6895,15 +6991,33 @@ nl80211_check_scan_flags(struct wiphy *wiphy, struct wireless_dev *wdev,
6895 6991
6896 if (((*flags & NL80211_SCAN_FLAG_LOW_PRIORITY) && 6992 if (((*flags & NL80211_SCAN_FLAG_LOW_PRIORITY) &&
6897 !(wiphy->features & NL80211_FEATURE_LOW_PRIORITY_SCAN)) || 6993 !(wiphy->features & NL80211_FEATURE_LOW_PRIORITY_SCAN)) ||
6898 ((*flags & NL80211_SCAN_FLAG_LOW_SPAN) && 6994 !nl80211_check_scan_feat(wiphy, *flags,
6899 !wiphy_ext_feature_isset(wiphy, 6995 NL80211_SCAN_FLAG_LOW_SPAN,
6900 NL80211_EXT_FEATURE_LOW_SPAN_SCAN)) || 6996 NL80211_EXT_FEATURE_LOW_SPAN_SCAN) ||
6901 ((*flags & NL80211_SCAN_FLAG_LOW_POWER) && 6997 !nl80211_check_scan_feat(wiphy, *flags,
6902 !wiphy_ext_feature_isset(wiphy, 6998 NL80211_SCAN_FLAG_LOW_POWER,
6903 NL80211_EXT_FEATURE_LOW_POWER_SCAN)) || 6999 NL80211_EXT_FEATURE_LOW_POWER_SCAN) ||
6904 ((*flags & NL80211_SCAN_FLAG_HIGH_ACCURACY) && 7000 !nl80211_check_scan_feat(wiphy, *flags,
6905 !wiphy_ext_feature_isset(wiphy, 7001 NL80211_SCAN_FLAG_HIGH_ACCURACY,
6906 NL80211_EXT_FEATURE_HIGH_ACCURACY_SCAN))) 7002 NL80211_EXT_FEATURE_HIGH_ACCURACY_SCAN) ||
7003 !nl80211_check_scan_feat(wiphy, *flags,
7004 NL80211_SCAN_FLAG_FILS_MAX_CHANNEL_TIME,
7005 NL80211_EXT_FEATURE_FILS_MAX_CHANNEL_TIME) ||
7006 !nl80211_check_scan_feat(wiphy, *flags,
7007 NL80211_SCAN_FLAG_ACCEPT_BCAST_PROBE_RESP,
7008 NL80211_EXT_FEATURE_ACCEPT_BCAST_PROBE_RESP) ||
7009 !nl80211_check_scan_feat(wiphy, *flags,
7010 NL80211_SCAN_FLAG_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION,
7011 NL80211_EXT_FEATURE_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION) ||
7012 !nl80211_check_scan_feat(wiphy, *flags,
7013 NL80211_SCAN_FLAG_OCE_PROBE_REQ_HIGH_TX_RATE,
7014 NL80211_EXT_FEATURE_OCE_PROBE_REQ_HIGH_TX_RATE) ||
7015 !nl80211_check_scan_feat(wiphy, *flags,
7016 NL80211_SCAN_FLAG_RANDOM_SN,
7017 NL80211_EXT_FEATURE_SCAN_RANDOM_SN) ||
7018 !nl80211_check_scan_feat(wiphy, *flags,
7019 NL80211_SCAN_FLAG_MIN_PREQ_CONTENT,
7020 NL80211_EXT_FEATURE_SCAN_MIN_PREQ_CONTENT))
6907 return -EOPNOTSUPP; 7021 return -EOPNOTSUPP;
6908 7022
6909 if (*flags & NL80211_SCAN_FLAG_RANDOM_ADDR) { 7023 if (*flags & NL80211_SCAN_FLAG_RANDOM_ADDR) {
@@ -6918,26 +7032,6 @@ nl80211_check_scan_flags(struct wiphy *wiphy, struct wireless_dev *wdev,
6918 return err; 7032 return err;
6919 } 7033 }
6920 7034
6921 if ((*flags & NL80211_SCAN_FLAG_FILS_MAX_CHANNEL_TIME) &&
6922 !wiphy_ext_feature_isset(wiphy,
6923 NL80211_EXT_FEATURE_FILS_MAX_CHANNEL_TIME))
6924 return -EOPNOTSUPP;
6925
6926 if ((*flags & NL80211_SCAN_FLAG_ACCEPT_BCAST_PROBE_RESP) &&
6927 !wiphy_ext_feature_isset(wiphy,
6928 NL80211_EXT_FEATURE_ACCEPT_BCAST_PROBE_RESP))
6929 return -EOPNOTSUPP;
6930
6931 if ((*flags & NL80211_SCAN_FLAG_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION) &&
6932 !wiphy_ext_feature_isset(wiphy,
6933 NL80211_EXT_FEATURE_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION))
6934 return -EOPNOTSUPP;
6935
6936 if ((*flags & NL80211_SCAN_FLAG_OCE_PROBE_REQ_HIGH_TX_RATE) &&
6937 !wiphy_ext_feature_isset(wiphy,
6938 NL80211_EXT_FEATURE_OCE_PROBE_REQ_HIGH_TX_RATE))
6939 return -EOPNOTSUPP;
6940
6941 return 0; 7035 return 0;
6942} 7036}
6943 7037
@@ -10160,7 +10254,7 @@ static int cfg80211_cqm_rssi_update(struct cfg80211_registered_device *rdev,
10160 if (err) 10254 if (err)
10161 return err; 10255 return err;
10162 10256
10163 if (sinfo.filled & BIT(NL80211_STA_INFO_BEACON_SIGNAL_AVG)) 10257 if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_BEACON_SIGNAL_AVG))
10164 wdev->cqm_config->last_rssi_event_value = 10258 wdev->cqm_config->last_rssi_event_value =
10165 (s8) sinfo.rx_beacon_signal_avg; 10259 (s8) sinfo.rx_beacon_signal_avg;
10166 } 10260 }
@@ -10962,9 +11056,12 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info)
10962 rem) { 11056 rem) {
10963 u8 *mask_pat; 11057 u8 *mask_pat;
10964 11058
10965 nla_parse_nested(pat_tb, MAX_NL80211_PKTPAT, pat, 11059 err = nla_parse_nested(pat_tb, MAX_NL80211_PKTPAT, pat,
10966 nl80211_packet_pattern_policy, 11060 nl80211_packet_pattern_policy,
10967 info->extack); 11061 info->extack);
11062 if (err)
11063 goto error;
11064
10968 err = -EINVAL; 11065 err = -EINVAL;
10969 if (!pat_tb[NL80211_PKTPAT_MASK] || 11066 if (!pat_tb[NL80211_PKTPAT_MASK] ||
10970 !pat_tb[NL80211_PKTPAT_PATTERN]) 11067 !pat_tb[NL80211_PKTPAT_PATTERN])
@@ -11213,8 +11310,11 @@ static int nl80211_parse_coalesce_rule(struct cfg80211_registered_device *rdev,
11213 rem) { 11310 rem) {
11214 u8 *mask_pat; 11311 u8 *mask_pat;
11215 11312
11216 nla_parse_nested(pat_tb, MAX_NL80211_PKTPAT, pat, 11313 err = nla_parse_nested(pat_tb, MAX_NL80211_PKTPAT, pat,
11217 nl80211_packet_pattern_policy, NULL); 11314 nl80211_packet_pattern_policy, NULL);
11315 if (err)
11316 return err;
11317
11218 if (!pat_tb[NL80211_PKTPAT_MASK] || 11318 if (!pat_tb[NL80211_PKTPAT_MASK] ||
11219 !pat_tb[NL80211_PKTPAT_PATTERN]) 11319 !pat_tb[NL80211_PKTPAT_PATTERN])
11220 return -EINVAL; 11320 return -EINVAL;
@@ -14930,20 +15030,24 @@ void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie,
14930EXPORT_SYMBOL(cfg80211_mgmt_tx_status); 15030EXPORT_SYMBOL(cfg80211_mgmt_tx_status);
14931 15031
14932static int __nl80211_rx_control_port(struct net_device *dev, 15032static int __nl80211_rx_control_port(struct net_device *dev,
14933 const u8 *buf, size_t len, 15033 struct sk_buff *skb,
14934 const u8 *addr, u16 proto,
14935 bool unencrypted, gfp_t gfp) 15034 bool unencrypted, gfp_t gfp)
14936{ 15035{
14937 struct wireless_dev *wdev = dev->ieee80211_ptr; 15036 struct wireless_dev *wdev = dev->ieee80211_ptr;
14938 struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); 15037 struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
15038 struct ethhdr *ehdr = eth_hdr(skb);
15039 const u8 *addr = ehdr->h_source;
15040 u16 proto = be16_to_cpu(skb->protocol);
14939 struct sk_buff *msg; 15041 struct sk_buff *msg;
14940 void *hdr; 15042 void *hdr;
15043 struct nlattr *frame;
15044
14941 u32 nlportid = READ_ONCE(wdev->conn_owner_nlportid); 15045 u32 nlportid = READ_ONCE(wdev->conn_owner_nlportid);
14942 15046
14943 if (!nlportid) 15047 if (!nlportid)
14944 return -ENOENT; 15048 return -ENOENT;
14945 15049
14946 msg = nlmsg_new(100 + len, gfp); 15050 msg = nlmsg_new(100 + skb->len, gfp);
14947 if (!msg) 15051 if (!msg)
14948 return -ENOMEM; 15052 return -ENOMEM;
14949 15053
@@ -14957,13 +15061,17 @@ static int __nl80211_rx_control_port(struct net_device *dev,
14957 nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) || 15061 nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) ||
14958 nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), 15062 nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev),
14959 NL80211_ATTR_PAD) || 15063 NL80211_ATTR_PAD) ||
14960 nla_put(msg, NL80211_ATTR_FRAME, len, buf) ||
14961 nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, addr) || 15064 nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, addr) ||
14962 nla_put_u16(msg, NL80211_ATTR_CONTROL_PORT_ETHERTYPE, proto) || 15065 nla_put_u16(msg, NL80211_ATTR_CONTROL_PORT_ETHERTYPE, proto) ||
14963 (unencrypted && nla_put_flag(msg, 15066 (unencrypted && nla_put_flag(msg,
14964 NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT))) 15067 NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT)))
14965 goto nla_put_failure; 15068 goto nla_put_failure;
14966 15069
15070 frame = nla_reserve(msg, NL80211_ATTR_FRAME, skb->len);
15071 if (!frame)
15072 goto nla_put_failure;
15073
15074 skb_copy_bits(skb, 0, nla_data(frame), skb->len);
14967 genlmsg_end(msg, hdr); 15075 genlmsg_end(msg, hdr);
14968 15076
14969 return genlmsg_unicast(wiphy_net(&rdev->wiphy), msg, nlportid); 15077 return genlmsg_unicast(wiphy_net(&rdev->wiphy), msg, nlportid);
@@ -14974,14 +15082,12 @@ static int __nl80211_rx_control_port(struct net_device *dev,
14974} 15082}
14975 15083
14976bool cfg80211_rx_control_port(struct net_device *dev, 15084bool cfg80211_rx_control_port(struct net_device *dev,
14977 const u8 *buf, size_t len, 15085 struct sk_buff *skb, bool unencrypted)
14978 const u8 *addr, u16 proto, bool unencrypted)
14979{ 15086{
14980 int ret; 15087 int ret;
14981 15088
14982 trace_cfg80211_rx_control_port(dev, buf, len, addr, proto, unencrypted); 15089 trace_cfg80211_rx_control_port(dev, skb, unencrypted);
14983 ret = __nl80211_rx_control_port(dev, buf, len, addr, proto, 15090 ret = __nl80211_rx_control_port(dev, skb, unencrypted, GFP_ATOMIC);
14984 unencrypted, GFP_ATOMIC);
14985 trace_cfg80211_return_bool(ret == 0); 15091 trace_cfg80211_return_bool(ret == 0);
14986 return ret == 0; 15092 return ret == 0;
14987} 15093}
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index bbe6298e4bb9..4fc66a117b7d 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -2240,7 +2240,9 @@ static void wiphy_update_regulatory(struct wiphy *wiphy,
2240 * as some drivers used this to restore its orig_* reg domain. 2240 * as some drivers used this to restore its orig_* reg domain.
2241 */ 2241 */
2242 if (initiator == NL80211_REGDOM_SET_BY_CORE && 2242 if (initiator == NL80211_REGDOM_SET_BY_CORE &&
2243 wiphy->regulatory_flags & REGULATORY_CUSTOM_REG) 2243 wiphy->regulatory_flags & REGULATORY_CUSTOM_REG &&
2244 !(wiphy->regulatory_flags &
2245 REGULATORY_WIPHY_SELF_MANAGED))
2244 reg_call_notifier(wiphy, lr); 2246 reg_call_notifier(wiphy, lr);
2245 return; 2247 return;
2246 } 2248 }
@@ -2787,26 +2789,6 @@ static void notify_self_managed_wiphys(struct regulatory_request *request)
2787 } 2789 }
2788} 2790}
2789 2791
2790static bool reg_only_self_managed_wiphys(void)
2791{
2792 struct cfg80211_registered_device *rdev;
2793 struct wiphy *wiphy;
2794 bool self_managed_found = false;
2795
2796 ASSERT_RTNL();
2797
2798 list_for_each_entry(rdev, &cfg80211_rdev_list, list) {
2799 wiphy = &rdev->wiphy;
2800 if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED)
2801 self_managed_found = true;
2802 else
2803 return false;
2804 }
2805
2806 /* make sure at least one self-managed wiphy exists */
2807 return self_managed_found;
2808}
2809
2810/* 2792/*
2811 * Processes regulatory hints, this is all the NL80211_REGDOM_SET_BY_* 2793 * Processes regulatory hints, this is all the NL80211_REGDOM_SET_BY_*
2812 * Regulatory hints come on a first come first serve basis and we 2794 * Regulatory hints come on a first come first serve basis and we
@@ -2839,10 +2821,6 @@ static void reg_process_pending_hints(void)
2839 spin_unlock(&reg_requests_lock); 2821 spin_unlock(&reg_requests_lock);
2840 2822
2841 notify_self_managed_wiphys(reg_request); 2823 notify_self_managed_wiphys(reg_request);
2842 if (reg_only_self_managed_wiphys()) {
2843 reg_free_request(reg_request);
2844 return;
2845 }
2846 2824
2847 reg_process_hint(reg_request); 2825 reg_process_hint(reg_request);
2848 2826
diff --git a/net/wireless/sysfs.c b/net/wireless/sysfs.c
index 570a2b67ca10..6ab32f6a1961 100644
--- a/net/wireless/sysfs.c
+++ b/net/wireless/sysfs.c
@@ -102,7 +102,7 @@ static int wiphy_suspend(struct device *dev)
102 struct cfg80211_registered_device *rdev = dev_to_rdev(dev); 102 struct cfg80211_registered_device *rdev = dev_to_rdev(dev);
103 int ret = 0; 103 int ret = 0;
104 104
105 rdev->suspend_at = get_seconds(); 105 rdev->suspend_at = ktime_get_boottime_seconds();
106 106
107 rtnl_lock(); 107 rtnl_lock();
108 if (rdev->wiphy.registered) { 108 if (rdev->wiphy.registered) {
@@ -130,7 +130,7 @@ static int wiphy_resume(struct device *dev)
130 int ret = 0; 130 int ret = 0;
131 131
132 /* Age scan results with time spent in suspend */ 132 /* Age scan results with time spent in suspend */
133 cfg80211_bss_age(rdev, get_seconds() - rdev->suspend_at); 133 cfg80211_bss_age(rdev, ktime_get_boottime_seconds() - rdev->suspend_at);
134 134
135 rtnl_lock(); 135 rtnl_lock();
136 if (rdev->wiphy.registered && rdev->ops->resume) 136 if (rdev->wiphy.registered && rdev->ops->resume)
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 2b417a2fe63f..7c73510b161f 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -2627,23 +2627,25 @@ TRACE_EVENT(cfg80211_mgmt_tx_status,
2627); 2627);
2628 2628
2629TRACE_EVENT(cfg80211_rx_control_port, 2629TRACE_EVENT(cfg80211_rx_control_port,
2630 TP_PROTO(struct net_device *netdev, const u8 *buf, size_t len, 2630 TP_PROTO(struct net_device *netdev, struct sk_buff *skb,
2631 const u8 *addr, u16 proto, bool unencrypted), 2631 bool unencrypted),
2632 TP_ARGS(netdev, buf, len, addr, proto, unencrypted), 2632 TP_ARGS(netdev, skb, unencrypted),
2633 TP_STRUCT__entry( 2633 TP_STRUCT__entry(
2634 NETDEV_ENTRY 2634 NETDEV_ENTRY
2635 MAC_ENTRY(addr) 2635 __field(int, len)
2636 MAC_ENTRY(from)
2636 __field(u16, proto) 2637 __field(u16, proto)
2637 __field(bool, unencrypted) 2638 __field(bool, unencrypted)
2638 ), 2639 ),
2639 TP_fast_assign( 2640 TP_fast_assign(
2640 NETDEV_ASSIGN; 2641 NETDEV_ASSIGN;
2641 MAC_ASSIGN(addr, addr); 2642 __entry->len = skb->len;
2642 __entry->proto = proto; 2643 MAC_ASSIGN(from, eth_hdr(skb)->h_source);
2644 __entry->proto = be16_to_cpu(skb->protocol);
2643 __entry->unencrypted = unencrypted; 2645 __entry->unencrypted = unencrypted;
2644 ), 2646 ),
2645 TP_printk(NETDEV_PR_FMT ", " MAC_PR_FMT " proto: 0x%x, unencrypted: %s", 2647 TP_printk(NETDEV_PR_FMT ", len=%d, " MAC_PR_FMT ", proto: 0x%x, unencrypted: %s",
2646 NETDEV_PR_ARG, MAC_PR_ARG(addr), 2648 NETDEV_PR_ARG, __entry->len, MAC_PR_ARG(from),
2647 __entry->proto, BOOL_TO_STR(__entry->unencrypted)) 2649 __entry->proto, BOOL_TO_STR(__entry->unencrypted))
2648); 2650);
2649 2651
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 3c654cd7ba56..e0825a019e9f 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -4,6 +4,7 @@
4 * 4 *
5 * Copyright 2007-2009 Johannes Berg <johannes@sipsolutions.net> 5 * Copyright 2007-2009 Johannes Berg <johannes@sipsolutions.net>
6 * Copyright 2013-2014 Intel Mobile Communications GmbH 6 * Copyright 2013-2014 Intel Mobile Communications GmbH
7 * Copyright 2017 Intel Deutschland GmbH
7 */ 8 */
8#include <linux/export.h> 9#include <linux/export.h>
9#include <linux/bitops.h> 10#include <linux/bitops.h>
@@ -1142,6 +1143,85 @@ static u32 cfg80211_calculate_bitrate_vht(struct rate_info *rate)
1142 return 0; 1143 return 0;
1143} 1144}
1144 1145
1146static u32 cfg80211_calculate_bitrate_he(struct rate_info *rate)
1147{
1148#define SCALE 2048
1149 u16 mcs_divisors[12] = {
1150 34133, /* 16.666666... */
1151 17067, /* 8.333333... */
1152 11378, /* 5.555555... */
1153 8533, /* 4.166666... */
1154 5689, /* 2.777777... */
1155 4267, /* 2.083333... */
1156 3923, /* 1.851851... */
1157 3413, /* 1.666666... */
1158 2844, /* 1.388888... */
1159 2560, /* 1.250000... */
1160 2276, /* 1.111111... */
1161 2048, /* 1.000000... */
1162 };
1163 u32 rates_160M[3] = { 960777777, 907400000, 816666666 };
1164 u32 rates_969[3] = { 480388888, 453700000, 408333333 };
1165 u32 rates_484[3] = { 229411111, 216666666, 195000000 };
1166 u32 rates_242[3] = { 114711111, 108333333, 97500000 };
1167 u32 rates_106[3] = { 40000000, 37777777, 34000000 };
1168 u32 rates_52[3] = { 18820000, 17777777, 16000000 };
1169 u32 rates_26[3] = { 9411111, 8888888, 8000000 };
1170 u64 tmp;
1171 u32 result;
1172
1173 if (WARN_ON_ONCE(rate->mcs > 11))
1174 return 0;
1175
1176 if (WARN_ON_ONCE(rate->he_gi > NL80211_RATE_INFO_HE_GI_3_2))
1177 return 0;
1178 if (WARN_ON_ONCE(rate->he_ru_alloc >
1179 NL80211_RATE_INFO_HE_RU_ALLOC_2x996))
1180 return 0;
1181 if (WARN_ON_ONCE(rate->nss < 1 || rate->nss > 8))
1182 return 0;
1183
1184 if (rate->bw == RATE_INFO_BW_160)
1185 result = rates_160M[rate->he_gi];
1186 else if (rate->bw == RATE_INFO_BW_80 ||
1187 (rate->bw == RATE_INFO_BW_HE_RU &&
1188 rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_996))
1189 result = rates_969[rate->he_gi];
1190 else if (rate->bw == RATE_INFO_BW_40 ||
1191 (rate->bw == RATE_INFO_BW_HE_RU &&
1192 rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_484))
1193 result = rates_484[rate->he_gi];
1194 else if (rate->bw == RATE_INFO_BW_20 ||
1195 (rate->bw == RATE_INFO_BW_HE_RU &&
1196 rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_242))
1197 result = rates_242[rate->he_gi];
1198 else if (rate->bw == RATE_INFO_BW_HE_RU &&
1199 rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_106)
1200 result = rates_106[rate->he_gi];
1201 else if (rate->bw == RATE_INFO_BW_HE_RU &&
1202 rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_52)
1203 result = rates_52[rate->he_gi];
1204 else if (rate->bw == RATE_INFO_BW_HE_RU &&
1205 rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_26)
1206 result = rates_26[rate->he_gi];
1207 else if (WARN(1, "invalid HE MCS: bw:%d, ru:%d\n",
1208 rate->bw, rate->he_ru_alloc))
1209 return 0;
1210
1211 /* now scale to the appropriate MCS */
1212 tmp = result;
1213 tmp *= SCALE;
1214 do_div(tmp, mcs_divisors[rate->mcs]);
1215 result = tmp;
1216
1217 /* and take NSS, DCM into account */
1218 result = (result * rate->nss) / 8;
1219 if (rate->he_dcm)
1220 result /= 2;
1221
1222 return result;
1223}
1224
1145u32 cfg80211_calculate_bitrate(struct rate_info *rate) 1225u32 cfg80211_calculate_bitrate(struct rate_info *rate)
1146{ 1226{
1147 if (rate->flags & RATE_INFO_FLAGS_MCS) 1227 if (rate->flags & RATE_INFO_FLAGS_MCS)
@@ -1150,6 +1230,8 @@ u32 cfg80211_calculate_bitrate(struct rate_info *rate)
1150 return cfg80211_calculate_bitrate_60g(rate); 1230 return cfg80211_calculate_bitrate_60g(rate);
1151 if (rate->flags & RATE_INFO_FLAGS_VHT_MCS) 1231 if (rate->flags & RATE_INFO_FLAGS_VHT_MCS)
1152 return cfg80211_calculate_bitrate_vht(rate); 1232 return cfg80211_calculate_bitrate_vht(rate);
1233 if (rate->flags & RATE_INFO_FLAGS_HE_MCS)
1234 return cfg80211_calculate_bitrate_he(rate);
1153 1235
1154 return rate->legacy; 1236 return rate->legacy;
1155} 1237}
@@ -1791,8 +1873,9 @@ bool cfg80211_does_bw_fit_range(const struct ieee80211_freq_range *freq_range,
1791 1873
1792int cfg80211_sinfo_alloc_tid_stats(struct station_info *sinfo, gfp_t gfp) 1874int cfg80211_sinfo_alloc_tid_stats(struct station_info *sinfo, gfp_t gfp)
1793{ 1875{
1794 sinfo->pertid = kcalloc(sizeof(*(sinfo->pertid)), 1876 sinfo->pertid = kcalloc(IEEE80211_NUM_TIDS + 1,
1795 IEEE80211_NUM_TIDS + 1, gfp); 1877 sizeof(*(sinfo->pertid)),
1878 gfp);
1796 if (!sinfo->pertid) 1879 if (!sinfo->pertid)
1797 return -ENOMEM; 1880 return -ENOMEM;
1798 1881
diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c
index 05186a47878f..167f7025ac98 100644
--- a/net/wireless/wext-compat.c
+++ b/net/wireless/wext-compat.c
@@ -1278,7 +1278,7 @@ static int cfg80211_wext_giwrate(struct net_device *dev,
1278 if (err) 1278 if (err)
1279 return err; 1279 return err;
1280 1280
1281 if (!(sinfo.filled & BIT(NL80211_STA_INFO_TX_BITRATE))) 1281 if (!(sinfo.filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE)))
1282 return -EOPNOTSUPP; 1282 return -EOPNOTSUPP;
1283 1283
1284 rate->value = 100000 * cfg80211_calculate_bitrate(&sinfo.txrate); 1284 rate->value = 100000 * cfg80211_calculate_bitrate(&sinfo.txrate);
@@ -1320,7 +1320,7 @@ static struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev)
1320 1320
1321 switch (rdev->wiphy.signal_type) { 1321 switch (rdev->wiphy.signal_type) {
1322 case CFG80211_SIGNAL_TYPE_MBM: 1322 case CFG80211_SIGNAL_TYPE_MBM:
1323 if (sinfo.filled & BIT(NL80211_STA_INFO_SIGNAL)) { 1323 if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_SIGNAL)) {
1324 int sig = sinfo.signal; 1324 int sig = sinfo.signal;
1325 wstats.qual.updated |= IW_QUAL_LEVEL_UPDATED; 1325 wstats.qual.updated |= IW_QUAL_LEVEL_UPDATED;
1326 wstats.qual.updated |= IW_QUAL_QUAL_UPDATED; 1326 wstats.qual.updated |= IW_QUAL_QUAL_UPDATED;
@@ -1334,7 +1334,7 @@ static struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev)
1334 break; 1334 break;
1335 } 1335 }
1336 case CFG80211_SIGNAL_TYPE_UNSPEC: 1336 case CFG80211_SIGNAL_TYPE_UNSPEC:
1337 if (sinfo.filled & BIT(NL80211_STA_INFO_SIGNAL)) { 1337 if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_SIGNAL)) {
1338 wstats.qual.updated |= IW_QUAL_LEVEL_UPDATED; 1338 wstats.qual.updated |= IW_QUAL_LEVEL_UPDATED;
1339 wstats.qual.updated |= IW_QUAL_QUAL_UPDATED; 1339 wstats.qual.updated |= IW_QUAL_QUAL_UPDATED;
1340 wstats.qual.level = sinfo.signal; 1340 wstats.qual.level = sinfo.signal;
@@ -1347,9 +1347,9 @@ static struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev)
1347 } 1347 }
1348 1348
1349 wstats.qual.updated |= IW_QUAL_NOISE_INVALID; 1349 wstats.qual.updated |= IW_QUAL_NOISE_INVALID;
1350 if (sinfo.filled & BIT(NL80211_STA_INFO_RX_DROP_MISC)) 1350 if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_RX_DROP_MISC))
1351 wstats.discard.misc = sinfo.rx_dropped_misc; 1351 wstats.discard.misc = sinfo.rx_dropped_misc;
1352 if (sinfo.filled & BIT(NL80211_STA_INFO_TX_FAILED)) 1352 if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_TX_FAILED))
1353 wstats.discard.retries = sinfo.tx_failed; 1353 wstats.discard.retries = sinfo.tx_failed;
1354 1354
1355 return &wstats; 1355 return &wstats;
diff --git a/net/x25/Kconfig b/net/x25/Kconfig
index e2fa133f9fba..59fcb41fc5e6 100644
--- a/net/x25/Kconfig
+++ b/net/x25/Kconfig
@@ -31,5 +31,3 @@ config X25
31 31
32 To compile this driver as a module, choose M here: the module 32 To compile this driver as a module, choose M here: the module
33 will be called x25. If unsure, say N. 33 will be called x25. If unsure, say N.
34
35
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index f93365ae0fdd..d49aa79b7997 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -1750,7 +1750,7 @@ static const struct proto_ops x25_proto_ops = {
1750 .socketpair = sock_no_socketpair, 1750 .socketpair = sock_no_socketpair,
1751 .accept = x25_accept, 1751 .accept = x25_accept,
1752 .getname = x25_getname, 1752 .getname = x25_getname,
1753 .poll_mask = datagram_poll_mask, 1753 .poll = datagram_poll,
1754 .ioctl = x25_ioctl, 1754 .ioctl = x25_ioctl,
1755#ifdef CONFIG_COMPAT 1755#ifdef CONFIG_COMPAT
1756 .compat_ioctl = compat_x25_ioctl, 1756 .compat_ioctl = compat_x25_ioctl,
diff --git a/net/x25/x25_subr.c b/net/x25/x25_subr.c
index 9c214ec681ac..743103786652 100644
--- a/net/x25/x25_subr.c
+++ b/net/x25/x25_subr.c
@@ -381,4 +381,3 @@ void x25_check_rbuf(struct sock *sk)
381 x25_stop_timer(sk); 381 x25_stop_timer(sk);
382 } 382 }
383} 383}
384
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index f47abb46c587..bfe2dbea480b 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -11,6 +11,8 @@
11#include <linux/slab.h> 11#include <linux/slab.h>
12#include <linux/bpf.h> 12#include <linux/bpf.h>
13#include <linux/mm.h> 13#include <linux/mm.h>
14#include <linux/netdevice.h>
15#include <linux/rtnetlink.h>
14 16
15#include "xdp_umem.h" 17#include "xdp_umem.h"
16#include "xsk_queue.h" 18#include "xsk_queue.h"
@@ -40,6 +42,21 @@ void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
40 } 42 }
41} 43}
42 44
45int xdp_umem_query(struct net_device *dev, u16 queue_id)
46{
47 struct netdev_bpf bpf;
48
49 ASSERT_RTNL();
50
51 memset(&bpf, 0, sizeof(bpf));
52 bpf.command = XDP_QUERY_XSK_UMEM;
53 bpf.xsk.queue_id = queue_id;
54
55 if (!dev->netdev_ops->ndo_bpf)
56 return 0;
57 return dev->netdev_ops->ndo_bpf(dev, &bpf) ?: !!bpf.xsk.umem;
58}
59
43int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, 60int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
44 u32 queue_id, u16 flags) 61 u32 queue_id, u16 flags)
45{ 62{
@@ -56,41 +73,36 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
56 if (force_copy) 73 if (force_copy)
57 return 0; 74 return 0;
58 75
59 dev_hold(dev); 76 if (!dev->netdev_ops->ndo_bpf || !dev->netdev_ops->ndo_xsk_async_xmit)
60 77 return force_zc ? -EOPNOTSUPP : 0; /* fail or fallback */
61 if (dev->netdev_ops->ndo_bpf && dev->netdev_ops->ndo_xsk_async_xmit) {
62 bpf.command = XDP_QUERY_XSK_UMEM;
63 78
64 rtnl_lock(); 79 bpf.command = XDP_QUERY_XSK_UMEM;
65 err = dev->netdev_ops->ndo_bpf(dev, &bpf);
66 rtnl_unlock();
67 80
68 if (err) { 81 rtnl_lock();
69 dev_put(dev); 82 err = xdp_umem_query(dev, queue_id);
70 return force_zc ? -ENOTSUPP : 0; 83 if (err) {
71 } 84 err = err < 0 ? -EOPNOTSUPP : -EBUSY;
72 85 goto err_rtnl_unlock;
73 bpf.command = XDP_SETUP_XSK_UMEM; 86 }
74 bpf.xsk.umem = umem;
75 bpf.xsk.queue_id = queue_id;
76 87
77 rtnl_lock(); 88 bpf.command = XDP_SETUP_XSK_UMEM;
78 err = dev->netdev_ops->ndo_bpf(dev, &bpf); 89 bpf.xsk.umem = umem;
79 rtnl_unlock(); 90 bpf.xsk.queue_id = queue_id;
80 91
81 if (err) { 92 err = dev->netdev_ops->ndo_bpf(dev, &bpf);
82 dev_put(dev); 93 if (err)
83 return force_zc ? err : 0; /* fail or fallback */ 94 goto err_rtnl_unlock;
84 } 95 rtnl_unlock();
85 96
86 umem->dev = dev; 97 dev_hold(dev);
87 umem->queue_id = queue_id; 98 umem->dev = dev;
88 umem->zc = true; 99 umem->queue_id = queue_id;
89 return 0; 100 umem->zc = true;
90 } 101 return 0;
91 102
92 dev_put(dev); 103err_rtnl_unlock:
93 return force_zc ? -ENOTSUPP : 0; /* fail or fallback */ 104 rtnl_unlock();
105 return force_zc ? err : 0; /* fail or fallback */
94} 106}
95 107
96static void xdp_umem_clear_dev(struct xdp_umem *umem) 108static void xdp_umem_clear_dev(struct xdp_umem *umem)
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 3b3410ada097..4e937cd7c17d 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -84,10 +84,8 @@ static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
84{ 84{
85 int err = xskq_produce_batch_desc(xs->rx, (u64)xdp->handle, len); 85 int err = xskq_produce_batch_desc(xs->rx, (u64)xdp->handle, len);
86 86
87 if (err) { 87 if (err)
88 xdp_return_buff(xdp);
89 xs->rx_dropped++; 88 xs->rx_dropped++;
90 }
91 89
92 return err; 90 return err;
93} 91}
@@ -199,8 +197,11 @@ static void xsk_destruct_skb(struct sk_buff *skb)
199{ 197{
200 u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg; 198 u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg;
201 struct xdp_sock *xs = xdp_sk(skb->sk); 199 struct xdp_sock *xs = xdp_sk(skb->sk);
200 unsigned long flags;
202 201
202 spin_lock_irqsave(&xs->tx_completion_lock, flags);
203 WARN_ON_ONCE(xskq_produce_addr(xs->umem->cq, addr)); 203 WARN_ON_ONCE(xskq_produce_addr(xs->umem->cq, addr));
204 spin_unlock_irqrestore(&xs->tx_completion_lock, flags);
204 205
205 sock_wfree(skb); 206 sock_wfree(skb);
206} 207}
@@ -215,9 +216,6 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
215 struct sk_buff *skb; 216 struct sk_buff *skb;
216 int err = 0; 217 int err = 0;
217 218
218 if (unlikely(!xs->tx))
219 return -ENOBUFS;
220
221 mutex_lock(&xs->mutex); 219 mutex_lock(&xs->mutex);
222 220
223 while (xskq_peek_desc(xs->tx, &desc)) { 221 while (xskq_peek_desc(xs->tx, &desc)) {
@@ -230,22 +228,13 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
230 goto out; 228 goto out;
231 } 229 }
232 230
233 if (xskq_reserve_addr(xs->umem->cq)) { 231 if (xskq_reserve_addr(xs->umem->cq))
234 err = -EAGAIN;
235 goto out; 232 goto out;
236 }
237 233
238 len = desc.len; 234 if (xs->queue_id >= xs->dev->real_num_tx_queues)
239 if (unlikely(len > xs->dev->mtu)) {
240 err = -EMSGSIZE;
241 goto out; 235 goto out;
242 }
243
244 if (xs->queue_id >= xs->dev->real_num_tx_queues) {
245 err = -ENXIO;
246 goto out;
247 }
248 236
237 len = desc.len;
249 skb = sock_alloc_send_skb(sk, len, 1, &err); 238 skb = sock_alloc_send_skb(sk, len, 1, &err);
250 if (unlikely(!skb)) { 239 if (unlikely(!skb)) {
251 err = -EAGAIN; 240 err = -EAGAIN;
@@ -268,15 +257,15 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
268 skb->destructor = xsk_destruct_skb; 257 skb->destructor = xsk_destruct_skb;
269 258
270 err = dev_direct_xmit(skb, xs->queue_id); 259 err = dev_direct_xmit(skb, xs->queue_id);
260 xskq_discard_desc(xs->tx);
271 /* Ignore NET_XMIT_CN as packet might have been sent */ 261 /* Ignore NET_XMIT_CN as packet might have been sent */
272 if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) { 262 if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) {
273 err = -EAGAIN; 263 /* SKB completed but not sent */
274 /* SKB consumed by dev_direct_xmit() */ 264 err = -EBUSY;
275 goto out; 265 goto out;
276 } 266 }
277 267
278 sent_frame = true; 268 sent_frame = true;
279 xskq_discard_desc(xs->tx);
280 } 269 }
281 270
282out: 271out:
@@ -297,15 +286,18 @@ static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
297 return -ENXIO; 286 return -ENXIO;
298 if (unlikely(!(xs->dev->flags & IFF_UP))) 287 if (unlikely(!(xs->dev->flags & IFF_UP)))
299 return -ENETDOWN; 288 return -ENETDOWN;
289 if (unlikely(!xs->tx))
290 return -ENOBUFS;
300 if (need_wait) 291 if (need_wait)
301 return -EOPNOTSUPP; 292 return -EOPNOTSUPP;
302 293
303 return (xs->zc) ? xsk_zc_xmit(sk) : xsk_generic_xmit(sk, m, total_len); 294 return (xs->zc) ? xsk_zc_xmit(sk) : xsk_generic_xmit(sk, m, total_len);
304} 295}
305 296
306static __poll_t xsk_poll_mask(struct socket *sock, __poll_t events) 297static unsigned int xsk_poll(struct file *file, struct socket *sock,
298 struct poll_table_struct *wait)
307{ 299{
308 __poll_t mask = datagram_poll_mask(sock, events); 300 unsigned int mask = datagram_poll(file, sock, wait);
309 struct sock *sk = sock->sk; 301 struct sock *sk = sock->sk;
310 struct xdp_sock *xs = xdp_sk(sk); 302 struct xdp_sock *xs = xdp_sk(sk);
311 303
@@ -696,7 +688,7 @@ static const struct proto_ops xsk_proto_ops = {
696 .socketpair = sock_no_socketpair, 688 .socketpair = sock_no_socketpair,
697 .accept = sock_no_accept, 689 .accept = sock_no_accept,
698 .getname = sock_no_getname, 690 .getname = sock_no_getname,
699 .poll_mask = xsk_poll_mask, 691 .poll = xsk_poll,
700 .ioctl = sock_no_ioctl, 692 .ioctl = sock_no_ioctl,
701 .listen = sock_no_listen, 693 .listen = sock_no_listen,
702 .shutdown = sock_no_shutdown, 694 .shutdown = sock_no_shutdown,
@@ -754,6 +746,7 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol,
754 746
755 xs = xdp_sk(sk); 747 xs = xdp_sk(sk);
756 mutex_init(&xs->mutex); 748 mutex_init(&xs->mutex);
749 spin_lock_init(&xs->tx_completion_lock);
757 750
758 local_bh_disable(); 751 local_bh_disable();
759 sock_prot_inuse_add(net, &xsk_proto, 1); 752 sock_prot_inuse_add(net, &xsk_proto, 1);
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index ef6a6f0ec949..8a64b150be54 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -62,14 +62,9 @@ static inline u32 xskq_nb_avail(struct xsk_queue *q, u32 dcnt)
62 return (entries > dcnt) ? dcnt : entries; 62 return (entries > dcnt) ? dcnt : entries;
63} 63}
64 64
65static inline u32 xskq_nb_free_lazy(struct xsk_queue *q, u32 producer)
66{
67 return q->nentries - (producer - q->cons_tail);
68}
69
70static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt) 65static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt)
71{ 66{
72 u32 free_entries = xskq_nb_free_lazy(q, producer); 67 u32 free_entries = q->nentries - (producer - q->cons_tail);
73 68
74 if (free_entries >= dcnt) 69 if (free_entries >= dcnt)
75 return free_entries; 70 return free_entries;
@@ -129,7 +124,7 @@ static inline int xskq_produce_addr(struct xsk_queue *q, u64 addr)
129{ 124{
130 struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; 125 struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
131 126
132 if (xskq_nb_free(q, q->prod_tail, LAZY_UPDATE_THRESHOLD) == 0) 127 if (xskq_nb_free(q, q->prod_tail, 1) == 0)
133 return -ENOSPC; 128 return -ENOSPC;
134 129
135 ring->desc[q->prod_tail++ & q->ring_mask] = addr; 130 ring->desc[q->prod_tail++ & q->ring_mask] = addr;
@@ -255,7 +250,7 @@ static inline bool xskq_full_desc(struct xsk_queue *q)
255 250
256static inline bool xskq_empty_desc(struct xsk_queue *q) 251static inline bool xskq_empty_desc(struct xsk_queue *q)
257{ 252{
258 return xskq_nb_free(q, q->prod_tail, 1) == q->nentries; 253 return xskq_nb_free(q, q->prod_tail, q->nentries) == q->nentries;
259} 254}
260 255
261void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props); 256void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props);
diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig
index 286ed25c1a69..4a9ee2d83158 100644
--- a/net/xfrm/Kconfig
+++ b/net/xfrm/Kconfig
@@ -25,6 +25,14 @@ config XFRM_USER
25 25
26 If unsure, say Y. 26 If unsure, say Y.
27 27
28config XFRM_INTERFACE
29 tristate "Transformation virtual interface"
30 depends on XFRM && IPV6
31 ---help---
32 This provides a virtual interface to route IPsec traffic.
33
34 If unsure, say N.
35
28config XFRM_SUB_POLICY 36config XFRM_SUB_POLICY
29 bool "Transformation sub policy support" 37 bool "Transformation sub policy support"
30 depends on XFRM 38 depends on XFRM
@@ -87,4 +95,3 @@ config NET_KEY_MIGRATE
87 <draft-sugimoto-mip6-pfkey-migrate>. 95 <draft-sugimoto-mip6-pfkey-migrate>.
88 96
89 If unsure, say N. 97 If unsure, say N.
90
diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile
index 0bd2465a8c5a..fbc4552d17b8 100644
--- a/net/xfrm/Makefile
+++ b/net/xfrm/Makefile
@@ -10,3 +10,4 @@ obj-$(CONFIG_XFRM_STATISTICS) += xfrm_proc.o
10obj-$(CONFIG_XFRM_ALGO) += xfrm_algo.o 10obj-$(CONFIG_XFRM_ALGO) += xfrm_algo.o
11obj-$(CONFIG_XFRM_USER) += xfrm_user.o 11obj-$(CONFIG_XFRM_USER) += xfrm_user.o
12obj-$(CONFIG_XFRM_IPCOMP) += xfrm_ipcomp.o 12obj-$(CONFIG_XFRM_IPCOMP) += xfrm_ipcomp.o
13obj-$(CONFIG_XFRM_INTERFACE) += xfrm_interface.o
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index 175941e15a6e..5611b7521020 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -56,7 +56,7 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur
56 if (skb_is_gso(skb)) { 56 if (skb_is_gso(skb)) {
57 struct net_device *dev = skb->dev; 57 struct net_device *dev = skb->dev;
58 58
59 if (unlikely(!x->xso.offload_handle || (x->xso.dev != dev))) { 59 if (unlikely(x->xso.dev != dev)) {
60 struct sk_buff *segs; 60 struct sk_buff *segs;
61 61
62 /* Packet got rerouted, fixup features and segment it. */ 62 /* Packet got rerouted, fixup features and segment it. */
@@ -162,7 +162,8 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
162 } 162 }
163 163
164 dst = __xfrm_dst_lookup(net, 0, 0, saddr, daddr, 164 dst = __xfrm_dst_lookup(net, 0, 0, saddr, daddr,
165 x->props.family, x->props.output_mark); 165 x->props.family,
166 xfrm_smark_get(0, x));
166 if (IS_ERR(dst)) 167 if (IS_ERR(dst))
167 return 0; 168 return 0;
168 169
@@ -210,8 +211,8 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x)
210 if (!x->type_offload || x->encap) 211 if (!x->type_offload || x->encap)
211 return false; 212 return false;
212 213
213 if ((!dev || (x->xso.offload_handle && (dev == xfrm_dst_path(dst)->dev))) && 214 if ((!dev || (dev == xfrm_dst_path(dst)->dev)) &&
214 (!xdst->child->xfrm && x->type->get_mtu)) { 215 (!xdst->child->xfrm && x->type->get_mtu)) {
215 mtu = x->type->get_mtu(x, xdst->child_mtu_cached); 216 mtu = x->type->get_mtu(x, xdst->child_mtu_cached);
216 217
217 if (skb->len <= mtu) 218 if (skb->len <= mtu)
@@ -306,12 +307,6 @@ static int xfrm_dev_register(struct net_device *dev)
306 return xfrm_api_check(dev); 307 return xfrm_api_check(dev);
307} 308}
308 309
309static int xfrm_dev_unregister(struct net_device *dev)
310{
311 xfrm_policy_cache_flush();
312 return NOTIFY_DONE;
313}
314
315static int xfrm_dev_feat_change(struct net_device *dev) 310static int xfrm_dev_feat_change(struct net_device *dev)
316{ 311{
317 return xfrm_api_check(dev); 312 return xfrm_api_check(dev);
@@ -322,7 +317,6 @@ static int xfrm_dev_down(struct net_device *dev)
322 if (dev->features & NETIF_F_HW_ESP) 317 if (dev->features & NETIF_F_HW_ESP)
323 xfrm_dev_state_flush(dev_net(dev), dev, true); 318 xfrm_dev_state_flush(dev_net(dev), dev, true);
324 319
325 xfrm_policy_cache_flush();
326 return NOTIFY_DONE; 320 return NOTIFY_DONE;
327} 321}
328 322
@@ -334,9 +328,6 @@ static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void
334 case NETDEV_REGISTER: 328 case NETDEV_REGISTER:
335 return xfrm_dev_register(dev); 329 return xfrm_dev_register(dev);
336 330
337 case NETDEV_UNREGISTER:
338 return xfrm_dev_unregister(dev);
339
340 case NETDEV_FEAT_CHANGE: 331 case NETDEV_FEAT_CHANGE:
341 return xfrm_dev_feat_change(dev); 332 return xfrm_dev_feat_change(dev);
342 333
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 352abca2605f..b89c9c7f8c5c 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -320,6 +320,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
320 320
321 seq = 0; 321 seq = 0;
322 if (!spi && (err = xfrm_parse_spi(skb, nexthdr, &spi, &seq)) != 0) { 322 if (!spi && (err = xfrm_parse_spi(skb, nexthdr, &spi, &seq)) != 0) {
323 secpath_reset(skb);
323 XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); 324 XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR);
324 goto drop; 325 goto drop;
325 } 326 }
@@ -328,17 +329,21 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
328 XFRM_SPI_SKB_CB(skb)->daddroff); 329 XFRM_SPI_SKB_CB(skb)->daddroff);
329 do { 330 do {
330 if (skb->sp->len == XFRM_MAX_DEPTH) { 331 if (skb->sp->len == XFRM_MAX_DEPTH) {
332 secpath_reset(skb);
331 XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); 333 XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR);
332 goto drop; 334 goto drop;
333 } 335 }
334 336
335 x = xfrm_state_lookup(net, mark, daddr, spi, nexthdr, family); 337 x = xfrm_state_lookup(net, mark, daddr, spi, nexthdr, family);
336 if (x == NULL) { 338 if (x == NULL) {
339 secpath_reset(skb);
337 XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOSTATES); 340 XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOSTATES);
338 xfrm_audit_state_notfound(skb, family, spi, seq); 341 xfrm_audit_state_notfound(skb, family, spi, seq);
339 goto drop; 342 goto drop;
340 } 343 }
341 344
345 skb->mark = xfrm_smark_get(skb->mark, x);
346
342 skb->sp->xvec[skb->sp->len++] = x; 347 skb->sp->xvec[skb->sp->len++] = x;
343 348
344lock: 349lock:
diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c
new file mode 100644
index 000000000000..31acc6f33d98
--- /dev/null
+++ b/net/xfrm/xfrm_interface.c
@@ -0,0 +1,975 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * XFRM virtual interface
4 *
5 * Copyright (C) 2018 secunet Security Networks AG
6 *
7 * Author:
8 * Steffen Klassert <steffen.klassert@secunet.com>
9 */
10
11#include <linux/module.h>
12#include <linux/capability.h>
13#include <linux/errno.h>
14#include <linux/types.h>
15#include <linux/sockios.h>
16#include <linux/icmp.h>
17#include <linux/if.h>
18#include <linux/in.h>
19#include <linux/ip.h>
20#include <linux/net.h>
21#include <linux/in6.h>
22#include <linux/netdevice.h>
23#include <linux/if_link.h>
24#include <linux/if_arp.h>
25#include <linux/icmpv6.h>
26#include <linux/init.h>
27#include <linux/route.h>
28#include <linux/rtnetlink.h>
29#include <linux/netfilter_ipv6.h>
30#include <linux/slab.h>
31#include <linux/hash.h>
32
33#include <linux/uaccess.h>
34#include <linux/atomic.h>
35
36#include <net/icmp.h>
37#include <net/ip.h>
38#include <net/ipv6.h>
39#include <net/ip6_route.h>
40#include <net/addrconf.h>
41#include <net/xfrm.h>
42#include <net/net_namespace.h>
43#include <net/netns/generic.h>
44#include <linux/etherdevice.h>
45
46static int xfrmi_dev_init(struct net_device *dev);
47static void xfrmi_dev_setup(struct net_device *dev);
48static struct rtnl_link_ops xfrmi_link_ops __read_mostly;
49static unsigned int xfrmi_net_id __read_mostly;
50
51struct xfrmi_net {
52 /* lists for storing interfaces in use */
53 struct xfrm_if __rcu *xfrmi[1];
54};
55
56#define for_each_xfrmi_rcu(start, xi) \
57 for (xi = rcu_dereference(start); xi; xi = rcu_dereference(xi->next))
58
59static struct xfrm_if *xfrmi_lookup(struct net *net, struct xfrm_state *x)
60{
61 struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id);
62 struct xfrm_if *xi;
63
64 for_each_xfrmi_rcu(xfrmn->xfrmi[0], xi) {
65 if (x->if_id == xi->p.if_id &&
66 (xi->dev->flags & IFF_UP))
67 return xi;
68 }
69
70 return NULL;
71}
72
73static struct xfrm_if *xfrmi_decode_session(struct sk_buff *skb)
74{
75 struct xfrmi_net *xfrmn;
76 int ifindex;
77 struct xfrm_if *xi;
78
79 if (!skb->dev)
80 return NULL;
81
82 xfrmn = net_generic(dev_net(skb->dev), xfrmi_net_id);
83 ifindex = skb->dev->ifindex;
84
85 for_each_xfrmi_rcu(xfrmn->xfrmi[0], xi) {
86 if (ifindex == xi->dev->ifindex &&
87 (xi->dev->flags & IFF_UP))
88 return xi;
89 }
90
91 return NULL;
92}
93
94static void xfrmi_link(struct xfrmi_net *xfrmn, struct xfrm_if *xi)
95{
96 struct xfrm_if __rcu **xip = &xfrmn->xfrmi[0];
97
98 rcu_assign_pointer(xi->next , rtnl_dereference(*xip));
99 rcu_assign_pointer(*xip, xi);
100}
101
102static void xfrmi_unlink(struct xfrmi_net *xfrmn, struct xfrm_if *xi)
103{
104 struct xfrm_if __rcu **xip;
105 struct xfrm_if *iter;
106
107 for (xip = &xfrmn->xfrmi[0];
108 (iter = rtnl_dereference(*xip)) != NULL;
109 xip = &iter->next) {
110 if (xi == iter) {
111 rcu_assign_pointer(*xip, xi->next);
112 break;
113 }
114 }
115}
116
117static void xfrmi_dev_free(struct net_device *dev)
118{
119 free_percpu(dev->tstats);
120}
121
122static int xfrmi_create2(struct net_device *dev)
123{
124 struct xfrm_if *xi = netdev_priv(dev);
125 struct net *net = dev_net(dev);
126 struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id);
127 int err;
128
129 dev->rtnl_link_ops = &xfrmi_link_ops;
130 err = register_netdevice(dev);
131 if (err < 0)
132 goto out;
133
134 strcpy(xi->p.name, dev->name);
135
136 dev_hold(dev);
137 xfrmi_link(xfrmn, xi);
138
139 return 0;
140
141out:
142 return err;
143}
144
145static struct xfrm_if *xfrmi_create(struct net *net, struct xfrm_if_parms *p)
146{
147 struct net_device *dev;
148 struct xfrm_if *xi;
149 char name[IFNAMSIZ];
150 int err;
151
152 if (p->name[0]) {
153 strlcpy(name, p->name, IFNAMSIZ);
154 } else {
155 err = -EINVAL;
156 goto failed;
157 }
158
159 dev = alloc_netdev(sizeof(*xi), name, NET_NAME_UNKNOWN, xfrmi_dev_setup);
160 if (!dev) {
161 err = -EAGAIN;
162 goto failed;
163 }
164
165 dev_net_set(dev, net);
166
167 xi = netdev_priv(dev);
168 xi->p = *p;
169 xi->net = net;
170 xi->dev = dev;
171 xi->phydev = dev_get_by_index(net, p->link);
172 if (!xi->phydev) {
173 err = -ENODEV;
174 goto failed_free;
175 }
176
177 err = xfrmi_create2(dev);
178 if (err < 0)
179 goto failed_dev_put;
180
181 return xi;
182
183failed_dev_put:
184 dev_put(xi->phydev);
185failed_free:
186 free_netdev(dev);
187failed:
188 return ERR_PTR(err);
189}
190
191static struct xfrm_if *xfrmi_locate(struct net *net, struct xfrm_if_parms *p,
192 int create)
193{
194 struct xfrm_if __rcu **xip;
195 struct xfrm_if *xi;
196 struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id);
197
198 for (xip = &xfrmn->xfrmi[0];
199 (xi = rtnl_dereference(*xip)) != NULL;
200 xip = &xi->next) {
201 if (xi->p.if_id == p->if_id) {
202 if (create)
203 return ERR_PTR(-EEXIST);
204
205 return xi;
206 }
207 }
208 if (!create)
209 return ERR_PTR(-ENODEV);
210 return xfrmi_create(net, p);
211}
212
213static void xfrmi_dev_uninit(struct net_device *dev)
214{
215 struct xfrm_if *xi = netdev_priv(dev);
216 struct xfrmi_net *xfrmn = net_generic(xi->net, xfrmi_net_id);
217
218 xfrmi_unlink(xfrmn, xi);
219 dev_put(xi->phydev);
220 dev_put(dev);
221}
222
223static void xfrmi_scrub_packet(struct sk_buff *skb, bool xnet)
224{
225 skb->tstamp = 0;
226 skb->pkt_type = PACKET_HOST;
227 skb->skb_iif = 0;
228 skb->ignore_df = 0;
229 skb_dst_drop(skb);
230 nf_reset(skb);
231 nf_reset_trace(skb);
232
233 if (!xnet)
234 return;
235
236 ipvs_reset(skb);
237 secpath_reset(skb);
238 skb_orphan(skb);
239 skb->mark = 0;
240}
241
242static int xfrmi_rcv_cb(struct sk_buff *skb, int err)
243{
244 struct pcpu_sw_netstats *tstats;
245 struct xfrm_mode *inner_mode;
246 struct net_device *dev;
247 struct xfrm_state *x;
248 struct xfrm_if *xi;
249 bool xnet;
250
251 if (err && !skb->sp)
252 return 0;
253
254 x = xfrm_input_state(skb);
255
256 xi = xfrmi_lookup(xs_net(x), x);
257 if (!xi)
258 return 1;
259
260 dev = xi->dev;
261 skb->dev = dev;
262
263 if (err) {
264 dev->stats.rx_errors++;
265 dev->stats.rx_dropped++;
266
267 return 0;
268 }
269
270 xnet = !net_eq(xi->net, dev_net(skb->dev));
271
272 if (xnet) {
273 inner_mode = x->inner_mode;
274
275 if (x->sel.family == AF_UNSPEC) {
276 inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol);
277 if (inner_mode == NULL) {
278 XFRM_INC_STATS(dev_net(skb->dev),
279 LINUX_MIB_XFRMINSTATEMODEERROR);
280 return -EINVAL;
281 }
282 }
283
284 if (!xfrm_policy_check(NULL, XFRM_POLICY_IN, skb,
285 inner_mode->afinfo->family))
286 return -EPERM;
287 }
288
289 xfrmi_scrub_packet(skb, xnet);
290
291 tstats = this_cpu_ptr(dev->tstats);
292
293 u64_stats_update_begin(&tstats->syncp);
294 tstats->rx_packets++;
295 tstats->rx_bytes += skb->len;
296 u64_stats_update_end(&tstats->syncp);
297
298 return 0;
299}
300
301static int
302xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
303{
304 struct xfrm_if *xi = netdev_priv(dev);
305 struct net_device_stats *stats = &xi->dev->stats;
306 struct dst_entry *dst = skb_dst(skb);
307 unsigned int length = skb->len;
308 struct net_device *tdev;
309 struct xfrm_state *x;
310 int err = -1;
311 int mtu;
312
313 if (!dst)
314 goto tx_err_link_failure;
315
316 dst_hold(dst);
317 dst = xfrm_lookup_with_ifid(xi->net, dst, fl, NULL, 0, xi->p.if_id);
318 if (IS_ERR(dst)) {
319 err = PTR_ERR(dst);
320 dst = NULL;
321 goto tx_err_link_failure;
322 }
323
324 x = dst->xfrm;
325 if (!x)
326 goto tx_err_link_failure;
327
328 if (x->if_id != xi->p.if_id)
329 goto tx_err_link_failure;
330
331 tdev = dst->dev;
332
333 if (tdev == dev) {
334 stats->collisions++;
335 net_warn_ratelimited("%s: Local routing loop detected!\n",
336 xi->p.name);
337 goto tx_err_dst_release;
338 }
339
340 mtu = dst_mtu(dst);
341 if (!skb->ignore_df && skb->len > mtu) {
342 skb_dst_update_pmtu(skb, mtu);
343
344 if (skb->protocol == htons(ETH_P_IPV6)) {
345 if (mtu < IPV6_MIN_MTU)
346 mtu = IPV6_MIN_MTU;
347
348 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
349 } else {
350 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
351 htonl(mtu));
352 }
353
354 dst_release(dst);
355 return -EMSGSIZE;
356 }
357
358 xfrmi_scrub_packet(skb, !net_eq(xi->net, dev_net(dev)));
359 skb_dst_set(skb, dst);
360 skb->dev = tdev;
361
362 err = dst_output(xi->net, skb->sk, skb);
363 if (net_xmit_eval(err) == 0) {
364 struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats);
365
366 u64_stats_update_begin(&tstats->syncp);
367 tstats->tx_bytes += length;
368 tstats->tx_packets++;
369 u64_stats_update_end(&tstats->syncp);
370 } else {
371 stats->tx_errors++;
372 stats->tx_aborted_errors++;
373 }
374
375 return 0;
376tx_err_link_failure:
377 stats->tx_carrier_errors++;
378 dst_link_failure(skb);
379tx_err_dst_release:
380 dst_release(dst);
381 return err;
382}
383
384static netdev_tx_t xfrmi_xmit(struct sk_buff *skb, struct net_device *dev)
385{
386 struct xfrm_if *xi = netdev_priv(dev);
387 struct net_device_stats *stats = &xi->dev->stats;
388 struct flowi fl;
389 int ret;
390
391 memset(&fl, 0, sizeof(fl));
392
393 switch (skb->protocol) {
394 case htons(ETH_P_IPV6):
395 xfrm_decode_session(skb, &fl, AF_INET6);
396 memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
397 break;
398 case htons(ETH_P_IP):
399 xfrm_decode_session(skb, &fl, AF_INET);
400 memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
401 break;
402 default:
403 goto tx_err;
404 }
405
406 fl.flowi_oif = xi->phydev->ifindex;
407
408 ret = xfrmi_xmit2(skb, dev, &fl);
409 if (ret < 0)
410 goto tx_err;
411
412 return NETDEV_TX_OK;
413
414tx_err:
415 stats->tx_errors++;
416 stats->tx_dropped++;
417 kfree_skb(skb);
418 return NETDEV_TX_OK;
419}
420
421static int xfrmi4_err(struct sk_buff *skb, u32 info)
422{
423 const struct iphdr *iph = (const struct iphdr *)skb->data;
424 struct net *net = dev_net(skb->dev);
425 int protocol = iph->protocol;
426 struct ip_comp_hdr *ipch;
427 struct ip_esp_hdr *esph;
428 struct ip_auth_hdr *ah ;
429 struct xfrm_state *x;
430 struct xfrm_if *xi;
431 __be32 spi;
432
433 switch (protocol) {
434 case IPPROTO_ESP:
435 esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2));
436 spi = esph->spi;
437 break;
438 case IPPROTO_AH:
439 ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2));
440 spi = ah->spi;
441 break;
442 case IPPROTO_COMP:
443 ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2));
444 spi = htonl(ntohs(ipch->cpi));
445 break;
446 default:
447 return 0;
448 }
449
450 switch (icmp_hdr(skb)->type) {
451 case ICMP_DEST_UNREACH:
452 if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
453 return 0;
454 case ICMP_REDIRECT:
455 break;
456 default:
457 return 0;
458 }
459
460 x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
461 spi, protocol, AF_INET);
462 if (!x)
463 return 0;
464
465 xi = xfrmi_lookup(net, x);
466 if (!xi) {
467 xfrm_state_put(x);
468 return -1;
469 }
470
471 if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
472 ipv4_update_pmtu(skb, net, info, 0, 0, protocol, 0);
473 else
474 ipv4_redirect(skb, net, 0, 0, protocol, 0);
475 xfrm_state_put(x);
476
477 return 0;
478}
479
480static int xfrmi6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
481 u8 type, u8 code, int offset, __be32 info)
482{
483 const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data;
484 struct net *net = dev_net(skb->dev);
485 int protocol = iph->nexthdr;
486 struct ip_comp_hdr *ipch;
487 struct ip_esp_hdr *esph;
488 struct ip_auth_hdr *ah;
489 struct xfrm_state *x;
490 struct xfrm_if *xi;
491 __be32 spi;
492
493 switch (protocol) {
494 case IPPROTO_ESP:
495 esph = (struct ip_esp_hdr *)(skb->data + offset);
496 spi = esph->spi;
497 break;
498 case IPPROTO_AH:
499 ah = (struct ip_auth_hdr *)(skb->data + offset);
500 spi = ah->spi;
501 break;
502 case IPPROTO_COMP:
503 ipch = (struct ip_comp_hdr *)(skb->data + offset);
504 spi = htonl(ntohs(ipch->cpi));
505 break;
506 default:
507 return 0;
508 }
509
510 if (type != ICMPV6_PKT_TOOBIG &&
511 type != NDISC_REDIRECT)
512 return 0;
513
514 x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
515 spi, protocol, AF_INET6);
516 if (!x)
517 return 0;
518
519 xi = xfrmi_lookup(net, x);
520 if (!xi) {
521 xfrm_state_put(x);
522 return -1;
523 }
524
525 if (type == NDISC_REDIRECT)
526 ip6_redirect(skb, net, skb->dev->ifindex, 0,
527 sock_net_uid(net, NULL));
528 else
529 ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
530 xfrm_state_put(x);
531
532 return 0;
533}
534
535static int xfrmi_change(struct xfrm_if *xi, const struct xfrm_if_parms *p)
536{
537 if (xi->p.link != p->link)
538 return -EINVAL;
539
540 xi->p.if_id = p->if_id;
541
542 return 0;
543}
544
545static int xfrmi_update(struct xfrm_if *xi, struct xfrm_if_parms *p)
546{
547 struct net *net = dev_net(xi->dev);
548 struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id);
549 int err;
550
551 xfrmi_unlink(xfrmn, xi);
552 synchronize_net();
553 err = xfrmi_change(xi, p);
554 xfrmi_link(xfrmn, xi);
555 netdev_state_change(xi->dev);
556 return err;
557}
558
559static void xfrmi_get_stats64(struct net_device *dev,
560 struct rtnl_link_stats64 *s)
561{
562 int cpu;
563
564 if (!dev->tstats)
565 return;
566
567 for_each_possible_cpu(cpu) {
568 struct pcpu_sw_netstats *stats;
569 struct pcpu_sw_netstats tmp;
570 int start;
571
572 stats = per_cpu_ptr(dev->tstats, cpu);
573 do {
574 start = u64_stats_fetch_begin_irq(&stats->syncp);
575 tmp.rx_packets = stats->rx_packets;
576 tmp.rx_bytes = stats->rx_bytes;
577 tmp.tx_packets = stats->tx_packets;
578 tmp.tx_bytes = stats->tx_bytes;
579 } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
580
581 s->rx_packets += tmp.rx_packets;
582 s->rx_bytes += tmp.rx_bytes;
583 s->tx_packets += tmp.tx_packets;
584 s->tx_bytes += tmp.tx_bytes;
585 }
586
587 s->rx_dropped = dev->stats.rx_dropped;
588 s->tx_dropped = dev->stats.tx_dropped;
589}
590
591static int xfrmi_get_iflink(const struct net_device *dev)
592{
593 struct xfrm_if *xi = netdev_priv(dev);
594
595 return xi->phydev->ifindex;
596}
597
598
599static const struct net_device_ops xfrmi_netdev_ops = {
600 .ndo_init = xfrmi_dev_init,
601 .ndo_uninit = xfrmi_dev_uninit,
602 .ndo_start_xmit = xfrmi_xmit,
603 .ndo_get_stats64 = xfrmi_get_stats64,
604 .ndo_get_iflink = xfrmi_get_iflink,
605};
606
607static void xfrmi_dev_setup(struct net_device *dev)
608{
609 dev->netdev_ops = &xfrmi_netdev_ops;
610 dev->type = ARPHRD_NONE;
611 dev->hard_header_len = ETH_HLEN;
612 dev->min_header_len = ETH_HLEN;
613 dev->mtu = ETH_DATA_LEN;
614 dev->min_mtu = ETH_MIN_MTU;
615 dev->max_mtu = ETH_DATA_LEN;
616 dev->addr_len = ETH_ALEN;
617 dev->flags = IFF_NOARP;
618 dev->needs_free_netdev = true;
619 dev->priv_destructor = xfrmi_dev_free;
620 netif_keep_dst(dev);
621}
622
623static int xfrmi_dev_init(struct net_device *dev)
624{
625 struct xfrm_if *xi = netdev_priv(dev);
626 struct net_device *phydev = xi->phydev;
627 int err;
628
629 dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
630 if (!dev->tstats)
631 return -ENOMEM;
632
633 err = gro_cells_init(&xi->gro_cells, dev);
634 if (err) {
635 free_percpu(dev->tstats);
636 return err;
637 }
638
639 dev->features |= NETIF_F_LLTX;
640
641 dev->needed_headroom = phydev->needed_headroom;
642 dev->needed_tailroom = phydev->needed_tailroom;
643
644 if (is_zero_ether_addr(dev->dev_addr))
645 eth_hw_addr_inherit(dev, phydev);
646 if (is_zero_ether_addr(dev->broadcast))
647 memcpy(dev->broadcast, phydev->broadcast, dev->addr_len);
648
649 return 0;
650}
651
652static int xfrmi_validate(struct nlattr *tb[], struct nlattr *data[],
653 struct netlink_ext_ack *extack)
654{
655 return 0;
656}
657
658static void xfrmi_netlink_parms(struct nlattr *data[],
659 struct xfrm_if_parms *parms)
660{
661 memset(parms, 0, sizeof(*parms));
662
663 if (!data)
664 return;
665
666 if (data[IFLA_XFRM_LINK])
667 parms->link = nla_get_u32(data[IFLA_XFRM_LINK]);
668
669 if (data[IFLA_XFRM_IF_ID])
670 parms->if_id = nla_get_u32(data[IFLA_XFRM_IF_ID]);
671}
672
673static int xfrmi_newlink(struct net *src_net, struct net_device *dev,
674 struct nlattr *tb[], struct nlattr *data[],
675 struct netlink_ext_ack *extack)
676{
677 struct net *net = dev_net(dev);
678 struct xfrm_if_parms *p;
679 struct xfrm_if *xi;
680
681 xi = netdev_priv(dev);
682 p = &xi->p;
683
684 xfrmi_netlink_parms(data, p);
685
686 if (!tb[IFLA_IFNAME])
687 return -EINVAL;
688
689 nla_strlcpy(p->name, tb[IFLA_IFNAME], IFNAMSIZ);
690
691 xi = xfrmi_locate(net, p, 1);
692 return PTR_ERR_OR_ZERO(xi);
693}
694
695static void xfrmi_dellink(struct net_device *dev, struct list_head *head)
696{
697 unregister_netdevice_queue(dev, head);
698}
699
700static int xfrmi_changelink(struct net_device *dev, struct nlattr *tb[],
701 struct nlattr *data[],
702 struct netlink_ext_ack *extack)
703{
704 struct xfrm_if *xi = netdev_priv(dev);
705 struct net *net = dev_net(dev);
706
707 xfrmi_netlink_parms(data, &xi->p);
708
709 xi = xfrmi_locate(net, &xi->p, 0);
710
711 if (IS_ERR_OR_NULL(xi)) {
712 xi = netdev_priv(dev);
713 } else {
714 if (xi->dev != dev)
715 return -EEXIST;
716 }
717
718 return xfrmi_update(xi, &xi->p);
719}
720
721static size_t xfrmi_get_size(const struct net_device *dev)
722{
723 return
724 /* IFLA_XFRM_LINK */
725 nla_total_size(4) +
726 /* IFLA_XFRM_IF_ID */
727 nla_total_size(4) +
728 0;
729}
730
731static int xfrmi_fill_info(struct sk_buff *skb, const struct net_device *dev)
732{
733 struct xfrm_if *xi = netdev_priv(dev);
734 struct xfrm_if_parms *parm = &xi->p;
735
736 if (nla_put_u32(skb, IFLA_XFRM_LINK, parm->link) ||
737 nla_put_u32(skb, IFLA_XFRM_IF_ID, parm->if_id))
738 goto nla_put_failure;
739 return 0;
740
741nla_put_failure:
742 return -EMSGSIZE;
743}
744
745struct net *xfrmi_get_link_net(const struct net_device *dev)
746{
747 struct xfrm_if *xi = netdev_priv(dev);
748
749 return dev_net(xi->phydev);
750}
751
752static const struct nla_policy xfrmi_policy[IFLA_XFRM_MAX + 1] = {
753 [IFLA_XFRM_LINK] = { .type = NLA_U32 },
754 [IFLA_XFRM_IF_ID] = { .type = NLA_U32 },
755};
756
757static struct rtnl_link_ops xfrmi_link_ops __read_mostly = {
758 .kind = "xfrm",
759 .maxtype = IFLA_XFRM_MAX,
760 .policy = xfrmi_policy,
761 .priv_size = sizeof(struct xfrm_if),
762 .setup = xfrmi_dev_setup,
763 .validate = xfrmi_validate,
764 .newlink = xfrmi_newlink,
765 .dellink = xfrmi_dellink,
766 .changelink = xfrmi_changelink,
767 .get_size = xfrmi_get_size,
768 .fill_info = xfrmi_fill_info,
769 .get_link_net = xfrmi_get_link_net,
770};
771
772static void __net_exit xfrmi_destroy_interfaces(struct xfrmi_net *xfrmn)
773{
774 struct xfrm_if *xi;
775 LIST_HEAD(list);
776
777 xi = rtnl_dereference(xfrmn->xfrmi[0]);
778 if (!xi)
779 return;
780
781 unregister_netdevice_queue(xi->dev, &list);
782 unregister_netdevice_many(&list);
783}
784
785static int __net_init xfrmi_init_net(struct net *net)
786{
787 return 0;
788}
789
790static void __net_exit xfrmi_exit_net(struct net *net)
791{
792 struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id);
793
794 rtnl_lock();
795 xfrmi_destroy_interfaces(xfrmn);
796 rtnl_unlock();
797}
798
799static struct pernet_operations xfrmi_net_ops = {
800 .init = xfrmi_init_net,
801 .exit = xfrmi_exit_net,
802 .id = &xfrmi_net_id,
803 .size = sizeof(struct xfrmi_net),
804};
805
806static struct xfrm6_protocol xfrmi_esp6_protocol __read_mostly = {
807 .handler = xfrm6_rcv,
808 .cb_handler = xfrmi_rcv_cb,
809 .err_handler = xfrmi6_err,
810 .priority = 10,
811};
812
813static struct xfrm6_protocol xfrmi_ah6_protocol __read_mostly = {
814 .handler = xfrm6_rcv,
815 .cb_handler = xfrmi_rcv_cb,
816 .err_handler = xfrmi6_err,
817 .priority = 10,
818};
819
820static struct xfrm6_protocol xfrmi_ipcomp6_protocol __read_mostly = {
821 .handler = xfrm6_rcv,
822 .cb_handler = xfrmi_rcv_cb,
823 .err_handler = xfrmi6_err,
824 .priority = 10,
825};
826
827static struct xfrm4_protocol xfrmi_esp4_protocol __read_mostly = {
828 .handler = xfrm4_rcv,
829 .input_handler = xfrm_input,
830 .cb_handler = xfrmi_rcv_cb,
831 .err_handler = xfrmi4_err,
832 .priority = 10,
833};
834
835static struct xfrm4_protocol xfrmi_ah4_protocol __read_mostly = {
836 .handler = xfrm4_rcv,
837 .input_handler = xfrm_input,
838 .cb_handler = xfrmi_rcv_cb,
839 .err_handler = xfrmi4_err,
840 .priority = 10,
841};
842
843static struct xfrm4_protocol xfrmi_ipcomp4_protocol __read_mostly = {
844 .handler = xfrm4_rcv,
845 .input_handler = xfrm_input,
846 .cb_handler = xfrmi_rcv_cb,
847 .err_handler = xfrmi4_err,
848 .priority = 10,
849};
850
851static int __init xfrmi4_init(void)
852{
853 int err;
854
855 err = xfrm4_protocol_register(&xfrmi_esp4_protocol, IPPROTO_ESP);
856 if (err < 0)
857 goto xfrm_proto_esp_failed;
858 err = xfrm4_protocol_register(&xfrmi_ah4_protocol, IPPROTO_AH);
859 if (err < 0)
860 goto xfrm_proto_ah_failed;
861 err = xfrm4_protocol_register(&xfrmi_ipcomp4_protocol, IPPROTO_COMP);
862 if (err < 0)
863 goto xfrm_proto_comp_failed;
864
865 return 0;
866
867xfrm_proto_comp_failed:
868 xfrm4_protocol_deregister(&xfrmi_ah4_protocol, IPPROTO_AH);
869xfrm_proto_ah_failed:
870 xfrm4_protocol_deregister(&xfrmi_esp4_protocol, IPPROTO_ESP);
871xfrm_proto_esp_failed:
872 return err;
873}
874
875static void xfrmi4_fini(void)
876{
877 xfrm4_protocol_deregister(&xfrmi_ipcomp4_protocol, IPPROTO_COMP);
878 xfrm4_protocol_deregister(&xfrmi_ah4_protocol, IPPROTO_AH);
879 xfrm4_protocol_deregister(&xfrmi_esp4_protocol, IPPROTO_ESP);
880}
881
882static int __init xfrmi6_init(void)
883{
884 int err;
885
886 err = xfrm6_protocol_register(&xfrmi_esp6_protocol, IPPROTO_ESP);
887 if (err < 0)
888 goto xfrm_proto_esp_failed;
889 err = xfrm6_protocol_register(&xfrmi_ah6_protocol, IPPROTO_AH);
890 if (err < 0)
891 goto xfrm_proto_ah_failed;
892 err = xfrm6_protocol_register(&xfrmi_ipcomp6_protocol, IPPROTO_COMP);
893 if (err < 0)
894 goto xfrm_proto_comp_failed;
895
896 return 0;
897
898xfrm_proto_comp_failed:
899 xfrm6_protocol_deregister(&xfrmi_ah6_protocol, IPPROTO_AH);
900xfrm_proto_ah_failed:
901 xfrm6_protocol_deregister(&xfrmi_esp6_protocol, IPPROTO_ESP);
902xfrm_proto_esp_failed:
903 return err;
904}
905
906static void xfrmi6_fini(void)
907{
908 xfrm6_protocol_deregister(&xfrmi_ipcomp6_protocol, IPPROTO_COMP);
909 xfrm6_protocol_deregister(&xfrmi_ah6_protocol, IPPROTO_AH);
910 xfrm6_protocol_deregister(&xfrmi_esp6_protocol, IPPROTO_ESP);
911}
912
913static const struct xfrm_if_cb xfrm_if_cb = {
914 .decode_session = xfrmi_decode_session,
915};
916
917static int __init xfrmi_init(void)
918{
919 const char *msg;
920 int err;
921
922 pr_info("IPsec XFRM device driver\n");
923
924 msg = "tunnel device";
925 err = register_pernet_device(&xfrmi_net_ops);
926 if (err < 0)
927 goto pernet_dev_failed;
928
929 msg = "xfrm4 protocols";
930 err = xfrmi4_init();
931 if (err < 0)
932 goto xfrmi4_failed;
933
934 msg = "xfrm6 protocols";
935 err = xfrmi6_init();
936 if (err < 0)
937 goto xfrmi6_failed;
938
939
940 msg = "netlink interface";
941 err = rtnl_link_register(&xfrmi_link_ops);
942 if (err < 0)
943 goto rtnl_link_failed;
944
945 xfrm_if_register_cb(&xfrm_if_cb);
946
947 return err;
948
949rtnl_link_failed:
950 xfrmi6_fini();
951xfrmi6_failed:
952 xfrmi4_fini();
953xfrmi4_failed:
954 unregister_pernet_device(&xfrmi_net_ops);
955pernet_dev_failed:
956 pr_err("xfrmi init: failed to register %s\n", msg);
957 return err;
958}
959
960static void __exit xfrmi_fini(void)
961{
962 xfrm_if_unregister_cb();
963 rtnl_link_unregister(&xfrmi_link_ops);
964 xfrmi4_fini();
965 xfrmi6_fini();
966 unregister_pernet_device(&xfrmi_net_ops);
967}
968
969module_init(xfrmi_init);
970module_exit(xfrmi_fini);
971MODULE_LICENSE("GPL");
972MODULE_ALIAS_RTNL_LINK("xfrm");
973MODULE_ALIAS_NETDEV("xfrm0");
974MODULE_AUTHOR("Steffen Klassert");
975MODULE_DESCRIPTION("XFRM virtual interface");
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index 89b178a78dc7..45ba07ab3e4f 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -66,8 +66,7 @@ static int xfrm_output_one(struct sk_buff *skb, int err)
66 goto error_nolock; 66 goto error_nolock;
67 } 67 }
68 68
69 if (x->props.output_mark) 69 skb->mark = xfrm_smark_get(skb->mark, x);
70 skb->mark = x->props.output_mark;
71 70
72 err = x->outer_mode->output(x, skb); 71 err = x->outer_mode->output(x, skb);
73 if (err) { 72 if (err) {
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 5f48251c1319..3110c3fbee20 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -45,8 +45,9 @@ struct xfrm_flo {
45 u8 flags; 45 u8 flags;
46}; 46};
47 47
48static DEFINE_PER_CPU(struct xfrm_dst *, xfrm_last_dst); 48static DEFINE_SPINLOCK(xfrm_if_cb_lock);
49static struct work_struct *xfrm_pcpu_work __read_mostly; 49static struct xfrm_if_cb const __rcu *xfrm_if_cb __read_mostly;
50
50static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock); 51static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock);
51static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1] 52static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1]
52 __read_mostly; 53 __read_mostly;
@@ -119,6 +120,12 @@ static const struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short fa
119 return afinfo; 120 return afinfo;
120} 121}
121 122
123/* Called with rcu_read_lock(). */
124static const struct xfrm_if_cb *xfrm_if_get_cb(void)
125{
126 return rcu_dereference(xfrm_if_cb);
127}
128
122struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif, 129struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif,
123 const xfrm_address_t *saddr, 130 const xfrm_address_t *saddr,
124 const xfrm_address_t *daddr, 131 const xfrm_address_t *daddr,
@@ -182,8 +189,8 @@ static inline unsigned long make_jiffies(long secs)
182static void xfrm_policy_timer(struct timer_list *t) 189static void xfrm_policy_timer(struct timer_list *t)
183{ 190{
184 struct xfrm_policy *xp = from_timer(xp, t, timer); 191 struct xfrm_policy *xp = from_timer(xp, t, timer);
185 unsigned long now = get_seconds(); 192 time64_t now = ktime_get_real_seconds();
186 long next = LONG_MAX; 193 time64_t next = TIME64_MAX;
187 int warn = 0; 194 int warn = 0;
188 int dir; 195 int dir;
189 196
@@ -195,7 +202,7 @@ static void xfrm_policy_timer(struct timer_list *t)
195 dir = xfrm_policy_id2dir(xp->index); 202 dir = xfrm_policy_id2dir(xp->index);
196 203
197 if (xp->lft.hard_add_expires_seconds) { 204 if (xp->lft.hard_add_expires_seconds) {
198 long tmo = xp->lft.hard_add_expires_seconds + 205 time64_t tmo = xp->lft.hard_add_expires_seconds +
199 xp->curlft.add_time - now; 206 xp->curlft.add_time - now;
200 if (tmo <= 0) 207 if (tmo <= 0)
201 goto expired; 208 goto expired;
@@ -203,7 +210,7 @@ static void xfrm_policy_timer(struct timer_list *t)
203 next = tmo; 210 next = tmo;
204 } 211 }
205 if (xp->lft.hard_use_expires_seconds) { 212 if (xp->lft.hard_use_expires_seconds) {
206 long tmo = xp->lft.hard_use_expires_seconds + 213 time64_t tmo = xp->lft.hard_use_expires_seconds +
207 (xp->curlft.use_time ? : xp->curlft.add_time) - now; 214 (xp->curlft.use_time ? : xp->curlft.add_time) - now;
208 if (tmo <= 0) 215 if (tmo <= 0)
209 goto expired; 216 goto expired;
@@ -211,7 +218,7 @@ static void xfrm_policy_timer(struct timer_list *t)
211 next = tmo; 218 next = tmo;
212 } 219 }
213 if (xp->lft.soft_add_expires_seconds) { 220 if (xp->lft.soft_add_expires_seconds) {
214 long tmo = xp->lft.soft_add_expires_seconds + 221 time64_t tmo = xp->lft.soft_add_expires_seconds +
215 xp->curlft.add_time - now; 222 xp->curlft.add_time - now;
216 if (tmo <= 0) { 223 if (tmo <= 0) {
217 warn = 1; 224 warn = 1;
@@ -221,7 +228,7 @@ static void xfrm_policy_timer(struct timer_list *t)
221 next = tmo; 228 next = tmo;
222 } 229 }
223 if (xp->lft.soft_use_expires_seconds) { 230 if (xp->lft.soft_use_expires_seconds) {
224 long tmo = xp->lft.soft_use_expires_seconds + 231 time64_t tmo = xp->lft.soft_use_expires_seconds +
225 (xp->curlft.use_time ? : xp->curlft.add_time) - now; 232 (xp->curlft.use_time ? : xp->curlft.add_time) - now;
226 if (tmo <= 0) { 233 if (tmo <= 0) {
227 warn = 1; 234 warn = 1;
@@ -233,7 +240,7 @@ static void xfrm_policy_timer(struct timer_list *t)
233 240
234 if (warn) 241 if (warn)
235 km_policy_expired(xp, dir, 0, 0); 242 km_policy_expired(xp, dir, 0, 0);
236 if (next != LONG_MAX && 243 if (next != TIME64_MAX &&
237 !mod_timer(&xp->timer, jiffies + make_jiffies(next))) 244 !mod_timer(&xp->timer, jiffies + make_jiffies(next)))
238 xfrm_pol_hold(xp); 245 xfrm_pol_hold(xp);
239 246
@@ -747,6 +754,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
747 newpos = NULL; 754 newpos = NULL;
748 hlist_for_each_entry(pol, chain, bydst) { 755 hlist_for_each_entry(pol, chain, bydst) {
749 if (pol->type == policy->type && 756 if (pol->type == policy->type &&
757 pol->if_id == policy->if_id &&
750 !selector_cmp(&pol->selector, &policy->selector) && 758 !selector_cmp(&pol->selector, &policy->selector) &&
751 xfrm_policy_mark_match(policy, pol) && 759 xfrm_policy_mark_match(policy, pol) &&
752 xfrm_sec_ctx_match(pol->security, policy->security) && 760 xfrm_sec_ctx_match(pol->security, policy->security) &&
@@ -783,7 +791,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
783 } 791 }
784 policy->index = delpol ? delpol->index : xfrm_gen_index(net, dir, policy->index); 792 policy->index = delpol ? delpol->index : xfrm_gen_index(net, dir, policy->index);
785 hlist_add_head(&policy->byidx, net->xfrm.policy_byidx+idx_hash(net, policy->index)); 793 hlist_add_head(&policy->byidx, net->xfrm.policy_byidx+idx_hash(net, policy->index));
786 policy->curlft.add_time = get_seconds(); 794 policy->curlft.add_time = ktime_get_real_seconds();
787 policy->curlft.use_time = 0; 795 policy->curlft.use_time = 0;
788 if (!mod_timer(&policy->timer, jiffies + HZ)) 796 if (!mod_timer(&policy->timer, jiffies + HZ))
789 xfrm_pol_hold(policy); 797 xfrm_pol_hold(policy);
@@ -798,8 +806,9 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
798} 806}
799EXPORT_SYMBOL(xfrm_policy_insert); 807EXPORT_SYMBOL(xfrm_policy_insert);
800 808
801struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type, 809struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u32 if_id,
802 int dir, struct xfrm_selector *sel, 810 u8 type, int dir,
811 struct xfrm_selector *sel,
803 struct xfrm_sec_ctx *ctx, int delete, 812 struct xfrm_sec_ctx *ctx, int delete,
804 int *err) 813 int *err)
805{ 814{
@@ -812,6 +821,7 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type,
812 ret = NULL; 821 ret = NULL;
813 hlist_for_each_entry(pol, chain, bydst) { 822 hlist_for_each_entry(pol, chain, bydst) {
814 if (pol->type == type && 823 if (pol->type == type &&
824 pol->if_id == if_id &&
815 (mark & pol->mark.m) == pol->mark.v && 825 (mark & pol->mark.m) == pol->mark.v &&
816 !selector_cmp(sel, &pol->selector) && 826 !selector_cmp(sel, &pol->selector) &&
817 xfrm_sec_ctx_match(ctx, pol->security)) { 827 xfrm_sec_ctx_match(ctx, pol->security)) {
@@ -837,8 +847,9 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type,
837} 847}
838EXPORT_SYMBOL(xfrm_policy_bysel_ctx); 848EXPORT_SYMBOL(xfrm_policy_bysel_ctx);
839 849
840struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type, 850struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u32 if_id,
841 int dir, u32 id, int delete, int *err) 851 u8 type, int dir, u32 id, int delete,
852 int *err)
842{ 853{
843 struct xfrm_policy *pol, *ret; 854 struct xfrm_policy *pol, *ret;
844 struct hlist_head *chain; 855 struct hlist_head *chain;
@@ -853,6 +864,7 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type,
853 ret = NULL; 864 ret = NULL;
854 hlist_for_each_entry(pol, chain, byidx) { 865 hlist_for_each_entry(pol, chain, byidx) {
855 if (pol->type == type && pol->index == id && 866 if (pol->type == type && pol->index == id &&
867 pol->if_id == if_id &&
856 (mark & pol->mark.m) == pol->mark.v) { 868 (mark & pol->mark.m) == pol->mark.v) {
857 xfrm_pol_hold(pol); 869 xfrm_pol_hold(pol);
858 if (delete) { 870 if (delete) {
@@ -1056,13 +1068,14 @@ EXPORT_SYMBOL(xfrm_policy_walk_done);
1056 */ 1068 */
1057static int xfrm_policy_match(const struct xfrm_policy *pol, 1069static int xfrm_policy_match(const struct xfrm_policy *pol,
1058 const struct flowi *fl, 1070 const struct flowi *fl,
1059 u8 type, u16 family, int dir) 1071 u8 type, u16 family, int dir, u32 if_id)
1060{ 1072{
1061 const struct xfrm_selector *sel = &pol->selector; 1073 const struct xfrm_selector *sel = &pol->selector;
1062 int ret = -ESRCH; 1074 int ret = -ESRCH;
1063 bool match; 1075 bool match;
1064 1076
1065 if (pol->family != family || 1077 if (pol->family != family ||
1078 pol->if_id != if_id ||
1066 (fl->flowi_mark & pol->mark.m) != pol->mark.v || 1079 (fl->flowi_mark & pol->mark.m) != pol->mark.v ||
1067 pol->type != type) 1080 pol->type != type)
1068 return ret; 1081 return ret;
@@ -1077,7 +1090,8 @@ static int xfrm_policy_match(const struct xfrm_policy *pol,
1077 1090
1078static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, 1091static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
1079 const struct flowi *fl, 1092 const struct flowi *fl,
1080 u16 family, u8 dir) 1093 u16 family, u8 dir,
1094 u32 if_id)
1081{ 1095{
1082 int err; 1096 int err;
1083 struct xfrm_policy *pol, *ret; 1097 struct xfrm_policy *pol, *ret;
@@ -1101,7 +1115,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
1101 priority = ~0U; 1115 priority = ~0U;
1102 ret = NULL; 1116 ret = NULL;
1103 hlist_for_each_entry_rcu(pol, chain, bydst) { 1117 hlist_for_each_entry_rcu(pol, chain, bydst) {
1104 err = xfrm_policy_match(pol, fl, type, family, dir); 1118 err = xfrm_policy_match(pol, fl, type, family, dir, if_id);
1105 if (err) { 1119 if (err) {
1106 if (err == -ESRCH) 1120 if (err == -ESRCH)
1107 continue; 1121 continue;
@@ -1120,7 +1134,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
1120 if ((pol->priority >= priority) && ret) 1134 if ((pol->priority >= priority) && ret)
1121 break; 1135 break;
1122 1136
1123 err = xfrm_policy_match(pol, fl, type, family, dir); 1137 err = xfrm_policy_match(pol, fl, type, family, dir, if_id);
1124 if (err) { 1138 if (err) {
1125 if (err == -ESRCH) 1139 if (err == -ESRCH)
1126 continue; 1140 continue;
@@ -1145,21 +1159,25 @@ fail:
1145 return ret; 1159 return ret;
1146} 1160}
1147 1161
1148static struct xfrm_policy * 1162static struct xfrm_policy *xfrm_policy_lookup(struct net *net,
1149xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir) 1163 const struct flowi *fl,
1164 u16 family, u8 dir, u32 if_id)
1150{ 1165{
1151#ifdef CONFIG_XFRM_SUB_POLICY 1166#ifdef CONFIG_XFRM_SUB_POLICY
1152 struct xfrm_policy *pol; 1167 struct xfrm_policy *pol;
1153 1168
1154 pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family, dir); 1169 pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family,
1170 dir, if_id);
1155 if (pol != NULL) 1171 if (pol != NULL)
1156 return pol; 1172 return pol;
1157#endif 1173#endif
1158 return xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir); 1174 return xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family,
1175 dir, if_id);
1159} 1176}
1160 1177
1161static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir, 1178static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir,
1162 const struct flowi *fl, u16 family) 1179 const struct flowi *fl,
1180 u16 family, u32 if_id)
1163{ 1181{
1164 struct xfrm_policy *pol; 1182 struct xfrm_policy *pol;
1165 1183
@@ -1177,7 +1195,8 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir,
1177 1195
1178 match = xfrm_selector_match(&pol->selector, fl, family); 1196 match = xfrm_selector_match(&pol->selector, fl, family);
1179 if (match) { 1197 if (match) {
1180 if ((sk->sk_mark & pol->mark.m) != pol->mark.v) { 1198 if ((sk->sk_mark & pol->mark.m) != pol->mark.v ||
1199 pol->if_id != if_id) {
1181 pol = NULL; 1200 pol = NULL;
1182 goto out; 1201 goto out;
1183 } 1202 }
@@ -1268,7 +1287,7 @@ int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol)
1268 old_pol = rcu_dereference_protected(sk->sk_policy[dir], 1287 old_pol = rcu_dereference_protected(sk->sk_policy[dir],
1269 lockdep_is_held(&net->xfrm.xfrm_policy_lock)); 1288 lockdep_is_held(&net->xfrm.xfrm_policy_lock));
1270 if (pol) { 1289 if (pol) {
1271 pol->curlft.add_time = get_seconds(); 1290 pol->curlft.add_time = ktime_get_real_seconds();
1272 pol->index = xfrm_gen_index(net, XFRM_POLICY_MAX+dir, 0); 1291 pol->index = xfrm_gen_index(net, XFRM_POLICY_MAX+dir, 0);
1273 xfrm_sk_policy_link(pol, dir); 1292 xfrm_sk_policy_link(pol, dir);
1274 } 1293 }
@@ -1305,6 +1324,7 @@ static struct xfrm_policy *clone_policy(const struct xfrm_policy *old, int dir)
1305 newp->lft = old->lft; 1324 newp->lft = old->lft;
1306 newp->curlft = old->curlft; 1325 newp->curlft = old->curlft;
1307 newp->mark = old->mark; 1326 newp->mark = old->mark;
1327 newp->if_id = old->if_id;
1308 newp->action = old->action; 1328 newp->action = old->action;
1309 newp->flags = old->flags; 1329 newp->flags = old->flags;
1310 newp->xfrm_nr = old->xfrm_nr; 1330 newp->xfrm_nr = old->xfrm_nr;
@@ -1390,7 +1410,8 @@ xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl,
1390 } 1410 }
1391 } 1411 }
1392 1412
1393 x = xfrm_state_find(remote, local, fl, tmpl, policy, &error, family); 1413 x = xfrm_state_find(remote, local, fl, tmpl, policy, &error,
1414 family, policy->if_id);
1394 1415
1395 if (x && x->km.state == XFRM_STATE_VALID) { 1416 if (x && x->km.state == XFRM_STATE_VALID) {
1396 xfrm[nx++] = x; 1417 xfrm[nx++] = x;
@@ -1607,10 +1628,11 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
1607 dst_copy_metrics(dst1, dst); 1628 dst_copy_metrics(dst1, dst);
1608 1629
1609 if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) { 1630 if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) {
1631 __u32 mark = xfrm_smark_get(fl->flowi_mark, xfrm[i]);
1632
1610 family = xfrm[i]->props.family; 1633 family = xfrm[i]->props.family;
1611 dst = xfrm_dst_lookup(xfrm[i], tos, fl->flowi_oif, 1634 dst = xfrm_dst_lookup(xfrm[i], tos, fl->flowi_oif,
1612 &saddr, &daddr, family, 1635 &saddr, &daddr, family, mark);
1613 xfrm[i]->props.output_mark);
1614 err = PTR_ERR(dst); 1636 err = PTR_ERR(dst);
1615 if (IS_ERR(dst)) 1637 if (IS_ERR(dst))
1616 goto put_states; 1638 goto put_states;
@@ -1692,7 +1714,8 @@ static int xfrm_expand_policies(const struct flowi *fl, u16 family,
1692 pols[1] = xfrm_policy_lookup_bytype(xp_net(pols[0]), 1714 pols[1] = xfrm_policy_lookup_bytype(xp_net(pols[0]),
1693 XFRM_POLICY_TYPE_MAIN, 1715 XFRM_POLICY_TYPE_MAIN,
1694 fl, family, 1716 fl, family,
1695 XFRM_POLICY_OUT); 1717 XFRM_POLICY_OUT,
1718 pols[0]->if_id);
1696 if (pols[1]) { 1719 if (pols[1]) {
1697 if (IS_ERR(pols[1])) { 1720 if (IS_ERR(pols[1])) {
1698 xfrm_pols_put(pols, *num_pols); 1721 xfrm_pols_put(pols, *num_pols);
@@ -1714,108 +1737,6 @@ static int xfrm_expand_policies(const struct flowi *fl, u16 family,
1714 1737
1715} 1738}
1716 1739
1717static void xfrm_last_dst_update(struct xfrm_dst *xdst, struct xfrm_dst *old)
1718{
1719 this_cpu_write(xfrm_last_dst, xdst);
1720 if (old)
1721 dst_release(&old->u.dst);
1722}
1723
1724static void __xfrm_pcpu_work_fn(void)
1725{
1726 struct xfrm_dst *old;
1727
1728 old = this_cpu_read(xfrm_last_dst);
1729 if (old && !xfrm_bundle_ok(old))
1730 xfrm_last_dst_update(NULL, old);
1731}
1732
1733static void xfrm_pcpu_work_fn(struct work_struct *work)
1734{
1735 local_bh_disable();
1736 rcu_read_lock();
1737 __xfrm_pcpu_work_fn();
1738 rcu_read_unlock();
1739 local_bh_enable();
1740}
1741
1742void xfrm_policy_cache_flush(void)
1743{
1744 struct xfrm_dst *old;
1745 bool found = false;
1746 int cpu;
1747
1748 might_sleep();
1749
1750 local_bh_disable();
1751 rcu_read_lock();
1752 for_each_possible_cpu(cpu) {
1753 old = per_cpu(xfrm_last_dst, cpu);
1754 if (old && !xfrm_bundle_ok(old)) {
1755 if (smp_processor_id() == cpu) {
1756 __xfrm_pcpu_work_fn();
1757 continue;
1758 }
1759 found = true;
1760 break;
1761 }
1762 }
1763
1764 rcu_read_unlock();
1765 local_bh_enable();
1766
1767 if (!found)
1768 return;
1769
1770 get_online_cpus();
1771
1772 for_each_possible_cpu(cpu) {
1773 bool bundle_release;
1774
1775 rcu_read_lock();
1776 old = per_cpu(xfrm_last_dst, cpu);
1777 bundle_release = old && !xfrm_bundle_ok(old);
1778 rcu_read_unlock();
1779
1780 if (!bundle_release)
1781 continue;
1782
1783 if (cpu_online(cpu)) {
1784 schedule_work_on(cpu, &xfrm_pcpu_work[cpu]);
1785 continue;
1786 }
1787
1788 rcu_read_lock();
1789 old = per_cpu(xfrm_last_dst, cpu);
1790 if (old && !xfrm_bundle_ok(old)) {
1791 per_cpu(xfrm_last_dst, cpu) = NULL;
1792 dst_release(&old->u.dst);
1793 }
1794 rcu_read_unlock();
1795 }
1796
1797 put_online_cpus();
1798}
1799
1800static bool xfrm_xdst_can_reuse(struct xfrm_dst *xdst,
1801 struct xfrm_state * const xfrm[],
1802 int num)
1803{
1804 const struct dst_entry *dst = &xdst->u.dst;
1805 int i;
1806
1807 if (xdst->num_xfrms != num)
1808 return false;
1809
1810 for (i = 0; i < num; i++) {
1811 if (!dst || dst->xfrm != xfrm[i])
1812 return false;
1813 dst = xfrm_dst_child(dst);
1814 }
1815
1816 return xfrm_bundle_ok(xdst);
1817}
1818
1819static struct xfrm_dst * 1740static struct xfrm_dst *
1820xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols, 1741xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols,
1821 const struct flowi *fl, u16 family, 1742 const struct flowi *fl, u16 family,
@@ -1824,34 +1745,21 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols,
1824 struct net *net = xp_net(pols[0]); 1745 struct net *net = xp_net(pols[0]);
1825 struct xfrm_state *xfrm[XFRM_MAX_DEPTH]; 1746 struct xfrm_state *xfrm[XFRM_MAX_DEPTH];
1826 struct xfrm_dst *bundle[XFRM_MAX_DEPTH]; 1747 struct xfrm_dst *bundle[XFRM_MAX_DEPTH];
1827 struct xfrm_dst *xdst, *old; 1748 struct xfrm_dst *xdst;
1828 struct dst_entry *dst; 1749 struct dst_entry *dst;
1829 int err; 1750 int err;
1830 1751
1831 /* Try to instantiate a bundle */ 1752 /* Try to instantiate a bundle */
1832 err = xfrm_tmpl_resolve(pols, num_pols, fl, xfrm, family); 1753 err = xfrm_tmpl_resolve(pols, num_pols, fl, xfrm, family);
1833 if (err <= 0) { 1754 if (err <= 0) {
1834 if (err != 0 && err != -EAGAIN) 1755 if (err == 0)
1756 return NULL;
1757
1758 if (err != -EAGAIN)
1835 XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR); 1759 XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
1836 return ERR_PTR(err); 1760 return ERR_PTR(err);
1837 } 1761 }
1838 1762
1839 xdst = this_cpu_read(xfrm_last_dst);
1840 if (xdst &&
1841 xdst->u.dst.dev == dst_orig->dev &&
1842 xdst->num_pols == num_pols &&
1843 memcmp(xdst->pols, pols,
1844 sizeof(struct xfrm_policy *) * num_pols) == 0 &&
1845 xfrm_xdst_can_reuse(xdst, xfrm, err)) {
1846 dst_hold(&xdst->u.dst);
1847 xfrm_pols_put(pols, num_pols);
1848 while (err > 0)
1849 xfrm_state_put(xfrm[--err]);
1850 return xdst;
1851 }
1852
1853 old = xdst;
1854
1855 dst = xfrm_bundle_create(pols[0], xfrm, bundle, err, fl, dst_orig); 1763 dst = xfrm_bundle_create(pols[0], xfrm, bundle, err, fl, dst_orig);
1856 if (IS_ERR(dst)) { 1764 if (IS_ERR(dst)) {
1857 XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLEGENERROR); 1765 XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLEGENERROR);
@@ -1864,9 +1772,6 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols,
1864 memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols); 1772 memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols);
1865 xdst->policy_genid = atomic_read(&pols[0]->genid); 1773 xdst->policy_genid = atomic_read(&pols[0]->genid);
1866 1774
1867 atomic_set(&xdst->u.dst.__refcnt, 2);
1868 xfrm_last_dst_update(xdst, old);
1869
1870 return xdst; 1775 return xdst;
1871} 1776}
1872 1777
@@ -2047,8 +1952,10 @@ free_dst:
2047 goto out; 1952 goto out;
2048} 1953}
2049 1954
2050static struct xfrm_dst * 1955static struct xfrm_dst *xfrm_bundle_lookup(struct net *net,
2051xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir, struct xfrm_flo *xflo) 1956 const struct flowi *fl,
1957 u16 family, u8 dir,
1958 struct xfrm_flo *xflo, u32 if_id)
2052{ 1959{
2053 struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; 1960 struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
2054 int num_pols = 0, num_xfrms = 0, err; 1961 int num_pols = 0, num_xfrms = 0, err;
@@ -2057,7 +1964,7 @@ xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir,
2057 /* Resolve policies to use if we couldn't get them from 1964 /* Resolve policies to use if we couldn't get them from
2058 * previous cache entry */ 1965 * previous cache entry */
2059 num_pols = 1; 1966 num_pols = 1;
2060 pols[0] = xfrm_policy_lookup(net, fl, family, dir); 1967 pols[0] = xfrm_policy_lookup(net, fl, family, dir, if_id);
2061 err = xfrm_expand_policies(fl, family, pols, 1968 err = xfrm_expand_policies(fl, family, pols,
2062 &num_pols, &num_xfrms); 1969 &num_pols, &num_xfrms);
2063 if (err < 0) 1970 if (err < 0)
@@ -2067,13 +1974,15 @@ xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir,
2067 if (num_xfrms <= 0) 1974 if (num_xfrms <= 0)
2068 goto make_dummy_bundle; 1975 goto make_dummy_bundle;
2069 1976
2070 local_bh_disable();
2071 xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family, 1977 xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family,
2072 xflo->dst_orig); 1978 xflo->dst_orig);
2073 local_bh_enable();
2074
2075 if (IS_ERR(xdst)) { 1979 if (IS_ERR(xdst)) {
2076 err = PTR_ERR(xdst); 1980 err = PTR_ERR(xdst);
1981 if (err == -EREMOTE) {
1982 xfrm_pols_put(pols, num_pols);
1983 return NULL;
1984 }
1985
2077 if (err != -EAGAIN) 1986 if (err != -EAGAIN)
2078 goto error; 1987 goto error;
2079 goto make_dummy_bundle; 1988 goto make_dummy_bundle;
@@ -2123,14 +2032,19 @@ static struct dst_entry *make_blackhole(struct net *net, u16 family,
2123 return ret; 2032 return ret;
2124} 2033}
2125 2034
2126/* Main function: finds/creates a bundle for given flow. 2035/* Finds/creates a bundle for given flow and if_id
2127 * 2036 *
2128 * At the moment we eat a raw IP route. Mostly to speed up lookups 2037 * At the moment we eat a raw IP route. Mostly to speed up lookups
2129 * on interfaces with disabled IPsec. 2038 * on interfaces with disabled IPsec.
2039 *
2040 * xfrm_lookup uses an if_id of 0 by default, and is provided for
2041 * compatibility
2130 */ 2042 */
2131struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, 2043struct dst_entry *xfrm_lookup_with_ifid(struct net *net,
2132 const struct flowi *fl, 2044 struct dst_entry *dst_orig,
2133 const struct sock *sk, int flags) 2045 const struct flowi *fl,
2046 const struct sock *sk,
2047 int flags, u32 if_id)
2134{ 2048{
2135 struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; 2049 struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
2136 struct xfrm_dst *xdst; 2050 struct xfrm_dst *xdst;
@@ -2146,7 +2060,8 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
2146 sk = sk_const_to_full_sk(sk); 2060 sk = sk_const_to_full_sk(sk);
2147 if (sk && sk->sk_policy[XFRM_POLICY_OUT]) { 2061 if (sk && sk->sk_policy[XFRM_POLICY_OUT]) {
2148 num_pols = 1; 2062 num_pols = 1;
2149 pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl, family); 2063 pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl, family,
2064 if_id);
2150 err = xfrm_expand_policies(fl, family, pols, 2065 err = xfrm_expand_policies(fl, family, pols,
2151 &num_pols, &num_xfrms); 2066 &num_pols, &num_xfrms);
2152 if (err < 0) 2067 if (err < 0)
@@ -2158,15 +2073,16 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
2158 goto no_transform; 2073 goto no_transform;
2159 } 2074 }
2160 2075
2161 local_bh_disable();
2162 xdst = xfrm_resolve_and_create_bundle( 2076 xdst = xfrm_resolve_and_create_bundle(
2163 pols, num_pols, fl, 2077 pols, num_pols, fl,
2164 family, dst_orig); 2078 family, dst_orig);
2165 local_bh_enable();
2166 2079
2167 if (IS_ERR(xdst)) { 2080 if (IS_ERR(xdst)) {
2168 xfrm_pols_put(pols, num_pols); 2081 xfrm_pols_put(pols, num_pols);
2169 err = PTR_ERR(xdst); 2082 err = PTR_ERR(xdst);
2083 if (err == -EREMOTE)
2084 goto nopol;
2085
2170 goto dropdst; 2086 goto dropdst;
2171 } else if (xdst == NULL) { 2087 } else if (xdst == NULL) {
2172 num_xfrms = 0; 2088 num_xfrms = 0;
@@ -2189,7 +2105,7 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
2189 !net->xfrm.policy_count[XFRM_POLICY_OUT]) 2105 !net->xfrm.policy_count[XFRM_POLICY_OUT])
2190 goto nopol; 2106 goto nopol;
2191 2107
2192 xdst = xfrm_bundle_lookup(net, fl, family, dir, &xflo); 2108 xdst = xfrm_bundle_lookup(net, fl, family, dir, &xflo, if_id);
2193 if (xdst == NULL) 2109 if (xdst == NULL)
2194 goto nopol; 2110 goto nopol;
2195 if (IS_ERR(xdst)) { 2111 if (IS_ERR(xdst)) {
@@ -2234,7 +2150,7 @@ no_transform:
2234 } 2150 }
2235 2151
2236 for (i = 0; i < num_pols; i++) 2152 for (i = 0; i < num_pols; i++)
2237 pols[i]->curlft.use_time = get_seconds(); 2153 pols[i]->curlft.use_time = ktime_get_real_seconds();
2238 2154
2239 if (num_xfrms < 0) { 2155 if (num_xfrms < 0) {
2240 /* Prohibit the flow */ 2156 /* Prohibit the flow */
@@ -2270,6 +2186,19 @@ dropdst:
2270 xfrm_pols_put(pols, drop_pols); 2186 xfrm_pols_put(pols, drop_pols);
2271 return ERR_PTR(err); 2187 return ERR_PTR(err);
2272} 2188}
2189EXPORT_SYMBOL(xfrm_lookup_with_ifid);
2190
2191/* Main function: finds/creates a bundle for given flow.
2192 *
2193 * At the moment we eat a raw IP route. Mostly to speed up lookups
2194 * on interfaces with disabled IPsec.
2195 */
2196struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
2197 const struct flowi *fl, const struct sock *sk,
2198 int flags)
2199{
2200 return xfrm_lookup_with_ifid(net, dst_orig, fl, sk, flags, 0);
2201}
2273EXPORT_SYMBOL(xfrm_lookup); 2202EXPORT_SYMBOL(xfrm_lookup);
2274 2203
2275/* Callers of xfrm_lookup_route() must ensure a call to dst_output(). 2204/* Callers of xfrm_lookup_route() must ensure a call to dst_output().
@@ -2286,6 +2215,9 @@ struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig,
2286 if (IS_ERR(dst) && PTR_ERR(dst) == -EREMOTE) 2215 if (IS_ERR(dst) && PTR_ERR(dst) == -EREMOTE)
2287 return make_blackhole(net, dst_orig->ops->family, dst_orig); 2216 return make_blackhole(net, dst_orig->ops->family, dst_orig);
2288 2217
2218 if (IS_ERR(dst))
2219 dst_release(dst_orig);
2220
2289 return dst; 2221 return dst;
2290} 2222}
2291EXPORT_SYMBOL(xfrm_lookup_route); 2223EXPORT_SYMBOL(xfrm_lookup_route);
@@ -2365,6 +2297,7 @@ int __xfrm_decode_session(struct sk_buff *skb, struct flowi *fl,
2365 return -EAFNOSUPPORT; 2297 return -EAFNOSUPPORT;
2366 2298
2367 afinfo->decode_session(skb, fl, reverse); 2299 afinfo->decode_session(skb, fl, reverse);
2300
2368 err = security_xfrm_decode_session(skb, &fl->flowi_secid); 2301 err = security_xfrm_decode_session(skb, &fl->flowi_secid);
2369 rcu_read_unlock(); 2302 rcu_read_unlock();
2370 return err; 2303 return err;
@@ -2395,6 +2328,19 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
2395 int reverse; 2328 int reverse;
2396 struct flowi fl; 2329 struct flowi fl;
2397 int xerr_idx = -1; 2330 int xerr_idx = -1;
2331 const struct xfrm_if_cb *ifcb;
2332 struct xfrm_if *xi;
2333 u32 if_id = 0;
2334
2335 rcu_read_lock();
2336 ifcb = xfrm_if_get_cb();
2337
2338 if (ifcb) {
2339 xi = ifcb->decode_session(skb);
2340 if (xi)
2341 if_id = xi->p.if_id;
2342 }
2343 rcu_read_unlock();
2398 2344
2399 reverse = dir & ~XFRM_POLICY_MASK; 2345 reverse = dir & ~XFRM_POLICY_MASK;
2400 dir &= XFRM_POLICY_MASK; 2346 dir &= XFRM_POLICY_MASK;
@@ -2422,7 +2368,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
2422 pol = NULL; 2368 pol = NULL;
2423 sk = sk_to_full_sk(sk); 2369 sk = sk_to_full_sk(sk);
2424 if (sk && sk->sk_policy[dir]) { 2370 if (sk && sk->sk_policy[dir]) {
2425 pol = xfrm_sk_policy_lookup(sk, dir, &fl, family); 2371 pol = xfrm_sk_policy_lookup(sk, dir, &fl, family, if_id);
2426 if (IS_ERR(pol)) { 2372 if (IS_ERR(pol)) {
2427 XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR); 2373 XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
2428 return 0; 2374 return 0;
@@ -2430,7 +2376,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
2430 } 2376 }
2431 2377
2432 if (!pol) 2378 if (!pol)
2433 pol = xfrm_policy_lookup(net, &fl, family, dir); 2379 pol = xfrm_policy_lookup(net, &fl, family, dir, if_id);
2434 2380
2435 if (IS_ERR(pol)) { 2381 if (IS_ERR(pol)) {
2436 XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR); 2382 XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
@@ -2446,7 +2392,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
2446 return 1; 2392 return 1;
2447 } 2393 }
2448 2394
2449 pol->curlft.use_time = get_seconds(); 2395 pol->curlft.use_time = ktime_get_real_seconds();
2450 2396
2451 pols[0] = pol; 2397 pols[0] = pol;
2452 npols++; 2398 npols++;
@@ -2454,13 +2400,13 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
2454 if (pols[0]->type != XFRM_POLICY_TYPE_MAIN) { 2400 if (pols[0]->type != XFRM_POLICY_TYPE_MAIN) {
2455 pols[1] = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, 2401 pols[1] = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN,
2456 &fl, family, 2402 &fl, family,
2457 XFRM_POLICY_IN); 2403 XFRM_POLICY_IN, if_id);
2458 if (pols[1]) { 2404 if (pols[1]) {
2459 if (IS_ERR(pols[1])) { 2405 if (IS_ERR(pols[1])) {
2460 XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR); 2406 XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
2461 return 0; 2407 return 0;
2462 } 2408 }
2463 pols[1]->curlft.use_time = get_seconds(); 2409 pols[1]->curlft.use_time = ktime_get_real_seconds();
2464 npols++; 2410 npols++;
2465 } 2411 }
2466 } 2412 }
@@ -2819,6 +2765,21 @@ void xfrm_policy_unregister_afinfo(const struct xfrm_policy_afinfo *afinfo)
2819} 2765}
2820EXPORT_SYMBOL(xfrm_policy_unregister_afinfo); 2766EXPORT_SYMBOL(xfrm_policy_unregister_afinfo);
2821 2767
2768void xfrm_if_register_cb(const struct xfrm_if_cb *ifcb)
2769{
2770 spin_lock(&xfrm_if_cb_lock);
2771 rcu_assign_pointer(xfrm_if_cb, ifcb);
2772 spin_unlock(&xfrm_if_cb_lock);
2773}
2774EXPORT_SYMBOL(xfrm_if_register_cb);
2775
2776void xfrm_if_unregister_cb(void)
2777{
2778 RCU_INIT_POINTER(xfrm_if_cb, NULL);
2779 synchronize_rcu();
2780}
2781EXPORT_SYMBOL(xfrm_if_unregister_cb);
2782
2822#ifdef CONFIG_XFRM_STATISTICS 2783#ifdef CONFIG_XFRM_STATISTICS
2823static int __net_init xfrm_statistics_init(struct net *net) 2784static int __net_init xfrm_statistics_init(struct net *net)
2824{ 2785{
@@ -2986,19 +2947,13 @@ static struct pernet_operations __net_initdata xfrm_net_ops = {
2986 2947
2987void __init xfrm_init(void) 2948void __init xfrm_init(void)
2988{ 2949{
2989 int i;
2990
2991 xfrm_pcpu_work = kmalloc_array(NR_CPUS, sizeof(*xfrm_pcpu_work),
2992 GFP_KERNEL);
2993 BUG_ON(!xfrm_pcpu_work);
2994
2995 for (i = 0; i < NR_CPUS; i++)
2996 INIT_WORK(&xfrm_pcpu_work[i], xfrm_pcpu_work_fn);
2997
2998 register_pernet_subsys(&xfrm_net_ops); 2950 register_pernet_subsys(&xfrm_net_ops);
2999 xfrm_dev_init(); 2951 xfrm_dev_init();
3000 seqcount_init(&xfrm_policy_hash_generation); 2952 seqcount_init(&xfrm_policy_hash_generation);
3001 xfrm_input_init(); 2953 xfrm_input_init();
2954
2955 RCU_INIT_POINTER(xfrm_if_cb, NULL);
2956 synchronize_rcu();
3002} 2957}
3003 2958
3004#ifdef CONFIG_AUDITSYSCALL 2959#ifdef CONFIG_AUDITSYSCALL
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 8308281f3253..b669262682c9 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -475,8 +475,8 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me)
475{ 475{
476 struct tasklet_hrtimer *thr = container_of(me, struct tasklet_hrtimer, timer); 476 struct tasklet_hrtimer *thr = container_of(me, struct tasklet_hrtimer, timer);
477 struct xfrm_state *x = container_of(thr, struct xfrm_state, mtimer); 477 struct xfrm_state *x = container_of(thr, struct xfrm_state, mtimer);
478 unsigned long now = get_seconds(); 478 time64_t now = ktime_get_real_seconds();
479 long next = LONG_MAX; 479 time64_t next = TIME64_MAX;
480 int warn = 0; 480 int warn = 0;
481 int err = 0; 481 int err = 0;
482 482
@@ -537,7 +537,7 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me)
537 if (warn) 537 if (warn)
538 km_state_expired(x, 0, 0); 538 km_state_expired(x, 0, 0);
539resched: 539resched:
540 if (next != LONG_MAX) { 540 if (next != TIME64_MAX) {
541 tasklet_hrtimer_start(&x->mtimer, ktime_set(next, 0), HRTIMER_MODE_REL); 541 tasklet_hrtimer_start(&x->mtimer, ktime_set(next, 0), HRTIMER_MODE_REL);
542 } 542 }
543 543
@@ -577,7 +577,7 @@ struct xfrm_state *xfrm_state_alloc(struct net *net)
577 tasklet_hrtimer_init(&x->mtimer, xfrm_timer_handler, 577 tasklet_hrtimer_init(&x->mtimer, xfrm_timer_handler,
578 CLOCK_BOOTTIME, HRTIMER_MODE_ABS); 578 CLOCK_BOOTTIME, HRTIMER_MODE_ABS);
579 timer_setup(&x->rtimer, xfrm_replay_timer_handler, 0); 579 timer_setup(&x->rtimer, xfrm_replay_timer_handler, 0);
580 x->curlft.add_time = get_seconds(); 580 x->curlft.add_time = ktime_get_real_seconds();
581 x->lft.soft_byte_limit = XFRM_INF; 581 x->lft.soft_byte_limit = XFRM_INF;
582 x->lft.soft_packet_limit = XFRM_INF; 582 x->lft.soft_packet_limit = XFRM_INF;
583 x->lft.hard_byte_limit = XFRM_INF; 583 x->lft.hard_byte_limit = XFRM_INF;
@@ -735,10 +735,9 @@ restart:
735 } 735 }
736out: 736out:
737 spin_unlock_bh(&net->xfrm.xfrm_state_lock); 737 spin_unlock_bh(&net->xfrm.xfrm_state_lock);
738 if (cnt) { 738 if (cnt)
739 err = 0; 739 err = 0;
740 xfrm_policy_cache_flush(); 740
741 }
742 return err; 741 return err;
743} 742}
744EXPORT_SYMBOL(xfrm_state_flush); 743EXPORT_SYMBOL(xfrm_state_flush);
@@ -931,7 +930,7 @@ struct xfrm_state *
931xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, 930xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr,
932 const struct flowi *fl, struct xfrm_tmpl *tmpl, 931 const struct flowi *fl, struct xfrm_tmpl *tmpl,
933 struct xfrm_policy *pol, int *err, 932 struct xfrm_policy *pol, int *err,
934 unsigned short family) 933 unsigned short family, u32 if_id)
935{ 934{
936 static xfrm_address_t saddr_wildcard = { }; 935 static xfrm_address_t saddr_wildcard = { };
937 struct net *net = xp_net(pol); 936 struct net *net = xp_net(pol);
@@ -955,6 +954,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr,
955 if (x->props.family == encap_family && 954 if (x->props.family == encap_family &&
956 x->props.reqid == tmpl->reqid && 955 x->props.reqid == tmpl->reqid &&
957 (mark & x->mark.m) == x->mark.v && 956 (mark & x->mark.m) == x->mark.v &&
957 x->if_id == if_id &&
958 !(x->props.flags & XFRM_STATE_WILDRECV) && 958 !(x->props.flags & XFRM_STATE_WILDRECV) &&
959 xfrm_state_addr_check(x, daddr, saddr, encap_family) && 959 xfrm_state_addr_check(x, daddr, saddr, encap_family) &&
960 tmpl->mode == x->props.mode && 960 tmpl->mode == x->props.mode &&
@@ -971,6 +971,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr,
971 if (x->props.family == encap_family && 971 if (x->props.family == encap_family &&
972 x->props.reqid == tmpl->reqid && 972 x->props.reqid == tmpl->reqid &&
973 (mark & x->mark.m) == x->mark.v && 973 (mark & x->mark.m) == x->mark.v &&
974 x->if_id == if_id &&
974 !(x->props.flags & XFRM_STATE_WILDRECV) && 975 !(x->props.flags & XFRM_STATE_WILDRECV) &&
975 xfrm_addr_equal(&x->id.daddr, daddr, encap_family) && 976 xfrm_addr_equal(&x->id.daddr, daddr, encap_family) &&
976 tmpl->mode == x->props.mode && 977 tmpl->mode == x->props.mode &&
@@ -1010,6 +1011,7 @@ found:
1010 * to current session. */ 1011 * to current session. */
1011 xfrm_init_tempstate(x, fl, tmpl, daddr, saddr, family); 1012 xfrm_init_tempstate(x, fl, tmpl, daddr, saddr, family);
1012 memcpy(&x->mark, &pol->mark, sizeof(x->mark)); 1013 memcpy(&x->mark, &pol->mark, sizeof(x->mark));
1014 x->if_id = if_id;
1013 1015
1014 error = security_xfrm_state_alloc_acquire(x, pol->security, fl->flowi_secid); 1016 error = security_xfrm_state_alloc_acquire(x, pol->security, fl->flowi_secid);
1015 if (error) { 1017 if (error) {
@@ -1067,7 +1069,7 @@ out:
1067} 1069}
1068 1070
1069struct xfrm_state * 1071struct xfrm_state *
1070xfrm_stateonly_find(struct net *net, u32 mark, 1072xfrm_stateonly_find(struct net *net, u32 mark, u32 if_id,
1071 xfrm_address_t *daddr, xfrm_address_t *saddr, 1073 xfrm_address_t *daddr, xfrm_address_t *saddr,
1072 unsigned short family, u8 mode, u8 proto, u32 reqid) 1074 unsigned short family, u8 mode, u8 proto, u32 reqid)
1073{ 1075{
@@ -1080,6 +1082,7 @@ xfrm_stateonly_find(struct net *net, u32 mark,
1080 if (x->props.family == family && 1082 if (x->props.family == family &&
1081 x->props.reqid == reqid && 1083 x->props.reqid == reqid &&
1082 (mark & x->mark.m) == x->mark.v && 1084 (mark & x->mark.m) == x->mark.v &&
1085 x->if_id == if_id &&
1083 !(x->props.flags & XFRM_STATE_WILDRECV) && 1086 !(x->props.flags & XFRM_STATE_WILDRECV) &&
1084 xfrm_state_addr_check(x, daddr, saddr, family) && 1087 xfrm_state_addr_check(x, daddr, saddr, family) &&
1085 mode == x->props.mode && 1088 mode == x->props.mode &&
@@ -1160,11 +1163,13 @@ static void __xfrm_state_bump_genids(struct xfrm_state *xnew)
1160 struct xfrm_state *x; 1163 struct xfrm_state *x;
1161 unsigned int h; 1164 unsigned int h;
1162 u32 mark = xnew->mark.v & xnew->mark.m; 1165 u32 mark = xnew->mark.v & xnew->mark.m;
1166 u32 if_id = xnew->if_id;
1163 1167
1164 h = xfrm_dst_hash(net, &xnew->id.daddr, &xnew->props.saddr, reqid, family); 1168 h = xfrm_dst_hash(net, &xnew->id.daddr, &xnew->props.saddr, reqid, family);
1165 hlist_for_each_entry(x, net->xfrm.state_bydst+h, bydst) { 1169 hlist_for_each_entry(x, net->xfrm.state_bydst+h, bydst) {
1166 if (x->props.family == family && 1170 if (x->props.family == family &&
1167 x->props.reqid == reqid && 1171 x->props.reqid == reqid &&
1172 x->if_id == if_id &&
1168 (mark & x->mark.m) == x->mark.v && 1173 (mark & x->mark.m) == x->mark.v &&
1169 xfrm_addr_equal(&x->id.daddr, &xnew->id.daddr, family) && 1174 xfrm_addr_equal(&x->id.daddr, &xnew->id.daddr, family) &&
1170 xfrm_addr_equal(&x->props.saddr, &xnew->props.saddr, family)) 1175 xfrm_addr_equal(&x->props.saddr, &xnew->props.saddr, family))
@@ -1187,7 +1192,7 @@ EXPORT_SYMBOL(xfrm_state_insert);
1187static struct xfrm_state *__find_acq_core(struct net *net, 1192static struct xfrm_state *__find_acq_core(struct net *net,
1188 const struct xfrm_mark *m, 1193 const struct xfrm_mark *m,
1189 unsigned short family, u8 mode, 1194 unsigned short family, u8 mode,
1190 u32 reqid, u8 proto, 1195 u32 reqid, u32 if_id, u8 proto,
1191 const xfrm_address_t *daddr, 1196 const xfrm_address_t *daddr,
1192 const xfrm_address_t *saddr, 1197 const xfrm_address_t *saddr,
1193 int create) 1198 int create)
@@ -1242,6 +1247,7 @@ static struct xfrm_state *__find_acq_core(struct net *net,
1242 x->props.family = family; 1247 x->props.family = family;
1243 x->props.mode = mode; 1248 x->props.mode = mode;
1244 x->props.reqid = reqid; 1249 x->props.reqid = reqid;
1250 x->if_id = if_id;
1245 x->mark.v = m->v; 1251 x->mark.v = m->v;
1246 x->mark.m = m->m; 1252 x->mark.m = m->m;
1247 x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires; 1253 x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires;
@@ -1296,7 +1302,7 @@ int xfrm_state_add(struct xfrm_state *x)
1296 1302
1297 if (use_spi && !x1) 1303 if (use_spi && !x1)
1298 x1 = __find_acq_core(net, &x->mark, family, x->props.mode, 1304 x1 = __find_acq_core(net, &x->mark, family, x->props.mode,
1299 x->props.reqid, x->id.proto, 1305 x->props.reqid, x->if_id, x->id.proto,
1300 &x->id.daddr, &x->props.saddr, 0); 1306 &x->id.daddr, &x->props.saddr, 0);
1301 1307
1302 __xfrm_state_bump_genids(x); 1308 __xfrm_state_bump_genids(x);
@@ -1395,6 +1401,7 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig,
1395 x->props.flags = orig->props.flags; 1401 x->props.flags = orig->props.flags;
1396 x->props.extra_flags = orig->props.extra_flags; 1402 x->props.extra_flags = orig->props.extra_flags;
1397 1403
1404 x->if_id = orig->if_id;
1398 x->tfcpad = orig->tfcpad; 1405 x->tfcpad = orig->tfcpad;
1399 x->replay_maxdiff = orig->replay_maxdiff; 1406 x->replay_maxdiff = orig->replay_maxdiff;
1400 x->replay_maxage = orig->replay_maxage; 1407 x->replay_maxage = orig->replay_maxage;
@@ -1554,6 +1561,19 @@ out:
1554 if (x1->curlft.use_time) 1561 if (x1->curlft.use_time)
1555 xfrm_state_check_expire(x1); 1562 xfrm_state_check_expire(x1);
1556 1563
1564 if (x->props.smark.m || x->props.smark.v || x->if_id) {
1565 spin_lock_bh(&net->xfrm.xfrm_state_lock);
1566
1567 if (x->props.smark.m || x->props.smark.v)
1568 x1->props.smark = x->props.smark;
1569
1570 if (x->if_id)
1571 x1->if_id = x->if_id;
1572
1573 __xfrm_state_bump_genids(x1);
1574 spin_unlock_bh(&net->xfrm.xfrm_state_lock);
1575 }
1576
1557 err = 0; 1577 err = 0;
1558 x->km.state = XFRM_STATE_DEAD; 1578 x->km.state = XFRM_STATE_DEAD;
1559 __xfrm_state_put(x); 1579 __xfrm_state_put(x);
@@ -1571,7 +1591,7 @@ EXPORT_SYMBOL(xfrm_state_update);
1571int xfrm_state_check_expire(struct xfrm_state *x) 1591int xfrm_state_check_expire(struct xfrm_state *x)
1572{ 1592{
1573 if (!x->curlft.use_time) 1593 if (!x->curlft.use_time)
1574 x->curlft.use_time = get_seconds(); 1594 x->curlft.use_time = ktime_get_real_seconds();
1575 1595
1576 if (x->curlft.bytes >= x->lft.hard_byte_limit || 1596 if (x->curlft.bytes >= x->lft.hard_byte_limit ||
1577 x->curlft.packets >= x->lft.hard_packet_limit) { 1597 x->curlft.packets >= x->lft.hard_packet_limit) {
@@ -1619,13 +1639,13 @@ EXPORT_SYMBOL(xfrm_state_lookup_byaddr);
1619 1639
1620struct xfrm_state * 1640struct xfrm_state *
1621xfrm_find_acq(struct net *net, const struct xfrm_mark *mark, u8 mode, u32 reqid, 1641xfrm_find_acq(struct net *net, const struct xfrm_mark *mark, u8 mode, u32 reqid,
1622 u8 proto, const xfrm_address_t *daddr, 1642 u32 if_id, u8 proto, const xfrm_address_t *daddr,
1623 const xfrm_address_t *saddr, int create, unsigned short family) 1643 const xfrm_address_t *saddr, int create, unsigned short family)
1624{ 1644{
1625 struct xfrm_state *x; 1645 struct xfrm_state *x;
1626 1646
1627 spin_lock_bh(&net->xfrm.xfrm_state_lock); 1647 spin_lock_bh(&net->xfrm.xfrm_state_lock);
1628 x = __find_acq_core(net, mark, family, mode, reqid, proto, daddr, saddr, create); 1648 x = __find_acq_core(net, mark, family, mode, reqid, if_id, proto, daddr, saddr, create);
1629 spin_unlock_bh(&net->xfrm.xfrm_state_lock); 1649 spin_unlock_bh(&net->xfrm.xfrm_state_lock);
1630 1650
1631 return x; 1651 return x;
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 080035f056d9..4791aa8b8185 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -527,6 +527,19 @@ static void xfrm_update_ae_params(struct xfrm_state *x, struct nlattr **attrs,
527 x->replay_maxdiff = nla_get_u32(rt); 527 x->replay_maxdiff = nla_get_u32(rt);
528} 528}
529 529
530static void xfrm_smark_init(struct nlattr **attrs, struct xfrm_mark *m)
531{
532 if (attrs[XFRMA_SET_MARK]) {
533 m->v = nla_get_u32(attrs[XFRMA_SET_MARK]);
534 if (attrs[XFRMA_SET_MARK_MASK])
535 m->m = nla_get_u32(attrs[XFRMA_SET_MARK_MASK]);
536 else
537 m->m = 0xffffffff;
538 } else {
539 m->v = m->m = 0;
540 }
541}
542
530static struct xfrm_state *xfrm_state_construct(struct net *net, 543static struct xfrm_state *xfrm_state_construct(struct net *net,
531 struct xfrm_usersa_info *p, 544 struct xfrm_usersa_info *p,
532 struct nlattr **attrs, 545 struct nlattr **attrs,
@@ -579,8 +592,10 @@ static struct xfrm_state *xfrm_state_construct(struct net *net,
579 592
580 xfrm_mark_get(attrs, &x->mark); 593 xfrm_mark_get(attrs, &x->mark);
581 594
582 if (attrs[XFRMA_OUTPUT_MARK]) 595 xfrm_smark_init(attrs, &x->props.smark);
583 x->props.output_mark = nla_get_u32(attrs[XFRMA_OUTPUT_MARK]); 596
597 if (attrs[XFRMA_IF_ID])
598 x->if_id = nla_get_u32(attrs[XFRMA_IF_ID]);
584 599
585 err = __xfrm_init_state(x, false, attrs[XFRMA_OFFLOAD_DEV]); 600 err = __xfrm_init_state(x, false, attrs[XFRMA_OFFLOAD_DEV]);
586 if (err) 601 if (err)
@@ -824,6 +839,18 @@ static int copy_to_user_auth(struct xfrm_algo_auth *auth, struct sk_buff *skb)
824 return 0; 839 return 0;
825} 840}
826 841
842static int xfrm_smark_put(struct sk_buff *skb, struct xfrm_mark *m)
843{
844 int ret = 0;
845
846 if (m->v | m->m) {
847 ret = nla_put_u32(skb, XFRMA_SET_MARK, m->v);
848 if (!ret)
849 ret = nla_put_u32(skb, XFRMA_SET_MARK_MASK, m->m);
850 }
851 return ret;
852}
853
827/* Don't change this without updating xfrm_sa_len! */ 854/* Don't change this without updating xfrm_sa_len! */
828static int copy_to_user_state_extra(struct xfrm_state *x, 855static int copy_to_user_state_extra(struct xfrm_state *x,
829 struct xfrm_usersa_info *p, 856 struct xfrm_usersa_info *p,
@@ -887,6 +914,11 @@ static int copy_to_user_state_extra(struct xfrm_state *x,
887 ret = xfrm_mark_put(skb, &x->mark); 914 ret = xfrm_mark_put(skb, &x->mark);
888 if (ret) 915 if (ret)
889 goto out; 916 goto out;
917
918 ret = xfrm_smark_put(skb, &x->props.smark);
919 if (ret)
920 goto out;
921
890 if (x->replay_esn) 922 if (x->replay_esn)
891 ret = nla_put(skb, XFRMA_REPLAY_ESN_VAL, 923 ret = nla_put(skb, XFRMA_REPLAY_ESN_VAL,
892 xfrm_replay_state_esn_len(x->replay_esn), 924 xfrm_replay_state_esn_len(x->replay_esn),
@@ -900,8 +932,8 @@ static int copy_to_user_state_extra(struct xfrm_state *x,
900 ret = copy_user_offload(&x->xso, skb); 932 ret = copy_user_offload(&x->xso, skb);
901 if (ret) 933 if (ret)
902 goto out; 934 goto out;
903 if (x->props.output_mark) { 935 if (x->if_id) {
904 ret = nla_put_u32(skb, XFRMA_OUTPUT_MARK, x->props.output_mark); 936 ret = nla_put_u32(skb, XFRMA_IF_ID, x->if_id);
905 if (ret) 937 if (ret)
906 goto out; 938 goto out;
907 } 939 }
@@ -1025,10 +1057,12 @@ static inline int xfrm_nlmsg_multicast(struct net *net, struct sk_buff *skb,
1025{ 1057{
1026 struct sock *nlsk = rcu_dereference(net->xfrm.nlsk); 1058 struct sock *nlsk = rcu_dereference(net->xfrm.nlsk);
1027 1059
1028 if (nlsk) 1060 if (!nlsk) {
1029 return nlmsg_multicast(nlsk, skb, pid, group, GFP_ATOMIC); 1061 kfree_skb(skb);
1030 else 1062 return -EPIPE;
1031 return -1; 1063 }
1064
1065 return nlmsg_multicast(nlsk, skb, pid, group, GFP_ATOMIC);
1032} 1066}
1033 1067
1034static inline unsigned int xfrm_spdinfo_msgsize(void) 1068static inline unsigned int xfrm_spdinfo_msgsize(void)
@@ -1253,6 +1287,7 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh,
1253 int err; 1287 int err;
1254 u32 mark; 1288 u32 mark;
1255 struct xfrm_mark m; 1289 struct xfrm_mark m;
1290 u32 if_id = 0;
1256 1291
1257 p = nlmsg_data(nlh); 1292 p = nlmsg_data(nlh);
1258 err = verify_spi_info(p->info.id.proto, p->min, p->max); 1293 err = verify_spi_info(p->info.id.proto, p->min, p->max);
@@ -1265,6 +1300,10 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh,
1265 x = NULL; 1300 x = NULL;
1266 1301
1267 mark = xfrm_mark_get(attrs, &m); 1302 mark = xfrm_mark_get(attrs, &m);
1303
1304 if (attrs[XFRMA_IF_ID])
1305 if_id = nla_get_u32(attrs[XFRMA_IF_ID]);
1306
1268 if (p->info.seq) { 1307 if (p->info.seq) {
1269 x = xfrm_find_acq_byseq(net, mark, p->info.seq); 1308 x = xfrm_find_acq_byseq(net, mark, p->info.seq);
1270 if (x && !xfrm_addr_equal(&x->id.daddr, daddr, family)) { 1309 if (x && !xfrm_addr_equal(&x->id.daddr, daddr, family)) {
@@ -1275,7 +1314,7 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh,
1275 1314
1276 if (!x) 1315 if (!x)
1277 x = xfrm_find_acq(net, &m, p->info.mode, p->info.reqid, 1316 x = xfrm_find_acq(net, &m, p->info.mode, p->info.reqid,
1278 p->info.id.proto, daddr, 1317 if_id, p->info.id.proto, daddr,
1279 &p->info.saddr, 1, 1318 &p->info.saddr, 1,
1280 family); 1319 family);
1281 err = -ENOENT; 1320 err = -ENOENT;
@@ -1563,6 +1602,9 @@ static struct xfrm_policy *xfrm_policy_construct(struct net *net, struct xfrm_us
1563 1602
1564 xfrm_mark_get(attrs, &xp->mark); 1603 xfrm_mark_get(attrs, &xp->mark);
1565 1604
1605 if (attrs[XFRMA_IF_ID])
1606 xp->if_id = nla_get_u32(attrs[XFRMA_IF_ID]);
1607
1566 return xp; 1608 return xp;
1567 error: 1609 error:
1568 *errp = err; 1610 *errp = err;
@@ -1671,9 +1713,11 @@ static inline unsigned int userpolicy_type_attrsize(void)
1671#ifdef CONFIG_XFRM_SUB_POLICY 1713#ifdef CONFIG_XFRM_SUB_POLICY
1672static int copy_to_user_policy_type(u8 type, struct sk_buff *skb) 1714static int copy_to_user_policy_type(u8 type, struct sk_buff *skb)
1673{ 1715{
1674 struct xfrm_userpolicy_type upt = { 1716 struct xfrm_userpolicy_type upt;
1675 .type = type, 1717
1676 }; 1718 /* Sadly there are two holes in struct xfrm_userpolicy_type */
1719 memset(&upt, 0, sizeof(upt));
1720 upt.type = type;
1677 1721
1678 return nla_put(skb, XFRMA_POLICY_TYPE, sizeof(upt), &upt); 1722 return nla_put(skb, XFRMA_POLICY_TYPE, sizeof(upt), &upt);
1679} 1723}
@@ -1708,6 +1752,8 @@ static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr
1708 err = copy_to_user_policy_type(xp->type, skb); 1752 err = copy_to_user_policy_type(xp->type, skb);
1709 if (!err) 1753 if (!err)
1710 err = xfrm_mark_put(skb, &xp->mark); 1754 err = xfrm_mark_put(skb, &xp->mark);
1755 if (!err)
1756 err = xfrm_if_id_put(skb, xp->if_id);
1711 if (err) { 1757 if (err) {
1712 nlmsg_cancel(skb, nlh); 1758 nlmsg_cancel(skb, nlh);
1713 return err; 1759 return err;
@@ -1789,6 +1835,7 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
1789 int delete; 1835 int delete;
1790 struct xfrm_mark m; 1836 struct xfrm_mark m;
1791 u32 mark = xfrm_mark_get(attrs, &m); 1837 u32 mark = xfrm_mark_get(attrs, &m);
1838 u32 if_id = 0;
1792 1839
1793 p = nlmsg_data(nlh); 1840 p = nlmsg_data(nlh);
1794 delete = nlh->nlmsg_type == XFRM_MSG_DELPOLICY; 1841 delete = nlh->nlmsg_type == XFRM_MSG_DELPOLICY;
@@ -1801,8 +1848,11 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
1801 if (err) 1848 if (err)
1802 return err; 1849 return err;
1803 1850
1851 if (attrs[XFRMA_IF_ID])
1852 if_id = nla_get_u32(attrs[XFRMA_IF_ID]);
1853
1804 if (p->index) 1854 if (p->index)
1805 xp = xfrm_policy_byid(net, mark, type, p->dir, p->index, delete, &err); 1855 xp = xfrm_policy_byid(net, mark, if_id, type, p->dir, p->index, delete, &err);
1806 else { 1856 else {
1807 struct nlattr *rt = attrs[XFRMA_SEC_CTX]; 1857 struct nlattr *rt = attrs[XFRMA_SEC_CTX];
1808 struct xfrm_sec_ctx *ctx; 1858 struct xfrm_sec_ctx *ctx;
@@ -1819,7 +1869,7 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
1819 if (err) 1869 if (err)
1820 return err; 1870 return err;
1821 } 1871 }
1822 xp = xfrm_policy_bysel_ctx(net, mark, type, p->dir, &p->sel, 1872 xp = xfrm_policy_bysel_ctx(net, mark, if_id, type, p->dir, &p->sel,
1823 ctx, delete, &err); 1873 ctx, delete, &err);
1824 security_xfrm_policy_free(ctx); 1874 security_xfrm_policy_free(ctx);
1825 } 1875 }
@@ -1942,6 +1992,10 @@ static int build_aevent(struct sk_buff *skb, struct xfrm_state *x, const struct
1942 if (err) 1992 if (err)
1943 goto out_cancel; 1993 goto out_cancel;
1944 1994
1995 err = xfrm_if_id_put(skb, x->if_id);
1996 if (err)
1997 goto out_cancel;
1998
1945 nlmsg_end(skb, nlh); 1999 nlmsg_end(skb, nlh);
1946 return 0; 2000 return 0;
1947 2001
@@ -2084,6 +2138,7 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh,
2084 int err = -ENOENT; 2138 int err = -ENOENT;
2085 struct xfrm_mark m; 2139 struct xfrm_mark m;
2086 u32 mark = xfrm_mark_get(attrs, &m); 2140 u32 mark = xfrm_mark_get(attrs, &m);
2141 u32 if_id = 0;
2087 2142
2088 err = copy_from_user_policy_type(&type, attrs); 2143 err = copy_from_user_policy_type(&type, attrs);
2089 if (err) 2144 if (err)
@@ -2093,8 +2148,11 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh,
2093 if (err) 2148 if (err)
2094 return err; 2149 return err;
2095 2150
2151 if (attrs[XFRMA_IF_ID])
2152 if_id = nla_get_u32(attrs[XFRMA_IF_ID]);
2153
2096 if (p->index) 2154 if (p->index)
2097 xp = xfrm_policy_byid(net, mark, type, p->dir, p->index, 0, &err); 2155 xp = xfrm_policy_byid(net, mark, if_id, type, p->dir, p->index, 0, &err);
2098 else { 2156 else {
2099 struct nlattr *rt = attrs[XFRMA_SEC_CTX]; 2157 struct nlattr *rt = attrs[XFRMA_SEC_CTX];
2100 struct xfrm_sec_ctx *ctx; 2158 struct xfrm_sec_ctx *ctx;
@@ -2111,7 +2169,7 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh,
2111 if (err) 2169 if (err)
2112 return err; 2170 return err;
2113 } 2171 }
2114 xp = xfrm_policy_bysel_ctx(net, mark, type, p->dir, 2172 xp = xfrm_policy_bysel_ctx(net, mark, if_id, type, p->dir,
2115 &p->sel, ctx, 0, &err); 2173 &p->sel, ctx, 0, &err);
2116 security_xfrm_policy_free(ctx); 2174 security_xfrm_policy_free(ctx);
2117 } 2175 }
@@ -2493,7 +2551,9 @@ static const struct nla_policy xfrma_policy[XFRMA_MAX+1] = {
2493 [XFRMA_PROTO] = { .type = NLA_U8 }, 2551 [XFRMA_PROTO] = { .type = NLA_U8 },
2494 [XFRMA_ADDRESS_FILTER] = { .len = sizeof(struct xfrm_address_filter) }, 2552 [XFRMA_ADDRESS_FILTER] = { .len = sizeof(struct xfrm_address_filter) },
2495 [XFRMA_OFFLOAD_DEV] = { .len = sizeof(struct xfrm_user_offload) }, 2553 [XFRMA_OFFLOAD_DEV] = { .len = sizeof(struct xfrm_user_offload) },
2496 [XFRMA_OUTPUT_MARK] = { .type = NLA_U32 }, 2554 [XFRMA_SET_MARK] = { .type = NLA_U32 },
2555 [XFRMA_SET_MARK_MASK] = { .type = NLA_U32 },
2556 [XFRMA_IF_ID] = { .type = NLA_U32 },
2497}; 2557};
2498 2558
2499static const struct nla_policy xfrma_spd_policy[XFRMA_SPD_MAX+1] = { 2559static const struct nla_policy xfrma_spd_policy[XFRMA_SPD_MAX+1] = {
@@ -2625,6 +2685,10 @@ static int build_expire(struct sk_buff *skb, struct xfrm_state *x, const struct
2625 if (err) 2685 if (err)
2626 return err; 2686 return err;
2627 2687
2688 err = xfrm_if_id_put(skb, x->if_id);
2689 if (err)
2690 return err;
2691
2628 nlmsg_end(skb, nlh); 2692 nlmsg_end(skb, nlh);
2629 return 0; 2693 return 0;
2630} 2694}
@@ -2719,8 +2783,12 @@ static inline unsigned int xfrm_sa_len(struct xfrm_state *x)
2719 l += nla_total_size(sizeof(x->props.extra_flags)); 2783 l += nla_total_size(sizeof(x->props.extra_flags));
2720 if (x->xso.dev) 2784 if (x->xso.dev)
2721 l += nla_total_size(sizeof(x->xso)); 2785 l += nla_total_size(sizeof(x->xso));
2722 if (x->props.output_mark) 2786 if (x->props.smark.v | x->props.smark.m) {
2723 l += nla_total_size(sizeof(x->props.output_mark)); 2787 l += nla_total_size(sizeof(x->props.smark.v));
2788 l += nla_total_size(sizeof(x->props.smark.m));
2789 }
2790 if (x->if_id)
2791 l += nla_total_size(sizeof(x->if_id));
2724 2792
2725 /* Must count x->lastused as it may become non-zero behind our back. */ 2793 /* Must count x->lastused as it may become non-zero behind our back. */
2726 l += nla_total_size_64bit(sizeof(u64)); 2794 l += nla_total_size_64bit(sizeof(u64));
@@ -2850,6 +2918,8 @@ static int build_acquire(struct sk_buff *skb, struct xfrm_state *x,
2850 err = copy_to_user_policy_type(xp->type, skb); 2918 err = copy_to_user_policy_type(xp->type, skb);
2851 if (!err) 2919 if (!err)
2852 err = xfrm_mark_put(skb, &xp->mark); 2920 err = xfrm_mark_put(skb, &xp->mark);
2921 if (!err)
2922 err = xfrm_if_id_put(skb, xp->if_id);
2853 if (err) { 2923 if (err) {
2854 nlmsg_cancel(skb, nlh); 2924 nlmsg_cancel(skb, nlh);
2855 return err; 2925 return err;
@@ -2966,6 +3036,8 @@ static int build_polexpire(struct sk_buff *skb, struct xfrm_policy *xp,
2966 err = copy_to_user_policy_type(xp->type, skb); 3036 err = copy_to_user_policy_type(xp->type, skb);
2967 if (!err) 3037 if (!err)
2968 err = xfrm_mark_put(skb, &xp->mark); 3038 err = xfrm_mark_put(skb, &xp->mark);
3039 if (!err)
3040 err = xfrm_if_id_put(skb, xp->if_id);
2969 if (err) { 3041 if (err) {
2970 nlmsg_cancel(skb, nlh); 3042 nlmsg_cancel(skb, nlh);
2971 return err; 3043 return err;
@@ -3047,6 +3119,8 @@ static int xfrm_notify_policy(struct xfrm_policy *xp, int dir, const struct km_e
3047 err = copy_to_user_policy_type(xp->type, skb); 3119 err = copy_to_user_policy_type(xp->type, skb);
3048 if (!err) 3120 if (!err)
3049 err = xfrm_mark_put(skb, &xp->mark); 3121 err = xfrm_mark_put(skb, &xp->mark);
3122 if (!err)
3123 err = xfrm_if_id_put(skb, xp->if_id);
3050 if (err) 3124 if (err)
3051 goto out_free_skb; 3125 goto out_free_skb;
3052 3126
@@ -3280,4 +3354,3 @@ module_init(xfrm_user_init);
3280module_exit(xfrm_user_exit); 3354module_exit(xfrm_user_exit);
3281MODULE_LICENSE("GPL"); 3355MODULE_LICENSE("GPL");
3282MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_XFRM); 3356MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_XFRM);
3283