aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2016-10-16 05:31:39 -0400
committerIngo Molnar <mingo@kernel.org>2016-10-16 05:31:39 -0400
commit1d33369db25eb7f37b7a8bd22d736888b4501a9c (patch)
tree116d764339be1bca928870151decbedc53a9e1d1 /net
parent23446cb66c073b827779e5eb3dec301623299b32 (diff)
parent1001354ca34179f3db924eb66672442a173147dc (diff)
Merge tag 'v4.9-rc1' into x86/urgent, to pick up updates
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'net')
-rw-r--r--net/6lowpan/ndisc.c2
-rw-r--r--net/9p/trans_rdma.c2
-rw-r--r--net/Kconfig1
-rw-r--r--net/Makefile1
-rw-r--r--net/appletalk/ddp.c2
-rw-r--r--net/atm/lec.c12
-rw-r--r--net/atm/mpc.c2
-rw-r--r--net/batman-adv/Kconfig15
-rw-r--r--net/batman-adv/Makefile4
-rw-r--r--net/batman-adv/bat_algo.c70
-rw-r--r--net/batman-adv/bat_algo.h3
-rw-r--r--net/batman-adv/bat_iv_ogm.c837
-rw-r--r--net/batman-adv/bat_v.c734
-rw-r--r--net/batman-adv/bat_v_ogm.c5
-rw-r--r--net/batman-adv/bridge_loop_avoidance.c348
-rw-r--r--net/batman-adv/bridge_loop_avoidance.h17
-rw-r--r--net/batman-adv/debugfs.c18
-rw-r--r--net/batman-adv/debugfs.h4
-rw-r--r--net/batman-adv/distributed-arp-table.c4
-rw-r--r--net/batman-adv/gateway_client.c285
-rw-r--r--net/batman-adv/gateway_client.h7
-rw-r--r--net/batman-adv/gateway_common.c5
-rw-r--r--net/batman-adv/hard-interface.c84
-rw-r--r--net/batman-adv/icmp_socket.h18
-rw-r--r--net/batman-adv/main.c19
-rw-r--r--net/batman-adv/main.h2
-rw-r--r--net/batman-adv/multicast.c4
-rw-r--r--net/batman-adv/netlink.c221
-rw-r--r--net/batman-adv/netlink.h6
-rw-r--r--net/batman-adv/network-coding.c11
-rw-r--r--net/batman-adv/originator.c172
-rw-r--r--net/batman-adv/originator.h4
-rw-r--r--net/batman-adv/packet.h36
-rw-r--r--net/batman-adv/routing.c43
-rw-r--r--net/batman-adv/send.c136
-rw-r--r--net/batman-adv/send.h6
-rw-r--r--net/batman-adv/soft-interface.c51
-rw-r--r--net/batman-adv/sysfs.c183
-rw-r--r--net/batman-adv/translation-table.c556
-rw-r--r--net/batman-adv/translation-table.h7
-rw-r--r--net/batman-adv/tvlv.c9
-rw-r--r--net/batman-adv/types.h69
-rw-r--r--net/bluetooth/af_bluetooth.c15
-rw-r--r--net/bluetooth/hci_core.c1
-rw-r--r--net/bluetooth/hci_request.c95
-rw-r--r--net/bluetooth/hci_request.h28
-rw-r--r--net/bluetooth/hci_sock.c396
-rw-r--r--net/bluetooth/leds.c27
-rw-r--r--net/bluetooth/leds.h10
-rw-r--r--net/bluetooth/mgmt.c332
-rw-r--r--net/bluetooth/mgmt_util.c66
-rw-r--r--net/bluetooth/smp.c5
-rw-r--r--net/bridge/Makefile2
-rw-r--r--net/bridge/br.c6
-rw-r--r--net/bridge/br_device.c8
-rw-r--r--net/bridge/br_fdb.c23
-rw-r--r--net/bridge/br_forward.c10
-rw-r--r--net/bridge/br_if.c12
-rw-r--r--net/bridge/br_input.c42
-rw-r--r--net/bridge/br_netfilter_hooks.c53
-rw-r--r--net/bridge/br_netfilter_ipv6.c12
-rw-r--r--net/bridge/br_netlink.c132
-rw-r--r--net/bridge/br_private.h46
-rw-r--r--net/bridge/br_stp_if.c43
-rw-r--r--net/bridge/br_switchdev.c57
-rw-r--r--net/bridge/br_sysfs_if.c2
-rw-r--r--net/bridge/netfilter/ebt_log.c2
-rw-r--r--net/bridge/netfilter/ebt_redirect.c2
-rw-r--r--net/bridge/netfilter/ebtables.c2
-rw-r--r--net/bridge/netfilter/nf_log_bridge.c3
-rw-r--r--net/bridge/netfilter/nf_tables_bridge.c92
-rw-r--r--net/bridge/netfilter/nft_reject_bridge.c44
-rw-r--r--net/ceph/Makefile1
-rw-r--r--net/ceph/auth.c13
-rw-r--r--net/ceph/auth_none.c2
-rw-r--r--net/ceph/ceph_common.c13
-rw-r--r--net/ceph/ceph_strings.c1
-rw-r--r--net/ceph/cls_lock_client.c325
-rw-r--r--net/ceph/crush/mapper.c17
-rw-r--r--net/ceph/mon_client.c82
-rw-r--r--net/ceph/osd_client.c169
-rw-r--r--net/core/dev.c163
-rw-r--r--net/core/drop_monitor.c2
-rw-r--r--net/core/filter.c472
-rw-r--r--net/core/flow_dissector.c164
-rw-r--r--net/core/lwtunnel.c35
-rw-r--r--net/core/neighbour.c3
-rw-r--r--net/core/net_namespace.c88
-rw-r--r--net/core/pktgen.c21
-rw-r--r--net/core/rtnetlink.c307
-rw-r--r--net/core/skbuff.c150
-rw-r--r--net/core/sock.c32
-rw-r--r--net/core/stream.c1
-rw-r--r--net/dsa/Kconfig3
-rw-r--r--net/dsa/Makefile1
-rw-r--r--net/dsa/dsa.c89
-rw-r--r--net/dsa/dsa2.c26
-rw-r--r--net/dsa/dsa_priv.h2
-rw-r--r--net/dsa/slave.c222
-rw-r--r--net/dsa/tag_qca.c138
-rw-r--r--net/ipv4/Kconfig18
-rw-r--r--net/ipv4/Makefile3
-rw-r--r--net/ipv4/af_inet.c37
-rw-r--r--net/ipv4/fib_frontend.c36
-rw-r--r--net/ipv4/fib_rules.c15
-rw-r--r--net/ipv4/fib_semantics.c3
-rw-r--r--net/ipv4/fib_trie.c176
-rw-r--r--net/ipv4/fou.c2
-rw-r--r--net/ipv4/gre_offload.c6
-rw-r--r--net/ipv4/igmp.c10
-rw-r--r--net/ipv4/inet_diag.c107
-rw-r--r--net/ipv4/ip_gre.c23
-rw-r--r--net/ipv4/ip_output.c21
-rw-r--r--net/ipv4/ip_sockglue.c7
-rw-r--r--net/ipv4/ip_tunnel.c76
-rw-r--r--net/ipv4/ip_tunnel_core.c2
-rw-r--r--net/ipv4/ipconfig.c71
-rw-r--r--net/ipv4/ipip.c35
-rw-r--r--net/ipv4/netfilter/Kconfig11
-rw-r--r--net/ipv4/netfilter/Makefile5
-rw-r--r--net/ipv4/netfilter/ip_tables.c2
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c72
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c492
-rw-r--r--net/ipv4/netfilter/nf_conntrack_proto_icmp.c41
-rw-r--r--net/ipv4/netfilter/nf_dup_ipv4.c10
-rw-r--r--net/ipv4/netfilter/nf_log_arp.c7
-rw-r--r--net/ipv4/netfilter/nf_log_ipv4.c13
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_gre.c13
-rw-r--r--net/ipv4/netfilter/nf_tables_arp.c7
-rw-r--r--net/ipv4/netfilter/nf_tables_ipv4.c5
-rw-r--r--net/ipv4/ping.c15
-rw-r--r--net/ipv4/proc.c103
-rw-r--r--net/ipv4/raw.c6
-rw-r--r--net/ipv4/route.c25
-rw-r--r--net/ipv4/tcp.c89
-rw-r--r--net/ipv4/tcp_bbr.c896
-rw-r--r--net/ipv4/tcp_cdg.c12
-rw-r--r--net/ipv4/tcp_cong.c2
-rw-r--r--net/ipv4/tcp_input.c531
-rw-r--r--net/ipv4/tcp_ipv4.c41
-rw-r--r--net/ipv4/tcp_metrics.c2
-rw-r--r--net/ipv4/tcp_minisocks.c6
-rw-r--r--net/ipv4/tcp_offload.c13
-rw-r--r--net/ipv4/tcp_output.c102
-rw-r--r--net/ipv4/tcp_rate.c186
-rw-r--r--net/ipv4/tcp_timer.c4
-rw-r--r--net/ipv4/udp.c23
-rw-r--r--net/ipv4/udp_diag.c89
-rw-r--r--net/ipv4/udp_offload.c6
-rw-r--r--net/ipv4/udplite.c1
-rw-r--r--net/ipv4/xfrm4_policy.c2
-rw-r--r--net/ipv6/addrconf.c98
-rw-r--r--net/ipv6/af_inet6.c2
-rw-r--r--net/ipv6/fib6_rules.c3
-rw-r--r--net/ipv6/ila/ila_common.c1
-rw-r--r--net/ipv6/ila/ila_lwt.c2
-rw-r--r--net/ipv6/ila/ila_xlat.c2
-rw-r--r--net/ipv6/ip6_fib.c6
-rw-r--r--net/ipv6/ip6_gre.c14
-rw-r--r--net/ipv6/ip6_offload.c5
-rw-r--r--net/ipv6/ip6_output.c27
-rw-r--r--net/ipv6/ip6_tunnel.c188
-rw-r--r--net/ipv6/ip6_vti.c10
-rw-r--r--net/ipv6/mcast.c10
-rw-r--r--net/ipv6/ndisc.c11
-rw-r--r--net/ipv6/netfilter/ip6_tables.c2
-rw-r--r--net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c2
-rw-r--r--net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c2
-rw-r--r--net/ipv6/netfilter/nf_log_ipv6.c21
-rw-r--r--net/ipv6/netfilter/nf_tables_ipv6.c9
-rw-r--r--net/ipv6/netfilter/nft_chain_route_ipv6.c4
-rw-r--r--net/ipv6/output_core.c7
-rw-r--r--net/ipv6/proc.c30
-rw-r--r--net/ipv6/raw.c7
-rw-r--r--net/ipv6/route.c41
-rw-r--r--net/ipv6/sit.c12
-rw-r--r--net/ipv6/tcp_ipv6.c47
-rw-r--r--net/ipv6/udp.c13
-rw-r--r--net/ipv6/udp_impl.h2
-rw-r--r--net/ipv6/udplite.c1
-rw-r--r--net/ipv6/xfrm6_policy.c2
-rw-r--r--net/irda/af_irda.c3
-rw-r--r--net/kcm/Kconfig1
-rw-r--r--net/kcm/kcmproc.c58
-rw-r--r--net/kcm/kcmsock.c499
-rw-r--r--net/l2tp/l2tp_core.h2
-rw-r--r--net/l2tp/l2tp_eth.c6
-rw-r--r--net/l2tp/l2tp_netlink.c2
-rw-r--r--net/l2tp/l2tp_ppp.c24
-rw-r--r--net/l3mdev/l3mdev.c105
-rw-r--r--net/llc/af_llc.c4
-rw-r--r--net/mac80211/agg-rx.c11
-rw-r--r--net/mac80211/cfg.c243
-rw-r--r--net/mac80211/chan.c6
-rw-r--r--net/mac80211/debugfs.c160
-rw-r--r--net/mac80211/debugfs_netdev.c49
-rw-r--r--net/mac80211/debugfs_sta.c56
-rw-r--r--net/mac80211/driver-ops.c17
-rw-r--r--net/mac80211/driver-ops.h109
-rw-r--r--net/mac80211/ieee80211_i.h39
-rw-r--r--net/mac80211/iface.c49
-rw-r--r--net/mac80211/main.c11
-rw-r--r--net/mac80211/mesh_hwmp.c27
-rw-r--r--net/mac80211/mesh_sync.c12
-rw-r--r--net/mac80211/mlme.c12
-rw-r--r--net/mac80211/offchannel.c4
-rw-r--r--net/mac80211/pm.c3
-rw-r--r--net/mac80211/rx.c83
-rw-r--r--net/mac80211/scan.c2
-rw-r--r--net/mac80211/sta_info.c92
-rw-r--r--net/mac80211/sta_info.h24
-rw-r--r--net/mac80211/status.c15
-rw-r--r--net/mac80211/trace.h159
-rw-r--r--net/mac80211/tx.c469
-rw-r--r--net/mac80211/util.c64
-rw-r--r--net/mac802154/iface.c1
-rw-r--r--net/mac802154/rx.c9
-rw-r--r--net/mpls/af_mpls.c5
-rw-r--r--net/mpls/internal.h10
-rw-r--r--net/mpls/mpls_gso.c40
-rw-r--r--net/mpls/mpls_iptunnel.c13
-rw-r--r--net/ncsi/internal.h22
-rw-r--r--net/ncsi/ncsi-aen.c37
-rw-r--r--net/ncsi/ncsi-cmd.c2
-rw-r--r--net/ncsi/ncsi-manage.c198
-rw-r--r--net/ncsi/ncsi-rsp.c4
-rw-r--r--net/netfilter/Kconfig22
-rw-r--r--net/netfilter/Makefile10
-rw-r--r--net/netfilter/core.c172
-rw-r--r--net/netfilter/ipvs/ip_vs_nfct.c7
-rw-r--r--net/netfilter/nf_conntrack_core.c245
-rw-r--r--net/netfilter/nf_conntrack_ecache.c22
-rw-r--r--net/netfilter/nf_conntrack_ftp.c17
-rw-r--r--net/netfilter/nf_conntrack_h323_main.c2
-rw-r--r--net/netfilter/nf_conntrack_helper.c17
-rw-r--r--net/netfilter/nf_conntrack_netlink.c50
-rw-r--r--net/netfilter/nf_conntrack_pptp.c3
-rw-r--r--net/netfilter/nf_conntrack_proto.c81
-rw-r--r--net/netfilter/nf_conntrack_proto_dccp.c3
-rw-r--r--net/netfilter/nf_conntrack_proto_generic.c39
-rw-r--r--net/netfilter/nf_conntrack_proto_gre.c14
-rw-r--r--net/netfilter/nf_conntrack_proto_sctp.c89
-rw-r--r--net/netfilter/nf_conntrack_proto_tcp.c131
-rw-r--r--net/netfilter/nf_conntrack_proto_udp.c53
-rw-r--r--net/netfilter/nf_conntrack_proto_udplite.c3
-rw-r--r--net/netfilter/nf_conntrack_seqadj.c20
-rw-r--r--net/netfilter/nf_conntrack_sip.c10
-rw-r--r--net/netfilter/nf_conntrack_standalone.c16
-rw-r--r--net/netfilter/nf_internals.h10
-rw-r--r--net/netfilter/nf_log.c14
-rw-r--r--net/netfilter/nf_log_common.c4
-rw-r--r--net/netfilter/nf_nat_core.c6
-rw-r--r--net/netfilter/nf_queue.c18
-rw-r--r--net/netfilter/nf_tables_api.c228
-rw-r--r--net/netfilter/nf_tables_core.c16
-rw-r--r--net/netfilter/nf_tables_inet.c5
-rw-r--r--net/netfilter/nf_tables_netdev.c101
-rw-r--r--net/netfilter/nf_tables_trace.c20
-rw-r--r--net/netfilter/nfnetlink_cthelper.c2
-rw-r--r--net/netfilter/nfnetlink_log.c8
-rw-r--r--net/netfilter/nfnetlink_queue.c19
-rw-r--r--net/netfilter/nft_bitwise.c8
-rw-r--r--net/netfilter/nft_byteorder.c15
-rw-r--r--net/netfilter/nft_cmp.c3
-rw-r--r--net/netfilter/nft_ct.c21
-rw-r--r--net/netfilter/nft_dynset.c20
-rw-r--r--net/netfilter/nft_exthdr.c12
-rw-r--r--net/netfilter/nft_hash.c424
-rw-r--r--net/netfilter/nft_immediate.c4
-rw-r--r--net/netfilter/nft_limit.c4
-rw-r--r--net/netfilter/nft_log.c9
-rw-r--r--net/netfilter/nft_lookup.c2
-rw-r--r--net/netfilter/nft_meta.c2
-rw-r--r--net/netfilter/nft_numgen.c212
-rw-r--r--net/netfilter/nft_payload.c4
-rw-r--r--net/netfilter/nft_queue.c113
-rw-r--r--net/netfilter/nft_quota.c121
-rw-r--r--net/netfilter/nft_range.c138
-rw-r--r--net/netfilter/nft_set_hash.c404
-rw-r--r--net/netfilter/nft_set_rbtree.c (renamed from net/netfilter/nft_rbtree.c)12
-rw-r--r--net/netfilter/xt_RATEEST.c6
-rw-r--r--net/netfilter/xt_TCPMSS.c12
-rw-r--r--net/netfilter/xt_TEE.c8
-rw-r--r--net/netfilter/xt_connlimit.c8
-rw-r--r--net/netfilter/xt_conntrack.c4
-rw-r--r--net/netfilter/xt_hashlimit.c340
-rw-r--r--net/netfilter/xt_helper.c4
-rw-r--r--net/netfilter/xt_physdev.c4
-rw-r--r--net/netfilter/xt_recent.c7
-rw-r--r--net/netfilter/xt_sctp.c2
-rw-r--r--net/netlink/af_netlink.c7
-rw-r--r--net/netlink/diag.c102
-rw-r--r--net/netlink/genetlink.c4
-rw-r--r--net/openvswitch/actions.c79
-rw-r--r--net/openvswitch/conntrack.c2
-rw-r--r--net/openvswitch/datapath.c25
-rw-r--r--net/openvswitch/flow.c118
-rw-r--r--net/openvswitch/flow.h12
-rw-r--r--net/openvswitch/flow_netlink.c316
-rw-r--r--net/openvswitch/flow_netlink.h3
-rw-r--r--net/openvswitch/flow_table.c25
-rw-r--r--net/openvswitch/vport-internal_dev.c2
-rw-r--r--net/openvswitch/vport.c8
-rw-r--r--net/packet/af_packet.c1
-rw-r--r--net/rds/ib.c2
-rw-r--r--net/rds/ib.h1
-rw-r--r--net/rds/rds.h1
-rw-r--r--net/rxrpc/Kconfig14
-rw-r--r--net/rxrpc/Makefile1
-rw-r--r--net/rxrpc/af_rxrpc.c175
-rw-r--r--net/rxrpc/ar-internal.h840
-rw-r--r--net/rxrpc/call_accept.c717
-rw-r--r--net/rxrpc/call_event.c1447
-rw-r--r--net/rxrpc/call_object.c797
-rw-r--r--net/rxrpc/conn_client.c993
-rw-r--r--net/rxrpc/conn_event.c271
-rw-r--r--net/rxrpc/conn_object.c204
-rw-r--r--net/rxrpc/conn_service.c117
-rw-r--r--net/rxrpc/input.c1433
-rw-r--r--net/rxrpc/insecure.c26
-rw-r--r--net/rxrpc/local_event.c19
-rw-r--r--net/rxrpc/local_object.c51
-rw-r--r--net/rxrpc/misc.c194
-rw-r--r--net/rxrpc/output.c956
-rw-r--r--net/rxrpc/peer_event.c103
-rw-r--r--net/rxrpc/peer_object.c199
-rw-r--r--net/rxrpc/proc.c72
-rw-r--r--net/rxrpc/recvmsg.c870
-rw-r--r--net/rxrpc/rxkad.c209
-rw-r--r--net/rxrpc/security.c18
-rw-r--r--net/rxrpc/sendmsg.c610
-rw-r--r--net/rxrpc/skbuff.c174
-rw-r--r--net/rxrpc/sysctl.c45
-rw-r--r--net/rxrpc/utils.c2
-rw-r--r--net/sched/Kconfig27
-rw-r--r--net/sched/Makefile3
-rw-r--r--net/sched/act_api.c55
-rw-r--r--net/sched/act_bpf.c5
-rw-r--r--net/sched/act_csum.c36
-rw-r--r--net/sched/act_gact.c3
-rw-r--r--net/sched/act_ife.c26
-rw-r--r--net/sched/act_meta_skbtcindex.c79
-rw-r--r--net/sched/act_mirred.c11
-rw-r--r--net/sched/act_police.c12
-rw-r--r--net/sched/act_skbmod.c301
-rw-r--r--net/sched/act_tunnel_key.c342
-rw-r--r--net/sched/act_vlan.c51
-rw-r--r--net/sched/cls_api.c32
-rw-r--r--net/sched/cls_basic.c12
-rw-r--r--net/sched/cls_bpf.c153
-rw-r--r--net/sched/cls_cgroup.c13
-rw-r--r--net/sched/cls_flow.c53
-rw-r--r--net/sched/cls_flower.c232
-rw-r--r--net/sched/cls_fw.c28
-rw-r--r--net/sched/cls_route.c24
-rw-r--r--net/sched/cls_rsvp.h17
-rw-r--r--net/sched/cls_tcindex.c102
-rw-r--r--net/sched/cls_u32.c51
-rw-r--r--net/sched/sch_api.c84
-rw-r--r--net/sched/sch_codel.c4
-rw-r--r--net/sched/sch_fifo.c4
-rw-r--r--net/sched/sch_fq.c71
-rw-r--r--net/sched/sch_generic.c36
-rw-r--r--net/sched/sch_hfsc.c51
-rw-r--r--net/sched/sch_htb.c24
-rw-r--r--net/sched/sch_mq.c2
-rw-r--r--net/sched/sch_mqprio.c2
-rw-r--r--net/sched/sch_netem.c20
-rw-r--r--net/sched/sch_pie.c4
-rw-r--r--net/sctp/associola.c2
-rw-r--r--net/sctp/auth.c2
-rw-r--r--net/sctp/chunk.c26
-rw-r--r--net/sctp/input.c8
-rw-r--r--net/sctp/inqueue.c2
-rw-r--r--net/sctp/output.c62
-rw-r--r--net/sctp/outqueue.c99
-rw-r--r--net/sctp/proc.c10
-rw-r--r--net/sctp/sctp_diag.c20
-rw-r--r--net/sctp/sm_make_chunk.c28
-rw-r--r--net/sctp/sm_sideeffect.c25
-rw-r--r--net/sctp/sm_statefuns.c6
-rw-r--r--net/sctp/socket.c8
-rw-r--r--net/sctp/transport.c4
-rw-r--r--net/sctp/ulpevent.c4
-rw-r--r--net/sctp/ulpqueue.c3
-rw-r--r--net/socket.c61
-rw-r--r--net/strparser/Kconfig4
-rw-r--r--net/strparser/Makefile1
-rw-r--r--net/strparser/strparser.c510
-rw-r--r--net/sunrpc/auth.c2
-rw-r--r--net/sunrpc/auth_generic.c13
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c7
-rw-r--r--net/sunrpc/auth_gss/gss_rpc_xdr.c2
-rw-r--r--net/sunrpc/auth_gss/svcauth_gss.c2
-rw-r--r--net/sunrpc/auth_unix.c13
-rw-r--r--net/sunrpc/backchannel_rqst.c8
-rw-r--r--net/sunrpc/cache.c5
-rw-r--r--net/sunrpc/clnt.c132
-rw-r--r--net/sunrpc/rpc_pipe.c2
-rw-r--r--net/sunrpc/sched.c35
-rw-r--r--net/sunrpc/svc.c17
-rw-r--r--net/sunrpc/svcauth_unix.c6
-rw-r--r--net/sunrpc/xdr.c11
-rw-r--r--net/sunrpc/xprt.c2
-rw-r--r--net/sunrpc/xprtmultipath.c24
-rw-r--r--net/sunrpc/xprtrdma/backchannel.c53
-rw-r--r--net/sunrpc/xprtrdma/fmr_ops.c7
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c28
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c323
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_backchannel.c21
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c2
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c82
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c62
-rw-r--r--net/sunrpc/xprtrdma/transport.c202
-rw-r--r--net/sunrpc/xprtrdma/verbs.c239
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h108
-rw-r--r--net/sunrpc/xprtsock.c34
-rw-r--r--net/switchdev/switchdev.c278
-rw-r--r--net/sysctl_net.c33
-rw-r--r--net/tipc/bcast.c8
-rw-r--r--net/tipc/bcast.h4
-rw-r--r--net/tipc/bearer.c130
-rw-r--r--net/tipc/bearer.h2
-rw-r--r--net/tipc/link.c149
-rw-r--r--net/tipc/link.h6
-rw-r--r--net/tipc/msg.h10
-rw-r--r--net/tipc/net.h2
-rw-r--r--net/tipc/netlink.c18
-rw-r--r--net/tipc/node.c95
-rw-r--r--net/tipc/node.h12
-rw-r--r--net/tipc/udp_media.c524
-rw-r--r--net/tipc/udp_media.h46
-rw-r--r--net/unix/af_unix.c17
-rw-r--r--net/wireless/chan.c2
-rw-r--r--net/wireless/core.c43
-rw-r--r--net/wireless/core.h9
-rw-r--r--net/wireless/ibss.c14
-rw-r--r--net/wireless/mlme.c3
-rw-r--r--net/wireless/nl80211.c1355
-rw-r--r--net/wireless/nl80211.h3
-rw-r--r--net/wireless/rdev-ops.h58
-rw-r--r--net/wireless/scan.c58
-rw-r--r--net/wireless/sme.c9
-rw-r--r--net/wireless/sysfs.c2
-rw-r--r--net/wireless/trace.h90
-rw-r--r--net/wireless/util.c43
-rw-r--r--net/wireless/wext-compat.c21
-rw-r--r--net/wireless/wext-sme.c5
-rw-r--r--net/x25/af_x25.c4
-rw-r--r--net/xfrm/xfrm_algo.c2
-rw-r--r--net/xfrm/xfrm_policy.c145
-rw-r--r--net/xfrm/xfrm_proc.c10
-rw-r--r--net/xfrm/xfrm_replay.c6
-rw-r--r--net/xfrm/xfrm_state.c125
-rw-r--r--net/xfrm/xfrm_sysctl.c4
455 files changed, 25263 insertions, 11991 deletions
diff --git a/net/6lowpan/ndisc.c b/net/6lowpan/ndisc.c
index 86450b7e2899..941df2fa4448 100644
--- a/net/6lowpan/ndisc.c
+++ b/net/6lowpan/ndisc.c
@@ -101,8 +101,6 @@ static void lowpan_ndisc_802154_update(struct neighbour *n, u32 flags,
101 ieee802154_be16_to_le16(&neigh->short_addr, lladdr_short); 101 ieee802154_be16_to_le16(&neigh->short_addr, lladdr_short);
102 if (!lowpan_802154_is_valid_src_short_addr(neigh->short_addr)) 102 if (!lowpan_802154_is_valid_src_short_addr(neigh->short_addr))
103 neigh->short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC); 103 neigh->short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC);
104 } else {
105 neigh->short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC);
106 } 104 }
107 write_unlock_bh(&n->lock); 105 write_unlock_bh(&n->lock);
108} 106}
diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
index 1852e383afd6..553ed4ecb6a0 100644
--- a/net/9p/trans_rdma.c
+++ b/net/9p/trans_rdma.c
@@ -680,7 +680,7 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args)
680 goto error; 680 goto error;
681 681
682 /* Create the Protection Domain */ 682 /* Create the Protection Domain */
683 rdma->pd = ib_alloc_pd(rdma->cm_id->device); 683 rdma->pd = ib_alloc_pd(rdma->cm_id->device, 0);
684 if (IS_ERR(rdma->pd)) 684 if (IS_ERR(rdma->pd))
685 goto error; 685 goto error;
686 686
diff --git a/net/Kconfig b/net/Kconfig
index c2cdbce629bd..7b6cd340b72b 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -369,6 +369,7 @@ source "net/irda/Kconfig"
369source "net/bluetooth/Kconfig" 369source "net/bluetooth/Kconfig"
370source "net/rxrpc/Kconfig" 370source "net/rxrpc/Kconfig"
371source "net/kcm/Kconfig" 371source "net/kcm/Kconfig"
372source "net/strparser/Kconfig"
372 373
373config FIB_RULES 374config FIB_RULES
374 bool 375 bool
diff --git a/net/Makefile b/net/Makefile
index 9bd20bb86cc6..4cafaa2b4667 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -35,6 +35,7 @@ obj-$(CONFIG_BT) += bluetooth/
35obj-$(CONFIG_SUNRPC) += sunrpc/ 35obj-$(CONFIG_SUNRPC) += sunrpc/
36obj-$(CONFIG_AF_RXRPC) += rxrpc/ 36obj-$(CONFIG_AF_RXRPC) += rxrpc/
37obj-$(CONFIG_AF_KCM) += kcm/ 37obj-$(CONFIG_AF_KCM) += kcm/
38obj-$(CONFIG_STREAM_PARSER) += strparser/
38obj-$(CONFIG_ATM) += atm/ 39obj-$(CONFIG_ATM) += atm/
39obj-$(CONFIG_L2TP) += l2tp/ 40obj-$(CONFIG_L2TP) += l2tp/
40obj-$(CONFIG_DECNET) += decnet/ 41obj-$(CONFIG_DECNET) += decnet/
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index f066781be3c8..10d2bdce686e 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -1278,7 +1278,7 @@ out:
1278 return err; 1278 return err;
1279} 1279}
1280 1280
1281#if defined(CONFIG_IPDDP) || defined(CONFIG_IPDDP_MODULE) 1281#if IS_ENABLED(CONFIG_IPDDP)
1282static __inline__ int is_ip_over_ddp(struct sk_buff *skb) 1282static __inline__ int is_ip_over_ddp(struct sk_buff *skb)
1283{ 1283{
1284 return skb->data[12] == 22; 1284 return skb->data[12] == 22;
diff --git a/net/atm/lec.c b/net/atm/lec.c
index e574a7e9db6f..5d2693826afb 100644
--- a/net/atm/lec.c
+++ b/net/atm/lec.c
@@ -31,7 +31,7 @@
31#include <linux/atmlec.h> 31#include <linux/atmlec.h>
32 32
33/* Proxy LEC knows about bridging */ 33/* Proxy LEC knows about bridging */
34#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) 34#if IS_ENABLED(CONFIG_BRIDGE)
35#include "../bridge/br_private.h" 35#include "../bridge/br_private.h"
36 36
37static unsigned char bridge_ula_lec[] = { 0x01, 0x80, 0xc2, 0x00, 0x00 }; 37static unsigned char bridge_ula_lec[] = { 0x01, 0x80, 0xc2, 0x00, 0x00 };
@@ -121,7 +121,7 @@ static unsigned char bus_mac[ETH_ALEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
121/* Device structures */ 121/* Device structures */
122static struct net_device *dev_lec[MAX_LEC_ITF]; 122static struct net_device *dev_lec[MAX_LEC_ITF];
123 123
124#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) 124#if IS_ENABLED(CONFIG_BRIDGE)
125static void lec_handle_bridge(struct sk_buff *skb, struct net_device *dev) 125static void lec_handle_bridge(struct sk_buff *skb, struct net_device *dev)
126{ 126{
127 char *buff; 127 char *buff;
@@ -155,7 +155,7 @@ static void lec_handle_bridge(struct sk_buff *skb, struct net_device *dev)
155 sk->sk_data_ready(sk); 155 sk->sk_data_ready(sk);
156 } 156 }
157} 157}
158#endif /* defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) */ 158#endif /* IS_ENABLED(CONFIG_BRIDGE) */
159 159
160/* 160/*
161 * Open/initialize the netdevice. This is called (in the current kernel) 161 * Open/initialize the netdevice. This is called (in the current kernel)
@@ -222,7 +222,7 @@ static netdev_tx_t lec_start_xmit(struct sk_buff *skb,
222 pr_debug("skbuff head:%lx data:%lx tail:%lx end:%lx\n", 222 pr_debug("skbuff head:%lx data:%lx tail:%lx end:%lx\n",
223 (long)skb->head, (long)skb->data, (long)skb_tail_pointer(skb), 223 (long)skb->head, (long)skb->data, (long)skb_tail_pointer(skb),
224 (long)skb_end_pointer(skb)); 224 (long)skb_end_pointer(skb));
225#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) 225#if IS_ENABLED(CONFIG_BRIDGE)
226 if (memcmp(skb->data, bridge_ula_lec, sizeof(bridge_ula_lec)) == 0) 226 if (memcmp(skb->data, bridge_ula_lec, sizeof(bridge_ula_lec)) == 0)
227 lec_handle_bridge(skb, dev); 227 lec_handle_bridge(skb, dev);
228#endif 228#endif
@@ -426,7 +426,7 @@ static int lec_atm_send(struct atm_vcc *vcc, struct sk_buff *skb)
426 (unsigned short)(0xffff & mesg->content.normal.flag); 426 (unsigned short)(0xffff & mesg->content.normal.flag);
427 break; 427 break;
428 case l_should_bridge: 428 case l_should_bridge:
429#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) 429#if IS_ENABLED(CONFIG_BRIDGE)
430 { 430 {
431 pr_debug("%s: bridge zeppelin asks about %pM\n", 431 pr_debug("%s: bridge zeppelin asks about %pM\n",
432 dev->name, mesg->content.proxy.mac_addr); 432 dev->name, mesg->content.proxy.mac_addr);
@@ -452,7 +452,7 @@ static int lec_atm_send(struct atm_vcc *vcc, struct sk_buff *skb)
452 sk->sk_data_ready(sk); 452 sk->sk_data_ready(sk);
453 } 453 }
454 } 454 }
455#endif /* defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) */ 455#endif /* IS_ENABLED(CONFIG_BRIDGE) */
456 break; 456 break;
457 default: 457 default:
458 pr_info("%s: Unknown message type %d\n", dev->name, mesg->type); 458 pr_info("%s: Unknown message type %d\n", dev->name, mesg->type);
diff --git a/net/atm/mpc.c b/net/atm/mpc.c
index 0e982222d425..3b3b1a292ec8 100644
--- a/net/atm/mpc.c
+++ b/net/atm/mpc.c
@@ -1007,7 +1007,7 @@ static int mpoa_event_listener(struct notifier_block *mpoa_notifier,
1007 if (!net_eq(dev_net(dev), &init_net)) 1007 if (!net_eq(dev_net(dev), &init_net))
1008 return NOTIFY_DONE; 1008 return NOTIFY_DONE;
1009 1009
1010 if (dev->name == NULL || strncmp(dev->name, "lec", 3)) 1010 if (strncmp(dev->name, "lec", 3))
1011 return NOTIFY_DONE; /* we are only interested in lec:s */ 1011 return NOTIFY_DONE; /* we are only interested in lec:s */
1012 1012
1013 switch (event) { 1013 switch (event) {
diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig
index 833bb145ba3c..f20742cbae6d 100644
--- a/net/batman-adv/Kconfig
+++ b/net/batman-adv/Kconfig
@@ -73,10 +73,21 @@ config BATMAN_ADV_MCAST
73 reduce the air overhead while improving the reliability of 73 reduce the air overhead while improving the reliability of
74 multicast messages. 74 multicast messages.
75 75
76config BATMAN_ADV_DEBUG 76config BATMAN_ADV_DEBUGFS
77 bool "B.A.T.M.A.N. debugging" 77 bool "batman-adv debugfs entries"
78 depends on BATMAN_ADV 78 depends on BATMAN_ADV
79 depends on DEBUG_FS 79 depends on DEBUG_FS
80 default y
81 help
82 Enable this to export routing related debug tables via debugfs.
83 The information for each soft-interface and used hard-interface can be
84 found under batman_adv/
85
86 If unsure, say Y.
87
88config BATMAN_ADV_DEBUG
89 bool "B.A.T.M.A.N. debugging"
90 depends on BATMAN_ADV_DEBUGFS
80 help 91 help
81 This is an option for use by developers; most people should 92 This is an option for use by developers; most people should
82 say N here. This enables compilation of support for 93 say N here. This enables compilation of support for
diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile
index a83fc6c58d19..f724d3c98a81 100644
--- a/net/batman-adv/Makefile
+++ b/net/batman-adv/Makefile
@@ -24,14 +24,14 @@ batman-adv-$(CONFIG_BATMAN_ADV_BATMAN_V) += bat_v_elp.o
24batman-adv-$(CONFIG_BATMAN_ADV_BATMAN_V) += bat_v_ogm.o 24batman-adv-$(CONFIG_BATMAN_ADV_BATMAN_V) += bat_v_ogm.o
25batman-adv-y += bitarray.o 25batman-adv-y += bitarray.o
26batman-adv-$(CONFIG_BATMAN_ADV_BLA) += bridge_loop_avoidance.o 26batman-adv-$(CONFIG_BATMAN_ADV_BLA) += bridge_loop_avoidance.o
27batman-adv-$(CONFIG_DEBUG_FS) += debugfs.o 27batman-adv-$(CONFIG_BATMAN_ADV_DEBUGFS) += debugfs.o
28batman-adv-$(CONFIG_BATMAN_ADV_DAT) += distributed-arp-table.o 28batman-adv-$(CONFIG_BATMAN_ADV_DAT) += distributed-arp-table.o
29batman-adv-y += fragmentation.o 29batman-adv-y += fragmentation.o
30batman-adv-y += gateway_client.o 30batman-adv-y += gateway_client.o
31batman-adv-y += gateway_common.o 31batman-adv-y += gateway_common.o
32batman-adv-y += hard-interface.o 32batman-adv-y += hard-interface.o
33batman-adv-y += hash.o 33batman-adv-y += hash.o
34batman-adv-y += icmp_socket.o 34batman-adv-$(CONFIG_BATMAN_ADV_DEBUGFS) += icmp_socket.o
35batman-adv-$(CONFIG_BATMAN_ADV_DEBUG) += log.o 35batman-adv-$(CONFIG_BATMAN_ADV_DEBUG) += log.o
36batman-adv-y += main.o 36batman-adv-y += main.o
37batman-adv-$(CONFIG_BATMAN_ADV_MCAST) += multicast.o 37batman-adv-$(CONFIG_BATMAN_ADV_MCAST) += multicast.o
diff --git a/net/batman-adv/bat_algo.c b/net/batman-adv/bat_algo.c
index 81dbbf569bd4..623d04302aa2 100644
--- a/net/batman-adv/bat_algo.c
+++ b/net/batman-adv/bat_algo.c
@@ -20,12 +20,18 @@
20#include <linux/errno.h> 20#include <linux/errno.h>
21#include <linux/list.h> 21#include <linux/list.h>
22#include <linux/moduleparam.h> 22#include <linux/moduleparam.h>
23#include <linux/netlink.h>
23#include <linux/printk.h> 24#include <linux/printk.h>
24#include <linux/seq_file.h> 25#include <linux/seq_file.h>
26#include <linux/skbuff.h>
25#include <linux/stddef.h> 27#include <linux/stddef.h>
26#include <linux/string.h> 28#include <linux/string.h>
29#include <net/genetlink.h>
30#include <net/netlink.h>
31#include <uapi/linux/batman_adv.h>
27 32
28#include "bat_algo.h" 33#include "bat_algo.h"
34#include "netlink.h"
29 35
30char batadv_routing_algo[20] = "BATMAN_IV"; 36char batadv_routing_algo[20] = "BATMAN_IV";
31static struct hlist_head batadv_algo_list; 37static struct hlist_head batadv_algo_list;
@@ -95,6 +101,7 @@ int batadv_algo_select(struct batadv_priv *bat_priv, char *name)
95 return 0; 101 return 0;
96} 102}
97 103
104#ifdef CONFIG_BATMAN_ADV_DEBUGFS
98int batadv_algo_seq_print_text(struct seq_file *seq, void *offset) 105int batadv_algo_seq_print_text(struct seq_file *seq, void *offset)
99{ 106{
100 struct batadv_algo_ops *bat_algo_ops; 107 struct batadv_algo_ops *bat_algo_ops;
@@ -107,6 +114,7 @@ int batadv_algo_seq_print_text(struct seq_file *seq, void *offset)
107 114
108 return 0; 115 return 0;
109} 116}
117#endif
110 118
111static int batadv_param_set_ra(const char *val, const struct kernel_param *kp) 119static int batadv_param_set_ra(const char *val, const struct kernel_param *kp)
112{ 120{
@@ -138,3 +146,65 @@ static struct kparam_string batadv_param_string_ra = {
138 146
139module_param_cb(routing_algo, &batadv_param_ops_ra, &batadv_param_string_ra, 147module_param_cb(routing_algo, &batadv_param_ops_ra, &batadv_param_string_ra,
140 0644); 148 0644);
149
150/**
151 * batadv_algo_dump_entry - fill in information about one supported routing
152 * algorithm
153 * @msg: netlink message to be sent back
154 * @portid: Port to reply to
155 * @seq: Sequence number of message
156 * @bat_algo_ops: Algorithm to be dumped
157 *
158 * Return: Error number, or 0 on success
159 */
160static int batadv_algo_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
161 struct batadv_algo_ops *bat_algo_ops)
162{
163 void *hdr;
164
165 hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family,
166 NLM_F_MULTI, BATADV_CMD_GET_ROUTING_ALGOS);
167 if (!hdr)
168 return -EMSGSIZE;
169
170 if (nla_put_string(msg, BATADV_ATTR_ALGO_NAME, bat_algo_ops->name))
171 goto nla_put_failure;
172
173 genlmsg_end(msg, hdr);
174 return 0;
175
176 nla_put_failure:
177 genlmsg_cancel(msg, hdr);
178 return -EMSGSIZE;
179}
180
181/**
182 * batadv_algo_dump - fill in information about supported routing
183 * algorithms
184 * @msg: netlink message to be sent back
185 * @cb: Parameters to the netlink request
186 *
187 * Return: Length of reply message.
188 */
189int batadv_algo_dump(struct sk_buff *msg, struct netlink_callback *cb)
190{
191 int portid = NETLINK_CB(cb->skb).portid;
192 struct batadv_algo_ops *bat_algo_ops;
193 int skip = cb->args[0];
194 int i = 0;
195
196 hlist_for_each_entry(bat_algo_ops, &batadv_algo_list, list) {
197 if (i++ < skip)
198 continue;
199
200 if (batadv_algo_dump_entry(msg, portid, cb->nlh->nlmsg_seq,
201 bat_algo_ops)) {
202 i--;
203 break;
204 }
205 }
206
207 cb->args[0] = i;
208
209 return msg->len;
210}
diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h
index 860d773dd8fa..3b5b69cdd12b 100644
--- a/net/batman-adv/bat_algo.h
+++ b/net/batman-adv/bat_algo.h
@@ -22,7 +22,9 @@
22 22
23#include <linux/types.h> 23#include <linux/types.h>
24 24
25struct netlink_callback;
25struct seq_file; 26struct seq_file;
27struct sk_buff;
26 28
27extern char batadv_routing_algo[]; 29extern char batadv_routing_algo[];
28extern struct list_head batadv_hardif_list; 30extern struct list_head batadv_hardif_list;
@@ -31,5 +33,6 @@ void batadv_algo_init(void);
31int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops); 33int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops);
32int batadv_algo_select(struct batadv_priv *bat_priv, char *name); 34int batadv_algo_select(struct batadv_priv *bat_priv, char *name);
33int batadv_algo_seq_print_text(struct seq_file *seq, void *offset); 35int batadv_algo_seq_print_text(struct seq_file *seq, void *offset);
36int batadv_algo_dump(struct sk_buff *msg, struct netlink_callback *cb);
34 37
35#endif /* _NET_BATMAN_ADV_BAT_ALGO_H_ */ 38#endif /* _NET_BATMAN_ADV_BAT_ALGO_H_ */
diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index 19b0abd6c640..e2d18d0b1f06 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -35,6 +35,7 @@
35#include <linux/list.h> 35#include <linux/list.h>
36#include <linux/lockdep.h> 36#include <linux/lockdep.h>
37#include <linux/netdevice.h> 37#include <linux/netdevice.h>
38#include <linux/netlink.h>
38#include <linux/pkt_sched.h> 39#include <linux/pkt_sched.h>
39#include <linux/printk.h> 40#include <linux/printk.h>
40#include <linux/random.h> 41#include <linux/random.h>
@@ -48,12 +49,17 @@
48#include <linux/string.h> 49#include <linux/string.h>
49#include <linux/types.h> 50#include <linux/types.h>
50#include <linux/workqueue.h> 51#include <linux/workqueue.h>
52#include <net/genetlink.h>
53#include <net/netlink.h>
54#include <uapi/linux/batman_adv.h>
51 55
52#include "bat_algo.h" 56#include "bat_algo.h"
53#include "bitarray.h" 57#include "bitarray.h"
58#include "gateway_client.h"
54#include "hard-interface.h" 59#include "hard-interface.h"
55#include "hash.h" 60#include "hash.h"
56#include "log.h" 61#include "log.h"
62#include "netlink.h"
57#include "network-coding.h" 63#include "network-coding.h"
58#include "originator.h" 64#include "originator.h"
59#include "packet.h" 65#include "packet.h"
@@ -318,17 +324,18 @@ batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr)
318 if (!orig_node->bat_iv.bcast_own_sum) 324 if (!orig_node->bat_iv.bcast_own_sum)
319 goto free_orig_node; 325 goto free_orig_node;
320 326
327 kref_get(&orig_node->refcount);
321 hash_added = batadv_hash_add(bat_priv->orig_hash, batadv_compare_orig, 328 hash_added = batadv_hash_add(bat_priv->orig_hash, batadv_compare_orig,
322 batadv_choose_orig, orig_node, 329 batadv_choose_orig, orig_node,
323 &orig_node->hash_entry); 330 &orig_node->hash_entry);
324 if (hash_added != 0) 331 if (hash_added != 0)
325 goto free_orig_node; 332 goto free_orig_node_hash;
326 333
327 return orig_node; 334 return orig_node;
328 335
329free_orig_node: 336free_orig_node_hash:
330 /* free twice, as batadv_orig_node_new sets refcount to 2 */
331 batadv_orig_node_put(orig_node); 337 batadv_orig_node_put(orig_node);
338free_orig_node:
332 batadv_orig_node_put(orig_node); 339 batadv_orig_node_put(orig_node);
333 340
334 return NULL; 341 return NULL;
@@ -528,36 +535,25 @@ static void batadv_iv_ogm_send_to_if(struct batadv_forw_packet *forw_packet,
528static void batadv_iv_ogm_emit(struct batadv_forw_packet *forw_packet) 535static void batadv_iv_ogm_emit(struct batadv_forw_packet *forw_packet)
529{ 536{
530 struct net_device *soft_iface; 537 struct net_device *soft_iface;
531 struct batadv_priv *bat_priv;
532 struct batadv_hard_iface *primary_if = NULL;
533 538
534 if (!forw_packet->if_incoming) { 539 if (!forw_packet->if_incoming) {
535 pr_err("Error - can't forward packet: incoming iface not specified\n"); 540 pr_err("Error - can't forward packet: incoming iface not specified\n");
536 goto out; 541 return;
537 } 542 }
538 543
539 soft_iface = forw_packet->if_incoming->soft_iface; 544 soft_iface = forw_packet->if_incoming->soft_iface;
540 bat_priv = netdev_priv(soft_iface);
541 545
542 if (WARN_ON(!forw_packet->if_outgoing)) 546 if (WARN_ON(!forw_packet->if_outgoing))
543 goto out; 547 return;
544 548
545 if (WARN_ON(forw_packet->if_outgoing->soft_iface != soft_iface)) 549 if (WARN_ON(forw_packet->if_outgoing->soft_iface != soft_iface))
546 goto out; 550 return;
547 551
548 if (forw_packet->if_incoming->if_status != BATADV_IF_ACTIVE) 552 if (forw_packet->if_incoming->if_status != BATADV_IF_ACTIVE)
549 goto out; 553 return;
550
551 primary_if = batadv_primary_if_get_selected(bat_priv);
552 if (!primary_if)
553 goto out;
554 554
555 /* only for one specific outgoing interface */ 555 /* only for one specific outgoing interface */
556 batadv_iv_ogm_send_to_if(forw_packet, forw_packet->if_outgoing); 556 batadv_iv_ogm_send_to_if(forw_packet, forw_packet->if_outgoing);
557
558out:
559 if (primary_if)
560 batadv_hardif_put(primary_if);
561} 557}
562 558
563/** 559/**
@@ -685,19 +681,12 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff,
685 struct batadv_forw_packet *forw_packet_aggr; 681 struct batadv_forw_packet *forw_packet_aggr;
686 unsigned char *skb_buff; 682 unsigned char *skb_buff;
687 unsigned int skb_size; 683 unsigned int skb_size;
684 atomic_t *queue_left = own_packet ? NULL : &bat_priv->batman_queue_left;
688 685
689 /* own packet should always be scheduled */ 686 forw_packet_aggr = batadv_forw_packet_alloc(if_incoming, if_outgoing,
690 if (!own_packet) { 687 queue_left, bat_priv);
691 if (!batadv_atomic_dec_not_zero(&bat_priv->batman_queue_left)) {
692 batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
693 "batman packet queue full\n");
694 return;
695 }
696 }
697
698 forw_packet_aggr = kmalloc(sizeof(*forw_packet_aggr), GFP_ATOMIC);
699 if (!forw_packet_aggr) 688 if (!forw_packet_aggr)
700 goto out_nomem; 689 return;
701 690
702 if (atomic_read(&bat_priv->aggregated_ogms) && 691 if (atomic_read(&bat_priv->aggregated_ogms) &&
703 packet_len < BATADV_MAX_AGGREGATION_BYTES) 692 packet_len < BATADV_MAX_AGGREGATION_BYTES)
@@ -708,8 +697,11 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff,
708 skb_size += ETH_HLEN; 697 skb_size += ETH_HLEN;
709 698
710 forw_packet_aggr->skb = netdev_alloc_skb_ip_align(NULL, skb_size); 699 forw_packet_aggr->skb = netdev_alloc_skb_ip_align(NULL, skb_size);
711 if (!forw_packet_aggr->skb) 700 if (!forw_packet_aggr->skb) {
712 goto out_free_forw_packet; 701 batadv_forw_packet_free(forw_packet_aggr);
702 return;
703 }
704
713 forw_packet_aggr->skb->priority = TC_PRIO_CONTROL; 705 forw_packet_aggr->skb->priority = TC_PRIO_CONTROL;
714 skb_reserve(forw_packet_aggr->skb, ETH_HLEN); 706 skb_reserve(forw_packet_aggr->skb, ETH_HLEN);
715 707
@@ -717,12 +709,7 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff,
717 forw_packet_aggr->packet_len = packet_len; 709 forw_packet_aggr->packet_len = packet_len;
718 memcpy(skb_buff, packet_buff, packet_len); 710 memcpy(skb_buff, packet_buff, packet_len);
719 711
720 kref_get(&if_incoming->refcount);
721 kref_get(&if_outgoing->refcount);
722 forw_packet_aggr->own = own_packet; 712 forw_packet_aggr->own = own_packet;
723 forw_packet_aggr->if_incoming = if_incoming;
724 forw_packet_aggr->if_outgoing = if_outgoing;
725 forw_packet_aggr->num_packets = 0;
726 forw_packet_aggr->direct_link_flags = BATADV_NO_FLAGS; 713 forw_packet_aggr->direct_link_flags = BATADV_NO_FLAGS;
727 forw_packet_aggr->send_time = send_time; 714 forw_packet_aggr->send_time = send_time;
728 715
@@ -741,13 +728,6 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff,
741 queue_delayed_work(batadv_event_workqueue, 728 queue_delayed_work(batadv_event_workqueue,
742 &forw_packet_aggr->delayed_work, 729 &forw_packet_aggr->delayed_work,
743 send_time - jiffies); 730 send_time - jiffies);
744
745 return;
746out_free_forw_packet:
747 kfree(forw_packet_aggr);
748out_nomem:
749 if (!own_packet)
750 atomic_inc(&bat_priv->batman_queue_left);
751} 731}
752 732
753/* aggregate a new packet into the existing ogm packet */ 733/* aggregate a new packet into the existing ogm packet */
@@ -1830,10 +1810,6 @@ static void batadv_iv_send_outstanding_bat_ogm_packet(struct work_struct *work)
1830 batadv_iv_ogm_schedule(forw_packet->if_incoming); 1810 batadv_iv_ogm_schedule(forw_packet->if_incoming);
1831 1811
1832out: 1812out:
1833 /* don't count own packet */
1834 if (!forw_packet->own)
1835 atomic_inc(&bat_priv->batman_queue_left);
1836
1837 batadv_forw_packet_free(forw_packet); 1813 batadv_forw_packet_free(forw_packet);
1838} 1814}
1839 1815
@@ -1879,6 +1855,7 @@ static int batadv_iv_ogm_receive(struct sk_buff *skb,
1879 return NET_RX_SUCCESS; 1855 return NET_RX_SUCCESS;
1880} 1856}
1881 1857
1858#ifdef CONFIG_BATMAN_ADV_DEBUGFS
1882/** 1859/**
1883 * batadv_iv_ogm_orig_print_neigh - print neighbors for the originator table 1860 * batadv_iv_ogm_orig_print_neigh - print neighbors for the originator table
1884 * @orig_node: the orig_node for which the neighbors are printed 1861 * @orig_node: the orig_node for which the neighbors are printed
@@ -1976,8 +1953,239 @@ next:
1976 if (batman_count == 0) 1953 if (batman_count == 0)
1977 seq_puts(seq, "No batman nodes in range ...\n"); 1954 seq_puts(seq, "No batman nodes in range ...\n");
1978} 1955}
1956#endif
1957
1958/**
1959 * batadv_iv_ogm_neigh_get_tq_avg - Get the TQ average for a neighbour on a
1960 * given outgoing interface.
1961 * @neigh_node: Neighbour of interest
1962 * @if_outgoing: Outgoing interface of interest
1963 * @tq_avg: Pointer of where to store the TQ average
1964 *
1965 * Return: False if no average TQ available, otherwise true.
1966 */
1967static bool
1968batadv_iv_ogm_neigh_get_tq_avg(struct batadv_neigh_node *neigh_node,
1969 struct batadv_hard_iface *if_outgoing,
1970 u8 *tq_avg)
1971{
1972 struct batadv_neigh_ifinfo *n_ifinfo;
1973
1974 n_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing);
1975 if (!n_ifinfo)
1976 return false;
1977
1978 *tq_avg = n_ifinfo->bat_iv.tq_avg;
1979 batadv_neigh_ifinfo_put(n_ifinfo);
1980
1981 return true;
1982}
1983
1984/**
1985 * batadv_iv_ogm_orig_dump_subentry - Dump an originator subentry into a
1986 * message
1987 * @msg: Netlink message to dump into
1988 * @portid: Port making netlink request
1989 * @seq: Sequence number of netlink message
1990 * @bat_priv: The bat priv with all the soft interface information
1991 * @if_outgoing: Limit dump to entries with this outgoing interface
1992 * @orig_node: Originator to dump
1993 * @neigh_node: Single hops neighbour
1994 * @best: Is the best originator
1995 *
1996 * Return: Error code, or 0 on success
1997 */
1998static int
1999batadv_iv_ogm_orig_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq,
2000 struct batadv_priv *bat_priv,
2001 struct batadv_hard_iface *if_outgoing,
2002 struct batadv_orig_node *orig_node,
2003 struct batadv_neigh_node *neigh_node,
2004 bool best)
2005{
2006 void *hdr;
2007 u8 tq_avg;
2008 unsigned int last_seen_msecs;
2009
2010 last_seen_msecs = jiffies_to_msecs(jiffies - orig_node->last_seen);
2011
2012 if (!batadv_iv_ogm_neigh_get_tq_avg(neigh_node, if_outgoing, &tq_avg))
2013 return 0;
2014
2015 if (if_outgoing != BATADV_IF_DEFAULT &&
2016 if_outgoing != neigh_node->if_incoming)
2017 return 0;
2018
2019 hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family,
2020 NLM_F_MULTI, BATADV_CMD_GET_ORIGINATORS);
2021 if (!hdr)
2022 return -ENOBUFS;
2023
2024 if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN,
2025 orig_node->orig) ||
2026 nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN,
2027 neigh_node->addr) ||
2028 nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX,
2029 neigh_node->if_incoming->net_dev->ifindex) ||
2030 nla_put_u8(msg, BATADV_ATTR_TQ, tq_avg) ||
2031 nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS,
2032 last_seen_msecs))
2033 goto nla_put_failure;
2034
2035 if (best && nla_put_flag(msg, BATADV_ATTR_FLAG_BEST))
2036 goto nla_put_failure;
2037
2038 genlmsg_end(msg, hdr);
2039 return 0;
2040
2041 nla_put_failure:
2042 genlmsg_cancel(msg, hdr);
2043 return -EMSGSIZE;
2044}
2045
2046/**
2047 * batadv_iv_ogm_orig_dump_entry - Dump an originator entry into a message
2048 * @msg: Netlink message to dump into
2049 * @portid: Port making netlink request
2050 * @seq: Sequence number of netlink message
2051 * @bat_priv: The bat priv with all the soft interface information
2052 * @if_outgoing: Limit dump to entries with this outgoing interface
2053 * @orig_node: Originator to dump
2054 * @sub_s: Number of sub entries to skip
2055 *
2056 * This function assumes the caller holds rcu_read_lock().
2057 *
2058 * Return: Error code, or 0 on success
2059 */
2060static int
2061batadv_iv_ogm_orig_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
2062 struct batadv_priv *bat_priv,
2063 struct batadv_hard_iface *if_outgoing,
2064 struct batadv_orig_node *orig_node, int *sub_s)
2065{
2066 struct batadv_neigh_node *neigh_node_best;
2067 struct batadv_neigh_node *neigh_node;
2068 int sub = 0;
2069 bool best;
2070 u8 tq_avg_best;
2071
2072 neigh_node_best = batadv_orig_router_get(orig_node, if_outgoing);
2073 if (!neigh_node_best)
2074 goto out;
2075
2076 if (!batadv_iv_ogm_neigh_get_tq_avg(neigh_node_best, if_outgoing,
2077 &tq_avg_best))
2078 goto out;
2079
2080 if (tq_avg_best == 0)
2081 goto out;
2082
2083 hlist_for_each_entry_rcu(neigh_node, &orig_node->neigh_list, list) {
2084 if (sub++ < *sub_s)
2085 continue;
2086
2087 best = (neigh_node == neigh_node_best);
2088
2089 if (batadv_iv_ogm_orig_dump_subentry(msg, portid, seq,
2090 bat_priv, if_outgoing,
2091 orig_node, neigh_node,
2092 best)) {
2093 batadv_neigh_node_put(neigh_node_best);
2094
2095 *sub_s = sub - 1;
2096 return -EMSGSIZE;
2097 }
2098 }
2099
2100 out:
2101 if (neigh_node_best)
2102 batadv_neigh_node_put(neigh_node_best);
2103
2104 *sub_s = 0;
2105 return 0;
2106}
1979 2107
1980/** 2108/**
2109 * batadv_iv_ogm_orig_dump_bucket - Dump an originator bucket into a
2110 * message
2111 * @msg: Netlink message to dump into
2112 * @portid: Port making netlink request
2113 * @seq: Sequence number of netlink message
2114 * @bat_priv: The bat priv with all the soft interface information
2115 * @if_outgoing: Limit dump to entries with this outgoing interface
2116 * @head: Bucket to be dumped
2117 * @idx_s: Number of entries to be skipped
2118 * @sub: Number of sub entries to be skipped
2119 *
2120 * Return: Error code, or 0 on success
2121 */
2122static int
2123batadv_iv_ogm_orig_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
2124 struct batadv_priv *bat_priv,
2125 struct batadv_hard_iface *if_outgoing,
2126 struct hlist_head *head, int *idx_s, int *sub)
2127{
2128 struct batadv_orig_node *orig_node;
2129 int idx = 0;
2130
2131 rcu_read_lock();
2132 hlist_for_each_entry_rcu(orig_node, head, hash_entry) {
2133 if (idx++ < *idx_s)
2134 continue;
2135
2136 if (batadv_iv_ogm_orig_dump_entry(msg, portid, seq, bat_priv,
2137 if_outgoing, orig_node,
2138 sub)) {
2139 rcu_read_unlock();
2140 *idx_s = idx - 1;
2141 return -EMSGSIZE;
2142 }
2143 }
2144 rcu_read_unlock();
2145
2146 *idx_s = 0;
2147 *sub = 0;
2148 return 0;
2149}
2150
2151/**
2152 * batadv_iv_ogm_orig_dump - Dump the originators into a message
2153 * @msg: Netlink message to dump into
2154 * @cb: Control block containing additional options
2155 * @bat_priv: The bat priv with all the soft interface information
2156 * @if_outgoing: Limit dump to entries with this outgoing interface
2157 */
2158static void
2159batadv_iv_ogm_orig_dump(struct sk_buff *msg, struct netlink_callback *cb,
2160 struct batadv_priv *bat_priv,
2161 struct batadv_hard_iface *if_outgoing)
2162{
2163 struct batadv_hashtable *hash = bat_priv->orig_hash;
2164 struct hlist_head *head;
2165 int bucket = cb->args[0];
2166 int idx = cb->args[1];
2167 int sub = cb->args[2];
2168 int portid = NETLINK_CB(cb->skb).portid;
2169
2170 while (bucket < hash->size) {
2171 head = &hash->table[bucket];
2172
2173 if (batadv_iv_ogm_orig_dump_bucket(msg, portid,
2174 cb->nlh->nlmsg_seq,
2175 bat_priv, if_outgoing, head,
2176 &idx, &sub))
2177 break;
2178
2179 bucket++;
2180 }
2181
2182 cb->args[0] = bucket;
2183 cb->args[1] = idx;
2184 cb->args[2] = sub;
2185}
2186
2187#ifdef CONFIG_BATMAN_ADV_DEBUGFS
2188/**
1981 * batadv_iv_hardif_neigh_print - print a single hop neighbour node 2189 * batadv_iv_hardif_neigh_print - print a single hop neighbour node
1982 * @seq: neighbour table seq_file struct 2190 * @seq: neighbour table seq_file struct
1983 * @hardif_neigh: hardif neighbour information 2191 * @hardif_neigh: hardif neighbour information
@@ -2027,37 +2235,43 @@ static void batadv_iv_neigh_print(struct batadv_priv *bat_priv,
2027 if (batman_count == 0) 2235 if (batman_count == 0)
2028 seq_puts(seq, "No batman nodes in range ...\n"); 2236 seq_puts(seq, "No batman nodes in range ...\n");
2029} 2237}
2238#endif
2030 2239
2031/** 2240/**
2032 * batadv_iv_ogm_neigh_cmp - compare the metrics of two neighbors 2241 * batadv_iv_ogm_neigh_diff - calculate tq difference of two neighbors
2033 * @neigh1: the first neighbor object of the comparison 2242 * @neigh1: the first neighbor object of the comparison
2034 * @if_outgoing1: outgoing interface for the first neighbor 2243 * @if_outgoing1: outgoing interface for the first neighbor
2035 * @neigh2: the second neighbor object of the comparison 2244 * @neigh2: the second neighbor object of the comparison
2036 * @if_outgoing2: outgoing interface for the second neighbor 2245 * @if_outgoing2: outgoing interface for the second neighbor
2246 * @diff: pointer to integer receiving the calculated difference
2037 * 2247 *
2038 * Return: a value less, equal to or greater than 0 if the metric via neigh1 is 2248 * The content of *@diff is only valid when this function returns true.
2039 * lower, the same as or higher than the metric via neigh2 2249 * It is less, equal to or greater than 0 if the metric via neigh1 is lower,
2250 * the same as or higher than the metric via neigh2
2251 *
2252 * Return: true when the difference could be calculated, false otherwise
2040 */ 2253 */
2041static int batadv_iv_ogm_neigh_cmp(struct batadv_neigh_node *neigh1, 2254static bool batadv_iv_ogm_neigh_diff(struct batadv_neigh_node *neigh1,
2042 struct batadv_hard_iface *if_outgoing1, 2255 struct batadv_hard_iface *if_outgoing1,
2043 struct batadv_neigh_node *neigh2, 2256 struct batadv_neigh_node *neigh2,
2044 struct batadv_hard_iface *if_outgoing2) 2257 struct batadv_hard_iface *if_outgoing2,
2258 int *diff)
2045{ 2259{
2046 struct batadv_neigh_ifinfo *neigh1_ifinfo, *neigh2_ifinfo; 2260 struct batadv_neigh_ifinfo *neigh1_ifinfo, *neigh2_ifinfo;
2047 u8 tq1, tq2; 2261 u8 tq1, tq2;
2048 int diff; 2262 bool ret = true;
2049 2263
2050 neigh1_ifinfo = batadv_neigh_ifinfo_get(neigh1, if_outgoing1); 2264 neigh1_ifinfo = batadv_neigh_ifinfo_get(neigh1, if_outgoing1);
2051 neigh2_ifinfo = batadv_neigh_ifinfo_get(neigh2, if_outgoing2); 2265 neigh2_ifinfo = batadv_neigh_ifinfo_get(neigh2, if_outgoing2);
2052 2266
2053 if (!neigh1_ifinfo || !neigh2_ifinfo) { 2267 if (!neigh1_ifinfo || !neigh2_ifinfo) {
2054 diff = 0; 2268 ret = false;
2055 goto out; 2269 goto out;
2056 } 2270 }
2057 2271
2058 tq1 = neigh1_ifinfo->bat_iv.tq_avg; 2272 tq1 = neigh1_ifinfo->bat_iv.tq_avg;
2059 tq2 = neigh2_ifinfo->bat_iv.tq_avg; 2273 tq2 = neigh2_ifinfo->bat_iv.tq_avg;
2060 diff = tq1 - tq2; 2274 *diff = (int)tq1 - (int)tq2;
2061 2275
2062out: 2276out:
2063 if (neigh1_ifinfo) 2277 if (neigh1_ifinfo)
@@ -2065,6 +2279,162 @@ out:
2065 if (neigh2_ifinfo) 2279 if (neigh2_ifinfo)
2066 batadv_neigh_ifinfo_put(neigh2_ifinfo); 2280 batadv_neigh_ifinfo_put(neigh2_ifinfo);
2067 2281
2282 return ret;
2283}
2284
2285/**
2286 * batadv_iv_ogm_neigh_dump_neigh - Dump a neighbour into a netlink message
2287 * @msg: Netlink message to dump into
2288 * @portid: Port making netlink request
2289 * @seq: Sequence number of netlink message
2290 * @hardif_neigh: Neighbour to be dumped
2291 *
2292 * Return: Error code, or 0 on success
2293 */
2294static int
2295batadv_iv_ogm_neigh_dump_neigh(struct sk_buff *msg, u32 portid, u32 seq,
2296 struct batadv_hardif_neigh_node *hardif_neigh)
2297{
2298 void *hdr;
2299 unsigned int last_seen_msecs;
2300
2301 last_seen_msecs = jiffies_to_msecs(jiffies - hardif_neigh->last_seen);
2302
2303 hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family,
2304 NLM_F_MULTI, BATADV_CMD_GET_NEIGHBORS);
2305 if (!hdr)
2306 return -ENOBUFS;
2307
2308 if (nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN,
2309 hardif_neigh->addr) ||
2310 nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX,
2311 hardif_neigh->if_incoming->net_dev->ifindex) ||
2312 nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS,
2313 last_seen_msecs))
2314 goto nla_put_failure;
2315
2316 genlmsg_end(msg, hdr);
2317 return 0;
2318
2319 nla_put_failure:
2320 genlmsg_cancel(msg, hdr);
2321 return -EMSGSIZE;
2322}
2323
2324/**
2325 * batadv_iv_ogm_neigh_dump_hardif - Dump the neighbours of a hard interface
2326 * into a message
2327 * @msg: Netlink message to dump into
2328 * @portid: Port making netlink request
2329 * @seq: Sequence number of netlink message
2330 * @bat_priv: The bat priv with all the soft interface information
2331 * @hard_iface: Hard interface to dump the neighbours for
2332 * @idx_s: Number of entries to skip
2333 *
2334 * This function assumes the caller holds rcu_read_lock().
2335 *
2336 * Return: Error code, or 0 on success
2337 */
2338static int
2339batadv_iv_ogm_neigh_dump_hardif(struct sk_buff *msg, u32 portid, u32 seq,
2340 struct batadv_priv *bat_priv,
2341 struct batadv_hard_iface *hard_iface,
2342 int *idx_s)
2343{
2344 struct batadv_hardif_neigh_node *hardif_neigh;
2345 int idx = 0;
2346
2347 hlist_for_each_entry_rcu(hardif_neigh,
2348 &hard_iface->neigh_list, list) {
2349 if (idx++ < *idx_s)
2350 continue;
2351
2352 if (batadv_iv_ogm_neigh_dump_neigh(msg, portid, seq,
2353 hardif_neigh)) {
2354 *idx_s = idx - 1;
2355 return -EMSGSIZE;
2356 }
2357 }
2358
2359 *idx_s = 0;
2360 return 0;
2361}
2362
2363/**
2364 * batadv_iv_ogm_neigh_dump - Dump the neighbours into a message
2365 * @msg: Netlink message to dump into
2366 * @cb: Control block containing additional options
2367 * @bat_priv: The bat priv with all the soft interface information
2368 * @single_hardif: Limit dump to this hard interfaace
2369 */
2370static void
2371batadv_iv_ogm_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb,
2372 struct batadv_priv *bat_priv,
2373 struct batadv_hard_iface *single_hardif)
2374{
2375 struct batadv_hard_iface *hard_iface;
2376 int i_hardif = 0;
2377 int i_hardif_s = cb->args[0];
2378 int idx = cb->args[1];
2379 int portid = NETLINK_CB(cb->skb).portid;
2380
2381 rcu_read_lock();
2382 if (single_hardif) {
2383 if (i_hardif_s == 0) {
2384 if (batadv_iv_ogm_neigh_dump_hardif(msg, portid,
2385 cb->nlh->nlmsg_seq,
2386 bat_priv,
2387 single_hardif,
2388 &idx) == 0)
2389 i_hardif++;
2390 }
2391 } else {
2392 list_for_each_entry_rcu(hard_iface, &batadv_hardif_list,
2393 list) {
2394 if (hard_iface->soft_iface != bat_priv->soft_iface)
2395 continue;
2396
2397 if (i_hardif++ < i_hardif_s)
2398 continue;
2399
2400 if (batadv_iv_ogm_neigh_dump_hardif(msg, portid,
2401 cb->nlh->nlmsg_seq,
2402 bat_priv,
2403 hard_iface, &idx)) {
2404 i_hardif--;
2405 break;
2406 }
2407 }
2408 }
2409 rcu_read_unlock();
2410
2411 cb->args[0] = i_hardif;
2412 cb->args[1] = idx;
2413}
2414
2415/**
2416 * batadv_iv_ogm_neigh_cmp - compare the metrics of two neighbors
2417 * @neigh1: the first neighbor object of the comparison
2418 * @if_outgoing1: outgoing interface for the first neighbor
2419 * @neigh2: the second neighbor object of the comparison
2420 * @if_outgoing2: outgoing interface for the second neighbor
2421 *
2422 * Return: a value less, equal to or greater than 0 if the metric via neigh1 is
2423 * lower, the same as or higher than the metric via neigh2
2424 */
2425static int batadv_iv_ogm_neigh_cmp(struct batadv_neigh_node *neigh1,
2426 struct batadv_hard_iface *if_outgoing1,
2427 struct batadv_neigh_node *neigh2,
2428 struct batadv_hard_iface *if_outgoing2)
2429{
2430 bool ret;
2431 int diff;
2432
2433 ret = batadv_iv_ogm_neigh_diff(neigh1, if_outgoing1, neigh2,
2434 if_outgoing2, &diff);
2435 if (!ret)
2436 return 0;
2437
2068 return diff; 2438 return diff;
2069} 2439}
2070 2440
@@ -2085,36 +2455,341 @@ batadv_iv_ogm_neigh_is_sob(struct batadv_neigh_node *neigh1,
2085 struct batadv_neigh_node *neigh2, 2455 struct batadv_neigh_node *neigh2,
2086 struct batadv_hard_iface *if_outgoing2) 2456 struct batadv_hard_iface *if_outgoing2)
2087{ 2457{
2088 struct batadv_neigh_ifinfo *neigh1_ifinfo, *neigh2_ifinfo;
2089 u8 tq1, tq2;
2090 bool ret; 2458 bool ret;
2459 int diff;
2091 2460
2092 neigh1_ifinfo = batadv_neigh_ifinfo_get(neigh1, if_outgoing1); 2461 ret = batadv_iv_ogm_neigh_diff(neigh1, if_outgoing1, neigh2,
2093 neigh2_ifinfo = batadv_neigh_ifinfo_get(neigh2, if_outgoing2); 2462 if_outgoing2, &diff);
2463 if (!ret)
2464 return false;
2094 2465
2095 /* we can't say that the metric is better */ 2466 ret = diff > -BATADV_TQ_SIMILARITY_THRESHOLD;
2096 if (!neigh1_ifinfo || !neigh2_ifinfo) { 2467 return ret;
2097 ret = false; 2468}
2469
2470static void batadv_iv_iface_activate(struct batadv_hard_iface *hard_iface)
2471{
2472 /* begin scheduling originator messages on that interface */
2473 batadv_iv_ogm_schedule(hard_iface);
2474}
2475
2476static struct batadv_gw_node *
2477batadv_iv_gw_get_best_gw_node(struct batadv_priv *bat_priv)
2478{
2479 struct batadv_neigh_node *router;
2480 struct batadv_neigh_ifinfo *router_ifinfo;
2481 struct batadv_gw_node *gw_node, *curr_gw = NULL;
2482 u64 max_gw_factor = 0;
2483 u64 tmp_gw_factor = 0;
2484 u8 max_tq = 0;
2485 u8 tq_avg;
2486 struct batadv_orig_node *orig_node;
2487
2488 rcu_read_lock();
2489 hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) {
2490 orig_node = gw_node->orig_node;
2491 router = batadv_orig_router_get(orig_node, BATADV_IF_DEFAULT);
2492 if (!router)
2493 continue;
2494
2495 router_ifinfo = batadv_neigh_ifinfo_get(router,
2496 BATADV_IF_DEFAULT);
2497 if (!router_ifinfo)
2498 goto next;
2499
2500 if (!kref_get_unless_zero(&gw_node->refcount))
2501 goto next;
2502
2503 tq_avg = router_ifinfo->bat_iv.tq_avg;
2504
2505 switch (atomic_read(&bat_priv->gw.sel_class)) {
2506 case 1: /* fast connection */
2507 tmp_gw_factor = tq_avg * tq_avg;
2508 tmp_gw_factor *= gw_node->bandwidth_down;
2509 tmp_gw_factor *= 100 * 100;
2510 tmp_gw_factor >>= 18;
2511
2512 if ((tmp_gw_factor > max_gw_factor) ||
2513 ((tmp_gw_factor == max_gw_factor) &&
2514 (tq_avg > max_tq))) {
2515 if (curr_gw)
2516 batadv_gw_node_put(curr_gw);
2517 curr_gw = gw_node;
2518 kref_get(&curr_gw->refcount);
2519 }
2520 break;
2521
2522 default: /* 2: stable connection (use best statistic)
2523 * 3: fast-switch (use best statistic but change as
2524 * soon as a better gateway appears)
2525 * XX: late-switch (use best statistic but change as
2526 * soon as a better gateway appears which has
2527 * $routing_class more tq points)
2528 */
2529 if (tq_avg > max_tq) {
2530 if (curr_gw)
2531 batadv_gw_node_put(curr_gw);
2532 curr_gw = gw_node;
2533 kref_get(&curr_gw->refcount);
2534 }
2535 break;
2536 }
2537
2538 if (tq_avg > max_tq)
2539 max_tq = tq_avg;
2540
2541 if (tmp_gw_factor > max_gw_factor)
2542 max_gw_factor = tmp_gw_factor;
2543
2544 batadv_gw_node_put(gw_node);
2545
2546next:
2547 batadv_neigh_node_put(router);
2548 if (router_ifinfo)
2549 batadv_neigh_ifinfo_put(router_ifinfo);
2550 }
2551 rcu_read_unlock();
2552
2553 return curr_gw;
2554}
2555
2556static bool batadv_iv_gw_is_eligible(struct batadv_priv *bat_priv,
2557 struct batadv_orig_node *curr_gw_orig,
2558 struct batadv_orig_node *orig_node)
2559{
2560 struct batadv_neigh_ifinfo *router_orig_ifinfo = NULL;
2561 struct batadv_neigh_ifinfo *router_gw_ifinfo = NULL;
2562 struct batadv_neigh_node *router_gw = NULL;
2563 struct batadv_neigh_node *router_orig = NULL;
2564 u8 gw_tq_avg, orig_tq_avg;
2565 bool ret = false;
2566
2567 /* dynamic re-election is performed only on fast or late switch */
2568 if (atomic_read(&bat_priv->gw.sel_class) <= 2)
2569 return false;
2570
2571 router_gw = batadv_orig_router_get(curr_gw_orig, BATADV_IF_DEFAULT);
2572 if (!router_gw) {
2573 ret = true;
2098 goto out; 2574 goto out;
2099 } 2575 }
2100 2576
2101 tq1 = neigh1_ifinfo->bat_iv.tq_avg; 2577 router_gw_ifinfo = batadv_neigh_ifinfo_get(router_gw,
2102 tq2 = neigh2_ifinfo->bat_iv.tq_avg; 2578 BATADV_IF_DEFAULT);
2103 ret = (tq1 - tq2) > -BATADV_TQ_SIMILARITY_THRESHOLD; 2579 if (!router_gw_ifinfo) {
2580 ret = true;
2581 goto out;
2582 }
2583
2584 router_orig = batadv_orig_router_get(orig_node, BATADV_IF_DEFAULT);
2585 if (!router_orig)
2586 goto out;
2587
2588 router_orig_ifinfo = batadv_neigh_ifinfo_get(router_orig,
2589 BATADV_IF_DEFAULT);
2590 if (!router_orig_ifinfo)
2591 goto out;
2592
2593 gw_tq_avg = router_gw_ifinfo->bat_iv.tq_avg;
2594 orig_tq_avg = router_orig_ifinfo->bat_iv.tq_avg;
2595
2596 /* the TQ value has to be better */
2597 if (orig_tq_avg < gw_tq_avg)
2598 goto out;
2104 2599
2600 /* if the routing class is greater than 3 the value tells us how much
2601 * greater the TQ value of the new gateway must be
2602 */
2603 if ((atomic_read(&bat_priv->gw.sel_class) > 3) &&
2604 (orig_tq_avg - gw_tq_avg < atomic_read(&bat_priv->gw.sel_class)))
2605 goto out;
2606
2607 batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
2608 "Restarting gateway selection: better gateway found (tq curr: %i, tq new: %i)\n",
2609 gw_tq_avg, orig_tq_avg);
2610
2611 ret = true;
2105out: 2612out:
2106 if (neigh1_ifinfo) 2613 if (router_gw_ifinfo)
2107 batadv_neigh_ifinfo_put(neigh1_ifinfo); 2614 batadv_neigh_ifinfo_put(router_gw_ifinfo);
2108 if (neigh2_ifinfo) 2615 if (router_orig_ifinfo)
2109 batadv_neigh_ifinfo_put(neigh2_ifinfo); 2616 batadv_neigh_ifinfo_put(router_orig_ifinfo);
2617 if (router_gw)
2618 batadv_neigh_node_put(router_gw);
2619 if (router_orig)
2620 batadv_neigh_node_put(router_orig);
2110 2621
2111 return ret; 2622 return ret;
2112} 2623}
2113 2624
2114static void batadv_iv_iface_activate(struct batadv_hard_iface *hard_iface) 2625#ifdef CONFIG_BATMAN_ADV_DEBUGFS
2626/* fails if orig_node has no router */
2627static int batadv_iv_gw_write_buffer_text(struct batadv_priv *bat_priv,
2628 struct seq_file *seq,
2629 const struct batadv_gw_node *gw_node)
2115{ 2630{
2116 /* begin scheduling originator messages on that interface */ 2631 struct batadv_gw_node *curr_gw;
2117 batadv_iv_ogm_schedule(hard_iface); 2632 struct batadv_neigh_node *router;
2633 struct batadv_neigh_ifinfo *router_ifinfo = NULL;
2634 int ret = -1;
2635
2636 router = batadv_orig_router_get(gw_node->orig_node, BATADV_IF_DEFAULT);
2637 if (!router)
2638 goto out;
2639
2640 router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT);
2641 if (!router_ifinfo)
2642 goto out;
2643
2644 curr_gw = batadv_gw_get_selected_gw_node(bat_priv);
2645
2646 seq_printf(seq, "%s %pM (%3i) %pM [%10s]: %u.%u/%u.%u MBit\n",
2647 (curr_gw == gw_node ? "=>" : " "),
2648 gw_node->orig_node->orig,
2649 router_ifinfo->bat_iv.tq_avg, router->addr,
2650 router->if_incoming->net_dev->name,
2651 gw_node->bandwidth_down / 10,
2652 gw_node->bandwidth_down % 10,
2653 gw_node->bandwidth_up / 10,
2654 gw_node->bandwidth_up % 10);
2655 ret = seq_has_overflowed(seq) ? -1 : 0;
2656
2657 if (curr_gw)
2658 batadv_gw_node_put(curr_gw);
2659out:
2660 if (router_ifinfo)
2661 batadv_neigh_ifinfo_put(router_ifinfo);
2662 if (router)
2663 batadv_neigh_node_put(router);
2664 return ret;
2665}
2666
2667static void batadv_iv_gw_print(struct batadv_priv *bat_priv,
2668 struct seq_file *seq)
2669{
2670 struct batadv_gw_node *gw_node;
2671 int gw_count = 0;
2672
2673 seq_puts(seq,
2674 " Gateway (#/255) Nexthop [outgoingIF]: advertised uplink bandwidth\n");
2675
2676 rcu_read_lock();
2677 hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) {
2678 /* fails if orig_node has no router */
2679 if (batadv_iv_gw_write_buffer_text(bat_priv, seq, gw_node) < 0)
2680 continue;
2681
2682 gw_count++;
2683 }
2684 rcu_read_unlock();
2685
2686 if (gw_count == 0)
2687 seq_puts(seq, "No gateways in range ...\n");
2688}
2689#endif
2690
2691/**
2692 * batadv_iv_gw_dump_entry - Dump a gateway into a message
2693 * @msg: Netlink message to dump into
2694 * @portid: Port making netlink request
2695 * @seq: Sequence number of netlink message
2696 * @bat_priv: The bat priv with all the soft interface information
2697 * @gw_node: Gateway to be dumped
2698 *
2699 * Return: Error code, or 0 on success
2700 */
2701static int batadv_iv_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
2702 struct batadv_priv *bat_priv,
2703 struct batadv_gw_node *gw_node)
2704{
2705 struct batadv_neigh_ifinfo *router_ifinfo = NULL;
2706 struct batadv_neigh_node *router;
2707 struct batadv_gw_node *curr_gw;
2708 int ret = -EINVAL;
2709 void *hdr;
2710
2711 router = batadv_orig_router_get(gw_node->orig_node, BATADV_IF_DEFAULT);
2712 if (!router)
2713 goto out;
2714
2715 router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT);
2716 if (!router_ifinfo)
2717 goto out;
2718
2719 curr_gw = batadv_gw_get_selected_gw_node(bat_priv);
2720
2721 hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family,
2722 NLM_F_MULTI, BATADV_CMD_GET_GATEWAYS);
2723 if (!hdr) {
2724 ret = -ENOBUFS;
2725 goto out;
2726 }
2727
2728 ret = -EMSGSIZE;
2729
2730 if (curr_gw == gw_node)
2731 if (nla_put_flag(msg, BATADV_ATTR_FLAG_BEST)) {
2732 genlmsg_cancel(msg, hdr);
2733 goto out;
2734 }
2735
2736 if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN,
2737 gw_node->orig_node->orig) ||
2738 nla_put_u8(msg, BATADV_ATTR_TQ, router_ifinfo->bat_iv.tq_avg) ||
2739 nla_put(msg, BATADV_ATTR_ROUTER, ETH_ALEN,
2740 router->addr) ||
2741 nla_put_string(msg, BATADV_ATTR_HARD_IFNAME,
2742 router->if_incoming->net_dev->name) ||
2743 nla_put_u32(msg, BATADV_ATTR_BANDWIDTH_DOWN,
2744 gw_node->bandwidth_down) ||
2745 nla_put_u32(msg, BATADV_ATTR_BANDWIDTH_UP,
2746 gw_node->bandwidth_up)) {
2747 genlmsg_cancel(msg, hdr);
2748 goto out;
2749 }
2750
2751 genlmsg_end(msg, hdr);
2752 ret = 0;
2753
2754out:
2755 if (router_ifinfo)
2756 batadv_neigh_ifinfo_put(router_ifinfo);
2757 if (router)
2758 batadv_neigh_node_put(router);
2759 return ret;
2760}
2761
2762/**
2763 * batadv_iv_gw_dump - Dump gateways into a message
2764 * @msg: Netlink message to dump into
2765 * @cb: Control block containing additional options
2766 * @bat_priv: The bat priv with all the soft interface information
2767 */
2768static void batadv_iv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb,
2769 struct batadv_priv *bat_priv)
2770{
2771 int portid = NETLINK_CB(cb->skb).portid;
2772 struct batadv_gw_node *gw_node;
2773 int idx_skip = cb->args[0];
2774 int idx = 0;
2775
2776 rcu_read_lock();
2777 hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) {
2778 if (idx++ < idx_skip)
2779 continue;
2780
2781 if (batadv_iv_gw_dump_entry(msg, portid, cb->nlh->nlmsg_seq,
2782 bat_priv, gw_node)) {
2783 idx_skip = idx - 1;
2784 goto unlock;
2785 }
2786 }
2787
2788 idx_skip = idx;
2789unlock:
2790 rcu_read_unlock();
2791
2792 cb->args[0] = idx_skip;
2118} 2793}
2119 2794
2120static struct batadv_algo_ops batadv_batman_iv __read_mostly = { 2795static struct batadv_algo_ops batadv_batman_iv __read_mostly = {
@@ -2129,14 +2804,28 @@ static struct batadv_algo_ops batadv_batman_iv __read_mostly = {
2129 .neigh = { 2804 .neigh = {
2130 .cmp = batadv_iv_ogm_neigh_cmp, 2805 .cmp = batadv_iv_ogm_neigh_cmp,
2131 .is_similar_or_better = batadv_iv_ogm_neigh_is_sob, 2806 .is_similar_or_better = batadv_iv_ogm_neigh_is_sob,
2807#ifdef CONFIG_BATMAN_ADV_DEBUGFS
2132 .print = batadv_iv_neigh_print, 2808 .print = batadv_iv_neigh_print,
2809#endif
2810 .dump = batadv_iv_ogm_neigh_dump,
2133 }, 2811 },
2134 .orig = { 2812 .orig = {
2813#ifdef CONFIG_BATMAN_ADV_DEBUGFS
2135 .print = batadv_iv_ogm_orig_print, 2814 .print = batadv_iv_ogm_orig_print,
2815#endif
2816 .dump = batadv_iv_ogm_orig_dump,
2136 .free = batadv_iv_ogm_orig_free, 2817 .free = batadv_iv_ogm_orig_free,
2137 .add_if = batadv_iv_ogm_orig_add_if, 2818 .add_if = batadv_iv_ogm_orig_add_if,
2138 .del_if = batadv_iv_ogm_orig_del_if, 2819 .del_if = batadv_iv_ogm_orig_del_if,
2139 }, 2820 },
2821 .gw = {
2822 .get_best_gw_node = batadv_iv_gw_get_best_gw_node,
2823 .is_eligible = batadv_iv_gw_is_eligible,
2824#ifdef CONFIG_BATMAN_ADV_DEBUGFS
2825 .print = batadv_iv_gw_print,
2826#endif
2827 .dump = batadv_iv_gw_dump,
2828 },
2140}; 2829};
2141 2830
2142int __init batadv_iv_init(void) 2831int __init batadv_iv_init(void)
diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c
index 0366cbf5e444..e79f6f01182e 100644
--- a/net/batman-adv/bat_v.c
+++ b/net/batman-adv/bat_v.c
@@ -21,24 +21,38 @@
21#include <linux/atomic.h> 21#include <linux/atomic.h>
22#include <linux/bug.h> 22#include <linux/bug.h>
23#include <linux/cache.h> 23#include <linux/cache.h>
24#include <linux/errno.h>
25#include <linux/if_ether.h>
24#include <linux/init.h> 26#include <linux/init.h>
25#include <linux/jiffies.h> 27#include <linux/jiffies.h>
28#include <linux/kernel.h>
29#include <linux/kref.h>
26#include <linux/netdevice.h> 30#include <linux/netdevice.h>
31#include <linux/netlink.h>
27#include <linux/rculist.h> 32#include <linux/rculist.h>
28#include <linux/rcupdate.h> 33#include <linux/rcupdate.h>
29#include <linux/seq_file.h> 34#include <linux/seq_file.h>
30#include <linux/stddef.h> 35#include <linux/stddef.h>
31#include <linux/types.h> 36#include <linux/types.h>
32#include <linux/workqueue.h> 37#include <linux/workqueue.h>
38#include <net/genetlink.h>
39#include <net/netlink.h>
40#include <uapi/linux/batman_adv.h>
33 41
34#include "bat_algo.h" 42#include "bat_algo.h"
35#include "bat_v_elp.h" 43#include "bat_v_elp.h"
36#include "bat_v_ogm.h" 44#include "bat_v_ogm.h"
45#include "gateway_client.h"
46#include "gateway_common.h"
37#include "hard-interface.h" 47#include "hard-interface.h"
38#include "hash.h" 48#include "hash.h"
49#include "log.h"
50#include "netlink.h"
39#include "originator.h" 51#include "originator.h"
40#include "packet.h" 52#include "packet.h"
41 53
54struct sk_buff;
55
42static void batadv_v_iface_activate(struct batadv_hard_iface *hard_iface) 56static void batadv_v_iface_activate(struct batadv_hard_iface *hard_iface)
43{ 57{
44 struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); 58 struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
@@ -115,6 +129,7 @@ batadv_v_hardif_neigh_init(struct batadv_hardif_neigh_node *hardif_neigh)
115 batadv_v_elp_throughput_metric_update); 129 batadv_v_elp_throughput_metric_update);
116} 130}
117 131
132#ifdef CONFIG_BATMAN_ADV_DEBUGFS
118/** 133/**
119 * batadv_v_orig_print_neigh - print neighbors for the originator table 134 * batadv_v_orig_print_neigh - print neighbors for the originator table
120 * @orig_node: the orig_node for which the neighbors are printed 135 * @orig_node: the orig_node for which the neighbors are printed
@@ -198,8 +213,142 @@ static void batadv_v_neigh_print(struct batadv_priv *bat_priv,
198 if (batman_count == 0) 213 if (batman_count == 0)
199 seq_puts(seq, "No batman nodes in range ...\n"); 214 seq_puts(seq, "No batman nodes in range ...\n");
200} 215}
216#endif
217
218/**
219 * batadv_v_neigh_dump_neigh - Dump a neighbour into a message
220 * @msg: Netlink message to dump into
221 * @portid: Port making netlink request
222 * @seq: Sequence number of netlink message
223 * @hardif_neigh: Neighbour to dump
224 *
225 * Return: Error code, or 0 on success
226 */
227static int
228batadv_v_neigh_dump_neigh(struct sk_buff *msg, u32 portid, u32 seq,
229 struct batadv_hardif_neigh_node *hardif_neigh)
230{
231 void *hdr;
232 unsigned int last_seen_msecs;
233 u32 throughput;
234
235 last_seen_msecs = jiffies_to_msecs(jiffies - hardif_neigh->last_seen);
236 throughput = ewma_throughput_read(&hardif_neigh->bat_v.throughput);
237 throughput = throughput * 100;
238
239 hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, NLM_F_MULTI,
240 BATADV_CMD_GET_NEIGHBORS);
241 if (!hdr)
242 return -ENOBUFS;
243
244 if (nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN,
245 hardif_neigh->addr) ||
246 nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX,
247 hardif_neigh->if_incoming->net_dev->ifindex) ||
248 nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS,
249 last_seen_msecs) ||
250 nla_put_u32(msg, BATADV_ATTR_THROUGHPUT, throughput))
251 goto nla_put_failure;
252
253 genlmsg_end(msg, hdr);
254 return 0;
255
256 nla_put_failure:
257 genlmsg_cancel(msg, hdr);
258 return -EMSGSIZE;
259}
201 260
202/** 261/**
262 * batadv_v_neigh_dump_hardif - Dump the neighbours of a hard interface into
263 * a message
264 * @msg: Netlink message to dump into
265 * @portid: Port making netlink request
266 * @seq: Sequence number of netlink message
267 * @bat_priv: The bat priv with all the soft interface information
268 * @hard_iface: The hard interface to be dumped
269 * @idx_s: Entries to be skipped
270 *
271 * This function assumes the caller holds rcu_read_lock().
272 *
273 * Return: Error code, or 0 on success
274 */
275static int
276batadv_v_neigh_dump_hardif(struct sk_buff *msg, u32 portid, u32 seq,
277 struct batadv_priv *bat_priv,
278 struct batadv_hard_iface *hard_iface,
279 int *idx_s)
280{
281 struct batadv_hardif_neigh_node *hardif_neigh;
282 int idx = 0;
283
284 hlist_for_each_entry_rcu(hardif_neigh,
285 &hard_iface->neigh_list, list) {
286 if (idx++ < *idx_s)
287 continue;
288
289 if (batadv_v_neigh_dump_neigh(msg, portid, seq, hardif_neigh)) {
290 *idx_s = idx - 1;
291 return -EMSGSIZE;
292 }
293 }
294
295 *idx_s = 0;
296 return 0;
297}
298
299/**
300 * batadv_v_neigh_dump - Dump the neighbours of a hard interface into a
301 * message
302 * @msg: Netlink message to dump into
303 * @cb: Control block containing additional options
304 * @bat_priv: The bat priv with all the soft interface information
305 * @single_hardif: Limit dumping to this hard interface
306 */
307static void
308batadv_v_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb,
309 struct batadv_priv *bat_priv,
310 struct batadv_hard_iface *single_hardif)
311{
312 struct batadv_hard_iface *hard_iface;
313 int i_hardif = 0;
314 int i_hardif_s = cb->args[0];
315 int idx = cb->args[1];
316 int portid = NETLINK_CB(cb->skb).portid;
317
318 rcu_read_lock();
319 if (single_hardif) {
320 if (i_hardif_s == 0) {
321 if (batadv_v_neigh_dump_hardif(msg, portid,
322 cb->nlh->nlmsg_seq,
323 bat_priv, single_hardif,
324 &idx) == 0)
325 i_hardif++;
326 }
327 } else {
328 list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
329 if (hard_iface->soft_iface != bat_priv->soft_iface)
330 continue;
331
332 if (i_hardif++ < i_hardif_s)
333 continue;
334
335 if (batadv_v_neigh_dump_hardif(msg, portid,
336 cb->nlh->nlmsg_seq,
337 bat_priv, hard_iface,
338 &idx)) {
339 i_hardif--;
340 break;
341 }
342 }
343 }
344 rcu_read_unlock();
345
346 cb->args[0] = i_hardif;
347 cb->args[1] = idx;
348}
349
350#ifdef CONFIG_BATMAN_ADV_DEBUGFS
351/**
203 * batadv_v_orig_print - print the originator table 352 * batadv_v_orig_print - print the originator table
204 * @bat_priv: the bat priv with all the soft interface information 353 * @bat_priv: the bat priv with all the soft interface information
205 * @seq: debugfs table seq_file struct 354 * @seq: debugfs table seq_file struct
@@ -265,6 +414,205 @@ next:
265 if (batman_count == 0) 414 if (batman_count == 0)
266 seq_puts(seq, "No batman nodes in range ...\n"); 415 seq_puts(seq, "No batman nodes in range ...\n");
267} 416}
417#endif
418
419/**
420 * batadv_v_orig_dump_subentry - Dump an originator subentry into a
421 * message
422 * @msg: Netlink message to dump into
423 * @portid: Port making netlink request
424 * @seq: Sequence number of netlink message
425 * @bat_priv: The bat priv with all the soft interface information
426 * @if_outgoing: Limit dump to entries with this outgoing interface
427 * @orig_node: Originator to dump
428 * @neigh_node: Single hops neighbour
429 * @best: Is the best originator
430 *
431 * Return: Error code, or 0 on success
432 */
433static int
434batadv_v_orig_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq,
435 struct batadv_priv *bat_priv,
436 struct batadv_hard_iface *if_outgoing,
437 struct batadv_orig_node *orig_node,
438 struct batadv_neigh_node *neigh_node,
439 bool best)
440{
441 struct batadv_neigh_ifinfo *n_ifinfo;
442 unsigned int last_seen_msecs;
443 u32 throughput;
444 void *hdr;
445
446 n_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing);
447 if (!n_ifinfo)
448 return 0;
449
450 throughput = n_ifinfo->bat_v.throughput * 100;
451
452 batadv_neigh_ifinfo_put(n_ifinfo);
453
454 last_seen_msecs = jiffies_to_msecs(jiffies - orig_node->last_seen);
455
456 if (if_outgoing != BATADV_IF_DEFAULT &&
457 if_outgoing != neigh_node->if_incoming)
458 return 0;
459
460 hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, NLM_F_MULTI,
461 BATADV_CMD_GET_ORIGINATORS);
462 if (!hdr)
463 return -ENOBUFS;
464
465 if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, orig_node->orig) ||
466 nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN,
467 neigh_node->addr) ||
468 nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX,
469 neigh_node->if_incoming->net_dev->ifindex) ||
470 nla_put_u32(msg, BATADV_ATTR_THROUGHPUT, throughput) ||
471 nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS,
472 last_seen_msecs))
473 goto nla_put_failure;
474
475 if (best && nla_put_flag(msg, BATADV_ATTR_FLAG_BEST))
476 goto nla_put_failure;
477
478 genlmsg_end(msg, hdr);
479 return 0;
480
481 nla_put_failure:
482 genlmsg_cancel(msg, hdr);
483 return -EMSGSIZE;
484}
485
486/**
487 * batadv_v_orig_dump_entry - Dump an originator entry into a message
488 * @msg: Netlink message to dump into
489 * @portid: Port making netlink request
490 * @seq: Sequence number of netlink message
491 * @bat_priv: The bat priv with all the soft interface information
492 * @if_outgoing: Limit dump to entries with this outgoing interface
493 * @orig_node: Originator to dump
494 * @sub_s: Number of sub entries to skip
495 *
496 * This function assumes the caller holds rcu_read_lock().
497 *
498 * Return: Error code, or 0 on success
499 */
500static int
501batadv_v_orig_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
502 struct batadv_priv *bat_priv,
503 struct batadv_hard_iface *if_outgoing,
504 struct batadv_orig_node *orig_node, int *sub_s)
505{
506 struct batadv_neigh_node *neigh_node_best;
507 struct batadv_neigh_node *neigh_node;
508 int sub = 0;
509 bool best;
510
511 neigh_node_best = batadv_orig_router_get(orig_node, if_outgoing);
512 if (!neigh_node_best)
513 goto out;
514
515 hlist_for_each_entry_rcu(neigh_node, &orig_node->neigh_list, list) {
516 if (sub++ < *sub_s)
517 continue;
518
519 best = (neigh_node == neigh_node_best);
520
521 if (batadv_v_orig_dump_subentry(msg, portid, seq, bat_priv,
522 if_outgoing, orig_node,
523 neigh_node, best)) {
524 batadv_neigh_node_put(neigh_node_best);
525
526 *sub_s = sub - 1;
527 return -EMSGSIZE;
528 }
529 }
530
531 out:
532 if (neigh_node_best)
533 batadv_neigh_node_put(neigh_node_best);
534
535 *sub_s = 0;
536 return 0;
537}
538
539/**
540 * batadv_v_orig_dump_bucket - Dump an originator bucket into a
541 * message
542 * @msg: Netlink message to dump into
543 * @portid: Port making netlink request
544 * @seq: Sequence number of netlink message
545 * @bat_priv: The bat priv with all the soft interface information
546 * @if_outgoing: Limit dump to entries with this outgoing interface
547 * @head: Bucket to be dumped
548 * @idx_s: Number of entries to be skipped
549 * @sub: Number of sub entries to be skipped
550 *
551 * Return: Error code, or 0 on success
552 */
553static int
554batadv_v_orig_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
555 struct batadv_priv *bat_priv,
556 struct batadv_hard_iface *if_outgoing,
557 struct hlist_head *head, int *idx_s, int *sub)
558{
559 struct batadv_orig_node *orig_node;
560 int idx = 0;
561
562 rcu_read_lock();
563 hlist_for_each_entry_rcu(orig_node, head, hash_entry) {
564 if (idx++ < *idx_s)
565 continue;
566
567 if (batadv_v_orig_dump_entry(msg, portid, seq, bat_priv,
568 if_outgoing, orig_node, sub)) {
569 rcu_read_unlock();
570 *idx_s = idx - 1;
571 return -EMSGSIZE;
572 }
573 }
574 rcu_read_unlock();
575
576 *idx_s = 0;
577 *sub = 0;
578 return 0;
579}
580
581/**
582 * batadv_v_orig_dump - Dump the originators into a message
583 * @msg: Netlink message to dump into
584 * @cb: Control block containing additional options
585 * @bat_priv: The bat priv with all the soft interface information
586 * @if_outgoing: Limit dump to entries with this outgoing interface
587 */
588static void
589batadv_v_orig_dump(struct sk_buff *msg, struct netlink_callback *cb,
590 struct batadv_priv *bat_priv,
591 struct batadv_hard_iface *if_outgoing)
592{
593 struct batadv_hashtable *hash = bat_priv->orig_hash;
594 struct hlist_head *head;
595 int bucket = cb->args[0];
596 int idx = cb->args[1];
597 int sub = cb->args[2];
598 int portid = NETLINK_CB(cb->skb).portid;
599
600 while (bucket < hash->size) {
601 head = &hash->table[bucket];
602
603 if (batadv_v_orig_dump_bucket(msg, portid,
604 cb->nlh->nlmsg_seq,
605 bat_priv, if_outgoing, head, &idx,
606 &sub))
607 break;
608
609 bucket++;
610 }
611
612 cb->args[0] = bucket;
613 cb->args[1] = idx;
614 cb->args[2] = sub;
615}
268 616
269static int batadv_v_neigh_cmp(struct batadv_neigh_node *neigh1, 617static int batadv_v_neigh_cmp(struct batadv_neigh_node *neigh1,
270 struct batadv_hard_iface *if_outgoing1, 618 struct batadv_hard_iface *if_outgoing1,
@@ -320,6 +668,365 @@ err_ifinfo1:
320 return ret; 668 return ret;
321} 669}
322 670
671static ssize_t batadv_v_store_sel_class(struct batadv_priv *bat_priv,
672 char *buff, size_t count)
673{
674 u32 old_class, class;
675
676 if (!batadv_parse_throughput(bat_priv->soft_iface, buff,
677 "B.A.T.M.A.N. V GW selection class",
678 &class))
679 return -EINVAL;
680
681 old_class = atomic_read(&bat_priv->gw.sel_class);
682 atomic_set(&bat_priv->gw.sel_class, class);
683
684 if (old_class != class)
685 batadv_gw_reselect(bat_priv);
686
687 return count;
688}
689
690static ssize_t batadv_v_show_sel_class(struct batadv_priv *bat_priv, char *buff)
691{
692 u32 class = atomic_read(&bat_priv->gw.sel_class);
693
694 return sprintf(buff, "%u.%u MBit\n", class / 10, class % 10);
695}
696
697/**
698 * batadv_v_gw_throughput_get - retrieve the GW-bandwidth for a given GW
699 * @gw_node: the GW to retrieve the metric for
700 * @bw: the pointer where the metric will be stored. The metric is computed as
701 * the minimum between the GW advertised throughput and the path throughput to
702 * it in the mesh
703 *
704 * Return: 0 on success, -1 on failure
705 */
706static int batadv_v_gw_throughput_get(struct batadv_gw_node *gw_node, u32 *bw)
707{
708 struct batadv_neigh_ifinfo *router_ifinfo = NULL;
709 struct batadv_orig_node *orig_node;
710 struct batadv_neigh_node *router;
711 int ret = -1;
712
713 orig_node = gw_node->orig_node;
714 router = batadv_orig_router_get(orig_node, BATADV_IF_DEFAULT);
715 if (!router)
716 goto out;
717
718 router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT);
719 if (!router_ifinfo)
720 goto out;
721
722 /* the GW metric is computed as the minimum between the path throughput
723 * to reach the GW itself and the advertised bandwidth.
724 * This gives us an approximation of the effective throughput that the
725 * client can expect via this particular GW node
726 */
727 *bw = router_ifinfo->bat_v.throughput;
728 *bw = min_t(u32, *bw, gw_node->bandwidth_down);
729
730 ret = 0;
731out:
732 if (router)
733 batadv_neigh_node_put(router);
734 if (router_ifinfo)
735 batadv_neigh_ifinfo_put(router_ifinfo);
736
737 return ret;
738}
739
740/**
741 * batadv_v_gw_get_best_gw_node - retrieve the best GW node
742 * @bat_priv: the bat priv with all the soft interface information
743 *
744 * Return: the GW node having the best GW-metric, NULL if no GW is known
745 */
746static struct batadv_gw_node *
747batadv_v_gw_get_best_gw_node(struct batadv_priv *bat_priv)
748{
749 struct batadv_gw_node *gw_node, *curr_gw = NULL;
750 u32 max_bw = 0, bw;
751
752 rcu_read_lock();
753 hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) {
754 if (!kref_get_unless_zero(&gw_node->refcount))
755 continue;
756
757 if (batadv_v_gw_throughput_get(gw_node, &bw) < 0)
758 goto next;
759
760 if (curr_gw && (bw <= max_bw))
761 goto next;
762
763 if (curr_gw)
764 batadv_gw_node_put(curr_gw);
765
766 curr_gw = gw_node;
767 kref_get(&curr_gw->refcount);
768 max_bw = bw;
769
770next:
771 batadv_gw_node_put(gw_node);
772 }
773 rcu_read_unlock();
774
775 return curr_gw;
776}
777
778/**
779 * batadv_v_gw_is_eligible - check if a originator would be selected as GW
780 * @bat_priv: the bat priv with all the soft interface information
781 * @curr_gw_orig: originator representing the currently selected GW
782 * @orig_node: the originator representing the new candidate
783 *
784 * Return: true if orig_node can be selected as current GW, false otherwise
785 */
786static bool batadv_v_gw_is_eligible(struct batadv_priv *bat_priv,
787 struct batadv_orig_node *curr_gw_orig,
788 struct batadv_orig_node *orig_node)
789{
790 struct batadv_gw_node *curr_gw = NULL, *orig_gw = NULL;
791 u32 gw_throughput, orig_throughput, threshold;
792 bool ret = false;
793
794 threshold = atomic_read(&bat_priv->gw.sel_class);
795
796 curr_gw = batadv_gw_node_get(bat_priv, curr_gw_orig);
797 if (!curr_gw) {
798 ret = true;
799 goto out;
800 }
801
802 if (batadv_v_gw_throughput_get(curr_gw, &gw_throughput) < 0) {
803 ret = true;
804 goto out;
805 }
806
807 orig_gw = batadv_gw_node_get(bat_priv, orig_node);
808 if (!orig_node)
809 goto out;
810
811 if (batadv_v_gw_throughput_get(orig_gw, &orig_throughput) < 0)
812 goto out;
813
814 if (orig_throughput < gw_throughput)
815 goto out;
816
817 if ((orig_throughput - gw_throughput) < threshold)
818 goto out;
819
820 batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
821 "Restarting gateway selection: better gateway found (throughput curr: %u, throughput new: %u)\n",
822 gw_throughput, orig_throughput);
823
824 ret = true;
825out:
826 if (curr_gw)
827 batadv_gw_node_put(curr_gw);
828 if (orig_gw)
829 batadv_gw_node_put(orig_gw);
830
831 return ret;
832}
833
834#ifdef CONFIG_BATMAN_ADV_DEBUGFS
835/* fails if orig_node has no router */
836static int batadv_v_gw_write_buffer_text(struct batadv_priv *bat_priv,
837 struct seq_file *seq,
838 const struct batadv_gw_node *gw_node)
839{
840 struct batadv_gw_node *curr_gw;
841 struct batadv_neigh_node *router;
842 struct batadv_neigh_ifinfo *router_ifinfo = NULL;
843 int ret = -1;
844
845 router = batadv_orig_router_get(gw_node->orig_node, BATADV_IF_DEFAULT);
846 if (!router)
847 goto out;
848
849 router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT);
850 if (!router_ifinfo)
851 goto out;
852
853 curr_gw = batadv_gw_get_selected_gw_node(bat_priv);
854
855 seq_printf(seq, "%s %pM (%9u.%1u) %pM [%10s]: %u.%u/%u.%u MBit\n",
856 (curr_gw == gw_node ? "=>" : " "),
857 gw_node->orig_node->orig,
858 router_ifinfo->bat_v.throughput / 10,
859 router_ifinfo->bat_v.throughput % 10, router->addr,
860 router->if_incoming->net_dev->name,
861 gw_node->bandwidth_down / 10,
862 gw_node->bandwidth_down % 10,
863 gw_node->bandwidth_up / 10,
864 gw_node->bandwidth_up % 10);
865 ret = seq_has_overflowed(seq) ? -1 : 0;
866
867 if (curr_gw)
868 batadv_gw_node_put(curr_gw);
869out:
870 if (router_ifinfo)
871 batadv_neigh_ifinfo_put(router_ifinfo);
872 if (router)
873 batadv_neigh_node_put(router);
874 return ret;
875}
876
877/**
878 * batadv_v_gw_print - print the gateway list
879 * @bat_priv: the bat priv with all the soft interface information
880 * @seq: gateway table seq_file struct
881 */
882static void batadv_v_gw_print(struct batadv_priv *bat_priv,
883 struct seq_file *seq)
884{
885 struct batadv_gw_node *gw_node;
886 int gw_count = 0;
887
888 seq_puts(seq,
889 " Gateway ( throughput) Nexthop [outgoingIF]: advertised uplink bandwidth\n");
890
891 rcu_read_lock();
892 hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) {
893 /* fails if orig_node has no router */
894 if (batadv_v_gw_write_buffer_text(bat_priv, seq, gw_node) < 0)
895 continue;
896
897 gw_count++;
898 }
899 rcu_read_unlock();
900
901 if (gw_count == 0)
902 seq_puts(seq, "No gateways in range ...\n");
903}
904#endif
905
906/**
907 * batadv_v_gw_dump_entry - Dump a gateway into a message
908 * @msg: Netlink message to dump into
909 * @portid: Port making netlink request
910 * @seq: Sequence number of netlink message
911 * @bat_priv: The bat priv with all the soft interface information
912 * @gw_node: Gateway to be dumped
913 *
914 * Return: Error code, or 0 on success
915 */
916static int batadv_v_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
917 struct batadv_priv *bat_priv,
918 struct batadv_gw_node *gw_node)
919{
920 struct batadv_neigh_ifinfo *router_ifinfo = NULL;
921 struct batadv_neigh_node *router;
922 struct batadv_gw_node *curr_gw;
923 int ret = -EINVAL;
924 void *hdr;
925
926 router = batadv_orig_router_get(gw_node->orig_node, BATADV_IF_DEFAULT);
927 if (!router)
928 goto out;
929
930 router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT);
931 if (!router_ifinfo)
932 goto out;
933
934 curr_gw = batadv_gw_get_selected_gw_node(bat_priv);
935
936 hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family,
937 NLM_F_MULTI, BATADV_CMD_GET_GATEWAYS);
938 if (!hdr) {
939 ret = -ENOBUFS;
940 goto out;
941 }
942
943 ret = -EMSGSIZE;
944
945 if (curr_gw == gw_node) {
946 if (nla_put_flag(msg, BATADV_ATTR_FLAG_BEST)) {
947 genlmsg_cancel(msg, hdr);
948 goto out;
949 }
950 }
951
952 if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN,
953 gw_node->orig_node->orig)) {
954 genlmsg_cancel(msg, hdr);
955 goto out;
956 }
957
958 if (nla_put_u32(msg, BATADV_ATTR_THROUGHPUT,
959 router_ifinfo->bat_v.throughput)) {
960 genlmsg_cancel(msg, hdr);
961 goto out;
962 }
963
964 if (nla_put(msg, BATADV_ATTR_ROUTER, ETH_ALEN, router->addr)) {
965 genlmsg_cancel(msg, hdr);
966 goto out;
967 }
968
969 if (nla_put_string(msg, BATADV_ATTR_HARD_IFNAME,
970 router->if_incoming->net_dev->name)) {
971 genlmsg_cancel(msg, hdr);
972 goto out;
973 }
974
975 if (nla_put_u32(msg, BATADV_ATTR_BANDWIDTH_DOWN,
976 gw_node->bandwidth_down)) {
977 genlmsg_cancel(msg, hdr);
978 goto out;
979 }
980
981 if (nla_put_u32(msg, BATADV_ATTR_BANDWIDTH_UP, gw_node->bandwidth_up)) {
982 genlmsg_cancel(msg, hdr);
983 goto out;
984 }
985
986 genlmsg_end(msg, hdr);
987 ret = 0;
988
989out:
990 if (router_ifinfo)
991 batadv_neigh_ifinfo_put(router_ifinfo);
992 if (router)
993 batadv_neigh_node_put(router);
994 return ret;
995}
996
997/**
998 * batadv_v_gw_dump - Dump gateways into a message
999 * @msg: Netlink message to dump into
1000 * @cb: Control block containing additional options
1001 * @bat_priv: The bat priv with all the soft interface information
1002 */
1003static void batadv_v_gw_dump(struct sk_buff *msg, struct netlink_callback *cb,
1004 struct batadv_priv *bat_priv)
1005{
1006 int portid = NETLINK_CB(cb->skb).portid;
1007 struct batadv_gw_node *gw_node;
1008 int idx_skip = cb->args[0];
1009 int idx = 0;
1010
1011 rcu_read_lock();
1012 hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) {
1013 if (idx++ < idx_skip)
1014 continue;
1015
1016 if (batadv_v_gw_dump_entry(msg, portid, cb->nlh->nlmsg_seq,
1017 bat_priv, gw_node)) {
1018 idx_skip = idx - 1;
1019 goto unlock;
1020 }
1021 }
1022
1023 idx_skip = idx;
1024unlock:
1025 rcu_read_unlock();
1026
1027 cb->args[0] = idx_skip;
1028}
1029
323static struct batadv_algo_ops batadv_batman_v __read_mostly = { 1030static struct batadv_algo_ops batadv_batman_v __read_mostly = {
324 .name = "BATMAN_V", 1031 .name = "BATMAN_V",
325 .iface = { 1032 .iface = {
@@ -333,10 +1040,26 @@ static struct batadv_algo_ops batadv_batman_v __read_mostly = {
333 .hardif_init = batadv_v_hardif_neigh_init, 1040 .hardif_init = batadv_v_hardif_neigh_init,
334 .cmp = batadv_v_neigh_cmp, 1041 .cmp = batadv_v_neigh_cmp,
335 .is_similar_or_better = batadv_v_neigh_is_sob, 1042 .is_similar_or_better = batadv_v_neigh_is_sob,
1043#ifdef CONFIG_BATMAN_ADV_DEBUGFS
336 .print = batadv_v_neigh_print, 1044 .print = batadv_v_neigh_print,
1045#endif
1046 .dump = batadv_v_neigh_dump,
337 }, 1047 },
338 .orig = { 1048 .orig = {
1049#ifdef CONFIG_BATMAN_ADV_DEBUGFS
339 .print = batadv_v_orig_print, 1050 .print = batadv_v_orig_print,
1051#endif
1052 .dump = batadv_v_orig_dump,
1053 },
1054 .gw = {
1055 .store_sel_class = batadv_v_store_sel_class,
1056 .show_sel_class = batadv_v_show_sel_class,
1057 .get_best_gw_node = batadv_v_gw_get_best_gw_node,
1058 .is_eligible = batadv_v_gw_is_eligible,
1059#ifdef CONFIG_BATMAN_ADV_DEBUGFS
1060 .print = batadv_v_gw_print,
1061#endif
1062 .dump = batadv_v_gw_dump,
340 }, 1063 },
341}; 1064};
342 1065
@@ -363,7 +1086,16 @@ void batadv_v_hardif_init(struct batadv_hard_iface *hard_iface)
363 */ 1086 */
364int batadv_v_mesh_init(struct batadv_priv *bat_priv) 1087int batadv_v_mesh_init(struct batadv_priv *bat_priv)
365{ 1088{
366 return batadv_v_ogm_init(bat_priv); 1089 int ret = 0;
1090
1091 ret = batadv_v_ogm_init(bat_priv);
1092 if (ret < 0)
1093 return ret;
1094
1095 /* set default throughput difference threshold to 5Mbps */
1096 atomic_set(&bat_priv->gw.sel_class, 50);
1097
1098 return 0;
367} 1099}
368 1100
369/** 1101/**
diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c
index 6fbba4eb0617..1aeeadca620c 100644
--- a/net/batman-adv/bat_v_ogm.c
+++ b/net/batman-adv/bat_v_ogm.c
@@ -73,13 +73,12 @@ struct batadv_orig_node *batadv_v_ogm_orig_get(struct batadv_priv *bat_priv,
73 if (!orig_node) 73 if (!orig_node)
74 return NULL; 74 return NULL;
75 75
76 kref_get(&orig_node->refcount);
76 hash_added = batadv_hash_add(bat_priv->orig_hash, batadv_compare_orig, 77 hash_added = batadv_hash_add(bat_priv->orig_hash, batadv_compare_orig,
77 batadv_choose_orig, orig_node, 78 batadv_choose_orig, orig_node,
78 &orig_node->hash_entry); 79 &orig_node->hash_entry);
79 if (hash_added != 0) { 80 if (hash_added != 0) {
80 /* orig_node->refcounter is initialised to 2 by 81 /* remove refcnt for newly created orig_node and hash entry */
81 * batadv_orig_node_new()
82 */
83 batadv_orig_node_put(orig_node); 82 batadv_orig_node_put(orig_node);
84 batadv_orig_node_put(orig_node); 83 batadv_orig_node_put(orig_node);
85 orig_node = NULL; 84 orig_node = NULL;
diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c
index ad2ffe16d29f..e7f690b571ea 100644
--- a/net/batman-adv/bridge_loop_avoidance.c
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -35,6 +35,7 @@
35#include <linux/list.h> 35#include <linux/list.h>
36#include <linux/lockdep.h> 36#include <linux/lockdep.h>
37#include <linux/netdevice.h> 37#include <linux/netdevice.h>
38#include <linux/netlink.h>
38#include <linux/rculist.h> 39#include <linux/rculist.h>
39#include <linux/rcupdate.h> 40#include <linux/rcupdate.h>
40#include <linux/seq_file.h> 41#include <linux/seq_file.h>
@@ -45,12 +46,18 @@
45#include <linux/string.h> 46#include <linux/string.h>
46#include <linux/workqueue.h> 47#include <linux/workqueue.h>
47#include <net/arp.h> 48#include <net/arp.h>
49#include <net/genetlink.h>
50#include <net/netlink.h>
51#include <net/sock.h>
52#include <uapi/linux/batman_adv.h>
48 53
49#include "hard-interface.h" 54#include "hard-interface.h"
50#include "hash.h" 55#include "hash.h"
51#include "log.h" 56#include "log.h"
57#include "netlink.h"
52#include "originator.h" 58#include "originator.h"
53#include "packet.h" 59#include "packet.h"
60#include "soft-interface.h"
54#include "sysfs.h" 61#include "sysfs.h"
55#include "translation-table.h" 62#include "translation-table.h"
56 63
@@ -519,11 +526,9 @@ batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, u8 *orig,
519 atomic_set(&entry->wait_periods, 0); 526 atomic_set(&entry->wait_periods, 0);
520 ether_addr_copy(entry->orig, orig); 527 ether_addr_copy(entry->orig, orig);
521 INIT_WORK(&entry->report_work, batadv_bla_loopdetect_report); 528 INIT_WORK(&entry->report_work, batadv_bla_loopdetect_report);
522
523 /* one for the hash, one for returning */
524 kref_init(&entry->refcount); 529 kref_init(&entry->refcount);
525 kref_get(&entry->refcount);
526 530
531 kref_get(&entry->refcount);
527 hash_added = batadv_hash_add(bat_priv->bla.backbone_hash, 532 hash_added = batadv_hash_add(bat_priv->bla.backbone_hash,
528 batadv_compare_backbone_gw, 533 batadv_compare_backbone_gw,
529 batadv_choose_backbone_gw, entry, 534 batadv_choose_backbone_gw, entry,
@@ -711,12 +716,13 @@ static void batadv_bla_add_claim(struct batadv_priv *bat_priv,
711 claim->lasttime = jiffies; 716 claim->lasttime = jiffies;
712 kref_get(&backbone_gw->refcount); 717 kref_get(&backbone_gw->refcount);
713 claim->backbone_gw = backbone_gw; 718 claim->backbone_gw = backbone_gw;
714
715 kref_init(&claim->refcount); 719 kref_init(&claim->refcount);
716 kref_get(&claim->refcount); 720
717 batadv_dbg(BATADV_DBG_BLA, bat_priv, 721 batadv_dbg(BATADV_DBG_BLA, bat_priv,
718 "bla_add_claim(): adding new entry %pM, vid %d to hash ...\n", 722 "bla_add_claim(): adding new entry %pM, vid %d to hash ...\n",
719 mac, BATADV_PRINT_VID(vid)); 723 mac, BATADV_PRINT_VID(vid));
724
725 kref_get(&claim->refcount);
720 hash_added = batadv_hash_add(bat_priv->bla.claim_hash, 726 hash_added = batadv_hash_add(bat_priv->bla.claim_hash,
721 batadv_compare_claim, 727 batadv_compare_claim,
722 batadv_choose_claim, claim, 728 batadv_choose_claim, claim,
@@ -1148,7 +1154,7 @@ static bool batadv_bla_process_claim(struct batadv_priv *bat_priv,
1148 1154
1149 /* Let the loopdetect frames on the mesh in any case. */ 1155 /* Let the loopdetect frames on the mesh in any case. */
1150 if (bla_dst->type == BATADV_CLAIM_TYPE_LOOPDETECT) 1156 if (bla_dst->type == BATADV_CLAIM_TYPE_LOOPDETECT)
1151 return 0; 1157 return false;
1152 1158
1153 /* check if it is a claim frame. */ 1159 /* check if it is a claim frame. */
1154 ret = batadv_check_claim_group(bat_priv, primary_if, hw_src, hw_dst, 1160 ret = batadv_check_claim_group(bat_priv, primary_if, hw_src, hw_dst,
@@ -1990,6 +1996,7 @@ out:
1990 return ret; 1996 return ret;
1991} 1997}
1992 1998
1999#ifdef CONFIG_BATMAN_ADV_DEBUGFS
1993/** 2000/**
1994 * batadv_bla_claim_table_seq_print_text - print the claim table in a seq file 2001 * batadv_bla_claim_table_seq_print_text - print the claim table in a seq file
1995 * @seq: seq file to print on 2002 * @seq: seq file to print on
@@ -2050,8 +2057,172 @@ out:
2050 batadv_hardif_put(primary_if); 2057 batadv_hardif_put(primary_if);
2051 return 0; 2058 return 0;
2052} 2059}
2060#endif
2061
2062/**
2063 * batadv_bla_claim_dump_entry - dump one entry of the claim table
2064 * to a netlink socket
2065 * @msg: buffer for the message
2066 * @portid: netlink port
2067 * @seq: Sequence number of netlink message
2068 * @primary_if: primary interface
2069 * @claim: entry to dump
2070 *
2071 * Return: 0 or error code.
2072 */
2073static int
2074batadv_bla_claim_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
2075 struct batadv_hard_iface *primary_if,
2076 struct batadv_bla_claim *claim)
2077{
2078 u8 *primary_addr = primary_if->net_dev->dev_addr;
2079 u16 backbone_crc;
2080 bool is_own;
2081 void *hdr;
2082 int ret = -EINVAL;
2083
2084 hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family,
2085 NLM_F_MULTI, BATADV_CMD_GET_BLA_CLAIM);
2086 if (!hdr) {
2087 ret = -ENOBUFS;
2088 goto out;
2089 }
2090
2091 is_own = batadv_compare_eth(claim->backbone_gw->orig,
2092 primary_addr);
2093
2094 spin_lock_bh(&claim->backbone_gw->crc_lock);
2095 backbone_crc = claim->backbone_gw->crc;
2096 spin_unlock_bh(&claim->backbone_gw->crc_lock);
2097
2098 if (is_own)
2099 if (nla_put_flag(msg, BATADV_ATTR_BLA_OWN)) {
2100 genlmsg_cancel(msg, hdr);
2101 goto out;
2102 }
2103
2104 if (nla_put(msg, BATADV_ATTR_BLA_ADDRESS, ETH_ALEN, claim->addr) ||
2105 nla_put_u16(msg, BATADV_ATTR_BLA_VID, claim->vid) ||
2106 nla_put(msg, BATADV_ATTR_BLA_BACKBONE, ETH_ALEN,
2107 claim->backbone_gw->orig) ||
2108 nla_put_u16(msg, BATADV_ATTR_BLA_CRC,
2109 backbone_crc)) {
2110 genlmsg_cancel(msg, hdr);
2111 goto out;
2112 }
2113
2114 genlmsg_end(msg, hdr);
2115 ret = 0;
2116
2117out:
2118 return ret;
2119}
2120
2121/**
2122 * batadv_bla_claim_dump_bucket - dump one bucket of the claim table
2123 * to a netlink socket
2124 * @msg: buffer for the message
2125 * @portid: netlink port
2126 * @seq: Sequence number of netlink message
2127 * @primary_if: primary interface
2128 * @head: bucket to dump
2129 * @idx_skip: How many entries to skip
2130 *
2131 * Return: always 0.
2132 */
2133static int
2134batadv_bla_claim_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
2135 struct batadv_hard_iface *primary_if,
2136 struct hlist_head *head, int *idx_skip)
2137{
2138 struct batadv_bla_claim *claim;
2139 int idx = 0;
2140
2141 rcu_read_lock();
2142 hlist_for_each_entry_rcu(claim, head, hash_entry) {
2143 if (idx++ < *idx_skip)
2144 continue;
2145 if (batadv_bla_claim_dump_entry(msg, portid, seq,
2146 primary_if, claim)) {
2147 *idx_skip = idx - 1;
2148 goto unlock;
2149 }
2150 }
2151
2152 *idx_skip = idx;
2153unlock:
2154 rcu_read_unlock();
2155 return 0;
2156}
2053 2157
2054/** 2158/**
2159 * batadv_bla_claim_dump - dump claim table to a netlink socket
2160 * @msg: buffer for the message
2161 * @cb: callback structure containing arguments
2162 *
2163 * Return: message length.
2164 */
2165int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb)
2166{
2167 struct batadv_hard_iface *primary_if = NULL;
2168 int portid = NETLINK_CB(cb->skb).portid;
2169 struct net *net = sock_net(cb->skb->sk);
2170 struct net_device *soft_iface;
2171 struct batadv_hashtable *hash;
2172 struct batadv_priv *bat_priv;
2173 int bucket = cb->args[0];
2174 struct hlist_head *head;
2175 int idx = cb->args[1];
2176 int ifindex;
2177 int ret = 0;
2178
2179 ifindex = batadv_netlink_get_ifindex(cb->nlh,
2180 BATADV_ATTR_MESH_IFINDEX);
2181 if (!ifindex)
2182 return -EINVAL;
2183
2184 soft_iface = dev_get_by_index(net, ifindex);
2185 if (!soft_iface || !batadv_softif_is_valid(soft_iface)) {
2186 ret = -ENODEV;
2187 goto out;
2188 }
2189
2190 bat_priv = netdev_priv(soft_iface);
2191 hash = bat_priv->bla.claim_hash;
2192
2193 primary_if = batadv_primary_if_get_selected(bat_priv);
2194 if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) {
2195 ret = -ENOENT;
2196 goto out;
2197 }
2198
2199 while (bucket < hash->size) {
2200 head = &hash->table[bucket];
2201
2202 if (batadv_bla_claim_dump_bucket(msg, portid,
2203 cb->nlh->nlmsg_seq,
2204 primary_if, head, &idx))
2205 break;
2206 bucket++;
2207 }
2208
2209 cb->args[0] = bucket;
2210 cb->args[1] = idx;
2211
2212 ret = msg->len;
2213
2214out:
2215 if (primary_if)
2216 batadv_hardif_put(primary_if);
2217
2218 if (soft_iface)
2219 dev_put(soft_iface);
2220
2221 return ret;
2222}
2223
2224#ifdef CONFIG_BATMAN_ADV_DEBUGFS
2225/**
2055 * batadv_bla_backbone_table_seq_print_text - print the backbone table in a seq 2226 * batadv_bla_backbone_table_seq_print_text - print the backbone table in a seq
2056 * file 2227 * file
2057 * @seq: seq file to print on 2228 * @seq: seq file to print on
@@ -2114,3 +2285,168 @@ out:
2114 batadv_hardif_put(primary_if); 2285 batadv_hardif_put(primary_if);
2115 return 0; 2286 return 0;
2116} 2287}
2288#endif
2289
2290/**
2291 * batadv_bla_backbone_dump_entry - dump one entry of the backbone table
2292 * to a netlink socket
2293 * @msg: buffer for the message
2294 * @portid: netlink port
2295 * @seq: Sequence number of netlink message
2296 * @primary_if: primary interface
2297 * @backbone_gw: entry to dump
2298 *
2299 * Return: 0 or error code.
2300 */
2301static int
2302batadv_bla_backbone_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
2303 struct batadv_hard_iface *primary_if,
2304 struct batadv_bla_backbone_gw *backbone_gw)
2305{
2306 u8 *primary_addr = primary_if->net_dev->dev_addr;
2307 u16 backbone_crc;
2308 bool is_own;
2309 int msecs;
2310 void *hdr;
2311 int ret = -EINVAL;
2312
2313 hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family,
2314 NLM_F_MULTI, BATADV_CMD_GET_BLA_BACKBONE);
2315 if (!hdr) {
2316 ret = -ENOBUFS;
2317 goto out;
2318 }
2319
2320 is_own = batadv_compare_eth(backbone_gw->orig, primary_addr);
2321
2322 spin_lock_bh(&backbone_gw->crc_lock);
2323 backbone_crc = backbone_gw->crc;
2324 spin_unlock_bh(&backbone_gw->crc_lock);
2325
2326 msecs = jiffies_to_msecs(jiffies - backbone_gw->lasttime);
2327
2328 if (is_own)
2329 if (nla_put_flag(msg, BATADV_ATTR_BLA_OWN)) {
2330 genlmsg_cancel(msg, hdr);
2331 goto out;
2332 }
2333
2334 if (nla_put(msg, BATADV_ATTR_BLA_BACKBONE, ETH_ALEN,
2335 backbone_gw->orig) ||
2336 nla_put_u16(msg, BATADV_ATTR_BLA_VID, backbone_gw->vid) ||
2337 nla_put_u16(msg, BATADV_ATTR_BLA_CRC,
2338 backbone_crc) ||
2339 nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, msecs)) {
2340 genlmsg_cancel(msg, hdr);
2341 goto out;
2342 }
2343
2344 genlmsg_end(msg, hdr);
2345 ret = 0;
2346
2347out:
2348 return ret;
2349}
2350
2351/**
2352 * batadv_bla_backbone_dump_bucket - dump one bucket of the backbone table
2353 * to a netlink socket
2354 * @msg: buffer for the message
2355 * @portid: netlink port
2356 * @seq: Sequence number of netlink message
2357 * @primary_if: primary interface
2358 * @head: bucket to dump
2359 * @idx_skip: How many entries to skip
2360 *
2361 * Return: always 0.
2362 */
2363static int
2364batadv_bla_backbone_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
2365 struct batadv_hard_iface *primary_if,
2366 struct hlist_head *head, int *idx_skip)
2367{
2368 struct batadv_bla_backbone_gw *backbone_gw;
2369 int idx = 0;
2370
2371 rcu_read_lock();
2372 hlist_for_each_entry_rcu(backbone_gw, head, hash_entry) {
2373 if (idx++ < *idx_skip)
2374 continue;
2375 if (batadv_bla_backbone_dump_entry(msg, portid, seq,
2376 primary_if, backbone_gw)) {
2377 *idx_skip = idx - 1;
2378 goto unlock;
2379 }
2380 }
2381
2382 *idx_skip = idx;
2383unlock:
2384 rcu_read_unlock();
2385 return 0;
2386}
2387
2388/**
2389 * batadv_bla_backbone_dump - dump backbone table to a netlink socket
2390 * @msg: buffer for the message
2391 * @cb: callback structure containing arguments
2392 *
2393 * Return: message length.
2394 */
2395int batadv_bla_backbone_dump(struct sk_buff *msg, struct netlink_callback *cb)
2396{
2397 struct batadv_hard_iface *primary_if = NULL;
2398 int portid = NETLINK_CB(cb->skb).portid;
2399 struct net *net = sock_net(cb->skb->sk);
2400 struct net_device *soft_iface;
2401 struct batadv_hashtable *hash;
2402 struct batadv_priv *bat_priv;
2403 int bucket = cb->args[0];
2404 struct hlist_head *head;
2405 int idx = cb->args[1];
2406 int ifindex;
2407 int ret = 0;
2408
2409 ifindex = batadv_netlink_get_ifindex(cb->nlh,
2410 BATADV_ATTR_MESH_IFINDEX);
2411 if (!ifindex)
2412 return -EINVAL;
2413
2414 soft_iface = dev_get_by_index(net, ifindex);
2415 if (!soft_iface || !batadv_softif_is_valid(soft_iface)) {
2416 ret = -ENODEV;
2417 goto out;
2418 }
2419
2420 bat_priv = netdev_priv(soft_iface);
2421 hash = bat_priv->bla.backbone_hash;
2422
2423 primary_if = batadv_primary_if_get_selected(bat_priv);
2424 if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) {
2425 ret = -ENOENT;
2426 goto out;
2427 }
2428
2429 while (bucket < hash->size) {
2430 head = &hash->table[bucket];
2431
2432 if (batadv_bla_backbone_dump_bucket(msg, portid,
2433 cb->nlh->nlmsg_seq,
2434 primary_if, head, &idx))
2435 break;
2436 bucket++;
2437 }
2438
2439 cb->args[0] = bucket;
2440 cb->args[1] = idx;
2441
2442 ret = msg->len;
2443
2444out:
2445 if (primary_if)
2446 batadv_hardif_put(primary_if);
2447
2448 if (soft_iface)
2449 dev_put(soft_iface);
2450
2451 return ret;
2452}
diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h
index 0f01daeb359e..1ae93e46fb98 100644
--- a/net/batman-adv/bridge_loop_avoidance.h
+++ b/net/batman-adv/bridge_loop_avoidance.h
@@ -23,6 +23,7 @@
23#include <linux/types.h> 23#include <linux/types.h>
24 24
25struct net_device; 25struct net_device;
26struct netlink_callback;
26struct seq_file; 27struct seq_file;
27struct sk_buff; 28struct sk_buff;
28 29
@@ -35,8 +36,10 @@ bool batadv_bla_is_backbone_gw(struct sk_buff *skb,
35 struct batadv_orig_node *orig_node, 36 struct batadv_orig_node *orig_node,
36 int hdr_size); 37 int hdr_size);
37int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset); 38int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset);
39int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb);
38int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq, 40int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq,
39 void *offset); 41 void *offset);
42int batadv_bla_backbone_dump(struct sk_buff *msg, struct netlink_callback *cb);
40bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, u8 *orig, 43bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, u8 *orig,
41 unsigned short vid); 44 unsigned short vid);
42bool batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv, 45bool batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv,
@@ -47,7 +50,7 @@ void batadv_bla_update_orig_address(struct batadv_priv *bat_priv,
47void batadv_bla_status_update(struct net_device *net_dev); 50void batadv_bla_status_update(struct net_device *net_dev);
48int batadv_bla_init(struct batadv_priv *bat_priv); 51int batadv_bla_init(struct batadv_priv *bat_priv);
49void batadv_bla_free(struct batadv_priv *bat_priv); 52void batadv_bla_free(struct batadv_priv *bat_priv);
50 53int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb);
51#define BATADV_BLA_CRC_INIT 0 54#define BATADV_BLA_CRC_INIT 0
52#else /* ifdef CONFIG_BATMAN_ADV_BLA */ 55#else /* ifdef CONFIG_BATMAN_ADV_BLA */
53 56
@@ -112,6 +115,18 @@ static inline void batadv_bla_free(struct batadv_priv *bat_priv)
112{ 115{
113} 116}
114 117
118static inline int batadv_bla_claim_dump(struct sk_buff *msg,
119 struct netlink_callback *cb)
120{
121 return -EOPNOTSUPP;
122}
123
124static inline int batadv_bla_backbone_dump(struct sk_buff *msg,
125 struct netlink_callback *cb)
126{
127 return -EOPNOTSUPP;
128}
129
115#endif /* ifdef CONFIG_BATMAN_ADV_BLA */ 130#endif /* ifdef CONFIG_BATMAN_ADV_BLA */
116 131
117#endif /* ifndef _NET_BATMAN_ADV_BLA_H_ */ 132#endif /* ifndef _NET_BATMAN_ADV_BLA_H_ */
diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c
index 1d68b6e63b96..b4ffba7dd583 100644
--- a/net/batman-adv/debugfs.c
+++ b/net/batman-adv/debugfs.c
@@ -31,6 +31,7 @@
31#include <linux/stddef.h> 31#include <linux/stddef.h>
32#include <linux/stringify.h> 32#include <linux/stringify.h>
33#include <linux/sysfs.h> 33#include <linux/sysfs.h>
34#include <net/net_namespace.h>
34 35
35#include "bat_algo.h" 36#include "bat_algo.h"
36#include "bridge_loop_avoidance.h" 37#include "bridge_loop_avoidance.h"
@@ -305,12 +306,16 @@ void batadv_debugfs_destroy(void)
305 */ 306 */
306int batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface) 307int batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface)
307{ 308{
309 struct net *net = dev_net(hard_iface->net_dev);
308 struct batadv_debuginfo **bat_debug; 310 struct batadv_debuginfo **bat_debug;
309 struct dentry *file; 311 struct dentry *file;
310 312
311 if (!batadv_debugfs) 313 if (!batadv_debugfs)
312 goto out; 314 goto out;
313 315
316 if (net != &init_net)
317 return 0;
318
314 hard_iface->debug_dir = debugfs_create_dir(hard_iface->net_dev->name, 319 hard_iface->debug_dir = debugfs_create_dir(hard_iface->net_dev->name,
315 batadv_debugfs); 320 batadv_debugfs);
316 if (!hard_iface->debug_dir) 321 if (!hard_iface->debug_dir)
@@ -341,6 +346,11 @@ out:
341 */ 346 */
342void batadv_debugfs_del_hardif(struct batadv_hard_iface *hard_iface) 347void batadv_debugfs_del_hardif(struct batadv_hard_iface *hard_iface)
343{ 348{
349 struct net *net = dev_net(hard_iface->net_dev);
350
351 if (net != &init_net)
352 return;
353
344 if (batadv_debugfs) { 354 if (batadv_debugfs) {
345 debugfs_remove_recursive(hard_iface->debug_dir); 355 debugfs_remove_recursive(hard_iface->debug_dir);
346 hard_iface->debug_dir = NULL; 356 hard_iface->debug_dir = NULL;
@@ -351,11 +361,15 @@ int batadv_debugfs_add_meshif(struct net_device *dev)
351{ 361{
352 struct batadv_priv *bat_priv = netdev_priv(dev); 362 struct batadv_priv *bat_priv = netdev_priv(dev);
353 struct batadv_debuginfo **bat_debug; 363 struct batadv_debuginfo **bat_debug;
364 struct net *net = dev_net(dev);
354 struct dentry *file; 365 struct dentry *file;
355 366
356 if (!batadv_debugfs) 367 if (!batadv_debugfs)
357 goto out; 368 goto out;
358 369
370 if (net != &init_net)
371 return 0;
372
359 bat_priv->debug_dir = debugfs_create_dir(dev->name, batadv_debugfs); 373 bat_priv->debug_dir = debugfs_create_dir(dev->name, batadv_debugfs);
360 if (!bat_priv->debug_dir) 374 if (!bat_priv->debug_dir)
361 goto out; 375 goto out;
@@ -392,6 +406,10 @@ out:
392void batadv_debugfs_del_meshif(struct net_device *dev) 406void batadv_debugfs_del_meshif(struct net_device *dev)
393{ 407{
394 struct batadv_priv *bat_priv = netdev_priv(dev); 408 struct batadv_priv *bat_priv = netdev_priv(dev);
409 struct net *net = dev_net(dev);
410
411 if (net != &init_net)
412 return;
395 413
396 batadv_debug_log_cleanup(bat_priv); 414 batadv_debug_log_cleanup(bat_priv);
397 415
diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h
index 1ab4e2e63afc..e49121ee55f6 100644
--- a/net/batman-adv/debugfs.h
+++ b/net/batman-adv/debugfs.h
@@ -20,13 +20,11 @@
20 20
21#include "main.h" 21#include "main.h"
22 22
23#include <linux/kconfig.h>
24
25struct net_device; 23struct net_device;
26 24
27#define BATADV_DEBUGFS_SUBDIR "batman_adv" 25#define BATADV_DEBUGFS_SUBDIR "batman_adv"
28 26
29#if IS_ENABLED(CONFIG_DEBUG_FS) 27#if IS_ENABLED(CONFIG_BATMAN_ADV_DEBUGFS)
30 28
31void batadv_debugfs_init(void); 29void batadv_debugfs_init(void);
32void batadv_debugfs_destroy(void); 30void batadv_debugfs_destroy(void);
diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c
index b1cc8bfe11ac..e257efdc5d03 100644
--- a/net/batman-adv/distributed-arp-table.c
+++ b/net/batman-adv/distributed-arp-table.c
@@ -343,8 +343,8 @@ static void batadv_dat_entry_add(struct batadv_priv *bat_priv, __be32 ip,
343 ether_addr_copy(dat_entry->mac_addr, mac_addr); 343 ether_addr_copy(dat_entry->mac_addr, mac_addr);
344 dat_entry->last_update = jiffies; 344 dat_entry->last_update = jiffies;
345 kref_init(&dat_entry->refcount); 345 kref_init(&dat_entry->refcount);
346 kref_get(&dat_entry->refcount);
347 346
347 kref_get(&dat_entry->refcount);
348 hash_added = batadv_hash_add(bat_priv->dat.hash, batadv_compare_dat, 348 hash_added = batadv_hash_add(bat_priv->dat.hash, batadv_compare_dat,
349 batadv_hash_dat, dat_entry, 349 batadv_hash_dat, dat_entry,
350 &dat_entry->hash_entry); 350 &dat_entry->hash_entry);
@@ -795,6 +795,7 @@ void batadv_dat_free(struct batadv_priv *bat_priv)
795 batadv_dat_hash_free(bat_priv); 795 batadv_dat_hash_free(bat_priv);
796} 796}
797 797
798#ifdef CONFIG_BATMAN_ADV_DEBUGFS
798/** 799/**
799 * batadv_dat_cache_seq_print_text - print the local DAT hash table 800 * batadv_dat_cache_seq_print_text - print the local DAT hash table
800 * @seq: seq file to print on 801 * @seq: seq file to print on
@@ -846,6 +847,7 @@ out:
846 batadv_hardif_put(primary_if); 847 batadv_hardif_put(primary_if);
847 return 0; 848 return 0;
848} 849}
850#endif
849 851
850/** 852/**
851 * batadv_arp_get_type - parse an ARP packet and gets the type 853 * batadv_arp_get_type - parse an ARP packet and gets the type
diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
index 63a805d3f96e..de055d64debe 100644
--- a/net/batman-adv/gateway_client.c
+++ b/net/batman-adv/gateway_client.c
@@ -20,6 +20,7 @@
20 20
21#include <linux/atomic.h> 21#include <linux/atomic.h>
22#include <linux/byteorder/generic.h> 22#include <linux/byteorder/generic.h>
23#include <linux/errno.h>
23#include <linux/etherdevice.h> 24#include <linux/etherdevice.h>
24#include <linux/fs.h> 25#include <linux/fs.h>
25#include <linux/if_ether.h> 26#include <linux/if_ether.h>
@@ -31,6 +32,7 @@
31#include <linux/kref.h> 32#include <linux/kref.h>
32#include <linux/list.h> 33#include <linux/list.h>
33#include <linux/netdevice.h> 34#include <linux/netdevice.h>
35#include <linux/netlink.h>
34#include <linux/rculist.h> 36#include <linux/rculist.h>
35#include <linux/rcupdate.h> 37#include <linux/rcupdate.h>
36#include <linux/seq_file.h> 38#include <linux/seq_file.h>
@@ -39,13 +41,17 @@
39#include <linux/spinlock.h> 41#include <linux/spinlock.h>
40#include <linux/stddef.h> 42#include <linux/stddef.h>
41#include <linux/udp.h> 43#include <linux/udp.h>
44#include <net/sock.h>
45#include <uapi/linux/batman_adv.h>
42 46
43#include "gateway_common.h" 47#include "gateway_common.h"
44#include "hard-interface.h" 48#include "hard-interface.h"
45#include "log.h" 49#include "log.h"
50#include "netlink.h"
46#include "originator.h" 51#include "originator.h"
47#include "packet.h" 52#include "packet.h"
48#include "routing.h" 53#include "routing.h"
54#include "soft-interface.h"
49#include "sysfs.h" 55#include "sysfs.h"
50#include "translation-table.h" 56#include "translation-table.h"
51 57
@@ -80,12 +86,12 @@ static void batadv_gw_node_release(struct kref *ref)
80 * batadv_gw_node_put - decrement the gw_node refcounter and possibly release it 86 * batadv_gw_node_put - decrement the gw_node refcounter and possibly release it
81 * @gw_node: gateway node to free 87 * @gw_node: gateway node to free
82 */ 88 */
83static void batadv_gw_node_put(struct batadv_gw_node *gw_node) 89void batadv_gw_node_put(struct batadv_gw_node *gw_node)
84{ 90{
85 kref_put(&gw_node->refcount, batadv_gw_node_release); 91 kref_put(&gw_node->refcount, batadv_gw_node_release);
86} 92}
87 93
88static struct batadv_gw_node * 94struct batadv_gw_node *
89batadv_gw_get_selected_gw_node(struct batadv_priv *bat_priv) 95batadv_gw_get_selected_gw_node(struct batadv_priv *bat_priv)
90{ 96{
91 struct batadv_gw_node *gw_node; 97 struct batadv_gw_node *gw_node;
@@ -164,86 +170,6 @@ void batadv_gw_reselect(struct batadv_priv *bat_priv)
164 atomic_set(&bat_priv->gw.reselect, 1); 170 atomic_set(&bat_priv->gw.reselect, 1);
165} 171}
166 172
167static struct batadv_gw_node *
168batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv)
169{
170 struct batadv_neigh_node *router;
171 struct batadv_neigh_ifinfo *router_ifinfo;
172 struct batadv_gw_node *gw_node, *curr_gw = NULL;
173 u64 max_gw_factor = 0;
174 u64 tmp_gw_factor = 0;
175 u8 max_tq = 0;
176 u8 tq_avg;
177 struct batadv_orig_node *orig_node;
178
179 rcu_read_lock();
180 hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) {
181 orig_node = gw_node->orig_node;
182 router = batadv_orig_router_get(orig_node, BATADV_IF_DEFAULT);
183 if (!router)
184 continue;
185
186 router_ifinfo = batadv_neigh_ifinfo_get(router,
187 BATADV_IF_DEFAULT);
188 if (!router_ifinfo)
189 goto next;
190
191 if (!kref_get_unless_zero(&gw_node->refcount))
192 goto next;
193
194 tq_avg = router_ifinfo->bat_iv.tq_avg;
195
196 switch (atomic_read(&bat_priv->gw.sel_class)) {
197 case 1: /* fast connection */
198 tmp_gw_factor = tq_avg * tq_avg;
199 tmp_gw_factor *= gw_node->bandwidth_down;
200 tmp_gw_factor *= 100 * 100;
201 tmp_gw_factor >>= 18;
202
203 if ((tmp_gw_factor > max_gw_factor) ||
204 ((tmp_gw_factor == max_gw_factor) &&
205 (tq_avg > max_tq))) {
206 if (curr_gw)
207 batadv_gw_node_put(curr_gw);
208 curr_gw = gw_node;
209 kref_get(&curr_gw->refcount);
210 }
211 break;
212
213 default: /* 2: stable connection (use best statistic)
214 * 3: fast-switch (use best statistic but change as
215 * soon as a better gateway appears)
216 * XX: late-switch (use best statistic but change as
217 * soon as a better gateway appears which has
218 * $routing_class more tq points)
219 */
220 if (tq_avg > max_tq) {
221 if (curr_gw)
222 batadv_gw_node_put(curr_gw);
223 curr_gw = gw_node;
224 kref_get(&curr_gw->refcount);
225 }
226 break;
227 }
228
229 if (tq_avg > max_tq)
230 max_tq = tq_avg;
231
232 if (tmp_gw_factor > max_gw_factor)
233 max_gw_factor = tmp_gw_factor;
234
235 batadv_gw_node_put(gw_node);
236
237next:
238 batadv_neigh_node_put(router);
239 if (router_ifinfo)
240 batadv_neigh_ifinfo_put(router_ifinfo);
241 }
242 rcu_read_unlock();
243
244 return curr_gw;
245}
246
247/** 173/**
248 * batadv_gw_check_client_stop - check if client mode has been switched off 174 * batadv_gw_check_client_stop - check if client mode has been switched off
249 * @bat_priv: the bat priv with all the soft interface information 175 * @bat_priv: the bat priv with all the soft interface information
@@ -287,12 +213,19 @@ void batadv_gw_election(struct batadv_priv *bat_priv)
287 if (atomic_read(&bat_priv->gw.mode) != BATADV_GW_MODE_CLIENT) 213 if (atomic_read(&bat_priv->gw.mode) != BATADV_GW_MODE_CLIENT)
288 goto out; 214 goto out;
289 215
216 if (!bat_priv->algo_ops->gw.get_best_gw_node)
217 goto out;
218
290 curr_gw = batadv_gw_get_selected_gw_node(bat_priv); 219 curr_gw = batadv_gw_get_selected_gw_node(bat_priv);
291 220
292 if (!batadv_atomic_dec_not_zero(&bat_priv->gw.reselect) && curr_gw) 221 if (!batadv_atomic_dec_not_zero(&bat_priv->gw.reselect) && curr_gw)
293 goto out; 222 goto out;
294 223
295 next_gw = batadv_gw_get_best_gw_node(bat_priv); 224 /* if gw.reselect is set to 1 it means that a previous call to
225 * gw.is_eligible() said that we have a new best GW, therefore it can
226 * now be picked from the list and selected
227 */
228 next_gw = bat_priv->algo_ops->gw.get_best_gw_node(bat_priv);
296 229
297 if (curr_gw == next_gw) 230 if (curr_gw == next_gw)
298 goto out; 231 goto out;
@@ -360,70 +293,31 @@ out:
360void batadv_gw_check_election(struct batadv_priv *bat_priv, 293void batadv_gw_check_election(struct batadv_priv *bat_priv,
361 struct batadv_orig_node *orig_node) 294 struct batadv_orig_node *orig_node)
362{ 295{
363 struct batadv_neigh_ifinfo *router_orig_tq = NULL;
364 struct batadv_neigh_ifinfo *router_gw_tq = NULL;
365 struct batadv_orig_node *curr_gw_orig; 296 struct batadv_orig_node *curr_gw_orig;
366 struct batadv_neigh_node *router_gw = NULL; 297
367 struct batadv_neigh_node *router_orig = NULL; 298 /* abort immediately if the routing algorithm does not support gateway
368 u8 gw_tq_avg, orig_tq_avg; 299 * election
300 */
301 if (!bat_priv->algo_ops->gw.is_eligible)
302 return;
369 303
370 curr_gw_orig = batadv_gw_get_selected_orig(bat_priv); 304 curr_gw_orig = batadv_gw_get_selected_orig(bat_priv);
371 if (!curr_gw_orig) 305 if (!curr_gw_orig)
372 goto reselect; 306 goto reselect;
373 307
374 router_gw = batadv_orig_router_get(curr_gw_orig, BATADV_IF_DEFAULT);
375 if (!router_gw)
376 goto reselect;
377
378 router_gw_tq = batadv_neigh_ifinfo_get(router_gw,
379 BATADV_IF_DEFAULT);
380 if (!router_gw_tq)
381 goto reselect;
382
383 /* this node already is the gateway */ 308 /* this node already is the gateway */
384 if (curr_gw_orig == orig_node) 309 if (curr_gw_orig == orig_node)
385 goto out; 310 goto out;
386 311
387 router_orig = batadv_orig_router_get(orig_node, BATADV_IF_DEFAULT); 312 if (!bat_priv->algo_ops->gw.is_eligible(bat_priv, curr_gw_orig,
388 if (!router_orig) 313 orig_node))
389 goto out; 314 goto out;
390 315
391 router_orig_tq = batadv_neigh_ifinfo_get(router_orig,
392 BATADV_IF_DEFAULT);
393 if (!router_orig_tq)
394 goto out;
395
396 gw_tq_avg = router_gw_tq->bat_iv.tq_avg;
397 orig_tq_avg = router_orig_tq->bat_iv.tq_avg;
398
399 /* the TQ value has to be better */
400 if (orig_tq_avg < gw_tq_avg)
401 goto out;
402
403 /* if the routing class is greater than 3 the value tells us how much
404 * greater the TQ value of the new gateway must be
405 */
406 if ((atomic_read(&bat_priv->gw.sel_class) > 3) &&
407 (orig_tq_avg - gw_tq_avg < atomic_read(&bat_priv->gw.sel_class)))
408 goto out;
409
410 batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
411 "Restarting gateway selection: better gateway found (tq curr: %i, tq new: %i)\n",
412 gw_tq_avg, orig_tq_avg);
413
414reselect: 316reselect:
415 batadv_gw_reselect(bat_priv); 317 batadv_gw_reselect(bat_priv);
416out: 318out:
417 if (curr_gw_orig) 319 if (curr_gw_orig)
418 batadv_orig_node_put(curr_gw_orig); 320 batadv_orig_node_put(curr_gw_orig);
419 if (router_gw)
420 batadv_neigh_node_put(router_gw);
421 if (router_orig)
422 batadv_neigh_node_put(router_orig);
423 if (router_gw_tq)
424 batadv_neigh_ifinfo_put(router_gw_tq);
425 if (router_orig_tq)
426 batadv_neigh_ifinfo_put(router_orig_tq);
427} 321}
428 322
429/** 323/**
@@ -445,14 +339,15 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv,
445 if (!gw_node) 339 if (!gw_node)
446 return; 340 return;
447 341
448 kref_get(&orig_node->refcount); 342 kref_init(&gw_node->refcount);
449 INIT_HLIST_NODE(&gw_node->list); 343 INIT_HLIST_NODE(&gw_node->list);
344 kref_get(&orig_node->refcount);
450 gw_node->orig_node = orig_node; 345 gw_node->orig_node = orig_node;
451 gw_node->bandwidth_down = ntohl(gateway->bandwidth_down); 346 gw_node->bandwidth_down = ntohl(gateway->bandwidth_down);
452 gw_node->bandwidth_up = ntohl(gateway->bandwidth_up); 347 gw_node->bandwidth_up = ntohl(gateway->bandwidth_up);
453 kref_init(&gw_node->refcount);
454 348
455 spin_lock_bh(&bat_priv->gw.list_lock); 349 spin_lock_bh(&bat_priv->gw.list_lock);
350 kref_get(&gw_node->refcount);
456 hlist_add_head_rcu(&gw_node->list, &bat_priv->gw.list); 351 hlist_add_head_rcu(&gw_node->list, &bat_priv->gw.list);
457 spin_unlock_bh(&bat_priv->gw.list_lock); 352 spin_unlock_bh(&bat_priv->gw.list_lock);
458 353
@@ -463,6 +358,9 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv,
463 ntohl(gateway->bandwidth_down) % 10, 358 ntohl(gateway->bandwidth_down) % 10,
464 ntohl(gateway->bandwidth_up) / 10, 359 ntohl(gateway->bandwidth_up) / 10,
465 ntohl(gateway->bandwidth_up) % 10); 360 ntohl(gateway->bandwidth_up) % 10);
361
362 /* don't return reference to new gw_node */
363 batadv_gw_node_put(gw_node);
466} 364}
467 365
468/** 366/**
@@ -472,9 +370,8 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv,
472 * 370 *
473 * Return: gateway node if found or NULL otherwise. 371 * Return: gateway node if found or NULL otherwise.
474 */ 372 */
475static struct batadv_gw_node * 373struct batadv_gw_node *batadv_gw_node_get(struct batadv_priv *bat_priv,
476batadv_gw_node_get(struct batadv_priv *bat_priv, 374 struct batadv_orig_node *orig_node)
477 struct batadv_orig_node *orig_node)
478{ 375{
479 struct batadv_gw_node *gw_node_tmp, *gw_node = NULL; 376 struct batadv_gw_node *gw_node_tmp, *gw_node = NULL;
480 377
@@ -585,81 +482,87 @@ void batadv_gw_node_free(struct batadv_priv *bat_priv)
585 spin_unlock_bh(&bat_priv->gw.list_lock); 482 spin_unlock_bh(&bat_priv->gw.list_lock);
586} 483}
587 484
588/* fails if orig_node has no router */ 485#ifdef CONFIG_BATMAN_ADV_DEBUGFS
589static int batadv_write_buffer_text(struct batadv_priv *bat_priv, 486int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset)
590 struct seq_file *seq,
591 const struct batadv_gw_node *gw_node)
592{ 487{
593 struct batadv_gw_node *curr_gw; 488 struct net_device *net_dev = (struct net_device *)seq->private;
594 struct batadv_neigh_node *router; 489 struct batadv_priv *bat_priv = netdev_priv(net_dev);
595 struct batadv_neigh_ifinfo *router_ifinfo = NULL; 490 struct batadv_hard_iface *primary_if;
596 int ret = -1;
597 491
598 router = batadv_orig_router_get(gw_node->orig_node, BATADV_IF_DEFAULT); 492 primary_if = batadv_seq_print_text_primary_if_get(seq);
599 if (!router) 493 if (!primary_if)
600 goto out; 494 return 0;
601 495
602 router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT); 496 seq_printf(seq, "[B.A.T.M.A.N. adv %s, MainIF/MAC: %s/%pM (%s %s)]\n",
603 if (!router_ifinfo) 497 BATADV_SOURCE_VERSION, primary_if->net_dev->name,
604 goto out; 498 primary_if->net_dev->dev_addr, net_dev->name,
499 bat_priv->algo_ops->name);
605 500
606 curr_gw = batadv_gw_get_selected_gw_node(bat_priv); 501 batadv_hardif_put(primary_if);
607 502
608 seq_printf(seq, "%s %pM (%3i) %pM [%10s]: %u.%u/%u.%u MBit\n", 503 if (!bat_priv->algo_ops->gw.print) {
609 (curr_gw == gw_node ? "=>" : " "), 504 seq_puts(seq,
610 gw_node->orig_node->orig, 505 "No printing function for this routing protocol\n");
611 router_ifinfo->bat_iv.tq_avg, router->addr, 506 return 0;
612 router->if_incoming->net_dev->name, 507 }
613 gw_node->bandwidth_down / 10,
614 gw_node->bandwidth_down % 10,
615 gw_node->bandwidth_up / 10,
616 gw_node->bandwidth_up % 10);
617 ret = seq_has_overflowed(seq) ? -1 : 0;
618 508
619 if (curr_gw) 509 bat_priv->algo_ops->gw.print(bat_priv, seq);
620 batadv_gw_node_put(curr_gw); 510
621out: 511 return 0;
622 if (router_ifinfo)
623 batadv_neigh_ifinfo_put(router_ifinfo);
624 if (router)
625 batadv_neigh_node_put(router);
626 return ret;
627} 512}
513#endif
628 514
629int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset) 515/**
516 * batadv_gw_dump - Dump gateways into a message
517 * @msg: Netlink message to dump into
518 * @cb: Control block containing additional options
519 *
520 * Return: Error code, or length of message
521 */
522int batadv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb)
630{ 523{
631 struct net_device *net_dev = (struct net_device *)seq->private; 524 struct batadv_hard_iface *primary_if = NULL;
632 struct batadv_priv *bat_priv = netdev_priv(net_dev); 525 struct net *net = sock_net(cb->skb->sk);
633 struct batadv_hard_iface *primary_if; 526 struct net_device *soft_iface;
634 struct batadv_gw_node *gw_node; 527 struct batadv_priv *bat_priv;
635 int gw_count = 0; 528 int ifindex;
636 529 int ret;
637 primary_if = batadv_seq_print_text_primary_if_get(seq); 530
638 if (!primary_if) 531 ifindex = batadv_netlink_get_ifindex(cb->nlh,
532 BATADV_ATTR_MESH_IFINDEX);
533 if (!ifindex)
534 return -EINVAL;
535
536 soft_iface = dev_get_by_index(net, ifindex);
537 if (!soft_iface || !batadv_softif_is_valid(soft_iface)) {
538 ret = -ENODEV;
639 goto out; 539 goto out;
540 }
640 541
641 seq_printf(seq, 542 bat_priv = netdev_priv(soft_iface);
642 " Gateway (#/255) Nexthop [outgoingIF]: advertised uplink bandwidth ... [B.A.T.M.A.N. adv %s, MainIF/MAC: %s/%pM (%s)]\n",
643 BATADV_SOURCE_VERSION, primary_if->net_dev->name,
644 primary_if->net_dev->dev_addr, net_dev->name);
645 543
646 rcu_read_lock(); 544 primary_if = batadv_primary_if_get_selected(bat_priv);
647 hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) { 545 if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) {
648 /* fails if orig_node has no router */ 546 ret = -ENOENT;
649 if (batadv_write_buffer_text(bat_priv, seq, gw_node) < 0) 547 goto out;
650 continue; 548 }
651 549
652 gw_count++; 550 if (!bat_priv->algo_ops->gw.dump) {
551 ret = -EOPNOTSUPP;
552 goto out;
653 } 553 }
654 rcu_read_unlock();
655 554
656 if (gw_count == 0) 555 bat_priv->algo_ops->gw.dump(msg, cb, bat_priv);
657 seq_puts(seq, "No gateways in range ...\n"); 556
557 ret = msg->len;
658 558
659out: 559out:
660 if (primary_if) 560 if (primary_if)
661 batadv_hardif_put(primary_if); 561 batadv_hardif_put(primary_if);
662 return 0; 562 if (soft_iface)
563 dev_put(soft_iface);
564
565 return ret;
663} 566}
664 567
665/** 568/**
diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h
index 582dd8c413c8..859166d03561 100644
--- a/net/batman-adv/gateway_client.h
+++ b/net/batman-adv/gateway_client.h
@@ -23,6 +23,7 @@
23#include <linux/types.h> 23#include <linux/types.h>
24 24
25struct batadv_tvlv_gateway_data; 25struct batadv_tvlv_gateway_data;
26struct netlink_callback;
26struct seq_file; 27struct seq_file;
27struct sk_buff; 28struct sk_buff;
28 29
@@ -39,10 +40,16 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv,
39void batadv_gw_node_delete(struct batadv_priv *bat_priv, 40void batadv_gw_node_delete(struct batadv_priv *bat_priv,
40 struct batadv_orig_node *orig_node); 41 struct batadv_orig_node *orig_node);
41void batadv_gw_node_free(struct batadv_priv *bat_priv); 42void batadv_gw_node_free(struct batadv_priv *bat_priv);
43void batadv_gw_node_put(struct batadv_gw_node *gw_node);
44struct batadv_gw_node *
45batadv_gw_get_selected_gw_node(struct batadv_priv *bat_priv);
42int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset); 46int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset);
47int batadv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb);
43bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, struct sk_buff *skb); 48bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, struct sk_buff *skb);
44enum batadv_dhcp_recipient 49enum batadv_dhcp_recipient
45batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len, 50batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len,
46 u8 *chaddr); 51 u8 *chaddr);
52struct batadv_gw_node *batadv_gw_node_get(struct batadv_priv *bat_priv,
53 struct batadv_orig_node *orig_node);
47 54
48#endif /* _NET_BATMAN_ADV_GATEWAY_CLIENT_H_ */ 55#endif /* _NET_BATMAN_ADV_GATEWAY_CLIENT_H_ */
diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c
index d7bc6a87bcc9..21184810d89f 100644
--- a/net/batman-adv/gateway_common.c
+++ b/net/batman-adv/gateway_common.c
@@ -241,10 +241,9 @@ static void batadv_gw_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv,
241 241
242 batadv_gw_node_update(bat_priv, orig, &gateway); 242 batadv_gw_node_update(bat_priv, orig, &gateway);
243 243
244 /* restart gateway selection if fast or late switching was enabled */ 244 /* restart gateway selection */
245 if ((gateway.bandwidth_down != 0) && 245 if ((gateway.bandwidth_down != 0) &&
246 (atomic_read(&bat_priv->gw.mode) == BATADV_GW_MODE_CLIENT) && 246 (atomic_read(&bat_priv->gw.mode) == BATADV_GW_MODE_CLIENT))
247 (atomic_read(&bat_priv->gw.sel_class) > 2))
248 batadv_gw_check_election(bat_priv, orig); 247 batadv_gw_check_election(bat_priv, orig);
249} 248}
250 249
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index 1f9080840566..08ce36147c4c 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -35,7 +35,8 @@
35#include <linux/rtnetlink.h> 35#include <linux/rtnetlink.h>
36#include <linux/slab.h> 36#include <linux/slab.h>
37#include <linux/spinlock.h> 37#include <linux/spinlock.h>
38#include <linux/workqueue.h> 38#include <net/net_namespace.h>
39#include <net/rtnetlink.h>
39 40
40#include "bat_v.h" 41#include "bat_v.h"
41#include "bridge_loop_avoidance.h" 42#include "bridge_loop_avoidance.h"
@@ -85,25 +86,55 @@ out:
85} 86}
86 87
87/** 88/**
89 * batadv_getlink_net - return link net namespace (of use fallback)
90 * @netdev: net_device to check
91 * @fallback_net: return in case get_link_net is not available for @netdev
92 *
93 * Return: result of rtnl_link_ops->get_link_net or @fallback_net
94 */
95static const struct net *batadv_getlink_net(const struct net_device *netdev,
96 const struct net *fallback_net)
97{
98 if (!netdev->rtnl_link_ops)
99 return fallback_net;
100
101 if (!netdev->rtnl_link_ops->get_link_net)
102 return fallback_net;
103
104 return netdev->rtnl_link_ops->get_link_net(netdev);
105}
106
107/**
88 * batadv_mutual_parents - check if two devices are each others parent 108 * batadv_mutual_parents - check if two devices are each others parent
89 * @dev1: 1st net_device 109 * @dev1: 1st net dev
90 * @dev2: 2nd net_device 110 * @net1: 1st devices netns
111 * @dev2: 2nd net dev
112 * @net2: 2nd devices netns
91 * 113 *
92 * veth devices come in pairs and each is the parent of the other! 114 * veth devices come in pairs and each is the parent of the other!
93 * 115 *
94 * Return: true if the devices are each others parent, otherwise false 116 * Return: true if the devices are each others parent, otherwise false
95 */ 117 */
96static bool batadv_mutual_parents(const struct net_device *dev1, 118static bool batadv_mutual_parents(const struct net_device *dev1,
97 const struct net_device *dev2) 119 const struct net *net1,
120 const struct net_device *dev2,
121 const struct net *net2)
98{ 122{
99 int dev1_parent_iflink = dev_get_iflink(dev1); 123 int dev1_parent_iflink = dev_get_iflink(dev1);
100 int dev2_parent_iflink = dev_get_iflink(dev2); 124 int dev2_parent_iflink = dev_get_iflink(dev2);
125 const struct net *dev1_parent_net;
126 const struct net *dev2_parent_net;
127
128 dev1_parent_net = batadv_getlink_net(dev1, net1);
129 dev2_parent_net = batadv_getlink_net(dev2, net2);
101 130
102 if (!dev1_parent_iflink || !dev2_parent_iflink) 131 if (!dev1_parent_iflink || !dev2_parent_iflink)
103 return false; 132 return false;
104 133
105 return (dev1_parent_iflink == dev2->ifindex) && 134 return (dev1_parent_iflink == dev2->ifindex) &&
106 (dev2_parent_iflink == dev1->ifindex); 135 (dev2_parent_iflink == dev1->ifindex) &&
136 net_eq(dev1_parent_net, net2) &&
137 net_eq(dev2_parent_net, net1);
107} 138}
108 139
109/** 140/**
@@ -121,8 +152,9 @@ static bool batadv_mutual_parents(const struct net_device *dev1,
121 */ 152 */
122static bool batadv_is_on_batman_iface(const struct net_device *net_dev) 153static bool batadv_is_on_batman_iface(const struct net_device *net_dev)
123{ 154{
124 struct net_device *parent_dev;
125 struct net *net = dev_net(net_dev); 155 struct net *net = dev_net(net_dev);
156 struct net_device *parent_dev;
157 const struct net *parent_net;
126 bool ret; 158 bool ret;
127 159
128 /* check if this is a batman-adv mesh interface */ 160 /* check if this is a batman-adv mesh interface */
@@ -134,13 +166,16 @@ static bool batadv_is_on_batman_iface(const struct net_device *net_dev)
134 dev_get_iflink(net_dev) == net_dev->ifindex) 166 dev_get_iflink(net_dev) == net_dev->ifindex)
135 return false; 167 return false;
136 168
169 parent_net = batadv_getlink_net(net_dev, net);
170
137 /* recurse over the parent device */ 171 /* recurse over the parent device */
138 parent_dev = __dev_get_by_index(net, dev_get_iflink(net_dev)); 172 parent_dev = __dev_get_by_index((struct net *)parent_net,
173 dev_get_iflink(net_dev));
139 /* if we got a NULL parent_dev there is something broken.. */ 174 /* if we got a NULL parent_dev there is something broken.. */
140 if (WARN(!parent_dev, "Cannot find parent device")) 175 if (WARN(!parent_dev, "Cannot find parent device"))
141 return false; 176 return false;
142 177
143 if (batadv_mutual_parents(net_dev, parent_dev)) 178 if (batadv_mutual_parents(net_dev, net, parent_dev, parent_net))
144 return false; 179 return false;
145 180
146 ret = batadv_is_on_batman_iface(parent_dev); 181 ret = batadv_is_on_batman_iface(parent_dev);
@@ -625,25 +660,6 @@ out:
625 batadv_hardif_put(primary_if); 660 batadv_hardif_put(primary_if);
626} 661}
627 662
628/**
629 * batadv_hardif_remove_interface_finish - cleans up the remains of a hardif
630 * @work: work queue item
631 *
632 * Free the parts of the hard interface which can not be removed under
633 * rtnl lock (to prevent deadlock situations).
634 */
635static void batadv_hardif_remove_interface_finish(struct work_struct *work)
636{
637 struct batadv_hard_iface *hard_iface;
638
639 hard_iface = container_of(work, struct batadv_hard_iface,
640 cleanup_work);
641
642 batadv_debugfs_del_hardif(hard_iface);
643 batadv_sysfs_del_hardif(&hard_iface->hardif_obj);
644 batadv_hardif_put(hard_iface);
645}
646
647static struct batadv_hard_iface * 663static struct batadv_hard_iface *
648batadv_hardif_add_interface(struct net_device *net_dev) 664batadv_hardif_add_interface(struct net_device *net_dev)
649{ 665{
@@ -676,10 +692,9 @@ batadv_hardif_add_interface(struct net_device *net_dev)
676 692
677 INIT_LIST_HEAD(&hard_iface->list); 693 INIT_LIST_HEAD(&hard_iface->list);
678 INIT_HLIST_HEAD(&hard_iface->neigh_list); 694 INIT_HLIST_HEAD(&hard_iface->neigh_list);
679 INIT_WORK(&hard_iface->cleanup_work,
680 batadv_hardif_remove_interface_finish);
681 695
682 spin_lock_init(&hard_iface->neigh_list_lock); 696 spin_lock_init(&hard_iface->neigh_list_lock);
697 kref_init(&hard_iface->refcount);
683 698
684 hard_iface->num_bcasts = BATADV_NUM_BCASTS_DEFAULT; 699 hard_iface->num_bcasts = BATADV_NUM_BCASTS_DEFAULT;
685 if (batadv_is_wifi_netdev(net_dev)) 700 if (batadv_is_wifi_netdev(net_dev))
@@ -687,11 +702,8 @@ batadv_hardif_add_interface(struct net_device *net_dev)
687 702
688 batadv_v_hardif_init(hard_iface); 703 batadv_v_hardif_init(hard_iface);
689 704
690 /* extra reference for return */
691 kref_init(&hard_iface->refcount);
692 kref_get(&hard_iface->refcount);
693
694 batadv_check_known_mac_addr(hard_iface->net_dev); 705 batadv_check_known_mac_addr(hard_iface->net_dev);
706 kref_get(&hard_iface->refcount);
695 list_add_tail_rcu(&hard_iface->list, &batadv_hardif_list); 707 list_add_tail_rcu(&hard_iface->list, &batadv_hardif_list);
696 708
697 return hard_iface; 709 return hard_iface;
@@ -713,13 +725,15 @@ static void batadv_hardif_remove_interface(struct batadv_hard_iface *hard_iface)
713 /* first deactivate interface */ 725 /* first deactivate interface */
714 if (hard_iface->if_status != BATADV_IF_NOT_IN_USE) 726 if (hard_iface->if_status != BATADV_IF_NOT_IN_USE)
715 batadv_hardif_disable_interface(hard_iface, 727 batadv_hardif_disable_interface(hard_iface,
716 BATADV_IF_CLEANUP_AUTO); 728 BATADV_IF_CLEANUP_KEEP);
717 729
718 if (hard_iface->if_status != BATADV_IF_NOT_IN_USE) 730 if (hard_iface->if_status != BATADV_IF_NOT_IN_USE)
719 return; 731 return;
720 732
721 hard_iface->if_status = BATADV_IF_TO_BE_REMOVED; 733 hard_iface->if_status = BATADV_IF_TO_BE_REMOVED;
722 queue_work(batadv_event_workqueue, &hard_iface->cleanup_work); 734 batadv_debugfs_del_hardif(hard_iface);
735 batadv_sysfs_del_hardif(&hard_iface->hardif_obj);
736 batadv_hardif_put(hard_iface);
723} 737}
724 738
725void batadv_hardif_remove_interfaces(void) 739void batadv_hardif_remove_interfaces(void)
diff --git a/net/batman-adv/icmp_socket.h b/net/batman-adv/icmp_socket.h
index 618d5de06f20..e44a7da51431 100644
--- a/net/batman-adv/icmp_socket.h
+++ b/net/batman-adv/icmp_socket.h
@@ -26,9 +26,25 @@ struct batadv_icmp_header;
26 26
27#define BATADV_ICMP_SOCKET "socket" 27#define BATADV_ICMP_SOCKET "socket"
28 28
29void batadv_socket_init(void);
30int batadv_socket_setup(struct batadv_priv *bat_priv); 29int batadv_socket_setup(struct batadv_priv *bat_priv);
30
31#ifdef CONFIG_BATMAN_ADV_DEBUGFS
32
33void batadv_socket_init(void);
31void batadv_socket_receive_packet(struct batadv_icmp_header *icmph, 34void batadv_socket_receive_packet(struct batadv_icmp_header *icmph,
32 size_t icmp_len); 35 size_t icmp_len);
33 36
37#else
38
39static inline void batadv_socket_init(void)
40{
41}
42
43static inline void
44batadv_socket_receive_packet(struct batadv_icmp_header *icmph, size_t icmp_len)
45{
46}
47
48#endif
49
34#endif /* _NET_BATMAN_ADV_ICMP_SOCKET_H_ */ 50#endif /* _NET_BATMAN_ADV_ICMP_SOCKET_H_ */
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index fe4c5e29f96b..2c017ab47557 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -82,6 +82,12 @@ static void batadv_recv_handler_init(void);
82 82
83static int __init batadv_init(void) 83static int __init batadv_init(void)
84{ 84{
85 int ret;
86
87 ret = batadv_tt_cache_init();
88 if (ret < 0)
89 return ret;
90
85 INIT_LIST_HEAD(&batadv_hardif_list); 91 INIT_LIST_HEAD(&batadv_hardif_list);
86 batadv_algo_init(); 92 batadv_algo_init();
87 93
@@ -93,9 +99,8 @@ static int __init batadv_init(void)
93 batadv_tp_meter_init(); 99 batadv_tp_meter_init();
94 100
95 batadv_event_workqueue = create_singlethread_workqueue("bat_events"); 101 batadv_event_workqueue = create_singlethread_workqueue("bat_events");
96
97 if (!batadv_event_workqueue) 102 if (!batadv_event_workqueue)
98 return -ENOMEM; 103 goto err_create_wq;
99 104
100 batadv_socket_init(); 105 batadv_socket_init();
101 batadv_debugfs_init(); 106 batadv_debugfs_init();
@@ -108,6 +113,11 @@ static int __init batadv_init(void)
108 BATADV_SOURCE_VERSION, BATADV_COMPAT_VERSION); 113 BATADV_SOURCE_VERSION, BATADV_COMPAT_VERSION);
109 114
110 return 0; 115 return 0;
116
117err_create_wq:
118 batadv_tt_cache_destroy();
119
120 return -ENOMEM;
111} 121}
112 122
113static void __exit batadv_exit(void) 123static void __exit batadv_exit(void)
@@ -123,6 +133,8 @@ static void __exit batadv_exit(void)
123 batadv_event_workqueue = NULL; 133 batadv_event_workqueue = NULL;
124 134
125 rcu_barrier(); 135 rcu_barrier();
136
137 batadv_tt_cache_destroy();
126} 138}
127 139
128int batadv_mesh_init(struct net_device *soft_iface) 140int batadv_mesh_init(struct net_device *soft_iface)
@@ -270,6 +282,7 @@ bool batadv_is_my_mac(struct batadv_priv *bat_priv, const u8 *addr)
270 return is_my_mac; 282 return is_my_mac;
271} 283}
272 284
285#ifdef CONFIG_BATMAN_ADV_DEBUGFS
273/** 286/**
274 * batadv_seq_print_text_primary_if_get - called from debugfs table printing 287 * batadv_seq_print_text_primary_if_get - called from debugfs table printing
275 * function that requires the primary interface 288 * function that requires the primary interface
@@ -305,6 +318,7 @@ batadv_seq_print_text_primary_if_get(struct seq_file *seq)
305out: 318out:
306 return primary_if; 319 return primary_if;
307} 320}
321#endif
308 322
309/** 323/**
310 * batadv_max_header_len - calculate maximum encapsulation overhead for a 324 * batadv_max_header_len - calculate maximum encapsulation overhead for a
@@ -638,3 +652,4 @@ MODULE_AUTHOR(BATADV_DRIVER_AUTHOR);
638MODULE_DESCRIPTION(BATADV_DRIVER_DESC); 652MODULE_DESCRIPTION(BATADV_DRIVER_DESC);
639MODULE_SUPPORTED_DEVICE(BATADV_DRIVER_DEVICE); 653MODULE_SUPPORTED_DEVICE(BATADV_DRIVER_DEVICE);
640MODULE_VERSION(BATADV_SOURCE_VERSION); 654MODULE_VERSION(BATADV_SOURCE_VERSION);
655MODULE_ALIAS_RTNL_LINK("batadv");
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index 06a860845434..09af21e27639 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -24,7 +24,7 @@
24#define BATADV_DRIVER_DEVICE "batman-adv" 24#define BATADV_DRIVER_DEVICE "batman-adv"
25 25
26#ifndef BATADV_SOURCE_VERSION 26#ifndef BATADV_SOURCE_VERSION
27#define BATADV_SOURCE_VERSION "2016.3" 27#define BATADV_SOURCE_VERSION "2016.4"
28#endif 28#endif
29 29
30/* B.A.T.M.A.N. parameters */ 30/* B.A.T.M.A.N. parameters */
diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c
index cc915073a753..13661f43386f 100644
--- a/net/batman-adv/multicast.c
+++ b/net/batman-adv/multicast.c
@@ -528,7 +528,7 @@ update:
528 } 528 }
529 529
530 return !(mcast_data.flags & 530 return !(mcast_data.flags &
531 (BATADV_MCAST_WANT_ALL_IPV4 + BATADV_MCAST_WANT_ALL_IPV6)); 531 (BATADV_MCAST_WANT_ALL_IPV4 | BATADV_MCAST_WANT_ALL_IPV6));
532} 532}
533 533
534/** 534/**
@@ -1134,6 +1134,7 @@ void batadv_mcast_init(struct batadv_priv *bat_priv)
1134 BATADV_TVLV_HANDLER_OGM_CIFNOTFND); 1134 BATADV_TVLV_HANDLER_OGM_CIFNOTFND);
1135} 1135}
1136 1136
1137#ifdef CONFIG_BATMAN_ADV_DEBUGFS
1137/** 1138/**
1138 * batadv_mcast_flags_print_header - print own mcast flags to debugfs table 1139 * batadv_mcast_flags_print_header - print own mcast flags to debugfs table
1139 * @bat_priv: the bat priv with all the soft interface information 1140 * @bat_priv: the bat priv with all the soft interface information
@@ -1234,6 +1235,7 @@ int batadv_mcast_flags_seq_print_text(struct seq_file *seq, void *offset)
1234 1235
1235 return 0; 1236 return 0;
1236} 1237}
1238#endif
1237 1239
1238/** 1240/**
1239 * batadv_mcast_free - free the multicast optimizations structures 1241 * batadv_mcast_free - free the multicast optimizations structures
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index 231f8eaf075b..64cb6acbe0a6 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -18,6 +18,8 @@
18#include "netlink.h" 18#include "netlink.h"
19#include "main.h" 19#include "main.h"
20 20
21#include <linux/atomic.h>
22#include <linux/byteorder/generic.h>
21#include <linux/errno.h> 23#include <linux/errno.h>
22#include <linux/fs.h> 24#include <linux/fs.h>
23#include <linux/genetlink.h> 25#include <linux/genetlink.h>
@@ -26,24 +28,33 @@
26#include <linux/netdevice.h> 28#include <linux/netdevice.h>
27#include <linux/netlink.h> 29#include <linux/netlink.h>
28#include <linux/printk.h> 30#include <linux/printk.h>
31#include <linux/rculist.h>
32#include <linux/rcupdate.h>
33#include <linux/skbuff.h>
29#include <linux/stddef.h> 34#include <linux/stddef.h>
30#include <linux/types.h> 35#include <linux/types.h>
31#include <net/genetlink.h> 36#include <net/genetlink.h>
32#include <net/netlink.h> 37#include <net/netlink.h>
38#include <net/sock.h>
33#include <uapi/linux/batman_adv.h> 39#include <uapi/linux/batman_adv.h>
34 40
41#include "bat_algo.h"
42#include "bridge_loop_avoidance.h"
43#include "gateway_client.h"
35#include "hard-interface.h" 44#include "hard-interface.h"
45#include "originator.h"
46#include "packet.h"
36#include "soft-interface.h" 47#include "soft-interface.h"
37#include "tp_meter.h" 48#include "tp_meter.h"
49#include "translation-table.h"
38 50
39struct sk_buff; 51struct genl_family batadv_netlink_family = {
40
41static struct genl_family batadv_netlink_family = {
42 .id = GENL_ID_GENERATE, 52 .id = GENL_ID_GENERATE,
43 .hdrsize = 0, 53 .hdrsize = 0,
44 .name = BATADV_NL_NAME, 54 .name = BATADV_NL_NAME,
45 .version = 1, 55 .version = 1,
46 .maxattr = BATADV_ATTR_MAX, 56 .maxattr = BATADV_ATTR_MAX,
57 .netnsok = true,
47}; 58};
48 59
49/* multicast groups */ 60/* multicast groups */
@@ -51,11 +62,11 @@ enum batadv_netlink_multicast_groups {
51 BATADV_NL_MCGRP_TPMETER, 62 BATADV_NL_MCGRP_TPMETER,
52}; 63};
53 64
54static struct genl_multicast_group batadv_netlink_mcgrps[] = { 65static const struct genl_multicast_group batadv_netlink_mcgrps[] = {
55 [BATADV_NL_MCGRP_TPMETER] = { .name = BATADV_NL_MCAST_GROUP_TPMETER }, 66 [BATADV_NL_MCGRP_TPMETER] = { .name = BATADV_NL_MCAST_GROUP_TPMETER },
56}; 67};
57 68
58static struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = { 69static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = {
59 [BATADV_ATTR_VERSION] = { .type = NLA_STRING }, 70 [BATADV_ATTR_VERSION] = { .type = NLA_STRING },
60 [BATADV_ATTR_ALGO_NAME] = { .type = NLA_STRING }, 71 [BATADV_ATTR_ALGO_NAME] = { .type = NLA_STRING },
61 [BATADV_ATTR_MESH_IFINDEX] = { .type = NLA_U32 }, 72 [BATADV_ATTR_MESH_IFINDEX] = { .type = NLA_U32 },
@@ -69,9 +80,44 @@ static struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = {
69 [BATADV_ATTR_TPMETER_TEST_TIME] = { .type = NLA_U32 }, 80 [BATADV_ATTR_TPMETER_TEST_TIME] = { .type = NLA_U32 },
70 [BATADV_ATTR_TPMETER_BYTES] = { .type = NLA_U64 }, 81 [BATADV_ATTR_TPMETER_BYTES] = { .type = NLA_U64 },
71 [BATADV_ATTR_TPMETER_COOKIE] = { .type = NLA_U32 }, 82 [BATADV_ATTR_TPMETER_COOKIE] = { .type = NLA_U32 },
83 [BATADV_ATTR_ACTIVE] = { .type = NLA_FLAG },
84 [BATADV_ATTR_TT_ADDRESS] = { .len = ETH_ALEN },
85 [BATADV_ATTR_TT_TTVN] = { .type = NLA_U8 },
86 [BATADV_ATTR_TT_LAST_TTVN] = { .type = NLA_U8 },
87 [BATADV_ATTR_TT_CRC32] = { .type = NLA_U32 },
88 [BATADV_ATTR_TT_VID] = { .type = NLA_U16 },
89 [BATADV_ATTR_TT_FLAGS] = { .type = NLA_U32 },
90 [BATADV_ATTR_FLAG_BEST] = { .type = NLA_FLAG },
91 [BATADV_ATTR_LAST_SEEN_MSECS] = { .type = NLA_U32 },
92 [BATADV_ATTR_NEIGH_ADDRESS] = { .len = ETH_ALEN },
93 [BATADV_ATTR_TQ] = { .type = NLA_U8 },
94 [BATADV_ATTR_THROUGHPUT] = { .type = NLA_U32 },
95 [BATADV_ATTR_BANDWIDTH_UP] = { .type = NLA_U32 },
96 [BATADV_ATTR_BANDWIDTH_DOWN] = { .type = NLA_U32 },
97 [BATADV_ATTR_ROUTER] = { .len = ETH_ALEN },
98 [BATADV_ATTR_BLA_OWN] = { .type = NLA_FLAG },
99 [BATADV_ATTR_BLA_ADDRESS] = { .len = ETH_ALEN },
100 [BATADV_ATTR_BLA_VID] = { .type = NLA_U16 },
101 [BATADV_ATTR_BLA_BACKBONE] = { .len = ETH_ALEN },
102 [BATADV_ATTR_BLA_CRC] = { .type = NLA_U16 },
72}; 103};
73 104
74/** 105/**
106 * batadv_netlink_get_ifindex - Extract an interface index from a message
107 * @nlh: Message header
108 * @attrtype: Attribute which holds an interface index
109 *
110 * Return: interface index, or 0.
111 */
112int
113batadv_netlink_get_ifindex(const struct nlmsghdr *nlh, int attrtype)
114{
115 struct nlattr *attr = nlmsg_find_attr(nlh, GENL_HDRLEN, attrtype);
116
117 return attr ? nla_get_u32(attr) : 0;
118}
119
120/**
75 * batadv_netlink_mesh_info_put - fill in generic information about mesh 121 * batadv_netlink_mesh_info_put - fill in generic information about mesh
76 * interface 122 * interface
77 * @msg: netlink message to be sent back 123 * @msg: netlink message to be sent back
@@ -93,8 +139,16 @@ batadv_netlink_mesh_info_put(struct sk_buff *msg, struct net_device *soft_iface)
93 nla_put_u32(msg, BATADV_ATTR_MESH_IFINDEX, soft_iface->ifindex) || 139 nla_put_u32(msg, BATADV_ATTR_MESH_IFINDEX, soft_iface->ifindex) ||
94 nla_put_string(msg, BATADV_ATTR_MESH_IFNAME, soft_iface->name) || 140 nla_put_string(msg, BATADV_ATTR_MESH_IFNAME, soft_iface->name) ||
95 nla_put(msg, BATADV_ATTR_MESH_ADDRESS, ETH_ALEN, 141 nla_put(msg, BATADV_ATTR_MESH_ADDRESS, ETH_ALEN,
96 soft_iface->dev_addr)) 142 soft_iface->dev_addr) ||
143 nla_put_u8(msg, BATADV_ATTR_TT_TTVN,
144 (u8)atomic_read(&bat_priv->tt.vn)))
145 goto out;
146
147#ifdef CONFIG_BATMAN_ADV_BLA
148 if (nla_put_u16(msg, BATADV_ATTR_BLA_CRC,
149 ntohs(bat_priv->bla.claim_dest.group)))
97 goto out; 150 goto out;
151#endif
98 152
99 primary_if = batadv_primary_if_get_selected(bat_priv); 153 primary_if = batadv_primary_if_get_selected(bat_priv);
100 if (primary_if && primary_if->if_status == BATADV_IF_ACTIVE) { 154 if (primary_if && primary_if->if_status == BATADV_IF_ACTIVE) {
@@ -380,6 +434,106 @@ out:
380 return ret; 434 return ret;
381} 435}
382 436
437/**
438 * batadv_netlink_dump_hardif_entry - Dump one hard interface into a message
439 * @msg: Netlink message to dump into
440 * @portid: Port making netlink request
441 * @seq: Sequence number of netlink message
442 * @hard_iface: Hard interface to dump
443 *
444 * Return: error code, or 0 on success
445 */
446static int
447batadv_netlink_dump_hardif_entry(struct sk_buff *msg, u32 portid, u32 seq,
448 struct batadv_hard_iface *hard_iface)
449{
450 struct net_device *net_dev = hard_iface->net_dev;
451 void *hdr;
452
453 hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, NLM_F_MULTI,
454 BATADV_CMD_GET_HARDIFS);
455 if (!hdr)
456 return -EMSGSIZE;
457
458 if (nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX,
459 net_dev->ifindex) ||
460 nla_put_string(msg, BATADV_ATTR_HARD_IFNAME,
461 net_dev->name) ||
462 nla_put(msg, BATADV_ATTR_HARD_ADDRESS, ETH_ALEN,
463 net_dev->dev_addr))
464 goto nla_put_failure;
465
466 if (hard_iface->if_status == BATADV_IF_ACTIVE) {
467 if (nla_put_flag(msg, BATADV_ATTR_ACTIVE))
468 goto nla_put_failure;
469 }
470
471 genlmsg_end(msg, hdr);
472 return 0;
473
474 nla_put_failure:
475 genlmsg_cancel(msg, hdr);
476 return -EMSGSIZE;
477}
478
479/**
480 * batadv_netlink_dump_hardifs - Dump all hard interface into a messages
481 * @msg: Netlink message to dump into
482 * @cb: Parameters from query
483 *
484 * Return: error code, or length of reply message on success
485 */
486static int
487batadv_netlink_dump_hardifs(struct sk_buff *msg, struct netlink_callback *cb)
488{
489 struct net *net = sock_net(cb->skb->sk);
490 struct net_device *soft_iface;
491 struct batadv_hard_iface *hard_iface;
492 int ifindex;
493 int portid = NETLINK_CB(cb->skb).portid;
494 int seq = cb->nlh->nlmsg_seq;
495 int skip = cb->args[0];
496 int i = 0;
497
498 ifindex = batadv_netlink_get_ifindex(cb->nlh,
499 BATADV_ATTR_MESH_IFINDEX);
500 if (!ifindex)
501 return -EINVAL;
502
503 soft_iface = dev_get_by_index(net, ifindex);
504 if (!soft_iface)
505 return -ENODEV;
506
507 if (!batadv_softif_is_valid(soft_iface)) {
508 dev_put(soft_iface);
509 return -ENODEV;
510 }
511
512 rcu_read_lock();
513
514 list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
515 if (hard_iface->soft_iface != soft_iface)
516 continue;
517
518 if (i++ < skip)
519 continue;
520
521 if (batadv_netlink_dump_hardif_entry(msg, portid, seq,
522 hard_iface)) {
523 i--;
524 break;
525 }
526 }
527
528 rcu_read_unlock();
529
530 dev_put(soft_iface);
531
532 cb->args[0] = i;
533
534 return msg->len;
535}
536
383static struct genl_ops batadv_netlink_ops[] = { 537static struct genl_ops batadv_netlink_ops[] = {
384 { 538 {
385 .cmd = BATADV_CMD_GET_MESH_INFO, 539 .cmd = BATADV_CMD_GET_MESH_INFO,
@@ -399,6 +553,61 @@ static struct genl_ops batadv_netlink_ops[] = {
399 .policy = batadv_netlink_policy, 553 .policy = batadv_netlink_policy,
400 .doit = batadv_netlink_tp_meter_cancel, 554 .doit = batadv_netlink_tp_meter_cancel,
401 }, 555 },
556 {
557 .cmd = BATADV_CMD_GET_ROUTING_ALGOS,
558 .flags = GENL_ADMIN_PERM,
559 .policy = batadv_netlink_policy,
560 .dumpit = batadv_algo_dump,
561 },
562 {
563 .cmd = BATADV_CMD_GET_HARDIFS,
564 .flags = GENL_ADMIN_PERM,
565 .policy = batadv_netlink_policy,
566 .dumpit = batadv_netlink_dump_hardifs,
567 },
568 {
569 .cmd = BATADV_CMD_GET_TRANSTABLE_LOCAL,
570 .flags = GENL_ADMIN_PERM,
571 .policy = batadv_netlink_policy,
572 .dumpit = batadv_tt_local_dump,
573 },
574 {
575 .cmd = BATADV_CMD_GET_TRANSTABLE_GLOBAL,
576 .flags = GENL_ADMIN_PERM,
577 .policy = batadv_netlink_policy,
578 .dumpit = batadv_tt_global_dump,
579 },
580 {
581 .cmd = BATADV_CMD_GET_ORIGINATORS,
582 .flags = GENL_ADMIN_PERM,
583 .policy = batadv_netlink_policy,
584 .dumpit = batadv_orig_dump,
585 },
586 {
587 .cmd = BATADV_CMD_GET_NEIGHBORS,
588 .flags = GENL_ADMIN_PERM,
589 .policy = batadv_netlink_policy,
590 .dumpit = batadv_hardif_neigh_dump,
591 },
592 {
593 .cmd = BATADV_CMD_GET_GATEWAYS,
594 .flags = GENL_ADMIN_PERM,
595 .policy = batadv_netlink_policy,
596 .dumpit = batadv_gw_dump,
597 },
598 {
599 .cmd = BATADV_CMD_GET_BLA_CLAIM,
600 .flags = GENL_ADMIN_PERM,
601 .policy = batadv_netlink_policy,
602 .dumpit = batadv_bla_claim_dump,
603 },
604 {
605 .cmd = BATADV_CMD_GET_BLA_BACKBONE,
606 .flags = GENL_ADMIN_PERM,
607 .policy = batadv_netlink_policy,
608 .dumpit = batadv_bla_backbone_dump,
609 },
610
402}; 611};
403 612
404/** 613/**
diff --git a/net/batman-adv/netlink.h b/net/batman-adv/netlink.h
index 945653ab58c6..52eb16281aba 100644
--- a/net/batman-adv/netlink.h
+++ b/net/batman-adv/netlink.h
@@ -21,12 +21,18 @@
21#include "main.h" 21#include "main.h"
22 22
23#include <linux/types.h> 23#include <linux/types.h>
24#include <net/genetlink.h>
25
26struct nlmsghdr;
24 27
25void batadv_netlink_register(void); 28void batadv_netlink_register(void);
26void batadv_netlink_unregister(void); 29void batadv_netlink_unregister(void);
30int batadv_netlink_get_ifindex(const struct nlmsghdr *nlh, int attrtype);
27 31
28int batadv_netlink_tpmeter_notify(struct batadv_priv *bat_priv, const u8 *dst, 32int batadv_netlink_tpmeter_notify(struct batadv_priv *bat_priv, const u8 *dst,
29 u8 result, u32 test_time, u64 total_bytes, 33 u8 result, u32 test_time, u64 total_bytes,
30 u32 cookie); 34 u32 cookie);
31 35
36extern struct genl_family batadv_netlink_family;
37
32#endif /* _NET_BATMAN_ADV_NETLINK_H_ */ 38#endif /* _NET_BATMAN_ADV_NETLINK_H_ */
diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c
index 293ef4ffd4e1..e3baf697a35c 100644
--- a/net/batman-adv/network-coding.c
+++ b/net/batman-adv/network-coding.c
@@ -856,14 +856,12 @@ batadv_nc_get_nc_node(struct batadv_priv *bat_priv,
856 if (!nc_node) 856 if (!nc_node)
857 return NULL; 857 return NULL;
858 858
859 kref_get(&orig_neigh_node->refcount);
860
861 /* Initialize nc_node */ 859 /* Initialize nc_node */
862 INIT_LIST_HEAD(&nc_node->list); 860 INIT_LIST_HEAD(&nc_node->list);
861 kref_init(&nc_node->refcount);
863 ether_addr_copy(nc_node->addr, orig_node->orig); 862 ether_addr_copy(nc_node->addr, orig_node->orig);
863 kref_get(&orig_neigh_node->refcount);
864 nc_node->orig_node = orig_neigh_node; 864 nc_node->orig_node = orig_neigh_node;
865 kref_init(&nc_node->refcount);
866 kref_get(&nc_node->refcount);
867 865
868 /* Select ingoing or outgoing coding node */ 866 /* Select ingoing or outgoing coding node */
869 if (in_coding) { 867 if (in_coding) {
@@ -879,6 +877,7 @@ batadv_nc_get_nc_node(struct batadv_priv *bat_priv,
879 877
880 /* Add nc_node to orig_node */ 878 /* Add nc_node to orig_node */
881 spin_lock_bh(lock); 879 spin_lock_bh(lock);
880 kref_get(&nc_node->refcount);
882 list_add_tail_rcu(&nc_node->list, list); 881 list_add_tail_rcu(&nc_node->list, list);
883 spin_unlock_bh(lock); 882 spin_unlock_bh(lock);
884 883
@@ -979,7 +978,6 @@ static struct batadv_nc_path *batadv_nc_get_path(struct batadv_priv *bat_priv,
979 INIT_LIST_HEAD(&nc_path->packet_list); 978 INIT_LIST_HEAD(&nc_path->packet_list);
980 spin_lock_init(&nc_path->packet_list_lock); 979 spin_lock_init(&nc_path->packet_list_lock);
981 kref_init(&nc_path->refcount); 980 kref_init(&nc_path->refcount);
982 kref_get(&nc_path->refcount);
983 nc_path->last_valid = jiffies; 981 nc_path->last_valid = jiffies;
984 ether_addr_copy(nc_path->next_hop, dst); 982 ether_addr_copy(nc_path->next_hop, dst);
985 ether_addr_copy(nc_path->prev_hop, src); 983 ether_addr_copy(nc_path->prev_hop, src);
@@ -989,6 +987,7 @@ static struct batadv_nc_path *batadv_nc_get_path(struct batadv_priv *bat_priv,
989 nc_path->next_hop); 987 nc_path->next_hop);
990 988
991 /* Add nc_path to hash table */ 989 /* Add nc_path to hash table */
990 kref_get(&nc_path->refcount);
992 hash_added = batadv_hash_add(hash, batadv_nc_hash_compare, 991 hash_added = batadv_hash_add(hash, batadv_nc_hash_compare,
993 batadv_nc_hash_choose, &nc_path_key, 992 batadv_nc_hash_choose, &nc_path_key,
994 &nc_path->hash_entry); 993 &nc_path->hash_entry);
@@ -1882,6 +1881,7 @@ void batadv_nc_mesh_free(struct batadv_priv *bat_priv)
1882 batadv_hash_destroy(bat_priv->nc.decoding_hash); 1881 batadv_hash_destroy(bat_priv->nc.decoding_hash);
1883} 1882}
1884 1883
1884#ifdef CONFIG_BATMAN_ADV_DEBUGFS
1885/** 1885/**
1886 * batadv_nc_nodes_seq_print_text - print the nc node information 1886 * batadv_nc_nodes_seq_print_text - print the nc node information
1887 * @seq: seq file to print on 1887 * @seq: seq file to print on
@@ -1981,3 +1981,4 @@ int batadv_nc_init_debugfs(struct batadv_priv *bat_priv)
1981out: 1981out:
1982 return -ENOMEM; 1982 return -ENOMEM;
1983} 1983}
1984#endif
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index 3940b5d24421..5f3bfc41aeb1 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -28,11 +28,15 @@
28#include <linux/list.h> 28#include <linux/list.h>
29#include <linux/lockdep.h> 29#include <linux/lockdep.h>
30#include <linux/netdevice.h> 30#include <linux/netdevice.h>
31#include <linux/netlink.h>
31#include <linux/rculist.h> 32#include <linux/rculist.h>
32#include <linux/seq_file.h> 33#include <linux/seq_file.h>
34#include <linux/skbuff.h>
33#include <linux/slab.h> 35#include <linux/slab.h>
34#include <linux/spinlock.h> 36#include <linux/spinlock.h>
35#include <linux/workqueue.h> 37#include <linux/workqueue.h>
38#include <net/sock.h>
39#include <uapi/linux/batman_adv.h>
36 40
37#include "bat_algo.h" 41#include "bat_algo.h"
38#include "distributed-arp-table.h" 42#include "distributed-arp-table.h"
@@ -42,8 +46,10 @@
42#include "hash.h" 46#include "hash.h"
43#include "log.h" 47#include "log.h"
44#include "multicast.h" 48#include "multicast.h"
49#include "netlink.h"
45#include "network-coding.h" 50#include "network-coding.h"
46#include "routing.h" 51#include "routing.h"
52#include "soft-interface.h"
47#include "translation-table.h" 53#include "translation-table.h"
48 54
49/* hash class keys */ 55/* hash class keys */
@@ -127,9 +133,9 @@ batadv_orig_node_vlan_new(struct batadv_orig_node *orig_node,
127 goto out; 133 goto out;
128 134
129 kref_init(&vlan->refcount); 135 kref_init(&vlan->refcount);
130 kref_get(&vlan->refcount);
131 vlan->vid = vid; 136 vlan->vid = vid;
132 137
138 kref_get(&vlan->refcount);
133 hlist_add_head_rcu(&vlan->list, &orig_node->vlan_list); 139 hlist_add_head_rcu(&vlan->list, &orig_node->vlan_list);
134 140
135out: 141out:
@@ -380,6 +386,7 @@ batadv_orig_ifinfo_new(struct batadv_orig_node *orig_node,
380 orig_ifinfo->if_outgoing = if_outgoing; 386 orig_ifinfo->if_outgoing = if_outgoing;
381 INIT_HLIST_NODE(&orig_ifinfo->list); 387 INIT_HLIST_NODE(&orig_ifinfo->list);
382 kref_init(&orig_ifinfo->refcount); 388 kref_init(&orig_ifinfo->refcount);
389
383 kref_get(&orig_ifinfo->refcount); 390 kref_get(&orig_ifinfo->refcount);
384 hlist_add_head_rcu(&orig_ifinfo->list, 391 hlist_add_head_rcu(&orig_ifinfo->list,
385 &orig_node->ifinfo_list); 392 &orig_node->ifinfo_list);
@@ -453,9 +460,9 @@ batadv_neigh_ifinfo_new(struct batadv_neigh_node *neigh,
453 460
454 INIT_HLIST_NODE(&neigh_ifinfo->list); 461 INIT_HLIST_NODE(&neigh_ifinfo->list);
455 kref_init(&neigh_ifinfo->refcount); 462 kref_init(&neigh_ifinfo->refcount);
456 kref_get(&neigh_ifinfo->refcount);
457 neigh_ifinfo->if_outgoing = if_outgoing; 463 neigh_ifinfo->if_outgoing = if_outgoing;
458 464
465 kref_get(&neigh_ifinfo->refcount);
459 hlist_add_head_rcu(&neigh_ifinfo->list, &neigh->ifinfo_list); 466 hlist_add_head_rcu(&neigh_ifinfo->list, &neigh->ifinfo_list);
460 467
461out: 468out:
@@ -647,8 +654,8 @@ batadv_neigh_node_create(struct batadv_orig_node *orig_node,
647 654
648 /* extra reference for return */ 655 /* extra reference for return */
649 kref_init(&neigh_node->refcount); 656 kref_init(&neigh_node->refcount);
650 kref_get(&neigh_node->refcount);
651 657
658 kref_get(&neigh_node->refcount);
652 hlist_add_head_rcu(&neigh_node->list, &orig_node->neigh_list); 659 hlist_add_head_rcu(&neigh_node->list, &orig_node->neigh_list);
653 660
654 batadv_dbg(BATADV_DBG_BATMAN, orig_node->bat_priv, 661 batadv_dbg(BATADV_DBG_BATMAN, orig_node->bat_priv,
@@ -686,6 +693,7 @@ batadv_neigh_node_get_or_create(struct batadv_orig_node *orig_node,
686 return batadv_neigh_node_create(orig_node, hard_iface, neigh_addr); 693 return batadv_neigh_node_create(orig_node, hard_iface, neigh_addr);
687} 694}
688 695
696#ifdef CONFIG_BATMAN_ADV_DEBUGFS
689/** 697/**
690 * batadv_hardif_neigh_seq_print_text - print the single hop neighbour list 698 * batadv_hardif_neigh_seq_print_text - print the single hop neighbour list
691 * @seq: neighbour table seq_file struct 699 * @seq: neighbour table seq_file struct
@@ -719,6 +727,84 @@ int batadv_hardif_neigh_seq_print_text(struct seq_file *seq, void *offset)
719 bat_priv->algo_ops->neigh.print(bat_priv, seq); 727 bat_priv->algo_ops->neigh.print(bat_priv, seq);
720 return 0; 728 return 0;
721} 729}
730#endif
731
732/**
733 * batadv_hardif_neigh_dump - Dump to netlink the neighbor infos for a specific
734 * outgoing interface
735 * @msg: message to dump into
736 * @cb: parameters for the dump
737 *
738 * Return: 0 or error value
739 */
740int batadv_hardif_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb)
741{
742 struct net *net = sock_net(cb->skb->sk);
743 struct net_device *soft_iface;
744 struct net_device *hard_iface = NULL;
745 struct batadv_hard_iface *hardif = BATADV_IF_DEFAULT;
746 struct batadv_priv *bat_priv;
747 struct batadv_hard_iface *primary_if = NULL;
748 int ret;
749 int ifindex, hard_ifindex;
750
751 ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX);
752 if (!ifindex)
753 return -EINVAL;
754
755 soft_iface = dev_get_by_index(net, ifindex);
756 if (!soft_iface || !batadv_softif_is_valid(soft_iface)) {
757 ret = -ENODEV;
758 goto out;
759 }
760
761 bat_priv = netdev_priv(soft_iface);
762
763 primary_if = batadv_primary_if_get_selected(bat_priv);
764 if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) {
765 ret = -ENOENT;
766 goto out;
767 }
768
769 hard_ifindex = batadv_netlink_get_ifindex(cb->nlh,
770 BATADV_ATTR_HARD_IFINDEX);
771 if (hard_ifindex) {
772 hard_iface = dev_get_by_index(net, hard_ifindex);
773 if (hard_iface)
774 hardif = batadv_hardif_get_by_netdev(hard_iface);
775
776 if (!hardif) {
777 ret = -ENODEV;
778 goto out;
779 }
780
781 if (hardif->soft_iface != soft_iface) {
782 ret = -ENOENT;
783 goto out;
784 }
785 }
786
787 if (!bat_priv->algo_ops->neigh.dump) {
788 ret = -EOPNOTSUPP;
789 goto out;
790 }
791
792 bat_priv->algo_ops->neigh.dump(msg, cb, bat_priv, hardif);
793
794 ret = msg->len;
795
796 out:
797 if (hardif)
798 batadv_hardif_put(hardif);
799 if (hard_iface)
800 dev_put(hard_iface);
801 if (primary_if)
802 batadv_hardif_put(primary_if);
803 if (soft_iface)
804 dev_put(soft_iface);
805
806 return ret;
807}
722 808
723/** 809/**
724 * batadv_orig_ifinfo_release - release orig_ifinfo from lists and queue for 810 * batadv_orig_ifinfo_release - release orig_ifinfo from lists and queue for
@@ -905,7 +991,6 @@ struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv,
905 991
906 /* extra reference for return */ 992 /* extra reference for return */
907 kref_init(&orig_node->refcount); 993 kref_init(&orig_node->refcount);
908 kref_get(&orig_node->refcount);
909 994
910 orig_node->bat_priv = bat_priv; 995 orig_node->bat_priv = bat_priv;
911 ether_addr_copy(orig_node->orig, addr); 996 ether_addr_copy(orig_node->orig, addr);
@@ -1256,6 +1341,7 @@ void batadv_purge_orig_ref(struct batadv_priv *bat_priv)
1256 _batadv_purge_orig(bat_priv); 1341 _batadv_purge_orig(bat_priv);
1257} 1342}
1258 1343
1344#ifdef CONFIG_BATMAN_ADV_DEBUGFS
1259int batadv_orig_seq_print_text(struct seq_file *seq, void *offset) 1345int batadv_orig_seq_print_text(struct seq_file *seq, void *offset)
1260{ 1346{
1261 struct net_device *net_dev = (struct net_device *)seq->private; 1347 struct net_device *net_dev = (struct net_device *)seq->private;
@@ -1329,6 +1415,84 @@ out:
1329 batadv_hardif_put(hard_iface); 1415 batadv_hardif_put(hard_iface);
1330 return 0; 1416 return 0;
1331} 1417}
1418#endif
1419
1420/**
1421 * batadv_orig_dump - Dump to netlink the originator infos for a specific
1422 * outgoing interface
1423 * @msg: message to dump into
1424 * @cb: parameters for the dump
1425 *
1426 * Return: 0 or error value
1427 */
1428int batadv_orig_dump(struct sk_buff *msg, struct netlink_callback *cb)
1429{
1430 struct net *net = sock_net(cb->skb->sk);
1431 struct net_device *soft_iface;
1432 struct net_device *hard_iface = NULL;
1433 struct batadv_hard_iface *hardif = BATADV_IF_DEFAULT;
1434 struct batadv_priv *bat_priv;
1435 struct batadv_hard_iface *primary_if = NULL;
1436 int ret;
1437 int ifindex, hard_ifindex;
1438
1439 ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX);
1440 if (!ifindex)
1441 return -EINVAL;
1442
1443 soft_iface = dev_get_by_index(net, ifindex);
1444 if (!soft_iface || !batadv_softif_is_valid(soft_iface)) {
1445 ret = -ENODEV;
1446 goto out;
1447 }
1448
1449 bat_priv = netdev_priv(soft_iface);
1450
1451 primary_if = batadv_primary_if_get_selected(bat_priv);
1452 if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) {
1453 ret = -ENOENT;
1454 goto out;
1455 }
1456
1457 hard_ifindex = batadv_netlink_get_ifindex(cb->nlh,
1458 BATADV_ATTR_HARD_IFINDEX);
1459 if (hard_ifindex) {
1460 hard_iface = dev_get_by_index(net, hard_ifindex);
1461 if (hard_iface)
1462 hardif = batadv_hardif_get_by_netdev(hard_iface);
1463
1464 if (!hardif) {
1465 ret = -ENODEV;
1466 goto out;
1467 }
1468
1469 if (hardif->soft_iface != soft_iface) {
1470 ret = -ENOENT;
1471 goto out;
1472 }
1473 }
1474
1475 if (!bat_priv->algo_ops->orig.dump) {
1476 ret = -EOPNOTSUPP;
1477 goto out;
1478 }
1479
1480 bat_priv->algo_ops->orig.dump(msg, cb, bat_priv, hardif);
1481
1482 ret = msg->len;
1483
1484 out:
1485 if (hardif)
1486 batadv_hardif_put(hardif);
1487 if (hard_iface)
1488 dev_put(hard_iface);
1489 if (primary_if)
1490 batadv_hardif_put(primary_if);
1491 if (soft_iface)
1492 dev_put(soft_iface);
1493
1494 return ret;
1495}
1332 1496
1333int batadv_orig_hash_add_if(struct batadv_hard_iface *hard_iface, 1497int batadv_orig_hash_add_if(struct batadv_hard_iface *hard_iface,
1334 int max_if_num) 1498 int max_if_num)
diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h
index 566306bf05dc..ebc56183f358 100644
--- a/net/batman-adv/originator.h
+++ b/net/batman-adv/originator.h
@@ -31,7 +31,9 @@
31 31
32#include "hash.h" 32#include "hash.h"
33 33
34struct netlink_callback;
34struct seq_file; 35struct seq_file;
36struct sk_buff;
35 37
36bool batadv_compare_orig(const struct hlist_node *node, const void *data2); 38bool batadv_compare_orig(const struct hlist_node *node, const void *data2);
37int batadv_originator_init(struct batadv_priv *bat_priv); 39int batadv_originator_init(struct batadv_priv *bat_priv);
@@ -61,6 +63,7 @@ batadv_neigh_ifinfo_get(struct batadv_neigh_node *neigh,
61 struct batadv_hard_iface *if_outgoing); 63 struct batadv_hard_iface *if_outgoing);
62void batadv_neigh_ifinfo_put(struct batadv_neigh_ifinfo *neigh_ifinfo); 64void batadv_neigh_ifinfo_put(struct batadv_neigh_ifinfo *neigh_ifinfo);
63 65
66int batadv_hardif_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb);
64int batadv_hardif_neigh_seq_print_text(struct seq_file *seq, void *offset); 67int batadv_hardif_neigh_seq_print_text(struct seq_file *seq, void *offset);
65 68
66struct batadv_orig_ifinfo * 69struct batadv_orig_ifinfo *
@@ -72,6 +75,7 @@ batadv_orig_ifinfo_new(struct batadv_orig_node *orig_node,
72void batadv_orig_ifinfo_put(struct batadv_orig_ifinfo *orig_ifinfo); 75void batadv_orig_ifinfo_put(struct batadv_orig_ifinfo *orig_ifinfo);
73 76
74int batadv_orig_seq_print_text(struct seq_file *seq, void *offset); 77int batadv_orig_seq_print_text(struct seq_file *seq, void *offset);
78int batadv_orig_dump(struct sk_buff *msg, struct netlink_callback *cb);
75int batadv_orig_hardif_seq_print_text(struct seq_file *seq, void *offset); 79int batadv_orig_hardif_seq_print_text(struct seq_file *seq, void *offset);
76int batadv_orig_hash_add_if(struct batadv_hard_iface *hard_iface, 80int batadv_orig_hash_add_if(struct batadv_hard_iface *hard_iface,
77 int max_if_num); 81 int max_if_num);
diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h
index 6b011ff64dd8..6afc0b86950e 100644
--- a/net/batman-adv/packet.h
+++ b/net/batman-adv/packet.h
@@ -129,42 +129,6 @@ enum batadv_tt_data_flags {
129}; 129};
130 130
131/** 131/**
132 * enum batadv_tt_client_flags - TT client specific flags
133 * @BATADV_TT_CLIENT_DEL: the client has to be deleted from the table
134 * @BATADV_TT_CLIENT_ROAM: the client roamed to/from another node and the new
135 * update telling its new real location has not been received/sent yet
136 * @BATADV_TT_CLIENT_WIFI: this client is connected through a wifi interface.
137 * This information is used by the "AP Isolation" feature
138 * @BATADV_TT_CLIENT_ISOLA: this client is considered "isolated". This
139 * information is used by the Extended Isolation feature
140 * @BATADV_TT_CLIENT_NOPURGE: this client should never be removed from the table
141 * @BATADV_TT_CLIENT_NEW: this client has been added to the local table but has
142 * not been announced yet
143 * @BATADV_TT_CLIENT_PENDING: this client is marked for removal but it is kept
144 * in the table for one more originator interval for consistency purposes
145 * @BATADV_TT_CLIENT_TEMP: this global client has been detected to be part of
146 * the network but no nnode has already announced it
147 *
148 * Bits from 0 to 7 are called _remote flags_ because they are sent on the wire.
149 * Bits from 8 to 15 are called _local flags_ because they are used for local
150 * computations only.
151 *
152 * Bits from 4 to 7 - a subset of remote flags - are ensured to be in sync with
153 * the other nodes in the network. To achieve this goal these flags are included
154 * in the TT CRC computation.
155 */
156enum batadv_tt_client_flags {
157 BATADV_TT_CLIENT_DEL = BIT(0),
158 BATADV_TT_CLIENT_ROAM = BIT(1),
159 BATADV_TT_CLIENT_WIFI = BIT(4),
160 BATADV_TT_CLIENT_ISOLA = BIT(5),
161 BATADV_TT_CLIENT_NOPURGE = BIT(8),
162 BATADV_TT_CLIENT_NEW = BIT(9),
163 BATADV_TT_CLIENT_PENDING = BIT(10),
164 BATADV_TT_CLIENT_TEMP = BIT(11),
165};
166
167/**
168 * enum batadv_vlan_flags - flags for the four MSB of any vlan ID field 132 * enum batadv_vlan_flags - flags for the four MSB of any vlan ID field
169 * @BATADV_VLAN_HAS_TAG: whether the field contains a valid vlan tag or not 133 * @BATADV_VLAN_HAS_TAG: whether the field contains a valid vlan tag or not
170 */ 134 */
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index 3d199478c405..7e8dc648b95a 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -74,11 +74,23 @@ static void _batadv_update_route(struct batadv_priv *bat_priv,
74 if (!orig_ifinfo) 74 if (!orig_ifinfo)
75 return; 75 return;
76 76
77 rcu_read_lock(); 77 spin_lock_bh(&orig_node->neigh_list_lock);
78 curr_router = rcu_dereference(orig_ifinfo->router); 78 /* curr_router used earlier may not be the current orig_ifinfo->router
79 if (curr_router && !kref_get_unless_zero(&curr_router->refcount)) 79 * anymore because it was dereferenced outside of the neigh_list_lock
80 curr_router = NULL; 80 * protected region. After the new best neighbor has replace the current
81 rcu_read_unlock(); 81 * best neighbor the reference counter needs to decrease. Consequently,
82 * the code needs to ensure the curr_router variable contains a pointer
83 * to the replaced best neighbor.
84 */
85 curr_router = rcu_dereference_protected(orig_ifinfo->router, true);
86
87 /* increase refcount of new best neighbor */
88 if (neigh_node)
89 kref_get(&neigh_node->refcount);
90
91 rcu_assign_pointer(orig_ifinfo->router, neigh_node);
92 spin_unlock_bh(&orig_node->neigh_list_lock);
93 batadv_orig_ifinfo_put(orig_ifinfo);
82 94
83 /* route deleted */ 95 /* route deleted */
84 if ((curr_router) && (!neigh_node)) { 96 if ((curr_router) && (!neigh_node)) {
@@ -100,27 +112,6 @@ static void _batadv_update_route(struct batadv_priv *bat_priv,
100 curr_router->addr); 112 curr_router->addr);
101 } 113 }
102 114
103 if (curr_router)
104 batadv_neigh_node_put(curr_router);
105
106 spin_lock_bh(&orig_node->neigh_list_lock);
107 /* curr_router used earlier may not be the current orig_ifinfo->router
108 * anymore because it was dereferenced outside of the neigh_list_lock
109 * protected region. After the new best neighbor has replace the current
110 * best neighbor the reference counter needs to decrease. Consequently,
111 * the code needs to ensure the curr_router variable contains a pointer
112 * to the replaced best neighbor.
113 */
114 curr_router = rcu_dereference_protected(orig_ifinfo->router, true);
115
116 /* increase refcount of new best neighbor */
117 if (neigh_node)
118 kref_get(&neigh_node->refcount);
119
120 rcu_assign_pointer(orig_ifinfo->router, neigh_node);
121 spin_unlock_bh(&orig_node->neigh_list_lock);
122 batadv_orig_ifinfo_put(orig_ifinfo);
123
124 /* decrease refcount of previous best neighbor */ 115 /* decrease refcount of previous best neighbor */
125 if (curr_router) 116 if (curr_router)
126 batadv_neigh_node_put(curr_router); 117 batadv_neigh_node_put(curr_router);
diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c
index 6191159484df..8d4e1f578574 100644
--- a/net/batman-adv/send.c
+++ b/net/batman-adv/send.c
@@ -315,8 +315,7 @@ out:
315 * 315 *
316 * Wrap the given skb into a batman-adv unicast or unicast-4addr header 316 * Wrap the given skb into a batman-adv unicast or unicast-4addr header
317 * depending on whether BATADV_UNICAST or BATADV_UNICAST_4ADDR was supplied 317 * depending on whether BATADV_UNICAST or BATADV_UNICAST_4ADDR was supplied
318 * as packet_type. Then send this frame to the given orig_node and release a 318 * as packet_type. Then send this frame to the given orig_node.
319 * reference to this orig_node.
320 * 319 *
321 * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. 320 * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise.
322 */ 321 */
@@ -370,8 +369,6 @@ int batadv_send_skb_unicast(struct batadv_priv *bat_priv,
370 ret = NET_XMIT_SUCCESS; 369 ret = NET_XMIT_SUCCESS;
371 370
372out: 371out:
373 if (orig_node)
374 batadv_orig_node_put(orig_node);
375 if (ret == NET_XMIT_DROP) 372 if (ret == NET_XMIT_DROP)
376 kfree_skb(skb); 373 kfree_skb(skb);
377 return ret; 374 return ret;
@@ -403,6 +400,7 @@ int batadv_send_skb_via_tt_generic(struct batadv_priv *bat_priv,
403 struct ethhdr *ethhdr = (struct ethhdr *)skb->data; 400 struct ethhdr *ethhdr = (struct ethhdr *)skb->data;
404 struct batadv_orig_node *orig_node; 401 struct batadv_orig_node *orig_node;
405 u8 *src, *dst; 402 u8 *src, *dst;
403 int ret;
406 404
407 src = ethhdr->h_source; 405 src = ethhdr->h_source;
408 dst = ethhdr->h_dest; 406 dst = ethhdr->h_dest;
@@ -414,8 +412,13 @@ int batadv_send_skb_via_tt_generic(struct batadv_priv *bat_priv,
414 } 412 }
415 orig_node = batadv_transtable_search(bat_priv, src, dst, vid); 413 orig_node = batadv_transtable_search(bat_priv, src, dst, vid);
416 414
417 return batadv_send_skb_unicast(bat_priv, skb, packet_type, 415 ret = batadv_send_skb_unicast(bat_priv, skb, packet_type,
418 packet_subtype, orig_node, vid); 416 packet_subtype, orig_node, vid);
417
418 if (orig_node)
419 batadv_orig_node_put(orig_node);
420
421 return ret;
419} 422}
420 423
421/** 424/**
@@ -433,12 +436,25 @@ int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb,
433 unsigned short vid) 436 unsigned short vid)
434{ 437{
435 struct batadv_orig_node *orig_node; 438 struct batadv_orig_node *orig_node;
439 int ret;
436 440
437 orig_node = batadv_gw_get_selected_orig(bat_priv); 441 orig_node = batadv_gw_get_selected_orig(bat_priv);
438 return batadv_send_skb_unicast(bat_priv, skb, BATADV_UNICAST_4ADDR, 442 ret = batadv_send_skb_unicast(bat_priv, skb, BATADV_UNICAST_4ADDR,
439 BATADV_P_DATA, orig_node, vid); 443 BATADV_P_DATA, orig_node, vid);
444
445 if (orig_node)
446 batadv_orig_node_put(orig_node);
447
448 return ret;
440} 449}
441 450
451/**
452 * batadv_forw_packet_free - free a forwarding packet
453 * @forw_packet: The packet to free
454 *
455 * This frees a forwarding packet and releases any resources it might
456 * have claimed.
457 */
442void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet) 458void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet)
443{ 459{
444 kfree_skb(forw_packet->skb); 460 kfree_skb(forw_packet->skb);
@@ -446,9 +462,73 @@ void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet)
446 batadv_hardif_put(forw_packet->if_incoming); 462 batadv_hardif_put(forw_packet->if_incoming);
447 if (forw_packet->if_outgoing) 463 if (forw_packet->if_outgoing)
448 batadv_hardif_put(forw_packet->if_outgoing); 464 batadv_hardif_put(forw_packet->if_outgoing);
465 if (forw_packet->queue_left)
466 atomic_inc(forw_packet->queue_left);
449 kfree(forw_packet); 467 kfree(forw_packet);
450} 468}
451 469
470/**
471 * batadv_forw_packet_alloc - allocate a forwarding packet
472 * @if_incoming: The (optional) if_incoming to be grabbed
473 * @if_outgoing: The (optional) if_outgoing to be grabbed
474 * @queue_left: The (optional) queue counter to decrease
475 * @bat_priv: The bat_priv for the mesh of this forw_packet
476 *
477 * Allocates a forwarding packet and tries to get a reference to the
478 * (optional) if_incoming, if_outgoing and queue_left. If queue_left
479 * is NULL then bat_priv is optional, too.
480 *
481 * Return: An allocated forwarding packet on success, NULL otherwise.
482 */
483struct batadv_forw_packet *
484batadv_forw_packet_alloc(struct batadv_hard_iface *if_incoming,
485 struct batadv_hard_iface *if_outgoing,
486 atomic_t *queue_left,
487 struct batadv_priv *bat_priv)
488{
489 struct batadv_forw_packet *forw_packet;
490 const char *qname;
491
492 if (queue_left && !batadv_atomic_dec_not_zero(queue_left)) {
493 qname = "unknown";
494
495 if (queue_left == &bat_priv->bcast_queue_left)
496 qname = "bcast";
497
498 if (queue_left == &bat_priv->batman_queue_left)
499 qname = "batman";
500
501 batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
502 "%s queue is full\n", qname);
503
504 return NULL;
505 }
506
507 forw_packet = kmalloc(sizeof(*forw_packet), GFP_ATOMIC);
508 if (!forw_packet)
509 goto err;
510
511 if (if_incoming)
512 kref_get(&if_incoming->refcount);
513
514 if (if_outgoing)
515 kref_get(&if_outgoing->refcount);
516
517 forw_packet->skb = NULL;
518 forw_packet->queue_left = queue_left;
519 forw_packet->if_incoming = if_incoming;
520 forw_packet->if_outgoing = if_outgoing;
521 forw_packet->num_packets = 0;
522
523 return forw_packet;
524
525err:
526 if (queue_left)
527 atomic_inc(queue_left);
528
529 return NULL;
530}
531
452static void 532static void
453_batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv, 533_batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv,
454 struct batadv_forw_packet *forw_packet, 534 struct batadv_forw_packet *forw_packet,
@@ -487,24 +567,20 @@ int batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv,
487 struct batadv_bcast_packet *bcast_packet; 567 struct batadv_bcast_packet *bcast_packet;
488 struct sk_buff *newskb; 568 struct sk_buff *newskb;
489 569
490 if (!batadv_atomic_dec_not_zero(&bat_priv->bcast_queue_left)) {
491 batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
492 "bcast packet queue full\n");
493 goto out;
494 }
495
496 primary_if = batadv_primary_if_get_selected(bat_priv); 570 primary_if = batadv_primary_if_get_selected(bat_priv);
497 if (!primary_if) 571 if (!primary_if)
498 goto out_and_inc; 572 goto err;
499
500 forw_packet = kmalloc(sizeof(*forw_packet), GFP_ATOMIC);
501 573
574 forw_packet = batadv_forw_packet_alloc(primary_if, NULL,
575 &bat_priv->bcast_queue_left,
576 bat_priv);
577 batadv_hardif_put(primary_if);
502 if (!forw_packet) 578 if (!forw_packet)
503 goto out_and_inc; 579 goto err;
504 580
505 newskb = skb_copy(skb, GFP_ATOMIC); 581 newskb = skb_copy(skb, GFP_ATOMIC);
506 if (!newskb) 582 if (!newskb)
507 goto packet_free; 583 goto err_packet_free;
508 584
509 /* as we have a copy now, it is safe to decrease the TTL */ 585 /* as we have a copy now, it is safe to decrease the TTL */
510 bcast_packet = (struct batadv_bcast_packet *)newskb->data; 586 bcast_packet = (struct batadv_bcast_packet *)newskb->data;
@@ -513,11 +589,6 @@ int batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv,
513 skb_reset_mac_header(newskb); 589 skb_reset_mac_header(newskb);
514 590
515 forw_packet->skb = newskb; 591 forw_packet->skb = newskb;
516 forw_packet->if_incoming = primary_if;
517 forw_packet->if_outgoing = NULL;
518
519 /* how often did we send the bcast packet ? */
520 forw_packet->num_packets = 0;
521 592
522 INIT_DELAYED_WORK(&forw_packet->delayed_work, 593 INIT_DELAYED_WORK(&forw_packet->delayed_work,
523 batadv_send_outstanding_bcast_packet); 594 batadv_send_outstanding_bcast_packet);
@@ -525,13 +596,9 @@ int batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv,
525 _batadv_add_bcast_packet_to_list(bat_priv, forw_packet, delay); 596 _batadv_add_bcast_packet_to_list(bat_priv, forw_packet, delay);
526 return NETDEV_TX_OK; 597 return NETDEV_TX_OK;
527 598
528packet_free: 599err_packet_free:
529 kfree(forw_packet); 600 batadv_forw_packet_free(forw_packet);
530out_and_inc: 601err:
531 atomic_inc(&bat_priv->bcast_queue_left);
532out:
533 if (primary_if)
534 batadv_hardif_put(primary_if);
535 return NETDEV_TX_BUSY; 602 return NETDEV_TX_BUSY;
536} 603}
537 604
@@ -592,7 +659,6 @@ static void batadv_send_outstanding_bcast_packet(struct work_struct *work)
592 659
593out: 660out:
594 batadv_forw_packet_free(forw_packet); 661 batadv_forw_packet_free(forw_packet);
595 atomic_inc(&bat_priv->bcast_queue_left);
596} 662}
597 663
598void 664void
@@ -633,9 +699,6 @@ batadv_purge_outstanding_packets(struct batadv_priv *bat_priv,
633 699
634 if (pending) { 700 if (pending) {
635 hlist_del(&forw_packet->list); 701 hlist_del(&forw_packet->list);
636 if (!forw_packet->own)
637 atomic_inc(&bat_priv->bcast_queue_left);
638
639 batadv_forw_packet_free(forw_packet); 702 batadv_forw_packet_free(forw_packet);
640 } 703 }
641 } 704 }
@@ -663,9 +726,6 @@ batadv_purge_outstanding_packets(struct batadv_priv *bat_priv,
663 726
664 if (pending) { 727 if (pending) {
665 hlist_del(&forw_packet->list); 728 hlist_del(&forw_packet->list);
666 if (!forw_packet->own)
667 atomic_inc(&bat_priv->batman_queue_left);
668
669 batadv_forw_packet_free(forw_packet); 729 batadv_forw_packet_free(forw_packet);
670 } 730 }
671 } 731 }
diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h
index 7cecb7563b45..999f78683d9e 100644
--- a/net/batman-adv/send.h
+++ b/net/batman-adv/send.h
@@ -28,6 +28,12 @@
28struct sk_buff; 28struct sk_buff;
29 29
30void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet); 30void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet);
31struct batadv_forw_packet *
32batadv_forw_packet_alloc(struct batadv_hard_iface *if_incoming,
33 struct batadv_hard_iface *if_outgoing,
34 atomic_t *queue_left,
35 struct batadv_priv *bat_priv);
36
31int batadv_send_skb_to_orig(struct sk_buff *skb, 37int batadv_send_skb_to_orig(struct sk_buff *skb,
32 struct batadv_orig_node *orig_node, 38 struct batadv_orig_node *orig_node,
33 struct batadv_hard_iface *recv_if); 39 struct batadv_hard_iface *recv_if);
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index 7527c0652dd5..49e16b6e0ba3 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -39,6 +39,7 @@
39#include <linux/random.h> 39#include <linux/random.h>
40#include <linux/rculist.h> 40#include <linux/rculist.h>
41#include <linux/rcupdate.h> 41#include <linux/rcupdate.h>
42#include <linux/rtnetlink.h>
42#include <linux/skbuff.h> 43#include <linux/skbuff.h>
43#include <linux/slab.h> 44#include <linux/slab.h>
44#include <linux/socket.h> 45#include <linux/socket.h>
@@ -46,7 +47,6 @@
46#include <linux/stddef.h> 47#include <linux/stddef.h>
47#include <linux/string.h> 48#include <linux/string.h>
48#include <linux/types.h> 49#include <linux/types.h>
49#include <linux/workqueue.h>
50 50
51#include "bat_algo.h" 51#include "bat_algo.h"
52#include "bridge_loop_avoidance.h" 52#include "bridge_loop_avoidance.h"
@@ -57,6 +57,7 @@
57#include "hard-interface.h" 57#include "hard-interface.h"
58#include "multicast.h" 58#include "multicast.h"
59#include "network-coding.h" 59#include "network-coding.h"
60#include "originator.h"
60#include "packet.h" 61#include "packet.h"
61#include "send.h" 62#include "send.h"
62#include "sysfs.h" 63#include "sysfs.h"
@@ -377,6 +378,8 @@ dropped:
377dropped_freed: 378dropped_freed:
378 batadv_inc_counter(bat_priv, BATADV_CNT_TX_DROPPED); 379 batadv_inc_counter(bat_priv, BATADV_CNT_TX_DROPPED);
379end: 380end:
381 if (mcast_single_orig)
382 batadv_orig_node_put(mcast_single_orig);
380 if (primary_if) 383 if (primary_if)
381 batadv_hardif_put(primary_if); 384 batadv_hardif_put(primary_if);
382 return NETDEV_TX_OK; 385 return NETDEV_TX_OK;
@@ -591,6 +594,7 @@ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid)
591 } 594 }
592 595
593 spin_lock_bh(&bat_priv->softif_vlan_list_lock); 596 spin_lock_bh(&bat_priv->softif_vlan_list_lock);
597 kref_get(&vlan->refcount);
594 hlist_add_head_rcu(&vlan->list, &bat_priv->softif_vlan_list); 598 hlist_add_head_rcu(&vlan->list, &bat_priv->softif_vlan_list);
595 spin_unlock_bh(&bat_priv->softif_vlan_list_lock); 599 spin_unlock_bh(&bat_priv->softif_vlan_list_lock);
596 600
@@ -601,6 +605,9 @@ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid)
601 bat_priv->soft_iface->dev_addr, vid, 605 bat_priv->soft_iface->dev_addr, vid,
602 BATADV_NULL_IFINDEX, BATADV_NO_MARK); 606 BATADV_NULL_IFINDEX, BATADV_NO_MARK);
603 607
608 /* don't return reference to new softif_vlan */
609 batadv_softif_vlan_put(vlan);
610
604 return 0; 611 return 0;
605} 612}
606 613
@@ -747,34 +754,6 @@ static void batadv_set_lockdep_class(struct net_device *dev)
747} 754}
748 755
749/** 756/**
750 * batadv_softif_destroy_finish - cleans up the remains of a softif
751 * @work: work queue item
752 *
753 * Free the parts of the soft interface which can not be removed under
754 * rtnl lock (to prevent deadlock situations).
755 */
756static void batadv_softif_destroy_finish(struct work_struct *work)
757{
758 struct batadv_softif_vlan *vlan;
759 struct batadv_priv *bat_priv;
760 struct net_device *soft_iface;
761
762 bat_priv = container_of(work, struct batadv_priv,
763 cleanup_work);
764 soft_iface = bat_priv->soft_iface;
765
766 /* destroy the "untagged" VLAN */
767 vlan = batadv_softif_vlan_get(bat_priv, BATADV_NO_FLAGS);
768 if (vlan) {
769 batadv_softif_destroy_vlan(bat_priv, vlan);
770 batadv_softif_vlan_put(vlan);
771 }
772
773 batadv_sysfs_del_meshif(soft_iface);
774 unregister_netdev(soft_iface);
775}
776
777/**
778 * batadv_softif_init_late - late stage initialization of soft interface 757 * batadv_softif_init_late - late stage initialization of soft interface
779 * @dev: registered network device to modify 758 * @dev: registered network device to modify
780 * 759 *
@@ -791,7 +770,6 @@ static int batadv_softif_init_late(struct net_device *dev)
791 770
792 bat_priv = netdev_priv(dev); 771 bat_priv = netdev_priv(dev);
793 bat_priv->soft_iface = dev; 772 bat_priv->soft_iface = dev;
794 INIT_WORK(&bat_priv->cleanup_work, batadv_softif_destroy_finish);
795 773
796 /* batadv_interface_stats() needs to be available as soon as 774 /* batadv_interface_stats() needs to be available as soon as
797 * register_netdevice() has been called 775 * register_netdevice() has been called
@@ -1028,8 +1006,19 @@ struct net_device *batadv_softif_create(struct net *net, const char *name)
1028void batadv_softif_destroy_sysfs(struct net_device *soft_iface) 1006void batadv_softif_destroy_sysfs(struct net_device *soft_iface)
1029{ 1007{
1030 struct batadv_priv *bat_priv = netdev_priv(soft_iface); 1008 struct batadv_priv *bat_priv = netdev_priv(soft_iface);
1009 struct batadv_softif_vlan *vlan;
1031 1010
1032 queue_work(batadv_event_workqueue, &bat_priv->cleanup_work); 1011 ASSERT_RTNL();
1012
1013 /* destroy the "untagged" VLAN */
1014 vlan = batadv_softif_vlan_get(bat_priv, BATADV_NO_FLAGS);
1015 if (vlan) {
1016 batadv_softif_destroy_vlan(bat_priv, vlan);
1017 batadv_softif_vlan_put(vlan);
1018 }
1019
1020 batadv_sysfs_del_meshif(soft_iface);
1021 unregister_netdevice(soft_iface);
1033} 1022}
1034 1023
1035/** 1024/**
diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c
index fe9ca94ddee2..02d96f224c60 100644
--- a/net/batman-adv/sysfs.c
+++ b/net/batman-adv/sysfs.c
@@ -37,6 +37,7 @@
37#include <linux/stddef.h> 37#include <linux/stddef.h>
38#include <linux/string.h> 38#include <linux/string.h>
39#include <linux/stringify.h> 39#include <linux/stringify.h>
40#include <linux/workqueue.h>
40 41
41#include "bridge_loop_avoidance.h" 42#include "bridge_loop_avoidance.h"
42#include "distributed-arp-table.h" 43#include "distributed-arp-table.h"
@@ -428,6 +429,13 @@ static ssize_t batadv_show_gw_mode(struct kobject *kobj, struct attribute *attr,
428 struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj); 429 struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj);
429 int bytes_written; 430 int bytes_written;
430 431
432 /* GW mode is not available if the routing algorithm in use does not
433 * implement the GW API
434 */
435 if (!bat_priv->algo_ops->gw.get_best_gw_node ||
436 !bat_priv->algo_ops->gw.is_eligible)
437 return -ENOENT;
438
431 switch (atomic_read(&bat_priv->gw.mode)) { 439 switch (atomic_read(&bat_priv->gw.mode)) {
432 case BATADV_GW_MODE_CLIENT: 440 case BATADV_GW_MODE_CLIENT:
433 bytes_written = sprintf(buff, "%s\n", 441 bytes_written = sprintf(buff, "%s\n",
@@ -455,6 +463,13 @@ static ssize_t batadv_store_gw_mode(struct kobject *kobj,
455 char *curr_gw_mode_str; 463 char *curr_gw_mode_str;
456 int gw_mode_tmp = -1; 464 int gw_mode_tmp = -1;
457 465
466 /* toggling GW mode is allowed only if the routing algorithm in use
467 * provides the GW API
468 */
469 if (!bat_priv->algo_ops->gw.get_best_gw_node ||
470 !bat_priv->algo_ops->gw.is_eligible)
471 return -EINVAL;
472
458 if (buff[count - 1] == '\n') 473 if (buff[count - 1] == '\n')
459 buff[count - 1] = '\0'; 474 buff[count - 1] = '\0';
460 475
@@ -514,6 +529,50 @@ static ssize_t batadv_store_gw_mode(struct kobject *kobj,
514 return count; 529 return count;
515} 530}
516 531
532static ssize_t batadv_show_gw_sel_class(struct kobject *kobj,
533 struct attribute *attr, char *buff)
534{
535 struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj);
536
537 /* GW selection class is not available if the routing algorithm in use
538 * does not implement the GW API
539 */
540 if (!bat_priv->algo_ops->gw.get_best_gw_node ||
541 !bat_priv->algo_ops->gw.is_eligible)
542 return -ENOENT;
543
544 if (bat_priv->algo_ops->gw.show_sel_class)
545 return bat_priv->algo_ops->gw.show_sel_class(bat_priv, buff);
546
547 return sprintf(buff, "%i\n", atomic_read(&bat_priv->gw.sel_class));
548}
549
550static ssize_t batadv_store_gw_sel_class(struct kobject *kobj,
551 struct attribute *attr, char *buff,
552 size_t count)
553{
554 struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj);
555
556 /* setting the GW selection class is allowed only if the routing
557 * algorithm in use implements the GW API
558 */
559 if (!bat_priv->algo_ops->gw.get_best_gw_node ||
560 !bat_priv->algo_ops->gw.is_eligible)
561 return -EINVAL;
562
563 if (buff[count - 1] == '\n')
564 buff[count - 1] = '\0';
565
566 if (bat_priv->algo_ops->gw.store_sel_class)
567 return bat_priv->algo_ops->gw.store_sel_class(bat_priv, buff,
568 count);
569
570 return __batadv_store_uint_attr(buff, count, 1, BATADV_TQ_MAX_VALUE,
571 batadv_post_gw_reselect, attr,
572 &bat_priv->gw.sel_class,
573 bat_priv->soft_iface);
574}
575
517static ssize_t batadv_show_gw_bwidth(struct kobject *kobj, 576static ssize_t batadv_show_gw_bwidth(struct kobject *kobj,
518 struct attribute *attr, char *buff) 577 struct attribute *attr, char *buff)
519{ 578{
@@ -625,8 +684,8 @@ BATADV_ATTR_SIF_UINT(orig_interval, orig_interval, S_IRUGO | S_IWUSR,
625 2 * BATADV_JITTER, INT_MAX, NULL); 684 2 * BATADV_JITTER, INT_MAX, NULL);
626BATADV_ATTR_SIF_UINT(hop_penalty, hop_penalty, S_IRUGO | S_IWUSR, 0, 685BATADV_ATTR_SIF_UINT(hop_penalty, hop_penalty, S_IRUGO | S_IWUSR, 0,
627 BATADV_TQ_MAX_VALUE, NULL); 686 BATADV_TQ_MAX_VALUE, NULL);
628BATADV_ATTR_SIF_UINT(gw_sel_class, gw.sel_class, S_IRUGO | S_IWUSR, 1, 687static BATADV_ATTR(gw_sel_class, S_IRUGO | S_IWUSR, batadv_show_gw_sel_class,
629 BATADV_TQ_MAX_VALUE, batadv_post_gw_reselect); 688 batadv_store_gw_sel_class);
630static BATADV_ATTR(gw_bandwidth, S_IRUGO | S_IWUSR, batadv_show_gw_bwidth, 689static BATADV_ATTR(gw_bandwidth, S_IRUGO | S_IWUSR, batadv_show_gw_bwidth,
631 batadv_store_gw_bwidth); 690 batadv_store_gw_bwidth);
632#ifdef CONFIG_BATMAN_ADV_MCAST 691#ifdef CONFIG_BATMAN_ADV_MCAST
@@ -712,6 +771,8 @@ rem_attr:
712 for (bat_attr = batadv_mesh_attrs; *bat_attr; ++bat_attr) 771 for (bat_attr = batadv_mesh_attrs; *bat_attr; ++bat_attr)
713 sysfs_remove_file(bat_priv->mesh_obj, &((*bat_attr)->attr)); 772 sysfs_remove_file(bat_priv->mesh_obj, &((*bat_attr)->attr));
714 773
774 kobject_uevent(bat_priv->mesh_obj, KOBJ_REMOVE);
775 kobject_del(bat_priv->mesh_obj);
715 kobject_put(bat_priv->mesh_obj); 776 kobject_put(bat_priv->mesh_obj);
716 bat_priv->mesh_obj = NULL; 777 bat_priv->mesh_obj = NULL;
717out: 778out:
@@ -726,6 +787,8 @@ void batadv_sysfs_del_meshif(struct net_device *dev)
726 for (bat_attr = batadv_mesh_attrs; *bat_attr; ++bat_attr) 787 for (bat_attr = batadv_mesh_attrs; *bat_attr; ++bat_attr)
727 sysfs_remove_file(bat_priv->mesh_obj, &((*bat_attr)->attr)); 788 sysfs_remove_file(bat_priv->mesh_obj, &((*bat_attr)->attr));
728 789
790 kobject_uevent(bat_priv->mesh_obj, KOBJ_REMOVE);
791 kobject_del(bat_priv->mesh_obj);
729 kobject_put(bat_priv->mesh_obj); 792 kobject_put(bat_priv->mesh_obj);
730 bat_priv->mesh_obj = NULL; 793 bat_priv->mesh_obj = NULL;
731} 794}
@@ -781,6 +844,10 @@ rem_attr:
781 for (bat_attr = batadv_vlan_attrs; *bat_attr; ++bat_attr) 844 for (bat_attr = batadv_vlan_attrs; *bat_attr; ++bat_attr)
782 sysfs_remove_file(vlan->kobj, &((*bat_attr)->attr)); 845 sysfs_remove_file(vlan->kobj, &((*bat_attr)->attr));
783 846
847 if (vlan->kobj != bat_priv->mesh_obj) {
848 kobject_uevent(vlan->kobj, KOBJ_REMOVE);
849 kobject_del(vlan->kobj);
850 }
784 kobject_put(vlan->kobj); 851 kobject_put(vlan->kobj);
785 vlan->kobj = NULL; 852 vlan->kobj = NULL;
786out: 853out:
@@ -800,6 +867,10 @@ void batadv_sysfs_del_vlan(struct batadv_priv *bat_priv,
800 for (bat_attr = batadv_vlan_attrs; *bat_attr; ++bat_attr) 867 for (bat_attr = batadv_vlan_attrs; *bat_attr; ++bat_attr)
801 sysfs_remove_file(vlan->kobj, &((*bat_attr)->attr)); 868 sysfs_remove_file(vlan->kobj, &((*bat_attr)->attr));
802 869
870 if (vlan->kobj != bat_priv->mesh_obj) {
871 kobject_uevent(vlan->kobj, KOBJ_REMOVE);
872 kobject_del(vlan->kobj);
873 }
803 kobject_put(vlan->kobj); 874 kobject_put(vlan->kobj);
804 vlan->kobj = NULL; 875 vlan->kobj = NULL;
805} 876}
@@ -828,31 +899,31 @@ static ssize_t batadv_show_mesh_iface(struct kobject *kobj,
828 return length; 899 return length;
829} 900}
830 901
831static ssize_t batadv_store_mesh_iface(struct kobject *kobj, 902/**
832 struct attribute *attr, char *buff, 903 * batadv_store_mesh_iface_finish - store new hardif mesh_iface state
833 size_t count) 904 * @net_dev: netdevice to add/remove to/from batman-adv soft-interface
905 * @ifname: name of soft-interface to modify
906 *
907 * Changes the parts of the hard+soft interface which can not be modified under
908 * sysfs lock (to prevent deadlock situations).
909 *
910 * Return: 0 on success, 0 < on failure
911 */
912static int batadv_store_mesh_iface_finish(struct net_device *net_dev,
913 char ifname[IFNAMSIZ])
834{ 914{
835 struct net_device *net_dev = batadv_kobj_to_netdev(kobj);
836 struct net *net = dev_net(net_dev); 915 struct net *net = dev_net(net_dev);
837 struct batadv_hard_iface *hard_iface; 916 struct batadv_hard_iface *hard_iface;
838 int status_tmp = -1; 917 int status_tmp;
839 int ret = count; 918 int ret = 0;
919
920 ASSERT_RTNL();
840 921
841 hard_iface = batadv_hardif_get_by_netdev(net_dev); 922 hard_iface = batadv_hardif_get_by_netdev(net_dev);
842 if (!hard_iface) 923 if (!hard_iface)
843 return count; 924 return 0;
844
845 if (buff[count - 1] == '\n')
846 buff[count - 1] = '\0';
847
848 if (strlen(buff) >= IFNAMSIZ) {
849 pr_err("Invalid parameter for 'mesh_iface' setting received: interface name too long '%s'\n",
850 buff);
851 batadv_hardif_put(hard_iface);
852 return -EINVAL;
853 }
854 925
855 if (strncmp(buff, "none", 4) == 0) 926 if (strncmp(ifname, "none", 4) == 0)
856 status_tmp = BATADV_IF_NOT_IN_USE; 927 status_tmp = BATADV_IF_NOT_IN_USE;
857 else 928 else
858 status_tmp = BATADV_IF_I_WANT_YOU; 929 status_tmp = BATADV_IF_I_WANT_YOU;
@@ -861,15 +932,13 @@ static ssize_t batadv_store_mesh_iface(struct kobject *kobj,
861 goto out; 932 goto out;
862 933
863 if ((hard_iface->soft_iface) && 934 if ((hard_iface->soft_iface) &&
864 (strncmp(hard_iface->soft_iface->name, buff, IFNAMSIZ) == 0)) 935 (strncmp(hard_iface->soft_iface->name, ifname, IFNAMSIZ) == 0))
865 goto out; 936 goto out;
866 937
867 rtnl_lock();
868
869 if (status_tmp == BATADV_IF_NOT_IN_USE) { 938 if (status_tmp == BATADV_IF_NOT_IN_USE) {
870 batadv_hardif_disable_interface(hard_iface, 939 batadv_hardif_disable_interface(hard_iface,
871 BATADV_IF_CLEANUP_AUTO); 940 BATADV_IF_CLEANUP_AUTO);
872 goto unlock; 941 goto out;
873 } 942 }
874 943
875 /* if the interface already is in use */ 944 /* if the interface already is in use */
@@ -877,15 +946,71 @@ static ssize_t batadv_store_mesh_iface(struct kobject *kobj,
877 batadv_hardif_disable_interface(hard_iface, 946 batadv_hardif_disable_interface(hard_iface,
878 BATADV_IF_CLEANUP_AUTO); 947 BATADV_IF_CLEANUP_AUTO);
879 948
880 ret = batadv_hardif_enable_interface(hard_iface, net, buff); 949 ret = batadv_hardif_enable_interface(hard_iface, net, ifname);
881
882unlock:
883 rtnl_unlock();
884out: 950out:
885 batadv_hardif_put(hard_iface); 951 batadv_hardif_put(hard_iface);
886 return ret; 952 return ret;
887} 953}
888 954
955/**
956 * batadv_store_mesh_iface_work - store new hardif mesh_iface state
957 * @work: work queue item
958 *
959 * Changes the parts of the hard+soft interface which can not be modified under
960 * sysfs lock (to prevent deadlock situations).
961 */
962static void batadv_store_mesh_iface_work(struct work_struct *work)
963{
964 struct batadv_store_mesh_work *store_work;
965 int ret;
966
967 store_work = container_of(work, struct batadv_store_mesh_work, work);
968
969 rtnl_lock();
970 ret = batadv_store_mesh_iface_finish(store_work->net_dev,
971 store_work->soft_iface_name);
972 rtnl_unlock();
973
974 if (ret < 0)
975 pr_err("Failed to store new mesh_iface state %s for %s: %d\n",
976 store_work->soft_iface_name, store_work->net_dev->name,
977 ret);
978
979 dev_put(store_work->net_dev);
980 kfree(store_work);
981}
982
983static ssize_t batadv_store_mesh_iface(struct kobject *kobj,
984 struct attribute *attr, char *buff,
985 size_t count)
986{
987 struct net_device *net_dev = batadv_kobj_to_netdev(kobj);
988 struct batadv_store_mesh_work *store_work;
989
990 if (buff[count - 1] == '\n')
991 buff[count - 1] = '\0';
992
993 if (strlen(buff) >= IFNAMSIZ) {
994 pr_err("Invalid parameter for 'mesh_iface' setting received: interface name too long '%s'\n",
995 buff);
996 return -EINVAL;
997 }
998
999 store_work = kmalloc(sizeof(*store_work), GFP_KERNEL);
1000 if (!store_work)
1001 return -ENOMEM;
1002
1003 dev_hold(net_dev);
1004 INIT_WORK(&store_work->work, batadv_store_mesh_iface_work);
1005 store_work->net_dev = net_dev;
1006 strlcpy(store_work->soft_iface_name, buff,
1007 sizeof(store_work->soft_iface_name));
1008
1009 queue_work(batadv_event_workqueue, &store_work->work);
1010
1011 return count;
1012}
1013
889static ssize_t batadv_show_iface_status(struct kobject *kobj, 1014static ssize_t batadv_show_iface_status(struct kobject *kobj,
890 struct attribute *attr, char *buff) 1015 struct attribute *attr, char *buff)
891{ 1016{
@@ -1048,6 +1173,8 @@ out:
1048 1173
1049void batadv_sysfs_del_hardif(struct kobject **hardif_obj) 1174void batadv_sysfs_del_hardif(struct kobject **hardif_obj)
1050{ 1175{
1176 kobject_uevent(*hardif_obj, KOBJ_REMOVE);
1177 kobject_del(*hardif_obj);
1051 kobject_put(*hardif_obj); 1178 kobject_put(*hardif_obj);
1052 *hardif_obj = NULL; 1179 *hardif_obj = NULL;
1053} 1180}
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 7e6df7a4964a..7f663092f6de 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -22,12 +22,14 @@
22#include <linux/bitops.h> 22#include <linux/bitops.h>
23#include <linux/bug.h> 23#include <linux/bug.h>
24#include <linux/byteorder/generic.h> 24#include <linux/byteorder/generic.h>
25#include <linux/cache.h>
25#include <linux/compiler.h> 26#include <linux/compiler.h>
26#include <linux/crc32c.h> 27#include <linux/crc32c.h>
27#include <linux/errno.h> 28#include <linux/errno.h>
28#include <linux/etherdevice.h> 29#include <linux/etherdevice.h>
29#include <linux/fs.h> 30#include <linux/fs.h>
30#include <linux/if_ether.h> 31#include <linux/if_ether.h>
32#include <linux/init.h>
31#include <linux/jhash.h> 33#include <linux/jhash.h>
32#include <linux/jiffies.h> 34#include <linux/jiffies.h>
33#include <linux/kernel.h> 35#include <linux/kernel.h>
@@ -35,25 +37,39 @@
35#include <linux/list.h> 37#include <linux/list.h>
36#include <linux/lockdep.h> 38#include <linux/lockdep.h>
37#include <linux/netdevice.h> 39#include <linux/netdevice.h>
40#include <linux/netlink.h>
38#include <linux/rculist.h> 41#include <linux/rculist.h>
39#include <linux/rcupdate.h> 42#include <linux/rcupdate.h>
40#include <linux/seq_file.h> 43#include <linux/seq_file.h>
44#include <linux/skbuff.h>
41#include <linux/slab.h> 45#include <linux/slab.h>
42#include <linux/spinlock.h> 46#include <linux/spinlock.h>
43#include <linux/stddef.h> 47#include <linux/stddef.h>
44#include <linux/string.h> 48#include <linux/string.h>
45#include <linux/workqueue.h> 49#include <linux/workqueue.h>
50#include <net/genetlink.h>
51#include <net/netlink.h>
52#include <net/sock.h>
53#include <uapi/linux/batman_adv.h>
46 54
47#include "bridge_loop_avoidance.h" 55#include "bridge_loop_avoidance.h"
48#include "hard-interface.h" 56#include "hard-interface.h"
49#include "hash.h" 57#include "hash.h"
50#include "log.h" 58#include "log.h"
51#include "multicast.h" 59#include "multicast.h"
60#include "netlink.h"
52#include "originator.h" 61#include "originator.h"
53#include "packet.h" 62#include "packet.h"
54#include "soft-interface.h" 63#include "soft-interface.h"
55#include "tvlv.h" 64#include "tvlv.h"
56 65
66static struct kmem_cache *batadv_tl_cache __read_mostly;
67static struct kmem_cache *batadv_tg_cache __read_mostly;
68static struct kmem_cache *batadv_tt_orig_cache __read_mostly;
69static struct kmem_cache *batadv_tt_change_cache __read_mostly;
70static struct kmem_cache *batadv_tt_req_cache __read_mostly;
71static struct kmem_cache *batadv_tt_roam_cache __read_mostly;
72
57/* hash class keys */ 73/* hash class keys */
58static struct lock_class_key batadv_tt_local_hash_lock_class_key; 74static struct lock_class_key batadv_tt_local_hash_lock_class_key;
59static struct lock_class_key batadv_tt_global_hash_lock_class_key; 75static struct lock_class_key batadv_tt_global_hash_lock_class_key;
@@ -205,6 +221,20 @@ batadv_tt_global_hash_find(struct batadv_priv *bat_priv, const u8 *addr,
205} 221}
206 222
207/** 223/**
224 * batadv_tt_local_entry_free_rcu - free the tt_local_entry
225 * @rcu: rcu pointer of the tt_local_entry
226 */
227static void batadv_tt_local_entry_free_rcu(struct rcu_head *rcu)
228{
229 struct batadv_tt_local_entry *tt_local_entry;
230
231 tt_local_entry = container_of(rcu, struct batadv_tt_local_entry,
232 common.rcu);
233
234 kmem_cache_free(batadv_tl_cache, tt_local_entry);
235}
236
237/**
208 * batadv_tt_local_entry_release - release tt_local_entry from lists and queue 238 * batadv_tt_local_entry_release - release tt_local_entry from lists and queue
209 * for free after rcu grace period 239 * for free after rcu grace period
210 * @ref: kref pointer of the nc_node 240 * @ref: kref pointer of the nc_node
@@ -218,7 +248,7 @@ static void batadv_tt_local_entry_release(struct kref *ref)
218 248
219 batadv_softif_vlan_put(tt_local_entry->vlan); 249 batadv_softif_vlan_put(tt_local_entry->vlan);
220 250
221 kfree_rcu(tt_local_entry, common.rcu); 251 call_rcu(&tt_local_entry->common.rcu, batadv_tt_local_entry_free_rcu);
222} 252}
223 253
224/** 254/**
@@ -234,6 +264,20 @@ batadv_tt_local_entry_put(struct batadv_tt_local_entry *tt_local_entry)
234} 264}
235 265
236/** 266/**
267 * batadv_tt_global_entry_free_rcu - free the tt_global_entry
268 * @rcu: rcu pointer of the tt_global_entry
269 */
270static void batadv_tt_global_entry_free_rcu(struct rcu_head *rcu)
271{
272 struct batadv_tt_global_entry *tt_global_entry;
273
274 tt_global_entry = container_of(rcu, struct batadv_tt_global_entry,
275 common.rcu);
276
277 kmem_cache_free(batadv_tg_cache, tt_global_entry);
278}
279
280/**
237 * batadv_tt_global_entry_release - release tt_global_entry from lists and queue 281 * batadv_tt_global_entry_release - release tt_global_entry from lists and queue
238 * for free after rcu grace period 282 * for free after rcu grace period
239 * @ref: kref pointer of the nc_node 283 * @ref: kref pointer of the nc_node
@@ -246,7 +290,8 @@ static void batadv_tt_global_entry_release(struct kref *ref)
246 common.refcount); 290 common.refcount);
247 291
248 batadv_tt_global_del_orig_list(tt_global_entry); 292 batadv_tt_global_del_orig_list(tt_global_entry);
249 kfree_rcu(tt_global_entry, common.rcu); 293
294 call_rcu(&tt_global_entry->common.rcu, batadv_tt_global_entry_free_rcu);
250} 295}
251 296
252/** 297/**
@@ -384,6 +429,19 @@ static void batadv_tt_global_size_dec(struct batadv_orig_node *orig_node,
384} 429}
385 430
386/** 431/**
432 * batadv_tt_orig_list_entry_free_rcu - free the orig_entry
433 * @rcu: rcu pointer of the orig_entry
434 */
435static void batadv_tt_orig_list_entry_free_rcu(struct rcu_head *rcu)
436{
437 struct batadv_tt_orig_list_entry *orig_entry;
438
439 orig_entry = container_of(rcu, struct batadv_tt_orig_list_entry, rcu);
440
441 kmem_cache_free(batadv_tt_orig_cache, orig_entry);
442}
443
444/**
387 * batadv_tt_orig_list_entry_release - release tt orig entry from lists and 445 * batadv_tt_orig_list_entry_release - release tt orig entry from lists and
388 * queue for free after rcu grace period 446 * queue for free after rcu grace period
389 * @ref: kref pointer of the tt orig entry 447 * @ref: kref pointer of the tt orig entry
@@ -396,7 +454,7 @@ static void batadv_tt_orig_list_entry_release(struct kref *ref)
396 refcount); 454 refcount);
397 455
398 batadv_orig_node_put(orig_entry->orig_node); 456 batadv_orig_node_put(orig_entry->orig_node);
399 kfree_rcu(orig_entry, rcu); 457 call_rcu(&orig_entry->rcu, batadv_tt_orig_list_entry_free_rcu);
400} 458}
401 459
402/** 460/**
@@ -426,7 +484,7 @@ static void batadv_tt_local_event(struct batadv_priv *bat_priv,
426 bool event_removed = false; 484 bool event_removed = false;
427 bool del_op_requested, del_op_entry; 485 bool del_op_requested, del_op_entry;
428 486
429 tt_change_node = kmalloc(sizeof(*tt_change_node), GFP_ATOMIC); 487 tt_change_node = kmem_cache_alloc(batadv_tt_change_cache, GFP_ATOMIC);
430 if (!tt_change_node) 488 if (!tt_change_node)
431 return; 489 return;
432 490
@@ -467,8 +525,8 @@ static void batadv_tt_local_event(struct batadv_priv *bat_priv,
467 continue; 525 continue;
468del: 526del:
469 list_del(&entry->list); 527 list_del(&entry->list);
470 kfree(entry); 528 kmem_cache_free(batadv_tt_change_cache, entry);
471 kfree(tt_change_node); 529 kmem_cache_free(batadv_tt_change_cache, tt_change_node);
472 event_removed = true; 530 event_removed = true;
473 goto unlock; 531 goto unlock;
474 } 532 }
@@ -646,7 +704,7 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr,
646 goto out; 704 goto out;
647 } 705 }
648 706
649 tt_local = kmalloc(sizeof(*tt_local), GFP_ATOMIC); 707 tt_local = kmem_cache_alloc(batadv_tl_cache, GFP_ATOMIC);
650 if (!tt_local) 708 if (!tt_local)
651 goto out; 709 goto out;
652 710
@@ -656,7 +714,7 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr,
656 net_ratelimited_function(batadv_info, soft_iface, 714 net_ratelimited_function(batadv_info, soft_iface,
657 "adding TT local entry %pM to non-existent VLAN %d\n", 715 "adding TT local entry %pM to non-existent VLAN %d\n",
658 addr, BATADV_PRINT_VID(vid)); 716 addr, BATADV_PRINT_VID(vid));
659 kfree(tt_local); 717 kmem_cache_free(batadv_tl_cache, tt_local);
660 tt_local = NULL; 718 tt_local = NULL;
661 goto out; 719 goto out;
662 } 720 }
@@ -676,7 +734,6 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr,
676 if (batadv_is_wifi_netdev(in_dev)) 734 if (batadv_is_wifi_netdev(in_dev))
677 tt_local->common.flags |= BATADV_TT_CLIENT_WIFI; 735 tt_local->common.flags |= BATADV_TT_CLIENT_WIFI;
678 kref_init(&tt_local->common.refcount); 736 kref_init(&tt_local->common.refcount);
679 kref_get(&tt_local->common.refcount);
680 tt_local->last_seen = jiffies; 737 tt_local->last_seen = jiffies;
681 tt_local->common.added_at = tt_local->last_seen; 738 tt_local->common.added_at = tt_local->last_seen;
682 tt_local->vlan = vlan; 739 tt_local->vlan = vlan;
@@ -688,6 +745,7 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr,
688 is_multicast_ether_addr(addr)) 745 is_multicast_ether_addr(addr))
689 tt_local->common.flags |= BATADV_TT_CLIENT_NOPURGE; 746 tt_local->common.flags |= BATADV_TT_CLIENT_NOPURGE;
690 747
748 kref_get(&tt_local->common.refcount);
691 hash_added = batadv_hash_add(bat_priv->tt.local_hash, batadv_compare_tt, 749 hash_added = batadv_hash_add(bat_priv->tt.local_hash, batadv_compare_tt,
692 batadv_choose_tt, &tt_local->common, 750 batadv_choose_tt, &tt_local->common,
693 &tt_local->common.hash_entry); 751 &tt_local->common.hash_entry);
@@ -959,7 +1017,7 @@ static void batadv_tt_tvlv_container_update(struct batadv_priv *bat_priv)
959 tt_diff_entries_count++; 1017 tt_diff_entries_count++;
960 } 1018 }
961 list_del(&entry->list); 1019 list_del(&entry->list);
962 kfree(entry); 1020 kmem_cache_free(batadv_tt_change_cache, entry);
963 } 1021 }
964 spin_unlock_bh(&bat_priv->tt.changes_list_lock); 1022 spin_unlock_bh(&bat_priv->tt.changes_list_lock);
965 1023
@@ -989,6 +1047,7 @@ container_register:
989 kfree(tt_data); 1047 kfree(tt_data);
990} 1048}
991 1049
1050#ifdef CONFIG_BATMAN_ADV_DEBUGFS
992int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset) 1051int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset)
993{ 1052{
994 struct net_device *net_dev = (struct net_device *)seq->private; 1053 struct net_device *net_dev = (struct net_device *)seq->private;
@@ -1056,6 +1115,165 @@ out:
1056 batadv_hardif_put(primary_if); 1115 batadv_hardif_put(primary_if);
1057 return 0; 1116 return 0;
1058} 1117}
1118#endif
1119
1120/**
1121 * batadv_tt_local_dump_entry - Dump one TT local entry into a message
1122 * @msg :Netlink message to dump into
1123 * @portid: Port making netlink request
1124 * @seq: Sequence number of netlink message
1125 * @bat_priv: The bat priv with all the soft interface information
1126 * @common: tt local & tt global common data
1127 *
1128 * Return: Error code, or 0 on success
1129 */
1130static int
1131batadv_tt_local_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
1132 struct batadv_priv *bat_priv,
1133 struct batadv_tt_common_entry *common)
1134{
1135 void *hdr;
1136 struct batadv_softif_vlan *vlan;
1137 struct batadv_tt_local_entry *local;
1138 unsigned int last_seen_msecs;
1139 u32 crc;
1140
1141 local = container_of(common, struct batadv_tt_local_entry, common);
1142 last_seen_msecs = jiffies_to_msecs(jiffies - local->last_seen);
1143
1144 vlan = batadv_softif_vlan_get(bat_priv, common->vid);
1145 if (!vlan)
1146 return 0;
1147
1148 crc = vlan->tt.crc;
1149
1150 batadv_softif_vlan_put(vlan);
1151
1152 hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family,
1153 NLM_F_MULTI,
1154 BATADV_CMD_GET_TRANSTABLE_LOCAL);
1155 if (!hdr)
1156 return -ENOBUFS;
1157
1158 if (nla_put(msg, BATADV_ATTR_TT_ADDRESS, ETH_ALEN, common->addr) ||
1159 nla_put_u32(msg, BATADV_ATTR_TT_CRC32, crc) ||
1160 nla_put_u16(msg, BATADV_ATTR_TT_VID, common->vid) ||
1161 nla_put_u32(msg, BATADV_ATTR_TT_FLAGS, common->flags))
1162 goto nla_put_failure;
1163
1164 if (!(common->flags & BATADV_TT_CLIENT_NOPURGE) &&
1165 nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, last_seen_msecs))
1166 goto nla_put_failure;
1167
1168 genlmsg_end(msg, hdr);
1169 return 0;
1170
1171 nla_put_failure:
1172 genlmsg_cancel(msg, hdr);
1173 return -EMSGSIZE;
1174}
1175
1176/**
1177 * batadv_tt_local_dump_bucket - Dump one TT local bucket into a message
1178 * @msg: Netlink message to dump into
1179 * @portid: Port making netlink request
1180 * @seq: Sequence number of netlink message
1181 * @bat_priv: The bat priv with all the soft interface information
1182 * @head: Pointer to the list containing the local tt entries
1183 * @idx_s: Number of entries to skip
1184 *
1185 * Return: Error code, or 0 on success
1186 */
1187static int
1188batadv_tt_local_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
1189 struct batadv_priv *bat_priv,
1190 struct hlist_head *head, int *idx_s)
1191{
1192 struct batadv_tt_common_entry *common;
1193 int idx = 0;
1194
1195 rcu_read_lock();
1196 hlist_for_each_entry_rcu(common, head, hash_entry) {
1197 if (idx++ < *idx_s)
1198 continue;
1199
1200 if (batadv_tt_local_dump_entry(msg, portid, seq, bat_priv,
1201 common)) {
1202 rcu_read_unlock();
1203 *idx_s = idx - 1;
1204 return -EMSGSIZE;
1205 }
1206 }
1207 rcu_read_unlock();
1208
1209 *idx_s = 0;
1210 return 0;
1211}
1212
1213/**
1214 * batadv_tt_local_dump - Dump TT local entries into a message
1215 * @msg: Netlink message to dump into
1216 * @cb: Parameters from query
1217 *
1218 * Return: Error code, or 0 on success
1219 */
1220int batadv_tt_local_dump(struct sk_buff *msg, struct netlink_callback *cb)
1221{
1222 struct net *net = sock_net(cb->skb->sk);
1223 struct net_device *soft_iface;
1224 struct batadv_priv *bat_priv;
1225 struct batadv_hard_iface *primary_if = NULL;
1226 struct batadv_hashtable *hash;
1227 struct hlist_head *head;
1228 int ret;
1229 int ifindex;
1230 int bucket = cb->args[0];
1231 int idx = cb->args[1];
1232 int portid = NETLINK_CB(cb->skb).portid;
1233
1234 ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX);
1235 if (!ifindex)
1236 return -EINVAL;
1237
1238 soft_iface = dev_get_by_index(net, ifindex);
1239 if (!soft_iface || !batadv_softif_is_valid(soft_iface)) {
1240 ret = -ENODEV;
1241 goto out;
1242 }
1243
1244 bat_priv = netdev_priv(soft_iface);
1245
1246 primary_if = batadv_primary_if_get_selected(bat_priv);
1247 if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) {
1248 ret = -ENOENT;
1249 goto out;
1250 }
1251
1252 hash = bat_priv->tt.local_hash;
1253
1254 while (bucket < hash->size) {
1255 head = &hash->table[bucket];
1256
1257 if (batadv_tt_local_dump_bucket(msg, portid, cb->nlh->nlmsg_seq,
1258 bat_priv, head, &idx))
1259 break;
1260
1261 bucket++;
1262 }
1263
1264 ret = msg->len;
1265
1266 out:
1267 if (primary_if)
1268 batadv_hardif_put(primary_if);
1269 if (soft_iface)
1270 dev_put(soft_iface);
1271
1272 cb->args[0] = bucket;
1273 cb->args[1] = idx;
1274
1275 return ret;
1276}
1059 1277
1060static void 1278static void
1061batadv_tt_local_set_pending(struct batadv_priv *bat_priv, 1279batadv_tt_local_set_pending(struct batadv_priv *bat_priv,
@@ -1259,7 +1477,7 @@ static void batadv_tt_changes_list_free(struct batadv_priv *bat_priv)
1259 list_for_each_entry_safe(entry, safe, &bat_priv->tt.changes_list, 1477 list_for_each_entry_safe(entry, safe, &bat_priv->tt.changes_list,
1260 list) { 1478 list) {
1261 list_del(&entry->list); 1479 list_del(&entry->list);
1262 kfree(entry); 1480 kmem_cache_free(batadv_tt_change_cache, entry);
1263 } 1481 }
1264 1482
1265 atomic_set(&bat_priv->tt.local_changes, 0); 1483 atomic_set(&bat_priv->tt.local_changes, 0);
@@ -1341,7 +1559,7 @@ batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global,
1341 goto out; 1559 goto out;
1342 } 1560 }
1343 1561
1344 orig_entry = kzalloc(sizeof(*orig_entry), GFP_ATOMIC); 1562 orig_entry = kmem_cache_zalloc(batadv_tt_orig_cache, GFP_ATOMIC);
1345 if (!orig_entry) 1563 if (!orig_entry)
1346 goto out; 1564 goto out;
1347 1565
@@ -1351,9 +1569,9 @@ batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global,
1351 orig_entry->orig_node = orig_node; 1569 orig_entry->orig_node = orig_node;
1352 orig_entry->ttvn = ttvn; 1570 orig_entry->ttvn = ttvn;
1353 kref_init(&orig_entry->refcount); 1571 kref_init(&orig_entry->refcount);
1354 kref_get(&orig_entry->refcount);
1355 1572
1356 spin_lock_bh(&tt_global->list_lock); 1573 spin_lock_bh(&tt_global->list_lock);
1574 kref_get(&orig_entry->refcount);
1357 hlist_add_head_rcu(&orig_entry->list, 1575 hlist_add_head_rcu(&orig_entry->list,
1358 &tt_global->orig_list); 1576 &tt_global->orig_list);
1359 spin_unlock_bh(&tt_global->list_lock); 1577 spin_unlock_bh(&tt_global->list_lock);
@@ -1411,7 +1629,8 @@ static bool batadv_tt_global_add(struct batadv_priv *bat_priv,
1411 goto out; 1629 goto out;
1412 1630
1413 if (!tt_global_entry) { 1631 if (!tt_global_entry) {
1414 tt_global_entry = kzalloc(sizeof(*tt_global_entry), GFP_ATOMIC); 1632 tt_global_entry = kmem_cache_zalloc(batadv_tg_cache,
1633 GFP_ATOMIC);
1415 if (!tt_global_entry) 1634 if (!tt_global_entry)
1416 goto out; 1635 goto out;
1417 1636
@@ -1428,13 +1647,13 @@ static bool batadv_tt_global_add(struct batadv_priv *bat_priv,
1428 if (flags & BATADV_TT_CLIENT_ROAM) 1647 if (flags & BATADV_TT_CLIENT_ROAM)
1429 tt_global_entry->roam_at = jiffies; 1648 tt_global_entry->roam_at = jiffies;
1430 kref_init(&common->refcount); 1649 kref_init(&common->refcount);
1431 kref_get(&common->refcount);
1432 common->added_at = jiffies; 1650 common->added_at = jiffies;
1433 1651
1434 INIT_HLIST_HEAD(&tt_global_entry->orig_list); 1652 INIT_HLIST_HEAD(&tt_global_entry->orig_list);
1435 atomic_set(&tt_global_entry->orig_list_count, 0); 1653 atomic_set(&tt_global_entry->orig_list_count, 0);
1436 spin_lock_init(&tt_global_entry->list_lock); 1654 spin_lock_init(&tt_global_entry->list_lock);
1437 1655
1656 kref_get(&common->refcount);
1438 hash_added = batadv_hash_add(bat_priv->tt.global_hash, 1657 hash_added = batadv_hash_add(bat_priv->tt.global_hash,
1439 batadv_compare_tt, 1658 batadv_compare_tt,
1440 batadv_choose_tt, common, 1659 batadv_choose_tt, common,
@@ -1579,6 +1798,7 @@ batadv_transtable_best_orig(struct batadv_priv *bat_priv,
1579 return best_entry; 1798 return best_entry;
1580} 1799}
1581 1800
1801#ifdef CONFIG_BATMAN_ADV_DEBUGFS
1582/** 1802/**
1583 * batadv_tt_global_print_entry - print all orig nodes who announce the address 1803 * batadv_tt_global_print_entry - print all orig nodes who announce the address
1584 * for this global entry 1804 * for this global entry
@@ -1702,6 +1922,219 @@ out:
1702 batadv_hardif_put(primary_if); 1922 batadv_hardif_put(primary_if);
1703 return 0; 1923 return 0;
1704} 1924}
1925#endif
1926
1927/**
1928 * batadv_tt_global_dump_subentry - Dump all TT local entries into a message
1929 * @msg: Netlink message to dump into
1930 * @portid: Port making netlink request
1931 * @seq: Sequence number of netlink message
1932 * @common: tt local & tt global common data
1933 * @orig: Originator node announcing a non-mesh client
1934 * @best: Is the best originator for the TT entry
1935 *
1936 * Return: Error code, or 0 on success
1937 */
1938static int
1939batadv_tt_global_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq,
1940 struct batadv_tt_common_entry *common,
1941 struct batadv_tt_orig_list_entry *orig,
1942 bool best)
1943{
1944 void *hdr;
1945 struct batadv_orig_node_vlan *vlan;
1946 u8 last_ttvn;
1947 u32 crc;
1948
1949 vlan = batadv_orig_node_vlan_get(orig->orig_node,
1950 common->vid);
1951 if (!vlan)
1952 return 0;
1953
1954 crc = vlan->tt.crc;
1955
1956 batadv_orig_node_vlan_put(vlan);
1957
1958 hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family,
1959 NLM_F_MULTI,
1960 BATADV_CMD_GET_TRANSTABLE_GLOBAL);
1961 if (!hdr)
1962 return -ENOBUFS;
1963
1964 last_ttvn = atomic_read(&orig->orig_node->last_ttvn);
1965
1966 if (nla_put(msg, BATADV_ATTR_TT_ADDRESS, ETH_ALEN, common->addr) ||
1967 nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN,
1968 orig->orig_node->orig) ||
1969 nla_put_u8(msg, BATADV_ATTR_TT_TTVN, orig->ttvn) ||
1970 nla_put_u8(msg, BATADV_ATTR_TT_LAST_TTVN, last_ttvn) ||
1971 nla_put_u32(msg, BATADV_ATTR_TT_CRC32, crc) ||
1972 nla_put_u16(msg, BATADV_ATTR_TT_VID, common->vid) ||
1973 nla_put_u32(msg, BATADV_ATTR_TT_FLAGS, common->flags))
1974 goto nla_put_failure;
1975
1976 if (best && nla_put_flag(msg, BATADV_ATTR_FLAG_BEST))
1977 goto nla_put_failure;
1978
1979 genlmsg_end(msg, hdr);
1980 return 0;
1981
1982 nla_put_failure:
1983 genlmsg_cancel(msg, hdr);
1984 return -EMSGSIZE;
1985}
1986
1987/**
1988 * batadv_tt_global_dump_entry - Dump one TT global entry into a message
1989 * @msg: Netlink message to dump into
1990 * @portid: Port making netlink request
1991 * @seq: Sequence number of netlink message
1992 * @bat_priv: The bat priv with all the soft interface information
1993 * @common: tt local & tt global common data
1994 * @sub_s: Number of entries to skip
1995 *
1996 * This function assumes the caller holds rcu_read_lock().
1997 *
1998 * Return: Error code, or 0 on success
1999 */
2000static int
2001batadv_tt_global_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
2002 struct batadv_priv *bat_priv,
2003 struct batadv_tt_common_entry *common, int *sub_s)
2004{
2005 struct batadv_tt_orig_list_entry *orig_entry, *best_entry;
2006 struct batadv_tt_global_entry *global;
2007 struct hlist_head *head;
2008 int sub = 0;
2009 bool best;
2010
2011 global = container_of(common, struct batadv_tt_global_entry, common);
2012 best_entry = batadv_transtable_best_orig(bat_priv, global);
2013 head = &global->orig_list;
2014
2015 hlist_for_each_entry_rcu(orig_entry, head, list) {
2016 if (sub++ < *sub_s)
2017 continue;
2018
2019 best = (orig_entry == best_entry);
2020
2021 if (batadv_tt_global_dump_subentry(msg, portid, seq, common,
2022 orig_entry, best)) {
2023 *sub_s = sub - 1;
2024 return -EMSGSIZE;
2025 }
2026 }
2027
2028 *sub_s = 0;
2029 return 0;
2030}
2031
2032/**
2033 * batadv_tt_global_dump_bucket - Dump one TT local bucket into a message
2034 * @msg: Netlink message to dump into
2035 * @portid: Port making netlink request
2036 * @seq: Sequence number of netlink message
2037 * @bat_priv: The bat priv with all the soft interface information
2038 * @head: Pointer to the list containing the global tt entries
2039 * @idx_s: Number of entries to skip
2040 * @sub: Number of entries to skip
2041 *
2042 * Return: Error code, or 0 on success
2043 */
2044static int
2045batadv_tt_global_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
2046 struct batadv_priv *bat_priv,
2047 struct hlist_head *head, int *idx_s, int *sub)
2048{
2049 struct batadv_tt_common_entry *common;
2050 int idx = 0;
2051
2052 rcu_read_lock();
2053 hlist_for_each_entry_rcu(common, head, hash_entry) {
2054 if (idx++ < *idx_s)
2055 continue;
2056
2057 if (batadv_tt_global_dump_entry(msg, portid, seq, bat_priv,
2058 common, sub)) {
2059 rcu_read_unlock();
2060 *idx_s = idx - 1;
2061 return -EMSGSIZE;
2062 }
2063 }
2064 rcu_read_unlock();
2065
2066 *idx_s = 0;
2067 *sub = 0;
2068 return 0;
2069}
2070
2071/**
2072 * batadv_tt_global_dump - Dump TT global entries into a message
2073 * @msg: Netlink message to dump into
2074 * @cb: Parameters from query
2075 *
2076 * Return: Error code, or length of message on success
2077 */
2078int batadv_tt_global_dump(struct sk_buff *msg, struct netlink_callback *cb)
2079{
2080 struct net *net = sock_net(cb->skb->sk);
2081 struct net_device *soft_iface;
2082 struct batadv_priv *bat_priv;
2083 struct batadv_hard_iface *primary_if = NULL;
2084 struct batadv_hashtable *hash;
2085 struct hlist_head *head;
2086 int ret;
2087 int ifindex;
2088 int bucket = cb->args[0];
2089 int idx = cb->args[1];
2090 int sub = cb->args[2];
2091 int portid = NETLINK_CB(cb->skb).portid;
2092
2093 ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX);
2094 if (!ifindex)
2095 return -EINVAL;
2096
2097 soft_iface = dev_get_by_index(net, ifindex);
2098 if (!soft_iface || !batadv_softif_is_valid(soft_iface)) {
2099 ret = -ENODEV;
2100 goto out;
2101 }
2102
2103 bat_priv = netdev_priv(soft_iface);
2104
2105 primary_if = batadv_primary_if_get_selected(bat_priv);
2106 if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) {
2107 ret = -ENOENT;
2108 goto out;
2109 }
2110
2111 hash = bat_priv->tt.global_hash;
2112
2113 while (bucket < hash->size) {
2114 head = &hash->table[bucket];
2115
2116 if (batadv_tt_global_dump_bucket(msg, portid,
2117 cb->nlh->nlmsg_seq, bat_priv,
2118 head, &idx, &sub))
2119 break;
2120
2121 bucket++;
2122 }
2123
2124 ret = msg->len;
2125
2126 out:
2127 if (primary_if)
2128 batadv_hardif_put(primary_if);
2129 if (soft_iface)
2130 dev_put(soft_iface);
2131
2132 cb->args[0] = bucket;
2133 cb->args[1] = idx;
2134 cb->args[2] = sub;
2135
2136 return ret;
2137}
1705 2138
1706/** 2139/**
1707 * _batadv_tt_global_del_orig_entry - remove and free an orig_entry 2140 * _batadv_tt_global_del_orig_entry - remove and free an orig_entry
@@ -2280,7 +2713,7 @@ static void batadv_tt_req_node_release(struct kref *ref)
2280 2713
2281 tt_req_node = container_of(ref, struct batadv_tt_req_node, refcount); 2714 tt_req_node = container_of(ref, struct batadv_tt_req_node, refcount);
2282 2715
2283 kfree(tt_req_node); 2716 kmem_cache_free(batadv_tt_req_cache, tt_req_node);
2284} 2717}
2285 2718
2286/** 2719/**
@@ -2367,7 +2800,7 @@ batadv_tt_req_node_new(struct batadv_priv *bat_priv,
2367 goto unlock; 2800 goto unlock;
2368 } 2801 }
2369 2802
2370 tt_req_node = kmalloc(sizeof(*tt_req_node), GFP_ATOMIC); 2803 tt_req_node = kmem_cache_alloc(batadv_tt_req_cache, GFP_ATOMIC);
2371 if (!tt_req_node) 2804 if (!tt_req_node)
2372 goto unlock; 2805 goto unlock;
2373 2806
@@ -3104,7 +3537,7 @@ static void batadv_tt_roam_list_free(struct batadv_priv *bat_priv)
3104 3537
3105 list_for_each_entry_safe(node, safe, &bat_priv->tt.roam_list, list) { 3538 list_for_each_entry_safe(node, safe, &bat_priv->tt.roam_list, list) {
3106 list_del(&node->list); 3539 list_del(&node->list);
3107 kfree(node); 3540 kmem_cache_free(batadv_tt_roam_cache, node);
3108 } 3541 }
3109 3542
3110 spin_unlock_bh(&bat_priv->tt.roam_list_lock); 3543 spin_unlock_bh(&bat_priv->tt.roam_list_lock);
@@ -3121,7 +3554,7 @@ static void batadv_tt_roam_purge(struct batadv_priv *bat_priv)
3121 continue; 3554 continue;
3122 3555
3123 list_del(&node->list); 3556 list_del(&node->list);
3124 kfree(node); 3557 kmem_cache_free(batadv_tt_roam_cache, node);
3125 } 3558 }
3126 spin_unlock_bh(&bat_priv->tt.roam_list_lock); 3559 spin_unlock_bh(&bat_priv->tt.roam_list_lock);
3127} 3560}
@@ -3162,7 +3595,8 @@ static bool batadv_tt_check_roam_count(struct batadv_priv *bat_priv, u8 *client)
3162 } 3595 }
3163 3596
3164 if (!ret) { 3597 if (!ret) {
3165 tt_roam_node = kmalloc(sizeof(*tt_roam_node), GFP_ATOMIC); 3598 tt_roam_node = kmem_cache_alloc(batadv_tt_roam_cache,
3599 GFP_ATOMIC);
3166 if (!tt_roam_node) 3600 if (!tt_roam_node)
3167 goto unlock; 3601 goto unlock;
3168 3602
@@ -3865,3 +4299,85 @@ bool batadv_tt_global_is_isolated(struct batadv_priv *bat_priv,
3865 4299
3866 return ret; 4300 return ret;
3867} 4301}
4302
4303/**
4304 * batadv_tt_cache_init - Initialize tt memory object cache
4305 *
4306 * Return: 0 on success or negative error number in case of failure.
4307 */
4308int __init batadv_tt_cache_init(void)
4309{
4310 size_t tl_size = sizeof(struct batadv_tt_local_entry);
4311 size_t tg_size = sizeof(struct batadv_tt_global_entry);
4312 size_t tt_orig_size = sizeof(struct batadv_tt_orig_list_entry);
4313 size_t tt_change_size = sizeof(struct batadv_tt_change_node);
4314 size_t tt_req_size = sizeof(struct batadv_tt_req_node);
4315 size_t tt_roam_size = sizeof(struct batadv_tt_roam_node);
4316
4317 batadv_tl_cache = kmem_cache_create("batadv_tl_cache", tl_size, 0,
4318 SLAB_HWCACHE_ALIGN, NULL);
4319 if (!batadv_tl_cache)
4320 return -ENOMEM;
4321
4322 batadv_tg_cache = kmem_cache_create("batadv_tg_cache", tg_size, 0,
4323 SLAB_HWCACHE_ALIGN, NULL);
4324 if (!batadv_tg_cache)
4325 goto err_tt_tl_destroy;
4326
4327 batadv_tt_orig_cache = kmem_cache_create("batadv_tt_orig_cache",
4328 tt_orig_size, 0,
4329 SLAB_HWCACHE_ALIGN, NULL);
4330 if (!batadv_tt_orig_cache)
4331 goto err_tt_tg_destroy;
4332
4333 batadv_tt_change_cache = kmem_cache_create("batadv_tt_change_cache",
4334 tt_change_size, 0,
4335 SLAB_HWCACHE_ALIGN, NULL);
4336 if (!batadv_tt_change_cache)
4337 goto err_tt_orig_destroy;
4338
4339 batadv_tt_req_cache = kmem_cache_create("batadv_tt_req_cache",
4340 tt_req_size, 0,
4341 SLAB_HWCACHE_ALIGN, NULL);
4342 if (!batadv_tt_req_cache)
4343 goto err_tt_change_destroy;
4344
4345 batadv_tt_roam_cache = kmem_cache_create("batadv_tt_roam_cache",
4346 tt_roam_size, 0,
4347 SLAB_HWCACHE_ALIGN, NULL);
4348 if (!batadv_tt_roam_cache)
4349 goto err_tt_req_destroy;
4350
4351 return 0;
4352
4353err_tt_req_destroy:
4354 kmem_cache_destroy(batadv_tt_req_cache);
4355 batadv_tt_req_cache = NULL;
4356err_tt_change_destroy:
4357 kmem_cache_destroy(batadv_tt_change_cache);
4358 batadv_tt_change_cache = NULL;
4359err_tt_orig_destroy:
4360 kmem_cache_destroy(batadv_tt_orig_cache);
4361 batadv_tt_orig_cache = NULL;
4362err_tt_tg_destroy:
4363 kmem_cache_destroy(batadv_tg_cache);
4364 batadv_tg_cache = NULL;
4365err_tt_tl_destroy:
4366 kmem_cache_destroy(batadv_tl_cache);
4367 batadv_tl_cache = NULL;
4368
4369 return -ENOMEM;
4370}
4371
4372/**
4373 * batadv_tt_cache_destroy - Destroy tt memory object cache
4374 */
4375void batadv_tt_cache_destroy(void)
4376{
4377 kmem_cache_destroy(batadv_tl_cache);
4378 kmem_cache_destroy(batadv_tg_cache);
4379 kmem_cache_destroy(batadv_tt_orig_cache);
4380 kmem_cache_destroy(batadv_tt_change_cache);
4381 kmem_cache_destroy(batadv_tt_req_cache);
4382 kmem_cache_destroy(batadv_tt_roam_cache);
4383}
diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h
index 7c7e2c006bfe..783fdba84db2 100644
--- a/net/batman-adv/translation-table.h
+++ b/net/batman-adv/translation-table.h
@@ -22,8 +22,10 @@
22 22
23#include <linux/types.h> 23#include <linux/types.h>
24 24
25struct netlink_callback;
25struct net_device; 26struct net_device;
26struct seq_file; 27struct seq_file;
28struct sk_buff;
27 29
28int batadv_tt_init(struct batadv_priv *bat_priv); 30int batadv_tt_init(struct batadv_priv *bat_priv);
29bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, 31bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr,
@@ -33,6 +35,8 @@ u16 batadv_tt_local_remove(struct batadv_priv *bat_priv,
33 const char *message, bool roaming); 35 const char *message, bool roaming);
34int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset); 36int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset);
35int batadv_tt_global_seq_print_text(struct seq_file *seq, void *offset); 37int batadv_tt_global_seq_print_text(struct seq_file *seq, void *offset);
38int batadv_tt_local_dump(struct sk_buff *msg, struct netlink_callback *cb);
39int batadv_tt_global_dump(struct sk_buff *msg, struct netlink_callback *cb);
36void batadv_tt_global_del_orig(struct batadv_priv *bat_priv, 40void batadv_tt_global_del_orig(struct batadv_priv *bat_priv,
37 struct batadv_orig_node *orig_node, 41 struct batadv_orig_node *orig_node,
38 s32 match_vid, const char *message); 42 s32 match_vid, const char *message);
@@ -59,4 +63,7 @@ bool batadv_tt_add_temporary_global_entry(struct batadv_priv *bat_priv,
59bool batadv_tt_global_is_isolated(struct batadv_priv *bat_priv, 63bool batadv_tt_global_is_isolated(struct batadv_priv *bat_priv,
60 const u8 *addr, unsigned short vid); 64 const u8 *addr, unsigned short vid);
61 65
66int batadv_tt_cache_init(void);
67void batadv_tt_cache_destroy(void);
68
62#endif /* _NET_BATMAN_ADV_TRANSLATION_TABLE_H_ */ 69#endif /* _NET_BATMAN_ADV_TRANSLATION_TABLE_H_ */
diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c
index 3d1cf0fb112d..77654f055f24 100644
--- a/net/batman-adv/tvlv.c
+++ b/net/batman-adv/tvlv.c
@@ -257,8 +257,13 @@ void batadv_tvlv_container_register(struct batadv_priv *bat_priv,
257 spin_lock_bh(&bat_priv->tvlv.container_list_lock); 257 spin_lock_bh(&bat_priv->tvlv.container_list_lock);
258 tvlv_old = batadv_tvlv_container_get(bat_priv, type, version); 258 tvlv_old = batadv_tvlv_container_get(bat_priv, type, version);
259 batadv_tvlv_container_remove(bat_priv, tvlv_old); 259 batadv_tvlv_container_remove(bat_priv, tvlv_old);
260
261 kref_get(&tvlv_new->refcount);
260 hlist_add_head(&tvlv_new->list, &bat_priv->tvlv.container_list); 262 hlist_add_head(&tvlv_new->list, &bat_priv->tvlv.container_list);
261 spin_unlock_bh(&bat_priv->tvlv.container_list_lock); 263 spin_unlock_bh(&bat_priv->tvlv.container_list_lock);
264
265 /* don't return reference to new tvlv_container */
266 batadv_tvlv_container_put(tvlv_new);
262} 267}
263 268
264/** 269/**
@@ -542,8 +547,12 @@ void batadv_tvlv_handler_register(struct batadv_priv *bat_priv,
542 INIT_HLIST_NODE(&tvlv_handler->list); 547 INIT_HLIST_NODE(&tvlv_handler->list);
543 548
544 spin_lock_bh(&bat_priv->tvlv.handler_list_lock); 549 spin_lock_bh(&bat_priv->tvlv.handler_list_lock);
550 kref_get(&tvlv_handler->refcount);
545 hlist_add_head_rcu(&tvlv_handler->list, &bat_priv->tvlv.handler_list); 551 hlist_add_head_rcu(&tvlv_handler->list, &bat_priv->tvlv.handler_list);
546 spin_unlock_bh(&bat_priv->tvlv.handler_list_lock); 552 spin_unlock_bh(&bat_priv->tvlv.handler_list_lock);
553
554 /* don't return reference to new tvlv_handler */
555 batadv_tvlv_handler_put(tvlv_handler);
547} 556}
548 557
549/** 558/**
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index a64522c3b45d..b3dd1a381aad 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -28,6 +28,7 @@
28#include <linux/if_ether.h> 28#include <linux/if_ether.h>
29#include <linux/kref.h> 29#include <linux/kref.h>
30#include <linux/netdevice.h> 30#include <linux/netdevice.h>
31#include <linux/netlink.h>
31#include <linux/sched.h> /* for linux/wait.h */ 32#include <linux/sched.h> /* for linux/wait.h */
32#include <linux/spinlock.h> 33#include <linux/spinlock.h>
33#include <linux/types.h> 34#include <linux/types.h>
@@ -132,7 +133,6 @@ struct batadv_hard_iface_bat_v {
132 * @rcu: struct used for freeing in an RCU-safe manner 133 * @rcu: struct used for freeing in an RCU-safe manner
133 * @bat_iv: per hard-interface B.A.T.M.A.N. IV data 134 * @bat_iv: per hard-interface B.A.T.M.A.N. IV data
134 * @bat_v: per hard-interface B.A.T.M.A.N. V data 135 * @bat_v: per hard-interface B.A.T.M.A.N. V data
135 * @cleanup_work: work queue callback item for hard-interface deinit
136 * @debug_dir: dentry for nc subdir in batman-adv directory in debugfs 136 * @debug_dir: dentry for nc subdir in batman-adv directory in debugfs
137 * @neigh_list: list of unique single hop neighbors via this interface 137 * @neigh_list: list of unique single hop neighbors via this interface
138 * @neigh_list_lock: lock protecting neigh_list 138 * @neigh_list_lock: lock protecting neigh_list
@@ -152,7 +152,6 @@ struct batadv_hard_iface {
152#ifdef CONFIG_BATMAN_ADV_BATMAN_V 152#ifdef CONFIG_BATMAN_ADV_BATMAN_V
153 struct batadv_hard_iface_bat_v bat_v; 153 struct batadv_hard_iface_bat_v bat_v;
154#endif 154#endif
155 struct work_struct cleanup_work;
156 struct dentry *debug_dir; 155 struct dentry *debug_dir;
157 struct hlist_head neigh_list; 156 struct hlist_head neigh_list;
158 /* neigh_list_lock protects: neigh_list */ 157 /* neigh_list_lock protects: neigh_list */
@@ -1015,7 +1014,6 @@ struct batadv_priv_bat_v {
1015 * @forw_bcast_list_lock: lock protecting forw_bcast_list 1014 * @forw_bcast_list_lock: lock protecting forw_bcast_list
1016 * @tp_list_lock: spinlock protecting @tp_list 1015 * @tp_list_lock: spinlock protecting @tp_list
1017 * @orig_work: work queue callback item for orig node purging 1016 * @orig_work: work queue callback item for orig node purging
1018 * @cleanup_work: work queue callback item for soft-interface deinit
1019 * @primary_if: one of the hard-interfaces assigned to this mesh interface 1017 * @primary_if: one of the hard-interfaces assigned to this mesh interface
1020 * becomes the primary interface 1018 * becomes the primary interface
1021 * @algo_ops: routing algorithm used by this mesh interface 1019 * @algo_ops: routing algorithm used by this mesh interface
@@ -1074,7 +1072,6 @@ struct batadv_priv {
1074 spinlock_t tp_list_lock; /* protects tp_list */ 1072 spinlock_t tp_list_lock; /* protects tp_list */
1075 atomic_t tp_num; 1073 atomic_t tp_num;
1076 struct delayed_work orig_work; 1074 struct delayed_work orig_work;
1077 struct work_struct cleanup_work;
1078 struct batadv_hard_iface __rcu *primary_if; /* rcu protected pointer */ 1075 struct batadv_hard_iface __rcu *primary_if; /* rcu protected pointer */
1079 struct batadv_algo_ops *algo_ops; 1076 struct batadv_algo_ops *algo_ops;
1080 struct hlist_head softif_vlan_list; 1077 struct hlist_head softif_vlan_list;
@@ -1379,6 +1376,7 @@ struct batadv_skb_cb {
1379 * locally generated packet 1376 * locally generated packet
1380 * @if_outgoing: packet where the packet should be sent to, or NULL if 1377 * @if_outgoing: packet where the packet should be sent to, or NULL if
1381 * unspecified 1378 * unspecified
1379 * @queue_left: The queue (counter) this packet was applied to
1382 */ 1380 */
1383struct batadv_forw_packet { 1381struct batadv_forw_packet {
1384 struct hlist_node list; 1382 struct hlist_node list;
@@ -1391,11 +1389,13 @@ struct batadv_forw_packet {
1391 struct delayed_work delayed_work; 1389 struct delayed_work delayed_work;
1392 struct batadv_hard_iface *if_incoming; 1390 struct batadv_hard_iface *if_incoming;
1393 struct batadv_hard_iface *if_outgoing; 1391 struct batadv_hard_iface *if_outgoing;
1392 atomic_t *queue_left;
1394}; 1393};
1395 1394
1396/** 1395/**
1397 * struct batadv_algo_iface_ops - mesh algorithm callbacks (interface specific) 1396 * struct batadv_algo_iface_ops - mesh algorithm callbacks (interface specific)
1398 * @activate: start routing mechanisms when hard-interface is brought up 1397 * @activate: start routing mechanisms when hard-interface is brought up
1398 * (optional)
1399 * @enable: init routing info when hard-interface is enabled 1399 * @enable: init routing info when hard-interface is enabled
1400 * @disable: de-init routing info when hard-interface is disabled 1400 * @disable: de-init routing info when hard-interface is disabled
1401 * @update_mac: (re-)init mac addresses of the protocol information 1401 * @update_mac: (re-)init mac addresses of the protocol information
@@ -1413,11 +1413,13 @@ struct batadv_algo_iface_ops {
1413/** 1413/**
1414 * struct batadv_algo_neigh_ops - mesh algorithm callbacks (neighbour specific) 1414 * struct batadv_algo_neigh_ops - mesh algorithm callbacks (neighbour specific)
1415 * @hardif_init: called on creation of single hop entry 1415 * @hardif_init: called on creation of single hop entry
1416 * (optional)
1416 * @cmp: compare the metrics of two neighbors for their respective outgoing 1417 * @cmp: compare the metrics of two neighbors for their respective outgoing
1417 * interfaces 1418 * interfaces
1418 * @is_similar_or_better: check if neigh1 is equally similar or better than 1419 * @is_similar_or_better: check if neigh1 is equally similar or better than
1419 * neigh2 for their respective outgoing interface from the metric prospective 1420 * neigh2 for their respective outgoing interface from the metric prospective
1420 * @print: print the single hop neighbor list (optional) 1421 * @print: print the single hop neighbor list (optional)
1422 * @dump: dump neighbors to a netlink socket (optional)
1421 */ 1423 */
1422struct batadv_algo_neigh_ops { 1424struct batadv_algo_neigh_ops {
1423 void (*hardif_init)(struct batadv_hardif_neigh_node *neigh); 1425 void (*hardif_init)(struct batadv_hardif_neigh_node *neigh);
@@ -1429,26 +1431,64 @@ struct batadv_algo_neigh_ops {
1429 struct batadv_hard_iface *if_outgoing1, 1431 struct batadv_hard_iface *if_outgoing1,
1430 struct batadv_neigh_node *neigh2, 1432 struct batadv_neigh_node *neigh2,
1431 struct batadv_hard_iface *if_outgoing2); 1433 struct batadv_hard_iface *if_outgoing2);
1434#ifdef CONFIG_BATMAN_ADV_DEBUGFS
1432 void (*print)(struct batadv_priv *priv, struct seq_file *seq); 1435 void (*print)(struct batadv_priv *priv, struct seq_file *seq);
1436#endif
1437 void (*dump)(struct sk_buff *msg, struct netlink_callback *cb,
1438 struct batadv_priv *priv,
1439 struct batadv_hard_iface *hard_iface);
1433}; 1440};
1434 1441
1435/** 1442/**
1436 * struct batadv_algo_orig_ops - mesh algorithm callbacks (originator specific) 1443 * struct batadv_algo_orig_ops - mesh algorithm callbacks (originator specific)
1437 * @free: free the resources allocated by the routing algorithm for an orig_node 1444 * @free: free the resources allocated by the routing algorithm for an orig_node
1438 * object 1445 * object (optional)
1439 * @add_if: ask the routing algorithm to apply the needed changes to the 1446 * @add_if: ask the routing algorithm to apply the needed changes to the
1440 * orig_node due to a new hard-interface being added into the mesh 1447 * orig_node due to a new hard-interface being added into the mesh (optional)
1441 * @del_if: ask the routing algorithm to apply the needed changes to the 1448 * @del_if: ask the routing algorithm to apply the needed changes to the
1442 * orig_node due to an hard-interface being removed from the mesh 1449 * orig_node due to an hard-interface being removed from the mesh (optional)
1443 * @print: print the originator table (optional) 1450 * @print: print the originator table (optional)
1451 * @dump: dump originators to a netlink socket (optional)
1444 */ 1452 */
1445struct batadv_algo_orig_ops { 1453struct batadv_algo_orig_ops {
1446 void (*free)(struct batadv_orig_node *orig_node); 1454 void (*free)(struct batadv_orig_node *orig_node);
1447 int (*add_if)(struct batadv_orig_node *orig_node, int max_if_num); 1455 int (*add_if)(struct batadv_orig_node *orig_node, int max_if_num);
1448 int (*del_if)(struct batadv_orig_node *orig_node, int max_if_num, 1456 int (*del_if)(struct batadv_orig_node *orig_node, int max_if_num,
1449 int del_if_num); 1457 int del_if_num);
1458#ifdef CONFIG_BATMAN_ADV_DEBUGFS
1450 void (*print)(struct batadv_priv *priv, struct seq_file *seq, 1459 void (*print)(struct batadv_priv *priv, struct seq_file *seq,
1451 struct batadv_hard_iface *hard_iface); 1460 struct batadv_hard_iface *hard_iface);
1461#endif
1462 void (*dump)(struct sk_buff *msg, struct netlink_callback *cb,
1463 struct batadv_priv *priv,
1464 struct batadv_hard_iface *hard_iface);
1465};
1466
1467/**
1468 * struct batadv_algo_gw_ops - mesh algorithm callbacks (GW specific)
1469 * @store_sel_class: parse and stores a new GW selection class (optional)
1470 * @show_sel_class: prints the current GW selection class (optional)
1471 * @get_best_gw_node: select the best GW from the list of available nodes
1472 * (optional)
1473 * @is_eligible: check if a newly discovered GW is a potential candidate for
1474 * the election as best GW (optional)
1475 * @print: print the gateway table (optional)
1476 * @dump: dump gateways to a netlink socket (optional)
1477 */
1478struct batadv_algo_gw_ops {
1479 ssize_t (*store_sel_class)(struct batadv_priv *bat_priv, char *buff,
1480 size_t count);
1481 ssize_t (*show_sel_class)(struct batadv_priv *bat_priv, char *buff);
1482 struct batadv_gw_node *(*get_best_gw_node)
1483 (struct batadv_priv *bat_priv);
1484 bool (*is_eligible)(struct batadv_priv *bat_priv,
1485 struct batadv_orig_node *curr_gw_orig,
1486 struct batadv_orig_node *orig_node);
1487#ifdef CONFIG_BATMAN_ADV_DEBUGFS
1488 void (*print)(struct batadv_priv *bat_priv, struct seq_file *seq);
1489#endif
1490 void (*dump)(struct sk_buff *msg, struct netlink_callback *cb,
1491 struct batadv_priv *priv);
1452}; 1492};
1453 1493
1454/** 1494/**
@@ -1458,6 +1498,7 @@ struct batadv_algo_orig_ops {
1458 * @iface: callbacks related to interface handling 1498 * @iface: callbacks related to interface handling
1459 * @neigh: callbacks related to neighbors handling 1499 * @neigh: callbacks related to neighbors handling
1460 * @orig: callbacks related to originators handling 1500 * @orig: callbacks related to originators handling
1501 * @gw: callbacks related to GW mode
1461 */ 1502 */
1462struct batadv_algo_ops { 1503struct batadv_algo_ops {
1463 struct hlist_node list; 1504 struct hlist_node list;
@@ -1465,6 +1506,7 @@ struct batadv_algo_ops {
1465 struct batadv_algo_iface_ops iface; 1506 struct batadv_algo_iface_ops iface;
1466 struct batadv_algo_neigh_ops neigh; 1507 struct batadv_algo_neigh_ops neigh;
1467 struct batadv_algo_orig_ops orig; 1508 struct batadv_algo_orig_ops orig;
1509 struct batadv_algo_gw_ops gw;
1468}; 1510};
1469 1511
1470/** 1512/**
@@ -1564,4 +1606,17 @@ enum batadv_tvlv_handler_flags {
1564 BATADV_TVLV_HANDLER_OGM_CALLED = BIT(2), 1606 BATADV_TVLV_HANDLER_OGM_CALLED = BIT(2),
1565}; 1607};
1566 1608
1609/**
1610 * struct batadv_store_mesh_work - Work queue item to detach add/del interface
1611 * from sysfs locks
1612 * @net_dev: netdevice to add/remove to/from batman-adv soft-interface
1613 * @soft_iface_name: name of soft-interface to modify
1614 * @work: work queue item
1615 */
1616struct batadv_store_mesh_work {
1617 struct net_device *net_dev;
1618 char soft_iface_name[IFNAMSIZ];
1619 struct work_struct work;
1620};
1621
1567#endif /* _NET_BATMAN_ADV_TYPES_H_ */ 1622#endif /* _NET_BATMAN_ADV_TYPES_H_ */
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 0b5f729d08d2..1aff2da9bc74 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -26,11 +26,13 @@
26 26
27#include <linux/module.h> 27#include <linux/module.h>
28#include <linux/debugfs.h> 28#include <linux/debugfs.h>
29#include <linux/stringify.h>
29#include <asm/ioctls.h> 30#include <asm/ioctls.h>
30 31
31#include <net/bluetooth/bluetooth.h> 32#include <net/bluetooth/bluetooth.h>
32#include <linux/proc_fs.h> 33#include <linux/proc_fs.h>
33 34
35#include "leds.h"
34#include "selftest.h" 36#include "selftest.h"
35 37
36/* Bluetooth sockets */ 38/* Bluetooth sockets */
@@ -712,13 +714,16 @@ static struct net_proto_family bt_sock_family_ops = {
712struct dentry *bt_debugfs; 714struct dentry *bt_debugfs;
713EXPORT_SYMBOL_GPL(bt_debugfs); 715EXPORT_SYMBOL_GPL(bt_debugfs);
714 716
717#define VERSION __stringify(BT_SUBSYS_VERSION) "." \
718 __stringify(BT_SUBSYS_REVISION)
719
715static int __init bt_init(void) 720static int __init bt_init(void)
716{ 721{
717 int err; 722 int err;
718 723
719 sock_skb_cb_check_size(sizeof(struct bt_skb_cb)); 724 sock_skb_cb_check_size(sizeof(struct bt_skb_cb));
720 725
721 BT_INFO("Core ver %s", BT_SUBSYS_VERSION); 726 BT_INFO("Core ver %s", VERSION);
722 727
723 err = bt_selftest(); 728 err = bt_selftest();
724 if (err < 0) 729 if (err < 0)
@@ -726,6 +731,8 @@ static int __init bt_init(void)
726 731
727 bt_debugfs = debugfs_create_dir("bluetooth", NULL); 732 bt_debugfs = debugfs_create_dir("bluetooth", NULL);
728 733
734 bt_leds_init();
735
729 err = bt_sysfs_init(); 736 err = bt_sysfs_init();
730 if (err < 0) 737 if (err < 0)
731 return err; 738 return err;
@@ -785,6 +792,8 @@ static void __exit bt_exit(void)
785 792
786 bt_sysfs_cleanup(); 793 bt_sysfs_cleanup();
787 794
795 bt_leds_cleanup();
796
788 debugfs_remove_recursive(bt_debugfs); 797 debugfs_remove_recursive(bt_debugfs);
789} 798}
790 799
@@ -792,7 +801,7 @@ subsys_initcall(bt_init);
792module_exit(bt_exit); 801module_exit(bt_exit);
793 802
794MODULE_AUTHOR("Marcel Holtmann <marcel@holtmann.org>"); 803MODULE_AUTHOR("Marcel Holtmann <marcel@holtmann.org>");
795MODULE_DESCRIPTION("Bluetooth Core ver " BT_SUBSYS_VERSION); 804MODULE_DESCRIPTION("Bluetooth Core ver " VERSION);
796MODULE_VERSION(BT_SUBSYS_VERSION); 805MODULE_VERSION(VERSION);
797MODULE_LICENSE("GPL"); 806MODULE_LICENSE("GPL");
798MODULE_ALIAS_NETPROTO(PF_BLUETOOTH); 807MODULE_ALIAS_NETPROTO(PF_BLUETOOTH);
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index ddf8432fe8fb..3ac89e9ace71 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -1562,6 +1562,7 @@ int hci_dev_do_close(struct hci_dev *hdev)
1562 auto_off = hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF); 1562 auto_off = hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF);
1563 1563
1564 if (!auto_off && hdev->dev_type == HCI_PRIMARY && 1564 if (!auto_off && hdev->dev_type == HCI_PRIMARY &&
1565 !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) &&
1565 hci_dev_test_flag(hdev, HCI_MGMT)) 1566 hci_dev_test_flag(hdev, HCI_MGMT))
1566 __mgmt_power_off(hdev); 1567 __mgmt_power_off(hdev);
1567 1568
diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index b0e23dfc5c34..e2288421fe6b 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -21,8 +21,6 @@
21 SOFTWARE IS DISCLAIMED. 21 SOFTWARE IS DISCLAIMED.
22*/ 22*/
23 23
24#include <asm/unaligned.h>
25
26#include <net/bluetooth/bluetooth.h> 24#include <net/bluetooth/bluetooth.h>
27#include <net/bluetooth/hci_core.h> 25#include <net/bluetooth/hci_core.h>
28#include <net/bluetooth/mgmt.h> 26#include <net/bluetooth/mgmt.h>
@@ -971,48 +969,88 @@ void __hci_req_enable_advertising(struct hci_request *req)
971 hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable); 969 hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable);
972} 970}
973 971
974static u8 create_default_scan_rsp_data(struct hci_dev *hdev, u8 *ptr) 972static u8 append_local_name(struct hci_dev *hdev, u8 *ptr, u8 ad_len)
975{ 973{
976 u8 ad_len = 0; 974 size_t complete_len;
977 size_t name_len; 975 size_t short_len;
976 int max_len;
978 977
979 name_len = strlen(hdev->dev_name); 978 max_len = HCI_MAX_AD_LENGTH - ad_len - 2;
980 if (name_len > 0) { 979 complete_len = strlen(hdev->dev_name);
981 size_t max_len = HCI_MAX_AD_LENGTH - ad_len - 2; 980 short_len = strlen(hdev->short_name);
982 981
983 if (name_len > max_len) { 982 /* no space left for name */
984 name_len = max_len; 983 if (max_len < 1)
985 ptr[1] = EIR_NAME_SHORT; 984 return ad_len;
986 } else
987 ptr[1] = EIR_NAME_COMPLETE;
988 985
989 ptr[0] = name_len + 1; 986 /* no name set */
987 if (!complete_len)
988 return ad_len;
990 989
991 memcpy(ptr + 2, hdev->dev_name, name_len); 990 /* complete name fits and is eq to max short name len or smaller */
991 if (complete_len <= max_len &&
992 complete_len <= HCI_MAX_SHORT_NAME_LENGTH) {
993 return eir_append_data(ptr, ad_len, EIR_NAME_COMPLETE,
994 hdev->dev_name, complete_len);
995 }
992 996
993 ad_len += (name_len + 2); 997 /* short name set and fits */
994 ptr += (name_len + 2); 998 if (short_len && short_len <= max_len) {
999 return eir_append_data(ptr, ad_len, EIR_NAME_SHORT,
1000 hdev->short_name, short_len);
1001 }
1002
1003 /* no short name set so shorten complete name */
1004 if (!short_len) {
1005 return eir_append_data(ptr, ad_len, EIR_NAME_SHORT,
1006 hdev->dev_name, max_len);
995 } 1007 }
996 1008
997 return ad_len; 1009 return ad_len;
998} 1010}
999 1011
1012static u8 append_appearance(struct hci_dev *hdev, u8 *ptr, u8 ad_len)
1013{
1014 return eir_append_le16(ptr, ad_len, EIR_APPEARANCE, hdev->appearance);
1015}
1016
1017static u8 create_default_scan_rsp_data(struct hci_dev *hdev, u8 *ptr)
1018{
1019 u8 scan_rsp_len = 0;
1020
1021 if (hdev->appearance) {
1022 scan_rsp_len = append_appearance(hdev, ptr, scan_rsp_len);
1023 }
1024
1025 return append_local_name(hdev, ptr, scan_rsp_len);
1026}
1027
1000static u8 create_instance_scan_rsp_data(struct hci_dev *hdev, u8 instance, 1028static u8 create_instance_scan_rsp_data(struct hci_dev *hdev, u8 instance,
1001 u8 *ptr) 1029 u8 *ptr)
1002{ 1030{
1003 struct adv_info *adv_instance; 1031 struct adv_info *adv_instance;
1032 u32 instance_flags;
1033 u8 scan_rsp_len = 0;
1004 1034
1005 adv_instance = hci_find_adv_instance(hdev, instance); 1035 adv_instance = hci_find_adv_instance(hdev, instance);
1006 if (!adv_instance) 1036 if (!adv_instance)
1007 return 0; 1037 return 0;
1008 1038
1009 /* TODO: Set the appropriate entries based on advertising instance flags 1039 instance_flags = adv_instance->flags;
1010 * here once flags other than 0 are supported. 1040
1011 */ 1041 if ((instance_flags & MGMT_ADV_FLAG_APPEARANCE) && hdev->appearance) {
1012 memcpy(ptr, adv_instance->scan_rsp_data, 1042 scan_rsp_len = append_appearance(hdev, ptr, scan_rsp_len);
1043 }
1044
1045 memcpy(&ptr[scan_rsp_len], adv_instance->scan_rsp_data,
1013 adv_instance->scan_rsp_len); 1046 adv_instance->scan_rsp_len);
1014 1047
1015 return adv_instance->scan_rsp_len; 1048 scan_rsp_len += adv_instance->scan_rsp_len;
1049
1050 if (instance_flags & MGMT_ADV_FLAG_LOCAL_NAME)
1051 scan_rsp_len = append_local_name(hdev, ptr, scan_rsp_len);
1052
1053 return scan_rsp_len;
1016} 1054}
1017 1055
1018void __hci_req_update_scan_rsp_data(struct hci_request *req, u8 instance) 1056void __hci_req_update_scan_rsp_data(struct hci_request *req, u8 instance)
@@ -1194,7 +1232,7 @@ static void adv_timeout_expire(struct work_struct *work)
1194 1232
1195 hci_req_init(&req, hdev); 1233 hci_req_init(&req, hdev);
1196 1234
1197 hci_req_clear_adv_instance(hdev, &req, instance, false); 1235 hci_req_clear_adv_instance(hdev, NULL, &req, instance, false);
1198 1236
1199 if (list_empty(&hdev->adv_instances)) 1237 if (list_empty(&hdev->adv_instances))
1200 __hci_req_disable_advertising(&req); 1238 __hci_req_disable_advertising(&req);
@@ -1284,8 +1322,9 @@ static void cancel_adv_timeout(struct hci_dev *hdev)
1284 * setting. 1322 * setting.
1285 * - force == false: Only instances that have a timeout will be removed. 1323 * - force == false: Only instances that have a timeout will be removed.
1286 */ 1324 */
1287void hci_req_clear_adv_instance(struct hci_dev *hdev, struct hci_request *req, 1325void hci_req_clear_adv_instance(struct hci_dev *hdev, struct sock *sk,
1288 u8 instance, bool force) 1326 struct hci_request *req, u8 instance,
1327 bool force)
1289{ 1328{
1290 struct adv_info *adv_instance, *n, *next_instance = NULL; 1329 struct adv_info *adv_instance, *n, *next_instance = NULL;
1291 int err; 1330 int err;
@@ -1311,7 +1350,7 @@ void hci_req_clear_adv_instance(struct hci_dev *hdev, struct hci_request *req,
1311 rem_inst = adv_instance->instance; 1350 rem_inst = adv_instance->instance;
1312 err = hci_remove_adv_instance(hdev, rem_inst); 1351 err = hci_remove_adv_instance(hdev, rem_inst);
1313 if (!err) 1352 if (!err)
1314 mgmt_advertising_removed(NULL, hdev, rem_inst); 1353 mgmt_advertising_removed(sk, hdev, rem_inst);
1315 } 1354 }
1316 } else { 1355 } else {
1317 adv_instance = hci_find_adv_instance(hdev, instance); 1356 adv_instance = hci_find_adv_instance(hdev, instance);
@@ -1325,7 +1364,7 @@ void hci_req_clear_adv_instance(struct hci_dev *hdev, struct hci_request *req,
1325 1364
1326 err = hci_remove_adv_instance(hdev, instance); 1365 err = hci_remove_adv_instance(hdev, instance);
1327 if (!err) 1366 if (!err)
1328 mgmt_advertising_removed(NULL, hdev, instance); 1367 mgmt_advertising_removed(sk, hdev, instance);
1329 } 1368 }
1330 } 1369 }
1331 1370
@@ -1716,7 +1755,7 @@ void __hci_abort_conn(struct hci_request *req, struct hci_conn *conn,
1716 * function. To be safe hard-code one of the 1755 * function. To be safe hard-code one of the
1717 * values that's suitable for SCO. 1756 * values that's suitable for SCO.
1718 */ 1757 */
1719 rej.reason = HCI_ERROR_REMOTE_LOW_RESOURCES; 1758 rej.reason = HCI_ERROR_REJ_LIMITED_RESOURCES;
1720 1759
1721 hci_req_add(req, HCI_OP_REJECT_SYNC_CONN_REQ, 1760 hci_req_add(req, HCI_OP_REJECT_SYNC_CONN_REQ,
1722 sizeof(rej), &rej); 1761 sizeof(rej), &rej);
diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h
index b2d044bdc732..6b06629245a8 100644
--- a/net/bluetooth/hci_request.h
+++ b/net/bluetooth/hci_request.h
@@ -20,6 +20,8 @@
20 SOFTWARE IS DISCLAIMED. 20 SOFTWARE IS DISCLAIMED.
21*/ 21*/
22 22
23#include <asm/unaligned.h>
24
23#define hci_req_sync_lock(hdev) mutex_lock(&hdev->req_lock) 25#define hci_req_sync_lock(hdev) mutex_lock(&hdev->req_lock)
24#define hci_req_sync_unlock(hdev) mutex_unlock(&hdev->req_lock) 26#define hci_req_sync_unlock(hdev) mutex_unlock(&hdev->req_lock)
25 27
@@ -73,8 +75,9 @@ void __hci_req_update_scan_rsp_data(struct hci_request *req, u8 instance);
73 75
74int __hci_req_schedule_adv_instance(struct hci_request *req, u8 instance, 76int __hci_req_schedule_adv_instance(struct hci_request *req, u8 instance,
75 bool force); 77 bool force);
76void hci_req_clear_adv_instance(struct hci_dev *hdev, struct hci_request *req, 78void hci_req_clear_adv_instance(struct hci_dev *hdev, struct sock *sk,
77 u8 instance, bool force); 79 struct hci_request *req, u8 instance,
80 bool force);
78 81
79void __hci_req_update_class(struct hci_request *req); 82void __hci_req_update_class(struct hci_request *req);
80 83
@@ -102,3 +105,24 @@ static inline void hci_update_background_scan(struct hci_dev *hdev)
102 105
103void hci_request_setup(struct hci_dev *hdev); 106void hci_request_setup(struct hci_dev *hdev);
104void hci_request_cancel_all(struct hci_dev *hdev); 107void hci_request_cancel_all(struct hci_dev *hdev);
108
109static inline u16 eir_append_data(u8 *eir, u16 eir_len, u8 type,
110 u8 *data, u8 data_len)
111{
112 eir[eir_len++] = sizeof(type) + data_len;
113 eir[eir_len++] = type;
114 memcpy(&eir[eir_len], data, data_len);
115 eir_len += data_len;
116
117 return eir_len;
118}
119
120static inline u16 eir_append_le16(u8 *eir, u16 eir_len, u8 type, u16 data)
121{
122 eir[eir_len++] = sizeof(type) + sizeof(data);
123 eir[eir_len++] = type;
124 put_unaligned_le16(data, &eir[eir_len]);
125 eir_len += sizeof(data);
126
127 return eir_len;
128}
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 96f04b7b9556..48f9471e7c85 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -26,6 +26,7 @@
26 26
27#include <linux/export.h> 27#include <linux/export.h>
28#include <linux/utsname.h> 28#include <linux/utsname.h>
29#include <linux/sched.h>
29#include <asm/unaligned.h> 30#include <asm/unaligned.h>
30 31
31#include <net/bluetooth/bluetooth.h> 32#include <net/bluetooth/bluetooth.h>
@@ -38,6 +39,8 @@
38static LIST_HEAD(mgmt_chan_list); 39static LIST_HEAD(mgmt_chan_list);
39static DEFINE_MUTEX(mgmt_chan_list_lock); 40static DEFINE_MUTEX(mgmt_chan_list_lock);
40 41
42static DEFINE_IDA(sock_cookie_ida);
43
41static atomic_t monitor_promisc = ATOMIC_INIT(0); 44static atomic_t monitor_promisc = ATOMIC_INIT(0);
42 45
43/* ----- HCI socket interface ----- */ 46/* ----- HCI socket interface ----- */
@@ -52,6 +55,8 @@ struct hci_pinfo {
52 __u32 cmsg_mask; 55 __u32 cmsg_mask;
53 unsigned short channel; 56 unsigned short channel;
54 unsigned long flags; 57 unsigned long flags;
58 __u32 cookie;
59 char comm[TASK_COMM_LEN];
55}; 60};
56 61
57void hci_sock_set_flag(struct sock *sk, int nr) 62void hci_sock_set_flag(struct sock *sk, int nr)
@@ -74,6 +79,38 @@ unsigned short hci_sock_get_channel(struct sock *sk)
74 return hci_pi(sk)->channel; 79 return hci_pi(sk)->channel;
75} 80}
76 81
82u32 hci_sock_get_cookie(struct sock *sk)
83{
84 return hci_pi(sk)->cookie;
85}
86
87static bool hci_sock_gen_cookie(struct sock *sk)
88{
89 int id = hci_pi(sk)->cookie;
90
91 if (!id) {
92 id = ida_simple_get(&sock_cookie_ida, 1, 0, GFP_KERNEL);
93 if (id < 0)
94 id = 0xffffffff;
95
96 hci_pi(sk)->cookie = id;
97 get_task_comm(hci_pi(sk)->comm, current);
98 return true;
99 }
100
101 return false;
102}
103
104static void hci_sock_free_cookie(struct sock *sk)
105{
106 int id = hci_pi(sk)->cookie;
107
108 if (id) {
109 hci_pi(sk)->cookie = 0xffffffff;
110 ida_simple_remove(&sock_cookie_ida, id);
111 }
112}
113
77static inline int hci_test_bit(int nr, const void *addr) 114static inline int hci_test_bit(int nr, const void *addr)
78{ 115{
79 return *((const __u32 *) addr + (nr >> 5)) & ((__u32) 1 << (nr & 31)); 116 return *((const __u32 *) addr + (nr >> 5)) & ((__u32) 1 << (nr & 31));
@@ -305,6 +342,60 @@ void hci_send_to_monitor(struct hci_dev *hdev, struct sk_buff *skb)
305 kfree_skb(skb_copy); 342 kfree_skb(skb_copy);
306} 343}
307 344
345void hci_send_monitor_ctrl_event(struct hci_dev *hdev, u16 event,
346 void *data, u16 data_len, ktime_t tstamp,
347 int flag, struct sock *skip_sk)
348{
349 struct sock *sk;
350 __le16 index;
351
352 if (hdev)
353 index = cpu_to_le16(hdev->id);
354 else
355 index = cpu_to_le16(MGMT_INDEX_NONE);
356
357 read_lock(&hci_sk_list.lock);
358
359 sk_for_each(sk, &hci_sk_list.head) {
360 struct hci_mon_hdr *hdr;
361 struct sk_buff *skb;
362
363 if (hci_pi(sk)->channel != HCI_CHANNEL_CONTROL)
364 continue;
365
366 /* Ignore socket without the flag set */
367 if (!hci_sock_test_flag(sk, flag))
368 continue;
369
370 /* Skip the original socket */
371 if (sk == skip_sk)
372 continue;
373
374 skb = bt_skb_alloc(6 + data_len, GFP_ATOMIC);
375 if (!skb)
376 continue;
377
378 put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4));
379 put_unaligned_le16(event, skb_put(skb, 2));
380
381 if (data)
382 memcpy(skb_put(skb, data_len), data, data_len);
383
384 skb->tstamp = tstamp;
385
386 hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE);
387 hdr->opcode = cpu_to_le16(HCI_MON_CTRL_EVENT);
388 hdr->index = index;
389 hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE);
390
391 hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
392 HCI_SOCK_TRUSTED, NULL);
393 kfree_skb(skb);
394 }
395
396 read_unlock(&hci_sk_list.lock);
397}
398
308static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event) 399static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event)
309{ 400{
310 struct hci_mon_hdr *hdr; 401 struct hci_mon_hdr *hdr;
@@ -384,6 +475,129 @@ static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event)
384 return skb; 475 return skb;
385} 476}
386 477
478static struct sk_buff *create_monitor_ctrl_open(struct sock *sk)
479{
480 struct hci_mon_hdr *hdr;
481 struct sk_buff *skb;
482 u16 format;
483 u8 ver[3];
484 u32 flags;
485
486 /* No message needed when cookie is not present */
487 if (!hci_pi(sk)->cookie)
488 return NULL;
489
490 switch (hci_pi(sk)->channel) {
491 case HCI_CHANNEL_RAW:
492 format = 0x0000;
493 ver[0] = BT_SUBSYS_VERSION;
494 put_unaligned_le16(BT_SUBSYS_REVISION, ver + 1);
495 break;
496 case HCI_CHANNEL_USER:
497 format = 0x0001;
498 ver[0] = BT_SUBSYS_VERSION;
499 put_unaligned_le16(BT_SUBSYS_REVISION, ver + 1);
500 break;
501 case HCI_CHANNEL_CONTROL:
502 format = 0x0002;
503 mgmt_fill_version_info(ver);
504 break;
505 default:
506 /* No message for unsupported format */
507 return NULL;
508 }
509
510 skb = bt_skb_alloc(14 + TASK_COMM_LEN , GFP_ATOMIC);
511 if (!skb)
512 return NULL;
513
514 flags = hci_sock_test_flag(sk, HCI_SOCK_TRUSTED) ? 0x1 : 0x0;
515
516 put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4));
517 put_unaligned_le16(format, skb_put(skb, 2));
518 memcpy(skb_put(skb, sizeof(ver)), ver, sizeof(ver));
519 put_unaligned_le32(flags, skb_put(skb, 4));
520 *skb_put(skb, 1) = TASK_COMM_LEN;
521 memcpy(skb_put(skb, TASK_COMM_LEN), hci_pi(sk)->comm, TASK_COMM_LEN);
522
523 __net_timestamp(skb);
524
525 hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE);
526 hdr->opcode = cpu_to_le16(HCI_MON_CTRL_OPEN);
527 if (hci_pi(sk)->hdev)
528 hdr->index = cpu_to_le16(hci_pi(sk)->hdev->id);
529 else
530 hdr->index = cpu_to_le16(HCI_DEV_NONE);
531 hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE);
532
533 return skb;
534}
535
536static struct sk_buff *create_monitor_ctrl_close(struct sock *sk)
537{
538 struct hci_mon_hdr *hdr;
539 struct sk_buff *skb;
540
541 /* No message needed when cookie is not present */
542 if (!hci_pi(sk)->cookie)
543 return NULL;
544
545 switch (hci_pi(sk)->channel) {
546 case HCI_CHANNEL_RAW:
547 case HCI_CHANNEL_USER:
548 case HCI_CHANNEL_CONTROL:
549 break;
550 default:
551 /* No message for unsupported format */
552 return NULL;
553 }
554
555 skb = bt_skb_alloc(4, GFP_ATOMIC);
556 if (!skb)
557 return NULL;
558
559 put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4));
560
561 __net_timestamp(skb);
562
563 hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE);
564 hdr->opcode = cpu_to_le16(HCI_MON_CTRL_CLOSE);
565 if (hci_pi(sk)->hdev)
566 hdr->index = cpu_to_le16(hci_pi(sk)->hdev->id);
567 else
568 hdr->index = cpu_to_le16(HCI_DEV_NONE);
569 hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE);
570
571 return skb;
572}
573
574static struct sk_buff *create_monitor_ctrl_command(struct sock *sk, u16 index,
575 u16 opcode, u16 len,
576 const void *buf)
577{
578 struct hci_mon_hdr *hdr;
579 struct sk_buff *skb;
580
581 skb = bt_skb_alloc(6 + len, GFP_ATOMIC);
582 if (!skb)
583 return NULL;
584
585 put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4));
586 put_unaligned_le16(opcode, skb_put(skb, 2));
587
588 if (buf)
589 memcpy(skb_put(skb, len), buf, len);
590
591 __net_timestamp(skb);
592
593 hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE);
594 hdr->opcode = cpu_to_le16(HCI_MON_CTRL_COMMAND);
595 hdr->index = cpu_to_le16(index);
596 hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE);
597
598 return skb;
599}
600
387static void __printf(2, 3) 601static void __printf(2, 3)
388send_monitor_note(struct sock *sk, const char *fmt, ...) 602send_monitor_note(struct sock *sk, const char *fmt, ...)
389{ 603{
@@ -458,6 +672,26 @@ static void send_monitor_replay(struct sock *sk)
458 read_unlock(&hci_dev_list_lock); 672 read_unlock(&hci_dev_list_lock);
459} 673}
460 674
675static void send_monitor_control_replay(struct sock *mon_sk)
676{
677 struct sock *sk;
678
679 read_lock(&hci_sk_list.lock);
680
681 sk_for_each(sk, &hci_sk_list.head) {
682 struct sk_buff *skb;
683
684 skb = create_monitor_ctrl_open(sk);
685 if (!skb)
686 continue;
687
688 if (sock_queue_rcv_skb(mon_sk, skb))
689 kfree_skb(skb);
690 }
691
692 read_unlock(&hci_sk_list.lock);
693}
694
461/* Generate internal stack event */ 695/* Generate internal stack event */
462static void hci_si_event(struct hci_dev *hdev, int type, int dlen, void *data) 696static void hci_si_event(struct hci_dev *hdev, int type, int dlen, void *data)
463{ 697{
@@ -585,6 +819,7 @@ static int hci_sock_release(struct socket *sock)
585{ 819{
586 struct sock *sk = sock->sk; 820 struct sock *sk = sock->sk;
587 struct hci_dev *hdev; 821 struct hci_dev *hdev;
822 struct sk_buff *skb;
588 823
589 BT_DBG("sock %p sk %p", sock, sk); 824 BT_DBG("sock %p sk %p", sock, sk);
590 825
@@ -593,8 +828,24 @@ static int hci_sock_release(struct socket *sock)
593 828
594 hdev = hci_pi(sk)->hdev; 829 hdev = hci_pi(sk)->hdev;
595 830
596 if (hci_pi(sk)->channel == HCI_CHANNEL_MONITOR) 831 switch (hci_pi(sk)->channel) {
832 case HCI_CHANNEL_MONITOR:
597 atomic_dec(&monitor_promisc); 833 atomic_dec(&monitor_promisc);
834 break;
835 case HCI_CHANNEL_RAW:
836 case HCI_CHANNEL_USER:
837 case HCI_CHANNEL_CONTROL:
838 /* Send event to monitor */
839 skb = create_monitor_ctrl_close(sk);
840 if (skb) {
841 hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
842 HCI_SOCK_TRUSTED, NULL);
843 kfree_skb(skb);
844 }
845
846 hci_sock_free_cookie(sk);
847 break;
848 }
598 849
599 bt_sock_unlink(&hci_sk_list, sk); 850 bt_sock_unlink(&hci_sk_list, sk);
600 851
@@ -721,6 +972,27 @@ static int hci_sock_ioctl(struct socket *sock, unsigned int cmd,
721 goto done; 972 goto done;
722 } 973 }
723 974
975 /* When calling an ioctl on an unbound raw socket, then ensure
976 * that the monitor gets informed. Ensure that the resulting event
977 * is only send once by checking if the cookie exists or not. The
978 * socket cookie will be only ever generated once for the lifetime
979 * of a given socket.
980 */
981 if (hci_sock_gen_cookie(sk)) {
982 struct sk_buff *skb;
983
984 if (capable(CAP_NET_ADMIN))
985 hci_sock_set_flag(sk, HCI_SOCK_TRUSTED);
986
987 /* Send event to monitor */
988 skb = create_monitor_ctrl_open(sk);
989 if (skb) {
990 hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
991 HCI_SOCK_TRUSTED, NULL);
992 kfree_skb(skb);
993 }
994 }
995
724 release_sock(sk); 996 release_sock(sk);
725 997
726 switch (cmd) { 998 switch (cmd) {
@@ -784,6 +1056,7 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
784 struct sockaddr_hci haddr; 1056 struct sockaddr_hci haddr;
785 struct sock *sk = sock->sk; 1057 struct sock *sk = sock->sk;
786 struct hci_dev *hdev = NULL; 1058 struct hci_dev *hdev = NULL;
1059 struct sk_buff *skb;
787 int len, err = 0; 1060 int len, err = 0;
788 1061
789 BT_DBG("sock %p sk %p", sock, sk); 1062 BT_DBG("sock %p sk %p", sock, sk);
@@ -822,7 +1095,35 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
822 atomic_inc(&hdev->promisc); 1095 atomic_inc(&hdev->promisc);
823 } 1096 }
824 1097
1098 hci_pi(sk)->channel = haddr.hci_channel;
1099
1100 if (!hci_sock_gen_cookie(sk)) {
1101 /* In the case when a cookie has already been assigned,
1102 * then there has been already an ioctl issued against
1103 * an unbound socket and with that triggerd an open
1104 * notification. Send a close notification first to
1105 * allow the state transition to bounded.
1106 */
1107 skb = create_monitor_ctrl_close(sk);
1108 if (skb) {
1109 hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
1110 HCI_SOCK_TRUSTED, NULL);
1111 kfree_skb(skb);
1112 }
1113 }
1114
1115 if (capable(CAP_NET_ADMIN))
1116 hci_sock_set_flag(sk, HCI_SOCK_TRUSTED);
1117
825 hci_pi(sk)->hdev = hdev; 1118 hci_pi(sk)->hdev = hdev;
1119
1120 /* Send event to monitor */
1121 skb = create_monitor_ctrl_open(sk);
1122 if (skb) {
1123 hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
1124 HCI_SOCK_TRUSTED, NULL);
1125 kfree_skb(skb);
1126 }
826 break; 1127 break;
827 1128
828 case HCI_CHANNEL_USER: 1129 case HCI_CHANNEL_USER:
@@ -884,9 +1185,38 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
884 } 1185 }
885 } 1186 }
886 1187
887 atomic_inc(&hdev->promisc); 1188 hci_pi(sk)->channel = haddr.hci_channel;
1189
1190 if (!hci_sock_gen_cookie(sk)) {
1191 /* In the case when a cookie has already been assigned,
1192 * this socket will transition from a raw socket into
1193 * an user channel socket. For a clean transition, send
1194 * the close notification first.
1195 */
1196 skb = create_monitor_ctrl_close(sk);
1197 if (skb) {
1198 hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
1199 HCI_SOCK_TRUSTED, NULL);
1200 kfree_skb(skb);
1201 }
1202 }
1203
1204 /* The user channel is restricted to CAP_NET_ADMIN
1205 * capabilities and with that implicitly trusted.
1206 */
1207 hci_sock_set_flag(sk, HCI_SOCK_TRUSTED);
888 1208
889 hci_pi(sk)->hdev = hdev; 1209 hci_pi(sk)->hdev = hdev;
1210
1211 /* Send event to monitor */
1212 skb = create_monitor_ctrl_open(sk);
1213 if (skb) {
1214 hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
1215 HCI_SOCK_TRUSTED, NULL);
1216 kfree_skb(skb);
1217 }
1218
1219 atomic_inc(&hdev->promisc);
890 break; 1220 break;
891 1221
892 case HCI_CHANNEL_MONITOR: 1222 case HCI_CHANNEL_MONITOR:
@@ -900,6 +1230,8 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
900 goto done; 1230 goto done;
901 } 1231 }
902 1232
1233 hci_pi(sk)->channel = haddr.hci_channel;
1234
903 /* The monitor interface is restricted to CAP_NET_RAW 1235 /* The monitor interface is restricted to CAP_NET_RAW
904 * capabilities and with that implicitly trusted. 1236 * capabilities and with that implicitly trusted.
905 */ 1237 */
@@ -908,9 +1240,10 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
908 send_monitor_note(sk, "Linux version %s (%s)", 1240 send_monitor_note(sk, "Linux version %s (%s)",
909 init_utsname()->release, 1241 init_utsname()->release,
910 init_utsname()->machine); 1242 init_utsname()->machine);
911 send_monitor_note(sk, "Bluetooth subsystem version %s", 1243 send_monitor_note(sk, "Bluetooth subsystem version %u.%u",
912 BT_SUBSYS_VERSION); 1244 BT_SUBSYS_VERSION, BT_SUBSYS_REVISION);
913 send_monitor_replay(sk); 1245 send_monitor_replay(sk);
1246 send_monitor_control_replay(sk);
914 1247
915 atomic_inc(&monitor_promisc); 1248 atomic_inc(&monitor_promisc);
916 break; 1249 break;
@@ -925,6 +1258,8 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
925 err = -EPERM; 1258 err = -EPERM;
926 goto done; 1259 goto done;
927 } 1260 }
1261
1262 hci_pi(sk)->channel = haddr.hci_channel;
928 break; 1263 break;
929 1264
930 default: 1265 default:
@@ -946,6 +1281,8 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
946 if (capable(CAP_NET_ADMIN)) 1281 if (capable(CAP_NET_ADMIN))
947 hci_sock_set_flag(sk, HCI_SOCK_TRUSTED); 1282 hci_sock_set_flag(sk, HCI_SOCK_TRUSTED);
948 1283
1284 hci_pi(sk)->channel = haddr.hci_channel;
1285
949 /* At the moment the index and unconfigured index events 1286 /* At the moment the index and unconfigured index events
950 * are enabled unconditionally. Setting them on each 1287 * are enabled unconditionally. Setting them on each
951 * socket when binding keeps this functionality. They 1288 * socket when binding keeps this functionality. They
@@ -956,16 +1293,40 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
956 * received by untrusted users. Example for such events 1293 * received by untrusted users. Example for such events
957 * are changes to settings, class of device, name etc. 1294 * are changes to settings, class of device, name etc.
958 */ 1295 */
959 if (haddr.hci_channel == HCI_CHANNEL_CONTROL) { 1296 if (hci_pi(sk)->channel == HCI_CHANNEL_CONTROL) {
1297 if (!hci_sock_gen_cookie(sk)) {
1298 /* In the case when a cookie has already been
1299 * assigned, this socket will transtion from
1300 * a raw socket into a control socket. To
1301 * allow for a clean transtion, send the
1302 * close notification first.
1303 */
1304 skb = create_monitor_ctrl_close(sk);
1305 if (skb) {
1306 hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
1307 HCI_SOCK_TRUSTED, NULL);
1308 kfree_skb(skb);
1309 }
1310 }
1311
1312 /* Send event to monitor */
1313 skb = create_monitor_ctrl_open(sk);
1314 if (skb) {
1315 hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
1316 HCI_SOCK_TRUSTED, NULL);
1317 kfree_skb(skb);
1318 }
1319
960 hci_sock_set_flag(sk, HCI_MGMT_INDEX_EVENTS); 1320 hci_sock_set_flag(sk, HCI_MGMT_INDEX_EVENTS);
961 hci_sock_set_flag(sk, HCI_MGMT_UNCONF_INDEX_EVENTS); 1321 hci_sock_set_flag(sk, HCI_MGMT_UNCONF_INDEX_EVENTS);
962 hci_sock_set_flag(sk, HCI_MGMT_GENERIC_EVENTS); 1322 hci_sock_set_flag(sk, HCI_MGMT_OPTION_EVENTS);
1323 hci_sock_set_flag(sk, HCI_MGMT_SETTING_EVENTS);
1324 hci_sock_set_flag(sk, HCI_MGMT_DEV_CLASS_EVENTS);
1325 hci_sock_set_flag(sk, HCI_MGMT_LOCAL_NAME_EVENTS);
963 } 1326 }
964 break; 1327 break;
965 } 1328 }
966 1329
967
968 hci_pi(sk)->channel = haddr.hci_channel;
969 sk->sk_state = BT_BOUND; 1330 sk->sk_state = BT_BOUND;
970 1331
971done: 1332done:
@@ -1133,6 +1494,19 @@ static int hci_mgmt_cmd(struct hci_mgmt_chan *chan, struct sock *sk,
1133 goto done; 1494 goto done;
1134 } 1495 }
1135 1496
1497 if (chan->channel == HCI_CHANNEL_CONTROL) {
1498 struct sk_buff *skb;
1499
1500 /* Send event to monitor */
1501 skb = create_monitor_ctrl_command(sk, index, opcode, len,
1502 buf + sizeof(*hdr));
1503 if (skb) {
1504 hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
1505 HCI_SOCK_TRUSTED, NULL);
1506 kfree_skb(skb);
1507 }
1508 }
1509
1136 if (opcode >= chan->handler_count || 1510 if (opcode >= chan->handler_count ||
1137 chan->handlers[opcode].func == NULL) { 1511 chan->handlers[opcode].func == NULL) {
1138 BT_DBG("Unknown op %u", opcode); 1512 BT_DBG("Unknown op %u", opcode);
@@ -1440,6 +1814,9 @@ static int hci_sock_setsockopt(struct socket *sock, int level, int optname,
1440 1814
1441 BT_DBG("sk %p, opt %d", sk, optname); 1815 BT_DBG("sk %p, opt %d", sk, optname);
1442 1816
1817 if (level != SOL_HCI)
1818 return -ENOPROTOOPT;
1819
1443 lock_sock(sk); 1820 lock_sock(sk);
1444 1821
1445 if (hci_pi(sk)->channel != HCI_CHANNEL_RAW) { 1822 if (hci_pi(sk)->channel != HCI_CHANNEL_RAW) {
@@ -1523,6 +1900,9 @@ static int hci_sock_getsockopt(struct socket *sock, int level, int optname,
1523 1900
1524 BT_DBG("sk %p, opt %d", sk, optname); 1901 BT_DBG("sk %p, opt %d", sk, optname);
1525 1902
1903 if (level != SOL_HCI)
1904 return -ENOPROTOOPT;
1905
1526 if (get_user(len, optlen)) 1906 if (get_user(len, optlen))
1527 return -EFAULT; 1907 return -EFAULT;
1528 1908
diff --git a/net/bluetooth/leds.c b/net/bluetooth/leds.c
index 8319c8440c89..cb670b5594eb 100644
--- a/net/bluetooth/leds.c
+++ b/net/bluetooth/leds.c
@@ -11,6 +11,8 @@
11 11
12#include "leds.h" 12#include "leds.h"
13 13
14DEFINE_LED_TRIGGER(bt_power_led_trigger);
15
14struct hci_basic_led_trigger { 16struct hci_basic_led_trigger {
15 struct led_trigger led_trigger; 17 struct led_trigger led_trigger;
16 struct hci_dev *hdev; 18 struct hci_dev *hdev;
@@ -24,6 +26,21 @@ void hci_leds_update_powered(struct hci_dev *hdev, bool enabled)
24 if (hdev->power_led) 26 if (hdev->power_led)
25 led_trigger_event(hdev->power_led, 27 led_trigger_event(hdev->power_led,
26 enabled ? LED_FULL : LED_OFF); 28 enabled ? LED_FULL : LED_OFF);
29
30 if (!enabled) {
31 struct hci_dev *d;
32
33 read_lock(&hci_dev_list_lock);
34
35 list_for_each_entry(d, &hci_dev_list, list) {
36 if (test_bit(HCI_UP, &d->flags))
37 enabled = true;
38 }
39
40 read_unlock(&hci_dev_list_lock);
41 }
42
43 led_trigger_event(bt_power_led_trigger, enabled ? LED_FULL : LED_OFF);
27} 44}
28 45
29static void power_activate(struct led_classdev *led_cdev) 46static void power_activate(struct led_classdev *led_cdev)
@@ -72,3 +89,13 @@ void hci_leds_init(struct hci_dev *hdev)
72 /* initialize power_led */ 89 /* initialize power_led */
73 hdev->power_led = led_allocate_basic(hdev, power_activate, "power"); 90 hdev->power_led = led_allocate_basic(hdev, power_activate, "power");
74} 91}
92
93void bt_leds_init(void)
94{
95 led_trigger_register_simple("bluetooth-power", &bt_power_led_trigger);
96}
97
98void bt_leds_cleanup(void)
99{
100 led_trigger_unregister_simple(bt_power_led_trigger);
101}
diff --git a/net/bluetooth/leds.h b/net/bluetooth/leds.h
index a9c4d6ea01cf..08725a2fbd9b 100644
--- a/net/bluetooth/leds.h
+++ b/net/bluetooth/leds.h
@@ -7,10 +7,20 @@
7 */ 7 */
8 8
9#if IS_ENABLED(CONFIG_BT_LEDS) 9#if IS_ENABLED(CONFIG_BT_LEDS)
10
10void hci_leds_update_powered(struct hci_dev *hdev, bool enabled); 11void hci_leds_update_powered(struct hci_dev *hdev, bool enabled);
11void hci_leds_init(struct hci_dev *hdev); 12void hci_leds_init(struct hci_dev *hdev);
13
14void bt_leds_init(void);
15void bt_leds_cleanup(void);
16
12#else 17#else
18
13static inline void hci_leds_update_powered(struct hci_dev *hdev, 19static inline void hci_leds_update_powered(struct hci_dev *hdev,
14 bool enabled) {} 20 bool enabled) {}
15static inline void hci_leds_init(struct hci_dev *hdev) {} 21static inline void hci_leds_init(struct hci_dev *hdev) {}
22
23static inline void bt_leds_init(void) {}
24static inline void bt_leds_cleanup(void) {}
25
16#endif 26#endif
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 7639290b6de3..736038085feb 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -38,7 +38,7 @@
38#include "mgmt_util.h" 38#include "mgmt_util.h"
39 39
40#define MGMT_VERSION 1 40#define MGMT_VERSION 1
41#define MGMT_REVISION 13 41#define MGMT_REVISION 14
42 42
43static const u16 mgmt_commands[] = { 43static const u16 mgmt_commands[] = {
44 MGMT_OP_READ_INDEX_LIST, 44 MGMT_OP_READ_INDEX_LIST,
@@ -104,6 +104,8 @@ static const u16 mgmt_commands[] = {
104 MGMT_OP_REMOVE_ADVERTISING, 104 MGMT_OP_REMOVE_ADVERTISING,
105 MGMT_OP_GET_ADV_SIZE_INFO, 105 MGMT_OP_GET_ADV_SIZE_INFO,
106 MGMT_OP_START_LIMITED_DISCOVERY, 106 MGMT_OP_START_LIMITED_DISCOVERY,
107 MGMT_OP_READ_EXT_INFO,
108 MGMT_OP_SET_APPEARANCE,
107}; 109};
108 110
109static const u16 mgmt_events[] = { 111static const u16 mgmt_events[] = {
@@ -141,6 +143,7 @@ static const u16 mgmt_events[] = {
141 MGMT_EV_LOCAL_OOB_DATA_UPDATED, 143 MGMT_EV_LOCAL_OOB_DATA_UPDATED,
142 MGMT_EV_ADVERTISING_ADDED, 144 MGMT_EV_ADVERTISING_ADDED,
143 MGMT_EV_ADVERTISING_REMOVED, 145 MGMT_EV_ADVERTISING_REMOVED,
146 MGMT_EV_EXT_INFO_CHANGED,
144}; 147};
145 148
146static const u16 mgmt_untrusted_commands[] = { 149static const u16 mgmt_untrusted_commands[] = {
@@ -149,6 +152,7 @@ static const u16 mgmt_untrusted_commands[] = {
149 MGMT_OP_READ_UNCONF_INDEX_LIST, 152 MGMT_OP_READ_UNCONF_INDEX_LIST,
150 MGMT_OP_READ_CONFIG_INFO, 153 MGMT_OP_READ_CONFIG_INFO,
151 MGMT_OP_READ_EXT_INDEX_LIST, 154 MGMT_OP_READ_EXT_INDEX_LIST,
155 MGMT_OP_READ_EXT_INFO,
152}; 156};
153 157
154static const u16 mgmt_untrusted_events[] = { 158static const u16 mgmt_untrusted_events[] = {
@@ -162,6 +166,7 @@ static const u16 mgmt_untrusted_events[] = {
162 MGMT_EV_NEW_CONFIG_OPTIONS, 166 MGMT_EV_NEW_CONFIG_OPTIONS,
163 MGMT_EV_EXT_INDEX_ADDED, 167 MGMT_EV_EXT_INDEX_ADDED,
164 MGMT_EV_EXT_INDEX_REMOVED, 168 MGMT_EV_EXT_INDEX_REMOVED,
169 MGMT_EV_EXT_INFO_CHANGED,
165}; 170};
166 171
167#define CACHE_TIMEOUT msecs_to_jiffies(2 * 1000) 172#define CACHE_TIMEOUT msecs_to_jiffies(2 * 1000)
@@ -256,13 +261,6 @@ static int mgmt_limited_event(u16 event, struct hci_dev *hdev, void *data,
256 flag, skip_sk); 261 flag, skip_sk);
257} 262}
258 263
259static int mgmt_generic_event(u16 event, struct hci_dev *hdev, void *data,
260 u16 len, struct sock *skip_sk)
261{
262 return mgmt_send_event(event, hdev, HCI_CHANNEL_CONTROL, data, len,
263 HCI_MGMT_GENERIC_EVENTS, skip_sk);
264}
265
266static int mgmt_event(u16 event, struct hci_dev *hdev, void *data, u16 len, 264static int mgmt_event(u16 event, struct hci_dev *hdev, void *data, u16 len,
267 struct sock *skip_sk) 265 struct sock *skip_sk)
268{ 266{
@@ -278,6 +276,14 @@ static u8 le_addr_type(u8 mgmt_addr_type)
278 return ADDR_LE_DEV_RANDOM; 276 return ADDR_LE_DEV_RANDOM;
279} 277}
280 278
279void mgmt_fill_version_info(void *ver)
280{
281 struct mgmt_rp_read_version *rp = ver;
282
283 rp->version = MGMT_VERSION;
284 rp->revision = cpu_to_le16(MGMT_REVISION);
285}
286
281static int read_version(struct sock *sk, struct hci_dev *hdev, void *data, 287static int read_version(struct sock *sk, struct hci_dev *hdev, void *data,
282 u16 data_len) 288 u16 data_len)
283{ 289{
@@ -285,8 +291,7 @@ static int read_version(struct sock *sk, struct hci_dev *hdev, void *data,
285 291
286 BT_DBG("sock %p", sk); 292 BT_DBG("sock %p", sk);
287 293
288 rp.version = MGMT_VERSION; 294 mgmt_fill_version_info(&rp);
289 rp.revision = cpu_to_le16(MGMT_REVISION);
290 295
291 return mgmt_cmd_complete(sk, MGMT_INDEX_NONE, MGMT_OP_READ_VERSION, 0, 296 return mgmt_cmd_complete(sk, MGMT_INDEX_NONE, MGMT_OP_READ_VERSION, 0,
292 &rp, sizeof(rp)); 297 &rp, sizeof(rp));
@@ -572,8 +577,8 @@ static int new_options(struct hci_dev *hdev, struct sock *skip)
572{ 577{
573 __le32 options = get_missing_options(hdev); 578 __le32 options = get_missing_options(hdev);
574 579
575 return mgmt_generic_event(MGMT_EV_NEW_CONFIG_OPTIONS, hdev, &options, 580 return mgmt_limited_event(MGMT_EV_NEW_CONFIG_OPTIONS, hdev, &options,
576 sizeof(options), skip); 581 sizeof(options), HCI_MGMT_OPTION_EVENTS, skip);
577} 582}
578 583
579static int send_options_rsp(struct sock *sk, u16 opcode, struct hci_dev *hdev) 584static int send_options_rsp(struct sock *sk, u16 opcode, struct hci_dev *hdev)
@@ -862,6 +867,86 @@ static int read_controller_info(struct sock *sk, struct hci_dev *hdev,
862 sizeof(rp)); 867 sizeof(rp));
863} 868}
864 869
870static u16 append_eir_data_to_buf(struct hci_dev *hdev, u8 *eir)
871{
872 u16 eir_len = 0;
873 size_t name_len;
874
875 if (hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
876 eir_len = eir_append_data(eir, eir_len, EIR_CLASS_OF_DEV,
877 hdev->dev_class, 3);
878
879 if (hci_dev_test_flag(hdev, HCI_LE_ENABLED))
880 eir_len = eir_append_le16(eir, eir_len, EIR_APPEARANCE,
881 hdev->appearance);
882
883 name_len = strlen(hdev->dev_name);
884 eir_len = eir_append_data(eir, eir_len, EIR_NAME_COMPLETE,
885 hdev->dev_name, name_len);
886
887 name_len = strlen(hdev->short_name);
888 eir_len = eir_append_data(eir, eir_len, EIR_NAME_SHORT,
889 hdev->short_name, name_len);
890
891 return eir_len;
892}
893
894static int read_ext_controller_info(struct sock *sk, struct hci_dev *hdev,
895 void *data, u16 data_len)
896{
897 char buf[512];
898 struct mgmt_rp_read_ext_info *rp = (void *)buf;
899 u16 eir_len;
900
901 BT_DBG("sock %p %s", sk, hdev->name);
902
903 memset(&buf, 0, sizeof(buf));
904
905 hci_dev_lock(hdev);
906
907 bacpy(&rp->bdaddr, &hdev->bdaddr);
908
909 rp->version = hdev->hci_ver;
910 rp->manufacturer = cpu_to_le16(hdev->manufacturer);
911
912 rp->supported_settings = cpu_to_le32(get_supported_settings(hdev));
913 rp->current_settings = cpu_to_le32(get_current_settings(hdev));
914
915
916 eir_len = append_eir_data_to_buf(hdev, rp->eir);
917 rp->eir_len = cpu_to_le16(eir_len);
918
919 hci_dev_unlock(hdev);
920
921 /* If this command is called at least once, then the events
922 * for class of device and local name changes are disabled
923 * and only the new extended controller information event
924 * is used.
925 */
926 hci_sock_set_flag(sk, HCI_MGMT_EXT_INFO_EVENTS);
927 hci_sock_clear_flag(sk, HCI_MGMT_DEV_CLASS_EVENTS);
928 hci_sock_clear_flag(sk, HCI_MGMT_LOCAL_NAME_EVENTS);
929
930 return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_READ_EXT_INFO, 0, rp,
931 sizeof(*rp) + eir_len);
932}
933
934static int ext_info_changed(struct hci_dev *hdev, struct sock *skip)
935{
936 char buf[512];
937 struct mgmt_ev_ext_info_changed *ev = (void *)buf;
938 u16 eir_len;
939
940 memset(buf, 0, sizeof(buf));
941
942 eir_len = append_eir_data_to_buf(hdev, ev->eir);
943 ev->eir_len = cpu_to_le16(eir_len);
944
945 return mgmt_limited_event(MGMT_EV_EXT_INFO_CHANGED, hdev, ev,
946 sizeof(*ev) + eir_len,
947 HCI_MGMT_EXT_INFO_EVENTS, skip);
948}
949
865static int send_settings_rsp(struct sock *sk, u16 opcode, struct hci_dev *hdev) 950static int send_settings_rsp(struct sock *sk, u16 opcode, struct hci_dev *hdev)
866{ 951{
867 __le32 settings = cpu_to_le32(get_current_settings(hdev)); 952 __le32 settings = cpu_to_le32(get_current_settings(hdev));
@@ -922,7 +1007,7 @@ static int clean_up_hci_state(struct hci_dev *hdev)
922 hci_req_add(&req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); 1007 hci_req_add(&req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan);
923 } 1008 }
924 1009
925 hci_req_clear_adv_instance(hdev, NULL, 0x00, false); 1010 hci_req_clear_adv_instance(hdev, NULL, NULL, 0x00, false);
926 1011
927 if (hci_dev_test_flag(hdev, HCI_LE_ADV)) 1012 if (hci_dev_test_flag(hdev, HCI_LE_ADV))
928 __hci_req_disable_advertising(&req); 1013 __hci_req_disable_advertising(&req);
@@ -1000,8 +1085,8 @@ static int new_settings(struct hci_dev *hdev, struct sock *skip)
1000{ 1085{
1001 __le32 ev = cpu_to_le32(get_current_settings(hdev)); 1086 __le32 ev = cpu_to_le32(get_current_settings(hdev));
1002 1087
1003 return mgmt_generic_event(MGMT_EV_NEW_SETTINGS, hdev, &ev, 1088 return mgmt_limited_event(MGMT_EV_NEW_SETTINGS, hdev, &ev,
1004 sizeof(ev), skip); 1089 sizeof(ev), HCI_MGMT_SETTING_EVENTS, skip);
1005} 1090}
1006 1091
1007int mgmt_new_settings(struct hci_dev *hdev) 1092int mgmt_new_settings(struct hci_dev *hdev)
@@ -1690,7 +1775,7 @@ static int set_le(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
1690 enabled = lmp_host_le_capable(hdev); 1775 enabled = lmp_host_le_capable(hdev);
1691 1776
1692 if (!val) 1777 if (!val)
1693 hci_req_clear_adv_instance(hdev, NULL, 0x00, true); 1778 hci_req_clear_adv_instance(hdev, NULL, NULL, 0x00, true);
1694 1779
1695 if (!hdev_is_powered(hdev) || val == enabled) { 1780 if (!hdev_is_powered(hdev) || val == enabled) {
1696 bool changed = false; 1781 bool changed = false;
@@ -2435,6 +2520,8 @@ static int send_pin_code_neg_reply(struct sock *sk, struct hci_dev *hdev,
2435 if (!cmd) 2520 if (!cmd)
2436 return -ENOMEM; 2521 return -ENOMEM;
2437 2522
2523 cmd->cmd_complete = addr_cmd_complete;
2524
2438 err = hci_send_cmd(hdev, HCI_OP_PIN_CODE_NEG_REPLY, 2525 err = hci_send_cmd(hdev, HCI_OP_PIN_CODE_NEG_REPLY,
2439 sizeof(cp->addr.bdaddr), &cp->addr.bdaddr); 2526 sizeof(cp->addr.bdaddr), &cp->addr.bdaddr);
2440 if (err < 0) 2527 if (err < 0)
@@ -2513,8 +2600,8 @@ static int set_io_capability(struct sock *sk, struct hci_dev *hdev, void *data,
2513 BT_DBG(""); 2600 BT_DBG("");
2514 2601
2515 if (cp->io_capability > SMP_IO_KEYBOARD_DISPLAY) 2602 if (cp->io_capability > SMP_IO_KEYBOARD_DISPLAY)
2516 return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_IO_CAPABILITY, 2603 return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_IO_CAPABILITY,
2517 MGMT_STATUS_INVALID_PARAMS, NULL, 0); 2604 MGMT_STATUS_INVALID_PARAMS);
2518 2605
2519 hci_dev_lock(hdev); 2606 hci_dev_lock(hdev);
2520 2607
@@ -2932,6 +3019,35 @@ static int user_passkey_neg_reply(struct sock *sk, struct hci_dev *hdev,
2932 HCI_OP_USER_PASSKEY_NEG_REPLY, 0); 3019 HCI_OP_USER_PASSKEY_NEG_REPLY, 0);
2933} 3020}
2934 3021
3022static void adv_expire(struct hci_dev *hdev, u32 flags)
3023{
3024 struct adv_info *adv_instance;
3025 struct hci_request req;
3026 int err;
3027
3028 adv_instance = hci_find_adv_instance(hdev, hdev->cur_adv_instance);
3029 if (!adv_instance)
3030 return;
3031
3032 /* stop if current instance doesn't need to be changed */
3033 if (!(adv_instance->flags & flags))
3034 return;
3035
3036 cancel_adv_timeout(hdev);
3037
3038 adv_instance = hci_get_next_instance(hdev, adv_instance->instance);
3039 if (!adv_instance)
3040 return;
3041
3042 hci_req_init(&req, hdev);
3043 err = __hci_req_schedule_adv_instance(&req, adv_instance->instance,
3044 true);
3045 if (err)
3046 return;
3047
3048 hci_req_run(&req, NULL);
3049}
3050
2935static void set_name_complete(struct hci_dev *hdev, u8 status, u16 opcode) 3051static void set_name_complete(struct hci_dev *hdev, u8 status, u16 opcode)
2936{ 3052{
2937 struct mgmt_cp_set_local_name *cp; 3053 struct mgmt_cp_set_local_name *cp;
@@ -2947,13 +3063,17 @@ static void set_name_complete(struct hci_dev *hdev, u8 status, u16 opcode)
2947 3063
2948 cp = cmd->param; 3064 cp = cmd->param;
2949 3065
2950 if (status) 3066 if (status) {
2951 mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_SET_LOCAL_NAME, 3067 mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_SET_LOCAL_NAME,
2952 mgmt_status(status)); 3068 mgmt_status(status));
2953 else 3069 } else {
2954 mgmt_cmd_complete(cmd->sk, hdev->id, MGMT_OP_SET_LOCAL_NAME, 0, 3070 mgmt_cmd_complete(cmd->sk, hdev->id, MGMT_OP_SET_LOCAL_NAME, 0,
2955 cp, sizeof(*cp)); 3071 cp, sizeof(*cp));
2956 3072
3073 if (hci_dev_test_flag(hdev, HCI_LE_ADV))
3074 adv_expire(hdev, MGMT_ADV_FLAG_LOCAL_NAME);
3075 }
3076
2957 mgmt_pending_remove(cmd); 3077 mgmt_pending_remove(cmd);
2958 3078
2959unlock: 3079unlock:
@@ -2993,8 +3113,9 @@ static int set_local_name(struct sock *sk, struct hci_dev *hdev, void *data,
2993 if (err < 0) 3113 if (err < 0)
2994 goto failed; 3114 goto failed;
2995 3115
2996 err = mgmt_generic_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev, 3116 err = mgmt_limited_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev, data,
2997 data, len, sk); 3117 len, HCI_MGMT_LOCAL_NAME_EVENTS, sk);
3118 ext_info_changed(hdev, sk);
2998 3119
2999 goto failed; 3120 goto failed;
3000 } 3121 }
@@ -3017,7 +3138,7 @@ static int set_local_name(struct sock *sk, struct hci_dev *hdev, void *data,
3017 /* The name is stored in the scan response data and so 3138 /* The name is stored in the scan response data and so
3018 * no need to udpate the advertising data here. 3139 * no need to udpate the advertising data here.
3019 */ 3140 */
3020 if (lmp_le_capable(hdev)) 3141 if (lmp_le_capable(hdev) && hci_dev_test_flag(hdev, HCI_ADVERTISING))
3021 __hci_req_update_scan_rsp_data(&req, hdev->cur_adv_instance); 3142 __hci_req_update_scan_rsp_data(&req, hdev->cur_adv_instance);
3022 3143
3023 err = hci_req_run(&req, set_name_complete); 3144 err = hci_req_run(&req, set_name_complete);
@@ -3029,6 +3150,40 @@ failed:
3029 return err; 3150 return err;
3030} 3151}
3031 3152
3153static int set_appearance(struct sock *sk, struct hci_dev *hdev, void *data,
3154 u16 len)
3155{
3156 struct mgmt_cp_set_appearance *cp = data;
3157 u16 apperance;
3158 int err;
3159
3160 BT_DBG("");
3161
3162 if (!lmp_le_capable(hdev))
3163 return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_APPEARANCE,
3164 MGMT_STATUS_NOT_SUPPORTED);
3165
3166 apperance = le16_to_cpu(cp->appearance);
3167
3168 hci_dev_lock(hdev);
3169
3170 if (hdev->appearance != apperance) {
3171 hdev->appearance = apperance;
3172
3173 if (hci_dev_test_flag(hdev, HCI_LE_ADV))
3174 adv_expire(hdev, MGMT_ADV_FLAG_APPEARANCE);
3175
3176 ext_info_changed(hdev, sk);
3177 }
3178
3179 err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_APPEARANCE, 0, NULL,
3180 0);
3181
3182 hci_dev_unlock(hdev);
3183
3184 return err;
3185}
3186
3032static void read_local_oob_data_complete(struct hci_dev *hdev, u8 status, 3187static void read_local_oob_data_complete(struct hci_dev *hdev, u8 status,
3033 u16 opcode, struct sk_buff *skb) 3188 u16 opcode, struct sk_buff *skb)
3034{ 3189{
@@ -4869,7 +5024,7 @@ static int clock_info_cmd_complete(struct mgmt_pending_cmd *cmd, u8 status)
4869 int err; 5024 int err;
4870 5025
4871 memset(&rp, 0, sizeof(rp)); 5026 memset(&rp, 0, sizeof(rp));
4872 memcpy(&rp.addr, &cmd->param, sizeof(rp.addr)); 5027 memcpy(&rp.addr, cmd->param, sizeof(rp.addr));
4873 5028
4874 if (status) 5029 if (status)
4875 goto complete; 5030 goto complete;
@@ -5501,17 +5656,6 @@ unlock:
5501 return err; 5656 return err;
5502} 5657}
5503 5658
5504static inline u16 eir_append_data(u8 *eir, u16 eir_len, u8 type, u8 *data,
5505 u8 data_len)
5506{
5507 eir[eir_len++] = sizeof(type) + data_len;
5508 eir[eir_len++] = type;
5509 memcpy(&eir[eir_len], data, data_len);
5510 eir_len += data_len;
5511
5512 return eir_len;
5513}
5514
5515static void read_local_oob_ext_data_complete(struct hci_dev *hdev, u8 status, 5659static void read_local_oob_ext_data_complete(struct hci_dev *hdev, u8 status,
5516 u16 opcode, struct sk_buff *skb) 5660 u16 opcode, struct sk_buff *skb)
5517{ 5661{
@@ -5815,6 +5959,8 @@ static u32 get_supported_adv_flags(struct hci_dev *hdev)
5815 flags |= MGMT_ADV_FLAG_DISCOV; 5959 flags |= MGMT_ADV_FLAG_DISCOV;
5816 flags |= MGMT_ADV_FLAG_LIMITED_DISCOV; 5960 flags |= MGMT_ADV_FLAG_LIMITED_DISCOV;
5817 flags |= MGMT_ADV_FLAG_MANAGED_FLAGS; 5961 flags |= MGMT_ADV_FLAG_MANAGED_FLAGS;
5962 flags |= MGMT_ADV_FLAG_APPEARANCE;
5963 flags |= MGMT_ADV_FLAG_LOCAL_NAME;
5818 5964
5819 if (hdev->adv_tx_power != HCI_TX_POWER_INVALID) 5965 if (hdev->adv_tx_power != HCI_TX_POWER_INVALID)
5820 flags |= MGMT_ADV_FLAG_TX_POWER; 5966 flags |= MGMT_ADV_FLAG_TX_POWER;
@@ -5871,28 +6017,59 @@ static int read_adv_features(struct sock *sk, struct hci_dev *hdev,
5871 return err; 6017 return err;
5872} 6018}
5873 6019
5874static bool tlv_data_is_valid(struct hci_dev *hdev, u32 adv_flags, u8 *data, 6020static u8 tlv_data_max_len(u32 adv_flags, bool is_adv_data)
5875 u8 len, bool is_adv_data)
5876{ 6021{
5877 u8 max_len = HCI_MAX_AD_LENGTH; 6022 u8 max_len = HCI_MAX_AD_LENGTH;
5878 int i, cur_len;
5879 bool flags_managed = false;
5880 bool tx_power_managed = false;
5881 6023
5882 if (is_adv_data) { 6024 if (is_adv_data) {
5883 if (adv_flags & (MGMT_ADV_FLAG_DISCOV | 6025 if (adv_flags & (MGMT_ADV_FLAG_DISCOV |
5884 MGMT_ADV_FLAG_LIMITED_DISCOV | 6026 MGMT_ADV_FLAG_LIMITED_DISCOV |
5885 MGMT_ADV_FLAG_MANAGED_FLAGS)) { 6027 MGMT_ADV_FLAG_MANAGED_FLAGS))
5886 flags_managed = true;
5887 max_len -= 3; 6028 max_len -= 3;
5888 }
5889 6029
5890 if (adv_flags & MGMT_ADV_FLAG_TX_POWER) { 6030 if (adv_flags & MGMT_ADV_FLAG_TX_POWER)
5891 tx_power_managed = true;
5892 max_len -= 3; 6031 max_len -= 3;
5893 } 6032 } else {
6033 /* at least 1 byte of name should fit in */
6034 if (adv_flags & MGMT_ADV_FLAG_LOCAL_NAME)
6035 max_len -= 3;
6036
6037 if (adv_flags & (MGMT_ADV_FLAG_APPEARANCE))
6038 max_len -= 4;
5894 } 6039 }
5895 6040
6041 return max_len;
6042}
6043
6044static bool flags_managed(u32 adv_flags)
6045{
6046 return adv_flags & (MGMT_ADV_FLAG_DISCOV |
6047 MGMT_ADV_FLAG_LIMITED_DISCOV |
6048 MGMT_ADV_FLAG_MANAGED_FLAGS);
6049}
6050
6051static bool tx_power_managed(u32 adv_flags)
6052{
6053 return adv_flags & MGMT_ADV_FLAG_TX_POWER;
6054}
6055
6056static bool name_managed(u32 adv_flags)
6057{
6058 return adv_flags & MGMT_ADV_FLAG_LOCAL_NAME;
6059}
6060
6061static bool appearance_managed(u32 adv_flags)
6062{
6063 return adv_flags & MGMT_ADV_FLAG_APPEARANCE;
6064}
6065
6066static bool tlv_data_is_valid(u32 adv_flags, u8 *data, u8 len, bool is_adv_data)
6067{
6068 int i, cur_len;
6069 u8 max_len;
6070
6071 max_len = tlv_data_max_len(adv_flags, is_adv_data);
6072
5896 if (len > max_len) 6073 if (len > max_len)
5897 return false; 6074 return false;
5898 6075
@@ -5900,10 +6077,21 @@ static bool tlv_data_is_valid(struct hci_dev *hdev, u32 adv_flags, u8 *data,
5900 for (i = 0, cur_len = 0; i < len; i += (cur_len + 1)) { 6077 for (i = 0, cur_len = 0; i < len; i += (cur_len + 1)) {
5901 cur_len = data[i]; 6078 cur_len = data[i];
5902 6079
5903 if (flags_managed && data[i + 1] == EIR_FLAGS) 6080 if (data[i + 1] == EIR_FLAGS &&
6081 (!is_adv_data || flags_managed(adv_flags)))
6082 return false;
6083
6084 if (data[i + 1] == EIR_TX_POWER && tx_power_managed(adv_flags))
5904 return false; 6085 return false;
5905 6086
5906 if (tx_power_managed && data[i + 1] == EIR_TX_POWER) 6087 if (data[i + 1] == EIR_NAME_COMPLETE && name_managed(adv_flags))
6088 return false;
6089
6090 if (data[i + 1] == EIR_NAME_SHORT && name_managed(adv_flags))
6091 return false;
6092
6093 if (data[i + 1] == EIR_APPEARANCE &&
6094 appearance_managed(adv_flags))
5907 return false; 6095 return false;
5908 6096
5909 /* If the current field length would exceed the total data 6097 /* If the current field length would exceed the total data
@@ -6027,8 +6215,8 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev,
6027 goto unlock; 6215 goto unlock;
6028 } 6216 }
6029 6217
6030 if (!tlv_data_is_valid(hdev, flags, cp->data, cp->adv_data_len, true) || 6218 if (!tlv_data_is_valid(flags, cp->data, cp->adv_data_len, true) ||
6031 !tlv_data_is_valid(hdev, flags, cp->data + cp->adv_data_len, 6219 !tlv_data_is_valid(flags, cp->data + cp->adv_data_len,
6032 cp->scan_rsp_len, false)) { 6220 cp->scan_rsp_len, false)) {
6033 err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING, 6221 err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING,
6034 MGMT_STATUS_INVALID_PARAMS); 6222 MGMT_STATUS_INVALID_PARAMS);
@@ -6175,7 +6363,7 @@ static int remove_advertising(struct sock *sk, struct hci_dev *hdev,
6175 6363
6176 hci_req_init(&req, hdev); 6364 hci_req_init(&req, hdev);
6177 6365
6178 hci_req_clear_adv_instance(hdev, &req, cp->instance, true); 6366 hci_req_clear_adv_instance(hdev, sk, &req, cp->instance, true);
6179 6367
6180 if (list_empty(&hdev->adv_instances)) 6368 if (list_empty(&hdev->adv_instances))
6181 __hci_req_disable_advertising(&req); 6369 __hci_req_disable_advertising(&req);
@@ -6211,23 +6399,6 @@ unlock:
6211 return err; 6399 return err;
6212} 6400}
6213 6401
6214static u8 tlv_data_max_len(u32 adv_flags, bool is_adv_data)
6215{
6216 u8 max_len = HCI_MAX_AD_LENGTH;
6217
6218 if (is_adv_data) {
6219 if (adv_flags & (MGMT_ADV_FLAG_DISCOV |
6220 MGMT_ADV_FLAG_LIMITED_DISCOV |
6221 MGMT_ADV_FLAG_MANAGED_FLAGS))
6222 max_len -= 3;
6223
6224 if (adv_flags & MGMT_ADV_FLAG_TX_POWER)
6225 max_len -= 3;
6226 }
6227
6228 return max_len;
6229}
6230
6231static int get_adv_size_info(struct sock *sk, struct hci_dev *hdev, 6402static int get_adv_size_info(struct sock *sk, struct hci_dev *hdev,
6232 void *data, u16 data_len) 6403 void *data, u16 data_len)
6233{ 6404{
@@ -6356,6 +6527,9 @@ static const struct hci_mgmt_handler mgmt_handlers[] = {
6356 { remove_advertising, MGMT_REMOVE_ADVERTISING_SIZE }, 6527 { remove_advertising, MGMT_REMOVE_ADVERTISING_SIZE },
6357 { get_adv_size_info, MGMT_GET_ADV_SIZE_INFO_SIZE }, 6528 { get_adv_size_info, MGMT_GET_ADV_SIZE_INFO_SIZE },
6358 { start_limited_discovery, MGMT_START_DISCOVERY_SIZE }, 6529 { start_limited_discovery, MGMT_START_DISCOVERY_SIZE },
6530 { read_ext_controller_info,MGMT_READ_EXT_INFO_SIZE,
6531 HCI_MGMT_UNTRUSTED },
6532 { set_appearance, MGMT_SET_APPEARANCE_SIZE },
6359}; 6533};
6360 6534
6361void mgmt_index_added(struct hci_dev *hdev) 6535void mgmt_index_added(struct hci_dev *hdev)
@@ -6494,9 +6668,12 @@ void __mgmt_power_off(struct hci_dev *hdev)
6494 6668
6495 mgmt_pending_foreach(0, hdev, cmd_complete_rsp, &status); 6669 mgmt_pending_foreach(0, hdev, cmd_complete_rsp, &status);
6496 6670
6497 if (memcmp(hdev->dev_class, zero_cod, sizeof(zero_cod)) != 0) 6671 if (memcmp(hdev->dev_class, zero_cod, sizeof(zero_cod)) != 0) {
6498 mgmt_generic_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev, 6672 mgmt_limited_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev,
6499 zero_cod, sizeof(zero_cod), NULL); 6673 zero_cod, sizeof(zero_cod),
6674 HCI_MGMT_DEV_CLASS_EVENTS, NULL);
6675 ext_info_changed(hdev, NULL);
6676 }
6500 6677
6501 new_settings(hdev, match.sk); 6678 new_settings(hdev, match.sk);
6502 6679
@@ -7092,9 +7269,11 @@ void mgmt_set_class_of_dev_complete(struct hci_dev *hdev, u8 *dev_class,
7092 mgmt_pending_foreach(MGMT_OP_ADD_UUID, hdev, sk_lookup, &match); 7269 mgmt_pending_foreach(MGMT_OP_ADD_UUID, hdev, sk_lookup, &match);
7093 mgmt_pending_foreach(MGMT_OP_REMOVE_UUID, hdev, sk_lookup, &match); 7270 mgmt_pending_foreach(MGMT_OP_REMOVE_UUID, hdev, sk_lookup, &match);
7094 7271
7095 if (!status) 7272 if (!status) {
7096 mgmt_generic_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev, 7273 mgmt_limited_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev, dev_class,
7097 dev_class, 3, NULL); 7274 3, HCI_MGMT_DEV_CLASS_EVENTS, NULL);
7275 ext_info_changed(hdev, NULL);
7276 }
7098 7277
7099 if (match.sk) 7278 if (match.sk)
7100 sock_put(match.sk); 7279 sock_put(match.sk);
@@ -7123,8 +7302,9 @@ void mgmt_set_local_name_complete(struct hci_dev *hdev, u8 *name, u8 status)
7123 return; 7302 return;
7124 } 7303 }
7125 7304
7126 mgmt_generic_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev, &ev, sizeof(ev), 7305 mgmt_limited_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev, &ev, sizeof(ev),
7127 cmd ? cmd->sk : NULL); 7306 HCI_MGMT_LOCAL_NAME_EVENTS, cmd ? cmd->sk : NULL);
7307 ext_info_changed(hdev, cmd ? cmd->sk : NULL);
7128} 7308}
7129 7309
7130static inline bool has_uuid(u8 *uuid, u16 uuid_count, u8 (*uuids)[16]) 7310static inline bool has_uuid(u8 *uuid, u16 uuid_count, u8 (*uuids)[16])
diff --git a/net/bluetooth/mgmt_util.c b/net/bluetooth/mgmt_util.c
index 8c30c7eb8bef..c933bd08c1fe 100644
--- a/net/bluetooth/mgmt_util.c
+++ b/net/bluetooth/mgmt_util.c
@@ -21,12 +21,41 @@
21 SOFTWARE IS DISCLAIMED. 21 SOFTWARE IS DISCLAIMED.
22*/ 22*/
23 23
24#include <asm/unaligned.h>
25
24#include <net/bluetooth/bluetooth.h> 26#include <net/bluetooth/bluetooth.h>
25#include <net/bluetooth/hci_core.h> 27#include <net/bluetooth/hci_core.h>
28#include <net/bluetooth/hci_mon.h>
26#include <net/bluetooth/mgmt.h> 29#include <net/bluetooth/mgmt.h>
27 30
28#include "mgmt_util.h" 31#include "mgmt_util.h"
29 32
33static struct sk_buff *create_monitor_ctrl_event(__le16 index, u32 cookie,
34 u16 opcode, u16 len, void *buf)
35{
36 struct hci_mon_hdr *hdr;
37 struct sk_buff *skb;
38
39 skb = bt_skb_alloc(6 + len, GFP_ATOMIC);
40 if (!skb)
41 return NULL;
42
43 put_unaligned_le32(cookie, skb_put(skb, 4));
44 put_unaligned_le16(opcode, skb_put(skb, 2));
45
46 if (buf)
47 memcpy(skb_put(skb, len), buf, len);
48
49 __net_timestamp(skb);
50
51 hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE);
52 hdr->opcode = cpu_to_le16(HCI_MON_CTRL_EVENT);
53 hdr->index = index;
54 hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE);
55
56 return skb;
57}
58
30int mgmt_send_event(u16 event, struct hci_dev *hdev, unsigned short channel, 59int mgmt_send_event(u16 event, struct hci_dev *hdev, unsigned short channel,
31 void *data, u16 data_len, int flag, struct sock *skip_sk) 60 void *data, u16 data_len, int flag, struct sock *skip_sk)
32{ 61{
@@ -52,14 +81,18 @@ int mgmt_send_event(u16 event, struct hci_dev *hdev, unsigned short channel,
52 __net_timestamp(skb); 81 __net_timestamp(skb);
53 82
54 hci_send_to_channel(channel, skb, flag, skip_sk); 83 hci_send_to_channel(channel, skb, flag, skip_sk);
55 kfree_skb(skb);
56 84
85 if (channel == HCI_CHANNEL_CONTROL)
86 hci_send_monitor_ctrl_event(hdev, event, data, data_len,
87 skb_get_ktime(skb), flag, skip_sk);
88
89 kfree_skb(skb);
57 return 0; 90 return 0;
58} 91}
59 92
60int mgmt_cmd_status(struct sock *sk, u16 index, u16 cmd, u8 status) 93int mgmt_cmd_status(struct sock *sk, u16 index, u16 cmd, u8 status)
61{ 94{
62 struct sk_buff *skb; 95 struct sk_buff *skb, *mskb;
63 struct mgmt_hdr *hdr; 96 struct mgmt_hdr *hdr;
64 struct mgmt_ev_cmd_status *ev; 97 struct mgmt_ev_cmd_status *ev;
65 int err; 98 int err;
@@ -80,17 +113,30 @@ int mgmt_cmd_status(struct sock *sk, u16 index, u16 cmd, u8 status)
80 ev->status = status; 113 ev->status = status;
81 ev->opcode = cpu_to_le16(cmd); 114 ev->opcode = cpu_to_le16(cmd);
82 115
116 mskb = create_monitor_ctrl_event(hdr->index, hci_sock_get_cookie(sk),
117 MGMT_EV_CMD_STATUS, sizeof(*ev), ev);
118 if (mskb)
119 skb->tstamp = mskb->tstamp;
120 else
121 __net_timestamp(skb);
122
83 err = sock_queue_rcv_skb(sk, skb); 123 err = sock_queue_rcv_skb(sk, skb);
84 if (err < 0) 124 if (err < 0)
85 kfree_skb(skb); 125 kfree_skb(skb);
86 126
127 if (mskb) {
128 hci_send_to_channel(HCI_CHANNEL_MONITOR, mskb,
129 HCI_SOCK_TRUSTED, NULL);
130 kfree_skb(mskb);
131 }
132
87 return err; 133 return err;
88} 134}
89 135
90int mgmt_cmd_complete(struct sock *sk, u16 index, u16 cmd, u8 status, 136int mgmt_cmd_complete(struct sock *sk, u16 index, u16 cmd, u8 status,
91 void *rp, size_t rp_len) 137 void *rp, size_t rp_len)
92{ 138{
93 struct sk_buff *skb; 139 struct sk_buff *skb, *mskb;
94 struct mgmt_hdr *hdr; 140 struct mgmt_hdr *hdr;
95 struct mgmt_ev_cmd_complete *ev; 141 struct mgmt_ev_cmd_complete *ev;
96 int err; 142 int err;
@@ -114,10 +160,24 @@ int mgmt_cmd_complete(struct sock *sk, u16 index, u16 cmd, u8 status,
114 if (rp) 160 if (rp)
115 memcpy(ev->data, rp, rp_len); 161 memcpy(ev->data, rp, rp_len);
116 162
163 mskb = create_monitor_ctrl_event(hdr->index, hci_sock_get_cookie(sk),
164 MGMT_EV_CMD_COMPLETE,
165 sizeof(*ev) + rp_len, ev);
166 if (mskb)
167 skb->tstamp = mskb->tstamp;
168 else
169 __net_timestamp(skb);
170
117 err = sock_queue_rcv_skb(sk, skb); 171 err = sock_queue_rcv_skb(sk, skb);
118 if (err < 0) 172 if (err < 0)
119 kfree_skb(skb); 173 kfree_skb(skb);
120 174
175 if (mskb) {
176 hci_send_to_channel(HCI_CHANNEL_MONITOR, mskb,
177 HCI_SOCK_TRUSTED, NULL);
178 kfree_skb(mskb);
179 }
180
121 return err; 181 return err;
122} 182}
123 183
diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
index 4c1a16a96ae5..43faf2aea2ab 100644
--- a/net/bluetooth/smp.c
+++ b/net/bluetooth/smp.c
@@ -3387,7 +3387,10 @@ int smp_register(struct hci_dev *hdev)
3387 if (!lmp_sc_capable(hdev)) { 3387 if (!lmp_sc_capable(hdev)) {
3388 debugfs_create_file("force_bredr_smp", 0644, hdev->debugfs, 3388 debugfs_create_file("force_bredr_smp", 0644, hdev->debugfs,
3389 hdev, &force_bredr_smp_fops); 3389 hdev, &force_bredr_smp_fops);
3390 return 0; 3390
3391 /* Flag can be already set here (due to power toggle) */
3392 if (!hci_dev_test_flag(hdev, HCI_FORCE_BREDR_SMP))
3393 return 0;
3391 } 3394 }
3392 3395
3393 if (WARN_ON(hdev->smp_bredr_data)) { 3396 if (WARN_ON(hdev->smp_bredr_data)) {
diff --git a/net/bridge/Makefile b/net/bridge/Makefile
index a1cda5d4718d..0aefc011b668 100644
--- a/net/bridge/Makefile
+++ b/net/bridge/Makefile
@@ -20,4 +20,6 @@ bridge-$(CONFIG_BRIDGE_IGMP_SNOOPING) += br_multicast.o br_mdb.o
20 20
21bridge-$(CONFIG_BRIDGE_VLAN_FILTERING) += br_vlan.o 21bridge-$(CONFIG_BRIDGE_VLAN_FILTERING) += br_vlan.o
22 22
23bridge-$(CONFIG_NET_SWITCHDEV) += br_switchdev.o
24
23obj-$(CONFIG_NETFILTER) += netfilter/ 25obj-$(CONFIG_NETFILTER) += netfilter/
diff --git a/net/bridge/br.c b/net/bridge/br.c
index 3addc05b9a16..889e5640455f 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -227,9 +227,11 @@ static int __init br_init(void)
227 br_fdb_test_addr_hook = br_fdb_test_addr; 227 br_fdb_test_addr_hook = br_fdb_test_addr;
228#endif 228#endif
229 229
230 pr_info("bridge: automatic filtering via arp/ip/ip6tables has been " 230#if IS_MODULE(CONFIG_BRIDGE_NETFILTER)
231 "deprecated. Update your scripts to load br_netfilter if you " 231 pr_info("bridge: filtering via arp/ip/ip6tables is no longer available "
232 "by default. Update your scripts to load br_netfilter if you "
232 "need this.\n"); 233 "need this.\n");
234#endif
233 235
234 return 0; 236 return 0;
235 237
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 09f26940aba5..89a687f3c0a3 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -62,10 +62,10 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
62 goto out; 62 goto out;
63 63
64 if (is_broadcast_ether_addr(dest)) { 64 if (is_broadcast_ether_addr(dest)) {
65 br_flood(br, skb, false, false, true); 65 br_flood(br, skb, BR_PKT_BROADCAST, false, true);
66 } else if (is_multicast_ether_addr(dest)) { 66 } else if (is_multicast_ether_addr(dest)) {
67 if (unlikely(netpoll_tx_running(dev))) { 67 if (unlikely(netpoll_tx_running(dev))) {
68 br_flood(br, skb, false, false, true); 68 br_flood(br, skb, BR_PKT_MULTICAST, false, true);
69 goto out; 69 goto out;
70 } 70 }
71 if (br_multicast_rcv(br, NULL, skb, vid)) { 71 if (br_multicast_rcv(br, NULL, skb, vid)) {
@@ -78,11 +78,11 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
78 br_multicast_querier_exists(br, eth_hdr(skb))) 78 br_multicast_querier_exists(br, eth_hdr(skb)))
79 br_multicast_flood(mdst, skb, false, true); 79 br_multicast_flood(mdst, skb, false, true);
80 else 80 else
81 br_flood(br, skb, false, false, true); 81 br_flood(br, skb, BR_PKT_MULTICAST, false, true);
82 } else if ((dst = __br_fdb_get(br, dest, vid)) != NULL) { 82 } else if ((dst = __br_fdb_get(br, dest, vid)) != NULL) {
83 br_forward(dst->dst, skb, false, true); 83 br_forward(dst->dst, skb, false, true);
84 } else { 84 } else {
85 br_flood(br, skb, true, false, true); 85 br_flood(br, skb, BR_PKT_UNICAST, false, true);
86 } 86 }
87out: 87out:
88 rcu_read_unlock(); 88 rcu_read_unlock();
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index cd620fab41b0..6b43c8c88f19 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -710,24 +710,27 @@ int br_fdb_dump(struct sk_buff *skb,
710 struct netlink_callback *cb, 710 struct netlink_callback *cb,
711 struct net_device *dev, 711 struct net_device *dev,
712 struct net_device *filter_dev, 712 struct net_device *filter_dev,
713 int idx) 713 int *idx)
714{ 714{
715 struct net_bridge *br = netdev_priv(dev); 715 struct net_bridge *br = netdev_priv(dev);
716 int err = 0;
716 int i; 717 int i;
717 718
718 if (!(dev->priv_flags & IFF_EBRIDGE)) 719 if (!(dev->priv_flags & IFF_EBRIDGE))
719 goto out; 720 goto out;
720 721
721 if (!filter_dev) 722 if (!filter_dev) {
722 idx = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx); 723 err = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx);
724 if (err < 0)
725 goto out;
726 }
723 727
724 for (i = 0; i < BR_HASH_SIZE; i++) { 728 for (i = 0; i < BR_HASH_SIZE; i++) {
725 struct net_bridge_fdb_entry *f; 729 struct net_bridge_fdb_entry *f;
726 730
727 hlist_for_each_entry_rcu(f, &br->hash[i], hlist) { 731 hlist_for_each_entry_rcu(f, &br->hash[i], hlist) {
728 int err;
729 732
730 if (idx < cb->args[0]) 733 if (*idx < cb->args[2])
731 goto skip; 734 goto skip;
732 735
733 if (filter_dev && 736 if (filter_dev &&
@@ -750,17 +753,15 @@ int br_fdb_dump(struct sk_buff *skb,
750 cb->nlh->nlmsg_seq, 753 cb->nlh->nlmsg_seq,
751 RTM_NEWNEIGH, 754 RTM_NEWNEIGH,
752 NLM_F_MULTI); 755 NLM_F_MULTI);
753 if (err < 0) { 756 if (err < 0)
754 cb->args[1] = err; 757 goto out;
755 break;
756 }
757skip: 758skip:
758 ++idx; 759 *idx += 1;
759 } 760 }
760 } 761 }
761 762
762out: 763out:
763 return idx; 764 return err;
764} 765}
765 766
766/* Update (create or replace) forwarding database entry */ 767/* Update (create or replace) forwarding database entry */
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index 63a83d8d7da3..7cb41aee4c82 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -29,7 +29,8 @@ static inline int should_deliver(const struct net_bridge_port *p,
29 29
30 vg = nbp_vlan_group_rcu(p); 30 vg = nbp_vlan_group_rcu(p);
31 return ((p->flags & BR_HAIRPIN_MODE) || skb->dev != p->dev) && 31 return ((p->flags & BR_HAIRPIN_MODE) || skb->dev != p->dev) &&
32 br_allowed_egress(vg, skb) && p->state == BR_STATE_FORWARDING; 32 br_allowed_egress(vg, skb) && p->state == BR_STATE_FORWARDING &&
33 nbp_switchdev_allowed_egress(p, skb);
33} 34}
34 35
35int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) 36int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
@@ -175,7 +176,7 @@ out:
175 176
176/* called under rcu_read_lock */ 177/* called under rcu_read_lock */
177void br_flood(struct net_bridge *br, struct sk_buff *skb, 178void br_flood(struct net_bridge *br, struct sk_buff *skb,
178 bool unicast, bool local_rcv, bool local_orig) 179 enum br_pkt_type pkt_type, bool local_rcv, bool local_orig)
179{ 180{
180 u8 igmp_type = br_multicast_igmp_type(skb); 181 u8 igmp_type = br_multicast_igmp_type(skb);
181 struct net_bridge_port *prev = NULL; 182 struct net_bridge_port *prev = NULL;
@@ -183,7 +184,10 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb,
183 184
184 list_for_each_entry_rcu(p, &br->port_list, list) { 185 list_for_each_entry_rcu(p, &br->port_list, list) {
185 /* Do not flood unicast traffic to ports that turn it off */ 186 /* Do not flood unicast traffic to ports that turn it off */
186 if (unicast && !(p->flags & BR_FLOOD)) 187 if (pkt_type == BR_PKT_UNICAST && !(p->flags & BR_FLOOD))
188 continue;
189 if (pkt_type == BR_PKT_MULTICAST &&
190 !(p->flags & BR_MCAST_FLOOD))
187 continue; 191 continue;
188 192
189 /* Do not flood to ports that enable proxy ARP */ 193 /* Do not flood to ports that enable proxy ARP */
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index f2fede05d32c..ed0dd3340084 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -362,7 +362,7 @@ static struct net_bridge_port *new_nbp(struct net_bridge *br,
362 p->path_cost = port_cost(dev); 362 p->path_cost = port_cost(dev);
363 p->priority = 0x8000 >> BR_PORT_BITS; 363 p->priority = 0x8000 >> BR_PORT_BITS;
364 p->port_no = index; 364 p->port_no = index;
365 p->flags = BR_LEARNING | BR_FLOOD; 365 p->flags = BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD;
366 br_init_port(p); 366 br_init_port(p);
367 br_set_state(p, BR_STATE_DISABLED); 367 br_set_state(p, BR_STATE_DISABLED);
368 br_stp_port_timer_init(p); 368 br_stp_port_timer_init(p);
@@ -545,6 +545,10 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
545 if (err) 545 if (err)
546 goto err5; 546 goto err5;
547 547
548 err = nbp_switchdev_mark_set(p);
549 if (err)
550 goto err6;
551
548 dev_disable_lro(dev); 552 dev_disable_lro(dev);
549 553
550 list_add_rcu(&p->list, &br->port_list); 554 list_add_rcu(&p->list, &br->port_list);
@@ -566,7 +570,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
566 err = nbp_vlan_init(p); 570 err = nbp_vlan_init(p);
567 if (err) { 571 if (err) {
568 netdev_err(dev, "failed to initialize vlan filtering on this port\n"); 572 netdev_err(dev, "failed to initialize vlan filtering on this port\n");
569 goto err6; 573 goto err7;
570 } 574 }
571 575
572 spin_lock_bh(&br->lock); 576 spin_lock_bh(&br->lock);
@@ -589,12 +593,12 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
589 593
590 return 0; 594 return 0;
591 595
592err6: 596err7:
593 list_del_rcu(&p->list); 597 list_del_rcu(&p->list);
594 br_fdb_delete_by_port(br, p, 0, 1); 598 br_fdb_delete_by_port(br, p, 0, 1);
595 nbp_update_port_count(br); 599 nbp_update_port_count(br);
600err6:
596 netdev_upper_dev_unlink(dev, br->dev); 601 netdev_upper_dev_unlink(dev, br->dev);
597
598err5: 602err5:
599 dev->priv_flags &= ~IFF_BRIDGE_PORT; 603 dev->priv_flags &= ~IFF_BRIDGE_PORT;
600 netdev_rx_handler_unregister(dev); 604 netdev_rx_handler_unregister(dev);
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index abe11f085479..855b72fbe1da 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -128,11 +128,12 @@ static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br,
128/* note: already called with rcu_read_lock */ 128/* note: already called with rcu_read_lock */
129int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb) 129int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
130{ 130{
131 bool local_rcv = false, mcast_hit = false, unicast = true;
132 struct net_bridge_port *p = br_port_get_rcu(skb->dev); 131 struct net_bridge_port *p = br_port_get_rcu(skb->dev);
133 const unsigned char *dest = eth_hdr(skb)->h_dest; 132 const unsigned char *dest = eth_hdr(skb)->h_dest;
133 enum br_pkt_type pkt_type = BR_PKT_UNICAST;
134 struct net_bridge_fdb_entry *dst = NULL; 134 struct net_bridge_fdb_entry *dst = NULL;
135 struct net_bridge_mdb_entry *mdst; 135 struct net_bridge_mdb_entry *mdst;
136 bool local_rcv, mcast_hit = false;
136 struct net_bridge *br; 137 struct net_bridge *br;
137 u16 vid = 0; 138 u16 vid = 0;
138 139
@@ -142,29 +143,36 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
142 if (!br_allowed_ingress(p->br, nbp_vlan_group_rcu(p), skb, &vid)) 143 if (!br_allowed_ingress(p->br, nbp_vlan_group_rcu(p), skb, &vid))
143 goto out; 144 goto out;
144 145
146 nbp_switchdev_frame_mark(p, skb);
147
145 /* insert into forwarding database after filtering to avoid spoofing */ 148 /* insert into forwarding database after filtering to avoid spoofing */
146 br = p->br; 149 br = p->br;
147 if (p->flags & BR_LEARNING) 150 if (p->flags & BR_LEARNING)
148 br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, false); 151 br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, false);
149 152
150 if (!is_broadcast_ether_addr(dest) && is_multicast_ether_addr(dest) && 153 local_rcv = !!(br->dev->flags & IFF_PROMISC);
151 br_multicast_rcv(br, p, skb, vid)) 154 if (is_multicast_ether_addr(dest)) {
152 goto drop; 155 /* by definition the broadcast is also a multicast address */
156 if (is_broadcast_ether_addr(dest)) {
157 pkt_type = BR_PKT_BROADCAST;
158 local_rcv = true;
159 } else {
160 pkt_type = BR_PKT_MULTICAST;
161 if (br_multicast_rcv(br, p, skb, vid))
162 goto drop;
163 }
164 }
153 165
154 if (p->state == BR_STATE_LEARNING) 166 if (p->state == BR_STATE_LEARNING)
155 goto drop; 167 goto drop;
156 168
157 BR_INPUT_SKB_CB(skb)->brdev = br->dev; 169 BR_INPUT_SKB_CB(skb)->brdev = br->dev;
158 170
159 local_rcv = !!(br->dev->flags & IFF_PROMISC);
160
161 if (IS_ENABLED(CONFIG_INET) && skb->protocol == htons(ETH_P_ARP)) 171 if (IS_ENABLED(CONFIG_INET) && skb->protocol == htons(ETH_P_ARP))
162 br_do_proxy_arp(skb, br, vid, p); 172 br_do_proxy_arp(skb, br, vid, p);
163 173
164 if (is_broadcast_ether_addr(dest)) { 174 switch (pkt_type) {
165 local_rcv = true; 175 case BR_PKT_MULTICAST:
166 unicast = false;
167 } else if (is_multicast_ether_addr(dest)) {
168 mdst = br_mdb_get(br, skb, vid); 176 mdst = br_mdb_get(br, skb, vid);
169 if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) && 177 if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) &&
170 br_multicast_querier_exists(br, eth_hdr(skb))) { 178 br_multicast_querier_exists(br, eth_hdr(skb))) {
@@ -178,18 +186,22 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
178 local_rcv = true; 186 local_rcv = true;
179 br->dev->stats.multicast++; 187 br->dev->stats.multicast++;
180 } 188 }
181 unicast = false; 189 break;
182 } else if ((dst = __br_fdb_get(br, dest, vid)) && dst->is_local) { 190 case BR_PKT_UNICAST:
183 /* Do not forward the packet since it's local. */ 191 dst = __br_fdb_get(br, dest, vid);
184 return br_pass_frame_up(skb); 192 default:
193 break;
185 } 194 }
186 195
187 if (dst) { 196 if (dst) {
197 if (dst->is_local)
198 return br_pass_frame_up(skb);
199
188 dst->used = jiffies; 200 dst->used = jiffies;
189 br_forward(dst->dst, skb, local_rcv, false); 201 br_forward(dst->dst, skb, local_rcv, false);
190 } else { 202 } else {
191 if (!mcast_hit) 203 if (!mcast_hit)
192 br_flood(br, skb, unicast, local_rcv, false); 204 br_flood(br, skb, pkt_type, local_rcv, false);
193 else 205 else
194 br_multicast_flood(mdst, skb, local_rcv, false); 206 br_multicast_flood(mdst, skb, local_rcv, false);
195 } 207 }
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 77e7f69bf80d..2fe9345c1407 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -30,6 +30,7 @@
30#include <linux/netfilter_ipv6.h> 30#include <linux/netfilter_ipv6.h>
31#include <linux/netfilter_arp.h> 31#include <linux/netfilter_arp.h>
32#include <linux/in_route.h> 32#include <linux/in_route.h>
33#include <linux/rculist.h>
33#include <linux/inetdevice.h> 34#include <linux/inetdevice.h>
34 35
35#include <net/ip.h> 36#include <net/ip.h>
@@ -395,11 +396,10 @@ bridged_dnat:
395 skb->dev = nf_bridge->physindev; 396 skb->dev = nf_bridge->physindev;
396 nf_bridge_update_protocol(skb); 397 nf_bridge_update_protocol(skb);
397 nf_bridge_push_encap_header(skb); 398 nf_bridge_push_encap_header(skb);
398 NF_HOOK_THRESH(NFPROTO_BRIDGE, 399 br_nf_hook_thresh(NF_BR_PRE_ROUTING,
399 NF_BR_PRE_ROUTING, 400 net, sk, skb, skb->dev,
400 net, sk, skb, skb->dev, NULL, 401 NULL,
401 br_nf_pre_routing_finish_bridge, 402 br_nf_pre_routing_finish);
402 1);
403 return 0; 403 return 0;
404 } 404 }
405 ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr); 405 ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr);
@@ -417,10 +417,8 @@ bridged_dnat:
417 skb->dev = nf_bridge->physindev; 417 skb->dev = nf_bridge->physindev;
418 nf_bridge_update_protocol(skb); 418 nf_bridge_update_protocol(skb);
419 nf_bridge_push_encap_header(skb); 419 nf_bridge_push_encap_header(skb);
420 NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, net, sk, skb, 420 br_nf_hook_thresh(NF_BR_PRE_ROUTING, net, sk, skb, skb->dev, NULL,
421 skb->dev, NULL, 421 br_handle_frame_finish);
422 br_handle_frame_finish, 1);
423
424 return 0; 422 return 0;
425} 423}
426 424
@@ -992,6 +990,43 @@ static struct notifier_block brnf_notifier __read_mostly = {
992 .notifier_call = brnf_device_event, 990 .notifier_call = brnf_device_event,
993}; 991};
994 992
993/* recursively invokes nf_hook_slow (again), skipping already-called
994 * hooks (< NF_BR_PRI_BRNF).
995 *
996 * Called with rcu read lock held.
997 */
998int br_nf_hook_thresh(unsigned int hook, struct net *net,
999 struct sock *sk, struct sk_buff *skb,
1000 struct net_device *indev,
1001 struct net_device *outdev,
1002 int (*okfn)(struct net *, struct sock *,
1003 struct sk_buff *))
1004{
1005 struct nf_hook_entry *elem;
1006 struct nf_hook_state state;
1007 int ret;
1008
1009 elem = rcu_dereference(net->nf.hooks[NFPROTO_BRIDGE][hook]);
1010
1011 while (elem && (elem->ops.priority <= NF_BR_PRI_BRNF))
1012 elem = rcu_dereference(elem->next);
1013
1014 if (!elem)
1015 return okfn(net, sk, skb);
1016
1017 /* We may already have this, but read-locks nest anyway */
1018 rcu_read_lock();
1019 nf_hook_state_init(&state, elem, hook, NF_BR_PRI_BRNF + 1,
1020 NFPROTO_BRIDGE, indev, outdev, sk, net, okfn);
1021
1022 ret = nf_hook_slow(skb, &state);
1023 rcu_read_unlock();
1024 if (ret == 1)
1025 ret = okfn(net, sk, skb);
1026
1027 return ret;
1028}
1029
995#ifdef CONFIG_SYSCTL 1030#ifdef CONFIG_SYSCTL
996static 1031static
997int brnf_sysctl_call_tables(struct ctl_table *ctl, int write, 1032int brnf_sysctl_call_tables(struct ctl_table *ctl, int write,
diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c
index 5e59a8457e7b..5989661c659f 100644
--- a/net/bridge/br_netfilter_ipv6.c
+++ b/net/bridge/br_netfilter_ipv6.c
@@ -187,10 +187,9 @@ static int br_nf_pre_routing_finish_ipv6(struct net *net, struct sock *sk, struc
187 skb->dev = nf_bridge->physindev; 187 skb->dev = nf_bridge->physindev;
188 nf_bridge_update_protocol(skb); 188 nf_bridge_update_protocol(skb);
189 nf_bridge_push_encap_header(skb); 189 nf_bridge_push_encap_header(skb);
190 NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, 190 br_nf_hook_thresh(NF_BR_PRE_ROUTING,
191 net, sk, skb, skb->dev, NULL, 191 net, sk, skb, skb->dev, NULL,
192 br_nf_pre_routing_finish_bridge, 192 br_nf_pre_routing_finish_bridge);
193 1);
194 return 0; 193 return 0;
195 } 194 }
196 ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr); 195 ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr);
@@ -207,9 +206,8 @@ static int br_nf_pre_routing_finish_ipv6(struct net *net, struct sock *sk, struc
207 skb->dev = nf_bridge->physindev; 206 skb->dev = nf_bridge->physindev;
208 nf_bridge_update_protocol(skb); 207 nf_bridge_update_protocol(skb);
209 nf_bridge_push_encap_header(skb); 208 nf_bridge_push_encap_header(skb);
210 NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, net, sk, skb, 209 br_nf_hook_thresh(NF_BR_PRE_ROUTING, net, sk, skb,
211 skb->dev, NULL, 210 skb->dev, NULL, br_handle_frame_finish);
212 br_handle_frame_finish, 1);
213 211
214 return 0; 212 return 0;
215} 213}
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index f2a29e467e78..e99037c6f7b7 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -169,10 +169,15 @@ static int br_port_fill_attrs(struct sk_buff *skb,
169 nla_put_u32(skb, IFLA_BRPORT_COST, p->path_cost) || 169 nla_put_u32(skb, IFLA_BRPORT_COST, p->path_cost) ||
170 nla_put_u8(skb, IFLA_BRPORT_MODE, mode) || 170 nla_put_u8(skb, IFLA_BRPORT_MODE, mode) ||
171 nla_put_u8(skb, IFLA_BRPORT_GUARD, !!(p->flags & BR_BPDU_GUARD)) || 171 nla_put_u8(skb, IFLA_BRPORT_GUARD, !!(p->flags & BR_BPDU_GUARD)) ||
172 nla_put_u8(skb, IFLA_BRPORT_PROTECT, !!(p->flags & BR_ROOT_BLOCK)) || 172 nla_put_u8(skb, IFLA_BRPORT_PROTECT,
173 nla_put_u8(skb, IFLA_BRPORT_FAST_LEAVE, !!(p->flags & BR_MULTICAST_FAST_LEAVE)) || 173 !!(p->flags & BR_ROOT_BLOCK)) ||
174 nla_put_u8(skb, IFLA_BRPORT_FAST_LEAVE,
175 !!(p->flags & BR_MULTICAST_FAST_LEAVE)) ||
174 nla_put_u8(skb, IFLA_BRPORT_LEARNING, !!(p->flags & BR_LEARNING)) || 176 nla_put_u8(skb, IFLA_BRPORT_LEARNING, !!(p->flags & BR_LEARNING)) ||
175 nla_put_u8(skb, IFLA_BRPORT_UNICAST_FLOOD, !!(p->flags & BR_FLOOD)) || 177 nla_put_u8(skb, IFLA_BRPORT_UNICAST_FLOOD,
178 !!(p->flags & BR_FLOOD)) ||
179 nla_put_u8(skb, IFLA_BRPORT_MCAST_FLOOD,
180 !!(p->flags & BR_MCAST_FLOOD)) ||
176 nla_put_u8(skb, IFLA_BRPORT_PROXYARP, !!(p->flags & BR_PROXYARP)) || 181 nla_put_u8(skb, IFLA_BRPORT_PROXYARP, !!(p->flags & BR_PROXYARP)) ||
177 nla_put_u8(skb, IFLA_BRPORT_PROXYARP_WIFI, 182 nla_put_u8(skb, IFLA_BRPORT_PROXYARP_WIFI,
178 !!(p->flags & BR_PROXYARP_WIFI)) || 183 !!(p->flags & BR_PROXYARP_WIFI)) ||
@@ -630,6 +635,7 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
630 br_set_port_flag(p, tb, IFLA_BRPORT_PROTECT, BR_ROOT_BLOCK); 635 br_set_port_flag(p, tb, IFLA_BRPORT_PROTECT, BR_ROOT_BLOCK);
631 br_set_port_flag(p, tb, IFLA_BRPORT_LEARNING, BR_LEARNING); 636 br_set_port_flag(p, tb, IFLA_BRPORT_LEARNING, BR_LEARNING);
632 br_set_port_flag(p, tb, IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD); 637 br_set_port_flag(p, tb, IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD);
638 br_set_port_flag(p, tb, IFLA_BRPORT_MCAST_FLOOD, BR_MCAST_FLOOD);
633 br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP, BR_PROXYARP); 639 br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP, BR_PROXYARP);
634 br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP_WIFI, BR_PROXYARP_WIFI); 640 br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP_WIFI, BR_PROXYARP_WIFI);
635 641
@@ -1245,14 +1251,30 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev)
1245 return 0; 1251 return 0;
1246} 1252}
1247 1253
1248static size_t bridge_get_linkxstats_size(const struct net_device *dev) 1254static size_t br_get_linkxstats_size(const struct net_device *dev, int attr)
1249{ 1255{
1250 struct net_bridge *br = netdev_priv(dev); 1256 struct net_bridge_port *p = NULL;
1251 struct net_bridge_vlan_group *vg; 1257 struct net_bridge_vlan_group *vg;
1252 struct net_bridge_vlan *v; 1258 struct net_bridge_vlan *v;
1259 struct net_bridge *br;
1253 int numvls = 0; 1260 int numvls = 0;
1254 1261
1255 vg = br_vlan_group(br); 1262 switch (attr) {
1263 case IFLA_STATS_LINK_XSTATS:
1264 br = netdev_priv(dev);
1265 vg = br_vlan_group(br);
1266 break;
1267 case IFLA_STATS_LINK_XSTATS_SLAVE:
1268 p = br_port_get_rtnl(dev);
1269 if (!p)
1270 return 0;
1271 br = p->br;
1272 vg = nbp_vlan_group(p);
1273 break;
1274 default:
1275 return 0;
1276 }
1277
1256 if (vg) { 1278 if (vg) {
1257 /* we need to count all, even placeholder entries */ 1279 /* we need to count all, even placeholder entries */
1258 list_for_each_entry(v, &vg->vlan_list, vlist) 1280 list_for_each_entry(v, &vg->vlan_list, vlist)
@@ -1264,45 +1286,42 @@ static size_t bridge_get_linkxstats_size(const struct net_device *dev)
1264 nla_total_size(0); 1286 nla_total_size(0);
1265} 1287}
1266 1288
1267static size_t brport_get_linkxstats_size(const struct net_device *dev) 1289static int br_fill_linkxstats(struct sk_buff *skb,
1268{ 1290 const struct net_device *dev,
1269 return nla_total_size(sizeof(struct br_mcast_stats)) + 1291 int *prividx, int attr)
1270 nla_total_size(0);
1271}
1272
1273static size_t br_get_linkxstats_size(const struct net_device *dev, int attr)
1274{ 1292{
1275 size_t retsize = 0; 1293 struct nlattr *nla __maybe_unused;
1294 struct net_bridge_port *p = NULL;
1295 struct net_bridge_vlan_group *vg;
1296 struct net_bridge_vlan *v;
1297 struct net_bridge *br;
1298 struct nlattr *nest;
1299 int vl_idx = 0;
1276 1300
1277 switch (attr) { 1301 switch (attr) {
1278 case IFLA_STATS_LINK_XSTATS: 1302 case IFLA_STATS_LINK_XSTATS:
1279 retsize = bridge_get_linkxstats_size(dev); 1303 br = netdev_priv(dev);
1304 vg = br_vlan_group(br);
1280 break; 1305 break;
1281 case IFLA_STATS_LINK_XSTATS_SLAVE: 1306 case IFLA_STATS_LINK_XSTATS_SLAVE:
1282 retsize = brport_get_linkxstats_size(dev); 1307 p = br_port_get_rtnl(dev);
1308 if (!p)
1309 return 0;
1310 br = p->br;
1311 vg = nbp_vlan_group(p);
1283 break; 1312 break;
1313 default:
1314 return -EINVAL;
1284 } 1315 }
1285 1316
1286 return retsize;
1287}
1288
1289static int bridge_fill_linkxstats(struct sk_buff *skb,
1290 const struct net_device *dev,
1291 int *prividx)
1292{
1293 struct net_bridge *br = netdev_priv(dev);
1294 struct nlattr *nla __maybe_unused;
1295 struct net_bridge_vlan_group *vg;
1296 struct net_bridge_vlan *v;
1297 struct nlattr *nest;
1298 int vl_idx = 0;
1299
1300 nest = nla_nest_start(skb, LINK_XSTATS_TYPE_BRIDGE); 1317 nest = nla_nest_start(skb, LINK_XSTATS_TYPE_BRIDGE);
1301 if (!nest) 1318 if (!nest)
1302 return -EMSGSIZE; 1319 return -EMSGSIZE;
1303 1320
1304 vg = br_vlan_group(br);
1305 if (vg) { 1321 if (vg) {
1322 u16 pvid;
1323
1324 pvid = br_get_pvid(vg);
1306 list_for_each_entry(v, &vg->vlan_list, vlist) { 1325 list_for_each_entry(v, &vg->vlan_list, vlist) {
1307 struct bridge_vlan_xstats vxi; 1326 struct bridge_vlan_xstats vxi;
1308 struct br_vlan_stats stats; 1327 struct br_vlan_stats stats;
@@ -1311,6 +1330,9 @@ static int bridge_fill_linkxstats(struct sk_buff *skb,
1311 continue; 1330 continue;
1312 memset(&vxi, 0, sizeof(vxi)); 1331 memset(&vxi, 0, sizeof(vxi));
1313 vxi.vid = v->vid; 1332 vxi.vid = v->vid;
1333 vxi.flags = v->flags;
1334 if (v->vid == pvid)
1335 vxi.flags |= BRIDGE_VLAN_INFO_PVID;
1314 br_vlan_get_stats(v, &stats); 1336 br_vlan_get_stats(v, &stats);
1315 vxi.rx_bytes = stats.rx_bytes; 1337 vxi.rx_bytes = stats.rx_bytes;
1316 vxi.rx_packets = stats.rx_packets; 1338 vxi.rx_packets = stats.rx_packets;
@@ -1329,7 +1351,7 @@ static int bridge_fill_linkxstats(struct sk_buff *skb,
1329 BRIDGE_XSTATS_PAD); 1351 BRIDGE_XSTATS_PAD);
1330 if (!nla) 1352 if (!nla)
1331 goto nla_put_failure; 1353 goto nla_put_failure;
1332 br_multicast_get_stats(br, NULL, nla_data(nla)); 1354 br_multicast_get_stats(br, p, nla_data(nla));
1333 } 1355 }
1334#endif 1356#endif
1335 nla_nest_end(skb, nest); 1357 nla_nest_end(skb, nest);
@@ -1344,52 +1366,6 @@ nla_put_failure:
1344 return -EMSGSIZE; 1366 return -EMSGSIZE;
1345} 1367}
1346 1368
1347static int brport_fill_linkxstats(struct sk_buff *skb,
1348 const struct net_device *dev,
1349 int *prividx)
1350{
1351 struct net_bridge_port *p = br_port_get_rtnl(dev);
1352 struct nlattr *nla __maybe_unused;
1353 struct nlattr *nest;
1354
1355 if (!p)
1356 return 0;
1357
1358 nest = nla_nest_start(skb, LINK_XSTATS_TYPE_BRIDGE);
1359 if (!nest)
1360 return -EMSGSIZE;
1361#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
1362 nla = nla_reserve_64bit(skb, BRIDGE_XSTATS_MCAST,
1363 sizeof(struct br_mcast_stats),
1364 BRIDGE_XSTATS_PAD);
1365 if (!nla) {
1366 nla_nest_end(skb, nest);
1367 return -EMSGSIZE;
1368 }
1369 br_multicast_get_stats(p->br, p, nla_data(nla));
1370#endif
1371 nla_nest_end(skb, nest);
1372
1373 return 0;
1374}
1375
1376static int br_fill_linkxstats(struct sk_buff *skb, const struct net_device *dev,
1377 int *prividx, int attr)
1378{
1379 int ret = -EINVAL;
1380
1381 switch (attr) {
1382 case IFLA_STATS_LINK_XSTATS:
1383 ret = bridge_fill_linkxstats(skb, dev, prividx);
1384 break;
1385 case IFLA_STATS_LINK_XSTATS_SLAVE:
1386 ret = brport_fill_linkxstats(skb, dev, prividx);
1387 break;
1388 }
1389
1390 return ret;
1391}
1392
1393static struct rtnl_af_ops br_af_ops __read_mostly = { 1369static struct rtnl_af_ops br_af_ops __read_mostly = {
1394 .family = AF_BRIDGE, 1370 .family = AF_BRIDGE,
1395 .get_link_af_size = br_get_link_af_size_filtered, 1371 .get_link_af_size = br_get_link_af_size_filtered,
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index aac2a6e6b008..1b63177e0ccd 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -251,6 +251,9 @@ struct net_bridge_port
251#ifdef CONFIG_BRIDGE_VLAN_FILTERING 251#ifdef CONFIG_BRIDGE_VLAN_FILTERING
252 struct net_bridge_vlan_group __rcu *vlgrp; 252 struct net_bridge_vlan_group __rcu *vlgrp;
253#endif 253#endif
254#ifdef CONFIG_NET_SWITCHDEV
255 int offload_fwd_mark;
256#endif
254}; 257};
255 258
256#define br_auto_port(p) ((p)->flags & BR_AUTO_MASK) 259#define br_auto_port(p) ((p)->flags & BR_AUTO_MASK)
@@ -359,6 +362,11 @@ struct net_bridge
359 struct timer_list gc_timer; 362 struct timer_list gc_timer;
360 struct kobject *ifobj; 363 struct kobject *ifobj;
361 u32 auto_cnt; 364 u32 auto_cnt;
365
366#ifdef CONFIG_NET_SWITCHDEV
367 int offload_fwd_mark;
368#endif
369
362#ifdef CONFIG_BRIDGE_VLAN_FILTERING 370#ifdef CONFIG_BRIDGE_VLAN_FILTERING
363 struct net_bridge_vlan_group __rcu *vlgrp; 371 struct net_bridge_vlan_group __rcu *vlgrp;
364 u8 vlan_enabled; 372 u8 vlan_enabled;
@@ -381,6 +389,10 @@ struct br_input_skb_cb {
381#ifdef CONFIG_BRIDGE_VLAN_FILTERING 389#ifdef CONFIG_BRIDGE_VLAN_FILTERING
382 bool vlan_filtered; 390 bool vlan_filtered;
383#endif 391#endif
392
393#ifdef CONFIG_NET_SWITCHDEV
394 int offload_fwd_mark;
395#endif
384}; 396};
385 397
386#define BR_INPUT_SKB_CB(__skb) ((struct br_input_skb_cb *)(__skb)->cb) 398#define BR_INPUT_SKB_CB(__skb) ((struct br_input_skb_cb *)(__skb)->cb)
@@ -496,7 +508,7 @@ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
496int br_fdb_add(struct ndmsg *nlh, struct nlattr *tb[], struct net_device *dev, 508int br_fdb_add(struct ndmsg *nlh, struct nlattr *tb[], struct net_device *dev,
497 const unsigned char *addr, u16 vid, u16 nlh_flags); 509 const unsigned char *addr, u16 vid, u16 nlh_flags);
498int br_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, 510int br_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
499 struct net_device *dev, struct net_device *fdev, int idx); 511 struct net_device *dev, struct net_device *fdev, int *idx);
500int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p); 512int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p);
501void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p); 513void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p);
502int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, 514int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
@@ -505,12 +517,17 @@ int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
505 const unsigned char *addr, u16 vid); 517 const unsigned char *addr, u16 vid);
506 518
507/* br_forward.c */ 519/* br_forward.c */
520enum br_pkt_type {
521 BR_PKT_UNICAST,
522 BR_PKT_MULTICAST,
523 BR_PKT_BROADCAST
524};
508int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb); 525int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb);
509void br_forward(const struct net_bridge_port *to, struct sk_buff *skb, 526void br_forward(const struct net_bridge_port *to, struct sk_buff *skb,
510 bool local_rcv, bool local_orig); 527 bool local_rcv, bool local_orig);
511int br_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb); 528int br_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb);
512void br_flood(struct net_bridge *br, struct sk_buff *skb, 529void br_flood(struct net_bridge *br, struct sk_buff *skb,
513 bool unicast, bool local_rcv, bool local_orig); 530 enum br_pkt_type pkt_type, bool local_rcv, bool local_orig);
514 531
515/* br_if.c */ 532/* br_if.c */
516void br_port_carrier_check(struct net_bridge_port *p); 533void br_port_carrier_check(struct net_bridge_port *p);
@@ -1034,4 +1051,29 @@ static inline int br_sysfs_addbr(struct net_device *dev) { return 0; }
1034static inline void br_sysfs_delbr(struct net_device *dev) { return; } 1051static inline void br_sysfs_delbr(struct net_device *dev) { return; }
1035#endif /* CONFIG_SYSFS */ 1052#endif /* CONFIG_SYSFS */
1036 1053
1054/* br_switchdev.c */
1055#ifdef CONFIG_NET_SWITCHDEV
1056int nbp_switchdev_mark_set(struct net_bridge_port *p);
1057void nbp_switchdev_frame_mark(const struct net_bridge_port *p,
1058 struct sk_buff *skb);
1059bool nbp_switchdev_allowed_egress(const struct net_bridge_port *p,
1060 const struct sk_buff *skb);
1061#else
1062static inline int nbp_switchdev_mark_set(struct net_bridge_port *p)
1063{
1064 return 0;
1065}
1066
1067static inline void nbp_switchdev_frame_mark(const struct net_bridge_port *p,
1068 struct sk_buff *skb)
1069{
1070}
1071
1072static inline bool nbp_switchdev_allowed_egress(const struct net_bridge_port *p,
1073 const struct sk_buff *skb)
1074{
1075 return true;
1076}
1077#endif /* CONFIG_NET_SWITCHDEV */
1078
1037#endif 1079#endif
diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c
index 341caa0ca63a..d8ad73b38de2 100644
--- a/net/bridge/br_stp_if.c
+++ b/net/bridge/br_stp_if.c
@@ -134,17 +134,36 @@ void br_stp_disable_port(struct net_bridge_port *p)
134 br_become_root_bridge(br); 134 br_become_root_bridge(br);
135} 135}
136 136
137static void br_stp_start(struct net_bridge *br) 137static int br_stp_call_user(struct net_bridge *br, char *arg)
138{ 138{
139 int r; 139 char *argv[] = { BR_STP_PROG, br->dev->name, arg, NULL };
140 char *argv[] = { BR_STP_PROG, br->dev->name, "start", NULL };
141 char *envp[] = { NULL }; 140 char *envp[] = { NULL };
141 int rc;
142
143 /* call userspace STP and report program errors */
144 rc = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC);
145 if (rc > 0) {
146 if (rc & 0xff)
147 br_debug(br, BR_STP_PROG " received signal %d\n",
148 rc & 0x7f);
149 else
150 br_debug(br, BR_STP_PROG " exited with code %d\n",
151 (rc >> 8) & 0xff);
152 }
153
154 return rc;
155}
156
157static void br_stp_start(struct net_bridge *br)
158{
142 struct net_bridge_port *p; 159 struct net_bridge_port *p;
160 int err = -ENOENT;
143 161
144 if (net_eq(dev_net(br->dev), &init_net)) 162 if (net_eq(dev_net(br->dev), &init_net))
145 r = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC); 163 err = br_stp_call_user(br, "start");
146 else 164
147 r = -ENOENT; 165 if (err && err != -ENOENT)
166 br_err(br, "failed to start userspace STP (%d)\n", err);
148 167
149 spin_lock_bh(&br->lock); 168 spin_lock_bh(&br->lock);
150 169
@@ -153,9 +172,10 @@ static void br_stp_start(struct net_bridge *br)
153 else if (br->bridge_forward_delay > BR_MAX_FORWARD_DELAY) 172 else if (br->bridge_forward_delay > BR_MAX_FORWARD_DELAY)
154 __br_set_forward_delay(br, BR_MAX_FORWARD_DELAY); 173 __br_set_forward_delay(br, BR_MAX_FORWARD_DELAY);
155 174
156 if (r == 0) { 175 if (!err) {
157 br->stp_enabled = BR_USER_STP; 176 br->stp_enabled = BR_USER_STP;
158 br_debug(br, "userspace STP started\n"); 177 br_debug(br, "userspace STP started\n");
178
159 /* Stop hello and hold timers */ 179 /* Stop hello and hold timers */
160 del_timer(&br->hello_timer); 180 del_timer(&br->hello_timer);
161 list_for_each_entry(p, &br->port_list, list) 181 list_for_each_entry(p, &br->port_list, list)
@@ -173,14 +193,13 @@ static void br_stp_start(struct net_bridge *br)
173 193
174static void br_stp_stop(struct net_bridge *br) 194static void br_stp_stop(struct net_bridge *br)
175{ 195{
176 int r;
177 char *argv[] = { BR_STP_PROG, br->dev->name, "stop", NULL };
178 char *envp[] = { NULL };
179 struct net_bridge_port *p; 196 struct net_bridge_port *p;
197 int err;
180 198
181 if (br->stp_enabled == BR_USER_STP) { 199 if (br->stp_enabled == BR_USER_STP) {
182 r = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC); 200 err = br_stp_call_user(br, "stop");
183 br_info(br, "userspace STP stopped, return code %d\n", r); 201 if (err)
202 br_err(br, "failed to stop userspace STP (%d)\n", err);
184 203
185 /* To start timers on any ports left in blocking */ 204 /* To start timers on any ports left in blocking */
186 mod_timer(&br->hello_timer, jiffies + br->hello_time); 205 mod_timer(&br->hello_timer, jiffies + br->hello_time);
diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c
new file mode 100644
index 000000000000..f4097b900de1
--- /dev/null
+++ b/net/bridge/br_switchdev.c
@@ -0,0 +1,57 @@
1#include <linux/kernel.h>
2#include <linux/list.h>
3#include <linux/netdevice.h>
4#include <linux/rtnetlink.h>
5#include <linux/skbuff.h>
6#include <net/switchdev.h>
7
8#include "br_private.h"
9
10static int br_switchdev_mark_get(struct net_bridge *br, struct net_device *dev)
11{
12 struct net_bridge_port *p;
13
14 /* dev is yet to be added to the port list. */
15 list_for_each_entry(p, &br->port_list, list) {
16 if (switchdev_port_same_parent_id(dev, p->dev))
17 return p->offload_fwd_mark;
18 }
19
20 return ++br->offload_fwd_mark;
21}
22
23int nbp_switchdev_mark_set(struct net_bridge_port *p)
24{
25 struct switchdev_attr attr = {
26 .orig_dev = p->dev,
27 .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID,
28 };
29 int err;
30
31 ASSERT_RTNL();
32
33 err = switchdev_port_attr_get(p->dev, &attr);
34 if (err) {
35 if (err == -EOPNOTSUPP)
36 return 0;
37 return err;
38 }
39
40 p->offload_fwd_mark = br_switchdev_mark_get(p->br, p->dev);
41
42 return 0;
43}
44
45void nbp_switchdev_frame_mark(const struct net_bridge_port *p,
46 struct sk_buff *skb)
47{
48 if (skb->offload_fwd_mark && !WARN_ON_ONCE(!p->offload_fwd_mark))
49 BR_INPUT_SKB_CB(skb)->offload_fwd_mark = p->offload_fwd_mark;
50}
51
52bool nbp_switchdev_allowed_egress(const struct net_bridge_port *p,
53 const struct sk_buff *skb)
54{
55 return !skb->offload_fwd_mark ||
56 BR_INPUT_SKB_CB(skb)->offload_fwd_mark != p->offload_fwd_mark;
57}
diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
index 1e04d4d44273..8bd569695e76 100644
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -171,6 +171,7 @@ BRPORT_ATTR_FLAG(learning, BR_LEARNING);
171BRPORT_ATTR_FLAG(unicast_flood, BR_FLOOD); 171BRPORT_ATTR_FLAG(unicast_flood, BR_FLOOD);
172BRPORT_ATTR_FLAG(proxyarp, BR_PROXYARP); 172BRPORT_ATTR_FLAG(proxyarp, BR_PROXYARP);
173BRPORT_ATTR_FLAG(proxyarp_wifi, BR_PROXYARP_WIFI); 173BRPORT_ATTR_FLAG(proxyarp_wifi, BR_PROXYARP_WIFI);
174BRPORT_ATTR_FLAG(multicast_flood, BR_MCAST_FLOOD);
174 175
175#ifdef CONFIG_BRIDGE_IGMP_SNOOPING 176#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
176static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf) 177static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf)
@@ -216,6 +217,7 @@ static const struct brport_attribute *brport_attrs[] = {
216#endif 217#endif
217 &brport_attr_proxyarp, 218 &brport_attr_proxyarp,
218 &brport_attr_proxyarp_wifi, 219 &brport_attr_proxyarp_wifi,
220 &brport_attr_multicast_flood,
219 NULL 221 NULL
220}; 222};
221 223
diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index 152300d164ac..9a11086ba6ff 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -91,7 +91,7 @@ ebt_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum,
91 if (loginfo->type == NF_LOG_TYPE_LOG) 91 if (loginfo->type == NF_LOG_TYPE_LOG)
92 bitmask = loginfo->u.log.logflags; 92 bitmask = loginfo->u.log.logflags;
93 else 93 else
94 bitmask = NF_LOG_MASK; 94 bitmask = NF_LOG_DEFAULT_MASK;
95 95
96 if ((bitmask & EBT_LOG_IP) && eth_hdr(skb)->h_proto == 96 if ((bitmask & EBT_LOG_IP) && eth_hdr(skb)->h_proto ==
97 htons(ETH_P_IP)) { 97 htons(ETH_P_IP)) {
diff --git a/net/bridge/netfilter/ebt_redirect.c b/net/bridge/netfilter/ebt_redirect.c
index 203964997a51..2e7c4f974340 100644
--- a/net/bridge/netfilter/ebt_redirect.c
+++ b/net/bridge/netfilter/ebt_redirect.c
@@ -24,7 +24,7 @@ ebt_redirect_tg(struct sk_buff *skb, const struct xt_action_param *par)
24 return EBT_DROP; 24 return EBT_DROP;
25 25
26 if (par->hooknum != NF_BR_BROUTING) 26 if (par->hooknum != NF_BR_BROUTING)
27 /* rcu_read_lock()ed by nf_hook_slow */ 27 /* rcu_read_lock()ed by nf_hook_thresh */
28 ether_addr_copy(eth_hdr(skb)->h_dest, 28 ether_addr_copy(eth_hdr(skb)->h_dest,
29 br_port_get_rcu(par->in)->br->dev->dev_addr); 29 br_port_get_rcu(par->in)->br->dev->dev_addr);
30 else 30 else
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 0833c251aef7..f5c11bbe27db 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -146,7 +146,7 @@ ebt_basic_match(const struct ebt_entry *e, const struct sk_buff *skb,
146 return 1; 146 return 1;
147 if (NF_INVF(e, EBT_IOUT, ebt_dev_check(e->out, out))) 147 if (NF_INVF(e, EBT_IOUT, ebt_dev_check(e->out, out)))
148 return 1; 148 return 1;
149 /* rcu_read_lock()ed by nf_hook_slow */ 149 /* rcu_read_lock()ed by nf_hook_thresh */
150 if (in && (p = br_port_get_rcu(in)) != NULL && 150 if (in && (p = br_port_get_rcu(in)) != NULL &&
151 NF_INVF(e, EBT_ILOGICALIN, 151 NF_INVF(e, EBT_ILOGICALIN,
152 ebt_dev_check(e->logical_in, p->br->dev))) 152 ebt_dev_check(e->logical_in, p->br->dev)))
diff --git a/net/bridge/netfilter/nf_log_bridge.c b/net/bridge/netfilter/nf_log_bridge.c
index 5d9953a90929..1663df598545 100644
--- a/net/bridge/netfilter/nf_log_bridge.c
+++ b/net/bridge/netfilter/nf_log_bridge.c
@@ -50,8 +50,7 @@ static struct nf_logger nf_bridge_logger __read_mostly = {
50 50
51static int __net_init nf_log_bridge_net_init(struct net *net) 51static int __net_init nf_log_bridge_net_init(struct net *net)
52{ 52{
53 nf_log_set(net, NFPROTO_BRIDGE, &nf_bridge_logger); 53 return nf_log_set(net, NFPROTO_BRIDGE, &nf_bridge_logger);
54 return 0;
55} 54}
56 55
57static void __net_exit nf_log_bridge_net_exit(struct net *net) 56static void __net_exit nf_log_bridge_net_exit(struct net *net)
diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c
index a78c4e2826e5..97afdc0744e6 100644
--- a/net/bridge/netfilter/nf_tables_bridge.c
+++ b/net/bridge/netfilter/nf_tables_bridge.c
@@ -13,79 +13,11 @@
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/netfilter_bridge.h> 14#include <linux/netfilter_bridge.h>
15#include <net/netfilter/nf_tables.h> 15#include <net/netfilter/nf_tables.h>
16#include <net/netfilter/nf_tables_bridge.h>
17#include <linux/ip.h> 16#include <linux/ip.h>
18#include <linux/ipv6.h> 17#include <linux/ipv6.h>
19#include <net/netfilter/nf_tables_ipv4.h> 18#include <net/netfilter/nf_tables_ipv4.h>
20#include <net/netfilter/nf_tables_ipv6.h> 19#include <net/netfilter/nf_tables_ipv6.h>
21 20
22int nft_bridge_iphdr_validate(struct sk_buff *skb)
23{
24 struct iphdr *iph;
25 u32 len;
26
27 if (!pskb_may_pull(skb, sizeof(struct iphdr)))
28 return 0;
29
30 iph = ip_hdr(skb);
31 if (iph->ihl < 5 || iph->version != 4)
32 return 0;
33
34 len = ntohs(iph->tot_len);
35 if (skb->len < len)
36 return 0;
37 else if (len < (iph->ihl*4))
38 return 0;
39
40 if (!pskb_may_pull(skb, iph->ihl*4))
41 return 0;
42
43 return 1;
44}
45EXPORT_SYMBOL_GPL(nft_bridge_iphdr_validate);
46
47int nft_bridge_ip6hdr_validate(struct sk_buff *skb)
48{
49 struct ipv6hdr *hdr;
50 u32 pkt_len;
51
52 if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
53 return 0;
54
55 hdr = ipv6_hdr(skb);
56 if (hdr->version != 6)
57 return 0;
58
59 pkt_len = ntohs(hdr->payload_len);
60 if (pkt_len + sizeof(struct ipv6hdr) > skb->len)
61 return 0;
62
63 return 1;
64}
65EXPORT_SYMBOL_GPL(nft_bridge_ip6hdr_validate);
66
67static inline void nft_bridge_set_pktinfo_ipv4(struct nft_pktinfo *pkt,
68 struct sk_buff *skb,
69 const struct nf_hook_state *state)
70{
71 if (nft_bridge_iphdr_validate(skb))
72 nft_set_pktinfo_ipv4(pkt, skb, state);
73 else
74 nft_set_pktinfo(pkt, skb, state);
75}
76
77static inline void nft_bridge_set_pktinfo_ipv6(struct nft_pktinfo *pkt,
78 struct sk_buff *skb,
79 const struct nf_hook_state *state)
80{
81#if IS_ENABLED(CONFIG_IPV6)
82 if (nft_bridge_ip6hdr_validate(skb) &&
83 nft_set_pktinfo_ipv6(pkt, skb, state) == 0)
84 return;
85#endif
86 nft_set_pktinfo(pkt, skb, state);
87}
88
89static unsigned int 21static unsigned int
90nft_do_chain_bridge(void *priv, 22nft_do_chain_bridge(void *priv,
91 struct sk_buff *skb, 23 struct sk_buff *skb,
@@ -95,13 +27,13 @@ nft_do_chain_bridge(void *priv,
95 27
96 switch (eth_hdr(skb)->h_proto) { 28 switch (eth_hdr(skb)->h_proto) {
97 case htons(ETH_P_IP): 29 case htons(ETH_P_IP):
98 nft_bridge_set_pktinfo_ipv4(&pkt, skb, state); 30 nft_set_pktinfo_ipv4_validate(&pkt, skb, state);
99 break; 31 break;
100 case htons(ETH_P_IPV6): 32 case htons(ETH_P_IPV6):
101 nft_bridge_set_pktinfo_ipv6(&pkt, skb, state); 33 nft_set_pktinfo_ipv6_validate(&pkt, skb, state);
102 break; 34 break;
103 default: 35 default:
104 nft_set_pktinfo(&pkt, skb, state); 36 nft_set_pktinfo_unspec(&pkt, skb, state);
105 break; 37 break;
106 } 38 }
107 39
@@ -207,12 +139,20 @@ static int __init nf_tables_bridge_init(void)
207 int ret; 139 int ret;
208 140
209 nf_register_afinfo(&nf_br_afinfo); 141 nf_register_afinfo(&nf_br_afinfo);
210 nft_register_chain_type(&filter_bridge); 142 ret = nft_register_chain_type(&filter_bridge);
143 if (ret < 0)
144 goto err1;
145
211 ret = register_pernet_subsys(&nf_tables_bridge_net_ops); 146 ret = register_pernet_subsys(&nf_tables_bridge_net_ops);
212 if (ret < 0) { 147 if (ret < 0)
213 nft_unregister_chain_type(&filter_bridge); 148 goto err2;
214 nf_unregister_afinfo(&nf_br_afinfo); 149
215 } 150 return ret;
151
152err2:
153 nft_unregister_chain_type(&filter_bridge);
154err1:
155 nf_unregister_afinfo(&nf_br_afinfo);
216 return ret; 156 return ret;
217} 157}
218 158
diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c
index 0b77ffbc27d6..4b3df6b0e3b9 100644
--- a/net/bridge/netfilter/nft_reject_bridge.c
+++ b/net/bridge/netfilter/nft_reject_bridge.c
@@ -14,7 +14,6 @@
14#include <linux/netfilter/nf_tables.h> 14#include <linux/netfilter/nf_tables.h>
15#include <net/netfilter/nf_tables.h> 15#include <net/netfilter/nf_tables.h>
16#include <net/netfilter/nft_reject.h> 16#include <net/netfilter/nft_reject.h>
17#include <net/netfilter/nf_tables_bridge.h>
18#include <net/netfilter/ipv4/nf_reject.h> 17#include <net/netfilter/ipv4/nf_reject.h>
19#include <net/netfilter/ipv6/nf_reject.h> 18#include <net/netfilter/ipv6/nf_reject.h>
20#include <linux/ip.h> 19#include <linux/ip.h>
@@ -37,6 +36,30 @@ static void nft_reject_br_push_etherhdr(struct sk_buff *oldskb,
37 skb_pull(nskb, ETH_HLEN); 36 skb_pull(nskb, ETH_HLEN);
38} 37}
39 38
39static int nft_bridge_iphdr_validate(struct sk_buff *skb)
40{
41 struct iphdr *iph;
42 u32 len;
43
44 if (!pskb_may_pull(skb, sizeof(struct iphdr)))
45 return 0;
46
47 iph = ip_hdr(skb);
48 if (iph->ihl < 5 || iph->version != 4)
49 return 0;
50
51 len = ntohs(iph->tot_len);
52 if (skb->len < len)
53 return 0;
54 else if (len < (iph->ihl*4))
55 return 0;
56
57 if (!pskb_may_pull(skb, iph->ihl*4))
58 return 0;
59
60 return 1;
61}
62
40/* We cannot use oldskb->dev, it can be either bridge device (NF_BRIDGE INPUT) 63/* We cannot use oldskb->dev, it can be either bridge device (NF_BRIDGE INPUT)
41 * or the bridge port (NF_BRIDGE PREROUTING). 64 * or the bridge port (NF_BRIDGE PREROUTING).
42 */ 65 */
@@ -143,6 +166,25 @@ static void nft_reject_br_send_v4_unreach(struct net *net,
143 br_forward(br_port_get_rcu(dev), nskb, false, true); 166 br_forward(br_port_get_rcu(dev), nskb, false, true);
144} 167}
145 168
169static int nft_bridge_ip6hdr_validate(struct sk_buff *skb)
170{
171 struct ipv6hdr *hdr;
172 u32 pkt_len;
173
174 if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
175 return 0;
176
177 hdr = ipv6_hdr(skb);
178 if (hdr->version != 6)
179 return 0;
180
181 pkt_len = ntohs(hdr->payload_len);
182 if (pkt_len + sizeof(struct ipv6hdr) > skb->len)
183 return 0;
184
185 return 1;
186}
187
146static void nft_reject_br_send_v6_tcp_reset(struct net *net, 188static void nft_reject_br_send_v6_tcp_reset(struct net *net,
147 struct sk_buff *oldskb, 189 struct sk_buff *oldskb,
148 const struct net_device *dev, 190 const struct net_device *dev,
diff --git a/net/ceph/Makefile b/net/ceph/Makefile
index 84cbed630c4b..6a5180903e7b 100644
--- a/net/ceph/Makefile
+++ b/net/ceph/Makefile
@@ -5,6 +5,7 @@ obj-$(CONFIG_CEPH_LIB) += libceph.o
5 5
6libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \ 6libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
7 mon_client.o \ 7 mon_client.o \
8 cls_lock_client.o \
8 osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \ 9 osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
9 debugfs.o \ 10 debugfs.o \
10 auth.o auth_none.o \ 11 auth.o auth_none.o \
diff --git a/net/ceph/auth.c b/net/ceph/auth.c
index 2bc5965fdd1e..c822b3ae1bd3 100644
--- a/net/ceph/auth.c
+++ b/net/ceph/auth.c
@@ -82,7 +82,10 @@ void ceph_auth_reset(struct ceph_auth_client *ac)
82 mutex_unlock(&ac->mutex); 82 mutex_unlock(&ac->mutex);
83} 83}
84 84
85int ceph_entity_name_encode(const char *name, void **p, void *end) 85/*
86 * EntityName, not to be confused with entity_name_t
87 */
88int ceph_auth_entity_name_encode(const char *name, void **p, void *end)
86{ 89{
87 int len = strlen(name); 90 int len = strlen(name);
88 91
@@ -111,7 +114,7 @@ int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
111 monhdr->session_mon = cpu_to_le16(-1); 114 monhdr->session_mon = cpu_to_le16(-1);
112 monhdr->session_mon_tid = 0; 115 monhdr->session_mon_tid = 0;
113 116
114 ceph_encode_32(&p, 0); /* no protocol, yet */ 117 ceph_encode_32(&p, CEPH_AUTH_UNKNOWN); /* no protocol, yet */
115 118
116 lenp = p; 119 lenp = p;
117 p += sizeof(u32); 120 p += sizeof(u32);
@@ -124,7 +127,7 @@ int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
124 for (i = 0; i < num; i++) 127 for (i = 0; i < num; i++)
125 ceph_encode_32(&p, supported_protocols[i]); 128 ceph_encode_32(&p, supported_protocols[i]);
126 129
127 ret = ceph_entity_name_encode(ac->name, &p, end); 130 ret = ceph_auth_entity_name_encode(ac->name, &p, end);
128 if (ret < 0) 131 if (ret < 0)
129 goto out; 132 goto out;
130 ceph_decode_need(&p, end, sizeof(u64), bad); 133 ceph_decode_need(&p, end, sizeof(u64), bad);
@@ -259,9 +262,7 @@ int ceph_build_auth(struct ceph_auth_client *ac,
259 int ret = 0; 262 int ret = 0;
260 263
261 mutex_lock(&ac->mutex); 264 mutex_lock(&ac->mutex);
262 if (!ac->protocol) 265 if (ac->ops->should_authenticate(ac))
263 ret = ceph_auth_build_hello(ac, msg_buf, msg_len);
264 else if (ac->ops->should_authenticate(ac))
265 ret = ceph_build_auth_request(ac, msg_buf, msg_len); 266 ret = ceph_build_auth_request(ac, msg_buf, msg_len);
266 mutex_unlock(&ac->mutex); 267 mutex_unlock(&ac->mutex);
267 return ret; 268 return ret;
diff --git a/net/ceph/auth_none.c b/net/ceph/auth_none.c
index 5f836f02ae36..df45e467c81f 100644
--- a/net/ceph/auth_none.c
+++ b/net/ceph/auth_none.c
@@ -46,7 +46,7 @@ static int ceph_auth_none_build_authorizer(struct ceph_auth_client *ac,
46 int ret; 46 int ret;
47 47
48 ceph_encode_8_safe(&p, end, 1, e_range); 48 ceph_encode_8_safe(&p, end, 1, e_range);
49 ret = ceph_entity_name_encode(ac->name, &p, end); 49 ret = ceph_auth_entity_name_encode(ac->name, &p, end);
50 if (ret < 0) 50 if (ret < 0)
51 return ret; 51 return ret;
52 52
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index bddfcf6f09c2..464e88599b9d 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -566,11 +566,17 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client)
566} 566}
567EXPORT_SYMBOL(ceph_print_client_options); 567EXPORT_SYMBOL(ceph_print_client_options);
568 568
569u64 ceph_client_id(struct ceph_client *client) 569struct ceph_entity_addr *ceph_client_addr(struct ceph_client *client)
570{
571 return &client->msgr.inst.addr;
572}
573EXPORT_SYMBOL(ceph_client_addr);
574
575u64 ceph_client_gid(struct ceph_client *client)
570{ 576{
571 return client->monc.auth->global_id; 577 return client->monc.auth->global_id;
572} 578}
573EXPORT_SYMBOL(ceph_client_id); 579EXPORT_SYMBOL(ceph_client_gid);
574 580
575/* 581/*
576 * create a fresh client instance 582 * create a fresh client instance
@@ -685,7 +691,8 @@ int __ceph_open_session(struct ceph_client *client, unsigned long started)
685 return client->auth_err; 691 return client->auth_err;
686 } 692 }
687 693
688 pr_info("client%llu fsid %pU\n", ceph_client_id(client), &client->fsid); 694 pr_info("client%llu fsid %pU\n", ceph_client_gid(client),
695 &client->fsid);
689 ceph_debugfs_client_init(client); 696 ceph_debugfs_client_init(client);
690 697
691 return 0; 698 return 0;
diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c
index 3773a4fa11e3..19b7d8aa915c 100644
--- a/net/ceph/ceph_strings.c
+++ b/net/ceph/ceph_strings.c
@@ -15,6 +15,7 @@ const char *ceph_entity_type_name(int type)
15 default: return "unknown"; 15 default: return "unknown";
16 } 16 }
17} 17}
18EXPORT_SYMBOL(ceph_entity_type_name);
18 19
19const char *ceph_osd_op_name(int op) 20const char *ceph_osd_op_name(int op)
20{ 21{
diff --git a/net/ceph/cls_lock_client.c b/net/ceph/cls_lock_client.c
new file mode 100644
index 000000000000..50f040fdb2a9
--- /dev/null
+++ b/net/ceph/cls_lock_client.c
@@ -0,0 +1,325 @@
1#include <linux/ceph/ceph_debug.h>
2
3#include <linux/types.h>
4#include <linux/slab.h>
5
6#include <linux/ceph/cls_lock_client.h>
7#include <linux/ceph/decode.h>
8
9/**
10 * ceph_cls_lock - grab rados lock for object
11 * @oid, @oloc: object to lock
12 * @lock_name: the name of the lock
13 * @type: lock type (CEPH_CLS_LOCK_EXCLUSIVE or CEPH_CLS_LOCK_SHARED)
14 * @cookie: user-defined identifier for this instance of the lock
15 * @tag: user-defined tag
16 * @desc: user-defined lock description
17 * @flags: lock flags
18 *
19 * All operations on the same lock should use the same tag.
20 */
21int ceph_cls_lock(struct ceph_osd_client *osdc,
22 struct ceph_object_id *oid,
23 struct ceph_object_locator *oloc,
24 char *lock_name, u8 type, char *cookie,
25 char *tag, char *desc, u8 flags)
26{
27 int lock_op_buf_size;
28 int name_len = strlen(lock_name);
29 int cookie_len = strlen(cookie);
30 int tag_len = strlen(tag);
31 int desc_len = strlen(desc);
32 void *p, *end;
33 struct page *lock_op_page;
34 struct timespec mtime;
35 int ret;
36
37 lock_op_buf_size = name_len + sizeof(__le32) +
38 cookie_len + sizeof(__le32) +
39 tag_len + sizeof(__le32) +
40 desc_len + sizeof(__le32) +
41 sizeof(struct ceph_timespec) +
42 /* flag and type */
43 sizeof(u8) + sizeof(u8) +
44 CEPH_ENCODING_START_BLK_LEN;
45 if (lock_op_buf_size > PAGE_SIZE)
46 return -E2BIG;
47
48 lock_op_page = alloc_page(GFP_NOIO);
49 if (!lock_op_page)
50 return -ENOMEM;
51
52 p = page_address(lock_op_page);
53 end = p + lock_op_buf_size;
54
55 /* encode cls_lock_lock_op struct */
56 ceph_start_encoding(&p, 1, 1,
57 lock_op_buf_size - CEPH_ENCODING_START_BLK_LEN);
58 ceph_encode_string(&p, end, lock_name, name_len);
59 ceph_encode_8(&p, type);
60 ceph_encode_string(&p, end, cookie, cookie_len);
61 ceph_encode_string(&p, end, tag, tag_len);
62 ceph_encode_string(&p, end, desc, desc_len);
63 /* only support infinite duration */
64 memset(&mtime, 0, sizeof(mtime));
65 ceph_encode_timespec(p, &mtime);
66 p += sizeof(struct ceph_timespec);
67 ceph_encode_8(&p, flags);
68
69 dout("%s lock_name %s type %d cookie %s tag %s desc %s flags 0x%x\n",
70 __func__, lock_name, type, cookie, tag, desc, flags);
71 ret = ceph_osdc_call(osdc, oid, oloc, "lock", "lock",
72 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
73 lock_op_page, lock_op_buf_size, NULL, NULL);
74
75 dout("%s: status %d\n", __func__, ret);
76 __free_page(lock_op_page);
77 return ret;
78}
79EXPORT_SYMBOL(ceph_cls_lock);
80
81/**
82 * ceph_cls_unlock - release rados lock for object
83 * @oid, @oloc: object to lock
84 * @lock_name: the name of the lock
85 * @cookie: user-defined identifier for this instance of the lock
86 */
87int ceph_cls_unlock(struct ceph_osd_client *osdc,
88 struct ceph_object_id *oid,
89 struct ceph_object_locator *oloc,
90 char *lock_name, char *cookie)
91{
92 int unlock_op_buf_size;
93 int name_len = strlen(lock_name);
94 int cookie_len = strlen(cookie);
95 void *p, *end;
96 struct page *unlock_op_page;
97 int ret;
98
99 unlock_op_buf_size = name_len + sizeof(__le32) +
100 cookie_len + sizeof(__le32) +
101 CEPH_ENCODING_START_BLK_LEN;
102 if (unlock_op_buf_size > PAGE_SIZE)
103 return -E2BIG;
104
105 unlock_op_page = alloc_page(GFP_NOIO);
106 if (!unlock_op_page)
107 return -ENOMEM;
108
109 p = page_address(unlock_op_page);
110 end = p + unlock_op_buf_size;
111
112 /* encode cls_lock_unlock_op struct */
113 ceph_start_encoding(&p, 1, 1,
114 unlock_op_buf_size - CEPH_ENCODING_START_BLK_LEN);
115 ceph_encode_string(&p, end, lock_name, name_len);
116 ceph_encode_string(&p, end, cookie, cookie_len);
117
118 dout("%s lock_name %s cookie %s\n", __func__, lock_name, cookie);
119 ret = ceph_osdc_call(osdc, oid, oloc, "lock", "unlock",
120 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
121 unlock_op_page, unlock_op_buf_size, NULL, NULL);
122
123 dout("%s: status %d\n", __func__, ret);
124 __free_page(unlock_op_page);
125 return ret;
126}
127EXPORT_SYMBOL(ceph_cls_unlock);
128
129/**
130 * ceph_cls_break_lock - release rados lock for object for specified client
131 * @oid, @oloc: object to lock
132 * @lock_name: the name of the lock
133 * @cookie: user-defined identifier for this instance of the lock
134 * @locker: current lock owner
135 */
136int ceph_cls_break_lock(struct ceph_osd_client *osdc,
137 struct ceph_object_id *oid,
138 struct ceph_object_locator *oloc,
139 char *lock_name, char *cookie,
140 struct ceph_entity_name *locker)
141{
142 int break_op_buf_size;
143 int name_len = strlen(lock_name);
144 int cookie_len = strlen(cookie);
145 struct page *break_op_page;
146 void *p, *end;
147 int ret;
148
149 break_op_buf_size = name_len + sizeof(__le32) +
150 cookie_len + sizeof(__le32) +
151 sizeof(u8) + sizeof(__le64) +
152 CEPH_ENCODING_START_BLK_LEN;
153 if (break_op_buf_size > PAGE_SIZE)
154 return -E2BIG;
155
156 break_op_page = alloc_page(GFP_NOIO);
157 if (!break_op_page)
158 return -ENOMEM;
159
160 p = page_address(break_op_page);
161 end = p + break_op_buf_size;
162
163 /* encode cls_lock_break_op struct */
164 ceph_start_encoding(&p, 1, 1,
165 break_op_buf_size - CEPH_ENCODING_START_BLK_LEN);
166 ceph_encode_string(&p, end, lock_name, name_len);
167 ceph_encode_copy(&p, locker, sizeof(*locker));
168 ceph_encode_string(&p, end, cookie, cookie_len);
169
170 dout("%s lock_name %s cookie %s locker %s%llu\n", __func__, lock_name,
171 cookie, ENTITY_NAME(*locker));
172 ret = ceph_osdc_call(osdc, oid, oloc, "lock", "break_lock",
173 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
174 break_op_page, break_op_buf_size, NULL, NULL);
175
176 dout("%s: status %d\n", __func__, ret);
177 __free_page(break_op_page);
178 return ret;
179}
180EXPORT_SYMBOL(ceph_cls_break_lock);
181
182void ceph_free_lockers(struct ceph_locker *lockers, u32 num_lockers)
183{
184 int i;
185
186 for (i = 0; i < num_lockers; i++)
187 kfree(lockers[i].id.cookie);
188 kfree(lockers);
189}
190EXPORT_SYMBOL(ceph_free_lockers);
191
192static int decode_locker(void **p, void *end, struct ceph_locker *locker)
193{
194 u8 struct_v;
195 u32 len;
196 char *s;
197 int ret;
198
199 ret = ceph_start_decoding(p, end, 1, "locker_id_t", &struct_v, &len);
200 if (ret)
201 return ret;
202
203 ceph_decode_copy(p, &locker->id.name, sizeof(locker->id.name));
204 s = ceph_extract_encoded_string(p, end, NULL, GFP_NOIO);
205 if (IS_ERR(s))
206 return PTR_ERR(s);
207
208 locker->id.cookie = s;
209
210 ret = ceph_start_decoding(p, end, 1, "locker_info_t", &struct_v, &len);
211 if (ret)
212 return ret;
213
214 *p += sizeof(struct ceph_timespec); /* skip expiration */
215 ceph_decode_copy(p, &locker->info.addr, sizeof(locker->info.addr));
216 ceph_decode_addr(&locker->info.addr);
217 len = ceph_decode_32(p);
218 *p += len; /* skip description */
219
220 dout("%s %s%llu cookie %s addr %s\n", __func__,
221 ENTITY_NAME(locker->id.name), locker->id.cookie,
222 ceph_pr_addr(&locker->info.addr.in_addr));
223 return 0;
224}
225
226static int decode_lockers(void **p, void *end, u8 *type, char **tag,
227 struct ceph_locker **lockers, u32 *num_lockers)
228{
229 u8 struct_v;
230 u32 struct_len;
231 char *s;
232 int i;
233 int ret;
234
235 ret = ceph_start_decoding(p, end, 1, "cls_lock_get_info_reply",
236 &struct_v, &struct_len);
237 if (ret)
238 return ret;
239
240 *num_lockers = ceph_decode_32(p);
241 *lockers = kcalloc(*num_lockers, sizeof(**lockers), GFP_NOIO);
242 if (!*lockers)
243 return -ENOMEM;
244
245 for (i = 0; i < *num_lockers; i++) {
246 ret = decode_locker(p, end, *lockers + i);
247 if (ret)
248 goto err_free_lockers;
249 }
250
251 *type = ceph_decode_8(p);
252 s = ceph_extract_encoded_string(p, end, NULL, GFP_NOIO);
253 if (IS_ERR(s)) {
254 ret = PTR_ERR(s);
255 goto err_free_lockers;
256 }
257
258 *tag = s;
259 return 0;
260
261err_free_lockers:
262 ceph_free_lockers(*lockers, *num_lockers);
263 return ret;
264}
265
266/*
267 * On success, the caller is responsible for:
268 *
269 * kfree(tag);
270 * ceph_free_lockers(lockers, num_lockers);
271 */
272int ceph_cls_lock_info(struct ceph_osd_client *osdc,
273 struct ceph_object_id *oid,
274 struct ceph_object_locator *oloc,
275 char *lock_name, u8 *type, char **tag,
276 struct ceph_locker **lockers, u32 *num_lockers)
277{
278 int get_info_op_buf_size;
279 int name_len = strlen(lock_name);
280 struct page *get_info_op_page, *reply_page;
281 size_t reply_len;
282 void *p, *end;
283 int ret;
284
285 get_info_op_buf_size = name_len + sizeof(__le32) +
286 CEPH_ENCODING_START_BLK_LEN;
287 if (get_info_op_buf_size > PAGE_SIZE)
288 return -E2BIG;
289
290 get_info_op_page = alloc_page(GFP_NOIO);
291 if (!get_info_op_page)
292 return -ENOMEM;
293
294 reply_page = alloc_page(GFP_NOIO);
295 if (!reply_page) {
296 __free_page(get_info_op_page);
297 return -ENOMEM;
298 }
299
300 p = page_address(get_info_op_page);
301 end = p + get_info_op_buf_size;
302
303 /* encode cls_lock_get_info_op struct */
304 ceph_start_encoding(&p, 1, 1,
305 get_info_op_buf_size - CEPH_ENCODING_START_BLK_LEN);
306 ceph_encode_string(&p, end, lock_name, name_len);
307
308 dout("%s lock_name %s\n", __func__, lock_name);
309 ret = ceph_osdc_call(osdc, oid, oloc, "lock", "get_info",
310 CEPH_OSD_FLAG_READ, get_info_op_page,
311 get_info_op_buf_size, reply_page, &reply_len);
312
313 dout("%s: status %d\n", __func__, ret);
314 if (ret >= 0) {
315 p = page_address(reply_page);
316 end = p + reply_len;
317
318 ret = decode_lockers(&p, end, type, tag, lockers, num_lockers);
319 }
320
321 __free_page(get_info_op_page);
322 __free_page(reply_page);
323 return ret;
324}
325EXPORT_SYMBOL(ceph_cls_lock_info);
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index 5fcfb98f309e..a421e905331a 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -245,7 +245,7 @@ static int bucket_straw_choose(struct crush_bucket_straw *bucket,
245/* compute 2^44*log2(input+1) */ 245/* compute 2^44*log2(input+1) */
246static __u64 crush_ln(unsigned int xin) 246static __u64 crush_ln(unsigned int xin)
247{ 247{
248 unsigned int x = xin, x1; 248 unsigned int x = xin;
249 int iexpon, index1, index2; 249 int iexpon, index1, index2;
250 __u64 RH, LH, LL, xl64, result; 250 __u64 RH, LH, LL, xl64, result;
251 251
@@ -253,9 +253,15 @@ static __u64 crush_ln(unsigned int xin)
253 253
254 /* normalize input */ 254 /* normalize input */
255 iexpon = 15; 255 iexpon = 15;
256 while (!(x & 0x18000)) { 256
257 x <<= 1; 257 /*
258 iexpon--; 258 * figure out number of bits we need to shift and
259 * do it in one step instead of iteratively
260 */
261 if (!(x & 0x18000)) {
262 int bits = __builtin_clz(x & 0x1FFFF) - 16;
263 x <<= bits;
264 iexpon = 15 - bits;
259 } 265 }
260 266
261 index1 = (x >> 8) << 1; 267 index1 = (x >> 8) << 1;
@@ -267,12 +273,11 @@ static __u64 crush_ln(unsigned int xin)
267 /* RH*x ~ 2^48 * (2^15 + xf), xf<2^8 */ 273 /* RH*x ~ 2^48 * (2^15 + xf), xf<2^8 */
268 xl64 = (__s64)x * RH; 274 xl64 = (__s64)x * RH;
269 xl64 >>= 48; 275 xl64 >>= 48;
270 x1 = xl64;
271 276
272 result = iexpon; 277 result = iexpon;
273 result <<= (12 + 32); 278 result <<= (12 + 32);
274 279
275 index2 = x1 & 0xff; 280 index2 = xl64 & 0xff;
276 /* LL ~ 2^48*log2(1.0+index2/2^15) */ 281 /* LL ~ 2^48*log2(1.0+index2/2^15) */
277 LL = __LL_tbl[index2]; 282 LL = __LL_tbl[index2];
278 283
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index ef34a02719d7..a8effc8b7280 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -835,6 +835,83 @@ int ceph_monc_get_version_async(struct ceph_mon_client *monc, const char *what,
835} 835}
836EXPORT_SYMBOL(ceph_monc_get_version_async); 836EXPORT_SYMBOL(ceph_monc_get_version_async);
837 837
838static void handle_command_ack(struct ceph_mon_client *monc,
839 struct ceph_msg *msg)
840{
841 struct ceph_mon_generic_request *req;
842 void *p = msg->front.iov_base;
843 void *const end = p + msg->front_alloc_len;
844 u64 tid = le64_to_cpu(msg->hdr.tid);
845
846 dout("%s msg %p tid %llu\n", __func__, msg, tid);
847
848 ceph_decode_need(&p, end, sizeof(struct ceph_mon_request_header) +
849 sizeof(u32), bad);
850 p += sizeof(struct ceph_mon_request_header);
851
852 mutex_lock(&monc->mutex);
853 req = lookup_generic_request(&monc->generic_request_tree, tid);
854 if (!req) {
855 mutex_unlock(&monc->mutex);
856 return;
857 }
858
859 req->result = ceph_decode_32(&p);
860 __finish_generic_request(req);
861 mutex_unlock(&monc->mutex);
862
863 complete_generic_request(req);
864 return;
865
866bad:
867 pr_err("corrupt mon_command ack, tid %llu\n", tid);
868 ceph_msg_dump(msg);
869}
870
871int ceph_monc_blacklist_add(struct ceph_mon_client *monc,
872 struct ceph_entity_addr *client_addr)
873{
874 struct ceph_mon_generic_request *req;
875 struct ceph_mon_command *h;
876 int ret = -ENOMEM;
877 int len;
878
879 req = alloc_generic_request(monc, GFP_NOIO);
880 if (!req)
881 goto out;
882
883 req->request = ceph_msg_new(CEPH_MSG_MON_COMMAND, 256, GFP_NOIO, true);
884 if (!req->request)
885 goto out;
886
887 req->reply = ceph_msg_new(CEPH_MSG_MON_COMMAND_ACK, 512, GFP_NOIO,
888 true);
889 if (!req->reply)
890 goto out;
891
892 mutex_lock(&monc->mutex);
893 register_generic_request(req);
894 h = req->request->front.iov_base;
895 h->monhdr.have_version = 0;
896 h->monhdr.session_mon = cpu_to_le16(-1);
897 h->monhdr.session_mon_tid = 0;
898 h->fsid = monc->monmap->fsid;
899 h->num_strs = cpu_to_le32(1);
900 len = sprintf(h->str, "{ \"prefix\": \"osd blacklist\", \
901 \"blacklistop\": \"add\", \
902 \"addr\": \"%pISpc/%u\" }",
903 &client_addr->in_addr, le32_to_cpu(client_addr->nonce));
904 h->str_len = cpu_to_le32(len);
905 send_generic_request(monc, req);
906 mutex_unlock(&monc->mutex);
907
908 ret = wait_generic_request(req);
909out:
910 put_generic_request(req);
911 return ret;
912}
913EXPORT_SYMBOL(ceph_monc_blacklist_add);
914
838/* 915/*
839 * Resend pending generic requests. 916 * Resend pending generic requests.
840 */ 917 */
@@ -1139,6 +1216,10 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
1139 handle_get_version_reply(monc, msg); 1216 handle_get_version_reply(monc, msg);
1140 break; 1217 break;
1141 1218
1219 case CEPH_MSG_MON_COMMAND_ACK:
1220 handle_command_ack(monc, msg);
1221 break;
1222
1142 case CEPH_MSG_MON_MAP: 1223 case CEPH_MSG_MON_MAP:
1143 ceph_monc_handle_map(monc, msg); 1224 ceph_monc_handle_map(monc, msg);
1144 break; 1225 break;
@@ -1178,6 +1259,7 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
1178 m = ceph_msg_get(monc->m_subscribe_ack); 1259 m = ceph_msg_get(monc->m_subscribe_ack);
1179 break; 1260 break;
1180 case CEPH_MSG_STATFS_REPLY: 1261 case CEPH_MSG_STATFS_REPLY:
1262 case CEPH_MSG_MON_COMMAND_ACK:
1181 return get_generic_reply(con, hdr, skip); 1263 return get_generic_reply(con, hdr, skip);
1182 case CEPH_MSG_AUTH_REPLY: 1264 case CEPH_MSG_AUTH_REPLY:
1183 m = ceph_msg_get(monc->m_auth_reply); 1265 m = ceph_msg_get(monc->m_auth_reply);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index a97e7b506612..d9bf7a1d0a58 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -338,6 +338,9 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
338 ceph_osd_data_release(&op->notify.request_data); 338 ceph_osd_data_release(&op->notify.request_data);
339 ceph_osd_data_release(&op->notify.response_data); 339 ceph_osd_data_release(&op->notify.response_data);
340 break; 340 break;
341 case CEPH_OSD_OP_LIST_WATCHERS:
342 ceph_osd_data_release(&op->list_watchers.response_data);
343 break;
341 default: 344 default:
342 break; 345 break;
343 } 346 }
@@ -863,6 +866,8 @@ static u32 osd_req_encode_op(struct ceph_osd_op *dst,
863 case CEPH_OSD_OP_NOTIFY: 866 case CEPH_OSD_OP_NOTIFY:
864 dst->notify.cookie = cpu_to_le64(src->notify.cookie); 867 dst->notify.cookie = cpu_to_le64(src->notify.cookie);
865 break; 868 break;
869 case CEPH_OSD_OP_LIST_WATCHERS:
870 break;
866 case CEPH_OSD_OP_SETALLOCHINT: 871 case CEPH_OSD_OP_SETALLOCHINT:
867 dst->alloc_hint.expected_object_size = 872 dst->alloc_hint.expected_object_size =
868 cpu_to_le64(src->alloc_hint.expected_object_size); 873 cpu_to_le64(src->alloc_hint.expected_object_size);
@@ -1445,6 +1450,10 @@ static void setup_request_data(struct ceph_osd_request *req,
1445 ceph_osdc_msg_data_add(req->r_reply, 1450 ceph_osdc_msg_data_add(req->r_reply,
1446 &op->extent.osd_data); 1451 &op->extent.osd_data);
1447 break; 1452 break;
1453 case CEPH_OSD_OP_LIST_WATCHERS:
1454 ceph_osdc_msg_data_add(req->r_reply,
1455 &op->list_watchers.response_data);
1456 break;
1448 1457
1449 /* both */ 1458 /* both */
1450 case CEPH_OSD_OP_CALL: 1459 case CEPH_OSD_OP_CALL:
@@ -3891,12 +3900,121 @@ int ceph_osdc_watch_check(struct ceph_osd_client *osdc,
3891 return ret; 3900 return ret;
3892} 3901}
3893 3902
3903static int decode_watcher(void **p, void *end, struct ceph_watch_item *item)
3904{
3905 u8 struct_v;
3906 u32 struct_len;
3907 int ret;
3908
3909 ret = ceph_start_decoding(p, end, 2, "watch_item_t",
3910 &struct_v, &struct_len);
3911 if (ret)
3912 return ret;
3913
3914 ceph_decode_copy(p, &item->name, sizeof(item->name));
3915 item->cookie = ceph_decode_64(p);
3916 *p += 4; /* skip timeout_seconds */
3917 if (struct_v >= 2) {
3918 ceph_decode_copy(p, &item->addr, sizeof(item->addr));
3919 ceph_decode_addr(&item->addr);
3920 }
3921
3922 dout("%s %s%llu cookie %llu addr %s\n", __func__,
3923 ENTITY_NAME(item->name), item->cookie,
3924 ceph_pr_addr(&item->addr.in_addr));
3925 return 0;
3926}
3927
3928static int decode_watchers(void **p, void *end,
3929 struct ceph_watch_item **watchers,
3930 u32 *num_watchers)
3931{
3932 u8 struct_v;
3933 u32 struct_len;
3934 int i;
3935 int ret;
3936
3937 ret = ceph_start_decoding(p, end, 1, "obj_list_watch_response_t",
3938 &struct_v, &struct_len);
3939 if (ret)
3940 return ret;
3941
3942 *num_watchers = ceph_decode_32(p);
3943 *watchers = kcalloc(*num_watchers, sizeof(**watchers), GFP_NOIO);
3944 if (!*watchers)
3945 return -ENOMEM;
3946
3947 for (i = 0; i < *num_watchers; i++) {
3948 ret = decode_watcher(p, end, *watchers + i);
3949 if (ret) {
3950 kfree(*watchers);
3951 return ret;
3952 }
3953 }
3954
3955 return 0;
3956}
3957
3958/*
3959 * On success, the caller is responsible for:
3960 *
3961 * kfree(watchers);
3962 */
3963int ceph_osdc_list_watchers(struct ceph_osd_client *osdc,
3964 struct ceph_object_id *oid,
3965 struct ceph_object_locator *oloc,
3966 struct ceph_watch_item **watchers,
3967 u32 *num_watchers)
3968{
3969 struct ceph_osd_request *req;
3970 struct page **pages;
3971 int ret;
3972
3973 req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
3974 if (!req)
3975 return -ENOMEM;
3976
3977 ceph_oid_copy(&req->r_base_oid, oid);
3978 ceph_oloc_copy(&req->r_base_oloc, oloc);
3979 req->r_flags = CEPH_OSD_FLAG_READ;
3980
3981 ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
3982 if (ret)
3983 goto out_put_req;
3984
3985 pages = ceph_alloc_page_vector(1, GFP_NOIO);
3986 if (IS_ERR(pages)) {
3987 ret = PTR_ERR(pages);
3988 goto out_put_req;
3989 }
3990
3991 osd_req_op_init(req, 0, CEPH_OSD_OP_LIST_WATCHERS, 0);
3992 ceph_osd_data_pages_init(osd_req_op_data(req, 0, list_watchers,
3993 response_data),
3994 pages, PAGE_SIZE, 0, false, true);
3995
3996 ceph_osdc_start_request(osdc, req, false);
3997 ret = ceph_osdc_wait_request(osdc, req);
3998 if (ret >= 0) {
3999 void *p = page_address(pages[0]);
4000 void *const end = p + req->r_ops[0].outdata_len;
4001
4002 ret = decode_watchers(&p, end, watchers, num_watchers);
4003 }
4004
4005out_put_req:
4006 ceph_osdc_put_request(req);
4007 return ret;
4008}
4009EXPORT_SYMBOL(ceph_osdc_list_watchers);
4010
3894/* 4011/*
3895 * Call all pending notify callbacks - for use after a watch is 4012 * Call all pending notify callbacks - for use after a watch is
3896 * unregistered, to make sure no more callbacks for it will be invoked 4013 * unregistered, to make sure no more callbacks for it will be invoked
3897 */ 4014 */
3898void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc) 4015void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc)
3899{ 4016{
4017 dout("%s osdc %p\n", __func__, osdc);
3900 flush_workqueue(osdc->notify_wq); 4018 flush_workqueue(osdc->notify_wq);
3901} 4019}
3902EXPORT_SYMBOL(ceph_osdc_flush_notifies); 4020EXPORT_SYMBOL(ceph_osdc_flush_notifies);
@@ -3910,6 +4028,57 @@ void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc)
3910EXPORT_SYMBOL(ceph_osdc_maybe_request_map); 4028EXPORT_SYMBOL(ceph_osdc_maybe_request_map);
3911 4029
3912/* 4030/*
4031 * Execute an OSD class method on an object.
4032 *
4033 * @flags: CEPH_OSD_FLAG_*
4034 * @resp_len: out param for reply length
4035 */
4036int ceph_osdc_call(struct ceph_osd_client *osdc,
4037 struct ceph_object_id *oid,
4038 struct ceph_object_locator *oloc,
4039 const char *class, const char *method,
4040 unsigned int flags,
4041 struct page *req_page, size_t req_len,
4042 struct page *resp_page, size_t *resp_len)
4043{
4044 struct ceph_osd_request *req;
4045 int ret;
4046
4047 req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
4048 if (!req)
4049 return -ENOMEM;
4050
4051 ceph_oid_copy(&req->r_base_oid, oid);
4052 ceph_oloc_copy(&req->r_base_oloc, oloc);
4053 req->r_flags = flags;
4054
4055 ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
4056 if (ret)
4057 goto out_put_req;
4058
4059 osd_req_op_cls_init(req, 0, CEPH_OSD_OP_CALL, class, method);
4060 if (req_page)
4061 osd_req_op_cls_request_data_pages(req, 0, &req_page, req_len,
4062 0, false, false);
4063 if (resp_page)
4064 osd_req_op_cls_response_data_pages(req, 0, &resp_page,
4065 PAGE_SIZE, 0, false, false);
4066
4067 ceph_osdc_start_request(osdc, req, false);
4068 ret = ceph_osdc_wait_request(osdc, req);
4069 if (ret >= 0) {
4070 ret = req->r_ops[0].rval;
4071 if (resp_page)
4072 *resp_len = req->r_ops[0].outdata_len;
4073 }
4074
4075out_put_req:
4076 ceph_osdc_put_request(req);
4077 return ret;
4078}
4079EXPORT_SYMBOL(ceph_osdc_call);
4080
4081/*
3913 * init, shutdown 4082 * init, shutdown
3914 */ 4083 */
3915int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) 4084int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
diff --git a/net/core/dev.c b/net/core/dev.c
index ea6312057a71..4bc19a164ba5 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3355,16 +3355,6 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3355 else 3355 else
3356 skb_dst_force(skb); 3356 skb_dst_force(skb);
3357 3357
3358#ifdef CONFIG_NET_SWITCHDEV
3359 /* Don't forward if offload device already forwarded */
3360 if (skb->offload_fwd_mark &&
3361 skb->offload_fwd_mark == dev->offload_fwd_mark) {
3362 consume_skb(skb);
3363 rc = NET_XMIT_SUCCESS;
3364 goto out;
3365 }
3366#endif
3367
3368 txq = netdev_pick_tx(dev, skb, accel_priv); 3358 txq = netdev_pick_tx(dev, skb, accel_priv);
3369 q = rcu_dereference_bh(txq->qdisc); 3359 q = rcu_dereference_bh(txq->qdisc);
3370 3360
@@ -3855,7 +3845,7 @@ int netif_rx_ni(struct sk_buff *skb)
3855} 3845}
3856EXPORT_SYMBOL(netif_rx_ni); 3846EXPORT_SYMBOL(netif_rx_ni);
3857 3847
3858static void net_tx_action(struct softirq_action *h) 3848static __latent_entropy void net_tx_action(struct softirq_action *h)
3859{ 3849{
3860 struct softnet_data *sd = this_cpu_ptr(&softnet_data); 3850 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3861 3851
@@ -3914,8 +3904,7 @@ static void net_tx_action(struct softirq_action *h)
3914 } 3904 }
3915} 3905}
3916 3906
3917#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \ 3907#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
3918 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3919/* This hook is defined here for ATM LANE */ 3908/* This hook is defined here for ATM LANE */
3920int (*br_fdb_test_addr_hook)(struct net_device *dev, 3909int (*br_fdb_test_addr_hook)(struct net_device *dev,
3921 unsigned char *addr) __read_mostly; 3910 unsigned char *addr) __read_mostly;
@@ -4066,12 +4055,17 @@ static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
4066{ 4055{
4067#ifdef CONFIG_NETFILTER_INGRESS 4056#ifdef CONFIG_NETFILTER_INGRESS
4068 if (nf_hook_ingress_active(skb)) { 4057 if (nf_hook_ingress_active(skb)) {
4058 int ingress_retval;
4059
4069 if (*pt_prev) { 4060 if (*pt_prev) {
4070 *ret = deliver_skb(skb, *pt_prev, orig_dev); 4061 *ret = deliver_skb(skb, *pt_prev, orig_dev);
4071 *pt_prev = NULL; 4062 *pt_prev = NULL;
4072 } 4063 }
4073 4064
4074 return nf_hook_ingress(skb); 4065 rcu_read_lock();
4066 ingress_retval = nf_hook_ingress(skb);
4067 rcu_read_unlock();
4068 return ingress_retval;
4075 } 4069 }
4076#endif /* CONFIG_NETFILTER_INGRESS */ 4070#endif /* CONFIG_NETFILTER_INGRESS */
4077 return 0; 4071 return 0;
@@ -4308,32 +4302,53 @@ int netif_receive_skb(struct sk_buff *skb)
4308} 4302}
4309EXPORT_SYMBOL(netif_receive_skb); 4303EXPORT_SYMBOL(netif_receive_skb);
4310 4304
4311/* Network device is going away, flush any packets still pending 4305DEFINE_PER_CPU(struct work_struct, flush_works);
4312 * Called with irqs disabled. 4306
4313 */ 4307/* Network device is going away, flush any packets still pending */
4314static void flush_backlog(void *arg) 4308static void flush_backlog(struct work_struct *work)
4315{ 4309{
4316 struct net_device *dev = arg;
4317 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4318 struct sk_buff *skb, *tmp; 4310 struct sk_buff *skb, *tmp;
4311 struct softnet_data *sd;
4312
4313 local_bh_disable();
4314 sd = this_cpu_ptr(&softnet_data);
4319 4315
4316 local_irq_disable();
4320 rps_lock(sd); 4317 rps_lock(sd);
4321 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { 4318 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
4322 if (skb->dev == dev) { 4319 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4323 __skb_unlink(skb, &sd->input_pkt_queue); 4320 __skb_unlink(skb, &sd->input_pkt_queue);
4324 kfree_skb(skb); 4321 kfree_skb(skb);
4325 input_queue_head_incr(sd); 4322 input_queue_head_incr(sd);
4326 } 4323 }
4327 } 4324 }
4328 rps_unlock(sd); 4325 rps_unlock(sd);
4326 local_irq_enable();
4329 4327
4330 skb_queue_walk_safe(&sd->process_queue, skb, tmp) { 4328 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4331 if (skb->dev == dev) { 4329 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4332 __skb_unlink(skb, &sd->process_queue); 4330 __skb_unlink(skb, &sd->process_queue);
4333 kfree_skb(skb); 4331 kfree_skb(skb);
4334 input_queue_head_incr(sd); 4332 input_queue_head_incr(sd);
4335 } 4333 }
4336 } 4334 }
4335 local_bh_enable();
4336}
4337
4338static void flush_all_backlogs(void)
4339{
4340 unsigned int cpu;
4341
4342 get_online_cpus();
4343
4344 for_each_online_cpu(cpu)
4345 queue_work_on(cpu, system_highpri_wq,
4346 per_cpu_ptr(&flush_works, cpu));
4347
4348 for_each_online_cpu(cpu)
4349 flush_work(per_cpu_ptr(&flush_works, cpu));
4350
4351 put_online_cpus();
4337} 4352}
4338 4353
4339static int napi_gro_complete(struct sk_buff *skb) 4354static int napi_gro_complete(struct sk_buff *skb)
@@ -4821,8 +4836,9 @@ static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4821 4836
4822static int process_backlog(struct napi_struct *napi, int quota) 4837static int process_backlog(struct napi_struct *napi, int quota)
4823{ 4838{
4824 int work = 0;
4825 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog); 4839 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4840 bool again = true;
4841 int work = 0;
4826 4842
4827 /* Check if we have pending ipi, its better to send them now, 4843 /* Check if we have pending ipi, its better to send them now,
4828 * not waiting net_rx_action() end. 4844 * not waiting net_rx_action() end.
@@ -4833,23 +4849,20 @@ static int process_backlog(struct napi_struct *napi, int quota)
4833 } 4849 }
4834 4850
4835 napi->weight = weight_p; 4851 napi->weight = weight_p;
4836 local_irq_disable(); 4852 while (again) {
4837 while (1) {
4838 struct sk_buff *skb; 4853 struct sk_buff *skb;
4839 4854
4840 while ((skb = __skb_dequeue(&sd->process_queue))) { 4855 while ((skb = __skb_dequeue(&sd->process_queue))) {
4841 rcu_read_lock(); 4856 rcu_read_lock();
4842 local_irq_enable();
4843 __netif_receive_skb(skb); 4857 __netif_receive_skb(skb);
4844 rcu_read_unlock(); 4858 rcu_read_unlock();
4845 local_irq_disable();
4846 input_queue_head_incr(sd); 4859 input_queue_head_incr(sd);
4847 if (++work >= quota) { 4860 if (++work >= quota)
4848 local_irq_enable();
4849 return work; 4861 return work;
4850 } 4862
4851 } 4863 }
4852 4864
4865 local_irq_disable();
4853 rps_lock(sd); 4866 rps_lock(sd);
4854 if (skb_queue_empty(&sd->input_pkt_queue)) { 4867 if (skb_queue_empty(&sd->input_pkt_queue)) {
4855 /* 4868 /*
@@ -4861,16 +4874,14 @@ static int process_backlog(struct napi_struct *napi, int quota)
4861 * and we dont need an smp_mb() memory barrier. 4874 * and we dont need an smp_mb() memory barrier.
4862 */ 4875 */
4863 napi->state = 0; 4876 napi->state = 0;
4864 rps_unlock(sd); 4877 again = false;
4865 4878 } else {
4866 break; 4879 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4880 &sd->process_queue);
4867 } 4881 }
4868
4869 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4870 &sd->process_queue);
4871 rps_unlock(sd); 4882 rps_unlock(sd);
4883 local_irq_enable();
4872 } 4884 }
4873 local_irq_enable();
4874 4885
4875 return work; 4886 return work;
4876} 4887}
@@ -5187,7 +5198,7 @@ out_unlock:
5187 return work; 5198 return work;
5188} 5199}
5189 5200
5190static void net_rx_action(struct softirq_action *h) 5201static __latent_entropy void net_rx_action(struct softirq_action *h)
5191{ 5202{
5192 struct softnet_data *sd = this_cpu_ptr(&softnet_data); 5203 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
5193 unsigned long time_limit = jiffies + 2; 5204 unsigned long time_limit = jiffies + 2;
@@ -5578,6 +5589,7 @@ static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5578 5589
5579static int __netdev_adjacent_dev_insert(struct net_device *dev, 5590static int __netdev_adjacent_dev_insert(struct net_device *dev,
5580 struct net_device *adj_dev, 5591 struct net_device *adj_dev,
5592 u16 ref_nr,
5581 struct list_head *dev_list, 5593 struct list_head *dev_list,
5582 void *private, bool master) 5594 void *private, bool master)
5583{ 5595{
@@ -5587,7 +5599,7 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev,
5587 adj = __netdev_find_adj(adj_dev, dev_list); 5599 adj = __netdev_find_adj(adj_dev, dev_list);
5588 5600
5589 if (adj) { 5601 if (adj) {
5590 adj->ref_nr++; 5602 adj->ref_nr += ref_nr;
5591 return 0; 5603 return 0;
5592 } 5604 }
5593 5605
@@ -5597,7 +5609,7 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev,
5597 5609
5598 adj->dev = adj_dev; 5610 adj->dev = adj_dev;
5599 adj->master = master; 5611 adj->master = master;
5600 adj->ref_nr = 1; 5612 adj->ref_nr = ref_nr;
5601 adj->private = private; 5613 adj->private = private;
5602 dev_hold(adj_dev); 5614 dev_hold(adj_dev);
5603 5615
@@ -5636,6 +5648,7 @@ free_adj:
5636 5648
5637static void __netdev_adjacent_dev_remove(struct net_device *dev, 5649static void __netdev_adjacent_dev_remove(struct net_device *dev,
5638 struct net_device *adj_dev, 5650 struct net_device *adj_dev,
5651 u16 ref_nr,
5639 struct list_head *dev_list) 5652 struct list_head *dev_list)
5640{ 5653{
5641 struct netdev_adjacent *adj; 5654 struct netdev_adjacent *adj;
@@ -5648,10 +5661,10 @@ static void __netdev_adjacent_dev_remove(struct net_device *dev,
5648 BUG(); 5661 BUG();
5649 } 5662 }
5650 5663
5651 if (adj->ref_nr > 1) { 5664 if (adj->ref_nr > ref_nr) {
5652 pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name, 5665 pr_debug("%s to %s ref_nr-%d = %d\n", dev->name, adj_dev->name,
5653 adj->ref_nr-1); 5666 ref_nr, adj->ref_nr-ref_nr);
5654 adj->ref_nr--; 5667 adj->ref_nr -= ref_nr;
5655 return; 5668 return;
5656 } 5669 }
5657 5670
@@ -5670,21 +5683,22 @@ static void __netdev_adjacent_dev_remove(struct net_device *dev,
5670 5683
5671static int __netdev_adjacent_dev_link_lists(struct net_device *dev, 5684static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5672 struct net_device *upper_dev, 5685 struct net_device *upper_dev,
5686 u16 ref_nr,
5673 struct list_head *up_list, 5687 struct list_head *up_list,
5674 struct list_head *down_list, 5688 struct list_head *down_list,
5675 void *private, bool master) 5689 void *private, bool master)
5676{ 5690{
5677 int ret; 5691 int ret;
5678 5692
5679 ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private, 5693 ret = __netdev_adjacent_dev_insert(dev, upper_dev, ref_nr, up_list,
5680 master); 5694 private, master);
5681 if (ret) 5695 if (ret)
5682 return ret; 5696 return ret;
5683 5697
5684 ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private, 5698 ret = __netdev_adjacent_dev_insert(upper_dev, dev, ref_nr, down_list,
5685 false); 5699 private, false);
5686 if (ret) { 5700 if (ret) {
5687 __netdev_adjacent_dev_remove(dev, upper_dev, up_list); 5701 __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
5688 return ret; 5702 return ret;
5689 } 5703 }
5690 5704
@@ -5692,9 +5706,10 @@ static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5692} 5706}
5693 5707
5694static int __netdev_adjacent_dev_link(struct net_device *dev, 5708static int __netdev_adjacent_dev_link(struct net_device *dev,
5695 struct net_device *upper_dev) 5709 struct net_device *upper_dev,
5710 u16 ref_nr)
5696{ 5711{
5697 return __netdev_adjacent_dev_link_lists(dev, upper_dev, 5712 return __netdev_adjacent_dev_link_lists(dev, upper_dev, ref_nr,
5698 &dev->all_adj_list.upper, 5713 &dev->all_adj_list.upper,
5699 &upper_dev->all_adj_list.lower, 5714 &upper_dev->all_adj_list.lower,
5700 NULL, false); 5715 NULL, false);
@@ -5702,17 +5717,19 @@ static int __netdev_adjacent_dev_link(struct net_device *dev,
5702 5717
5703static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev, 5718static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5704 struct net_device *upper_dev, 5719 struct net_device *upper_dev,
5720 u16 ref_nr,
5705 struct list_head *up_list, 5721 struct list_head *up_list,
5706 struct list_head *down_list) 5722 struct list_head *down_list)
5707{ 5723{
5708 __netdev_adjacent_dev_remove(dev, upper_dev, up_list); 5724 __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
5709 __netdev_adjacent_dev_remove(upper_dev, dev, down_list); 5725 __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
5710} 5726}
5711 5727
5712static void __netdev_adjacent_dev_unlink(struct net_device *dev, 5728static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5713 struct net_device *upper_dev) 5729 struct net_device *upper_dev,
5730 u16 ref_nr)
5714{ 5731{
5715 __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 5732 __netdev_adjacent_dev_unlink_lists(dev, upper_dev, ref_nr,
5716 &dev->all_adj_list.upper, 5733 &dev->all_adj_list.upper,
5717 &upper_dev->all_adj_list.lower); 5734 &upper_dev->all_adj_list.lower);
5718} 5735}
@@ -5721,17 +5738,17 @@ static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5721 struct net_device *upper_dev, 5738 struct net_device *upper_dev,
5722 void *private, bool master) 5739 void *private, bool master)
5723{ 5740{
5724 int ret = __netdev_adjacent_dev_link(dev, upper_dev); 5741 int ret = __netdev_adjacent_dev_link(dev, upper_dev, 1);
5725 5742
5726 if (ret) 5743 if (ret)
5727 return ret; 5744 return ret;
5728 5745
5729 ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, 5746 ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, 1,
5730 &dev->adj_list.upper, 5747 &dev->adj_list.upper,
5731 &upper_dev->adj_list.lower, 5748 &upper_dev->adj_list.lower,
5732 private, master); 5749 private, master);
5733 if (ret) { 5750 if (ret) {
5734 __netdev_adjacent_dev_unlink(dev, upper_dev); 5751 __netdev_adjacent_dev_unlink(dev, upper_dev, 1);
5735 return ret; 5752 return ret;
5736 } 5753 }
5737 5754
@@ -5741,8 +5758,8 @@ static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5741static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, 5758static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5742 struct net_device *upper_dev) 5759 struct net_device *upper_dev)
5743{ 5760{
5744 __netdev_adjacent_dev_unlink(dev, upper_dev); 5761 __netdev_adjacent_dev_unlink(dev, upper_dev, 1);
5745 __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 5762 __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
5746 &dev->adj_list.upper, 5763 &dev->adj_list.upper,
5747 &upper_dev->adj_list.lower); 5764 &upper_dev->adj_list.lower);
5748} 5765}
@@ -5795,7 +5812,7 @@ static int __netdev_upper_dev_link(struct net_device *dev,
5795 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) { 5812 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5796 pr_debug("Interlinking %s with %s, non-neighbour\n", 5813 pr_debug("Interlinking %s with %s, non-neighbour\n",
5797 i->dev->name, j->dev->name); 5814 i->dev->name, j->dev->name);
5798 ret = __netdev_adjacent_dev_link(i->dev, j->dev); 5815 ret = __netdev_adjacent_dev_link(i->dev, j->dev, i->ref_nr);
5799 if (ret) 5816 if (ret)
5800 goto rollback_mesh; 5817 goto rollback_mesh;
5801 } 5818 }
@@ -5805,7 +5822,7 @@ static int __netdev_upper_dev_link(struct net_device *dev,
5805 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) { 5822 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5806 pr_debug("linking %s's upper device %s with %s\n", 5823 pr_debug("linking %s's upper device %s with %s\n",
5807 upper_dev->name, i->dev->name, dev->name); 5824 upper_dev->name, i->dev->name, dev->name);
5808 ret = __netdev_adjacent_dev_link(dev, i->dev); 5825 ret = __netdev_adjacent_dev_link(dev, i->dev, i->ref_nr);
5809 if (ret) 5826 if (ret)
5810 goto rollback_upper_mesh; 5827 goto rollback_upper_mesh;
5811 } 5828 }
@@ -5814,7 +5831,7 @@ static int __netdev_upper_dev_link(struct net_device *dev,
5814 list_for_each_entry(i, &dev->all_adj_list.lower, list) { 5831 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5815 pr_debug("linking %s's lower device %s with %s\n", dev->name, 5832 pr_debug("linking %s's lower device %s with %s\n", dev->name,
5816 i->dev->name, upper_dev->name); 5833 i->dev->name, upper_dev->name);
5817 ret = __netdev_adjacent_dev_link(i->dev, upper_dev); 5834 ret = __netdev_adjacent_dev_link(i->dev, upper_dev, i->ref_nr);
5818 if (ret) 5835 if (ret)
5819 goto rollback_lower_mesh; 5836 goto rollback_lower_mesh;
5820 } 5837 }
@@ -5832,7 +5849,7 @@ rollback_lower_mesh:
5832 list_for_each_entry(i, &dev->all_adj_list.lower, list) { 5849 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5833 if (i == to_i) 5850 if (i == to_i)
5834 break; 5851 break;
5835 __netdev_adjacent_dev_unlink(i->dev, upper_dev); 5852 __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr);
5836 } 5853 }
5837 5854
5838 i = NULL; 5855 i = NULL;
@@ -5842,7 +5859,7 @@ rollback_upper_mesh:
5842 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) { 5859 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5843 if (i == to_i) 5860 if (i == to_i)
5844 break; 5861 break;
5845 __netdev_adjacent_dev_unlink(dev, i->dev); 5862 __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr);
5846 } 5863 }
5847 5864
5848 i = j = NULL; 5865 i = j = NULL;
@@ -5854,7 +5871,7 @@ rollback_mesh:
5854 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) { 5871 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5855 if (i == to_i && j == to_j) 5872 if (i == to_i && j == to_j)
5856 break; 5873 break;
5857 __netdev_adjacent_dev_unlink(i->dev, j->dev); 5874 __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr);
5858 } 5875 }
5859 if (i == to_i) 5876 if (i == to_i)
5860 break; 5877 break;
@@ -5934,16 +5951,16 @@ void netdev_upper_dev_unlink(struct net_device *dev,
5934 */ 5951 */
5935 list_for_each_entry(i, &dev->all_adj_list.lower, list) 5952 list_for_each_entry(i, &dev->all_adj_list.lower, list)
5936 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) 5953 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5937 __netdev_adjacent_dev_unlink(i->dev, j->dev); 5954 __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr);
5938 5955
5939 /* remove also the devices itself from lower/upper device 5956 /* remove also the devices itself from lower/upper device
5940 * list 5957 * list
5941 */ 5958 */
5942 list_for_each_entry(i, &dev->all_adj_list.lower, list) 5959 list_for_each_entry(i, &dev->all_adj_list.lower, list)
5943 __netdev_adjacent_dev_unlink(i->dev, upper_dev); 5960 __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr);
5944 5961
5945 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) 5962 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5946 __netdev_adjacent_dev_unlink(dev, i->dev); 5963 __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr);
5947 5964
5948 call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, 5965 call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5949 &changeupper_info.info); 5966 &changeupper_info.info);
@@ -6723,8 +6740,8 @@ static void rollback_registered_many(struct list_head *head)
6723 unlist_netdevice(dev); 6740 unlist_netdevice(dev);
6724 6741
6725 dev->reg_state = NETREG_UNREGISTERING; 6742 dev->reg_state = NETREG_UNREGISTERING;
6726 on_each_cpu(flush_backlog, dev, 1);
6727 } 6743 }
6744 flush_all_backlogs();
6728 6745
6729 synchronize_net(); 6746 synchronize_net();
6730 6747
@@ -7641,6 +7658,9 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
7641 INIT_LIST_HEAD(&dev->all_adj_list.lower); 7658 INIT_LIST_HEAD(&dev->all_adj_list.lower);
7642 INIT_LIST_HEAD(&dev->ptype_all); 7659 INIT_LIST_HEAD(&dev->ptype_all);
7643 INIT_LIST_HEAD(&dev->ptype_specific); 7660 INIT_LIST_HEAD(&dev->ptype_specific);
7661#ifdef CONFIG_NET_SCHED
7662 hash_init(dev->qdisc_hash);
7663#endif
7644 dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; 7664 dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
7645 setup(dev); 7665 setup(dev);
7646 7666
@@ -8286,8 +8306,11 @@ static int __init net_dev_init(void)
8286 */ 8306 */
8287 8307
8288 for_each_possible_cpu(i) { 8308 for_each_possible_cpu(i) {
8309 struct work_struct *flush = per_cpu_ptr(&flush_works, i);
8289 struct softnet_data *sd = &per_cpu(softnet_data, i); 8310 struct softnet_data *sd = &per_cpu(softnet_data, i);
8290 8311
8312 INIT_WORK(flush, flush_backlog);
8313
8291 skb_queue_head_init(&sd->input_pkt_queue); 8314 skb_queue_head_init(&sd->input_pkt_queue);
8292 skb_queue_head_init(&sd->process_queue); 8315 skb_queue_head_init(&sd->process_queue);
8293 INIT_LIST_HEAD(&sd->poll_list); 8316 INIT_LIST_HEAD(&sd->poll_list);
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index d6b3b579560d..72cfb0c61125 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -105,7 +105,7 @@ static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data)
105 return skb; 105 return skb;
106} 106}
107 107
108static struct genl_multicast_group dropmon_mcgrps[] = { 108static const struct genl_multicast_group dropmon_mcgrps[] = {
109 { .name = "events", }, 109 { .name = "events", },
110}; 110};
111 111
diff --git a/net/core/filter.c b/net/core/filter.c
index cb06aceb512a..00351cdf7d0c 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -94,14 +94,13 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
94} 94}
95EXPORT_SYMBOL(sk_filter_trim_cap); 95EXPORT_SYMBOL(sk_filter_trim_cap);
96 96
97static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) 97BPF_CALL_1(__skb_get_pay_offset, struct sk_buff *, skb)
98{ 98{
99 return skb_get_poff((struct sk_buff *)(unsigned long) ctx); 99 return skb_get_poff(skb);
100} 100}
101 101
102static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) 102BPF_CALL_3(__skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x)
103{ 103{
104 struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx;
105 struct nlattr *nla; 104 struct nlattr *nla;
106 105
107 if (skb_is_nonlinear(skb)) 106 if (skb_is_nonlinear(skb))
@@ -120,9 +119,8 @@ static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
120 return 0; 119 return 0;
121} 120}
122 121
123static u64 __skb_get_nlattr_nest(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) 122BPF_CALL_3(__skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x)
124{ 123{
125 struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx;
126 struct nlattr *nla; 124 struct nlattr *nla;
127 125
128 if (skb_is_nonlinear(skb)) 126 if (skb_is_nonlinear(skb))
@@ -145,7 +143,7 @@ static u64 __skb_get_nlattr_nest(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
145 return 0; 143 return 0;
146} 144}
147 145
148static u64 __get_raw_cpu_id(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) 146BPF_CALL_0(__get_raw_cpu_id)
149{ 147{
150 return raw_smp_processor_id(); 148 return raw_smp_processor_id();
151} 149}
@@ -233,9 +231,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
233 case SKF_AD_OFF + SKF_AD_HATYPE: 231 case SKF_AD_OFF + SKF_AD_HATYPE:
234 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); 232 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
235 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, type) != 2); 233 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, type) != 2);
236 BUILD_BUG_ON(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)) < 0);
237 234
238 *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)), 235 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
239 BPF_REG_TMP, BPF_REG_CTX, 236 BPF_REG_TMP, BPF_REG_CTX,
240 offsetof(struct sk_buff, dev)); 237 offsetof(struct sk_buff, dev));
241 /* if (tmp != 0) goto pc + 1 */ 238 /* if (tmp != 0) goto pc + 1 */
@@ -1350,17 +1347,26 @@ struct bpf_scratchpad {
1350 1347
1351static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp); 1348static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp);
1352 1349
1350static inline int __bpf_try_make_writable(struct sk_buff *skb,
1351 unsigned int write_len)
1352{
1353 return skb_ensure_writable(skb, write_len);
1354}
1355
1353static inline int bpf_try_make_writable(struct sk_buff *skb, 1356static inline int bpf_try_make_writable(struct sk_buff *skb,
1354 unsigned int write_len) 1357 unsigned int write_len)
1355{ 1358{
1356 int err; 1359 int err = __bpf_try_make_writable(skb, write_len);
1357 1360
1358 err = skb_ensure_writable(skb, write_len);
1359 bpf_compute_data_end(skb); 1361 bpf_compute_data_end(skb);
1360
1361 return err; 1362 return err;
1362} 1363}
1363 1364
1365static int bpf_try_make_head_writable(struct sk_buff *skb)
1366{
1367 return bpf_try_make_writable(skb, skb_headlen(skb));
1368}
1369
1364static inline void bpf_push_mac_rcsum(struct sk_buff *skb) 1370static inline void bpf_push_mac_rcsum(struct sk_buff *skb)
1365{ 1371{
1366 if (skb_at_tc_ingress(skb)) 1372 if (skb_at_tc_ingress(skb))
@@ -1373,12 +1379,9 @@ static inline void bpf_pull_mac_rcsum(struct sk_buff *skb)
1373 skb_postpull_rcsum(skb, skb_mac_header(skb), skb->mac_len); 1379 skb_postpull_rcsum(skb, skb_mac_header(skb), skb->mac_len);
1374} 1380}
1375 1381
1376static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) 1382BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset,
1383 const void *, from, u32, len, u64, flags)
1377{ 1384{
1378 struct sk_buff *skb = (struct sk_buff *) (long) r1;
1379 unsigned int offset = (unsigned int) r2;
1380 void *from = (void *) (long) r3;
1381 unsigned int len = (unsigned int) r4;
1382 void *ptr; 1385 void *ptr;
1383 1386
1384 if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH))) 1387 if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH)))
@@ -1413,12 +1416,9 @@ static const struct bpf_func_proto bpf_skb_store_bytes_proto = {
1413 .arg5_type = ARG_ANYTHING, 1416 .arg5_type = ARG_ANYTHING,
1414}; 1417};
1415 1418
1416static u64 bpf_skb_load_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) 1419BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset,
1420 void *, to, u32, len)
1417{ 1421{
1418 const struct sk_buff *skb = (const struct sk_buff *)(unsigned long) r1;
1419 unsigned int offset = (unsigned int) r2;
1420 void *to = (void *)(unsigned long) r3;
1421 unsigned int len = (unsigned int) r4;
1422 void *ptr; 1422 void *ptr;
1423 1423
1424 if (unlikely(offset > 0xffff)) 1424 if (unlikely(offset > 0xffff))
@@ -1446,10 +1446,31 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
1446 .arg4_type = ARG_CONST_STACK_SIZE, 1446 .arg4_type = ARG_CONST_STACK_SIZE,
1447}; 1447};
1448 1448
1449static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) 1449BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len)
1450{
1451 /* Idea is the following: should the needed direct read/write
1452 * test fail during runtime, we can pull in more data and redo
1453 * again, since implicitly, we invalidate previous checks here.
1454 *
1455 * Or, since we know how much we need to make read/writeable,
1456 * this can be done once at the program beginning for direct
1457 * access case. By this we overcome limitations of only current
1458 * headroom being accessible.
1459 */
1460 return bpf_try_make_writable(skb, len ? : skb_headlen(skb));
1461}
1462
1463static const struct bpf_func_proto bpf_skb_pull_data_proto = {
1464 .func = bpf_skb_pull_data,
1465 .gpl_only = false,
1466 .ret_type = RET_INTEGER,
1467 .arg1_type = ARG_PTR_TO_CTX,
1468 .arg2_type = ARG_ANYTHING,
1469};
1470
1471BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset,
1472 u64, from, u64, to, u64, flags)
1450{ 1473{
1451 struct sk_buff *skb = (struct sk_buff *) (long) r1;
1452 unsigned int offset = (unsigned int) r2;
1453 __sum16 *ptr; 1474 __sum16 *ptr;
1454 1475
1455 if (unlikely(flags & ~(BPF_F_HDR_FIELD_MASK))) 1476 if (unlikely(flags & ~(BPF_F_HDR_FIELD_MASK)))
@@ -1491,12 +1512,11 @@ static const struct bpf_func_proto bpf_l3_csum_replace_proto = {
1491 .arg5_type = ARG_ANYTHING, 1512 .arg5_type = ARG_ANYTHING,
1492}; 1513};
1493 1514
1494static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) 1515BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset,
1516 u64, from, u64, to, u64, flags)
1495{ 1517{
1496 struct sk_buff *skb = (struct sk_buff *) (long) r1;
1497 bool is_pseudo = flags & BPF_F_PSEUDO_HDR; 1518 bool is_pseudo = flags & BPF_F_PSEUDO_HDR;
1498 bool is_mmzero = flags & BPF_F_MARK_MANGLED_0; 1519 bool is_mmzero = flags & BPF_F_MARK_MANGLED_0;
1499 unsigned int offset = (unsigned int) r2;
1500 __sum16 *ptr; 1520 __sum16 *ptr;
1501 1521
1502 if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_PSEUDO_HDR | 1522 if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_PSEUDO_HDR |
@@ -1544,12 +1564,11 @@ static const struct bpf_func_proto bpf_l4_csum_replace_proto = {
1544 .arg5_type = ARG_ANYTHING, 1564 .arg5_type = ARG_ANYTHING,
1545}; 1565};
1546 1566
1547static u64 bpf_csum_diff(u64 r1, u64 from_size, u64 r3, u64 to_size, u64 seed) 1567BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size,
1568 __be32 *, to, u32, to_size, __wsum, seed)
1548{ 1569{
1549 struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp); 1570 struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp);
1550 u64 diff_size = from_size + to_size; 1571 u32 diff_size = from_size + to_size;
1551 __be32 *from = (__be32 *) (long) r1;
1552 __be32 *to = (__be32 *) (long) r3;
1553 int i, j = 0; 1572 int i, j = 0;
1554 1573
1555 /* This is quite flexible, some examples: 1574 /* This is quite flexible, some examples:
@@ -1575,6 +1594,7 @@ static u64 bpf_csum_diff(u64 r1, u64 from_size, u64 r3, u64 to_size, u64 seed)
1575static const struct bpf_func_proto bpf_csum_diff_proto = { 1594static const struct bpf_func_proto bpf_csum_diff_proto = {
1576 .func = bpf_csum_diff, 1595 .func = bpf_csum_diff,
1577 .gpl_only = false, 1596 .gpl_only = false,
1597 .pkt_access = true,
1578 .ret_type = RET_INTEGER, 1598 .ret_type = RET_INTEGER,
1579 .arg1_type = ARG_PTR_TO_STACK, 1599 .arg1_type = ARG_PTR_TO_STACK,
1580 .arg2_type = ARG_CONST_STACK_SIZE_OR_ZERO, 1600 .arg2_type = ARG_CONST_STACK_SIZE_OR_ZERO,
@@ -1583,6 +1603,26 @@ static const struct bpf_func_proto bpf_csum_diff_proto = {
1583 .arg5_type = ARG_ANYTHING, 1603 .arg5_type = ARG_ANYTHING,
1584}; 1604};
1585 1605
1606BPF_CALL_2(bpf_csum_update, struct sk_buff *, skb, __wsum, csum)
1607{
1608 /* The interface is to be used in combination with bpf_csum_diff()
1609 * for direct packet writes. csum rotation for alignment as well
1610 * as emulating csum_sub() can be done from the eBPF program.
1611 */
1612 if (skb->ip_summed == CHECKSUM_COMPLETE)
1613 return (skb->csum = csum_add(skb->csum, csum));
1614
1615 return -ENOTSUPP;
1616}
1617
1618static const struct bpf_func_proto bpf_csum_update_proto = {
1619 .func = bpf_csum_update,
1620 .gpl_only = false,
1621 .ret_type = RET_INTEGER,
1622 .arg1_type = ARG_PTR_TO_CTX,
1623 .arg2_type = ARG_ANYTHING,
1624};
1625
1586static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb) 1626static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb)
1587{ 1627{
1588 return dev_forward_skb(dev, skb); 1628 return dev_forward_skb(dev, skb);
@@ -1607,10 +1647,11 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
1607 return ret; 1647 return ret;
1608} 1648}
1609 1649
1610static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5) 1650BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
1611{ 1651{
1612 struct sk_buff *skb = (struct sk_buff *) (long) r1;
1613 struct net_device *dev; 1652 struct net_device *dev;
1653 struct sk_buff *clone;
1654 int ret;
1614 1655
1615 if (unlikely(flags & ~(BPF_F_INGRESS))) 1656 if (unlikely(flags & ~(BPF_F_INGRESS)))
1616 return -EINVAL; 1657 return -EINVAL;
@@ -1619,14 +1660,25 @@ static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5)
1619 if (unlikely(!dev)) 1660 if (unlikely(!dev))
1620 return -EINVAL; 1661 return -EINVAL;
1621 1662
1622 skb = skb_clone(skb, GFP_ATOMIC); 1663 clone = skb_clone(skb, GFP_ATOMIC);
1623 if (unlikely(!skb)) 1664 if (unlikely(!clone))
1624 return -ENOMEM; 1665 return -ENOMEM;
1625 1666
1626 bpf_push_mac_rcsum(skb); 1667 /* For direct write, we need to keep the invariant that the skbs
1668 * we're dealing with need to be uncloned. Should uncloning fail
1669 * here, we need to free the just generated clone to unclone once
1670 * again.
1671 */
1672 ret = bpf_try_make_head_writable(skb);
1673 if (unlikely(ret)) {
1674 kfree_skb(clone);
1675 return -ENOMEM;
1676 }
1677
1678 bpf_push_mac_rcsum(clone);
1627 1679
1628 return flags & BPF_F_INGRESS ? 1680 return flags & BPF_F_INGRESS ?
1629 __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb); 1681 __bpf_rx_skb(dev, clone) : __bpf_tx_skb(dev, clone);
1630} 1682}
1631 1683
1632static const struct bpf_func_proto bpf_clone_redirect_proto = { 1684static const struct bpf_func_proto bpf_clone_redirect_proto = {
@@ -1645,7 +1697,7 @@ struct redirect_info {
1645 1697
1646static DEFINE_PER_CPU(struct redirect_info, redirect_info); 1698static DEFINE_PER_CPU(struct redirect_info, redirect_info);
1647 1699
1648static u64 bpf_redirect(u64 ifindex, u64 flags, u64 r3, u64 r4, u64 r5) 1700BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
1649{ 1701{
1650 struct redirect_info *ri = this_cpu_ptr(&redirect_info); 1702 struct redirect_info *ri = this_cpu_ptr(&redirect_info);
1651 1703
@@ -1684,9 +1736,9 @@ static const struct bpf_func_proto bpf_redirect_proto = {
1684 .arg2_type = ARG_ANYTHING, 1736 .arg2_type = ARG_ANYTHING,
1685}; 1737};
1686 1738
1687static u64 bpf_get_cgroup_classid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) 1739BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
1688{ 1740{
1689 return task_get_classid((struct sk_buff *) (unsigned long) r1); 1741 return task_get_classid(skb);
1690} 1742}
1691 1743
1692static const struct bpf_func_proto bpf_get_cgroup_classid_proto = { 1744static const struct bpf_func_proto bpf_get_cgroup_classid_proto = {
@@ -1696,9 +1748,9 @@ static const struct bpf_func_proto bpf_get_cgroup_classid_proto = {
1696 .arg1_type = ARG_PTR_TO_CTX, 1748 .arg1_type = ARG_PTR_TO_CTX,
1697}; 1749};
1698 1750
1699static u64 bpf_get_route_realm(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) 1751BPF_CALL_1(bpf_get_route_realm, const struct sk_buff *, skb)
1700{ 1752{
1701 return dst_tclassid((struct sk_buff *) (unsigned long) r1); 1753 return dst_tclassid(skb);
1702} 1754}
1703 1755
1704static const struct bpf_func_proto bpf_get_route_realm_proto = { 1756static const struct bpf_func_proto bpf_get_route_realm_proto = {
@@ -1708,14 +1760,14 @@ static const struct bpf_func_proto bpf_get_route_realm_proto = {
1708 .arg1_type = ARG_PTR_TO_CTX, 1760 .arg1_type = ARG_PTR_TO_CTX,
1709}; 1761};
1710 1762
1711static u64 bpf_get_hash_recalc(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) 1763BPF_CALL_1(bpf_get_hash_recalc, struct sk_buff *, skb)
1712{ 1764{
1713 /* If skb_clear_hash() was called due to mangling, we can 1765 /* If skb_clear_hash() was called due to mangling, we can
1714 * trigger SW recalculation here. Later access to hash 1766 * trigger SW recalculation here. Later access to hash
1715 * can then use the inline skb->hash via context directly 1767 * can then use the inline skb->hash via context directly
1716 * instead of calling this helper again. 1768 * instead of calling this helper again.
1717 */ 1769 */
1718 return skb_get_hash((struct sk_buff *) (unsigned long) r1); 1770 return skb_get_hash(skb);
1719} 1771}
1720 1772
1721static const struct bpf_func_proto bpf_get_hash_recalc_proto = { 1773static const struct bpf_func_proto bpf_get_hash_recalc_proto = {
@@ -1725,10 +1777,25 @@ static const struct bpf_func_proto bpf_get_hash_recalc_proto = {
1725 .arg1_type = ARG_PTR_TO_CTX, 1777 .arg1_type = ARG_PTR_TO_CTX,
1726}; 1778};
1727 1779
1728static u64 bpf_skb_vlan_push(u64 r1, u64 r2, u64 vlan_tci, u64 r4, u64 r5) 1780BPF_CALL_1(bpf_set_hash_invalid, struct sk_buff *, skb)
1781{
1782 /* After all direct packet write, this can be used once for
1783 * triggering a lazy recalc on next skb_get_hash() invocation.
1784 */
1785 skb_clear_hash(skb);
1786 return 0;
1787}
1788
1789static const struct bpf_func_proto bpf_set_hash_invalid_proto = {
1790 .func = bpf_set_hash_invalid,
1791 .gpl_only = false,
1792 .ret_type = RET_INTEGER,
1793 .arg1_type = ARG_PTR_TO_CTX,
1794};
1795
1796BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto,
1797 u16, vlan_tci)
1729{ 1798{
1730 struct sk_buff *skb = (struct sk_buff *) (long) r1;
1731 __be16 vlan_proto = (__force __be16) r2;
1732 int ret; 1799 int ret;
1733 1800
1734 if (unlikely(vlan_proto != htons(ETH_P_8021Q) && 1801 if (unlikely(vlan_proto != htons(ETH_P_8021Q) &&
@@ -1753,9 +1820,8 @@ const struct bpf_func_proto bpf_skb_vlan_push_proto = {
1753}; 1820};
1754EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto); 1821EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto);
1755 1822
1756static u64 bpf_skb_vlan_pop(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) 1823BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb)
1757{ 1824{
1758 struct sk_buff *skb = (struct sk_buff *) (long) r1;
1759 int ret; 1825 int ret;
1760 1826
1761 bpf_push_mac_rcsum(skb); 1827 bpf_push_mac_rcsum(skb);
@@ -1930,10 +1996,9 @@ static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto)
1930 return -ENOTSUPP; 1996 return -ENOTSUPP;
1931} 1997}
1932 1998
1933static u64 bpf_skb_change_proto(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) 1999BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto,
2000 u64, flags)
1934{ 2001{
1935 struct sk_buff *skb = (struct sk_buff *) (long) r1;
1936 __be16 proto = (__force __be16) r2;
1937 int ret; 2002 int ret;
1938 2003
1939 if (unlikely(flags)) 2004 if (unlikely(flags))
@@ -1970,14 +2035,11 @@ static const struct bpf_func_proto bpf_skb_change_proto_proto = {
1970 .arg3_type = ARG_ANYTHING, 2035 .arg3_type = ARG_ANYTHING,
1971}; 2036};
1972 2037
1973static u64 bpf_skb_change_type(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) 2038BPF_CALL_2(bpf_skb_change_type, struct sk_buff *, skb, u32, pkt_type)
1974{ 2039{
1975 struct sk_buff *skb = (struct sk_buff *) (long) r1;
1976 u32 pkt_type = r2;
1977
1978 /* We only allow a restricted subset to be changed for now. */ 2040 /* We only allow a restricted subset to be changed for now. */
1979 if (unlikely(skb->pkt_type > PACKET_OTHERHOST || 2041 if (unlikely(!skb_pkt_type_ok(skb->pkt_type) ||
1980 pkt_type > PACKET_OTHERHOST)) 2042 !skb_pkt_type_ok(pkt_type)))
1981 return -EINVAL; 2043 return -EINVAL;
1982 2044
1983 skb->pkt_type = pkt_type; 2045 skb->pkt_type = pkt_type;
@@ -1992,19 +2054,100 @@ static const struct bpf_func_proto bpf_skb_change_type_proto = {
1992 .arg2_type = ARG_ANYTHING, 2054 .arg2_type = ARG_ANYTHING,
1993}; 2055};
1994 2056
2057static u32 __bpf_skb_min_len(const struct sk_buff *skb)
2058{
2059 u32 min_len = skb_network_offset(skb);
2060
2061 if (skb_transport_header_was_set(skb))
2062 min_len = skb_transport_offset(skb);
2063 if (skb->ip_summed == CHECKSUM_PARTIAL)
2064 min_len = skb_checksum_start_offset(skb) +
2065 skb->csum_offset + sizeof(__sum16);
2066 return min_len;
2067}
2068
2069static u32 __bpf_skb_max_len(const struct sk_buff *skb)
2070{
2071 return skb->dev->mtu + skb->dev->hard_header_len;
2072}
2073
2074static int bpf_skb_grow_rcsum(struct sk_buff *skb, unsigned int new_len)
2075{
2076 unsigned int old_len = skb->len;
2077 int ret;
2078
2079 ret = __skb_grow_rcsum(skb, new_len);
2080 if (!ret)
2081 memset(skb->data + old_len, 0, new_len - old_len);
2082 return ret;
2083}
2084
2085static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len)
2086{
2087 return __skb_trim_rcsum(skb, new_len);
2088}
2089
2090BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len,
2091 u64, flags)
2092{
2093 u32 max_len = __bpf_skb_max_len(skb);
2094 u32 min_len = __bpf_skb_min_len(skb);
2095 int ret;
2096
2097 if (unlikely(flags || new_len > max_len || new_len < min_len))
2098 return -EINVAL;
2099 if (skb->encapsulation)
2100 return -ENOTSUPP;
2101
2102 /* The basic idea of this helper is that it's performing the
2103 * needed work to either grow or trim an skb, and eBPF program
2104 * rewrites the rest via helpers like bpf_skb_store_bytes(),
2105 * bpf_lX_csum_replace() and others rather than passing a raw
2106 * buffer here. This one is a slow path helper and intended
2107 * for replies with control messages.
2108 *
2109 * Like in bpf_skb_change_proto(), we want to keep this rather
2110 * minimal and without protocol specifics so that we are able
2111 * to separate concerns as in bpf_skb_store_bytes() should only
2112 * be the one responsible for writing buffers.
2113 *
2114 * It's really expected to be a slow path operation here for
2115 * control message replies, so we're implicitly linearizing,
2116 * uncloning and drop offloads from the skb by this.
2117 */
2118 ret = __bpf_try_make_writable(skb, skb->len);
2119 if (!ret) {
2120 if (new_len > skb->len)
2121 ret = bpf_skb_grow_rcsum(skb, new_len);
2122 else if (new_len < skb->len)
2123 ret = bpf_skb_trim_rcsum(skb, new_len);
2124 if (!ret && skb_is_gso(skb))
2125 skb_gso_reset(skb);
2126 }
2127
2128 bpf_compute_data_end(skb);
2129 return ret;
2130}
2131
2132static const struct bpf_func_proto bpf_skb_change_tail_proto = {
2133 .func = bpf_skb_change_tail,
2134 .gpl_only = false,
2135 .ret_type = RET_INTEGER,
2136 .arg1_type = ARG_PTR_TO_CTX,
2137 .arg2_type = ARG_ANYTHING,
2138 .arg3_type = ARG_ANYTHING,
2139};
2140
1995bool bpf_helper_changes_skb_data(void *func) 2141bool bpf_helper_changes_skb_data(void *func)
1996{ 2142{
1997 if (func == bpf_skb_vlan_push) 2143 if (func == bpf_skb_vlan_push ||
1998 return true; 2144 func == bpf_skb_vlan_pop ||
1999 if (func == bpf_skb_vlan_pop) 2145 func == bpf_skb_store_bytes ||
2000 return true; 2146 func == bpf_skb_change_proto ||
2001 if (func == bpf_skb_store_bytes) 2147 func == bpf_skb_change_tail ||
2002 return true; 2148 func == bpf_skb_pull_data ||
2003 if (func == bpf_skb_change_proto) 2149 func == bpf_l3_csum_replace ||
2004 return true; 2150 func == bpf_l4_csum_replace)
2005 if (func == bpf_l3_csum_replace)
2006 return true;
2007 if (func == bpf_l4_csum_replace)
2008 return true; 2151 return true;
2009 2152
2010 return false; 2153 return false;
@@ -2023,13 +2166,10 @@ static unsigned long bpf_skb_copy(void *dst_buff, const void *skb,
2023 return 0; 2166 return 0;
2024} 2167}
2025 2168
2026static u64 bpf_skb_event_output(u64 r1, u64 r2, u64 flags, u64 r4, 2169BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map *, map,
2027 u64 meta_size) 2170 u64, flags, void *, meta, u64, meta_size)
2028{ 2171{
2029 struct sk_buff *skb = (struct sk_buff *)(long) r1;
2030 struct bpf_map *map = (struct bpf_map *)(long) r2;
2031 u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32; 2172 u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32;
2032 void *meta = (void *)(long) r4;
2033 2173
2034 if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) 2174 if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
2035 return -EINVAL; 2175 return -EINVAL;
@@ -2056,10 +2196,9 @@ static unsigned short bpf_tunnel_key_af(u64 flags)
2056 return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET; 2196 return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET;
2057} 2197}
2058 2198
2059static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) 2199BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key *, to,
2200 u32, size, u64, flags)
2060{ 2201{
2061 struct sk_buff *skb = (struct sk_buff *) (long) r1;
2062 struct bpf_tunnel_key *to = (struct bpf_tunnel_key *) (long) r2;
2063 const struct ip_tunnel_info *info = skb_tunnel_info(skb); 2202 const struct ip_tunnel_info *info = skb_tunnel_info(skb);
2064 u8 compat[sizeof(struct bpf_tunnel_key)]; 2203 u8 compat[sizeof(struct bpf_tunnel_key)];
2065 void *to_orig = to; 2204 void *to_orig = to;
@@ -2124,10 +2263,8 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
2124 .arg4_type = ARG_ANYTHING, 2263 .arg4_type = ARG_ANYTHING,
2125}; 2264};
2126 2265
2127static u64 bpf_skb_get_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5) 2266BPF_CALL_3(bpf_skb_get_tunnel_opt, struct sk_buff *, skb, u8 *, to, u32, size)
2128{ 2267{
2129 struct sk_buff *skb = (struct sk_buff *) (long) r1;
2130 u8 *to = (u8 *) (long) r2;
2131 const struct ip_tunnel_info *info = skb_tunnel_info(skb); 2268 const struct ip_tunnel_info *info = skb_tunnel_info(skb);
2132 int err; 2269 int err;
2133 2270
@@ -2162,10 +2299,9 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = {
2162 2299
2163static struct metadata_dst __percpu *md_dst; 2300static struct metadata_dst __percpu *md_dst;
2164 2301
2165static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) 2302BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,
2303 const struct bpf_tunnel_key *, from, u32, size, u64, flags)
2166{ 2304{
2167 struct sk_buff *skb = (struct sk_buff *) (long) r1;
2168 struct bpf_tunnel_key *from = (struct bpf_tunnel_key *) (long) r2;
2169 struct metadata_dst *md = this_cpu_ptr(md_dst); 2305 struct metadata_dst *md = this_cpu_ptr(md_dst);
2170 u8 compat[sizeof(struct bpf_tunnel_key)]; 2306 u8 compat[sizeof(struct bpf_tunnel_key)];
2171 struct ip_tunnel_info *info; 2307 struct ip_tunnel_info *info;
@@ -2183,7 +2319,7 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
2183 */ 2319 */
2184 memcpy(compat, from, size); 2320 memcpy(compat, from, size);
2185 memset(compat + size, 0, sizeof(compat) - size); 2321 memset(compat + size, 0, sizeof(compat) - size);
2186 from = (struct bpf_tunnel_key *)compat; 2322 from = (const struct bpf_tunnel_key *) compat;
2187 break; 2323 break;
2188 default: 2324 default:
2189 return -EINVAL; 2325 return -EINVAL;
@@ -2233,10 +2369,9 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
2233 .arg4_type = ARG_ANYTHING, 2369 .arg4_type = ARG_ANYTHING,
2234}; 2370};
2235 2371
2236static u64 bpf_skb_set_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5) 2372BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb,
2373 const u8 *, from, u32, size)
2237{ 2374{
2238 struct sk_buff *skb = (struct sk_buff *) (long) r1;
2239 u8 *from = (u8 *) (long) r2;
2240 struct ip_tunnel_info *info = skb_tunnel_info(skb); 2375 struct ip_tunnel_info *info = skb_tunnel_info(skb);
2241 const struct metadata_dst *md = this_cpu_ptr(md_dst); 2376 const struct metadata_dst *md = this_cpu_ptr(md_dst);
2242 2377
@@ -2282,28 +2417,24 @@ bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)
2282 } 2417 }
2283} 2418}
2284 2419
2285#ifdef CONFIG_SOCK_CGROUP_DATA 2420BPF_CALL_3(bpf_skb_under_cgroup, struct sk_buff *, skb, struct bpf_map *, map,
2286static u64 bpf_skb_under_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) 2421 u32, idx)
2287{ 2422{
2288 struct sk_buff *skb = (struct sk_buff *)(long)r1;
2289 struct bpf_map *map = (struct bpf_map *)(long)r2;
2290 struct bpf_array *array = container_of(map, struct bpf_array, map); 2423 struct bpf_array *array = container_of(map, struct bpf_array, map);
2291 struct cgroup *cgrp; 2424 struct cgroup *cgrp;
2292 struct sock *sk; 2425 struct sock *sk;
2293 u32 i = (u32)r3;
2294 2426
2295 sk = skb->sk; 2427 sk = skb_to_full_sk(skb);
2296 if (!sk || !sk_fullsock(sk)) 2428 if (!sk || !sk_fullsock(sk))
2297 return -ENOENT; 2429 return -ENOENT;
2298 2430 if (unlikely(idx >= array->map.max_entries))
2299 if (unlikely(i >= array->map.max_entries))
2300 return -E2BIG; 2431 return -E2BIG;
2301 2432
2302 cgrp = READ_ONCE(array->ptrs[i]); 2433 cgrp = READ_ONCE(array->ptrs[idx]);
2303 if (unlikely(!cgrp)) 2434 if (unlikely(!cgrp))
2304 return -EAGAIN; 2435 return -EAGAIN;
2305 2436
2306 return cgroup_is_descendant(sock_cgroup_ptr(&sk->sk_cgrp_data), cgrp); 2437 return sk_under_cgroup_hierarchy(sk, cgrp);
2307} 2438}
2308 2439
2309static const struct bpf_func_proto bpf_skb_under_cgroup_proto = { 2440static const struct bpf_func_proto bpf_skb_under_cgroup_proto = {
@@ -2314,7 +2445,38 @@ static const struct bpf_func_proto bpf_skb_under_cgroup_proto = {
2314 .arg2_type = ARG_CONST_MAP_PTR, 2445 .arg2_type = ARG_CONST_MAP_PTR,
2315 .arg3_type = ARG_ANYTHING, 2446 .arg3_type = ARG_ANYTHING,
2316}; 2447};
2317#endif 2448
2449static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
2450 unsigned long off, unsigned long len)
2451{
2452 memcpy(dst_buff, src_buff + off, len);
2453 return 0;
2454}
2455
2456BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map,
2457 u64, flags, void *, meta, u64, meta_size)
2458{
2459 u64 xdp_size = (flags & BPF_F_CTXLEN_MASK) >> 32;
2460
2461 if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
2462 return -EINVAL;
2463 if (unlikely(xdp_size > (unsigned long)(xdp->data_end - xdp->data)))
2464 return -EFAULT;
2465
2466 return bpf_event_output(map, flags, meta, meta_size, xdp, xdp_size,
2467 bpf_xdp_copy);
2468}
2469
2470static const struct bpf_func_proto bpf_xdp_event_output_proto = {
2471 .func = bpf_xdp_event_output,
2472 .gpl_only = true,
2473 .ret_type = RET_INTEGER,
2474 .arg1_type = ARG_PTR_TO_CTX,
2475 .arg2_type = ARG_CONST_MAP_PTR,
2476 .arg3_type = ARG_ANYTHING,
2477 .arg4_type = ARG_PTR_TO_STACK,
2478 .arg5_type = ARG_CONST_STACK_SIZE,
2479};
2318 2480
2319static const struct bpf_func_proto * 2481static const struct bpf_func_proto *
2320sk_filter_func_proto(enum bpf_func_id func_id) 2482sk_filter_func_proto(enum bpf_func_id func_id)
@@ -2350,8 +2512,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
2350 return &bpf_skb_store_bytes_proto; 2512 return &bpf_skb_store_bytes_proto;
2351 case BPF_FUNC_skb_load_bytes: 2513 case BPF_FUNC_skb_load_bytes:
2352 return &bpf_skb_load_bytes_proto; 2514 return &bpf_skb_load_bytes_proto;
2515 case BPF_FUNC_skb_pull_data:
2516 return &bpf_skb_pull_data_proto;
2353 case BPF_FUNC_csum_diff: 2517 case BPF_FUNC_csum_diff:
2354 return &bpf_csum_diff_proto; 2518 return &bpf_csum_diff_proto;
2519 case BPF_FUNC_csum_update:
2520 return &bpf_csum_update_proto;
2355 case BPF_FUNC_l3_csum_replace: 2521 case BPF_FUNC_l3_csum_replace:
2356 return &bpf_l3_csum_replace_proto; 2522 return &bpf_l3_csum_replace_proto;
2357 case BPF_FUNC_l4_csum_replace: 2523 case BPF_FUNC_l4_csum_replace:
@@ -2368,6 +2534,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
2368 return &bpf_skb_change_proto_proto; 2534 return &bpf_skb_change_proto_proto;
2369 case BPF_FUNC_skb_change_type: 2535 case BPF_FUNC_skb_change_type:
2370 return &bpf_skb_change_type_proto; 2536 return &bpf_skb_change_type_proto;
2537 case BPF_FUNC_skb_change_tail:
2538 return &bpf_skb_change_tail_proto;
2371 case BPF_FUNC_skb_get_tunnel_key: 2539 case BPF_FUNC_skb_get_tunnel_key:
2372 return &bpf_skb_get_tunnel_key_proto; 2540 return &bpf_skb_get_tunnel_key_proto;
2373 case BPF_FUNC_skb_set_tunnel_key: 2541 case BPF_FUNC_skb_set_tunnel_key:
@@ -2382,14 +2550,14 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
2382 return &bpf_get_route_realm_proto; 2550 return &bpf_get_route_realm_proto;
2383 case BPF_FUNC_get_hash_recalc: 2551 case BPF_FUNC_get_hash_recalc:
2384 return &bpf_get_hash_recalc_proto; 2552 return &bpf_get_hash_recalc_proto;
2553 case BPF_FUNC_set_hash_invalid:
2554 return &bpf_set_hash_invalid_proto;
2385 case BPF_FUNC_perf_event_output: 2555 case BPF_FUNC_perf_event_output:
2386 return &bpf_skb_event_output_proto; 2556 return &bpf_skb_event_output_proto;
2387 case BPF_FUNC_get_smp_processor_id: 2557 case BPF_FUNC_get_smp_processor_id:
2388 return &bpf_get_smp_processor_id_proto; 2558 return &bpf_get_smp_processor_id_proto;
2389#ifdef CONFIG_SOCK_CGROUP_DATA
2390 case BPF_FUNC_skb_under_cgroup: 2559 case BPF_FUNC_skb_under_cgroup:
2391 return &bpf_skb_under_cgroup_proto; 2560 return &bpf_skb_under_cgroup_proto;
2392#endif
2393 default: 2561 default:
2394 return sk_filter_func_proto(func_id); 2562 return sk_filter_func_proto(func_id);
2395 } 2563 }
@@ -2398,7 +2566,14 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
2398static const struct bpf_func_proto * 2566static const struct bpf_func_proto *
2399xdp_func_proto(enum bpf_func_id func_id) 2567xdp_func_proto(enum bpf_func_id func_id)
2400{ 2568{
2401 return sk_filter_func_proto(func_id); 2569 switch (func_id) {
2570 case BPF_FUNC_perf_event_output:
2571 return &bpf_xdp_event_output_proto;
2572 case BPF_FUNC_get_smp_processor_id:
2573 return &bpf_get_smp_processor_id_proto;
2574 default:
2575 return sk_filter_func_proto(func_id);
2576 }
2402} 2577}
2403 2578
2404static bool __is_valid_access(int off, int size, enum bpf_access_type type) 2579static bool __is_valid_access(int off, int size, enum bpf_access_type type)
@@ -2438,6 +2613,45 @@ static bool sk_filter_is_valid_access(int off, int size,
2438 return __is_valid_access(off, size, type); 2613 return __is_valid_access(off, size, type);
2439} 2614}
2440 2615
2616static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
2617 const struct bpf_prog *prog)
2618{
2619 struct bpf_insn *insn = insn_buf;
2620
2621 if (!direct_write)
2622 return 0;
2623
2624 /* if (!skb->cloned)
2625 * goto start;
2626 *
2627 * (Fast-path, otherwise approximation that we might be
2628 * a clone, do the rest in helper.)
2629 */
2630 *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET());
2631 *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_6, CLONED_MASK);
2632 *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 7);
2633
2634 /* ret = bpf_skb_pull_data(skb, 0); */
2635 *insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1);
2636 *insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_2, BPF_REG_2);
2637 *insn++ = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
2638 BPF_FUNC_skb_pull_data);
2639 /* if (!ret)
2640 * goto restore;
2641 * return TC_ACT_SHOT;
2642 */
2643 *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2);
2644 *insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, TC_ACT_SHOT);
2645 *insn++ = BPF_EXIT_INSN();
2646
2647 /* restore: */
2648 *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6);
2649 /* start: */
2650 *insn++ = prog->insnsi[0];
2651
2652 return insn - insn_buf;
2653}
2654
2441static bool tc_cls_act_is_valid_access(int off, int size, 2655static bool tc_cls_act_is_valid_access(int off, int size,
2442 enum bpf_access_type type, 2656 enum bpf_access_type type,
2443 enum bpf_reg_type *reg_type) 2657 enum bpf_reg_type *reg_type)
@@ -2475,7 +2689,7 @@ static bool __is_valid_xdp_access(int off, int size,
2475 return false; 2689 return false;
2476 if (off % size != 0) 2690 if (off % size != 0)
2477 return false; 2691 return false;
2478 if (size != 4) 2692 if (size != sizeof(__u32))
2479 return false; 2693 return false;
2480 2694
2481 return true; 2695 return true;
@@ -2506,10 +2720,10 @@ void bpf_warn_invalid_xdp_action(u32 act)
2506} 2720}
2507EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); 2721EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
2508 2722
2509static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, 2723static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
2510 int src_reg, int ctx_off, 2724 int src_reg, int ctx_off,
2511 struct bpf_insn *insn_buf, 2725 struct bpf_insn *insn_buf,
2512 struct bpf_prog *prog) 2726 struct bpf_prog *prog)
2513{ 2727{
2514 struct bpf_insn *insn = insn_buf; 2728 struct bpf_insn *insn = insn_buf;
2515 2729
@@ -2556,7 +2770,7 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
2556 case offsetof(struct __sk_buff, ifindex): 2770 case offsetof(struct __sk_buff, ifindex):
2557 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); 2771 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
2558 2772
2559 *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)), 2773 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
2560 dst_reg, src_reg, 2774 dst_reg, src_reg,
2561 offsetof(struct sk_buff, dev)); 2775 offsetof(struct sk_buff, dev));
2562 *insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1); 2776 *insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1);
@@ -2597,7 +2811,7 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
2597 dst_reg, src_reg, insn); 2811 dst_reg, src_reg, insn);
2598 2812
2599 case offsetof(struct __sk_buff, cb[0]) ... 2813 case offsetof(struct __sk_buff, cb[0]) ...
2600 offsetof(struct __sk_buff, cb[4]): 2814 offsetof(struct __sk_buff, cb[4]):
2601 BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20); 2815 BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20);
2602 2816
2603 prog->cb_access = 1; 2817 prog->cb_access = 1;
@@ -2621,7 +2835,7 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
2621 break; 2835 break;
2622 2836
2623 case offsetof(struct __sk_buff, data): 2837 case offsetof(struct __sk_buff, data):
2624 *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, data)), 2838 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
2625 dst_reg, src_reg, 2839 dst_reg, src_reg,
2626 offsetof(struct sk_buff, data)); 2840 offsetof(struct sk_buff, data));
2627 break; 2841 break;
@@ -2630,8 +2844,8 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
2630 ctx_off -= offsetof(struct __sk_buff, data_end); 2844 ctx_off -= offsetof(struct __sk_buff, data_end);
2631 ctx_off += offsetof(struct sk_buff, cb); 2845 ctx_off += offsetof(struct sk_buff, cb);
2632 ctx_off += offsetof(struct bpf_skb_data_end, data_end); 2846 ctx_off += offsetof(struct bpf_skb_data_end, data_end);
2633 *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(sizeof(void *)), 2847 *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), dst_reg, src_reg,
2634 dst_reg, src_reg, ctx_off); 2848 ctx_off);
2635 break; 2849 break;
2636 2850
2637 case offsetof(struct __sk_buff, tc_index): 2851 case offsetof(struct __sk_buff, tc_index):
@@ -2657,6 +2871,31 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
2657 return insn - insn_buf; 2871 return insn - insn_buf;
2658} 2872}
2659 2873
2874static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, int dst_reg,
2875 int src_reg, int ctx_off,
2876 struct bpf_insn *insn_buf,
2877 struct bpf_prog *prog)
2878{
2879 struct bpf_insn *insn = insn_buf;
2880
2881 switch (ctx_off) {
2882 case offsetof(struct __sk_buff, ifindex):
2883 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
2884
2885 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
2886 dst_reg, src_reg,
2887 offsetof(struct sk_buff, dev));
2888 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg,
2889 offsetof(struct net_device, ifindex));
2890 break;
2891 default:
2892 return sk_filter_convert_ctx_access(type, dst_reg, src_reg,
2893 ctx_off, insn_buf, prog);
2894 }
2895
2896 return insn - insn_buf;
2897}
2898
2660static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg, 2899static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg,
2661 int src_reg, int ctx_off, 2900 int src_reg, int ctx_off,
2662 struct bpf_insn *insn_buf, 2901 struct bpf_insn *insn_buf,
@@ -2666,12 +2905,12 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg,
2666 2905
2667 switch (ctx_off) { 2906 switch (ctx_off) {
2668 case offsetof(struct xdp_md, data): 2907 case offsetof(struct xdp_md, data):
2669 *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct xdp_buff, data)), 2908 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data),
2670 dst_reg, src_reg, 2909 dst_reg, src_reg,
2671 offsetof(struct xdp_buff, data)); 2910 offsetof(struct xdp_buff, data));
2672 break; 2911 break;
2673 case offsetof(struct xdp_md, data_end): 2912 case offsetof(struct xdp_md, data_end):
2674 *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct xdp_buff, data_end)), 2913 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end),
2675 dst_reg, src_reg, 2914 dst_reg, src_reg,
2676 offsetof(struct xdp_buff, data_end)); 2915 offsetof(struct xdp_buff, data_end));
2677 break; 2916 break;
@@ -2683,13 +2922,14 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg,
2683static const struct bpf_verifier_ops sk_filter_ops = { 2922static const struct bpf_verifier_ops sk_filter_ops = {
2684 .get_func_proto = sk_filter_func_proto, 2923 .get_func_proto = sk_filter_func_proto,
2685 .is_valid_access = sk_filter_is_valid_access, 2924 .is_valid_access = sk_filter_is_valid_access,
2686 .convert_ctx_access = bpf_net_convert_ctx_access, 2925 .convert_ctx_access = sk_filter_convert_ctx_access,
2687}; 2926};
2688 2927
2689static const struct bpf_verifier_ops tc_cls_act_ops = { 2928static const struct bpf_verifier_ops tc_cls_act_ops = {
2690 .get_func_proto = tc_cls_act_func_proto, 2929 .get_func_proto = tc_cls_act_func_proto,
2691 .is_valid_access = tc_cls_act_is_valid_access, 2930 .is_valid_access = tc_cls_act_is_valid_access,
2692 .convert_ctx_access = bpf_net_convert_ctx_access, 2931 .convert_ctx_access = tc_cls_act_convert_ctx_access,
2932 .gen_prologue = tc_cls_act_prologue,
2693}; 2933};
2694 2934
2695static const struct bpf_verifier_ops xdp_ops = { 2935static const struct bpf_verifier_ops xdp_ops = {
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 52742a02814f..1a7b80f73376 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -6,6 +6,8 @@
6#include <linux/if_vlan.h> 6#include <linux/if_vlan.h>
7#include <net/ip.h> 7#include <net/ip.h>
8#include <net/ipv6.h> 8#include <net/ipv6.h>
9#include <net/gre.h>
10#include <net/pptp.h>
9#include <linux/igmp.h> 11#include <linux/igmp.h>
10#include <linux/icmp.h> 12#include <linux/icmp.h>
11#include <linux/sctp.h> 13#include <linux/sctp.h>
@@ -116,13 +118,16 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
116 struct flow_dissector_key_addrs *key_addrs; 118 struct flow_dissector_key_addrs *key_addrs;
117 struct flow_dissector_key_ports *key_ports; 119 struct flow_dissector_key_ports *key_ports;
118 struct flow_dissector_key_tags *key_tags; 120 struct flow_dissector_key_tags *key_tags;
121 struct flow_dissector_key_vlan *key_vlan;
119 struct flow_dissector_key_keyid *key_keyid; 122 struct flow_dissector_key_keyid *key_keyid;
123 bool skip_vlan = false;
120 u8 ip_proto = 0; 124 u8 ip_proto = 0;
121 bool ret = false; 125 bool ret = false;
122 126
123 if (!data) { 127 if (!data) {
124 data = skb->data; 128 data = skb->data;
125 proto = skb->protocol; 129 proto = skb_vlan_tag_present(skb) ?
130 skb->vlan_proto : skb->protocol;
126 nhoff = skb_network_offset(skb); 131 nhoff = skb_network_offset(skb);
127 hlen = skb_headlen(skb); 132 hlen = skb_headlen(skb);
128 } 133 }
@@ -241,23 +246,45 @@ ipv6:
241 case htons(ETH_P_8021AD): 246 case htons(ETH_P_8021AD):
242 case htons(ETH_P_8021Q): { 247 case htons(ETH_P_8021Q): {
243 const struct vlan_hdr *vlan; 248 const struct vlan_hdr *vlan;
244 struct vlan_hdr _vlan;
245 249
246 vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan), data, hlen, &_vlan); 250 if (skb_vlan_tag_present(skb))
247 if (!vlan) 251 proto = skb->protocol;
248 goto out_bad; 252
253 if (!skb_vlan_tag_present(skb) ||
254 proto == cpu_to_be16(ETH_P_8021Q) ||
255 proto == cpu_to_be16(ETH_P_8021AD)) {
256 struct vlan_hdr _vlan;
249 257
258 vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan),
259 data, hlen, &_vlan);
260 if (!vlan)
261 goto out_bad;
262 proto = vlan->h_vlan_encapsulated_proto;
263 nhoff += sizeof(*vlan);
264 if (skip_vlan)
265 goto again;
266 }
267
268 skip_vlan = true;
250 if (dissector_uses_key(flow_dissector, 269 if (dissector_uses_key(flow_dissector,
251 FLOW_DISSECTOR_KEY_VLANID)) { 270 FLOW_DISSECTOR_KEY_VLAN)) {
252 key_tags = skb_flow_dissector_target(flow_dissector, 271 key_vlan = skb_flow_dissector_target(flow_dissector,
253 FLOW_DISSECTOR_KEY_VLANID, 272 FLOW_DISSECTOR_KEY_VLAN,
254 target_container); 273 target_container);
255 274
256 key_tags->vlan_id = skb_vlan_tag_get_id(skb); 275 if (skb_vlan_tag_present(skb)) {
276 key_vlan->vlan_id = skb_vlan_tag_get_id(skb);
277 key_vlan->vlan_priority =
278 (skb_vlan_tag_get_prio(skb) >> VLAN_PRIO_SHIFT);
279 } else {
280 key_vlan->vlan_id = ntohs(vlan->h_vlan_TCI) &
281 VLAN_VID_MASK;
282 key_vlan->vlan_priority =
283 (ntohs(vlan->h_vlan_TCI) &
284 VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
285 }
257 } 286 }
258 287
259 proto = vlan->h_vlan_encapsulated_proto;
260 nhoff += sizeof(*vlan);
261 goto again; 288 goto again;
262 } 289 }
263 case htons(ETH_P_PPP_SES): { 290 case htons(ETH_P_PPP_SES): {
@@ -338,32 +365,42 @@ mpls:
338ip_proto_again: 365ip_proto_again:
339 switch (ip_proto) { 366 switch (ip_proto) {
340 case IPPROTO_GRE: { 367 case IPPROTO_GRE: {
341 struct gre_hdr { 368 struct gre_base_hdr *hdr, _hdr;
342 __be16 flags; 369 u16 gre_ver;
343 __be16 proto; 370 int offset = 0;
344 } *hdr, _hdr;
345 371
346 hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); 372 hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
347 if (!hdr) 373 if (!hdr)
348 goto out_bad; 374 goto out_bad;
349 /* 375
350 * Only look inside GRE if version zero and no 376 /* Only look inside GRE without routing */
351 * routing 377 if (hdr->flags & GRE_ROUTING)
352 */
353 if (hdr->flags & (GRE_VERSION | GRE_ROUTING))
354 break; 378 break;
355 379
356 proto = hdr->proto; 380 /* Only look inside GRE for version 0 and 1 */
357 nhoff += 4; 381 gre_ver = ntohs(hdr->flags & GRE_VERSION);
382 if (gre_ver > 1)
383 break;
384
385 proto = hdr->protocol;
386 if (gre_ver) {
387 /* Version1 must be PPTP, and check the flags */
388 if (!(proto == GRE_PROTO_PPP && (hdr->flags & GRE_KEY)))
389 break;
390 }
391
392 offset += sizeof(struct gre_base_hdr);
393
358 if (hdr->flags & GRE_CSUM) 394 if (hdr->flags & GRE_CSUM)
359 nhoff += 4; 395 offset += sizeof(((struct gre_full_hdr *)0)->csum) +
396 sizeof(((struct gre_full_hdr *)0)->reserved1);
397
360 if (hdr->flags & GRE_KEY) { 398 if (hdr->flags & GRE_KEY) {
361 const __be32 *keyid; 399 const __be32 *keyid;
362 __be32 _keyid; 400 __be32 _keyid;
363 401
364 keyid = __skb_header_pointer(skb, nhoff, sizeof(_keyid), 402 keyid = __skb_header_pointer(skb, nhoff + offset, sizeof(_keyid),
365 data, hlen, &_keyid); 403 data, hlen, &_keyid);
366
367 if (!keyid) 404 if (!keyid)
368 goto out_bad; 405 goto out_bad;
369 406
@@ -372,32 +409,65 @@ ip_proto_again:
372 key_keyid = skb_flow_dissector_target(flow_dissector, 409 key_keyid = skb_flow_dissector_target(flow_dissector,
373 FLOW_DISSECTOR_KEY_GRE_KEYID, 410 FLOW_DISSECTOR_KEY_GRE_KEYID,
374 target_container); 411 target_container);
375 key_keyid->keyid = *keyid; 412 if (gre_ver == 0)
413 key_keyid->keyid = *keyid;
414 else
415 key_keyid->keyid = *keyid & GRE_PPTP_KEY_MASK;
376 } 416 }
377 nhoff += 4; 417 offset += sizeof(((struct gre_full_hdr *)0)->key);
378 } 418 }
419
379 if (hdr->flags & GRE_SEQ) 420 if (hdr->flags & GRE_SEQ)
380 nhoff += 4; 421 offset += sizeof(((struct pptp_gre_header *)0)->seq);
381 if (proto == htons(ETH_P_TEB)) { 422
382 const struct ethhdr *eth; 423 if (gre_ver == 0) {
383 struct ethhdr _eth; 424 if (proto == htons(ETH_P_TEB)) {
384 425 const struct ethhdr *eth;
385 eth = __skb_header_pointer(skb, nhoff, 426 struct ethhdr _eth;
386 sizeof(_eth), 427
387 data, hlen, &_eth); 428 eth = __skb_header_pointer(skb, nhoff + offset,
388 if (!eth) 429 sizeof(_eth),
430 data, hlen, &_eth);
431 if (!eth)
432 goto out_bad;
433 proto = eth->h_proto;
434 offset += sizeof(*eth);
435
436 /* Cap headers that we access via pointers at the
437 * end of the Ethernet header as our maximum alignment
438 * at that point is only 2 bytes.
439 */
440 if (NET_IP_ALIGN)
441 hlen = (nhoff + offset);
442 }
443 } else { /* version 1, must be PPTP */
444 u8 _ppp_hdr[PPP_HDRLEN];
445 u8 *ppp_hdr;
446
447 if (hdr->flags & GRE_ACK)
448 offset += sizeof(((struct pptp_gre_header *)0)->ack);
449
450 ppp_hdr = skb_header_pointer(skb, nhoff + offset,
451 sizeof(_ppp_hdr), _ppp_hdr);
452 if (!ppp_hdr)
389 goto out_bad; 453 goto out_bad;
390 proto = eth->h_proto; 454
391 nhoff += sizeof(*eth); 455 switch (PPP_PROTOCOL(ppp_hdr)) {
392 456 case PPP_IP:
393 /* Cap headers that we access via pointers at the 457 proto = htons(ETH_P_IP);
394 * end of the Ethernet header as our maximum alignment 458 break;
395 * at that point is only 2 bytes. 459 case PPP_IPV6:
396 */ 460 proto = htons(ETH_P_IPV6);
397 if (NET_IP_ALIGN) 461 break;
398 hlen = nhoff; 462 default:
463 /* Could probably catch some more like MPLS */
464 break;
465 }
466
467 offset += PPP_HDRLEN;
399 } 468 }
400 469
470 nhoff += offset;
401 key_control->flags |= FLOW_DIS_ENCAPSULATION; 471 key_control->flags |= FLOW_DIS_ENCAPSULATION;
402 if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) 472 if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP)
403 goto out_good; 473 goto out_good;
@@ -874,8 +944,8 @@ static const struct flow_dissector_key flow_keys_dissector_keys[] = {
874 .offset = offsetof(struct flow_keys, ports), 944 .offset = offsetof(struct flow_keys, ports),
875 }, 945 },
876 { 946 {
877 .key_id = FLOW_DISSECTOR_KEY_VLANID, 947 .key_id = FLOW_DISSECTOR_KEY_VLAN,
878 .offset = offsetof(struct flow_keys, tags), 948 .offset = offsetof(struct flow_keys, vlan),
879 }, 949 },
880 { 950 {
881 .key_id = FLOW_DISSECTOR_KEY_FLOW_LABEL, 951 .key_id = FLOW_DISSECTOR_KEY_FLOW_LABEL,
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index 669ecc9f884e..e5f84c26ba1a 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -251,6 +251,41 @@ drop:
251} 251}
252EXPORT_SYMBOL(lwtunnel_output); 252EXPORT_SYMBOL(lwtunnel_output);
253 253
254int lwtunnel_xmit(struct sk_buff *skb)
255{
256 struct dst_entry *dst = skb_dst(skb);
257 const struct lwtunnel_encap_ops *ops;
258 struct lwtunnel_state *lwtstate;
259 int ret = -EINVAL;
260
261 if (!dst)
262 goto drop;
263
264 lwtstate = dst->lwtstate;
265
266 if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
267 lwtstate->type > LWTUNNEL_ENCAP_MAX)
268 return 0;
269
270 ret = -EOPNOTSUPP;
271 rcu_read_lock();
272 ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
273 if (likely(ops && ops->xmit))
274 ret = ops->xmit(skb);
275 rcu_read_unlock();
276
277 if (ret == -EOPNOTSUPP)
278 goto drop;
279
280 return ret;
281
282drop:
283 kfree_skb(skb);
284
285 return ret;
286}
287EXPORT_SYMBOL(lwtunnel_xmit);
288
254int lwtunnel_input(struct sk_buff *skb) 289int lwtunnel_input(struct sk_buff *skb)
255{ 290{
256 struct dst_entry *dst = skb_dst(skb); 291 struct dst_entry *dst = skb_dst(skb);
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index cf26e04c4046..2ae929f9bd06 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1148,7 +1148,8 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
1148 } else 1148 } else
1149 goto out; 1149 goto out;
1150 } else { 1150 } else {
1151 if (lladdr == neigh->ha && new == NUD_STALE) 1151 if (lladdr == neigh->ha && new == NUD_STALE &&
1152 !(flags & NEIGH_UPDATE_F_ADMIN))
1152 new = old; 1153 new = old;
1153 } 1154 }
1154 } 1155 }
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 2c2eb1b629b1..989434f36f96 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -37,6 +37,8 @@ struct net init_net = {
37}; 37};
38EXPORT_SYMBOL(init_net); 38EXPORT_SYMBOL(init_net);
39 39
40static bool init_net_initialized;
41
40#define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */ 42#define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */
41 43
42static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS; 44static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;
@@ -213,31 +215,29 @@ static void rtnl_net_notifyid(struct net *net, int cmd, int id);
213 */ 215 */
214int peernet2id_alloc(struct net *net, struct net *peer) 216int peernet2id_alloc(struct net *net, struct net *peer)
215{ 217{
216 unsigned long flags;
217 bool alloc; 218 bool alloc;
218 int id; 219 int id;
219 220
220 spin_lock_irqsave(&net->nsid_lock, flags); 221 spin_lock_bh(&net->nsid_lock);
221 alloc = atomic_read(&peer->count) == 0 ? false : true; 222 alloc = atomic_read(&peer->count) == 0 ? false : true;
222 id = __peernet2id_alloc(net, peer, &alloc); 223 id = __peernet2id_alloc(net, peer, &alloc);
223 spin_unlock_irqrestore(&net->nsid_lock, flags); 224 spin_unlock_bh(&net->nsid_lock);
224 if (alloc && id >= 0) 225 if (alloc && id >= 0)
225 rtnl_net_notifyid(net, RTM_NEWNSID, id); 226 rtnl_net_notifyid(net, RTM_NEWNSID, id);
226 return id; 227 return id;
227} 228}
228EXPORT_SYMBOL(peernet2id_alloc);
229 229
230/* This function returns, if assigned, the id of a peer netns. */ 230/* This function returns, if assigned, the id of a peer netns. */
231int peernet2id(struct net *net, struct net *peer) 231int peernet2id(struct net *net, struct net *peer)
232{ 232{
233 unsigned long flags;
234 int id; 233 int id;
235 234
236 spin_lock_irqsave(&net->nsid_lock, flags); 235 spin_lock_bh(&net->nsid_lock);
237 id = __peernet2id(net, peer); 236 id = __peernet2id(net, peer);
238 spin_unlock_irqrestore(&net->nsid_lock, flags); 237 spin_unlock_bh(&net->nsid_lock);
239 return id; 238 return id;
240} 239}
240EXPORT_SYMBOL(peernet2id);
241 241
242/* This function returns true is the peer netns has an id assigned into the 242/* This function returns true is the peer netns has an id assigned into the
243 * current netns. 243 * current netns.
@@ -249,18 +249,17 @@ bool peernet_has_id(struct net *net, struct net *peer)
249 249
250struct net *get_net_ns_by_id(struct net *net, int id) 250struct net *get_net_ns_by_id(struct net *net, int id)
251{ 251{
252 unsigned long flags;
253 struct net *peer; 252 struct net *peer;
254 253
255 if (id < 0) 254 if (id < 0)
256 return NULL; 255 return NULL;
257 256
258 rcu_read_lock(); 257 rcu_read_lock();
259 spin_lock_irqsave(&net->nsid_lock, flags); 258 spin_lock_bh(&net->nsid_lock);
260 peer = idr_find(&net->netns_ids, id); 259 peer = idr_find(&net->netns_ids, id);
261 if (peer) 260 if (peer)
262 get_net(peer); 261 get_net(peer);
263 spin_unlock_irqrestore(&net->nsid_lock, flags); 262 spin_unlock_bh(&net->nsid_lock);
264 rcu_read_unlock(); 263 rcu_read_unlock();
265 264
266 return peer; 265 return peer;
@@ -310,6 +309,16 @@ out_undo:
310 309
311 310
312#ifdef CONFIG_NET_NS 311#ifdef CONFIG_NET_NS
312static struct ucounts *inc_net_namespaces(struct user_namespace *ns)
313{
314 return inc_ucount(ns, current_euid(), UCOUNT_NET_NAMESPACES);
315}
316
317static void dec_net_namespaces(struct ucounts *ucounts)
318{
319 dec_ucount(ucounts, UCOUNT_NET_NAMESPACES);
320}
321
313static struct kmem_cache *net_cachep; 322static struct kmem_cache *net_cachep;
314static struct workqueue_struct *netns_wq; 323static struct workqueue_struct *netns_wq;
315 324
@@ -351,19 +360,27 @@ void net_drop_ns(void *p)
351struct net *copy_net_ns(unsigned long flags, 360struct net *copy_net_ns(unsigned long flags,
352 struct user_namespace *user_ns, struct net *old_net) 361 struct user_namespace *user_ns, struct net *old_net)
353{ 362{
363 struct ucounts *ucounts;
354 struct net *net; 364 struct net *net;
355 int rv; 365 int rv;
356 366
357 if (!(flags & CLONE_NEWNET)) 367 if (!(flags & CLONE_NEWNET))
358 return get_net(old_net); 368 return get_net(old_net);
359 369
370 ucounts = inc_net_namespaces(user_ns);
371 if (!ucounts)
372 return ERR_PTR(-ENOSPC);
373
360 net = net_alloc(); 374 net = net_alloc();
361 if (!net) 375 if (!net) {
376 dec_net_namespaces(ucounts);
362 return ERR_PTR(-ENOMEM); 377 return ERR_PTR(-ENOMEM);
378 }
363 379
364 get_user_ns(user_ns); 380 get_user_ns(user_ns);
365 381
366 mutex_lock(&net_mutex); 382 mutex_lock(&net_mutex);
383 net->ucounts = ucounts;
367 rv = setup_net(net, user_ns); 384 rv = setup_net(net, user_ns);
368 if (rv == 0) { 385 if (rv == 0) {
369 rtnl_lock(); 386 rtnl_lock();
@@ -372,6 +389,7 @@ struct net *copy_net_ns(unsigned long flags,
372 } 389 }
373 mutex_unlock(&net_mutex); 390 mutex_unlock(&net_mutex);
374 if (rv < 0) { 391 if (rv < 0) {
392 dec_net_namespaces(ucounts);
375 put_user_ns(user_ns); 393 put_user_ns(user_ns);
376 net_drop_ns(net); 394 net_drop_ns(net);
377 return ERR_PTR(rv); 395 return ERR_PTR(rv);
@@ -404,17 +422,17 @@ static void cleanup_net(struct work_struct *work)
404 for_each_net(tmp) { 422 for_each_net(tmp) {
405 int id; 423 int id;
406 424
407 spin_lock_irq(&tmp->nsid_lock); 425 spin_lock_bh(&tmp->nsid_lock);
408 id = __peernet2id(tmp, net); 426 id = __peernet2id(tmp, net);
409 if (id >= 0) 427 if (id >= 0)
410 idr_remove(&tmp->netns_ids, id); 428 idr_remove(&tmp->netns_ids, id);
411 spin_unlock_irq(&tmp->nsid_lock); 429 spin_unlock_bh(&tmp->nsid_lock);
412 if (id >= 0) 430 if (id >= 0)
413 rtnl_net_notifyid(tmp, RTM_DELNSID, id); 431 rtnl_net_notifyid(tmp, RTM_DELNSID, id);
414 } 432 }
415 spin_lock_irq(&net->nsid_lock); 433 spin_lock_bh(&net->nsid_lock);
416 idr_destroy(&net->netns_ids); 434 idr_destroy(&net->netns_ids);
417 spin_unlock_irq(&net->nsid_lock); 435 spin_unlock_bh(&net->nsid_lock);
418 436
419 } 437 }
420 rtnl_unlock(); 438 rtnl_unlock();
@@ -444,6 +462,7 @@ static void cleanup_net(struct work_struct *work)
444 /* Finally it is safe to free my network namespace structure */ 462 /* Finally it is safe to free my network namespace structure */
445 list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) { 463 list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) {
446 list_del_init(&net->exit_list); 464 list_del_init(&net->exit_list);
465 dec_net_namespaces(net->ucounts);
447 put_user_ns(net->user_ns); 466 put_user_ns(net->user_ns);
448 net_drop_ns(net); 467 net_drop_ns(net);
449 } 468 }
@@ -531,7 +550,7 @@ static struct pernet_operations __net_initdata net_ns_ops = {
531 .exit = net_ns_net_exit, 550 .exit = net_ns_net_exit,
532}; 551};
533 552
534static struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = { 553static const struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = {
535 [NETNSA_NONE] = { .type = NLA_UNSPEC }, 554 [NETNSA_NONE] = { .type = NLA_UNSPEC },
536 [NETNSA_NSID] = { .type = NLA_S32 }, 555 [NETNSA_NSID] = { .type = NLA_S32 },
537 [NETNSA_PID] = { .type = NLA_U32 }, 556 [NETNSA_PID] = { .type = NLA_U32 },
@@ -542,7 +561,6 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh)
542{ 561{
543 struct net *net = sock_net(skb->sk); 562 struct net *net = sock_net(skb->sk);
544 struct nlattr *tb[NETNSA_MAX + 1]; 563 struct nlattr *tb[NETNSA_MAX + 1];
545 unsigned long flags;
546 struct net *peer; 564 struct net *peer;
547 int nsid, err; 565 int nsid, err;
548 566
@@ -563,15 +581,15 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh)
563 if (IS_ERR(peer)) 581 if (IS_ERR(peer))
564 return PTR_ERR(peer); 582 return PTR_ERR(peer);
565 583
566 spin_lock_irqsave(&net->nsid_lock, flags); 584 spin_lock_bh(&net->nsid_lock);
567 if (__peernet2id(net, peer) >= 0) { 585 if (__peernet2id(net, peer) >= 0) {
568 spin_unlock_irqrestore(&net->nsid_lock, flags); 586 spin_unlock_bh(&net->nsid_lock);
569 err = -EEXIST; 587 err = -EEXIST;
570 goto out; 588 goto out;
571 } 589 }
572 590
573 err = alloc_netid(net, peer, nsid); 591 err = alloc_netid(net, peer, nsid);
574 spin_unlock_irqrestore(&net->nsid_lock, flags); 592 spin_unlock_bh(&net->nsid_lock);
575 if (err >= 0) { 593 if (err >= 0) {
576 rtnl_net_notifyid(net, RTM_NEWNSID, err); 594 rtnl_net_notifyid(net, RTM_NEWNSID, err);
577 err = 0; 595 err = 0;
@@ -693,11 +711,10 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb)
693 .idx = 0, 711 .idx = 0,
694 .s_idx = cb->args[0], 712 .s_idx = cb->args[0],
695 }; 713 };
696 unsigned long flags;
697 714
698 spin_lock_irqsave(&net->nsid_lock, flags); 715 spin_lock_bh(&net->nsid_lock);
699 idr_for_each(&net->netns_ids, rtnl_net_dumpid_one, &net_cb); 716 idr_for_each(&net->netns_ids, rtnl_net_dumpid_one, &net_cb);
700 spin_unlock_irqrestore(&net->nsid_lock, flags); 717 spin_unlock_bh(&net->nsid_lock);
701 718
702 cb->args[0] = net_cb.idx; 719 cb->args[0] = net_cb.idx;
703 return skb->len; 720 return skb->len;
@@ -750,6 +767,8 @@ static int __init net_ns_init(void)
750 if (setup_net(&init_net, &init_user_ns)) 767 if (setup_net(&init_net, &init_user_ns))
751 panic("Could not setup the initial network namespace"); 768 panic("Could not setup the initial network namespace");
752 769
770 init_net_initialized = true;
771
753 rtnl_lock(); 772 rtnl_lock();
754 list_add_tail_rcu(&init_net.list, &net_namespace_list); 773 list_add_tail_rcu(&init_net.list, &net_namespace_list);
755 rtnl_unlock(); 774 rtnl_unlock();
@@ -811,15 +830,24 @@ static void __unregister_pernet_operations(struct pernet_operations *ops)
811static int __register_pernet_operations(struct list_head *list, 830static int __register_pernet_operations(struct list_head *list,
812 struct pernet_operations *ops) 831 struct pernet_operations *ops)
813{ 832{
833 if (!init_net_initialized) {
834 list_add_tail(&ops->list, list);
835 return 0;
836 }
837
814 return ops_init(ops, &init_net); 838 return ops_init(ops, &init_net);
815} 839}
816 840
817static void __unregister_pernet_operations(struct pernet_operations *ops) 841static void __unregister_pernet_operations(struct pernet_operations *ops)
818{ 842{
819 LIST_HEAD(net_exit_list); 843 if (!init_net_initialized) {
820 list_add(&init_net.exit_list, &net_exit_list); 844 list_del(&ops->list);
821 ops_exit_list(ops, &net_exit_list); 845 } else {
822 ops_free_list(ops, &net_exit_list); 846 LIST_HEAD(net_exit_list);
847 list_add(&init_net.exit_list, &net_exit_list);
848 ops_exit_list(ops, &net_exit_list);
849 ops_free_list(ops, &net_exit_list);
850 }
823} 851}
824 852
825#endif /* CONFIG_NET_NS */ 853#endif /* CONFIG_NET_NS */
@@ -996,11 +1024,17 @@ static int netns_install(struct nsproxy *nsproxy, struct ns_common *ns)
996 return 0; 1024 return 0;
997} 1025}
998 1026
1027static struct user_namespace *netns_owner(struct ns_common *ns)
1028{
1029 return to_net_ns(ns)->user_ns;
1030}
1031
999const struct proc_ns_operations netns_operations = { 1032const struct proc_ns_operations netns_operations = {
1000 .name = "net", 1033 .name = "net",
1001 .type = CLONE_NEWNET, 1034 .type = CLONE_NEWNET,
1002 .get = netns_get, 1035 .get = netns_get,
1003 .put = netns_put, 1036 .put = netns_put,
1004 .install = netns_install, 1037 .install = netns_install,
1038 .owner = netns_owner,
1005}; 1039};
1006#endif 1040#endif
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index bbd118b19aef..5219a9e2127a 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -2286,7 +2286,7 @@ out:
2286 2286
2287static inline void set_pkt_overhead(struct pktgen_dev *pkt_dev) 2287static inline void set_pkt_overhead(struct pktgen_dev *pkt_dev)
2288{ 2288{
2289 pkt_dev->pkt_overhead = LL_RESERVED_SPACE(pkt_dev->odev); 2289 pkt_dev->pkt_overhead = 0;
2290 pkt_dev->pkt_overhead += pkt_dev->nr_labels*sizeof(u32); 2290 pkt_dev->pkt_overhead += pkt_dev->nr_labels*sizeof(u32);
2291 pkt_dev->pkt_overhead += VLAN_TAG_SIZE(pkt_dev); 2291 pkt_dev->pkt_overhead += VLAN_TAG_SIZE(pkt_dev);
2292 pkt_dev->pkt_overhead += SVLAN_TAG_SIZE(pkt_dev); 2292 pkt_dev->pkt_overhead += SVLAN_TAG_SIZE(pkt_dev);
@@ -2777,13 +2777,13 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb,
2777} 2777}
2778 2778
2779static struct sk_buff *pktgen_alloc_skb(struct net_device *dev, 2779static struct sk_buff *pktgen_alloc_skb(struct net_device *dev,
2780 struct pktgen_dev *pkt_dev, 2780 struct pktgen_dev *pkt_dev)
2781 unsigned int extralen)
2782{ 2781{
2782 unsigned int extralen = LL_RESERVED_SPACE(dev);
2783 struct sk_buff *skb = NULL; 2783 struct sk_buff *skb = NULL;
2784 unsigned int size = pkt_dev->cur_pkt_size + 64 + extralen + 2784 unsigned int size;
2785 pkt_dev->pkt_overhead;
2786 2785
2786 size = pkt_dev->cur_pkt_size + 64 + extralen + pkt_dev->pkt_overhead;
2787 if (pkt_dev->flags & F_NODE) { 2787 if (pkt_dev->flags & F_NODE) {
2788 int node = pkt_dev->node >= 0 ? pkt_dev->node : numa_node_id(); 2788 int node = pkt_dev->node >= 0 ? pkt_dev->node : numa_node_id();
2789 2789
@@ -2796,8 +2796,9 @@ static struct sk_buff *pktgen_alloc_skb(struct net_device *dev,
2796 skb = __netdev_alloc_skb(dev, size, GFP_NOWAIT); 2796 skb = __netdev_alloc_skb(dev, size, GFP_NOWAIT);
2797 } 2797 }
2798 2798
2799 /* the caller pre-fetches from skb->data and reserves for the mac hdr */
2799 if (likely(skb)) 2800 if (likely(skb))
2800 skb_reserve(skb, LL_RESERVED_SPACE(dev)); 2801 skb_reserve(skb, extralen - 16);
2801 2802
2802 return skb; 2803 return skb;
2803} 2804}
@@ -2830,16 +2831,14 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
2830 mod_cur_headers(pkt_dev); 2831 mod_cur_headers(pkt_dev);
2831 queue_map = pkt_dev->cur_queue_map; 2832 queue_map = pkt_dev->cur_queue_map;
2832 2833
2833 datalen = (odev->hard_header_len + 16) & ~0xf; 2834 skb = pktgen_alloc_skb(odev, pkt_dev);
2834
2835 skb = pktgen_alloc_skb(odev, pkt_dev, datalen);
2836 if (!skb) { 2835 if (!skb) {
2837 sprintf(pkt_dev->result, "No memory"); 2836 sprintf(pkt_dev->result, "No memory");
2838 return NULL; 2837 return NULL;
2839 } 2838 }
2840 2839
2841 prefetchw(skb->data); 2840 prefetchw(skb->data);
2842 skb_reserve(skb, datalen); 2841 skb_reserve(skb, 16);
2843 2842
2844 /* Reserve for ethernet and IP header */ 2843 /* Reserve for ethernet and IP header */
2845 eth = (__u8 *) skb_push(skb, 14); 2844 eth = (__u8 *) skb_push(skb, 14);
@@ -2959,7 +2958,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
2959 mod_cur_headers(pkt_dev); 2958 mod_cur_headers(pkt_dev);
2960 queue_map = pkt_dev->cur_queue_map; 2959 queue_map = pkt_dev->cur_queue_map;
2961 2960
2962 skb = pktgen_alloc_skb(odev, pkt_dev, 16); 2961 skb = pktgen_alloc_skb(odev, pkt_dev);
2963 if (!skb) { 2962 if (!skb) {
2964 sprintf(pkt_dev->result, "No memory"); 2963 sprintf(pkt_dev->result, "No memory");
2965 return NULL; 2964 return NULL;
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 189cc78c77eb..fb7348f13501 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -704,6 +704,8 @@ int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics)
704 } else if (i == RTAX_FEATURES - 1) { 704 } else if (i == RTAX_FEATURES - 1) {
705 u32 user_features = metrics[i] & RTAX_FEATURE_MASK; 705 u32 user_features = metrics[i] & RTAX_FEATURE_MASK;
706 706
707 if (!user_features)
708 continue;
707 BUILD_BUG_ON(RTAX_FEATURE_MASK & DST_FEATURE_MASK); 709 BUILD_BUG_ON(RTAX_FEATURE_MASK & DST_FEATURE_MASK);
708 if (nla_put_u32(skb, i + 1, user_features)) 710 if (nla_put_u32(skb, i + 1, user_features))
709 goto nla_put_failure; 711 goto nla_put_failure;
@@ -841,7 +843,10 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev,
841 size += nla_total_size(num_vfs * sizeof(struct nlattr)); 843 size += nla_total_size(num_vfs * sizeof(struct nlattr));
842 size += num_vfs * 844 size += num_vfs *
843 (nla_total_size(sizeof(struct ifla_vf_mac)) + 845 (nla_total_size(sizeof(struct ifla_vf_mac)) +
844 nla_total_size(sizeof(struct ifla_vf_vlan)) + 846 nla_total_size(MAX_VLAN_LIST_LEN *
847 sizeof(struct nlattr)) +
848 nla_total_size(MAX_VLAN_LIST_LEN *
849 sizeof(struct ifla_vf_vlan_info)) +
845 nla_total_size(sizeof(struct ifla_vf_spoofchk)) + 850 nla_total_size(sizeof(struct ifla_vf_spoofchk)) +
846 nla_total_size(sizeof(struct ifla_vf_rate)) + 851 nla_total_size(sizeof(struct ifla_vf_rate)) +
847 nla_total_size(sizeof(struct ifla_vf_link_state)) + 852 nla_total_size(sizeof(struct ifla_vf_link_state)) +
@@ -1109,14 +1114,15 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
1109 struct nlattr *vfinfo) 1114 struct nlattr *vfinfo)
1110{ 1115{
1111 struct ifla_vf_rss_query_en vf_rss_query_en; 1116 struct ifla_vf_rss_query_en vf_rss_query_en;
1117 struct nlattr *vf, *vfstats, *vfvlanlist;
1112 struct ifla_vf_link_state vf_linkstate; 1118 struct ifla_vf_link_state vf_linkstate;
1119 struct ifla_vf_vlan_info vf_vlan_info;
1113 struct ifla_vf_spoofchk vf_spoofchk; 1120 struct ifla_vf_spoofchk vf_spoofchk;
1114 struct ifla_vf_tx_rate vf_tx_rate; 1121 struct ifla_vf_tx_rate vf_tx_rate;
1115 struct ifla_vf_stats vf_stats; 1122 struct ifla_vf_stats vf_stats;
1116 struct ifla_vf_trust vf_trust; 1123 struct ifla_vf_trust vf_trust;
1117 struct ifla_vf_vlan vf_vlan; 1124 struct ifla_vf_vlan vf_vlan;
1118 struct ifla_vf_rate vf_rate; 1125 struct ifla_vf_rate vf_rate;
1119 struct nlattr *vf, *vfstats;
1120 struct ifla_vf_mac vf_mac; 1126 struct ifla_vf_mac vf_mac;
1121 struct ifla_vf_info ivi; 1127 struct ifla_vf_info ivi;
1122 1128
@@ -1133,11 +1139,16 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
1133 * IFLA_VF_LINK_STATE_AUTO which equals zero 1139 * IFLA_VF_LINK_STATE_AUTO which equals zero
1134 */ 1140 */
1135 ivi.linkstate = 0; 1141 ivi.linkstate = 0;
1142 /* VLAN Protocol by default is 802.1Q */
1143 ivi.vlan_proto = htons(ETH_P_8021Q);
1136 if (dev->netdev_ops->ndo_get_vf_config(dev, vfs_num, &ivi)) 1144 if (dev->netdev_ops->ndo_get_vf_config(dev, vfs_num, &ivi))
1137 return 0; 1145 return 0;
1138 1146
1147 memset(&vf_vlan_info, 0, sizeof(vf_vlan_info));
1148
1139 vf_mac.vf = 1149 vf_mac.vf =
1140 vf_vlan.vf = 1150 vf_vlan.vf =
1151 vf_vlan_info.vf =
1141 vf_rate.vf = 1152 vf_rate.vf =
1142 vf_tx_rate.vf = 1153 vf_tx_rate.vf =
1143 vf_spoofchk.vf = 1154 vf_spoofchk.vf =
@@ -1148,6 +1159,9 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
1148 memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac)); 1159 memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac));
1149 vf_vlan.vlan = ivi.vlan; 1160 vf_vlan.vlan = ivi.vlan;
1150 vf_vlan.qos = ivi.qos; 1161 vf_vlan.qos = ivi.qos;
1162 vf_vlan_info.vlan = ivi.vlan;
1163 vf_vlan_info.qos = ivi.qos;
1164 vf_vlan_info.vlan_proto = ivi.vlan_proto;
1151 vf_tx_rate.rate = ivi.max_tx_rate; 1165 vf_tx_rate.rate = ivi.max_tx_rate;
1152 vf_rate.min_tx_rate = ivi.min_tx_rate; 1166 vf_rate.min_tx_rate = ivi.min_tx_rate;
1153 vf_rate.max_tx_rate = ivi.max_tx_rate; 1167 vf_rate.max_tx_rate = ivi.max_tx_rate;
@@ -1156,10 +1170,8 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
1156 vf_rss_query_en.setting = ivi.rss_query_en; 1170 vf_rss_query_en.setting = ivi.rss_query_en;
1157 vf_trust.setting = ivi.trusted; 1171 vf_trust.setting = ivi.trusted;
1158 vf = nla_nest_start(skb, IFLA_VF_INFO); 1172 vf = nla_nest_start(skb, IFLA_VF_INFO);
1159 if (!vf) { 1173 if (!vf)
1160 nla_nest_cancel(skb, vfinfo); 1174 goto nla_put_vfinfo_failure;
1161 return -EMSGSIZE;
1162 }
1163 if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) || 1175 if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) ||
1164 nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) || 1176 nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) ||
1165 nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate), 1177 nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate),
@@ -1175,17 +1187,23 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
1175 &vf_rss_query_en) || 1187 &vf_rss_query_en) ||
1176 nla_put(skb, IFLA_VF_TRUST, 1188 nla_put(skb, IFLA_VF_TRUST,
1177 sizeof(vf_trust), &vf_trust)) 1189 sizeof(vf_trust), &vf_trust))
1178 return -EMSGSIZE; 1190 goto nla_put_vf_failure;
1191 vfvlanlist = nla_nest_start(skb, IFLA_VF_VLAN_LIST);
1192 if (!vfvlanlist)
1193 goto nla_put_vf_failure;
1194 if (nla_put(skb, IFLA_VF_VLAN_INFO, sizeof(vf_vlan_info),
1195 &vf_vlan_info)) {
1196 nla_nest_cancel(skb, vfvlanlist);
1197 goto nla_put_vf_failure;
1198 }
1199 nla_nest_end(skb, vfvlanlist);
1179 memset(&vf_stats, 0, sizeof(vf_stats)); 1200 memset(&vf_stats, 0, sizeof(vf_stats));
1180 if (dev->netdev_ops->ndo_get_vf_stats) 1201 if (dev->netdev_ops->ndo_get_vf_stats)
1181 dev->netdev_ops->ndo_get_vf_stats(dev, vfs_num, 1202 dev->netdev_ops->ndo_get_vf_stats(dev, vfs_num,
1182 &vf_stats); 1203 &vf_stats);
1183 vfstats = nla_nest_start(skb, IFLA_VF_STATS); 1204 vfstats = nla_nest_start(skb, IFLA_VF_STATS);
1184 if (!vfstats) { 1205 if (!vfstats)
1185 nla_nest_cancel(skb, vf); 1206 goto nla_put_vf_failure;
1186 nla_nest_cancel(skb, vfinfo);
1187 return -EMSGSIZE;
1188 }
1189 if (nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_PACKETS, 1207 if (nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_PACKETS,
1190 vf_stats.rx_packets, IFLA_VF_STATS_PAD) || 1208 vf_stats.rx_packets, IFLA_VF_STATS_PAD) ||
1191 nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_PACKETS, 1209 nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_PACKETS,
@@ -1197,11 +1215,19 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
1197 nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST, 1215 nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST,
1198 vf_stats.broadcast, IFLA_VF_STATS_PAD) || 1216 vf_stats.broadcast, IFLA_VF_STATS_PAD) ||
1199 nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST, 1217 nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST,
1200 vf_stats.multicast, IFLA_VF_STATS_PAD)) 1218 vf_stats.multicast, IFLA_VF_STATS_PAD)) {
1201 return -EMSGSIZE; 1219 nla_nest_cancel(skb, vfstats);
1220 goto nla_put_vf_failure;
1221 }
1202 nla_nest_end(skb, vfstats); 1222 nla_nest_end(skb, vfstats);
1203 nla_nest_end(skb, vf); 1223 nla_nest_end(skb, vf);
1204 return 0; 1224 return 0;
1225
1226nla_put_vf_failure:
1227 nla_nest_cancel(skb, vf);
1228nla_put_vfinfo_failure:
1229 nla_nest_cancel(skb, vfinfo);
1230 return -EMSGSIZE;
1205} 1231}
1206 1232
1207static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev) 1233static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev)
@@ -1446,6 +1472,7 @@ static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
1446static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = { 1472static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = {
1447 [IFLA_VF_MAC] = { .len = sizeof(struct ifla_vf_mac) }, 1473 [IFLA_VF_MAC] = { .len = sizeof(struct ifla_vf_mac) },
1448 [IFLA_VF_VLAN] = { .len = sizeof(struct ifla_vf_vlan) }, 1474 [IFLA_VF_VLAN] = { .len = sizeof(struct ifla_vf_vlan) },
1475 [IFLA_VF_VLAN_LIST] = { .type = NLA_NESTED },
1449 [IFLA_VF_TX_RATE] = { .len = sizeof(struct ifla_vf_tx_rate) }, 1476 [IFLA_VF_TX_RATE] = { .len = sizeof(struct ifla_vf_tx_rate) },
1450 [IFLA_VF_SPOOFCHK] = { .len = sizeof(struct ifla_vf_spoofchk) }, 1477 [IFLA_VF_SPOOFCHK] = { .len = sizeof(struct ifla_vf_spoofchk) },
1451 [IFLA_VF_RATE] = { .len = sizeof(struct ifla_vf_rate) }, 1478 [IFLA_VF_RATE] = { .len = sizeof(struct ifla_vf_rate) },
@@ -1702,7 +1729,37 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
1702 err = -EOPNOTSUPP; 1729 err = -EOPNOTSUPP;
1703 if (ops->ndo_set_vf_vlan) 1730 if (ops->ndo_set_vf_vlan)
1704 err = ops->ndo_set_vf_vlan(dev, ivv->vf, ivv->vlan, 1731 err = ops->ndo_set_vf_vlan(dev, ivv->vf, ivv->vlan,
1705 ivv->qos); 1732 ivv->qos,
1733 htons(ETH_P_8021Q));
1734 if (err < 0)
1735 return err;
1736 }
1737
1738 if (tb[IFLA_VF_VLAN_LIST]) {
1739 struct ifla_vf_vlan_info *ivvl[MAX_VLAN_LIST_LEN];
1740 struct nlattr *attr;
1741 int rem, len = 0;
1742
1743 err = -EOPNOTSUPP;
1744 if (!ops->ndo_set_vf_vlan)
1745 return err;
1746
1747 nla_for_each_nested(attr, tb[IFLA_VF_VLAN_LIST], rem) {
1748 if (nla_type(attr) != IFLA_VF_VLAN_INFO ||
1749 nla_len(attr) < NLA_HDRLEN) {
1750 return -EINVAL;
1751 }
1752 if (len >= MAX_VLAN_LIST_LEN)
1753 return -EOPNOTSUPP;
1754 ivvl[len] = nla_data(attr);
1755
1756 len++;
1757 }
1758 if (len == 0)
1759 return -EINVAL;
1760
1761 err = ops->ndo_set_vf_vlan(dev, ivvl[0]->vf, ivvl[0]->vlan,
1762 ivvl[0]->qos, ivvl[0]->vlan_proto);
1706 if (err < 0) 1763 if (err < 0)
1707 return err; 1764 return err;
1708 } 1765 }
@@ -3066,7 +3123,7 @@ static int nlmsg_populate_fdb(struct sk_buff *skb,
3066 seq = cb->nlh->nlmsg_seq; 3123 seq = cb->nlh->nlmsg_seq;
3067 3124
3068 list_for_each_entry(ha, &list->list, list) { 3125 list_for_each_entry(ha, &list->list, list) {
3069 if (*idx < cb->args[0]) 3126 if (*idx < cb->args[2])
3070 goto skip; 3127 goto skip;
3071 3128
3072 err = nlmsg_populate_fdb_fill(skb, dev, ha->addr, 0, 3129 err = nlmsg_populate_fdb_fill(skb, dev, ha->addr, 0,
@@ -3093,19 +3150,18 @@ int ndo_dflt_fdb_dump(struct sk_buff *skb,
3093 struct netlink_callback *cb, 3150 struct netlink_callback *cb,
3094 struct net_device *dev, 3151 struct net_device *dev,
3095 struct net_device *filter_dev, 3152 struct net_device *filter_dev,
3096 int idx) 3153 int *idx)
3097{ 3154{
3098 int err; 3155 int err;
3099 3156
3100 netif_addr_lock_bh(dev); 3157 netif_addr_lock_bh(dev);
3101 err = nlmsg_populate_fdb(skb, cb, dev, &idx, &dev->uc); 3158 err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->uc);
3102 if (err) 3159 if (err)
3103 goto out; 3160 goto out;
3104 nlmsg_populate_fdb(skb, cb, dev, &idx, &dev->mc); 3161 nlmsg_populate_fdb(skb, cb, dev, idx, &dev->mc);
3105out: 3162out:
3106 netif_addr_unlock_bh(dev); 3163 netif_addr_unlock_bh(dev);
3107 cb->args[1] = err; 3164 return err;
3108 return idx;
3109} 3165}
3110EXPORT_SYMBOL(ndo_dflt_fdb_dump); 3166EXPORT_SYMBOL(ndo_dflt_fdb_dump);
3111 3167
@@ -3118,9 +3174,13 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
3118 const struct net_device_ops *cops = NULL; 3174 const struct net_device_ops *cops = NULL;
3119 struct ifinfomsg *ifm = nlmsg_data(cb->nlh); 3175 struct ifinfomsg *ifm = nlmsg_data(cb->nlh);
3120 struct net *net = sock_net(skb->sk); 3176 struct net *net = sock_net(skb->sk);
3177 struct hlist_head *head;
3121 int brport_idx = 0; 3178 int brport_idx = 0;
3122 int br_idx = 0; 3179 int br_idx = 0;
3123 int idx = 0; 3180 int h, s_h;
3181 int idx = 0, s_idx;
3182 int err = 0;
3183 int fidx = 0;
3124 3184
3125 if (nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX, 3185 if (nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX,
3126 ifla_policy) == 0) { 3186 ifla_policy) == 0) {
@@ -3138,49 +3198,71 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
3138 ops = br_dev->netdev_ops; 3198 ops = br_dev->netdev_ops;
3139 } 3199 }
3140 3200
3141 cb->args[1] = 0; 3201 s_h = cb->args[0];
3142 for_each_netdev(net, dev) { 3202 s_idx = cb->args[1];
3143 if (brport_idx && (dev->ifindex != brport_idx))
3144 continue;
3145 3203
3146 if (!br_idx) { /* user did not specify a specific bridge */ 3204 for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
3147 if (dev->priv_flags & IFF_BRIDGE_PORT) { 3205 idx = 0;
3148 br_dev = netdev_master_upper_dev_get(dev); 3206 head = &net->dev_index_head[h];
3149 cops = br_dev->netdev_ops; 3207 hlist_for_each_entry(dev, head, index_hlist) {
3150 }
3151 3208
3152 } else { 3209 if (brport_idx && (dev->ifindex != brport_idx))
3153 if (dev != br_dev &&
3154 !(dev->priv_flags & IFF_BRIDGE_PORT))
3155 continue; 3210 continue;
3156 3211
3157 if (br_dev != netdev_master_upper_dev_get(dev) && 3212 if (!br_idx) { /* user did not specify a specific bridge */
3158 !(dev->priv_flags & IFF_EBRIDGE)) 3213 if (dev->priv_flags & IFF_BRIDGE_PORT) {
3159 continue; 3214 br_dev = netdev_master_upper_dev_get(dev);
3215 cops = br_dev->netdev_ops;
3216 }
3217 } else {
3218 if (dev != br_dev &&
3219 !(dev->priv_flags & IFF_BRIDGE_PORT))
3220 continue;
3160 3221
3161 cops = ops; 3222 if (br_dev != netdev_master_upper_dev_get(dev) &&
3162 } 3223 !(dev->priv_flags & IFF_EBRIDGE))
3224 continue;
3225 cops = ops;
3226 }
3163 3227
3164 if (dev->priv_flags & IFF_BRIDGE_PORT) { 3228 if (idx < s_idx)
3165 if (cops && cops->ndo_fdb_dump) 3229 goto cont;
3166 idx = cops->ndo_fdb_dump(skb, cb, br_dev, dev,
3167 idx);
3168 }
3169 if (cb->args[1] == -EMSGSIZE)
3170 break;
3171 3230
3172 if (dev->netdev_ops->ndo_fdb_dump) 3231 if (dev->priv_flags & IFF_BRIDGE_PORT) {
3173 idx = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, NULL, 3232 if (cops && cops->ndo_fdb_dump) {
3174 idx); 3233 err = cops->ndo_fdb_dump(skb, cb,
3175 else 3234 br_dev, dev,
3176 idx = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx); 3235 &fidx);
3177 if (cb->args[1] == -EMSGSIZE) 3236 if (err == -EMSGSIZE)
3178 break; 3237 goto out;
3238 }
3239 }
3240
3241 if (dev->netdev_ops->ndo_fdb_dump)
3242 err = dev->netdev_ops->ndo_fdb_dump(skb, cb,
3243 dev, NULL,
3244 &fidx);
3245 else
3246 err = ndo_dflt_fdb_dump(skb, cb, dev, NULL,
3247 &fidx);
3248 if (err == -EMSGSIZE)
3249 goto out;
3250
3251 cops = NULL;
3179 3252
3180 cops = NULL; 3253 /* reset fdb offset to 0 for rest of the interfaces */
3254 cb->args[2] = 0;
3255 fidx = 0;
3256cont:
3257 idx++;
3258 }
3181 } 3259 }
3182 3260
3183 cb->args[0] = idx; 3261out:
3262 cb->args[0] = h;
3263 cb->args[1] = idx;
3264 cb->args[2] = fidx;
3265
3184 return skb->len; 3266 return skb->len;
3185} 3267}
3186 3268
@@ -3550,6 +3632,91 @@ static bool stats_attr_valid(unsigned int mask, int attrid, int idxattr)
3550 (!idxattr || idxattr == attrid); 3632 (!idxattr || idxattr == attrid);
3551} 3633}
3552 3634
3635#define IFLA_OFFLOAD_XSTATS_FIRST (IFLA_OFFLOAD_XSTATS_UNSPEC + 1)
3636static int rtnl_get_offload_stats_attr_size(int attr_id)
3637{
3638 switch (attr_id) {
3639 case IFLA_OFFLOAD_XSTATS_CPU_HIT:
3640 return sizeof(struct rtnl_link_stats64);
3641 }
3642
3643 return 0;
3644}
3645
3646static int rtnl_get_offload_stats(struct sk_buff *skb, struct net_device *dev,
3647 int *prividx)
3648{
3649 struct nlattr *attr = NULL;
3650 int attr_id, size;
3651 void *attr_data;
3652 int err;
3653
3654 if (!(dev->netdev_ops && dev->netdev_ops->ndo_has_offload_stats &&
3655 dev->netdev_ops->ndo_get_offload_stats))
3656 return -ENODATA;
3657
3658 for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST;
3659 attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) {
3660 if (attr_id < *prividx)
3661 continue;
3662
3663 size = rtnl_get_offload_stats_attr_size(attr_id);
3664 if (!size)
3665 continue;
3666
3667 if (!dev->netdev_ops->ndo_has_offload_stats(attr_id))
3668 continue;
3669
3670 attr = nla_reserve_64bit(skb, attr_id, size,
3671 IFLA_OFFLOAD_XSTATS_UNSPEC);
3672 if (!attr)
3673 goto nla_put_failure;
3674
3675 attr_data = nla_data(attr);
3676 memset(attr_data, 0, size);
3677 err = dev->netdev_ops->ndo_get_offload_stats(attr_id, dev,
3678 attr_data);
3679 if (err)
3680 goto get_offload_stats_failure;
3681 }
3682
3683 if (!attr)
3684 return -ENODATA;
3685
3686 *prividx = 0;
3687 return 0;
3688
3689nla_put_failure:
3690 err = -EMSGSIZE;
3691get_offload_stats_failure:
3692 *prividx = attr_id;
3693 return err;
3694}
3695
3696static int rtnl_get_offload_stats_size(const struct net_device *dev)
3697{
3698 int nla_size = 0;
3699 int attr_id;
3700 int size;
3701
3702 if (!(dev->netdev_ops && dev->netdev_ops->ndo_has_offload_stats &&
3703 dev->netdev_ops->ndo_get_offload_stats))
3704 return 0;
3705
3706 for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST;
3707 attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) {
3708 if (!dev->netdev_ops->ndo_has_offload_stats(attr_id))
3709 continue;
3710 size = rtnl_get_offload_stats_attr_size(attr_id);
3711 nla_size += nla_total_size_64bit(size);
3712 }
3713
3714 if (nla_size != 0)
3715 nla_size += nla_total_size(0);
3716
3717 return nla_size;
3718}
3719
3553static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, 3720static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
3554 int type, u32 pid, u32 seq, u32 change, 3721 int type, u32 pid, u32 seq, u32 change,
3555 unsigned int flags, unsigned int filter_mask, 3722 unsigned int flags, unsigned int filter_mask,
@@ -3559,6 +3726,7 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
3559 struct nlmsghdr *nlh; 3726 struct nlmsghdr *nlh;
3560 struct nlattr *attr; 3727 struct nlattr *attr;
3561 int s_prividx = *prividx; 3728 int s_prividx = *prividx;
3729 int err;
3562 3730
3563 ASSERT_RTNL(); 3731 ASSERT_RTNL();
3564 3732
@@ -3587,8 +3755,6 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
3587 const struct rtnl_link_ops *ops = dev->rtnl_link_ops; 3755 const struct rtnl_link_ops *ops = dev->rtnl_link_ops;
3588 3756
3589 if (ops && ops->fill_linkxstats) { 3757 if (ops && ops->fill_linkxstats) {
3590 int err;
3591
3592 *idxattr = IFLA_STATS_LINK_XSTATS; 3758 *idxattr = IFLA_STATS_LINK_XSTATS;
3593 attr = nla_nest_start(skb, 3759 attr = nla_nest_start(skb,
3594 IFLA_STATS_LINK_XSTATS); 3760 IFLA_STATS_LINK_XSTATS);
@@ -3612,8 +3778,6 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
3612 if (master) 3778 if (master)
3613 ops = master->rtnl_link_ops; 3779 ops = master->rtnl_link_ops;
3614 if (ops && ops->fill_linkxstats) { 3780 if (ops && ops->fill_linkxstats) {
3615 int err;
3616
3617 *idxattr = IFLA_STATS_LINK_XSTATS_SLAVE; 3781 *idxattr = IFLA_STATS_LINK_XSTATS_SLAVE;
3618 attr = nla_nest_start(skb, 3782 attr = nla_nest_start(skb,
3619 IFLA_STATS_LINK_XSTATS_SLAVE); 3783 IFLA_STATS_LINK_XSTATS_SLAVE);
@@ -3628,6 +3792,24 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
3628 } 3792 }
3629 } 3793 }
3630 3794
3795 if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS,
3796 *idxattr)) {
3797 *idxattr = IFLA_STATS_LINK_OFFLOAD_XSTATS;
3798 attr = nla_nest_start(skb, IFLA_STATS_LINK_OFFLOAD_XSTATS);
3799 if (!attr)
3800 goto nla_put_failure;
3801
3802 err = rtnl_get_offload_stats(skb, dev, prividx);
3803 if (err == -ENODATA)
3804 nla_nest_cancel(skb, attr);
3805 else
3806 nla_nest_end(skb, attr);
3807
3808 if (err && err != -ENODATA)
3809 goto nla_put_failure;
3810 *idxattr = 0;
3811 }
3812
3631 nlmsg_end(skb, nlh); 3813 nlmsg_end(skb, nlh);
3632 3814
3633 return 0; 3815 return 0;
@@ -3642,10 +3824,6 @@ nla_put_failure:
3642 return -EMSGSIZE; 3824 return -EMSGSIZE;
3643} 3825}
3644 3826
3645static const struct nla_policy ifla_stats_policy[IFLA_STATS_MAX + 1] = {
3646 [IFLA_STATS_LINK_64] = { .len = sizeof(struct rtnl_link_stats64) },
3647};
3648
3649static size_t if_nlmsg_stats_size(const struct net_device *dev, 3827static size_t if_nlmsg_stats_size(const struct net_device *dev,
3650 u32 filter_mask) 3828 u32 filter_mask)
3651{ 3829{
@@ -3685,6 +3863,9 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev,
3685 } 3863 }
3686 } 3864 }
3687 3865
3866 if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0))
3867 size += rtnl_get_offload_stats_size(dev);
3868
3688 return size; 3869 return size;
3689} 3870}
3690 3871
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 3864b4b68fa1..1e3e0087245b 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1962,37 +1962,13 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
1962 return false; 1962 return false;
1963} 1963}
1964 1964
1965ssize_t skb_socket_splice(struct sock *sk,
1966 struct pipe_inode_info *pipe,
1967 struct splice_pipe_desc *spd)
1968{
1969 int ret;
1970
1971 /* Drop the socket lock, otherwise we have reverse
1972 * locking dependencies between sk_lock and i_mutex
1973 * here as compared to sendfile(). We enter here
1974 * with the socket lock held, and splice_to_pipe() will
1975 * grab the pipe inode lock. For sendfile() emulation,
1976 * we call into ->sendpage() with the i_mutex lock held
1977 * and networking will grab the socket lock.
1978 */
1979 release_sock(sk);
1980 ret = splice_to_pipe(pipe, spd);
1981 lock_sock(sk);
1982
1983 return ret;
1984}
1985
1986/* 1965/*
1987 * Map data from the skb to a pipe. Should handle both the linear part, 1966 * Map data from the skb to a pipe. Should handle both the linear part,
1988 * the fragments, and the frag list. 1967 * the fragments, and the frag list.
1989 */ 1968 */
1990int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, 1969int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
1991 struct pipe_inode_info *pipe, unsigned int tlen, 1970 struct pipe_inode_info *pipe, unsigned int tlen,
1992 unsigned int flags, 1971 unsigned int flags)
1993 ssize_t (*splice_cb)(struct sock *,
1994 struct pipe_inode_info *,
1995 struct splice_pipe_desc *))
1996{ 1972{
1997 struct partial_page partial[MAX_SKB_FRAGS]; 1973 struct partial_page partial[MAX_SKB_FRAGS];
1998 struct page *pages[MAX_SKB_FRAGS]; 1974 struct page *pages[MAX_SKB_FRAGS];
@@ -2009,7 +1985,7 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
2009 __skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk); 1985 __skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk);
2010 1986
2011 if (spd.nr_pages) 1987 if (spd.nr_pages)
2012 ret = splice_cb(sk, pipe, &spd); 1988 ret = splice_to_pipe(pipe, &spd);
2013 1989
2014 return ret; 1990 return ret;
2015} 1991}
@@ -2445,6 +2421,25 @@ void skb_queue_purge(struct sk_buff_head *list)
2445EXPORT_SYMBOL(skb_queue_purge); 2421EXPORT_SYMBOL(skb_queue_purge);
2446 2422
2447/** 2423/**
2424 * skb_rbtree_purge - empty a skb rbtree
2425 * @root: root of the rbtree to empty
2426 *
2427 * Delete all buffers on an &sk_buff rbtree. Each buffer is removed from
2428 * the list and one reference dropped. This function does not take
2429 * any lock. Synchronization should be handled by the caller (e.g., TCP
2430 * out-of-order queue is protected by the socket lock).
2431 */
2432void skb_rbtree_purge(struct rb_root *root)
2433{
2434 struct sk_buff *skb, *next;
2435
2436 rbtree_postorder_for_each_entry_safe(skb, next, root, rbnode)
2437 kfree_skb(skb);
2438
2439 *root = RB_ROOT;
2440}
2441
2442/**
2448 * skb_queue_head - queue a buffer at the list head 2443 * skb_queue_head - queue a buffer at the list head
2449 * @list: list to use 2444 * @list: list to use
2450 * @newsk: buffer to queue 2445 * @newsk: buffer to queue
@@ -3078,11 +3073,31 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
3078 sg = !!(features & NETIF_F_SG); 3073 sg = !!(features & NETIF_F_SG);
3079 csum = !!can_checksum_protocol(features, proto); 3074 csum = !!can_checksum_protocol(features, proto);
3080 3075
3081 /* GSO partial only requires that we trim off any excess that 3076 if (sg && csum && (mss != GSO_BY_FRAGS)) {
3082 * doesn't fit into an MSS sized block, so take care of that 3077 if (!(features & NETIF_F_GSO_PARTIAL)) {
3083 * now. 3078 struct sk_buff *iter;
3084 */ 3079
3085 if (sg && csum && (features & NETIF_F_GSO_PARTIAL)) { 3080 if (!list_skb ||
3081 !net_gso_ok(features, skb_shinfo(head_skb)->gso_type))
3082 goto normal;
3083
3084 /* Split the buffer at the frag_list pointer.
3085 * This is based on the assumption that all
3086 * buffers in the chain excluding the last
3087 * containing the same amount of data.
3088 */
3089 skb_walk_frags(head_skb, iter) {
3090 if (skb_headlen(iter))
3091 goto normal;
3092
3093 len -= iter->len;
3094 }
3095 }
3096
3097 /* GSO partial only requires that we trim off any excess that
3098 * doesn't fit into an MSS sized block, so take care of that
3099 * now.
3100 */
3086 partial_segs = len / mss; 3101 partial_segs = len / mss;
3087 if (partial_segs > 1) 3102 if (partial_segs > 1)
3088 mss *= partial_segs; 3103 mss *= partial_segs;
@@ -3090,6 +3105,7 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
3090 partial_segs = 0; 3105 partial_segs = 0;
3091 } 3106 }
3092 3107
3108normal:
3093 headroom = skb_headroom(head_skb); 3109 headroom = skb_headroom(head_skb);
3094 pos = skb_headlen(head_skb); 3110 pos = skb_headlen(head_skb);
3095 3111
@@ -3281,21 +3297,29 @@ perform_csum_check:
3281 */ 3297 */
3282 segs->prev = tail; 3298 segs->prev = tail;
3283 3299
3284 /* Update GSO info on first skb in partial sequence. */
3285 if (partial_segs) { 3300 if (partial_segs) {
3301 struct sk_buff *iter;
3286 int type = skb_shinfo(head_skb)->gso_type; 3302 int type = skb_shinfo(head_skb)->gso_type;
3303 unsigned short gso_size = skb_shinfo(head_skb)->gso_size;
3287 3304
3288 /* Update type to add partial and then remove dodgy if set */ 3305 /* Update type to add partial and then remove dodgy if set */
3289 type |= SKB_GSO_PARTIAL; 3306 type |= (features & NETIF_F_GSO_PARTIAL) / NETIF_F_GSO_PARTIAL * SKB_GSO_PARTIAL;
3290 type &= ~SKB_GSO_DODGY; 3307 type &= ~SKB_GSO_DODGY;
3291 3308
3292 /* Update GSO info and prepare to start updating headers on 3309 /* Update GSO info and prepare to start updating headers on
3293 * our way back down the stack of protocols. 3310 * our way back down the stack of protocols.
3294 */ 3311 */
3295 skb_shinfo(segs)->gso_size = skb_shinfo(head_skb)->gso_size; 3312 for (iter = segs; iter; iter = iter->next) {
3296 skb_shinfo(segs)->gso_segs = partial_segs; 3313 skb_shinfo(iter)->gso_size = gso_size;
3297 skb_shinfo(segs)->gso_type = type; 3314 skb_shinfo(iter)->gso_segs = partial_segs;
3298 SKB_GSO_CB(segs)->data_offset = skb_headroom(segs) + doffset; 3315 skb_shinfo(iter)->gso_type = type;
3316 SKB_GSO_CB(iter)->data_offset = skb_headroom(iter) + doffset;
3317 }
3318
3319 if (tail->len - doffset <= gso_size)
3320 skb_shinfo(tail)->gso_size = 0;
3321 else if (tail != segs)
3322 skb_shinfo(tail)->gso_segs = DIV_ROUND_UP(tail->len - doffset, gso_size);
3299 } 3323 }
3300 3324
3301 /* Following permits correct backpressure, for protocols 3325 /* Following permits correct backpressure, for protocols
@@ -4474,17 +4498,24 @@ int skb_ensure_writable(struct sk_buff *skb, int write_len)
4474} 4498}
4475EXPORT_SYMBOL(skb_ensure_writable); 4499EXPORT_SYMBOL(skb_ensure_writable);
4476 4500
4477/* remove VLAN header from packet and update csum accordingly. */ 4501/* remove VLAN header from packet and update csum accordingly.
4478static int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci) 4502 * expects a non skb_vlan_tag_present skb with a vlan tag payload
4503 */
4504int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
4479{ 4505{
4480 struct vlan_hdr *vhdr; 4506 struct vlan_hdr *vhdr;
4481 unsigned int offset = skb->data - skb_mac_header(skb); 4507 int offset = skb->data - skb_mac_header(skb);
4482 int err; 4508 int err;
4483 4509
4484 __skb_push(skb, offset); 4510 if (WARN_ONCE(offset,
4511 "__skb_vlan_pop got skb with skb->data not at mac header (offset %d)\n",
4512 offset)) {
4513 return -EINVAL;
4514 }
4515
4485 err = skb_ensure_writable(skb, VLAN_ETH_HLEN); 4516 err = skb_ensure_writable(skb, VLAN_ETH_HLEN);
4486 if (unlikely(err)) 4517 if (unlikely(err))
4487 goto pull; 4518 return err;
4488 4519
4489 skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); 4520 skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
4490 4521
@@ -4501,12 +4532,14 @@ static int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
4501 skb_set_network_header(skb, ETH_HLEN); 4532 skb_set_network_header(skb, ETH_HLEN);
4502 4533
4503 skb_reset_mac_len(skb); 4534 skb_reset_mac_len(skb);
4504pull:
4505 __skb_pull(skb, offset);
4506 4535
4507 return err; 4536 return err;
4508} 4537}
4538EXPORT_SYMBOL(__skb_vlan_pop);
4509 4539
4540/* Pop a vlan tag either from hwaccel or from payload.
4541 * Expects skb->data at mac header.
4542 */
4510int skb_vlan_pop(struct sk_buff *skb) 4543int skb_vlan_pop(struct sk_buff *skb)
4511{ 4544{
4512 u16 vlan_tci; 4545 u16 vlan_tci;
@@ -4516,9 +4549,7 @@ int skb_vlan_pop(struct sk_buff *skb)
4516 if (likely(skb_vlan_tag_present(skb))) { 4549 if (likely(skb_vlan_tag_present(skb))) {
4517 skb->vlan_tci = 0; 4550 skb->vlan_tci = 0;
4518 } else { 4551 } else {
4519 if (unlikely((skb->protocol != htons(ETH_P_8021Q) && 4552 if (unlikely(!eth_type_vlan(skb->protocol)))
4520 skb->protocol != htons(ETH_P_8021AD)) ||
4521 skb->len < VLAN_ETH_HLEN))
4522 return 0; 4553 return 0;
4523 4554
4524 err = __skb_vlan_pop(skb, &vlan_tci); 4555 err = __skb_vlan_pop(skb, &vlan_tci);
@@ -4526,9 +4557,7 @@ int skb_vlan_pop(struct sk_buff *skb)
4526 return err; 4557 return err;
4527 } 4558 }
4528 /* move next vlan tag to hw accel tag */ 4559 /* move next vlan tag to hw accel tag */
4529 if (likely((skb->protocol != htons(ETH_P_8021Q) && 4560 if (likely(!eth_type_vlan(skb->protocol)))
4530 skb->protocol != htons(ETH_P_8021AD)) ||
4531 skb->len < VLAN_ETH_HLEN))
4532 return 0; 4561 return 0;
4533 4562
4534 vlan_proto = skb->protocol; 4563 vlan_proto = skb->protocol;
@@ -4541,29 +4570,30 @@ int skb_vlan_pop(struct sk_buff *skb)
4541} 4570}
4542EXPORT_SYMBOL(skb_vlan_pop); 4571EXPORT_SYMBOL(skb_vlan_pop);
4543 4572
4573/* Push a vlan tag either into hwaccel or into payload (if hwaccel tag present).
4574 * Expects skb->data at mac header.
4575 */
4544int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) 4576int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
4545{ 4577{
4546 if (skb_vlan_tag_present(skb)) { 4578 if (skb_vlan_tag_present(skb)) {
4547 unsigned int offset = skb->data - skb_mac_header(skb); 4579 int offset = skb->data - skb_mac_header(skb);
4548 int err; 4580 int err;
4549 4581
4550 /* __vlan_insert_tag expect skb->data pointing to mac header. 4582 if (WARN_ONCE(offset,
4551 * So change skb->data before calling it and change back to 4583 "skb_vlan_push got skb with skb->data not at mac header (offset %d)\n",
4552 * original position later 4584 offset)) {
4553 */ 4585 return -EINVAL;
4554 __skb_push(skb, offset); 4586 }
4587
4555 err = __vlan_insert_tag(skb, skb->vlan_proto, 4588 err = __vlan_insert_tag(skb, skb->vlan_proto,
4556 skb_vlan_tag_get(skb)); 4589 skb_vlan_tag_get(skb));
4557 if (err) { 4590 if (err)
4558 __skb_pull(skb, offset);
4559 return err; 4591 return err;
4560 }
4561 4592
4562 skb->protocol = skb->vlan_proto; 4593 skb->protocol = skb->vlan_proto;
4563 skb->mac_len += VLAN_HLEN; 4594 skb->mac_len += VLAN_HLEN;
4564 4595
4565 skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); 4596 skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
4566 __skb_pull(skb, offset);
4567 } 4597 }
4568 __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); 4598 __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
4569 return 0; 4599 return 0;
diff --git a/net/core/sock.c b/net/core/sock.c
index fd7b41edf1ce..c73e28fc9c2a 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1315,24 +1315,6 @@ static void sock_copy(struct sock *nsk, const struct sock *osk)
1315#endif 1315#endif
1316} 1316}
1317 1317
1318void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1319{
1320 unsigned long nulls1, nulls2;
1321
1322 nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1323 nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1324 if (nulls1 > nulls2)
1325 swap(nulls1, nulls2);
1326
1327 if (nulls1 != 0)
1328 memset((char *)sk, 0, nulls1);
1329 memset((char *)sk + nulls1 + sizeof(void *), 0,
1330 nulls2 - nulls1 - sizeof(void *));
1331 memset((char *)sk + nulls2 + sizeof(void *), 0,
1332 size - nulls2 - sizeof(void *));
1333}
1334EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1335
1336static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, 1318static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1337 int family) 1319 int family)
1338{ 1320{
@@ -1344,12 +1326,8 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1344 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); 1326 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1345 if (!sk) 1327 if (!sk)
1346 return sk; 1328 return sk;
1347 if (priority & __GFP_ZERO) { 1329 if (priority & __GFP_ZERO)
1348 if (prot->clear_sk) 1330 sk_prot_clear_nulls(sk, prot->obj_size);
1349 prot->clear_sk(sk, prot->obj_size);
1350 else
1351 sk_prot_clear_nulls(sk, prot->obj_size);
1352 }
1353 } else 1331 } else
1354 sk = kmalloc(prot->obj_size, priority); 1332 sk = kmalloc(prot->obj_size, priority);
1355 1333
@@ -1385,6 +1363,7 @@ static void sk_prot_free(struct proto *prot, struct sock *sk)
1385 slab = prot->slab; 1363 slab = prot->slab;
1386 1364
1387 cgroup_sk_free(&sk->sk_cgrp_data); 1365 cgroup_sk_free(&sk->sk_cgrp_data);
1366 mem_cgroup_sk_free(sk);
1388 security_sk_free(sk); 1367 security_sk_free(sk);
1389 if (slab != NULL) 1368 if (slab != NULL)
1390 kmem_cache_free(slab, sk); 1369 kmem_cache_free(slab, sk);
@@ -1421,6 +1400,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1421 sock_net_set(sk, net); 1400 sock_net_set(sk, net);
1422 atomic_set(&sk->sk_wmem_alloc, 1); 1401 atomic_set(&sk->sk_wmem_alloc, 1);
1423 1402
1403 mem_cgroup_sk_alloc(sk);
1424 cgroup_sk_alloc(&sk->sk_cgrp_data); 1404 cgroup_sk_alloc(&sk->sk_cgrp_data);
1425 sock_update_classid(&sk->sk_cgrp_data); 1405 sock_update_classid(&sk->sk_cgrp_data);
1426 sock_update_netprioidx(&sk->sk_cgrp_data); 1406 sock_update_netprioidx(&sk->sk_cgrp_data);
@@ -1567,6 +1547,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1567 newsk->sk_incoming_cpu = raw_smp_processor_id(); 1547 newsk->sk_incoming_cpu = raw_smp_processor_id();
1568 atomic64_set(&newsk->sk_cookie, 0); 1548 atomic64_set(&newsk->sk_cookie, 0);
1569 1549
1550 mem_cgroup_sk_alloc(newsk);
1570 cgroup_sk_alloc(&newsk->sk_cgrp_data); 1551 cgroup_sk_alloc(&newsk->sk_cgrp_data);
1571 1552
1572 /* 1553 /*
@@ -1591,9 +1572,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1591 sk_set_socket(newsk, NULL); 1572 sk_set_socket(newsk, NULL);
1592 newsk->sk_wq = NULL; 1573 newsk->sk_wq = NULL;
1593 1574
1594 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
1595 sock_update_memcg(newsk);
1596
1597 if (newsk->sk_prot->sockets_allocated) 1575 if (newsk->sk_prot->sockets_allocated)
1598 sk_sockets_allocated_inc(newsk); 1576 sk_sockets_allocated_inc(newsk);
1599 1577
diff --git a/net/core/stream.c b/net/core/stream.c
index 159516a11b7e..1086c8b280a8 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -43,7 +43,6 @@ void sk_stream_write_space(struct sock *sk)
43 rcu_read_unlock(); 43 rcu_read_unlock();
44 } 44 }
45} 45}
46EXPORT_SYMBOL(sk_stream_write_space);
47 46
48/** 47/**
49 * sk_stream_wait_connect - Wait for a socket to get into the connected state 48 * sk_stream_wait_connect - Wait for a socket to get into the connected state
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index ff7736f7ff42..96e47c539bee 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -38,4 +38,7 @@ config NET_DSA_TAG_EDSA
38config NET_DSA_TAG_TRAILER 38config NET_DSA_TAG_TRAILER
39 bool 39 bool
40 40
41config NET_DSA_TAG_QCA
42 bool
43
41endif 44endif
diff --git a/net/dsa/Makefile b/net/dsa/Makefile
index 8af4ded70f1c..a3380ed0e0be 100644
--- a/net/dsa/Makefile
+++ b/net/dsa/Makefile
@@ -7,3 +7,4 @@ dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o
7dsa_core-$(CONFIG_NET_DSA_TAG_DSA) += tag_dsa.o 7dsa_core-$(CONFIG_NET_DSA_TAG_DSA) += tag_dsa.o
8dsa_core-$(CONFIG_NET_DSA_TAG_EDSA) += tag_edsa.o 8dsa_core-$(CONFIG_NET_DSA_TAG_EDSA) += tag_edsa.o
9dsa_core-$(CONFIG_NET_DSA_TAG_TRAILER) += tag_trailer.o 9dsa_core-$(CONFIG_NET_DSA_TAG_TRAILER) += tag_trailer.o
10dsa_core-$(CONFIG_NET_DSA_TAG_QCA) += tag_qca.o
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 7e68bc6bc853..a6902c1e2f28 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -54,6 +54,9 @@ const struct dsa_device_ops *dsa_device_ops[DSA_TAG_LAST] = {
54#ifdef CONFIG_NET_DSA_TAG_BRCM 54#ifdef CONFIG_NET_DSA_TAG_BRCM
55 [DSA_TAG_PROTO_BRCM] = &brcm_netdev_ops, 55 [DSA_TAG_PROTO_BRCM] = &brcm_netdev_ops,
56#endif 56#endif
57#ifdef CONFIG_NET_DSA_TAG_QCA
58 [DSA_TAG_PROTO_QCA] = &qca_netdev_ops,
59#endif
57 [DSA_TAG_PROTO_NONE] = &none_ops, 60 [DSA_TAG_PROTO_NONE] = &none_ops,
58}; 61};
59 62
@@ -61,27 +64,27 @@ const struct dsa_device_ops *dsa_device_ops[DSA_TAG_LAST] = {
61static DEFINE_MUTEX(dsa_switch_drivers_mutex); 64static DEFINE_MUTEX(dsa_switch_drivers_mutex);
62static LIST_HEAD(dsa_switch_drivers); 65static LIST_HEAD(dsa_switch_drivers);
63 66
64void register_switch_driver(struct dsa_switch_driver *drv) 67void register_switch_driver(struct dsa_switch_ops *ops)
65{ 68{
66 mutex_lock(&dsa_switch_drivers_mutex); 69 mutex_lock(&dsa_switch_drivers_mutex);
67 list_add_tail(&drv->list, &dsa_switch_drivers); 70 list_add_tail(&ops->list, &dsa_switch_drivers);
68 mutex_unlock(&dsa_switch_drivers_mutex); 71 mutex_unlock(&dsa_switch_drivers_mutex);
69} 72}
70EXPORT_SYMBOL_GPL(register_switch_driver); 73EXPORT_SYMBOL_GPL(register_switch_driver);
71 74
72void unregister_switch_driver(struct dsa_switch_driver *drv) 75void unregister_switch_driver(struct dsa_switch_ops *ops)
73{ 76{
74 mutex_lock(&dsa_switch_drivers_mutex); 77 mutex_lock(&dsa_switch_drivers_mutex);
75 list_del_init(&drv->list); 78 list_del_init(&ops->list);
76 mutex_unlock(&dsa_switch_drivers_mutex); 79 mutex_unlock(&dsa_switch_drivers_mutex);
77} 80}
78EXPORT_SYMBOL_GPL(unregister_switch_driver); 81EXPORT_SYMBOL_GPL(unregister_switch_driver);
79 82
80static struct dsa_switch_driver * 83static struct dsa_switch_ops *
81dsa_switch_probe(struct device *parent, struct device *host_dev, int sw_addr, 84dsa_switch_probe(struct device *parent, struct device *host_dev, int sw_addr,
82 const char **_name, void **priv) 85 const char **_name, void **priv)
83{ 86{
84 struct dsa_switch_driver *ret; 87 struct dsa_switch_ops *ret;
85 struct list_head *list; 88 struct list_head *list;
86 const char *name; 89 const char *name;
87 90
@@ -90,13 +93,13 @@ dsa_switch_probe(struct device *parent, struct device *host_dev, int sw_addr,
90 93
91 mutex_lock(&dsa_switch_drivers_mutex); 94 mutex_lock(&dsa_switch_drivers_mutex);
92 list_for_each(list, &dsa_switch_drivers) { 95 list_for_each(list, &dsa_switch_drivers) {
93 struct dsa_switch_driver *drv; 96 struct dsa_switch_ops *ops;
94 97
95 drv = list_entry(list, struct dsa_switch_driver, list); 98 ops = list_entry(list, struct dsa_switch_ops, list);
96 99
97 name = drv->probe(parent, host_dev, sw_addr, priv); 100 name = ops->probe(parent, host_dev, sw_addr, priv);
98 if (name != NULL) { 101 if (name != NULL) {
99 ret = drv; 102 ret = ops;
100 break; 103 break;
101 } 104 }
102 } 105 }
@@ -117,7 +120,7 @@ static ssize_t temp1_input_show(struct device *dev,
117 struct dsa_switch *ds = dev_get_drvdata(dev); 120 struct dsa_switch *ds = dev_get_drvdata(dev);
118 int temp, ret; 121 int temp, ret;
119 122
120 ret = ds->drv->get_temp(ds, &temp); 123 ret = ds->ops->get_temp(ds, &temp);
121 if (ret < 0) 124 if (ret < 0)
122 return ret; 125 return ret;
123 126
@@ -131,7 +134,7 @@ static ssize_t temp1_max_show(struct device *dev,
131 struct dsa_switch *ds = dev_get_drvdata(dev); 134 struct dsa_switch *ds = dev_get_drvdata(dev);
132 int temp, ret; 135 int temp, ret;
133 136
134 ret = ds->drv->get_temp_limit(ds, &temp); 137 ret = ds->ops->get_temp_limit(ds, &temp);
135 if (ret < 0) 138 if (ret < 0)
136 return ret; 139 return ret;
137 140
@@ -149,7 +152,7 @@ static ssize_t temp1_max_store(struct device *dev,
149 if (ret < 0) 152 if (ret < 0)
150 return ret; 153 return ret;
151 154
152 ret = ds->drv->set_temp_limit(ds, DIV_ROUND_CLOSEST(temp, 1000)); 155 ret = ds->ops->set_temp_limit(ds, DIV_ROUND_CLOSEST(temp, 1000));
153 if (ret < 0) 156 if (ret < 0)
154 return ret; 157 return ret;
155 158
@@ -164,7 +167,7 @@ static ssize_t temp1_max_alarm_show(struct device *dev,
164 bool alarm; 167 bool alarm;
165 int ret; 168 int ret;
166 169
167 ret = ds->drv->get_temp_alarm(ds, &alarm); 170 ret = ds->ops->get_temp_alarm(ds, &alarm);
168 if (ret < 0) 171 if (ret < 0)
169 return ret; 172 return ret;
170 173
@@ -184,15 +187,15 @@ static umode_t dsa_hwmon_attrs_visible(struct kobject *kobj,
184{ 187{
185 struct device *dev = container_of(kobj, struct device, kobj); 188 struct device *dev = container_of(kobj, struct device, kobj);
186 struct dsa_switch *ds = dev_get_drvdata(dev); 189 struct dsa_switch *ds = dev_get_drvdata(dev);
187 struct dsa_switch_driver *drv = ds->drv; 190 struct dsa_switch_ops *ops = ds->ops;
188 umode_t mode = attr->mode; 191 umode_t mode = attr->mode;
189 192
190 if (index == 1) { 193 if (index == 1) {
191 if (!drv->get_temp_limit) 194 if (!ops->get_temp_limit)
192 mode = 0; 195 mode = 0;
193 else if (!drv->set_temp_limit) 196 else if (!ops->set_temp_limit)
194 mode &= ~S_IWUSR; 197 mode &= ~S_IWUSR;
195 } else if (index == 2 && !drv->get_temp_alarm) { 198 } else if (index == 2 && !ops->get_temp_alarm) {
196 mode = 0; 199 mode = 0;
197 } 200 }
198 return mode; 201 return mode;
@@ -228,8 +231,8 @@ int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct device *dev,
228 231
229 genphy_config_init(phydev); 232 genphy_config_init(phydev);
230 genphy_read_status(phydev); 233 genphy_read_status(phydev);
231 if (ds->drv->adjust_link) 234 if (ds->ops->adjust_link)
232 ds->drv->adjust_link(ds, port, phydev); 235 ds->ops->adjust_link(ds, port, phydev);
233 } 236 }
234 237
235 return 0; 238 return 0;
@@ -303,7 +306,7 @@ void dsa_cpu_port_ethtool_restore(struct dsa_switch *ds)
303 306
304static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) 307static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
305{ 308{
306 struct dsa_switch_driver *drv = ds->drv; 309 struct dsa_switch_ops *ops = ds->ops;
307 struct dsa_switch_tree *dst = ds->dst; 310 struct dsa_switch_tree *dst = ds->dst;
308 struct dsa_chip_data *cd = ds->cd; 311 struct dsa_chip_data *cd = ds->cd;
309 bool valid_name_found = false; 312 bool valid_name_found = false;
@@ -354,7 +357,10 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
354 * switch. 357 * switch.
355 */ 358 */
356 if (dst->cpu_switch == index) { 359 if (dst->cpu_switch == index) {
357 dst->tag_ops = dsa_resolve_tag_protocol(drv->tag_protocol); 360 enum dsa_tag_protocol tag_protocol;
361
362 tag_protocol = ops->get_tag_protocol(ds);
363 dst->tag_ops = dsa_resolve_tag_protocol(tag_protocol);
358 if (IS_ERR(dst->tag_ops)) { 364 if (IS_ERR(dst->tag_ops)) {
359 ret = PTR_ERR(dst->tag_ops); 365 ret = PTR_ERR(dst->tag_ops);
360 goto out; 366 goto out;
@@ -368,15 +374,17 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
368 /* 374 /*
369 * Do basic register setup. 375 * Do basic register setup.
370 */ 376 */
371 ret = drv->setup(ds); 377 ret = ops->setup(ds);
372 if (ret < 0) 378 if (ret < 0)
373 goto out; 379 goto out;
374 380
375 ret = drv->set_addr(ds, dst->master_netdev->dev_addr); 381 if (ops->set_addr) {
376 if (ret < 0) 382 ret = ops->set_addr(ds, dst->master_netdev->dev_addr);
377 goto out; 383 if (ret < 0)
384 goto out;
385 }
378 386
379 if (!ds->slave_mii_bus && drv->phy_read) { 387 if (!ds->slave_mii_bus && ops->phy_read) {
380 ds->slave_mii_bus = devm_mdiobus_alloc(parent); 388 ds->slave_mii_bus = devm_mdiobus_alloc(parent);
381 if (!ds->slave_mii_bus) { 389 if (!ds->slave_mii_bus) {
382 ret = -ENOMEM; 390 ret = -ENOMEM;
@@ -423,7 +431,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
423 * register with hardware monitoring subsystem. 431 * register with hardware monitoring subsystem.
424 * Treat registration error as non-fatal and ignore it. 432 * Treat registration error as non-fatal and ignore it.
425 */ 433 */
426 if (drv->get_temp) { 434 if (ops->get_temp) {
427 const char *netname = netdev_name(dst->master_netdev); 435 const char *netname = netdev_name(dst->master_netdev);
428 char hname[IFNAMSIZ + 1]; 436 char hname[IFNAMSIZ + 1];
429 int i, j; 437 int i, j;
@@ -454,7 +462,7 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index,
454 struct device *parent, struct device *host_dev) 462 struct device *parent, struct device *host_dev)
455{ 463{
456 struct dsa_chip_data *cd = dst->pd->chip + index; 464 struct dsa_chip_data *cd = dst->pd->chip + index;
457 struct dsa_switch_driver *drv; 465 struct dsa_switch_ops *ops;
458 struct dsa_switch *ds; 466 struct dsa_switch *ds;
459 int ret; 467 int ret;
460 const char *name; 468 const char *name;
@@ -463,8 +471,8 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index,
463 /* 471 /*
464 * Probe for switch model. 472 * Probe for switch model.
465 */ 473 */
466 drv = dsa_switch_probe(parent, host_dev, cd->sw_addr, &name, &priv); 474 ops = dsa_switch_probe(parent, host_dev, cd->sw_addr, &name, &priv);
467 if (drv == NULL) { 475 if (!ops) {
468 netdev_err(dst->master_netdev, "[%d]: could not detect attached switch\n", 476 netdev_err(dst->master_netdev, "[%d]: could not detect attached switch\n",
469 index); 477 index);
470 return ERR_PTR(-EINVAL); 478 return ERR_PTR(-EINVAL);
@@ -483,7 +491,7 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index,
483 ds->dst = dst; 491 ds->dst = dst;
484 ds->index = index; 492 ds->index = index;
485 ds->cd = cd; 493 ds->cd = cd;
486 ds->drv = drv; 494 ds->ops = ops;
487 ds->priv = priv; 495 ds->priv = priv;
488 ds->dev = parent; 496 ds->dev = parent;
489 497
@@ -538,12 +546,12 @@ static void dsa_switch_destroy(struct dsa_switch *ds)
538 ds->dsa_port_mask |= ~(1 << port); 546 ds->dsa_port_mask |= ~(1 << port);
539 } 547 }
540 548
541 if (ds->slave_mii_bus && ds->drv->phy_read) 549 if (ds->slave_mii_bus && ds->ops->phy_read)
542 mdiobus_unregister(ds->slave_mii_bus); 550 mdiobus_unregister(ds->slave_mii_bus);
543} 551}
544 552
545#ifdef CONFIG_PM_SLEEP 553#ifdef CONFIG_PM_SLEEP
546static int dsa_switch_suspend(struct dsa_switch *ds) 554int dsa_switch_suspend(struct dsa_switch *ds)
547{ 555{
548 int i, ret = 0; 556 int i, ret = 0;
549 557
@@ -557,18 +565,19 @@ static int dsa_switch_suspend(struct dsa_switch *ds)
557 return ret; 565 return ret;
558 } 566 }
559 567
560 if (ds->drv->suspend) 568 if (ds->ops->suspend)
561 ret = ds->drv->suspend(ds); 569 ret = ds->ops->suspend(ds);
562 570
563 return ret; 571 return ret;
564} 572}
573EXPORT_SYMBOL_GPL(dsa_switch_suspend);
565 574
566static int dsa_switch_resume(struct dsa_switch *ds) 575int dsa_switch_resume(struct dsa_switch *ds)
567{ 576{
568 int i, ret = 0; 577 int i, ret = 0;
569 578
570 if (ds->drv->resume) 579 if (ds->ops->resume)
571 ret = ds->drv->resume(ds); 580 ret = ds->ops->resume(ds);
572 581
573 if (ret) 582 if (ret)
574 return ret; 583 return ret;
@@ -585,6 +594,7 @@ static int dsa_switch_resume(struct dsa_switch *ds)
585 594
586 return 0; 595 return 0;
587} 596}
597EXPORT_SYMBOL_GPL(dsa_switch_resume);
588#endif 598#endif
589 599
590/* platform driver init and cleanup *****************************************/ 600/* platform driver init and cleanup *****************************************/
@@ -1086,7 +1096,6 @@ static int dsa_resume(struct device *d)
1086static SIMPLE_DEV_PM_OPS(dsa_pm_ops, dsa_suspend, dsa_resume); 1096static SIMPLE_DEV_PM_OPS(dsa_pm_ops, dsa_suspend, dsa_resume);
1087 1097
1088static const struct of_device_id dsa_of_match_table[] = { 1098static const struct of_device_id dsa_of_match_table[] = {
1089 { .compatible = "brcm,bcm7445-switch-v4.0" },
1090 { .compatible = "marvell,dsa", }, 1099 { .compatible = "marvell,dsa", },
1091 {} 1100 {}
1092}; 1101};
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index f30bad9678f0..f8a7d9aab437 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -294,25 +294,23 @@ static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds)
294 int err; 294 int err;
295 295
296 /* Initialize ds->phys_mii_mask before registering the slave MDIO bus 296 /* Initialize ds->phys_mii_mask before registering the slave MDIO bus
297 * driver and before drv->setup() has run, since the switch drivers and 297 * driver and before ops->setup() has run, since the switch drivers and
298 * the slave MDIO bus driver rely on these values for probing PHY 298 * the slave MDIO bus driver rely on these values for probing PHY
299 * devices or not 299 * devices or not
300 */ 300 */
301 ds->phys_mii_mask = ds->enabled_port_mask; 301 ds->phys_mii_mask = ds->enabled_port_mask;
302 302
303 err = ds->drv->setup(ds); 303 err = ds->ops->setup(ds);
304 if (err < 0) 304 if (err < 0)
305 return err; 305 return err;
306 306
307 err = ds->drv->set_addr(ds, dst->master_netdev->dev_addr); 307 if (ds->ops->set_addr) {
308 if (err < 0) 308 err = ds->ops->set_addr(ds, dst->master_netdev->dev_addr);
309 return err; 309 if (err < 0)
310 310 return err;
311 err = ds->drv->set_addr(ds, dst->master_netdev->dev_addr); 311 }
312 if (err < 0)
313 return err;
314 312
315 if (!ds->slave_mii_bus && ds->drv->phy_read) { 313 if (!ds->slave_mii_bus && ds->ops->phy_read) {
316 ds->slave_mii_bus = devm_mdiobus_alloc(ds->dev); 314 ds->slave_mii_bus = devm_mdiobus_alloc(ds->dev);
317 if (!ds->slave_mii_bus) 315 if (!ds->slave_mii_bus)
318 return -ENOMEM; 316 return -ENOMEM;
@@ -374,7 +372,7 @@ static void dsa_ds_unapply(struct dsa_switch_tree *dst, struct dsa_switch *ds)
374 dsa_user_port_unapply(port, index, ds); 372 dsa_user_port_unapply(port, index, ds);
375 } 373 }
376 374
377 if (ds->slave_mii_bus && ds->drv->phy_read) 375 if (ds->slave_mii_bus && ds->ops->phy_read)
378 mdiobus_unregister(ds->slave_mii_bus); 376 mdiobus_unregister(ds->slave_mii_bus);
379} 377}
380 378
@@ -443,6 +441,7 @@ static int dsa_cpu_parse(struct device_node *port, u32 index,
443 struct dsa_switch_tree *dst, 441 struct dsa_switch_tree *dst,
444 struct dsa_switch *ds) 442 struct dsa_switch *ds)
445{ 443{
444 enum dsa_tag_protocol tag_protocol;
446 struct net_device *ethernet_dev; 445 struct net_device *ethernet_dev;
447 struct device_node *ethernet; 446 struct device_node *ethernet;
448 447
@@ -465,7 +464,8 @@ static int dsa_cpu_parse(struct device_node *port, u32 index,
465 dst->cpu_port = index; 464 dst->cpu_port = index;
466 } 465 }
467 466
468 dst->tag_ops = dsa_resolve_tag_protocol(ds->drv->tag_protocol); 467 tag_protocol = ds->ops->get_tag_protocol(ds);
468 dst->tag_ops = dsa_resolve_tag_protocol(tag_protocol);
469 if (IS_ERR(dst->tag_ops)) { 469 if (IS_ERR(dst->tag_ops)) {
470 dev_warn(ds->dev, "No tagger for this switch\n"); 470 dev_warn(ds->dev, "No tagger for this switch\n");
471 return PTR_ERR(dst->tag_ops); 471 return PTR_ERR(dst->tag_ops);
@@ -541,7 +541,7 @@ static int dsa_parse_ports_dn(struct device_node *ports, struct dsa_switch *ds)
541 541
542 ds->ports[reg].dn = port; 542 ds->ports[reg].dn = port;
543 543
544 /* Initialize enabled_port_mask now for drv->setup() 544 /* Initialize enabled_port_mask now for ops->setup()
545 * to have access to a correct value, just like what 545 * to have access to a correct value, just like what
546 * net/dsa/dsa.c::dsa_switch_setup_one does. 546 * net/dsa/dsa.c::dsa_switch_setup_one does.
547 */ 547 */
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 00077a9c97f4..6cfd7388834e 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -81,5 +81,7 @@ extern const struct dsa_device_ops trailer_netdev_ops;
81/* tag_brcm.c */ 81/* tag_brcm.c */
82extern const struct dsa_device_ops brcm_netdev_ops; 82extern const struct dsa_device_ops brcm_netdev_ops;
83 83
84/* tag_qca.c */
85extern const struct dsa_device_ops qca_netdev_ops;
84 86
85#endif 87#endif
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index fc9196745225..6b1282c006b1 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -28,7 +28,7 @@ static int dsa_slave_phy_read(struct mii_bus *bus, int addr, int reg)
28 struct dsa_switch *ds = bus->priv; 28 struct dsa_switch *ds = bus->priv;
29 29
30 if (ds->phys_mii_mask & (1 << addr)) 30 if (ds->phys_mii_mask & (1 << addr))
31 return ds->drv->phy_read(ds, addr, reg); 31 return ds->ops->phy_read(ds, addr, reg);
32 32
33 return 0xffff; 33 return 0xffff;
34} 34}
@@ -38,7 +38,7 @@ static int dsa_slave_phy_write(struct mii_bus *bus, int addr, int reg, u16 val)
38 struct dsa_switch *ds = bus->priv; 38 struct dsa_switch *ds = bus->priv;
39 39
40 if (ds->phys_mii_mask & (1 << addr)) 40 if (ds->phys_mii_mask & (1 << addr))
41 return ds->drv->phy_write(ds, addr, reg, val); 41 return ds->ops->phy_write(ds, addr, reg, val);
42 42
43 return 0; 43 return 0;
44} 44}
@@ -69,6 +69,30 @@ static inline bool dsa_port_is_bridged(struct dsa_slave_priv *p)
69 return !!p->bridge_dev; 69 return !!p->bridge_dev;
70} 70}
71 71
72static void dsa_port_set_stp_state(struct dsa_switch *ds, int port, u8 state)
73{
74 struct dsa_port *dp = &ds->ports[port];
75
76 if (ds->ops->port_stp_state_set)
77 ds->ops->port_stp_state_set(ds, port, state);
78
79 if (ds->ops->port_fast_age) {
80 /* Fast age FDB entries or flush appropriate forwarding database
81 * for the given port, if we are moving it from Learning or
82 * Forwarding state, to Disabled or Blocking or Listening state.
83 */
84
85 if ((dp->stp_state == BR_STATE_LEARNING ||
86 dp->stp_state == BR_STATE_FORWARDING) &&
87 (state == BR_STATE_DISABLED ||
88 state == BR_STATE_BLOCKING ||
89 state == BR_STATE_LISTENING))
90 ds->ops->port_fast_age(ds, port);
91 }
92
93 dp->stp_state = state;
94}
95
72static int dsa_slave_open(struct net_device *dev) 96static int dsa_slave_open(struct net_device *dev)
73{ 97{
74 struct dsa_slave_priv *p = netdev_priv(dev); 98 struct dsa_slave_priv *p = netdev_priv(dev);
@@ -98,14 +122,13 @@ static int dsa_slave_open(struct net_device *dev)
98 goto clear_allmulti; 122 goto clear_allmulti;
99 } 123 }
100 124
101 if (ds->drv->port_enable) { 125 if (ds->ops->port_enable) {
102 err = ds->drv->port_enable(ds, p->port, p->phy); 126 err = ds->ops->port_enable(ds, p->port, p->phy);
103 if (err) 127 if (err)
104 goto clear_promisc; 128 goto clear_promisc;
105 } 129 }
106 130
107 if (ds->drv->port_stp_state_set) 131 dsa_port_set_stp_state(ds, p->port, stp_state);
108 ds->drv->port_stp_state_set(ds, p->port, stp_state);
109 132
110 if (p->phy) 133 if (p->phy)
111 phy_start(p->phy); 134 phy_start(p->phy);
@@ -144,11 +167,10 @@ static int dsa_slave_close(struct net_device *dev)
144 if (!ether_addr_equal(dev->dev_addr, master->dev_addr)) 167 if (!ether_addr_equal(dev->dev_addr, master->dev_addr))
145 dev_uc_del(master, dev->dev_addr); 168 dev_uc_del(master, dev->dev_addr);
146 169
147 if (ds->drv->port_disable) 170 if (ds->ops->port_disable)
148 ds->drv->port_disable(ds, p->port, p->phy); 171 ds->ops->port_disable(ds, p->port, p->phy);
149 172
150 if (ds->drv->port_stp_state_set) 173 dsa_port_set_stp_state(ds, p->port, BR_STATE_DISABLED);
151 ds->drv->port_stp_state_set(ds, p->port, BR_STATE_DISABLED);
152 174
153 return 0; 175 return 0;
154} 176}
@@ -209,13 +231,13 @@ static int dsa_slave_port_vlan_add(struct net_device *dev,
209 struct dsa_switch *ds = p->parent; 231 struct dsa_switch *ds = p->parent;
210 232
211 if (switchdev_trans_ph_prepare(trans)) { 233 if (switchdev_trans_ph_prepare(trans)) {
212 if (!ds->drv->port_vlan_prepare || !ds->drv->port_vlan_add) 234 if (!ds->ops->port_vlan_prepare || !ds->ops->port_vlan_add)
213 return -EOPNOTSUPP; 235 return -EOPNOTSUPP;
214 236
215 return ds->drv->port_vlan_prepare(ds, p->port, vlan, trans); 237 return ds->ops->port_vlan_prepare(ds, p->port, vlan, trans);
216 } 238 }
217 239
218 ds->drv->port_vlan_add(ds, p->port, vlan, trans); 240 ds->ops->port_vlan_add(ds, p->port, vlan, trans);
219 241
220 return 0; 242 return 0;
221} 243}
@@ -226,10 +248,10 @@ static int dsa_slave_port_vlan_del(struct net_device *dev,
226 struct dsa_slave_priv *p = netdev_priv(dev); 248 struct dsa_slave_priv *p = netdev_priv(dev);
227 struct dsa_switch *ds = p->parent; 249 struct dsa_switch *ds = p->parent;
228 250
229 if (!ds->drv->port_vlan_del) 251 if (!ds->ops->port_vlan_del)
230 return -EOPNOTSUPP; 252 return -EOPNOTSUPP;
231 253
232 return ds->drv->port_vlan_del(ds, p->port, vlan); 254 return ds->ops->port_vlan_del(ds, p->port, vlan);
233} 255}
234 256
235static int dsa_slave_port_vlan_dump(struct net_device *dev, 257static int dsa_slave_port_vlan_dump(struct net_device *dev,
@@ -239,8 +261,8 @@ static int dsa_slave_port_vlan_dump(struct net_device *dev,
239 struct dsa_slave_priv *p = netdev_priv(dev); 261 struct dsa_slave_priv *p = netdev_priv(dev);
240 struct dsa_switch *ds = p->parent; 262 struct dsa_switch *ds = p->parent;
241 263
242 if (ds->drv->port_vlan_dump) 264 if (ds->ops->port_vlan_dump)
243 return ds->drv->port_vlan_dump(ds, p->port, vlan, cb); 265 return ds->ops->port_vlan_dump(ds, p->port, vlan, cb);
244 266
245 return -EOPNOTSUPP; 267 return -EOPNOTSUPP;
246} 268}
@@ -253,13 +275,13 @@ static int dsa_slave_port_fdb_add(struct net_device *dev,
253 struct dsa_switch *ds = p->parent; 275 struct dsa_switch *ds = p->parent;
254 276
255 if (switchdev_trans_ph_prepare(trans)) { 277 if (switchdev_trans_ph_prepare(trans)) {
256 if (!ds->drv->port_fdb_prepare || !ds->drv->port_fdb_add) 278 if (!ds->ops->port_fdb_prepare || !ds->ops->port_fdb_add)
257 return -EOPNOTSUPP; 279 return -EOPNOTSUPP;
258 280
259 return ds->drv->port_fdb_prepare(ds, p->port, fdb, trans); 281 return ds->ops->port_fdb_prepare(ds, p->port, fdb, trans);
260 } 282 }
261 283
262 ds->drv->port_fdb_add(ds, p->port, fdb, trans); 284 ds->ops->port_fdb_add(ds, p->port, fdb, trans);
263 285
264 return 0; 286 return 0;
265} 287}
@@ -271,8 +293,8 @@ static int dsa_slave_port_fdb_del(struct net_device *dev,
271 struct dsa_switch *ds = p->parent; 293 struct dsa_switch *ds = p->parent;
272 int ret = -EOPNOTSUPP; 294 int ret = -EOPNOTSUPP;
273 295
274 if (ds->drv->port_fdb_del) 296 if (ds->ops->port_fdb_del)
275 ret = ds->drv->port_fdb_del(ds, p->port, fdb); 297 ret = ds->ops->port_fdb_del(ds, p->port, fdb);
276 298
277 return ret; 299 return ret;
278} 300}
@@ -284,8 +306,52 @@ static int dsa_slave_port_fdb_dump(struct net_device *dev,
284 struct dsa_slave_priv *p = netdev_priv(dev); 306 struct dsa_slave_priv *p = netdev_priv(dev);
285 struct dsa_switch *ds = p->parent; 307 struct dsa_switch *ds = p->parent;
286 308
287 if (ds->drv->port_fdb_dump) 309 if (ds->ops->port_fdb_dump)
288 return ds->drv->port_fdb_dump(ds, p->port, fdb, cb); 310 return ds->ops->port_fdb_dump(ds, p->port, fdb, cb);
311
312 return -EOPNOTSUPP;
313}
314
315static int dsa_slave_port_mdb_add(struct net_device *dev,
316 const struct switchdev_obj_port_mdb *mdb,
317 struct switchdev_trans *trans)
318{
319 struct dsa_slave_priv *p = netdev_priv(dev);
320 struct dsa_switch *ds = p->parent;
321
322 if (switchdev_trans_ph_prepare(trans)) {
323 if (!ds->ops->port_mdb_prepare || !ds->ops->port_mdb_add)
324 return -EOPNOTSUPP;
325
326 return ds->ops->port_mdb_prepare(ds, p->port, mdb, trans);
327 }
328
329 ds->ops->port_mdb_add(ds, p->port, mdb, trans);
330
331 return 0;
332}
333
334static int dsa_slave_port_mdb_del(struct net_device *dev,
335 const struct switchdev_obj_port_mdb *mdb)
336{
337 struct dsa_slave_priv *p = netdev_priv(dev);
338 struct dsa_switch *ds = p->parent;
339
340 if (ds->ops->port_mdb_del)
341 return ds->ops->port_mdb_del(ds, p->port, mdb);
342
343 return -EOPNOTSUPP;
344}
345
346static int dsa_slave_port_mdb_dump(struct net_device *dev,
347 struct switchdev_obj_port_mdb *mdb,
348 switchdev_obj_dump_cb_t *cb)
349{
350 struct dsa_slave_priv *p = netdev_priv(dev);
351 struct dsa_switch *ds = p->parent;
352
353 if (ds->ops->port_mdb_dump)
354 return ds->ops->port_mdb_dump(ds, p->port, mdb, cb);
289 355
290 return -EOPNOTSUPP; 356 return -EOPNOTSUPP;
291} 357}
@@ -308,9 +374,9 @@ static int dsa_slave_stp_state_set(struct net_device *dev,
308 struct dsa_switch *ds = p->parent; 374 struct dsa_switch *ds = p->parent;
309 375
310 if (switchdev_trans_ph_prepare(trans)) 376 if (switchdev_trans_ph_prepare(trans))
311 return ds->drv->port_stp_state_set ? 0 : -EOPNOTSUPP; 377 return ds->ops->port_stp_state_set ? 0 : -EOPNOTSUPP;
312 378
313 ds->drv->port_stp_state_set(ds, p->port, attr->u.stp_state); 379 dsa_port_set_stp_state(ds, p->port, attr->u.stp_state);
314 380
315 return 0; 381 return 0;
316} 382}
@@ -326,8 +392,8 @@ static int dsa_slave_vlan_filtering(struct net_device *dev,
326 if (switchdev_trans_ph_prepare(trans)) 392 if (switchdev_trans_ph_prepare(trans))
327 return 0; 393 return 0;
328 394
329 if (ds->drv->port_vlan_filtering) 395 if (ds->ops->port_vlan_filtering)
330 return ds->drv->port_vlan_filtering(ds, p->port, 396 return ds->ops->port_vlan_filtering(ds, p->port,
331 attr->u.vlan_filtering); 397 attr->u.vlan_filtering);
332 398
333 return 0; 399 return 0;
@@ -365,8 +431,8 @@ static int dsa_slave_ageing_time(struct net_device *dev,
365 ds->ports[p->port].ageing_time = ageing_time; 431 ds->ports[p->port].ageing_time = ageing_time;
366 ageing_time = dsa_fastest_ageing_time(ds, ageing_time); 432 ageing_time = dsa_fastest_ageing_time(ds, ageing_time);
367 433
368 if (ds->drv->set_ageing_time) 434 if (ds->ops->set_ageing_time)
369 return ds->drv->set_ageing_time(ds, ageing_time); 435 return ds->ops->set_ageing_time(ds, ageing_time);
370 436
371 return 0; 437 return 0;
372} 438}
@@ -412,6 +478,10 @@ static int dsa_slave_port_obj_add(struct net_device *dev,
412 SWITCHDEV_OBJ_PORT_FDB(obj), 478 SWITCHDEV_OBJ_PORT_FDB(obj),
413 trans); 479 trans);
414 break; 480 break;
481 case SWITCHDEV_OBJ_ID_PORT_MDB:
482 err = dsa_slave_port_mdb_add(dev, SWITCHDEV_OBJ_PORT_MDB(obj),
483 trans);
484 break;
415 case SWITCHDEV_OBJ_ID_PORT_VLAN: 485 case SWITCHDEV_OBJ_ID_PORT_VLAN:
416 err = dsa_slave_port_vlan_add(dev, 486 err = dsa_slave_port_vlan_add(dev,
417 SWITCHDEV_OBJ_PORT_VLAN(obj), 487 SWITCHDEV_OBJ_PORT_VLAN(obj),
@@ -435,6 +505,9 @@ static int dsa_slave_port_obj_del(struct net_device *dev,
435 err = dsa_slave_port_fdb_del(dev, 505 err = dsa_slave_port_fdb_del(dev,
436 SWITCHDEV_OBJ_PORT_FDB(obj)); 506 SWITCHDEV_OBJ_PORT_FDB(obj));
437 break; 507 break;
508 case SWITCHDEV_OBJ_ID_PORT_MDB:
509 err = dsa_slave_port_mdb_del(dev, SWITCHDEV_OBJ_PORT_MDB(obj));
510 break;
438 case SWITCHDEV_OBJ_ID_PORT_VLAN: 511 case SWITCHDEV_OBJ_ID_PORT_VLAN:
439 err = dsa_slave_port_vlan_del(dev, 512 err = dsa_slave_port_vlan_del(dev,
440 SWITCHDEV_OBJ_PORT_VLAN(obj)); 513 SWITCHDEV_OBJ_PORT_VLAN(obj));
@@ -459,6 +532,10 @@ static int dsa_slave_port_obj_dump(struct net_device *dev,
459 SWITCHDEV_OBJ_PORT_FDB(obj), 532 SWITCHDEV_OBJ_PORT_FDB(obj),
460 cb); 533 cb);
461 break; 534 break;
535 case SWITCHDEV_OBJ_ID_PORT_MDB:
536 err = dsa_slave_port_mdb_dump(dev, SWITCHDEV_OBJ_PORT_MDB(obj),
537 cb);
538 break;
462 case SWITCHDEV_OBJ_ID_PORT_VLAN: 539 case SWITCHDEV_OBJ_ID_PORT_VLAN:
463 err = dsa_slave_port_vlan_dump(dev, 540 err = dsa_slave_port_vlan_dump(dev,
464 SWITCHDEV_OBJ_PORT_VLAN(obj), 541 SWITCHDEV_OBJ_PORT_VLAN(obj),
@@ -481,8 +558,8 @@ static int dsa_slave_bridge_port_join(struct net_device *dev,
481 558
482 p->bridge_dev = br; 559 p->bridge_dev = br;
483 560
484 if (ds->drv->port_bridge_join) 561 if (ds->ops->port_bridge_join)
485 ret = ds->drv->port_bridge_join(ds, p->port, br); 562 ret = ds->ops->port_bridge_join(ds, p->port, br);
486 563
487 return ret == -EOPNOTSUPP ? 0 : ret; 564 return ret == -EOPNOTSUPP ? 0 : ret;
488} 565}
@@ -493,16 +570,15 @@ static void dsa_slave_bridge_port_leave(struct net_device *dev)
493 struct dsa_switch *ds = p->parent; 570 struct dsa_switch *ds = p->parent;
494 571
495 572
496 if (ds->drv->port_bridge_leave) 573 if (ds->ops->port_bridge_leave)
497 ds->drv->port_bridge_leave(ds, p->port); 574 ds->ops->port_bridge_leave(ds, p->port);
498 575
499 p->bridge_dev = NULL; 576 p->bridge_dev = NULL;
500 577
501 /* Port left the bridge, put in BR_STATE_DISABLED by the bridge layer, 578 /* Port left the bridge, put in BR_STATE_DISABLED by the bridge layer,
502 * so allow it to be in BR_STATE_FORWARDING to be kept functional 579 * so allow it to be in BR_STATE_FORWARDING to be kept functional
503 */ 580 */
504 if (ds->drv->port_stp_state_set) 581 dsa_port_set_stp_state(ds, p->port, BR_STATE_FORWARDING);
505 ds->drv->port_stp_state_set(ds, p->port, BR_STATE_FORWARDING);
506} 582}
507 583
508static int dsa_slave_port_attr_get(struct net_device *dev, 584static int dsa_slave_port_attr_get(struct net_device *dev,
@@ -605,8 +681,8 @@ static int dsa_slave_get_regs_len(struct net_device *dev)
605 struct dsa_slave_priv *p = netdev_priv(dev); 681 struct dsa_slave_priv *p = netdev_priv(dev);
606 struct dsa_switch *ds = p->parent; 682 struct dsa_switch *ds = p->parent;
607 683
608 if (ds->drv->get_regs_len) 684 if (ds->ops->get_regs_len)
609 return ds->drv->get_regs_len(ds, p->port); 685 return ds->ops->get_regs_len(ds, p->port);
610 686
611 return -EOPNOTSUPP; 687 return -EOPNOTSUPP;
612} 688}
@@ -617,8 +693,8 @@ dsa_slave_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *_p)
617 struct dsa_slave_priv *p = netdev_priv(dev); 693 struct dsa_slave_priv *p = netdev_priv(dev);
618 struct dsa_switch *ds = p->parent; 694 struct dsa_switch *ds = p->parent;
619 695
620 if (ds->drv->get_regs) 696 if (ds->ops->get_regs)
621 ds->drv->get_regs(ds, p->port, regs, _p); 697 ds->ops->get_regs(ds, p->port, regs, _p);
622} 698}
623 699
624static int dsa_slave_nway_reset(struct net_device *dev) 700static int dsa_slave_nway_reset(struct net_device *dev)
@@ -651,8 +727,8 @@ static int dsa_slave_get_eeprom_len(struct net_device *dev)
651 if (ds->cd && ds->cd->eeprom_len) 727 if (ds->cd && ds->cd->eeprom_len)
652 return ds->cd->eeprom_len; 728 return ds->cd->eeprom_len;
653 729
654 if (ds->drv->get_eeprom_len) 730 if (ds->ops->get_eeprom_len)
655 return ds->drv->get_eeprom_len(ds); 731 return ds->ops->get_eeprom_len(ds);
656 732
657 return 0; 733 return 0;
658} 734}
@@ -663,8 +739,8 @@ static int dsa_slave_get_eeprom(struct net_device *dev,
663 struct dsa_slave_priv *p = netdev_priv(dev); 739 struct dsa_slave_priv *p = netdev_priv(dev);
664 struct dsa_switch *ds = p->parent; 740 struct dsa_switch *ds = p->parent;
665 741
666 if (ds->drv->get_eeprom) 742 if (ds->ops->get_eeprom)
667 return ds->drv->get_eeprom(ds, eeprom, data); 743 return ds->ops->get_eeprom(ds, eeprom, data);
668 744
669 return -EOPNOTSUPP; 745 return -EOPNOTSUPP;
670} 746}
@@ -675,8 +751,8 @@ static int dsa_slave_set_eeprom(struct net_device *dev,
675 struct dsa_slave_priv *p = netdev_priv(dev); 751 struct dsa_slave_priv *p = netdev_priv(dev);
676 struct dsa_switch *ds = p->parent; 752 struct dsa_switch *ds = p->parent;
677 753
678 if (ds->drv->set_eeprom) 754 if (ds->ops->set_eeprom)
679 return ds->drv->set_eeprom(ds, eeprom, data); 755 return ds->ops->set_eeprom(ds, eeprom, data);
680 756
681 return -EOPNOTSUPP; 757 return -EOPNOTSUPP;
682} 758}
@@ -694,8 +770,8 @@ static void dsa_slave_get_strings(struct net_device *dev,
694 strncpy(data + len, "tx_bytes", len); 770 strncpy(data + len, "tx_bytes", len);
695 strncpy(data + 2 * len, "rx_packets", len); 771 strncpy(data + 2 * len, "rx_packets", len);
696 strncpy(data + 3 * len, "rx_bytes", len); 772 strncpy(data + 3 * len, "rx_bytes", len);
697 if (ds->drv->get_strings != NULL) 773 if (ds->ops->get_strings)
698 ds->drv->get_strings(ds, p->port, data + 4 * len); 774 ds->ops->get_strings(ds, p->port, data + 4 * len);
699 } 775 }
700} 776}
701 777
@@ -714,8 +790,8 @@ static void dsa_cpu_port_get_ethtool_stats(struct net_device *dev,
714 dst->master_ethtool_ops.get_ethtool_stats(dev, stats, data); 790 dst->master_ethtool_ops.get_ethtool_stats(dev, stats, data);
715 } 791 }
716 792
717 if (ds->drv->get_ethtool_stats) 793 if (ds->ops->get_ethtool_stats)
718 ds->drv->get_ethtool_stats(ds, cpu_port, data + count); 794 ds->ops->get_ethtool_stats(ds, cpu_port, data + count);
719} 795}
720 796
721static int dsa_cpu_port_get_sset_count(struct net_device *dev, int sset) 797static int dsa_cpu_port_get_sset_count(struct net_device *dev, int sset)
@@ -727,8 +803,8 @@ static int dsa_cpu_port_get_sset_count(struct net_device *dev, int sset)
727 if (dst->master_ethtool_ops.get_sset_count) 803 if (dst->master_ethtool_ops.get_sset_count)
728 count += dst->master_ethtool_ops.get_sset_count(dev, sset); 804 count += dst->master_ethtool_ops.get_sset_count(dev, sset);
729 805
730 if (sset == ETH_SS_STATS && ds->drv->get_sset_count) 806 if (sset == ETH_SS_STATS && ds->ops->get_sset_count)
731 count += ds->drv->get_sset_count(ds); 807 count += ds->ops->get_sset_count(ds);
732 808
733 return count; 809 return count;
734} 810}
@@ -755,14 +831,14 @@ static void dsa_cpu_port_get_strings(struct net_device *dev,
755 dst->master_ethtool_ops.get_strings(dev, stringset, data); 831 dst->master_ethtool_ops.get_strings(dev, stringset, data);
756 } 832 }
757 833
758 if (stringset == ETH_SS_STATS && ds->drv->get_strings) { 834 if (stringset == ETH_SS_STATS && ds->ops->get_strings) {
759 ndata = data + mcount * len; 835 ndata = data + mcount * len;
760 /* This function copies ETH_GSTRINGS_LEN bytes, we will mangle 836 /* This function copies ETH_GSTRINGS_LEN bytes, we will mangle
761 * the output after to prepend our CPU port prefix we 837 * the output after to prepend our CPU port prefix we
762 * constructed earlier 838 * constructed earlier
763 */ 839 */
764 ds->drv->get_strings(ds, cpu_port, ndata); 840 ds->ops->get_strings(ds, cpu_port, ndata);
765 count = ds->drv->get_sset_count(ds); 841 count = ds->ops->get_sset_count(ds);
766 for (i = 0; i < count; i++) { 842 for (i = 0; i < count; i++) {
767 memmove(ndata + (i * len + sizeof(pfx)), 843 memmove(ndata + (i * len + sizeof(pfx)),
768 ndata + i * len, len - sizeof(pfx)); 844 ndata + i * len, len - sizeof(pfx));
@@ -782,8 +858,8 @@ static void dsa_slave_get_ethtool_stats(struct net_device *dev,
782 data[1] = dev->stats.tx_bytes; 858 data[1] = dev->stats.tx_bytes;
783 data[2] = dev->stats.rx_packets; 859 data[2] = dev->stats.rx_packets;
784 data[3] = dev->stats.rx_bytes; 860 data[3] = dev->stats.rx_bytes;
785 if (ds->drv->get_ethtool_stats != NULL) 861 if (ds->ops->get_ethtool_stats)
786 ds->drv->get_ethtool_stats(ds, p->port, data + 4); 862 ds->ops->get_ethtool_stats(ds, p->port, data + 4);
787} 863}
788 864
789static int dsa_slave_get_sset_count(struct net_device *dev, int sset) 865static int dsa_slave_get_sset_count(struct net_device *dev, int sset)
@@ -795,8 +871,8 @@ static int dsa_slave_get_sset_count(struct net_device *dev, int sset)
795 int count; 871 int count;
796 872
797 count = 4; 873 count = 4;
798 if (ds->drv->get_sset_count != NULL) 874 if (ds->ops->get_sset_count)
799 count += ds->drv->get_sset_count(ds); 875 count += ds->ops->get_sset_count(ds);
800 876
801 return count; 877 return count;
802 } 878 }
@@ -809,8 +885,8 @@ static void dsa_slave_get_wol(struct net_device *dev, struct ethtool_wolinfo *w)
809 struct dsa_slave_priv *p = netdev_priv(dev); 885 struct dsa_slave_priv *p = netdev_priv(dev);
810 struct dsa_switch *ds = p->parent; 886 struct dsa_switch *ds = p->parent;
811 887
812 if (ds->drv->get_wol) 888 if (ds->ops->get_wol)
813 ds->drv->get_wol(ds, p->port, w); 889 ds->ops->get_wol(ds, p->port, w);
814} 890}
815 891
816static int dsa_slave_set_wol(struct net_device *dev, struct ethtool_wolinfo *w) 892static int dsa_slave_set_wol(struct net_device *dev, struct ethtool_wolinfo *w)
@@ -819,8 +895,8 @@ static int dsa_slave_set_wol(struct net_device *dev, struct ethtool_wolinfo *w)
819 struct dsa_switch *ds = p->parent; 895 struct dsa_switch *ds = p->parent;
820 int ret = -EOPNOTSUPP; 896 int ret = -EOPNOTSUPP;
821 897
822 if (ds->drv->set_wol) 898 if (ds->ops->set_wol)
823 ret = ds->drv->set_wol(ds, p->port, w); 899 ret = ds->ops->set_wol(ds, p->port, w);
824 900
825 return ret; 901 return ret;
826} 902}
@@ -831,10 +907,10 @@ static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e)
831 struct dsa_switch *ds = p->parent; 907 struct dsa_switch *ds = p->parent;
832 int ret; 908 int ret;
833 909
834 if (!ds->drv->set_eee) 910 if (!ds->ops->set_eee)
835 return -EOPNOTSUPP; 911 return -EOPNOTSUPP;
836 912
837 ret = ds->drv->set_eee(ds, p->port, p->phy, e); 913 ret = ds->ops->set_eee(ds, p->port, p->phy, e);
838 if (ret) 914 if (ret)
839 return ret; 915 return ret;
840 916
@@ -850,10 +926,10 @@ static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e)
850 struct dsa_switch *ds = p->parent; 926 struct dsa_switch *ds = p->parent;
851 int ret; 927 int ret;
852 928
853 if (!ds->drv->get_eee) 929 if (!ds->ops->get_eee)
854 return -EOPNOTSUPP; 930 return -EOPNOTSUPP;
855 931
856 ret = ds->drv->get_eee(ds, p->port, e); 932 ret = ds->ops->get_eee(ds, p->port, e);
857 if (ret) 933 if (ret)
858 return ret; 934 return ret;
859 935
@@ -988,8 +1064,8 @@ static void dsa_slave_adjust_link(struct net_device *dev)
988 p->old_pause = p->phy->pause; 1064 p->old_pause = p->phy->pause;
989 } 1065 }
990 1066
991 if (ds->drv->adjust_link && status_changed) 1067 if (ds->ops->adjust_link && status_changed)
992 ds->drv->adjust_link(ds, p->port, p->phy); 1068 ds->ops->adjust_link(ds, p->port, p->phy);
993 1069
994 if (status_changed) 1070 if (status_changed)
995 phy_print_status(p->phy); 1071 phy_print_status(p->phy);
@@ -1004,8 +1080,8 @@ static int dsa_slave_fixed_link_update(struct net_device *dev,
1004 if (dev) { 1080 if (dev) {
1005 p = netdev_priv(dev); 1081 p = netdev_priv(dev);
1006 ds = p->parent; 1082 ds = p->parent;
1007 if (ds->drv->fixed_link_update) 1083 if (ds->ops->fixed_link_update)
1008 ds->drv->fixed_link_update(ds, p->port, status); 1084 ds->ops->fixed_link_update(ds, p->port, status);
1009 } 1085 }
1010 1086
1011 return 0; 1087 return 0;
@@ -1062,8 +1138,8 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p,
1062 phy_dn = port_dn; 1138 phy_dn = port_dn;
1063 } 1139 }
1064 1140
1065 if (ds->drv->get_phy_flags) 1141 if (ds->ops->get_phy_flags)
1066 phy_flags = ds->drv->get_phy_flags(ds, p->port); 1142 phy_flags = ds->ops->get_phy_flags(ds, p->port);
1067 1143
1068 if (phy_dn) { 1144 if (phy_dn) {
1069 int phy_id = of_mdio_parse_addr(&slave_dev->dev, phy_dn); 1145 int phy_id = of_mdio_parse_addr(&slave_dev->dev, phy_dn);
diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c
new file mode 100644
index 000000000000..0c90cacee7aa
--- /dev/null
+++ b/net/dsa/tag_qca.c
@@ -0,0 +1,138 @@
1/*
2 * Copyright (c) 2015, The Linux Foundation. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 and
6 * only version 2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 */
13
14#include <linux/etherdevice.h>
15#include "dsa_priv.h"
16
17#define QCA_HDR_LEN 2
18#define QCA_HDR_VERSION 0x2
19
20#define QCA_HDR_RECV_VERSION_MASK GENMASK(15, 14)
21#define QCA_HDR_RECV_VERSION_S 14
22#define QCA_HDR_RECV_PRIORITY_MASK GENMASK(13, 11)
23#define QCA_HDR_RECV_PRIORITY_S 11
24#define QCA_HDR_RECV_TYPE_MASK GENMASK(10, 6)
25#define QCA_HDR_RECV_TYPE_S 6
26#define QCA_HDR_RECV_FRAME_IS_TAGGED BIT(3)
27#define QCA_HDR_RECV_SOURCE_PORT_MASK GENMASK(2, 0)
28
29#define QCA_HDR_XMIT_VERSION_MASK GENMASK(15, 14)
30#define QCA_HDR_XMIT_VERSION_S 14
31#define QCA_HDR_XMIT_PRIORITY_MASK GENMASK(13, 11)
32#define QCA_HDR_XMIT_PRIORITY_S 11
33#define QCA_HDR_XMIT_CONTROL_MASK GENMASK(10, 8)
34#define QCA_HDR_XMIT_CONTROL_S 8
35#define QCA_HDR_XMIT_FROM_CPU BIT(7)
36#define QCA_HDR_XMIT_DP_BIT_MASK GENMASK(6, 0)
37
38static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev)
39{
40 struct dsa_slave_priv *p = netdev_priv(dev);
41 u16 *phdr, hdr;
42
43 dev->stats.tx_packets++;
44 dev->stats.tx_bytes += skb->len;
45
46 if (skb_cow_head(skb, 0) < 0)
47 goto out_free;
48
49 skb_push(skb, QCA_HDR_LEN);
50
51 memmove(skb->data, skb->data + QCA_HDR_LEN, 2 * ETH_ALEN);
52 phdr = (u16 *)(skb->data + 2 * ETH_ALEN);
53
54 /* Set the version field, and set destination port information */
55 hdr = QCA_HDR_VERSION << QCA_HDR_XMIT_VERSION_S |
56 QCA_HDR_XMIT_FROM_CPU |
57 BIT(p->port);
58
59 *phdr = htons(hdr);
60
61 return skb;
62
63out_free:
64 kfree_skb(skb);
65 return NULL;
66}
67
68static int qca_tag_rcv(struct sk_buff *skb, struct net_device *dev,
69 struct packet_type *pt, struct net_device *orig_dev)
70{
71 struct dsa_switch_tree *dst = dev->dsa_ptr;
72 struct dsa_switch *ds;
73 u8 ver;
74 int port;
75 __be16 *phdr, hdr;
76
77 if (unlikely(!dst))
78 goto out_drop;
79
80 skb = skb_unshare(skb, GFP_ATOMIC);
81 if (!skb)
82 goto out;
83
84 if (unlikely(!pskb_may_pull(skb, QCA_HDR_LEN)))
85 goto out_drop;
86
87 /* The QCA header is added by the switch between src addr and Ethertype
88 * At this point, skb->data points to ethertype so header should be
89 * right before
90 */
91 phdr = (__be16 *)(skb->data - 2);
92 hdr = ntohs(*phdr);
93
94 /* Make sure the version is correct */
95 ver = (hdr & QCA_HDR_RECV_VERSION_MASK) >> QCA_HDR_RECV_VERSION_S;
96 if (unlikely(ver != QCA_HDR_VERSION))
97 goto out_drop;
98
99 /* Remove QCA tag and recalculate checksum */
100 skb_pull_rcsum(skb, QCA_HDR_LEN);
101 memmove(skb->data - ETH_HLEN, skb->data - ETH_HLEN - QCA_HDR_LEN,
102 ETH_HLEN - QCA_HDR_LEN);
103
104 /* This protocol doesn't support cascading multiple switches so it's
105 * safe to assume the switch is first in the tree
106 */
107 ds = dst->ds[0];
108 if (!ds)
109 goto out_drop;
110
111 /* Get source port information */
112 port = (hdr & QCA_HDR_RECV_SOURCE_PORT_MASK);
113 if (!ds->ports[port].netdev)
114 goto out_drop;
115
116 /* Update skb & forward the frame accordingly */
117 skb_push(skb, ETH_HLEN);
118 skb->pkt_type = PACKET_HOST;
119 skb->dev = ds->ports[port].netdev;
120 skb->protocol = eth_type_trans(skb, skb->dev);
121
122 skb->dev->stats.rx_packets++;
123 skb->dev->stats.rx_bytes += skb->len;
124
125 netif_receive_skb(skb);
126
127 return 0;
128
129out_drop:
130 kfree_skb(skb);
131out:
132 return 0;
133}
134
135const struct dsa_device_ops qca_netdev_ops = {
136 .xmit = qca_tag_xmit,
137 .rcv = qca_tag_rcv,
138};
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 50d6a9b49f6c..300b06888fdf 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -640,6 +640,21 @@ config TCP_CONG_CDG
640 D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using 640 D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using
641 delay gradients." In Networking 2011. Preprint: http://goo.gl/No3vdg 641 delay gradients." In Networking 2011. Preprint: http://goo.gl/No3vdg
642 642
643config TCP_CONG_BBR
644 tristate "BBR TCP"
645 default n
646 ---help---
647
648 BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to
649 maximize network utilization and minimize queues. It builds an explicit
650 model of the the bottleneck delivery rate and path round-trip
651 propagation delay. It tolerates packet loss and delay unrelated to
652 congestion. It can operate over LAN, WAN, cellular, wifi, or cable
653 modem links. It can coexist with flows that use loss-based congestion
654 control, and can operate with shallow buffers, deep buffers,
655 bufferbloat, policers, or AQM schemes that do not provide a delay
656 signal. It requires the fq ("Fair Queue") pacing packet scheduler.
657
643choice 658choice
644 prompt "Default TCP congestion control" 659 prompt "Default TCP congestion control"
645 default DEFAULT_CUBIC 660 default DEFAULT_CUBIC
@@ -674,6 +689,9 @@ choice
674 config DEFAULT_CDG 689 config DEFAULT_CDG
675 bool "CDG" if TCP_CONG_CDG=y 690 bool "CDG" if TCP_CONG_CDG=y
676 691
692 config DEFAULT_BBR
693 bool "BBR" if TCP_CONG_BBR=y
694
677 config DEFAULT_RENO 695 config DEFAULT_RENO
678 bool "Reno" 696 bool "Reno"
679endchoice 697endchoice
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 24629b6f57cc..bc6a6c8b9bcd 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -8,7 +8,7 @@ obj-y := route.o inetpeer.o protocol.o \
8 inet_timewait_sock.o inet_connection_sock.o \ 8 inet_timewait_sock.o inet_connection_sock.o \
9 tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ 9 tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
10 tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \ 10 tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
11 tcp_recovery.o \ 11 tcp_rate.o tcp_recovery.o \
12 tcp_offload.o datagram.o raw.o udp.o udplite.o \ 12 tcp_offload.o datagram.o raw.o udp.o udplite.o \
13 udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \ 13 udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
14 fib_frontend.o fib_semantics.o fib_trie.o \ 14 fib_frontend.o fib_semantics.o fib_trie.o \
@@ -41,6 +41,7 @@ obj-$(CONFIG_INET_DIAG) += inet_diag.o
41obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o 41obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
42obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o 42obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
43obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o 43obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
44obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o
44obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o 45obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
45obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o 46obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o
46obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o 47obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 55513e654d79..1effc986739e 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -211,24 +211,19 @@ int inet_listen(struct socket *sock, int backlog)
211 * we can only allow the backlog to be adjusted. 211 * we can only allow the backlog to be adjusted.
212 */ 212 */
213 if (old_state != TCP_LISTEN) { 213 if (old_state != TCP_LISTEN) {
214 /* Check special setups for testing purpose to enable TFO w/o 214 /* Enable TFO w/o requiring TCP_FASTOPEN socket option.
215 * requiring TCP_FASTOPEN sockopt.
216 * Note that only TCP sockets (SOCK_STREAM) will reach here. 215 * Note that only TCP sockets (SOCK_STREAM) will reach here.
217 * Also fastopenq may already been allocated because this 216 * Also fastopen backlog may already been set via the option
218 * socket was in TCP_LISTEN state previously but was 217 * because the socket was in TCP_LISTEN state previously but
219 * shutdown() (rather than close()). 218 * was shutdown() rather than close().
220 */ 219 */
221 if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) != 0 && 220 if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) &&
221 (sysctl_tcp_fastopen & TFO_SERVER_ENABLE) &&
222 !inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) { 222 !inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) {
223 if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) != 0) 223 fastopen_queue_tune(sk, backlog);
224 fastopen_queue_tune(sk, backlog);
225 else if ((sysctl_tcp_fastopen &
226 TFO_SERVER_WO_SOCKOPT2) != 0)
227 fastopen_queue_tune(sk,
228 ((uint)sysctl_tcp_fastopen) >> 16);
229
230 tcp_fastopen_init_key_once(true); 224 tcp_fastopen_init_key_once(true);
231 } 225 }
226
232 err = inet_csk_listen_start(sk, backlog); 227 err = inet_csk_listen_start(sk, backlog);
233 if (err) 228 if (err)
234 goto out; 229 goto out;
@@ -921,6 +916,8 @@ const struct proto_ops inet_stream_ops = {
921 .mmap = sock_no_mmap, 916 .mmap = sock_no_mmap,
922 .sendpage = inet_sendpage, 917 .sendpage = inet_sendpage,
923 .splice_read = tcp_splice_read, 918 .splice_read = tcp_splice_read,
919 .read_sock = tcp_read_sock,
920 .peek_len = tcp_peek_len,
924#ifdef CONFIG_COMPAT 921#ifdef CONFIG_COMPAT
925 .compat_setsockopt = compat_sock_common_setsockopt, 922 .compat_setsockopt = compat_sock_common_setsockopt,
926 .compat_getsockopt = compat_sock_common_getsockopt, 923 .compat_getsockopt = compat_sock_common_getsockopt,
@@ -1195,7 +1192,7 @@ EXPORT_SYMBOL(inet_sk_rebuild_header);
1195struct sk_buff *inet_gso_segment(struct sk_buff *skb, 1192struct sk_buff *inet_gso_segment(struct sk_buff *skb,
1196 netdev_features_t features) 1193 netdev_features_t features)
1197{ 1194{
1198 bool udpfrag = false, fixedid = false, encap; 1195 bool udpfrag = false, fixedid = false, gso_partial, encap;
1199 struct sk_buff *segs = ERR_PTR(-EINVAL); 1196 struct sk_buff *segs = ERR_PTR(-EINVAL);
1200 const struct net_offload *ops; 1197 const struct net_offload *ops;
1201 unsigned int offset = 0; 1198 unsigned int offset = 0;
@@ -1248,6 +1245,8 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
1248 if (IS_ERR_OR_NULL(segs)) 1245 if (IS_ERR_OR_NULL(segs))
1249 goto out; 1246 goto out;
1250 1247
1248 gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
1249
1251 skb = segs; 1250 skb = segs;
1252 do { 1251 do {
1253 iph = (struct iphdr *)(skb_mac_header(skb) + nhoff); 1252 iph = (struct iphdr *)(skb_mac_header(skb) + nhoff);
@@ -1262,9 +1261,13 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
1262 iph->id = htons(id); 1261 iph->id = htons(id);
1263 id += skb_shinfo(skb)->gso_segs; 1262 id += skb_shinfo(skb)->gso_segs;
1264 } 1263 }
1265 tot_len = skb_shinfo(skb)->gso_size + 1264
1266 SKB_GSO_CB(skb)->data_offset + 1265 if (gso_partial)
1267 skb->head - (unsigned char *)iph; 1266 tot_len = skb_shinfo(skb)->gso_size +
1267 SKB_GSO_CB(skb)->data_offset +
1268 skb->head - (unsigned char *)iph;
1269 else
1270 tot_len = skb->len - nhoff;
1268 } else { 1271 } else {
1269 if (!fixedid) 1272 if (!fixedid)
1270 iph->id = htons(id++); 1273 iph->id = htons(id++);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 1b25daf8c7f1..c3b80478226e 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -93,9 +93,6 @@ struct fib_table *fib_new_table(struct net *net, u32 id)
93 return NULL; 93 return NULL;
94 94
95 switch (id) { 95 switch (id) {
96 case RT_TABLE_LOCAL:
97 rcu_assign_pointer(net->ipv4.fib_local, tb);
98 break;
99 case RT_TABLE_MAIN: 96 case RT_TABLE_MAIN:
100 rcu_assign_pointer(net->ipv4.fib_main, tb); 97 rcu_assign_pointer(net->ipv4.fib_main, tb);
101 break; 98 break;
@@ -137,9 +134,6 @@ static void fib_replace_table(struct net *net, struct fib_table *old,
137{ 134{
138#ifdef CONFIG_IP_MULTIPLE_TABLES 135#ifdef CONFIG_IP_MULTIPLE_TABLES
139 switch (new->tb_id) { 136 switch (new->tb_id) {
140 case RT_TABLE_LOCAL:
141 rcu_assign_pointer(net->ipv4.fib_local, new);
142 break;
143 case RT_TABLE_MAIN: 137 case RT_TABLE_MAIN:
144 rcu_assign_pointer(net->ipv4.fib_main, new); 138 rcu_assign_pointer(net->ipv4.fib_main, new);
145 break; 139 break;
@@ -188,26 +182,13 @@ static void fib_flush(struct net *net)
188 struct fib_table *tb; 182 struct fib_table *tb;
189 183
190 hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) 184 hlist_for_each_entry_safe(tb, tmp, head, tb_hlist)
191 flushed += fib_table_flush(tb); 185 flushed += fib_table_flush(net, tb);
192 } 186 }
193 187
194 if (flushed) 188 if (flushed)
195 rt_cache_flush(net); 189 rt_cache_flush(net);
196} 190}
197 191
198void fib_flush_external(struct net *net)
199{
200 struct fib_table *tb;
201 struct hlist_head *head;
202 unsigned int h;
203
204 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
205 head = &net->ipv4.fib_table_hash[h];
206 hlist_for_each_entry(tb, head, tb_hlist)
207 fib_table_flush_external(tb);
208 }
209}
210
211/* 192/*
212 * Find address type as if only "dev" was present in the system. If 193 * Find address type as if only "dev" was present in the system. If
213 * on_dev is NULL then all interfaces are taken into consideration. 194 * on_dev is NULL then all interfaces are taken into consideration.
@@ -596,13 +577,13 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
596 if (cmd == SIOCDELRT) { 577 if (cmd == SIOCDELRT) {
597 tb = fib_get_table(net, cfg.fc_table); 578 tb = fib_get_table(net, cfg.fc_table);
598 if (tb) 579 if (tb)
599 err = fib_table_delete(tb, &cfg); 580 err = fib_table_delete(net, tb, &cfg);
600 else 581 else
601 err = -ESRCH; 582 err = -ESRCH;
602 } else { 583 } else {
603 tb = fib_new_table(net, cfg.fc_table); 584 tb = fib_new_table(net, cfg.fc_table);
604 if (tb) 585 if (tb)
605 err = fib_table_insert(tb, &cfg); 586 err = fib_table_insert(net, tb, &cfg);
606 else 587 else
607 err = -ENOBUFS; 588 err = -ENOBUFS;
608 } 589 }
@@ -725,7 +706,7 @@ static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
725 goto errout; 706 goto errout;
726 } 707 }
727 708
728 err = fib_table_delete(tb, &cfg); 709 err = fib_table_delete(net, tb, &cfg);
729errout: 710errout:
730 return err; 711 return err;
731} 712}
@@ -747,7 +728,7 @@ static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
747 goto errout; 728 goto errout;
748 } 729 }
749 730
750 err = fib_table_insert(tb, &cfg); 731 err = fib_table_insert(net, tb, &cfg);
751errout: 732errout:
752 return err; 733 return err;
753} 734}
@@ -834,9 +815,9 @@ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifad
834 cfg.fc_scope = RT_SCOPE_HOST; 815 cfg.fc_scope = RT_SCOPE_HOST;
835 816
836 if (cmd == RTM_NEWROUTE) 817 if (cmd == RTM_NEWROUTE)
837 fib_table_insert(tb, &cfg); 818 fib_table_insert(net, tb, &cfg);
838 else 819 else
839 fib_table_delete(tb, &cfg); 820 fib_table_delete(net, tb, &cfg);
840} 821}
841 822
842void fib_add_ifaddr(struct in_ifaddr *ifa) 823void fib_add_ifaddr(struct in_ifaddr *ifa)
@@ -1250,7 +1231,6 @@ static void ip_fib_net_exit(struct net *net)
1250 1231
1251 rtnl_lock(); 1232 rtnl_lock();
1252#ifdef CONFIG_IP_MULTIPLE_TABLES 1233#ifdef CONFIG_IP_MULTIPLE_TABLES
1253 RCU_INIT_POINTER(net->ipv4.fib_local, NULL);
1254 RCU_INIT_POINTER(net->ipv4.fib_main, NULL); 1234 RCU_INIT_POINTER(net->ipv4.fib_main, NULL);
1255 RCU_INIT_POINTER(net->ipv4.fib_default, NULL); 1235 RCU_INIT_POINTER(net->ipv4.fib_default, NULL);
1256#endif 1236#endif
@@ -1261,7 +1241,7 @@ static void ip_fib_net_exit(struct net *net)
1261 1241
1262 hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) { 1242 hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) {
1263 hlist_del(&tb->tb_hlist); 1243 hlist_del(&tb->tb_hlist);
1264 fib_table_flush(tb); 1244 fib_table_flush(net, tb);
1265 fib_free_table(tb); 1245 fib_free_table(tb);
1266 } 1246 }
1267 } 1247 }
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 6e9ea69e5f75..2e50062f642d 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -56,6 +56,9 @@ int __fib_lookup(struct net *net, struct flowi4 *flp,
56 }; 56 };
57 int err; 57 int err;
58 58
59 /* update flow if oif or iif point to device enslaved to l3mdev */
60 l3mdev_update_flow(net, flowi4_to_flowi(flp));
61
59 err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg); 62 err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg);
60#ifdef CONFIG_IP_ROUTE_CLASSID 63#ifdef CONFIG_IP_ROUTE_CLASSID
61 if (arg.rule) 64 if (arg.rule)
@@ -161,6 +164,14 @@ static struct fib_table *fib_empty_table(struct net *net)
161 return NULL; 164 return NULL;
162} 165}
163 166
167static int call_fib_rule_notifiers(struct net *net,
168 enum fib_event_type event_type)
169{
170 struct fib_notifier_info info;
171
172 return call_fib_notifiers(net, event_type, &info);
173}
174
164static const struct nla_policy fib4_rule_policy[FRA_MAX+1] = { 175static const struct nla_policy fib4_rule_policy[FRA_MAX+1] = {
165 FRA_GENERIC_POLICY, 176 FRA_GENERIC_POLICY,
166 [FRA_FLOW] = { .type = NLA_U32 }, 177 [FRA_FLOW] = { .type = NLA_U32 },
@@ -217,7 +228,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
217 rule4->tos = frh->tos; 228 rule4->tos = frh->tos;
218 229
219 net->ipv4.fib_has_custom_rules = true; 230 net->ipv4.fib_has_custom_rules = true;
220 fib_flush_external(rule->fr_net); 231 call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD);
221 232
222 err = 0; 233 err = 0;
223errout: 234errout:
@@ -239,7 +250,7 @@ static int fib4_rule_delete(struct fib_rule *rule)
239 net->ipv4.fib_num_tclassid_users--; 250 net->ipv4.fib_num_tclassid_users--;
240#endif 251#endif
241 net->ipv4.fib_has_custom_rules = true; 252 net->ipv4.fib_has_custom_rules = true;
242 fib_flush_external(rule->fr_net); 253 call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL);
243errout: 254errout:
244 return err; 255 return err;
245} 256}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index e9f56225e53f..388d3e21629b 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1580,7 +1580,8 @@ static bool fib_good_nh(const struct fib_nh *nh)
1580 1580
1581 rcu_read_lock_bh(); 1581 rcu_read_lock_bh();
1582 1582
1583 n = __ipv4_neigh_lookup_noref(nh->nh_dev, nh->nh_gw); 1583 n = __ipv4_neigh_lookup_noref(nh->nh_dev,
1584 (__force u32)nh->nh_gw);
1584 if (n) 1585 if (n)
1585 state = n->nud_state; 1586 state = n->nud_state;
1586 1587
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index e2ffc2a5c7db..31cef3602585 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -73,6 +73,7 @@
73#include <linux/slab.h> 73#include <linux/slab.h>
74#include <linux/export.h> 74#include <linux/export.h>
75#include <linux/vmalloc.h> 75#include <linux/vmalloc.h>
76#include <linux/notifier.h>
76#include <net/net_namespace.h> 77#include <net/net_namespace.h>
77#include <net/ip.h> 78#include <net/ip.h>
78#include <net/protocol.h> 79#include <net/protocol.h>
@@ -80,10 +81,47 @@
80#include <net/tcp.h> 81#include <net/tcp.h>
81#include <net/sock.h> 82#include <net/sock.h>
82#include <net/ip_fib.h> 83#include <net/ip_fib.h>
83#include <net/switchdev.h>
84#include <trace/events/fib.h> 84#include <trace/events/fib.h>
85#include "fib_lookup.h" 85#include "fib_lookup.h"
86 86
87static BLOCKING_NOTIFIER_HEAD(fib_chain);
88
89int register_fib_notifier(struct notifier_block *nb)
90{
91 return blocking_notifier_chain_register(&fib_chain, nb);
92}
93EXPORT_SYMBOL(register_fib_notifier);
94
95int unregister_fib_notifier(struct notifier_block *nb)
96{
97 return blocking_notifier_chain_unregister(&fib_chain, nb);
98}
99EXPORT_SYMBOL(unregister_fib_notifier);
100
101int call_fib_notifiers(struct net *net, enum fib_event_type event_type,
102 struct fib_notifier_info *info)
103{
104 info->net = net;
105 return blocking_notifier_call_chain(&fib_chain, event_type, info);
106}
107
108static int call_fib_entry_notifiers(struct net *net,
109 enum fib_event_type event_type, u32 dst,
110 int dst_len, struct fib_info *fi,
111 u8 tos, u8 type, u32 tb_id, u32 nlflags)
112{
113 struct fib_entry_notifier_info info = {
114 .dst = dst,
115 .dst_len = dst_len,
116 .fi = fi,
117 .tos = tos,
118 .type = type,
119 .tb_id = tb_id,
120 .nlflags = nlflags,
121 };
122 return call_fib_notifiers(net, event_type, &info.info);
123}
124
87#define MAX_STAT_DEPTH 32 125#define MAX_STAT_DEPTH 32
88 126
89#define KEYLENGTH (8*sizeof(t_key)) 127#define KEYLENGTH (8*sizeof(t_key))
@@ -1076,12 +1114,13 @@ static int fib_insert_alias(struct trie *t, struct key_vector *tp,
1076} 1114}
1077 1115
1078/* Caller must hold RTNL. */ 1116/* Caller must hold RTNL. */
1079int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) 1117int fib_table_insert(struct net *net, struct fib_table *tb,
1118 struct fib_config *cfg)
1080{ 1119{
1081 struct trie *t = (struct trie *)tb->tb_data; 1120 struct trie *t = (struct trie *)tb->tb_data;
1082 struct fib_alias *fa, *new_fa; 1121 struct fib_alias *fa, *new_fa;
1083 struct key_vector *l, *tp; 1122 struct key_vector *l, *tp;
1084 unsigned int nlflags = 0; 1123 u16 nlflags = NLM_F_EXCL;
1085 struct fib_info *fi; 1124 struct fib_info *fi;
1086 u8 plen = cfg->fc_dst_len; 1125 u8 plen = cfg->fc_dst_len;
1087 u8 slen = KEYLENGTH - plen; 1126 u8 slen = KEYLENGTH - plen;
@@ -1126,6 +1165,8 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1126 if (cfg->fc_nlflags & NLM_F_EXCL) 1165 if (cfg->fc_nlflags & NLM_F_EXCL)
1127 goto out; 1166 goto out;
1128 1167
1168 nlflags &= ~NLM_F_EXCL;
1169
1129 /* We have 2 goals: 1170 /* We have 2 goals:
1130 * 1. Find exact match for type, scope, fib_info to avoid 1171 * 1. Find exact match for type, scope, fib_info to avoid
1131 * duplicate routes 1172 * duplicate routes
@@ -1151,6 +1192,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1151 struct fib_info *fi_drop; 1192 struct fib_info *fi_drop;
1152 u8 state; 1193 u8 state;
1153 1194
1195 nlflags |= NLM_F_REPLACE;
1154 fa = fa_first; 1196 fa = fa_first;
1155 if (fa_match) { 1197 if (fa_match) {
1156 if (fa == fa_match) 1198 if (fa == fa_match)
@@ -1172,17 +1214,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1172 new_fa->tb_id = tb->tb_id; 1214 new_fa->tb_id = tb->tb_id;
1173 new_fa->fa_default = -1; 1215 new_fa->fa_default = -1;
1174 1216
1175 err = switchdev_fib_ipv4_add(key, plen, fi,
1176 new_fa->fa_tos,
1177 cfg->fc_type,
1178 cfg->fc_nlflags,
1179 tb->tb_id);
1180 if (err) {
1181 switchdev_fib_ipv4_abort(fi);
1182 kmem_cache_free(fn_alias_kmem, new_fa);
1183 goto out;
1184 }
1185
1186 hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list); 1217 hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1187 1218
1188 alias_free_mem_rcu(fa); 1219 alias_free_mem_rcu(fa);
@@ -1190,8 +1221,13 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1190 fib_release_info(fi_drop); 1221 fib_release_info(fi_drop);
1191 if (state & FA_S_ACCESSED) 1222 if (state & FA_S_ACCESSED)
1192 rt_cache_flush(cfg->fc_nlinfo.nl_net); 1223 rt_cache_flush(cfg->fc_nlinfo.nl_net);
1224
1225 call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_ADD,
1226 key, plen, fi,
1227 new_fa->fa_tos, cfg->fc_type,
1228 tb->tb_id, cfg->fc_nlflags);
1193 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, 1229 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1194 tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE); 1230 tb->tb_id, &cfg->fc_nlinfo, nlflags);
1195 1231
1196 goto succeeded; 1232 goto succeeded;
1197 } 1233 }
@@ -1203,7 +1239,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1203 goto out; 1239 goto out;
1204 1240
1205 if (cfg->fc_nlflags & NLM_F_APPEND) 1241 if (cfg->fc_nlflags & NLM_F_APPEND)
1206 nlflags = NLM_F_APPEND; 1242 nlflags |= NLM_F_APPEND;
1207 else 1243 else
1208 fa = fa_first; 1244 fa = fa_first;
1209 } 1245 }
@@ -1211,6 +1247,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1211 if (!(cfg->fc_nlflags & NLM_F_CREATE)) 1247 if (!(cfg->fc_nlflags & NLM_F_CREATE))
1212 goto out; 1248 goto out;
1213 1249
1250 nlflags |= NLM_F_CREATE;
1214 err = -ENOBUFS; 1251 err = -ENOBUFS;
1215 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); 1252 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1216 if (!new_fa) 1253 if (!new_fa)
@@ -1224,30 +1261,22 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1224 new_fa->tb_id = tb->tb_id; 1261 new_fa->tb_id = tb->tb_id;
1225 new_fa->fa_default = -1; 1262 new_fa->fa_default = -1;
1226 1263
1227 /* (Optionally) offload fib entry to switch hardware. */
1228 err = switchdev_fib_ipv4_add(key, plen, fi, tos, cfg->fc_type,
1229 cfg->fc_nlflags, tb->tb_id);
1230 if (err) {
1231 switchdev_fib_ipv4_abort(fi);
1232 goto out_free_new_fa;
1233 }
1234
1235 /* Insert new entry to the list. */ 1264 /* Insert new entry to the list. */
1236 err = fib_insert_alias(t, tp, l, new_fa, fa, key); 1265 err = fib_insert_alias(t, tp, l, new_fa, fa, key);
1237 if (err) 1266 if (err)
1238 goto out_sw_fib_del; 1267 goto out_free_new_fa;
1239 1268
1240 if (!plen) 1269 if (!plen)
1241 tb->tb_num_default++; 1270 tb->tb_num_default++;
1242 1271
1243 rt_cache_flush(cfg->fc_nlinfo.nl_net); 1272 rt_cache_flush(cfg->fc_nlinfo.nl_net);
1273 call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, key, plen, fi, tos,
1274 cfg->fc_type, tb->tb_id, cfg->fc_nlflags);
1244 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id, 1275 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id,
1245 &cfg->fc_nlinfo, nlflags); 1276 &cfg->fc_nlinfo, nlflags);
1246succeeded: 1277succeeded:
1247 return 0; 1278 return 0;
1248 1279
1249out_sw_fib_del:
1250 switchdev_fib_ipv4_del(key, plen, fi, tos, cfg->fc_type, tb->tb_id);
1251out_free_new_fa: 1280out_free_new_fa:
1252 kmem_cache_free(fn_alias_kmem, new_fa); 1281 kmem_cache_free(fn_alias_kmem, new_fa);
1253out: 1282out:
@@ -1486,7 +1515,8 @@ static void fib_remove_alias(struct trie *t, struct key_vector *tp,
1486} 1515}
1487 1516
1488/* Caller must hold RTNL. */ 1517/* Caller must hold RTNL. */
1489int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) 1518int fib_table_delete(struct net *net, struct fib_table *tb,
1519 struct fib_config *cfg)
1490{ 1520{
1491 struct trie *t = (struct trie *) tb->tb_data; 1521 struct trie *t = (struct trie *) tb->tb_data;
1492 struct fib_alias *fa, *fa_to_delete; 1522 struct fib_alias *fa, *fa_to_delete;
@@ -1539,9 +1569,9 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1539 if (!fa_to_delete) 1569 if (!fa_to_delete)
1540 return -ESRCH; 1570 return -ESRCH;
1541 1571
1542 switchdev_fib_ipv4_del(key, plen, fa_to_delete->fa_info, tos, 1572 call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, plen,
1543 cfg->fc_type, tb->tb_id); 1573 fa_to_delete->fa_info, tos, cfg->fc_type,
1544 1574 tb->tb_id, 0);
1545 rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id, 1575 rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id,
1546 &cfg->fc_nlinfo, 0); 1576 &cfg->fc_nlinfo, 0);
1547 1577
@@ -1730,82 +1760,8 @@ out:
1730 return NULL; 1760 return NULL;
1731} 1761}
1732 1762
1733/* Caller must hold RTNL */
1734void fib_table_flush_external(struct fib_table *tb)
1735{
1736 struct trie *t = (struct trie *)tb->tb_data;
1737 struct key_vector *pn = t->kv;
1738 unsigned long cindex = 1;
1739 struct hlist_node *tmp;
1740 struct fib_alias *fa;
1741
1742 /* walk trie in reverse order */
1743 for (;;) {
1744 unsigned char slen = 0;
1745 struct key_vector *n;
1746
1747 if (!(cindex--)) {
1748 t_key pkey = pn->key;
1749
1750 /* cannot resize the trie vector */
1751 if (IS_TRIE(pn))
1752 break;
1753
1754 /* resize completed node */
1755 pn = resize(t, pn);
1756 cindex = get_index(pkey, pn);
1757
1758 continue;
1759 }
1760
1761 /* grab the next available node */
1762 n = get_child(pn, cindex);
1763 if (!n)
1764 continue;
1765
1766 if (IS_TNODE(n)) {
1767 /* record pn and cindex for leaf walking */
1768 pn = n;
1769 cindex = 1ul << n->bits;
1770
1771 continue;
1772 }
1773
1774 hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
1775 struct fib_info *fi = fa->fa_info;
1776
1777 /* if alias was cloned to local then we just
1778 * need to remove the local copy from main
1779 */
1780 if (tb->tb_id != fa->tb_id) {
1781 hlist_del_rcu(&fa->fa_list);
1782 alias_free_mem_rcu(fa);
1783 continue;
1784 }
1785
1786 /* record local slen */
1787 slen = fa->fa_slen;
1788
1789 if (!fi || !(fi->fib_flags & RTNH_F_OFFLOAD))
1790 continue;
1791
1792 switchdev_fib_ipv4_del(n->key, KEYLENGTH - fa->fa_slen,
1793 fi, fa->fa_tos, fa->fa_type,
1794 tb->tb_id);
1795 }
1796
1797 /* update leaf slen */
1798 n->slen = slen;
1799
1800 if (hlist_empty(&n->leaf)) {
1801 put_child_root(pn, n->key, NULL);
1802 node_free(n);
1803 }
1804 }
1805}
1806
1807/* Caller must hold RTNL. */ 1763/* Caller must hold RTNL. */
1808int fib_table_flush(struct fib_table *tb) 1764int fib_table_flush(struct net *net, struct fib_table *tb)
1809{ 1765{
1810 struct trie *t = (struct trie *)tb->tb_data; 1766 struct trie *t = (struct trie *)tb->tb_data;
1811 struct key_vector *pn = t->kv; 1767 struct key_vector *pn = t->kv;
@@ -1854,9 +1810,11 @@ int fib_table_flush(struct fib_table *tb)
1854 continue; 1810 continue;
1855 } 1811 }
1856 1812
1857 switchdev_fib_ipv4_del(n->key, KEYLENGTH - fa->fa_slen, 1813 call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL,
1858 fi, fa->fa_tos, fa->fa_type, 1814 n->key,
1859 tb->tb_id); 1815 KEYLENGTH - fa->fa_slen,
1816 fi, fa->fa_tos, fa->fa_type,
1817 tb->tb_id, 0);
1860 hlist_del_rcu(&fa->fa_list); 1818 hlist_del_rcu(&fa->fa_list);
1861 fib_release_info(fa->fa_info); 1819 fib_release_info(fa->fa_info);
1862 alias_free_mem_rcu(fa); 1820 alias_free_mem_rcu(fa);
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 321d57f825ce..cf50f7e2b012 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -631,7 +631,7 @@ static struct genl_family fou_nl_family = {
631 .netnsok = true, 631 .netnsok = true,
632}; 632};
633 633
634static struct nla_policy fou_nl_policy[FOU_ATTR_MAX + 1] = { 634static const struct nla_policy fou_nl_policy[FOU_ATTR_MAX + 1] = {
635 [FOU_ATTR_PORT] = { .type = NLA_U16, }, 635 [FOU_ATTR_PORT] = { .type = NLA_U16, },
636 [FOU_ATTR_AF] = { .type = NLA_U8, }, 636 [FOU_ATTR_AF] = { .type = NLA_U8, },
637 [FOU_ATTR_IPPROTO] = { .type = NLA_U8, }, 637 [FOU_ATTR_IPPROTO] = { .type = NLA_U8, },
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index ecd1e09dbbf1..96e0efecefa6 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -24,7 +24,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
24 __be16 protocol = skb->protocol; 24 __be16 protocol = skb->protocol;
25 u16 mac_len = skb->mac_len; 25 u16 mac_len = skb->mac_len;
26 int gre_offset, outer_hlen; 26 int gre_offset, outer_hlen;
27 bool need_csum, ufo; 27 bool need_csum, ufo, gso_partial;
28 28
29 if (!skb->encapsulation) 29 if (!skb->encapsulation)
30 goto out; 30 goto out;
@@ -69,6 +69,8 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
69 goto out; 69 goto out;
70 } 70 }
71 71
72 gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
73
72 outer_hlen = skb_tnl_header_len(skb); 74 outer_hlen = skb_tnl_header_len(skb);
73 gre_offset = outer_hlen - tnl_hlen; 75 gre_offset = outer_hlen - tnl_hlen;
74 skb = segs; 76 skb = segs;
@@ -96,7 +98,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
96 greh = (struct gre_base_hdr *)skb_transport_header(skb); 98 greh = (struct gre_base_hdr *)skb_transport_header(skb);
97 pcsum = (__sum16 *)(greh + 1); 99 pcsum = (__sum16 *)(greh + 1);
98 100
99 if (skb_is_gso(skb)) { 101 if (gso_partial) {
100 unsigned int partial_adj; 102 unsigned int partial_adj;
101 103
102 /* Adjust checksum to account for the fact that 104 /* Adjust checksum to account for the fact that
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 9b4ca87f70ba..606cc3e85d2b 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -472,6 +472,15 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
472 continue; 472 continue;
473 } 473 }
474 474
475 /* Based on RFC3376 5.1. Should not send source-list change
476 * records when there is a filter mode change.
477 */
478 if (((gdeleted && pmc->sfmode == MCAST_EXCLUDE) ||
479 (!gdeleted && pmc->crcount)) &&
480 (type == IGMPV3_ALLOW_NEW_SOURCES ||
481 type == IGMPV3_BLOCK_OLD_SOURCES) && psf->sf_crcount)
482 goto decrease_sf_crcount;
483
475 /* clear marks on query responses */ 484 /* clear marks on query responses */
476 if (isquery) 485 if (isquery)
477 psf->sf_gsresp = 0; 486 psf->sf_gsresp = 0;
@@ -499,6 +508,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
499 scount++; stotal++; 508 scount++; stotal++;
500 if ((type == IGMPV3_ALLOW_NEW_SOURCES || 509 if ((type == IGMPV3_ALLOW_NEW_SOURCES ||
501 type == IGMPV3_BLOCK_OLD_SOURCES) && psf->sf_crcount) { 510 type == IGMPV3_BLOCK_OLD_SOURCES) && psf->sf_crcount) {
511decrease_sf_crcount:
502 psf->sf_crcount--; 512 psf->sf_crcount--;
503 if ((sdeleted || gdeleted) && psf->sf_crcount == 0) { 513 if ((sdeleted || gdeleted) && psf->sf_crcount == 0) {
504 if (psf_prev) 514 if (psf_prev)
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 38c2c47fe0e8..e4d16fc5bbb3 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -45,6 +45,7 @@ struct inet_diag_entry {
45 u16 family; 45 u16 family;
46 u16 userlocks; 46 u16 userlocks;
47 u32 ifindex; 47 u32 ifindex;
48 u32 mark;
48}; 49};
49 50
50static DEFINE_MUTEX(inet_diag_table_mutex); 51static DEFINE_MUTEX(inet_diag_table_mutex);
@@ -98,6 +99,7 @@ static size_t inet_sk_attr_size(void)
98 + nla_total_size(1) /* INET_DIAG_SHUTDOWN */ 99 + nla_total_size(1) /* INET_DIAG_SHUTDOWN */
99 + nla_total_size(1) /* INET_DIAG_TOS */ 100 + nla_total_size(1) /* INET_DIAG_TOS */
100 + nla_total_size(1) /* INET_DIAG_TCLASS */ 101 + nla_total_size(1) /* INET_DIAG_TCLASS */
102 + nla_total_size(4) /* INET_DIAG_MARK */
101 + nla_total_size(sizeof(struct inet_diag_meminfo)) 103 + nla_total_size(sizeof(struct inet_diag_meminfo))
102 + nla_total_size(sizeof(struct inet_diag_msg)) 104 + nla_total_size(sizeof(struct inet_diag_msg))
103 + nla_total_size(SK_MEMINFO_VARS * sizeof(u32)) 105 + nla_total_size(SK_MEMINFO_VARS * sizeof(u32))
@@ -108,7 +110,8 @@ static size_t inet_sk_attr_size(void)
108 110
109int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, 111int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
110 struct inet_diag_msg *r, int ext, 112 struct inet_diag_msg *r, int ext,
111 struct user_namespace *user_ns) 113 struct user_namespace *user_ns,
114 bool net_admin)
112{ 115{
113 const struct inet_sock *inet = inet_sk(sk); 116 const struct inet_sock *inet = inet_sk(sk);
114 117
@@ -135,6 +138,9 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
135 } 138 }
136#endif 139#endif
137 140
141 if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, sk->sk_mark))
142 goto errout;
143
138 r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk)); 144 r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
139 r->idiag_inode = sock_i_ino(sk); 145 r->idiag_inode = sock_i_ino(sk);
140 146
@@ -148,7 +154,8 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
148 struct sk_buff *skb, const struct inet_diag_req_v2 *req, 154 struct sk_buff *skb, const struct inet_diag_req_v2 *req,
149 struct user_namespace *user_ns, 155 struct user_namespace *user_ns,
150 u32 portid, u32 seq, u16 nlmsg_flags, 156 u32 portid, u32 seq, u16 nlmsg_flags,
151 const struct nlmsghdr *unlh) 157 const struct nlmsghdr *unlh,
158 bool net_admin)
152{ 159{
153 const struct tcp_congestion_ops *ca_ops; 160 const struct tcp_congestion_ops *ca_ops;
154 const struct inet_diag_handler *handler; 161 const struct inet_diag_handler *handler;
@@ -174,7 +181,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
174 r->idiag_timer = 0; 181 r->idiag_timer = 0;
175 r->idiag_retrans = 0; 182 r->idiag_retrans = 0;
176 183
177 if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns)) 184 if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns, net_admin))
178 goto errout; 185 goto errout;
179 186
180 if (ext & (1 << (INET_DIAG_MEMINFO - 1))) { 187 if (ext & (1 << (INET_DIAG_MEMINFO - 1))) {
@@ -273,10 +280,11 @@ static int inet_csk_diag_fill(struct sock *sk,
273 const struct inet_diag_req_v2 *req, 280 const struct inet_diag_req_v2 *req,
274 struct user_namespace *user_ns, 281 struct user_namespace *user_ns,
275 u32 portid, u32 seq, u16 nlmsg_flags, 282 u32 portid, u32 seq, u16 nlmsg_flags,
276 const struct nlmsghdr *unlh) 283 const struct nlmsghdr *unlh,
284 bool net_admin)
277{ 285{
278 return inet_sk_diag_fill(sk, inet_csk(sk), skb, req, 286 return inet_sk_diag_fill(sk, inet_csk(sk), skb, req, user_ns,
279 user_ns, portid, seq, nlmsg_flags, unlh); 287 portid, seq, nlmsg_flags, unlh, net_admin);
280} 288}
281 289
282static int inet_twsk_diag_fill(struct sock *sk, 290static int inet_twsk_diag_fill(struct sock *sk,
@@ -318,8 +326,9 @@ static int inet_twsk_diag_fill(struct sock *sk,
318 326
319static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb, 327static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb,
320 u32 portid, u32 seq, u16 nlmsg_flags, 328 u32 portid, u32 seq, u16 nlmsg_flags,
321 const struct nlmsghdr *unlh) 329 const struct nlmsghdr *unlh, bool net_admin)
322{ 330{
331 struct request_sock *reqsk = inet_reqsk(sk);
323 struct inet_diag_msg *r; 332 struct inet_diag_msg *r;
324 struct nlmsghdr *nlh; 333 struct nlmsghdr *nlh;
325 long tmo; 334 long tmo;
@@ -333,7 +342,7 @@ static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb,
333 inet_diag_msg_common_fill(r, sk); 342 inet_diag_msg_common_fill(r, sk);
334 r->idiag_state = TCP_SYN_RECV; 343 r->idiag_state = TCP_SYN_RECV;
335 r->idiag_timer = 1; 344 r->idiag_timer = 1;
336 r->idiag_retrans = inet_reqsk(sk)->num_retrans; 345 r->idiag_retrans = reqsk->num_retrans;
337 346
338 BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) != 347 BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) !=
339 offsetof(struct sock, sk_cookie)); 348 offsetof(struct sock, sk_cookie));
@@ -345,6 +354,10 @@ static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb,
345 r->idiag_uid = 0; 354 r->idiag_uid = 0;
346 r->idiag_inode = 0; 355 r->idiag_inode = 0;
347 356
357 if (net_admin && nla_put_u32(skb, INET_DIAG_MARK,
358 inet_rsk(reqsk)->ir_mark))
359 return -EMSGSIZE;
360
348 nlmsg_end(skb, nlh); 361 nlmsg_end(skb, nlh);
349 return 0; 362 return 0;
350} 363}
@@ -353,7 +366,7 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
353 const struct inet_diag_req_v2 *r, 366 const struct inet_diag_req_v2 *r,
354 struct user_namespace *user_ns, 367 struct user_namespace *user_ns,
355 u32 portid, u32 seq, u16 nlmsg_flags, 368 u32 portid, u32 seq, u16 nlmsg_flags,
356 const struct nlmsghdr *unlh) 369 const struct nlmsghdr *unlh, bool net_admin)
357{ 370{
358 if (sk->sk_state == TCP_TIME_WAIT) 371 if (sk->sk_state == TCP_TIME_WAIT)
359 return inet_twsk_diag_fill(sk, skb, portid, seq, 372 return inet_twsk_diag_fill(sk, skb, portid, seq,
@@ -361,10 +374,10 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
361 374
362 if (sk->sk_state == TCP_NEW_SYN_RECV) 375 if (sk->sk_state == TCP_NEW_SYN_RECV)
363 return inet_req_diag_fill(sk, skb, portid, seq, 376 return inet_req_diag_fill(sk, skb, portid, seq,
364 nlmsg_flags, unlh); 377 nlmsg_flags, unlh, net_admin);
365 378
366 return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq, 379 return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq,
367 nlmsg_flags, unlh); 380 nlmsg_flags, unlh, net_admin);
368} 381}
369 382
370struct sock *inet_diag_find_one_icsk(struct net *net, 383struct sock *inet_diag_find_one_icsk(struct net *net,
@@ -434,7 +447,8 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo,
434 err = sk_diag_fill(sk, rep, req, 447 err = sk_diag_fill(sk, rep, req,
435 sk_user_ns(NETLINK_CB(in_skb).sk), 448 sk_user_ns(NETLINK_CB(in_skb).sk),
436 NETLINK_CB(in_skb).portid, 449 NETLINK_CB(in_skb).portid,
437 nlh->nlmsg_seq, 0, nlh); 450 nlh->nlmsg_seq, 0, nlh,
451 netlink_net_capable(in_skb, CAP_NET_ADMIN));
438 if (err < 0) { 452 if (err < 0) {
439 WARN_ON(err == -EMSGSIZE); 453 WARN_ON(err == -EMSGSIZE);
440 nlmsg_free(rep); 454 nlmsg_free(rep);
@@ -580,6 +594,14 @@ static int inet_diag_bc_run(const struct nlattr *_bc,
580 yes = 0; 594 yes = 0;
581 break; 595 break;
582 } 596 }
597 case INET_DIAG_BC_MARK_COND: {
598 struct inet_diag_markcond *cond;
599
600 cond = (struct inet_diag_markcond *)(op + 1);
601 if ((entry->mark & cond->mask) != cond->mark)
602 yes = 0;
603 break;
604 }
583 } 605 }
584 606
585 if (yes) { 607 if (yes) {
@@ -624,6 +646,12 @@ int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk)
624 entry.dport = ntohs(inet->inet_dport); 646 entry.dport = ntohs(inet->inet_dport);
625 entry.ifindex = sk->sk_bound_dev_if; 647 entry.ifindex = sk->sk_bound_dev_if;
626 entry.userlocks = sk_fullsock(sk) ? sk->sk_userlocks : 0; 648 entry.userlocks = sk_fullsock(sk) ? sk->sk_userlocks : 0;
649 if (sk_fullsock(sk))
650 entry.mark = sk->sk_mark;
651 else if (sk->sk_state == TCP_NEW_SYN_RECV)
652 entry.mark = inet_rsk(inet_reqsk(sk))->ir_mark;
653 else
654 entry.mark = 0;
627 655
628 return inet_diag_bc_run(bc, &entry); 656 return inet_diag_bc_run(bc, &entry);
629} 657}
@@ -706,10 +734,25 @@ static bool valid_port_comparison(const struct inet_diag_bc_op *op,
706 return true; 734 return true;
707} 735}
708 736
709static int inet_diag_bc_audit(const void *bytecode, int bytecode_len) 737static bool valid_markcond(const struct inet_diag_bc_op *op, int len,
738 int *min_len)
739{
740 *min_len += sizeof(struct inet_diag_markcond);
741 return len >= *min_len;
742}
743
744static int inet_diag_bc_audit(const struct nlattr *attr,
745 const struct sk_buff *skb)
710{ 746{
711 const void *bc = bytecode; 747 bool net_admin = netlink_net_capable(skb, CAP_NET_ADMIN);
712 int len = bytecode_len; 748 const void *bytecode, *bc;
749 int bytecode_len, len;
750
751 if (!attr || nla_len(attr) < sizeof(struct inet_diag_bc_op))
752 return -EINVAL;
753
754 bytecode = bc = nla_data(attr);
755 len = bytecode_len = nla_len(attr);
713 756
714 while (len > 0) { 757 while (len > 0) {
715 int min_len = sizeof(struct inet_diag_bc_op); 758 int min_len = sizeof(struct inet_diag_bc_op);
@@ -732,6 +775,12 @@ static int inet_diag_bc_audit(const void *bytecode, int bytecode_len)
732 if (!valid_port_comparison(bc, len, &min_len)) 775 if (!valid_port_comparison(bc, len, &min_len))
733 return -EINVAL; 776 return -EINVAL;
734 break; 777 break;
778 case INET_DIAG_BC_MARK_COND:
779 if (!net_admin)
780 return -EPERM;
781 if (!valid_markcond(bc, len, &min_len))
782 return -EINVAL;
783 break;
735 case INET_DIAG_BC_AUTO: 784 case INET_DIAG_BC_AUTO:
736 case INET_DIAG_BC_JMP: 785 case INET_DIAG_BC_JMP:
737 case INET_DIAG_BC_NOP: 786 case INET_DIAG_BC_NOP:
@@ -760,7 +809,8 @@ static int inet_csk_diag_dump(struct sock *sk,
760 struct sk_buff *skb, 809 struct sk_buff *skb,
761 struct netlink_callback *cb, 810 struct netlink_callback *cb,
762 const struct inet_diag_req_v2 *r, 811 const struct inet_diag_req_v2 *r,
763 const struct nlattr *bc) 812 const struct nlattr *bc,
813 bool net_admin)
764{ 814{
765 if (!inet_diag_bc_sk(bc, sk)) 815 if (!inet_diag_bc_sk(bc, sk))
766 return 0; 816 return 0;
@@ -768,7 +818,8 @@ static int inet_csk_diag_dump(struct sock *sk,
768 return inet_csk_diag_fill(sk, skb, r, 818 return inet_csk_diag_fill(sk, skb, r,
769 sk_user_ns(NETLINK_CB(cb->skb).sk), 819 sk_user_ns(NETLINK_CB(cb->skb).sk),
770 NETLINK_CB(cb->skb).portid, 820 NETLINK_CB(cb->skb).portid,
771 cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); 821 cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh,
822 net_admin);
772} 823}
773 824
774static void twsk_build_assert(void) 825static void twsk_build_assert(void)
@@ -804,6 +855,7 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
804 struct net *net = sock_net(skb->sk); 855 struct net *net = sock_net(skb->sk);
805 int i, num, s_i, s_num; 856 int i, num, s_i, s_num;
806 u32 idiag_states = r->idiag_states; 857 u32 idiag_states = r->idiag_states;
858 bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
807 859
808 if (idiag_states & TCPF_SYN_RECV) 860 if (idiag_states & TCPF_SYN_RECV)
809 idiag_states |= TCPF_NEW_SYN_RECV; 861 idiag_states |= TCPF_NEW_SYN_RECV;
@@ -844,7 +896,8 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
844 cb->args[3] > 0) 896 cb->args[3] > 0)
845 goto next_listen; 897 goto next_listen;
846 898
847 if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) { 899 if (inet_csk_diag_dump(sk, skb, cb, r,
900 bc, net_admin) < 0) {
848 spin_unlock_bh(&ilb->lock); 901 spin_unlock_bh(&ilb->lock);
849 goto done; 902 goto done;
850 } 903 }
@@ -912,7 +965,7 @@ skip_listen_ht:
912 sk_user_ns(NETLINK_CB(cb->skb).sk), 965 sk_user_ns(NETLINK_CB(cb->skb).sk),
913 NETLINK_CB(cb->skb).portid, 966 NETLINK_CB(cb->skb).portid,
914 cb->nlh->nlmsg_seq, NLM_F_MULTI, 967 cb->nlh->nlmsg_seq, NLM_F_MULTI,
915 cb->nlh); 968 cb->nlh, net_admin);
916 if (res < 0) { 969 if (res < 0) {
917 spin_unlock_bh(lock); 970 spin_unlock_bh(lock);
918 goto done; 971 goto done;
@@ -1020,13 +1073,13 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
1020 if (nlh->nlmsg_flags & NLM_F_DUMP) { 1073 if (nlh->nlmsg_flags & NLM_F_DUMP) {
1021 if (nlmsg_attrlen(nlh, hdrlen)) { 1074 if (nlmsg_attrlen(nlh, hdrlen)) {
1022 struct nlattr *attr; 1075 struct nlattr *attr;
1076 int err;
1023 1077
1024 attr = nlmsg_find_attr(nlh, hdrlen, 1078 attr = nlmsg_find_attr(nlh, hdrlen,
1025 INET_DIAG_REQ_BYTECODE); 1079 INET_DIAG_REQ_BYTECODE);
1026 if (!attr || 1080 err = inet_diag_bc_audit(attr, skb);
1027 nla_len(attr) < sizeof(struct inet_diag_bc_op) || 1081 if (err)
1028 inet_diag_bc_audit(nla_data(attr), nla_len(attr))) 1082 return err;
1029 return -EINVAL;
1030 } 1083 }
1031 { 1084 {
1032 struct netlink_dump_control c = { 1085 struct netlink_dump_control c = {
@@ -1051,13 +1104,13 @@ static int inet_diag_handler_cmd(struct sk_buff *skb, struct nlmsghdr *h)
1051 h->nlmsg_flags & NLM_F_DUMP) { 1104 h->nlmsg_flags & NLM_F_DUMP) {
1052 if (nlmsg_attrlen(h, hdrlen)) { 1105 if (nlmsg_attrlen(h, hdrlen)) {
1053 struct nlattr *attr; 1106 struct nlattr *attr;
1107 int err;
1054 1108
1055 attr = nlmsg_find_attr(h, hdrlen, 1109 attr = nlmsg_find_attr(h, hdrlen,
1056 INET_DIAG_REQ_BYTECODE); 1110 INET_DIAG_REQ_BYTECODE);
1057 if (!attr || 1111 err = inet_diag_bc_audit(attr, skb);
1058 nla_len(attr) < sizeof(struct inet_diag_bc_op) || 1112 if (err)
1059 inet_diag_bc_audit(nla_data(attr), nla_len(attr))) 1113 return err;
1060 return -EINVAL;
1061 } 1114 }
1062 { 1115 {
1063 struct netlink_dump_control c = { 1116 struct netlink_dump_control c = {
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 113cc43df789..576f705d8180 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -246,25 +246,6 @@ static void gre_err(struct sk_buff *skb, u32 info)
246 ipgre_err(skb, info, &tpi); 246 ipgre_err(skb, info, &tpi);
247} 247}
248 248
249static __be64 key_to_tunnel_id(__be32 key)
250{
251#ifdef __BIG_ENDIAN
252 return (__force __be64)((__force u32)key);
253#else
254 return (__force __be64)((__force u64)key << 32);
255#endif
256}
257
258/* Returns the least-significant 32 bits of a __be64. */
259static __be32 tunnel_id_to_key(__be64 x)
260{
261#ifdef __BIG_ENDIAN
262 return (__force __be32)x;
263#else
264 return (__force __be32)((__force u64)x >> 32);
265#endif
266}
267
268static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi, 249static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
269 struct ip_tunnel_net *itn, int hdr_len, bool raw_proto) 250 struct ip_tunnel_net *itn, int hdr_len, bool raw_proto)
270{ 251{
@@ -290,7 +271,7 @@ static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
290 __be64 tun_id; 271 __be64 tun_id;
291 272
292 flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY); 273 flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY);
293 tun_id = key_to_tunnel_id(tpi->key); 274 tun_id = key32_to_tunnel_id(tpi->key);
294 tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0); 275 tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0);
295 if (!tun_dst) 276 if (!tun_dst)
296 return PACKET_REJECT; 277 return PACKET_REJECT;
@@ -446,7 +427,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
446 427
447 flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY); 428 flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY);
448 gre_build_header(skb, tunnel_hlen, flags, proto, 429 gre_build_header(skb, tunnel_hlen, flags, proto,
449 tunnel_id_to_key(tun_info->key.tun_id), 0); 430 tunnel_id_to_key32(tun_info->key.tun_id), 0);
450 431
451 df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; 432 df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
452 433
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index dde37fb340bf..05d105832bdb 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -73,6 +73,7 @@
73#include <net/icmp.h> 73#include <net/icmp.h>
74#include <net/checksum.h> 74#include <net/checksum.h>
75#include <net/inetpeer.h> 75#include <net/inetpeer.h>
76#include <net/lwtunnel.h>
76#include <linux/igmp.h> 77#include <linux/igmp.h>
77#include <linux/netfilter_ipv4.h> 78#include <linux/netfilter_ipv4.h>
78#include <linux/netfilter_bridge.h> 79#include <linux/netfilter_bridge.h>
@@ -98,6 +99,14 @@ int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
98 99
99 iph->tot_len = htons(skb->len); 100 iph->tot_len = htons(skb->len);
100 ip_send_check(iph); 101 ip_send_check(iph);
102
103 /* if egress device is enslaved to an L3 master device pass the
104 * skb to its handler for processing
105 */
106 skb = l3mdev_ip_out(sk, skb);
107 if (unlikely(!skb))
108 return 0;
109
101 return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, 110 return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
102 net, sk, skb, NULL, skb_dst(skb)->dev, 111 net, sk, skb, NULL, skb_dst(skb)->dev,
103 dst_output); 112 dst_output);
@@ -197,6 +206,13 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s
197 skb = skb2; 206 skb = skb2;
198 } 207 }
199 208
209 if (lwtunnel_xmit_redirect(dst->lwtstate)) {
210 int res = lwtunnel_xmit(skb);
211
212 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
213 return res;
214 }
215
200 rcu_read_lock_bh(); 216 rcu_read_lock_bh();
201 nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr); 217 nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr);
202 neigh = __ipv4_neigh_lookup_noref(dev, nexthop); 218 neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
@@ -482,7 +498,7 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
482 to->tc_index = from->tc_index; 498 to->tc_index = from->tc_index;
483#endif 499#endif
484 nf_copy(to, from); 500 nf_copy(to, from);
485#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) 501#if IS_ENABLED(CONFIG_IP_VS)
486 to->ipvs_property = from->ipvs_property; 502 to->ipvs_property = from->ipvs_property;
487#endif 503#endif
488 skb_copy_secmark(to, from); 504 skb_copy_secmark(to, from);
@@ -1566,8 +1582,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
1566 } 1582 }
1567 1583
1568 oif = arg->bound_dev_if; 1584 oif = arg->bound_dev_if;
1569 if (!oif && netif_index_is_l3_master(net, skb->skb_iif)) 1585 oif = oif ? : skb->skb_iif;
1570 oif = skb->skb_iif;
1571 1586
1572 flowi4_init_output(&fl4, oif, 1587 flowi4_init_output(&fl4, oif,
1573 IP4_REPLY_MARK(net, skb->mark), 1588 IP4_REPLY_MARK(net, skb->mark),
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 71a52f4d4cff..af4919792b6a 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -284,9 +284,12 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc,
284 ipc->ttl = val; 284 ipc->ttl = val;
285 break; 285 break;
286 case IP_TOS: 286 case IP_TOS:
287 if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) 287 if (cmsg->cmsg_len == CMSG_LEN(sizeof(int)))
288 val = *(int *)CMSG_DATA(cmsg);
289 else if (cmsg->cmsg_len == CMSG_LEN(sizeof(u8)))
290 val = *(u8 *)CMSG_DATA(cmsg);
291 else
288 return -EINVAL; 292 return -EINVAL;
289 val = *(int *)CMSG_DATA(cmsg);
290 if (val < 0 || val > 255) 293 if (val < 0 || val > 255)
291 return -EINVAL; 294 return -EINVAL;
292 ipc->tos = val; 295 ipc->tos = val;
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 95649ebd2874..5719d6ba0824 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -55,6 +55,7 @@
55#include <net/netns/generic.h> 55#include <net/netns/generic.h>
56#include <net/rtnetlink.h> 56#include <net/rtnetlink.h>
57#include <net/udp.h> 57#include <net/udp.h>
58#include <net/dst_metadata.h>
58 59
59#if IS_ENABLED(CONFIG_IPV6) 60#if IS_ENABLED(CONFIG_IPV6)
60#include <net/ipv6.h> 61#include <net/ipv6.h>
@@ -546,6 +547,81 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
546 return 0; 547 return 0;
547} 548}
548 549
550void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
551{
552 struct ip_tunnel *tunnel = netdev_priv(dev);
553 u32 headroom = sizeof(struct iphdr);
554 struct ip_tunnel_info *tun_info;
555 const struct ip_tunnel_key *key;
556 const struct iphdr *inner_iph;
557 struct rtable *rt;
558 struct flowi4 fl4;
559 __be16 df = 0;
560 u8 tos, ttl;
561
562 tun_info = skb_tunnel_info(skb);
563 if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
564 ip_tunnel_info_af(tun_info) != AF_INET))
565 goto tx_error;
566 key = &tun_info->key;
567 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
568 inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
569 tos = key->tos;
570 if (tos == 1) {
571 if (skb->protocol == htons(ETH_P_IP))
572 tos = inner_iph->tos;
573 else if (skb->protocol == htons(ETH_P_IPV6))
574 tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
575 }
576 init_tunnel_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 0,
577 RT_TOS(tos), tunnel->parms.link);
578 if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
579 goto tx_error;
580 rt = ip_route_output_key(tunnel->net, &fl4);
581 if (IS_ERR(rt)) {
582 dev->stats.tx_carrier_errors++;
583 goto tx_error;
584 }
585 if (rt->dst.dev == dev) {
586 ip_rt_put(rt);
587 dev->stats.collisions++;
588 goto tx_error;
589 }
590 tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
591 ttl = key->ttl;
592 if (ttl == 0) {
593 if (skb->protocol == htons(ETH_P_IP))
594 ttl = inner_iph->ttl;
595 else if (skb->protocol == htons(ETH_P_IPV6))
596 ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
597 else
598 ttl = ip4_dst_hoplimit(&rt->dst);
599 }
600 if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
601 df = htons(IP_DF);
602 else if (skb->protocol == htons(ETH_P_IP))
603 df = inner_iph->frag_off & htons(IP_DF);
604 headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
605 if (headroom > dev->needed_headroom)
606 dev->needed_headroom = headroom;
607
608 if (skb_cow_head(skb, dev->needed_headroom)) {
609 ip_rt_put(rt);
610 goto tx_dropped;
611 }
612 iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, key->tos,
613 key->ttl, df, !net_eq(tunnel->net, dev_net(dev)));
614 return;
615tx_error:
616 dev->stats.tx_errors++;
617 goto kfree;
618tx_dropped:
619 dev->stats.tx_dropped++;
620kfree:
621 kfree_skb(skb);
622}
623EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
624
549void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, 625void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
550 const struct iphdr *tnl_params, u8 protocol) 626 const struct iphdr *tnl_params, u8 protocol)
551{ 627{
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 0f227db0e9ac..777bc1883870 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -69,7 +69,7 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
69 69
70 skb_scrub_packet(skb, xnet); 70 skb_scrub_packet(skb, xnet);
71 71
72 skb_clear_hash(skb); 72 skb_clear_hash_if_not_l4(skb);
73 skb_dst_set(skb, &rt->dst); 73 skb_dst_set(skb, &rt->dst);
74 memset(IPCB(skb), 0, sizeof(*IPCB(skb))); 74 memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
75 75
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 1d71c40eaaf3..071a785c65eb 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -85,7 +85,6 @@
85/* Define the timeout for waiting for a DHCP/BOOTP/RARP reply */ 85/* Define the timeout for waiting for a DHCP/BOOTP/RARP reply */
86#define CONF_OPEN_RETRIES 2 /* (Re)open devices twice */ 86#define CONF_OPEN_RETRIES 2 /* (Re)open devices twice */
87#define CONF_SEND_RETRIES 6 /* Send six requests per open */ 87#define CONF_SEND_RETRIES 6 /* Send six requests per open */
88#define CONF_INTER_TIMEOUT (HZ) /* Inter-device timeout: 1 second */
89#define CONF_BASE_TIMEOUT (HZ*2) /* Initial timeout: 2 seconds */ 88#define CONF_BASE_TIMEOUT (HZ*2) /* Initial timeout: 2 seconds */
90#define CONF_TIMEOUT_RANDOM (HZ) /* Maximum amount of randomization */ 89#define CONF_TIMEOUT_RANDOM (HZ) /* Maximum amount of randomization */
91#define CONF_TIMEOUT_MULT *7/4 /* Rate of timeout growth */ 90#define CONF_TIMEOUT_MULT *7/4 /* Rate of timeout growth */
@@ -188,7 +187,7 @@ struct ic_device {
188}; 187};
189 188
190static struct ic_device *ic_first_dev __initdata; /* List of open device */ 189static struct ic_device *ic_first_dev __initdata; /* List of open device */
191static struct net_device *ic_dev __initdata; /* Selected device */ 190static struct ic_device *ic_dev __initdata; /* Selected device */
192 191
193static bool __init ic_is_init_dev(struct net_device *dev) 192static bool __init ic_is_init_dev(struct net_device *dev)
194{ 193{
@@ -307,7 +306,7 @@ static void __init ic_close_devs(void)
307 while ((d = next)) { 306 while ((d = next)) {
308 next = d->next; 307 next = d->next;
309 dev = d->dev; 308 dev = d->dev;
310 if (dev != ic_dev && !netdev_uses_dsa(dev)) { 309 if ((!ic_dev || dev != ic_dev->dev) && !netdev_uses_dsa(dev)) {
311 pr_debug("IP-Config: Downing %s\n", dev->name); 310 pr_debug("IP-Config: Downing %s\n", dev->name);
312 dev_change_flags(dev, d->flags); 311 dev_change_flags(dev, d->flags);
313 } 312 }
@@ -372,7 +371,7 @@ static int __init ic_setup_if(void)
372 int err; 371 int err;
373 372
374 memset(&ir, 0, sizeof(ir)); 373 memset(&ir, 0, sizeof(ir));
375 strcpy(ir.ifr_ifrn.ifrn_name, ic_dev->name); 374 strcpy(ir.ifr_ifrn.ifrn_name, ic_dev->dev->name);
376 set_sockaddr(sin, ic_myaddr, 0); 375 set_sockaddr(sin, ic_myaddr, 0);
377 if ((err = ic_devinet_ioctl(SIOCSIFADDR, &ir)) < 0) { 376 if ((err = ic_devinet_ioctl(SIOCSIFADDR, &ir)) < 0) {
378 pr_err("IP-Config: Unable to set interface address (%d)\n", 377 pr_err("IP-Config: Unable to set interface address (%d)\n",
@@ -396,7 +395,7 @@ static int __init ic_setup_if(void)
396 * out, we'll try to muddle along. 395 * out, we'll try to muddle along.
397 */ 396 */
398 if (ic_dev_mtu != 0) { 397 if (ic_dev_mtu != 0) {
399 strcpy(ir.ifr_name, ic_dev->name); 398 strcpy(ir.ifr_name, ic_dev->dev->name);
400 ir.ifr_mtu = ic_dev_mtu; 399 ir.ifr_mtu = ic_dev_mtu;
401 if ((err = ic_dev_ioctl(SIOCSIFMTU, &ir)) < 0) 400 if ((err = ic_dev_ioctl(SIOCSIFMTU, &ir)) < 0)
402 pr_err("IP-Config: Unable to set interface mtu to %d (%d)\n", 401 pr_err("IP-Config: Unable to set interface mtu to %d (%d)\n",
@@ -568,7 +567,7 @@ ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
568 goto drop_unlock; 567 goto drop_unlock;
569 568
570 /* We have a winner! */ 569 /* We have a winner! */
571 ic_dev = dev; 570 ic_dev = d;
572 if (ic_myaddr == NONE) 571 if (ic_myaddr == NONE)
573 ic_myaddr = tip; 572 ic_myaddr = tip;
574 ic_servaddr = sip; 573 ic_servaddr = sip;
@@ -655,8 +654,6 @@ static struct packet_type bootp_packet_type __initdata = {
655 .func = ic_bootp_recv, 654 .func = ic_bootp_recv,
656}; 655};
657 656
658static __be32 ic_dev_xid; /* Device under configuration */
659
660/* 657/*
661 * Initialize DHCP/BOOTP extension fields in the request. 658 * Initialize DHCP/BOOTP extension fields in the request.
662 */ 659 */
@@ -666,14 +663,14 @@ static const u8 ic_bootp_cookie[4] = { 99, 130, 83, 99 };
666#ifdef IPCONFIG_DHCP 663#ifdef IPCONFIG_DHCP
667 664
668static void __init 665static void __init
669ic_dhcp_init_options(u8 *options) 666ic_dhcp_init_options(u8 *options, struct ic_device *d)
670{ 667{
671 u8 mt = ((ic_servaddr == NONE) 668 u8 mt = ((ic_servaddr == NONE)
672 ? DHCPDISCOVER : DHCPREQUEST); 669 ? DHCPDISCOVER : DHCPREQUEST);
673 u8 *e = options; 670 u8 *e = options;
674 int len; 671 int len;
675 672
676 pr_debug("DHCP: Sending message type %d\n", mt); 673 pr_debug("DHCP: Sending message type %d (%s)\n", mt, d->dev->name);
677 674
678 memcpy(e, ic_bootp_cookie, 4); /* RFC1048 Magic Cookie */ 675 memcpy(e, ic_bootp_cookie, 4); /* RFC1048 Magic Cookie */
679 e += 4; 676 e += 4;
@@ -857,7 +854,7 @@ static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_d
857 /* add DHCP options or BOOTP extensions */ 854 /* add DHCP options or BOOTP extensions */
858#ifdef IPCONFIG_DHCP 855#ifdef IPCONFIG_DHCP
859 if (ic_proto_enabled & IC_USE_DHCP) 856 if (ic_proto_enabled & IC_USE_DHCP)
860 ic_dhcp_init_options(b->exten); 857 ic_dhcp_init_options(b->exten, d);
861 else 858 else
862#endif 859#endif
863 ic_bootp_init_ext(b->exten); 860 ic_bootp_init_ext(b->exten);
@@ -1033,14 +1030,8 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
1033 /* Is it a reply to our BOOTP request? */ 1030 /* Is it a reply to our BOOTP request? */
1034 if (b->op != BOOTP_REPLY || 1031 if (b->op != BOOTP_REPLY ||
1035 b->xid != d->xid) { 1032 b->xid != d->xid) {
1036 net_err_ratelimited("DHCP/BOOTP: Reply not for us, op[%x] xid[%x]\n", 1033 net_err_ratelimited("DHCP/BOOTP: Reply not for us on %s, op[%x] xid[%x]\n",
1037 b->op, b->xid); 1034 d->dev->name, b->op, b->xid);
1038 goto drop_unlock;
1039 }
1040
1041 /* Is it a reply for the device we are configuring? */
1042 if (b->xid != ic_dev_xid) {
1043 net_err_ratelimited("DHCP/BOOTP: Ignoring delayed packet\n");
1044 goto drop_unlock; 1035 goto drop_unlock;
1045 } 1036 }
1046 1037
@@ -1075,7 +1066,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
1075 } 1066 }
1076 } 1067 }
1077 1068
1078 pr_debug("DHCP: Got message type %d\n", mt); 1069 pr_debug("DHCP: Got message type %d (%s)\n", mt, d->dev->name);
1079 1070
1080 switch (mt) { 1071 switch (mt) {
1081 case DHCPOFFER: 1072 case DHCPOFFER:
@@ -1130,7 +1121,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
1130 } 1121 }
1131 1122
1132 /* We have a winner! */ 1123 /* We have a winner! */
1133 ic_dev = dev; 1124 ic_dev = d;
1134 ic_myaddr = b->your_ip; 1125 ic_myaddr = b->your_ip;
1135 ic_servaddr = b->server_ip; 1126 ic_servaddr = b->server_ip;
1136 ic_addrservaddr = b->iph.saddr; 1127 ic_addrservaddr = b->iph.saddr;
@@ -1225,9 +1216,6 @@ static int __init ic_dynamic(void)
1225 timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned int) CONF_TIMEOUT_RANDOM); 1216 timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned int) CONF_TIMEOUT_RANDOM);
1226 for (;;) { 1217 for (;;) {
1227#ifdef IPCONFIG_BOOTP 1218#ifdef IPCONFIG_BOOTP
1228 /* Track the device we are configuring */
1229 ic_dev_xid = d->xid;
1230
1231 if (do_bootp && (d->able & IC_BOOTP)) 1219 if (do_bootp && (d->able & IC_BOOTP))
1232 ic_bootp_send_if(d, jiffies - start_jiffies); 1220 ic_bootp_send_if(d, jiffies - start_jiffies);
1233#endif 1221#endif
@@ -1236,15 +1224,19 @@ static int __init ic_dynamic(void)
1236 ic_rarp_send_if(d); 1224 ic_rarp_send_if(d);
1237#endif 1225#endif
1238 1226
1239 jiff = jiffies + (d->next ? CONF_INTER_TIMEOUT : timeout); 1227 if (!d->next) {
1240 while (time_before(jiffies, jiff) && !ic_got_reply) 1228 jiff = jiffies + timeout;
1241 schedule_timeout_uninterruptible(1); 1229 while (time_before(jiffies, jiff) && !ic_got_reply)
1230 schedule_timeout_uninterruptible(1);
1231 }
1242#ifdef IPCONFIG_DHCP 1232#ifdef IPCONFIG_DHCP
1243 /* DHCP isn't done until we get a DHCPACK. */ 1233 /* DHCP isn't done until we get a DHCPACK. */
1244 if ((ic_got_reply & IC_BOOTP) && 1234 if ((ic_got_reply & IC_BOOTP) &&
1245 (ic_proto_enabled & IC_USE_DHCP) && 1235 (ic_proto_enabled & IC_USE_DHCP) &&
1246 ic_dhcp_msgtype != DHCPACK) { 1236 ic_dhcp_msgtype != DHCPACK) {
1247 ic_got_reply = 0; 1237 ic_got_reply = 0;
1238 /* continue on device that got the reply */
1239 d = ic_dev;
1248 pr_cont(","); 1240 pr_cont(",");
1249 continue; 1241 continue;
1250 } 1242 }
@@ -1487,7 +1479,7 @@ static int __init ip_auto_config(void)
1487#endif /* IPCONFIG_DYNAMIC */ 1479#endif /* IPCONFIG_DYNAMIC */
1488 } else { 1480 } else {
1489 /* Device selected manually or only one device -> use it */ 1481 /* Device selected manually or only one device -> use it */
1490 ic_dev = ic_first_dev->dev; 1482 ic_dev = ic_first_dev;
1491 } 1483 }
1492 1484
1493 addr = root_nfs_parse_addr(root_server_path); 1485 addr = root_nfs_parse_addr(root_server_path);
@@ -1501,14 +1493,6 @@ static int __init ip_auto_config(void)
1501 return -1; 1493 return -1;
1502 1494
1503 /* 1495 /*
1504 * Close all network devices except the device we've
1505 * autoconfigured and set up routes.
1506 */
1507 ic_close_devs();
1508 if (ic_setup_if() < 0 || ic_setup_routes() < 0)
1509 return -1;
1510
1511 /*
1512 * Record which protocol was actually used. 1496 * Record which protocol was actually used.
1513 */ 1497 */
1514#ifdef IPCONFIG_DYNAMIC 1498#ifdef IPCONFIG_DYNAMIC
@@ -1522,7 +1506,7 @@ static int __init ip_auto_config(void)
1522 pr_info("IP-Config: Complete:\n"); 1506 pr_info("IP-Config: Complete:\n");
1523 1507
1524 pr_info(" device=%s, hwaddr=%*phC, ipaddr=%pI4, mask=%pI4, gw=%pI4\n", 1508 pr_info(" device=%s, hwaddr=%*phC, ipaddr=%pI4, mask=%pI4, gw=%pI4\n",
1525 ic_dev->name, ic_dev->addr_len, ic_dev->dev_addr, 1509 ic_dev->dev->name, ic_dev->dev->addr_len, ic_dev->dev->dev_addr,
1526 &ic_myaddr, &ic_netmask, &ic_gateway); 1510 &ic_myaddr, &ic_netmask, &ic_gateway);
1527 pr_info(" host=%s, domain=%s, nis-domain=%s\n", 1511 pr_info(" host=%s, domain=%s, nis-domain=%s\n",
1528 utsname()->nodename, ic_domain, utsname()->domainname); 1512 utsname()->nodename, ic_domain, utsname()->domainname);
@@ -1542,7 +1526,18 @@ static int __init ip_auto_config(void)
1542 pr_cont("\n"); 1526 pr_cont("\n");
1543#endif /* !SILENT */ 1527#endif /* !SILENT */
1544 1528
1545 return 0; 1529 /*
1530 * Close all network devices except the device we've
1531 * autoconfigured and set up routes.
1532 */
1533 if (ic_setup_if() < 0 || ic_setup_routes() < 0)
1534 err = -1;
1535 else
1536 err = 0;
1537
1538 ic_close_devs();
1539
1540 return err;
1546} 1541}
1547 1542
1548late_initcall(ip_auto_config); 1543late_initcall(ip_auto_config);
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 4ae3f8e6c6cc..c9392589c415 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -115,6 +115,7 @@
115#include <net/xfrm.h> 115#include <net/xfrm.h>
116#include <net/net_namespace.h> 116#include <net/net_namespace.h>
117#include <net/netns/generic.h> 117#include <net/netns/generic.h>
118#include <net/dst_metadata.h>
118 119
119static bool log_ecn_error = true; 120static bool log_ecn_error = true;
120module_param(log_ecn_error, bool, 0644); 121module_param(log_ecn_error, bool, 0644);
@@ -193,6 +194,7 @@ static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto)
193{ 194{
194 struct net *net = dev_net(skb->dev); 195 struct net *net = dev_net(skb->dev);
195 struct ip_tunnel_net *itn = net_generic(net, ipip_net_id); 196 struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
197 struct metadata_dst *tun_dst = NULL;
196 struct ip_tunnel *tunnel; 198 struct ip_tunnel *tunnel;
197 const struct iphdr *iph; 199 const struct iphdr *iph;
198 200
@@ -216,7 +218,12 @@ static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto)
216 tpi = &ipip_tpi; 218 tpi = &ipip_tpi;
217 if (iptunnel_pull_header(skb, 0, tpi->proto, false)) 219 if (iptunnel_pull_header(skb, 0, tpi->proto, false))
218 goto drop; 220 goto drop;
219 return ip_tunnel_rcv(tunnel, skb, tpi, NULL, log_ecn_error); 221 if (tunnel->collect_md) {
222 tun_dst = ip_tun_rx_dst(skb, 0, 0, 0);
223 if (!tun_dst)
224 return 0;
225 }
226 return ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
220 } 227 }
221 228
222 return -1; 229 return -1;
@@ -270,7 +277,10 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb,
270 277
271 skb_set_inner_ipproto(skb, ipproto); 278 skb_set_inner_ipproto(skb, ipproto);
272 279
273 ip_tunnel_xmit(skb, dev, tiph, ipproto); 280 if (tunnel->collect_md)
281 ip_md_tunnel_xmit(skb, dev, ipproto);
282 else
283 ip_tunnel_xmit(skb, dev, tiph, ipproto);
274 return NETDEV_TX_OK; 284 return NETDEV_TX_OK;
275 285
276tx_error: 286tx_error:
@@ -380,13 +390,14 @@ static int ipip_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
380} 390}
381 391
382static void ipip_netlink_parms(struct nlattr *data[], 392static void ipip_netlink_parms(struct nlattr *data[],
383 struct ip_tunnel_parm *parms) 393 struct ip_tunnel_parm *parms, bool *collect_md)
384{ 394{
385 memset(parms, 0, sizeof(*parms)); 395 memset(parms, 0, sizeof(*parms));
386 396
387 parms->iph.version = 4; 397 parms->iph.version = 4;
388 parms->iph.protocol = IPPROTO_IPIP; 398 parms->iph.protocol = IPPROTO_IPIP;
389 parms->iph.ihl = 5; 399 parms->iph.ihl = 5;
400 *collect_md = false;
390 401
391 if (!data) 402 if (!data)
392 return; 403 return;
@@ -414,6 +425,9 @@ static void ipip_netlink_parms(struct nlattr *data[],
414 425
415 if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC])) 426 if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC]))
416 parms->iph.frag_off = htons(IP_DF); 427 parms->iph.frag_off = htons(IP_DF);
428
429 if (data[IFLA_IPTUN_COLLECT_METADATA])
430 *collect_md = true;
417} 431}
418 432
419/* This function returns true when ENCAP attributes are present in the nl msg */ 433/* This function returns true when ENCAP attributes are present in the nl msg */
@@ -453,18 +467,18 @@ static bool ipip_netlink_encap_parms(struct nlattr *data[],
453static int ipip_newlink(struct net *src_net, struct net_device *dev, 467static int ipip_newlink(struct net *src_net, struct net_device *dev,
454 struct nlattr *tb[], struct nlattr *data[]) 468 struct nlattr *tb[], struct nlattr *data[])
455{ 469{
470 struct ip_tunnel *t = netdev_priv(dev);
456 struct ip_tunnel_parm p; 471 struct ip_tunnel_parm p;
457 struct ip_tunnel_encap ipencap; 472 struct ip_tunnel_encap ipencap;
458 473
459 if (ipip_netlink_encap_parms(data, &ipencap)) { 474 if (ipip_netlink_encap_parms(data, &ipencap)) {
460 struct ip_tunnel *t = netdev_priv(dev);
461 int err = ip_tunnel_encap_setup(t, &ipencap); 475 int err = ip_tunnel_encap_setup(t, &ipencap);
462 476
463 if (err < 0) 477 if (err < 0)
464 return err; 478 return err;
465 } 479 }
466 480
467 ipip_netlink_parms(data, &p); 481 ipip_netlink_parms(data, &p, &t->collect_md);
468 return ip_tunnel_newlink(dev, tb, &p); 482 return ip_tunnel_newlink(dev, tb, &p);
469} 483}
470 484
@@ -473,6 +487,7 @@ static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
473{ 487{
474 struct ip_tunnel_parm p; 488 struct ip_tunnel_parm p;
475 struct ip_tunnel_encap ipencap; 489 struct ip_tunnel_encap ipencap;
490 bool collect_md;
476 491
477 if (ipip_netlink_encap_parms(data, &ipencap)) { 492 if (ipip_netlink_encap_parms(data, &ipencap)) {
478 struct ip_tunnel *t = netdev_priv(dev); 493 struct ip_tunnel *t = netdev_priv(dev);
@@ -482,7 +497,9 @@ static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
482 return err; 497 return err;
483 } 498 }
484 499
485 ipip_netlink_parms(data, &p); 500 ipip_netlink_parms(data, &p, &collect_md);
501 if (collect_md)
502 return -EINVAL;
486 503
487 if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) || 504 if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) ||
488 (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr)) 505 (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr))
@@ -516,6 +533,8 @@ static size_t ipip_get_size(const struct net_device *dev)
516 nla_total_size(2) + 533 nla_total_size(2) +
517 /* IFLA_IPTUN_ENCAP_DPORT */ 534 /* IFLA_IPTUN_ENCAP_DPORT */
518 nla_total_size(2) + 535 nla_total_size(2) +
536 /* IFLA_IPTUN_COLLECT_METADATA */
537 nla_total_size(0) +
519 0; 538 0;
520} 539}
521 540
@@ -544,6 +563,9 @@ static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev)
544 tunnel->encap.flags)) 563 tunnel->encap.flags))
545 goto nla_put_failure; 564 goto nla_put_failure;
546 565
566 if (tunnel->collect_md)
567 if (nla_put_flag(skb, IFLA_IPTUN_COLLECT_METADATA))
568 goto nla_put_failure;
547 return 0; 569 return 0;
548 570
549nla_put_failure: 571nla_put_failure:
@@ -562,6 +584,7 @@ static const struct nla_policy ipip_policy[IFLA_IPTUN_MAX + 1] = {
562 [IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 }, 584 [IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 },
563 [IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 }, 585 [IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 },
564 [IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 }, 586 [IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 },
587 [IFLA_IPTUN_COLLECT_METADATA] = { .type = NLA_FLAG },
565}; 588};
566 589
567static struct rtnl_link_ops ipip_link_ops __read_mostly = { 590static struct rtnl_link_ops ipip_link_ops __read_mostly = {
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index c187c60e3e0c..d613309e3e5d 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -25,17 +25,6 @@ config NF_CONNTRACK_IPV4
25 25
26 To compile it as a module, choose M here. If unsure, say N. 26 To compile it as a module, choose M here. If unsure, say N.
27 27
28config NF_CONNTRACK_PROC_COMPAT
29 bool "proc/sysctl compatibility with old connection tracking"
30 depends on NF_CONNTRACK_PROCFS && NF_CONNTRACK_IPV4
31 default y
32 help
33 This option enables /proc and sysctl compatibility with the old
34 layer 3 dependent connection tracking. This is needed to keep
35 old programs that have not been adapted to the new names working.
36
37 If unsure, say Y.
38
39if NF_TABLES 28if NF_TABLES
40 29
41config NF_TABLES_IPV4 30config NF_TABLES_IPV4
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 87b073da14c9..853328f8fd05 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -4,11 +4,6 @@
4 4
5# objects for l3 independent conntrack 5# objects for l3 independent conntrack
6nf_conntrack_ipv4-y := nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o 6nf_conntrack_ipv4-y := nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o
7ifeq ($(CONFIG_NF_CONNTRACK_PROC_COMPAT),y)
8ifeq ($(CONFIG_PROC_FS),y)
9nf_conntrack_ipv4-objs += nf_conntrack_l3proto_ipv4_compat.o
10endif
11endif
12 7
13# connection tracking 8# connection tracking
14obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o 9obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index f993545a3373..7c00ce90adb8 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -156,7 +156,7 @@ static struct nf_loginfo trace_loginfo = {
156 .u = { 156 .u = {
157 .log = { 157 .log = {
158 .level = 4, 158 .level = 4,
159 .logflags = NF_LOG_MASK, 159 .logflags = NF_LOG_DEFAULT_MASK,
160 }, 160 },
161 }, 161 },
162}; 162};
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index ae1a71a97132..713c09a74b90 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -110,7 +110,7 @@ static unsigned int ipv4_helper(void *priv,
110 if (!help) 110 if (!help)
111 return NF_ACCEPT; 111 return NF_ACCEPT;
112 112
113 /* rcu_read_lock()ed by nf_hook_slow */ 113 /* rcu_read_lock()ed by nf_hook_thresh */
114 helper = rcu_dereference(help->helper); 114 helper = rcu_dereference(help->helper);
115 if (!helper) 115 if (!helper)
116 return NF_ACCEPT; 116 return NF_ACCEPT;
@@ -202,47 +202,6 @@ static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = {
202 }, 202 },
203}; 203};
204 204
205#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
206static int log_invalid_proto_min = 0;
207static int log_invalid_proto_max = 255;
208
209static struct ctl_table ip_ct_sysctl_table[] = {
210 {
211 .procname = "ip_conntrack_max",
212 .maxlen = sizeof(int),
213 .mode = 0644,
214 .proc_handler = proc_dointvec,
215 },
216 {
217 .procname = "ip_conntrack_count",
218 .maxlen = sizeof(int),
219 .mode = 0444,
220 .proc_handler = proc_dointvec,
221 },
222 {
223 .procname = "ip_conntrack_buckets",
224 .maxlen = sizeof(unsigned int),
225 .mode = 0444,
226 .proc_handler = proc_dointvec,
227 },
228 {
229 .procname = "ip_conntrack_checksum",
230 .maxlen = sizeof(int),
231 .mode = 0644,
232 .proc_handler = proc_dointvec,
233 },
234 {
235 .procname = "ip_conntrack_log_invalid",
236 .maxlen = sizeof(unsigned int),
237 .mode = 0644,
238 .proc_handler = proc_dointvec_minmax,
239 .extra1 = &log_invalid_proto_min,
240 .extra2 = &log_invalid_proto_max,
241 },
242 { }
243};
244#endif /* CONFIG_SYSCTL && CONFIG_NF_CONNTRACK_PROC_COMPAT */
245
246/* Fast function for those who don't want to parse /proc (and I don't 205/* Fast function for those who don't want to parse /proc (and I don't
247 blame them). */ 206 blame them). */
248/* Reversing the socket's dst/src point of view gives us the reply 207/* Reversing the socket's dst/src point of view gives us the reply
@@ -350,20 +309,6 @@ static struct nf_sockopt_ops so_getorigdst = {
350 309
351static int ipv4_init_net(struct net *net) 310static int ipv4_init_net(struct net *net)
352{ 311{
353#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
354 struct nf_ip_net *in = &net->ct.nf_ct_proto;
355 in->ctl_table = kmemdup(ip_ct_sysctl_table,
356 sizeof(ip_ct_sysctl_table),
357 GFP_KERNEL);
358 if (!in->ctl_table)
359 return -ENOMEM;
360
361 in->ctl_table[0].data = &nf_conntrack_max;
362 in->ctl_table[1].data = &net->ct.count;
363 in->ctl_table[2].data = &nf_conntrack_htable_size;
364 in->ctl_table[3].data = &net->ct.sysctl_checksum;
365 in->ctl_table[4].data = &net->ct.sysctl_log_invalid;
366#endif
367 return 0; 312 return 0;
368} 313}
369 314
@@ -380,9 +325,6 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
380 .nlattr_to_tuple = ipv4_nlattr_to_tuple, 325 .nlattr_to_tuple = ipv4_nlattr_to_tuple,
381 .nla_policy = ipv4_nla_policy, 326 .nla_policy = ipv4_nla_policy,
382#endif 327#endif
383#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
384 .ctl_table_path = "net/ipv4/netfilter",
385#endif
386 .init_net = ipv4_init_net, 328 .init_net = ipv4_init_net,
387 .me = THIS_MODULE, 329 .me = THIS_MODULE,
388}; 330};
@@ -492,16 +434,7 @@ static int __init nf_conntrack_l3proto_ipv4_init(void)
492 goto cleanup_icmpv4; 434 goto cleanup_icmpv4;
493 } 435 }
494 436
495#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
496 ret = nf_conntrack_ipv4_compat_init();
497 if (ret < 0)
498 goto cleanup_proto;
499#endif
500 return ret; 437 return ret;
501#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
502 cleanup_proto:
503 nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv4);
504#endif
505 cleanup_icmpv4: 438 cleanup_icmpv4:
506 nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmp); 439 nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmp);
507 cleanup_udp4: 440 cleanup_udp4:
@@ -520,9 +453,6 @@ static int __init nf_conntrack_l3proto_ipv4_init(void)
520static void __exit nf_conntrack_l3proto_ipv4_fini(void) 453static void __exit nf_conntrack_l3proto_ipv4_fini(void)
521{ 454{
522 synchronize_net(); 455 synchronize_net();
523#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
524 nf_conntrack_ipv4_compat_fini();
525#endif
526 nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv4); 456 nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv4);
527 nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmp); 457 nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmp);
528 nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp4); 458 nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp4);
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
deleted file mode 100644
index 63923710f325..000000000000
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ /dev/null
@@ -1,492 +0,0 @@
1/* ip_conntrack proc compat - based on ip_conntrack_standalone.c
2 *
3 * (C) 1999-2001 Paul `Rusty' Russell
4 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
5 * (C) 2006-2010 Patrick McHardy <kaber@trash.net>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11#include <linux/types.h>
12#include <linux/proc_fs.h>
13#include <linux/seq_file.h>
14#include <linux/percpu.h>
15#include <linux/security.h>
16#include <net/net_namespace.h>
17
18#include <linux/netfilter.h>
19#include <net/netfilter/nf_conntrack_core.h>
20#include <net/netfilter/nf_conntrack_l3proto.h>
21#include <net/netfilter/nf_conntrack_l4proto.h>
22#include <net/netfilter/nf_conntrack_expect.h>
23#include <net/netfilter/nf_conntrack_acct.h>
24#include <linux/rculist_nulls.h>
25#include <linux/export.h>
26
27struct ct_iter_state {
28 struct seq_net_private p;
29 struct hlist_nulls_head *hash;
30 unsigned int htable_size;
31 unsigned int bucket;
32};
33
34static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
35{
36 struct ct_iter_state *st = seq->private;
37 struct hlist_nulls_node *n;
38
39 for (st->bucket = 0;
40 st->bucket < st->htable_size;
41 st->bucket++) {
42 n = rcu_dereference(
43 hlist_nulls_first_rcu(&st->hash[st->bucket]));
44 if (!is_a_nulls(n))
45 return n;
46 }
47 return NULL;
48}
49
50static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
51 struct hlist_nulls_node *head)
52{
53 struct ct_iter_state *st = seq->private;
54
55 head = rcu_dereference(hlist_nulls_next_rcu(head));
56 while (is_a_nulls(head)) {
57 if (likely(get_nulls_value(head) == st->bucket)) {
58 if (++st->bucket >= st->htable_size)
59 return NULL;
60 }
61 head = rcu_dereference(
62 hlist_nulls_first_rcu(&st->hash[st->bucket]));
63 }
64 return head;
65}
66
67static struct hlist_nulls_node *ct_get_idx(struct seq_file *seq, loff_t pos)
68{
69 struct hlist_nulls_node *head = ct_get_first(seq);
70
71 if (head)
72 while (pos && (head = ct_get_next(seq, head)))
73 pos--;
74 return pos ? NULL : head;
75}
76
77static void *ct_seq_start(struct seq_file *seq, loff_t *pos)
78 __acquires(RCU)
79{
80 struct ct_iter_state *st = seq->private;
81
82 rcu_read_lock();
83
84 nf_conntrack_get_ht(&st->hash, &st->htable_size);
85 return ct_get_idx(seq, *pos);
86}
87
88static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos)
89{
90 (*pos)++;
91 return ct_get_next(s, v);
92}
93
94static void ct_seq_stop(struct seq_file *s, void *v)
95 __releases(RCU)
96{
97 rcu_read_unlock();
98}
99
100#ifdef CONFIG_NF_CONNTRACK_SECMARK
101static void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
102{
103 int ret;
104 u32 len;
105 char *secctx;
106
107 ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
108 if (ret)
109 return;
110
111 seq_printf(s, "secctx=%s ", secctx);
112
113 security_release_secctx(secctx, len);
114}
115#else
116static inline void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
117{
118}
119#endif
120
121static bool ct_seq_should_skip(const struct nf_conn *ct,
122 const struct net *net,
123 const struct nf_conntrack_tuple_hash *hash)
124{
125 /* we only want to print DIR_ORIGINAL */
126 if (NF_CT_DIRECTION(hash))
127 return true;
128
129 if (nf_ct_l3num(ct) != AF_INET)
130 return true;
131
132 if (!net_eq(nf_ct_net(ct), net))
133 return true;
134
135 return false;
136}
137
138static int ct_seq_show(struct seq_file *s, void *v)
139{
140 struct nf_conntrack_tuple_hash *hash = v;
141 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(hash);
142 const struct nf_conntrack_l3proto *l3proto;
143 const struct nf_conntrack_l4proto *l4proto;
144 int ret = 0;
145
146 NF_CT_ASSERT(ct);
147 if (ct_seq_should_skip(ct, seq_file_net(s), hash))
148 return 0;
149
150 if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use)))
151 return 0;
152
153 /* check if we raced w. object reuse */
154 if (!nf_ct_is_confirmed(ct) ||
155 ct_seq_should_skip(ct, seq_file_net(s), hash))
156 goto release;
157
158 l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct));
159 NF_CT_ASSERT(l3proto);
160 l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
161 NF_CT_ASSERT(l4proto);
162
163 ret = -ENOSPC;
164 seq_printf(s, "%-8s %u %ld ",
165 l4proto->name, nf_ct_protonum(ct),
166 timer_pending(&ct->timeout)
167 ? (long)(ct->timeout.expires - jiffies)/HZ : 0);
168
169 if (l4proto->print_conntrack)
170 l4proto->print_conntrack(s, ct);
171
172 if (seq_has_overflowed(s))
173 goto release;
174
175 print_tuple(s, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
176 l3proto, l4proto);
177
178 if (seq_has_overflowed(s))
179 goto release;
180
181 if (seq_print_acct(s, ct, IP_CT_DIR_ORIGINAL))
182 goto release;
183
184 if (!(test_bit(IPS_SEEN_REPLY_BIT, &ct->status)))
185 seq_printf(s, "[UNREPLIED] ");
186
187 print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
188 l3proto, l4proto);
189
190 if (seq_has_overflowed(s))
191 goto release;
192
193 if (seq_print_acct(s, ct, IP_CT_DIR_REPLY))
194 goto release;
195
196 if (test_bit(IPS_ASSURED_BIT, &ct->status))
197 seq_printf(s, "[ASSURED] ");
198
199#ifdef CONFIG_NF_CONNTRACK_MARK
200 seq_printf(s, "mark=%u ", ct->mark);
201#endif
202
203 ct_show_secctx(s, ct);
204
205 seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use));
206
207 if (seq_has_overflowed(s))
208 goto release;
209
210 ret = 0;
211release:
212 nf_ct_put(ct);
213 return ret;
214}
215
216static const struct seq_operations ct_seq_ops = {
217 .start = ct_seq_start,
218 .next = ct_seq_next,
219 .stop = ct_seq_stop,
220 .show = ct_seq_show
221};
222
223static int ct_open(struct inode *inode, struct file *file)
224{
225 return seq_open_net(inode, file, &ct_seq_ops,
226 sizeof(struct ct_iter_state));
227}
228
229static const struct file_operations ct_file_ops = {
230 .owner = THIS_MODULE,
231 .open = ct_open,
232 .read = seq_read,
233 .llseek = seq_lseek,
234 .release = seq_release_net,
235};
236
237/* expects */
238struct ct_expect_iter_state {
239 struct seq_net_private p;
240 unsigned int bucket;
241};
242
243static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
244{
245 struct ct_expect_iter_state *st = seq->private;
246 struct hlist_node *n;
247
248 for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) {
249 n = rcu_dereference(
250 hlist_first_rcu(&nf_ct_expect_hash[st->bucket]));
251 if (n)
252 return n;
253 }
254 return NULL;
255}
256
257static struct hlist_node *ct_expect_get_next(struct seq_file *seq,
258 struct hlist_node *head)
259{
260 struct ct_expect_iter_state *st = seq->private;
261
262 head = rcu_dereference(hlist_next_rcu(head));
263 while (head == NULL) {
264 if (++st->bucket >= nf_ct_expect_hsize)
265 return NULL;
266 head = rcu_dereference(
267 hlist_first_rcu(&nf_ct_expect_hash[st->bucket]));
268 }
269 return head;
270}
271
272static struct hlist_node *ct_expect_get_idx(struct seq_file *seq, loff_t pos)
273{
274 struct hlist_node *head = ct_expect_get_first(seq);
275
276 if (head)
277 while (pos && (head = ct_expect_get_next(seq, head)))
278 pos--;
279 return pos ? NULL : head;
280}
281
282static void *exp_seq_start(struct seq_file *seq, loff_t *pos)
283 __acquires(RCU)
284{
285 rcu_read_lock();
286 return ct_expect_get_idx(seq, *pos);
287}
288
289static void *exp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
290{
291 (*pos)++;
292 return ct_expect_get_next(seq, v);
293}
294
295static void exp_seq_stop(struct seq_file *seq, void *v)
296 __releases(RCU)
297{
298 rcu_read_unlock();
299}
300
301static int exp_seq_show(struct seq_file *s, void *v)
302{
303 struct nf_conntrack_expect *exp;
304 const struct hlist_node *n = v;
305
306 exp = hlist_entry(n, struct nf_conntrack_expect, hnode);
307
308 if (!net_eq(nf_ct_net(exp->master), seq_file_net(s)))
309 return 0;
310
311 if (exp->tuple.src.l3num != AF_INET)
312 return 0;
313
314 if (exp->timeout.function)
315 seq_printf(s, "%ld ", timer_pending(&exp->timeout)
316 ? (long)(exp->timeout.expires - jiffies)/HZ : 0);
317 else
318 seq_printf(s, "- ");
319
320 seq_printf(s, "proto=%u ", exp->tuple.dst.protonum);
321
322 print_tuple(s, &exp->tuple,
323 __nf_ct_l3proto_find(exp->tuple.src.l3num),
324 __nf_ct_l4proto_find(exp->tuple.src.l3num,
325 exp->tuple.dst.protonum));
326 seq_putc(s, '\n');
327
328 return 0;
329}
330
331static const struct seq_operations exp_seq_ops = {
332 .start = exp_seq_start,
333 .next = exp_seq_next,
334 .stop = exp_seq_stop,
335 .show = exp_seq_show
336};
337
338static int exp_open(struct inode *inode, struct file *file)
339{
340 return seq_open_net(inode, file, &exp_seq_ops,
341 sizeof(struct ct_expect_iter_state));
342}
343
344static const struct file_operations ip_exp_file_ops = {
345 .owner = THIS_MODULE,
346 .open = exp_open,
347 .read = seq_read,
348 .llseek = seq_lseek,
349 .release = seq_release_net,
350};
351
352static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos)
353{
354 struct net *net = seq_file_net(seq);
355 int cpu;
356
357 if (*pos == 0)
358 return SEQ_START_TOKEN;
359
360 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
361 if (!cpu_possible(cpu))
362 continue;
363 *pos = cpu+1;
364 return per_cpu_ptr(net->ct.stat, cpu);
365 }
366
367 return NULL;
368}
369
370static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
371{
372 struct net *net = seq_file_net(seq);
373 int cpu;
374
375 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
376 if (!cpu_possible(cpu))
377 continue;
378 *pos = cpu+1;
379 return per_cpu_ptr(net->ct.stat, cpu);
380 }
381
382 return NULL;
383}
384
385static void ct_cpu_seq_stop(struct seq_file *seq, void *v)
386{
387}
388
389static int ct_cpu_seq_show(struct seq_file *seq, void *v)
390{
391 struct net *net = seq_file_net(seq);
392 unsigned int nr_conntracks = atomic_read(&net->ct.count);
393 const struct ip_conntrack_stat *st = v;
394
395 if (v == SEQ_START_TOKEN) {
396 seq_printf(seq, "entries searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete search_restart\n");
397 return 0;
398 }
399
400 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x "
401 "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
402 nr_conntracks,
403 st->searched,
404 st->found,
405 st->new,
406 st->invalid,
407 st->ignore,
408 st->delete,
409 st->delete_list,
410 st->insert,
411 st->insert_failed,
412 st->drop,
413 st->early_drop,
414 st->error,
415
416 st->expect_new,
417 st->expect_create,
418 st->expect_delete,
419 st->search_restart
420 );
421 return 0;
422}
423
424static const struct seq_operations ct_cpu_seq_ops = {
425 .start = ct_cpu_seq_start,
426 .next = ct_cpu_seq_next,
427 .stop = ct_cpu_seq_stop,
428 .show = ct_cpu_seq_show,
429};
430
431static int ct_cpu_seq_open(struct inode *inode, struct file *file)
432{
433 return seq_open_net(inode, file, &ct_cpu_seq_ops,
434 sizeof(struct seq_net_private));
435}
436
437static const struct file_operations ct_cpu_seq_fops = {
438 .owner = THIS_MODULE,
439 .open = ct_cpu_seq_open,
440 .read = seq_read,
441 .llseek = seq_lseek,
442 .release = seq_release_net,
443};
444
445static int __net_init ip_conntrack_net_init(struct net *net)
446{
447 struct proc_dir_entry *proc, *proc_exp, *proc_stat;
448
449 proc = proc_create("ip_conntrack", 0440, net->proc_net, &ct_file_ops);
450 if (!proc)
451 goto err1;
452
453 proc_exp = proc_create("ip_conntrack_expect", 0440, net->proc_net,
454 &ip_exp_file_ops);
455 if (!proc_exp)
456 goto err2;
457
458 proc_stat = proc_create("ip_conntrack", S_IRUGO,
459 net->proc_net_stat, &ct_cpu_seq_fops);
460 if (!proc_stat)
461 goto err3;
462 return 0;
463
464err3:
465 remove_proc_entry("ip_conntrack_expect", net->proc_net);
466err2:
467 remove_proc_entry("ip_conntrack", net->proc_net);
468err1:
469 return -ENOMEM;
470}
471
472static void __net_exit ip_conntrack_net_exit(struct net *net)
473{
474 remove_proc_entry("ip_conntrack", net->proc_net_stat);
475 remove_proc_entry("ip_conntrack_expect", net->proc_net);
476 remove_proc_entry("ip_conntrack", net->proc_net);
477}
478
479static struct pernet_operations ip_conntrack_net_ops = {
480 .init = ip_conntrack_net_init,
481 .exit = ip_conntrack_net_exit,
482};
483
484int __init nf_conntrack_ipv4_compat_init(void)
485{
486 return register_pernet_subsys(&ip_conntrack_net_ops);
487}
488
489void __exit nf_conntrack_ipv4_compat_fini(void)
490{
491 unregister_pernet_subsys(&ip_conntrack_net_ops);
492}
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index c567e1b5d799..d075b3cf2400 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -149,7 +149,7 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
149 return -NF_ACCEPT; 149 return -NF_ACCEPT;
150 } 150 }
151 151
152 /* rcu_read_lock()ed by nf_hook_slow */ 152 /* rcu_read_lock()ed by nf_hook_thresh */
153 innerproto = __nf_ct_l4proto_find(PF_INET, origtuple.dst.protonum); 153 innerproto = __nf_ct_l4proto_find(PF_INET, origtuple.dst.protonum);
154 154
155 /* Ordinarily, we'd expect the inverted tupleproto, but it's 155 /* Ordinarily, we'd expect the inverted tupleproto, but it's
@@ -327,17 +327,6 @@ static struct ctl_table icmp_sysctl_table[] = {
327 }, 327 },
328 { } 328 { }
329}; 329};
330#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
331static struct ctl_table icmp_compat_sysctl_table[] = {
332 {
333 .procname = "ip_conntrack_icmp_timeout",
334 .maxlen = sizeof(unsigned int),
335 .mode = 0644,
336 .proc_handler = proc_dointvec_jiffies,
337 },
338 { }
339};
340#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
341#endif /* CONFIG_SYSCTL */ 330#endif /* CONFIG_SYSCTL */
342 331
343static int icmp_kmemdup_sysctl_table(struct nf_proto_net *pn, 332static int icmp_kmemdup_sysctl_table(struct nf_proto_net *pn,
@@ -355,40 +344,14 @@ static int icmp_kmemdup_sysctl_table(struct nf_proto_net *pn,
355 return 0; 344 return 0;
356} 345}
357 346
358static int icmp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn,
359 struct nf_icmp_net *in)
360{
361#ifdef CONFIG_SYSCTL
362#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
363 pn->ctl_compat_table = kmemdup(icmp_compat_sysctl_table,
364 sizeof(icmp_compat_sysctl_table),
365 GFP_KERNEL);
366 if (!pn->ctl_compat_table)
367 return -ENOMEM;
368
369 pn->ctl_compat_table[0].data = &in->timeout;
370#endif
371#endif
372 return 0;
373}
374
375static int icmp_init_net(struct net *net, u_int16_t proto) 347static int icmp_init_net(struct net *net, u_int16_t proto)
376{ 348{
377 int ret;
378 struct nf_icmp_net *in = icmp_pernet(net); 349 struct nf_icmp_net *in = icmp_pernet(net);
379 struct nf_proto_net *pn = &in->pn; 350 struct nf_proto_net *pn = &in->pn;
380 351
381 in->timeout = nf_ct_icmp_timeout; 352 in->timeout = nf_ct_icmp_timeout;
382 353
383 ret = icmp_kmemdup_compat_sysctl_table(pn, in); 354 return icmp_kmemdup_sysctl_table(pn, in);
384 if (ret < 0)
385 return ret;
386
387 ret = icmp_kmemdup_sysctl_table(pn, in);
388 if (ret < 0)
389 nf_ct_kfree_compat_sysctl_table(pn);
390
391 return ret;
392} 355}
393 356
394static struct nf_proto_net *icmp_get_net_proto(struct net *net) 357static struct nf_proto_net *icmp_get_net_proto(struct net *net)
diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c
index ceb187308120..cf986e1c7bbd 100644
--- a/net/ipv4/netfilter/nf_dup_ipv4.c
+++ b/net/ipv4/netfilter/nf_dup_ipv4.c
@@ -74,21 +74,19 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum,
74 nf_conntrack_get(skb->nfct); 74 nf_conntrack_get(skb->nfct);
75#endif 75#endif
76 /* 76 /*
77 * If we are in PREROUTING/INPUT, the checksum must be recalculated 77 * If we are in PREROUTING/INPUT, decrease the TTL to mitigate potential
78 * since the length could have changed as a result of defragmentation. 78 * loops between two hosts.
79 *
80 * We also decrease the TTL to mitigate potential loops between two
81 * hosts.
82 * 79 *
83 * Set %IP_DF so that the original source is notified of a potentially 80 * Set %IP_DF so that the original source is notified of a potentially
84 * decreased MTU on the clone route. IPv6 does this too. 81 * decreased MTU on the clone route. IPv6 does this too.
82 *
83 * IP header checksum will be recalculated at ip_local_out.
85 */ 84 */
86 iph = ip_hdr(skb); 85 iph = ip_hdr(skb);
87 iph->frag_off |= htons(IP_DF); 86 iph->frag_off |= htons(IP_DF);
88 if (hooknum == NF_INET_PRE_ROUTING || 87 if (hooknum == NF_INET_PRE_ROUTING ||
89 hooknum == NF_INET_LOCAL_IN) 88 hooknum == NF_INET_LOCAL_IN)
90 --iph->ttl; 89 --iph->ttl;
91 ip_send_check(iph);
92 90
93 if (nf_dup_ipv4_route(net, skb, gw, oif)) { 91 if (nf_dup_ipv4_route(net, skb, gw, oif)) {
94 __this_cpu_write(nf_skb_duplicated, true); 92 __this_cpu_write(nf_skb_duplicated, true);
diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c
index e7ad950cf9ef..b24795e2ee6d 100644
--- a/net/ipv4/netfilter/nf_log_arp.c
+++ b/net/ipv4/netfilter/nf_log_arp.c
@@ -30,7 +30,7 @@ static struct nf_loginfo default_loginfo = {
30 .u = { 30 .u = {
31 .log = { 31 .log = {
32 .level = LOGLEVEL_NOTICE, 32 .level = LOGLEVEL_NOTICE,
33 .logflags = NF_LOG_MASK, 33 .logflags = NF_LOG_DEFAULT_MASK,
34 }, 34 },
35 }, 35 },
36}; 36};
@@ -62,7 +62,7 @@ static void dump_arp_packet(struct nf_log_buf *m,
62 /* If it's for Ethernet and the lengths are OK, then log the ARP 62 /* If it's for Ethernet and the lengths are OK, then log the ARP
63 * payload. 63 * payload.
64 */ 64 */
65 if (ah->ar_hrd != htons(1) || 65 if (ah->ar_hrd != htons(ARPHRD_ETHER) ||
66 ah->ar_hln != ETH_ALEN || 66 ah->ar_hln != ETH_ALEN ||
67 ah->ar_pln != sizeof(__be32)) 67 ah->ar_pln != sizeof(__be32))
68 return; 68 return;
@@ -111,8 +111,7 @@ static struct nf_logger nf_arp_logger __read_mostly = {
111 111
112static int __net_init nf_log_arp_net_init(struct net *net) 112static int __net_init nf_log_arp_net_init(struct net *net)
113{ 113{
114 nf_log_set(net, NFPROTO_ARP, &nf_arp_logger); 114 return nf_log_set(net, NFPROTO_ARP, &nf_arp_logger);
115 return 0;
116} 115}
117 116
118static void __net_exit nf_log_arp_net_exit(struct net *net) 117static void __net_exit nf_log_arp_net_exit(struct net *net)
diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c
index 076aadda0473..856648966f4c 100644
--- a/net/ipv4/netfilter/nf_log_ipv4.c
+++ b/net/ipv4/netfilter/nf_log_ipv4.c
@@ -29,7 +29,7 @@ static struct nf_loginfo default_loginfo = {
29 .u = { 29 .u = {
30 .log = { 30 .log = {
31 .level = LOGLEVEL_NOTICE, 31 .level = LOGLEVEL_NOTICE,
32 .logflags = NF_LOG_MASK, 32 .logflags = NF_LOG_DEFAULT_MASK,
33 }, 33 },
34 }, 34 },
35}; 35};
@@ -46,7 +46,7 @@ static void dump_ipv4_packet(struct nf_log_buf *m,
46 if (info->type == NF_LOG_TYPE_LOG) 46 if (info->type == NF_LOG_TYPE_LOG)
47 logflags = info->u.log.logflags; 47 logflags = info->u.log.logflags;
48 else 48 else
49 logflags = NF_LOG_MASK; 49 logflags = NF_LOG_DEFAULT_MASK;
50 50
51 ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph); 51 ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph);
52 if (ih == NULL) { 52 if (ih == NULL) {
@@ -76,7 +76,7 @@ static void dump_ipv4_packet(struct nf_log_buf *m,
76 if (ntohs(ih->frag_off) & IP_OFFSET) 76 if (ntohs(ih->frag_off) & IP_OFFSET)
77 nf_log_buf_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); 77 nf_log_buf_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
78 78
79 if ((logflags & XT_LOG_IPOPT) && 79 if ((logflags & NF_LOG_IPOPT) &&
80 ih->ihl * 4 > sizeof(struct iphdr)) { 80 ih->ihl * 4 > sizeof(struct iphdr)) {
81 const unsigned char *op; 81 const unsigned char *op;
82 unsigned char _opt[4 * 15 - sizeof(struct iphdr)]; 82 unsigned char _opt[4 * 15 - sizeof(struct iphdr)];
@@ -250,7 +250,7 @@ static void dump_ipv4_packet(struct nf_log_buf *m,
250 } 250 }
251 251
252 /* Max length: 15 "UID=4294967295 " */ 252 /* Max length: 15 "UID=4294967295 " */
253 if ((logflags & XT_LOG_UID) && !iphoff) 253 if ((logflags & NF_LOG_UID) && !iphoff)
254 nf_log_dump_sk_uid_gid(m, skb->sk); 254 nf_log_dump_sk_uid_gid(m, skb->sk);
255 255
256 /* Max length: 16 "MARK=0xFFFFFFFF " */ 256 /* Max length: 16 "MARK=0xFFFFFFFF " */
@@ -282,7 +282,7 @@ static void dump_ipv4_mac_header(struct nf_log_buf *m,
282 if (info->type == NF_LOG_TYPE_LOG) 282 if (info->type == NF_LOG_TYPE_LOG)
283 logflags = info->u.log.logflags; 283 logflags = info->u.log.logflags;
284 284
285 if (!(logflags & XT_LOG_MACDECODE)) 285 if (!(logflags & NF_LOG_MACDECODE))
286 goto fallback; 286 goto fallback;
287 287
288 switch (dev->type) { 288 switch (dev->type) {
@@ -347,8 +347,7 @@ static struct nf_logger nf_ip_logger __read_mostly = {
347 347
348static int __net_init nf_log_ipv4_net_init(struct net *net) 348static int __net_init nf_log_ipv4_net_init(struct net *net)
349{ 349{
350 nf_log_set(net, NFPROTO_IPV4, &nf_ip_logger); 350 return nf_log_set(net, NFPROTO_IPV4, &nf_ip_logger);
351 return 0;
352} 351}
353 352
354static void __net_exit nf_log_ipv4_net_exit(struct net *net) 353static void __net_exit nf_log_ipv4_net_exit(struct net *net)
diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c
index 9414923f1e15..edf05002d674 100644
--- a/net/ipv4/netfilter/nf_nat_proto_gre.c
+++ b/net/ipv4/netfilter/nf_nat_proto_gre.c
@@ -88,8 +88,8 @@ gre_manip_pkt(struct sk_buff *skb,
88 const struct nf_conntrack_tuple *tuple, 88 const struct nf_conntrack_tuple *tuple,
89 enum nf_nat_manip_type maniptype) 89 enum nf_nat_manip_type maniptype)
90{ 90{
91 const struct gre_hdr *greh; 91 const struct gre_base_hdr *greh;
92 struct gre_hdr_pptp *pgreh; 92 struct pptp_gre_header *pgreh;
93 93
94 /* pgreh includes two optional 32bit fields which are not required 94 /* pgreh includes two optional 32bit fields which are not required
95 * to be there. That's where the magic '8' comes from */ 95 * to be there. That's where the magic '8' comes from */
@@ -97,18 +97,19 @@ gre_manip_pkt(struct sk_buff *skb,
97 return false; 97 return false;
98 98
99 greh = (void *)skb->data + hdroff; 99 greh = (void *)skb->data + hdroff;
100 pgreh = (struct gre_hdr_pptp *)greh; 100 pgreh = (struct pptp_gre_header *)greh;
101 101
102 /* we only have destination manip of a packet, since 'source key' 102 /* we only have destination manip of a packet, since 'source key'
103 * is not present in the packet itself */ 103 * is not present in the packet itself */
104 if (maniptype != NF_NAT_MANIP_DST) 104 if (maniptype != NF_NAT_MANIP_DST)
105 return true; 105 return true;
106 switch (greh->version) { 106
107 case GRE_VERSION_1701: 107 switch (greh->flags & GRE_VERSION) {
108 case GRE_VERSION_0:
108 /* We do not currently NAT any GREv0 packets. 109 /* We do not currently NAT any GREv0 packets.
109 * Try to behave like "nf_nat_proto_unknown" */ 110 * Try to behave like "nf_nat_proto_unknown" */
110 break; 111 break;
111 case GRE_VERSION_PPTP: 112 case GRE_VERSION_1:
112 pr_debug("call_id -> 0x%04x\n", ntohs(tuple->dst.u.gre.key)); 113 pr_debug("call_id -> 0x%04x\n", ntohs(tuple->dst.u.gre.key));
113 pgreh->call_id = tuple->dst.u.gre.key; 114 pgreh->call_id = tuple->dst.u.gre.key;
114 break; 115 break;
diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c
index cd84d4295a20..805c8ddfe860 100644
--- a/net/ipv4/netfilter/nf_tables_arp.c
+++ b/net/ipv4/netfilter/nf_tables_arp.c
@@ -21,7 +21,7 @@ nft_do_chain_arp(void *priv,
21{ 21{
22 struct nft_pktinfo pkt; 22 struct nft_pktinfo pkt;
23 23
24 nft_set_pktinfo(&pkt, skb, state); 24 nft_set_pktinfo_unspec(&pkt, skb, state);
25 25
26 return nft_do_chain(&pkt, priv); 26 return nft_do_chain(&pkt, priv);
27} 27}
@@ -80,7 +80,10 @@ static int __init nf_tables_arp_init(void)
80{ 80{
81 int ret; 81 int ret;
82 82
83 nft_register_chain_type(&filter_arp); 83 ret = nft_register_chain_type(&filter_arp);
84 if (ret < 0)
85 return ret;
86
84 ret = register_pernet_subsys(&nf_tables_arp_net_ops); 87 ret = register_pernet_subsys(&nf_tables_arp_net_ops);
85 if (ret < 0) 88 if (ret < 0)
86 nft_unregister_chain_type(&filter_arp); 89 nft_unregister_chain_type(&filter_arp);
diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c
index e44ba3b12fbb..2840a29b2e04 100644
--- a/net/ipv4/netfilter/nf_tables_ipv4.c
+++ b/net/ipv4/netfilter/nf_tables_ipv4.c
@@ -103,7 +103,10 @@ static int __init nf_tables_ipv4_init(void)
103{ 103{
104 int ret; 104 int ret;
105 105
106 nft_register_chain_type(&filter_ipv4); 106 ret = nft_register_chain_type(&filter_ipv4);
107 if (ret < 0)
108 return ret;
109
107 ret = register_pernet_subsys(&nf_tables_ipv4_net_ops); 110 ret = register_pernet_subsys(&nf_tables_ipv4_net_ops);
108 if (ret < 0) 111 if (ret < 0)
109 nft_unregister_chain_type(&filter_ipv4); 112 nft_unregister_chain_type(&filter_ipv4);
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 66ddcb60519a..7cf7d6e380c2 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -258,7 +258,7 @@ int ping_init_sock(struct sock *sk)
258 struct net *net = sock_net(sk); 258 struct net *net = sock_net(sk);
259 kgid_t group = current_egid(); 259 kgid_t group = current_egid();
260 struct group_info *group_info; 260 struct group_info *group_info;
261 int i, j, count; 261 int i;
262 kgid_t low, high; 262 kgid_t low, high;
263 int ret = 0; 263 int ret = 0;
264 264
@@ -270,16 +270,11 @@ int ping_init_sock(struct sock *sk)
270 return 0; 270 return 0;
271 271
272 group_info = get_current_groups(); 272 group_info = get_current_groups();
273 count = group_info->ngroups; 273 for (i = 0; i < group_info->ngroups; i++) {
274 for (i = 0; i < group_info->nblocks; i++) { 274 kgid_t gid = group_info->gid[i];
275 int cp_count = min_t(int, NGROUPS_PER_BLOCK, count);
276 for (j = 0; j < cp_count; j++) {
277 kgid_t gid = group_info->blocks[i][j];
278 if (gid_lte(low, gid) && gid_lte(gid, high))
279 goto out_release_group;
280 }
281 275
282 count -= cp_count; 276 if (gid_lte(low, gid) && gid_lte(gid, high))
277 goto out_release_group;
283 } 278 }
284 279
285 ret = -EACCES; 280 ret = -EACCES;
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 9f665b63a927..7143ca1a6af9 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -46,6 +46,8 @@
46#include <net/sock.h> 46#include <net/sock.h>
47#include <net/raw.h> 47#include <net/raw.h>
48 48
49#define TCPUDP_MIB_MAX max_t(u32, UDP_MIB_MAX, TCP_MIB_MAX)
50
49/* 51/*
50 * Report socket allocation statistics [mea@utu.fi] 52 * Report socket allocation statistics [mea@utu.fi]
51 */ 53 */
@@ -257,6 +259,7 @@ static const struct snmp_mib snmp4_net_list[] = {
257 SNMP_MIB_ITEM("TCPSpuriousRTOs", LINUX_MIB_TCPSPURIOUSRTOS), 259 SNMP_MIB_ITEM("TCPSpuriousRTOs", LINUX_MIB_TCPSPURIOUSRTOS),
258 SNMP_MIB_ITEM("TCPMD5NotFound", LINUX_MIB_TCPMD5NOTFOUND), 260 SNMP_MIB_ITEM("TCPMD5NotFound", LINUX_MIB_TCPMD5NOTFOUND),
259 SNMP_MIB_ITEM("TCPMD5Unexpected", LINUX_MIB_TCPMD5UNEXPECTED), 261 SNMP_MIB_ITEM("TCPMD5Unexpected", LINUX_MIB_TCPMD5UNEXPECTED),
262 SNMP_MIB_ITEM("TCPMD5Failure", LINUX_MIB_TCPMD5FAILURE),
260 SNMP_MIB_ITEM("TCPSackShifted", LINUX_MIB_SACKSHIFTED), 263 SNMP_MIB_ITEM("TCPSackShifted", LINUX_MIB_SACKSHIFTED),
261 SNMP_MIB_ITEM("TCPSackMerged", LINUX_MIB_SACKMERGED), 264 SNMP_MIB_ITEM("TCPSackMerged", LINUX_MIB_SACKMERGED),
262 SNMP_MIB_ITEM("TCPSackShiftFallback", LINUX_MIB_SACKSHIFTFALLBACK), 265 SNMP_MIB_ITEM("TCPSackShiftFallback", LINUX_MIB_SACKSHIFTFALLBACK),
@@ -355,22 +358,22 @@ static void icmp_put(struct seq_file *seq)
355 atomic_long_t *ptr = net->mib.icmpmsg_statistics->mibs; 358 atomic_long_t *ptr = net->mib.icmpmsg_statistics->mibs;
356 359
357 seq_puts(seq, "\nIcmp: InMsgs InErrors InCsumErrors"); 360 seq_puts(seq, "\nIcmp: InMsgs InErrors InCsumErrors");
358 for (i = 0; icmpmibmap[i].name != NULL; i++) 361 for (i = 0; icmpmibmap[i].name; i++)
359 seq_printf(seq, " In%s", icmpmibmap[i].name); 362 seq_printf(seq, " In%s", icmpmibmap[i].name);
360 seq_puts(seq, " OutMsgs OutErrors"); 363 seq_puts(seq, " OutMsgs OutErrors");
361 for (i = 0; icmpmibmap[i].name != NULL; i++) 364 for (i = 0; icmpmibmap[i].name; i++)
362 seq_printf(seq, " Out%s", icmpmibmap[i].name); 365 seq_printf(seq, " Out%s", icmpmibmap[i].name);
363 seq_printf(seq, "\nIcmp: %lu %lu %lu", 366 seq_printf(seq, "\nIcmp: %lu %lu %lu",
364 snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INMSGS), 367 snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INMSGS),
365 snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INERRORS), 368 snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INERRORS),
366 snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_CSUMERRORS)); 369 snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_CSUMERRORS));
367 for (i = 0; icmpmibmap[i].name != NULL; i++) 370 for (i = 0; icmpmibmap[i].name; i++)
368 seq_printf(seq, " %lu", 371 seq_printf(seq, " %lu",
369 atomic_long_read(ptr + icmpmibmap[i].index)); 372 atomic_long_read(ptr + icmpmibmap[i].index));
370 seq_printf(seq, " %lu %lu", 373 seq_printf(seq, " %lu %lu",
371 snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTMSGS), 374 snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTMSGS),
372 snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTERRORS)); 375 snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTERRORS));
373 for (i = 0; icmpmibmap[i].name != NULL; i++) 376 for (i = 0; icmpmibmap[i].name; i++)
374 seq_printf(seq, " %lu", 377 seq_printf(seq, " %lu",
375 atomic_long_read(ptr + (icmpmibmap[i].index | 0x100))); 378 atomic_long_read(ptr + (icmpmibmap[i].index | 0x100)));
376} 379}
@@ -378,14 +381,16 @@ static void icmp_put(struct seq_file *seq)
378/* 381/*
379 * Called from the PROCfs module. This outputs /proc/net/snmp. 382 * Called from the PROCfs module. This outputs /proc/net/snmp.
380 */ 383 */
381static int snmp_seq_show(struct seq_file *seq, void *v) 384static int snmp_seq_show_ipstats(struct seq_file *seq, void *v)
382{ 385{
383 int i;
384 struct net *net = seq->private; 386 struct net *net = seq->private;
387 u64 buff64[IPSTATS_MIB_MAX];
388 int i;
385 389
386 seq_puts(seq, "Ip: Forwarding DefaultTTL"); 390 memset(buff64, 0, IPSTATS_MIB_MAX * sizeof(u64));
387 391
388 for (i = 0; snmp4_ipstats_list[i].name != NULL; i++) 392 seq_puts(seq, "Ip: Forwarding DefaultTTL");
393 for (i = 0; snmp4_ipstats_list[i].name; i++)
389 seq_printf(seq, " %s", snmp4_ipstats_list[i].name); 394 seq_printf(seq, " %s", snmp4_ipstats_list[i].name);
390 395
391 seq_printf(seq, "\nIp: %d %d", 396 seq_printf(seq, "\nIp: %d %d",
@@ -393,57 +398,77 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
393 net->ipv4.sysctl_ip_default_ttl); 398 net->ipv4.sysctl_ip_default_ttl);
394 399
395 BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0); 400 BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0);
396 for (i = 0; snmp4_ipstats_list[i].name != NULL; i++) 401 snmp_get_cpu_field64_batch(buff64, snmp4_ipstats_list,
397 seq_printf(seq, " %llu", 402 net->mib.ip_statistics,
398 snmp_fold_field64(net->mib.ip_statistics, 403 offsetof(struct ipstats_mib, syncp));
399 snmp4_ipstats_list[i].entry, 404 for (i = 0; snmp4_ipstats_list[i].name; i++)
400 offsetof(struct ipstats_mib, syncp))); 405 seq_printf(seq, " %llu", buff64[i]);
401 406
402 icmp_put(seq); /* RFC 2011 compatibility */ 407 return 0;
403 icmpmsg_put(seq); 408}
409
410static int snmp_seq_show_tcp_udp(struct seq_file *seq, void *v)
411{
412 unsigned long buff[TCPUDP_MIB_MAX];
413 struct net *net = seq->private;
414 int i;
415
416 memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long));
404 417
405 seq_puts(seq, "\nTcp:"); 418 seq_puts(seq, "\nTcp:");
406 for (i = 0; snmp4_tcp_list[i].name != NULL; i++) 419 for (i = 0; snmp4_tcp_list[i].name; i++)
407 seq_printf(seq, " %s", snmp4_tcp_list[i].name); 420 seq_printf(seq, " %s", snmp4_tcp_list[i].name);
408 421
409 seq_puts(seq, "\nTcp:"); 422 seq_puts(seq, "\nTcp:");
410 for (i = 0; snmp4_tcp_list[i].name != NULL; i++) { 423 snmp_get_cpu_field_batch(buff, snmp4_tcp_list,
424 net->mib.tcp_statistics);
425 for (i = 0; snmp4_tcp_list[i].name; i++) {
411 /* MaxConn field is signed, RFC 2012 */ 426 /* MaxConn field is signed, RFC 2012 */
412 if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN) 427 if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN)
413 seq_printf(seq, " %ld", 428 seq_printf(seq, " %ld", buff[i]);
414 snmp_fold_field(net->mib.tcp_statistics,
415 snmp4_tcp_list[i].entry));
416 else 429 else
417 seq_printf(seq, " %lu", 430 seq_printf(seq, " %lu", buff[i]);
418 snmp_fold_field(net->mib.tcp_statistics,
419 snmp4_tcp_list[i].entry));
420 } 431 }
421 432
433 memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long));
434
435 snmp_get_cpu_field_batch(buff, snmp4_udp_list,
436 net->mib.udp_statistics);
422 seq_puts(seq, "\nUdp:"); 437 seq_puts(seq, "\nUdp:");
423 for (i = 0; snmp4_udp_list[i].name != NULL; i++) 438 for (i = 0; snmp4_udp_list[i].name; i++)
424 seq_printf(seq, " %s", snmp4_udp_list[i].name); 439 seq_printf(seq, " %s", snmp4_udp_list[i].name);
425
426 seq_puts(seq, "\nUdp:"); 440 seq_puts(seq, "\nUdp:");
427 for (i = 0; snmp4_udp_list[i].name != NULL; i++) 441 for (i = 0; snmp4_udp_list[i].name; i++)
428 seq_printf(seq, " %lu", 442 seq_printf(seq, " %lu", buff[i]);
429 snmp_fold_field(net->mib.udp_statistics, 443
430 snmp4_udp_list[i].entry)); 444 memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long));
431 445
432 /* the UDP and UDP-Lite MIBs are the same */ 446 /* the UDP and UDP-Lite MIBs are the same */
433 seq_puts(seq, "\nUdpLite:"); 447 seq_puts(seq, "\nUdpLite:");
434 for (i = 0; snmp4_udp_list[i].name != NULL; i++) 448 snmp_get_cpu_field_batch(buff, snmp4_udp_list,
449 net->mib.udplite_statistics);
450 for (i = 0; snmp4_udp_list[i].name; i++)
435 seq_printf(seq, " %s", snmp4_udp_list[i].name); 451 seq_printf(seq, " %s", snmp4_udp_list[i].name);
436
437 seq_puts(seq, "\nUdpLite:"); 452 seq_puts(seq, "\nUdpLite:");
438 for (i = 0; snmp4_udp_list[i].name != NULL; i++) 453 for (i = 0; snmp4_udp_list[i].name; i++)
439 seq_printf(seq, " %lu", 454 seq_printf(seq, " %lu", buff[i]);
440 snmp_fold_field(net->mib.udplite_statistics,
441 snmp4_udp_list[i].entry));
442 455
443 seq_putc(seq, '\n'); 456 seq_putc(seq, '\n');
444 return 0; 457 return 0;
445} 458}
446 459
460static int snmp_seq_show(struct seq_file *seq, void *v)
461{
462 snmp_seq_show_ipstats(seq, v);
463
464 icmp_put(seq); /* RFC 2011 compatibility */
465 icmpmsg_put(seq);
466
467 snmp_seq_show_tcp_udp(seq, v);
468
469 return 0;
470}
471
447static int snmp_seq_open(struct inode *inode, struct file *file) 472static int snmp_seq_open(struct inode *inode, struct file *file)
448{ 473{
449 return single_open_net(inode, file, snmp_seq_show); 474 return single_open_net(inode, file, snmp_seq_show);
@@ -468,21 +493,21 @@ static int netstat_seq_show(struct seq_file *seq, void *v)
468 struct net *net = seq->private; 493 struct net *net = seq->private;
469 494
470 seq_puts(seq, "TcpExt:"); 495 seq_puts(seq, "TcpExt:");
471 for (i = 0; snmp4_net_list[i].name != NULL; i++) 496 for (i = 0; snmp4_net_list[i].name; i++)
472 seq_printf(seq, " %s", snmp4_net_list[i].name); 497 seq_printf(seq, " %s", snmp4_net_list[i].name);
473 498
474 seq_puts(seq, "\nTcpExt:"); 499 seq_puts(seq, "\nTcpExt:");
475 for (i = 0; snmp4_net_list[i].name != NULL; i++) 500 for (i = 0; snmp4_net_list[i].name; i++)
476 seq_printf(seq, " %lu", 501 seq_printf(seq, " %lu",
477 snmp_fold_field(net->mib.net_statistics, 502 snmp_fold_field(net->mib.net_statistics,
478 snmp4_net_list[i].entry)); 503 snmp4_net_list[i].entry));
479 504
480 seq_puts(seq, "\nIpExt:"); 505 seq_puts(seq, "\nIpExt:");
481 for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++) 506 for (i = 0; snmp4_ipextstats_list[i].name; i++)
482 seq_printf(seq, " %s", snmp4_ipextstats_list[i].name); 507 seq_printf(seq, " %s", snmp4_ipextstats_list[i].name);
483 508
484 seq_puts(seq, "\nIpExt:"); 509 seq_puts(seq, "\nIpExt:");
485 for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++) 510 for (i = 0; snmp4_ipextstats_list[i].name; i++)
486 seq_printf(seq, " %llu", 511 seq_printf(seq, " %llu",
487 snmp_fold_field64(net->mib.ip_statistics, 512 snmp_fold_field64(net->mib.ip_statistics,
488 snmp4_ipextstats_list[i].entry, 513 snmp4_ipextstats_list[i].entry,
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 438f50c1a676..90a85c955872 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -606,12 +606,6 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
606 (inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), 606 (inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
607 daddr, saddr, 0, 0); 607 daddr, saddr, 0, 0);
608 608
609 if (!saddr && ipc.oif) {
610 err = l3mdev_get_saddr(net, ipc.oif, &fl4);
611 if (err < 0)
612 goto done;
613 }
614
615 if (!inet->hdrincl) { 609 if (!inet->hdrincl) {
616 rfv.msg = msg; 610 rfv.msg = msg;
617 rfv.hlen = 0; 611 rfv.hlen = 0;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 62c3ed0b7556..62d4d90c1389 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1252,7 +1252,9 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
1252 mtu = 576; 1252 mtu = 576;
1253 } 1253 }
1254 1254
1255 return min_t(unsigned int, mtu, IP_MAX_MTU); 1255 mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1256
1257 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1256} 1258}
1257 1259
1258static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) 1260static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
@@ -1835,7 +1837,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1835 * Now we are ready to route packet. 1837 * Now we are ready to route packet.
1836 */ 1838 */
1837 fl4.flowi4_oif = 0; 1839 fl4.flowi4_oif = 0;
1838 fl4.flowi4_iif = l3mdev_fib_oif_rcu(dev); 1840 fl4.flowi4_iif = dev->ifindex;
1839 fl4.flowi4_mark = skb->mark; 1841 fl4.flowi4_mark = skb->mark;
1840 fl4.flowi4_tos = tos; 1842 fl4.flowi4_tos = tos;
1841 fl4.flowi4_scope = RT_SCOPE_UNIVERSE; 1843 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
@@ -2022,7 +2024,9 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
2022 return ERR_PTR(-EINVAL); 2024 return ERR_PTR(-EINVAL);
2023 2025
2024 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) 2026 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2025 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK)) 2027 if (ipv4_is_loopback(fl4->saddr) &&
2028 !(dev_out->flags & IFF_LOOPBACK) &&
2029 !netif_is_l3_master(dev_out))
2026 return ERR_PTR(-EINVAL); 2030 return ERR_PTR(-EINVAL);
2027 2031
2028 if (ipv4_is_lbcast(fl4->daddr)) 2032 if (ipv4_is_lbcast(fl4->daddr))
@@ -2152,7 +2156,6 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2152 unsigned int flags = 0; 2156 unsigned int flags = 0;
2153 struct fib_result res; 2157 struct fib_result res;
2154 struct rtable *rth; 2158 struct rtable *rth;
2155 int master_idx;
2156 int orig_oif; 2159 int orig_oif;
2157 int err = -ENETUNREACH; 2160 int err = -ENETUNREACH;
2158 2161
@@ -2162,9 +2165,6 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2162 2165
2163 orig_oif = fl4->flowi4_oif; 2166 orig_oif = fl4->flowi4_oif;
2164 2167
2165 master_idx = l3mdev_master_ifindex_by_index(net, fl4->flowi4_oif);
2166 if (master_idx)
2167 fl4->flowi4_oif = master_idx;
2168 fl4->flowi4_iif = LOOPBACK_IFINDEX; 2168 fl4->flowi4_iif = LOOPBACK_IFINDEX;
2169 fl4->flowi4_tos = tos & IPTOS_RT_MASK; 2169 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2170 fl4->flowi4_scope = ((tos & RTO_ONLINK) ? 2170 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
@@ -2248,10 +2248,6 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2248 fl4->saddr = inet_select_addr(dev_out, 0, 2248 fl4->saddr = inet_select_addr(dev_out, 0,
2249 RT_SCOPE_HOST); 2249 RT_SCOPE_HOST);
2250 } 2250 }
2251
2252 rth = l3mdev_get_rtable(dev_out, fl4);
2253 if (rth)
2254 goto out;
2255 } 2251 }
2256 2252
2257 if (!fl4->daddr) { 2253 if (!fl4->daddr) {
@@ -2306,7 +2302,9 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2306 else 2302 else
2307 fl4->saddr = fl4->daddr; 2303 fl4->saddr = fl4->daddr;
2308 } 2304 }
2309 dev_out = net->loopback_dev; 2305
2306 /* L3 master device is the loopback for that domain */
2307 dev_out = l3mdev_master_dev_rcu(dev_out) ? : net->loopback_dev;
2310 fl4->flowi4_oif = dev_out->ifindex; 2308 fl4->flowi4_oif = dev_out->ifindex;
2311 flags |= RTCF_LOCAL; 2309 flags |= RTCF_LOCAL;
2312 goto make_route; 2310 goto make_route;
@@ -2582,9 +2580,6 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2582 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0; 2580 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2583 fl4.flowi4_mark = mark; 2581 fl4.flowi4_mark = mark;
2584 2582
2585 if (netif_index_is_l3_master(net, fl4.flowi4_oif))
2586 fl4.flowi4_flags = FLOWI_FLAG_L3MDEV_SRC | FLOWI_FLAG_SKIP_NH_OIF;
2587
2588 if (iif) { 2583 if (iif) {
2589 struct net_device *dev; 2584 struct net_device *dev;
2590 2585
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index ffbb218de520..3251fe71f39f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -380,14 +380,14 @@ void tcp_init_sock(struct sock *sk)
380 struct inet_connection_sock *icsk = inet_csk(sk); 380 struct inet_connection_sock *icsk = inet_csk(sk);
381 struct tcp_sock *tp = tcp_sk(sk); 381 struct tcp_sock *tp = tcp_sk(sk);
382 382
383 __skb_queue_head_init(&tp->out_of_order_queue); 383 tp->out_of_order_queue = RB_ROOT;
384 tcp_init_xmit_timers(sk); 384 tcp_init_xmit_timers(sk);
385 tcp_prequeue_init(tp); 385 tcp_prequeue_init(tp);
386 INIT_LIST_HEAD(&tp->tsq_node); 386 INIT_LIST_HEAD(&tp->tsq_node);
387 387
388 icsk->icsk_rto = TCP_TIMEOUT_INIT; 388 icsk->icsk_rto = TCP_TIMEOUT_INIT;
389 tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); 389 tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
390 tp->rtt_min[0].rtt = ~0U; 390 minmax_reset(&tp->rtt_min, tcp_time_stamp, ~0U);
391 391
392 /* So many TCP implementations out there (incorrectly) count the 392 /* So many TCP implementations out there (incorrectly) count the
393 * initial SYN frame in their delayed-ACK and congestion control 393 * initial SYN frame in their delayed-ACK and congestion control
@@ -396,6 +396,9 @@ void tcp_init_sock(struct sock *sk)
396 */ 396 */
397 tp->snd_cwnd = TCP_INIT_CWND; 397 tp->snd_cwnd = TCP_INIT_CWND;
398 398
399 /* There's a bubble in the pipe until at least the first ACK. */
400 tp->app_limited = ~0U;
401
399 /* See draft-stevens-tcpca-spec-01 for discussion of the 402 /* See draft-stevens-tcpca-spec-01 for discussion of the
400 * initialization of these values. 403 * initialization of these values.
401 */ 404 */
@@ -421,8 +424,6 @@ void tcp_init_sock(struct sock *sk)
421 sk->sk_rcvbuf = sysctl_tcp_rmem[1]; 424 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
422 425
423 local_bh_disable(); 426 local_bh_disable();
424 if (mem_cgroup_sockets_enabled)
425 sock_update_memcg(sk);
426 sk_sockets_allocated_inc(sk); 427 sk_sockets_allocated_inc(sk);
427 local_bh_enable(); 428 local_bh_enable();
428} 429}
@@ -688,8 +689,7 @@ static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
688 int ret; 689 int ret;
689 690
690 ret = skb_splice_bits(skb, skb->sk, offset, tss->pipe, 691 ret = skb_splice_bits(skb, skb->sk, offset, tss->pipe,
691 min(rd_desc->count, len), tss->flags, 692 min(rd_desc->count, len), tss->flags);
692 skb_socket_splice);
693 if (ret > 0) 693 if (ret > 0)
694 rd_desc->count -= ret; 694 rd_desc->count -= ret;
695 return ret; 695 return ret;
@@ -1014,23 +1014,40 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset,
1014 flags); 1014 flags);
1015 1015
1016 lock_sock(sk); 1016 lock_sock(sk);
1017
1018 tcp_rate_check_app_limited(sk); /* is sending application-limited? */
1019
1017 res = do_tcp_sendpages(sk, page, offset, size, flags); 1020 res = do_tcp_sendpages(sk, page, offset, size, flags);
1018 release_sock(sk); 1021 release_sock(sk);
1019 return res; 1022 return res;
1020} 1023}
1021EXPORT_SYMBOL(tcp_sendpage); 1024EXPORT_SYMBOL(tcp_sendpage);
1022 1025
1023static inline int select_size(const struct sock *sk, bool sg) 1026/* Do not bother using a page frag for very small frames.
1027 * But use this heuristic only for the first skb in write queue.
1028 *
1029 * Having no payload in skb->head allows better SACK shifting
1030 * in tcp_shift_skb_data(), reducing sack/rack overhead, because
1031 * write queue has less skbs.
1032 * Each skb can hold up to MAX_SKB_FRAGS * 32Kbytes, or ~0.5 MB.
1033 * This also speeds up tso_fragment(), since it wont fallback
1034 * to tcp_fragment().
1035 */
1036static int linear_payload_sz(bool first_skb)
1037{
1038 if (first_skb)
1039 return SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
1040 return 0;
1041}
1042
1043static int select_size(const struct sock *sk, bool sg, bool first_skb)
1024{ 1044{
1025 const struct tcp_sock *tp = tcp_sk(sk); 1045 const struct tcp_sock *tp = tcp_sk(sk);
1026 int tmp = tp->mss_cache; 1046 int tmp = tp->mss_cache;
1027 1047
1028 if (sg) { 1048 if (sg) {
1029 if (sk_can_gso(sk)) { 1049 if (sk_can_gso(sk)) {
1030 /* Small frames wont use a full page: 1050 tmp = linear_payload_sz(first_skb);
1031 * Payload will immediately follow tcp header.
1032 */
1033 tmp = SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
1034 } else { 1051 } else {
1035 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER); 1052 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
1036 1053
@@ -1101,6 +1118,8 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
1101 1118
1102 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); 1119 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1103 1120
1121 tcp_rate_check_app_limited(sk); /* is sending application-limited? */
1122
1104 /* Wait for a connection to finish. One exception is TCP Fast Open 1123 /* Wait for a connection to finish. One exception is TCP Fast Open
1105 * (passive side) where data is allowed to be sent before a connection 1124 * (passive side) where data is allowed to be sent before a connection
1106 * is fully established. 1125 * is fully established.
@@ -1161,6 +1180,8 @@ restart:
1161 } 1180 }
1162 1181
1163 if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) { 1182 if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
1183 bool first_skb;
1184
1164new_segment: 1185new_segment:
1165 /* Allocate new segment. If the interface is SG, 1186 /* Allocate new segment. If the interface is SG,
1166 * allocate skb fitting to single page. 1187 * allocate skb fitting to single page.
@@ -1172,10 +1193,11 @@ new_segment:
1172 process_backlog = false; 1193 process_backlog = false;
1173 goto restart; 1194 goto restart;
1174 } 1195 }
1196 first_skb = skb_queue_empty(&sk->sk_write_queue);
1175 skb = sk_stream_alloc_skb(sk, 1197 skb = sk_stream_alloc_skb(sk,
1176 select_size(sk, sg), 1198 select_size(sk, sg, first_skb),
1177 sk->sk_allocation, 1199 sk->sk_allocation,
1178 skb_queue_empty(&sk->sk_write_queue)); 1200 first_skb);
1179 if (!skb) 1201 if (!skb)
1180 goto wait_for_memory; 1202 goto wait_for_memory;
1181 1203
@@ -1570,6 +1592,12 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1570} 1592}
1571EXPORT_SYMBOL(tcp_read_sock); 1593EXPORT_SYMBOL(tcp_read_sock);
1572 1594
1595int tcp_peek_len(struct socket *sock)
1596{
1597 return tcp_inq(sock->sk);
1598}
1599EXPORT_SYMBOL(tcp_peek_len);
1600
1573/* 1601/*
1574 * This routine copies from a sock struct into the user buffer. 1602 * This routine copies from a sock struct into the user buffer.
1575 * 1603 *
@@ -2237,7 +2265,7 @@ int tcp_disconnect(struct sock *sk, int flags)
2237 tcp_clear_xmit_timers(sk); 2265 tcp_clear_xmit_timers(sk);
2238 __skb_queue_purge(&sk->sk_receive_queue); 2266 __skb_queue_purge(&sk->sk_receive_queue);
2239 tcp_write_queue_purge(sk); 2267 tcp_write_queue_purge(sk);
2240 __skb_queue_purge(&tp->out_of_order_queue); 2268 skb_rbtree_purge(&tp->out_of_order_queue);
2241 2269
2242 inet->inet_dport = 0; 2270 inet->inet_dport = 0;
2243 2271
@@ -2681,7 +2709,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
2681{ 2709{
2682 const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */ 2710 const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
2683 const struct inet_connection_sock *icsk = inet_csk(sk); 2711 const struct inet_connection_sock *icsk = inet_csk(sk);
2684 u32 now = tcp_time_stamp; 2712 u32 now = tcp_time_stamp, intv;
2685 unsigned int start; 2713 unsigned int start;
2686 int notsent_bytes; 2714 int notsent_bytes;
2687 u64 rate64; 2715 u64 rate64;
@@ -2771,6 +2799,15 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
2771 info->tcpi_min_rtt = tcp_min_rtt(tp); 2799 info->tcpi_min_rtt = tcp_min_rtt(tp);
2772 info->tcpi_data_segs_in = tp->data_segs_in; 2800 info->tcpi_data_segs_in = tp->data_segs_in;
2773 info->tcpi_data_segs_out = tp->data_segs_out; 2801 info->tcpi_data_segs_out = tp->data_segs_out;
2802
2803 info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0;
2804 rate = READ_ONCE(tp->rate_delivered);
2805 intv = READ_ONCE(tp->rate_interval_us);
2806 if (rate && intv) {
2807 rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC;
2808 do_div(rate64, intv);
2809 put_unaligned(rate64, &info->tcpi_delivery_rate);
2810 }
2774} 2811}
2775EXPORT_SYMBOL_GPL(tcp_get_info); 2812EXPORT_SYMBOL_GPL(tcp_get_info);
2776 2813
@@ -3092,23 +3129,6 @@ struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
3092} 3129}
3093EXPORT_SYMBOL(tcp_get_md5sig_pool); 3130EXPORT_SYMBOL(tcp_get_md5sig_pool);
3094 3131
3095int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,
3096 const struct tcphdr *th)
3097{
3098 struct scatterlist sg;
3099 struct tcphdr hdr;
3100
3101 /* We are not allowed to change tcphdr, make a local copy */
3102 memcpy(&hdr, th, sizeof(hdr));
3103 hdr.check = 0;
3104
3105 /* options aren't included in the hash */
3106 sg_init_one(&sg, &hdr, sizeof(hdr));
3107 ahash_request_set_crypt(hp->md5_req, &sg, NULL, sizeof(hdr));
3108 return crypto_ahash_update(hp->md5_req);
3109}
3110EXPORT_SYMBOL(tcp_md5_hash_header);
3111
3112int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp, 3132int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
3113 const struct sk_buff *skb, unsigned int header_len) 3133 const struct sk_buff *skb, unsigned int header_len)
3114{ 3134{
@@ -3255,11 +3275,12 @@ static void __init tcp_init_mem(void)
3255 3275
3256void __init tcp_init(void) 3276void __init tcp_init(void)
3257{ 3277{
3258 unsigned long limit;
3259 int max_rshare, max_wshare, cnt; 3278 int max_rshare, max_wshare, cnt;
3279 unsigned long limit;
3260 unsigned int i; 3280 unsigned int i;
3261 3281
3262 sock_skb_cb_check_size(sizeof(struct tcp_skb_cb)); 3282 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) >
3283 FIELD_SIZEOF(struct sk_buff, cb));
3263 3284
3264 percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL); 3285 percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
3265 percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL); 3286 percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
new file mode 100644
index 000000000000..0ea66c2c9344
--- /dev/null
+++ b/net/ipv4/tcp_bbr.c
@@ -0,0 +1,896 @@
1/* Bottleneck Bandwidth and RTT (BBR) congestion control
2 *
3 * BBR congestion control computes the sending rate based on the delivery
4 * rate (throughput) estimated from ACKs. In a nutshell:
5 *
6 * On each ACK, update our model of the network path:
7 * bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips)
8 * min_rtt = windowed_min(rtt, 10 seconds)
9 * pacing_rate = pacing_gain * bottleneck_bandwidth
10 * cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4)
11 *
12 * The core algorithm does not react directly to packet losses or delays,
13 * although BBR may adjust the size of next send per ACK when loss is
14 * observed, or adjust the sending rate if it estimates there is a
15 * traffic policer, in order to keep the drop rate reasonable.
16 *
17 * BBR is described in detail in:
18 * "BBR: Congestion-Based Congestion Control",
19 * Neal Cardwell, Yuchung Cheng, C. Stephen Gunn, Soheil Hassas Yeganeh,
20 * Van Jacobson. ACM Queue, Vol. 14 No. 5, September-October 2016.
21 *
22 * There is a public e-mail list for discussing BBR development and testing:
23 * https://groups.google.com/forum/#!forum/bbr-dev
24 *
25 * NOTE: BBR *must* be used with the fq qdisc ("man tc-fq") with pacing enabled,
26 * since pacing is integral to the BBR design and implementation.
27 * BBR without pacing would not function properly, and may incur unnecessary
28 * high packet loss rates.
29 */
30#include <linux/module.h>
31#include <net/tcp.h>
32#include <linux/inet_diag.h>
33#include <linux/inet.h>
34#include <linux/random.h>
35#include <linux/win_minmax.h>
36
37/* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth
38 * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps.
39 * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32.
40 * Since the minimum window is >=4 packets, the lower bound isn't
41 * an issue. The upper bound isn't an issue with existing technologies.
42 */
43#define BW_SCALE 24
44#define BW_UNIT (1 << BW_SCALE)
45
46#define BBR_SCALE 8 /* scaling factor for fractions in BBR (e.g. gains) */
47#define BBR_UNIT (1 << BBR_SCALE)
48
49/* BBR has the following modes for deciding how fast to send: */
50enum bbr_mode {
51 BBR_STARTUP, /* ramp up sending rate rapidly to fill pipe */
52 BBR_DRAIN, /* drain any queue created during startup */
53 BBR_PROBE_BW, /* discover, share bw: pace around estimated bw */
54 BBR_PROBE_RTT, /* cut cwnd to min to probe min_rtt */
55};
56
57/* BBR congestion control block */
58struct bbr {
59 u32 min_rtt_us; /* min RTT in min_rtt_win_sec window */
60 u32 min_rtt_stamp; /* timestamp of min_rtt_us */
61 u32 probe_rtt_done_stamp; /* end time for BBR_PROBE_RTT mode */
62 struct minmax bw; /* Max recent delivery rate in pkts/uS << 24 */
63 u32 rtt_cnt; /* count of packet-timed rounds elapsed */
64 u32 next_rtt_delivered; /* scb->tx.delivered at end of round */
65 struct skb_mstamp cycle_mstamp; /* time of this cycle phase start */
66 u32 mode:3, /* current bbr_mode in state machine */
67 prev_ca_state:3, /* CA state on previous ACK */
68 packet_conservation:1, /* use packet conservation? */
69 restore_cwnd:1, /* decided to revert cwnd to old value */
70 round_start:1, /* start of packet-timed tx->ack round? */
71 tso_segs_goal:7, /* segments we want in each skb we send */
72 idle_restart:1, /* restarting after idle? */
73 probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */
74 unused:5,
75 lt_is_sampling:1, /* taking long-term ("LT") samples now? */
76 lt_rtt_cnt:7, /* round trips in long-term interval */
77 lt_use_bw:1; /* use lt_bw as our bw estimate? */
78 u32 lt_bw; /* LT est delivery rate in pkts/uS << 24 */
79 u32 lt_last_delivered; /* LT intvl start: tp->delivered */
80 u32 lt_last_stamp; /* LT intvl start: tp->delivered_mstamp */
81 u32 lt_last_lost; /* LT intvl start: tp->lost */
82 u32 pacing_gain:10, /* current gain for setting pacing rate */
83 cwnd_gain:10, /* current gain for setting cwnd */
84 full_bw_cnt:3, /* number of rounds without large bw gains */
85 cycle_idx:3, /* current index in pacing_gain cycle array */
86 unused_b:6;
87 u32 prior_cwnd; /* prior cwnd upon entering loss recovery */
88 u32 full_bw; /* recent bw, to estimate if pipe is full */
89};
90
91#define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */
92
93/* Window length of bw filter (in rounds): */
94static const int bbr_bw_rtts = CYCLE_LEN + 2;
95/* Window length of min_rtt filter (in sec): */
96static const u32 bbr_min_rtt_win_sec = 10;
97/* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */
98static const u32 bbr_probe_rtt_mode_ms = 200;
99/* Skip TSO below the following bandwidth (bits/sec): */
100static const int bbr_min_tso_rate = 1200000;
101
102/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
103 * that will allow a smoothly increasing pacing rate that will double each RTT
104 * and send the same number of packets per RTT that an un-paced, slow-starting
105 * Reno or CUBIC flow would:
106 */
107static const int bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1;
108/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain
109 * the queue created in BBR_STARTUP in a single round:
110 */
111static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
112/* The gain for deriving steady-state cwnd tolerates delayed/stretched ACKs: */
113static const int bbr_cwnd_gain = BBR_UNIT * 2;
114/* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */
115static const int bbr_pacing_gain[] = {
116 BBR_UNIT * 5 / 4, /* probe for more available bw */
117 BBR_UNIT * 3 / 4, /* drain queue and/or yield bw to other flows */
118 BBR_UNIT, BBR_UNIT, BBR_UNIT, /* cruise at 1.0*bw to utilize pipe, */
119 BBR_UNIT, BBR_UNIT, BBR_UNIT /* without creating excess queue... */
120};
121/* Randomize the starting gain cycling phase over N phases: */
122static const u32 bbr_cycle_rand = 7;
123
124/* Try to keep at least this many packets in flight, if things go smoothly. For
125 * smooth functioning, a sliding window protocol ACKing every other packet
126 * needs at least 4 packets in flight:
127 */
128static const u32 bbr_cwnd_min_target = 4;
129
130/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */
131/* If bw has increased significantly (1.25x), there may be more bw available: */
132static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4;
133/* But after 3 rounds w/o significant bw growth, estimate pipe is full: */
134static const u32 bbr_full_bw_cnt = 3;
135
136/* "long-term" ("LT") bandwidth estimator parameters... */
137/* The minimum number of rounds in an LT bw sampling interval: */
138static const u32 bbr_lt_intvl_min_rtts = 4;
139/* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */
140static const u32 bbr_lt_loss_thresh = 50;
141/* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */
142static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8;
143/* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */
144static const u32 bbr_lt_bw_diff = 4000 / 8;
145/* If we estimate we're policed, use lt_bw for this many round trips: */
146static const u32 bbr_lt_bw_max_rtts = 48;
147
148/* Do we estimate that STARTUP filled the pipe? */
149static bool bbr_full_bw_reached(const struct sock *sk)
150{
151 const struct bbr *bbr = inet_csk_ca(sk);
152
153 return bbr->full_bw_cnt >= bbr_full_bw_cnt;
154}
155
156/* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */
157static u32 bbr_max_bw(const struct sock *sk)
158{
159 struct bbr *bbr = inet_csk_ca(sk);
160
161 return minmax_get(&bbr->bw);
162}
163
164/* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */
165static u32 bbr_bw(const struct sock *sk)
166{
167 struct bbr *bbr = inet_csk_ca(sk);
168
169 return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk);
170}
171
172/* Return rate in bytes per second, optionally with a gain.
173 * The order here is chosen carefully to avoid overflow of u64. This should
174 * work for input rates of up to 2.9Tbit/sec and gain of 2.89x.
175 */
176static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain)
177{
178 rate *= tcp_mss_to_mtu(sk, tcp_sk(sk)->mss_cache);
179 rate *= gain;
180 rate >>= BBR_SCALE;
181 rate *= USEC_PER_SEC;
182 return rate >> BW_SCALE;
183}
184
185/* Pace using current bw estimate and a gain factor. In order to help drive the
186 * network toward lower queues while maintaining high utilization and low
187 * latency, the average pacing rate aims to be slightly (~1%) lower than the
188 * estimated bandwidth. This is an important aspect of the design. In this
189 * implementation this slightly lower pacing rate is achieved implicitly by not
190 * including link-layer headers in the packet size used for the pacing rate.
191 */
192static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
193{
194 struct bbr *bbr = inet_csk_ca(sk);
195 u64 rate = bw;
196
197 rate = bbr_rate_bytes_per_sec(sk, rate, gain);
198 rate = min_t(u64, rate, sk->sk_max_pacing_rate);
199 if (bbr->mode != BBR_STARTUP || rate > sk->sk_pacing_rate)
200 sk->sk_pacing_rate = rate;
201}
202
203/* Return count of segments we want in the skbs we send, or 0 for default. */
204static u32 bbr_tso_segs_goal(struct sock *sk)
205{
206 struct bbr *bbr = inet_csk_ca(sk);
207
208 return bbr->tso_segs_goal;
209}
210
211static void bbr_set_tso_segs_goal(struct sock *sk)
212{
213 struct tcp_sock *tp = tcp_sk(sk);
214 struct bbr *bbr = inet_csk_ca(sk);
215 u32 min_segs;
216
217 min_segs = sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2;
218 bbr->tso_segs_goal = min(tcp_tso_autosize(sk, tp->mss_cache, min_segs),
219 0x7FU);
220}
221
222/* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
223static void bbr_save_cwnd(struct sock *sk)
224{
225 struct tcp_sock *tp = tcp_sk(sk);
226 struct bbr *bbr = inet_csk_ca(sk);
227
228 if (bbr->prev_ca_state < TCP_CA_Recovery && bbr->mode != BBR_PROBE_RTT)
229 bbr->prior_cwnd = tp->snd_cwnd; /* this cwnd is good enough */
230 else /* loss recovery or BBR_PROBE_RTT have temporarily cut cwnd */
231 bbr->prior_cwnd = max(bbr->prior_cwnd, tp->snd_cwnd);
232}
233
234static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
235{
236 struct tcp_sock *tp = tcp_sk(sk);
237 struct bbr *bbr = inet_csk_ca(sk);
238
239 if (event == CA_EVENT_TX_START && tp->app_limited) {
240 bbr->idle_restart = 1;
241 /* Avoid pointless buffer overflows: pace at est. bw if we don't
242 * need more speed (we're restarting from idle and app-limited).
243 */
244 if (bbr->mode == BBR_PROBE_BW)
245 bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT);
246 }
247}
248
249/* Find target cwnd. Right-size the cwnd based on min RTT and the
250 * estimated bottleneck bandwidth:
251 *
252 * cwnd = bw * min_rtt * gain = BDP * gain
253 *
254 * The key factor, gain, controls the amount of queue. While a small gain
255 * builds a smaller queue, it becomes more vulnerable to noise in RTT
256 * measurements (e.g., delayed ACKs or other ACK compression effects). This
257 * noise may cause BBR to under-estimate the rate.
258 *
259 * To achieve full performance in high-speed paths, we budget enough cwnd to
260 * fit full-sized skbs in-flight on both end hosts to fully utilize the path:
261 * - one skb in sending host Qdisc,
262 * - one skb in sending host TSO/GSO engine
263 * - one skb being received by receiver host LRO/GRO/delayed-ACK engine
264 * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because
265 * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets,
266 * which allows 2 outstanding 2-packet sequences, to try to keep pipe
267 * full even with ACK-every-other-packet delayed ACKs.
268 */
269static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain)
270{
271 struct bbr *bbr = inet_csk_ca(sk);
272 u32 cwnd;
273 u64 w;
274
275 /* If we've never had a valid RTT sample, cap cwnd at the initial
276 * default. This should only happen when the connection is not using TCP
277 * timestamps and has retransmitted all of the SYN/SYNACK/data packets
278 * ACKed so far. In this case, an RTO can cut cwnd to 1, in which
279 * case we need to slow-start up toward something safe: TCP_INIT_CWND.
280 */
281 if (unlikely(bbr->min_rtt_us == ~0U)) /* no valid RTT samples yet? */
282 return TCP_INIT_CWND; /* be safe: cap at default initial cwnd*/
283
284 w = (u64)bw * bbr->min_rtt_us;
285
286 /* Apply a gain to the given value, then remove the BW_SCALE shift. */
287 cwnd = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT;
288
289 /* Allow enough full-sized skbs in flight to utilize end systems. */
290 cwnd += 3 * bbr->tso_segs_goal;
291
292 /* Reduce delayed ACKs by rounding up cwnd to the next even number. */
293 cwnd = (cwnd + 1) & ~1U;
294
295 return cwnd;
296}
297
298/* An optimization in BBR to reduce losses: On the first round of recovery, we
299 * follow the packet conservation principle: send P packets per P packets acked.
300 * After that, we slow-start and send at most 2*P packets per P packets acked.
301 * After recovery finishes, or upon undo, we restore the cwnd we had when
302 * recovery started (capped by the target cwnd based on estimated BDP).
303 *
304 * TODO(ycheng/ncardwell): implement a rate-based approach.
305 */
306static bool bbr_set_cwnd_to_recover_or_restore(
307 struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd)
308{
309 struct tcp_sock *tp = tcp_sk(sk);
310 struct bbr *bbr = inet_csk_ca(sk);
311 u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state;
312 u32 cwnd = tp->snd_cwnd;
313
314 /* An ACK for P pkts should release at most 2*P packets. We do this
315 * in two steps. First, here we deduct the number of lost packets.
316 * Then, in bbr_set_cwnd() we slow start up toward the target cwnd.
317 */
318 if (rs->losses > 0)
319 cwnd = max_t(s32, cwnd - rs->losses, 1);
320
321 if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) {
322 /* Starting 1st round of Recovery, so do packet conservation. */
323 bbr->packet_conservation = 1;
324 bbr->next_rtt_delivered = tp->delivered; /* start round now */
325 /* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */
326 cwnd = tcp_packets_in_flight(tp) + acked;
327 } else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) {
328 /* Exiting loss recovery; restore cwnd saved before recovery. */
329 bbr->restore_cwnd = 1;
330 bbr->packet_conservation = 0;
331 }
332 bbr->prev_ca_state = state;
333
334 if (bbr->restore_cwnd) {
335 /* Restore cwnd after exiting loss recovery or PROBE_RTT. */
336 cwnd = max(cwnd, bbr->prior_cwnd);
337 bbr->restore_cwnd = 0;
338 }
339
340 if (bbr->packet_conservation) {
341 *new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked);
342 return true; /* yes, using packet conservation */
343 }
344 *new_cwnd = cwnd;
345 return false;
346}
347
348/* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss
349 * has drawn us down below target), or snap down to target if we're above it.
350 */
351static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
352 u32 acked, u32 bw, int gain)
353{
354 struct tcp_sock *tp = tcp_sk(sk);
355 struct bbr *bbr = inet_csk_ca(sk);
356 u32 cwnd = 0, target_cwnd = 0;
357
358 if (!acked)
359 return;
360
361 if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd))
362 goto done;
363
364 /* If we're below target cwnd, slow start cwnd toward target cwnd. */
365 target_cwnd = bbr_target_cwnd(sk, bw, gain);
366 if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */
367 cwnd = min(cwnd + acked, target_cwnd);
368 else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND)
369 cwnd = cwnd + acked;
370 cwnd = max(cwnd, bbr_cwnd_min_target);
371
372done:
373 tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); /* apply global cap */
374 if (bbr->mode == BBR_PROBE_RTT) /* drain queue, refresh min_rtt */
375 tp->snd_cwnd = min(tp->snd_cwnd, bbr_cwnd_min_target);
376}
377
378/* End cycle phase if it's time and/or we hit the phase's in-flight target. */
379static bool bbr_is_next_cycle_phase(struct sock *sk,
380 const struct rate_sample *rs)
381{
382 struct tcp_sock *tp = tcp_sk(sk);
383 struct bbr *bbr = inet_csk_ca(sk);
384 bool is_full_length =
385 skb_mstamp_us_delta(&tp->delivered_mstamp, &bbr->cycle_mstamp) >
386 bbr->min_rtt_us;
387 u32 inflight, bw;
388
389 /* The pacing_gain of 1.0 paces at the estimated bw to try to fully
390 * use the pipe without increasing the queue.
391 */
392 if (bbr->pacing_gain == BBR_UNIT)
393 return is_full_length; /* just use wall clock time */
394
395 inflight = rs->prior_in_flight; /* what was in-flight before ACK? */
396 bw = bbr_max_bw(sk);
397
398 /* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at
399 * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is
400 * small (e.g. on a LAN). We do not persist if packets are lost, since
401 * a path with small buffers may not hold that much.
402 */
403 if (bbr->pacing_gain > BBR_UNIT)
404 return is_full_length &&
405 (rs->losses || /* perhaps pacing_gain*BDP won't fit */
406 inflight >= bbr_target_cwnd(sk, bw, bbr->pacing_gain));
407
408 /* A pacing_gain < 1.0 tries to drain extra queue we added if bw
409 * probing didn't find more bw. If inflight falls to match BDP then we
410 * estimate queue is drained; persisting would underutilize the pipe.
411 */
412 return is_full_length ||
413 inflight <= bbr_target_cwnd(sk, bw, BBR_UNIT);
414}
415
416static void bbr_advance_cycle_phase(struct sock *sk)
417{
418 struct tcp_sock *tp = tcp_sk(sk);
419 struct bbr *bbr = inet_csk_ca(sk);
420
421 bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1);
422 bbr->cycle_mstamp = tp->delivered_mstamp;
423 bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx];
424}
425
426/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */
427static void bbr_update_cycle_phase(struct sock *sk,
428 const struct rate_sample *rs)
429{
430 struct bbr *bbr = inet_csk_ca(sk);
431
432 if ((bbr->mode == BBR_PROBE_BW) && !bbr->lt_use_bw &&
433 bbr_is_next_cycle_phase(sk, rs))
434 bbr_advance_cycle_phase(sk);
435}
436
437static void bbr_reset_startup_mode(struct sock *sk)
438{
439 struct bbr *bbr = inet_csk_ca(sk);
440
441 bbr->mode = BBR_STARTUP;
442 bbr->pacing_gain = bbr_high_gain;
443 bbr->cwnd_gain = bbr_high_gain;
444}
445
446static void bbr_reset_probe_bw_mode(struct sock *sk)
447{
448 struct bbr *bbr = inet_csk_ca(sk);
449
450 bbr->mode = BBR_PROBE_BW;
451 bbr->pacing_gain = BBR_UNIT;
452 bbr->cwnd_gain = bbr_cwnd_gain;
453 bbr->cycle_idx = CYCLE_LEN - 1 - prandom_u32_max(bbr_cycle_rand);
454 bbr_advance_cycle_phase(sk); /* flip to next phase of gain cycle */
455}
456
457static void bbr_reset_mode(struct sock *sk)
458{
459 if (!bbr_full_bw_reached(sk))
460 bbr_reset_startup_mode(sk);
461 else
462 bbr_reset_probe_bw_mode(sk);
463}
464
465/* Start a new long-term sampling interval. */
466static void bbr_reset_lt_bw_sampling_interval(struct sock *sk)
467{
468 struct tcp_sock *tp = tcp_sk(sk);
469 struct bbr *bbr = inet_csk_ca(sk);
470
471 bbr->lt_last_stamp = tp->delivered_mstamp.stamp_jiffies;
472 bbr->lt_last_delivered = tp->delivered;
473 bbr->lt_last_lost = tp->lost;
474 bbr->lt_rtt_cnt = 0;
475}
476
477/* Completely reset long-term bandwidth sampling. */
478static void bbr_reset_lt_bw_sampling(struct sock *sk)
479{
480 struct bbr *bbr = inet_csk_ca(sk);
481
482 bbr->lt_bw = 0;
483 bbr->lt_use_bw = 0;
484 bbr->lt_is_sampling = false;
485 bbr_reset_lt_bw_sampling_interval(sk);
486}
487
488/* Long-term bw sampling interval is done. Estimate whether we're policed. */
489static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw)
490{
491 struct bbr *bbr = inet_csk_ca(sk);
492 u32 diff;
493
494 if (bbr->lt_bw) { /* do we have bw from a previous interval? */
495 /* Is new bw close to the lt_bw from the previous interval? */
496 diff = abs(bw - bbr->lt_bw);
497 if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) ||
498 (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <=
499 bbr_lt_bw_diff)) {
500 /* All criteria are met; estimate we're policed. */
501 bbr->lt_bw = (bw + bbr->lt_bw) >> 1; /* avg 2 intvls */
502 bbr->lt_use_bw = 1;
503 bbr->pacing_gain = BBR_UNIT; /* try to avoid drops */
504 bbr->lt_rtt_cnt = 0;
505 return;
506 }
507 }
508 bbr->lt_bw = bw;
509 bbr_reset_lt_bw_sampling_interval(sk);
510}
511
512/* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of
513 * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and
514 * explicitly models their policed rate, to reduce unnecessary losses. We
515 * estimate that we're policed if we see 2 consecutive sampling intervals with
516 * consistent throughput and high packet loss. If we think we're being policed,
517 * set lt_bw to the "long-term" average delivery rate from those 2 intervals.
518 */
519static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs)
520{
521 struct tcp_sock *tp = tcp_sk(sk);
522 struct bbr *bbr = inet_csk_ca(sk);
523 u32 lost, delivered;
524 u64 bw;
525 s32 t;
526
527 if (bbr->lt_use_bw) { /* already using long-term rate, lt_bw? */
528 if (bbr->mode == BBR_PROBE_BW && bbr->round_start &&
529 ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) {
530 bbr_reset_lt_bw_sampling(sk); /* stop using lt_bw */
531 bbr_reset_probe_bw_mode(sk); /* restart gain cycling */
532 }
533 return;
534 }
535
536 /* Wait for the first loss before sampling, to let the policer exhaust
537 * its tokens and estimate the steady-state rate allowed by the policer.
538 * Starting samples earlier includes bursts that over-estimate the bw.
539 */
540 if (!bbr->lt_is_sampling) {
541 if (!rs->losses)
542 return;
543 bbr_reset_lt_bw_sampling_interval(sk);
544 bbr->lt_is_sampling = true;
545 }
546
547 /* To avoid underestimates, reset sampling if we run out of data. */
548 if (rs->is_app_limited) {
549 bbr_reset_lt_bw_sampling(sk);
550 return;
551 }
552
553 if (bbr->round_start)
554 bbr->lt_rtt_cnt++; /* count round trips in this interval */
555 if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts)
556 return; /* sampling interval needs to be longer */
557 if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) {
558 bbr_reset_lt_bw_sampling(sk); /* interval is too long */
559 return;
560 }
561
562 /* End sampling interval when a packet is lost, so we estimate the
563 * policer tokens were exhausted. Stopping the sampling before the
564 * tokens are exhausted under-estimates the policed rate.
565 */
566 if (!rs->losses)
567 return;
568
569 /* Calculate packets lost and delivered in sampling interval. */
570 lost = tp->lost - bbr->lt_last_lost;
571 delivered = tp->delivered - bbr->lt_last_delivered;
572 /* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */
573 if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered)
574 return;
575
576 /* Find average delivery rate in this sampling interval. */
577 t = (s32)(tp->delivered_mstamp.stamp_jiffies - bbr->lt_last_stamp);
578 if (t < 1)
579 return; /* interval is less than one jiffy, so wait */
580 t = jiffies_to_usecs(t);
581 /* Interval long enough for jiffies_to_usecs() to return a bogus 0? */
582 if (t < 1) {
583 bbr_reset_lt_bw_sampling(sk); /* interval too long; reset */
584 return;
585 }
586 bw = (u64)delivered * BW_UNIT;
587 do_div(bw, t);
588 bbr_lt_bw_interval_done(sk, bw);
589}
590
591/* Estimate the bandwidth based on how fast packets are delivered */
592static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
593{
594 struct tcp_sock *tp = tcp_sk(sk);
595 struct bbr *bbr = inet_csk_ca(sk);
596 u64 bw;
597
598 bbr->round_start = 0;
599 if (rs->delivered < 0 || rs->interval_us <= 0)
600 return; /* Not a valid observation */
601
602 /* See if we've reached the next RTT */
603 if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) {
604 bbr->next_rtt_delivered = tp->delivered;
605 bbr->rtt_cnt++;
606 bbr->round_start = 1;
607 bbr->packet_conservation = 0;
608 }
609
610 bbr_lt_bw_sampling(sk, rs);
611
612 /* Divide delivered by the interval to find a (lower bound) bottleneck
613 * bandwidth sample. Delivered is in packets and interval_us in uS and
614 * ratio will be <<1 for most connections. So delivered is first scaled.
615 */
616 bw = (u64)rs->delivered * BW_UNIT;
617 do_div(bw, rs->interval_us);
618
619 /* If this sample is application-limited, it is likely to have a very
620 * low delivered count that represents application behavior rather than
621 * the available network rate. Such a sample could drag down estimated
622 * bw, causing needless slow-down. Thus, to continue to send at the
623 * last measured network rate, we filter out app-limited samples unless
624 * they describe the path bw at least as well as our bw model.
625 *
626 * So the goal during app-limited phase is to proceed with the best
627 * network rate no matter how long. We automatically leave this
628 * phase when app writes faster than the network can deliver :)
629 */
630 if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) {
631 /* Incorporate new sample into our max bw filter. */
632 minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw);
633 }
634}
635
636/* Estimate when the pipe is full, using the change in delivery rate: BBR
637 * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by
638 * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
639 * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
640 * higher rwin, 3: we get higher delivery rate samples. Or transient
641 * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
642 * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
643 */
644static void bbr_check_full_bw_reached(struct sock *sk,
645 const struct rate_sample *rs)
646{
647 struct bbr *bbr = inet_csk_ca(sk);
648 u32 bw_thresh;
649
650 if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited)
651 return;
652
653 bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE;
654 if (bbr_max_bw(sk) >= bw_thresh) {
655 bbr->full_bw = bbr_max_bw(sk);
656 bbr->full_bw_cnt = 0;
657 return;
658 }
659 ++bbr->full_bw_cnt;
660}
661
662/* If pipe is probably full, drain the queue and then enter steady-state. */
663static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
664{
665 struct bbr *bbr = inet_csk_ca(sk);
666
667 if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
668 bbr->mode = BBR_DRAIN; /* drain queue we created */
669 bbr->pacing_gain = bbr_drain_gain; /* pace slow to drain */
670 bbr->cwnd_gain = bbr_high_gain; /* maintain cwnd */
671 } /* fall through to check if in-flight is already small: */
672 if (bbr->mode == BBR_DRAIN &&
673 tcp_packets_in_flight(tcp_sk(sk)) <=
674 bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT))
675 bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */
676}
677
678/* The goal of PROBE_RTT mode is to have BBR flows cooperatively and
679 * periodically drain the bottleneck queue, to converge to measure the true
680 * min_rtt (unloaded propagation delay). This allows the flows to keep queues
681 * small (reducing queuing delay and packet loss) and achieve fairness among
682 * BBR flows.
683 *
684 * The min_rtt filter window is 10 seconds. When the min_rtt estimate expires,
685 * we enter PROBE_RTT mode and cap the cwnd at bbr_cwnd_min_target=4 packets.
686 * After at least bbr_probe_rtt_mode_ms=200ms and at least one packet-timed
687 * round trip elapsed with that flight size <= 4, we leave PROBE_RTT mode and
688 * re-enter the previous mode. BBR uses 200ms to approximately bound the
689 * performance penalty of PROBE_RTT's cwnd capping to roughly 2% (200ms/10s).
690 *
691 * Note that flows need only pay 2% if they are busy sending over the last 10
692 * seconds. Interactive applications (e.g., Web, RPCs, video chunks) often have
693 * natural silences or low-rate periods within 10 seconds where the rate is low
694 * enough for long enough to drain its queue in the bottleneck. We pick up
695 * these min RTT measurements opportunistically with our min_rtt filter. :-)
696 */
697static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
698{
699 struct tcp_sock *tp = tcp_sk(sk);
700 struct bbr *bbr = inet_csk_ca(sk);
701 bool filter_expired;
702
703 /* Track min RTT seen in the min_rtt_win_sec filter window: */
704 filter_expired = after(tcp_time_stamp,
705 bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ);
706 if (rs->rtt_us >= 0 &&
707 (rs->rtt_us <= bbr->min_rtt_us || filter_expired)) {
708 bbr->min_rtt_us = rs->rtt_us;
709 bbr->min_rtt_stamp = tcp_time_stamp;
710 }
711
712 if (bbr_probe_rtt_mode_ms > 0 && filter_expired &&
713 !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
714 bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */
715 bbr->pacing_gain = BBR_UNIT;
716 bbr->cwnd_gain = BBR_UNIT;
717 bbr_save_cwnd(sk); /* note cwnd so we can restore it */
718 bbr->probe_rtt_done_stamp = 0;
719 }
720
721 if (bbr->mode == BBR_PROBE_RTT) {
722 /* Ignore low rate samples during this mode. */
723 tp->app_limited =
724 (tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
725 /* Maintain min packets in flight for max(200 ms, 1 round). */
726 if (!bbr->probe_rtt_done_stamp &&
727 tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) {
728 bbr->probe_rtt_done_stamp = tcp_time_stamp +
729 msecs_to_jiffies(bbr_probe_rtt_mode_ms);
730 bbr->probe_rtt_round_done = 0;
731 bbr->next_rtt_delivered = tp->delivered;
732 } else if (bbr->probe_rtt_done_stamp) {
733 if (bbr->round_start)
734 bbr->probe_rtt_round_done = 1;
735 if (bbr->probe_rtt_round_done &&
736 after(tcp_time_stamp, bbr->probe_rtt_done_stamp)) {
737 bbr->min_rtt_stamp = tcp_time_stamp;
738 bbr->restore_cwnd = 1; /* snap to prior_cwnd */
739 bbr_reset_mode(sk);
740 }
741 }
742 }
743 bbr->idle_restart = 0;
744}
745
746static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
747{
748 bbr_update_bw(sk, rs);
749 bbr_update_cycle_phase(sk, rs);
750 bbr_check_full_bw_reached(sk, rs);
751 bbr_check_drain(sk, rs);
752 bbr_update_min_rtt(sk, rs);
753}
754
755static void bbr_main(struct sock *sk, const struct rate_sample *rs)
756{
757 struct bbr *bbr = inet_csk_ca(sk);
758 u32 bw;
759
760 bbr_update_model(sk, rs);
761
762 bw = bbr_bw(sk);
763 bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
764 bbr_set_tso_segs_goal(sk);
765 bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain);
766}
767
768static void bbr_init(struct sock *sk)
769{
770 struct tcp_sock *tp = tcp_sk(sk);
771 struct bbr *bbr = inet_csk_ca(sk);
772 u64 bw;
773
774 bbr->prior_cwnd = 0;
775 bbr->tso_segs_goal = 0; /* default segs per skb until first ACK */
776 bbr->rtt_cnt = 0;
777 bbr->next_rtt_delivered = 0;
778 bbr->prev_ca_state = TCP_CA_Open;
779 bbr->packet_conservation = 0;
780
781 bbr->probe_rtt_done_stamp = 0;
782 bbr->probe_rtt_round_done = 0;
783 bbr->min_rtt_us = tcp_min_rtt(tp);
784 bbr->min_rtt_stamp = tcp_time_stamp;
785
786 minmax_reset(&bbr->bw, bbr->rtt_cnt, 0); /* init max bw to 0 */
787
788 /* Initialize pacing rate to: high_gain * init_cwnd / RTT. */
789 bw = (u64)tp->snd_cwnd * BW_UNIT;
790 do_div(bw, (tp->srtt_us >> 3) ? : USEC_PER_MSEC);
791 sk->sk_pacing_rate = 0; /* force an update of sk_pacing_rate */
792 bbr_set_pacing_rate(sk, bw, bbr_high_gain);
793
794 bbr->restore_cwnd = 0;
795 bbr->round_start = 0;
796 bbr->idle_restart = 0;
797 bbr->full_bw = 0;
798 bbr->full_bw_cnt = 0;
799 bbr->cycle_mstamp.v64 = 0;
800 bbr->cycle_idx = 0;
801 bbr_reset_lt_bw_sampling(sk);
802 bbr_reset_startup_mode(sk);
803}
804
805static u32 bbr_sndbuf_expand(struct sock *sk)
806{
807 /* Provision 3 * cwnd since BBR may slow-start even during recovery. */
808 return 3;
809}
810
811/* In theory BBR does not need to undo the cwnd since it does not
812 * always reduce cwnd on losses (see bbr_main()). Keep it for now.
813 */
814static u32 bbr_undo_cwnd(struct sock *sk)
815{
816 return tcp_sk(sk)->snd_cwnd;
817}
818
819/* Entering loss recovery, so save cwnd for when we exit or undo recovery. */
820static u32 bbr_ssthresh(struct sock *sk)
821{
822 bbr_save_cwnd(sk);
823 return TCP_INFINITE_SSTHRESH; /* BBR does not use ssthresh */
824}
825
826static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr,
827 union tcp_cc_info *info)
828{
829 if (ext & (1 << (INET_DIAG_BBRINFO - 1)) ||
830 ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
831 struct tcp_sock *tp = tcp_sk(sk);
832 struct bbr *bbr = inet_csk_ca(sk);
833 u64 bw = bbr_bw(sk);
834
835 bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE;
836 memset(&info->bbr, 0, sizeof(info->bbr));
837 info->bbr.bbr_bw_lo = (u32)bw;
838 info->bbr.bbr_bw_hi = (u32)(bw >> 32);
839 info->bbr.bbr_min_rtt = bbr->min_rtt_us;
840 info->bbr.bbr_pacing_gain = bbr->pacing_gain;
841 info->bbr.bbr_cwnd_gain = bbr->cwnd_gain;
842 *attr = INET_DIAG_BBRINFO;
843 return sizeof(info->bbr);
844 }
845 return 0;
846}
847
848static void bbr_set_state(struct sock *sk, u8 new_state)
849{
850 struct bbr *bbr = inet_csk_ca(sk);
851
852 if (new_state == TCP_CA_Loss) {
853 struct rate_sample rs = { .losses = 1 };
854
855 bbr->prev_ca_state = TCP_CA_Loss;
856 bbr->full_bw = 0;
857 bbr->round_start = 1; /* treat RTO like end of a round */
858 bbr_lt_bw_sampling(sk, &rs);
859 }
860}
861
862static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
863 .flags = TCP_CONG_NON_RESTRICTED,
864 .name = "bbr",
865 .owner = THIS_MODULE,
866 .init = bbr_init,
867 .cong_control = bbr_main,
868 .sndbuf_expand = bbr_sndbuf_expand,
869 .undo_cwnd = bbr_undo_cwnd,
870 .cwnd_event = bbr_cwnd_event,
871 .ssthresh = bbr_ssthresh,
872 .tso_segs_goal = bbr_tso_segs_goal,
873 .get_info = bbr_get_info,
874 .set_state = bbr_set_state,
875};
876
877static int __init bbr_register(void)
878{
879 BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE);
880 return tcp_register_congestion_control(&tcp_bbr_cong_ops);
881}
882
883static void __exit bbr_unregister(void)
884{
885 tcp_unregister_congestion_control(&tcp_bbr_cong_ops);
886}
887
888module_init(bbr_register);
889module_exit(bbr_unregister);
890
891MODULE_AUTHOR("Van Jacobson <vanj@google.com>");
892MODULE_AUTHOR("Neal Cardwell <ncardwell@google.com>");
893MODULE_AUTHOR("Yuchung Cheng <ycheng@google.com>");
894MODULE_AUTHOR("Soheil Hassas Yeganeh <soheil@google.com>");
895MODULE_LICENSE("Dual BSD/GPL");
896MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)");
diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c
index 03725b294286..35b280361cb2 100644
--- a/net/ipv4/tcp_cdg.c
+++ b/net/ipv4/tcp_cdg.c
@@ -56,7 +56,7 @@ MODULE_PARM_DESC(use_shadow, "use shadow window heuristic");
56module_param(use_tolerance, bool, 0644); 56module_param(use_tolerance, bool, 0644);
57MODULE_PARM_DESC(use_tolerance, "use loss tolerance heuristic"); 57MODULE_PARM_DESC(use_tolerance, "use loss tolerance heuristic");
58 58
59struct minmax { 59struct cdg_minmax {
60 union { 60 union {
61 struct { 61 struct {
62 s32 min; 62 s32 min;
@@ -74,10 +74,10 @@ enum cdg_state {
74}; 74};
75 75
76struct cdg { 76struct cdg {
77 struct minmax rtt; 77 struct cdg_minmax rtt;
78 struct minmax rtt_prev; 78 struct cdg_minmax rtt_prev;
79 struct minmax *gradients; 79 struct cdg_minmax *gradients;
80 struct minmax gsum; 80 struct cdg_minmax gsum;
81 bool gfilled; 81 bool gfilled;
82 u8 tail; 82 u8 tail;
83 u8 state; 83 u8 state;
@@ -353,7 +353,7 @@ static void tcp_cdg_cwnd_event(struct sock *sk, const enum tcp_ca_event ev)
353{ 353{
354 struct cdg *ca = inet_csk_ca(sk); 354 struct cdg *ca = inet_csk_ca(sk);
355 struct tcp_sock *tp = tcp_sk(sk); 355 struct tcp_sock *tp = tcp_sk(sk);
356 struct minmax *gradients; 356 struct cdg_minmax *gradients;
357 357
358 switch (ev) { 358 switch (ev) {
359 case CA_EVENT_CWND_RESTART: 359 case CA_EVENT_CWND_RESTART:
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 882caa4e72bc..1294af4e0127 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -69,7 +69,7 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
69 int ret = 0; 69 int ret = 0;
70 70
71 /* all algorithms must implement ssthresh and cong_avoid ops */ 71 /* all algorithms must implement ssthresh and cong_avoid ops */
72 if (!ca->ssthresh || !ca->cong_avoid) { 72 if (!ca->ssthresh || !(ca->cong_avoid || ca->cong_control)) {
73 pr_err("%s does not implement required ops\n", ca->name); 73 pr_err("%s does not implement required ops\n", ca->name);
74 return -EINVAL; 74 return -EINVAL;
75 } 75 }
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a756b8749a26..a27b9c0e27c0 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -289,6 +289,7 @@ static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr
289static void tcp_sndbuf_expand(struct sock *sk) 289static void tcp_sndbuf_expand(struct sock *sk)
290{ 290{
291 const struct tcp_sock *tp = tcp_sk(sk); 291 const struct tcp_sock *tp = tcp_sk(sk);
292 const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
292 int sndmem, per_mss; 293 int sndmem, per_mss;
293 u32 nr_segs; 294 u32 nr_segs;
294 295
@@ -309,7 +310,8 @@ static void tcp_sndbuf_expand(struct sock *sk)
309 * Cubic needs 1.7 factor, rounded to 2 to include 310 * Cubic needs 1.7 factor, rounded to 2 to include
310 * extra cushion (application might react slowly to POLLOUT) 311 * extra cushion (application might react slowly to POLLOUT)
311 */ 312 */
312 sndmem = 2 * nr_segs * per_mss; 313 sndmem = ca_ops->sndbuf_expand ? ca_ops->sndbuf_expand(sk) : 2;
314 sndmem *= nr_segs * per_mss;
313 315
314 if (sk->sk_sndbuf < sndmem) 316 if (sk->sk_sndbuf < sndmem)
315 sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]); 317 sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
@@ -899,12 +901,29 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
899 tp->retransmit_high = TCP_SKB_CB(skb)->end_seq; 901 tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
900} 902}
901 903
904/* Sum the number of packets on the wire we have marked as lost.
905 * There are two cases we care about here:
906 * a) Packet hasn't been marked lost (nor retransmitted),
907 * and this is the first loss.
908 * b) Packet has been marked both lost and retransmitted,
909 * and this means we think it was lost again.
910 */
911static void tcp_sum_lost(struct tcp_sock *tp, struct sk_buff *skb)
912{
913 __u8 sacked = TCP_SKB_CB(skb)->sacked;
914
915 if (!(sacked & TCPCB_LOST) ||
916 ((sacked & TCPCB_LOST) && (sacked & TCPCB_SACKED_RETRANS)))
917 tp->lost += tcp_skb_pcount(skb);
918}
919
902static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb) 920static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
903{ 921{
904 if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) { 922 if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
905 tcp_verify_retransmit_hint(tp, skb); 923 tcp_verify_retransmit_hint(tp, skb);
906 924
907 tp->lost_out += tcp_skb_pcount(skb); 925 tp->lost_out += tcp_skb_pcount(skb);
926 tcp_sum_lost(tp, skb);
908 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; 927 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
909 } 928 }
910} 929}
@@ -913,6 +932,7 @@ void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb)
913{ 932{
914 tcp_verify_retransmit_hint(tp, skb); 933 tcp_verify_retransmit_hint(tp, skb);
915 934
935 tcp_sum_lost(tp, skb);
916 if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) { 936 if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
917 tp->lost_out += tcp_skb_pcount(skb); 937 tp->lost_out += tcp_skb_pcount(skb);
918 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; 938 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
@@ -1094,6 +1114,7 @@ struct tcp_sacktag_state {
1094 */ 1114 */
1095 struct skb_mstamp first_sackt; 1115 struct skb_mstamp first_sackt;
1096 struct skb_mstamp last_sackt; 1116 struct skb_mstamp last_sackt;
1117 struct rate_sample *rate;
1097 int flag; 1118 int flag;
1098}; 1119};
1099 1120
@@ -1261,6 +1282,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
1261 tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, 1282 tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
1262 start_seq, end_seq, dup_sack, pcount, 1283 start_seq, end_seq, dup_sack, pcount,
1263 &skb->skb_mstamp); 1284 &skb->skb_mstamp);
1285 tcp_rate_skb_delivered(sk, skb, state->rate);
1264 1286
1265 if (skb == tp->lost_skb_hint) 1287 if (skb == tp->lost_skb_hint)
1266 tp->lost_cnt_hint += pcount; 1288 tp->lost_cnt_hint += pcount;
@@ -1311,6 +1333,9 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
1311 tcp_advance_highest_sack(sk, skb); 1333 tcp_advance_highest_sack(sk, skb);
1312 1334
1313 tcp_skb_collapse_tstamp(prev, skb); 1335 tcp_skb_collapse_tstamp(prev, skb);
1336 if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp.v64))
1337 TCP_SKB_CB(prev)->tx.delivered_mstamp.v64 = 0;
1338
1314 tcp_unlink_write_queue(skb, sk); 1339 tcp_unlink_write_queue(skb, sk);
1315 sk_wmem_free_skb(sk, skb); 1340 sk_wmem_free_skb(sk, skb);
1316 1341
@@ -1540,6 +1565,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
1540 dup_sack, 1565 dup_sack,
1541 tcp_skb_pcount(skb), 1566 tcp_skb_pcount(skb),
1542 &skb->skb_mstamp); 1567 &skb->skb_mstamp);
1568 tcp_rate_skb_delivered(sk, skb, state->rate);
1543 1569
1544 if (!before(TCP_SKB_CB(skb)->seq, 1570 if (!before(TCP_SKB_CB(skb)->seq,
1545 tcp_highest_sack_seq(tp))) 1571 tcp_highest_sack_seq(tp)))
@@ -1622,8 +1648,10 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
1622 1648
1623 found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire, 1649 found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
1624 num_sacks, prior_snd_una); 1650 num_sacks, prior_snd_una);
1625 if (found_dup_sack) 1651 if (found_dup_sack) {
1626 state->flag |= FLAG_DSACKING_ACK; 1652 state->flag |= FLAG_DSACKING_ACK;
1653 tp->delivered++; /* A spurious retransmission is delivered */
1654 }
1627 1655
1628 /* Eliminate too old ACKs, but take into 1656 /* Eliminate too old ACKs, but take into
1629 * account more or less fresh ones, they can 1657 * account more or less fresh ones, they can
@@ -1890,6 +1918,7 @@ void tcp_enter_loss(struct sock *sk)
1890 struct sk_buff *skb; 1918 struct sk_buff *skb;
1891 bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery; 1919 bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
1892 bool is_reneg; /* is receiver reneging on SACKs? */ 1920 bool is_reneg; /* is receiver reneging on SACKs? */
1921 bool mark_lost;
1893 1922
1894 /* Reduce ssthresh if it has not yet been made inside this window. */ 1923 /* Reduce ssthresh if it has not yet been made inside this window. */
1895 if (icsk->icsk_ca_state <= TCP_CA_Disorder || 1924 if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
@@ -1923,8 +1952,12 @@ void tcp_enter_loss(struct sock *sk)
1923 if (skb == tcp_send_head(sk)) 1952 if (skb == tcp_send_head(sk))
1924 break; 1953 break;
1925 1954
1955 mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
1956 is_reneg);
1957 if (mark_lost)
1958 tcp_sum_lost(tp, skb);
1926 TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED; 1959 TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
1927 if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || is_reneg) { 1960 if (mark_lost) {
1928 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; 1961 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
1929 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; 1962 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
1930 tp->lost_out += tcp_skb_pcount(skb); 1963 tp->lost_out += tcp_skb_pcount(skb);
@@ -2502,6 +2535,9 @@ static inline void tcp_end_cwnd_reduction(struct sock *sk)
2502{ 2535{
2503 struct tcp_sock *tp = tcp_sk(sk); 2536 struct tcp_sock *tp = tcp_sk(sk);
2504 2537
2538 if (inet_csk(sk)->icsk_ca_ops->cong_control)
2539 return;
2540
2505 /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */ 2541 /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */
2506 if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR || 2542 if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR ||
2507 (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) { 2543 (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) {
@@ -2878,67 +2914,13 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
2878 *rexmit = REXMIT_LOST; 2914 *rexmit = REXMIT_LOST;
2879} 2915}
2880 2916
2881/* Kathleen Nichols' algorithm for tracking the minimum value of
2882 * a data stream over some fixed time interval. (E.g., the minimum
2883 * RTT over the past five minutes.) It uses constant space and constant
2884 * time per update yet almost always delivers the same minimum as an
2885 * implementation that has to keep all the data in the window.
2886 *
2887 * The algorithm keeps track of the best, 2nd best & 3rd best min
2888 * values, maintaining an invariant that the measurement time of the
2889 * n'th best >= n-1'th best. It also makes sure that the three values
2890 * are widely separated in the time window since that bounds the worse
2891 * case error when that data is monotonically increasing over the window.
2892 *
2893 * Upon getting a new min, we can forget everything earlier because it
2894 * has no value - the new min is <= everything else in the window by
2895 * definition and it's the most recent. So we restart fresh on every new min
2896 * and overwrites 2nd & 3rd choices. The same property holds for 2nd & 3rd
2897 * best.
2898 */
2899static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us) 2917static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us)
2900{ 2918{
2901 const u32 now = tcp_time_stamp, wlen = sysctl_tcp_min_rtt_wlen * HZ; 2919 struct tcp_sock *tp = tcp_sk(sk);
2902 struct rtt_meas *m = tcp_sk(sk)->rtt_min; 2920 u32 wlen = sysctl_tcp_min_rtt_wlen * HZ;
2903 struct rtt_meas rttm = { 2921
2904 .rtt = likely(rtt_us) ? rtt_us : jiffies_to_usecs(1), 2922 minmax_running_min(&tp->rtt_min, wlen, tcp_time_stamp,
2905 .ts = now, 2923 rtt_us ? : jiffies_to_usecs(1));
2906 };
2907 u32 elapsed;
2908
2909 /* Check if the new measurement updates the 1st, 2nd, or 3rd choices */
2910 if (unlikely(rttm.rtt <= m[0].rtt))
2911 m[0] = m[1] = m[2] = rttm;
2912 else if (rttm.rtt <= m[1].rtt)
2913 m[1] = m[2] = rttm;
2914 else if (rttm.rtt <= m[2].rtt)
2915 m[2] = rttm;
2916
2917 elapsed = now - m[0].ts;
2918 if (unlikely(elapsed > wlen)) {
2919 /* Passed entire window without a new min so make 2nd choice
2920 * the new min & 3rd choice the new 2nd. So forth and so on.
2921 */
2922 m[0] = m[1];
2923 m[1] = m[2];
2924 m[2] = rttm;
2925 if (now - m[0].ts > wlen) {
2926 m[0] = m[1];
2927 m[1] = rttm;
2928 if (now - m[0].ts > wlen)
2929 m[0] = rttm;
2930 }
2931 } else if (m[1].ts == m[0].ts && elapsed > wlen / 4) {
2932 /* Passed a quarter of the window without a new min so
2933 * take 2nd choice from the 2nd quarter of the window.
2934 */
2935 m[2] = m[1] = rttm;
2936 } else if (m[2].ts == m[1].ts && elapsed > wlen / 2) {
2937 /* Passed half the window without a new min so take the 3rd
2938 * choice from the last half of the window.
2939 */
2940 m[2] = rttm;
2941 }
2942} 2924}
2943 2925
2944static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, 2926static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
@@ -3101,10 +3083,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
3101 */ 3083 */
3102static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, 3084static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3103 u32 prior_snd_una, int *acked, 3085 u32 prior_snd_una, int *acked,
3104 struct tcp_sacktag_state *sack) 3086 struct tcp_sacktag_state *sack,
3087 struct skb_mstamp *now)
3105{ 3088{
3106 const struct inet_connection_sock *icsk = inet_csk(sk); 3089 const struct inet_connection_sock *icsk = inet_csk(sk);
3107 struct skb_mstamp first_ackt, last_ackt, now; 3090 struct skb_mstamp first_ackt, last_ackt;
3108 struct tcp_sock *tp = tcp_sk(sk); 3091 struct tcp_sock *tp = tcp_sk(sk);
3109 u32 prior_sacked = tp->sacked_out; 3092 u32 prior_sacked = tp->sacked_out;
3110 u32 reord = tp->packets_out; 3093 u32 reord = tp->packets_out;
@@ -3136,7 +3119,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3136 acked_pcount = tcp_tso_acked(sk, skb); 3119 acked_pcount = tcp_tso_acked(sk, skb);
3137 if (!acked_pcount) 3120 if (!acked_pcount)
3138 break; 3121 break;
3139
3140 fully_acked = false; 3122 fully_acked = false;
3141 } else { 3123 } else {
3142 /* Speedup tcp_unlink_write_queue() and next loop */ 3124 /* Speedup tcp_unlink_write_queue() and next loop */
@@ -3172,6 +3154,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3172 3154
3173 tp->packets_out -= acked_pcount; 3155 tp->packets_out -= acked_pcount;
3174 pkts_acked += acked_pcount; 3156 pkts_acked += acked_pcount;
3157 tcp_rate_skb_delivered(sk, skb, sack->rate);
3175 3158
3176 /* Initial outgoing SYN's get put onto the write_queue 3159 /* Initial outgoing SYN's get put onto the write_queue
3177 * just like anything else we transmit. It is not 3160 * just like anything else we transmit. It is not
@@ -3204,16 +3187,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3204 if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) 3187 if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
3205 flag |= FLAG_SACK_RENEGING; 3188 flag |= FLAG_SACK_RENEGING;
3206 3189
3207 skb_mstamp_get(&now);
3208 if (likely(first_ackt.v64) && !(flag & FLAG_RETRANS_DATA_ACKED)) { 3190 if (likely(first_ackt.v64) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
3209 seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt); 3191 seq_rtt_us = skb_mstamp_us_delta(now, &first_ackt);
3210 ca_rtt_us = skb_mstamp_us_delta(&now, &last_ackt); 3192 ca_rtt_us = skb_mstamp_us_delta(now, &last_ackt);
3211 } 3193 }
3212 if (sack->first_sackt.v64) { 3194 if (sack->first_sackt.v64) {
3213 sack_rtt_us = skb_mstamp_us_delta(&now, &sack->first_sackt); 3195 sack_rtt_us = skb_mstamp_us_delta(now, &sack->first_sackt);
3214 ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt); 3196 ca_rtt_us = skb_mstamp_us_delta(now, &sack->last_sackt);
3215 } 3197 }
3216 3198 sack->rate->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet, or -1 */
3217 rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us, 3199 rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us,
3218 ca_rtt_us); 3200 ca_rtt_us);
3219 3201
@@ -3241,7 +3223,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3241 tp->fackets_out -= min(pkts_acked, tp->fackets_out); 3223 tp->fackets_out -= min(pkts_acked, tp->fackets_out);
3242 3224
3243 } else if (skb && rtt_update && sack_rtt_us >= 0 && 3225 } else if (skb && rtt_update && sack_rtt_us >= 0 &&
3244 sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) { 3226 sack_rtt_us > skb_mstamp_us_delta(now, &skb->skb_mstamp)) {
3245 /* Do not re-arm RTO if the sack RTT is measured from data sent 3227 /* Do not re-arm RTO if the sack RTT is measured from data sent
3246 * after when the head was last (re)transmitted. Otherwise the 3228 * after when the head was last (re)transmitted. Otherwise the
3247 * timeout may continue to extend in loss recovery. 3229 * timeout may continue to extend in loss recovery.
@@ -3332,8 +3314,15 @@ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
3332 * information. All transmission or retransmission are delayed afterwards. 3314 * information. All transmission or retransmission are delayed afterwards.
3333 */ 3315 */
3334static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked, 3316static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked,
3335 int flag) 3317 int flag, const struct rate_sample *rs)
3336{ 3318{
3319 const struct inet_connection_sock *icsk = inet_csk(sk);
3320
3321 if (icsk->icsk_ca_ops->cong_control) {
3322 icsk->icsk_ca_ops->cong_control(sk, rs);
3323 return;
3324 }
3325
3337 if (tcp_in_cwnd_reduction(sk)) { 3326 if (tcp_in_cwnd_reduction(sk)) {
3338 /* Reduce cwnd if state mandates */ 3327 /* Reduce cwnd if state mandates */
3339 tcp_cwnd_reduction(sk, acked_sacked, flag); 3328 tcp_cwnd_reduction(sk, acked_sacked, flag);
@@ -3578,17 +3567,21 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3578 struct inet_connection_sock *icsk = inet_csk(sk); 3567 struct inet_connection_sock *icsk = inet_csk(sk);
3579 struct tcp_sock *tp = tcp_sk(sk); 3568 struct tcp_sock *tp = tcp_sk(sk);
3580 struct tcp_sacktag_state sack_state; 3569 struct tcp_sacktag_state sack_state;
3570 struct rate_sample rs = { .prior_delivered = 0 };
3581 u32 prior_snd_una = tp->snd_una; 3571 u32 prior_snd_una = tp->snd_una;
3582 u32 ack_seq = TCP_SKB_CB(skb)->seq; 3572 u32 ack_seq = TCP_SKB_CB(skb)->seq;
3583 u32 ack = TCP_SKB_CB(skb)->ack_seq; 3573 u32 ack = TCP_SKB_CB(skb)->ack_seq;
3584 bool is_dupack = false; 3574 bool is_dupack = false;
3585 u32 prior_fackets; 3575 u32 prior_fackets;
3586 int prior_packets = tp->packets_out; 3576 int prior_packets = tp->packets_out;
3587 u32 prior_delivered = tp->delivered; 3577 u32 delivered = tp->delivered;
3578 u32 lost = tp->lost;
3588 int acked = 0; /* Number of packets newly acked */ 3579 int acked = 0; /* Number of packets newly acked */
3589 int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */ 3580 int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
3581 struct skb_mstamp now;
3590 3582
3591 sack_state.first_sackt.v64 = 0; 3583 sack_state.first_sackt.v64 = 0;
3584 sack_state.rate = &rs;
3592 3585
3593 /* We very likely will need to access write queue head. */ 3586 /* We very likely will need to access write queue head. */
3594 prefetchw(sk->sk_write_queue.next); 3587 prefetchw(sk->sk_write_queue.next);
@@ -3611,6 +3604,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3611 if (after(ack, tp->snd_nxt)) 3604 if (after(ack, tp->snd_nxt))
3612 goto invalid_ack; 3605 goto invalid_ack;
3613 3606
3607 skb_mstamp_get(&now);
3608
3614 if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || 3609 if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
3615 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) 3610 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
3616 tcp_rearm_rto(sk); 3611 tcp_rearm_rto(sk);
@@ -3621,6 +3616,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3621 } 3616 }
3622 3617
3623 prior_fackets = tp->fackets_out; 3618 prior_fackets = tp->fackets_out;
3619 rs.prior_in_flight = tcp_packets_in_flight(tp);
3624 3620
3625 /* ts_recent update must be made after we are sure that the packet 3621 /* ts_recent update must be made after we are sure that the packet
3626 * is in window. 3622 * is in window.
@@ -3676,7 +3672,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3676 3672
3677 /* See if we can take anything off of the retransmit queue. */ 3673 /* See if we can take anything off of the retransmit queue. */
3678 flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked, 3674 flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked,
3679 &sack_state); 3675 &sack_state, &now);
3680 3676
3681 if (tcp_ack_is_dubious(sk, flag)) { 3677 if (tcp_ack_is_dubious(sk, flag)) {
3682 is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); 3678 is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
@@ -3693,7 +3689,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3693 3689
3694 if (icsk->icsk_pending == ICSK_TIME_RETRANS) 3690 if (icsk->icsk_pending == ICSK_TIME_RETRANS)
3695 tcp_schedule_loss_probe(sk); 3691 tcp_schedule_loss_probe(sk);
3696 tcp_cong_control(sk, ack, tp->delivered - prior_delivered, flag); 3692 delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */
3693 lost = tp->lost - lost; /* freshly marked lost */
3694 tcp_rate_gen(sk, delivered, lost, &now, &rs);
3695 tcp_cong_control(sk, ack, delivered, flag, &rs);
3697 tcp_xmit_recovery(sk, rexmit); 3696 tcp_xmit_recovery(sk, rexmit);
3698 return 1; 3697 return 1;
3699 3698
@@ -4107,7 +4106,7 @@ void tcp_fin(struct sock *sk)
4107 /* It _is_ possible, that we have something out-of-order _after_ FIN. 4106 /* It _is_ possible, that we have something out-of-order _after_ FIN.
4108 * Probably, we should reset in this case. For now drop them. 4107 * Probably, we should reset in this case. For now drop them.
4109 */ 4108 */
4110 __skb_queue_purge(&tp->out_of_order_queue); 4109 skb_rbtree_purge(&tp->out_of_order_queue);
4111 if (tcp_is_sack(tp)) 4110 if (tcp_is_sack(tp))
4112 tcp_sack_reset(&tp->rx_opt); 4111 tcp_sack_reset(&tp->rx_opt);
4113 sk_mem_reclaim(sk); 4112 sk_mem_reclaim(sk);
@@ -4267,7 +4266,7 @@ static void tcp_sack_remove(struct tcp_sock *tp)
4267 int this_sack; 4266 int this_sack;
4268 4267
4269 /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */ 4268 /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
4270 if (skb_queue_empty(&tp->out_of_order_queue)) { 4269 if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
4271 tp->rx_opt.num_sacks = 0; 4270 tp->rx_opt.num_sacks = 0;
4272 return; 4271 return;
4273 } 4272 }
@@ -4343,10 +4342,13 @@ static void tcp_ofo_queue(struct sock *sk)
4343{ 4342{
4344 struct tcp_sock *tp = tcp_sk(sk); 4343 struct tcp_sock *tp = tcp_sk(sk);
4345 __u32 dsack_high = tp->rcv_nxt; 4344 __u32 dsack_high = tp->rcv_nxt;
4345 bool fin, fragstolen, eaten;
4346 struct sk_buff *skb, *tail; 4346 struct sk_buff *skb, *tail;
4347 bool fragstolen, eaten; 4347 struct rb_node *p;
4348 4348
4349 while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) { 4349 p = rb_first(&tp->out_of_order_queue);
4350 while (p) {
4351 skb = rb_entry(p, struct sk_buff, rbnode);
4350 if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) 4352 if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
4351 break; 4353 break;
4352 4354
@@ -4356,9 +4358,10 @@ static void tcp_ofo_queue(struct sock *sk)
4356 dsack_high = TCP_SKB_CB(skb)->end_seq; 4358 dsack_high = TCP_SKB_CB(skb)->end_seq;
4357 tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack); 4359 tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
4358 } 4360 }
4361 p = rb_next(p);
4362 rb_erase(&skb->rbnode, &tp->out_of_order_queue);
4359 4363
4360 __skb_unlink(skb, &tp->out_of_order_queue); 4364 if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
4361 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
4362 SOCK_DEBUG(sk, "ofo packet was already received\n"); 4365 SOCK_DEBUG(sk, "ofo packet was already received\n");
4363 tcp_drop(sk, skb); 4366 tcp_drop(sk, skb);
4364 continue; 4367 continue;
@@ -4370,12 +4373,19 @@ static void tcp_ofo_queue(struct sock *sk)
4370 tail = skb_peek_tail(&sk->sk_receive_queue); 4373 tail = skb_peek_tail(&sk->sk_receive_queue);
4371 eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen); 4374 eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
4372 tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); 4375 tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
4376 fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
4373 if (!eaten) 4377 if (!eaten)
4374 __skb_queue_tail(&sk->sk_receive_queue, skb); 4378 __skb_queue_tail(&sk->sk_receive_queue, skb);
4375 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) 4379 else
4376 tcp_fin(sk);
4377 if (eaten)
4378 kfree_skb_partial(skb, fragstolen); 4380 kfree_skb_partial(skb, fragstolen);
4381
4382 if (unlikely(fin)) {
4383 tcp_fin(sk);
4384 /* tcp_fin() purges tp->out_of_order_queue,
4385 * so we must end this loop right now.
4386 */
4387 break;
4388 }
4379 } 4389 }
4380} 4390}
4381 4391
@@ -4391,12 +4401,9 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
4391 if (tcp_prune_queue(sk) < 0) 4401 if (tcp_prune_queue(sk) < 0)
4392 return -1; 4402 return -1;
4393 4403
4394 if (!sk_rmem_schedule(sk, skb, size)) { 4404 while (!sk_rmem_schedule(sk, skb, size)) {
4395 if (!tcp_prune_ofo_queue(sk)) 4405 if (!tcp_prune_ofo_queue(sk))
4396 return -1; 4406 return -1;
4397
4398 if (!sk_rmem_schedule(sk, skb, size))
4399 return -1;
4400 } 4407 }
4401 } 4408 }
4402 return 0; 4409 return 0;
@@ -4405,8 +4412,10 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
4405static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) 4412static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
4406{ 4413{
4407 struct tcp_sock *tp = tcp_sk(sk); 4414 struct tcp_sock *tp = tcp_sk(sk);
4415 struct rb_node **p, *q, *parent;
4408 struct sk_buff *skb1; 4416 struct sk_buff *skb1;
4409 u32 seq, end_seq; 4417 u32 seq, end_seq;
4418 bool fragstolen;
4410 4419
4411 tcp_ecn_check_ce(tp, skb); 4420 tcp_ecn_check_ce(tp, skb);
4412 4421
@@ -4421,88 +4430,92 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
4421 inet_csk_schedule_ack(sk); 4430 inet_csk_schedule_ack(sk);
4422 4431
4423 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE); 4432 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
4433 seq = TCP_SKB_CB(skb)->seq;
4434 end_seq = TCP_SKB_CB(skb)->end_seq;
4424 SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", 4435 SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
4425 tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); 4436 tp->rcv_nxt, seq, end_seq);
4426 4437
4427 skb1 = skb_peek_tail(&tp->out_of_order_queue); 4438 p = &tp->out_of_order_queue.rb_node;
4428 if (!skb1) { 4439 if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
4429 /* Initial out of order segment, build 1 SACK. */ 4440 /* Initial out of order segment, build 1 SACK. */
4430 if (tcp_is_sack(tp)) { 4441 if (tcp_is_sack(tp)) {
4431 tp->rx_opt.num_sacks = 1; 4442 tp->rx_opt.num_sacks = 1;
4432 tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq; 4443 tp->selective_acks[0].start_seq = seq;
4433 tp->selective_acks[0].end_seq = 4444 tp->selective_acks[0].end_seq = end_seq;
4434 TCP_SKB_CB(skb)->end_seq;
4435 } 4445 }
4436 __skb_queue_head(&tp->out_of_order_queue, skb); 4446 rb_link_node(&skb->rbnode, NULL, p);
4447 rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
4448 tp->ooo_last_skb = skb;
4437 goto end; 4449 goto end;
4438 } 4450 }
4439 4451
4440 seq = TCP_SKB_CB(skb)->seq; 4452 /* In the typical case, we are adding an skb to the end of the list.
4441 end_seq = TCP_SKB_CB(skb)->end_seq; 4453 * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
4442 4454 */
4443 if (seq == TCP_SKB_CB(skb1)->end_seq) { 4455 if (tcp_try_coalesce(sk, tp->ooo_last_skb, skb, &fragstolen)) {
4444 bool fragstolen; 4456coalesce_done:
4445 4457 tcp_grow_window(sk, skb);
4446 if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) { 4458 kfree_skb_partial(skb, fragstolen);
4447 __skb_queue_after(&tp->out_of_order_queue, skb1, skb); 4459 skb = NULL;
4448 } else { 4460 goto add_sack;
4449 tcp_grow_window(sk, skb); 4461 }
4450 kfree_skb_partial(skb, fragstolen); 4462 /* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */
4451 skb = NULL; 4463 if (!before(seq, TCP_SKB_CB(tp->ooo_last_skb)->end_seq)) {
4452 } 4464 parent = &tp->ooo_last_skb->rbnode;
4453 4465 p = &parent->rb_right;
4454 if (!tp->rx_opt.num_sacks || 4466 goto insert;
4455 tp->selective_acks[0].end_seq != seq) 4467 }
4456 goto add_sack; 4468
4457 4469 /* Find place to insert this segment. Handle overlaps on the way. */
4458 /* Common case: data arrive in order after hole. */ 4470 parent = NULL;
4459 tp->selective_acks[0].end_seq = end_seq; 4471 while (*p) {
4460 goto end; 4472 parent = *p;
4461 } 4473 skb1 = rb_entry(parent, struct sk_buff, rbnode);
4462 4474 if (before(seq, TCP_SKB_CB(skb1)->seq)) {
4463 /* Find place to insert this segment. */ 4475 p = &parent->rb_left;
4464 while (1) { 4476 continue;
4465 if (!after(TCP_SKB_CB(skb1)->seq, seq))
4466 break;
4467 if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) {
4468 skb1 = NULL;
4469 break;
4470 }
4471 skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1);
4472 }
4473
4474 /* Do skb overlap to previous one? */
4475 if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
4476 if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
4477 /* All the bits are present. Drop. */
4478 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
4479 tcp_drop(sk, skb);
4480 skb = NULL;
4481 tcp_dsack_set(sk, seq, end_seq);
4482 goto add_sack;
4483 } 4477 }
4484 if (after(seq, TCP_SKB_CB(skb1)->seq)) { 4478 if (before(seq, TCP_SKB_CB(skb1)->end_seq)) {
4485 /* Partial overlap. */ 4479 if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
4486 tcp_dsack_set(sk, seq, 4480 /* All the bits are present. Drop. */
4487 TCP_SKB_CB(skb1)->end_seq); 4481 NET_INC_STATS(sock_net(sk),
4488 } else { 4482 LINUX_MIB_TCPOFOMERGE);
4489 if (skb_queue_is_first(&tp->out_of_order_queue, 4483 __kfree_skb(skb);
4490 skb1)) 4484 skb = NULL;
4491 skb1 = NULL; 4485 tcp_dsack_set(sk, seq, end_seq);
4492 else 4486 goto add_sack;
4493 skb1 = skb_queue_prev( 4487 }
4494 &tp->out_of_order_queue, 4488 if (after(seq, TCP_SKB_CB(skb1)->seq)) {
4495 skb1); 4489 /* Partial overlap. */
4490 tcp_dsack_set(sk, seq, TCP_SKB_CB(skb1)->end_seq);
4491 } else {
4492 /* skb's seq == skb1's seq and skb covers skb1.
4493 * Replace skb1 with skb.
4494 */
4495 rb_replace_node(&skb1->rbnode, &skb->rbnode,
4496 &tp->out_of_order_queue);
4497 tcp_dsack_extend(sk,
4498 TCP_SKB_CB(skb1)->seq,
4499 TCP_SKB_CB(skb1)->end_seq);
4500 NET_INC_STATS(sock_net(sk),
4501 LINUX_MIB_TCPOFOMERGE);
4502 __kfree_skb(skb1);
4503 goto merge_right;
4504 }
4505 } else if (tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
4506 goto coalesce_done;
4496 } 4507 }
4508 p = &parent->rb_right;
4497 } 4509 }
4498 if (!skb1) 4510insert:
4499 __skb_queue_head(&tp->out_of_order_queue, skb); 4511 /* Insert segment into RB tree. */
4500 else 4512 rb_link_node(&skb->rbnode, parent, p);
4501 __skb_queue_after(&tp->out_of_order_queue, skb1, skb); 4513 rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
4502 4514
4503 /* And clean segments covered by new one as whole. */ 4515merge_right:
4504 while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) { 4516 /* Remove other segments covered by skb. */
4505 skb1 = skb_queue_next(&tp->out_of_order_queue, skb); 4517 while ((q = rb_next(&skb->rbnode)) != NULL) {
4518 skb1 = rb_entry(q, struct sk_buff, rbnode);
4506 4519
4507 if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) 4520 if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
4508 break; 4521 break;
@@ -4511,12 +4524,15 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
4511 end_seq); 4524 end_seq);
4512 break; 4525 break;
4513 } 4526 }
4514 __skb_unlink(skb1, &tp->out_of_order_queue); 4527 rb_erase(&skb1->rbnode, &tp->out_of_order_queue);
4515 tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, 4528 tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
4516 TCP_SKB_CB(skb1)->end_seq); 4529 TCP_SKB_CB(skb1)->end_seq);
4517 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE); 4530 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
4518 tcp_drop(sk, skb1); 4531 tcp_drop(sk, skb1);
4519 } 4532 }
4533 /* If there is no skb after us, we are the last_skb ! */
4534 if (!q)
4535 tp->ooo_last_skb = skb;
4520 4536
4521add_sack: 4537add_sack:
4522 if (tcp_is_sack(tp)) 4538 if (tcp_is_sack(tp))
@@ -4653,13 +4669,13 @@ queue_and_out:
4653 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) 4669 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
4654 tcp_fin(sk); 4670 tcp_fin(sk);
4655 4671
4656 if (!skb_queue_empty(&tp->out_of_order_queue)) { 4672 if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
4657 tcp_ofo_queue(sk); 4673 tcp_ofo_queue(sk);
4658 4674
4659 /* RFC2581. 4.2. SHOULD send immediate ACK, when 4675 /* RFC2581. 4.2. SHOULD send immediate ACK, when
4660 * gap in queue is filled. 4676 * gap in queue is filled.
4661 */ 4677 */
4662 if (skb_queue_empty(&tp->out_of_order_queue)) 4678 if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
4663 inet_csk(sk)->icsk_ack.pingpong = 0; 4679 inet_csk(sk)->icsk_ack.pingpong = 0;
4664 } 4680 }
4665 4681
@@ -4713,48 +4729,76 @@ drop:
4713 tcp_data_queue_ofo(sk, skb); 4729 tcp_data_queue_ofo(sk, skb);
4714} 4730}
4715 4731
4732static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *list)
4733{
4734 if (list)
4735 return !skb_queue_is_last(list, skb) ? skb->next : NULL;
4736
4737 return rb_entry_safe(rb_next(&skb->rbnode), struct sk_buff, rbnode);
4738}
4739
4716static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, 4740static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
4717 struct sk_buff_head *list) 4741 struct sk_buff_head *list,
4742 struct rb_root *root)
4718{ 4743{
4719 struct sk_buff *next = NULL; 4744 struct sk_buff *next = tcp_skb_next(skb, list);
4720 4745
4721 if (!skb_queue_is_last(list, skb)) 4746 if (list)
4722 next = skb_queue_next(list, skb); 4747 __skb_unlink(skb, list);
4748 else
4749 rb_erase(&skb->rbnode, root);
4723 4750
4724 __skb_unlink(skb, list);
4725 __kfree_skb(skb); 4751 __kfree_skb(skb);
4726 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED); 4752 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
4727 4753
4728 return next; 4754 return next;
4729} 4755}
4730 4756
4757/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */
4758static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
4759{
4760 struct rb_node **p = &root->rb_node;
4761 struct rb_node *parent = NULL;
4762 struct sk_buff *skb1;
4763
4764 while (*p) {
4765 parent = *p;
4766 skb1 = rb_entry(parent, struct sk_buff, rbnode);
4767 if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq))
4768 p = &parent->rb_left;
4769 else
4770 p = &parent->rb_right;
4771 }
4772 rb_link_node(&skb->rbnode, parent, p);
4773 rb_insert_color(&skb->rbnode, root);
4774}
4775
4731/* Collapse contiguous sequence of skbs head..tail with 4776/* Collapse contiguous sequence of skbs head..tail with
4732 * sequence numbers start..end. 4777 * sequence numbers start..end.
4733 * 4778 *
4734 * If tail is NULL, this means until the end of the list. 4779 * If tail is NULL, this means until the end of the queue.
4735 * 4780 *
4736 * Segments with FIN/SYN are not collapsed (only because this 4781 * Segments with FIN/SYN are not collapsed (only because this
4737 * simplifies code) 4782 * simplifies code)
4738 */ 4783 */
4739static void 4784static void
4740tcp_collapse(struct sock *sk, struct sk_buff_head *list, 4785tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
4741 struct sk_buff *head, struct sk_buff *tail, 4786 struct sk_buff *head, struct sk_buff *tail, u32 start, u32 end)
4742 u32 start, u32 end)
4743{ 4787{
4744 struct sk_buff *skb, *n; 4788 struct sk_buff *skb = head, *n;
4789 struct sk_buff_head tmp;
4745 bool end_of_skbs; 4790 bool end_of_skbs;
4746 4791
4747 /* First, check that queue is collapsible and find 4792 /* First, check that queue is collapsible and find
4748 * the point where collapsing can be useful. */ 4793 * the point where collapsing can be useful.
4749 skb = head; 4794 */
4750restart: 4795restart:
4751 end_of_skbs = true; 4796 for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) {
4752 skb_queue_walk_from_safe(list, skb, n) { 4797 n = tcp_skb_next(skb, list);
4753 if (skb == tail) 4798
4754 break;
4755 /* No new bits? It is possible on ofo queue. */ 4799 /* No new bits? It is possible on ofo queue. */
4756 if (!before(start, TCP_SKB_CB(skb)->end_seq)) { 4800 if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
4757 skb = tcp_collapse_one(sk, skb, list); 4801 skb = tcp_collapse_one(sk, skb, list, root);
4758 if (!skb) 4802 if (!skb)
4759 break; 4803 break;
4760 goto restart; 4804 goto restart;
@@ -4772,13 +4816,10 @@ restart:
4772 break; 4816 break;
4773 } 4817 }
4774 4818
4775 if (!skb_queue_is_last(list, skb)) { 4819 if (n && n != tail &&
4776 struct sk_buff *next = skb_queue_next(list, skb); 4820 TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) {
4777 if (next != tail && 4821 end_of_skbs = false;
4778 TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(next)->seq) { 4822 break;
4779 end_of_skbs = false;
4780 break;
4781 }
4782 } 4823 }
4783 4824
4784 /* Decided to skip this, advance start seq. */ 4825 /* Decided to skip this, advance start seq. */
@@ -4788,17 +4829,22 @@ restart:
4788 (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) 4829 (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
4789 return; 4830 return;
4790 4831
4832 __skb_queue_head_init(&tmp);
4833
4791 while (before(start, end)) { 4834 while (before(start, end)) {
4792 int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start); 4835 int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start);
4793 struct sk_buff *nskb; 4836 struct sk_buff *nskb;
4794 4837
4795 nskb = alloc_skb(copy, GFP_ATOMIC); 4838 nskb = alloc_skb(copy, GFP_ATOMIC);
4796 if (!nskb) 4839 if (!nskb)
4797 return; 4840 break;
4798 4841
4799 memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); 4842 memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
4800 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; 4843 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
4801 __skb_queue_before(list, skb, nskb); 4844 if (list)
4845 __skb_queue_before(list, skb, nskb);
4846 else
4847 __skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */
4802 skb_set_owner_r(nskb, sk); 4848 skb_set_owner_r(nskb, sk);
4803 4849
4804 /* Copy data, releasing collapsed skbs. */ 4850 /* Copy data, releasing collapsed skbs. */
@@ -4816,14 +4862,17 @@ restart:
4816 start += size; 4862 start += size;
4817 } 4863 }
4818 if (!before(start, TCP_SKB_CB(skb)->end_seq)) { 4864 if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
4819 skb = tcp_collapse_one(sk, skb, list); 4865 skb = tcp_collapse_one(sk, skb, list, root);
4820 if (!skb || 4866 if (!skb ||
4821 skb == tail || 4867 skb == tail ||
4822 (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) 4868 (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
4823 return; 4869 goto end;
4824 } 4870 }
4825 } 4871 }
4826 } 4872 }
4873end:
4874 skb_queue_walk_safe(&tmp, skb, n)
4875 tcp_rbtree_insert(root, skb);
4827} 4876}
4828 4877
4829/* Collapse ofo queue. Algorithm: select contiguous sequence of skbs 4878/* Collapse ofo queue. Algorithm: select contiguous sequence of skbs
@@ -4832,70 +4881,86 @@ restart:
4832static void tcp_collapse_ofo_queue(struct sock *sk) 4881static void tcp_collapse_ofo_queue(struct sock *sk)
4833{ 4882{
4834 struct tcp_sock *tp = tcp_sk(sk); 4883 struct tcp_sock *tp = tcp_sk(sk);
4835 struct sk_buff *skb = skb_peek(&tp->out_of_order_queue); 4884 struct sk_buff *skb, *head;
4836 struct sk_buff *head; 4885 struct rb_node *p;
4837 u32 start, end; 4886 u32 start, end;
4838 4887
4839 if (!skb) 4888 p = rb_first(&tp->out_of_order_queue);
4889 skb = rb_entry_safe(p, struct sk_buff, rbnode);
4890new_range:
4891 if (!skb) {
4892 p = rb_last(&tp->out_of_order_queue);
4893 /* Note: This is possible p is NULL here. We do not
4894 * use rb_entry_safe(), as ooo_last_skb is valid only
4895 * if rbtree is not empty.
4896 */
4897 tp->ooo_last_skb = rb_entry(p, struct sk_buff, rbnode);
4840 return; 4898 return;
4841 4899 }
4842 start = TCP_SKB_CB(skb)->seq; 4900 start = TCP_SKB_CB(skb)->seq;
4843 end = TCP_SKB_CB(skb)->end_seq; 4901 end = TCP_SKB_CB(skb)->end_seq;
4844 head = skb;
4845
4846 for (;;) {
4847 struct sk_buff *next = NULL;
4848 4902
4849 if (!skb_queue_is_last(&tp->out_of_order_queue, skb)) 4903 for (head = skb;;) {
4850 next = skb_queue_next(&tp->out_of_order_queue, skb); 4904 skb = tcp_skb_next(skb, NULL);
4851 skb = next;
4852 4905
4853 /* Segment is terminated when we see gap or when 4906 /* Range is terminated when we see a gap or when
4854 * we are at the end of all the queue. */ 4907 * we are at the queue end.
4908 */
4855 if (!skb || 4909 if (!skb ||
4856 after(TCP_SKB_CB(skb)->seq, end) || 4910 after(TCP_SKB_CB(skb)->seq, end) ||
4857 before(TCP_SKB_CB(skb)->end_seq, start)) { 4911 before(TCP_SKB_CB(skb)->end_seq, start)) {
4858 tcp_collapse(sk, &tp->out_of_order_queue, 4912 tcp_collapse(sk, NULL, &tp->out_of_order_queue,
4859 head, skb, start, end); 4913 head, skb, start, end);
4860 head = skb; 4914 goto new_range;
4861 if (!skb) 4915 }
4862 break; 4916
4863 /* Start new segment */ 4917 if (unlikely(before(TCP_SKB_CB(skb)->seq, start)))
4864 start = TCP_SKB_CB(skb)->seq; 4918 start = TCP_SKB_CB(skb)->seq;
4919 if (after(TCP_SKB_CB(skb)->end_seq, end))
4865 end = TCP_SKB_CB(skb)->end_seq; 4920 end = TCP_SKB_CB(skb)->end_seq;
4866 } else {
4867 if (before(TCP_SKB_CB(skb)->seq, start))
4868 start = TCP_SKB_CB(skb)->seq;
4869 if (after(TCP_SKB_CB(skb)->end_seq, end))
4870 end = TCP_SKB_CB(skb)->end_seq;
4871 }
4872 } 4921 }
4873} 4922}
4874 4923
4875/* 4924/*
4876 * Purge the out-of-order queue. 4925 * Clean the out-of-order queue to make room.
4877 * Return true if queue was pruned. 4926 * We drop high sequences packets to :
4927 * 1) Let a chance for holes to be filled.
4928 * 2) not add too big latencies if thousands of packets sit there.
4929 * (But if application shrinks SO_RCVBUF, we could still end up
4930 * freeing whole queue here)
4931 *
4932 * Return true if queue has shrunk.
4878 */ 4933 */
4879static bool tcp_prune_ofo_queue(struct sock *sk) 4934static bool tcp_prune_ofo_queue(struct sock *sk)
4880{ 4935{
4881 struct tcp_sock *tp = tcp_sk(sk); 4936 struct tcp_sock *tp = tcp_sk(sk);
4882 bool res = false; 4937 struct rb_node *node, *prev;
4883 4938
4884 if (!skb_queue_empty(&tp->out_of_order_queue)) { 4939 if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
4885 NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED); 4940 return false;
4886 __skb_queue_purge(&tp->out_of_order_queue);
4887 4941
4888 /* Reset SACK state. A conforming SACK implementation will 4942 NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
4889 * do the same at a timeout based retransmit. When a connection 4943 node = &tp->ooo_last_skb->rbnode;
4890 * is in a sad state like this, we care only about integrity 4944 do {
4891 * of the connection not performance. 4945 prev = rb_prev(node);
4892 */ 4946 rb_erase(node, &tp->out_of_order_queue);
4893 if (tp->rx_opt.sack_ok) 4947 tcp_drop(sk, rb_entry(node, struct sk_buff, rbnode));
4894 tcp_sack_reset(&tp->rx_opt);
4895 sk_mem_reclaim(sk); 4948 sk_mem_reclaim(sk);
4896 res = true; 4949 if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
4897 } 4950 !tcp_under_memory_pressure(sk))
4898 return res; 4951 break;
4952 node = prev;
4953 } while (node);
4954 tp->ooo_last_skb = rb_entry(prev, struct sk_buff, rbnode);
4955
4956 /* Reset SACK state. A conforming SACK implementation will
4957 * do the same at a timeout based retransmit. When a connection
4958 * is in a sad state like this, we care only about integrity
4959 * of the connection not performance.
4960 */
4961 if (tp->rx_opt.sack_ok)
4962 tcp_sack_reset(&tp->rx_opt);
4963 return true;
4899} 4964}
4900 4965
4901/* Reduce allocated memory if we can, trying to get 4966/* Reduce allocated memory if we can, trying to get
@@ -4920,7 +4985,7 @@ static int tcp_prune_queue(struct sock *sk)
4920 4985
4921 tcp_collapse_ofo_queue(sk); 4986 tcp_collapse_ofo_queue(sk);
4922 if (!skb_queue_empty(&sk->sk_receive_queue)) 4987 if (!skb_queue_empty(&sk->sk_receive_queue))
4923 tcp_collapse(sk, &sk->sk_receive_queue, 4988 tcp_collapse(sk, &sk->sk_receive_queue, NULL,
4924 skb_peek(&sk->sk_receive_queue), 4989 skb_peek(&sk->sk_receive_queue),
4925 NULL, 4990 NULL,
4926 tp->copied_seq, tp->rcv_nxt); 4991 tp->copied_seq, tp->rcv_nxt);
@@ -5025,7 +5090,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
5025 /* We ACK each frame or... */ 5090 /* We ACK each frame or... */
5026 tcp_in_quickack_mode(sk) || 5091 tcp_in_quickack_mode(sk) ||
5027 /* We have out of order data. */ 5092 /* We have out of order data. */
5028 (ofo_possible && skb_peek(&tp->out_of_order_queue))) { 5093 (ofo_possible && !RB_EMPTY_ROOT(&tp->out_of_order_queue))) {
5029 /* Then ack it now */ 5094 /* Then ack it now */
5030 tcp_send_ack(sk); 5095 tcp_send_ack(sk);
5031 } else { 5096 } else {
@@ -5926,7 +5991,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
5926 } else 5991 } else
5927 tcp_init_metrics(sk); 5992 tcp_init_metrics(sk);
5928 5993
5929 tcp_update_pacing_rate(sk); 5994 if (!inet_csk(sk)->icsk_ca_ops->cong_control)
5995 tcp_update_pacing_rate(sk);
5930 5996
5931 /* Prevent spurious tcp_cwnd_restart() on first data packet */ 5997 /* Prevent spurious tcp_cwnd_restart() on first data packet */
5932 tp->lsndtime = tcp_time_stamp; 5998 tp->lsndtime = tcp_time_stamp;
@@ -6259,6 +6325,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
6259 6325
6260 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; 6326 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
6261 tcp_openreq_init(req, &tmp_opt, skb, sk); 6327 tcp_openreq_init(req, &tmp_opt, skb, sk);
6328 inet_rsk(req)->no_srccheck = inet_sk(sk)->transparent;
6262 6329
6263 /* Note: tcp_v6_init_req() might override ir_iif for link locals */ 6330 /* Note: tcp_v6_init_req() might override ir_iif for link locals */
6264 inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb); 6331 inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 7158d4f8dae4..bd5e8d10893f 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1175,6 +1175,7 @@ static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1175 NULL, skb); 1175 NULL, skb);
1176 1176
1177 if (genhash || memcmp(hash_location, newhash, 16) != 0) { 1177 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1178 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1178 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n", 1179 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1179 &iph->saddr, ntohs(th->source), 1180 &iph->saddr, ntohs(th->source),
1180 &iph->daddr, ntohs(th->dest), 1181 &iph->daddr, ntohs(th->dest),
@@ -1195,7 +1196,6 @@ static void tcp_v4_init_req(struct request_sock *req,
1195 1196
1196 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1197 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1197 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1198 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1198 ireq->no_srccheck = inet_sk(sk_listener)->transparent;
1199 ireq->opt = tcp_v4_save_options(skb); 1199 ireq->opt = tcp_v4_save_options(skb);
1200} 1200}
1201 1201
@@ -1537,6 +1537,34 @@ bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1537} 1537}
1538EXPORT_SYMBOL(tcp_prequeue); 1538EXPORT_SYMBOL(tcp_prequeue);
1539 1539
1540bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1541{
1542 u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1543
1544 /* Only socket owner can try to collapse/prune rx queues
1545 * to reduce memory overhead, so add a little headroom here.
1546 * Few sockets backlog are possibly concurrently non empty.
1547 */
1548 limit += 64*1024;
1549
1550 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1551 * we can fix skb->truesize to its real value to avoid future drops.
1552 * This is valid because skb is not yet charged to the socket.
1553 * It has been noticed pure SACK packets were sometimes dropped
1554 * (if cooked by drivers without copybreak feature).
1555 */
1556 if (!skb->data_len)
1557 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
1558
1559 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1560 bh_unlock_sock(sk);
1561 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1562 return true;
1563 }
1564 return false;
1565}
1566EXPORT_SYMBOL(tcp_add_backlog);
1567
1540/* 1568/*
1541 * From tcp_input.c 1569 * From tcp_input.c
1542 */ 1570 */
@@ -1608,6 +1636,7 @@ process:
1608 1636
1609 sk = req->rsk_listener; 1637 sk = req->rsk_listener;
1610 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) { 1638 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1639 sk_drops_add(sk, skb);
1611 reqsk_put(req); 1640 reqsk_put(req);
1612 goto discard_it; 1641 goto discard_it;
1613 } 1642 }
@@ -1666,10 +1695,7 @@ process:
1666 if (!sock_owned_by_user(sk)) { 1695 if (!sock_owned_by_user(sk)) {
1667 if (!tcp_prequeue(sk, skb)) 1696 if (!tcp_prequeue(sk, skb))
1668 ret = tcp_v4_do_rcv(sk, skb); 1697 ret = tcp_v4_do_rcv(sk, skb);
1669 } else if (unlikely(sk_add_backlog(sk, skb, 1698 } else if (tcp_add_backlog(sk, skb)) {
1670 sk->sk_rcvbuf + sk->sk_sndbuf))) {
1671 bh_unlock_sock(sk);
1672 __NET_INC_STATS(net, LINUX_MIB_TCPBACKLOGDROP);
1673 goto discard_and_relse; 1699 goto discard_and_relse;
1674 } 1700 }
1675 bh_unlock_sock(sk); 1701 bh_unlock_sock(sk);
@@ -1818,7 +1844,7 @@ void tcp_v4_destroy_sock(struct sock *sk)
1818 tcp_write_queue_purge(sk); 1844 tcp_write_queue_purge(sk);
1819 1845
1820 /* Cleans up our, hopefully empty, out_of_order_queue. */ 1846 /* Cleans up our, hopefully empty, out_of_order_queue. */
1821 __skb_queue_purge(&tp->out_of_order_queue); 1847 skb_rbtree_purge(&tp->out_of_order_queue);
1822 1848
1823#ifdef CONFIG_TCP_MD5SIG 1849#ifdef CONFIG_TCP_MD5SIG
1824 /* Clean up the MD5 key list, if any */ 1850 /* Clean up the MD5 key list, if any */
@@ -1845,9 +1871,6 @@ void tcp_v4_destroy_sock(struct sock *sk)
1845 local_bh_disable(); 1871 local_bh_disable();
1846 sk_sockets_allocated_dec(sk); 1872 sk_sockets_allocated_dec(sk);
1847 local_bh_enable(); 1873 local_bh_enable();
1848
1849 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
1850 sock_release_memcg(sk);
1851} 1874}
1852EXPORT_SYMBOL(tcp_v4_destroy_sock); 1875EXPORT_SYMBOL(tcp_v4_destroy_sock);
1853 1876
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index b617826e2477..bf1f3b2b29d1 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -751,7 +751,7 @@ static struct genl_family tcp_metrics_nl_family = {
751 .netnsok = true, 751 .netnsok = true,
752}; 752};
753 753
754static struct nla_policy tcp_metrics_nl_policy[TCP_METRICS_ATTR_MAX + 1] = { 754static const struct nla_policy tcp_metrics_nl_policy[TCP_METRICS_ATTR_MAX + 1] = {
755 [TCP_METRICS_ATTR_ADDR_IPV4] = { .type = NLA_U32, }, 755 [TCP_METRICS_ATTR_ADDR_IPV4] = { .type = NLA_U32, },
756 [TCP_METRICS_ATTR_ADDR_IPV6] = { .type = NLA_BINARY, 756 [TCP_METRICS_ATTR_ADDR_IPV6] = { .type = NLA_BINARY,
757 .len = sizeof(struct in6_addr), }, 757 .len = sizeof(struct in6_addr), },
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 4b95ec4ed2c8..6234ebaa7db1 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -464,7 +464,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
464 464
465 newtp->srtt_us = 0; 465 newtp->srtt_us = 0;
466 newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); 466 newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
467 newtp->rtt_min[0].rtt = ~0U; 467 minmax_reset(&newtp->rtt_min, tcp_time_stamp, ~0U);
468 newicsk->icsk_rto = TCP_TIMEOUT_INIT; 468 newicsk->icsk_rto = TCP_TIMEOUT_INIT;
469 469
470 newtp->packets_out = 0; 470 newtp->packets_out = 0;
@@ -487,8 +487,10 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
487 newtp->snd_cwnd = TCP_INIT_CWND; 487 newtp->snd_cwnd = TCP_INIT_CWND;
488 newtp->snd_cwnd_cnt = 0; 488 newtp->snd_cwnd_cnt = 0;
489 489
490 /* There's a bubble in the pipe until at least the first ACK. */
491 newtp->app_limited = ~0U;
492
490 tcp_init_xmit_timers(newsk); 493 tcp_init_xmit_timers(newsk);
491 __skb_queue_head_init(&newtp->out_of_order_queue);
492 newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1; 494 newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1;
493 495
494 newtp->rx_opt.saw_tstamp = 0; 496 newtp->rx_opt.saw_tstamp = 0;
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 5c5964962d0c..bc68da38ea86 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -90,12 +90,6 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
90 goto out; 90 goto out;
91 } 91 }
92 92
93 /* GSO partial only requires splitting the frame into an MSS
94 * multiple and possibly a remainder. So update the mss now.
95 */
96 if (features & NETIF_F_GSO_PARTIAL)
97 mss = skb->len - (skb->len % mss);
98
99 copy_destructor = gso_skb->destructor == tcp_wfree; 93 copy_destructor = gso_skb->destructor == tcp_wfree;
100 ooo_okay = gso_skb->ooo_okay; 94 ooo_okay = gso_skb->ooo_okay;
101 /* All segments but the first should have ooo_okay cleared */ 95 /* All segments but the first should have ooo_okay cleared */
@@ -108,6 +102,13 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
108 /* Only first segment might have ooo_okay set */ 102 /* Only first segment might have ooo_okay set */
109 segs->ooo_okay = ooo_okay; 103 segs->ooo_okay = ooo_okay;
110 104
105 /* GSO partial and frag_list segmentation only requires splitting
106 * the frame into an MSS multiple and possibly a remainder, both
107 * cases return a GSO skb. So update the mss now.
108 */
109 if (skb_is_gso(segs))
110 mss *= skb_shinfo(segs)->gso_segs;
111
111 delta = htonl(oldlen + (thlen + mss)); 112 delta = htonl(oldlen + (thlen + mss));
112 113
113 skb = segs; 114 skb = segs;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index d48d5571e62a..896e9dfbdb5c 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -734,9 +734,16 @@ static void tcp_tsq_handler(struct sock *sk)
734{ 734{
735 if ((1 << sk->sk_state) & 735 if ((1 << sk->sk_state) &
736 (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING | 736 (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
737 TCPF_CLOSE_WAIT | TCPF_LAST_ACK)) 737 TCPF_CLOSE_WAIT | TCPF_LAST_ACK)) {
738 tcp_write_xmit(sk, tcp_current_mss(sk), tcp_sk(sk)->nonagle, 738 struct tcp_sock *tp = tcp_sk(sk);
739
740 if (tp->lost_out > tp->retrans_out &&
741 tp->snd_cwnd > tcp_packets_in_flight(tp))
742 tcp_xmit_retransmit_queue(sk);
743
744 tcp_write_xmit(sk, tcp_current_mss(sk), tp->nonagle,
739 0, GFP_ATOMIC); 745 0, GFP_ATOMIC);
746 }
740} 747}
741/* 748/*
742 * One tasklet per cpu tries to send more skbs. 749 * One tasklet per cpu tries to send more skbs.
@@ -918,6 +925,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
918 skb_mstamp_get(&skb->skb_mstamp); 925 skb_mstamp_get(&skb->skb_mstamp);
919 TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq 926 TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
920 - tp->snd_una; 927 - tp->snd_una;
928 tcp_rate_skb_sent(sk, skb);
921 929
922 if (unlikely(skb_cloned(skb))) 930 if (unlikely(skb_cloned(skb)))
923 skb = pskb_copy(skb, gfp_mask); 931 skb = pskb_copy(skb, gfp_mask);
@@ -1213,6 +1221,9 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
1213 tcp_set_skb_tso_segs(skb, mss_now); 1221 tcp_set_skb_tso_segs(skb, mss_now);
1214 tcp_set_skb_tso_segs(buff, mss_now); 1222 tcp_set_skb_tso_segs(buff, mss_now);
1215 1223
1224 /* Update delivered info for the new segment */
1225 TCP_SKB_CB(buff)->tx = TCP_SKB_CB(skb)->tx;
1226
1216 /* If this packet has been sent out already, we must 1227 /* If this packet has been sent out already, we must
1217 * adjust the various packet counters. 1228 * adjust the various packet counters.
1218 */ 1229 */
@@ -1358,6 +1369,7 @@ int tcp_mss_to_mtu(struct sock *sk, int mss)
1358 } 1369 }
1359 return mtu; 1370 return mtu;
1360} 1371}
1372EXPORT_SYMBOL(tcp_mss_to_mtu);
1361 1373
1362/* MTU probing init per socket */ 1374/* MTU probing init per socket */
1363void tcp_mtup_init(struct sock *sk) 1375void tcp_mtup_init(struct sock *sk)
@@ -1545,7 +1557,8 @@ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
1545/* Return how many segs we'd like on a TSO packet, 1557/* Return how many segs we'd like on a TSO packet,
1546 * to send one TSO packet per ms 1558 * to send one TSO packet per ms
1547 */ 1559 */
1548static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now) 1560u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
1561 int min_tso_segs)
1549{ 1562{
1550 u32 bytes, segs; 1563 u32 bytes, segs;
1551 1564
@@ -1557,10 +1570,23 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now)
1557 * This preserves ACK clocking and is consistent 1570 * This preserves ACK clocking and is consistent
1558 * with tcp_tso_should_defer() heuristic. 1571 * with tcp_tso_should_defer() heuristic.
1559 */ 1572 */
1560 segs = max_t(u32, bytes / mss_now, sysctl_tcp_min_tso_segs); 1573 segs = max_t(u32, bytes / mss_now, min_tso_segs);
1561 1574
1562 return min_t(u32, segs, sk->sk_gso_max_segs); 1575 return min_t(u32, segs, sk->sk_gso_max_segs);
1563} 1576}
1577EXPORT_SYMBOL(tcp_tso_autosize);
1578
1579/* Return the number of segments we want in the skb we are transmitting.
1580 * See if congestion control module wants to decide; otherwise, autosize.
1581 */
1582static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
1583{
1584 const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
1585 u32 tso_segs = ca_ops->tso_segs_goal ? ca_ops->tso_segs_goal(sk) : 0;
1586
1587 return tso_segs ? :
1588 tcp_tso_autosize(sk, mss_now, sysctl_tcp_min_tso_segs);
1589}
1564 1590
1565/* Returns the portion of skb which can be sent right away */ 1591/* Returns the portion of skb which can be sent right away */
1566static unsigned int tcp_mss_split_point(const struct sock *sk, 1592static unsigned int tcp_mss_split_point(const struct sock *sk,
@@ -2022,6 +2048,39 @@ static int tcp_mtu_probe(struct sock *sk)
2022 return -1; 2048 return -1;
2023} 2049}
2024 2050
2051/* TCP Small Queues :
2052 * Control number of packets in qdisc/devices to two packets / or ~1 ms.
2053 * (These limits are doubled for retransmits)
2054 * This allows for :
2055 * - better RTT estimation and ACK scheduling
2056 * - faster recovery
2057 * - high rates
2058 * Alas, some drivers / subsystems require a fair amount
2059 * of queued bytes to ensure line rate.
2060 * One example is wifi aggregation (802.11 AMPDU)
2061 */
2062static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
2063 unsigned int factor)
2064{
2065 unsigned int limit;
2066
2067 limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10);
2068 limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes);
2069 limit <<= factor;
2070
2071 if (atomic_read(&sk->sk_wmem_alloc) > limit) {
2072 set_bit(TSQ_THROTTLED, &tcp_sk(sk)->tsq_flags);
2073 /* It is possible TX completion already happened
2074 * before we set TSQ_THROTTLED, so we must
2075 * test again the condition.
2076 */
2077 smp_mb__after_atomic();
2078 if (atomic_read(&sk->sk_wmem_alloc) > limit)
2079 return true;
2080 }
2081 return false;
2082}
2083
2025/* This routine writes packets to the network. It advances the 2084/* This routine writes packets to the network. It advances the
2026 * send_head. This happens as incoming acks open up the remote 2085 * send_head. This happens as incoming acks open up the remote
2027 * window for us. 2086 * window for us.
@@ -2059,7 +2118,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
2059 } 2118 }
2060 } 2119 }
2061 2120
2062 max_segs = tcp_tso_autosize(sk, mss_now); 2121 max_segs = tcp_tso_segs(sk, mss_now);
2063 while ((skb = tcp_send_head(sk))) { 2122 while ((skb = tcp_send_head(sk))) {
2064 unsigned int limit; 2123 unsigned int limit;
2065 2124
@@ -2108,29 +2167,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
2108 unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) 2167 unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
2109 break; 2168 break;
2110 2169
2111 /* TCP Small Queues : 2170 if (tcp_small_queue_check(sk, skb, 0))
2112 * Control number of packets in qdisc/devices to two packets / or ~1 ms. 2171 break;
2113 * This allows for :
2114 * - better RTT estimation and ACK scheduling
2115 * - faster recovery
2116 * - high rates
2117 * Alas, some drivers / subsystems require a fair amount
2118 * of queued bytes to ensure line rate.
2119 * One example is wifi aggregation (802.11 AMPDU)
2120 */
2121 limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10);
2122 limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes);
2123
2124 if (atomic_read(&sk->sk_wmem_alloc) > limit) {
2125 set_bit(TSQ_THROTTLED, &tp->tsq_flags);
2126 /* It is possible TX completion already happened
2127 * before we set TSQ_THROTTLED, so we must
2128 * test again the condition.
2129 */
2130 smp_mb__after_atomic();
2131 if (atomic_read(&sk->sk_wmem_alloc) > limit)
2132 break;
2133 }
2134 2172
2135 if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) 2173 if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
2136 break; 2174 break;
@@ -2777,9 +2815,9 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
2777 last_lost = tp->snd_una; 2815 last_lost = tp->snd_una;
2778 } 2816 }
2779 2817
2780 max_segs = tcp_tso_autosize(sk, tcp_current_mss(sk)); 2818 max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
2781 tcp_for_write_queue_from(skb, sk) { 2819 tcp_for_write_queue_from(skb, sk) {
2782 __u8 sacked = TCP_SKB_CB(skb)->sacked; 2820 __u8 sacked;
2783 int segs; 2821 int segs;
2784 2822
2785 if (skb == tcp_send_head(sk)) 2823 if (skb == tcp_send_head(sk))
@@ -2791,6 +2829,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
2791 segs = tp->snd_cwnd - tcp_packets_in_flight(tp); 2829 segs = tp->snd_cwnd - tcp_packets_in_flight(tp);
2792 if (segs <= 0) 2830 if (segs <= 0)
2793 return; 2831 return;
2832 sacked = TCP_SKB_CB(skb)->sacked;
2794 /* In case tcp_shift_skb_data() have aggregated large skbs, 2833 /* In case tcp_shift_skb_data() have aggregated large skbs,
2795 * we need to make sure not sending too bigs TSO packets 2834 * we need to make sure not sending too bigs TSO packets
2796 */ 2835 */
@@ -2830,6 +2869,9 @@ begin_fwd:
2830 if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS)) 2869 if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
2831 continue; 2870 continue;
2832 2871
2872 if (tcp_small_queue_check(sk, skb, 1))
2873 return;
2874
2833 if (tcp_retransmit_skb(sk, skb, segs)) 2875 if (tcp_retransmit_skb(sk, skb, segs))
2834 return; 2876 return;
2835 2877
diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
new file mode 100644
index 000000000000..9be1581a5a08
--- /dev/null
+++ b/net/ipv4/tcp_rate.c
@@ -0,0 +1,186 @@
1#include <net/tcp.h>
2
3/* The bandwidth estimator estimates the rate at which the network
4 * can currently deliver outbound data packets for this flow. At a high
5 * level, it operates by taking a delivery rate sample for each ACK.
6 *
7 * A rate sample records the rate at which the network delivered packets
8 * for this flow, calculated over the time interval between the transmission
9 * of a data packet and the acknowledgment of that packet.
10 *
11 * Specifically, over the interval between each transmit and corresponding ACK,
12 * the estimator generates a delivery rate sample. Typically it uses the rate
13 * at which packets were acknowledged. However, the approach of using only the
14 * acknowledgment rate faces a challenge under the prevalent ACK decimation or
15 * compression: packets can temporarily appear to be delivered much quicker
16 * than the bottleneck rate. Since it is physically impossible to do that in a
17 * sustained fashion, when the estimator notices that the ACK rate is faster
18 * than the transmit rate, it uses the latter:
19 *
20 * send_rate = #pkts_delivered/(last_snd_time - first_snd_time)
21 * ack_rate = #pkts_delivered/(last_ack_time - first_ack_time)
22 * bw = min(send_rate, ack_rate)
23 *
24 * Notice the estimator essentially estimates the goodput, not always the
25 * network bottleneck link rate when the sending or receiving is limited by
26 * other factors like applications or receiver window limits. The estimator
27 * deliberately avoids using the inter-packet spacing approach because that
28 * approach requires a large number of samples and sophisticated filtering.
29 *
30 * TCP flows can often be application-limited in request/response workloads.
31 * The estimator marks a bandwidth sample as application-limited if there
32 * was some moment during the sampled window of packets when there was no data
33 * ready to send in the write queue.
34 */
35
36/* Snapshot the current delivery information in the skb, to generate
37 * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered().
38 */
39void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
40{
41 struct tcp_sock *tp = tcp_sk(sk);
42
43 /* In general we need to start delivery rate samples from the
44 * time we received the most recent ACK, to ensure we include
45 * the full time the network needs to deliver all in-flight
46 * packets. If there are no packets in flight yet, then we
47 * know that any ACKs after now indicate that the network was
48 * able to deliver those packets completely in the sampling
49 * interval between now and the next ACK.
50 *
51 * Note that we use packets_out instead of tcp_packets_in_flight(tp)
52 * because the latter is a guess based on RTO and loss-marking
53 * heuristics. We don't want spurious RTOs or loss markings to cause
54 * a spuriously small time interval, causing a spuriously high
55 * bandwidth estimate.
56 */
57 if (!tp->packets_out) {
58 tp->first_tx_mstamp = skb->skb_mstamp;
59 tp->delivered_mstamp = skb->skb_mstamp;
60 }
61
62 TCP_SKB_CB(skb)->tx.first_tx_mstamp = tp->first_tx_mstamp;
63 TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp;
64 TCP_SKB_CB(skb)->tx.delivered = tp->delivered;
65 TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0;
66}
67
68/* When an skb is sacked or acked, we fill in the rate sample with the (prior)
69 * delivery information when the skb was last transmitted.
70 *
71 * If an ACK (s)acks multiple skbs (e.g., stretched-acks), this function is
72 * called multiple times. We favor the information from the most recently
73 * sent skb, i.e., the skb with the highest prior_delivered count.
74 */
75void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
76 struct rate_sample *rs)
77{
78 struct tcp_sock *tp = tcp_sk(sk);
79 struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
80
81 if (!scb->tx.delivered_mstamp.v64)
82 return;
83
84 if (!rs->prior_delivered ||
85 after(scb->tx.delivered, rs->prior_delivered)) {
86 rs->prior_delivered = scb->tx.delivered;
87 rs->prior_mstamp = scb->tx.delivered_mstamp;
88 rs->is_app_limited = scb->tx.is_app_limited;
89 rs->is_retrans = scb->sacked & TCPCB_RETRANS;
90
91 /* Find the duration of the "send phase" of this window: */
92 rs->interval_us = skb_mstamp_us_delta(
93 &skb->skb_mstamp,
94 &scb->tx.first_tx_mstamp);
95
96 /* Record send time of most recently ACKed packet: */
97 tp->first_tx_mstamp = skb->skb_mstamp;
98 }
99 /* Mark off the skb delivered once it's sacked to avoid being
100 * used again when it's cumulatively acked. For acked packets
101 * we don't need to reset since it'll be freed soon.
102 */
103 if (scb->sacked & TCPCB_SACKED_ACKED)
104 scb->tx.delivered_mstamp.v64 = 0;
105}
106
107/* Update the connection delivery information and generate a rate sample. */
108void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
109 struct skb_mstamp *now, struct rate_sample *rs)
110{
111 struct tcp_sock *tp = tcp_sk(sk);
112 u32 snd_us, ack_us;
113
114 /* Clear app limited if bubble is acked and gone. */
115 if (tp->app_limited && after(tp->delivered, tp->app_limited))
116 tp->app_limited = 0;
117
118 /* TODO: there are multiple places throughout tcp_ack() to get
119 * current time. Refactor the code using a new "tcp_acktag_state"
120 * to carry current time, flags, stats like "tcp_sacktag_state".
121 */
122 if (delivered)
123 tp->delivered_mstamp = *now;
124
125 rs->acked_sacked = delivered; /* freshly ACKed or SACKed */
126 rs->losses = lost; /* freshly marked lost */
127 /* Return an invalid sample if no timing information is available. */
128 if (!rs->prior_mstamp.v64) {
129 rs->delivered = -1;
130 rs->interval_us = -1;
131 return;
132 }
133 rs->delivered = tp->delivered - rs->prior_delivered;
134
135 /* Model sending data and receiving ACKs as separate pipeline phases
136 * for a window. Usually the ACK phase is longer, but with ACK
137 * compression the send phase can be longer. To be safe we use the
138 * longer phase.
139 */
140 snd_us = rs->interval_us; /* send phase */
141 ack_us = skb_mstamp_us_delta(now, &rs->prior_mstamp); /* ack phase */
142 rs->interval_us = max(snd_us, ack_us);
143
144 /* Normally we expect interval_us >= min-rtt.
145 * Note that rate may still be over-estimated when a spuriously
146 * retransmistted skb was first (s)acked because "interval_us"
147 * is under-estimated (up to an RTT). However continuously
148 * measuring the delivery rate during loss recovery is crucial
149 * for connections suffer heavy or prolonged losses.
150 */
151 if (unlikely(rs->interval_us < tcp_min_rtt(tp))) {
152 if (!rs->is_retrans)
153 pr_debug("tcp rate: %ld %d %u %u %u\n",
154 rs->interval_us, rs->delivered,
155 inet_csk(sk)->icsk_ca_state,
156 tp->rx_opt.sack_ok, tcp_min_rtt(tp));
157 rs->interval_us = -1;
158 return;
159 }
160
161 /* Record the last non-app-limited or the highest app-limited bw */
162 if (!rs->is_app_limited ||
163 ((u64)rs->delivered * tp->rate_interval_us >=
164 (u64)tp->rate_delivered * rs->interval_us)) {
165 tp->rate_delivered = rs->delivered;
166 tp->rate_interval_us = rs->interval_us;
167 tp->rate_app_limited = rs->is_app_limited;
168 }
169}
170
171/* If a gap is detected between sends, mark the socket application-limited. */
172void tcp_rate_check_app_limited(struct sock *sk)
173{
174 struct tcp_sock *tp = tcp_sk(sk);
175
176 if (/* We have less than one packet to send. */
177 tp->write_seq - tp->snd_nxt < tp->mss_cache &&
178 /* Nothing in sending host's qdisc queues or NIC tx queue. */
179 sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) &&
180 /* We are not limited by CWND. */
181 tcp_packets_in_flight(tp) < tp->snd_cwnd &&
182 /* All lost packets have been retransmitted. */
183 tp->lost_out <= tp->retrans_out)
184 tp->app_limited =
185 (tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
186}
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index f712b411f6ed..3ea1cf804748 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -192,6 +192,8 @@ static int tcp_write_timeout(struct sock *sk)
192 if (tp->syn_data && icsk->icsk_retransmits == 1) 192 if (tp->syn_data && icsk->icsk_retransmits == 1)
193 NET_INC_STATS(sock_net(sk), 193 NET_INC_STATS(sock_net(sk),
194 LINUX_MIB_TCPFASTOPENACTIVEFAIL); 194 LINUX_MIB_TCPFASTOPENACTIVEFAIL);
195 } else if (!tp->syn_data && !tp->syn_fastopen) {
196 sk_rethink_txhash(sk);
195 } 197 }
196 retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries; 198 retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
197 syn_set = true; 199 syn_set = true;
@@ -213,6 +215,8 @@ static int tcp_write_timeout(struct sock *sk)
213 tcp_mtu_probing(icsk, sk); 215 tcp_mtu_probing(icsk, sk);
214 216
215 dst_negative_advice(sk); 217 dst_negative_advice(sk);
218 } else {
219 sk_rethink_txhash(sk);
216 } 220 }
217 221
218 retry_until = net->ipv4.sysctl_tcp_retries2; 222 retry_until = net->ipv4.sysctl_tcp_retries2;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 5fdcb8d108d4..7d96dc2d3d08 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -114,6 +114,7 @@
114#include <net/busy_poll.h> 114#include <net/busy_poll.h>
115#include "udp_impl.h" 115#include "udp_impl.h"
116#include <net/sock_reuseport.h> 116#include <net/sock_reuseport.h>
117#include <net/addrconf.h>
117 118
118struct udp_table udp_table __read_mostly; 119struct udp_table udp_table __read_mostly;
119EXPORT_SYMBOL(udp_table); 120EXPORT_SYMBOL(udp_table);
@@ -1020,12 +1021,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
1020 flow_flags, 1021 flow_flags,
1021 faddr, saddr, dport, inet->inet_sport); 1022 faddr, saddr, dport, inet->inet_sport);
1022 1023
1023 if (!saddr && ipc.oif) {
1024 err = l3mdev_get_saddr(net, ipc.oif, fl4);
1025 if (err < 0)
1026 goto out;
1027 }
1028
1029 security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); 1024 security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
1030 rt = ip_route_output_flow(net, fl4, sk); 1025 rt = ip_route_output_flow(net, fl4, sk);
1031 if (IS_ERR(rt)) { 1026 if (IS_ERR(rt)) {
@@ -2192,6 +2187,20 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait)
2192} 2187}
2193EXPORT_SYMBOL(udp_poll); 2188EXPORT_SYMBOL(udp_poll);
2194 2189
2190int udp_abort(struct sock *sk, int err)
2191{
2192 lock_sock(sk);
2193
2194 sk->sk_err = err;
2195 sk->sk_error_report(sk);
2196 udp_disconnect(sk, 0);
2197
2198 release_sock(sk);
2199
2200 return 0;
2201}
2202EXPORT_SYMBOL_GPL(udp_abort);
2203
2195struct proto udp_prot = { 2204struct proto udp_prot = {
2196 .name = "UDP", 2205 .name = "UDP",
2197 .owner = THIS_MODULE, 2206 .owner = THIS_MODULE,
@@ -2221,7 +2230,7 @@ struct proto udp_prot = {
2221 .compat_setsockopt = compat_udp_setsockopt, 2230 .compat_setsockopt = compat_udp_setsockopt,
2222 .compat_getsockopt = compat_udp_getsockopt, 2231 .compat_getsockopt = compat_udp_getsockopt,
2223#endif 2232#endif
2224 .clear_sk = sk_prot_clear_portaddr_nulls, 2233 .diag_destroy = udp_abort,
2225}; 2234};
2226EXPORT_SYMBOL(udp_prot); 2235EXPORT_SYMBOL(udp_prot);
2227 2236
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index 3d5ccf4b1412..9a89c10a55f0 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -20,7 +20,7 @@
20static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, 20static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
21 struct netlink_callback *cb, 21 struct netlink_callback *cb,
22 const struct inet_diag_req_v2 *req, 22 const struct inet_diag_req_v2 *req,
23 struct nlattr *bc) 23 struct nlattr *bc, bool net_admin)
24{ 24{
25 if (!inet_diag_bc_sk(bc, sk)) 25 if (!inet_diag_bc_sk(bc, sk))
26 return 0; 26 return 0;
@@ -28,7 +28,7 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
28 return inet_sk_diag_fill(sk, NULL, skb, req, 28 return inet_sk_diag_fill(sk, NULL, skb, req,
29 sk_user_ns(NETLINK_CB(cb->skb).sk), 29 sk_user_ns(NETLINK_CB(cb->skb).sk),
30 NETLINK_CB(cb->skb).portid, 30 NETLINK_CB(cb->skb).portid,
31 cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); 31 cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh, net_admin);
32} 32}
33 33
34static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, 34static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
@@ -76,7 +76,8 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
76 err = inet_sk_diag_fill(sk, NULL, rep, req, 76 err = inet_sk_diag_fill(sk, NULL, rep, req,
77 sk_user_ns(NETLINK_CB(in_skb).sk), 77 sk_user_ns(NETLINK_CB(in_skb).sk),
78 NETLINK_CB(in_skb).portid, 78 NETLINK_CB(in_skb).portid,
79 nlh->nlmsg_seq, 0, nlh); 79 nlh->nlmsg_seq, 0, nlh,
80 netlink_net_capable(in_skb, CAP_NET_ADMIN));
80 if (err < 0) { 81 if (err < 0) {
81 WARN_ON(err == -EMSGSIZE); 82 WARN_ON(err == -EMSGSIZE);
82 kfree_skb(rep); 83 kfree_skb(rep);
@@ -97,6 +98,7 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb,
97 struct netlink_callback *cb, 98 struct netlink_callback *cb,
98 const struct inet_diag_req_v2 *r, struct nlattr *bc) 99 const struct inet_diag_req_v2 *r, struct nlattr *bc)
99{ 100{
101 bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
100 struct net *net = sock_net(skb->sk); 102 struct net *net = sock_net(skb->sk);
101 int num, s_num, slot, s_slot; 103 int num, s_num, slot, s_slot;
102 104
@@ -132,7 +134,7 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb,
132 r->id.idiag_dport) 134 r->id.idiag_dport)
133 goto next; 135 goto next;
134 136
135 if (sk_diag_dump(sk, skb, cb, r, bc) < 0) { 137 if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0) {
136 spin_unlock_bh(&hslot->lock); 138 spin_unlock_bh(&hslot->lock);
137 goto done; 139 goto done;
138 } 140 }
@@ -165,12 +167,88 @@ static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
165 r->idiag_wqueue = sk_wmem_alloc_get(sk); 167 r->idiag_wqueue = sk_wmem_alloc_get(sk);
166} 168}
167 169
170#ifdef CONFIG_INET_DIAG_DESTROY
171static int __udp_diag_destroy(struct sk_buff *in_skb,
172 const struct inet_diag_req_v2 *req,
173 struct udp_table *tbl)
174{
175 struct net *net = sock_net(in_skb->sk);
176 struct sock *sk;
177 int err;
178
179 rcu_read_lock();
180
181 if (req->sdiag_family == AF_INET)
182 sk = __udp4_lib_lookup(net,
183 req->id.idiag_dst[0], req->id.idiag_dport,
184 req->id.idiag_src[0], req->id.idiag_sport,
185 req->id.idiag_if, tbl, NULL);
186#if IS_ENABLED(CONFIG_IPV6)
187 else if (req->sdiag_family == AF_INET6) {
188 if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) &&
189 ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src))
190 sk = __udp4_lib_lookup(net,
191 req->id.idiag_dst[3], req->id.idiag_dport,
192 req->id.idiag_src[3], req->id.idiag_sport,
193 req->id.idiag_if, tbl, NULL);
194
195 else
196 sk = __udp6_lib_lookup(net,
197 (struct in6_addr *)req->id.idiag_dst,
198 req->id.idiag_dport,
199 (struct in6_addr *)req->id.idiag_src,
200 req->id.idiag_sport,
201 req->id.idiag_if, tbl, NULL);
202 }
203#endif
204 else {
205 rcu_read_unlock();
206 return -EINVAL;
207 }
208
209 if (sk && !atomic_inc_not_zero(&sk->sk_refcnt))
210 sk = NULL;
211
212 rcu_read_unlock();
213
214 if (!sk)
215 return -ENOENT;
216
217 if (sock_diag_check_cookie(sk, req->id.idiag_cookie)) {
218 sock_put(sk);
219 return -ENOENT;
220 }
221
222 err = sock_diag_destroy(sk, ECONNABORTED);
223
224 sock_put(sk);
225
226 return err;
227}
228
229static int udp_diag_destroy(struct sk_buff *in_skb,
230 const struct inet_diag_req_v2 *req)
231{
232 return __udp_diag_destroy(in_skb, req, &udp_table);
233}
234
235static int udplite_diag_destroy(struct sk_buff *in_skb,
236 const struct inet_diag_req_v2 *req)
237{
238 return __udp_diag_destroy(in_skb, req, &udplite_table);
239}
240
241#endif
242
168static const struct inet_diag_handler udp_diag_handler = { 243static const struct inet_diag_handler udp_diag_handler = {
169 .dump = udp_diag_dump, 244 .dump = udp_diag_dump,
170 .dump_one = udp_diag_dump_one, 245 .dump_one = udp_diag_dump_one,
171 .idiag_get_info = udp_diag_get_info, 246 .idiag_get_info = udp_diag_get_info,
172 .idiag_type = IPPROTO_UDP, 247 .idiag_type = IPPROTO_UDP,
173 .idiag_info_size = 0, 248 .idiag_info_size = 0,
249#ifdef CONFIG_INET_DIAG_DESTROY
250 .destroy = udp_diag_destroy,
251#endif
174}; 252};
175 253
176static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, 254static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
@@ -192,6 +270,9 @@ static const struct inet_diag_handler udplite_diag_handler = {
192 .idiag_get_info = udp_diag_get_info, 270 .idiag_get_info = udp_diag_get_info,
193 .idiag_type = IPPROTO_UDPLITE, 271 .idiag_type = IPPROTO_UDPLITE,
194 .idiag_info_size = 0, 272 .idiag_info_size = 0,
273#ifdef CONFIG_INET_DIAG_DESTROY
274 .destroy = udplite_diag_destroy,
275#endif
195}; 276};
196 277
197static int __init udp_diag_init(void) 278static int __init udp_diag_init(void)
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 81f253b6ff36..f9333c963607 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -21,7 +21,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
21 __be16 new_protocol, bool is_ipv6) 21 __be16 new_protocol, bool is_ipv6)
22{ 22{
23 int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); 23 int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
24 bool remcsum, need_csum, offload_csum, ufo; 24 bool remcsum, need_csum, offload_csum, ufo, gso_partial;
25 struct sk_buff *segs = ERR_PTR(-EINVAL); 25 struct sk_buff *segs = ERR_PTR(-EINVAL);
26 struct udphdr *uh = udp_hdr(skb); 26 struct udphdr *uh = udp_hdr(skb);
27 u16 mac_offset = skb->mac_header; 27 u16 mac_offset = skb->mac_header;
@@ -88,6 +88,8 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
88 goto out; 88 goto out;
89 } 89 }
90 90
91 gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
92
91 outer_hlen = skb_tnl_header_len(skb); 93 outer_hlen = skb_tnl_header_len(skb);
92 udp_offset = outer_hlen - tnl_hlen; 94 udp_offset = outer_hlen - tnl_hlen;
93 skb = segs; 95 skb = segs;
@@ -117,7 +119,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
117 * will be using a length value equal to only one MSS sized 119 * will be using a length value equal to only one MSS sized
118 * segment instead of the entire frame. 120 * segment instead of the entire frame.
119 */ 121 */
120 if (skb_is_gso(skb)) { 122 if (gso_partial) {
121 uh->len = htons(skb_shinfo(skb)->gso_size + 123 uh->len = htons(skb_shinfo(skb)->gso_size +
122 SKB_GSO_CB(skb)->data_offset + 124 SKB_GSO_CB(skb)->data_offset +
123 skb->head - (unsigned char *)uh); 125 skb->head - (unsigned char *)uh);
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index 2eea073e27ef..af817158d830 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -60,7 +60,6 @@ struct proto udplite_prot = {
60 .compat_setsockopt = compat_udp_setsockopt, 60 .compat_setsockopt = compat_udp_setsockopt,
61 .compat_getsockopt = compat_udp_getsockopt, 61 .compat_getsockopt = compat_udp_getsockopt,
62#endif 62#endif
63 .clear_sk = sk_prot_clear_portaddr_nulls,
64}; 63};
65EXPORT_SYMBOL(udplite_prot); 64EXPORT_SYMBOL(udplite_prot);
66 65
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 41f5b504a782..6a7ff6957535 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -112,7 +112,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
112 int oif = 0; 112 int oif = 0;
113 113
114 if (skb_dst(skb)) 114 if (skb_dst(skb))
115 oif = l3mdev_fib_oif(skb_dst(skb)->dev); 115 oif = skb_dst(skb)->dev->ifindex;
116 116
117 memset(fl4, 0, sizeof(struct flowi4)); 117 memset(fl4, 0, sizeof(struct flowi4));
118 fl4->flowi4_mark = skb->mark; 118 fl4->flowi4_mark = skb->mark;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 2f1f5d439788..d8983e15f859 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -112,6 +112,27 @@ static inline u32 cstamp_delta(unsigned long cstamp)
112 return (cstamp - INITIAL_JIFFIES) * 100UL / HZ; 112 return (cstamp - INITIAL_JIFFIES) * 100UL / HZ;
113} 113}
114 114
115static inline s32 rfc3315_s14_backoff_init(s32 irt)
116{
117 /* multiply 'initial retransmission time' by 0.9 .. 1.1 */
118 u64 tmp = (900000 + prandom_u32() % 200001) * (u64)irt;
119 do_div(tmp, 1000000);
120 return (s32)tmp;
121}
122
123static inline s32 rfc3315_s14_backoff_update(s32 rt, s32 mrt)
124{
125 /* multiply 'retransmission timeout' by 1.9 .. 2.1 */
126 u64 tmp = (1900000 + prandom_u32() % 200001) * (u64)rt;
127 do_div(tmp, 1000000);
128 if ((s32)tmp > mrt) {
129 /* multiply 'maximum retransmission time' by 0.9 .. 1.1 */
130 tmp = (900000 + prandom_u32() % 200001) * (u64)mrt;
131 do_div(tmp, 1000000);
132 }
133 return (s32)tmp;
134}
135
115#ifdef CONFIG_SYSCTL 136#ifdef CONFIG_SYSCTL
116static int addrconf_sysctl_register(struct inet6_dev *idev); 137static int addrconf_sysctl_register(struct inet6_dev *idev);
117static void addrconf_sysctl_unregister(struct inet6_dev *idev); 138static void addrconf_sysctl_unregister(struct inet6_dev *idev);
@@ -187,6 +208,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
187 .dad_transmits = 1, 208 .dad_transmits = 1,
188 .rtr_solicits = MAX_RTR_SOLICITATIONS, 209 .rtr_solicits = MAX_RTR_SOLICITATIONS,
189 .rtr_solicit_interval = RTR_SOLICITATION_INTERVAL, 210 .rtr_solicit_interval = RTR_SOLICITATION_INTERVAL,
211 .rtr_solicit_max_interval = RTR_SOLICITATION_MAX_INTERVAL,
190 .rtr_solicit_delay = MAX_RTR_SOLICITATION_DELAY, 212 .rtr_solicit_delay = MAX_RTR_SOLICITATION_DELAY,
191 .use_tempaddr = 0, 213 .use_tempaddr = 0,
192 .temp_valid_lft = TEMP_VALID_LIFETIME, 214 .temp_valid_lft = TEMP_VALID_LIFETIME,
@@ -232,6 +254,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
232 .dad_transmits = 1, 254 .dad_transmits = 1,
233 .rtr_solicits = MAX_RTR_SOLICITATIONS, 255 .rtr_solicits = MAX_RTR_SOLICITATIONS,
234 .rtr_solicit_interval = RTR_SOLICITATION_INTERVAL, 256 .rtr_solicit_interval = RTR_SOLICITATION_INTERVAL,
257 .rtr_solicit_max_interval = RTR_SOLICITATION_MAX_INTERVAL,
235 .rtr_solicit_delay = MAX_RTR_SOLICITATION_DELAY, 258 .rtr_solicit_delay = MAX_RTR_SOLICITATION_DELAY,
236 .use_tempaddr = 0, 259 .use_tempaddr = 0,
237 .temp_valid_lft = TEMP_VALID_LIFETIME, 260 .temp_valid_lft = TEMP_VALID_LIFETIME,
@@ -3687,7 +3710,7 @@ static void addrconf_rs_timer(unsigned long data)
3687 if (idev->if_flags & IF_RA_RCVD) 3710 if (idev->if_flags & IF_RA_RCVD)
3688 goto out; 3711 goto out;
3689 3712
3690 if (idev->rs_probes++ < idev->cnf.rtr_solicits) { 3713 if (idev->rs_probes++ < idev->cnf.rtr_solicits || idev->cnf.rtr_solicits < 0) {
3691 write_unlock(&idev->lock); 3714 write_unlock(&idev->lock);
3692 if (!ipv6_get_lladdr(dev, &lladdr, IFA_F_TENTATIVE)) 3715 if (!ipv6_get_lladdr(dev, &lladdr, IFA_F_TENTATIVE))
3693 ndisc_send_rs(dev, &lladdr, 3716 ndisc_send_rs(dev, &lladdr,
@@ -3696,11 +3719,13 @@ static void addrconf_rs_timer(unsigned long data)
3696 goto put; 3719 goto put;
3697 3720
3698 write_lock(&idev->lock); 3721 write_lock(&idev->lock);
3722 idev->rs_interval = rfc3315_s14_backoff_update(
3723 idev->rs_interval, idev->cnf.rtr_solicit_max_interval);
3699 /* The wait after the last probe can be shorter */ 3724 /* The wait after the last probe can be shorter */
3700 addrconf_mod_rs_timer(idev, (idev->rs_probes == 3725 addrconf_mod_rs_timer(idev, (idev->rs_probes ==
3701 idev->cnf.rtr_solicits) ? 3726 idev->cnf.rtr_solicits) ?
3702 idev->cnf.rtr_solicit_delay : 3727 idev->cnf.rtr_solicit_delay :
3703 idev->cnf.rtr_solicit_interval); 3728 idev->rs_interval);
3704 } else { 3729 } else {
3705 /* 3730 /*
3706 * Note: we do not support deprecated "all on-link" 3731 * Note: we do not support deprecated "all on-link"
@@ -3949,7 +3974,7 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp)
3949 send_mld = ifp->scope == IFA_LINK && ipv6_lonely_lladdr(ifp); 3974 send_mld = ifp->scope == IFA_LINK && ipv6_lonely_lladdr(ifp);
3950 send_rs = send_mld && 3975 send_rs = send_mld &&
3951 ipv6_accept_ra(ifp->idev) && 3976 ipv6_accept_ra(ifp->idev) &&
3952 ifp->idev->cnf.rtr_solicits > 0 && 3977 ifp->idev->cnf.rtr_solicits != 0 &&
3953 (dev->flags&IFF_LOOPBACK) == 0; 3978 (dev->flags&IFF_LOOPBACK) == 0;
3954 read_unlock_bh(&ifp->idev->lock); 3979 read_unlock_bh(&ifp->idev->lock);
3955 3980
@@ -3971,10 +3996,11 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp)
3971 3996
3972 write_lock_bh(&ifp->idev->lock); 3997 write_lock_bh(&ifp->idev->lock);
3973 spin_lock(&ifp->lock); 3998 spin_lock(&ifp->lock);
3999 ifp->idev->rs_interval = rfc3315_s14_backoff_init(
4000 ifp->idev->cnf.rtr_solicit_interval);
3974 ifp->idev->rs_probes = 1; 4001 ifp->idev->rs_probes = 1;
3975 ifp->idev->if_flags |= IF_RS_SENT; 4002 ifp->idev->if_flags |= IF_RS_SENT;
3976 addrconf_mod_rs_timer(ifp->idev, 4003 addrconf_mod_rs_timer(ifp->idev, ifp->idev->rs_interval);
3977 ifp->idev->cnf.rtr_solicit_interval);
3978 spin_unlock(&ifp->lock); 4004 spin_unlock(&ifp->lock);
3979 write_unlock_bh(&ifp->idev->lock); 4005 write_unlock_bh(&ifp->idev->lock);
3980 } 4006 }
@@ -4891,6 +4917,8 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
4891 array[DEVCONF_RTR_SOLICITS] = cnf->rtr_solicits; 4917 array[DEVCONF_RTR_SOLICITS] = cnf->rtr_solicits;
4892 array[DEVCONF_RTR_SOLICIT_INTERVAL] = 4918 array[DEVCONF_RTR_SOLICIT_INTERVAL] =
4893 jiffies_to_msecs(cnf->rtr_solicit_interval); 4919 jiffies_to_msecs(cnf->rtr_solicit_interval);
4920 array[DEVCONF_RTR_SOLICIT_MAX_INTERVAL] =
4921 jiffies_to_msecs(cnf->rtr_solicit_max_interval);
4894 array[DEVCONF_RTR_SOLICIT_DELAY] = 4922 array[DEVCONF_RTR_SOLICIT_DELAY] =
4895 jiffies_to_msecs(cnf->rtr_solicit_delay); 4923 jiffies_to_msecs(cnf->rtr_solicit_delay);
4896 array[DEVCONF_FORCE_MLD_VERSION] = cnf->force_mld_version; 4924 array[DEVCONF_FORCE_MLD_VERSION] = cnf->force_mld_version;
@@ -4961,18 +4989,18 @@ static inline size_t inet6_if_nlmsg_size(void)
4961} 4989}
4962 4990
4963static inline void __snmp6_fill_statsdev(u64 *stats, atomic_long_t *mib, 4991static inline void __snmp6_fill_statsdev(u64 *stats, atomic_long_t *mib,
4964 int items, int bytes) 4992 int bytes)
4965{ 4993{
4966 int i; 4994 int i;
4967 int pad = bytes - sizeof(u64) * items; 4995 int pad = bytes - sizeof(u64) * ICMP6_MIB_MAX;
4968 BUG_ON(pad < 0); 4996 BUG_ON(pad < 0);
4969 4997
4970 /* Use put_unaligned() because stats may not be aligned for u64. */ 4998 /* Use put_unaligned() because stats may not be aligned for u64. */
4971 put_unaligned(items, &stats[0]); 4999 put_unaligned(ICMP6_MIB_MAX, &stats[0]);
4972 for (i = 1; i < items; i++) 5000 for (i = 1; i < ICMP6_MIB_MAX; i++)
4973 put_unaligned(atomic_long_read(&mib[i]), &stats[i]); 5001 put_unaligned(atomic_long_read(&mib[i]), &stats[i]);
4974 5002
4975 memset(&stats[items], 0, pad); 5003 memset(&stats[ICMP6_MIB_MAX], 0, pad);
4976} 5004}
4977 5005
4978static inline void __snmp6_fill_stats64(u64 *stats, void __percpu *mib, 5006static inline void __snmp6_fill_stats64(u64 *stats, void __percpu *mib,
@@ -5005,7 +5033,7 @@ static void snmp6_fill_stats(u64 *stats, struct inet6_dev *idev, int attrtype,
5005 offsetof(struct ipstats_mib, syncp)); 5033 offsetof(struct ipstats_mib, syncp));
5006 break; 5034 break;
5007 case IFLA_INET6_ICMP6STATS: 5035 case IFLA_INET6_ICMP6STATS:
5008 __snmp6_fill_statsdev(stats, idev->stats.icmpv6dev->mibs, ICMP6_MIB_MAX, bytes); 5036 __snmp6_fill_statsdev(stats, idev->stats.icmpv6dev->mibs, bytes);
5009 break; 5037 break;
5010 } 5038 }
5011} 5039}
@@ -5099,7 +5127,7 @@ static int inet6_set_iftoken(struct inet6_dev *idev, struct in6_addr *token)
5099 return -EINVAL; 5127 return -EINVAL;
5100 if (!ipv6_accept_ra(idev)) 5128 if (!ipv6_accept_ra(idev))
5101 return -EINVAL; 5129 return -EINVAL;
5102 if (idev->cnf.rtr_solicits <= 0) 5130 if (idev->cnf.rtr_solicits == 0)
5103 return -EINVAL; 5131 return -EINVAL;
5104 5132
5105 write_lock_bh(&idev->lock); 5133 write_lock_bh(&idev->lock);
@@ -5128,8 +5156,10 @@ update_lft:
5128 5156
5129 if (update_rs) { 5157 if (update_rs) {
5130 idev->if_flags |= IF_RS_SENT; 5158 idev->if_flags |= IF_RS_SENT;
5159 idev->rs_interval = rfc3315_s14_backoff_init(
5160 idev->cnf.rtr_solicit_interval);
5131 idev->rs_probes = 1; 5161 idev->rs_probes = 1;
5132 addrconf_mod_rs_timer(idev, idev->cnf.rtr_solicit_interval); 5162 addrconf_mod_rs_timer(idev, idev->rs_interval);
5133 } 5163 }
5134 5164
5135 /* Well, that's kinda nasty ... */ 5165 /* Well, that's kinda nasty ... */
@@ -5467,20 +5497,6 @@ int addrconf_sysctl_forward(struct ctl_table *ctl, int write,
5467} 5497}
5468 5498
5469static 5499static
5470int addrconf_sysctl_hop_limit(struct ctl_table *ctl, int write,
5471 void __user *buffer, size_t *lenp, loff_t *ppos)
5472{
5473 struct ctl_table lctl;
5474 int min_hl = 1, max_hl = 255;
5475
5476 lctl = *ctl;
5477 lctl.extra1 = &min_hl;
5478 lctl.extra2 = &max_hl;
5479
5480 return proc_dointvec_minmax(&lctl, write, buffer, lenp, ppos);
5481}
5482
5483static
5484int addrconf_sysctl_mtu(struct ctl_table *ctl, int write, 5500int addrconf_sysctl_mtu(struct ctl_table *ctl, int write,
5485 void __user *buffer, size_t *lenp, loff_t *ppos) 5501 void __user *buffer, size_t *lenp, loff_t *ppos)
5486{ 5502{
@@ -5713,6 +5729,10 @@ int addrconf_sysctl_ignore_routes_with_linkdown(struct ctl_table *ctl,
5713 return ret; 5729 return ret;
5714} 5730}
5715 5731
5732static int minus_one = -1;
5733static const int one = 1;
5734static const int two_five_five = 255;
5735
5716static const struct ctl_table addrconf_sysctl[] = { 5736static const struct ctl_table addrconf_sysctl[] = {
5717 { 5737 {
5718 .procname = "forwarding", 5738 .procname = "forwarding",
@@ -5726,7 +5746,9 @@ static const struct ctl_table addrconf_sysctl[] = {
5726 .data = &ipv6_devconf.hop_limit, 5746 .data = &ipv6_devconf.hop_limit,
5727 .maxlen = sizeof(int), 5747 .maxlen = sizeof(int),
5728 .mode = 0644, 5748 .mode = 0644,
5729 .proc_handler = addrconf_sysctl_hop_limit, 5749 .proc_handler = proc_dointvec_minmax,
5750 .extra1 = (void *)&one,
5751 .extra2 = (void *)&two_five_five,
5730 }, 5752 },
5731 { 5753 {
5732 .procname = "mtu", 5754 .procname = "mtu",
@@ -5768,7 +5790,8 @@ static const struct ctl_table addrconf_sysctl[] = {
5768 .data = &ipv6_devconf.rtr_solicits, 5790 .data = &ipv6_devconf.rtr_solicits,
5769 .maxlen = sizeof(int), 5791 .maxlen = sizeof(int),
5770 .mode = 0644, 5792 .mode = 0644,
5771 .proc_handler = proc_dointvec, 5793 .proc_handler = proc_dointvec_minmax,
5794 .extra1 = &minus_one,
5772 }, 5795 },
5773 { 5796 {
5774 .procname = "router_solicitation_interval", 5797 .procname = "router_solicitation_interval",
@@ -5778,6 +5801,13 @@ static const struct ctl_table addrconf_sysctl[] = {
5778 .proc_handler = proc_dointvec_jiffies, 5801 .proc_handler = proc_dointvec_jiffies,
5779 }, 5802 },
5780 { 5803 {
5804 .procname = "router_solicitation_max_interval",
5805 .data = &ipv6_devconf.rtr_solicit_max_interval,
5806 .maxlen = sizeof(int),
5807 .mode = 0644,
5808 .proc_handler = proc_dointvec_jiffies,
5809 },
5810 {
5781 .procname = "router_solicitation_delay", 5811 .procname = "router_solicitation_delay",
5782 .data = &ipv6_devconf.rtr_solicit_delay, 5812 .data = &ipv6_devconf.rtr_solicit_delay,
5783 .maxlen = sizeof(int), 5813 .maxlen = sizeof(int),
@@ -6044,8 +6074,14 @@ static int __addrconf_sysctl_register(struct net *net, char *dev_name,
6044 6074
6045 for (i = 0; table[i].data; i++) { 6075 for (i = 0; table[i].data; i++) {
6046 table[i].data += (char *)p - (char *)&ipv6_devconf; 6076 table[i].data += (char *)p - (char *)&ipv6_devconf;
6047 table[i].extra1 = idev; /* embedded; no ref */ 6077 /* If one of these is already set, then it is not safe to
6048 table[i].extra2 = net; 6078 * overwrite either of them: this makes proc_dointvec_minmax
6079 * usable.
6080 */
6081 if (!table[i].extra1 && !table[i].extra2) {
6082 table[i].extra1 = idev; /* embedded; no ref */
6083 table[i].extra2 = net;
6084 }
6049 } 6085 }
6050 6086
6051 snprintf(path, sizeof(path), "net/ipv6/conf/%s", dev_name); 6087 snprintf(path, sizeof(path), "net/ipv6/conf/%s", dev_name);
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index b454055ba625..46ad699937fd 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -545,6 +545,8 @@ const struct proto_ops inet6_stream_ops = {
545 .mmap = sock_no_mmap, 545 .mmap = sock_no_mmap,
546 .sendpage = inet_sendpage, 546 .sendpage = inet_sendpage,
547 .splice_read = tcp_splice_read, 547 .splice_read = tcp_splice_read,
548 .read_sock = tcp_read_sock,
549 .peek_len = tcp_peek_len,
548#ifdef CONFIG_COMPAT 550#ifdef CONFIG_COMPAT
549 .compat_setsockopt = compat_sock_common_setsockopt, 551 .compat_setsockopt = compat_sock_common_setsockopt,
550 .compat_getsockopt = compat_sock_common_getsockopt, 552 .compat_getsockopt = compat_sock_common_getsockopt,
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index 5857c1fc8b67..eea23b57c6a5 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -38,6 +38,9 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
38 .flags = FIB_LOOKUP_NOREF, 38 .flags = FIB_LOOKUP_NOREF,
39 }; 39 };
40 40
41 /* update flow if oif or iif point to device enslaved to l3mdev */
42 l3mdev_update_flow(net, flowi6_to_flowi(fl6));
43
41 fib_rules_lookup(net->ipv6.fib6_rules_ops, 44 fib_rules_lookup(net->ipv6.fib6_rules_ops,
42 flowi6_to_flowi(fl6), flags, &arg); 45 flowi6_to_flowi(fl6), flags, &arg);
43 46
diff --git a/net/ipv6/ila/ila_common.c b/net/ipv6/ila/ila_common.c
index ec9efbcdad35..aba0998ddbfb 100644
--- a/net/ipv6/ila/ila_common.c
+++ b/net/ipv6/ila/ila_common.c
@@ -172,6 +172,5 @@ static void __exit ila_fini(void)
172 172
173module_init(ila_init); 173module_init(ila_init);
174module_exit(ila_fini); 174module_exit(ila_fini);
175MODULE_ALIAS_RTNL_LWT(ILA);
176MODULE_AUTHOR("Tom Herbert <tom@herbertland.com>"); 175MODULE_AUTHOR("Tom Herbert <tom@herbertland.com>");
177MODULE_LICENSE("GPL"); 176MODULE_LICENSE("GPL");
diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c
index c8314c6b6154..e50c27a93e17 100644
--- a/net/ipv6/ila/ila_lwt.c
+++ b/net/ipv6/ila/ila_lwt.c
@@ -51,7 +51,7 @@ drop:
51 return -EINVAL; 51 return -EINVAL;
52} 52}
53 53
54static struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = { 54static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = {
55 [ILA_ATTR_LOCATOR] = { .type = NLA_U64, }, 55 [ILA_ATTR_LOCATOR] = { .type = NLA_U64, },
56 [ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, }, 56 [ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, },
57}; 57};
diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c
index e6eca5fdf4c9..e604013dd814 100644
--- a/net/ipv6/ila/ila_xlat.c
+++ b/net/ipv6/ila/ila_xlat.c
@@ -128,7 +128,7 @@ static struct genl_family ila_nl_family = {
128 .parallel_ops = true, 128 .parallel_ops = true,
129}; 129};
130 130
131static struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = { 131static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = {
132 [ILA_ATTR_LOCATOR] = { .type = NLA_U64, }, 132 [ILA_ATTR_LOCATOR] = { .type = NLA_U64, },
133 [ILA_ATTR_LOCATOR_MATCH] = { .type = NLA_U64, }, 133 [ILA_ATTR_LOCATOR_MATCH] = { .type = NLA_U64, },
134 [ILA_ATTR_IFINDEX] = { .type = NLA_U32, }, 134 [ILA_ATTR_IFINDEX] = { .type = NLA_U32, },
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 771be1fa4176..ef5485204522 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -743,6 +743,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
743 (info->nlh->nlmsg_flags & NLM_F_CREATE)); 743 (info->nlh->nlmsg_flags & NLM_F_CREATE));
744 int found = 0; 744 int found = 0;
745 bool rt_can_ecmp = rt6_qualify_for_ecmp(rt); 745 bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
746 u16 nlflags = NLM_F_EXCL;
746 int err; 747 int err;
747 748
748 ins = &fn->leaf; 749 ins = &fn->leaf;
@@ -759,6 +760,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
759 if (info->nlh && 760 if (info->nlh &&
760 (info->nlh->nlmsg_flags & NLM_F_EXCL)) 761 (info->nlh->nlmsg_flags & NLM_F_EXCL))
761 return -EEXIST; 762 return -EEXIST;
763
764 nlflags &= ~NLM_F_EXCL;
762 if (replace) { 765 if (replace) {
763 if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) { 766 if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) {
764 found++; 767 found++;
@@ -856,6 +859,7 @@ next_iter:
856 pr_warn("NLM_F_CREATE should be set when creating new route\n"); 859 pr_warn("NLM_F_CREATE should be set when creating new route\n");
857 860
858add: 861add:
862 nlflags |= NLM_F_CREATE;
859 err = fib6_commit_metrics(&rt->dst, mxc); 863 err = fib6_commit_metrics(&rt->dst, mxc);
860 if (err) 864 if (err)
861 return err; 865 return err;
@@ -864,7 +868,7 @@ add:
864 *ins = rt; 868 *ins = rt;
865 rt->rt6i_node = fn; 869 rt->rt6i_node = fn;
866 atomic_inc(&rt->rt6i_ref); 870 atomic_inc(&rt->rt6i_ref);
867 inet6_rt_notify(RTM_NEWROUTE, rt, info, 0); 871 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
868 info->nl_net->ipv6.rt6_stats->fib_rt_entries++; 872 info->nl_net->ipv6.rt6_stats->fib_rt_entries++;
869 873
870 if (!(fn->fn_flags & RTN_RTINFO)) { 874 if (!(fn->fn_flags & RTN_RTINFO)) {
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index edc3daab354e..d7d6d3ae0b3b 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -61,12 +61,12 @@ static bool log_ecn_error = true;
61module_param(log_ecn_error, bool, 0644); 61module_param(log_ecn_error, bool, 0644);
62MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); 62MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
63 63
64#define HASH_SIZE_SHIFT 5 64#define IP6_GRE_HASH_SIZE_SHIFT 5
65#define HASH_SIZE (1 << HASH_SIZE_SHIFT) 65#define IP6_GRE_HASH_SIZE (1 << IP6_GRE_HASH_SIZE_SHIFT)
66 66
67static int ip6gre_net_id __read_mostly; 67static int ip6gre_net_id __read_mostly;
68struct ip6gre_net { 68struct ip6gre_net {
69 struct ip6_tnl __rcu *tunnels[4][HASH_SIZE]; 69 struct ip6_tnl __rcu *tunnels[4][IP6_GRE_HASH_SIZE];
70 70
71 struct net_device *fb_tunnel_dev; 71 struct net_device *fb_tunnel_dev;
72}; 72};
@@ -96,12 +96,12 @@ static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu);
96 will match fallback tunnel. 96 will match fallback tunnel.
97 */ 97 */
98 98
99#define HASH_KEY(key) (((__force u32)key^((__force u32)key>>4))&(HASH_SIZE - 1)) 99#define HASH_KEY(key) (((__force u32)key^((__force u32)key>>4))&(IP6_GRE_HASH_SIZE - 1))
100static u32 HASH_ADDR(const struct in6_addr *addr) 100static u32 HASH_ADDR(const struct in6_addr *addr)
101{ 101{
102 u32 hash = ipv6_addr_hash(addr); 102 u32 hash = ipv6_addr_hash(addr);
103 103
104 return hash_32(hash, HASH_SIZE_SHIFT); 104 return hash_32(hash, IP6_GRE_HASH_SIZE_SHIFT);
105} 105}
106 106
107#define tunnels_r_l tunnels[3] 107#define tunnels_r_l tunnels[3]
@@ -1086,7 +1086,7 @@ static void ip6gre_destroy_tunnels(struct net *net, struct list_head *head)
1086 1086
1087 for (prio = 0; prio < 4; prio++) { 1087 for (prio = 0; prio < 4; prio++) {
1088 int h; 1088 int h;
1089 for (h = 0; h < HASH_SIZE; h++) { 1089 for (h = 0; h < IP6_GRE_HASH_SIZE; h++) {
1090 struct ip6_tnl *t; 1090 struct ip6_tnl *t;
1091 1091
1092 t = rtnl_dereference(ign->tunnels[prio][h]); 1092 t = rtnl_dereference(ign->tunnels[prio][h]);
@@ -1238,7 +1238,7 @@ static void ip6gre_netlink_parms(struct nlattr *data[],
1238 parms->encap_limit = nla_get_u8(data[IFLA_GRE_ENCAP_LIMIT]); 1238 parms->encap_limit = nla_get_u8(data[IFLA_GRE_ENCAP_LIMIT]);
1239 1239
1240 if (data[IFLA_GRE_FLOWINFO]) 1240 if (data[IFLA_GRE_FLOWINFO])
1241 parms->flowinfo = nla_get_u32(data[IFLA_GRE_FLOWINFO]); 1241 parms->flowinfo = nla_get_be32(data[IFLA_GRE_FLOWINFO]);
1242 1242
1243 if (data[IFLA_GRE_FLAGS]) 1243 if (data[IFLA_GRE_FLAGS])
1244 parms->flags = nla_get_u32(data[IFLA_GRE_FLAGS]); 1244 parms->flags = nla_get_u32(data[IFLA_GRE_FLAGS]);
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 22e90e56b5a9..e7bfd55899a3 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -69,6 +69,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
69 int offset = 0; 69 int offset = 0;
70 bool encap, udpfrag; 70 bool encap, udpfrag;
71 int nhoff; 71 int nhoff;
72 bool gso_partial;
72 73
73 skb_reset_network_header(skb); 74 skb_reset_network_header(skb);
74 nhoff = skb_network_header(skb) - skb_mac_header(skb); 75 nhoff = skb_network_header(skb) - skb_mac_header(skb);
@@ -101,9 +102,11 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
101 if (IS_ERR(segs)) 102 if (IS_ERR(segs))
102 goto out; 103 goto out;
103 104
105 gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
106
104 for (skb = segs; skb; skb = skb->next) { 107 for (skb = segs; skb; skb = skb->next) {
105 ipv6h = (struct ipv6hdr *)(skb_mac_header(skb) + nhoff); 108 ipv6h = (struct ipv6hdr *)(skb_mac_header(skb) + nhoff);
106 if (skb_is_gso(skb)) 109 if (gso_partial)
107 payload_len = skb_shinfo(skb)->gso_size + 110 payload_len = skb_shinfo(skb)->gso_size +
108 SKB_GSO_CB(skb)->data_offset + 111 SKB_GSO_CB(skb)->data_offset +
109 skb->head - (unsigned char *)(ipv6h + 1); 112 skb->head - (unsigned char *)(ipv6h + 1);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 1dfc402d9ad1..6001e781164e 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -56,6 +56,7 @@
56#include <net/checksum.h> 56#include <net/checksum.h>
57#include <linux/mroute6.h> 57#include <linux/mroute6.h>
58#include <net/l3mdev.h> 58#include <net/l3mdev.h>
59#include <net/lwtunnel.h>
59 60
60static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) 61static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
61{ 62{
@@ -104,6 +105,13 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
104 } 105 }
105 } 106 }
106 107
108 if (lwtunnel_xmit_redirect(dst->lwtstate)) {
109 int res = lwtunnel_xmit(skb);
110
111 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
112 return res;
113 }
114
107 rcu_read_lock_bh(); 115 rcu_read_lock_bh();
108 nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr); 116 nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
109 neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop); 117 neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
@@ -228,6 +236,14 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
228 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) { 236 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
229 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)), 237 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
230 IPSTATS_MIB_OUT, skb->len); 238 IPSTATS_MIB_OUT, skb->len);
239
240 /* if egress device is enslaved to an L3 master device pass the
241 * skb to its handler for processing
242 */
243 skb = l3mdev_ip6_out((struct sock *)sk, skb);
244 if (unlikely(!skb))
245 return 0;
246
231 /* hooks should never assume socket lock is held. 247 /* hooks should never assume socket lock is held.
232 * we promote our socket to non const 248 * we promote our socket to non const
233 */ 249 */
@@ -910,13 +926,6 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
910 int err; 926 int err;
911 int flags = 0; 927 int flags = 0;
912 928
913 if (ipv6_addr_any(&fl6->saddr) && fl6->flowi6_oif &&
914 (!*dst || !(*dst)->error)) {
915 err = l3mdev_get_saddr6(net, sk, fl6);
916 if (err)
917 goto out_err;
918 }
919
920 /* The correct way to handle this would be to do 929 /* The correct way to handle this would be to do
921 * ip6_route_get_saddr, and then ip6_route_output; however, 930 * ip6_route_get_saddr, and then ip6_route_output; however,
922 * the route-specific preferred source forces the 931 * the route-specific preferred source forces the
@@ -1008,7 +1017,7 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1008out_err_release: 1017out_err_release:
1009 dst_release(*dst); 1018 dst_release(*dst);
1010 *dst = NULL; 1019 *dst = NULL;
1011out_err: 1020
1012 if (err == -ENETUNREACH) 1021 if (err == -ENETUNREACH)
1013 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES); 1022 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1014 return err; 1023 return err;
@@ -1054,8 +1063,6 @@ struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1054 return ERR_PTR(err); 1063 return ERR_PTR(err);
1055 if (final_dst) 1064 if (final_dst)
1056 fl6->daddr = *final_dst; 1065 fl6->daddr = *final_dst;
1057 if (!fl6->flowi6_oif)
1058 fl6->flowi6_oif = l3mdev_fib_oif(dst->dev);
1059 1066
1060 return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); 1067 return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1061} 1068}
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 888543debe4e..6a66adba0c22 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -57,6 +57,7 @@
57#include <net/inet_ecn.h> 57#include <net/inet_ecn.h>
58#include <net/net_namespace.h> 58#include <net/net_namespace.h>
59#include <net/netns/generic.h> 59#include <net/netns/generic.h>
60#include <net/dst_metadata.h>
60 61
61MODULE_AUTHOR("Ville Nuorvala"); 62MODULE_AUTHOR("Ville Nuorvala");
62MODULE_DESCRIPTION("IPv6 tunneling device"); 63MODULE_DESCRIPTION("IPv6 tunneling device");
@@ -64,8 +65,8 @@ MODULE_LICENSE("GPL");
64MODULE_ALIAS_RTNL_LINK("ip6tnl"); 65MODULE_ALIAS_RTNL_LINK("ip6tnl");
65MODULE_ALIAS_NETDEV("ip6tnl0"); 66MODULE_ALIAS_NETDEV("ip6tnl0");
66 67
67#define HASH_SIZE_SHIFT 5 68#define IP6_TUNNEL_HASH_SIZE_SHIFT 5
68#define HASH_SIZE (1 << HASH_SIZE_SHIFT) 69#define IP6_TUNNEL_HASH_SIZE (1 << IP6_TUNNEL_HASH_SIZE_SHIFT)
69 70
70static bool log_ecn_error = true; 71static bool log_ecn_error = true;
71module_param(log_ecn_error, bool, 0644); 72module_param(log_ecn_error, bool, 0644);
@@ -75,7 +76,7 @@ static u32 HASH(const struct in6_addr *addr1, const struct in6_addr *addr2)
75{ 76{
76 u32 hash = ipv6_addr_hash(addr1) ^ ipv6_addr_hash(addr2); 77 u32 hash = ipv6_addr_hash(addr1) ^ ipv6_addr_hash(addr2);
77 78
78 return hash_32(hash, HASH_SIZE_SHIFT); 79 return hash_32(hash, IP6_TUNNEL_HASH_SIZE_SHIFT);
79} 80}
80 81
81static int ip6_tnl_dev_init(struct net_device *dev); 82static int ip6_tnl_dev_init(struct net_device *dev);
@@ -87,9 +88,10 @@ struct ip6_tnl_net {
87 /* the IPv6 tunnel fallback device */ 88 /* the IPv6 tunnel fallback device */
88 struct net_device *fb_tnl_dev; 89 struct net_device *fb_tnl_dev;
89 /* lists for storing tunnels in use */ 90 /* lists for storing tunnels in use */
90 struct ip6_tnl __rcu *tnls_r_l[HASH_SIZE]; 91 struct ip6_tnl __rcu *tnls_r_l[IP6_TUNNEL_HASH_SIZE];
91 struct ip6_tnl __rcu *tnls_wc[1]; 92 struct ip6_tnl __rcu *tnls_wc[1];
92 struct ip6_tnl __rcu **tnls[2]; 93 struct ip6_tnl __rcu **tnls[2];
94 struct ip6_tnl __rcu *collect_md_tun;
93}; 95};
94 96
95static struct net_device_stats *ip6_get_stats(struct net_device *dev) 97static struct net_device_stats *ip6_get_stats(struct net_device *dev)
@@ -166,6 +168,10 @@ ip6_tnl_lookup(struct net *net, const struct in6_addr *remote, const struct in6_
166 return t; 168 return t;
167 } 169 }
168 170
171 t = rcu_dereference(ip6n->collect_md_tun);
172 if (t)
173 return t;
174
169 t = rcu_dereference(ip6n->tnls_wc[0]); 175 t = rcu_dereference(ip6n->tnls_wc[0]);
170 if (t && (t->dev->flags & IFF_UP)) 176 if (t && (t->dev->flags & IFF_UP))
171 return t; 177 return t;
@@ -209,6 +215,8 @@ ip6_tnl_link(struct ip6_tnl_net *ip6n, struct ip6_tnl *t)
209{ 215{
210 struct ip6_tnl __rcu **tp = ip6_tnl_bucket(ip6n, &t->parms); 216 struct ip6_tnl __rcu **tp = ip6_tnl_bucket(ip6n, &t->parms);
211 217
218 if (t->parms.collect_md)
219 rcu_assign_pointer(ip6n->collect_md_tun, t);
212 rcu_assign_pointer(t->next , rtnl_dereference(*tp)); 220 rcu_assign_pointer(t->next , rtnl_dereference(*tp));
213 rcu_assign_pointer(*tp, t); 221 rcu_assign_pointer(*tp, t);
214} 222}
@@ -224,6 +232,9 @@ ip6_tnl_unlink(struct ip6_tnl_net *ip6n, struct ip6_tnl *t)
224 struct ip6_tnl __rcu **tp; 232 struct ip6_tnl __rcu **tp;
225 struct ip6_tnl *iter; 233 struct ip6_tnl *iter;
226 234
235 if (t->parms.collect_md)
236 rcu_assign_pointer(ip6n->collect_md_tun, NULL);
237
227 for (tp = ip6_tnl_bucket(ip6n, &t->parms); 238 for (tp = ip6_tnl_bucket(ip6n, &t->parms);
228 (iter = rtnl_dereference(*tp)) != NULL; 239 (iter = rtnl_dereference(*tp)) != NULL;
229 tp = &iter->next) { 240 tp = &iter->next) {
@@ -829,6 +840,9 @@ static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb,
829 840
830 skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev))); 841 skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
831 842
843 if (tun_dst)
844 skb_dst_set(skb, (struct dst_entry *)tun_dst);
845
832 gro_cells_receive(&tunnel->gro_cells, skb); 846 gro_cells_receive(&tunnel->gro_cells, skb);
833 return 0; 847 return 0;
834 848
@@ -865,6 +879,7 @@ static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto,
865{ 879{
866 struct ip6_tnl *t; 880 struct ip6_tnl *t;
867 const struct ipv6hdr *ipv6h = ipv6_hdr(skb); 881 const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
882 struct metadata_dst *tun_dst = NULL;
868 int ret = -1; 883 int ret = -1;
869 884
870 rcu_read_lock(); 885 rcu_read_lock();
@@ -881,7 +896,12 @@ static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto,
881 goto drop; 896 goto drop;
882 if (iptunnel_pull_header(skb, 0, tpi->proto, false)) 897 if (iptunnel_pull_header(skb, 0, tpi->proto, false))
883 goto drop; 898 goto drop;
884 ret = __ip6_tnl_rcv(t, skb, tpi, NULL, dscp_ecn_decapsulate, 899 if (t->parms.collect_md) {
900 tun_dst = ipv6_tun_rx_dst(skb, 0, 0, 0);
901 if (!tun_dst)
902 return 0;
903 }
904 ret = __ip6_tnl_rcv(t, skb, tpi, tun_dst, dscp_ecn_decapsulate,
885 log_ecn_error); 905 log_ecn_error);
886 } 906 }
887 907
@@ -1012,8 +1032,16 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
1012 int mtu; 1032 int mtu;
1013 unsigned int psh_hlen = sizeof(struct ipv6hdr) + t->encap_hlen; 1033 unsigned int psh_hlen = sizeof(struct ipv6hdr) + t->encap_hlen;
1014 unsigned int max_headroom = psh_hlen; 1034 unsigned int max_headroom = psh_hlen;
1035 u8 hop_limit;
1015 int err = -1; 1036 int err = -1;
1016 1037
1038 if (t->parms.collect_md) {
1039 hop_limit = skb_tunnel_info(skb)->key.ttl;
1040 goto route_lookup;
1041 } else {
1042 hop_limit = t->parms.hop_limit;
1043 }
1044
1017 /* NBMA tunnel */ 1045 /* NBMA tunnel */
1018 if (ipv6_addr_any(&t->parms.raddr)) { 1046 if (ipv6_addr_any(&t->parms.raddr)) {
1019 struct in6_addr *addr6; 1047 struct in6_addr *addr6;
@@ -1043,6 +1071,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
1043 goto tx_err_link_failure; 1071 goto tx_err_link_failure;
1044 1072
1045 if (!dst) { 1073 if (!dst) {
1074route_lookup:
1046 dst = ip6_route_output(net, NULL, fl6); 1075 dst = ip6_route_output(net, NULL, fl6);
1047 1076
1048 if (dst->error) 1077 if (dst->error)
@@ -1053,6 +1082,10 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
1053 dst = NULL; 1082 dst = NULL;
1054 goto tx_err_link_failure; 1083 goto tx_err_link_failure;
1055 } 1084 }
1085 if (t->parms.collect_md &&
1086 ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev,
1087 &fl6->daddr, 0, &fl6->saddr))
1088 goto tx_err_link_failure;
1056 ndst = dst; 1089 ndst = dst;
1057 } 1090 }
1058 1091
@@ -1071,7 +1104,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
1071 } 1104 }
1072 if (mtu < IPV6_MIN_MTU) 1105 if (mtu < IPV6_MIN_MTU)
1073 mtu = IPV6_MIN_MTU; 1106 mtu = IPV6_MIN_MTU;
1074 if (skb_dst(skb)) 1107 if (skb_dst(skb) && !t->parms.collect_md)
1075 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); 1108 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
1076 if (skb->len > mtu && !skb_is_gso(skb)) { 1109 if (skb->len > mtu && !skb_is_gso(skb)) {
1077 *pmtu = mtu; 1110 *pmtu = mtu;
@@ -1111,8 +1144,13 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
1111 skb = new_skb; 1144 skb = new_skb;
1112 } 1145 }
1113 1146
1114 if (!fl6->flowi6_mark && ndst) 1147 if (t->parms.collect_md) {
1115 dst_cache_set_ip6(&t->dst_cache, ndst, &fl6->saddr); 1148 if (t->encap.type != TUNNEL_ENCAP_NONE)
1149 goto tx_err_dst_release;
1150 } else {
1151 if (!fl6->flowi6_mark && ndst)
1152 dst_cache_set_ip6(&t->dst_cache, ndst, &fl6->saddr);
1153 }
1116 skb_dst_set(skb, dst); 1154 skb_dst_set(skb, dst);
1117 1155
1118 if (encap_limit >= 0) { 1156 if (encap_limit >= 0) {
@@ -1137,7 +1175,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
1137 ipv6h = ipv6_hdr(skb); 1175 ipv6h = ipv6_hdr(skb);
1138 ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield), 1176 ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield),
1139 ip6_make_flowlabel(net, skb, fl6->flowlabel, true, fl6)); 1177 ip6_make_flowlabel(net, skb, fl6->flowlabel, true, fl6));
1140 ipv6h->hop_limit = t->parms.hop_limit; 1178 ipv6h->hop_limit = hop_limit;
1141 ipv6h->nexthdr = proto; 1179 ipv6h->nexthdr = proto;
1142 ipv6h->saddr = fl6->saddr; 1180 ipv6h->saddr = fl6->saddr;
1143 ipv6h->daddr = fl6->daddr; 1181 ipv6h->daddr = fl6->daddr;
@@ -1170,19 +1208,34 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
1170 if (tproto != IPPROTO_IPIP && tproto != 0) 1208 if (tproto != IPPROTO_IPIP && tproto != 0)
1171 return -1; 1209 return -1;
1172 1210
1173 if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) 1211 dsfield = ipv4_get_dsfield(iph);
1174 encap_limit = t->parms.encap_limit;
1175 1212
1176 memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); 1213 if (t->parms.collect_md) {
1177 fl6.flowi6_proto = IPPROTO_IPIP; 1214 struct ip_tunnel_info *tun_info;
1215 const struct ip_tunnel_key *key;
1178 1216
1179 dsfield = ipv4_get_dsfield(iph); 1217 tun_info = skb_tunnel_info(skb);
1218 if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
1219 ip_tunnel_info_af(tun_info) != AF_INET6))
1220 return -1;
1221 key = &tun_info->key;
1222 memset(&fl6, 0, sizeof(fl6));
1223 fl6.flowi6_proto = IPPROTO_IPIP;
1224 fl6.daddr = key->u.ipv6.dst;
1225 fl6.flowlabel = key->label;
1226 } else {
1227 if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
1228 encap_limit = t->parms.encap_limit;
1180 1229
1181 if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) 1230 memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
1182 fl6.flowlabel |= htonl((__u32)iph->tos << IPV6_TCLASS_SHIFT) 1231 fl6.flowi6_proto = IPPROTO_IPIP;
1183 & IPV6_TCLASS_MASK; 1232
1184 if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) 1233 if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
1185 fl6.flowi6_mark = skb->mark; 1234 fl6.flowlabel |= htonl((__u32)iph->tos << IPV6_TCLASS_SHIFT)
1235 & IPV6_TCLASS_MASK;
1236 if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
1237 fl6.flowi6_mark = skb->mark;
1238 }
1186 1239
1187 if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6)) 1240 if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6))
1188 return -1; 1241 return -1;
@@ -1220,29 +1273,47 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
1220 ip6_tnl_addr_conflict(t, ipv6h)) 1273 ip6_tnl_addr_conflict(t, ipv6h))
1221 return -1; 1274 return -1;
1222 1275
1223 offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb)); 1276 dsfield = ipv6_get_dsfield(ipv6h);
1224 if (offset > 0) { 1277
1225 struct ipv6_tlv_tnl_enc_lim *tel; 1278 if (t->parms.collect_md) {
1226 tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset]; 1279 struct ip_tunnel_info *tun_info;
1227 if (tel->encap_limit == 0) { 1280 const struct ip_tunnel_key *key;
1228 icmpv6_send(skb, ICMPV6_PARAMPROB, 1281
1229 ICMPV6_HDR_FIELD, offset + 2); 1282 tun_info = skb_tunnel_info(skb);
1283 if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
1284 ip_tunnel_info_af(tun_info) != AF_INET6))
1230 return -1; 1285 return -1;
1286 key = &tun_info->key;
1287 memset(&fl6, 0, sizeof(fl6));
1288 fl6.flowi6_proto = IPPROTO_IPV6;
1289 fl6.daddr = key->u.ipv6.dst;
1290 fl6.flowlabel = key->label;
1291 } else {
1292 offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb));
1293 if (offset > 0) {
1294 struct ipv6_tlv_tnl_enc_lim *tel;
1295
1296 tel = (void *)&skb_network_header(skb)[offset];
1297 if (tel->encap_limit == 0) {
1298 icmpv6_send(skb, ICMPV6_PARAMPROB,
1299 ICMPV6_HDR_FIELD, offset + 2);
1300 return -1;
1301 }
1302 encap_limit = tel->encap_limit - 1;
1303 } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) {
1304 encap_limit = t->parms.encap_limit;
1231 } 1305 }
1232 encap_limit = tel->encap_limit - 1;
1233 } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
1234 encap_limit = t->parms.encap_limit;
1235 1306
1236 memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); 1307 memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
1237 fl6.flowi6_proto = IPPROTO_IPV6; 1308 fl6.flowi6_proto = IPPROTO_IPV6;
1238 1309
1239 dsfield = ipv6_get_dsfield(ipv6h); 1310 if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
1240 if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) 1311 fl6.flowlabel |= (*(__be32 *)ipv6h & IPV6_TCLASS_MASK);
1241 fl6.flowlabel |= (*(__be32 *) ipv6h & IPV6_TCLASS_MASK); 1312 if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)
1242 if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL) 1313 fl6.flowlabel |= ip6_flowlabel(ipv6h);
1243 fl6.flowlabel |= ip6_flowlabel(ipv6h); 1314 if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
1244 if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) 1315 fl6.flowi6_mark = skb->mark;
1245 fl6.flowi6_mark = skb->mark; 1316 }
1246 1317
1247 if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6)) 1318 if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6))
1248 return -1; 1319 return -1;
@@ -1741,6 +1812,10 @@ static int ip6_tnl_dev_init(struct net_device *dev)
1741 if (err) 1812 if (err)
1742 return err; 1813 return err;
1743 ip6_tnl_link_config(t); 1814 ip6_tnl_link_config(t);
1815 if (t->parms.collect_md) {
1816 dev->features |= NETIF_F_NETNS_LOCAL;
1817 netif_keep_dst(dev);
1818 }
1744 return 0; 1819 return 0;
1745} 1820}
1746 1821
@@ -1811,6 +1886,9 @@ static void ip6_tnl_netlink_parms(struct nlattr *data[],
1811 1886
1812 if (data[IFLA_IPTUN_PROTO]) 1887 if (data[IFLA_IPTUN_PROTO])
1813 parms->proto = nla_get_u8(data[IFLA_IPTUN_PROTO]); 1888 parms->proto = nla_get_u8(data[IFLA_IPTUN_PROTO]);
1889
1890 if (data[IFLA_IPTUN_COLLECT_METADATA])
1891 parms->collect_md = true;
1814} 1892}
1815 1893
1816static bool ip6_tnl_netlink_encap_parms(struct nlattr *data[], 1894static bool ip6_tnl_netlink_encap_parms(struct nlattr *data[],
@@ -1850,6 +1928,7 @@ static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev,
1850 struct nlattr *tb[], struct nlattr *data[]) 1928 struct nlattr *tb[], struct nlattr *data[])
1851{ 1929{
1852 struct net *net = dev_net(dev); 1930 struct net *net = dev_net(dev);
1931 struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
1853 struct ip6_tnl *nt, *t; 1932 struct ip6_tnl *nt, *t;
1854 struct ip_tunnel_encap ipencap; 1933 struct ip_tunnel_encap ipencap;
1855 1934
@@ -1864,9 +1943,14 @@ static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev,
1864 1943
1865 ip6_tnl_netlink_parms(data, &nt->parms); 1944 ip6_tnl_netlink_parms(data, &nt->parms);
1866 1945
1867 t = ip6_tnl_locate(net, &nt->parms, 0); 1946 if (nt->parms.collect_md) {
1868 if (!IS_ERR(t)) 1947 if (rtnl_dereference(ip6n->collect_md_tun))
1869 return -EEXIST; 1948 return -EEXIST;
1949 } else {
1950 t = ip6_tnl_locate(net, &nt->parms, 0);
1951 if (!IS_ERR(t))
1952 return -EEXIST;
1953 }
1870 1954
1871 return ip6_tnl_create2(dev); 1955 return ip6_tnl_create2(dev);
1872} 1956}
@@ -1890,6 +1974,8 @@ static int ip6_tnl_changelink(struct net_device *dev, struct nlattr *tb[],
1890 return err; 1974 return err;
1891 } 1975 }
1892 ip6_tnl_netlink_parms(data, &p); 1976 ip6_tnl_netlink_parms(data, &p);
1977 if (p.collect_md)
1978 return -EINVAL;
1893 1979
1894 t = ip6_tnl_locate(net, &p, 0); 1980 t = ip6_tnl_locate(net, &p, 0);
1895 if (!IS_ERR(t)) { 1981 if (!IS_ERR(t)) {
@@ -1937,6 +2023,8 @@ static size_t ip6_tnl_get_size(const struct net_device *dev)
1937 nla_total_size(2) + 2023 nla_total_size(2) +
1938 /* IFLA_IPTUN_ENCAP_DPORT */ 2024 /* IFLA_IPTUN_ENCAP_DPORT */
1939 nla_total_size(2) + 2025 nla_total_size(2) +
2026 /* IFLA_IPTUN_COLLECT_METADATA */
2027 nla_total_size(0) +
1940 0; 2028 0;
1941} 2029}
1942 2030
@@ -1955,16 +2043,15 @@ static int ip6_tnl_fill_info(struct sk_buff *skb, const struct net_device *dev)
1955 nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->proto)) 2043 nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->proto))
1956 goto nla_put_failure; 2044 goto nla_put_failure;
1957 2045
1958 if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE, 2046 if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE, tunnel->encap.type) ||
1959 tunnel->encap.type) || 2047 nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT, tunnel->encap.sport) ||
1960 nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT, 2048 nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT, tunnel->encap.dport) ||
1961 tunnel->encap.sport) || 2049 nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS, tunnel->encap.flags))
1962 nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT,
1963 tunnel->encap.dport) ||
1964 nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS,
1965 tunnel->encap.flags))
1966 goto nla_put_failure; 2050 goto nla_put_failure;
1967 2051
2052 if (parm->collect_md)
2053 if (nla_put_flag(skb, IFLA_IPTUN_COLLECT_METADATA))
2054 goto nla_put_failure;
1968 return 0; 2055 return 0;
1969 2056
1970nla_put_failure: 2057nla_put_failure:
@@ -1992,6 +2079,7 @@ static const struct nla_policy ip6_tnl_policy[IFLA_IPTUN_MAX + 1] = {
1992 [IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 }, 2079 [IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 },
1993 [IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 }, 2080 [IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 },
1994 [IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 }, 2081 [IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 },
2082 [IFLA_IPTUN_COLLECT_METADATA] = { .type = NLA_FLAG },
1995}; 2083};
1996 2084
1997static struct rtnl_link_ops ip6_link_ops __read_mostly = { 2085static struct rtnl_link_ops ip6_link_ops __read_mostly = {
@@ -2033,7 +2121,7 @@ static void __net_exit ip6_tnl_destroy_tunnels(struct net *net)
2033 if (dev->rtnl_link_ops == &ip6_link_ops) 2121 if (dev->rtnl_link_ops == &ip6_link_ops)
2034 unregister_netdevice_queue(dev, &list); 2122 unregister_netdevice_queue(dev, &list);
2035 2123
2036 for (h = 0; h < HASH_SIZE; h++) { 2124 for (h = 0; h < IP6_TUNNEL_HASH_SIZE; h++) {
2037 t = rtnl_dereference(ip6n->tnls_r_l[h]); 2125 t = rtnl_dereference(ip6n->tnls_r_l[h]);
2038 while (t) { 2126 while (t) {
2039 /* If dev is in the same netns, it has already 2127 /* If dev is in the same netns, it has already
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index 5bd3afdcc771..8a02ca8a11af 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -50,14 +50,14 @@
50#include <net/net_namespace.h> 50#include <net/net_namespace.h>
51#include <net/netns/generic.h> 51#include <net/netns/generic.h>
52 52
53#define HASH_SIZE_SHIFT 5 53#define IP6_VTI_HASH_SIZE_SHIFT 5
54#define HASH_SIZE (1 << HASH_SIZE_SHIFT) 54#define IP6_VTI_HASH_SIZE (1 << IP6_VTI_HASH_SIZE_SHIFT)
55 55
56static u32 HASH(const struct in6_addr *addr1, const struct in6_addr *addr2) 56static u32 HASH(const struct in6_addr *addr1, const struct in6_addr *addr2)
57{ 57{
58 u32 hash = ipv6_addr_hash(addr1) ^ ipv6_addr_hash(addr2); 58 u32 hash = ipv6_addr_hash(addr1) ^ ipv6_addr_hash(addr2);
59 59
60 return hash_32(hash, HASH_SIZE_SHIFT); 60 return hash_32(hash, IP6_VTI_HASH_SIZE_SHIFT);
61} 61}
62 62
63static int vti6_dev_init(struct net_device *dev); 63static int vti6_dev_init(struct net_device *dev);
@@ -69,7 +69,7 @@ struct vti6_net {
69 /* the vti6 tunnel fallback device */ 69 /* the vti6 tunnel fallback device */
70 struct net_device *fb_tnl_dev; 70 struct net_device *fb_tnl_dev;
71 /* lists for storing tunnels in use */ 71 /* lists for storing tunnels in use */
72 struct ip6_tnl __rcu *tnls_r_l[HASH_SIZE]; 72 struct ip6_tnl __rcu *tnls_r_l[IP6_VTI_HASH_SIZE];
73 struct ip6_tnl __rcu *tnls_wc[1]; 73 struct ip6_tnl __rcu *tnls_wc[1];
74 struct ip6_tnl __rcu **tnls[2]; 74 struct ip6_tnl __rcu **tnls[2];
75}; 75};
@@ -1051,7 +1051,7 @@ static void __net_exit vti6_destroy_tunnels(struct vti6_net *ip6n)
1051 struct ip6_tnl *t; 1051 struct ip6_tnl *t;
1052 LIST_HEAD(list); 1052 LIST_HEAD(list);
1053 1053
1054 for (h = 0; h < HASH_SIZE; h++) { 1054 for (h = 0; h < IP6_VTI_HASH_SIZE; h++) {
1055 t = rtnl_dereference(ip6n->tnls_r_l[h]); 1055 t = rtnl_dereference(ip6n->tnls_r_l[h]);
1056 while (t) { 1056 while (t) {
1057 unregister_netdevice_queue(t->dev, &list); 1057 unregister_netdevice_queue(t->dev, &list);
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index d64ee7e83664..75c1fc54f188 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -1739,6 +1739,15 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc,
1739 continue; 1739 continue;
1740 } 1740 }
1741 1741
1742 /* Based on RFC3810 6.1. Should not send source-list change
1743 * records when there is a filter mode change.
1744 */
1745 if (((gdeleted && pmc->mca_sfmode == MCAST_EXCLUDE) ||
1746 (!gdeleted && pmc->mca_crcount)) &&
1747 (type == MLD2_ALLOW_NEW_SOURCES ||
1748 type == MLD2_BLOCK_OLD_SOURCES) && psf->sf_crcount)
1749 goto decrease_sf_crcount;
1750
1742 /* clear marks on query responses */ 1751 /* clear marks on query responses */
1743 if (isquery) 1752 if (isquery)
1744 psf->sf_gsresp = 0; 1753 psf->sf_gsresp = 0;
@@ -1766,6 +1775,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc,
1766 scount++; stotal++; 1775 scount++; stotal++;
1767 if ((type == MLD2_ALLOW_NEW_SOURCES || 1776 if ((type == MLD2_ALLOW_NEW_SOURCES ||
1768 type == MLD2_BLOCK_OLD_SOURCES) && psf->sf_crcount) { 1777 type == MLD2_BLOCK_OLD_SOURCES) && psf->sf_crcount) {
1778decrease_sf_crcount:
1769 psf->sf_crcount--; 1779 psf->sf_crcount--;
1770 if ((sdeleted || gdeleted) && psf->sf_crcount == 0) { 1780 if ((sdeleted || gdeleted) && psf->sf_crcount == 0) {
1771 if (psf_prev) 1781 if (psf_prev)
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index fe65cdc28a45..d8e671457d10 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -67,7 +67,6 @@
67#include <net/flow.h> 67#include <net/flow.h>
68#include <net/ip6_checksum.h> 68#include <net/ip6_checksum.h>
69#include <net/inet_common.h> 69#include <net/inet_common.h>
70#include <net/l3mdev.h>
71#include <linux/proc_fs.h> 70#include <linux/proc_fs.h>
72 71
73#include <linux/netfilter.h> 72#include <linux/netfilter.h>
@@ -457,11 +456,9 @@ static void ndisc_send_skb(struct sk_buff *skb,
457 456
458 if (!dst) { 457 if (!dst) {
459 struct flowi6 fl6; 458 struct flowi6 fl6;
460 int oif = l3mdev_fib_oif(skb->dev); 459 int oif = skb->dev->ifindex;
461 460
462 icmpv6_flow_init(sk, &fl6, type, saddr, daddr, oif); 461 icmpv6_flow_init(sk, &fl6, type, saddr, daddr, oif);
463 if (oif != skb->dev->ifindex)
464 fl6.flowi6_flags |= FLOWI_FLAG_L3MDEV_SRC;
465 dst = icmp6_dst_alloc(skb->dev, &fl6); 462 dst = icmp6_dst_alloc(skb->dev, &fl6);
466 if (IS_ERR(dst)) { 463 if (IS_ERR(dst)) {
467 kfree_skb(skb); 464 kfree_skb(skb);
@@ -1538,7 +1535,6 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
1538 int rd_len; 1535 int rd_len;
1539 u8 ha_buf[MAX_ADDR_LEN], *ha = NULL, 1536 u8 ha_buf[MAX_ADDR_LEN], *ha = NULL,
1540 ops_data_buf[NDISC_OPS_REDIRECT_DATA_SPACE], *ops_data = NULL; 1537 ops_data_buf[NDISC_OPS_REDIRECT_DATA_SPACE], *ops_data = NULL;
1541 int oif = l3mdev_fib_oif(dev);
1542 bool ret; 1538 bool ret;
1543 1539
1544 if (ipv6_get_lladdr(dev, &saddr_buf, IFA_F_TENTATIVE)) { 1540 if (ipv6_get_lladdr(dev, &saddr_buf, IFA_F_TENTATIVE)) {
@@ -1555,10 +1551,7 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
1555 } 1551 }
1556 1552
1557 icmpv6_flow_init(sk, &fl6, NDISC_REDIRECT, 1553 icmpv6_flow_init(sk, &fl6, NDISC_REDIRECT,
1558 &saddr_buf, &ipv6_hdr(skb)->saddr, oif); 1554 &saddr_buf, &ipv6_hdr(skb)->saddr, dev->ifindex);
1559
1560 if (oif != skb->dev->ifindex)
1561 fl6.flowi6_flags |= FLOWI_FLAG_L3MDEV_SRC;
1562 1555
1563 dst = ip6_route_output(net, NULL, &fl6); 1556 dst = ip6_route_output(net, NULL, &fl6);
1564 if (dst->error) { 1557 if (dst->error) {
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 552fac2f390a..55aacea24396 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -190,7 +190,7 @@ static struct nf_loginfo trace_loginfo = {
190 .u = { 190 .u = {
191 .log = { 191 .log = {
192 .level = LOGLEVEL_WARNING, 192 .level = LOGLEVEL_WARNING,
193 .logflags = NF_LOG_MASK, 193 .logflags = NF_LOG_DEFAULT_MASK,
194 }, 194 },
195 }, 195 },
196}; 196};
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index 1aa5848764a7..963ee3848675 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -115,7 +115,7 @@ static unsigned int ipv6_helper(void *priv,
115 help = nfct_help(ct); 115 help = nfct_help(ct);
116 if (!help) 116 if (!help)
117 return NF_ACCEPT; 117 return NF_ACCEPT;
118 /* rcu_read_lock()ed by nf_hook_slow */ 118 /* rcu_read_lock()ed by nf_hook_thresh */
119 helper = rcu_dereference(help->helper); 119 helper = rcu_dereference(help->helper);
120 if (!helper) 120 if (!helper)
121 return NF_ACCEPT; 121 return NF_ACCEPT;
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index 660bc10c7a9c..f5a61bc3ec2b 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -165,7 +165,7 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
165 return -NF_ACCEPT; 165 return -NF_ACCEPT;
166 } 166 }
167 167
168 /* rcu_read_lock()ed by nf_hook_slow */ 168 /* rcu_read_lock()ed by nf_hook_thresh */
169 inproto = __nf_ct_l4proto_find(PF_INET6, origtuple.dst.protonum); 169 inproto = __nf_ct_l4proto_find(PF_INET6, origtuple.dst.protonum);
170 170
171 /* Ordinarily, we'd expect the inverted tupleproto, but it's 171 /* Ordinarily, we'd expect the inverted tupleproto, but it's
diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c
index 8dd869642f45..57d86066a13b 100644
--- a/net/ipv6/netfilter/nf_log_ipv6.c
+++ b/net/ipv6/netfilter/nf_log_ipv6.c
@@ -30,7 +30,7 @@ static struct nf_loginfo default_loginfo = {
30 .u = { 30 .u = {
31 .log = { 31 .log = {
32 .level = LOGLEVEL_NOTICE, 32 .level = LOGLEVEL_NOTICE,
33 .logflags = NF_LOG_MASK, 33 .logflags = NF_LOG_DEFAULT_MASK,
34 }, 34 },
35 }, 35 },
36}; 36};
@@ -52,7 +52,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m,
52 if (info->type == NF_LOG_TYPE_LOG) 52 if (info->type == NF_LOG_TYPE_LOG)
53 logflags = info->u.log.logflags; 53 logflags = info->u.log.logflags;
54 else 54 else
55 logflags = NF_LOG_MASK; 55 logflags = NF_LOG_DEFAULT_MASK;
56 56
57 ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h); 57 ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h);
58 if (ih == NULL) { 58 if (ih == NULL) {
@@ -84,7 +84,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m,
84 } 84 }
85 85
86 /* Max length: 48 "OPT (...) " */ 86 /* Max length: 48 "OPT (...) " */
87 if (logflags & XT_LOG_IPOPT) 87 if (logflags & NF_LOG_IPOPT)
88 nf_log_buf_add(m, "OPT ( "); 88 nf_log_buf_add(m, "OPT ( ");
89 89
90 switch (currenthdr) { 90 switch (currenthdr) {
@@ -121,7 +121,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m,
121 case IPPROTO_ROUTING: 121 case IPPROTO_ROUTING:
122 case IPPROTO_HOPOPTS: 122 case IPPROTO_HOPOPTS:
123 if (fragment) { 123 if (fragment) {
124 if (logflags & XT_LOG_IPOPT) 124 if (logflags & NF_LOG_IPOPT)
125 nf_log_buf_add(m, ")"); 125 nf_log_buf_add(m, ")");
126 return; 126 return;
127 } 127 }
@@ -129,7 +129,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m,
129 break; 129 break;
130 /* Max Length */ 130 /* Max Length */
131 case IPPROTO_AH: 131 case IPPROTO_AH:
132 if (logflags & XT_LOG_IPOPT) { 132 if (logflags & NF_LOG_IPOPT) {
133 struct ip_auth_hdr _ahdr; 133 struct ip_auth_hdr _ahdr;
134 const struct ip_auth_hdr *ah; 134 const struct ip_auth_hdr *ah;
135 135
@@ -161,7 +161,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m,
161 hdrlen = (hp->hdrlen+2)<<2; 161 hdrlen = (hp->hdrlen+2)<<2;
162 break; 162 break;
163 case IPPROTO_ESP: 163 case IPPROTO_ESP:
164 if (logflags & XT_LOG_IPOPT) { 164 if (logflags & NF_LOG_IPOPT) {
165 struct ip_esp_hdr _esph; 165 struct ip_esp_hdr _esph;
166 const struct ip_esp_hdr *eh; 166 const struct ip_esp_hdr *eh;
167 167
@@ -194,7 +194,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m,
194 nf_log_buf_add(m, "Unknown Ext Hdr %u", currenthdr); 194 nf_log_buf_add(m, "Unknown Ext Hdr %u", currenthdr);
195 return; 195 return;
196 } 196 }
197 if (logflags & XT_LOG_IPOPT) 197 if (logflags & NF_LOG_IPOPT)
198 nf_log_buf_add(m, ") "); 198 nf_log_buf_add(m, ") ");
199 199
200 currenthdr = hp->nexthdr; 200 currenthdr = hp->nexthdr;
@@ -277,7 +277,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m,
277 } 277 }
278 278
279 /* Max length: 15 "UID=4294967295 " */ 279 /* Max length: 15 "UID=4294967295 " */
280 if ((logflags & XT_LOG_UID) && recurse) 280 if ((logflags & NF_LOG_UID) && recurse)
281 nf_log_dump_sk_uid_gid(m, skb->sk); 281 nf_log_dump_sk_uid_gid(m, skb->sk);
282 282
283 /* Max length: 16 "MARK=0xFFFFFFFF " */ 283 /* Max length: 16 "MARK=0xFFFFFFFF " */
@@ -295,7 +295,7 @@ static void dump_ipv6_mac_header(struct nf_log_buf *m,
295 if (info->type == NF_LOG_TYPE_LOG) 295 if (info->type == NF_LOG_TYPE_LOG)
296 logflags = info->u.log.logflags; 296 logflags = info->u.log.logflags;
297 297
298 if (!(logflags & XT_LOG_MACDECODE)) 298 if (!(logflags & NF_LOG_MACDECODE))
299 goto fallback; 299 goto fallback;
300 300
301 switch (dev->type) { 301 switch (dev->type) {
@@ -379,8 +379,7 @@ static struct nf_logger nf_ip6_logger __read_mostly = {
379 379
380static int __net_init nf_log_ipv6_net_init(struct net *net) 380static int __net_init nf_log_ipv6_net_init(struct net *net)
381{ 381{
382 nf_log_set(net, NFPROTO_IPV6, &nf_ip6_logger); 382 return nf_log_set(net, NFPROTO_IPV6, &nf_ip6_logger);
383 return 0;
384} 383}
385 384
386static void __net_exit nf_log_ipv6_net_exit(struct net *net) 385static void __net_exit nf_log_ipv6_net_exit(struct net *net)
diff --git a/net/ipv6/netfilter/nf_tables_ipv6.c b/net/ipv6/netfilter/nf_tables_ipv6.c
index 30b22f4dff55..d6e4ba5de916 100644
--- a/net/ipv6/netfilter/nf_tables_ipv6.c
+++ b/net/ipv6/netfilter/nf_tables_ipv6.c
@@ -22,9 +22,7 @@ static unsigned int nft_do_chain_ipv6(void *priv,
22{ 22{
23 struct nft_pktinfo pkt; 23 struct nft_pktinfo pkt;
24 24
25 /* malformed packet, drop it */ 25 nft_set_pktinfo_ipv6(&pkt, skb, state);
26 if (nft_set_pktinfo_ipv6(&pkt, skb, state) < 0)
27 return NF_DROP;
28 26
29 return nft_do_chain(&pkt, priv); 27 return nft_do_chain(&pkt, priv);
30} 28}
@@ -102,7 +100,10 @@ static int __init nf_tables_ipv6_init(void)
102{ 100{
103 int ret; 101 int ret;
104 102
105 nft_register_chain_type(&filter_ipv6); 103 ret = nft_register_chain_type(&filter_ipv6);
104 if (ret < 0)
105 return ret;
106
106 ret = register_pernet_subsys(&nf_tables_ipv6_net_ops); 107 ret = register_pernet_subsys(&nf_tables_ipv6_net_ops);
107 if (ret < 0) 108 if (ret < 0)
108 nft_unregister_chain_type(&filter_ipv6); 109 nft_unregister_chain_type(&filter_ipv6);
diff --git a/net/ipv6/netfilter/nft_chain_route_ipv6.c b/net/ipv6/netfilter/nft_chain_route_ipv6.c
index 2535223ba956..f2727475895e 100644
--- a/net/ipv6/netfilter/nft_chain_route_ipv6.c
+++ b/net/ipv6/netfilter/nft_chain_route_ipv6.c
@@ -33,9 +33,7 @@ static unsigned int nf_route_table_hook(void *priv,
33 u32 mark, flowlabel; 33 u32 mark, flowlabel;
34 int err; 34 int err;
35 35
36 /* malformed packet, drop it */ 36 nft_set_pktinfo_ipv6(&pkt, skb, state);
37 if (nft_set_pktinfo_ipv6(&pkt, skb, state) < 0)
38 return NF_DROP;
39 37
40 /* save source/dest address, mark, hoplimit, flowlabel, priority */ 38 /* save source/dest address, mark, hoplimit, flowlabel, priority */
41 memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr)); 39 memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr));
diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c
index 462f2a76b5c2..7cca8ac66fe9 100644
--- a/net/ipv6/output_core.c
+++ b/net/ipv6/output_core.c
@@ -148,6 +148,13 @@ int __ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
148 ipv6_hdr(skb)->payload_len = htons(len); 148 ipv6_hdr(skb)->payload_len = htons(len);
149 IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr); 149 IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr);
150 150
151 /* if egress device is enslaved to an L3 master device pass the
152 * skb to its handler for processing
153 */
154 skb = l3mdev_ip6_out(sk, skb);
155 if (unlikely(!skb))
156 return 0;
157
151 return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, 158 return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
152 net, sk, skb, NULL, skb_dst(skb)->dev, 159 net, sk, skb, NULL, skb_dst(skb)->dev,
153 dst_output); 160 dst_output);
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index 679253d0af84..cc8e3ae9ca73 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -30,6 +30,11 @@
30#include <net/transp_v6.h> 30#include <net/transp_v6.h>
31#include <net/ipv6.h> 31#include <net/ipv6.h>
32 32
33#define MAX4(a, b, c, d) \
34 max_t(u32, max_t(u32, a, b), max_t(u32, c, d))
35#define SNMP_MIB_MAX MAX4(UDP_MIB_MAX, TCP_MIB_MAX, \
36 IPSTATS_MIB_MAX, ICMP_MIB_MAX)
37
33static int sockstat6_seq_show(struct seq_file *seq, void *v) 38static int sockstat6_seq_show(struct seq_file *seq, void *v)
34{ 39{
35 struct net *net = seq->private; 40 struct net *net = seq->private;
@@ -191,25 +196,34 @@ static void snmp6_seq_show_item(struct seq_file *seq, void __percpu *pcpumib,
191 atomic_long_t *smib, 196 atomic_long_t *smib,
192 const struct snmp_mib *itemlist) 197 const struct snmp_mib *itemlist)
193{ 198{
199 unsigned long buff[SNMP_MIB_MAX];
194 int i; 200 int i;
195 unsigned long val;
196 201
197 for (i = 0; itemlist[i].name; i++) { 202 if (pcpumib) {
198 val = pcpumib ? 203 memset(buff, 0, sizeof(unsigned long) * SNMP_MIB_MAX);
199 snmp_fold_field(pcpumib, itemlist[i].entry) : 204
200 atomic_long_read(smib + itemlist[i].entry); 205 snmp_get_cpu_field_batch(buff, itemlist, pcpumib);
201 seq_printf(seq, "%-32s\t%lu\n", itemlist[i].name, val); 206 for (i = 0; itemlist[i].name; i++)
207 seq_printf(seq, "%-32s\t%lu\n",
208 itemlist[i].name, buff[i]);
209 } else {
210 for (i = 0; itemlist[i].name; i++)
211 seq_printf(seq, "%-32s\t%lu\n", itemlist[i].name,
212 atomic_long_read(smib + itemlist[i].entry));
202 } 213 }
203} 214}
204 215
205static void snmp6_seq_show_item64(struct seq_file *seq, void __percpu *mib, 216static void snmp6_seq_show_item64(struct seq_file *seq, void __percpu *mib,
206 const struct snmp_mib *itemlist, size_t syncpoff) 217 const struct snmp_mib *itemlist, size_t syncpoff)
207{ 218{
219 u64 buff64[SNMP_MIB_MAX];
208 int i; 220 int i;
209 221
222 memset(buff64, 0, sizeof(unsigned long) * SNMP_MIB_MAX);
223
224 snmp_get_cpu_field64_batch(buff64, itemlist, mib, syncpoff);
210 for (i = 0; itemlist[i].name; i++) 225 for (i = 0; itemlist[i].name; i++)
211 seq_printf(seq, "%-32s\t%llu\n", itemlist[i].name, 226 seq_printf(seq, "%-32s\t%llu\n", itemlist[i].name, buff64[i]);
212 snmp_fold_field64(mib, itemlist[i].entry, syncpoff));
213} 227}
214 228
215static int snmp6_seq_show(struct seq_file *seq, void *v) 229static int snmp6_seq_show(struct seq_file *seq, void *v)
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 590dd1f7746f..54404f08efcc 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -653,6 +653,13 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length,
653 if (err) 653 if (err)
654 goto error_fault; 654 goto error_fault;
655 655
656 /* if egress device is enslaved to an L3 master device pass the
657 * skb to its handler for processing
658 */
659 skb = l3mdev_ip6_out(sk, skb);
660 if (unlikely(!skb))
661 return 0;
662
656 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); 663 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
657 err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, 664 err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb,
658 NULL, rt->dst.dev, dst_output); 665 NULL, rt->dst.dev, dst_output);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 269218aacbea..bdbc38e8bf29 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1147,15 +1147,16 @@ static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *
1147 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags); 1147 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1148} 1148}
1149 1149
1150static struct dst_entry *ip6_route_input_lookup(struct net *net, 1150struct dst_entry *ip6_route_input_lookup(struct net *net,
1151 struct net_device *dev, 1151 struct net_device *dev,
1152 struct flowi6 *fl6, int flags) 1152 struct flowi6 *fl6, int flags)
1153{ 1153{
1154 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) 1154 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1155 flags |= RT6_LOOKUP_F_IFACE; 1155 flags |= RT6_LOOKUP_F_IFACE;
1156 1156
1157 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input); 1157 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1158} 1158}
1159EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1159 1160
1160void ip6_route_input(struct sk_buff *skb) 1161void ip6_route_input(struct sk_buff *skb)
1161{ 1162{
@@ -1164,7 +1165,7 @@ void ip6_route_input(struct sk_buff *skb)
1164 int flags = RT6_LOOKUP_F_HAS_SADDR; 1165 int flags = RT6_LOOKUP_F_HAS_SADDR;
1165 struct ip_tunnel_info *tun_info; 1166 struct ip_tunnel_info *tun_info;
1166 struct flowi6 fl6 = { 1167 struct flowi6 fl6 = {
1167 .flowi6_iif = l3mdev_fib_oif(skb->dev), 1168 .flowi6_iif = skb->dev->ifindex,
1168 .daddr = iph->daddr, 1169 .daddr = iph->daddr,
1169 .saddr = iph->saddr, 1170 .saddr = iph->saddr,
1170 .flowlabel = ip6_flowinfo(iph), 1171 .flowlabel = ip6_flowinfo(iph),
@@ -1188,12 +1189,15 @@ static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table
1188struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, 1189struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1189 struct flowi6 *fl6, int flags) 1190 struct flowi6 *fl6, int flags)
1190{ 1191{
1191 struct dst_entry *dst;
1192 bool any_src; 1192 bool any_src;
1193 1193
1194 dst = l3mdev_get_rt6_dst(net, fl6); 1194 if (rt6_need_strict(&fl6->daddr)) {
1195 if (dst) 1195 struct dst_entry *dst;
1196 return dst; 1196
1197 dst = l3mdev_link_scope_lookup(net, fl6);
1198 if (dst)
1199 return dst;
1200 }
1197 1201
1198 fl6->flowi6_iif = LOOPBACK_IFINDEX; 1202 fl6->flowi6_iif = LOOPBACK_IFINDEX;
1199 1203
@@ -1604,7 +1608,9 @@ static unsigned int ip6_mtu(const struct dst_entry *dst)
1604 rcu_read_unlock(); 1608 rcu_read_unlock();
1605 1609
1606out: 1610out:
1607 return min_t(unsigned int, mtu, IP6_MAX_MTU); 1611 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1612
1613 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1608} 1614}
1609 1615
1610static struct dst_entry *icmp6_dst_gc_list; 1616static struct dst_entry *icmp6_dst_gc_list;
@@ -2565,8 +2571,16 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2565{ 2571{
2566 u32 tb_id; 2572 u32 tb_id;
2567 struct net *net = dev_net(idev->dev); 2573 struct net *net = dev_net(idev->dev);
2568 struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev, 2574 struct net_device *dev = net->loopback_dev;
2569 DST_NOCOUNT); 2575 struct rt6_info *rt;
2576
2577 /* use L3 Master device as loopback for host routes if device
2578 * is enslaved and address is not link local or multicast
2579 */
2580 if (!rt6_need_strict(addr))
2581 dev = l3mdev_master_dev_rcu(idev->dev) ? : dev;
2582
2583 rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
2570 if (!rt) 2584 if (!rt)
2571 return ERR_PTR(-ENOMEM); 2585 return ERR_PTR(-ENOMEM);
2572 2586
@@ -3347,11 +3361,6 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
3347 } else { 3361 } else {
3348 fl6.flowi6_oif = oif; 3362 fl6.flowi6_oif = oif;
3349 3363
3350 if (netif_index_is_l3_master(net, oif)) {
3351 fl6.flowi6_flags = FLOWI_FLAG_L3MDEV_SRC |
3352 FLOWI_FLAG_SKIP_NH_OIF;
3353 }
3354
3355 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6); 3364 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
3356 } 3365 }
3357 3366
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 182b6a9be29d..b1cdf8009d29 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -62,7 +62,7 @@
62 For comments look at net/ipv4/ip_gre.c --ANK 62 For comments look at net/ipv4/ip_gre.c --ANK
63 */ 63 */
64 64
65#define HASH_SIZE 16 65#define IP6_SIT_HASH_SIZE 16
66#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF) 66#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
67 67
68static bool log_ecn_error = true; 68static bool log_ecn_error = true;
@@ -78,9 +78,9 @@ static struct rtnl_link_ops sit_link_ops __read_mostly;
78 78
79static int sit_net_id __read_mostly; 79static int sit_net_id __read_mostly;
80struct sit_net { 80struct sit_net {
81 struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE]; 81 struct ip_tunnel __rcu *tunnels_r_l[IP6_SIT_HASH_SIZE];
82 struct ip_tunnel __rcu *tunnels_r[HASH_SIZE]; 82 struct ip_tunnel __rcu *tunnels_r[IP6_SIT_HASH_SIZE];
83 struct ip_tunnel __rcu *tunnels_l[HASH_SIZE]; 83 struct ip_tunnel __rcu *tunnels_l[IP6_SIT_HASH_SIZE];
84 struct ip_tunnel __rcu *tunnels_wc[1]; 84 struct ip_tunnel __rcu *tunnels_wc[1];
85 struct ip_tunnel __rcu **tunnels[4]; 85 struct ip_tunnel __rcu **tunnels[4];
86 86
@@ -1126,7 +1126,7 @@ static int ipip6_tunnel_update_6rd(struct ip_tunnel *t,
1126} 1126}
1127#endif 1127#endif
1128 1128
1129bool ipip6_valid_ip_proto(u8 ipproto) 1129static bool ipip6_valid_ip_proto(u8 ipproto)
1130{ 1130{
1131 return ipproto == IPPROTO_IPV6 || 1131 return ipproto == IPPROTO_IPV6 ||
1132 ipproto == IPPROTO_IPIP || 1132 ipproto == IPPROTO_IPIP ||
@@ -1783,7 +1783,7 @@ static void __net_exit sit_destroy_tunnels(struct net *net,
1783 1783
1784 for (prio = 1; prio < 4; prio++) { 1784 for (prio = 1; prio < 4; prio++) {
1785 int h; 1785 int h;
1786 for (h = 0; h < HASH_SIZE; h++) { 1786 for (h = 0; h < IP6_SIT_HASH_SIZE; h++) {
1787 struct ip_tunnel *t; 1787 struct ip_tunnel *t;
1788 1788
1789 t = rtnl_dereference(sitn->tunnels[prio][h]); 1789 t = rtnl_dereference(sitn->tunnels[prio][h]);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 94f4f89d73e7..5a27ab4eab39 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -671,6 +671,7 @@ static bool tcp_v6_inbound_md5_hash(const struct sock *sk,
671 NULL, skb); 671 NULL, skb);
672 672
673 if (genhash || memcmp(hash_location, newhash, 16) != 0) { 673 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
674 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
674 net_info_ratelimited("MD5 Hash %s for [%pI6c]:%u->[%pI6c]:%u\n", 675 net_info_ratelimited("MD5 Hash %s for [%pI6c]:%u->[%pI6c]:%u\n",
675 genhash ? "failed" : "mismatch", 676 genhash ? "failed" : "mismatch",
676 &ip6h->saddr, ntohs(th->source), 677 &ip6h->saddr, ntohs(th->source),
@@ -817,12 +818,8 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
817 fl6.flowi6_proto = IPPROTO_TCP; 818 fl6.flowi6_proto = IPPROTO_TCP;
818 if (rt6_need_strict(&fl6.daddr) && !oif) 819 if (rt6_need_strict(&fl6.daddr) && !oif)
819 fl6.flowi6_oif = tcp_v6_iif(skb); 820 fl6.flowi6_oif = tcp_v6_iif(skb);
820 else { 821 else
821 if (!oif && netif_index_is_l3_master(net, skb->skb_iif)) 822 fl6.flowi6_oif = oif ? : skb->skb_iif;
822 oif = skb->skb_iif;
823
824 fl6.flowi6_oif = oif;
825 }
826 823
827 fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark); 824 fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark);
828 fl6.fl6_dport = t1->dest; 825 fl6.fl6_dport = t1->dest;
@@ -1193,6 +1190,16 @@ out:
1193 return NULL; 1190 return NULL;
1194} 1191}
1195 1192
1193static void tcp_v6_restore_cb(struct sk_buff *skb)
1194{
1195 /* We need to move header back to the beginning if xfrm6_policy_check()
1196 * and tcp_v6_fill_cb() are going to be called again.
1197 * ip6_datagram_recv_specific_ctl() also expects IP6CB to be there.
1198 */
1199 memmove(IP6CB(skb), &TCP_SKB_CB(skb)->header.h6,
1200 sizeof(struct inet6_skb_parm));
1201}
1202
1196/* The socket must have it's spinlock held when we get 1203/* The socket must have it's spinlock held when we get
1197 * here, unless it is a TCP_LISTEN socket. 1204 * here, unless it is a TCP_LISTEN socket.
1198 * 1205 *
@@ -1322,6 +1329,7 @@ ipv6_pktoptions:
1322 np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb)); 1329 np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb));
1323 if (ipv6_opt_accepted(sk, opt_skb, &TCP_SKB_CB(opt_skb)->header.h6)) { 1330 if (ipv6_opt_accepted(sk, opt_skb, &TCP_SKB_CB(opt_skb)->header.h6)) {
1324 skb_set_owner_r(opt_skb, sk); 1331 skb_set_owner_r(opt_skb, sk);
1332 tcp_v6_restore_cb(opt_skb);
1325 opt_skb = xchg(&np->pktoptions, opt_skb); 1333 opt_skb = xchg(&np->pktoptions, opt_skb);
1326 } else { 1334 } else {
1327 __kfree_skb(opt_skb); 1335 __kfree_skb(opt_skb);
@@ -1355,15 +1363,6 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr,
1355 TCP_SKB_CB(skb)->sacked = 0; 1363 TCP_SKB_CB(skb)->sacked = 0;
1356} 1364}
1357 1365
1358static void tcp_v6_restore_cb(struct sk_buff *skb)
1359{
1360 /* We need to move header back to the beginning if xfrm6_policy_check()
1361 * and tcp_v6_fill_cb() are going to be called again.
1362 */
1363 memmove(IP6CB(skb), &TCP_SKB_CB(skb)->header.h6,
1364 sizeof(struct inet6_skb_parm));
1365}
1366
1367static int tcp_v6_rcv(struct sk_buff *skb) 1366static int tcp_v6_rcv(struct sk_buff *skb)
1368{ 1367{
1369 const struct tcphdr *th; 1368 const struct tcphdr *th;
@@ -1415,6 +1414,7 @@ process:
1415 sk = req->rsk_listener; 1414 sk = req->rsk_listener;
1416 tcp_v6_fill_cb(skb, hdr, th); 1415 tcp_v6_fill_cb(skb, hdr, th);
1417 if (tcp_v6_inbound_md5_hash(sk, skb)) { 1416 if (tcp_v6_inbound_md5_hash(sk, skb)) {
1417 sk_drops_add(sk, skb);
1418 reqsk_put(req); 1418 reqsk_put(req);
1419 goto discard_it; 1419 goto discard_it;
1420 } 1420 }
@@ -1471,10 +1471,7 @@ process:
1471 if (!sock_owned_by_user(sk)) { 1471 if (!sock_owned_by_user(sk)) {
1472 if (!tcp_prequeue(sk, skb)) 1472 if (!tcp_prequeue(sk, skb))
1473 ret = tcp_v6_do_rcv(sk, skb); 1473 ret = tcp_v6_do_rcv(sk, skb);
1474 } else if (unlikely(sk_add_backlog(sk, skb, 1474 } else if (tcp_add_backlog(sk, skb)) {
1475 sk->sk_rcvbuf + sk->sk_sndbuf))) {
1476 bh_unlock_sock(sk);
1477 __NET_INC_STATS(net, LINUX_MIB_TCPBACKLOGDROP);
1478 goto discard_and_relse; 1475 goto discard_and_relse;
1479 } 1476 }
1480 bh_unlock_sock(sk); 1477 bh_unlock_sock(sk);
@@ -1868,17 +1865,6 @@ void tcp6_proc_exit(struct net *net)
1868} 1865}
1869#endif 1866#endif
1870 1867
1871static void tcp_v6_clear_sk(struct sock *sk, int size)
1872{
1873 struct inet_sock *inet = inet_sk(sk);
1874
1875 /* we do not want to clear pinet6 field, because of RCU lookups */
1876 sk_prot_clear_nulls(sk, offsetof(struct inet_sock, pinet6));
1877
1878 size -= offsetof(struct inet_sock, pinet6) + sizeof(inet->pinet6);
1879 memset(&inet->pinet6 + 1, 0, size);
1880}
1881
1882struct proto tcpv6_prot = { 1868struct proto tcpv6_prot = {
1883 .name = "TCPv6", 1869 .name = "TCPv6",
1884 .owner = THIS_MODULE, 1870 .owner = THIS_MODULE,
@@ -1920,7 +1906,6 @@ struct proto tcpv6_prot = {
1920 .compat_setsockopt = compat_tcp_setsockopt, 1906 .compat_setsockopt = compat_tcp_setsockopt,
1921 .compat_getsockopt = compat_tcp_getsockopt, 1907 .compat_getsockopt = compat_tcp_getsockopt,
1922#endif 1908#endif
1923 .clear_sk = tcp_v6_clear_sk,
1924 .diag_destroy = tcp_abort, 1909 .diag_destroy = tcp_abort,
1925}; 1910};
1926 1911
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 19ac3a1c308d..9aa7c1c7a9ce 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1424,17 +1424,6 @@ void udp6_proc_exit(struct net *net)
1424} 1424}
1425#endif /* CONFIG_PROC_FS */ 1425#endif /* CONFIG_PROC_FS */
1426 1426
1427void udp_v6_clear_sk(struct sock *sk, int size)
1428{
1429 struct inet_sock *inet = inet_sk(sk);
1430
1431 /* we do not want to clear pinet6 field, because of RCU lookups */
1432 sk_prot_clear_portaddr_nulls(sk, offsetof(struct inet_sock, pinet6));
1433
1434 size -= offsetof(struct inet_sock, pinet6) + sizeof(inet->pinet6);
1435 memset(&inet->pinet6 + 1, 0, size);
1436}
1437
1438/* ------------------------------------------------------------------------ */ 1427/* ------------------------------------------------------------------------ */
1439 1428
1440struct proto udpv6_prot = { 1429struct proto udpv6_prot = {
@@ -1465,7 +1454,7 @@ struct proto udpv6_prot = {
1465 .compat_setsockopt = compat_udpv6_setsockopt, 1454 .compat_setsockopt = compat_udpv6_setsockopt,
1466 .compat_getsockopt = compat_udpv6_getsockopt, 1455 .compat_getsockopt = compat_udpv6_getsockopt,
1467#endif 1456#endif
1468 .clear_sk = udp_v6_clear_sk, 1457 .diag_destroy = udp_abort,
1469}; 1458};
1470 1459
1471static struct inet_protosw udpv6_protosw = { 1460static struct inet_protosw udpv6_protosw = {
diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h
index 0682c031ccdc..f6eb1ab34f4b 100644
--- a/net/ipv6/udp_impl.h
+++ b/net/ipv6/udp_impl.h
@@ -29,8 +29,6 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
29int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); 29int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
30void udpv6_destroy_sock(struct sock *sk); 30void udpv6_destroy_sock(struct sock *sk);
31 31
32void udp_v6_clear_sk(struct sock *sk, int size);
33
34#ifdef CONFIG_PROC_FS 32#ifdef CONFIG_PROC_FS
35int udp6_seq_show(struct seq_file *seq, void *v); 33int udp6_seq_show(struct seq_file *seq, void *v);
36#endif 34#endif
diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c
index fd6ef414899b..47d0d2b87106 100644
--- a/net/ipv6/udplite.c
+++ b/net/ipv6/udplite.c
@@ -55,7 +55,6 @@ struct proto udplitev6_prot = {
55 .compat_setsockopt = compat_udpv6_setsockopt, 55 .compat_setsockopt = compat_udpv6_setsockopt,
56 .compat_getsockopt = compat_udpv6_getsockopt, 56 .compat_getsockopt = compat_udpv6_getsockopt,
57#endif 57#endif
58 .clear_sk = udp_v6_clear_sk,
59}; 58};
60 59
61static struct inet_protosw udplite6_protosw = { 60static struct inet_protosw udplite6_protosw = {
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 70a86adad875..e0f71c01d728 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -134,7 +134,7 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse)
134 nexthdr = nh[nhoff]; 134 nexthdr = nh[nhoff];
135 135
136 if (skb_dst(skb)) 136 if (skb_dst(skb))
137 oif = l3mdev_fib_oif(skb_dst(skb)->dev); 137 oif = skb_dst(skb)->dev->ifindex;
138 138
139 memset(fl6, 0, sizeof(struct flowi6)); 139 memset(fl6, 0, sizeof(struct flowi6));
140 fl6->flowi6_mark = skb->mark; 140 fl6->flowi6_mark = skb->mark;
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index ccc244406fb9..391c3cbd2eed 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -845,9 +845,6 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags)
845 if (sock->state != SS_UNCONNECTED) 845 if (sock->state != SS_UNCONNECTED)
846 goto out; 846 goto out;
847 847
848 if ((sk = sock->sk) == NULL)
849 goto out;
850
851 err = -EOPNOTSUPP; 848 err = -EOPNOTSUPP;
852 if ((sk->sk_type != SOCK_STREAM) && (sk->sk_type != SOCK_SEQPACKET) && 849 if ((sk->sk_type != SOCK_STREAM) && (sk->sk_type != SOCK_SEQPACKET) &&
853 (sk->sk_type != SOCK_DGRAM)) 850 (sk->sk_type != SOCK_DGRAM))
diff --git a/net/kcm/Kconfig b/net/kcm/Kconfig
index 5db94d940ecc..87fca36e6c47 100644
--- a/net/kcm/Kconfig
+++ b/net/kcm/Kconfig
@@ -3,6 +3,7 @@ config AF_KCM
3 tristate "KCM sockets" 3 tristate "KCM sockets"
4 depends on INET 4 depends on INET
5 select BPF_SYSCALL 5 select BPF_SYSCALL
6 select STREAM_PARSER
6 ---help--- 7 ---help---
7 KCM (Kernel Connection Multiplexor) sockets provide a method 8 KCM (Kernel Connection Multiplexor) sockets provide a method
8 for multiplexing messages of a message based application 9 for multiplexing messages of a message based application
diff --git a/net/kcm/kcmproc.c b/net/kcm/kcmproc.c
index 16c2e03bd388..bf75c9231cca 100644
--- a/net/kcm/kcmproc.c
+++ b/net/kcm/kcmproc.c
@@ -155,8 +155,8 @@ static void kcm_format_psock(struct kcm_psock *psock, struct seq_file *seq,
155 seq_printf(seq, 155 seq_printf(seq,
156 " psock-%-5u %-10llu %-16llu %-10llu %-16llu %-8d %-8d %-8d %-8d ", 156 " psock-%-5u %-10llu %-16llu %-10llu %-16llu %-8d %-8d %-8d %-8d ",
157 psock->index, 157 psock->index,
158 psock->stats.rx_msgs, 158 psock->strp.stats.rx_msgs,
159 psock->stats.rx_bytes, 159 psock->strp.stats.rx_bytes,
160 psock->stats.tx_msgs, 160 psock->stats.tx_msgs,
161 psock->stats.tx_bytes, 161 psock->stats.tx_bytes,
162 psock->sk->sk_receive_queue.qlen, 162 psock->sk->sk_receive_queue.qlen,
@@ -170,14 +170,27 @@ static void kcm_format_psock(struct kcm_psock *psock, struct seq_file *seq,
170 if (psock->tx_stopped) 170 if (psock->tx_stopped)
171 seq_puts(seq, "TxStop "); 171 seq_puts(seq, "TxStop ");
172 172
173 if (psock->rx_stopped) 173 if (psock->strp.rx_stopped)
174 seq_puts(seq, "RxStop "); 174 seq_puts(seq, "RxStop ");
175 175
176 if (psock->tx_kcm) 176 if (psock->tx_kcm)
177 seq_printf(seq, "Rsvd-%d ", psock->tx_kcm->index); 177 seq_printf(seq, "Rsvd-%d ", psock->tx_kcm->index);
178 178
179 if (psock->ready_rx_msg) 179 if (!psock->strp.rx_paused && !psock->ready_rx_msg) {
180 seq_puts(seq, "RdyRx "); 180 if (psock->sk->sk_receive_queue.qlen) {
181 if (psock->strp.rx_need_bytes)
182 seq_printf(seq, "RxWait=%u ",
183 psock->strp.rx_need_bytes);
184 else
185 seq_printf(seq, "RxWait ");
186 }
187 } else {
188 if (psock->strp.rx_paused)
189 seq_puts(seq, "RxPause ");
190
191 if (psock->ready_rx_msg)
192 seq_puts(seq, "RdyRx ");
193 }
181 194
182 seq_puts(seq, "\n"); 195 seq_puts(seq, "\n");
183} 196}
@@ -275,6 +288,7 @@ static int kcm_stats_seq_show(struct seq_file *seq, void *v)
275{ 288{
276 struct kcm_psock_stats psock_stats; 289 struct kcm_psock_stats psock_stats;
277 struct kcm_mux_stats mux_stats; 290 struct kcm_mux_stats mux_stats;
291 struct strp_aggr_stats strp_stats;
278 struct kcm_mux *mux; 292 struct kcm_mux *mux;
279 struct kcm_psock *psock; 293 struct kcm_psock *psock;
280 struct net *net = seq->private; 294 struct net *net = seq->private;
@@ -282,20 +296,28 @@ static int kcm_stats_seq_show(struct seq_file *seq, void *v)
282 296
283 memset(&mux_stats, 0, sizeof(mux_stats)); 297 memset(&mux_stats, 0, sizeof(mux_stats));
284 memset(&psock_stats, 0, sizeof(psock_stats)); 298 memset(&psock_stats, 0, sizeof(psock_stats));
299 memset(&strp_stats, 0, sizeof(strp_stats));
285 300
286 mutex_lock(&knet->mutex); 301 mutex_lock(&knet->mutex);
287 302
288 aggregate_mux_stats(&knet->aggregate_mux_stats, &mux_stats); 303 aggregate_mux_stats(&knet->aggregate_mux_stats, &mux_stats);
289 aggregate_psock_stats(&knet->aggregate_psock_stats, 304 aggregate_psock_stats(&knet->aggregate_psock_stats,
290 &psock_stats); 305 &psock_stats);
306 aggregate_strp_stats(&knet->aggregate_strp_stats,
307 &strp_stats);
291 308
292 list_for_each_entry_rcu(mux, &knet->mux_list, kcm_mux_list) { 309 list_for_each_entry_rcu(mux, &knet->mux_list, kcm_mux_list) {
293 spin_lock_bh(&mux->lock); 310 spin_lock_bh(&mux->lock);
294 aggregate_mux_stats(&mux->stats, &mux_stats); 311 aggregate_mux_stats(&mux->stats, &mux_stats);
295 aggregate_psock_stats(&mux->aggregate_psock_stats, 312 aggregate_psock_stats(&mux->aggregate_psock_stats,
296 &psock_stats); 313 &psock_stats);
297 list_for_each_entry(psock, &mux->psocks, psock_list) 314 aggregate_strp_stats(&mux->aggregate_strp_stats,
315 &strp_stats);
316 list_for_each_entry(psock, &mux->psocks, psock_list) {
298 aggregate_psock_stats(&psock->stats, &psock_stats); 317 aggregate_psock_stats(&psock->stats, &psock_stats);
318 save_strp_stats(&psock->strp, &strp_stats);
319 }
320
299 spin_unlock_bh(&mux->lock); 321 spin_unlock_bh(&mux->lock);
300 } 322 }
301 323
@@ -328,7 +350,7 @@ static int kcm_stats_seq_show(struct seq_file *seq, void *v)
328 mux_stats.rx_ready_drops); 350 mux_stats.rx_ready_drops);
329 351
330 seq_printf(seq, 352 seq_printf(seq,
331 "%-8s %-10s %-16s %-10s %-16s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s\n", 353 "%-8s %-10s %-16s %-10s %-16s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s\n",
332 "Psock", 354 "Psock",
333 "RX-Msgs", 355 "RX-Msgs",
334 "RX-Bytes", 356 "RX-Bytes",
@@ -337,6 +359,8 @@ static int kcm_stats_seq_show(struct seq_file *seq, void *v)
337 "Reserved", 359 "Reserved",
338 "Unreserved", 360 "Unreserved",
339 "RX-Aborts", 361 "RX-Aborts",
362 "RX-Intr",
363 "RX-Unrecov",
340 "RX-MemFail", 364 "RX-MemFail",
341 "RX-NeedMor", 365 "RX-NeedMor",
342 "RX-BadLen", 366 "RX-BadLen",
@@ -345,20 +369,22 @@ static int kcm_stats_seq_show(struct seq_file *seq, void *v)
345 "TX-Aborts"); 369 "TX-Aborts");
346 370
347 seq_printf(seq, 371 seq_printf(seq,
348 "%-8s %-10llu %-16llu %-10llu %-16llu %-10llu %-10llu %-10u %-10u %-10u %-10u %-10u %-10u %-10u\n", 372 "%-8s %-10llu %-16llu %-10llu %-16llu %-10llu %-10llu %-10u %-10u %-10u %-10u %-10u %-10u %-10u %-10u %-10u\n",
349 "", 373 "",
350 psock_stats.rx_msgs, 374 strp_stats.rx_msgs,
351 psock_stats.rx_bytes, 375 strp_stats.rx_bytes,
352 psock_stats.tx_msgs, 376 psock_stats.tx_msgs,
353 psock_stats.tx_bytes, 377 psock_stats.tx_bytes,
354 psock_stats.reserved, 378 psock_stats.reserved,
355 psock_stats.unreserved, 379 psock_stats.unreserved,
356 psock_stats.rx_aborts, 380 strp_stats.rx_aborts,
357 psock_stats.rx_mem_fail, 381 strp_stats.rx_interrupted,
358 psock_stats.rx_need_more_hdr, 382 strp_stats.rx_unrecov_intr,
359 psock_stats.rx_bad_hdr_len, 383 strp_stats.rx_mem_fail,
360 psock_stats.rx_msg_too_big, 384 strp_stats.rx_need_more_hdr,
361 psock_stats.rx_msg_timeouts, 385 strp_stats.rx_bad_hdr_len,
386 strp_stats.rx_msg_too_big,
387 strp_stats.rx_msg_timeouts,
362 psock_stats.tx_aborts); 388 psock_stats.tx_aborts);
363 389
364 return 0; 390 return 0;
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index 411693288648..7e08a4d3d77d 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -1,3 +1,13 @@
1/*
2 * Kernel Connection Multiplexor
3 *
4 * Copyright (c) 2016 Tom Herbert <tom@herbertland.com>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2
8 * as published by the Free Software Foundation.
9 */
10
1#include <linux/bpf.h> 11#include <linux/bpf.h>
2#include <linux/errno.h> 12#include <linux/errno.h>
3#include <linux/errqueue.h> 13#include <linux/errqueue.h>
@@ -17,7 +27,6 @@
17#include <net/kcm.h> 27#include <net/kcm.h>
18#include <net/netns/generic.h> 28#include <net/netns/generic.h>
19#include <net/sock.h> 29#include <net/sock.h>
20#include <net/tcp.h>
21#include <uapi/linux/kcm.h> 30#include <uapi/linux/kcm.h>
22 31
23unsigned int kcm_net_id; 32unsigned int kcm_net_id;
@@ -36,38 +45,12 @@ static inline struct kcm_tx_msg *kcm_tx_msg(struct sk_buff *skb)
36 return (struct kcm_tx_msg *)skb->cb; 45 return (struct kcm_tx_msg *)skb->cb;
37} 46}
38 47
39static inline struct kcm_rx_msg *kcm_rx_msg(struct sk_buff *skb)
40{
41 return (struct kcm_rx_msg *)((void *)skb->cb +
42 offsetof(struct qdisc_skb_cb, data));
43}
44
45static void report_csk_error(struct sock *csk, int err) 48static void report_csk_error(struct sock *csk, int err)
46{ 49{
47 csk->sk_err = EPIPE; 50 csk->sk_err = EPIPE;
48 csk->sk_error_report(csk); 51 csk->sk_error_report(csk);
49} 52}
50 53
51/* Callback lock held */
52static void kcm_abort_rx_psock(struct kcm_psock *psock, int err,
53 struct sk_buff *skb)
54{
55 struct sock *csk = psock->sk;
56
57 /* Unrecoverable error in receive */
58
59 del_timer(&psock->rx_msg_timer);
60
61 if (psock->rx_stopped)
62 return;
63
64 psock->rx_stopped = 1;
65 KCM_STATS_INCR(psock->stats.rx_aborts);
66
67 /* Report an error on the lower socket */
68 report_csk_error(csk, err);
69}
70
71static void kcm_abort_tx_psock(struct kcm_psock *psock, int err, 54static void kcm_abort_tx_psock(struct kcm_psock *psock, int err,
72 bool wakeup_kcm) 55 bool wakeup_kcm)
73{ 56{
@@ -110,12 +93,13 @@ static void kcm_abort_tx_psock(struct kcm_psock *psock, int err,
110static void kcm_update_rx_mux_stats(struct kcm_mux *mux, 93static void kcm_update_rx_mux_stats(struct kcm_mux *mux,
111 struct kcm_psock *psock) 94 struct kcm_psock *psock)
112{ 95{
113 KCM_STATS_ADD(mux->stats.rx_bytes, 96 STRP_STATS_ADD(mux->stats.rx_bytes,
114 psock->stats.rx_bytes - psock->saved_rx_bytes); 97 psock->strp.stats.rx_bytes -
98 psock->saved_rx_bytes);
115 mux->stats.rx_msgs += 99 mux->stats.rx_msgs +=
116 psock->stats.rx_msgs - psock->saved_rx_msgs; 100 psock->strp.stats.rx_msgs - psock->saved_rx_msgs;
117 psock->saved_rx_msgs = psock->stats.rx_msgs; 101 psock->saved_rx_msgs = psock->strp.stats.rx_msgs;
118 psock->saved_rx_bytes = psock->stats.rx_bytes; 102 psock->saved_rx_bytes = psock->strp.stats.rx_bytes;
119} 103}
120 104
121static void kcm_update_tx_mux_stats(struct kcm_mux *mux, 105static void kcm_update_tx_mux_stats(struct kcm_mux *mux,
@@ -168,11 +152,11 @@ static void kcm_rcv_ready(struct kcm_sock *kcm)
168 */ 152 */
169 list_del(&psock->psock_ready_list); 153 list_del(&psock->psock_ready_list);
170 psock->ready_rx_msg = NULL; 154 psock->ready_rx_msg = NULL;
171
172 /* Commit clearing of ready_rx_msg for queuing work */ 155 /* Commit clearing of ready_rx_msg for queuing work */
173 smp_mb(); 156 smp_mb();
174 157
175 queue_work(kcm_wq, &psock->rx_work); 158 strp_unpause(&psock->strp);
159 strp_check_rcv(&psock->strp);
176 } 160 }
177 161
178 /* Buffer limit is okay now, add to ready list */ 162 /* Buffer limit is okay now, add to ready list */
@@ -286,6 +270,7 @@ static struct kcm_sock *reserve_rx_kcm(struct kcm_psock *psock,
286 270
287 if (list_empty(&mux->kcm_rx_waiters)) { 271 if (list_empty(&mux->kcm_rx_waiters)) {
288 psock->ready_rx_msg = head; 272 psock->ready_rx_msg = head;
273 strp_pause(&psock->strp);
289 list_add_tail(&psock->psock_ready_list, 274 list_add_tail(&psock->psock_ready_list,
290 &mux->psocks_ready); 275 &mux->psocks_ready);
291 spin_unlock_bh(&mux->rx_lock); 276 spin_unlock_bh(&mux->rx_lock);
@@ -354,346 +339,60 @@ static void unreserve_rx_kcm(struct kcm_psock *psock,
354 spin_unlock_bh(&mux->rx_lock); 339 spin_unlock_bh(&mux->rx_lock);
355} 340}
356 341
357static void kcm_start_rx_timer(struct kcm_psock *psock)
358{
359 if (psock->sk->sk_rcvtimeo)
360 mod_timer(&psock->rx_msg_timer, psock->sk->sk_rcvtimeo);
361}
362
363/* Macro to invoke filter function. */
364#define KCM_RUN_FILTER(prog, ctx) \
365 (*prog->bpf_func)(ctx, prog->insnsi)
366
367/* Lower socket lock held */
368static int kcm_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
369 unsigned int orig_offset, size_t orig_len)
370{
371 struct kcm_psock *psock = (struct kcm_psock *)desc->arg.data;
372 struct kcm_rx_msg *rxm;
373 struct kcm_sock *kcm;
374 struct sk_buff *head, *skb;
375 size_t eaten = 0, cand_len;
376 ssize_t extra;
377 int err;
378 bool cloned_orig = false;
379
380 if (psock->ready_rx_msg)
381 return 0;
382
383 head = psock->rx_skb_head;
384 if (head) {
385 /* Message already in progress */
386
387 rxm = kcm_rx_msg(head);
388 if (unlikely(rxm->early_eaten)) {
389 /* Already some number of bytes on the receive sock
390 * data saved in rx_skb_head, just indicate they
391 * are consumed.
392 */
393 eaten = orig_len <= rxm->early_eaten ?
394 orig_len : rxm->early_eaten;
395 rxm->early_eaten -= eaten;
396
397 return eaten;
398 }
399
400 if (unlikely(orig_offset)) {
401 /* Getting data with a non-zero offset when a message is
402 * in progress is not expected. If it does happen, we
403 * need to clone and pull since we can't deal with
404 * offsets in the skbs for a message expect in the head.
405 */
406 orig_skb = skb_clone(orig_skb, GFP_ATOMIC);
407 if (!orig_skb) {
408 KCM_STATS_INCR(psock->stats.rx_mem_fail);
409 desc->error = -ENOMEM;
410 return 0;
411 }
412 if (!pskb_pull(orig_skb, orig_offset)) {
413 KCM_STATS_INCR(psock->stats.rx_mem_fail);
414 kfree_skb(orig_skb);
415 desc->error = -ENOMEM;
416 return 0;
417 }
418 cloned_orig = true;
419 orig_offset = 0;
420 }
421
422 if (!psock->rx_skb_nextp) {
423 /* We are going to append to the frags_list of head.
424 * Need to unshare the frag_list.
425 */
426 err = skb_unclone(head, GFP_ATOMIC);
427 if (err) {
428 KCM_STATS_INCR(psock->stats.rx_mem_fail);
429 desc->error = err;
430 return 0;
431 }
432
433 if (unlikely(skb_shinfo(head)->frag_list)) {
434 /* We can't append to an sk_buff that already
435 * has a frag_list. We create a new head, point
436 * the frag_list of that to the old head, and
437 * then are able to use the old head->next for
438 * appending to the message.
439 */
440 if (WARN_ON(head->next)) {
441 desc->error = -EINVAL;
442 return 0;
443 }
444
445 skb = alloc_skb(0, GFP_ATOMIC);
446 if (!skb) {
447 KCM_STATS_INCR(psock->stats.rx_mem_fail);
448 desc->error = -ENOMEM;
449 return 0;
450 }
451 skb->len = head->len;
452 skb->data_len = head->len;
453 skb->truesize = head->truesize;
454 *kcm_rx_msg(skb) = *kcm_rx_msg(head);
455 psock->rx_skb_nextp = &head->next;
456 skb_shinfo(skb)->frag_list = head;
457 psock->rx_skb_head = skb;
458 head = skb;
459 } else {
460 psock->rx_skb_nextp =
461 &skb_shinfo(head)->frag_list;
462 }
463 }
464 }
465
466 while (eaten < orig_len) {
467 /* Always clone since we will consume something */
468 skb = skb_clone(orig_skb, GFP_ATOMIC);
469 if (!skb) {
470 KCM_STATS_INCR(psock->stats.rx_mem_fail);
471 desc->error = -ENOMEM;
472 break;
473 }
474
475 cand_len = orig_len - eaten;
476
477 head = psock->rx_skb_head;
478 if (!head) {
479 head = skb;
480 psock->rx_skb_head = head;
481 /* Will set rx_skb_nextp on next packet if needed */
482 psock->rx_skb_nextp = NULL;
483 rxm = kcm_rx_msg(head);
484 memset(rxm, 0, sizeof(*rxm));
485 rxm->offset = orig_offset + eaten;
486 } else {
487 /* Unclone since we may be appending to an skb that we
488 * already share a frag_list with.
489 */
490 err = skb_unclone(skb, GFP_ATOMIC);
491 if (err) {
492 KCM_STATS_INCR(psock->stats.rx_mem_fail);
493 desc->error = err;
494 break;
495 }
496
497 rxm = kcm_rx_msg(head);
498 *psock->rx_skb_nextp = skb;
499 psock->rx_skb_nextp = &skb->next;
500 head->data_len += skb->len;
501 head->len += skb->len;
502 head->truesize += skb->truesize;
503 }
504
505 if (!rxm->full_len) {
506 ssize_t len;
507
508 len = KCM_RUN_FILTER(psock->bpf_prog, head);
509
510 if (!len) {
511 /* Need more header to determine length */
512 if (!rxm->accum_len) {
513 /* Start RX timer for new message */
514 kcm_start_rx_timer(psock);
515 }
516 rxm->accum_len += cand_len;
517 eaten += cand_len;
518 KCM_STATS_INCR(psock->stats.rx_need_more_hdr);
519 WARN_ON(eaten != orig_len);
520 break;
521 } else if (len > psock->sk->sk_rcvbuf) {
522 /* Message length exceeds maximum allowed */
523 KCM_STATS_INCR(psock->stats.rx_msg_too_big);
524 desc->error = -EMSGSIZE;
525 psock->rx_skb_head = NULL;
526 kcm_abort_rx_psock(psock, EMSGSIZE, head);
527 break;
528 } else if (len <= (ssize_t)head->len -
529 skb->len - rxm->offset) {
530 /* Length must be into new skb (and also
531 * greater than zero)
532 */
533 KCM_STATS_INCR(psock->stats.rx_bad_hdr_len);
534 desc->error = -EPROTO;
535 psock->rx_skb_head = NULL;
536 kcm_abort_rx_psock(psock, EPROTO, head);
537 break;
538 }
539
540 rxm->full_len = len;
541 }
542
543 extra = (ssize_t)(rxm->accum_len + cand_len) - rxm->full_len;
544
545 if (extra < 0) {
546 /* Message not complete yet. */
547 if (rxm->full_len - rxm->accum_len >
548 tcp_inq(psock->sk)) {
549 /* Don't have the whole messages in the socket
550 * buffer. Set psock->rx_need_bytes to wait for
551 * the rest of the message. Also, set "early
552 * eaten" since we've already buffered the skb
553 * but don't consume yet per tcp_read_sock.
554 */
555
556 if (!rxm->accum_len) {
557 /* Start RX timer for new message */
558 kcm_start_rx_timer(psock);
559 }
560
561 psock->rx_need_bytes = rxm->full_len -
562 rxm->accum_len;
563 rxm->accum_len += cand_len;
564 rxm->early_eaten = cand_len;
565 KCM_STATS_ADD(psock->stats.rx_bytes, cand_len);
566 desc->count = 0; /* Stop reading socket */
567 break;
568 }
569 rxm->accum_len += cand_len;
570 eaten += cand_len;
571 WARN_ON(eaten != orig_len);
572 break;
573 }
574
575 /* Positive extra indicates ore bytes than needed for the
576 * message
577 */
578
579 WARN_ON(extra > cand_len);
580
581 eaten += (cand_len - extra);
582
583 /* Hurray, we have a new message! */
584 del_timer(&psock->rx_msg_timer);
585 psock->rx_skb_head = NULL;
586 KCM_STATS_INCR(psock->stats.rx_msgs);
587
588try_queue:
589 kcm = reserve_rx_kcm(psock, head);
590 if (!kcm) {
591 /* Unable to reserve a KCM, message is held in psock. */
592 break;
593 }
594
595 if (kcm_queue_rcv_skb(&kcm->sk, head)) {
596 /* Should mean socket buffer full */
597 unreserve_rx_kcm(psock, false);
598 goto try_queue;
599 }
600 }
601
602 if (cloned_orig)
603 kfree_skb(orig_skb);
604
605 KCM_STATS_ADD(psock->stats.rx_bytes, eaten);
606
607 return eaten;
608}
609
610/* Called with lock held on lower socket */
611static int psock_tcp_read_sock(struct kcm_psock *psock)
612{
613 read_descriptor_t desc;
614
615 desc.arg.data = psock;
616 desc.error = 0;
617 desc.count = 1; /* give more than one skb per call */
618
619 /* sk should be locked here, so okay to do tcp_read_sock */
620 tcp_read_sock(psock->sk, &desc, kcm_tcp_recv);
621
622 unreserve_rx_kcm(psock, true);
623
624 return desc.error;
625}
626
627/* Lower sock lock held */ 342/* Lower sock lock held */
628static void psock_tcp_data_ready(struct sock *sk) 343static void psock_data_ready(struct sock *sk)
629{ 344{
630 struct kcm_psock *psock; 345 struct kcm_psock *psock;
631 346
632 read_lock_bh(&sk->sk_callback_lock); 347 read_lock_bh(&sk->sk_callback_lock);
633 348
634 psock = (struct kcm_psock *)sk->sk_user_data; 349 psock = (struct kcm_psock *)sk->sk_user_data;
635 if (unlikely(!psock || psock->rx_stopped)) 350 if (likely(psock))
636 goto out; 351 strp_data_ready(&psock->strp);
637
638 if (psock->ready_rx_msg)
639 goto out;
640 352
641 if (psock->rx_need_bytes) {
642 if (tcp_inq(sk) >= psock->rx_need_bytes)
643 psock->rx_need_bytes = 0;
644 else
645 goto out;
646 }
647
648 if (psock_tcp_read_sock(psock) == -ENOMEM)
649 queue_delayed_work(kcm_wq, &psock->rx_delayed_work, 0);
650
651out:
652 read_unlock_bh(&sk->sk_callback_lock); 353 read_unlock_bh(&sk->sk_callback_lock);
653} 354}
654 355
655static void do_psock_rx_work(struct kcm_psock *psock) 356/* Called with lower sock held */
357static void kcm_rcv_strparser(struct strparser *strp, struct sk_buff *skb)
656{ 358{
657 read_descriptor_t rd_desc; 359 struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp);
658 struct sock *csk = psock->sk; 360 struct kcm_sock *kcm;
659
660 /* We need the read lock to synchronize with psock_tcp_data_ready. We
661 * need the socket lock for calling tcp_read_sock.
662 */
663 lock_sock(csk);
664 read_lock_bh(&csk->sk_callback_lock);
665
666 if (unlikely(csk->sk_user_data != psock))
667 goto out;
668
669 if (unlikely(psock->rx_stopped))
670 goto out;
671
672 if (psock->ready_rx_msg)
673 goto out;
674
675 rd_desc.arg.data = psock;
676 361
677 if (psock_tcp_read_sock(psock) == -ENOMEM) 362try_queue:
678 queue_delayed_work(kcm_wq, &psock->rx_delayed_work, 0); 363 kcm = reserve_rx_kcm(psock, skb);
364 if (!kcm) {
365 /* Unable to reserve a KCM, message is held in psock and strp
366 * is paused.
367 */
368 return;
369 }
679 370
680out: 371 if (kcm_queue_rcv_skb(&kcm->sk, skb)) {
681 read_unlock_bh(&csk->sk_callback_lock); 372 /* Should mean socket buffer full */
682 release_sock(csk); 373 unreserve_rx_kcm(psock, false);
374 goto try_queue;
375 }
683} 376}
684 377
685static void psock_rx_work(struct work_struct *w) 378static int kcm_parse_func_strparser(struct strparser *strp, struct sk_buff *skb)
686{ 379{
687 do_psock_rx_work(container_of(w, struct kcm_psock, rx_work)); 380 struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp);
381 struct bpf_prog *prog = psock->bpf_prog;
382
383 return (*prog->bpf_func)(skb, prog->insnsi);
688} 384}
689 385
690static void psock_rx_delayed_work(struct work_struct *w) 386static int kcm_read_sock_done(struct strparser *strp, int err)
691{ 387{
692 do_psock_rx_work(container_of(w, struct kcm_psock, 388 struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp);
693 rx_delayed_work.work)); 389
390 unreserve_rx_kcm(psock, true);
391
392 return err;
694} 393}
695 394
696static void psock_tcp_state_change(struct sock *sk) 395static void psock_state_change(struct sock *sk)
697{ 396{
698 /* TCP only does a POLLIN for a half close. Do a POLLHUP here 397 /* TCP only does a POLLIN for a half close. Do a POLLHUP here
699 * since application will normally not poll with POLLIN 398 * since application will normally not poll with POLLIN
@@ -703,7 +402,7 @@ static void psock_tcp_state_change(struct sock *sk)
703 report_csk_error(sk, EPIPE); 402 report_csk_error(sk, EPIPE);
704} 403}
705 404
706static void psock_tcp_write_space(struct sock *sk) 405static void psock_write_space(struct sock *sk)
707{ 406{
708 struct kcm_psock *psock; 407 struct kcm_psock *psock;
709 struct kcm_mux *mux; 408 struct kcm_mux *mux;
@@ -714,14 +413,13 @@ static void psock_tcp_write_space(struct sock *sk)
714 psock = (struct kcm_psock *)sk->sk_user_data; 413 psock = (struct kcm_psock *)sk->sk_user_data;
715 if (unlikely(!psock)) 414 if (unlikely(!psock))
716 goto out; 415 goto out;
717
718 mux = psock->mux; 416 mux = psock->mux;
719 417
720 spin_lock_bh(&mux->lock); 418 spin_lock_bh(&mux->lock);
721 419
722 /* Check if the socket is reserved so someone is waiting for sending. */ 420 /* Check if the socket is reserved so someone is waiting for sending. */
723 kcm = psock->tx_kcm; 421 kcm = psock->tx_kcm;
724 if (kcm) 422 if (kcm && !unlikely(kcm->tx_stopped))
725 queue_work(kcm_wq, &kcm->tx_work); 423 queue_work(kcm_wq, &kcm->tx_work);
726 424
727 spin_unlock_bh(&mux->lock); 425 spin_unlock_bh(&mux->lock);
@@ -1412,7 +1110,7 @@ static int kcm_recvmsg(struct socket *sock, struct msghdr *msg,
1412 struct kcm_sock *kcm = kcm_sk(sk); 1110 struct kcm_sock *kcm = kcm_sk(sk);
1413 int err = 0; 1111 int err = 0;
1414 long timeo; 1112 long timeo;
1415 struct kcm_rx_msg *rxm; 1113 struct strp_rx_msg *rxm;
1416 int copied = 0; 1114 int copied = 0;
1417 struct sk_buff *skb; 1115 struct sk_buff *skb;
1418 1116
@@ -1426,7 +1124,7 @@ static int kcm_recvmsg(struct socket *sock, struct msghdr *msg,
1426 1124
1427 /* Okay, have a message on the receive queue */ 1125 /* Okay, have a message on the receive queue */
1428 1126
1429 rxm = kcm_rx_msg(skb); 1127 rxm = strp_rx_msg(skb);
1430 1128
1431 if (len > rxm->full_len) 1129 if (len > rxm->full_len)
1432 len = rxm->full_len; 1130 len = rxm->full_len;
@@ -1462,19 +1160,6 @@ out:
1462 return copied ? : err; 1160 return copied ? : err;
1463} 1161}
1464 1162
1465static ssize_t kcm_sock_splice(struct sock *sk,
1466 struct pipe_inode_info *pipe,
1467 struct splice_pipe_desc *spd)
1468{
1469 int ret;
1470
1471 release_sock(sk);
1472 ret = splice_to_pipe(pipe, spd);
1473 lock_sock(sk);
1474
1475 return ret;
1476}
1477
1478static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos, 1163static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos,
1479 struct pipe_inode_info *pipe, size_t len, 1164 struct pipe_inode_info *pipe, size_t len,
1480 unsigned int flags) 1165 unsigned int flags)
@@ -1482,7 +1167,7 @@ static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos,
1482 struct sock *sk = sock->sk; 1167 struct sock *sk = sock->sk;
1483 struct kcm_sock *kcm = kcm_sk(sk); 1168 struct kcm_sock *kcm = kcm_sk(sk);
1484 long timeo; 1169 long timeo;
1485 struct kcm_rx_msg *rxm; 1170 struct strp_rx_msg *rxm;
1486 int err = 0; 1171 int err = 0;
1487 ssize_t copied; 1172 ssize_t copied;
1488 struct sk_buff *skb; 1173 struct sk_buff *skb;
@@ -1499,13 +1184,12 @@ static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos,
1499 1184
1500 /* Okay, have a message on the receive queue */ 1185 /* Okay, have a message on the receive queue */
1501 1186
1502 rxm = kcm_rx_msg(skb); 1187 rxm = strp_rx_msg(skb);
1503 1188
1504 if (len > rxm->full_len) 1189 if (len > rxm->full_len)
1505 len = rxm->full_len; 1190 len = rxm->full_len;
1506 1191
1507 copied = skb_splice_bits(skb, sk, rxm->offset, pipe, len, flags, 1192 copied = skb_splice_bits(skb, sk, rxm->offset, pipe, len, flags);
1508 kcm_sock_splice);
1509 if (copied < 0) { 1193 if (copied < 0) {
1510 err = copied; 1194 err = copied;
1511 goto err_out; 1195 goto err_out;
@@ -1675,15 +1359,6 @@ static void init_kcm_sock(struct kcm_sock *kcm, struct kcm_mux *mux)
1675 spin_unlock_bh(&mux->rx_lock); 1359 spin_unlock_bh(&mux->rx_lock);
1676} 1360}
1677 1361
1678static void kcm_rx_msg_timeout(unsigned long arg)
1679{
1680 struct kcm_psock *psock = (struct kcm_psock *)arg;
1681
1682 /* Message assembly timed out */
1683 KCM_STATS_INCR(psock->stats.rx_msg_timeouts);
1684 kcm_abort_rx_psock(psock, ETIMEDOUT, NULL);
1685}
1686
1687static int kcm_attach(struct socket *sock, struct socket *csock, 1362static int kcm_attach(struct socket *sock, struct socket *csock,
1688 struct bpf_prog *prog) 1363 struct bpf_prog *prog)
1689{ 1364{
@@ -1693,19 +1368,13 @@ static int kcm_attach(struct socket *sock, struct socket *csock,
1693 struct kcm_psock *psock = NULL, *tpsock; 1368 struct kcm_psock *psock = NULL, *tpsock;
1694 struct list_head *head; 1369 struct list_head *head;
1695 int index = 0; 1370 int index = 0;
1696 1371 struct strp_callbacks cb;
1697 if (csock->ops->family != PF_INET && 1372 int err;
1698 csock->ops->family != PF_INET6)
1699 return -EINVAL;
1700 1373
1701 csk = csock->sk; 1374 csk = csock->sk;
1702 if (!csk) 1375 if (!csk)
1703 return -EINVAL; 1376 return -EINVAL;
1704 1377
1705 /* Only support TCP for now */
1706 if (csk->sk_protocol != IPPROTO_TCP)
1707 return -EINVAL;
1708
1709 psock = kmem_cache_zalloc(kcm_psockp, GFP_KERNEL); 1378 psock = kmem_cache_zalloc(kcm_psockp, GFP_KERNEL);
1710 if (!psock) 1379 if (!psock)
1711 return -ENOMEM; 1380 return -ENOMEM;
@@ -1714,11 +1383,16 @@ static int kcm_attach(struct socket *sock, struct socket *csock,
1714 psock->sk = csk; 1383 psock->sk = csk;
1715 psock->bpf_prog = prog; 1384 psock->bpf_prog = prog;
1716 1385
1717 setup_timer(&psock->rx_msg_timer, kcm_rx_msg_timeout, 1386 cb.rcv_msg = kcm_rcv_strparser;
1718 (unsigned long)psock); 1387 cb.abort_parser = NULL;
1388 cb.parse_msg = kcm_parse_func_strparser;
1389 cb.read_sock_done = kcm_read_sock_done;
1719 1390
1720 INIT_WORK(&psock->rx_work, psock_rx_work); 1391 err = strp_init(&psock->strp, csk, &cb);
1721 INIT_DELAYED_WORK(&psock->rx_delayed_work, psock_rx_delayed_work); 1392 if (err) {
1393 kmem_cache_free(kcm_psockp, psock);
1394 return err;
1395 }
1722 1396
1723 sock_hold(csk); 1397 sock_hold(csk);
1724 1398
@@ -1727,9 +1401,9 @@ static int kcm_attach(struct socket *sock, struct socket *csock,
1727 psock->save_write_space = csk->sk_write_space; 1401 psock->save_write_space = csk->sk_write_space;
1728 psock->save_state_change = csk->sk_state_change; 1402 psock->save_state_change = csk->sk_state_change;
1729 csk->sk_user_data = psock; 1403 csk->sk_user_data = psock;
1730 csk->sk_data_ready = psock_tcp_data_ready; 1404 csk->sk_data_ready = psock_data_ready;
1731 csk->sk_write_space = psock_tcp_write_space; 1405 csk->sk_write_space = psock_write_space;
1732 csk->sk_state_change = psock_tcp_state_change; 1406 csk->sk_state_change = psock_state_change;
1733 write_unlock_bh(&csk->sk_callback_lock); 1407 write_unlock_bh(&csk->sk_callback_lock);
1734 1408
1735 /* Finished initialization, now add the psock to the MUX. */ 1409 /* Finished initialization, now add the psock to the MUX. */
@@ -1751,7 +1425,7 @@ static int kcm_attach(struct socket *sock, struct socket *csock,
1751 spin_unlock_bh(&mux->lock); 1425 spin_unlock_bh(&mux->lock);
1752 1426
1753 /* Schedule RX work in case there are already bytes queued */ 1427 /* Schedule RX work in case there are already bytes queued */
1754 queue_work(kcm_wq, &psock->rx_work); 1428 strp_check_rcv(&psock->strp);
1755 1429
1756 return 0; 1430 return 0;
1757} 1431}
@@ -1791,6 +1465,8 @@ static void kcm_unattach(struct kcm_psock *psock)
1791 struct sock *csk = psock->sk; 1465 struct sock *csk = psock->sk;
1792 struct kcm_mux *mux = psock->mux; 1466 struct kcm_mux *mux = psock->mux;
1793 1467
1468 lock_sock(csk);
1469
1794 /* Stop getting callbacks from TCP socket. After this there should 1470 /* Stop getting callbacks from TCP socket. After this there should
1795 * be no way to reserve a kcm for this psock. 1471 * be no way to reserve a kcm for this psock.
1796 */ 1472 */
@@ -1799,7 +1475,7 @@ static void kcm_unattach(struct kcm_psock *psock)
1799 csk->sk_data_ready = psock->save_data_ready; 1475 csk->sk_data_ready = psock->save_data_ready;
1800 csk->sk_write_space = psock->save_write_space; 1476 csk->sk_write_space = psock->save_write_space;
1801 csk->sk_state_change = psock->save_state_change; 1477 csk->sk_state_change = psock->save_state_change;
1802 psock->rx_stopped = 1; 1478 strp_stop(&psock->strp);
1803 1479
1804 if (WARN_ON(psock->rx_kcm)) { 1480 if (WARN_ON(psock->rx_kcm)) {
1805 write_unlock_bh(&csk->sk_callback_lock); 1481 write_unlock_bh(&csk->sk_callback_lock);
@@ -1822,18 +1498,17 @@ static void kcm_unattach(struct kcm_psock *psock)
1822 1498
1823 write_unlock_bh(&csk->sk_callback_lock); 1499 write_unlock_bh(&csk->sk_callback_lock);
1824 1500
1825 del_timer_sync(&psock->rx_msg_timer); 1501 /* Call strp_done without sock lock */
1826 cancel_work_sync(&psock->rx_work); 1502 release_sock(csk);
1827 cancel_delayed_work_sync(&psock->rx_delayed_work); 1503 strp_done(&psock->strp);
1504 lock_sock(csk);
1828 1505
1829 bpf_prog_put(psock->bpf_prog); 1506 bpf_prog_put(psock->bpf_prog);
1830 1507
1831 kfree_skb(psock->rx_skb_head);
1832 psock->rx_skb_head = NULL;
1833
1834 spin_lock_bh(&mux->lock); 1508 spin_lock_bh(&mux->lock);
1835 1509
1836 aggregate_psock_stats(&psock->stats, &mux->aggregate_psock_stats); 1510 aggregate_psock_stats(&psock->stats, &mux->aggregate_psock_stats);
1511 save_strp_stats(&psock->strp, &mux->aggregate_strp_stats);
1837 1512
1838 KCM_STATS_INCR(mux->stats.psock_unattach); 1513 KCM_STATS_INCR(mux->stats.psock_unattach);
1839 1514
@@ -1876,6 +1551,8 @@ no_reserved:
1876 fput(csk->sk_socket->file); 1551 fput(csk->sk_socket->file);
1877 kmem_cache_free(kcm_psockp, psock); 1552 kmem_cache_free(kcm_psockp, psock);
1878 } 1553 }
1554
1555 release_sock(csk);
1879} 1556}
1880 1557
1881static int kcm_unattach_ioctl(struct socket *sock, struct kcm_unattach *info) 1558static int kcm_unattach_ioctl(struct socket *sock, struct kcm_unattach *info)
@@ -1916,6 +1593,7 @@ static int kcm_unattach_ioctl(struct socket *sock, struct kcm_unattach *info)
1916 1593
1917 spin_unlock_bh(&mux->lock); 1594 spin_unlock_bh(&mux->lock);
1918 1595
1596 /* Lower socket lock should already be held */
1919 kcm_unattach(psock); 1597 kcm_unattach(psock);
1920 1598
1921 err = 0; 1599 err = 0;
@@ -2073,6 +1751,8 @@ static void release_mux(struct kcm_mux *mux)
2073 aggregate_mux_stats(&mux->stats, &knet->aggregate_mux_stats); 1751 aggregate_mux_stats(&mux->stats, &knet->aggregate_mux_stats);
2074 aggregate_psock_stats(&mux->aggregate_psock_stats, 1752 aggregate_psock_stats(&mux->aggregate_psock_stats,
2075 &knet->aggregate_psock_stats); 1753 &knet->aggregate_psock_stats);
1754 aggregate_strp_stats(&mux->aggregate_strp_stats,
1755 &knet->aggregate_strp_stats);
2076 list_del_rcu(&mux->kcm_mux_list); 1756 list_del_rcu(&mux->kcm_mux_list);
2077 knet->count--; 1757 knet->count--;
2078 mutex_unlock(&knet->mutex); 1758 mutex_unlock(&knet->mutex);
@@ -2152,6 +1832,13 @@ static int kcm_release(struct socket *sock)
2152 * it will just return. 1832 * it will just return.
2153 */ 1833 */
2154 __skb_queue_purge(&sk->sk_write_queue); 1834 __skb_queue_purge(&sk->sk_write_queue);
1835
1836 /* Set tx_stopped. This is checked when psock is bound to a kcm and we
1837 * get a writespace callback. This prevents further work being queued
1838 * from the callback (unbinding the psock occurs after canceling work.
1839 */
1840 kcm->tx_stopped = 1;
1841
2155 release_sock(sk); 1842 release_sock(sk);
2156 1843
2157 spin_lock_bh(&mux->lock); 1844 spin_lock_bh(&mux->lock);
diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h
index 5871537af387..2599af6378e4 100644
--- a/net/l2tp/l2tp_core.h
+++ b/net/l2tp/l2tp_core.h
@@ -139,7 +139,7 @@ struct l2tp_session {
139 void (*session_close)(struct l2tp_session *session); 139 void (*session_close)(struct l2tp_session *session);
140 void (*ref)(struct l2tp_session *session); 140 void (*ref)(struct l2tp_session *session);
141 void (*deref)(struct l2tp_session *session); 141 void (*deref)(struct l2tp_session *session);
142#if defined(CONFIG_L2TP_DEBUGFS) || defined(CONFIG_L2TP_DEBUGFS_MODULE) 142#if IS_ENABLED(CONFIG_L2TP_DEBUGFS)
143 void (*show)(struct seq_file *m, void *priv); 143 void (*show)(struct seq_file *m, void *priv);
144#endif 144#endif
145 uint8_t priv[0]; /* private data */ 145 uint8_t priv[0]; /* private data */
diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c
index 57fc5a46ce06..965f7e344cef 100644
--- a/net/l2tp/l2tp_eth.c
+++ b/net/l2tp/l2tp_eth.c
@@ -121,7 +121,7 @@ static struct rtnl_link_stats64 *l2tp_eth_get_stats64(struct net_device *dev,
121} 121}
122 122
123 123
124static struct net_device_ops l2tp_eth_netdev_ops = { 124static const struct net_device_ops l2tp_eth_netdev_ops = {
125 .ndo_init = l2tp_eth_dev_init, 125 .ndo_init = l2tp_eth_dev_init,
126 .ndo_uninit = l2tp_eth_dev_uninit, 126 .ndo_uninit = l2tp_eth_dev_uninit,
127 .ndo_start_xmit = l2tp_eth_dev_xmit, 127 .ndo_start_xmit = l2tp_eth_dev_xmit,
@@ -195,7 +195,7 @@ static void l2tp_eth_delete(struct l2tp_session *session)
195 } 195 }
196} 196}
197 197
198#if defined(CONFIG_L2TP_DEBUGFS) || defined(CONFIG_L2TP_DEBUGFS_MODULE) 198#if IS_ENABLED(CONFIG_L2TP_DEBUGFS)
199static void l2tp_eth_show(struct seq_file *m, void *arg) 199static void l2tp_eth_show(struct seq_file *m, void *arg)
200{ 200{
201 struct l2tp_session *session = arg; 201 struct l2tp_session *session = arg;
@@ -268,7 +268,7 @@ static int l2tp_eth_create(struct net *net, u32 tunnel_id, u32 session_id, u32 p
268 priv->tunnel_sock = tunnel->sock; 268 priv->tunnel_sock = tunnel->sock;
269 session->recv_skb = l2tp_eth_dev_recv; 269 session->recv_skb = l2tp_eth_dev_recv;
270 session->session_close = l2tp_eth_delete; 270 session->session_close = l2tp_eth_delete;
271#if defined(CONFIG_L2TP_DEBUGFS) || defined(CONFIG_L2TP_DEBUGFS_MODULE) 271#if IS_ENABLED(CONFIG_L2TP_DEBUGFS)
272 session->show = l2tp_eth_show; 272 session->show = l2tp_eth_show;
273#endif 273#endif
274 274
diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
index 1d02e8d20e56..bf3117771822 100644
--- a/net/l2tp/l2tp_netlink.c
+++ b/net/l2tp/l2tp_netlink.c
@@ -867,7 +867,7 @@ out:
867 return skb->len; 867 return skb->len;
868} 868}
869 869
870static struct nla_policy l2tp_nl_policy[L2TP_ATTR_MAX + 1] = { 870static const struct nla_policy l2tp_nl_policy[L2TP_ATTR_MAX + 1] = {
871 [L2TP_ATTR_NONE] = { .type = NLA_UNSPEC, }, 871 [L2TP_ATTR_NONE] = { .type = NLA_UNSPEC, },
872 [L2TP_ATTR_PW_TYPE] = { .type = NLA_U16, }, 872 [L2TP_ATTR_PW_TYPE] = { .type = NLA_U16, },
873 [L2TP_ATTR_ENCAP_TYPE] = { .type = NLA_U16, }, 873 [L2TP_ATTR_ENCAP_TYPE] = { .type = NLA_U16, },
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 232cb92033e8..41d47bfda15c 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -177,7 +177,7 @@ static int pppol2tp_recv_payload_hook(struct sk_buff *skb)
177 if (!pskb_may_pull(skb, 2)) 177 if (!pskb_may_pull(skb, 2))
178 return 1; 178 return 1;
179 179
180 if ((skb->data[0] == 0xff) && (skb->data[1] == 0x03)) 180 if ((skb->data[0] == PPP_ALLSTATIONS) && (skb->data[1] == PPP_UI))
181 skb_pull(skb, 2); 181 skb_pull(skb, 2);
182 182
183 return 0; 183 return 0;
@@ -282,7 +282,6 @@ static void pppol2tp_session_sock_put(struct l2tp_session *session)
282static int pppol2tp_sendmsg(struct socket *sock, struct msghdr *m, 282static int pppol2tp_sendmsg(struct socket *sock, struct msghdr *m,
283 size_t total_len) 283 size_t total_len)
284{ 284{
285 static const unsigned char ppph[2] = { 0xff, 0x03 };
286 struct sock *sk = sock->sk; 285 struct sock *sk = sock->sk;
287 struct sk_buff *skb; 286 struct sk_buff *skb;
288 int error; 287 int error;
@@ -312,7 +311,7 @@ static int pppol2tp_sendmsg(struct socket *sock, struct msghdr *m,
312 error = -ENOMEM; 311 error = -ENOMEM;
313 skb = sock_wmalloc(sk, NET_SKB_PAD + sizeof(struct iphdr) + 312 skb = sock_wmalloc(sk, NET_SKB_PAD + sizeof(struct iphdr) +
314 uhlen + session->hdr_len + 313 uhlen + session->hdr_len +
315 sizeof(ppph) + total_len, 314 2 + total_len, /* 2 bytes for PPP_ALLSTATIONS & PPP_UI */
316 0, GFP_KERNEL); 315 0, GFP_KERNEL);
317 if (!skb) 316 if (!skb)
318 goto error_put_sess_tun; 317 goto error_put_sess_tun;
@@ -325,8 +324,8 @@ static int pppol2tp_sendmsg(struct socket *sock, struct msghdr *m,
325 skb_reserve(skb, uhlen); 324 skb_reserve(skb, uhlen);
326 325
327 /* Add PPP header */ 326 /* Add PPP header */
328 skb->data[0] = ppph[0]; 327 skb->data[0] = PPP_ALLSTATIONS;
329 skb->data[1] = ppph[1]; 328 skb->data[1] = PPP_UI;
330 skb_put(skb, 2); 329 skb_put(skb, 2);
331 330
332 /* Copy user data into skb */ 331 /* Copy user data into skb */
@@ -369,7 +368,6 @@ error:
369 */ 368 */
370static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb) 369static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb)
371{ 370{
372 static const u8 ppph[2] = { 0xff, 0x03 };
373 struct sock *sk = (struct sock *) chan->private; 371 struct sock *sk = (struct sock *) chan->private;
374 struct sock *sk_tun; 372 struct sock *sk_tun;
375 struct l2tp_session *session; 373 struct l2tp_session *session;
@@ -398,14 +396,14 @@ static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb)
398 sizeof(struct iphdr) + /* IP header */ 396 sizeof(struct iphdr) + /* IP header */
399 uhlen + /* UDP header (if L2TP_ENCAPTYPE_UDP) */ 397 uhlen + /* UDP header (if L2TP_ENCAPTYPE_UDP) */
400 session->hdr_len + /* L2TP header */ 398 session->hdr_len + /* L2TP header */
401 sizeof(ppph); /* PPP header */ 399 2; /* 2 bytes for PPP_ALLSTATIONS & PPP_UI */
402 if (skb_cow_head(skb, headroom)) 400 if (skb_cow_head(skb, headroom))
403 goto abort_put_sess_tun; 401 goto abort_put_sess_tun;
404 402
405 /* Setup PPP header */ 403 /* Setup PPP header */
406 __skb_push(skb, sizeof(ppph)); 404 __skb_push(skb, 2);
407 skb->data[0] = ppph[0]; 405 skb->data[0] = PPP_ALLSTATIONS;
408 skb->data[1] = ppph[1]; 406 skb->data[1] = PPP_UI;
409 407
410 local_bh_disable(); 408 local_bh_disable();
411 l2tp_xmit_skb(session, skb, session->hdr_len); 409 l2tp_xmit_skb(session, skb, session->hdr_len);
@@ -440,7 +438,7 @@ static void pppol2tp_session_close(struct l2tp_session *session)
440 BUG_ON(session->magic != L2TP_SESSION_MAGIC); 438 BUG_ON(session->magic != L2TP_SESSION_MAGIC);
441 439
442 if (sock) { 440 if (sock) {
443 inet_shutdown(sock, 2); 441 inet_shutdown(sock, SEND_SHUTDOWN);
444 /* Don't let the session go away before our socket does */ 442 /* Don't let the session go away before our socket does */
445 l2tp_session_inc_refcount(session); 443 l2tp_session_inc_refcount(session);
446 } 444 }
@@ -554,7 +552,7 @@ out:
554 return error; 552 return error;
555} 553}
556 554
557#if defined(CONFIG_L2TP_DEBUGFS) || defined(CONFIG_L2TP_DEBUGFS_MODULE) 555#if IS_ENABLED(CONFIG_L2TP_DEBUGFS)
558static void pppol2tp_show(struct seq_file *m, void *arg) 556static void pppol2tp_show(struct seq_file *m, void *arg)
559{ 557{
560 struct l2tp_session *session = arg; 558 struct l2tp_session *session = arg;
@@ -725,7 +723,7 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
725 723
726 session->recv_skb = pppol2tp_recv; 724 session->recv_skb = pppol2tp_recv;
727 session->session_close = pppol2tp_session_close; 725 session->session_close = pppol2tp_session_close;
728#if defined(CONFIG_L2TP_DEBUGFS) || defined(CONFIG_L2TP_DEBUGFS_MODULE) 726#if IS_ENABLED(CONFIG_L2TP_DEBUGFS)
729 session->show = pppol2tp_show; 727 session->show = pppol2tp_show;
730#endif 728#endif
731 729
diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c
index c4a1c3e84e12..8da86ceca33d 100644
--- a/net/l3mdev/l3mdev.c
+++ b/net/l3mdev/l3mdev.c
@@ -100,15 +100,14 @@ u32 l3mdev_fib_table_by_index(struct net *net, int ifindex)
100EXPORT_SYMBOL_GPL(l3mdev_fib_table_by_index); 100EXPORT_SYMBOL_GPL(l3mdev_fib_table_by_index);
101 101
102/** 102/**
103 * l3mdev_get_rt6_dst - IPv6 route lookup based on flow. Returns 103 * l3mdev_link_scope_lookup - IPv6 route lookup based on flow for link
104 * cached route for L3 master device if relevant 104 * local and multicast addresses
105 * to flow
106 * @net: network namespace for device index lookup 105 * @net: network namespace for device index lookup
107 * @fl6: IPv6 flow struct for lookup 106 * @fl6: IPv6 flow struct for lookup
108 */ 107 */
109 108
110struct dst_entry *l3mdev_get_rt6_dst(struct net *net, 109struct dst_entry *l3mdev_link_scope_lookup(struct net *net,
111 struct flowi6 *fl6) 110 struct flowi6 *fl6)
112{ 111{
113 struct dst_entry *dst = NULL; 112 struct dst_entry *dst = NULL;
114 struct net_device *dev; 113 struct net_device *dev;
@@ -121,70 +120,15 @@ struct dst_entry *l3mdev_get_rt6_dst(struct net *net,
121 dev = netdev_master_upper_dev_get_rcu(dev); 120 dev = netdev_master_upper_dev_get_rcu(dev);
122 121
123 if (dev && netif_is_l3_master(dev) && 122 if (dev && netif_is_l3_master(dev) &&
124 dev->l3mdev_ops->l3mdev_get_rt6_dst) 123 dev->l3mdev_ops->l3mdev_link_scope_lookup)
125 dst = dev->l3mdev_ops->l3mdev_get_rt6_dst(dev, fl6); 124 dst = dev->l3mdev_ops->l3mdev_link_scope_lookup(dev, fl6);
126 125
127 rcu_read_unlock(); 126 rcu_read_unlock();
128 } 127 }
129 128
130 return dst; 129 return dst;
131} 130}
132EXPORT_SYMBOL_GPL(l3mdev_get_rt6_dst); 131EXPORT_SYMBOL_GPL(l3mdev_link_scope_lookup);
133
134/**
135 * l3mdev_get_saddr - get source address for a flow based on an interface
136 * enslaved to an L3 master device
137 * @net: network namespace for device index lookup
138 * @ifindex: Interface index
139 * @fl4: IPv4 flow struct
140 */
141
142int l3mdev_get_saddr(struct net *net, int ifindex, struct flowi4 *fl4)
143{
144 struct net_device *dev;
145 int rc = 0;
146
147 if (ifindex) {
148 rcu_read_lock();
149
150 dev = dev_get_by_index_rcu(net, ifindex);
151 if (dev && netif_is_l3_slave(dev))
152 dev = netdev_master_upper_dev_get_rcu(dev);
153
154 if (dev && netif_is_l3_master(dev) &&
155 dev->l3mdev_ops->l3mdev_get_saddr)
156 rc = dev->l3mdev_ops->l3mdev_get_saddr(dev, fl4);
157
158 rcu_read_unlock();
159 }
160
161 return rc;
162}
163EXPORT_SYMBOL_GPL(l3mdev_get_saddr);
164
165int l3mdev_get_saddr6(struct net *net, const struct sock *sk,
166 struct flowi6 *fl6)
167{
168 struct net_device *dev;
169 int rc = 0;
170
171 if (fl6->flowi6_oif) {
172 rcu_read_lock();
173
174 dev = dev_get_by_index_rcu(net, fl6->flowi6_oif);
175 if (dev && netif_is_l3_slave(dev))
176 dev = netdev_master_upper_dev_get_rcu(dev);
177
178 if (dev && netif_is_l3_master(dev) &&
179 dev->l3mdev_ops->l3mdev_get_saddr6)
180 rc = dev->l3mdev_ops->l3mdev_get_saddr6(dev, sk, fl6);
181
182 rcu_read_unlock();
183 }
184
185 return rc;
186}
187EXPORT_SYMBOL_GPL(l3mdev_get_saddr6);
188 132
189/** 133/**
190 * l3mdev_fib_rule_match - Determine if flowi references an 134 * l3mdev_fib_rule_match - Determine if flowi references an
@@ -222,3 +166,38 @@ out:
222 166
223 return rc; 167 return rc;
224} 168}
169
170void l3mdev_update_flow(struct net *net, struct flowi *fl)
171{
172 struct net_device *dev;
173 int ifindex;
174
175 rcu_read_lock();
176
177 if (fl->flowi_oif) {
178 dev = dev_get_by_index_rcu(net, fl->flowi_oif);
179 if (dev) {
180 ifindex = l3mdev_master_ifindex_rcu(dev);
181 if (ifindex) {
182 fl->flowi_oif = ifindex;
183 fl->flowi_flags |= FLOWI_FLAG_SKIP_NH_OIF;
184 goto out;
185 }
186 }
187 }
188
189 if (fl->flowi_iif) {
190 dev = dev_get_by_index_rcu(net, fl->flowi_iif);
191 if (dev) {
192 ifindex = l3mdev_master_ifindex_rcu(dev);
193 if (ifindex) {
194 fl->flowi_iif = ifindex;
195 fl->flowi_flags |= FLOWI_FLAG_SKIP_NH_OIF;
196 }
197 }
198 }
199
200out:
201 rcu_read_unlock();
202}
203EXPORT_SYMBOL_GPL(l3mdev_update_flow);
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 8ae3ed97d95c..db916cf51ffe 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -38,7 +38,7 @@ static u16 llc_ui_sap_link_no_max[256];
38static struct sockaddr_llc llc_ui_addrnull; 38static struct sockaddr_llc llc_ui_addrnull;
39static const struct proto_ops llc_ui_ops; 39static const struct proto_ops llc_ui_ops;
40 40
41static long llc_ui_wait_for_conn(struct sock *sk, long timeout); 41static bool llc_ui_wait_for_conn(struct sock *sk, long timeout);
42static int llc_ui_wait_for_disc(struct sock *sk, long timeout); 42static int llc_ui_wait_for_disc(struct sock *sk, long timeout);
43static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout); 43static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout);
44 44
@@ -551,7 +551,7 @@ static int llc_ui_wait_for_disc(struct sock *sk, long timeout)
551 return rc; 551 return rc;
552} 552}
553 553
554static long llc_ui_wait_for_conn(struct sock *sk, long timeout) 554static bool llc_ui_wait_for_conn(struct sock *sk, long timeout)
555{ 555{
556 DEFINE_WAIT(wait); 556 DEFINE_WAIT(wait);
557 557
diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index afa94687d5e1..f6749dced021 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -304,10 +304,13 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
304 buf_size = IEEE80211_MAX_AMPDU_BUF; 304 buf_size = IEEE80211_MAX_AMPDU_BUF;
305 305
306 /* make sure the size doesn't exceed the maximum supported by the hw */ 306 /* make sure the size doesn't exceed the maximum supported by the hw */
307 if (buf_size > local->hw.max_rx_aggregation_subframes) 307 if (buf_size > sta->sta.max_rx_aggregation_subframes)
308 buf_size = local->hw.max_rx_aggregation_subframes; 308 buf_size = sta->sta.max_rx_aggregation_subframes;
309 params.buf_size = buf_size; 309 params.buf_size = buf_size;
310 310
311 ht_dbg(sta->sdata, "AddBA Req buf_size=%d for %pM\n",
312 buf_size, sta->sta.addr);
313
311 /* examine state machine */ 314 /* examine state machine */
312 mutex_lock(&sta->ampdu_mlme.mtx); 315 mutex_lock(&sta->ampdu_mlme.mtx);
313 316
@@ -412,8 +415,10 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
412 } 415 }
413 416
414end: 417end:
415 if (status == WLAN_STATUS_SUCCESS) 418 if (status == WLAN_STATUS_SUCCESS) {
416 __set_bit(tid, sta->ampdu_mlme.agg_session_valid); 419 __set_bit(tid, sta->ampdu_mlme.agg_session_valid);
420 __clear_bit(tid, sta->ampdu_mlme.unexpected_agg);
421 }
417 mutex_unlock(&sta->ampdu_mlme.mtx); 422 mutex_unlock(&sta->ampdu_mlme.mtx);
418 423
419end_no_lock: 424end_no_lock:
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 543b1d4fc33d..fd6541f3ade3 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -3,6 +3,7 @@
3 * 3 *
4 * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> 4 * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net>
5 * Copyright 2013-2015 Intel Mobile Communications GmbH 5 * Copyright 2013-2015 Intel Mobile Communications GmbH
6 * Copyright (C) 2015-2016 Intel Deutschland GmbH
6 * 7 *
7 * This file is GPLv2 as found in COPYING. 8 * This file is GPLv2 as found in COPYING.
8 */ 9 */
@@ -39,7 +40,7 @@ static struct wireless_dev *ieee80211_add_iface(struct wiphy *wiphy,
39 40
40 if (type == NL80211_IFTYPE_MONITOR && flags) { 41 if (type == NL80211_IFTYPE_MONITOR && flags) {
41 sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); 42 sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
42 sdata->u.mntr_flags = *flags; 43 sdata->u.mntr.flags = *flags;
43 } 44 }
44 45
45 return wdev; 46 return wdev;
@@ -73,8 +74,29 @@ static int ieee80211_change_iface(struct wiphy *wiphy,
73 sdata->u.mgd.use_4addr = params->use_4addr; 74 sdata->u.mgd.use_4addr = params->use_4addr;
74 } 75 }
75 76
76 if (sdata->vif.type == NL80211_IFTYPE_MONITOR && flags) { 77 if (sdata->vif.type == NL80211_IFTYPE_MONITOR) {
77 struct ieee80211_local *local = sdata->local; 78 struct ieee80211_local *local = sdata->local;
79 struct ieee80211_sub_if_data *monitor_sdata;
80 u32 mu_mntr_cap_flag = NL80211_EXT_FEATURE_MU_MIMO_AIR_SNIFFER;
81
82 monitor_sdata = rtnl_dereference(local->monitor_sdata);
83 if (monitor_sdata &&
84 wiphy_ext_feature_isset(wiphy, mu_mntr_cap_flag)) {
85 memcpy(monitor_sdata->vif.bss_conf.mu_group.membership,
86 params->vht_mumimo_groups, WLAN_MEMBERSHIP_LEN);
87 memcpy(monitor_sdata->vif.bss_conf.mu_group.position,
88 params->vht_mumimo_groups + WLAN_MEMBERSHIP_LEN,
89 WLAN_USER_POSITION_LEN);
90 monitor_sdata->vif.mu_mimo_owner = true;
91 ieee80211_bss_info_change_notify(monitor_sdata,
92 BSS_CHANGED_MU_GROUPS);
93
94 ether_addr_copy(monitor_sdata->u.mntr.mu_follow_addr,
95 params->macaddr);
96 }
97
98 if (!flags)
99 return 0;
78 100
79 if (ieee80211_sdata_running(sdata)) { 101 if (ieee80211_sdata_running(sdata)) {
80 u32 mask = MONITOR_FLAG_COOK_FRAMES | 102 u32 mask = MONITOR_FLAG_COOK_FRAMES |
@@ -89,11 +111,11 @@ static int ieee80211_change_iface(struct wiphy *wiphy,
89 * cooked_mntrs, monitor and all fif_* counters 111 * cooked_mntrs, monitor and all fif_* counters
90 * reconfigure hardware 112 * reconfigure hardware
91 */ 113 */
92 if ((*flags & mask) != (sdata->u.mntr_flags & mask)) 114 if ((*flags & mask) != (sdata->u.mntr.flags & mask))
93 return -EBUSY; 115 return -EBUSY;
94 116
95 ieee80211_adjust_monitor_flags(sdata, -1); 117 ieee80211_adjust_monitor_flags(sdata, -1);
96 sdata->u.mntr_flags = *flags; 118 sdata->u.mntr.flags = *flags;
97 ieee80211_adjust_monitor_flags(sdata, 1); 119 ieee80211_adjust_monitor_flags(sdata, 1);
98 120
99 ieee80211_configure_filter(local); 121 ieee80211_configure_filter(local);
@@ -103,7 +125,7 @@ static int ieee80211_change_iface(struct wiphy *wiphy,
103 * and ieee80211_do_open take care of "everything" 125 * and ieee80211_do_open take care of "everything"
104 * mentioned in the comment above. 126 * mentioned in the comment above.
105 */ 127 */
106 sdata->u.mntr_flags = *flags; 128 sdata->u.mntr.flags = *flags;
107 } 129 }
108 } 130 }
109 131
@@ -131,6 +153,149 @@ static void ieee80211_stop_p2p_device(struct wiphy *wiphy,
131 ieee80211_sdata_stop(IEEE80211_WDEV_TO_SUB_IF(wdev)); 153 ieee80211_sdata_stop(IEEE80211_WDEV_TO_SUB_IF(wdev));
132} 154}
133 155
156static int ieee80211_start_nan(struct wiphy *wiphy,
157 struct wireless_dev *wdev,
158 struct cfg80211_nan_conf *conf)
159{
160 struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
161 int ret;
162
163 mutex_lock(&sdata->local->chanctx_mtx);
164 ret = ieee80211_check_combinations(sdata, NULL, 0, 0);
165 mutex_unlock(&sdata->local->chanctx_mtx);
166 if (ret < 0)
167 return ret;
168
169 ret = ieee80211_do_open(wdev, true);
170 if (ret)
171 return ret;
172
173 ret = drv_start_nan(sdata->local, sdata, conf);
174 if (ret)
175 ieee80211_sdata_stop(sdata);
176
177 sdata->u.nan.conf = *conf;
178
179 return ret;
180}
181
182static void ieee80211_stop_nan(struct wiphy *wiphy,
183 struct wireless_dev *wdev)
184{
185 struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
186
187 drv_stop_nan(sdata->local, sdata);
188 ieee80211_sdata_stop(sdata);
189}
190
191static int ieee80211_nan_change_conf(struct wiphy *wiphy,
192 struct wireless_dev *wdev,
193 struct cfg80211_nan_conf *conf,
194 u32 changes)
195{
196 struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
197 struct cfg80211_nan_conf new_conf;
198 int ret = 0;
199
200 if (sdata->vif.type != NL80211_IFTYPE_NAN)
201 return -EOPNOTSUPP;
202
203 if (!ieee80211_sdata_running(sdata))
204 return -ENETDOWN;
205
206 new_conf = sdata->u.nan.conf;
207
208 if (changes & CFG80211_NAN_CONF_CHANGED_PREF)
209 new_conf.master_pref = conf->master_pref;
210
211 if (changes & CFG80211_NAN_CONF_CHANGED_DUAL)
212 new_conf.dual = conf->dual;
213
214 ret = drv_nan_change_conf(sdata->local, sdata, &new_conf, changes);
215 if (!ret)
216 sdata->u.nan.conf = new_conf;
217
218 return ret;
219}
220
221static int ieee80211_add_nan_func(struct wiphy *wiphy,
222 struct wireless_dev *wdev,
223 struct cfg80211_nan_func *nan_func)
224{
225 struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
226 int ret;
227
228 if (sdata->vif.type != NL80211_IFTYPE_NAN)
229 return -EOPNOTSUPP;
230
231 if (!ieee80211_sdata_running(sdata))
232 return -ENETDOWN;
233
234 spin_lock_bh(&sdata->u.nan.func_lock);
235
236 ret = idr_alloc(&sdata->u.nan.function_inst_ids,
237 nan_func, 1, sdata->local->hw.max_nan_de_entries + 1,
238 GFP_ATOMIC);
239 spin_unlock_bh(&sdata->u.nan.func_lock);
240
241 if (ret < 0)
242 return ret;
243
244 nan_func->instance_id = ret;
245
246 WARN_ON(nan_func->instance_id == 0);
247
248 ret = drv_add_nan_func(sdata->local, sdata, nan_func);
249 if (ret) {
250 spin_lock_bh(&sdata->u.nan.func_lock);
251 idr_remove(&sdata->u.nan.function_inst_ids,
252 nan_func->instance_id);
253 spin_unlock_bh(&sdata->u.nan.func_lock);
254 }
255
256 return ret;
257}
258
259static struct cfg80211_nan_func *
260ieee80211_find_nan_func_by_cookie(struct ieee80211_sub_if_data *sdata,
261 u64 cookie)
262{
263 struct cfg80211_nan_func *func;
264 int id;
265
266 lockdep_assert_held(&sdata->u.nan.func_lock);
267
268 idr_for_each_entry(&sdata->u.nan.function_inst_ids, func, id) {
269 if (func->cookie == cookie)
270 return func;
271 }
272
273 return NULL;
274}
275
276static void ieee80211_del_nan_func(struct wiphy *wiphy,
277 struct wireless_dev *wdev, u64 cookie)
278{
279 struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
280 struct cfg80211_nan_func *func;
281 u8 instance_id = 0;
282
283 if (sdata->vif.type != NL80211_IFTYPE_NAN ||
284 !ieee80211_sdata_running(sdata))
285 return;
286
287 spin_lock_bh(&sdata->u.nan.func_lock);
288
289 func = ieee80211_find_nan_func_by_cookie(sdata, cookie);
290 if (func)
291 instance_id = func->instance_id;
292
293 spin_unlock_bh(&sdata->u.nan.func_lock);
294
295 if (instance_id)
296 drv_del_nan_func(sdata->local, sdata, instance_id);
297}
298
134static int ieee80211_set_noack_map(struct wiphy *wiphy, 299static int ieee80211_set_noack_map(struct wiphy *wiphy,
135 struct net_device *dev, 300 struct net_device *dev,
136 u16 noack_map) 301 u16 noack_map)
@@ -236,6 +401,7 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev,
236 case NL80211_IFTYPE_WDS: 401 case NL80211_IFTYPE_WDS:
237 case NL80211_IFTYPE_MONITOR: 402 case NL80211_IFTYPE_MONITOR:
238 case NL80211_IFTYPE_P2P_DEVICE: 403 case NL80211_IFTYPE_P2P_DEVICE:
404 case NL80211_IFTYPE_NAN:
239 case NL80211_IFTYPE_UNSPECIFIED: 405 case NL80211_IFTYPE_UNSPECIFIED:
240 case NUM_NL80211_IFTYPES: 406 case NUM_NL80211_IFTYPES:
241 case NL80211_IFTYPE_P2P_CLIENT: 407 case NL80211_IFTYPE_P2P_CLIENT:
@@ -2015,6 +2181,7 @@ static int ieee80211_scan(struct wiphy *wiphy,
2015 !(req->flags & NL80211_SCAN_FLAG_AP))) 2181 !(req->flags & NL80211_SCAN_FLAG_AP)))
2016 return -EOPNOTSUPP; 2182 return -EOPNOTSUPP;
2017 break; 2183 break;
2184 case NL80211_IFTYPE_NAN:
2018 default: 2185 default:
2019 return -EOPNOTSUPP; 2186 return -EOPNOTSUPP;
2020 } 2187 }
@@ -2940,10 +3107,6 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev,
2940 } 3107 }
2941 3108
2942 chanctx = container_of(conf, struct ieee80211_chanctx, conf); 3109 chanctx = container_of(conf, struct ieee80211_chanctx, conf);
2943 if (!chanctx) {
2944 err = -EBUSY;
2945 goto out;
2946 }
2947 3110
2948 ch_switch.timestamp = 0; 3111 ch_switch.timestamp = 0;
2949 ch_switch.device_timestamp = 0; 3112 ch_switch.device_timestamp = 0;
@@ -3360,6 +3523,63 @@ static int ieee80211_del_tx_ts(struct wiphy *wiphy, struct net_device *dev,
3360 return -ENOENT; 3523 return -ENOENT;
3361} 3524}
3362 3525
3526void ieee80211_nan_func_terminated(struct ieee80211_vif *vif,
3527 u8 inst_id,
3528 enum nl80211_nan_func_term_reason reason,
3529 gfp_t gfp)
3530{
3531 struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
3532 struct cfg80211_nan_func *func;
3533 u64 cookie;
3534
3535 if (WARN_ON(vif->type != NL80211_IFTYPE_NAN))
3536 return;
3537
3538 spin_lock_bh(&sdata->u.nan.func_lock);
3539
3540 func = idr_find(&sdata->u.nan.function_inst_ids, inst_id);
3541 if (WARN_ON(!func)) {
3542 spin_unlock_bh(&sdata->u.nan.func_lock);
3543 return;
3544 }
3545
3546 cookie = func->cookie;
3547 idr_remove(&sdata->u.nan.function_inst_ids, inst_id);
3548
3549 spin_unlock_bh(&sdata->u.nan.func_lock);
3550
3551 cfg80211_free_nan_func(func);
3552
3553 cfg80211_nan_func_terminated(ieee80211_vif_to_wdev(vif), inst_id,
3554 reason, cookie, gfp);
3555}
3556EXPORT_SYMBOL(ieee80211_nan_func_terminated);
3557
3558void ieee80211_nan_func_match(struct ieee80211_vif *vif,
3559 struct cfg80211_nan_match_params *match,
3560 gfp_t gfp)
3561{
3562 struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
3563 struct cfg80211_nan_func *func;
3564
3565 if (WARN_ON(vif->type != NL80211_IFTYPE_NAN))
3566 return;
3567
3568 spin_lock_bh(&sdata->u.nan.func_lock);
3569
3570 func = idr_find(&sdata->u.nan.function_inst_ids, match->inst_id);
3571 if (WARN_ON(!func)) {
3572 spin_unlock_bh(&sdata->u.nan.func_lock);
3573 return;
3574 }
3575 match->cookie = func->cookie;
3576
3577 spin_unlock_bh(&sdata->u.nan.func_lock);
3578
3579 cfg80211_nan_match(ieee80211_vif_to_wdev(vif), match, gfp);
3580}
3581EXPORT_SYMBOL(ieee80211_nan_func_match);
3582
3363const struct cfg80211_ops mac80211_config_ops = { 3583const struct cfg80211_ops mac80211_config_ops = {
3364 .add_virtual_intf = ieee80211_add_iface, 3584 .add_virtual_intf = ieee80211_add_iface,
3365 .del_virtual_intf = ieee80211_del_iface, 3585 .del_virtual_intf = ieee80211_del_iface,
@@ -3445,4 +3665,9 @@ const struct cfg80211_ops mac80211_config_ops = {
3445 .set_ap_chanwidth = ieee80211_set_ap_chanwidth, 3665 .set_ap_chanwidth = ieee80211_set_ap_chanwidth,
3446 .add_tx_ts = ieee80211_add_tx_ts, 3666 .add_tx_ts = ieee80211_add_tx_ts,
3447 .del_tx_ts = ieee80211_del_tx_ts, 3667 .del_tx_ts = ieee80211_del_tx_ts,
3668 .start_nan = ieee80211_start_nan,
3669 .stop_nan = ieee80211_stop_nan,
3670 .nan_change_conf = ieee80211_nan_change_conf,
3671 .add_nan_func = ieee80211_add_nan_func,
3672 .del_nan_func = ieee80211_del_nan_func,
3448}; 3673};
diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c
index 74142d07ad31..e75cbf6ecc26 100644
--- a/net/mac80211/chan.c
+++ b/net/mac80211/chan.c
@@ -274,6 +274,7 @@ ieee80211_get_chanctx_max_required_bw(struct ieee80211_local *local,
274 ieee80211_get_max_required_bw(sdata)); 274 ieee80211_get_max_required_bw(sdata));
275 break; 275 break;
276 case NL80211_IFTYPE_P2P_DEVICE: 276 case NL80211_IFTYPE_P2P_DEVICE:
277 case NL80211_IFTYPE_NAN:
277 continue; 278 continue;
278 case NL80211_IFTYPE_ADHOC: 279 case NL80211_IFTYPE_ADHOC:
279 case NL80211_IFTYPE_WDS: 280 case NL80211_IFTYPE_WDS:
@@ -646,6 +647,9 @@ static int ieee80211_assign_vif_chanctx(struct ieee80211_sub_if_data *sdata,
646 struct ieee80211_chanctx *curr_ctx = NULL; 647 struct ieee80211_chanctx *curr_ctx = NULL;
647 int ret = 0; 648 int ret = 0;
648 649
650 if (WARN_ON(sdata->vif.type == NL80211_IFTYPE_NAN))
651 return -ENOTSUPP;
652
649 conf = rcu_dereference_protected(sdata->vif.chanctx_conf, 653 conf = rcu_dereference_protected(sdata->vif.chanctx_conf,
650 lockdep_is_held(&local->chanctx_mtx)); 654 lockdep_is_held(&local->chanctx_mtx));
651 655
@@ -718,6 +722,7 @@ void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local,
718 722
719 switch (sdata->vif.type) { 723 switch (sdata->vif.type) {
720 case NL80211_IFTYPE_P2P_DEVICE: 724 case NL80211_IFTYPE_P2P_DEVICE:
725 case NL80211_IFTYPE_NAN:
721 continue; 726 continue;
722 case NL80211_IFTYPE_STATION: 727 case NL80211_IFTYPE_STATION:
723 if (!sdata->u.mgd.associated) 728 if (!sdata->u.mgd.associated)
@@ -980,6 +985,7 @@ ieee80211_vif_chanctx_reservation_complete(struct ieee80211_sub_if_data *sdata)
980 case NL80211_IFTYPE_P2P_CLIENT: 985 case NL80211_IFTYPE_P2P_CLIENT:
981 case NL80211_IFTYPE_P2P_GO: 986 case NL80211_IFTYPE_P2P_GO:
982 case NL80211_IFTYPE_P2P_DEVICE: 987 case NL80211_IFTYPE_P2P_DEVICE:
988 case NL80211_IFTYPE_NAN:
983 case NUM_NL80211_IFTYPES: 989 case NUM_NL80211_IFTYPES:
984 WARN_ON(1); 990 WARN_ON(1);
985 break; 991 break;
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index 2906c1004e1a..f56e2f487d09 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -71,138 +71,45 @@ DEBUGFS_READONLY_FILE(wep_iv, "%#08x",
71DEBUGFS_READONLY_FILE(rate_ctrl_alg, "%s", 71DEBUGFS_READONLY_FILE(rate_ctrl_alg, "%s",
72 local->rate_ctrl ? local->rate_ctrl->ops->name : "hw/driver"); 72 local->rate_ctrl ? local->rate_ctrl->ops->name : "hw/driver");
73 73
74struct aqm_info { 74static ssize_t aqm_read(struct file *file,
75 struct ieee80211_local *local; 75 char __user *user_buf,
76 size_t size; 76 size_t count,
77 size_t len; 77 loff_t *ppos)
78 unsigned char buf[0];
79};
80
81#define AQM_HDR_LEN 200
82#define AQM_HW_ENTRY_LEN 40
83#define AQM_TXQ_ENTRY_LEN 110
84
85static int aqm_open(struct inode *inode, struct file *file)
86{ 78{
87 struct ieee80211_local *local = inode->i_private; 79 struct ieee80211_local *local = file->private_data;
88 struct ieee80211_sub_if_data *sdata;
89 struct sta_info *sta;
90 struct txq_info *txqi;
91 struct fq *fq = &local->fq; 80 struct fq *fq = &local->fq;
92 struct aqm_info *info = NULL; 81 char buf[200];
93 int len = 0; 82 int len = 0;
94 int i;
95
96 if (!local->ops->wake_tx_queue)
97 return -EOPNOTSUPP;
98
99 len += AQM_HDR_LEN;
100 len += 6 * AQM_HW_ENTRY_LEN;
101
102 rcu_read_lock();
103 list_for_each_entry_rcu(sdata, &local->interfaces, list)
104 len += AQM_TXQ_ENTRY_LEN;
105 list_for_each_entry_rcu(sta, &local->sta_list, list)
106 len += AQM_TXQ_ENTRY_LEN * ARRAY_SIZE(sta->sta.txq);
107 rcu_read_unlock();
108
109 info = vmalloc(len);
110 if (!info)
111 return -ENOMEM;
112 83
113 spin_lock_bh(&local->fq.lock); 84 spin_lock_bh(&local->fq.lock);
114 rcu_read_lock(); 85 rcu_read_lock();
115 86
116 file->private_data = info; 87 len = scnprintf(buf, sizeof(buf),
117 info->local = local; 88 "access name value\n"
118 info->size = len; 89 "R fq_flows_cnt %u\n"
119 len = 0; 90 "R fq_backlog %u\n"
120 91 "R fq_overlimit %u\n"
121 len += scnprintf(info->buf + len, info->size - len, 92 "R fq_overmemory %u\n"
122 "* hw\n" 93 "R fq_collisions %u\n"
123 "access name value\n" 94 "R fq_memory_usage %u\n"
124 "R fq_flows_cnt %u\n" 95 "RW fq_memory_limit %u\n"
125 "R fq_backlog %u\n" 96 "RW fq_limit %u\n"
126 "R fq_overlimit %u\n" 97 "RW fq_quantum %u\n",
127 "R fq_collisions %u\n" 98 fq->flows_cnt,
128 "RW fq_limit %u\n" 99 fq->backlog,
129 "RW fq_quantum %u\n", 100 fq->overmemory,
130 fq->flows_cnt, 101 fq->overlimit,
131 fq->backlog, 102 fq->collisions,
132 fq->overlimit, 103 fq->memory_usage,
133 fq->collisions, 104 fq->memory_limit,
134 fq->limit, 105 fq->limit,
135 fq->quantum); 106 fq->quantum);
136
137 len += scnprintf(info->buf + len,
138 info->size - len,
139 "* vif\n"
140 "ifname addr ac backlog-bytes backlog-packets flows overlimit collisions tx-bytes tx-packets\n");
141
142 list_for_each_entry_rcu(sdata, &local->interfaces, list) {
143 txqi = to_txq_info(sdata->vif.txq);
144 len += scnprintf(info->buf + len, info->size - len,
145 "%s %pM %u %u %u %u %u %u %u %u\n",
146 sdata->name,
147 sdata->vif.addr,
148 txqi->txq.ac,
149 txqi->tin.backlog_bytes,
150 txqi->tin.backlog_packets,
151 txqi->tin.flows,
152 txqi->tin.overlimit,
153 txqi->tin.collisions,
154 txqi->tin.tx_bytes,
155 txqi->tin.tx_packets);
156 }
157
158 len += scnprintf(info->buf + len,
159 info->size - len,
160 "* sta\n"
161 "ifname addr tid ac backlog-bytes backlog-packets flows overlimit collisions tx-bytes tx-packets\n");
162
163 list_for_each_entry_rcu(sta, &local->sta_list, list) {
164 sdata = sta->sdata;
165 for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) {
166 txqi = to_txq_info(sta->sta.txq[i]);
167 len += scnprintf(info->buf + len, info->size - len,
168 "%s %pM %d %d %u %u %u %u %u %u %u\n",
169 sdata->name,
170 sta->sta.addr,
171 txqi->txq.tid,
172 txqi->txq.ac,
173 txqi->tin.backlog_bytes,
174 txqi->tin.backlog_packets,
175 txqi->tin.flows,
176 txqi->tin.overlimit,
177 txqi->tin.collisions,
178 txqi->tin.tx_bytes,
179 txqi->tin.tx_packets);
180 }
181 }
182
183 info->len = len;
184 107
185 rcu_read_unlock(); 108 rcu_read_unlock();
186 spin_unlock_bh(&local->fq.lock); 109 spin_unlock_bh(&local->fq.lock);
187 110
188 return 0;
189}
190
191static int aqm_release(struct inode *inode, struct file *file)
192{
193 vfree(file->private_data);
194 return 0;
195}
196
197static ssize_t aqm_read(struct file *file,
198 char __user *user_buf,
199 size_t count,
200 loff_t *ppos)
201{
202 struct aqm_info *info = file->private_data;
203
204 return simple_read_from_buffer(user_buf, count, ppos, 111 return simple_read_from_buffer(user_buf, count, ppos,
205 info->buf, info->len); 112 buf, len);
206} 113}
207 114
208static ssize_t aqm_write(struct file *file, 115static ssize_t aqm_write(struct file *file,
@@ -210,8 +117,7 @@ static ssize_t aqm_write(struct file *file,
210 size_t count, 117 size_t count,
211 loff_t *ppos) 118 loff_t *ppos)
212{ 119{
213 struct aqm_info *info = file->private_data; 120 struct ieee80211_local *local = file->private_data;
214 struct ieee80211_local *local = info->local;
215 char buf[100]; 121 char buf[100];
216 size_t len; 122 size_t len;
217 123
@@ -228,6 +134,8 @@ static ssize_t aqm_write(struct file *file,
228 134
229 if (sscanf(buf, "fq_limit %u", &local->fq.limit) == 1) 135 if (sscanf(buf, "fq_limit %u", &local->fq.limit) == 1)
230 return count; 136 return count;
137 else if (sscanf(buf, "fq_memory_limit %u", &local->fq.memory_limit) == 1)
138 return count;
231 else if (sscanf(buf, "fq_quantum %u", &local->fq.quantum) == 1) 139 else if (sscanf(buf, "fq_quantum %u", &local->fq.quantum) == 1)
232 return count; 140 return count;
233 141
@@ -237,8 +145,7 @@ static ssize_t aqm_write(struct file *file,
237static const struct file_operations aqm_ops = { 145static const struct file_operations aqm_ops = {
238 .write = aqm_write, 146 .write = aqm_write,
239 .read = aqm_read, 147 .read = aqm_read,
240 .open = aqm_open, 148 .open = simple_open,
241 .release = aqm_release,
242 .llseek = default_llseek, 149 .llseek = default_llseek,
243}; 150};
244 151
@@ -302,6 +209,7 @@ static const char *hw_flag_names[] = {
302 FLAG(USES_RSS), 209 FLAG(USES_RSS),
303 FLAG(TX_AMSDU), 210 FLAG(TX_AMSDU),
304 FLAG(TX_FRAG_LIST), 211 FLAG(TX_FRAG_LIST),
212 FLAG(REPORTS_LOW_ACK),
305#undef FLAG 213#undef FLAG
306}; 214};
307 215
@@ -428,7 +336,9 @@ void debugfs_hw_add(struct ieee80211_local *local)
428 DEBUGFS_ADD(hwflags); 336 DEBUGFS_ADD(hwflags);
429 DEBUGFS_ADD(user_power); 337 DEBUGFS_ADD(user_power);
430 DEBUGFS_ADD(power); 338 DEBUGFS_ADD(power);
431 DEBUGFS_ADD_MODE(aqm, 0600); 339
340 if (local->ops->wake_tx_queue)
341 DEBUGFS_ADD_MODE(aqm, 0600);
432 342
433 statsd = debugfs_create_dir("statistics", phyd); 343 statsd = debugfs_create_dir("statistics", phyd);
434 344
diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c
index a5ba739cd2a7..bcec1240f41d 100644
--- a/net/mac80211/debugfs_netdev.c
+++ b/net/mac80211/debugfs_netdev.c
@@ -30,7 +30,7 @@ static ssize_t ieee80211_if_read(
30 size_t count, loff_t *ppos, 30 size_t count, loff_t *ppos,
31 ssize_t (*format)(const struct ieee80211_sub_if_data *, char *, int)) 31 ssize_t (*format)(const struct ieee80211_sub_if_data *, char *, int))
32{ 32{
33 char buf[70]; 33 char buf[200];
34 ssize_t ret = -EINVAL; 34 ssize_t ret = -EINVAL;
35 35
36 read_lock(&dev_base_lock); 36 read_lock(&dev_base_lock);
@@ -486,6 +486,38 @@ static ssize_t ieee80211_if_fmt_num_buffered_multicast(
486} 486}
487IEEE80211_IF_FILE_R(num_buffered_multicast); 487IEEE80211_IF_FILE_R(num_buffered_multicast);
488 488
489static ssize_t ieee80211_if_fmt_aqm(
490 const struct ieee80211_sub_if_data *sdata, char *buf, int buflen)
491{
492 struct ieee80211_local *local = sdata->local;
493 struct txq_info *txqi = to_txq_info(sdata->vif.txq);
494 int len;
495
496 spin_lock_bh(&local->fq.lock);
497 rcu_read_lock();
498
499 len = scnprintf(buf,
500 buflen,
501 "ac backlog-bytes backlog-packets new-flows drops marks overlimit collisions tx-bytes tx-packets\n"
502 "%u %u %u %u %u %u %u %u %u %u\n",
503 txqi->txq.ac,
504 txqi->tin.backlog_bytes,
505 txqi->tin.backlog_packets,
506 txqi->tin.flows,
507 txqi->cstats.drop_count,
508 txqi->cstats.ecn_mark,
509 txqi->tin.overlimit,
510 txqi->tin.collisions,
511 txqi->tin.tx_bytes,
512 txqi->tin.tx_packets);
513
514 rcu_read_unlock();
515 spin_unlock_bh(&local->fq.lock);
516
517 return len;
518}
519IEEE80211_IF_FILE_R(aqm);
520
489/* IBSS attributes */ 521/* IBSS attributes */
490static ssize_t ieee80211_if_fmt_tsf( 522static ssize_t ieee80211_if_fmt_tsf(
491 const struct ieee80211_sub_if_data *sdata, char *buf, int buflen) 523 const struct ieee80211_sub_if_data *sdata, char *buf, int buflen)
@@ -524,9 +556,15 @@ static ssize_t ieee80211_if_parse_tsf(
524 ret = kstrtoull(buf, 10, &tsf); 556 ret = kstrtoull(buf, 10, &tsf);
525 if (ret < 0) 557 if (ret < 0)
526 return ret; 558 return ret;
527 if (tsf_is_delta) 559 if (tsf_is_delta && local->ops->offset_tsf) {
528 tsf = drv_get_tsf(local, sdata) + tsf_is_delta * tsf; 560 drv_offset_tsf(local, sdata, tsf_is_delta * tsf);
529 if (local->ops->set_tsf) { 561 wiphy_info(local->hw.wiphy,
562 "debugfs offset TSF by %018lld\n",
563 tsf_is_delta * tsf);
564 } else if (local->ops->set_tsf) {
565 if (tsf_is_delta)
566 tsf = drv_get_tsf(local, sdata) +
567 tsf_is_delta * tsf;
530 drv_set_tsf(local, sdata, tsf); 568 drv_set_tsf(local, sdata, tsf);
531 wiphy_info(local->hw.wiphy, 569 wiphy_info(local->hw.wiphy,
532 "debugfs set TSF to %#018llx\n", tsf); 570 "debugfs set TSF to %#018llx\n", tsf);
@@ -618,6 +656,9 @@ static void add_common_files(struct ieee80211_sub_if_data *sdata)
618 DEBUGFS_ADD(rc_rateidx_vht_mcs_mask_2ghz); 656 DEBUGFS_ADD(rc_rateidx_vht_mcs_mask_2ghz);
619 DEBUGFS_ADD(rc_rateidx_vht_mcs_mask_5ghz); 657 DEBUGFS_ADD(rc_rateidx_vht_mcs_mask_5ghz);
620 DEBUGFS_ADD(hw_queues); 658 DEBUGFS_ADD(hw_queues);
659
660 if (sdata->local->ops->wake_tx_queue)
661 DEBUGFS_ADD(aqm);
621} 662}
622 663
623static void add_sta_files(struct ieee80211_sub_if_data *sdata) 664static void add_sta_files(struct ieee80211_sub_if_data *sdata)
diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index fd334133ff45..a2fcdb47a0e6 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -133,6 +133,55 @@ static ssize_t sta_last_seq_ctrl_read(struct file *file, char __user *userbuf,
133} 133}
134STA_OPS(last_seq_ctrl); 134STA_OPS(last_seq_ctrl);
135 135
136#define AQM_TXQ_ENTRY_LEN 130
137
138static ssize_t sta_aqm_read(struct file *file, char __user *userbuf,
139 size_t count, loff_t *ppos)
140{
141 struct sta_info *sta = file->private_data;
142 struct ieee80211_local *local = sta->local;
143 size_t bufsz = AQM_TXQ_ENTRY_LEN*(IEEE80211_NUM_TIDS+1);
144 char *buf = kzalloc(bufsz, GFP_KERNEL), *p = buf;
145 struct txq_info *txqi;
146 ssize_t rv;
147 int i;
148
149 if (!buf)
150 return -ENOMEM;
151
152 spin_lock_bh(&local->fq.lock);
153 rcu_read_lock();
154
155 p += scnprintf(p,
156 bufsz+buf-p,
157 "tid ac backlog-bytes backlog-packets new-flows drops marks overlimit collisions tx-bytes tx-packets\n");
158
159 for (i = 0; i < IEEE80211_NUM_TIDS; i++) {
160 txqi = to_txq_info(sta->sta.txq[i]);
161 p += scnprintf(p, bufsz+buf-p,
162 "%d %d %u %u %u %u %u %u %u %u %u\n",
163 txqi->txq.tid,
164 txqi->txq.ac,
165 txqi->tin.backlog_bytes,
166 txqi->tin.backlog_packets,
167 txqi->tin.flows,
168 txqi->cstats.drop_count,
169 txqi->cstats.ecn_mark,
170 txqi->tin.overlimit,
171 txqi->tin.collisions,
172 txqi->tin.tx_bytes,
173 txqi->tin.tx_packets);
174 }
175
176 rcu_read_unlock();
177 spin_unlock_bh(&local->fq.lock);
178
179 rv = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf);
180 kfree(buf);
181 return rv;
182}
183STA_OPS(aqm);
184
136static ssize_t sta_agg_status_read(struct file *file, char __user *userbuf, 185static ssize_t sta_agg_status_read(struct file *file, char __user *userbuf,
137 size_t count, loff_t *ppos) 186 size_t count, loff_t *ppos)
138{ 187{
@@ -478,6 +527,9 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta)
478 DEBUGFS_ADD_COUNTER(rx_fragments, rx_stats.fragments); 527 DEBUGFS_ADD_COUNTER(rx_fragments, rx_stats.fragments);
479 DEBUGFS_ADD_COUNTER(tx_filtered, status_stats.filtered); 528 DEBUGFS_ADD_COUNTER(tx_filtered, status_stats.filtered);
480 529
530 if (local->ops->wake_tx_queue)
531 DEBUGFS_ADD(aqm);
532
481 if (sizeof(sta->driver_buffered_tids) == sizeof(u32)) 533 if (sizeof(sta->driver_buffered_tids) == sizeof(u32))
482 debugfs_create_x32("driver_buffered_tids", 0400, 534 debugfs_create_x32("driver_buffered_tids", 0400,
483 sta->debugfs_dir, 535 sta->debugfs_dir,
@@ -492,10 +544,6 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta)
492 544
493void ieee80211_sta_debugfs_remove(struct sta_info *sta) 545void ieee80211_sta_debugfs_remove(struct sta_info *sta)
494{ 546{
495 struct ieee80211_local *local = sta->local;
496 struct ieee80211_sub_if_data *sdata = sta->sdata;
497
498 drv_sta_remove_debugfs(local, sdata, &sta->sta, sta->debugfs_dir);
499 debugfs_remove_recursive(sta->debugfs_dir); 547 debugfs_remove_recursive(sta->debugfs_dir);
500 sta->debugfs_dir = NULL; 548 sta->debugfs_dir = NULL;
501} 549}
diff --git a/net/mac80211/driver-ops.c b/net/mac80211/driver-ops.c
index c258f1041d33..bb886e7db47f 100644
--- a/net/mac80211/driver-ops.c
+++ b/net/mac80211/driver-ops.c
@@ -62,7 +62,7 @@ int drv_add_interface(struct ieee80211_local *local,
62 if (WARN_ON(sdata->vif.type == NL80211_IFTYPE_AP_VLAN || 62 if (WARN_ON(sdata->vif.type == NL80211_IFTYPE_AP_VLAN ||
63 (sdata->vif.type == NL80211_IFTYPE_MONITOR && 63 (sdata->vif.type == NL80211_IFTYPE_MONITOR &&
64 !ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF) && 64 !ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF) &&
65 !(sdata->u.mntr_flags & MONITOR_FLAG_ACTIVE)))) 65 !(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE))))
66 return -EINVAL; 66 return -EINVAL;
67 67
68 trace_drv_add_interface(local, sdata); 68 trace_drv_add_interface(local, sdata);
@@ -215,6 +215,21 @@ void drv_set_tsf(struct ieee80211_local *local,
215 trace_drv_return_void(local); 215 trace_drv_return_void(local);
216} 216}
217 217
218void drv_offset_tsf(struct ieee80211_local *local,
219 struct ieee80211_sub_if_data *sdata,
220 s64 offset)
221{
222 might_sleep();
223
224 if (!check_sdata_in_driver(sdata))
225 return;
226
227 trace_drv_offset_tsf(local, sdata, offset);
228 if (local->ops->offset_tsf)
229 local->ops->offset_tsf(&local->hw, &sdata->vif, offset);
230 trace_drv_return_void(local);
231}
232
218void drv_reset_tsf(struct ieee80211_local *local, 233void drv_reset_tsf(struct ieee80211_local *local,
219 struct ieee80211_sub_if_data *sdata) 234 struct ieee80211_sub_if_data *sdata)
220{ 235{
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index ba5fc1f01e53..09f77e4a8a79 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -162,7 +162,9 @@ static inline void drv_bss_info_changed(struct ieee80211_local *local,
162 return; 162 return;
163 163
164 if (WARN_ON_ONCE(sdata->vif.type == NL80211_IFTYPE_P2P_DEVICE || 164 if (WARN_ON_ONCE(sdata->vif.type == NL80211_IFTYPE_P2P_DEVICE ||
165 sdata->vif.type == NL80211_IFTYPE_MONITOR)) 165 sdata->vif.type == NL80211_IFTYPE_NAN ||
166 (sdata->vif.type == NL80211_IFTYPE_MONITOR &&
167 !sdata->vif.mu_mimo_owner)))
166 return; 168 return;
167 169
168 if (!check_sdata_in_driver(sdata)) 170 if (!check_sdata_in_driver(sdata))
@@ -498,21 +500,6 @@ static inline void drv_sta_add_debugfs(struct ieee80211_local *local,
498 local->ops->sta_add_debugfs(&local->hw, &sdata->vif, 500 local->ops->sta_add_debugfs(&local->hw, &sdata->vif,
499 sta, dir); 501 sta, dir);
500} 502}
501
502static inline void drv_sta_remove_debugfs(struct ieee80211_local *local,
503 struct ieee80211_sub_if_data *sdata,
504 struct ieee80211_sta *sta,
505 struct dentry *dir)
506{
507 might_sleep();
508
509 sdata = get_bss_sdata(sdata);
510 check_sdata_in_driver(sdata);
511
512 if (local->ops->sta_remove_debugfs)
513 local->ops->sta_remove_debugfs(&local->hw, &sdata->vif,
514 sta, dir);
515}
516#endif 503#endif
517 504
518static inline void drv_sta_pre_rcu_remove(struct ieee80211_local *local, 505static inline void drv_sta_pre_rcu_remove(struct ieee80211_local *local,
@@ -582,6 +569,9 @@ u64 drv_get_tsf(struct ieee80211_local *local,
582void drv_set_tsf(struct ieee80211_local *local, 569void drv_set_tsf(struct ieee80211_local *local,
583 struct ieee80211_sub_if_data *sdata, 570 struct ieee80211_sub_if_data *sdata,
584 u64 tsf); 571 u64 tsf);
572void drv_offset_tsf(struct ieee80211_local *local,
573 struct ieee80211_sub_if_data *sdata,
574 s64 offset);
585void drv_reset_tsf(struct ieee80211_local *local, 575void drv_reset_tsf(struct ieee80211_local *local,
586 struct ieee80211_sub_if_data *sdata); 576 struct ieee80211_sub_if_data *sdata);
587 577
@@ -1088,13 +1078,13 @@ static inline void drv_leave_ibss(struct ieee80211_local *local,
1088} 1078}
1089 1079
1090static inline u32 drv_get_expected_throughput(struct ieee80211_local *local, 1080static inline u32 drv_get_expected_throughput(struct ieee80211_local *local,
1091 struct ieee80211_sta *sta) 1081 struct sta_info *sta)
1092{ 1082{
1093 u32 ret = 0; 1083 u32 ret = 0;
1094 1084
1095 trace_drv_get_expected_throughput(sta); 1085 trace_drv_get_expected_throughput(&sta->sta);
1096 if (local->ops->get_expected_throughput) 1086 if (local->ops->get_expected_throughput && sta->uploaded)
1097 ret = local->ops->get_expected_throughput(&local->hw, sta); 1087 ret = local->ops->get_expected_throughput(&local->hw, &sta->sta);
1098 trace_drv_return_u32(local, ret); 1088 trace_drv_return_u32(local, ret);
1099 1089
1100 return ret; 1090 return ret;
@@ -1179,4 +1169,83 @@ static inline void drv_wake_tx_queue(struct ieee80211_local *local,
1179 local->ops->wake_tx_queue(&local->hw, &txq->txq); 1169 local->ops->wake_tx_queue(&local->hw, &txq->txq);
1180} 1170}
1181 1171
1172static inline int drv_start_nan(struct ieee80211_local *local,
1173 struct ieee80211_sub_if_data *sdata,
1174 struct cfg80211_nan_conf *conf)
1175{
1176 int ret;
1177
1178 might_sleep();
1179 check_sdata_in_driver(sdata);
1180
1181 trace_drv_start_nan(local, sdata, conf);
1182 ret = local->ops->start_nan(&local->hw, &sdata->vif, conf);
1183 trace_drv_return_int(local, ret);
1184 return ret;
1185}
1186
1187static inline void drv_stop_nan(struct ieee80211_local *local,
1188 struct ieee80211_sub_if_data *sdata)
1189{
1190 might_sleep();
1191 check_sdata_in_driver(sdata);
1192
1193 trace_drv_stop_nan(local, sdata);
1194 local->ops->stop_nan(&local->hw, &sdata->vif);
1195 trace_drv_return_void(local);
1196}
1197
1198static inline int drv_nan_change_conf(struct ieee80211_local *local,
1199 struct ieee80211_sub_if_data *sdata,
1200 struct cfg80211_nan_conf *conf,
1201 u32 changes)
1202{
1203 int ret;
1204
1205 might_sleep();
1206 check_sdata_in_driver(sdata);
1207
1208 if (!local->ops->nan_change_conf)
1209 return -EOPNOTSUPP;
1210
1211 trace_drv_nan_change_conf(local, sdata, conf, changes);
1212 ret = local->ops->nan_change_conf(&local->hw, &sdata->vif, conf,
1213 changes);
1214 trace_drv_return_int(local, ret);
1215
1216 return ret;
1217}
1218
1219static inline int drv_add_nan_func(struct ieee80211_local *local,
1220 struct ieee80211_sub_if_data *sdata,
1221 const struct cfg80211_nan_func *nan_func)
1222{
1223 int ret;
1224
1225 might_sleep();
1226 check_sdata_in_driver(sdata);
1227
1228 if (!local->ops->add_nan_func)
1229 return -EOPNOTSUPP;
1230
1231 trace_drv_add_nan_func(local, sdata, nan_func);
1232 ret = local->ops->add_nan_func(&local->hw, &sdata->vif, nan_func);
1233 trace_drv_return_int(local, ret);
1234
1235 return ret;
1236}
1237
1238static inline void drv_del_nan_func(struct ieee80211_local *local,
1239 struct ieee80211_sub_if_data *sdata,
1240 u8 instance_id)
1241{
1242 might_sleep();
1243 check_sdata_in_driver(sdata);
1244
1245 trace_drv_del_nan_func(local, sdata, instance_id);
1246 if (local->ops->del_nan_func)
1247 local->ops->del_nan_func(&local->hw, &sdata->vif, instance_id);
1248 trace_drv_return_void(local);
1249}
1250
1182#endif /* __MAC80211_DRIVER_OPS */ 1251#endif /* __MAC80211_DRIVER_OPS */
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index f56d342c31b8..34c2add2c455 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -3,7 +3,7 @@
3 * Copyright 2005, Devicescape Software, Inc. 3 * Copyright 2005, Devicescape Software, Inc.
4 * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> 4 * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
5 * Copyright 2007-2010 Johannes Berg <johannes@sipsolutions.net> 5 * Copyright 2007-2010 Johannes Berg <johannes@sipsolutions.net>
6 * Copyright 2013-2014 Intel Mobile Communications GmbH 6 * Copyright 2013-2015 Intel Mobile Communications GmbH
7 * 7 *
8 * This program is free software; you can redistribute it and/or modify 8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as 9 * it under the terms of the GNU General Public License version 2 as
@@ -86,6 +86,8 @@ struct ieee80211_local;
86 86
87#define IEEE80211_DEAUTH_FRAME_LEN (24 /* hdr */ + 2 /* reason */) 87#define IEEE80211_DEAUTH_FRAME_LEN (24 /* hdr */ + 2 /* reason */)
88 88
89#define IEEE80211_MAX_NAN_INSTANCE_ID 255
90
89struct ieee80211_fragment_entry { 91struct ieee80211_fragment_entry {
90 struct sk_buff_head skb_list; 92 struct sk_buff_head skb_list;
91 unsigned long first_frag_time; 93 unsigned long first_frag_time;
@@ -813,17 +815,39 @@ enum txq_info_flags {
813 * @def_flow: used as a fallback flow when a packet destined to @tin hashes to 815 * @def_flow: used as a fallback flow when a packet destined to @tin hashes to
814 * a fq_flow which is already owned by a different tin 816 * a fq_flow which is already owned by a different tin
815 * @def_cvars: codel vars for @def_flow 817 * @def_cvars: codel vars for @def_flow
818 * @frags: used to keep fragments created after dequeue
816 */ 819 */
817struct txq_info { 820struct txq_info {
818 struct fq_tin tin; 821 struct fq_tin tin;
819 struct fq_flow def_flow; 822 struct fq_flow def_flow;
820 struct codel_vars def_cvars; 823 struct codel_vars def_cvars;
824 struct codel_stats cstats;
825 struct sk_buff_head frags;
821 unsigned long flags; 826 unsigned long flags;
822 827
823 /* keep last! */ 828 /* keep last! */
824 struct ieee80211_txq txq; 829 struct ieee80211_txq txq;
825}; 830};
826 831
832struct ieee80211_if_mntr {
833 u32 flags;
834 u8 mu_follow_addr[ETH_ALEN] __aligned(2);
835};
836
837/**
838 * struct ieee80211_if_nan - NAN state
839 *
840 * @conf: current NAN configuration
841 * @func_ids: a bitmap of available instance_id's
842 */
843struct ieee80211_if_nan {
844 struct cfg80211_nan_conf conf;
845
846 /* protects function_inst_ids */
847 spinlock_t func_lock;
848 struct idr function_inst_ids;
849};
850
827struct ieee80211_sub_if_data { 851struct ieee80211_sub_if_data {
828 struct list_head list; 852 struct list_head list;
829 853
@@ -922,7 +946,8 @@ struct ieee80211_sub_if_data {
922 struct ieee80211_if_ibss ibss; 946 struct ieee80211_if_ibss ibss;
923 struct ieee80211_if_mesh mesh; 947 struct ieee80211_if_mesh mesh;
924 struct ieee80211_if_ocb ocb; 948 struct ieee80211_if_ocb ocb;
925 u32 mntr_flags; 949 struct ieee80211_if_mntr mntr;
950 struct ieee80211_if_nan nan;
926 } u; 951 } u;
927 952
928#ifdef CONFIG_MAC80211_DEBUGFS 953#ifdef CONFIG_MAC80211_DEBUGFS
@@ -1112,7 +1137,6 @@ struct ieee80211_local {
1112 struct fq fq; 1137 struct fq fq;
1113 struct codel_vars *cvars; 1138 struct codel_vars *cvars;
1114 struct codel_params cparams; 1139 struct codel_params cparams;
1115 struct codel_stats cstats;
1116 1140
1117 const struct ieee80211_ops *ops; 1141 const struct ieee80211_ops *ops;
1118 1142
@@ -1208,7 +1232,7 @@ struct ieee80211_local {
1208 spinlock_t tim_lock; 1232 spinlock_t tim_lock;
1209 unsigned long num_sta; 1233 unsigned long num_sta;
1210 struct list_head sta_list; 1234 struct list_head sta_list;
1211 struct rhashtable sta_hash; 1235 struct rhltable sta_hash;
1212 struct timer_list sta_cleanup; 1236 struct timer_list sta_cleanup;
1213 int sta_generation; 1237 int sta_generation;
1214 1238
@@ -1476,6 +1500,13 @@ static inline struct txq_info *to_txq_info(struct ieee80211_txq *txq)
1476 return container_of(txq, struct txq_info, txq); 1500 return container_of(txq, struct txq_info, txq);
1477} 1501}
1478 1502
1503static inline bool txq_has_queue(struct ieee80211_txq *txq)
1504{
1505 struct txq_info *txqi = to_txq_info(txq);
1506
1507 return !(skb_queue_empty(&txqi->frags) && !txqi->tin.backlog_packets);
1508}
1509
1479static inline int ieee80211_bssid_match(const u8 *raddr, const u8 *addr) 1510static inline int ieee80211_bssid_match(const u8 *raddr, const u8 *addr)
1480{ 1511{
1481 return ether_addr_equal(raddr, addr) || 1512 return ether_addr_equal(raddr, addr) ||
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index b123a9e325b3..638ec0759078 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -43,6 +43,8 @@
43 * by either the RTNL, the iflist_mtx or RCU. 43 * by either the RTNL, the iflist_mtx or RCU.
44 */ 44 */
45 45
46static void ieee80211_iface_work(struct work_struct *work);
47
46bool __ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata) 48bool __ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata)
47{ 49{
48 struct ieee80211_chanctx_conf *chanctx_conf; 50 struct ieee80211_chanctx_conf *chanctx_conf;
@@ -188,7 +190,7 @@ static int ieee80211_verify_mac(struct ieee80211_sub_if_data *sdata, u8 *addr,
188 continue; 190 continue;
189 191
190 if (iter->vif.type == NL80211_IFTYPE_MONITOR && 192 if (iter->vif.type == NL80211_IFTYPE_MONITOR &&
191 !(iter->u.mntr_flags & MONITOR_FLAG_ACTIVE)) 193 !(iter->u.mntr.flags & MONITOR_FLAG_ACTIVE))
192 continue; 194 continue;
193 195
194 m = iter->vif.addr; 196 m = iter->vif.addr;
@@ -217,7 +219,7 @@ static int ieee80211_change_mac(struct net_device *dev, void *addr)
217 return -EBUSY; 219 return -EBUSY;
218 220
219 if (sdata->vif.type == NL80211_IFTYPE_MONITOR && 221 if (sdata->vif.type == NL80211_IFTYPE_MONITOR &&
220 !(sdata->u.mntr_flags & MONITOR_FLAG_ACTIVE)) 222 !(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE))
221 check_dup = false; 223 check_dup = false;
222 224
223 ret = ieee80211_verify_mac(sdata, sa->sa_data, check_dup); 225 ret = ieee80211_verify_mac(sdata, sa->sa_data, check_dup);
@@ -325,6 +327,9 @@ static int ieee80211_check_queues(struct ieee80211_sub_if_data *sdata,
325 int n_queues = sdata->local->hw.queues; 327 int n_queues = sdata->local->hw.queues;
326 int i; 328 int i;
327 329
330 if (iftype == NL80211_IFTYPE_NAN)
331 return 0;
332
328 if (iftype != NL80211_IFTYPE_P2P_DEVICE) { 333 if (iftype != NL80211_IFTYPE_P2P_DEVICE) {
329 for (i = 0; i < IEEE80211_NUM_ACS; i++) { 334 for (i = 0; i < IEEE80211_NUM_ACS; i++) {
330 if (WARN_ON_ONCE(sdata->vif.hw_queue[i] == 335 if (WARN_ON_ONCE(sdata->vif.hw_queue[i] ==
@@ -357,7 +362,7 @@ void ieee80211_adjust_monitor_flags(struct ieee80211_sub_if_data *sdata,
357 const int offset) 362 const int offset)
358{ 363{
359 struct ieee80211_local *local = sdata->local; 364 struct ieee80211_local *local = sdata->local;
360 u32 flags = sdata->u.mntr_flags; 365 u32 flags = sdata->u.mntr.flags;
361 366
362#define ADJUST(_f, _s) do { \ 367#define ADJUST(_f, _s) do { \
363 if (flags & MONITOR_FLAG_##_f) \ 368 if (flags & MONITOR_FLAG_##_f) \
@@ -448,6 +453,9 @@ int ieee80211_add_virtual_monitor(struct ieee80211_local *local)
448 return ret; 453 return ret;
449 } 454 }
450 455
456 skb_queue_head_init(&sdata->skb_queue);
457 INIT_WORK(&sdata->work, ieee80211_iface_work);
458
451 return 0; 459 return 0;
452} 460}
453 461
@@ -540,6 +548,7 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up)
540 case NL80211_IFTYPE_ADHOC: 548 case NL80211_IFTYPE_ADHOC:
541 case NL80211_IFTYPE_P2P_DEVICE: 549 case NL80211_IFTYPE_P2P_DEVICE:
542 case NL80211_IFTYPE_OCB: 550 case NL80211_IFTYPE_OCB:
551 case NL80211_IFTYPE_NAN:
543 /* no special treatment */ 552 /* no special treatment */
544 break; 553 break;
545 case NL80211_IFTYPE_UNSPECIFIED: 554 case NL80211_IFTYPE_UNSPECIFIED:
@@ -589,12 +598,12 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up)
589 } 598 }
590 break; 599 break;
591 case NL80211_IFTYPE_MONITOR: 600 case NL80211_IFTYPE_MONITOR:
592 if (sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES) { 601 if (sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES) {
593 local->cooked_mntrs++; 602 local->cooked_mntrs++;
594 break; 603 break;
595 } 604 }
596 605
597 if (sdata->u.mntr_flags & MONITOR_FLAG_ACTIVE) { 606 if (sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) {
598 res = drv_add_interface(local, sdata); 607 res = drv_add_interface(local, sdata);
599 if (res) 608 if (res)
600 goto err_stop; 609 goto err_stop;
@@ -641,7 +650,8 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up)
641 local->fif_probe_req++; 650 local->fif_probe_req++;
642 } 651 }
643 652
644 if (sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE) 653 if (sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE &&
654 sdata->vif.type != NL80211_IFTYPE_NAN)
645 changed |= ieee80211_reset_erp_info(sdata); 655 changed |= ieee80211_reset_erp_info(sdata);
646 ieee80211_bss_info_change_notify(sdata, changed); 656 ieee80211_bss_info_change_notify(sdata, changed);
647 657
@@ -655,6 +665,7 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up)
655 break; 665 break;
656 case NL80211_IFTYPE_WDS: 666 case NL80211_IFTYPE_WDS:
657 case NL80211_IFTYPE_P2P_DEVICE: 667 case NL80211_IFTYPE_P2P_DEVICE:
668 case NL80211_IFTYPE_NAN:
658 break; 669 break;
659 default: 670 default:
660 /* not reached */ 671 /* not reached */
@@ -787,6 +798,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
787 struct ps_data *ps; 798 struct ps_data *ps;
788 struct cfg80211_chan_def chandef; 799 struct cfg80211_chan_def chandef;
789 bool cancel_scan; 800 bool cancel_scan;
801 struct cfg80211_nan_func *func;
790 802
791 clear_bit(SDATA_STATE_RUNNING, &sdata->state); 803 clear_bit(SDATA_STATE_RUNNING, &sdata->state);
792 804
@@ -926,7 +938,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
926 /* no need to tell driver */ 938 /* no need to tell driver */
927 break; 939 break;
928 case NL80211_IFTYPE_MONITOR: 940 case NL80211_IFTYPE_MONITOR:
929 if (sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES) { 941 if (sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES) {
930 local->cooked_mntrs--; 942 local->cooked_mntrs--;
931 break; 943 break;
932 } 944 }
@@ -939,6 +951,18 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
939 951
940 ieee80211_adjust_monitor_flags(sdata, -1); 952 ieee80211_adjust_monitor_flags(sdata, -1);
941 break; 953 break;
954 case NL80211_IFTYPE_NAN:
955 /* clean all the functions */
956 spin_lock_bh(&sdata->u.nan.func_lock);
957
958 idr_for_each_entry(&sdata->u.nan.function_inst_ids, func, i) {
959 idr_remove(&sdata->u.nan.function_inst_ids, i);
960 cfg80211_free_nan_func(func);
961 }
962 idr_destroy(&sdata->u.nan.function_inst_ids);
963
964 spin_unlock_bh(&sdata->u.nan.func_lock);
965 break;
942 case NL80211_IFTYPE_P2P_DEVICE: 966 case NL80211_IFTYPE_P2P_DEVICE:
943 /* relies on synchronize_rcu() below */ 967 /* relies on synchronize_rcu() below */
944 RCU_INIT_POINTER(local->p2p_sdata, NULL); 968 RCU_INIT_POINTER(local->p2p_sdata, NULL);
@@ -1012,7 +1036,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
1012 ieee80211_recalc_idle(local); 1036 ieee80211_recalc_idle(local);
1013 mutex_unlock(&local->mtx); 1037 mutex_unlock(&local->mtx);
1014 1038
1015 if (!(sdata->u.mntr_flags & MONITOR_FLAG_ACTIVE)) 1039 if (!(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE))
1016 break; 1040 break;
1017 1041
1018 /* fall through */ 1042 /* fall through */
@@ -1444,12 +1468,17 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata,
1444 case NL80211_IFTYPE_MONITOR: 1468 case NL80211_IFTYPE_MONITOR:
1445 sdata->dev->type = ARPHRD_IEEE80211_RADIOTAP; 1469 sdata->dev->type = ARPHRD_IEEE80211_RADIOTAP;
1446 sdata->dev->netdev_ops = &ieee80211_monitorif_ops; 1470 sdata->dev->netdev_ops = &ieee80211_monitorif_ops;
1447 sdata->u.mntr_flags = MONITOR_FLAG_CONTROL | 1471 sdata->u.mntr.flags = MONITOR_FLAG_CONTROL |
1448 MONITOR_FLAG_OTHER_BSS; 1472 MONITOR_FLAG_OTHER_BSS;
1449 break; 1473 break;
1450 case NL80211_IFTYPE_WDS: 1474 case NL80211_IFTYPE_WDS:
1451 sdata->vif.bss_conf.bssid = NULL; 1475 sdata->vif.bss_conf.bssid = NULL;
1452 break; 1476 break;
1477 case NL80211_IFTYPE_NAN:
1478 idr_init(&sdata->u.nan.function_inst_ids);
1479 spin_lock_init(&sdata->u.nan.func_lock);
1480 sdata->vif.bss_conf.bssid = sdata->vif.addr;
1481 break;
1453 case NL80211_IFTYPE_AP_VLAN: 1482 case NL80211_IFTYPE_AP_VLAN:
1454 case NL80211_IFTYPE_P2P_DEVICE: 1483 case NL80211_IFTYPE_P2P_DEVICE:
1455 sdata->vif.bss_conf.bssid = sdata->vif.addr; 1484 sdata->vif.bss_conf.bssid = sdata->vif.addr;
@@ -1717,7 +1746,7 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name,
1717 1746
1718 ASSERT_RTNL(); 1747 ASSERT_RTNL();
1719 1748
1720 if (type == NL80211_IFTYPE_P2P_DEVICE) { 1749 if (type == NL80211_IFTYPE_P2P_DEVICE || type == NL80211_IFTYPE_NAN) {
1721 struct wireless_dev *wdev; 1750 struct wireless_dev *wdev;
1722 1751
1723 sdata = kzalloc(sizeof(*sdata) + local->hw.vif_data_size, 1752 sdata = kzalloc(sizeof(*sdata) + local->hw.vif_data_size,
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index d00ea9b13f49..1075ac24c8c5 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -660,6 +660,9 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
660 660
661 ieee80211_roc_setup(local); 661 ieee80211_roc_setup(local);
662 662
663 local->hw.radiotap_timestamp.units_pos = -1;
664 local->hw.radiotap_timestamp.accuracy = -1;
665
663 return &local->hw; 666 return &local->hw;
664 err_free: 667 err_free:
665 wiphy_free(wiphy); 668 wiphy_free(wiphy);
@@ -818,6 +821,11 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
818 !local->ops->tdls_recv_channel_switch)) 821 !local->ops->tdls_recv_channel_switch))
819 return -EOPNOTSUPP; 822 return -EOPNOTSUPP;
820 823
824 if (WARN_ON(local->hw.wiphy->interface_modes &
825 BIT(NL80211_IFTYPE_NAN) &&
826 (!local->ops->start_nan || !local->ops->stop_nan)))
827 return -EINVAL;
828
821#ifdef CONFIG_PM 829#ifdef CONFIG_PM
822 if (hw->wiphy->wowlan && (!local->ops->suspend || !local->ops->resume)) 830 if (hw->wiphy->wowlan && (!local->ops->suspend || !local->ops->resume))
823 return -EINVAL; 831 return -EINVAL;
@@ -1055,6 +1063,9 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
1055 1063
1056 local->dynamic_ps_forced_timeout = -1; 1064 local->dynamic_ps_forced_timeout = -1;
1057 1065
1066 if (!local->hw.max_nan_de_entries)
1067 local->hw.max_nan_de_entries = IEEE80211_MAX_NAN_INSTANCE_ID;
1068
1058 result = ieee80211_wep_init(local); 1069 result = ieee80211_wep_init(local);
1059 if (result < 0) 1070 if (result < 0)
1060 wiphy_debug(local->hw.wiphy, "Failed to initialize wep: %d\n", 1071 wiphy_debug(local->hw.wiphy, "Failed to initialize wep: %d\n",
diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c
index faccef977670..b747c9645e43 100644
--- a/net/mac80211/mesh_hwmp.c
+++ b/net/mac80211/mesh_hwmp.c
@@ -326,22 +326,33 @@ static u32 airtime_link_metric_get(struct ieee80211_local *local,
326 u32 tx_time, estimated_retx; 326 u32 tx_time, estimated_retx;
327 u64 result; 327 u64 result;
328 328
329 if (sta->mesh->fail_avg >= 100) 329 /* Try to get rate based on HW/SW RC algorithm.
330 return MAX_METRIC; 330 * Rate is returned in units of Kbps, correct this
331 * to comply with airtime calculation units
332 * Round up in case we get rate < 100Kbps
333 */
334 rate = DIV_ROUND_UP(sta_get_expected_throughput(sta), 100);
331 335
332 sta_set_rate_info_tx(sta, &sta->tx_stats.last_rate, &rinfo); 336 if (rate) {
333 rate = cfg80211_calculate_bitrate(&rinfo); 337 err = 0;
334 if (WARN_ON(!rate)) 338 } else {
335 return MAX_METRIC; 339 if (sta->mesh->fail_avg >= 100)
340 return MAX_METRIC;
336 341
337 err = (sta->mesh->fail_avg << ARITH_SHIFT) / 100; 342 sta_set_rate_info_tx(sta, &sta->tx_stats.last_rate, &rinfo);
343 rate = cfg80211_calculate_bitrate(&rinfo);
344 if (WARN_ON(!rate))
345 return MAX_METRIC;
346
347 err = (sta->mesh->fail_avg << ARITH_SHIFT) / 100;
348 }
338 349
339 /* bitrate is in units of 100 Kbps, while we need rate in units of 350 /* bitrate is in units of 100 Kbps, while we need rate in units of
340 * 1Mbps. This will be corrected on tx_time computation. 351 * 1Mbps. This will be corrected on tx_time computation.
341 */ 352 */
342 tx_time = (device_constant + 10 * test_frame_len / rate); 353 tx_time = (device_constant + 10 * test_frame_len / rate);
343 estimated_retx = ((1 << (2 * ARITH_SHIFT)) / (s_unit - err)); 354 estimated_retx = ((1 << (2 * ARITH_SHIFT)) / (s_unit - err));
344 result = (tx_time * estimated_retx) >> (2 * ARITH_SHIFT) ; 355 result = (tx_time * estimated_retx) >> (2 * ARITH_SHIFT);
345 return (u32)result; 356 return (u32)result;
346} 357}
347 358
diff --git a/net/mac80211/mesh_sync.c b/net/mac80211/mesh_sync.c
index 64bc22ad9496..faca22cd02b5 100644
--- a/net/mac80211/mesh_sync.c
+++ b/net/mac80211/mesh_sync.c
@@ -28,7 +28,7 @@
28 * could be, for instance, in case a neighbor is restarted and its TSF counter 28 * could be, for instance, in case a neighbor is restarted and its TSF counter
29 * reset. 29 * reset.
30 */ 30 */
31#define TOFFSET_MAXIMUM_ADJUSTMENT 30000 /* 30 ms */ 31#define TOFFSET_MAXIMUM_ADJUSTMENT 800 /* 0.8 ms */
32 32
33struct sync_method { 33struct sync_method {
34 u8 method; 34 u8 method;
@@ -70,9 +70,13 @@ void mesh_sync_adjust_tbtt(struct ieee80211_sub_if_data *sdata)
70 } 70 }
71 spin_unlock_bh(&ifmsh->sync_offset_lock); 71 spin_unlock_bh(&ifmsh->sync_offset_lock);
72 72
73 tsf = drv_get_tsf(local, sdata); 73 if (local->ops->offset_tsf) {
74 if (tsf != -1ULL) 74 drv_offset_tsf(local, sdata, tsfdelta);
75 drv_set_tsf(local, sdata, tsf + tsfdelta); 75 } else {
76 tsf = drv_get_tsf(local, sdata);
77 if (tsf != -1ULL)
78 drv_set_tsf(local, sdata, tsf + tsfdelta);
79 }
76} 80}
77 81
78static void mesh_sync_offset_rx_bcn_presp(struct ieee80211_sub_if_data *sdata, 82static void mesh_sync_offset_rx_bcn_presp(struct ieee80211_sub_if_data *sdata,
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 8d426f637f58..7486f2dab4ba 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1672,11 +1672,15 @@ __ieee80211_sta_handle_tspec_ac_params(struct ieee80211_sub_if_data *sdata)
1672 non_acm_ac++) 1672 non_acm_ac++)
1673 if (!(sdata->wmm_acm & BIT(7 - 2 * non_acm_ac))) 1673 if (!(sdata->wmm_acm & BIT(7 - 2 * non_acm_ac)))
1674 break; 1674 break;
1675 /* The loop will result in using BK even if it requires 1675 /* Usually the loop will result in using BK even if it
1676 * admission control, such configuration makes no sense 1676 * requires admission control, but such a configuration
1677 * and we have to transmit somehow - the AC selection 1677 * makes no sense and we have to transmit somehow - the
1678 * does the same thing. 1678 * AC selection does the same thing.
1679 * If we started out trying to downgrade from BK, then
1680 * the extra condition here might be needed.
1679 */ 1681 */
1682 if (non_acm_ac >= IEEE80211_NUM_ACS)
1683 non_acm_ac = IEEE80211_AC_BK;
1680 if (drv_conf_tx(local, sdata, ac, 1684 if (drv_conf_tx(local, sdata, ac,
1681 &sdata->tx_conf[non_acm_ac])) 1685 &sdata->tx_conf[non_acm_ac]))
1682 sdata_err(sdata, 1686 sdata_err(sdata,
diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c
index 55a9c5b94ce1..c3f610bba3fe 100644
--- a/net/mac80211/offchannel.c
+++ b/net/mac80211/offchannel.c
@@ -128,7 +128,8 @@ void ieee80211_offchannel_stop_vifs(struct ieee80211_local *local)
128 if (!ieee80211_sdata_running(sdata)) 128 if (!ieee80211_sdata_running(sdata))
129 continue; 129 continue;
130 130
131 if (sdata->vif.type == NL80211_IFTYPE_P2P_DEVICE) 131 if (sdata->vif.type == NL80211_IFTYPE_P2P_DEVICE ||
132 sdata->vif.type == NL80211_IFTYPE_NAN)
132 continue; 133 continue;
133 134
134 if (sdata->vif.type != NL80211_IFTYPE_MONITOR) 135 if (sdata->vif.type != NL80211_IFTYPE_MONITOR)
@@ -838,6 +839,7 @@ int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev,
838 case NL80211_IFTYPE_P2P_DEVICE: 839 case NL80211_IFTYPE_P2P_DEVICE:
839 need_offchan = true; 840 need_offchan = true;
840 break; 841 break;
842 case NL80211_IFTYPE_NAN:
841 default: 843 default:
842 return -EOPNOTSUPP; 844 return -EOPNOTSUPP;
843 } 845 }
diff --git a/net/mac80211/pm.c b/net/mac80211/pm.c
index 00a43a70e1fc..28a3a0957c9e 100644
--- a/net/mac80211/pm.c
+++ b/net/mac80211/pm.c
@@ -178,8 +178,7 @@ int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan)
178 WARN_ON(!list_empty(&local->chanctx_list)); 178 WARN_ON(!list_empty(&local->chanctx_list));
179 179
180 /* stop hardware - this must stop RX */ 180 /* stop hardware - this must stop RX */
181 if (local->open_count) 181 ieee80211_stop_device(local);
182 ieee80211_stop_device(local);
183 182
184 suspend: 183 suspend:
185 local->suspended = true; 184 local->suspended = true;
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 9dce3b157908..6175db385ba7 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -180,6 +180,11 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local,
180 len += 12; 180 len += 12;
181 } 181 }
182 182
183 if (local->hw.radiotap_timestamp.units_pos >= 0) {
184 len = ALIGN(len, 8);
185 len += 12;
186 }
187
183 if (status->chains) { 188 if (status->chains) {
184 /* antenna and antenna signal fields */ 189 /* antenna and antenna signal fields */
185 len += 2 * hweight8(status->chains); 190 len += 2 * hweight8(status->chains);
@@ -447,6 +452,31 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
447 pos += 2; 452 pos += 2;
448 } 453 }
449 454
455 if (local->hw.radiotap_timestamp.units_pos >= 0) {
456 u16 accuracy = 0;
457 u8 flags = IEEE80211_RADIOTAP_TIMESTAMP_FLAG_32BIT;
458
459 rthdr->it_present |=
460 cpu_to_le32(1 << IEEE80211_RADIOTAP_TIMESTAMP);
461
462 /* ensure 8 byte alignment */
463 while ((pos - (u8 *)rthdr) & 7)
464 pos++;
465
466 put_unaligned_le64(status->device_timestamp, pos);
467 pos += sizeof(u64);
468
469 if (local->hw.radiotap_timestamp.accuracy >= 0) {
470 accuracy = local->hw.radiotap_timestamp.accuracy;
471 flags |= IEEE80211_RADIOTAP_TIMESTAMP_FLAG_ACCURACY;
472 }
473 put_unaligned_le16(accuracy, pos);
474 pos += sizeof(u16);
475
476 *pos++ = local->hw.radiotap_timestamp.units_pos;
477 *pos++ = flags;
478 }
479
450 for_each_set_bit(chain, &chains, IEEE80211_MAX_CHAINS) { 480 for_each_set_bit(chain, &chains, IEEE80211_MAX_CHAINS) {
451 *pos++ = status->chain_signal[chain]; 481 *pos++ = status->chain_signal[chain];
452 *pos++ = chain; 482 *pos++ = chain;
@@ -485,6 +515,9 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
485 struct net_device *prev_dev = NULL; 515 struct net_device *prev_dev = NULL;
486 int present_fcs_len = 0; 516 int present_fcs_len = 0;
487 unsigned int rtap_vendor_space = 0; 517 unsigned int rtap_vendor_space = 0;
518 struct ieee80211_mgmt *mgmt;
519 struct ieee80211_sub_if_data *monitor_sdata =
520 rcu_dereference(local->monitor_sdata);
488 521
489 if (unlikely(status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA)) { 522 if (unlikely(status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA)) {
490 struct ieee80211_vendor_radiotap *rtap = (void *)origskb->data; 523 struct ieee80211_vendor_radiotap *rtap = (void *)origskb->data;
@@ -567,7 +600,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
567 if (sdata->vif.type != NL80211_IFTYPE_MONITOR) 600 if (sdata->vif.type != NL80211_IFTYPE_MONITOR)
568 continue; 601 continue;
569 602
570 if (sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES) 603 if (sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES)
571 continue; 604 continue;
572 605
573 if (!ieee80211_sdata_running(sdata)) 606 if (!ieee80211_sdata_running(sdata))
@@ -585,6 +618,23 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
585 ieee80211_rx_stats(sdata->dev, skb->len); 618 ieee80211_rx_stats(sdata->dev, skb->len);
586 } 619 }
587 620
621 mgmt = (void *)skb->data;
622 if (monitor_sdata &&
623 skb->len >= IEEE80211_MIN_ACTION_SIZE + 1 + VHT_MUMIMO_GROUPS_DATA_LEN &&
624 ieee80211_is_action(mgmt->frame_control) &&
625 mgmt->u.action.category == WLAN_CATEGORY_VHT &&
626 mgmt->u.action.u.vht_group_notif.action_code == WLAN_VHT_ACTION_GROUPID_MGMT &&
627 is_valid_ether_addr(monitor_sdata->u.mntr.mu_follow_addr) &&
628 ether_addr_equal(mgmt->da, monitor_sdata->u.mntr.mu_follow_addr)) {
629 struct sk_buff *mu_skb = skb_copy(skb, GFP_ATOMIC);
630
631 if (mu_skb) {
632 mu_skb->pkt_type = IEEE80211_SDATA_QUEUE_TYPE_FRAME;
633 skb_queue_tail(&monitor_sdata->skb_queue, mu_skb);
634 ieee80211_queue_work(&local->hw, &monitor_sdata->work);
635 }
636 }
637
588 if (prev_dev) { 638 if (prev_dev) {
589 skb->dev = prev_dev; 639 skb->dev = prev_dev;
590 netif_receive_skb(skb); 640 netif_receive_skb(skb);
@@ -1072,8 +1122,15 @@ static void ieee80211_rx_reorder_ampdu(struct ieee80211_rx_data *rx,
1072 tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK; 1122 tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK;
1073 1123
1074 tid_agg_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[tid]); 1124 tid_agg_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[tid]);
1075 if (!tid_agg_rx) 1125 if (!tid_agg_rx) {
1126 if (ack_policy == IEEE80211_QOS_CTL_ACK_POLICY_BLOCKACK &&
1127 !test_bit(tid, rx->sta->ampdu_mlme.agg_session_valid) &&
1128 !test_and_set_bit(tid, rx->sta->ampdu_mlme.unexpected_agg))
1129 ieee80211_send_delba(rx->sdata, rx->sta->sta.addr, tid,
1130 WLAN_BACK_RECIPIENT,
1131 WLAN_REASON_QSTA_REQUIRE_SETUP);
1076 goto dont_reorder; 1132 goto dont_reorder;
1133 }
1077 1134
1078 /* qos null data frames are excluded */ 1135 /* qos null data frames are excluded */
1079 if (unlikely(hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_NULLFUNC))) 1136 if (unlikely(hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_NULLFUNC)))
@@ -1266,9 +1323,7 @@ static void sta_ps_start(struct sta_info *sta)
1266 return; 1323 return;
1267 1324
1268 for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) { 1325 for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) {
1269 struct txq_info *txqi = to_txq_info(sta->sta.txq[tid]); 1326 if (txq_has_queue(sta->sta.txq[tid]))
1270
1271 if (txqi->tin.backlog_packets)
1272 set_bit(tid, &sta->txq_buffered_tids); 1327 set_bit(tid, &sta->txq_buffered_tids);
1273 else 1328 else
1274 clear_bit(tid, &sta->txq_buffered_tids); 1329 clear_bit(tid, &sta->txq_buffered_tids);
@@ -2535,6 +2590,12 @@ ieee80211_rx_h_ctrl(struct ieee80211_rx_data *rx, struct sk_buff_head *frames)
2535 2590
2536 tid = le16_to_cpu(bar_data.control) >> 12; 2591 tid = le16_to_cpu(bar_data.control) >> 12;
2537 2592
2593 if (!test_bit(tid, rx->sta->ampdu_mlme.agg_session_valid) &&
2594 !test_and_set_bit(tid, rx->sta->ampdu_mlme.unexpected_agg))
2595 ieee80211_send_delba(rx->sdata, rx->sta->sta.addr, tid,
2596 WLAN_BACK_RECIPIENT,
2597 WLAN_REASON_QSTA_REQUIRE_SETUP);
2598
2538 tid_agg_rx = rcu_dereference(rx->sta->ampdu_mlme.tid_rx[tid]); 2599 tid_agg_rx = rcu_dereference(rx->sta->ampdu_mlme.tid_rx[tid]);
2539 if (!tid_agg_rx) 2600 if (!tid_agg_rx)
2540 return RX_DROP_MONITOR; 2601 return RX_DROP_MONITOR;
@@ -3147,7 +3208,7 @@ static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx,
3147 continue; 3208 continue;
3148 3209
3149 if (sdata->vif.type != NL80211_IFTYPE_MONITOR || 3210 if (sdata->vif.type != NL80211_IFTYPE_MONITOR ||
3150 !(sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES)) 3211 !(sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES))
3151 continue; 3212 continue;
3152 3213
3153 if (prev_dev) { 3214 if (prev_dev) {
@@ -3523,6 +3584,9 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx)
3523 ieee80211_is_probe_req(hdr->frame_control) || 3584 ieee80211_is_probe_req(hdr->frame_control) ||
3524 ieee80211_is_probe_resp(hdr->frame_control) || 3585 ieee80211_is_probe_resp(hdr->frame_control) ||
3525 ieee80211_is_beacon(hdr->frame_control); 3586 ieee80211_is_beacon(hdr->frame_control);
3587 case NL80211_IFTYPE_NAN:
3588 /* Currently no frames on NAN interface are allowed */
3589 return false;
3526 default: 3590 default:
3527 break; 3591 break;
3528 } 3592 }
@@ -3940,7 +4004,7 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
3940 __le16 fc; 4004 __le16 fc;
3941 struct ieee80211_rx_data rx; 4005 struct ieee80211_rx_data rx;
3942 struct ieee80211_sub_if_data *prev; 4006 struct ieee80211_sub_if_data *prev;
3943 struct rhash_head *tmp; 4007 struct rhlist_head *tmp;
3944 int err = 0; 4008 int err = 0;
3945 4009
3946 fc = ((struct ieee80211_hdr *)skb->data)->frame_control; 4010 fc = ((struct ieee80211_hdr *)skb->data)->frame_control;
@@ -3983,13 +4047,10 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
3983 goto out; 4047 goto out;
3984 } else if (ieee80211_is_data(fc)) { 4048 } else if (ieee80211_is_data(fc)) {
3985 struct sta_info *sta, *prev_sta; 4049 struct sta_info *sta, *prev_sta;
3986 const struct bucket_table *tbl;
3987 4050
3988 prev_sta = NULL; 4051 prev_sta = NULL;
3989 4052
3990 tbl = rht_dereference_rcu(local->sta_hash.tbl, &local->sta_hash); 4053 for_each_sta_info(local, hdr->addr2, sta, tmp) {
3991
3992 for_each_sta_info(local, tbl, hdr->addr2, sta, tmp) {
3993 if (!prev_sta) { 4054 if (!prev_sta) {
3994 prev_sta = sta; 4055 prev_sta = sta;
3995 continue; 4056 continue;
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index 070b40f15850..23d8ac829279 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -420,7 +420,7 @@ void ieee80211_scan_completed(struct ieee80211_hw *hw,
420{ 420{
421 struct ieee80211_local *local = hw_to_local(hw); 421 struct ieee80211_local *local = hw_to_local(hw);
422 422
423 trace_api_scan_completed(local, info); 423 trace_api_scan_completed(local, info->aborted);
424 424
425 set_bit(SCAN_COMPLETED, &local->scanning); 425 set_bit(SCAN_COMPLETED, &local->scanning);
426 if (info->aborted) 426 if (info->aborted)
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index aa58df80ede0..78e9ecbc96e6 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -67,12 +67,10 @@
67 67
68static const struct rhashtable_params sta_rht_params = { 68static const struct rhashtable_params sta_rht_params = {
69 .nelem_hint = 3, /* start small */ 69 .nelem_hint = 3, /* start small */
70 .insecure_elasticity = true, /* Disable chain-length checks. */
71 .automatic_shrinking = true, 70 .automatic_shrinking = true,
72 .head_offset = offsetof(struct sta_info, hash_node), 71 .head_offset = offsetof(struct sta_info, hash_node),
73 .key_offset = offsetof(struct sta_info, addr), 72 .key_offset = offsetof(struct sta_info, addr),
74 .key_len = ETH_ALEN, 73 .key_len = ETH_ALEN,
75 .hashfn = sta_addr_hash,
76 .max_size = CONFIG_MAC80211_STA_HASH_MAX_SIZE, 74 .max_size = CONFIG_MAC80211_STA_HASH_MAX_SIZE,
77}; 75};
78 76
@@ -80,8 +78,8 @@ static const struct rhashtable_params sta_rht_params = {
80static int sta_info_hash_del(struct ieee80211_local *local, 78static int sta_info_hash_del(struct ieee80211_local *local,
81 struct sta_info *sta) 79 struct sta_info *sta)
82{ 80{
83 return rhashtable_remove_fast(&local->sta_hash, &sta->hash_node, 81 return rhltable_remove(&local->sta_hash, &sta->hash_node,
84 sta_rht_params); 82 sta_rht_params);
85} 83}
86 84
87static void __cleanup_single_sta(struct sta_info *sta) 85static void __cleanup_single_sta(struct sta_info *sta)
@@ -157,19 +155,22 @@ static void cleanup_single_sta(struct sta_info *sta)
157 sta_info_free(local, sta); 155 sta_info_free(local, sta);
158} 156}
159 157
158struct rhlist_head *sta_info_hash_lookup(struct ieee80211_local *local,
159 const u8 *addr)
160{
161 return rhltable_lookup(&local->sta_hash, addr, sta_rht_params);
162}
163
160/* protected by RCU */ 164/* protected by RCU */
161struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata, 165struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata,
162 const u8 *addr) 166 const u8 *addr)
163{ 167{
164 struct ieee80211_local *local = sdata->local; 168 struct ieee80211_local *local = sdata->local;
169 struct rhlist_head *tmp;
165 struct sta_info *sta; 170 struct sta_info *sta;
166 struct rhash_head *tmp;
167 const struct bucket_table *tbl;
168 171
169 rcu_read_lock(); 172 rcu_read_lock();
170 tbl = rht_dereference_rcu(local->sta_hash.tbl, &local->sta_hash); 173 for_each_sta_info(local, addr, sta, tmp) {
171
172 for_each_sta_info(local, tbl, addr, sta, tmp) {
173 if (sta->sdata == sdata) { 174 if (sta->sdata == sdata) {
174 rcu_read_unlock(); 175 rcu_read_unlock();
175 /* this is safe as the caller must already hold 176 /* this is safe as the caller must already hold
@@ -190,14 +191,11 @@ struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata,
190 const u8 *addr) 191 const u8 *addr)
191{ 192{
192 struct ieee80211_local *local = sdata->local; 193 struct ieee80211_local *local = sdata->local;
194 struct rhlist_head *tmp;
193 struct sta_info *sta; 195 struct sta_info *sta;
194 struct rhash_head *tmp;
195 const struct bucket_table *tbl;
196 196
197 rcu_read_lock(); 197 rcu_read_lock();
198 tbl = rht_dereference_rcu(local->sta_hash.tbl, &local->sta_hash); 198 for_each_sta_info(local, addr, sta, tmp) {
199
200 for_each_sta_info(local, tbl, addr, sta, tmp) {
201 if (sta->sdata == sdata || 199 if (sta->sdata == sdata ||
202 (sta->sdata->bss && sta->sdata->bss == sdata->bss)) { 200 (sta->sdata->bss && sta->sdata->bss == sdata->bss)) {
203 rcu_read_unlock(); 201 rcu_read_unlock();
@@ -263,8 +261,8 @@ void sta_info_free(struct ieee80211_local *local, struct sta_info *sta)
263static int sta_info_hash_add(struct ieee80211_local *local, 261static int sta_info_hash_add(struct ieee80211_local *local,
264 struct sta_info *sta) 262 struct sta_info *sta)
265{ 263{
266 return rhashtable_insert_fast(&local->sta_hash, &sta->hash_node, 264 return rhltable_insert(&local->sta_hash, &sta->hash_node,
267 sta_rht_params); 265 sta_rht_params);
268} 266}
269 267
270static void sta_deliver_ps_frames(struct work_struct *wk) 268static void sta_deliver_ps_frames(struct work_struct *wk)
@@ -340,6 +338,9 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
340 338
341 memcpy(sta->addr, addr, ETH_ALEN); 339 memcpy(sta->addr, addr, ETH_ALEN);
342 memcpy(sta->sta.addr, addr, ETH_ALEN); 340 memcpy(sta->sta.addr, addr, ETH_ALEN);
341 sta->sta.max_rx_aggregation_subframes =
342 local->hw.max_rx_aggregation_subframes;
343
343 sta->local = local; 344 sta->local = local;
344 sta->sdata = sdata; 345 sta->sdata = sdata;
345 sta->rx_stats.last_rx = jiffies; 346 sta->rx_stats.last_rx = jiffies;
@@ -450,9 +451,9 @@ static int sta_info_insert_check(struct sta_info *sta)
450 is_multicast_ether_addr(sta->sta.addr))) 451 is_multicast_ether_addr(sta->sta.addr)))
451 return -EINVAL; 452 return -EINVAL;
452 453
453 /* Strictly speaking this isn't necessary as we hold the mutex, but 454 /* The RCU read lock is required by rhashtable due to
454 * the rhashtable code can't really deal with that distinction. We 455 * asynchronous resize/rehash. We also require the mutex
455 * do require the mutex for correctness though. 456 * for correctness.
456 */ 457 */
457 rcu_read_lock(); 458 rcu_read_lock();
458 lockdep_assert_held(&sdata->local->sta_mtx); 459 lockdep_assert_held(&sdata->local->sta_mtx);
@@ -687,7 +688,7 @@ static void __sta_info_recalc_tim(struct sta_info *sta, bool ignore_pending)
687 } 688 }
688 689
689 /* No need to do anything if the driver does all */ 690 /* No need to do anything if the driver does all */
690 if (ieee80211_hw_check(&local->hw, AP_LINK_PS)) 691 if (!local->ops->set_tim)
691 return; 692 return;
692 693
693 if (sta->dead) 694 if (sta->dead)
@@ -1040,16 +1041,11 @@ static void sta_info_cleanup(unsigned long data)
1040 round_jiffies(jiffies + STA_INFO_CLEANUP_INTERVAL)); 1041 round_jiffies(jiffies + STA_INFO_CLEANUP_INTERVAL));
1041} 1042}
1042 1043
1043u32 sta_addr_hash(const void *key, u32 length, u32 seed)
1044{
1045 return jhash(key, ETH_ALEN, seed);
1046}
1047
1048int sta_info_init(struct ieee80211_local *local) 1044int sta_info_init(struct ieee80211_local *local)
1049{ 1045{
1050 int err; 1046 int err;
1051 1047
1052 err = rhashtable_init(&local->sta_hash, &sta_rht_params); 1048 err = rhltable_init(&local->sta_hash, &sta_rht_params);
1053 if (err) 1049 if (err)
1054 return err; 1050 return err;
1055 1051
@@ -1065,7 +1061,7 @@ int sta_info_init(struct ieee80211_local *local)
1065void sta_info_stop(struct ieee80211_local *local) 1061void sta_info_stop(struct ieee80211_local *local)
1066{ 1062{
1067 del_timer_sync(&local->sta_cleanup); 1063 del_timer_sync(&local->sta_cleanup);
1068 rhashtable_destroy(&local->sta_hash); 1064 rhltable_destroy(&local->sta_hash);
1069} 1065}
1070 1066
1071 1067
@@ -1135,17 +1131,14 @@ struct ieee80211_sta *ieee80211_find_sta_by_ifaddr(struct ieee80211_hw *hw,
1135 const u8 *localaddr) 1131 const u8 *localaddr)
1136{ 1132{
1137 struct ieee80211_local *local = hw_to_local(hw); 1133 struct ieee80211_local *local = hw_to_local(hw);
1134 struct rhlist_head *tmp;
1138 struct sta_info *sta; 1135 struct sta_info *sta;
1139 struct rhash_head *tmp;
1140 const struct bucket_table *tbl;
1141
1142 tbl = rht_dereference_rcu(local->sta_hash.tbl, &local->sta_hash);
1143 1136
1144 /* 1137 /*
1145 * Just return a random station if localaddr is NULL 1138 * Just return a random station if localaddr is NULL
1146 * ... first in list. 1139 * ... first in list.
1147 */ 1140 */
1148 for_each_sta_info(local, tbl, addr, sta, tmp) { 1141 for_each_sta_info(local, addr, sta, tmp) {
1149 if (localaddr && 1142 if (localaddr &&
1150 !ether_addr_equal(sta->sdata->vif.addr, localaddr)) 1143 !ether_addr_equal(sta->sdata->vif.addr, localaddr))
1151 continue; 1144 continue;
@@ -1209,12 +1202,10 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta)
1209 1202
1210 if (sta->sta.txq[0]) { 1203 if (sta->sta.txq[0]) {
1211 for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { 1204 for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) {
1212 struct txq_info *txqi = to_txq_info(sta->sta.txq[i]); 1205 if (!txq_has_queue(sta->sta.txq[i]))
1213
1214 if (!txqi->tin.backlog_packets)
1215 continue; 1206 continue;
1216 1207
1217 drv_wake_tx_queue(local, txqi); 1208 drv_wake_tx_queue(local, to_txq_info(sta->sta.txq[i]));
1218 } 1209 }
1219 } 1210 }
1220 1211
@@ -1645,10 +1636,8 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta,
1645 return; 1636 return;
1646 1637
1647 for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) { 1638 for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) {
1648 struct txq_info *txqi = to_txq_info(sta->sta.txq[tid]);
1649
1650 if (!(driver_release_tids & BIT(tid)) || 1639 if (!(driver_release_tids & BIT(tid)) ||
1651 txqi->tin.backlog_packets) 1640 txq_has_queue(sta->sta.txq[tid]))
1652 continue; 1641 continue;
1653 1642
1654 sta_info_recalc_tim(sta); 1643 sta_info_recalc_tim(sta);
@@ -2279,11 +2268,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo)
2279 if (test_sta_flag(sta, WLAN_STA_TDLS_PEER)) 2268 if (test_sta_flag(sta, WLAN_STA_TDLS_PEER))
2280 sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_TDLS_PEER); 2269 sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_TDLS_PEER);
2281 2270
2282 /* check if the driver has a SW RC implementation */ 2271 thr = sta_get_expected_throughput(sta);
2283 if (ref && ref->ops->get_expected_throughput)
2284 thr = ref->ops->get_expected_throughput(sta->rate_ctrl_priv);
2285 else
2286 thr = drv_get_expected_throughput(local, &sta->sta);
2287 2272
2288 if (thr != 0) { 2273 if (thr != 0) {
2289 sinfo->filled |= BIT(NL80211_STA_INFO_EXPECTED_THROUGHPUT); 2274 sinfo->filled |= BIT(NL80211_STA_INFO_EXPECTED_THROUGHPUT);
@@ -2291,6 +2276,25 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo)
2291 } 2276 }
2292} 2277}
2293 2278
2279u32 sta_get_expected_throughput(struct sta_info *sta)
2280{
2281 struct ieee80211_sub_if_data *sdata = sta->sdata;
2282 struct ieee80211_local *local = sdata->local;
2283 struct rate_control_ref *ref = NULL;
2284 u32 thr = 0;
2285
2286 if (test_sta_flag(sta, WLAN_STA_RATE_CONTROL))
2287 ref = local->rate_ctrl;
2288
2289 /* check if the driver has a SW RC implementation */
2290 if (ref && ref->ops->get_expected_throughput)
2291 thr = ref->ops->get_expected_throughput(sta->rate_ctrl_priv);
2292 else
2293 thr = drv_get_expected_throughput(local, sta);
2294
2295 return thr;
2296}
2297
2294unsigned long ieee80211_sta_last_active(struct sta_info *sta) 2298unsigned long ieee80211_sta_last_active(struct sta_info *sta)
2295{ 2299{
2296 struct ieee80211_sta_rx_stats *stats = sta_get_last_rx_stats(sta); 2300 struct ieee80211_sta_rx_stats *stats = sta_get_last_rx_stats(sta);
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index 78b0ef32dddd..ed5fcb984a01 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -230,6 +230,8 @@ struct tid_ampdu_rx {
230 * @tid_rx_stop_requested: bitmap indicating which BA sessions per TID the 230 * @tid_rx_stop_requested: bitmap indicating which BA sessions per TID the
231 * driver requested to close until the work for it runs 231 * driver requested to close until the work for it runs
232 * @agg_session_valid: bitmap indicating which TID has a rx BA session open on 232 * @agg_session_valid: bitmap indicating which TID has a rx BA session open on
233 * @unexpected_agg: bitmap indicating which TID already sent a delBA due to
234 * unexpected aggregation related frames outside a session
233 * @work: work struct for starting/stopping aggregation 235 * @work: work struct for starting/stopping aggregation
234 * @tid_tx: aggregation info for Tx per TID 236 * @tid_tx: aggregation info for Tx per TID
235 * @tid_start_tx: sessions where start was requested 237 * @tid_start_tx: sessions where start was requested
@@ -244,6 +246,7 @@ struct sta_ampdu_mlme {
244 unsigned long tid_rx_timer_expired[BITS_TO_LONGS(IEEE80211_NUM_TIDS)]; 246 unsigned long tid_rx_timer_expired[BITS_TO_LONGS(IEEE80211_NUM_TIDS)];
245 unsigned long tid_rx_stop_requested[BITS_TO_LONGS(IEEE80211_NUM_TIDS)]; 247 unsigned long tid_rx_stop_requested[BITS_TO_LONGS(IEEE80211_NUM_TIDS)];
246 unsigned long agg_session_valid[BITS_TO_LONGS(IEEE80211_NUM_TIDS)]; 248 unsigned long agg_session_valid[BITS_TO_LONGS(IEEE80211_NUM_TIDS)];
249 unsigned long unexpected_agg[BITS_TO_LONGS(IEEE80211_NUM_TIDS)];
247 /* tx */ 250 /* tx */
248 struct work_struct work; 251 struct work_struct work;
249 struct tid_ampdu_tx __rcu *tid_tx[IEEE80211_NUM_TIDS]; 252 struct tid_ampdu_tx __rcu *tid_tx[IEEE80211_NUM_TIDS];
@@ -452,7 +455,7 @@ struct sta_info {
452 /* General information, mostly static */ 455 /* General information, mostly static */
453 struct list_head list, free_list; 456 struct list_head list, free_list;
454 struct rcu_head rcu_head; 457 struct rcu_head rcu_head;
455 struct rhash_head hash_node; 458 struct rhlist_head hash_node;
456 u8 addr[ETH_ALEN]; 459 u8 addr[ETH_ALEN];
457 struct ieee80211_local *local; 460 struct ieee80211_local *local;
458 struct ieee80211_sub_if_data *sdata; 461 struct ieee80211_sub_if_data *sdata;
@@ -635,6 +638,9 @@ rcu_dereference_protected_tid_tx(struct sta_info *sta, int tid)
635 */ 638 */
636#define STA_INFO_CLEANUP_INTERVAL (10 * HZ) 639#define STA_INFO_CLEANUP_INTERVAL (10 * HZ)
637 640
641struct rhlist_head *sta_info_hash_lookup(struct ieee80211_local *local,
642 const u8 *addr);
643
638/* 644/*
639 * Get a STA info, must be under RCU read lock. 645 * Get a STA info, must be under RCU read lock.
640 */ 646 */
@@ -644,17 +650,9 @@ struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata,
644struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata, 650struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata,
645 const u8 *addr); 651 const u8 *addr);
646 652
647u32 sta_addr_hash(const void *key, u32 length, u32 seed); 653#define for_each_sta_info(local, _addr, _sta, _tmp) \
648 654 rhl_for_each_entry_rcu(_sta, _tmp, \
649#define _sta_bucket_idx(_tbl, _a) \ 655 sta_info_hash_lookup(local, _addr), hash_node)
650 rht_bucket_index(_tbl, sta_addr_hash(_a, ETH_ALEN, (_tbl)->hash_rnd))
651
652#define for_each_sta_info(local, tbl, _addr, _sta, _tmp) \
653 rht_for_each_entry_rcu(_sta, _tmp, tbl, \
654 _sta_bucket_idx(tbl, _addr), \
655 hash_node) \
656 /* compare address and run code only if it matches */ \
657 if (ether_addr_equal(_sta->addr, (_addr)))
658 656
659/* 657/*
660 * Get STA info by index, BROKEN! 658 * Get STA info by index, BROKEN!
@@ -712,6 +710,8 @@ void sta_set_rate_info_tx(struct sta_info *sta,
712 struct rate_info *rinfo); 710 struct rate_info *rinfo);
713void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo); 711void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo);
714 712
713u32 sta_get_expected_throughput(struct sta_info *sta);
714
715void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata, 715void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata,
716 unsigned long exp_time); 716 unsigned long exp_time);
717u8 sta_info_tx_streams(struct sta_info *sta); 717u8 sta_info_tx_streams(struct sta_info *sta);
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index a2a68269675d..ddf71c648cab 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -557,6 +557,12 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local,
557static void ieee80211_lost_packet(struct sta_info *sta, 557static void ieee80211_lost_packet(struct sta_info *sta,
558 struct ieee80211_tx_info *info) 558 struct ieee80211_tx_info *info)
559{ 559{
560 /* If driver relies on its own algorithm for station kickout, skip
561 * mac80211 packet loss mechanism.
562 */
563 if (ieee80211_hw_check(&sta->local->hw, REPORTS_LOW_ACK))
564 return;
565
560 /* This packet was aggregated but doesn't carry status info */ 566 /* This packet was aggregated but doesn't carry status info */
561 if ((info->flags & IEEE80211_TX_CTL_AMPDU) && 567 if ((info->flags & IEEE80211_TX_CTL_AMPDU) &&
562 !(info->flags & IEEE80211_TX_STAT_AMPDU)) 568 !(info->flags & IEEE80211_TX_STAT_AMPDU))
@@ -709,7 +715,7 @@ void ieee80211_tx_monitor(struct ieee80211_local *local, struct sk_buff *skb,
709 if (!ieee80211_sdata_running(sdata)) 715 if (!ieee80211_sdata_running(sdata))
710 continue; 716 continue;
711 717
712 if ((sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES) && 718 if ((sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES) &&
713 !send_to_cooked) 719 !send_to_cooked)
714 continue; 720 continue;
715 721
@@ -740,8 +746,8 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
740 struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); 746 struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
741 __le16 fc; 747 __le16 fc;
742 struct ieee80211_supported_band *sband; 748 struct ieee80211_supported_band *sband;
749 struct rhlist_head *tmp;
743 struct sta_info *sta; 750 struct sta_info *sta;
744 struct rhash_head *tmp;
745 int retry_count; 751 int retry_count;
746 int rates_idx; 752 int rates_idx;
747 bool send_to_cooked; 753 bool send_to_cooked;
@@ -749,7 +755,6 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
749 struct ieee80211_bar *bar; 755 struct ieee80211_bar *bar;
750 int shift = 0; 756 int shift = 0;
751 int tid = IEEE80211_NUM_TIDS; 757 int tid = IEEE80211_NUM_TIDS;
752 const struct bucket_table *tbl;
753 758
754 rates_idx = ieee80211_tx_get_rates(hw, info, &retry_count); 759 rates_idx = ieee80211_tx_get_rates(hw, info, &retry_count);
755 760
@@ -758,9 +763,7 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
758 sband = local->hw.wiphy->bands[info->band]; 763 sband = local->hw.wiphy->bands[info->band];
759 fc = hdr->frame_control; 764 fc = hdr->frame_control;
760 765
761 tbl = rht_dereference_rcu(local->sta_hash.tbl, &local->sta_hash); 766 for_each_sta_info(local, hdr->addr1, sta, tmp) {
762
763 for_each_sta_info(local, tbl, hdr->addr1, sta, tmp) {
764 /* skip wrong virtual interface */ 767 /* skip wrong virtual interface */
765 if (!ether_addr_equal(hdr->addr2, sta->sdata->vif.addr)) 768 if (!ether_addr_equal(hdr->addr2, sta->sdata->vif.addr))
766 continue; 769 continue;
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index 77e4c53baefb..92a47afaa989 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -984,6 +984,32 @@ TRACE_EVENT(drv_set_tsf,
984 ) 984 )
985); 985);
986 986
987TRACE_EVENT(drv_offset_tsf,
988 TP_PROTO(struct ieee80211_local *local,
989 struct ieee80211_sub_if_data *sdata,
990 s64 offset),
991
992 TP_ARGS(local, sdata, offset),
993
994 TP_STRUCT__entry(
995 LOCAL_ENTRY
996 VIF_ENTRY
997 __field(s64, tsf_offset)
998 ),
999
1000 TP_fast_assign(
1001 LOCAL_ASSIGN;
1002 VIF_ASSIGN;
1003 __entry->tsf_offset = offset;
1004 ),
1005
1006 TP_printk(
1007 LOCAL_PR_FMT VIF_PR_FMT " tsf offset:%lld",
1008 LOCAL_PR_ARG, VIF_PR_ARG,
1009 (unsigned long long)__entry->tsf_offset
1010 )
1011);
1012
987DEFINE_EVENT(local_sdata_evt, drv_reset_tsf, 1013DEFINE_EVENT(local_sdata_evt, drv_reset_tsf,
988 TP_PROTO(struct ieee80211_local *local, 1014 TP_PROTO(struct ieee80211_local *local,
989 struct ieee80211_sub_if_data *sdata), 1015 struct ieee80211_sub_if_data *sdata),
@@ -1700,6 +1726,139 @@ TRACE_EVENT(drv_get_expected_throughput,
1700 ) 1726 )
1701); 1727);
1702 1728
1729TRACE_EVENT(drv_start_nan,
1730 TP_PROTO(struct ieee80211_local *local,
1731 struct ieee80211_sub_if_data *sdata,
1732 struct cfg80211_nan_conf *conf),
1733
1734 TP_ARGS(local, sdata, conf),
1735 TP_STRUCT__entry(
1736 LOCAL_ENTRY
1737 VIF_ENTRY
1738 __field(u8, master_pref)
1739 __field(u8, dual)
1740 ),
1741
1742 TP_fast_assign(
1743 LOCAL_ASSIGN;
1744 VIF_ASSIGN;
1745 __entry->master_pref = conf->master_pref;
1746 __entry->dual = conf->dual;
1747 ),
1748
1749 TP_printk(
1750 LOCAL_PR_FMT VIF_PR_FMT
1751 ", master preference: %u, dual: %d",
1752 LOCAL_PR_ARG, VIF_PR_ARG, __entry->master_pref,
1753 __entry->dual
1754 )
1755);
1756
1757TRACE_EVENT(drv_stop_nan,
1758 TP_PROTO(struct ieee80211_local *local,
1759 struct ieee80211_sub_if_data *sdata),
1760
1761 TP_ARGS(local, sdata),
1762
1763 TP_STRUCT__entry(
1764 LOCAL_ENTRY
1765 VIF_ENTRY
1766 ),
1767
1768 TP_fast_assign(
1769 LOCAL_ASSIGN;
1770 VIF_ASSIGN;
1771 ),
1772
1773 TP_printk(
1774 LOCAL_PR_FMT VIF_PR_FMT,
1775 LOCAL_PR_ARG, VIF_PR_ARG
1776 )
1777);
1778
1779TRACE_EVENT(drv_nan_change_conf,
1780 TP_PROTO(struct ieee80211_local *local,
1781 struct ieee80211_sub_if_data *sdata,
1782 struct cfg80211_nan_conf *conf,
1783 u32 changes),
1784
1785 TP_ARGS(local, sdata, conf, changes),
1786 TP_STRUCT__entry(
1787 LOCAL_ENTRY
1788 VIF_ENTRY
1789 __field(u8, master_pref)
1790 __field(u8, dual)
1791 __field(u32, changes)
1792 ),
1793
1794 TP_fast_assign(
1795 LOCAL_ASSIGN;
1796 VIF_ASSIGN;
1797 __entry->master_pref = conf->master_pref;
1798 __entry->dual = conf->dual;
1799 __entry->changes = changes;
1800 ),
1801
1802 TP_printk(
1803 LOCAL_PR_FMT VIF_PR_FMT
1804 ", master preference: %u, dual: %d, changes: 0x%x",
1805 LOCAL_PR_ARG, VIF_PR_ARG, __entry->master_pref,
1806 __entry->dual, __entry->changes
1807 )
1808);
1809
1810TRACE_EVENT(drv_add_nan_func,
1811 TP_PROTO(struct ieee80211_local *local,
1812 struct ieee80211_sub_if_data *sdata,
1813 const struct cfg80211_nan_func *func),
1814
1815 TP_ARGS(local, sdata, func),
1816 TP_STRUCT__entry(
1817 LOCAL_ENTRY
1818 VIF_ENTRY
1819 __field(u8, type)
1820 __field(u8, inst_id)
1821 ),
1822
1823 TP_fast_assign(
1824 LOCAL_ASSIGN;
1825 VIF_ASSIGN;
1826 __entry->type = func->type;
1827 __entry->inst_id = func->instance_id;
1828 ),
1829
1830 TP_printk(
1831 LOCAL_PR_FMT VIF_PR_FMT
1832 ", type: %u, inst_id: %u",
1833 LOCAL_PR_ARG, VIF_PR_ARG, __entry->type, __entry->inst_id
1834 )
1835);
1836
1837TRACE_EVENT(drv_del_nan_func,
1838 TP_PROTO(struct ieee80211_local *local,
1839 struct ieee80211_sub_if_data *sdata,
1840 u8 instance_id),
1841
1842 TP_ARGS(local, sdata, instance_id),
1843 TP_STRUCT__entry(
1844 LOCAL_ENTRY
1845 VIF_ENTRY
1846 __field(u8, instance_id)
1847 ),
1848
1849 TP_fast_assign(
1850 LOCAL_ASSIGN;
1851 VIF_ASSIGN;
1852 __entry->instance_id = instance_id;
1853 ),
1854
1855 TP_printk(
1856 LOCAL_PR_FMT VIF_PR_FMT
1857 ", instance_id: %u",
1858 LOCAL_PR_ARG, VIF_PR_ARG, __entry->instance_id
1859 )
1860);
1861
1703/* 1862/*
1704 * Tracing for API calls that drivers call. 1863 * Tracing for API calls that drivers call.
1705 */ 1864 */
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 18b285e06bc8..1c56abc49627 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -796,36 +796,6 @@ static __le16 ieee80211_tx_next_seq(struct sta_info *sta, int tid)
796 return ret; 796 return ret;
797} 797}
798 798
799static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local,
800 struct ieee80211_vif *vif,
801 struct ieee80211_sta *pubsta,
802 struct sk_buff *skb)
803{
804 struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
805 struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
806 struct ieee80211_txq *txq = NULL;
807
808 if ((info->flags & IEEE80211_TX_CTL_SEND_AFTER_DTIM) ||
809 (info->control.flags & IEEE80211_TX_CTRL_PS_RESPONSE))
810 return NULL;
811
812 if (!ieee80211_is_data(hdr->frame_control))
813 return NULL;
814
815 if (pubsta) {
816 u8 tid = skb->priority & IEEE80211_QOS_CTL_TID_MASK;
817
818 txq = pubsta->txq[tid];
819 } else if (vif) {
820 txq = vif->txq;
821 }
822
823 if (!txq)
824 return NULL;
825
826 return to_txq_info(txq);
827}
828
829static ieee80211_tx_result debug_noinline 799static ieee80211_tx_result debug_noinline
830ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx) 800ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
831{ 801{
@@ -883,9 +853,7 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
883 tid = *qc & IEEE80211_QOS_CTL_TID_MASK; 853 tid = *qc & IEEE80211_QOS_CTL_TID_MASK;
884 tx->sta->tx_stats.msdu[tid]++; 854 tx->sta->tx_stats.msdu[tid]++;
885 855
886 if (!ieee80211_get_txq(tx->local, info->control.vif, &tx->sta->sta, 856 hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid);
887 tx->skb))
888 hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid);
889 857
890 return TX_CONTINUE; 858 return TX_CONTINUE;
891} 859}
@@ -1274,6 +1242,36 @@ ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata,
1274 return TX_CONTINUE; 1242 return TX_CONTINUE;
1275} 1243}
1276 1244
1245static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local,
1246 struct ieee80211_vif *vif,
1247 struct ieee80211_sta *pubsta,
1248 struct sk_buff *skb)
1249{
1250 struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
1251 struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
1252 struct ieee80211_txq *txq = NULL;
1253
1254 if ((info->flags & IEEE80211_TX_CTL_SEND_AFTER_DTIM) ||
1255 (info->control.flags & IEEE80211_TX_CTRL_PS_RESPONSE))
1256 return NULL;
1257
1258 if (!ieee80211_is_data(hdr->frame_control))
1259 return NULL;
1260
1261 if (pubsta) {
1262 u8 tid = skb->priority & IEEE80211_QOS_CTL_TID_MASK;
1263
1264 txq = pubsta->txq[tid];
1265 } else if (vif) {
1266 txq = vif->txq;
1267 }
1268
1269 if (!txq)
1270 return NULL;
1271
1272 return to_txq_info(txq);
1273}
1274
1277static void ieee80211_set_skb_enqueue_time(struct sk_buff *skb) 1275static void ieee80211_set_skb_enqueue_time(struct sk_buff *skb)
1278{ 1276{
1279 IEEE80211_SKB_CB(skb)->control.enqueue_time = codel_get_time(); 1277 IEEE80211_SKB_CB(skb)->control.enqueue_time = codel_get_time();
@@ -1344,7 +1342,7 @@ static struct sk_buff *fq_tin_dequeue_func(struct fq *fq,
1344 local = container_of(fq, struct ieee80211_local, fq); 1342 local = container_of(fq, struct ieee80211_local, fq);
1345 txqi = container_of(tin, struct txq_info, tin); 1343 txqi = container_of(tin, struct txq_info, tin);
1346 cparams = &local->cparams; 1344 cparams = &local->cparams;
1347 cstats = &local->cstats; 1345 cstats = &txqi->cstats;
1348 1346
1349 if (flow == &txqi->def_flow) 1347 if (flow == &txqi->def_flow)
1350 cvars = &txqi->def_cvars; 1348 cvars = &txqi->def_cvars;
@@ -1404,6 +1402,8 @@ void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata,
1404 fq_tin_init(&txqi->tin); 1402 fq_tin_init(&txqi->tin);
1405 fq_flow_init(&txqi->def_flow); 1403 fq_flow_init(&txqi->def_flow);
1406 codel_vars_init(&txqi->def_cvars); 1404 codel_vars_init(&txqi->def_cvars);
1405 codel_stats_init(&txqi->cstats);
1406 __skb_queue_head_init(&txqi->frags);
1407 1407
1408 txqi->txq.vif = &sdata->vif; 1408 txqi->txq.vif = &sdata->vif;
1409 1409
@@ -1426,6 +1426,7 @@ void ieee80211_txq_purge(struct ieee80211_local *local,
1426 struct fq_tin *tin = &txqi->tin; 1426 struct fq_tin *tin = &txqi->tin;
1427 1427
1428 fq_tin_reset(fq, tin, fq_skb_free_func); 1428 fq_tin_reset(fq, tin, fq_skb_free_func);
1429 ieee80211_purge_tx_queue(&local->hw, &txqi->frags);
1429} 1430}
1430 1431
1431int ieee80211_txq_setup_flows(struct ieee80211_local *local) 1432int ieee80211_txq_setup_flows(struct ieee80211_local *local)
@@ -1433,6 +1434,8 @@ int ieee80211_txq_setup_flows(struct ieee80211_local *local)
1433 struct fq *fq = &local->fq; 1434 struct fq *fq = &local->fq;
1434 int ret; 1435 int ret;
1435 int i; 1436 int i;
1437 bool supp_vht = false;
1438 enum nl80211_band band;
1436 1439
1437 if (!local->ops->wake_tx_queue) 1440 if (!local->ops->wake_tx_queue)
1438 return 0; 1441 return 0;
@@ -1441,8 +1444,24 @@ int ieee80211_txq_setup_flows(struct ieee80211_local *local)
1441 if (ret) 1444 if (ret)
1442 return ret; 1445 return ret;
1443 1446
1447 /*
1448 * If the hardware doesn't support VHT, it is safe to limit the maximum
1449 * queue size. 4 Mbytes is 64 max-size aggregates in 802.11n.
1450 */
1451 for (band = 0; band < NUM_NL80211_BANDS; band++) {
1452 struct ieee80211_supported_band *sband;
1453
1454 sband = local->hw.wiphy->bands[band];
1455 if (!sband)
1456 continue;
1457
1458 supp_vht = supp_vht || sband->vht_cap.vht_supported;
1459 }
1460
1461 if (!supp_vht)
1462 fq->memory_limit = 4 << 20; /* 4 Mbytes */
1463
1444 codel_params_init(&local->cparams); 1464 codel_params_init(&local->cparams);
1445 codel_stats_init(&local->cstats);
1446 local->cparams.interval = MS2TIME(100); 1465 local->cparams.interval = MS2TIME(100);
1447 local->cparams.target = MS2TIME(20); 1466 local->cparams.target = MS2TIME(20);
1448 local->cparams.ecn = true; 1467 local->cparams.ecn = true;
@@ -1477,54 +1496,46 @@ void ieee80211_txq_teardown_flows(struct ieee80211_local *local)
1477 spin_unlock_bh(&fq->lock); 1496 spin_unlock_bh(&fq->lock);
1478} 1497}
1479 1498
1480struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, 1499static bool ieee80211_queue_skb(struct ieee80211_local *local,
1481 struct ieee80211_txq *txq) 1500 struct ieee80211_sub_if_data *sdata,
1501 struct sta_info *sta,
1502 struct sk_buff *skb)
1482{ 1503{
1483 struct ieee80211_local *local = hw_to_local(hw); 1504 struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
1484 struct txq_info *txqi = container_of(txq, struct txq_info, txq);
1485 struct ieee80211_hdr *hdr;
1486 struct sk_buff *skb = NULL;
1487 struct fq *fq = &local->fq; 1505 struct fq *fq = &local->fq;
1488 struct fq_tin *tin = &txqi->tin; 1506 struct ieee80211_vif *vif;
1507 struct txq_info *txqi;
1508 struct ieee80211_sta *pubsta;
1489 1509
1490 spin_lock_bh(&fq->lock); 1510 if (!local->ops->wake_tx_queue ||
1511 sdata->vif.type == NL80211_IFTYPE_MONITOR)
1512 return false;
1491 1513
1492 if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags)) 1514 if (sta && sta->uploaded)
1493 goto out; 1515 pubsta = &sta->sta;
1516 else
1517 pubsta = NULL;
1494 1518
1495 skb = fq_tin_dequeue(fq, tin, fq_tin_dequeue_func); 1519 if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
1496 if (!skb) 1520 sdata = container_of(sdata->bss,
1497 goto out; 1521 struct ieee80211_sub_if_data, u.ap);
1498 1522
1499 ieee80211_set_skb_vif(skb, txqi); 1523 vif = &sdata->vif;
1524 txqi = ieee80211_get_txq(local, vif, pubsta, skb);
1500 1525
1501 hdr = (struct ieee80211_hdr *)skb->data; 1526 if (!txqi)
1502 if (txq->sta && ieee80211_is_data_qos(hdr->frame_control)) { 1527 return false;
1503 struct sta_info *sta = container_of(txq->sta, struct sta_info,
1504 sta);
1505 struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
1506 1528
1507 hdr->seq_ctrl = ieee80211_tx_next_seq(sta, txq->tid); 1529 info->control.vif = vif;
1508 if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags))
1509 info->flags |= IEEE80211_TX_CTL_AMPDU;
1510 else
1511 info->flags &= ~IEEE80211_TX_CTL_AMPDU;
1512 }
1513 1530
1514out: 1531 spin_lock_bh(&fq->lock);
1532 ieee80211_txq_enqueue(local, txqi, skb);
1515 spin_unlock_bh(&fq->lock); 1533 spin_unlock_bh(&fq->lock);
1516 1534
1517 if (skb && skb_has_frag_list(skb) && 1535 drv_wake_tx_queue(local, txqi);
1518 !ieee80211_hw_check(&local->hw, TX_FRAG_LIST)) {
1519 if (skb_linearize(skb)) {
1520 ieee80211_free_txskb(&local->hw, skb);
1521 return NULL;
1522 }
1523 }
1524 1536
1525 return skb; 1537 return true;
1526} 1538}
1527EXPORT_SYMBOL(ieee80211_tx_dequeue);
1528 1539
1529static bool ieee80211_tx_frags(struct ieee80211_local *local, 1540static bool ieee80211_tx_frags(struct ieee80211_local *local,
1530 struct ieee80211_vif *vif, 1541 struct ieee80211_vif *vif,
@@ -1533,9 +1544,7 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local,
1533 bool txpending) 1544 bool txpending)
1534{ 1545{
1535 struct ieee80211_tx_control control = {}; 1546 struct ieee80211_tx_control control = {};
1536 struct fq *fq = &local->fq;
1537 struct sk_buff *skb, *tmp; 1547 struct sk_buff *skb, *tmp;
1538 struct txq_info *txqi;
1539 unsigned long flags; 1548 unsigned long flags;
1540 1549
1541 skb_queue_walk_safe(skbs, skb, tmp) { 1550 skb_queue_walk_safe(skbs, skb, tmp) {
@@ -1550,21 +1559,6 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local,
1550 } 1559 }
1551#endif 1560#endif
1552 1561
1553 txqi = ieee80211_get_txq(local, vif, sta, skb);
1554 if (txqi) {
1555 info->control.vif = vif;
1556
1557 __skb_unlink(skb, skbs);
1558
1559 spin_lock_bh(&fq->lock);
1560 ieee80211_txq_enqueue(local, txqi, skb);
1561 spin_unlock_bh(&fq->lock);
1562
1563 drv_wake_tx_queue(local, txqi);
1564
1565 continue;
1566 }
1567
1568 spin_lock_irqsave(&local->queue_stop_reason_lock, flags); 1562 spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
1569 if (local->queue_stop_reasons[q] || 1563 if (local->queue_stop_reasons[q] ||
1570 (!txpending && !skb_queue_empty(&local->pending[q]))) { 1564 (!txpending && !skb_queue_empty(&local->pending[q]))) {
@@ -1648,7 +1642,7 @@ static bool __ieee80211_tx(struct ieee80211_local *local,
1648 1642
1649 switch (sdata->vif.type) { 1643 switch (sdata->vif.type) {
1650 case NL80211_IFTYPE_MONITOR: 1644 case NL80211_IFTYPE_MONITOR:
1651 if (sdata->u.mntr_flags & MONITOR_FLAG_ACTIVE) { 1645 if (sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) {
1652 vif = &sdata->vif; 1646 vif = &sdata->vif;
1653 break; 1647 break;
1654 } 1648 }
@@ -1685,10 +1679,13 @@ static bool __ieee80211_tx(struct ieee80211_local *local,
1685/* 1679/*
1686 * Invoke TX handlers, return 0 on success and non-zero if the 1680 * Invoke TX handlers, return 0 on success and non-zero if the
1687 * frame was dropped or queued. 1681 * frame was dropped or queued.
1682 *
1683 * The handlers are split into an early and late part. The latter is everything
1684 * that can be sensitive to reordering, and will be deferred to after packets
1685 * are dequeued from the intermediate queues (when they are enabled).
1688 */ 1686 */
1689static int invoke_tx_handlers(struct ieee80211_tx_data *tx) 1687static int invoke_tx_handlers_early(struct ieee80211_tx_data *tx)
1690{ 1688{
1691 struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
1692 ieee80211_tx_result res = TX_DROP; 1689 ieee80211_tx_result res = TX_DROP;
1693 1690
1694#define CALL_TXH(txh) \ 1691#define CALL_TXH(txh) \
@@ -1706,6 +1703,31 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
1706 if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL)) 1703 if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
1707 CALL_TXH(ieee80211_tx_h_rate_ctrl); 1704 CALL_TXH(ieee80211_tx_h_rate_ctrl);
1708 1705
1706 txh_done:
1707 if (unlikely(res == TX_DROP)) {
1708 I802_DEBUG_INC(tx->local->tx_handlers_drop);
1709 if (tx->skb)
1710 ieee80211_free_txskb(&tx->local->hw, tx->skb);
1711 else
1712 ieee80211_purge_tx_queue(&tx->local->hw, &tx->skbs);
1713 return -1;
1714 } else if (unlikely(res == TX_QUEUED)) {
1715 I802_DEBUG_INC(tx->local->tx_handlers_queued);
1716 return -1;
1717 }
1718
1719 return 0;
1720}
1721
1722/*
1723 * Late handlers can be called while the sta lock is held. Handlers that can
1724 * cause packets to be generated will cause deadlock!
1725 */
1726static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx)
1727{
1728 struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
1729 ieee80211_tx_result res = TX_CONTINUE;
1730
1709 if (unlikely(info->flags & IEEE80211_TX_INTFL_RETRANSMISSION)) { 1731 if (unlikely(info->flags & IEEE80211_TX_INTFL_RETRANSMISSION)) {
1710 __skb_queue_tail(&tx->skbs, tx->skb); 1732 __skb_queue_tail(&tx->skbs, tx->skb);
1711 tx->skb = NULL; 1733 tx->skb = NULL;
@@ -1738,6 +1760,15 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
1738 return 0; 1760 return 0;
1739} 1761}
1740 1762
1763static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
1764{
1765 int r = invoke_tx_handlers_early(tx);
1766
1767 if (r)
1768 return r;
1769 return invoke_tx_handlers_late(tx);
1770}
1771
1741bool ieee80211_tx_prepare_skb(struct ieee80211_hw *hw, 1772bool ieee80211_tx_prepare_skb(struct ieee80211_hw *hw,
1742 struct ieee80211_vif *vif, struct sk_buff *skb, 1773 struct ieee80211_vif *vif, struct sk_buff *skb,
1743 int band, struct ieee80211_sta **sta) 1774 int band, struct ieee80211_sta **sta)
@@ -1812,7 +1843,13 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata,
1812 info->hw_queue = 1843 info->hw_queue =
1813 sdata->vif.hw_queue[skb_get_queue_mapping(skb)]; 1844 sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
1814 1845
1815 if (!invoke_tx_handlers(&tx)) 1846 if (invoke_tx_handlers_early(&tx))
1847 return false;
1848
1849 if (ieee80211_queue_skb(local, sdata, tx.sta, tx.skb))
1850 return true;
1851
1852 if (!invoke_tx_handlers_late(&tx))
1816 result = __ieee80211_tx(local, &tx.skbs, led_len, 1853 result = __ieee80211_tx(local, &tx.skbs, led_len,
1817 tx.sta, txpending); 1854 tx.sta, txpending);
1818 1855
@@ -2268,15 +2305,9 @@ static int ieee80211_lookup_ra_sta(struct ieee80211_sub_if_data *sdata,
2268 case NL80211_IFTYPE_STATION: 2305 case NL80211_IFTYPE_STATION:
2269 if (sdata->wdev.wiphy->flags & WIPHY_FLAG_SUPPORTS_TDLS) { 2306 if (sdata->wdev.wiphy->flags & WIPHY_FLAG_SUPPORTS_TDLS) {
2270 sta = sta_info_get(sdata, skb->data); 2307 sta = sta_info_get(sdata, skb->data);
2271 if (sta) { 2308 if (sta && test_sta_flag(sta, WLAN_STA_TDLS_PEER)) {
2272 bool tdls_peer, tdls_auth; 2309 if (test_sta_flag(sta,
2273 2310 WLAN_STA_TDLS_PEER_AUTH)) {
2274 tdls_peer = test_sta_flag(sta,
2275 WLAN_STA_TDLS_PEER);
2276 tdls_auth = test_sta_flag(sta,
2277 WLAN_STA_TDLS_PEER_AUTH);
2278
2279 if (tdls_peer && tdls_auth) {
2280 *sta_out = sta; 2311 *sta_out = sta;
2281 return 0; 2312 return 0;
2282 } 2313 }
@@ -2288,8 +2319,7 @@ static int ieee80211_lookup_ra_sta(struct ieee80211_sub_if_data *sdata,
2288 * after a TDLS sta is removed due to being 2319 * after a TDLS sta is removed due to being
2289 * unreachable. 2320 * unreachable.
2290 */ 2321 */
2291 if (tdls_peer && !tdls_auth && 2322 if (!ieee80211_is_tdls_setup(skb))
2292 !ieee80211_is_tdls_setup(skb))
2293 return -EINVAL; 2323 return -EINVAL;
2294 } 2324 }
2295 2325
@@ -2339,7 +2369,6 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata,
2339 struct mesh_path __maybe_unused *mppath = NULL, *mpath = NULL; 2369 struct mesh_path __maybe_unused *mppath = NULL, *mpath = NULL;
2340 const u8 *encaps_data; 2370 const u8 *encaps_data;
2341 int encaps_len, skip_header_bytes; 2371 int encaps_len, skip_header_bytes;
2342 int nh_pos, h_pos;
2343 bool wme_sta = false, authorized = false; 2372 bool wme_sta = false, authorized = false;
2344 bool tdls_peer; 2373 bool tdls_peer;
2345 bool multicast; 2374 bool multicast;
@@ -2645,13 +2674,7 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata,
2645 encaps_len = 0; 2674 encaps_len = 0;
2646 } 2675 }
2647 2676
2648 nh_pos = skb_network_header(skb) - skb->data;
2649 h_pos = skb_transport_header(skb) - skb->data;
2650
2651 skb_pull(skb, skip_header_bytes); 2677 skb_pull(skb, skip_header_bytes);
2652 nh_pos -= skip_header_bytes;
2653 h_pos -= skip_header_bytes;
2654
2655 head_need = hdrlen + encaps_len + meshhdrlen - skb_headroom(skb); 2678 head_need = hdrlen + encaps_len + meshhdrlen - skb_headroom(skb);
2656 2679
2657 /* 2680 /*
@@ -2677,18 +2700,12 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata,
2677 } 2700 }
2678 } 2701 }
2679 2702
2680 if (encaps_data) { 2703 if (encaps_data)
2681 memcpy(skb_push(skb, encaps_len), encaps_data, encaps_len); 2704 memcpy(skb_push(skb, encaps_len), encaps_data, encaps_len);
2682 nh_pos += encaps_len;
2683 h_pos += encaps_len;
2684 }
2685 2705
2686#ifdef CONFIG_MAC80211_MESH 2706#ifdef CONFIG_MAC80211_MESH
2687 if (meshhdrlen > 0) { 2707 if (meshhdrlen > 0)
2688 memcpy(skb_push(skb, meshhdrlen), &mesh_hdr, meshhdrlen); 2708 memcpy(skb_push(skb, meshhdrlen), &mesh_hdr, meshhdrlen);
2689 nh_pos += meshhdrlen;
2690 h_pos += meshhdrlen;
2691 }
2692#endif 2709#endif
2693 2710
2694 if (ieee80211_is_data_qos(fc)) { 2711 if (ieee80211_is_data_qos(fc)) {
@@ -2704,15 +2721,7 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata,
2704 } else 2721 } else
2705 memcpy(skb_push(skb, hdrlen), &hdr, hdrlen); 2722 memcpy(skb_push(skb, hdrlen), &hdr, hdrlen);
2706 2723
2707 nh_pos += hdrlen;
2708 h_pos += hdrlen;
2709
2710 /* Update skb pointers to various headers since this modified frame
2711 * is going to go through Linux networking code that may potentially
2712 * need things like pointer to IP header. */
2713 skb_reset_mac_header(skb); 2724 skb_reset_mac_header(skb);
2714 skb_set_network_header(skb, nh_pos);
2715 skb_set_transport_header(skb, h_pos);
2716 2725
2717 info = IEEE80211_SKB_CB(skb); 2726 info = IEEE80211_SKB_CB(skb);
2718 memset(info, 0, sizeof(*info)); 2727 memset(info, 0, sizeof(*info));
@@ -3184,8 +3193,71 @@ out:
3184 return ret; 3193 return ret;
3185} 3194}
3186 3195
3196/*
3197 * Can be called while the sta lock is held. Anything that can cause packets to
3198 * be generated will cause deadlock!
3199 */
3200static void ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sdata,
3201 struct sta_info *sta, u8 pn_offs,
3202 struct ieee80211_key *key,
3203 struct sk_buff *skb)
3204{
3205 struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
3206 struct ieee80211_hdr *hdr = (void *)skb->data;
3207 u8 tid = IEEE80211_NUM_TIDS;
3208
3209 if (key)
3210 info->control.hw_key = &key->conf;
3211
3212 ieee80211_tx_stats(skb->dev, skb->len);
3213
3214 if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) {
3215 tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
3216 *ieee80211_get_qos_ctl(hdr) = tid;
3217 hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid);
3218 } else {
3219 info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
3220 hdr->seq_ctrl = cpu_to_le16(sdata->sequence_number);
3221 sdata->sequence_number += 0x10;
3222 }
3223
3224 if (skb_shinfo(skb)->gso_size)
3225 sta->tx_stats.msdu[tid] +=
3226 DIV_ROUND_UP(skb->len, skb_shinfo(skb)->gso_size);
3227 else
3228 sta->tx_stats.msdu[tid]++;
3229
3230 info->hw_queue = sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
3231
3232 /* statistics normally done by ieee80211_tx_h_stats (but that
3233 * has to consider fragmentation, so is more complex)
3234 */
3235 sta->tx_stats.bytes[skb_get_queue_mapping(skb)] += skb->len;
3236 sta->tx_stats.packets[skb_get_queue_mapping(skb)]++;
3237
3238 if (pn_offs) {
3239 u64 pn;
3240 u8 *crypto_hdr = skb->data + pn_offs;
3241
3242 switch (key->conf.cipher) {
3243 case WLAN_CIPHER_SUITE_CCMP:
3244 case WLAN_CIPHER_SUITE_CCMP_256:
3245 case WLAN_CIPHER_SUITE_GCMP:
3246 case WLAN_CIPHER_SUITE_GCMP_256:
3247 pn = atomic64_inc_return(&key->conf.tx_pn);
3248 crypto_hdr[0] = pn;
3249 crypto_hdr[1] = pn >> 8;
3250 crypto_hdr[4] = pn >> 16;
3251 crypto_hdr[5] = pn >> 24;
3252 crypto_hdr[6] = pn >> 32;
3253 crypto_hdr[7] = pn >> 40;
3254 break;
3255 }
3256 }
3257}
3258
3187static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, 3259static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
3188 struct net_device *dev, struct sta_info *sta, 3260 struct sta_info *sta,
3189 struct ieee80211_fast_tx *fast_tx, 3261 struct ieee80211_fast_tx *fast_tx,
3190 struct sk_buff *skb) 3262 struct sk_buff *skb)
3191{ 3263{
@@ -3236,8 +3308,6 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
3236 return true; 3308 return true;
3237 } 3309 }
3238 3310
3239 ieee80211_tx_stats(dev, skb->len + extra_head);
3240
3241 if ((hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) && 3311 if ((hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) &&
3242 ieee80211_amsdu_aggregate(sdata, sta, fast_tx, skb)) 3312 ieee80211_amsdu_aggregate(sdata, sta, fast_tx, skb))
3243 return true; 3313 return true;
@@ -3266,24 +3336,7 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
3266 info->flags = IEEE80211_TX_CTL_FIRST_FRAGMENT | 3336 info->flags = IEEE80211_TX_CTL_FIRST_FRAGMENT |
3267 IEEE80211_TX_CTL_DONTFRAG | 3337 IEEE80211_TX_CTL_DONTFRAG |
3268 (tid_tx ? IEEE80211_TX_CTL_AMPDU : 0); 3338 (tid_tx ? IEEE80211_TX_CTL_AMPDU : 0);
3269 3339 info->control.flags = IEEE80211_TX_CTRL_FAST_XMIT;
3270 if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) {
3271 *ieee80211_get_qos_ctl(hdr) = tid;
3272 if (!ieee80211_get_txq(local, &sdata->vif, &sta->sta, skb))
3273 hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid);
3274 } else {
3275 info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
3276 hdr->seq_ctrl = cpu_to_le16(sdata->sequence_number);
3277 sdata->sequence_number += 0x10;
3278 }
3279
3280 if (skb_shinfo(skb)->gso_size)
3281 sta->tx_stats.msdu[tid] +=
3282 DIV_ROUND_UP(skb->len, skb_shinfo(skb)->gso_size);
3283 else
3284 sta->tx_stats.msdu[tid]++;
3285
3286 info->hw_queue = sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
3287 3340
3288 __skb_queue_head_init(&tx.skbs); 3341 __skb_queue_head_init(&tx.skbs);
3289 3342
@@ -3293,9 +3346,6 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
3293 tx.sta = sta; 3346 tx.sta = sta;
3294 tx.key = fast_tx->key; 3347 tx.key = fast_tx->key;
3295 3348
3296 if (fast_tx->key)
3297 info->control.hw_key = &fast_tx->key->conf;
3298
3299 if (!ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) { 3349 if (!ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) {
3300 tx.skb = skb; 3350 tx.skb = skb;
3301 r = ieee80211_tx_h_rate_ctrl(&tx); 3351 r = ieee80211_tx_h_rate_ctrl(&tx);
@@ -3309,31 +3359,11 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
3309 } 3359 }
3310 } 3360 }
3311 3361
3312 /* statistics normally done by ieee80211_tx_h_stats (but that 3362 if (ieee80211_queue_skb(local, sdata, sta, skb))
3313 * has to consider fragmentation, so is more complex) 3363 return true;
3314 */
3315 sta->tx_stats.bytes[skb_get_queue_mapping(skb)] += skb->len;
3316 sta->tx_stats.packets[skb_get_queue_mapping(skb)]++;
3317
3318 if (fast_tx->pn_offs) {
3319 u64 pn;
3320 u8 *crypto_hdr = skb->data + fast_tx->pn_offs;
3321 3364
3322 switch (fast_tx->key->conf.cipher) { 3365 ieee80211_xmit_fast_finish(sdata, sta, fast_tx->pn_offs,
3323 case WLAN_CIPHER_SUITE_CCMP: 3366 fast_tx->key, skb);
3324 case WLAN_CIPHER_SUITE_CCMP_256:
3325 case WLAN_CIPHER_SUITE_GCMP:
3326 case WLAN_CIPHER_SUITE_GCMP_256:
3327 pn = atomic64_inc_return(&fast_tx->key->conf.tx_pn);
3328 crypto_hdr[0] = pn;
3329 crypto_hdr[1] = pn >> 8;
3330 crypto_hdr[4] = pn >> 16;
3331 crypto_hdr[5] = pn >> 24;
3332 crypto_hdr[6] = pn >> 32;
3333 crypto_hdr[7] = pn >> 40;
3334 break;
3335 }
3336 }
3337 3367
3338 if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) 3368 if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
3339 sdata = container_of(sdata->bss, 3369 sdata = container_of(sdata->bss,
@@ -3344,6 +3374,94 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
3344 return true; 3374 return true;
3345} 3375}
3346 3376
3377struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
3378 struct ieee80211_txq *txq)
3379{
3380 struct ieee80211_local *local = hw_to_local(hw);
3381 struct txq_info *txqi = container_of(txq, struct txq_info, txq);
3382 struct ieee80211_hdr *hdr;
3383 struct sk_buff *skb = NULL;
3384 struct fq *fq = &local->fq;
3385 struct fq_tin *tin = &txqi->tin;
3386 struct ieee80211_tx_info *info;
3387 struct ieee80211_tx_data tx;
3388 ieee80211_tx_result r;
3389
3390 spin_lock_bh(&fq->lock);
3391
3392 if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags))
3393 goto out;
3394
3395 /* Make sure fragments stay together. */
3396 skb = __skb_dequeue(&txqi->frags);
3397 if (skb)
3398 goto out;
3399
3400begin:
3401 skb = fq_tin_dequeue(fq, tin, fq_tin_dequeue_func);
3402 if (!skb)
3403 goto out;
3404
3405 ieee80211_set_skb_vif(skb, txqi);
3406
3407 hdr = (struct ieee80211_hdr *)skb->data;
3408 info = IEEE80211_SKB_CB(skb);
3409
3410 memset(&tx, 0, sizeof(tx));
3411 __skb_queue_head_init(&tx.skbs);
3412 tx.local = local;
3413 tx.skb = skb;
3414 tx.sdata = vif_to_sdata(info->control.vif);
3415
3416 if (txq->sta)
3417 tx.sta = container_of(txq->sta, struct sta_info, sta);
3418
3419 /*
3420 * The key can be removed while the packet was queued, so need to call
3421 * this here to get the current key.
3422 */
3423 r = ieee80211_tx_h_select_key(&tx);
3424 if (r != TX_CONTINUE) {
3425 ieee80211_free_txskb(&local->hw, skb);
3426 goto begin;
3427 }
3428
3429 if (info->control.flags & IEEE80211_TX_CTRL_FAST_XMIT) {
3430 struct sta_info *sta = container_of(txq->sta, struct sta_info,
3431 sta);
3432 u8 pn_offs = 0;
3433
3434 if (tx.key &&
3435 (tx.key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV))
3436 pn_offs = ieee80211_hdrlen(hdr->frame_control);
3437
3438 ieee80211_xmit_fast_finish(sta->sdata, sta, pn_offs,
3439 tx.key, skb);
3440 } else {
3441 if (invoke_tx_handlers_late(&tx))
3442 goto begin;
3443
3444 skb = __skb_dequeue(&tx.skbs);
3445
3446 if (!skb_queue_empty(&tx.skbs))
3447 skb_queue_splice_tail(&tx.skbs, &txqi->frags);
3448 }
3449
3450 if (skb && skb_has_frag_list(skb) &&
3451 !ieee80211_hw_check(&local->hw, TX_FRAG_LIST)) {
3452 if (skb_linearize(skb)) {
3453 ieee80211_free_txskb(&local->hw, skb);
3454 goto begin;
3455 }
3456 }
3457
3458out:
3459 spin_unlock_bh(&fq->lock);
3460
3461 return skb;
3462}
3463EXPORT_SYMBOL(ieee80211_tx_dequeue);
3464
3347void __ieee80211_subif_start_xmit(struct sk_buff *skb, 3465void __ieee80211_subif_start_xmit(struct sk_buff *skb,
3348 struct net_device *dev, 3466 struct net_device *dev,
3349 u32 info_flags) 3467 u32 info_flags)
@@ -3368,7 +3486,7 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb,
3368 fast_tx = rcu_dereference(sta->fast_tx); 3486 fast_tx = rcu_dereference(sta->fast_tx);
3369 3487
3370 if (fast_tx && 3488 if (fast_tx &&
3371 ieee80211_xmit_fast(sdata, dev, sta, fast_tx, skb)) 3489 ieee80211_xmit_fast(sdata, sta, fast_tx, skb))
3372 goto out; 3490 goto out;
3373 } 3491 }
3374 3492
@@ -4395,9 +4513,6 @@ void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata,
4395 int ac = ieee802_1d_to_ac[tid & 7]; 4513 int ac = ieee802_1d_to_ac[tid & 7];
4396 4514
4397 skb_reset_mac_header(skb); 4515 skb_reset_mac_header(skb);
4398 skb_reset_network_header(skb);
4399 skb_reset_transport_header(skb);
4400
4401 skb_set_queue_mapping(skb, ac); 4516 skb_set_queue_mapping(skb, ac);
4402 skb->priority = tid; 4517 skb->priority = tid;
4403 4518
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 42bf0b6685e8..545c79a42a77 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -598,7 +598,7 @@ static void __iterate_interfaces(struct ieee80211_local *local,
598 list_for_each_entry_rcu(sdata, &local->interfaces, list) { 598 list_for_each_entry_rcu(sdata, &local->interfaces, list) {
599 switch (sdata->vif.type) { 599 switch (sdata->vif.type) {
600 case NL80211_IFTYPE_MONITOR: 600 case NL80211_IFTYPE_MONITOR:
601 if (!(sdata->u.mntr_flags & MONITOR_FLAG_ACTIVE)) 601 if (!(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE))
602 continue; 602 continue;
603 break; 603 break;
604 case NL80211_IFTYPE_AP_VLAN: 604 case NL80211_IFTYPE_AP_VLAN:
@@ -1209,7 +1209,8 @@ void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata,
1209 } 1209 }
1210 1210
1211 if (sdata->vif.type != NL80211_IFTYPE_MONITOR && 1211 if (sdata->vif.type != NL80211_IFTYPE_MONITOR &&
1212 sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE) { 1212 sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE &&
1213 sdata->vif.type != NL80211_IFTYPE_NAN) {
1213 sdata->vif.bss_conf.qos = enable_qos; 1214 sdata->vif.bss_conf.qos = enable_qos;
1214 if (bss_notify) 1215 if (bss_notify)
1215 ieee80211_bss_info_change_notify(sdata, 1216 ieee80211_bss_info_change_notify(sdata,
@@ -1748,6 +1749,46 @@ static void ieee80211_reconfig_stations(struct ieee80211_sub_if_data *sdata)
1748 mutex_unlock(&local->sta_mtx); 1749 mutex_unlock(&local->sta_mtx);
1749} 1750}
1750 1751
1752static int ieee80211_reconfig_nan(struct ieee80211_sub_if_data *sdata)
1753{
1754 struct cfg80211_nan_func *func, **funcs;
1755 int res, id, i = 0;
1756
1757 res = drv_start_nan(sdata->local, sdata,
1758 &sdata->u.nan.conf);
1759 if (WARN_ON(res))
1760 return res;
1761
1762 funcs = kzalloc((sdata->local->hw.max_nan_de_entries + 1) *
1763 sizeof(*funcs), GFP_KERNEL);
1764 if (!funcs)
1765 return -ENOMEM;
1766
1767 /* Add all the functions:
1768 * This is a little bit ugly. We need to call a potentially sleeping
1769 * callback for each NAN function, so we can't hold the spinlock.
1770 */
1771 spin_lock_bh(&sdata->u.nan.func_lock);
1772
1773 idr_for_each_entry(&sdata->u.nan.function_inst_ids, func, id)
1774 funcs[i++] = func;
1775
1776 spin_unlock_bh(&sdata->u.nan.func_lock);
1777
1778 for (i = 0; funcs[i]; i++) {
1779 res = drv_add_nan_func(sdata->local, sdata, funcs[i]);
1780 if (WARN_ON(res))
1781 ieee80211_nan_func_terminated(&sdata->vif,
1782 funcs[i]->instance_id,
1783 NL80211_NAN_FUNC_TERM_REASON_ERROR,
1784 GFP_KERNEL);
1785 }
1786
1787 kfree(funcs);
1788
1789 return 0;
1790}
1791
1751int ieee80211_reconfig(struct ieee80211_local *local) 1792int ieee80211_reconfig(struct ieee80211_local *local)
1752{ 1793{
1753 struct ieee80211_hw *hw = &local->hw; 1794 struct ieee80211_hw *hw = &local->hw;
@@ -1971,6 +2012,13 @@ int ieee80211_reconfig(struct ieee80211_local *local)
1971 ieee80211_bss_info_change_notify(sdata, changed); 2012 ieee80211_bss_info_change_notify(sdata, changed);
1972 } 2013 }
1973 break; 2014 break;
2015 case NL80211_IFTYPE_NAN:
2016 res = ieee80211_reconfig_nan(sdata);
2017 if (res < 0) {
2018 ieee80211_handle_reconfig_failure(local);
2019 return res;
2020 }
2021 break;
1974 case NL80211_IFTYPE_WDS: 2022 case NL80211_IFTYPE_WDS:
1975 case NL80211_IFTYPE_AP_VLAN: 2023 case NL80211_IFTYPE_AP_VLAN:
1976 case NL80211_IFTYPE_MONITOR: 2024 case NL80211_IFTYPE_MONITOR:
@@ -2555,7 +2603,6 @@ int ieee80211_add_srates_ie(struct ieee80211_sub_if_data *sdata,
2555 2603
2556 if (need_basic && basic_rates & BIT(i)) 2604 if (need_basic && basic_rates & BIT(i))
2557 basic = 0x80; 2605 basic = 0x80;
2558 rate = sband->bitrates[i].bitrate;
2559 rate = DIV_ROUND_UP(sband->bitrates[i].bitrate, 2606 rate = DIV_ROUND_UP(sband->bitrates[i].bitrate,
2560 5 * (1 << shift)); 2607 5 * (1 << shift));
2561 *pos++ = basic | (u8) rate; 2608 *pos++ = basic | (u8) rate;
@@ -3394,11 +3441,18 @@ void ieee80211_txq_get_depth(struct ieee80211_txq *txq,
3394 unsigned long *byte_cnt) 3441 unsigned long *byte_cnt)
3395{ 3442{
3396 struct txq_info *txqi = to_txq_info(txq); 3443 struct txq_info *txqi = to_txq_info(txq);
3444 u32 frag_cnt = 0, frag_bytes = 0;
3445 struct sk_buff *skb;
3446
3447 skb_queue_walk(&txqi->frags, skb) {
3448 frag_cnt++;
3449 frag_bytes += skb->len;
3450 }
3397 3451
3398 if (frame_cnt) 3452 if (frame_cnt)
3399 *frame_cnt = txqi->tin.backlog_packets; 3453 *frame_cnt = txqi->tin.backlog_packets + frag_cnt;
3400 3454
3401 if (byte_cnt) 3455 if (byte_cnt)
3402 *byte_cnt = txqi->tin.backlog_bytes; 3456 *byte_cnt = txqi->tin.backlog_bytes + frag_bytes;
3403} 3457}
3404EXPORT_SYMBOL(ieee80211_txq_get_depth); 3458EXPORT_SYMBOL(ieee80211_txq_get_depth);
diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c
index 7079cd32a7ad..06019dba4b10 100644
--- a/net/mac802154/iface.c
+++ b/net/mac802154/iface.c
@@ -663,6 +663,7 @@ ieee802154_if_add(struct ieee802154_local *local, const char *name,
663 663
664 /* TODO check this */ 664 /* TODO check this */
665 SET_NETDEV_DEV(ndev, &local->phy->dev); 665 SET_NETDEV_DEV(ndev, &local->phy->dev);
666 dev_net_set(ndev, wpan_phy_net(local->hw.phy));
666 sdata = netdev_priv(ndev); 667 sdata = netdev_priv(ndev);
667 ndev->ieee802154_ptr = &sdata->wpan_dev; 668 ndev->ieee802154_ptr = &sdata->wpan_dev;
668 memcpy(sdata->name, ndev->name, IFNAMSIZ); 669 memcpy(sdata->name, ndev->name, IFNAMSIZ);
diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c
index 446e1300383e..4dcf6e18563a 100644
--- a/net/mac802154/rx.c
+++ b/net/mac802154/rx.c
@@ -101,11 +101,16 @@ ieee802154_subif_frame(struct ieee802154_sub_if_data *sdata,
101 sdata->dev->stats.rx_bytes += skb->len; 101 sdata->dev->stats.rx_bytes += skb->len;
102 102
103 switch (mac_cb(skb)->type) { 103 switch (mac_cb(skb)->type) {
104 case IEEE802154_FC_TYPE_BEACON:
105 case IEEE802154_FC_TYPE_ACK:
106 case IEEE802154_FC_TYPE_MAC_CMD:
107 goto fail;
108
104 case IEEE802154_FC_TYPE_DATA: 109 case IEEE802154_FC_TYPE_DATA:
105 return ieee802154_deliver_skb(skb); 110 return ieee802154_deliver_skb(skb);
106 default: 111 default:
107 pr_warn("ieee802154: bad frame received (type = %d)\n", 112 pr_warn_ratelimited("ieee802154: bad frame received "
108 mac_cb(skb)->type); 113 "(type = %d)\n", mac_cb(skb)->type);
109 goto fail; 114 goto fail;
110 } 115 }
111 116
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 5c161e7759b5..0e4334cbde17 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -961,9 +961,6 @@ static void mpls_ifdown(struct net_device *dev, int event)
961 RCU_INIT_POINTER(nh->nh_dev, NULL); 961 RCU_INIT_POINTER(nh->nh_dev, NULL);
962 } endfor_nexthops(rt); 962 } endfor_nexthops(rt);
963 } 963 }
964
965
966 return;
967} 964}
968 965
969static void mpls_ifup(struct net_device *dev, unsigned int nh_flags) 966static void mpls_ifup(struct net_device *dev, unsigned int nh_flags)
@@ -997,8 +994,6 @@ static void mpls_ifup(struct net_device *dev, unsigned int nh_flags)
997 994
998 ACCESS_ONCE(rt->rt_nhn_alive) = alive; 995 ACCESS_ONCE(rt->rt_nhn_alive) = alive;
999 } 996 }
1000
1001 return;
1002} 997}
1003 998
1004static int mpls_dev_notify(struct notifier_block *this, unsigned long event, 999static int mpls_dev_notify(struct notifier_block *this, unsigned long event,
diff --git a/net/mpls/internal.h b/net/mpls/internal.h
index 732a5c17e986..bdfef6c3271a 100644
--- a/net/mpls/internal.h
+++ b/net/mpls/internal.h
@@ -1,9 +1,6 @@
1#ifndef MPLS_INTERNAL_H 1#ifndef MPLS_INTERNAL_H
2#define MPLS_INTERNAL_H 2#define MPLS_INTERNAL_H
3 3#include <net/mpls.h>
4struct mpls_shim_hdr {
5 __be32 label_stack_entry;
6};
7 4
8struct mpls_entry_decoded { 5struct mpls_entry_decoded {
9 u32 label; 6 u32 label;
@@ -93,11 +90,6 @@ struct mpls_route { /* next hop label forwarding entry */
93 90
94#define endfor_nexthops(rt) } 91#define endfor_nexthops(rt) }
95 92
96static inline struct mpls_shim_hdr *mpls_hdr(const struct sk_buff *skb)
97{
98 return (struct mpls_shim_hdr *)skb_network_header(skb);
99}
100
101static inline struct mpls_shim_hdr mpls_entry_encode(u32 label, unsigned ttl, unsigned tc, bool bos) 93static inline struct mpls_shim_hdr mpls_entry_encode(u32 label, unsigned ttl, unsigned tc, bool bos)
102{ 94{
103 struct mpls_shim_hdr result; 95 struct mpls_shim_hdr result;
diff --git a/net/mpls/mpls_gso.c b/net/mpls/mpls_gso.c
index 2055e57ed1c3..b4da6d8e8632 100644
--- a/net/mpls/mpls_gso.c
+++ b/net/mpls/mpls_gso.c
@@ -23,32 +23,50 @@ static struct sk_buff *mpls_gso_segment(struct sk_buff *skb,
23 netdev_features_t features) 23 netdev_features_t features)
24{ 24{
25 struct sk_buff *segs = ERR_PTR(-EINVAL); 25 struct sk_buff *segs = ERR_PTR(-EINVAL);
26 u16 mac_offset = skb->mac_header;
26 netdev_features_t mpls_features; 27 netdev_features_t mpls_features;
28 u16 mac_len = skb->mac_len;
27 __be16 mpls_protocol; 29 __be16 mpls_protocol;
30 unsigned int mpls_hlen;
31
32 skb_reset_network_header(skb);
33 mpls_hlen = skb_inner_network_header(skb) - skb_network_header(skb);
34 if (unlikely(!pskb_may_pull(skb, mpls_hlen)))
35 goto out;
28 36
29 /* Setup inner SKB. */ 37 /* Setup inner SKB. */
30 mpls_protocol = skb->protocol; 38 mpls_protocol = skb->protocol;
31 skb->protocol = skb->inner_protocol; 39 skb->protocol = skb->inner_protocol;
32 40
33 /* Push back the mac header that skb_mac_gso_segment() has pulled. 41 __skb_pull(skb, mpls_hlen);
34 * It will be re-pulled by the call to skb_mac_gso_segment() below 42
35 */ 43 skb->mac_len = 0;
36 __skb_push(skb, skb->mac_len); 44 skb_reset_mac_header(skb);
37 45
38 /* Segment inner packet. */ 46 /* Segment inner packet. */
39 mpls_features = skb->dev->mpls_features & features; 47 mpls_features = skb->dev->mpls_features & features;
40 segs = skb_mac_gso_segment(skb, mpls_features); 48 segs = skb_mac_gso_segment(skb, mpls_features);
49 if (IS_ERR_OR_NULL(segs)) {
50 skb_gso_error_unwind(skb, mpls_protocol, mpls_hlen, mac_offset,
51 mac_len);
52 goto out;
53 }
54 skb = segs;
55
56 mpls_hlen += mac_len;
57 do {
58 skb->mac_len = mac_len;
59 skb->protocol = mpls_protocol;
41 60
61 skb_reset_inner_network_header(skb);
42 62
43 /* Restore outer protocol. */ 63 __skb_push(skb, mpls_hlen);
44 skb->protocol = mpls_protocol;
45 64
46 /* Re-pull the mac header that the call to skb_mac_gso_segment() 65 skb_reset_mac_header(skb);
47 * above pulled. It will be re-pushed after returning 66 skb_set_network_header(skb, mac_len);
48 * skb_mac_gso_segment(), an indirect caller of this function. 67 } while ((skb = skb->next));
49 */
50 __skb_pull(skb, skb->data - skb_mac_header(skb));
51 68
69out:
52 return segs; 70 return segs;
53} 71}
54 72
diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c
index 644a8da6d4bd..cf52cf30ac4b 100644
--- a/net/mpls/mpls_iptunnel.c
+++ b/net/mpls/mpls_iptunnel.c
@@ -37,7 +37,7 @@ static unsigned int mpls_encap_size(struct mpls_iptunnel_encap *en)
37 return en->labels * sizeof(struct mpls_shim_hdr); 37 return en->labels * sizeof(struct mpls_shim_hdr);
38} 38}
39 39
40static int mpls_output(struct net *net, struct sock *sk, struct sk_buff *skb) 40static int mpls_xmit(struct sk_buff *skb)
41{ 41{
42 struct mpls_iptunnel_encap *tun_encap_info; 42 struct mpls_iptunnel_encap *tun_encap_info;
43 struct mpls_shim_hdr *hdr; 43 struct mpls_shim_hdr *hdr;
@@ -90,7 +90,11 @@ static int mpls_output(struct net *net, struct sock *sk, struct sk_buff *skb)
90 if (skb_cow(skb, hh_len + new_header_size)) 90 if (skb_cow(skb, hh_len + new_header_size))
91 goto drop; 91 goto drop;
92 92
93 skb_set_inner_protocol(skb, skb->protocol);
94 skb_reset_inner_network_header(skb);
95
93 skb_push(skb, new_header_size); 96 skb_push(skb, new_header_size);
97
94 skb_reset_network_header(skb); 98 skb_reset_network_header(skb);
95 99
96 skb->dev = out_dev; 100 skb->dev = out_dev;
@@ -115,7 +119,7 @@ static int mpls_output(struct net *net, struct sock *sk, struct sk_buff *skb)
115 net_dbg_ratelimited("%s: packet transmission failed: %d\n", 119 net_dbg_ratelimited("%s: packet transmission failed: %d\n",
116 __func__, err); 120 __func__, err);
117 121
118 return 0; 122 return LWTUNNEL_XMIT_DONE;
119 123
120drop: 124drop:
121 kfree_skb(skb); 125 kfree_skb(skb);
@@ -153,7 +157,8 @@ static int mpls_build_state(struct net_device *dev, struct nlattr *nla,
153 if (ret) 157 if (ret)
154 goto errout; 158 goto errout;
155 newts->type = LWTUNNEL_ENCAP_MPLS; 159 newts->type = LWTUNNEL_ENCAP_MPLS;
156 newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; 160 newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT;
161 newts->headroom = mpls_encap_size(tun_encap_info);
157 162
158 *ts = newts; 163 *ts = newts;
159 164
@@ -209,7 +214,7 @@ static int mpls_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
209 214
210static const struct lwtunnel_encap_ops mpls_iptun_ops = { 215static const struct lwtunnel_encap_ops mpls_iptun_ops = {
211 .build_state = mpls_build_state, 216 .build_state = mpls_build_state,
212 .output = mpls_output, 217 .xmit = mpls_xmit,
213 .fill_encap = mpls_fill_encap_info, 218 .fill_encap = mpls_fill_encap_info,
214 .get_encap_size = mpls_encap_nlsize, 219 .get_encap_size = mpls_encap_nlsize,
215 .cmp_encap = mpls_encap_cmp, 220 .cmp_encap = mpls_encap_cmp,
diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h
index 33738c060547..13290a70fa71 100644
--- a/net/ncsi/internal.h
+++ b/net/ncsi/internal.h
@@ -170,6 +170,7 @@ struct ncsi_package;
170 170
171#define NCSI_PACKAGE_SHIFT 5 171#define NCSI_PACKAGE_SHIFT 5
172#define NCSI_PACKAGE_INDEX(c) (((c) >> NCSI_PACKAGE_SHIFT) & 0x7) 172#define NCSI_PACKAGE_INDEX(c) (((c) >> NCSI_PACKAGE_SHIFT) & 0x7)
173#define NCSI_RESERVED_CHANNEL 0x1f
173#define NCSI_CHANNEL_INDEX(c) ((c) & ((1 << NCSI_PACKAGE_SHIFT) - 1)) 174#define NCSI_CHANNEL_INDEX(c) ((c) & ((1 << NCSI_PACKAGE_SHIFT) - 1))
174#define NCSI_TO_CHANNEL(p, c) (((p) << NCSI_PACKAGE_SHIFT) | (c)) 175#define NCSI_TO_CHANNEL(p, c) (((p) << NCSI_PACKAGE_SHIFT) | (c))
175 176
@@ -186,9 +187,15 @@ struct ncsi_channel {
186 struct ncsi_channel_mode modes[NCSI_MODE_MAX]; 187 struct ncsi_channel_mode modes[NCSI_MODE_MAX];
187 struct ncsi_channel_filter *filters[NCSI_FILTER_MAX]; 188 struct ncsi_channel_filter *filters[NCSI_FILTER_MAX];
188 struct ncsi_channel_stats stats; 189 struct ncsi_channel_stats stats;
189 struct timer_list timer; /* Link monitor timer */ 190 struct {
190 bool enabled; /* Timer is enabled */ 191 struct timer_list timer;
191 unsigned int timeout; /* Times of timeout */ 192 bool enabled;
193 unsigned int state;
194#define NCSI_CHANNEL_MONITOR_START 0
195#define NCSI_CHANNEL_MONITOR_RETRY 1
196#define NCSI_CHANNEL_MONITOR_WAIT 2
197#define NCSI_CHANNEL_MONITOR_WAIT_MAX 5
198 } monitor;
192 struct list_head node; 199 struct list_head node;
193 struct list_head link; 200 struct list_head link;
194}; 201};
@@ -206,7 +213,8 @@ struct ncsi_package {
206struct ncsi_request { 213struct ncsi_request {
207 unsigned char id; /* Request ID - 0 to 255 */ 214 unsigned char id; /* Request ID - 0 to 255 */
208 bool used; /* Request that has been assigned */ 215 bool used; /* Request that has been assigned */
209 bool driven; /* Drive state machine */ 216 unsigned int flags; /* NCSI request property */
217#define NCSI_REQ_FLAG_EVENT_DRIVEN 1
210 struct ncsi_dev_priv *ndp; /* Associated NCSI device */ 218 struct ncsi_dev_priv *ndp; /* Associated NCSI device */
211 struct sk_buff *cmd; /* Associated NCSI command packet */ 219 struct sk_buff *cmd; /* Associated NCSI command packet */
212 struct sk_buff *rsp; /* Associated NCSI response packet */ 220 struct sk_buff *rsp; /* Associated NCSI response packet */
@@ -258,6 +266,7 @@ struct ncsi_dev_priv {
258 struct list_head packages; /* List of packages */ 266 struct list_head packages; /* List of packages */
259 struct ncsi_request requests[256]; /* Request table */ 267 struct ncsi_request requests[256]; /* Request table */
260 unsigned int request_id; /* Last used request ID */ 268 unsigned int request_id; /* Last used request ID */
269#define NCSI_REQ_START_IDX 1
261 unsigned int pending_req_num; /* Number of pending requests */ 270 unsigned int pending_req_num; /* Number of pending requests */
262 struct ncsi_package *active_package; /* Currently handled package */ 271 struct ncsi_package *active_package; /* Currently handled package */
263 struct ncsi_channel *active_channel; /* Currently handled channel */ 272 struct ncsi_channel *active_channel; /* Currently handled channel */
@@ -274,7 +283,7 @@ struct ncsi_cmd_arg {
274 unsigned char package; /* Destination package ID */ 283 unsigned char package; /* Destination package ID */
275 unsigned char channel; /* Detination channel ID or 0x1f */ 284 unsigned char channel; /* Detination channel ID or 0x1f */
276 unsigned short payload; /* Command packet payload length */ 285 unsigned short payload; /* Command packet payload length */
277 bool driven; /* Drive the state machine? */ 286 unsigned int req_flags; /* NCSI request properties */
278 union { 287 union {
279 unsigned char bytes[16]; /* Command packet specific data */ 288 unsigned char bytes[16]; /* Command packet specific data */
280 unsigned short words[8]; 289 unsigned short words[8];
@@ -313,7 +322,8 @@ void ncsi_find_package_and_channel(struct ncsi_dev_priv *ndp,
313 unsigned char id, 322 unsigned char id,
314 struct ncsi_package **np, 323 struct ncsi_package **np,
315 struct ncsi_channel **nc); 324 struct ncsi_channel **nc);
316struct ncsi_request *ncsi_alloc_request(struct ncsi_dev_priv *ndp, bool driven); 325struct ncsi_request *ncsi_alloc_request(struct ncsi_dev_priv *ndp,
326 unsigned int req_flags);
317void ncsi_free_request(struct ncsi_request *nr); 327void ncsi_free_request(struct ncsi_request *nr);
318struct ncsi_dev *ncsi_find_dev(struct net_device *dev); 328struct ncsi_dev *ncsi_find_dev(struct net_device *dev);
319int ncsi_process_next_channel(struct ncsi_dev_priv *ndp); 329int ncsi_process_next_channel(struct ncsi_dev_priv *ndp);
diff --git a/net/ncsi/ncsi-aen.c b/net/ncsi/ncsi-aen.c
index d463468442ae..b41a6617d498 100644
--- a/net/ncsi/ncsi-aen.c
+++ b/net/ncsi/ncsi-aen.c
@@ -53,7 +53,9 @@ static int ncsi_aen_handler_lsc(struct ncsi_dev_priv *ndp,
53 struct ncsi_aen_lsc_pkt *lsc; 53 struct ncsi_aen_lsc_pkt *lsc;
54 struct ncsi_channel *nc; 54 struct ncsi_channel *nc;
55 struct ncsi_channel_mode *ncm; 55 struct ncsi_channel_mode *ncm;
56 unsigned long old_data; 56 bool chained;
57 int state;
58 unsigned long old_data, data;
57 unsigned long flags; 59 unsigned long flags;
58 60
59 /* Find the NCSI channel */ 61 /* Find the NCSI channel */
@@ -62,20 +64,27 @@ static int ncsi_aen_handler_lsc(struct ncsi_dev_priv *ndp,
62 return -ENODEV; 64 return -ENODEV;
63 65
64 /* Update the link status */ 66 /* Update the link status */
65 ncm = &nc->modes[NCSI_MODE_LINK];
66 lsc = (struct ncsi_aen_lsc_pkt *)h; 67 lsc = (struct ncsi_aen_lsc_pkt *)h;
68
69 spin_lock_irqsave(&nc->lock, flags);
70 ncm = &nc->modes[NCSI_MODE_LINK];
67 old_data = ncm->data[2]; 71 old_data = ncm->data[2];
68 ncm->data[2] = ntohl(lsc->status); 72 data = ntohl(lsc->status);
73 ncm->data[2] = data;
69 ncm->data[4] = ntohl(lsc->oem_status); 74 ncm->data[4] = ntohl(lsc->oem_status);
70 if (!((old_data ^ ncm->data[2]) & 0x1) || 75
71 !list_empty(&nc->link)) 76 chained = !list_empty(&nc->link);
77 state = nc->state;
78 spin_unlock_irqrestore(&nc->lock, flags);
79
80 if (!((old_data ^ data) & 0x1) || chained)
72 return 0; 81 return 0;
73 if (!(nc->state == NCSI_CHANNEL_INACTIVE && (ncm->data[2] & 0x1)) && 82 if (!(state == NCSI_CHANNEL_INACTIVE && (data & 0x1)) &&
74 !(nc->state == NCSI_CHANNEL_ACTIVE && !(ncm->data[2] & 0x1))) 83 !(state == NCSI_CHANNEL_ACTIVE && !(data & 0x1)))
75 return 0; 84 return 0;
76 85
77 if (!(ndp->flags & NCSI_DEV_HWA) && 86 if (!(ndp->flags & NCSI_DEV_HWA) &&
78 nc->state == NCSI_CHANNEL_ACTIVE) 87 state == NCSI_CHANNEL_ACTIVE)
79 ndp->flags |= NCSI_DEV_RESHUFFLE; 88 ndp->flags |= NCSI_DEV_RESHUFFLE;
80 89
81 ncsi_stop_channel_monitor(nc); 90 ncsi_stop_channel_monitor(nc);
@@ -97,13 +106,21 @@ static int ncsi_aen_handler_cr(struct ncsi_dev_priv *ndp,
97 if (!nc) 106 if (!nc)
98 return -ENODEV; 107 return -ENODEV;
99 108
109 spin_lock_irqsave(&nc->lock, flags);
100 if (!list_empty(&nc->link) || 110 if (!list_empty(&nc->link) ||
101 nc->state != NCSI_CHANNEL_ACTIVE) 111 nc->state != NCSI_CHANNEL_ACTIVE) {
112 spin_unlock_irqrestore(&nc->lock, flags);
102 return 0; 113 return 0;
114 }
115 spin_unlock_irqrestore(&nc->lock, flags);
103 116
104 ncsi_stop_channel_monitor(nc); 117 ncsi_stop_channel_monitor(nc);
118 spin_lock_irqsave(&nc->lock, flags);
119 nc->state = NCSI_CHANNEL_INVISIBLE;
120 spin_unlock_irqrestore(&nc->lock, flags);
121
105 spin_lock_irqsave(&ndp->lock, flags); 122 spin_lock_irqsave(&ndp->lock, flags);
106 xchg(&nc->state, NCSI_CHANNEL_INACTIVE); 123 nc->state = NCSI_CHANNEL_INACTIVE;
107 list_add_tail_rcu(&nc->link, &ndp->channel_queue); 124 list_add_tail_rcu(&nc->link, &ndp->channel_queue);
108 spin_unlock_irqrestore(&ndp->lock, flags); 125 spin_unlock_irqrestore(&ndp->lock, flags);
109 126
diff --git a/net/ncsi/ncsi-cmd.c b/net/ncsi/ncsi-cmd.c
index 21057a8ceeac..db7083bfd476 100644
--- a/net/ncsi/ncsi-cmd.c
+++ b/net/ncsi/ncsi-cmd.c
@@ -272,7 +272,7 @@ static struct ncsi_request *ncsi_alloc_command(struct ncsi_cmd_arg *nca)
272 struct sk_buff *skb; 272 struct sk_buff *skb;
273 struct ncsi_request *nr; 273 struct ncsi_request *nr;
274 274
275 nr = ncsi_alloc_request(ndp, nca->driven); 275 nr = ncsi_alloc_request(ndp, nca->req_flags);
276 if (!nr) 276 if (!nr)
277 return NULL; 277 return NULL;
278 278
diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c
index ef017b871857..5e509e547c2d 100644
--- a/net/ncsi/ncsi-manage.c
+++ b/net/ncsi/ncsi-manage.c
@@ -132,6 +132,7 @@ static void ncsi_report_link(struct ncsi_dev_priv *ndp, bool force_down)
132 struct ncsi_dev *nd = &ndp->ndev; 132 struct ncsi_dev *nd = &ndp->ndev;
133 struct ncsi_package *np; 133 struct ncsi_package *np;
134 struct ncsi_channel *nc; 134 struct ncsi_channel *nc;
135 unsigned long flags;
135 136
136 nd->state = ncsi_dev_state_functional; 137 nd->state = ncsi_dev_state_functional;
137 if (force_down) { 138 if (force_down) {
@@ -142,14 +143,21 @@ static void ncsi_report_link(struct ncsi_dev_priv *ndp, bool force_down)
142 nd->link_up = 0; 143 nd->link_up = 0;
143 NCSI_FOR_EACH_PACKAGE(ndp, np) { 144 NCSI_FOR_EACH_PACKAGE(ndp, np) {
144 NCSI_FOR_EACH_CHANNEL(np, nc) { 145 NCSI_FOR_EACH_CHANNEL(np, nc) {
146 spin_lock_irqsave(&nc->lock, flags);
147
145 if (!list_empty(&nc->link) || 148 if (!list_empty(&nc->link) ||
146 nc->state != NCSI_CHANNEL_ACTIVE) 149 nc->state != NCSI_CHANNEL_ACTIVE) {
150 spin_unlock_irqrestore(&nc->lock, flags);
147 continue; 151 continue;
152 }
148 153
149 if (nc->modes[NCSI_MODE_LINK].data[2] & 0x1) { 154 if (nc->modes[NCSI_MODE_LINK].data[2] & 0x1) {
155 spin_unlock_irqrestore(&nc->lock, flags);
150 nd->link_up = 1; 156 nd->link_up = 1;
151 goto report; 157 goto report;
152 } 158 }
159
160 spin_unlock_irqrestore(&nc->lock, flags);
153 } 161 }
154 } 162 }
155 163
@@ -163,43 +171,55 @@ static void ncsi_channel_monitor(unsigned long data)
163 struct ncsi_package *np = nc->package; 171 struct ncsi_package *np = nc->package;
164 struct ncsi_dev_priv *ndp = np->ndp; 172 struct ncsi_dev_priv *ndp = np->ndp;
165 struct ncsi_cmd_arg nca; 173 struct ncsi_cmd_arg nca;
166 bool enabled; 174 bool enabled, chained;
167 unsigned int timeout; 175 unsigned int monitor_state;
168 unsigned long flags; 176 unsigned long flags;
169 int ret; 177 int state, ret;
170 178
171 spin_lock_irqsave(&nc->lock, flags); 179 spin_lock_irqsave(&nc->lock, flags);
172 timeout = nc->timeout; 180 state = nc->state;
173 enabled = nc->enabled; 181 chained = !list_empty(&nc->link);
182 enabled = nc->monitor.enabled;
183 monitor_state = nc->monitor.state;
174 spin_unlock_irqrestore(&nc->lock, flags); 184 spin_unlock_irqrestore(&nc->lock, flags);
175 185
176 if (!enabled || !list_empty(&nc->link)) 186 if (!enabled || chained)
177 return; 187 return;
178 if (nc->state != NCSI_CHANNEL_INACTIVE && 188 if (state != NCSI_CHANNEL_INACTIVE &&
179 nc->state != NCSI_CHANNEL_ACTIVE) 189 state != NCSI_CHANNEL_ACTIVE)
180 return; 190 return;
181 191
182 if (!(timeout % 2)) { 192 switch (monitor_state) {
193 case NCSI_CHANNEL_MONITOR_START:
194 case NCSI_CHANNEL_MONITOR_RETRY:
183 nca.ndp = ndp; 195 nca.ndp = ndp;
184 nca.package = np->id; 196 nca.package = np->id;
185 nca.channel = nc->id; 197 nca.channel = nc->id;
186 nca.type = NCSI_PKT_CMD_GLS; 198 nca.type = NCSI_PKT_CMD_GLS;
187 nca.driven = false; 199 nca.req_flags = 0;
188 ret = ncsi_xmit_cmd(&nca); 200 ret = ncsi_xmit_cmd(&nca);
189 if (ret) { 201 if (ret) {
190 netdev_err(ndp->ndev.dev, "Error %d sending GLS\n", 202 netdev_err(ndp->ndev.dev, "Error %d sending GLS\n",
191 ret); 203 ret);
192 return; 204 return;
193 } 205 }
194 }
195 206
196 if (timeout + 1 >= 3) { 207 break;
208 case NCSI_CHANNEL_MONITOR_WAIT ... NCSI_CHANNEL_MONITOR_WAIT_MAX:
209 break;
210 default:
197 if (!(ndp->flags & NCSI_DEV_HWA) && 211 if (!(ndp->flags & NCSI_DEV_HWA) &&
198 nc->state == NCSI_CHANNEL_ACTIVE) 212 state == NCSI_CHANNEL_ACTIVE) {
199 ncsi_report_link(ndp, true); 213 ncsi_report_link(ndp, true);
214 ndp->flags |= NCSI_DEV_RESHUFFLE;
215 }
216
217 spin_lock_irqsave(&nc->lock, flags);
218 nc->state = NCSI_CHANNEL_INVISIBLE;
219 spin_unlock_irqrestore(&nc->lock, flags);
200 220
201 spin_lock_irqsave(&ndp->lock, flags); 221 spin_lock_irqsave(&ndp->lock, flags);
202 xchg(&nc->state, NCSI_CHANNEL_INACTIVE); 222 nc->state = NCSI_CHANNEL_INACTIVE;
203 list_add_tail_rcu(&nc->link, &ndp->channel_queue); 223 list_add_tail_rcu(&nc->link, &ndp->channel_queue);
204 spin_unlock_irqrestore(&ndp->lock, flags); 224 spin_unlock_irqrestore(&ndp->lock, flags);
205 ncsi_process_next_channel(ndp); 225 ncsi_process_next_channel(ndp);
@@ -207,10 +227,9 @@ static void ncsi_channel_monitor(unsigned long data)
207 } 227 }
208 228
209 spin_lock_irqsave(&nc->lock, flags); 229 spin_lock_irqsave(&nc->lock, flags);
210 nc->timeout = timeout + 1; 230 nc->monitor.state++;
211 nc->enabled = true;
212 spin_unlock_irqrestore(&nc->lock, flags); 231 spin_unlock_irqrestore(&nc->lock, flags);
213 mod_timer(&nc->timer, jiffies + HZ * (1 << (nc->timeout / 2))); 232 mod_timer(&nc->monitor.timer, jiffies + HZ);
214} 233}
215 234
216void ncsi_start_channel_monitor(struct ncsi_channel *nc) 235void ncsi_start_channel_monitor(struct ncsi_channel *nc)
@@ -218,12 +237,12 @@ void ncsi_start_channel_monitor(struct ncsi_channel *nc)
218 unsigned long flags; 237 unsigned long flags;
219 238
220 spin_lock_irqsave(&nc->lock, flags); 239 spin_lock_irqsave(&nc->lock, flags);
221 WARN_ON_ONCE(nc->enabled); 240 WARN_ON_ONCE(nc->monitor.enabled);
222 nc->timeout = 0; 241 nc->monitor.enabled = true;
223 nc->enabled = true; 242 nc->monitor.state = NCSI_CHANNEL_MONITOR_START;
224 spin_unlock_irqrestore(&nc->lock, flags); 243 spin_unlock_irqrestore(&nc->lock, flags);
225 244
226 mod_timer(&nc->timer, jiffies + HZ * (1 << (nc->timeout / 2))); 245 mod_timer(&nc->monitor.timer, jiffies + HZ);
227} 246}
228 247
229void ncsi_stop_channel_monitor(struct ncsi_channel *nc) 248void ncsi_stop_channel_monitor(struct ncsi_channel *nc)
@@ -231,14 +250,14 @@ void ncsi_stop_channel_monitor(struct ncsi_channel *nc)
231 unsigned long flags; 250 unsigned long flags;
232 251
233 spin_lock_irqsave(&nc->lock, flags); 252 spin_lock_irqsave(&nc->lock, flags);
234 if (!nc->enabled) { 253 if (!nc->monitor.enabled) {
235 spin_unlock_irqrestore(&nc->lock, flags); 254 spin_unlock_irqrestore(&nc->lock, flags);
236 return; 255 return;
237 } 256 }
238 nc->enabled = false; 257 nc->monitor.enabled = false;
239 spin_unlock_irqrestore(&nc->lock, flags); 258 spin_unlock_irqrestore(&nc->lock, flags);
240 259
241 del_timer_sync(&nc->timer); 260 del_timer_sync(&nc->monitor.timer);
242} 261}
243 262
244struct ncsi_channel *ncsi_find_channel(struct ncsi_package *np, 263struct ncsi_channel *ncsi_find_channel(struct ncsi_package *np,
@@ -267,8 +286,9 @@ struct ncsi_channel *ncsi_add_channel(struct ncsi_package *np, unsigned char id)
267 nc->id = id; 286 nc->id = id;
268 nc->package = np; 287 nc->package = np;
269 nc->state = NCSI_CHANNEL_INACTIVE; 288 nc->state = NCSI_CHANNEL_INACTIVE;
270 nc->enabled = false; 289 nc->monitor.enabled = false;
271 setup_timer(&nc->timer, ncsi_channel_monitor, (unsigned long)nc); 290 setup_timer(&nc->monitor.timer,
291 ncsi_channel_monitor, (unsigned long)nc);
272 spin_lock_init(&nc->lock); 292 spin_lock_init(&nc->lock);
273 INIT_LIST_HEAD(&nc->link); 293 INIT_LIST_HEAD(&nc->link);
274 for (index = 0; index < NCSI_CAP_MAX; index++) 294 for (index = 0; index < NCSI_CAP_MAX; index++)
@@ -405,7 +425,8 @@ void ncsi_find_package_and_channel(struct ncsi_dev_priv *ndp,
405 * be same. Otherwise, the bogus response might be replied. So 425 * be same. Otherwise, the bogus response might be replied. So
406 * the available IDs are allocated in round-robin fashion. 426 * the available IDs are allocated in round-robin fashion.
407 */ 427 */
408struct ncsi_request *ncsi_alloc_request(struct ncsi_dev_priv *ndp, bool driven) 428struct ncsi_request *ncsi_alloc_request(struct ncsi_dev_priv *ndp,
429 unsigned int req_flags)
409{ 430{
410 struct ncsi_request *nr = NULL; 431 struct ncsi_request *nr = NULL;
411 int i, limit = ARRAY_SIZE(ndp->requests); 432 int i, limit = ARRAY_SIZE(ndp->requests);
@@ -413,30 +434,31 @@ struct ncsi_request *ncsi_alloc_request(struct ncsi_dev_priv *ndp, bool driven)
413 434
414 /* Check if there is one available request until the ceiling */ 435 /* Check if there is one available request until the ceiling */
415 spin_lock_irqsave(&ndp->lock, flags); 436 spin_lock_irqsave(&ndp->lock, flags);
416 for (i = ndp->request_id; !nr && i < limit; i++) { 437 for (i = ndp->request_id; i < limit; i++) {
417 if (ndp->requests[i].used) 438 if (ndp->requests[i].used)
418 continue; 439 continue;
419 440
420 nr = &ndp->requests[i]; 441 nr = &ndp->requests[i];
421 nr->used = true; 442 nr->used = true;
422 nr->driven = driven; 443 nr->flags = req_flags;
423 if (++ndp->request_id >= limit) 444 ndp->request_id = i + 1;
424 ndp->request_id = 0; 445 goto found;
425 } 446 }
426 447
427 /* Fail back to check from the starting cursor */ 448 /* Fail back to check from the starting cursor */
428 for (i = 0; !nr && i < ndp->request_id; i++) { 449 for (i = NCSI_REQ_START_IDX; i < ndp->request_id; i++) {
429 if (ndp->requests[i].used) 450 if (ndp->requests[i].used)
430 continue; 451 continue;
431 452
432 nr = &ndp->requests[i]; 453 nr = &ndp->requests[i];
433 nr->used = true; 454 nr->used = true;
434 nr->driven = driven; 455 nr->flags = req_flags;
435 if (++ndp->request_id >= limit) 456 ndp->request_id = i + 1;
436 ndp->request_id = 0; 457 goto found;
437 } 458 }
438 spin_unlock_irqrestore(&ndp->lock, flags);
439 459
460found:
461 spin_unlock_irqrestore(&ndp->lock, flags);
440 return nr; 462 return nr;
441} 463}
442 464
@@ -458,7 +480,7 @@ void ncsi_free_request(struct ncsi_request *nr)
458 nr->cmd = NULL; 480 nr->cmd = NULL;
459 nr->rsp = NULL; 481 nr->rsp = NULL;
460 nr->used = false; 482 nr->used = false;
461 driven = nr->driven; 483 driven = !!(nr->flags & NCSI_REQ_FLAG_EVENT_DRIVEN);
462 spin_unlock_irqrestore(&ndp->lock, flags); 484 spin_unlock_irqrestore(&ndp->lock, flags);
463 485
464 if (driven && cmd && --ndp->pending_req_num == 0) 486 if (driven && cmd && --ndp->pending_req_num == 0)
@@ -508,10 +530,11 @@ static void ncsi_suspend_channel(struct ncsi_dev_priv *ndp)
508 struct ncsi_package *np = ndp->active_package; 530 struct ncsi_package *np = ndp->active_package;
509 struct ncsi_channel *nc = ndp->active_channel; 531 struct ncsi_channel *nc = ndp->active_channel;
510 struct ncsi_cmd_arg nca; 532 struct ncsi_cmd_arg nca;
533 unsigned long flags;
511 int ret; 534 int ret;
512 535
513 nca.ndp = ndp; 536 nca.ndp = ndp;
514 nca.driven = true; 537 nca.req_flags = NCSI_REQ_FLAG_EVENT_DRIVEN;
515 switch (nd->state) { 538 switch (nd->state) {
516 case ncsi_dev_state_suspend: 539 case ncsi_dev_state_suspend:
517 nd->state = ncsi_dev_state_suspend_select; 540 nd->state = ncsi_dev_state_suspend_select;
@@ -527,7 +550,7 @@ static void ncsi_suspend_channel(struct ncsi_dev_priv *ndp)
527 nca.package = np->id; 550 nca.package = np->id;
528 if (nd->state == ncsi_dev_state_suspend_select) { 551 if (nd->state == ncsi_dev_state_suspend_select) {
529 nca.type = NCSI_PKT_CMD_SP; 552 nca.type = NCSI_PKT_CMD_SP;
530 nca.channel = 0x1f; 553 nca.channel = NCSI_RESERVED_CHANNEL;
531 if (ndp->flags & NCSI_DEV_HWA) 554 if (ndp->flags & NCSI_DEV_HWA)
532 nca.bytes[0] = 0; 555 nca.bytes[0] = 0;
533 else 556 else
@@ -544,7 +567,7 @@ static void ncsi_suspend_channel(struct ncsi_dev_priv *ndp)
544 nd->state = ncsi_dev_state_suspend_deselect; 567 nd->state = ncsi_dev_state_suspend_deselect;
545 } else if (nd->state == ncsi_dev_state_suspend_deselect) { 568 } else if (nd->state == ncsi_dev_state_suspend_deselect) {
546 nca.type = NCSI_PKT_CMD_DP; 569 nca.type = NCSI_PKT_CMD_DP;
547 nca.channel = 0x1f; 570 nca.channel = NCSI_RESERVED_CHANNEL;
548 nd->state = ncsi_dev_state_suspend_done; 571 nd->state = ncsi_dev_state_suspend_done;
549 } 572 }
550 573
@@ -556,7 +579,9 @@ static void ncsi_suspend_channel(struct ncsi_dev_priv *ndp)
556 579
557 break; 580 break;
558 case ncsi_dev_state_suspend_done: 581 case ncsi_dev_state_suspend_done:
559 xchg(&nc->state, NCSI_CHANNEL_INACTIVE); 582 spin_lock_irqsave(&nc->lock, flags);
583 nc->state = NCSI_CHANNEL_INACTIVE;
584 spin_unlock_irqrestore(&nc->lock, flags);
560 ncsi_process_next_channel(ndp); 585 ncsi_process_next_channel(ndp);
561 586
562 break; 587 break;
@@ -574,10 +599,11 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp)
574 struct ncsi_channel *nc = ndp->active_channel; 599 struct ncsi_channel *nc = ndp->active_channel;
575 struct ncsi_cmd_arg nca; 600 struct ncsi_cmd_arg nca;
576 unsigned char index; 601 unsigned char index;
602 unsigned long flags;
577 int ret; 603 int ret;
578 604
579 nca.ndp = ndp; 605 nca.ndp = ndp;
580 nca.driven = true; 606 nca.req_flags = NCSI_REQ_FLAG_EVENT_DRIVEN;
581 switch (nd->state) { 607 switch (nd->state) {
582 case ncsi_dev_state_config: 608 case ncsi_dev_state_config:
583 case ncsi_dev_state_config_sp: 609 case ncsi_dev_state_config_sp:
@@ -590,7 +616,7 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp)
590 else 616 else
591 nca.bytes[0] = 1; 617 nca.bytes[0] = 1;
592 nca.package = np->id; 618 nca.package = np->id;
593 nca.channel = 0x1f; 619 nca.channel = NCSI_RESERVED_CHANNEL;
594 ret = ncsi_xmit_cmd(&nca); 620 ret = ncsi_xmit_cmd(&nca);
595 if (ret) 621 if (ret)
596 goto error; 622 goto error;
@@ -675,10 +701,12 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp)
675 goto error; 701 goto error;
676 break; 702 break;
677 case ncsi_dev_state_config_done: 703 case ncsi_dev_state_config_done:
704 spin_lock_irqsave(&nc->lock, flags);
678 if (nc->modes[NCSI_MODE_LINK].data[2] & 0x1) 705 if (nc->modes[NCSI_MODE_LINK].data[2] & 0x1)
679 xchg(&nc->state, NCSI_CHANNEL_ACTIVE); 706 nc->state = NCSI_CHANNEL_ACTIVE;
680 else 707 else
681 xchg(&nc->state, NCSI_CHANNEL_INACTIVE); 708 nc->state = NCSI_CHANNEL_INACTIVE;
709 spin_unlock_irqrestore(&nc->lock, flags);
682 710
683 ncsi_start_channel_monitor(nc); 711 ncsi_start_channel_monitor(nc);
684 ncsi_process_next_channel(ndp); 712 ncsi_process_next_channel(ndp);
@@ -707,18 +735,25 @@ static int ncsi_choose_active_channel(struct ncsi_dev_priv *ndp)
707 found = NULL; 735 found = NULL;
708 NCSI_FOR_EACH_PACKAGE(ndp, np) { 736 NCSI_FOR_EACH_PACKAGE(ndp, np) {
709 NCSI_FOR_EACH_CHANNEL(np, nc) { 737 NCSI_FOR_EACH_CHANNEL(np, nc) {
738 spin_lock_irqsave(&nc->lock, flags);
739
710 if (!list_empty(&nc->link) || 740 if (!list_empty(&nc->link) ||
711 nc->state != NCSI_CHANNEL_INACTIVE) 741 nc->state != NCSI_CHANNEL_INACTIVE) {
742 spin_unlock_irqrestore(&nc->lock, flags);
712 continue; 743 continue;
744 }
713 745
714 if (!found) 746 if (!found)
715 found = nc; 747 found = nc;
716 748
717 ncm = &nc->modes[NCSI_MODE_LINK]; 749 ncm = &nc->modes[NCSI_MODE_LINK];
718 if (ncm->data[2] & 0x1) { 750 if (ncm->data[2] & 0x1) {
751 spin_unlock_irqrestore(&nc->lock, flags);
719 found = nc; 752 found = nc;
720 goto out; 753 goto out;
721 } 754 }
755
756 spin_unlock_irqrestore(&nc->lock, flags);
722 } 757 }
723 } 758 }
724 759
@@ -797,7 +832,7 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp)
797 int ret; 832 int ret;
798 833
799 nca.ndp = ndp; 834 nca.ndp = ndp;
800 nca.driven = true; 835 nca.req_flags = NCSI_REQ_FLAG_EVENT_DRIVEN;
801 switch (nd->state) { 836 switch (nd->state) {
802 case ncsi_dev_state_probe: 837 case ncsi_dev_state_probe:
803 nd->state = ncsi_dev_state_probe_deselect; 838 nd->state = ncsi_dev_state_probe_deselect;
@@ -807,7 +842,7 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp)
807 842
808 /* Deselect all possible packages */ 843 /* Deselect all possible packages */
809 nca.type = NCSI_PKT_CMD_DP; 844 nca.type = NCSI_PKT_CMD_DP;
810 nca.channel = 0x1f; 845 nca.channel = NCSI_RESERVED_CHANNEL;
811 for (index = 0; index < 8; index++) { 846 for (index = 0; index < 8; index++) {
812 nca.package = index; 847 nca.package = index;
813 ret = ncsi_xmit_cmd(&nca); 848 ret = ncsi_xmit_cmd(&nca);
@@ -823,7 +858,7 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp)
823 /* Select all possible packages */ 858 /* Select all possible packages */
824 nca.type = NCSI_PKT_CMD_SP; 859 nca.type = NCSI_PKT_CMD_SP;
825 nca.bytes[0] = 1; 860 nca.bytes[0] = 1;
826 nca.channel = 0x1f; 861 nca.channel = NCSI_RESERVED_CHANNEL;
827 for (index = 0; index < 8; index++) { 862 for (index = 0; index < 8; index++) {
828 nca.package = index; 863 nca.package = index;
829 ret = ncsi_xmit_cmd(&nca); 864 ret = ncsi_xmit_cmd(&nca);
@@ -876,7 +911,7 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp)
876 nca.type = NCSI_PKT_CMD_SP; 911 nca.type = NCSI_PKT_CMD_SP;
877 nca.bytes[0] = 1; 912 nca.bytes[0] = 1;
878 nca.package = ndp->active_package->id; 913 nca.package = ndp->active_package->id;
879 nca.channel = 0x1f; 914 nca.channel = NCSI_RESERVED_CHANNEL;
880 ret = ncsi_xmit_cmd(&nca); 915 ret = ncsi_xmit_cmd(&nca);
881 if (ret) 916 if (ret)
882 goto error; 917 goto error;
@@ -884,12 +919,12 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp)
884 nd->state = ncsi_dev_state_probe_cis; 919 nd->state = ncsi_dev_state_probe_cis;
885 break; 920 break;
886 case ncsi_dev_state_probe_cis: 921 case ncsi_dev_state_probe_cis:
887 ndp->pending_req_num = 32; 922 ndp->pending_req_num = NCSI_RESERVED_CHANNEL;
888 923
889 /* Clear initial state */ 924 /* Clear initial state */
890 nca.type = NCSI_PKT_CMD_CIS; 925 nca.type = NCSI_PKT_CMD_CIS;
891 nca.package = ndp->active_package->id; 926 nca.package = ndp->active_package->id;
892 for (index = 0; index < 0x20; index++) { 927 for (index = 0; index < NCSI_RESERVED_CHANNEL; index++) {
893 nca.channel = index; 928 nca.channel = index;
894 ret = ncsi_xmit_cmd(&nca); 929 ret = ncsi_xmit_cmd(&nca);
895 if (ret) 930 if (ret)
@@ -933,7 +968,7 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp)
933 /* Deselect the active package */ 968 /* Deselect the active package */
934 nca.type = NCSI_PKT_CMD_DP; 969 nca.type = NCSI_PKT_CMD_DP;
935 nca.package = ndp->active_package->id; 970 nca.package = ndp->active_package->id;
936 nca.channel = 0x1f; 971 nca.channel = NCSI_RESERVED_CHANNEL;
937 ret = ncsi_xmit_cmd(&nca); 972 ret = ncsi_xmit_cmd(&nca);
938 if (ret) 973 if (ret)
939 goto error; 974 goto error;
@@ -987,11 +1022,14 @@ int ncsi_process_next_channel(struct ncsi_dev_priv *ndp)
987 goto out; 1022 goto out;
988 } 1023 }
989 1024
990 old_state = xchg(&nc->state, NCSI_CHANNEL_INVISIBLE);
991 list_del_init(&nc->link); 1025 list_del_init(&nc->link);
992
993 spin_unlock_irqrestore(&ndp->lock, flags); 1026 spin_unlock_irqrestore(&ndp->lock, flags);
994 1027
1028 spin_lock_irqsave(&nc->lock, flags);
1029 old_state = nc->state;
1030 nc->state = NCSI_CHANNEL_INVISIBLE;
1031 spin_unlock_irqrestore(&nc->lock, flags);
1032
995 ndp->active_channel = nc; 1033 ndp->active_channel = nc;
996 ndp->active_package = nc->package; 1034 ndp->active_package = nc->package;
997 1035
@@ -1006,7 +1044,7 @@ int ncsi_process_next_channel(struct ncsi_dev_priv *ndp)
1006 break; 1044 break;
1007 default: 1045 default:
1008 netdev_err(ndp->ndev.dev, "Invalid state 0x%x on %d:%d\n", 1046 netdev_err(ndp->ndev.dev, "Invalid state 0x%x on %d:%d\n",
1009 nc->state, nc->package->id, nc->id); 1047 old_state, nc->package->id, nc->id);
1010 ncsi_report_link(ndp, false); 1048 ncsi_report_link(ndp, false);
1011 return -EINVAL; 1049 return -EINVAL;
1012 } 1050 }
@@ -1070,7 +1108,7 @@ static int ncsi_inet6addr_event(struct notifier_block *this,
1070 return NOTIFY_OK; 1108 return NOTIFY_OK;
1071 1109
1072 nca.ndp = ndp; 1110 nca.ndp = ndp;
1073 nca.driven = false; 1111 nca.req_flags = 0;
1074 nca.package = np->id; 1112 nca.package = np->id;
1075 nca.channel = nc->id; 1113 nca.channel = nc->id;
1076 nca.dwords[0] = nc->caps[NCSI_CAP_MC].cap; 1114 nca.dwords[0] = nc->caps[NCSI_CAP_MC].cap;
@@ -1118,7 +1156,7 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev,
1118 /* Initialize private NCSI device */ 1156 /* Initialize private NCSI device */
1119 spin_lock_init(&ndp->lock); 1157 spin_lock_init(&ndp->lock);
1120 INIT_LIST_HEAD(&ndp->packages); 1158 INIT_LIST_HEAD(&ndp->packages);
1121 ndp->request_id = 0; 1159 ndp->request_id = NCSI_REQ_START_IDX;
1122 for (i = 0; i < ARRAY_SIZE(ndp->requests); i++) { 1160 for (i = 0; i < ARRAY_SIZE(ndp->requests); i++) {
1123 ndp->requests[i].id = i; 1161 ndp->requests[i].id = i;
1124 ndp->requests[i].ndp = ndp; 1162 ndp->requests[i].ndp = ndp;
@@ -1149,9 +1187,7 @@ EXPORT_SYMBOL_GPL(ncsi_register_dev);
1149int ncsi_start_dev(struct ncsi_dev *nd) 1187int ncsi_start_dev(struct ncsi_dev *nd)
1150{ 1188{
1151 struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd); 1189 struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd);
1152 struct ncsi_package *np; 1190 int ret;
1153 struct ncsi_channel *nc;
1154 int old_state, ret;
1155 1191
1156 if (nd->state != ncsi_dev_state_registered && 1192 if (nd->state != ncsi_dev_state_registered &&
1157 nd->state != ncsi_dev_state_functional) 1193 nd->state != ncsi_dev_state_functional)
@@ -1163,15 +1199,6 @@ int ncsi_start_dev(struct ncsi_dev *nd)
1163 return 0; 1199 return 0;
1164 } 1200 }
1165 1201
1166 /* Reset channel's state and start over */
1167 NCSI_FOR_EACH_PACKAGE(ndp, np) {
1168 NCSI_FOR_EACH_CHANNEL(np, nc) {
1169 old_state = xchg(&nc->state, NCSI_CHANNEL_INACTIVE);
1170 WARN_ON_ONCE(!list_empty(&nc->link) ||
1171 old_state == NCSI_CHANNEL_INVISIBLE);
1172 }
1173 }
1174
1175 if (ndp->flags & NCSI_DEV_HWA) 1202 if (ndp->flags & NCSI_DEV_HWA)
1176 ret = ncsi_enable_hwa(ndp); 1203 ret = ncsi_enable_hwa(ndp);
1177 else 1204 else
@@ -1181,6 +1208,35 @@ int ncsi_start_dev(struct ncsi_dev *nd)
1181} 1208}
1182EXPORT_SYMBOL_GPL(ncsi_start_dev); 1209EXPORT_SYMBOL_GPL(ncsi_start_dev);
1183 1210
1211void ncsi_stop_dev(struct ncsi_dev *nd)
1212{
1213 struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd);
1214 struct ncsi_package *np;
1215 struct ncsi_channel *nc;
1216 bool chained;
1217 int old_state;
1218 unsigned long flags;
1219
1220 /* Stop the channel monitor and reset channel's state */
1221 NCSI_FOR_EACH_PACKAGE(ndp, np) {
1222 NCSI_FOR_EACH_CHANNEL(np, nc) {
1223 ncsi_stop_channel_monitor(nc);
1224
1225 spin_lock_irqsave(&nc->lock, flags);
1226 chained = !list_empty(&nc->link);
1227 old_state = nc->state;
1228 nc->state = NCSI_CHANNEL_INACTIVE;
1229 spin_unlock_irqrestore(&nc->lock, flags);
1230
1231 WARN_ON_ONCE(chained ||
1232 old_state == NCSI_CHANNEL_INVISIBLE);
1233 }
1234 }
1235
1236 ncsi_report_link(ndp, true);
1237}
1238EXPORT_SYMBOL_GPL(ncsi_stop_dev);
1239
1184void ncsi_unregister_dev(struct ncsi_dev *nd) 1240void ncsi_unregister_dev(struct ncsi_dev *nd)
1185{ 1241{
1186 struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd); 1242 struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd);
diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c
index af84389a6bf1..087db775b3dc 100644
--- a/net/ncsi/ncsi-rsp.c
+++ b/net/ncsi/ncsi-rsp.c
@@ -317,12 +317,12 @@ static int ncsi_rsp_handler_gls(struct ncsi_request *nr)
317 ncm->data[3] = ntohl(rsp->other); 317 ncm->data[3] = ntohl(rsp->other);
318 ncm->data[4] = ntohl(rsp->oem_status); 318 ncm->data[4] = ntohl(rsp->oem_status);
319 319
320 if (nr->driven) 320 if (nr->flags & NCSI_REQ_FLAG_EVENT_DRIVEN)
321 return 0; 321 return 0;
322 322
323 /* Reset the channel monitor if it has been enabled */ 323 /* Reset the channel monitor if it has been enabled */
324 spin_lock_irqsave(&nc->lock, flags); 324 spin_lock_irqsave(&nc->lock, flags);
325 nc->timeout = 0; 325 nc->monitor.state = NCSI_CHANNEL_MONITOR_START;
326 spin_unlock_irqrestore(&nc->lock, flags); 326 spin_unlock_irqrestore(&nc->lock, flags);
327 327
328 return 0; 328 return 0;
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 9266ceebd112..e8d56d9a4df2 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -474,6 +474,12 @@ config NFT_META
474 This option adds the "meta" expression that you can use to match and 474 This option adds the "meta" expression that you can use to match and
475 to set packet metainformation such as the packet mark. 475 to set packet metainformation such as the packet mark.
476 476
477config NFT_NUMGEN
478 tristate "Netfilter nf_tables number generator module"
479 help
480 This option adds the number generator expression used to perform
481 incremental counting and random numbers bound to a upper limit.
482
477config NFT_CT 483config NFT_CT
478 depends on NF_CONNTRACK 484 depends on NF_CONNTRACK
479 tristate "Netfilter nf_tables conntrack module" 485 tristate "Netfilter nf_tables conntrack module"
@@ -481,13 +487,13 @@ config NFT_CT
481 This option adds the "meta" expression that you can use to match 487 This option adds the "meta" expression that you can use to match
482 connection tracking information such as the flow state. 488 connection tracking information such as the flow state.
483 489
484config NFT_RBTREE 490config NFT_SET_RBTREE
485 tristate "Netfilter nf_tables rbtree set module" 491 tristate "Netfilter nf_tables rbtree set module"
486 help 492 help
487 This option adds the "rbtree" set type (Red Black tree) that is used 493 This option adds the "rbtree" set type (Red Black tree) that is used
488 to build interval-based sets. 494 to build interval-based sets.
489 495
490config NFT_HASH 496config NFT_SET_HASH
491 tristate "Netfilter nf_tables hash set module" 497 tristate "Netfilter nf_tables hash set module"
492 help 498 help
493 This option adds the "hash" set type that is used to build one-way 499 This option adds the "hash" set type that is used to build one-way
@@ -542,6 +548,12 @@ config NFT_QUEUE
542 This is required if you intend to use the userspace queueing 548 This is required if you intend to use the userspace queueing
543 infrastructure (also known as NFQUEUE) from nftables. 549 infrastructure (also known as NFQUEUE) from nftables.
544 550
551config NFT_QUOTA
552 tristate "Netfilter nf_tables quota module"
553 help
554 This option adds the "quota" expression that you can use to match
555 enforce bytes quotas.
556
545config NFT_REJECT 557config NFT_REJECT
546 default m if NETFILTER_ADVANCED=n 558 default m if NETFILTER_ADVANCED=n
547 tristate "Netfilter nf_tables reject support" 559 tristate "Netfilter nf_tables reject support"
@@ -563,6 +575,12 @@ config NFT_COMPAT
563 x_tables match/target extensions over the nf_tables 575 x_tables match/target extensions over the nf_tables
564 framework. 576 framework.
565 577
578config NFT_HASH
579 tristate "Netfilter nf_tables hash module"
580 help
581 This option adds the "hash" expression that you can use to perform
582 a hash operation on registers.
583
566if NF_TABLES_NETDEV 584if NF_TABLES_NETDEV
567 585
568config NF_DUP_NETDEV 586config NF_DUP_NETDEV
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 69134541d65b..c23c3c84416f 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -71,8 +71,9 @@ obj-$(CONFIG_NF_DUP_NETDEV) += nf_dup_netdev.o
71 71
72# nf_tables 72# nf_tables
73nf_tables-objs += nf_tables_core.o nf_tables_api.o nf_tables_trace.o 73nf_tables-objs += nf_tables_core.o nf_tables_api.o nf_tables_trace.o
74nf_tables-objs += nft_immediate.o nft_cmp.o nft_lookup.o nft_dynset.o 74nf_tables-objs += nft_immediate.o nft_cmp.o nft_range.o
75nf_tables-objs += nft_bitwise.o nft_byteorder.o nft_payload.o 75nf_tables-objs += nft_bitwise.o nft_byteorder.o nft_payload.o
76nf_tables-objs += nft_lookup.o nft_dynset.o
76 77
77obj-$(CONFIG_NF_TABLES) += nf_tables.o 78obj-$(CONFIG_NF_TABLES) += nf_tables.o
78obj-$(CONFIG_NF_TABLES_INET) += nf_tables_inet.o 79obj-$(CONFIG_NF_TABLES_INET) += nf_tables_inet.o
@@ -80,18 +81,21 @@ obj-$(CONFIG_NF_TABLES_NETDEV) += nf_tables_netdev.o
80obj-$(CONFIG_NFT_COMPAT) += nft_compat.o 81obj-$(CONFIG_NFT_COMPAT) += nft_compat.o
81obj-$(CONFIG_NFT_EXTHDR) += nft_exthdr.o 82obj-$(CONFIG_NFT_EXTHDR) += nft_exthdr.o
82obj-$(CONFIG_NFT_META) += nft_meta.o 83obj-$(CONFIG_NFT_META) += nft_meta.o
84obj-$(CONFIG_NFT_NUMGEN) += nft_numgen.o
83obj-$(CONFIG_NFT_CT) += nft_ct.o 85obj-$(CONFIG_NFT_CT) += nft_ct.o
84obj-$(CONFIG_NFT_LIMIT) += nft_limit.o 86obj-$(CONFIG_NFT_LIMIT) += nft_limit.o
85obj-$(CONFIG_NFT_NAT) += nft_nat.o 87obj-$(CONFIG_NFT_NAT) += nft_nat.o
86obj-$(CONFIG_NFT_QUEUE) += nft_queue.o 88obj-$(CONFIG_NFT_QUEUE) += nft_queue.o
89obj-$(CONFIG_NFT_QUOTA) += nft_quota.o
87obj-$(CONFIG_NFT_REJECT) += nft_reject.o 90obj-$(CONFIG_NFT_REJECT) += nft_reject.o
88obj-$(CONFIG_NFT_REJECT_INET) += nft_reject_inet.o 91obj-$(CONFIG_NFT_REJECT_INET) += nft_reject_inet.o
89obj-$(CONFIG_NFT_RBTREE) += nft_rbtree.o 92obj-$(CONFIG_NFT_SET_RBTREE) += nft_set_rbtree.o
90obj-$(CONFIG_NFT_HASH) += nft_hash.o 93obj-$(CONFIG_NFT_SET_HASH) += nft_set_hash.o
91obj-$(CONFIG_NFT_COUNTER) += nft_counter.o 94obj-$(CONFIG_NFT_COUNTER) += nft_counter.o
92obj-$(CONFIG_NFT_LOG) += nft_log.o 95obj-$(CONFIG_NFT_LOG) += nft_log.o
93obj-$(CONFIG_NFT_MASQ) += nft_masq.o 96obj-$(CONFIG_NFT_MASQ) += nft_masq.o
94obj-$(CONFIG_NFT_REDIR) += nft_redir.o 97obj-$(CONFIG_NFT_REDIR) += nft_redir.o
98obj-$(CONFIG_NFT_HASH) += nft_hash.o
95 99
96# nf_tables netdev 100# nf_tables netdev
97obj-$(CONFIG_NFT_DUP_NETDEV) += nft_dup_netdev.o 101obj-$(CONFIG_NFT_DUP_NETDEV) += nft_dup_netdev.o
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index f39276d1c2d7..fcb5d1df11e9 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -22,6 +22,7 @@
22#include <linux/proc_fs.h> 22#include <linux/proc_fs.h>
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/rcupdate.h>
25#include <net/net_namespace.h> 26#include <net/net_namespace.h>
26#include <net/sock.h> 27#include <net/sock.h>
27 28
@@ -61,33 +62,41 @@ EXPORT_SYMBOL(nf_hooks_needed);
61#endif 62#endif
62 63
63static DEFINE_MUTEX(nf_hook_mutex); 64static DEFINE_MUTEX(nf_hook_mutex);
65#define nf_entry_dereference(e) \
66 rcu_dereference_protected(e, lockdep_is_held(&nf_hook_mutex))
64 67
65static struct list_head *nf_find_hook_list(struct net *net, 68static struct nf_hook_entry __rcu **nf_hook_entry_head(struct net *net, const struct nf_hook_ops *reg)
66 const struct nf_hook_ops *reg)
67{ 69{
68 struct list_head *hook_list = NULL;
69
70 if (reg->pf != NFPROTO_NETDEV) 70 if (reg->pf != NFPROTO_NETDEV)
71 hook_list = &net->nf.hooks[reg->pf][reg->hooknum]; 71 return net->nf.hooks[reg->pf]+reg->hooknum;
72 else if (reg->hooknum == NF_NETDEV_INGRESS) { 72
73#ifdef CONFIG_NETFILTER_INGRESS 73#ifdef CONFIG_NETFILTER_INGRESS
74 if (reg->hooknum == NF_NETDEV_INGRESS) {
74 if (reg->dev && dev_net(reg->dev) == net) 75 if (reg->dev && dev_net(reg->dev) == net)
75 hook_list = &reg->dev->nf_hooks_ingress; 76 return &reg->dev->nf_hooks_ingress;
76#endif
77 } 77 }
78 return hook_list; 78#endif
79 return NULL;
79} 80}
80 81
81struct nf_hook_entry {
82 const struct nf_hook_ops *orig_ops;
83 struct nf_hook_ops ops;
84};
85
86int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg) 82int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg)
87{ 83{
88 struct list_head *hook_list; 84 struct nf_hook_entry __rcu **pp;
89 struct nf_hook_entry *entry; 85 struct nf_hook_entry *entry, *p;
90 struct nf_hook_ops *elem; 86
87 if (reg->pf == NFPROTO_NETDEV) {
88#ifndef CONFIG_NETFILTER_INGRESS
89 if (reg->hooknum == NF_NETDEV_INGRESS)
90 return -EOPNOTSUPP;
91#endif
92 if (reg->hooknum != NF_NETDEV_INGRESS ||
93 !reg->dev || dev_net(reg->dev) != net)
94 return -EINVAL;
95 }
96
97 pp = nf_hook_entry_head(net, reg);
98 if (!pp)
99 return -EINVAL;
91 100
92 entry = kmalloc(sizeof(*entry), GFP_KERNEL); 101 entry = kmalloc(sizeof(*entry), GFP_KERNEL);
93 if (!entry) 102 if (!entry)
@@ -95,19 +104,19 @@ int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg)
95 104
96 entry->orig_ops = reg; 105 entry->orig_ops = reg;
97 entry->ops = *reg; 106 entry->ops = *reg;
98 107 entry->next = NULL;
99 hook_list = nf_find_hook_list(net, reg);
100 if (!hook_list) {
101 kfree(entry);
102 return -ENOENT;
103 }
104 108
105 mutex_lock(&nf_hook_mutex); 109 mutex_lock(&nf_hook_mutex);
106 list_for_each_entry(elem, hook_list, list) { 110
107 if (reg->priority < elem->priority) 111 /* Find the spot in the list */
112 while ((p = nf_entry_dereference(*pp)) != NULL) {
113 if (reg->priority < p->orig_ops->priority)
108 break; 114 break;
115 pp = &p->next;
109 } 116 }
110 list_add_rcu(&entry->ops.list, elem->list.prev); 117 rcu_assign_pointer(entry->next, p);
118 rcu_assign_pointer(*pp, entry);
119
111 mutex_unlock(&nf_hook_mutex); 120 mutex_unlock(&nf_hook_mutex);
112#ifdef CONFIG_NETFILTER_INGRESS 121#ifdef CONFIG_NETFILTER_INGRESS
113 if (reg->pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS) 122 if (reg->pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS)
@@ -122,24 +131,23 @@ EXPORT_SYMBOL(nf_register_net_hook);
122 131
123void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg) 132void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg)
124{ 133{
125 struct list_head *hook_list; 134 struct nf_hook_entry __rcu **pp;
126 struct nf_hook_entry *entry; 135 struct nf_hook_entry *p;
127 struct nf_hook_ops *elem;
128 136
129 hook_list = nf_find_hook_list(net, reg); 137 pp = nf_hook_entry_head(net, reg);
130 if (!hook_list) 138 if (WARN_ON_ONCE(!pp))
131 return; 139 return;
132 140
133 mutex_lock(&nf_hook_mutex); 141 mutex_lock(&nf_hook_mutex);
134 list_for_each_entry(elem, hook_list, list) { 142 while ((p = nf_entry_dereference(*pp)) != NULL) {
135 entry = container_of(elem, struct nf_hook_entry, ops); 143 if (p->orig_ops == reg) {
136 if (entry->orig_ops == reg) { 144 rcu_assign_pointer(*pp, p->next);
137 list_del_rcu(&entry->ops.list);
138 break; 145 break;
139 } 146 }
147 pp = &p->next;
140 } 148 }
141 mutex_unlock(&nf_hook_mutex); 149 mutex_unlock(&nf_hook_mutex);
142 if (&elem->list == hook_list) { 150 if (!p) {
143 WARN(1, "nf_unregister_net_hook: hook not found!\n"); 151 WARN(1, "nf_unregister_net_hook: hook not found!\n");
144 return; 152 return;
145 } 153 }
@@ -151,10 +159,10 @@ void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg)
151 static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]); 159 static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]);
152#endif 160#endif
153 synchronize_net(); 161 synchronize_net();
154 nf_queue_nf_hook_drop(net, &entry->ops); 162 nf_queue_nf_hook_drop(net, p);
155 /* other cpu might still process nfqueue verdict that used reg */ 163 /* other cpu might still process nfqueue verdict that used reg */
156 synchronize_net(); 164 synchronize_net();
157 kfree(entry); 165 kfree(p);
158} 166}
159EXPORT_SYMBOL(nf_unregister_net_hook); 167EXPORT_SYMBOL(nf_unregister_net_hook);
160 168
@@ -188,19 +196,17 @@ EXPORT_SYMBOL(nf_unregister_net_hooks);
188 196
189static LIST_HEAD(nf_hook_list); 197static LIST_HEAD(nf_hook_list);
190 198
191int nf_register_hook(struct nf_hook_ops *reg) 199static int _nf_register_hook(struct nf_hook_ops *reg)
192{ 200{
193 struct net *net, *last; 201 struct net *net, *last;
194 int ret; 202 int ret;
195 203
196 rtnl_lock();
197 for_each_net(net) { 204 for_each_net(net) {
198 ret = nf_register_net_hook(net, reg); 205 ret = nf_register_net_hook(net, reg);
199 if (ret && ret != -ENOENT) 206 if (ret && ret != -ENOENT)
200 goto rollback; 207 goto rollback;
201 } 208 }
202 list_add_tail(&reg->list, &nf_hook_list); 209 list_add_tail(&reg->list, &nf_hook_list);
203 rtnl_unlock();
204 210
205 return 0; 211 return 0;
206rollback: 212rollback:
@@ -210,19 +216,34 @@ rollback:
210 break; 216 break;
211 nf_unregister_net_hook(net, reg); 217 nf_unregister_net_hook(net, reg);
212 } 218 }
219 return ret;
220}
221
222int nf_register_hook(struct nf_hook_ops *reg)
223{
224 int ret;
225
226 rtnl_lock();
227 ret = _nf_register_hook(reg);
213 rtnl_unlock(); 228 rtnl_unlock();
229
214 return ret; 230 return ret;
215} 231}
216EXPORT_SYMBOL(nf_register_hook); 232EXPORT_SYMBOL(nf_register_hook);
217 233
218void nf_unregister_hook(struct nf_hook_ops *reg) 234static void _nf_unregister_hook(struct nf_hook_ops *reg)
219{ 235{
220 struct net *net; 236 struct net *net;
221 237
222 rtnl_lock();
223 list_del(&reg->list); 238 list_del(&reg->list);
224 for_each_net(net) 239 for_each_net(net)
225 nf_unregister_net_hook(net, reg); 240 nf_unregister_net_hook(net, reg);
241}
242
243void nf_unregister_hook(struct nf_hook_ops *reg)
244{
245 rtnl_lock();
246 _nf_unregister_hook(reg);
226 rtnl_unlock(); 247 rtnl_unlock();
227} 248}
228EXPORT_SYMBOL(nf_unregister_hook); 249EXPORT_SYMBOL(nf_unregister_hook);
@@ -246,6 +267,26 @@ err:
246} 267}
247EXPORT_SYMBOL(nf_register_hooks); 268EXPORT_SYMBOL(nf_register_hooks);
248 269
270/* Caller MUST take rtnl_lock() */
271int _nf_register_hooks(struct nf_hook_ops *reg, unsigned int n)
272{
273 unsigned int i;
274 int err = 0;
275
276 for (i = 0; i < n; i++) {
277 err = _nf_register_hook(&reg[i]);
278 if (err)
279 goto err;
280 }
281 return err;
282
283err:
284 if (i > 0)
285 _nf_unregister_hooks(reg, i);
286 return err;
287}
288EXPORT_SYMBOL(_nf_register_hooks);
289
249void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n) 290void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n)
250{ 291{
251 while (n-- > 0) 292 while (n-- > 0)
@@ -253,10 +294,17 @@ void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n)
253} 294}
254EXPORT_SYMBOL(nf_unregister_hooks); 295EXPORT_SYMBOL(nf_unregister_hooks);
255 296
256unsigned int nf_iterate(struct list_head *head, 297/* Caller MUST take rtnl_lock */
257 struct sk_buff *skb, 298void _nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n)
299{
300 while (n-- > 0)
301 _nf_unregister_hook(&reg[n]);
302}
303EXPORT_SYMBOL(_nf_unregister_hooks);
304
305unsigned int nf_iterate(struct sk_buff *skb,
258 struct nf_hook_state *state, 306 struct nf_hook_state *state,
259 struct nf_hook_ops **elemp) 307 struct nf_hook_entry **entryp)
260{ 308{
261 unsigned int verdict; 309 unsigned int verdict;
262 310
@@ -264,20 +312,23 @@ unsigned int nf_iterate(struct list_head *head,
264 * The caller must not block between calls to this 312 * The caller must not block between calls to this
265 * function because of risk of continuing from deleted element. 313 * function because of risk of continuing from deleted element.
266 */ 314 */
267 list_for_each_entry_continue_rcu((*elemp), head, list) { 315 while (*entryp) {
268 if (state->thresh > (*elemp)->priority) 316 if (state->thresh > (*entryp)->ops.priority) {
317 *entryp = rcu_dereference((*entryp)->next);
269 continue; 318 continue;
319 }
270 320
271 /* Optimization: we don't need to hold module 321 /* Optimization: we don't need to hold module
272 reference here, since function can't sleep. --RR */ 322 reference here, since function can't sleep. --RR */
273repeat: 323repeat:
274 verdict = (*elemp)->hook((*elemp)->priv, skb, state); 324 verdict = (*entryp)->ops.hook((*entryp)->ops.priv, skb, state);
275 if (verdict != NF_ACCEPT) { 325 if (verdict != NF_ACCEPT) {
276#ifdef CONFIG_NETFILTER_DEBUG 326#ifdef CONFIG_NETFILTER_DEBUG
277 if (unlikely((verdict & NF_VERDICT_MASK) 327 if (unlikely((verdict & NF_VERDICT_MASK)
278 > NF_MAX_VERDICT)) { 328 > NF_MAX_VERDICT)) {
279 NFDEBUG("Evil return from %p(%u).\n", 329 NFDEBUG("Evil return from %p(%u).\n",
280 (*elemp)->hook, state->hook); 330 (*entryp)->ops.hook, state->hook);
331 *entryp = rcu_dereference((*entryp)->next);
281 continue; 332 continue;
282 } 333 }
283#endif 334#endif
@@ -285,25 +336,23 @@ repeat:
285 return verdict; 336 return verdict;
286 goto repeat; 337 goto repeat;
287 } 338 }
339 *entryp = rcu_dereference((*entryp)->next);
288 } 340 }
289 return NF_ACCEPT; 341 return NF_ACCEPT;
290} 342}
291 343
292 344
293/* Returns 1 if okfn() needs to be executed by the caller, 345/* Returns 1 if okfn() needs to be executed by the caller,
294 * -EPERM for NF_DROP, 0 otherwise. */ 346 * -EPERM for NF_DROP, 0 otherwise. Caller must hold rcu_read_lock. */
295int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state) 347int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state)
296{ 348{
297 struct nf_hook_ops *elem; 349 struct nf_hook_entry *entry;
298 unsigned int verdict; 350 unsigned int verdict;
299 int ret = 0; 351 int ret = 0;
300 352
301 /* We may already have this, but read-locks nest anyway */ 353 entry = rcu_dereference(state->hook_entries);
302 rcu_read_lock();
303
304 elem = list_entry_rcu(state->hook_list, struct nf_hook_ops, list);
305next_hook: 354next_hook:
306 verdict = nf_iterate(state->hook_list, skb, state, &elem); 355 verdict = nf_iterate(skb, state, &entry);
307 if (verdict == NF_ACCEPT || verdict == NF_STOP) { 356 if (verdict == NF_ACCEPT || verdict == NF_STOP) {
308 ret = 1; 357 ret = 1;
309 } else if ((verdict & NF_VERDICT_MASK) == NF_DROP) { 358 } else if ((verdict & NF_VERDICT_MASK) == NF_DROP) {
@@ -312,8 +361,10 @@ next_hook:
312 if (ret == 0) 361 if (ret == 0)
313 ret = -EPERM; 362 ret = -EPERM;
314 } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) { 363 } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) {
315 int err = nf_queue(skb, elem, state, 364 int err;
316 verdict >> NF_VERDICT_QBITS); 365
366 RCU_INIT_POINTER(state->hook_entries, entry);
367 err = nf_queue(skb, state, verdict >> NF_VERDICT_QBITS);
317 if (err < 0) { 368 if (err < 0) {
318 if (err == -ESRCH && 369 if (err == -ESRCH &&
319 (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS)) 370 (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS))
@@ -321,7 +372,6 @@ next_hook:
321 kfree_skb(skb); 372 kfree_skb(skb);
322 } 373 }
323 } 374 }
324 rcu_read_unlock();
325 return ret; 375 return ret;
326} 376}
327EXPORT_SYMBOL(nf_hook_slow); 377EXPORT_SYMBOL(nf_hook_slow);
@@ -441,7 +491,7 @@ static int __net_init netfilter_net_init(struct net *net)
441 491
442 for (i = 0; i < ARRAY_SIZE(net->nf.hooks); i++) { 492 for (i = 0; i < ARRAY_SIZE(net->nf.hooks); i++) {
443 for (h = 0; h < NF_MAX_HOOKS; h++) 493 for (h = 0; h < NF_MAX_HOOKS; h++)
444 INIT_LIST_HEAD(&net->nf.hooks[i][h]); 494 RCU_INIT_POINTER(net->nf.hooks[i][h], NULL);
445 } 495 }
446 496
447#ifdef CONFIG_PROC_FS 497#ifdef CONFIG_PROC_FS
diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c
index f04fd8df210b..fc230d99aa3b 100644
--- a/net/netfilter/ipvs/ip_vs_nfct.c
+++ b/net/netfilter/ipvs/ip_vs_nfct.c
@@ -281,13 +281,10 @@ void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp)
281 h = nf_conntrack_find_get(cp->ipvs->net, &nf_ct_zone_dflt, &tuple); 281 h = nf_conntrack_find_get(cp->ipvs->net, &nf_ct_zone_dflt, &tuple);
282 if (h) { 282 if (h) {
283 ct = nf_ct_tuplehash_to_ctrack(h); 283 ct = nf_ct_tuplehash_to_ctrack(h);
284 /* Show what happens instead of calling nf_ct_kill() */ 284 if (nf_ct_kill(ct)) {
285 if (del_timer(&ct->timeout)) { 285 IP_VS_DBG(7, "%s: ct=%p, deleted conntrack for tuple="
286 IP_VS_DBG(7, "%s: ct=%p, deleted conntrack timer for tuple="
287 FMT_TUPLE "\n", 286 FMT_TUPLE "\n",
288 __func__, ct, ARG_TUPLE(&tuple)); 287 __func__, ct, ARG_TUPLE(&tuple));
289 if (ct->timeout.function)
290 ct->timeout.function(ct->timeout.data);
291 } else { 288 } else {
292 IP_VS_DBG(7, "%s: ct=%p, no conntrack timer for tuple=" 289 IP_VS_DBG(7, "%s: ct=%p, no conntrack timer for tuple="
293 FMT_TUPLE "\n", 290 FMT_TUPLE "\n",
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 9934b0c93c1e..ba6a1d421222 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -72,12 +72,24 @@ EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock);
72struct hlist_nulls_head *nf_conntrack_hash __read_mostly; 72struct hlist_nulls_head *nf_conntrack_hash __read_mostly;
73EXPORT_SYMBOL_GPL(nf_conntrack_hash); 73EXPORT_SYMBOL_GPL(nf_conntrack_hash);
74 74
75struct conntrack_gc_work {
76 struct delayed_work dwork;
77 u32 last_bucket;
78 bool exiting;
79};
80
75static __read_mostly struct kmem_cache *nf_conntrack_cachep; 81static __read_mostly struct kmem_cache *nf_conntrack_cachep;
76static __read_mostly spinlock_t nf_conntrack_locks_all_lock; 82static __read_mostly spinlock_t nf_conntrack_locks_all_lock;
77static __read_mostly seqcount_t nf_conntrack_generation;
78static __read_mostly DEFINE_SPINLOCK(nf_conntrack_locks_all_lock); 83static __read_mostly DEFINE_SPINLOCK(nf_conntrack_locks_all_lock);
79static __read_mostly bool nf_conntrack_locks_all; 84static __read_mostly bool nf_conntrack_locks_all;
80 85
86#define GC_MAX_BUCKETS_DIV 64u
87#define GC_MAX_BUCKETS 8192u
88#define GC_INTERVAL (5 * HZ)
89#define GC_MAX_EVICTS 256u
90
91static struct conntrack_gc_work conntrack_gc_work;
92
81void nf_conntrack_lock(spinlock_t *lock) __acquires(lock) 93void nf_conntrack_lock(spinlock_t *lock) __acquires(lock)
82{ 94{
83 spin_lock(lock); 95 spin_lock(lock);
@@ -164,7 +176,7 @@ unsigned int nf_conntrack_htable_size __read_mostly;
164EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); 176EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
165 177
166unsigned int nf_conntrack_max __read_mostly; 178unsigned int nf_conntrack_max __read_mostly;
167EXPORT_SYMBOL_GPL(nf_conntrack_max); 179seqcount_t nf_conntrack_generation __read_mostly;
168 180
169DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked); 181DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked);
170EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked); 182EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked);
@@ -367,12 +379,10 @@ static void
367destroy_conntrack(struct nf_conntrack *nfct) 379destroy_conntrack(struct nf_conntrack *nfct)
368{ 380{
369 struct nf_conn *ct = (struct nf_conn *)nfct; 381 struct nf_conn *ct = (struct nf_conn *)nfct;
370 struct net *net = nf_ct_net(ct);
371 struct nf_conntrack_l4proto *l4proto; 382 struct nf_conntrack_l4proto *l4proto;
372 383
373 pr_debug("destroy_conntrack(%p)\n", ct); 384 pr_debug("destroy_conntrack(%p)\n", ct);
374 NF_CT_ASSERT(atomic_read(&nfct->use) == 0); 385 NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
375 NF_CT_ASSERT(!timer_pending(&ct->timeout));
376 386
377 if (unlikely(nf_ct_is_template(ct))) { 387 if (unlikely(nf_ct_is_template(ct))) {
378 nf_ct_tmpl_free(ct); 388 nf_ct_tmpl_free(ct);
@@ -395,7 +405,6 @@ destroy_conntrack(struct nf_conntrack *nfct)
395 405
396 nf_ct_del_from_dying_or_unconfirmed_list(ct); 406 nf_ct_del_from_dying_or_unconfirmed_list(ct);
397 407
398 NF_CT_STAT_INC(net, delete);
399 local_bh_enable(); 408 local_bh_enable();
400 409
401 if (ct->master) 410 if (ct->master)
@@ -427,7 +436,6 @@ static void nf_ct_delete_from_lists(struct nf_conn *ct)
427 436
428 nf_ct_add_to_dying_list(ct); 437 nf_ct_add_to_dying_list(ct);
429 438
430 NF_CT_STAT_INC(net, delete_list);
431 local_bh_enable(); 439 local_bh_enable();
432} 440}
433 441
@@ -435,35 +443,30 @@ bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report)
435{ 443{
436 struct nf_conn_tstamp *tstamp; 444 struct nf_conn_tstamp *tstamp;
437 445
446 if (test_and_set_bit(IPS_DYING_BIT, &ct->status))
447 return false;
448
438 tstamp = nf_conn_tstamp_find(ct); 449 tstamp = nf_conn_tstamp_find(ct);
439 if (tstamp && tstamp->stop == 0) 450 if (tstamp && tstamp->stop == 0)
440 tstamp->stop = ktime_get_real_ns(); 451 tstamp->stop = ktime_get_real_ns();
441 452
442 if (nf_ct_is_dying(ct))
443 goto delete;
444
445 if (nf_conntrack_event_report(IPCT_DESTROY, ct, 453 if (nf_conntrack_event_report(IPCT_DESTROY, ct,
446 portid, report) < 0) { 454 portid, report) < 0) {
447 /* destroy event was not delivered */ 455 /* destroy event was not delivered. nf_ct_put will
456 * be done by event cache worker on redelivery.
457 */
448 nf_ct_delete_from_lists(ct); 458 nf_ct_delete_from_lists(ct);
449 nf_conntrack_ecache_delayed_work(nf_ct_net(ct)); 459 nf_conntrack_ecache_delayed_work(nf_ct_net(ct));
450 return false; 460 return false;
451 } 461 }
452 462
453 nf_conntrack_ecache_work(nf_ct_net(ct)); 463 nf_conntrack_ecache_work(nf_ct_net(ct));
454 set_bit(IPS_DYING_BIT, &ct->status);
455 delete:
456 nf_ct_delete_from_lists(ct); 464 nf_ct_delete_from_lists(ct);
457 nf_ct_put(ct); 465 nf_ct_put(ct);
458 return true; 466 return true;
459} 467}
460EXPORT_SYMBOL_GPL(nf_ct_delete); 468EXPORT_SYMBOL_GPL(nf_ct_delete);
461 469
462static void death_by_timeout(unsigned long ul_conntrack)
463{
464 nf_ct_delete((struct nf_conn *)ul_conntrack, 0, 0);
465}
466
467static inline bool 470static inline bool
468nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, 471nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
469 const struct nf_conntrack_tuple *tuple, 472 const struct nf_conntrack_tuple *tuple,
@@ -481,22 +484,17 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
481 net_eq(net, nf_ct_net(ct)); 484 net_eq(net, nf_ct_net(ct));
482} 485}
483 486
484/* must be called with rcu read lock held */ 487/* caller must hold rcu readlock and none of the nf_conntrack_locks */
485void nf_conntrack_get_ht(struct hlist_nulls_head **hash, unsigned int *hsize) 488static void nf_ct_gc_expired(struct nf_conn *ct)
486{ 489{
487 struct hlist_nulls_head *hptr; 490 if (!atomic_inc_not_zero(&ct->ct_general.use))
488 unsigned int sequence, hsz; 491 return;
489 492
490 do { 493 if (nf_ct_should_gc(ct))
491 sequence = read_seqcount_begin(&nf_conntrack_generation); 494 nf_ct_kill(ct);
492 hsz = nf_conntrack_htable_size;
493 hptr = nf_conntrack_hash;
494 } while (read_seqcount_retry(&nf_conntrack_generation, sequence));
495 495
496 *hash = hptr; 496 nf_ct_put(ct);
497 *hsize = hsz;
498} 497}
499EXPORT_SYMBOL_GPL(nf_conntrack_get_ht);
500 498
501/* 499/*
502 * Warning : 500 * Warning :
@@ -510,21 +508,26 @@ ____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone,
510 struct nf_conntrack_tuple_hash *h; 508 struct nf_conntrack_tuple_hash *h;
511 struct hlist_nulls_head *ct_hash; 509 struct hlist_nulls_head *ct_hash;
512 struct hlist_nulls_node *n; 510 struct hlist_nulls_node *n;
513 unsigned int bucket, sequence; 511 unsigned int bucket, hsize;
514 512
515begin: 513begin:
516 do { 514 nf_conntrack_get_ht(&ct_hash, &hsize);
517 sequence = read_seqcount_begin(&nf_conntrack_generation); 515 bucket = reciprocal_scale(hash, hsize);
518 bucket = scale_hash(hash);
519 ct_hash = nf_conntrack_hash;
520 } while (read_seqcount_retry(&nf_conntrack_generation, sequence));
521 516
522 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) { 517 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) {
523 if (nf_ct_key_equal(h, tuple, zone, net)) { 518 struct nf_conn *ct;
524 NF_CT_STAT_INC_ATOMIC(net, found); 519
525 return h; 520 ct = nf_ct_tuplehash_to_ctrack(h);
521 if (nf_ct_is_expired(ct)) {
522 nf_ct_gc_expired(ct);
523 continue;
526 } 524 }
527 NF_CT_STAT_INC_ATOMIC(net, searched); 525
526 if (nf_ct_is_dying(ct))
527 continue;
528
529 if (nf_ct_key_equal(h, tuple, zone, net))
530 return h;
528 } 531 }
529 /* 532 /*
530 * if the nulls value we got at the end of this lookup is 533 * if the nulls value we got at the end of this lookup is
@@ -618,7 +621,6 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
618 zone, net)) 621 zone, net))
619 goto out; 622 goto out;
620 623
621 add_timer(&ct->timeout);
622 smp_wmb(); 624 smp_wmb();
623 /* The caller holds a reference to this object */ 625 /* The caller holds a reference to this object */
624 atomic_set(&ct->ct_general.use, 2); 626 atomic_set(&ct->ct_general.use, 2);
@@ -771,8 +773,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)
771 /* Timer relative to confirmation time, not original 773 /* Timer relative to confirmation time, not original
772 setting time, otherwise we'd get timer wrap in 774 setting time, otherwise we'd get timer wrap in
773 weird delay cases. */ 775 weird delay cases. */
774 ct->timeout.expires += jiffies; 776 ct->timeout += nfct_time_stamp;
775 add_timer(&ct->timeout);
776 atomic_inc(&ct->ct_general.use); 777 atomic_inc(&ct->ct_general.use);
777 ct->status |= IPS_CONFIRMED; 778 ct->status |= IPS_CONFIRMED;
778 779
@@ -791,7 +792,6 @@ __nf_conntrack_confirm(struct sk_buff *skb)
791 */ 792 */
792 __nf_conntrack_hash_insert(ct, hash, reply_hash); 793 __nf_conntrack_hash_insert(ct, hash, reply_hash);
793 nf_conntrack_double_unlock(hash, reply_hash); 794 nf_conntrack_double_unlock(hash, reply_hash);
794 NF_CT_STAT_INC(net, insert);
795 local_bh_enable(); 795 local_bh_enable();
796 796
797 help = nfct_help(ct); 797 help = nfct_help(ct);
@@ -823,29 +823,40 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
823 const struct nf_conntrack_zone *zone; 823 const struct nf_conntrack_zone *zone;
824 struct nf_conntrack_tuple_hash *h; 824 struct nf_conntrack_tuple_hash *h;
825 struct hlist_nulls_head *ct_hash; 825 struct hlist_nulls_head *ct_hash;
826 unsigned int hash, sequence; 826 unsigned int hash, hsize;
827 struct hlist_nulls_node *n; 827 struct hlist_nulls_node *n;
828 struct nf_conn *ct; 828 struct nf_conn *ct;
829 829
830 zone = nf_ct_zone(ignored_conntrack); 830 zone = nf_ct_zone(ignored_conntrack);
831 831
832 rcu_read_lock(); 832 rcu_read_lock();
833 do { 833 begin:
834 sequence = read_seqcount_begin(&nf_conntrack_generation); 834 nf_conntrack_get_ht(&ct_hash, &hsize);
835 hash = hash_conntrack(net, tuple); 835 hash = __hash_conntrack(net, tuple, hsize);
836 ct_hash = nf_conntrack_hash;
837 } while (read_seqcount_retry(&nf_conntrack_generation, sequence));
838 836
839 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) { 837 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) {
840 ct = nf_ct_tuplehash_to_ctrack(h); 838 ct = nf_ct_tuplehash_to_ctrack(h);
841 if (ct != ignored_conntrack && 839
842 nf_ct_key_equal(h, tuple, zone, net)) { 840 if (ct == ignored_conntrack)
841 continue;
842
843 if (nf_ct_is_expired(ct)) {
844 nf_ct_gc_expired(ct);
845 continue;
846 }
847
848 if (nf_ct_key_equal(h, tuple, zone, net)) {
843 NF_CT_STAT_INC_ATOMIC(net, found); 849 NF_CT_STAT_INC_ATOMIC(net, found);
844 rcu_read_unlock(); 850 rcu_read_unlock();
845 return 1; 851 return 1;
846 } 852 }
847 NF_CT_STAT_INC_ATOMIC(net, searched);
848 } 853 }
854
855 if (get_nulls_value(n) != hash) {
856 NF_CT_STAT_INC_ATOMIC(net, search_restart);
857 goto begin;
858 }
859
849 rcu_read_unlock(); 860 rcu_read_unlock();
850 861
851 return 0; 862 return 0;
@@ -867,6 +878,11 @@ static unsigned int early_drop_list(struct net *net,
867 hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) { 878 hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) {
868 tmp = nf_ct_tuplehash_to_ctrack(h); 879 tmp = nf_ct_tuplehash_to_ctrack(h);
869 880
881 if (nf_ct_is_expired(tmp)) {
882 nf_ct_gc_expired(tmp);
883 continue;
884 }
885
870 if (test_bit(IPS_ASSURED_BIT, &tmp->status) || 886 if (test_bit(IPS_ASSURED_BIT, &tmp->status) ||
871 !net_eq(nf_ct_net(tmp), net) || 887 !net_eq(nf_ct_net(tmp), net) ||
872 nf_ct_is_dying(tmp)) 888 nf_ct_is_dying(tmp))
@@ -884,7 +900,6 @@ static unsigned int early_drop_list(struct net *net,
884 */ 900 */
885 if (net_eq(nf_ct_net(tmp), net) && 901 if (net_eq(nf_ct_net(tmp), net) &&
886 nf_ct_is_confirmed(tmp) && 902 nf_ct_is_confirmed(tmp) &&
887 del_timer(&tmp->timeout) &&
888 nf_ct_delete(tmp, 0, 0)) 903 nf_ct_delete(tmp, 0, 0))
889 drops++; 904 drops++;
890 905
@@ -900,14 +915,11 @@ static noinline int early_drop(struct net *net, unsigned int _hash)
900 915
901 for (i = 0; i < NF_CT_EVICTION_RANGE; i++) { 916 for (i = 0; i < NF_CT_EVICTION_RANGE; i++) {
902 struct hlist_nulls_head *ct_hash; 917 struct hlist_nulls_head *ct_hash;
903 unsigned hash, sequence, drops; 918 unsigned int hash, hsize, drops;
904 919
905 rcu_read_lock(); 920 rcu_read_lock();
906 do { 921 nf_conntrack_get_ht(&ct_hash, &hsize);
907 sequence = read_seqcount_begin(&nf_conntrack_generation); 922 hash = reciprocal_scale(_hash++, hsize);
908 hash = scale_hash(_hash++);
909 ct_hash = nf_conntrack_hash;
910 } while (read_seqcount_retry(&nf_conntrack_generation, sequence));
911 923
912 drops = early_drop_list(net, &ct_hash[hash]); 924 drops = early_drop_list(net, &ct_hash[hash]);
913 rcu_read_unlock(); 925 rcu_read_unlock();
@@ -921,6 +933,69 @@ static noinline int early_drop(struct net *net, unsigned int _hash)
921 return false; 933 return false;
922} 934}
923 935
936static void gc_worker(struct work_struct *work)
937{
938 unsigned int i, goal, buckets = 0, expired_count = 0;
939 unsigned long next_run = GC_INTERVAL;
940 unsigned int ratio, scanned = 0;
941 struct conntrack_gc_work *gc_work;
942
943 gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
944
945 goal = min(nf_conntrack_htable_size / GC_MAX_BUCKETS_DIV, GC_MAX_BUCKETS);
946 i = gc_work->last_bucket;
947
948 do {
949 struct nf_conntrack_tuple_hash *h;
950 struct hlist_nulls_head *ct_hash;
951 struct hlist_nulls_node *n;
952 unsigned int hashsz;
953 struct nf_conn *tmp;
954
955 i++;
956 rcu_read_lock();
957
958 nf_conntrack_get_ht(&ct_hash, &hashsz);
959 if (i >= hashsz)
960 i = 0;
961
962 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) {
963 tmp = nf_ct_tuplehash_to_ctrack(h);
964
965 scanned++;
966 if (nf_ct_is_expired(tmp)) {
967 nf_ct_gc_expired(tmp);
968 expired_count++;
969 continue;
970 }
971 }
972
973 /* could check get_nulls_value() here and restart if ct
974 * was moved to another chain. But given gc is best-effort
975 * we will just continue with next hash slot.
976 */
977 rcu_read_unlock();
978 cond_resched_rcu_qs();
979 } while (++buckets < goal &&
980 expired_count < GC_MAX_EVICTS);
981
982 if (gc_work->exiting)
983 return;
984
985 ratio = scanned ? expired_count * 100 / scanned : 0;
986 if (ratio >= 90)
987 next_run = 0;
988
989 gc_work->last_bucket = i;
990 schedule_delayed_work(&gc_work->dwork, next_run);
991}
992
993static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work)
994{
995 INIT_DELAYED_WORK(&gc_work->dwork, gc_worker);
996 gc_work->exiting = false;
997}
998
924static struct nf_conn * 999static struct nf_conn *
925__nf_conntrack_alloc(struct net *net, 1000__nf_conntrack_alloc(struct net *net,
926 const struct nf_conntrack_zone *zone, 1001 const struct nf_conntrack_zone *zone,
@@ -957,8 +1032,6 @@ __nf_conntrack_alloc(struct net *net,
957 /* save hash for reusing when confirming */ 1032 /* save hash for reusing when confirming */
958 *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash; 1033 *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash;
959 ct->status = 0; 1034 ct->status = 0;
960 /* Don't set timer yet: wait for confirmation */
961 setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct);
962 write_pnet(&ct->ct_net, net); 1035 write_pnet(&ct->ct_net, net);
963 memset(&ct->__nfct_init_offset[0], 0, 1036 memset(&ct->__nfct_init_offset[0], 0,
964 offsetof(struct nf_conn, proto) - 1037 offsetof(struct nf_conn, proto) -
@@ -1096,10 +1169,8 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
1096 } 1169 }
1097 spin_unlock(&nf_conntrack_expect_lock); 1170 spin_unlock(&nf_conntrack_expect_lock);
1098 } 1171 }
1099 if (!exp) { 1172 if (!exp)
1100 __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC); 1173 __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC);
1101 NF_CT_STAT_INC(net, new);
1102 }
1103 1174
1104 /* Now it is inserted into the unconfirmed list, bump refcount */ 1175 /* Now it is inserted into the unconfirmed list, bump refcount */
1105 nf_conntrack_get(&ct->ct_general); 1176 nf_conntrack_get(&ct->ct_general);
@@ -1204,7 +1275,7 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
1204 skb->nfct = NULL; 1275 skb->nfct = NULL;
1205 } 1276 }
1206 1277
1207 /* rcu_read_lock()ed by nf_hook_slow */ 1278 /* rcu_read_lock()ed by nf_hook_thresh */
1208 l3proto = __nf_ct_l3proto_find(pf); 1279 l3proto = __nf_ct_l3proto_find(pf);
1209 ret = l3proto->get_l4proto(skb, skb_network_offset(skb), 1280 ret = l3proto->get_l4proto(skb, skb_network_offset(skb),
1210 &dataoff, &protonum); 1281 &dataoff, &protonum);
@@ -1332,7 +1403,6 @@ void __nf_ct_refresh_acct(struct nf_conn *ct,
1332 unsigned long extra_jiffies, 1403 unsigned long extra_jiffies,
1333 int do_acct) 1404 int do_acct)
1334{ 1405{
1335 NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
1336 NF_CT_ASSERT(skb); 1406 NF_CT_ASSERT(skb);
1337 1407
1338 /* Only update if this is not a fixed timeout */ 1408 /* Only update if this is not a fixed timeout */
@@ -1340,39 +1410,25 @@ void __nf_ct_refresh_acct(struct nf_conn *ct,
1340 goto acct; 1410 goto acct;
1341 1411
1342 /* If not in hash table, timer will not be active yet */ 1412 /* If not in hash table, timer will not be active yet */
1343 if (!nf_ct_is_confirmed(ct)) { 1413 if (nf_ct_is_confirmed(ct))
1344 ct->timeout.expires = extra_jiffies; 1414 extra_jiffies += nfct_time_stamp;
1345 } else {
1346 unsigned long newtime = jiffies + extra_jiffies;
1347
1348 /* Only update the timeout if the new timeout is at least
1349 HZ jiffies from the old timeout. Need del_timer for race
1350 avoidance (may already be dying). */
1351 if (newtime - ct->timeout.expires >= HZ)
1352 mod_timer_pending(&ct->timeout, newtime);
1353 }
1354 1415
1416 ct->timeout = extra_jiffies;
1355acct: 1417acct:
1356 if (do_acct) 1418 if (do_acct)
1357 nf_ct_acct_update(ct, ctinfo, skb->len); 1419 nf_ct_acct_update(ct, ctinfo, skb->len);
1358} 1420}
1359EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct); 1421EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct);
1360 1422
1361bool __nf_ct_kill_acct(struct nf_conn *ct, 1423bool nf_ct_kill_acct(struct nf_conn *ct,
1362 enum ip_conntrack_info ctinfo, 1424 enum ip_conntrack_info ctinfo,
1363 const struct sk_buff *skb, 1425 const struct sk_buff *skb)
1364 int do_acct)
1365{ 1426{
1366 if (do_acct) 1427 nf_ct_acct_update(ct, ctinfo, skb->len);
1367 nf_ct_acct_update(ct, ctinfo, skb->len);
1368 1428
1369 if (del_timer(&ct->timeout)) { 1429 return nf_ct_delete(ct, 0, 0);
1370 ct->timeout.function((unsigned long)ct);
1371 return true;
1372 }
1373 return false;
1374} 1430}
1375EXPORT_SYMBOL_GPL(__nf_ct_kill_acct); 1431EXPORT_SYMBOL_GPL(nf_ct_kill_acct);
1376 1432
1377#if IS_ENABLED(CONFIG_NF_CT_NETLINK) 1433#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
1378 1434
@@ -1505,11 +1561,8 @@ void nf_ct_iterate_cleanup(struct net *net,
1505 1561
1506 while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) { 1562 while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) {
1507 /* Time to push up daises... */ 1563 /* Time to push up daises... */
1508 if (del_timer(&ct->timeout))
1509 nf_ct_delete(ct, portid, report);
1510
1511 /* ... else the timer will get him soon. */
1512 1564
1565 nf_ct_delete(ct, portid, report);
1513 nf_ct_put(ct); 1566 nf_ct_put(ct);
1514 cond_resched(); 1567 cond_resched();
1515 } 1568 }
@@ -1545,6 +1598,7 @@ static int untrack_refs(void)
1545 1598
1546void nf_conntrack_cleanup_start(void) 1599void nf_conntrack_cleanup_start(void)
1547{ 1600{
1601 conntrack_gc_work.exiting = true;
1548 RCU_INIT_POINTER(ip_ct_attach, NULL); 1602 RCU_INIT_POINTER(ip_ct_attach, NULL);
1549} 1603}
1550 1604
@@ -1554,6 +1608,7 @@ void nf_conntrack_cleanup_end(void)
1554 while (untrack_refs() > 0) 1608 while (untrack_refs() > 0)
1555 schedule(); 1609 schedule();
1556 1610
1611 cancel_delayed_work_sync(&conntrack_gc_work.dwork);
1557 nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size); 1612 nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size);
1558 1613
1559 nf_conntrack_proto_fini(); 1614 nf_conntrack_proto_fini();
@@ -1828,6 +1883,10 @@ int nf_conntrack_init_start(void)
1828 } 1883 }
1829 /* - and look it like as a confirmed connection */ 1884 /* - and look it like as a confirmed connection */
1830 nf_ct_untracked_status_or(IPS_CONFIRMED | IPS_UNTRACKED); 1885 nf_ct_untracked_status_or(IPS_CONFIRMED | IPS_UNTRACKED);
1886
1887 conntrack_gc_work_init(&conntrack_gc_work);
1888 schedule_delayed_work(&conntrack_gc_work.dwork, GC_INTERVAL);
1889
1831 return 0; 1890 return 0;
1832 1891
1833err_proto: 1892err_proto:
diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
index d28011b42845..da9df2d56e66 100644
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -49,8 +49,13 @@ static enum retry_state ecache_work_evict_list(struct ct_pcpu *pcpu)
49 49
50 hlist_nulls_for_each_entry(h, n, &pcpu->dying, hnnode) { 50 hlist_nulls_for_each_entry(h, n, &pcpu->dying, hnnode) {
51 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 51 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
52 struct nf_conntrack_ecache *e;
52 53
53 if (nf_ct_is_dying(ct)) 54 if (!nf_ct_is_confirmed(ct))
55 continue;
56
57 e = nf_ct_ecache_find(ct);
58 if (!e || e->state != NFCT_ECACHE_DESTROY_FAIL)
54 continue; 59 continue;
55 60
56 if (nf_conntrack_event(IPCT_DESTROY, ct)) { 61 if (nf_conntrack_event(IPCT_DESTROY, ct)) {
@@ -58,8 +63,7 @@ static enum retry_state ecache_work_evict_list(struct ct_pcpu *pcpu)
58 break; 63 break;
59 } 64 }
60 65
61 /* we've got the event delivered, now it's dying */ 66 e->state = NFCT_ECACHE_DESTROY_SENT;
62 set_bit(IPS_DYING_BIT, &ct->status);
63 refs[evicted] = ct; 67 refs[evicted] = ct;
64 68
65 if (++evicted >= ARRAY_SIZE(refs)) { 69 if (++evicted >= ARRAY_SIZE(refs)) {
@@ -130,7 +134,7 @@ int nf_conntrack_eventmask_report(unsigned int eventmask, struct nf_conn *ct,
130 if (!e) 134 if (!e)
131 goto out_unlock; 135 goto out_unlock;
132 136
133 if (nf_ct_is_confirmed(ct) && !nf_ct_is_dying(ct)) { 137 if (nf_ct_is_confirmed(ct)) {
134 struct nf_ct_event item = { 138 struct nf_ct_event item = {
135 .ct = ct, 139 .ct = ct,
136 .portid = e->portid ? e->portid : portid, 140 .portid = e->portid ? e->portid : portid,
@@ -150,11 +154,13 @@ int nf_conntrack_eventmask_report(unsigned int eventmask, struct nf_conn *ct,
150 * triggered by a process, we store the PORTID 154 * triggered by a process, we store the PORTID
151 * to include it in the retransmission. 155 * to include it in the retransmission.
152 */ 156 */
153 if (eventmask & (1 << IPCT_DESTROY) && 157 if (eventmask & (1 << IPCT_DESTROY)) {
154 e->portid == 0 && portid != 0) 158 if (e->portid == 0 && portid != 0)
155 e->portid = portid; 159 e->portid = portid;
156 else 160 e->state = NFCT_ECACHE_DESTROY_FAIL;
161 } else {
157 e->missed |= eventmask; 162 e->missed |= eventmask;
163 }
158 } else { 164 } else {
159 e->missed &= ~missed; 165 e->missed &= ~missed;
160 } 166 }
diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c
index 43147005bea3..e3ed20060878 100644
--- a/net/netfilter/nf_conntrack_ftp.c
+++ b/net/netfilter/nf_conntrack_ftp.c
@@ -237,7 +237,7 @@ static int try_eprt(const char *data, size_t dlen, struct nf_conntrack_man *cmd,
237 } 237 }
238 delim = data[0]; 238 delim = data[0];
239 if (isdigit(delim) || delim < 33 || delim > 126 || data[2] != delim) { 239 if (isdigit(delim) || delim < 33 || delim > 126 || data[2] != delim) {
240 pr_debug("try_eprt: invalid delimitter.\n"); 240 pr_debug("try_eprt: invalid delimiter.\n");
241 return 0; 241 return 0;
242 } 242 }
243 243
@@ -301,8 +301,6 @@ static int find_pattern(const char *data, size_t dlen,
301 size_t i = plen; 301 size_t i = plen;
302 302
303 pr_debug("find_pattern `%s': dlen = %Zu\n", pattern, dlen); 303 pr_debug("find_pattern `%s': dlen = %Zu\n", pattern, dlen);
304 if (dlen == 0)
305 return 0;
306 304
307 if (dlen <= plen) { 305 if (dlen <= plen) {
308 /* Short packet: try for partial? */ 306 /* Short packet: try for partial? */
@@ -311,19 +309,8 @@ static int find_pattern(const char *data, size_t dlen,
311 else return 0; 309 else return 0;
312 } 310 }
313 311
314 if (strncasecmp(data, pattern, plen) != 0) { 312 if (strncasecmp(data, pattern, plen) != 0)
315#if 0
316 size_t i;
317
318 pr_debug("ftp: string mismatch\n");
319 for (i = 0; i < plen; i++) {
320 pr_debug("ftp:char %u `%c'(%u) vs `%c'(%u)\n",
321 i, data[i], data[i],
322 pattern[i], pattern[i]);
323 }
324#endif
325 return 0; 313 return 0;
326 }
327 314
328 pr_debug("Pattern matches!\n"); 315 pr_debug("Pattern matches!\n");
329 /* Now we've found the constant string, try to skip 316 /* Now we've found the constant string, try to skip
diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c
index 5c0db5c64734..f65d93639d12 100644
--- a/net/netfilter/nf_conntrack_h323_main.c
+++ b/net/netfilter/nf_conntrack_h323_main.c
@@ -736,7 +736,7 @@ static int callforward_do_filter(struct net *net,
736 const struct nf_afinfo *afinfo; 736 const struct nf_afinfo *afinfo;
737 int ret = 0; 737 int ret = 0;
738 738
739 /* rcu_read_lock()ed by nf_hook_slow() */ 739 /* rcu_read_lock()ed by nf_hook_thresh */
740 afinfo = nf_get_afinfo(family); 740 afinfo = nf_get_afinfo(family);
741 if (!afinfo) 741 if (!afinfo)
742 return 0; 742 return 0;
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index b989b81ac156..336e21559e01 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -189,7 +189,6 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
189 struct nf_conntrack_helper *helper = NULL; 189 struct nf_conntrack_helper *helper = NULL;
190 struct nf_conn_help *help; 190 struct nf_conn_help *help;
191 struct net *net = nf_ct_net(ct); 191 struct net *net = nf_ct_net(ct);
192 int ret = 0;
193 192
194 /* We already got a helper explicitly attached. The function 193 /* We already got a helper explicitly attached. The function
195 * nf_conntrack_alter_reply - in case NAT is in use - asks for looking 194 * nf_conntrack_alter_reply - in case NAT is in use - asks for looking
@@ -223,15 +222,13 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
223 if (helper == NULL) { 222 if (helper == NULL) {
224 if (help) 223 if (help)
225 RCU_INIT_POINTER(help->helper, NULL); 224 RCU_INIT_POINTER(help->helper, NULL);
226 goto out; 225 return 0;
227 } 226 }
228 227
229 if (help == NULL) { 228 if (help == NULL) {
230 help = nf_ct_helper_ext_add(ct, helper, flags); 229 help = nf_ct_helper_ext_add(ct, helper, flags);
231 if (help == NULL) { 230 if (help == NULL)
232 ret = -ENOMEM; 231 return -ENOMEM;
233 goto out;
234 }
235 } else { 232 } else {
236 /* We only allow helper re-assignment of the same sort since 233 /* We only allow helper re-assignment of the same sort since
237 * we cannot reallocate the helper extension area. 234 * we cannot reallocate the helper extension area.
@@ -240,13 +237,13 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
240 237
241 if (tmp && tmp->help != helper->help) { 238 if (tmp && tmp->help != helper->help) {
242 RCU_INIT_POINTER(help->helper, NULL); 239 RCU_INIT_POINTER(help->helper, NULL);
243 goto out; 240 return 0;
244 } 241 }
245 } 242 }
246 243
247 rcu_assign_pointer(help->helper, helper); 244 rcu_assign_pointer(help->helper, helper);
248out: 245
249 return ret; 246 return 0;
250} 247}
251EXPORT_SYMBOL_GPL(__nf_ct_try_assign_helper); 248EXPORT_SYMBOL_GPL(__nf_ct_try_assign_helper);
252 249
@@ -349,7 +346,7 @@ void nf_ct_helper_log(struct sk_buff *skb, const struct nf_conn *ct,
349 /* Called from the helper function, this call never fails */ 346 /* Called from the helper function, this call never fails */
350 help = nfct_help(ct); 347 help = nfct_help(ct);
351 348
352 /* rcu_read_lock()ed by nf_hook_slow */ 349 /* rcu_read_lock()ed by nf_hook_thresh */
353 helper = rcu_dereference(help->helper); 350 helper = rcu_dereference(help->helper);
354 351
355 nf_log_packet(nf_ct_net(ct), nf_ct_l3num(ct), 0, skb, NULL, NULL, NULL, 352 nf_log_packet(nf_ct_net(ct), nf_ct_l3num(ct), 0, skb, NULL, NULL, NULL,
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index fdfc71f416b7..27540455dc62 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -149,10 +149,7 @@ nla_put_failure:
149 149
150static int ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct) 150static int ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct)
151{ 151{
152 long timeout = ((long)ct->timeout.expires - (long)jiffies) / HZ; 152 long timeout = nf_ct_expires(ct) / HZ;
153
154 if (timeout < 0)
155 timeout = 0;
156 153
157 if (nla_put_be32(skb, CTA_TIMEOUT, htonl(timeout))) 154 if (nla_put_be32(skb, CTA_TIMEOUT, htonl(timeout)))
158 goto nla_put_failure; 155 goto nla_put_failure;
@@ -818,14 +815,23 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
818 struct hlist_nulls_node *n; 815 struct hlist_nulls_node *n;
819 struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); 816 struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
820 u_int8_t l3proto = nfmsg->nfgen_family; 817 u_int8_t l3proto = nfmsg->nfgen_family;
821 int res; 818 struct nf_conn *nf_ct_evict[8];
819 int res, i;
822 spinlock_t *lockp; 820 spinlock_t *lockp;
823 821
824 last = (struct nf_conn *)cb->args[1]; 822 last = (struct nf_conn *)cb->args[1];
823 i = 0;
825 824
826 local_bh_disable(); 825 local_bh_disable();
827 for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++) { 826 for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++) {
828restart: 827restart:
828 while (i) {
829 i--;
830 if (nf_ct_should_gc(nf_ct_evict[i]))
831 nf_ct_kill(nf_ct_evict[i]);
832 nf_ct_put(nf_ct_evict[i]);
833 }
834
829 lockp = &nf_conntrack_locks[cb->args[0] % CONNTRACK_LOCKS]; 835 lockp = &nf_conntrack_locks[cb->args[0] % CONNTRACK_LOCKS];
830 nf_conntrack_lock(lockp); 836 nf_conntrack_lock(lockp);
831 if (cb->args[0] >= nf_conntrack_htable_size) { 837 if (cb->args[0] >= nf_conntrack_htable_size) {
@@ -837,6 +843,13 @@ restart:
837 if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) 843 if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
838 continue; 844 continue;
839 ct = nf_ct_tuplehash_to_ctrack(h); 845 ct = nf_ct_tuplehash_to_ctrack(h);
846 if (nf_ct_is_expired(ct)) {
847 if (i < ARRAY_SIZE(nf_ct_evict) &&
848 atomic_inc_not_zero(&ct->ct_general.use))
849 nf_ct_evict[i++] = ct;
850 continue;
851 }
852
840 if (!net_eq(net, nf_ct_net(ct))) 853 if (!net_eq(net, nf_ct_net(ct)))
841 continue; 854 continue;
842 855
@@ -878,6 +891,13 @@ out:
878 if (last) 891 if (last)
879 nf_ct_put(last); 892 nf_ct_put(last);
880 893
894 while (i) {
895 i--;
896 if (nf_ct_should_gc(nf_ct_evict[i]))
897 nf_ct_kill(nf_ct_evict[i]);
898 nf_ct_put(nf_ct_evict[i]);
899 }
900
881 return skb->len; 901 return skb->len;
882} 902}
883 903
@@ -1147,9 +1167,7 @@ static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl,
1147 } 1167 }
1148 } 1168 }
1149 1169
1150 if (del_timer(&ct->timeout)) 1170 nf_ct_delete(ct, NETLINK_CB(skb).portid, nlmsg_report(nlh));
1151 nf_ct_delete(ct, NETLINK_CB(skb).portid, nlmsg_report(nlh));
1152
1153 nf_ct_put(ct); 1171 nf_ct_put(ct);
1154 1172
1155 return 0; 1173 return 0;
@@ -1517,11 +1535,10 @@ static int ctnetlink_change_timeout(struct nf_conn *ct,
1517{ 1535{
1518 u_int32_t timeout = ntohl(nla_get_be32(cda[CTA_TIMEOUT])); 1536 u_int32_t timeout = ntohl(nla_get_be32(cda[CTA_TIMEOUT]));
1519 1537
1520 if (!del_timer(&ct->timeout)) 1538 ct->timeout = nfct_time_stamp + timeout * HZ;
1521 return -ETIME;
1522 1539
1523 ct->timeout.expires = jiffies + timeout * HZ; 1540 if (test_bit(IPS_DYING_BIT, &ct->status))
1524 add_timer(&ct->timeout); 1541 return -ETIME;
1525 1542
1526 return 0; 1543 return 0;
1527} 1544}
@@ -1719,9 +1736,8 @@ ctnetlink_create_conntrack(struct net *net,
1719 1736
1720 if (!cda[CTA_TIMEOUT]) 1737 if (!cda[CTA_TIMEOUT])
1721 goto err1; 1738 goto err1;
1722 ct->timeout.expires = ntohl(nla_get_be32(cda[CTA_TIMEOUT]));
1723 1739
1724 ct->timeout.expires = jiffies + ct->timeout.expires * HZ; 1740 ct->timeout = nfct_time_stamp + ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ;
1725 1741
1726 rcu_read_lock(); 1742 rcu_read_lock();
1727 if (cda[CTA_HELP]) { 1743 if (cda[CTA_HELP]) {
@@ -1968,13 +1984,9 @@ ctnetlink_ct_stat_cpu_fill_info(struct sk_buff *skb, u32 portid, u32 seq,
1968 nfmsg->version = NFNETLINK_V0; 1984 nfmsg->version = NFNETLINK_V0;
1969 nfmsg->res_id = htons(cpu); 1985 nfmsg->res_id = htons(cpu);
1970 1986
1971 if (nla_put_be32(skb, CTA_STATS_SEARCHED, htonl(st->searched)) || 1987 if (nla_put_be32(skb, CTA_STATS_FOUND, htonl(st->found)) ||
1972 nla_put_be32(skb, CTA_STATS_FOUND, htonl(st->found)) ||
1973 nla_put_be32(skb, CTA_STATS_NEW, htonl(st->new)) ||
1974 nla_put_be32(skb, CTA_STATS_INVALID, htonl(st->invalid)) || 1988 nla_put_be32(skb, CTA_STATS_INVALID, htonl(st->invalid)) ||
1975 nla_put_be32(skb, CTA_STATS_IGNORE, htonl(st->ignore)) || 1989 nla_put_be32(skb, CTA_STATS_IGNORE, htonl(st->ignore)) ||
1976 nla_put_be32(skb, CTA_STATS_DELETE, htonl(st->delete)) ||
1977 nla_put_be32(skb, CTA_STATS_DELETE_LIST, htonl(st->delete_list)) ||
1978 nla_put_be32(skb, CTA_STATS_INSERT, htonl(st->insert)) || 1990 nla_put_be32(skb, CTA_STATS_INSERT, htonl(st->insert)) ||
1979 nla_put_be32(skb, CTA_STATS_INSERT_FAILED, 1991 nla_put_be32(skb, CTA_STATS_INSERT_FAILED,
1980 htonl(st->insert_failed)) || 1992 htonl(st->insert_failed)) ||
diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c
index 5588c7ae1ac2..f60a4755d71e 100644
--- a/net/netfilter/nf_conntrack_pptp.c
+++ b/net/netfilter/nf_conntrack_pptp.c
@@ -157,8 +157,7 @@ static int destroy_sibling_or_exp(struct net *net, struct nf_conn *ct,
157 pr_debug("setting timeout of conntrack %p to 0\n", sibling); 157 pr_debug("setting timeout of conntrack %p to 0\n", sibling);
158 sibling->proto.gre.timeout = 0; 158 sibling->proto.gre.timeout = 0;
159 sibling->proto.gre.stream_timeout = 0; 159 sibling->proto.gre.stream_timeout = 0;
160 if (del_timer(&sibling->timeout)) 160 nf_ct_kill(sibling);
161 sibling->timeout.function((unsigned long)sibling);
162 nf_ct_put(sibling); 161 nf_ct_put(sibling);
163 return 1; 162 return 1;
164 } else { 163 } else {
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index b65d5864b6d9..8d2c7d8c666a 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -159,54 +159,6 @@ static int kill_l4proto(struct nf_conn *i, void *data)
159 nf_ct_l3num(i) == l4proto->l3proto; 159 nf_ct_l3num(i) == l4proto->l3proto;
160} 160}
161 161
162static struct nf_ip_net *nf_ct_l3proto_net(struct net *net,
163 struct nf_conntrack_l3proto *l3proto)
164{
165 if (l3proto->l3proto == PF_INET)
166 return &net->ct.nf_ct_proto;
167 else
168 return NULL;
169}
170
171static int nf_ct_l3proto_register_sysctl(struct net *net,
172 struct nf_conntrack_l3proto *l3proto)
173{
174 int err = 0;
175 struct nf_ip_net *in = nf_ct_l3proto_net(net, l3proto);
176 /* nf_conntrack_l3proto_ipv6 doesn't support sysctl */
177 if (in == NULL)
178 return 0;
179
180#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
181 if (in->ctl_table != NULL) {
182 err = nf_ct_register_sysctl(net,
183 &in->ctl_table_header,
184 l3proto->ctl_table_path,
185 in->ctl_table);
186 if (err < 0) {
187 kfree(in->ctl_table);
188 in->ctl_table = NULL;
189 }
190 }
191#endif
192 return err;
193}
194
195static void nf_ct_l3proto_unregister_sysctl(struct net *net,
196 struct nf_conntrack_l3proto *l3proto)
197{
198 struct nf_ip_net *in = nf_ct_l3proto_net(net, l3proto);
199
200 if (in == NULL)
201 return;
202#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
203 if (in->ctl_table_header != NULL)
204 nf_ct_unregister_sysctl(&in->ctl_table_header,
205 &in->ctl_table,
206 0);
207#endif
208}
209
210int nf_ct_l3proto_register(struct nf_conntrack_l3proto *proto) 162int nf_ct_l3proto_register(struct nf_conntrack_l3proto *proto)
211{ 163{
212 int ret = 0; 164 int ret = 0;
@@ -241,7 +193,7 @@ EXPORT_SYMBOL_GPL(nf_ct_l3proto_register);
241int nf_ct_l3proto_pernet_register(struct net *net, 193int nf_ct_l3proto_pernet_register(struct net *net,
242 struct nf_conntrack_l3proto *proto) 194 struct nf_conntrack_l3proto *proto)
243{ 195{
244 int ret = 0; 196 int ret;
245 197
246 if (proto->init_net) { 198 if (proto->init_net) {
247 ret = proto->init_net(net); 199 ret = proto->init_net(net);
@@ -249,7 +201,7 @@ int nf_ct_l3proto_pernet_register(struct net *net,
249 return ret; 201 return ret;
250 } 202 }
251 203
252 return nf_ct_l3proto_register_sysctl(net, proto); 204 return 0;
253} 205}
254EXPORT_SYMBOL_GPL(nf_ct_l3proto_pernet_register); 206EXPORT_SYMBOL_GPL(nf_ct_l3proto_pernet_register);
255 207
@@ -272,8 +224,6 @@ EXPORT_SYMBOL_GPL(nf_ct_l3proto_unregister);
272void nf_ct_l3proto_pernet_unregister(struct net *net, 224void nf_ct_l3proto_pernet_unregister(struct net *net,
273 struct nf_conntrack_l3proto *proto) 225 struct nf_conntrack_l3proto *proto)
274{ 226{
275 nf_ct_l3proto_unregister_sysctl(net, proto);
276
277 /* Remove all contrack entries for this protocol */ 227 /* Remove all contrack entries for this protocol */
278 nf_ct_iterate_cleanup(net, kill_l3proto, proto, 0, 0); 228 nf_ct_iterate_cleanup(net, kill_l3proto, proto, 0, 0);
279} 229}
@@ -312,26 +262,6 @@ int nf_ct_l4proto_register_sysctl(struct net *net,
312 } 262 }
313 } 263 }
314 } 264 }
315#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
316 if (l4proto->l3proto != AF_INET6 && pn->ctl_compat_table != NULL) {
317 if (err < 0) {
318 nf_ct_kfree_compat_sysctl_table(pn);
319 goto out;
320 }
321 err = nf_ct_register_sysctl(net,
322 &pn->ctl_compat_header,
323 "net/ipv4/netfilter",
324 pn->ctl_compat_table);
325 if (err == 0)
326 goto out;
327
328 nf_ct_kfree_compat_sysctl_table(pn);
329 nf_ct_unregister_sysctl(&pn->ctl_table_header,
330 &pn->ctl_table,
331 pn->users);
332 }
333out:
334#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
335#endif /* CONFIG_SYSCTL */ 265#endif /* CONFIG_SYSCTL */
336 return err; 266 return err;
337} 267}
@@ -346,13 +276,6 @@ void nf_ct_l4proto_unregister_sysctl(struct net *net,
346 nf_ct_unregister_sysctl(&pn->ctl_table_header, 276 nf_ct_unregister_sysctl(&pn->ctl_table_header,
347 &pn->ctl_table, 277 &pn->ctl_table,
348 pn->users); 278 pn->users);
349
350#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
351 if (l4proto->l3proto != AF_INET6 && pn->ctl_compat_header != NULL)
352 nf_ct_unregister_sysctl(&pn->ctl_compat_header,
353 &pn->ctl_compat_table,
354 0);
355#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
356#endif /* CONFIG_SYSCTL */ 279#endif /* CONFIG_SYSCTL */
357} 280}
358 281
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index 399a38fd685a..a45bee52dccc 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -402,7 +402,8 @@ static bool dccp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
402{ 402{
403 struct dccp_hdr _hdr, *dh; 403 struct dccp_hdr _hdr, *dh;
404 404
405 dh = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); 405 /* Actually only need first 4 bytes to get ports. */
406 dh = skb_header_pointer(skb, dataoff, 4, &_hdr);
406 if (dh == NULL) 407 if (dh == NULL)
407 return false; 408 return false;
408 409
diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c
index 86dc752e5349..d5868bad33a7 100644
--- a/net/netfilter/nf_conntrack_proto_generic.c
+++ b/net/netfilter/nf_conntrack_proto_generic.c
@@ -151,17 +151,6 @@ static struct ctl_table generic_sysctl_table[] = {
151 }, 151 },
152 { } 152 { }
153}; 153};
154#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
155static struct ctl_table generic_compat_sysctl_table[] = {
156 {
157 .procname = "ip_conntrack_generic_timeout",
158 .maxlen = sizeof(unsigned int),
159 .mode = 0644,
160 .proc_handler = proc_dointvec_jiffies,
161 },
162 { }
163};
164#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
165#endif /* CONFIG_SYSCTL */ 154#endif /* CONFIG_SYSCTL */
166 155
167static int generic_kmemdup_sysctl_table(struct nf_proto_net *pn, 156static int generic_kmemdup_sysctl_table(struct nf_proto_net *pn,
@@ -179,40 +168,14 @@ static int generic_kmemdup_sysctl_table(struct nf_proto_net *pn,
179 return 0; 168 return 0;
180} 169}
181 170
182static int generic_kmemdup_compat_sysctl_table(struct nf_proto_net *pn,
183 struct nf_generic_net *gn)
184{
185#ifdef CONFIG_SYSCTL
186#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
187 pn->ctl_compat_table = kmemdup(generic_compat_sysctl_table,
188 sizeof(generic_compat_sysctl_table),
189 GFP_KERNEL);
190 if (!pn->ctl_compat_table)
191 return -ENOMEM;
192
193 pn->ctl_compat_table[0].data = &gn->timeout;
194#endif
195#endif
196 return 0;
197}
198
199static int generic_init_net(struct net *net, u_int16_t proto) 171static int generic_init_net(struct net *net, u_int16_t proto)
200{ 172{
201 int ret;
202 struct nf_generic_net *gn = generic_pernet(net); 173 struct nf_generic_net *gn = generic_pernet(net);
203 struct nf_proto_net *pn = &gn->pn; 174 struct nf_proto_net *pn = &gn->pn;
204 175
205 gn->timeout = nf_ct_generic_timeout; 176 gn->timeout = nf_ct_generic_timeout;
206 177
207 ret = generic_kmemdup_compat_sysctl_table(pn, gn); 178 return generic_kmemdup_sysctl_table(pn, gn);
208 if (ret < 0)
209 return ret;
210
211 ret = generic_kmemdup_sysctl_table(pn, gn);
212 if (ret < 0)
213 nf_ct_kfree_compat_sysctl_table(pn);
214
215 return ret;
216} 179}
217 180
218static struct nf_proto_net *generic_get_net_proto(struct net *net) 181static struct nf_proto_net *generic_get_net_proto(struct net *net)
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index a96451a7af20..9a715f88b2f1 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -192,15 +192,15 @@ static bool gre_invert_tuple(struct nf_conntrack_tuple *tuple,
192static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, 192static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
193 struct net *net, struct nf_conntrack_tuple *tuple) 193 struct net *net, struct nf_conntrack_tuple *tuple)
194{ 194{
195 const struct gre_hdr_pptp *pgrehdr; 195 const struct pptp_gre_header *pgrehdr;
196 struct gre_hdr_pptp _pgrehdr; 196 struct pptp_gre_header _pgrehdr;
197 __be16 srckey; 197 __be16 srckey;
198 const struct gre_hdr *grehdr; 198 const struct gre_base_hdr *grehdr;
199 struct gre_hdr _grehdr; 199 struct gre_base_hdr _grehdr;
200 200
201 /* first only delinearize old RFC1701 GRE header */ 201 /* first only delinearize old RFC1701 GRE header */
202 grehdr = skb_header_pointer(skb, dataoff, sizeof(_grehdr), &_grehdr); 202 grehdr = skb_header_pointer(skb, dataoff, sizeof(_grehdr), &_grehdr);
203 if (!grehdr || grehdr->version != GRE_VERSION_PPTP) { 203 if (!grehdr || (grehdr->flags & GRE_VERSION) != GRE_VERSION_1) {
204 /* try to behave like "nf_conntrack_proto_generic" */ 204 /* try to behave like "nf_conntrack_proto_generic" */
205 tuple->src.u.all = 0; 205 tuple->src.u.all = 0;
206 tuple->dst.u.all = 0; 206 tuple->dst.u.all = 0;
@@ -212,8 +212,8 @@ static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
212 if (!pgrehdr) 212 if (!pgrehdr)
213 return true; 213 return true;
214 214
215 if (ntohs(grehdr->protocol) != GRE_PROTOCOL_PPTP) { 215 if (grehdr->protocol != GRE_PROTO_PPP) {
216 pr_debug("GRE_VERSION_PPTP but unknown proto\n"); 216 pr_debug("Unsupported GRE proto(0x%x)\n", ntohs(grehdr->protocol));
217 return false; 217 return false;
218 } 218 }
219 219
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index 1d7ab960a9e6..982ea62606c7 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -161,8 +161,8 @@ static bool sctp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
161 const struct sctphdr *hp; 161 const struct sctphdr *hp;
162 struct sctphdr _hdr; 162 struct sctphdr _hdr;
163 163
164 /* Actually only need first 8 bytes. */ 164 /* Actually only need first 4 bytes to get ports. */
165 hp = skb_header_pointer(skb, dataoff, 8, &_hdr); 165 hp = skb_header_pointer(skb, dataoff, 4, &_hdr);
166 if (hp == NULL) 166 if (hp == NULL)
167 return false; 167 return false;
168 168
@@ -705,54 +705,6 @@ static struct ctl_table sctp_sysctl_table[] = {
705 }, 705 },
706 { } 706 { }
707}; 707};
708
709#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
710static struct ctl_table sctp_compat_sysctl_table[] = {
711 {
712 .procname = "ip_conntrack_sctp_timeout_closed",
713 .maxlen = sizeof(unsigned int),
714 .mode = 0644,
715 .proc_handler = proc_dointvec_jiffies,
716 },
717 {
718 .procname = "ip_conntrack_sctp_timeout_cookie_wait",
719 .maxlen = sizeof(unsigned int),
720 .mode = 0644,
721 .proc_handler = proc_dointvec_jiffies,
722 },
723 {
724 .procname = "ip_conntrack_sctp_timeout_cookie_echoed",
725 .maxlen = sizeof(unsigned int),
726 .mode = 0644,
727 .proc_handler = proc_dointvec_jiffies,
728 },
729 {
730 .procname = "ip_conntrack_sctp_timeout_established",
731 .maxlen = sizeof(unsigned int),
732 .mode = 0644,
733 .proc_handler = proc_dointvec_jiffies,
734 },
735 {
736 .procname = "ip_conntrack_sctp_timeout_shutdown_sent",
737 .maxlen = sizeof(unsigned int),
738 .mode = 0644,
739 .proc_handler = proc_dointvec_jiffies,
740 },
741 {
742 .procname = "ip_conntrack_sctp_timeout_shutdown_recd",
743 .maxlen = sizeof(unsigned int),
744 .mode = 0644,
745 .proc_handler = proc_dointvec_jiffies,
746 },
747 {
748 .procname = "ip_conntrack_sctp_timeout_shutdown_ack_sent",
749 .maxlen = sizeof(unsigned int),
750 .mode = 0644,
751 .proc_handler = proc_dointvec_jiffies,
752 },
753 { }
754};
755#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
756#endif 708#endif
757 709
758static int sctp_kmemdup_sysctl_table(struct nf_proto_net *pn, 710static int sctp_kmemdup_sysctl_table(struct nf_proto_net *pn,
@@ -781,32 +733,8 @@ static int sctp_kmemdup_sysctl_table(struct nf_proto_net *pn,
781 return 0; 733 return 0;
782} 734}
783 735
784static int sctp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn,
785 struct sctp_net *sn)
786{
787#ifdef CONFIG_SYSCTL
788#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
789 pn->ctl_compat_table = kmemdup(sctp_compat_sysctl_table,
790 sizeof(sctp_compat_sysctl_table),
791 GFP_KERNEL);
792 if (!pn->ctl_compat_table)
793 return -ENOMEM;
794
795 pn->ctl_compat_table[0].data = &sn->timeouts[SCTP_CONNTRACK_CLOSED];
796 pn->ctl_compat_table[1].data = &sn->timeouts[SCTP_CONNTRACK_COOKIE_WAIT];
797 pn->ctl_compat_table[2].data = &sn->timeouts[SCTP_CONNTRACK_COOKIE_ECHOED];
798 pn->ctl_compat_table[3].data = &sn->timeouts[SCTP_CONNTRACK_ESTABLISHED];
799 pn->ctl_compat_table[4].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_SENT];
800 pn->ctl_compat_table[5].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_RECD];
801 pn->ctl_compat_table[6].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_ACK_SENT];
802#endif
803#endif
804 return 0;
805}
806
807static int sctp_init_net(struct net *net, u_int16_t proto) 736static int sctp_init_net(struct net *net, u_int16_t proto)
808{ 737{
809 int ret;
810 struct sctp_net *sn = sctp_pernet(net); 738 struct sctp_net *sn = sctp_pernet(net);
811 struct nf_proto_net *pn = &sn->pn; 739 struct nf_proto_net *pn = &sn->pn;
812 740
@@ -817,18 +745,7 @@ static int sctp_init_net(struct net *net, u_int16_t proto)
817 sn->timeouts[i] = sctp_timeouts[i]; 745 sn->timeouts[i] = sctp_timeouts[i];
818 } 746 }
819 747
820 if (proto == AF_INET) { 748 return sctp_kmemdup_sysctl_table(pn, sn);
821 ret = sctp_kmemdup_compat_sysctl_table(pn, sn);
822 if (ret < 0)
823 return ret;
824
825 ret = sctp_kmemdup_sysctl_table(pn, sn);
826 if (ret < 0)
827 nf_ct_kfree_compat_sysctl_table(pn);
828 } else
829 ret = sctp_kmemdup_sysctl_table(pn, sn);
830
831 return ret;
832} 749}
833 750
834static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = { 751static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = {
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 70c8381641a7..69f687740c76 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -282,8 +282,8 @@ static bool tcp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
282 const struct tcphdr *hp; 282 const struct tcphdr *hp;
283 struct tcphdr _hdr; 283 struct tcphdr _hdr;
284 284
285 /* Actually only need first 8 bytes. */ 285 /* Actually only need first 4 bytes to get ports. */
286 hp = skb_header_pointer(skb, dataoff, 8, &_hdr); 286 hp = skb_header_pointer(skb, dataoff, 4, &_hdr);
287 if (hp == NULL) 287 if (hp == NULL)
288 return false; 288 return false;
289 289
@@ -1481,90 +1481,6 @@ static struct ctl_table tcp_sysctl_table[] = {
1481 }, 1481 },
1482 { } 1482 { }
1483}; 1483};
1484
1485#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
1486static struct ctl_table tcp_compat_sysctl_table[] = {
1487 {
1488 .procname = "ip_conntrack_tcp_timeout_syn_sent",
1489 .maxlen = sizeof(unsigned int),
1490 .mode = 0644,
1491 .proc_handler = proc_dointvec_jiffies,
1492 },
1493 {
1494 .procname = "ip_conntrack_tcp_timeout_syn_sent2",
1495 .maxlen = sizeof(unsigned int),
1496 .mode = 0644,
1497 .proc_handler = proc_dointvec_jiffies,
1498 },
1499 {
1500 .procname = "ip_conntrack_tcp_timeout_syn_recv",
1501 .maxlen = sizeof(unsigned int),
1502 .mode = 0644,
1503 .proc_handler = proc_dointvec_jiffies,
1504 },
1505 {
1506 .procname = "ip_conntrack_tcp_timeout_established",
1507 .maxlen = sizeof(unsigned int),
1508 .mode = 0644,
1509 .proc_handler = proc_dointvec_jiffies,
1510 },
1511 {
1512 .procname = "ip_conntrack_tcp_timeout_fin_wait",
1513 .maxlen = sizeof(unsigned int),
1514 .mode = 0644,
1515 .proc_handler = proc_dointvec_jiffies,
1516 },
1517 {
1518 .procname = "ip_conntrack_tcp_timeout_close_wait",
1519 .maxlen = sizeof(unsigned int),
1520 .mode = 0644,
1521 .proc_handler = proc_dointvec_jiffies,
1522 },
1523 {
1524 .procname = "ip_conntrack_tcp_timeout_last_ack",
1525 .maxlen = sizeof(unsigned int),
1526 .mode = 0644,
1527 .proc_handler = proc_dointvec_jiffies,
1528 },
1529 {
1530 .procname = "ip_conntrack_tcp_timeout_time_wait",
1531 .maxlen = sizeof(unsigned int),
1532 .mode = 0644,
1533 .proc_handler = proc_dointvec_jiffies,
1534 },
1535 {
1536 .procname = "ip_conntrack_tcp_timeout_close",
1537 .maxlen = sizeof(unsigned int),
1538 .mode = 0644,
1539 .proc_handler = proc_dointvec_jiffies,
1540 },
1541 {
1542 .procname = "ip_conntrack_tcp_timeout_max_retrans",
1543 .maxlen = sizeof(unsigned int),
1544 .mode = 0644,
1545 .proc_handler = proc_dointvec_jiffies,
1546 },
1547 {
1548 .procname = "ip_conntrack_tcp_loose",
1549 .maxlen = sizeof(unsigned int),
1550 .mode = 0644,
1551 .proc_handler = proc_dointvec,
1552 },
1553 {
1554 .procname = "ip_conntrack_tcp_be_liberal",
1555 .maxlen = sizeof(unsigned int),
1556 .mode = 0644,
1557 .proc_handler = proc_dointvec,
1558 },
1559 {
1560 .procname = "ip_conntrack_tcp_max_retrans",
1561 .maxlen = sizeof(unsigned int),
1562 .mode = 0644,
1563 .proc_handler = proc_dointvec,
1564 },
1565 { }
1566};
1567#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
1568#endif /* CONFIG_SYSCTL */ 1484#endif /* CONFIG_SYSCTL */
1569 1485
1570static int tcp_kmemdup_sysctl_table(struct nf_proto_net *pn, 1486static int tcp_kmemdup_sysctl_table(struct nf_proto_net *pn,
@@ -1597,38 +1513,8 @@ static int tcp_kmemdup_sysctl_table(struct nf_proto_net *pn,
1597 return 0; 1513 return 0;
1598} 1514}
1599 1515
1600static int tcp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn,
1601 struct nf_tcp_net *tn)
1602{
1603#ifdef CONFIG_SYSCTL
1604#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
1605 pn->ctl_compat_table = kmemdup(tcp_compat_sysctl_table,
1606 sizeof(tcp_compat_sysctl_table),
1607 GFP_KERNEL);
1608 if (!pn->ctl_compat_table)
1609 return -ENOMEM;
1610
1611 pn->ctl_compat_table[0].data = &tn->timeouts[TCP_CONNTRACK_SYN_SENT];
1612 pn->ctl_compat_table[1].data = &tn->timeouts[TCP_CONNTRACK_SYN_SENT2];
1613 pn->ctl_compat_table[2].data = &tn->timeouts[TCP_CONNTRACK_SYN_RECV];
1614 pn->ctl_compat_table[3].data = &tn->timeouts[TCP_CONNTRACK_ESTABLISHED];
1615 pn->ctl_compat_table[4].data = &tn->timeouts[TCP_CONNTRACK_FIN_WAIT];
1616 pn->ctl_compat_table[5].data = &tn->timeouts[TCP_CONNTRACK_CLOSE_WAIT];
1617 pn->ctl_compat_table[6].data = &tn->timeouts[TCP_CONNTRACK_LAST_ACK];
1618 pn->ctl_compat_table[7].data = &tn->timeouts[TCP_CONNTRACK_TIME_WAIT];
1619 pn->ctl_compat_table[8].data = &tn->timeouts[TCP_CONNTRACK_CLOSE];
1620 pn->ctl_compat_table[9].data = &tn->timeouts[TCP_CONNTRACK_RETRANS];
1621 pn->ctl_compat_table[10].data = &tn->tcp_loose;
1622 pn->ctl_compat_table[11].data = &tn->tcp_be_liberal;
1623 pn->ctl_compat_table[12].data = &tn->tcp_max_retrans;
1624#endif
1625#endif
1626 return 0;
1627}
1628
1629static int tcp_init_net(struct net *net, u_int16_t proto) 1516static int tcp_init_net(struct net *net, u_int16_t proto)
1630{ 1517{
1631 int ret;
1632 struct nf_tcp_net *tn = tcp_pernet(net); 1518 struct nf_tcp_net *tn = tcp_pernet(net);
1633 struct nf_proto_net *pn = &tn->pn; 1519 struct nf_proto_net *pn = &tn->pn;
1634 1520
@@ -1643,18 +1529,7 @@ static int tcp_init_net(struct net *net, u_int16_t proto)
1643 tn->tcp_max_retrans = nf_ct_tcp_max_retrans; 1529 tn->tcp_max_retrans = nf_ct_tcp_max_retrans;
1644 } 1530 }
1645 1531
1646 if (proto == AF_INET) { 1532 return tcp_kmemdup_sysctl_table(pn, tn);
1647 ret = tcp_kmemdup_compat_sysctl_table(pn, tn);
1648 if (ret < 0)
1649 return ret;
1650
1651 ret = tcp_kmemdup_sysctl_table(pn, tn);
1652 if (ret < 0)
1653 nf_ct_kfree_compat_sysctl_table(pn);
1654 } else
1655 ret = tcp_kmemdup_sysctl_table(pn, tn);
1656
1657 return ret;
1658} 1533}
1659 1534
1660static struct nf_proto_net *tcp_get_net_proto(struct net *net) 1535static struct nf_proto_net *tcp_get_net_proto(struct net *net)
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index 4fd040575ffe..20f35ed68030 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -44,8 +44,8 @@ static bool udp_pkt_to_tuple(const struct sk_buff *skb,
44 const struct udphdr *hp; 44 const struct udphdr *hp;
45 struct udphdr _hdr; 45 struct udphdr _hdr;
46 46
47 /* Actually only need first 8 bytes. */ 47 /* Actually only need first 4 bytes to get ports. */
48 hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); 48 hp = skb_header_pointer(skb, dataoff, 4, &_hdr);
49 if (hp == NULL) 49 if (hp == NULL)
50 return false; 50 return false;
51 51
@@ -218,23 +218,6 @@ static struct ctl_table udp_sysctl_table[] = {
218 }, 218 },
219 { } 219 { }
220}; 220};
221#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
222static struct ctl_table udp_compat_sysctl_table[] = {
223 {
224 .procname = "ip_conntrack_udp_timeout",
225 .maxlen = sizeof(unsigned int),
226 .mode = 0644,
227 .proc_handler = proc_dointvec_jiffies,
228 },
229 {
230 .procname = "ip_conntrack_udp_timeout_stream",
231 .maxlen = sizeof(unsigned int),
232 .mode = 0644,
233 .proc_handler = proc_dointvec_jiffies,
234 },
235 { }
236};
237#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
238#endif /* CONFIG_SYSCTL */ 221#endif /* CONFIG_SYSCTL */
239 222
240static int udp_kmemdup_sysctl_table(struct nf_proto_net *pn, 223static int udp_kmemdup_sysctl_table(struct nf_proto_net *pn,
@@ -254,27 +237,8 @@ static int udp_kmemdup_sysctl_table(struct nf_proto_net *pn,
254 return 0; 237 return 0;
255} 238}
256 239
257static int udp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn,
258 struct nf_udp_net *un)
259{
260#ifdef CONFIG_SYSCTL
261#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
262 pn->ctl_compat_table = kmemdup(udp_compat_sysctl_table,
263 sizeof(udp_compat_sysctl_table),
264 GFP_KERNEL);
265 if (!pn->ctl_compat_table)
266 return -ENOMEM;
267
268 pn->ctl_compat_table[0].data = &un->timeouts[UDP_CT_UNREPLIED];
269 pn->ctl_compat_table[1].data = &un->timeouts[UDP_CT_REPLIED];
270#endif
271#endif
272 return 0;
273}
274
275static int udp_init_net(struct net *net, u_int16_t proto) 240static int udp_init_net(struct net *net, u_int16_t proto)
276{ 241{
277 int ret;
278 struct nf_udp_net *un = udp_pernet(net); 242 struct nf_udp_net *un = udp_pernet(net);
279 struct nf_proto_net *pn = &un->pn; 243 struct nf_proto_net *pn = &un->pn;
280 244
@@ -285,18 +249,7 @@ static int udp_init_net(struct net *net, u_int16_t proto)
285 un->timeouts[i] = udp_timeouts[i]; 249 un->timeouts[i] = udp_timeouts[i];
286 } 250 }
287 251
288 if (proto == AF_INET) { 252 return udp_kmemdup_sysctl_table(pn, un);
289 ret = udp_kmemdup_compat_sysctl_table(pn, un);
290 if (ret < 0)
291 return ret;
292
293 ret = udp_kmemdup_sysctl_table(pn, un);
294 if (ret < 0)
295 nf_ct_kfree_compat_sysctl_table(pn);
296 } else
297 ret = udp_kmemdup_sysctl_table(pn, un);
298
299 return ret;
300} 253}
301 254
302static struct nf_proto_net *udp_get_net_proto(struct net *net) 255static struct nf_proto_net *udp_get_net_proto(struct net *net)
diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c
index 9d692f5adb94..029206e8dec4 100644
--- a/net/netfilter/nf_conntrack_proto_udplite.c
+++ b/net/netfilter/nf_conntrack_proto_udplite.c
@@ -54,7 +54,8 @@ static bool udplite_pkt_to_tuple(const struct sk_buff *skb,
54 const struct udphdr *hp; 54 const struct udphdr *hp;
55 struct udphdr _hdr; 55 struct udphdr _hdr;
56 56
57 hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); 57 /* Actually only need first 4 bytes to get ports. */
58 hp = skb_header_pointer(skb, dataoff, 4, &_hdr);
58 if (hp == NULL) 59 if (hp == NULL)
59 return false; 60 return false;
60 61
diff --git a/net/netfilter/nf_conntrack_seqadj.c b/net/netfilter/nf_conntrack_seqadj.c
index dff0f0cc59e4..ef7063eced7c 100644
--- a/net/netfilter/nf_conntrack_seqadj.c
+++ b/net/netfilter/nf_conntrack_seqadj.c
@@ -169,7 +169,7 @@ int nf_ct_seq_adjust(struct sk_buff *skb,
169 s32 seqoff, ackoff; 169 s32 seqoff, ackoff;
170 struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); 170 struct nf_conn_seqadj *seqadj = nfct_seqadj(ct);
171 struct nf_ct_seqadj *this_way, *other_way; 171 struct nf_ct_seqadj *this_way, *other_way;
172 int res; 172 int res = 1;
173 173
174 this_way = &seqadj->seq[dir]; 174 this_way = &seqadj->seq[dir];
175 other_way = &seqadj->seq[!dir]; 175 other_way = &seqadj->seq[!dir];
@@ -184,27 +184,31 @@ int nf_ct_seq_adjust(struct sk_buff *skb,
184 else 184 else
185 seqoff = this_way->offset_before; 185 seqoff = this_way->offset_before;
186 186
187 newseq = htonl(ntohl(tcph->seq) + seqoff);
188 inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, false);
189 pr_debug("Adjusting sequence number from %u->%u\n",
190 ntohl(tcph->seq), ntohl(newseq));
191 tcph->seq = newseq;
192
193 if (!tcph->ack)
194 goto out;
195
187 if (after(ntohl(tcph->ack_seq) - other_way->offset_before, 196 if (after(ntohl(tcph->ack_seq) - other_way->offset_before,
188 other_way->correction_pos)) 197 other_way->correction_pos))
189 ackoff = other_way->offset_after; 198 ackoff = other_way->offset_after;
190 else 199 else
191 ackoff = other_way->offset_before; 200 ackoff = other_way->offset_before;
192 201
193 newseq = htonl(ntohl(tcph->seq) + seqoff);
194 newack = htonl(ntohl(tcph->ack_seq) - ackoff); 202 newack = htonl(ntohl(tcph->ack_seq) - ackoff);
195
196 inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, false);
197 inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, 203 inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack,
198 false); 204 false);
199 205 pr_debug("Adjusting ack number from %u->%u, ack from %u->%u\n",
200 pr_debug("Adjusting sequence number from %u->%u, ack from %u->%u\n",
201 ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq), 206 ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq),
202 ntohl(newack)); 207 ntohl(newack));
203
204 tcph->seq = newseq;
205 tcph->ack_seq = newack; 208 tcph->ack_seq = newack;
206 209
207 res = nf_ct_sack_adjust(skb, protoff, tcph, ct, ctinfo); 210 res = nf_ct_sack_adjust(skb, protoff, tcph, ct, ctinfo);
211out:
208 spin_unlock_bh(&ct->lock); 212 spin_unlock_bh(&ct->lock);
209 213
210 return res; 214 return res;
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index 7d77217de6a3..621b81c7bddc 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -83,9 +83,10 @@ static int digits_len(const struct nf_conn *ct, const char *dptr,
83static int iswordc(const char c) 83static int iswordc(const char c)
84{ 84{
85 if (isalnum(c) || c == '!' || c == '"' || c == '%' || 85 if (isalnum(c) || c == '!' || c == '"' || c == '%' ||
86 (c >= '(' && c <= '/') || c == ':' || c == '<' || c == '>' || 86 (c >= '(' && c <= '+') || c == ':' || c == '<' || c == '>' ||
87 c == '?' || (c >= '[' && c <= ']') || c == '_' || c == '`' || 87 c == '?' || (c >= '[' && c <= ']') || c == '_' || c == '`' ||
88 c == '{' || c == '}' || c == '~') 88 c == '{' || c == '}' || c == '~' || (c >= '-' && c <= '/') ||
89 c == '\'')
89 return 1; 90 return 1;
90 return 0; 91 return 0;
91} 92}
@@ -329,13 +330,12 @@ static const char *sip_follow_continuation(const char *dptr, const char *limit)
329static const char *sip_skip_whitespace(const char *dptr, const char *limit) 330static const char *sip_skip_whitespace(const char *dptr, const char *limit)
330{ 331{
331 for (; dptr < limit; dptr++) { 332 for (; dptr < limit; dptr++) {
332 if (*dptr == ' ') 333 if (*dptr == ' ' || *dptr == '\t')
333 continue; 334 continue;
334 if (*dptr != '\r' && *dptr != '\n') 335 if (*dptr != '\r' && *dptr != '\n')
335 break; 336 break;
336 dptr = sip_follow_continuation(dptr, limit); 337 dptr = sip_follow_continuation(dptr, limit);
337 if (dptr == NULL) 338 break;
338 return NULL;
339 } 339 }
340 return dptr; 340 return dptr;
341} 341}
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 9f267c3ffb39..5f446cd9f3fd 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -212,6 +212,11 @@ static int ct_seq_show(struct seq_file *s, void *v)
212 if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use))) 212 if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use)))
213 return 0; 213 return 0;
214 214
215 if (nf_ct_should_gc(ct)) {
216 nf_ct_kill(ct);
217 goto release;
218 }
219
215 /* we only want to print DIR_ORIGINAL */ 220 /* we only want to print DIR_ORIGINAL */
216 if (NF_CT_DIRECTION(hash)) 221 if (NF_CT_DIRECTION(hash))
217 goto release; 222 goto release;
@@ -228,8 +233,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
228 seq_printf(s, "%-8s %u %-8s %u %ld ", 233 seq_printf(s, "%-8s %u %-8s %u %ld ",
229 l3proto->name, nf_ct_l3num(ct), 234 l3proto->name, nf_ct_l3num(ct),
230 l4proto->name, nf_ct_protonum(ct), 235 l4proto->name, nf_ct_protonum(ct),
231 timer_pending(&ct->timeout) 236 nf_ct_expires(ct) / HZ);
232 ? (long)(ct->timeout.expires - jiffies)/HZ : 0);
233 237
234 if (l4proto->print_conntrack) 238 if (l4proto->print_conntrack)
235 l4proto->print_conntrack(s, ct); 239 l4proto->print_conntrack(s, ct);
@@ -353,13 +357,13 @@ static int ct_cpu_seq_show(struct seq_file *seq, void *v)
353 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x " 357 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x "
354 "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", 358 "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
355 nr_conntracks, 359 nr_conntracks,
356 st->searched, 360 0,
357 st->found, 361 st->found,
358 st->new, 362 0,
359 st->invalid, 363 st->invalid,
360 st->ignore, 364 st->ignore,
361 st->delete, 365 0,
362 st->delete_list, 366 0,
363 st->insert, 367 st->insert,
364 st->insert_failed, 368 st->insert_failed,
365 st->drop, 369 st->drop,
diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h
index 065522564ac6..e0adb5959342 100644
--- a/net/netfilter/nf_internals.h
+++ b/net/netfilter/nf_internals.h
@@ -13,13 +13,13 @@
13 13
14 14
15/* core.c */ 15/* core.c */
16unsigned int nf_iterate(struct list_head *head, struct sk_buff *skb, 16unsigned int nf_iterate(struct sk_buff *skb, struct nf_hook_state *state,
17 struct nf_hook_state *state, struct nf_hook_ops **elemp); 17 struct nf_hook_entry **entryp);
18 18
19/* nf_queue.c */ 19/* nf_queue.c */
20int nf_queue(struct sk_buff *skb, struct nf_hook_ops *elem, 20int nf_queue(struct sk_buff *skb, struct nf_hook_state *state,
21 struct nf_hook_state *state, unsigned int queuenum); 21 unsigned int queuenum);
22void nf_queue_nf_hook_drop(struct net *net, struct nf_hook_ops *ops); 22void nf_queue_nf_hook_drop(struct net *net, const struct nf_hook_entry *entry);
23int __init netfilter_queue_init(void); 23int __init netfilter_queue_init(void);
24 24
25/* nf_log.c */ 25/* nf_log.c */
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index aa5847a16713..3dca90dc24ad 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -39,12 +39,12 @@ static struct nf_logger *__find_logger(int pf, const char *str_logger)
39 return NULL; 39 return NULL;
40} 40}
41 41
42void nf_log_set(struct net *net, u_int8_t pf, const struct nf_logger *logger) 42int nf_log_set(struct net *net, u_int8_t pf, const struct nf_logger *logger)
43{ 43{
44 const struct nf_logger *log; 44 const struct nf_logger *log;
45 45
46 if (pf == NFPROTO_UNSPEC) 46 if (pf == NFPROTO_UNSPEC || pf >= ARRAY_SIZE(net->nf.nf_loggers))
47 return; 47 return -EOPNOTSUPP;
48 48
49 mutex_lock(&nf_log_mutex); 49 mutex_lock(&nf_log_mutex);
50 log = nft_log_dereference(net->nf.nf_loggers[pf]); 50 log = nft_log_dereference(net->nf.nf_loggers[pf]);
@@ -52,6 +52,8 @@ void nf_log_set(struct net *net, u_int8_t pf, const struct nf_logger *logger)
52 rcu_assign_pointer(net->nf.nf_loggers[pf], logger); 52 rcu_assign_pointer(net->nf.nf_loggers[pf], logger);
53 53
54 mutex_unlock(&nf_log_mutex); 54 mutex_unlock(&nf_log_mutex);
55
56 return 0;
55} 57}
56EXPORT_SYMBOL(nf_log_set); 58EXPORT_SYMBOL(nf_log_set);
57 59
@@ -420,7 +422,7 @@ static int nf_log_proc_dostring(struct ctl_table *table, int write,
420 char buf[NFLOGGER_NAME_LEN]; 422 char buf[NFLOGGER_NAME_LEN];
421 int r = 0; 423 int r = 0;
422 int tindex = (unsigned long)table->extra1; 424 int tindex = (unsigned long)table->extra1;
423 struct net *net = current->nsproxy->net_ns; 425 struct net *net = table->extra2;
424 426
425 if (write) { 427 if (write) {
426 struct ctl_table tmp = *table; 428 struct ctl_table tmp = *table;
@@ -474,7 +476,6 @@ static int netfilter_log_sysctl_init(struct net *net)
474 3, "%d", i); 476 3, "%d", i);
475 nf_log_sysctl_table[i].procname = 477 nf_log_sysctl_table[i].procname =
476 nf_log_sysctl_fnames[i]; 478 nf_log_sysctl_fnames[i];
477 nf_log_sysctl_table[i].data = NULL;
478 nf_log_sysctl_table[i].maxlen = NFLOGGER_NAME_LEN; 479 nf_log_sysctl_table[i].maxlen = NFLOGGER_NAME_LEN;
479 nf_log_sysctl_table[i].mode = 0644; 480 nf_log_sysctl_table[i].mode = 0644;
480 nf_log_sysctl_table[i].proc_handler = 481 nf_log_sysctl_table[i].proc_handler =
@@ -484,6 +485,9 @@ static int netfilter_log_sysctl_init(struct net *net)
484 } 485 }
485 } 486 }
486 487
488 for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++)
489 table[i].extra2 = net;
490
487 net->nf.nf_log_dir_header = register_net_sysctl(net, 491 net->nf.nf_log_dir_header = register_net_sysctl(net,
488 "net/netfilter/nf_log", 492 "net/netfilter/nf_log",
489 table); 493 table);
diff --git a/net/netfilter/nf_log_common.c b/net/netfilter/nf_log_common.c
index a5aa5967b8e1..119fe1cb1ea9 100644
--- a/net/netfilter/nf_log_common.c
+++ b/net/netfilter/nf_log_common.c
@@ -77,7 +77,7 @@ int nf_log_dump_tcp_header(struct nf_log_buf *m, const struct sk_buff *skb,
77 nf_log_buf_add(m, "SPT=%u DPT=%u ", 77 nf_log_buf_add(m, "SPT=%u DPT=%u ",
78 ntohs(th->source), ntohs(th->dest)); 78 ntohs(th->source), ntohs(th->dest));
79 /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ 79 /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
80 if (logflags & XT_LOG_TCPSEQ) { 80 if (logflags & NF_LOG_TCPSEQ) {
81 nf_log_buf_add(m, "SEQ=%u ACK=%u ", 81 nf_log_buf_add(m, "SEQ=%u ACK=%u ",
82 ntohl(th->seq), ntohl(th->ack_seq)); 82 ntohl(th->seq), ntohl(th->ack_seq));
83 } 83 }
@@ -107,7 +107,7 @@ int nf_log_dump_tcp_header(struct nf_log_buf *m, const struct sk_buff *skb,
107 /* Max length: 11 "URGP=65535 " */ 107 /* Max length: 11 "URGP=65535 " */
108 nf_log_buf_add(m, "URGP=%u ", ntohs(th->urg_ptr)); 108 nf_log_buf_add(m, "URGP=%u ", ntohs(th->urg_ptr));
109 109
110 if ((logflags & XT_LOG_TCPOPT) && th->doff*4 > sizeof(struct tcphdr)) { 110 if ((logflags & NF_LOG_TCPOPT) && th->doff*4 > sizeof(struct tcphdr)) {
111 u_int8_t _opt[60 - sizeof(struct tcphdr)]; 111 u_int8_t _opt[60 - sizeof(struct tcphdr)];
112 const u_int8_t *op; 112 const u_int8_t *op;
113 unsigned int i; 113 unsigned int i;
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index ecee105bbada..bbb8f3df79f7 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -566,16 +566,10 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
566 * Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack() 566 * Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack()
567 * will delete entry from already-freed table. 567 * will delete entry from already-freed table.
568 */ 568 */
569 if (!del_timer(&ct->timeout))
570 return 1;
571
572 ct->status &= ~IPS_NAT_DONE_MASK; 569 ct->status &= ~IPS_NAT_DONE_MASK;
573
574 rhashtable_remove_fast(&nf_nat_bysource_table, &ct->nat_bysource, 570 rhashtable_remove_fast(&nf_nat_bysource_table, &ct->nat_bysource,
575 nf_nat_bysource_params); 571 nf_nat_bysource_params);
576 572
577 add_timer(&ct->timeout);
578
579 /* don't delete conntrack. Although that would make things a lot 573 /* don't delete conntrack. Although that would make things a lot
580 * simpler, we'd end up flushing all conntracks on nat rmmod. 574 * simpler, we'd end up flushing all conntracks on nat rmmod.
581 */ 575 */
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index b19ad20a705c..96964a0070e1 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -96,14 +96,14 @@ void nf_queue_entry_get_refs(struct nf_queue_entry *entry)
96} 96}
97EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs); 97EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs);
98 98
99void nf_queue_nf_hook_drop(struct net *net, struct nf_hook_ops *ops) 99void nf_queue_nf_hook_drop(struct net *net, const struct nf_hook_entry *entry)
100{ 100{
101 const struct nf_queue_handler *qh; 101 const struct nf_queue_handler *qh;
102 102
103 rcu_read_lock(); 103 rcu_read_lock();
104 qh = rcu_dereference(net->nf.queue_handler); 104 qh = rcu_dereference(net->nf.queue_handler);
105 if (qh) 105 if (qh)
106 qh->nf_hook_drop(net, ops); 106 qh->nf_hook_drop(net, entry);
107 rcu_read_unlock(); 107 rcu_read_unlock();
108} 108}
109 109
@@ -112,7 +112,6 @@ void nf_queue_nf_hook_drop(struct net *net, struct nf_hook_ops *ops)
112 * through nf_reinject(). 112 * through nf_reinject().
113 */ 113 */
114int nf_queue(struct sk_buff *skb, 114int nf_queue(struct sk_buff *skb,
115 struct nf_hook_ops *elem,
116 struct nf_hook_state *state, 115 struct nf_hook_state *state,
117 unsigned int queuenum) 116 unsigned int queuenum)
118{ 117{
@@ -141,7 +140,6 @@ int nf_queue(struct sk_buff *skb,
141 140
142 *entry = (struct nf_queue_entry) { 141 *entry = (struct nf_queue_entry) {
143 .skb = skb, 142 .skb = skb,
144 .elem = elem,
145 .state = *state, 143 .state = *state,
146 .size = sizeof(*entry) + afinfo->route_key_size, 144 .size = sizeof(*entry) + afinfo->route_key_size,
147 }; 145 };
@@ -165,11 +163,15 @@ err:
165 163
166void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) 164void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
167{ 165{
166 struct nf_hook_entry *hook_entry;
168 struct sk_buff *skb = entry->skb; 167 struct sk_buff *skb = entry->skb;
169 struct nf_hook_ops *elem = entry->elem;
170 const struct nf_afinfo *afinfo; 168 const struct nf_afinfo *afinfo;
169 struct nf_hook_ops *elem;
171 int err; 170 int err;
172 171
172 hook_entry = rcu_dereference(entry->state.hook_entries);
173 elem = &hook_entry->ops;
174
173 nf_queue_entry_release_refs(entry); 175 nf_queue_entry_release_refs(entry);
174 176
175 /* Continue traversal iff userspace said ok... */ 177 /* Continue traversal iff userspace said ok... */
@@ -186,8 +188,7 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
186 188
187 if (verdict == NF_ACCEPT) { 189 if (verdict == NF_ACCEPT) {
188 next_hook: 190 next_hook:
189 verdict = nf_iterate(entry->state.hook_list, 191 verdict = nf_iterate(skb, &entry->state, &hook_entry);
190 skb, &entry->state, &elem);
191 } 192 }
192 193
193 switch (verdict & NF_VERDICT_MASK) { 194 switch (verdict & NF_VERDICT_MASK) {
@@ -198,7 +199,8 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
198 local_bh_enable(); 199 local_bh_enable();
199 break; 200 break;
200 case NF_QUEUE: 201 case NF_QUEUE:
201 err = nf_queue(skb, elem, &entry->state, 202 RCU_INIT_POINTER(entry->state.hook_entries, hook_entry);
203 err = nf_queue(skb, &entry->state,
202 verdict >> NF_VERDICT_QBITS); 204 verdict >> NF_VERDICT_QBITS);
203 if (err < 0) { 205 if (err < 0) {
204 if (err == -ESRCH && 206 if (err == -ESRCH &&
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 7e1c876c7608..b70d3ea1430e 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1196,6 +1196,83 @@ static void nf_tables_chain_destroy(struct nft_chain *chain)
1196 } 1196 }
1197} 1197}
1198 1198
1199struct nft_chain_hook {
1200 u32 num;
1201 u32 priority;
1202 const struct nf_chain_type *type;
1203 struct net_device *dev;
1204};
1205
1206static int nft_chain_parse_hook(struct net *net,
1207 const struct nlattr * const nla[],
1208 struct nft_af_info *afi,
1209 struct nft_chain_hook *hook, bool create)
1210{
1211 struct nlattr *ha[NFTA_HOOK_MAX + 1];
1212 const struct nf_chain_type *type;
1213 struct net_device *dev;
1214 int err;
1215
1216 err = nla_parse_nested(ha, NFTA_HOOK_MAX, nla[NFTA_CHAIN_HOOK],
1217 nft_hook_policy);
1218 if (err < 0)
1219 return err;
1220
1221 if (ha[NFTA_HOOK_HOOKNUM] == NULL ||
1222 ha[NFTA_HOOK_PRIORITY] == NULL)
1223 return -EINVAL;
1224
1225 hook->num = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM]));
1226 if (hook->num >= afi->nhooks)
1227 return -EINVAL;
1228
1229 hook->priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY]));
1230
1231 type = chain_type[afi->family][NFT_CHAIN_T_DEFAULT];
1232 if (nla[NFTA_CHAIN_TYPE]) {
1233 type = nf_tables_chain_type_lookup(afi, nla[NFTA_CHAIN_TYPE],
1234 create);
1235 if (IS_ERR(type))
1236 return PTR_ERR(type);
1237 }
1238 if (!(type->hook_mask & (1 << hook->num)))
1239 return -EOPNOTSUPP;
1240 if (!try_module_get(type->owner))
1241 return -ENOENT;
1242
1243 hook->type = type;
1244
1245 hook->dev = NULL;
1246 if (afi->flags & NFT_AF_NEEDS_DEV) {
1247 char ifname[IFNAMSIZ];
1248
1249 if (!ha[NFTA_HOOK_DEV]) {
1250 module_put(type->owner);
1251 return -EOPNOTSUPP;
1252 }
1253
1254 nla_strlcpy(ifname, ha[NFTA_HOOK_DEV], IFNAMSIZ);
1255 dev = dev_get_by_name(net, ifname);
1256 if (!dev) {
1257 module_put(type->owner);
1258 return -ENOENT;
1259 }
1260 hook->dev = dev;
1261 } else if (ha[NFTA_HOOK_DEV]) {
1262 module_put(type->owner);
1263 return -EOPNOTSUPP;
1264 }
1265
1266 return 0;
1267}
1268
1269static void nft_chain_release_hook(struct nft_chain_hook *hook)
1270{
1271 module_put(hook->type->owner);
1272 if (hook->dev != NULL)
1273 dev_put(hook->dev);
1274}
1275
1199static int nf_tables_newchain(struct net *net, struct sock *nlsk, 1276static int nf_tables_newchain(struct net *net, struct sock *nlsk,
1200 struct sk_buff *skb, const struct nlmsghdr *nlh, 1277 struct sk_buff *skb, const struct nlmsghdr *nlh,
1201 const struct nlattr * const nla[]) 1278 const struct nlattr * const nla[])
@@ -1206,10 +1283,8 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
1206 struct nft_table *table; 1283 struct nft_table *table;
1207 struct nft_chain *chain; 1284 struct nft_chain *chain;
1208 struct nft_base_chain *basechain = NULL; 1285 struct nft_base_chain *basechain = NULL;
1209 struct nlattr *ha[NFTA_HOOK_MAX + 1];
1210 u8 genmask = nft_genmask_next(net); 1286 u8 genmask = nft_genmask_next(net);
1211 int family = nfmsg->nfgen_family; 1287 int family = nfmsg->nfgen_family;
1212 struct net_device *dev = NULL;
1213 u8 policy = NF_ACCEPT; 1288 u8 policy = NF_ACCEPT;
1214 u64 handle = 0; 1289 u64 handle = 0;
1215 unsigned int i; 1290 unsigned int i;
@@ -1273,6 +1348,37 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
1273 if (nlh->nlmsg_flags & NLM_F_REPLACE) 1348 if (nlh->nlmsg_flags & NLM_F_REPLACE)
1274 return -EOPNOTSUPP; 1349 return -EOPNOTSUPP;
1275 1350
1351 if (nla[NFTA_CHAIN_HOOK]) {
1352 struct nft_base_chain *basechain;
1353 struct nft_chain_hook hook;
1354 struct nf_hook_ops *ops;
1355
1356 if (!(chain->flags & NFT_BASE_CHAIN))
1357 return -EBUSY;
1358
1359 err = nft_chain_parse_hook(net, nla, afi, &hook,
1360 create);
1361 if (err < 0)
1362 return err;
1363
1364 basechain = nft_base_chain(chain);
1365 if (basechain->type != hook.type) {
1366 nft_chain_release_hook(&hook);
1367 return -EBUSY;
1368 }
1369
1370 for (i = 0; i < afi->nops; i++) {
1371 ops = &basechain->ops[i];
1372 if (ops->hooknum != hook.num ||
1373 ops->priority != hook.priority ||
1374 ops->dev != hook.dev) {
1375 nft_chain_release_hook(&hook);
1376 return -EBUSY;
1377 }
1378 }
1379 nft_chain_release_hook(&hook);
1380 }
1381
1276 if (nla[NFTA_CHAIN_HANDLE] && name) { 1382 if (nla[NFTA_CHAIN_HANDLE] && name) {
1277 struct nft_chain *chain2; 1383 struct nft_chain *chain2;
1278 1384
@@ -1320,102 +1426,53 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
1320 return -EOVERFLOW; 1426 return -EOVERFLOW;
1321 1427
1322 if (nla[NFTA_CHAIN_HOOK]) { 1428 if (nla[NFTA_CHAIN_HOOK]) {
1323 const struct nf_chain_type *type; 1429 struct nft_chain_hook hook;
1324 struct nf_hook_ops *ops; 1430 struct nf_hook_ops *ops;
1325 nf_hookfn *hookfn; 1431 nf_hookfn *hookfn;
1326 u32 hooknum, priority;
1327
1328 type = chain_type[family][NFT_CHAIN_T_DEFAULT];
1329 if (nla[NFTA_CHAIN_TYPE]) {
1330 type = nf_tables_chain_type_lookup(afi,
1331 nla[NFTA_CHAIN_TYPE],
1332 create);
1333 if (IS_ERR(type))
1334 return PTR_ERR(type);
1335 }
1336 1432
1337 err = nla_parse_nested(ha, NFTA_HOOK_MAX, nla[NFTA_CHAIN_HOOK], 1433 err = nft_chain_parse_hook(net, nla, afi, &hook, create);
1338 nft_hook_policy);
1339 if (err < 0) 1434 if (err < 0)
1340 return err; 1435 return err;
1341 if (ha[NFTA_HOOK_HOOKNUM] == NULL ||
1342 ha[NFTA_HOOK_PRIORITY] == NULL)
1343 return -EINVAL;
1344
1345 hooknum = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM]));
1346 if (hooknum >= afi->nhooks)
1347 return -EINVAL;
1348 priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY]));
1349
1350 if (!(type->hook_mask & (1 << hooknum)))
1351 return -EOPNOTSUPP;
1352 if (!try_module_get(type->owner))
1353 return -ENOENT;
1354 hookfn = type->hooks[hooknum];
1355
1356 if (afi->flags & NFT_AF_NEEDS_DEV) {
1357 char ifname[IFNAMSIZ];
1358
1359 if (!ha[NFTA_HOOK_DEV]) {
1360 module_put(type->owner);
1361 return -EOPNOTSUPP;
1362 }
1363
1364 nla_strlcpy(ifname, ha[NFTA_HOOK_DEV], IFNAMSIZ);
1365 dev = dev_get_by_name(net, ifname);
1366 if (!dev) {
1367 module_put(type->owner);
1368 return -ENOENT;
1369 }
1370 } else if (ha[NFTA_HOOK_DEV]) {
1371 module_put(type->owner);
1372 return -EOPNOTSUPP;
1373 }
1374 1436
1375 basechain = kzalloc(sizeof(*basechain), GFP_KERNEL); 1437 basechain = kzalloc(sizeof(*basechain), GFP_KERNEL);
1376 if (basechain == NULL) { 1438 if (basechain == NULL) {
1377 module_put(type->owner); 1439 nft_chain_release_hook(&hook);
1378 if (dev != NULL)
1379 dev_put(dev);
1380 return -ENOMEM; 1440 return -ENOMEM;
1381 } 1441 }
1382 1442
1383 if (dev != NULL) 1443 if (hook.dev != NULL)
1384 strncpy(basechain->dev_name, dev->name, IFNAMSIZ); 1444 strncpy(basechain->dev_name, hook.dev->name, IFNAMSIZ);
1385 1445
1386 if (nla[NFTA_CHAIN_COUNTERS]) { 1446 if (nla[NFTA_CHAIN_COUNTERS]) {
1387 stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]); 1447 stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]);
1388 if (IS_ERR(stats)) { 1448 if (IS_ERR(stats)) {
1389 module_put(type->owner); 1449 nft_chain_release_hook(&hook);
1390 kfree(basechain); 1450 kfree(basechain);
1391 if (dev != NULL)
1392 dev_put(dev);
1393 return PTR_ERR(stats); 1451 return PTR_ERR(stats);
1394 } 1452 }
1395 basechain->stats = stats; 1453 basechain->stats = stats;
1396 } else { 1454 } else {
1397 stats = netdev_alloc_pcpu_stats(struct nft_stats); 1455 stats = netdev_alloc_pcpu_stats(struct nft_stats);
1398 if (stats == NULL) { 1456 if (stats == NULL) {
1399 module_put(type->owner); 1457 nft_chain_release_hook(&hook);
1400 kfree(basechain); 1458 kfree(basechain);
1401 if (dev != NULL)
1402 dev_put(dev);
1403 return -ENOMEM; 1459 return -ENOMEM;
1404 } 1460 }
1405 rcu_assign_pointer(basechain->stats, stats); 1461 rcu_assign_pointer(basechain->stats, stats);
1406 } 1462 }
1407 1463
1408 basechain->type = type; 1464 hookfn = hook.type->hooks[hook.num];
1465 basechain->type = hook.type;
1409 chain = &basechain->chain; 1466 chain = &basechain->chain;
1410 1467
1411 for (i = 0; i < afi->nops; i++) { 1468 for (i = 0; i < afi->nops; i++) {
1412 ops = &basechain->ops[i]; 1469 ops = &basechain->ops[i];
1413 ops->pf = family; 1470 ops->pf = family;
1414 ops->hooknum = hooknum; 1471 ops->hooknum = hook.num;
1415 ops->priority = priority; 1472 ops->priority = hook.priority;
1416 ops->priv = chain; 1473 ops->priv = chain;
1417 ops->hook = afi->hooks[ops->hooknum]; 1474 ops->hook = afi->hooks[ops->hooknum];
1418 ops->dev = dev; 1475 ops->dev = hook.dev;
1419 if (hookfn) 1476 if (hookfn)
1420 ops->hook = hookfn; 1477 ops->hook = hookfn;
1421 if (afi->hook_ops_init) 1478 if (afi->hook_ops_init)
@@ -3426,12 +3483,12 @@ static int nft_setelem_parse_flags(const struct nft_set *set,
3426} 3483}
3427 3484
3428static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, 3485static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
3429 const struct nlattr *attr) 3486 const struct nlattr *attr, u32 nlmsg_flags)
3430{ 3487{
3431 struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; 3488 struct nlattr *nla[NFTA_SET_ELEM_MAX + 1];
3432 struct nft_data_desc d1, d2; 3489 struct nft_data_desc d1, d2;
3433 struct nft_set_ext_tmpl tmpl; 3490 struct nft_set_ext_tmpl tmpl;
3434 struct nft_set_ext *ext; 3491 struct nft_set_ext *ext, *ext2;
3435 struct nft_set_elem elem; 3492 struct nft_set_elem elem;
3436 struct nft_set_binding *binding; 3493 struct nft_set_binding *binding;
3437 struct nft_userdata *udata; 3494 struct nft_userdata *udata;
@@ -3558,9 +3615,19 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
3558 goto err4; 3615 goto err4;
3559 3616
3560 ext->genmask = nft_genmask_cur(ctx->net) | NFT_SET_ELEM_BUSY_MASK; 3617 ext->genmask = nft_genmask_cur(ctx->net) | NFT_SET_ELEM_BUSY_MASK;
3561 err = set->ops->insert(ctx->net, set, &elem); 3618 err = set->ops->insert(ctx->net, set, &elem, &ext2);
3562 if (err < 0) 3619 if (err) {
3620 if (err == -EEXIST) {
3621 if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) &&
3622 nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) &&
3623 memcmp(nft_set_ext_data(ext),
3624 nft_set_ext_data(ext2), set->dlen) != 0)
3625 err = -EBUSY;
3626 else if (!(nlmsg_flags & NLM_F_EXCL))
3627 err = 0;
3628 }
3563 goto err5; 3629 goto err5;
3630 }
3564 3631
3565 nft_trans_elem(trans) = elem; 3632 nft_trans_elem(trans) = elem;
3566 list_add_tail(&trans->list, &ctx->net->nft.commit_list); 3633 list_add_tail(&trans->list, &ctx->net->nft.commit_list);
@@ -3616,7 +3683,7 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk,
3616 !atomic_add_unless(&set->nelems, 1, set->size + set->ndeact)) 3683 !atomic_add_unless(&set->nelems, 1, set->size + set->ndeact))
3617 return -ENFILE; 3684 return -ENFILE;
3618 3685
3619 err = nft_add_set_elem(&ctx, set, attr); 3686 err = nft_add_set_elem(&ctx, set, attr, nlh->nlmsg_flags);
3620 if (err < 0) { 3687 if (err < 0) {
3621 atomic_dec(&set->nelems); 3688 atomic_dec(&set->nelems);
3622 break; 3689 break;
@@ -4343,6 +4410,31 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx,
4343} 4410}
4344 4411
4345/** 4412/**
4413 * nft_parse_u32_check - fetch u32 attribute and check for maximum value
4414 *
4415 * @attr: netlink attribute to fetch value from
4416 * @max: maximum value to be stored in dest
4417 * @dest: pointer to the variable
4418 *
4419 * Parse, check and store a given u32 netlink attribute into variable.
4420 * This function returns -ERANGE if the value goes over maximum value.
4421 * Otherwise a 0 is returned and the attribute value is stored in the
4422 * destination variable.
4423 */
4424unsigned int nft_parse_u32_check(const struct nlattr *attr, int max, u32 *dest)
4425{
4426 int val;
4427
4428 val = ntohl(nla_get_be32(attr));
4429 if (val > max)
4430 return -ERANGE;
4431
4432 *dest = val;
4433 return 0;
4434}
4435EXPORT_SYMBOL_GPL(nft_parse_u32_check);
4436
4437/**
4346 * nft_parse_register - parse a register value from a netlink attribute 4438 * nft_parse_register - parse a register value from a netlink attribute
4347 * 4439 *
4348 * @attr: netlink attribute 4440 * @attr: netlink attribute
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index fb8b5892b5ff..0dd5c695482f 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -34,7 +34,7 @@ static struct nf_loginfo trace_loginfo = {
34 .u = { 34 .u = {
35 .log = { 35 .log = {
36 .level = LOGLEVEL_WARNING, 36 .level = LOGLEVEL_WARNING,
37 .logflags = NF_LOG_MASK, 37 .logflags = NF_LOG_DEFAULT_MASK,
38 }, 38 },
39 }, 39 },
40}; 40};
@@ -93,12 +93,15 @@ static bool nft_payload_fast_eval(const struct nft_expr *expr,
93 93
94 if (priv->base == NFT_PAYLOAD_NETWORK_HEADER) 94 if (priv->base == NFT_PAYLOAD_NETWORK_HEADER)
95 ptr = skb_network_header(skb); 95 ptr = skb_network_header(skb);
96 else 96 else {
97 if (!pkt->tprot_set)
98 return false;
97 ptr = skb_network_header(skb) + pkt->xt.thoff; 99 ptr = skb_network_header(skb) + pkt->xt.thoff;
100 }
98 101
99 ptr += priv->offset; 102 ptr += priv->offset;
100 103
101 if (unlikely(ptr + priv->len >= skb_tail_pointer(skb))) 104 if (unlikely(ptr + priv->len > skb_tail_pointer(skb)))
102 return false; 105 return false;
103 106
104 *dest = 0; 107 *dest = 0;
@@ -260,8 +263,13 @@ int __init nf_tables_core_module_init(void)
260 if (err < 0) 263 if (err < 0)
261 goto err7; 264 goto err7;
262 265
263 return 0; 266 err = nft_range_module_init();
267 if (err < 0)
268 goto err8;
264 269
270 return 0;
271err8:
272 nft_dynset_module_exit();
265err7: 273err7:
266 nft_payload_module_exit(); 274 nft_payload_module_exit();
267err6: 275err6:
diff --git a/net/netfilter/nf_tables_inet.c b/net/netfilter/nf_tables_inet.c
index 6b5f76295d3d..f713cc205669 100644
--- a/net/netfilter/nf_tables_inet.c
+++ b/net/netfilter/nf_tables_inet.c
@@ -82,7 +82,10 @@ static int __init nf_tables_inet_init(void)
82{ 82{
83 int ret; 83 int ret;
84 84
85 nft_register_chain_type(&filter_inet); 85 ret = nft_register_chain_type(&filter_inet);
86 if (ret < 0)
87 return ret;
88
86 ret = register_pernet_subsys(&nf_tables_inet_net_ops); 89 ret = register_pernet_subsys(&nf_tables_inet_net_ops);
87 if (ret < 0) 90 if (ret < 0)
88 nft_unregister_chain_type(&filter_inet); 91 nft_unregister_chain_type(&filter_inet);
diff --git a/net/netfilter/nf_tables_netdev.c b/net/netfilter/nf_tables_netdev.c
index 75d696f11045..9e2ae424b640 100644
--- a/net/netfilter/nf_tables_netdev.c
+++ b/net/netfilter/nf_tables_netdev.c
@@ -15,78 +15,6 @@
15#include <net/netfilter/nf_tables_ipv4.h> 15#include <net/netfilter/nf_tables_ipv4.h>
16#include <net/netfilter/nf_tables_ipv6.h> 16#include <net/netfilter/nf_tables_ipv6.h>
17 17
18static inline void
19nft_netdev_set_pktinfo_ipv4(struct nft_pktinfo *pkt,
20 struct sk_buff *skb,
21 const struct nf_hook_state *state)
22{
23 struct iphdr *iph, _iph;
24 u32 len, thoff;
25
26 nft_set_pktinfo(pkt, skb, state);
27
28 iph = skb_header_pointer(skb, skb_network_offset(skb), sizeof(*iph),
29 &_iph);
30 if (!iph)
31 return;
32
33 if (iph->ihl < 5 || iph->version != 4)
34 return;
35
36 len = ntohs(iph->tot_len);
37 thoff = iph->ihl * 4;
38 if (skb->len < len)
39 return;
40 else if (len < thoff)
41 return;
42
43 pkt->tprot = iph->protocol;
44 pkt->xt.thoff = thoff;
45 pkt->xt.fragoff = ntohs(iph->frag_off) & IP_OFFSET;
46}
47
48static inline void
49__nft_netdev_set_pktinfo_ipv6(struct nft_pktinfo *pkt,
50 struct sk_buff *skb,
51 const struct nf_hook_state *state)
52{
53#if IS_ENABLED(CONFIG_IPV6)
54 struct ipv6hdr *ip6h, _ip6h;
55 unsigned int thoff = 0;
56 unsigned short frag_off;
57 int protohdr;
58 u32 pkt_len;
59
60 ip6h = skb_header_pointer(skb, skb_network_offset(skb), sizeof(*ip6h),
61 &_ip6h);
62 if (!ip6h)
63 return;
64
65 if (ip6h->version != 6)
66 return;
67
68 pkt_len = ntohs(ip6h->payload_len);
69 if (pkt_len + sizeof(*ip6h) > skb->len)
70 return;
71
72 protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, NULL);
73 if (protohdr < 0)
74 return;
75
76 pkt->tprot = protohdr;
77 pkt->xt.thoff = thoff;
78 pkt->xt.fragoff = frag_off;
79#endif
80}
81
82static inline void nft_netdev_set_pktinfo_ipv6(struct nft_pktinfo *pkt,
83 struct sk_buff *skb,
84 const struct nf_hook_state *state)
85{
86 nft_set_pktinfo(pkt, skb, state);
87 __nft_netdev_set_pktinfo_ipv6(pkt, skb, state);
88}
89
90static unsigned int 18static unsigned int
91nft_do_chain_netdev(void *priv, struct sk_buff *skb, 19nft_do_chain_netdev(void *priv, struct sk_buff *skb,
92 const struct nf_hook_state *state) 20 const struct nf_hook_state *state)
@@ -95,13 +23,13 @@ nft_do_chain_netdev(void *priv, struct sk_buff *skb,
95 23
96 switch (skb->protocol) { 24 switch (skb->protocol) {
97 case htons(ETH_P_IP): 25 case htons(ETH_P_IP):
98 nft_netdev_set_pktinfo_ipv4(&pkt, skb, state); 26 nft_set_pktinfo_ipv4_validate(&pkt, skb, state);
99 break; 27 break;
100 case htons(ETH_P_IPV6): 28 case htons(ETH_P_IPV6):
101 nft_netdev_set_pktinfo_ipv6(&pkt, skb, state); 29 nft_set_pktinfo_ipv6_validate(&pkt, skb, state);
102 break; 30 break;
103 default: 31 default:
104 nft_set_pktinfo(&pkt, skb, state); 32 nft_set_pktinfo_unspec(&pkt, skb, state);
105 break; 33 break;
106 } 34 }
107 35
@@ -221,14 +149,25 @@ static int __init nf_tables_netdev_init(void)
221{ 149{
222 int ret; 150 int ret;
223 151
224 nft_register_chain_type(&nft_filter_chain_netdev); 152 ret = nft_register_chain_type(&nft_filter_chain_netdev);
225 ret = register_pernet_subsys(&nf_tables_netdev_net_ops); 153 if (ret)
226 if (ret < 0) {
227 nft_unregister_chain_type(&nft_filter_chain_netdev);
228 return ret; 154 return ret;
229 } 155
230 register_netdevice_notifier(&nf_tables_netdev_notifier); 156 ret = register_pernet_subsys(&nf_tables_netdev_net_ops);
157 if (ret)
158 goto err1;
159
160 ret = register_netdevice_notifier(&nf_tables_netdev_notifier);
161 if (ret)
162 goto err2;
163
231 return 0; 164 return 0;
165
166err2:
167 unregister_pernet_subsys(&nf_tables_netdev_net_ops);
168err1:
169 nft_unregister_chain_type(&nft_filter_chain_netdev);
170 return ret;
232} 171}
233 172
234static void __exit nf_tables_netdev_exit(void) 173static void __exit nf_tables_netdev_exit(void)
diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c
index fa24a5b398b1..ab695f8e2d29 100644
--- a/net/netfilter/nf_tables_trace.c
+++ b/net/netfilter/nf_tables_trace.c
@@ -113,20 +113,22 @@ static int nf_trace_fill_pkt_info(struct sk_buff *nlskb,
113 const struct nft_pktinfo *pkt) 113 const struct nft_pktinfo *pkt)
114{ 114{
115 const struct sk_buff *skb = pkt->skb; 115 const struct sk_buff *skb = pkt->skb;
116 unsigned int len = min_t(unsigned int,
117 pkt->xt.thoff - skb_network_offset(skb),
118 NFT_TRACETYPE_NETWORK_HSIZE);
119 int off = skb_network_offset(skb); 116 int off = skb_network_offset(skb);
117 unsigned int len, nh_end;
120 118
119 nh_end = pkt->tprot_set ? pkt->xt.thoff : skb->len;
120 len = min_t(unsigned int, nh_end - skb_network_offset(skb),
121 NFT_TRACETYPE_NETWORK_HSIZE);
121 if (trace_fill_header(nlskb, NFTA_TRACE_NETWORK_HEADER, skb, off, len)) 122 if (trace_fill_header(nlskb, NFTA_TRACE_NETWORK_HEADER, skb, off, len))
122 return -1; 123 return -1;
123 124
124 len = min_t(unsigned int, skb->len - pkt->xt.thoff, 125 if (pkt->tprot_set) {
125 NFT_TRACETYPE_TRANSPORT_HSIZE); 126 len = min_t(unsigned int, skb->len - pkt->xt.thoff,
126 127 NFT_TRACETYPE_TRANSPORT_HSIZE);
127 if (trace_fill_header(nlskb, NFTA_TRACE_TRANSPORT_HEADER, skb, 128 if (trace_fill_header(nlskb, NFTA_TRACE_TRANSPORT_HEADER, skb,
128 pkt->xt.thoff, len)) 129 pkt->xt.thoff, len))
129 return -1; 130 return -1;
131 }
130 132
131 if (!skb_mac_header_was_set(skb)) 133 if (!skb_mac_header_was_set(skb))
132 return 0; 134 return 0;
diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c
index e924e95fcc7f..3b79f34b5095 100644
--- a/net/netfilter/nfnetlink_cthelper.c
+++ b/net/netfilter/nfnetlink_cthelper.c
@@ -43,7 +43,7 @@ nfnl_userspace_cthelper(struct sk_buff *skb, unsigned int protoff,
43 if (help == NULL) 43 if (help == NULL)
44 return NF_DROP; 44 return NF_DROP;
45 45
46 /* rcu_read_lock()ed by nf_hook_slow */ 46 /* rcu_read_lock()ed by nf_hook_thresh */
47 helper = rcu_dereference(help->helper); 47 helper = rcu_dereference(help->helper);
48 if (helper == NULL) 48 if (helper == NULL)
49 return NF_DROP; 49 return NF_DROP;
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 6577db524ef6..eb086a192c5a 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -442,7 +442,9 @@ __build_packet_message(struct nfnl_log_net *log,
442 if (nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSINDEV, 442 if (nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSINDEV,
443 htonl(indev->ifindex)) || 443 htonl(indev->ifindex)) ||
444 /* this is the bridge group "brX" */ 444 /* this is the bridge group "brX" */
445 /* rcu_read_lock()ed by nf_hook_slow or nf_log_packet */ 445 /* rcu_read_lock()ed by nf_hook_thresh or
446 * nf_log_packet.
447 */
446 nla_put_be32(inst->skb, NFULA_IFINDEX_INDEV, 448 nla_put_be32(inst->skb, NFULA_IFINDEX_INDEV,
447 htonl(br_port_get_rcu(indev)->br->dev->ifindex))) 449 htonl(br_port_get_rcu(indev)->br->dev->ifindex)))
448 goto nla_put_failure; 450 goto nla_put_failure;
@@ -477,7 +479,9 @@ __build_packet_message(struct nfnl_log_net *log,
477 if (nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSOUTDEV, 479 if (nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSOUTDEV,
478 htonl(outdev->ifindex)) || 480 htonl(outdev->ifindex)) ||
479 /* this is the bridge group "brX" */ 481 /* this is the bridge group "brX" */
480 /* rcu_read_lock()ed by nf_hook_slow or nf_log_packet */ 482 /* rcu_read_lock()ed by nf_hook_thresh or
483 * nf_log_packet.
484 */
481 nla_put_be32(inst->skb, NFULA_IFINDEX_OUTDEV, 485 nla_put_be32(inst->skb, NFULA_IFINDEX_OUTDEV,
482 htonl(br_port_get_rcu(outdev)->br->dev->ifindex))) 486 htonl(br_port_get_rcu(outdev)->br->dev->ifindex)))
483 goto nla_put_failure; 487 goto nla_put_failure;
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index f49f45081acb..af832c526048 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -740,7 +740,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
740 struct net *net = entry->state.net; 740 struct net *net = entry->state.net;
741 struct nfnl_queue_net *q = nfnl_queue_pernet(net); 741 struct nfnl_queue_net *q = nfnl_queue_pernet(net);
742 742
743 /* rcu_read_lock()ed by nf_hook_slow() */ 743 /* rcu_read_lock()ed by nf_hook_thresh */
744 queue = instance_lookup(q, queuenum); 744 queue = instance_lookup(q, queuenum);
745 if (!queue) 745 if (!queue)
746 return -ESRCH; 746 return -ESRCH;
@@ -917,12 +917,14 @@ static struct notifier_block nfqnl_dev_notifier = {
917 .notifier_call = nfqnl_rcv_dev_event, 917 .notifier_call = nfqnl_rcv_dev_event,
918}; 918};
919 919
920static int nf_hook_cmp(struct nf_queue_entry *entry, unsigned long ops_ptr) 920static int nf_hook_cmp(struct nf_queue_entry *entry, unsigned long entry_ptr)
921{ 921{
922 return entry->elem == (struct nf_hook_ops *)ops_ptr; 922 return rcu_access_pointer(entry->state.hook_entries) ==
923 (struct nf_hook_entry *)entry_ptr;
923} 924}
924 925
925static void nfqnl_nf_hook_drop(struct net *net, struct nf_hook_ops *hook) 926static void nfqnl_nf_hook_drop(struct net *net,
927 const struct nf_hook_entry *hook)
926{ 928{
927 struct nfnl_queue_net *q = nfnl_queue_pernet(net); 929 struct nfnl_queue_net *q = nfnl_queue_pernet(net);
928 int i; 930 int i;
@@ -1522,9 +1524,16 @@ static int __init nfnetlink_queue_init(void)
1522 goto cleanup_netlink_notifier; 1524 goto cleanup_netlink_notifier;
1523 } 1525 }
1524 1526
1525 register_netdevice_notifier(&nfqnl_dev_notifier); 1527 status = register_netdevice_notifier(&nfqnl_dev_notifier);
1528 if (status < 0) {
1529 pr_err("nf_queue: failed to register netdevice notifier\n");
1530 goto cleanup_netlink_subsys;
1531 }
1532
1526 return status; 1533 return status;
1527 1534
1535cleanup_netlink_subsys:
1536 nfnetlink_subsys_unregister(&nfqnl_subsys);
1528cleanup_netlink_notifier: 1537cleanup_netlink_notifier:
1529 netlink_unregister_notifier(&nfqnl_rtnl_notifier); 1538 netlink_unregister_notifier(&nfqnl_rtnl_notifier);
1530 unregister_pernet_subsys(&nfnl_queue_net_ops); 1539 unregister_pernet_subsys(&nfnl_queue_net_ops);
diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c
index d71cc18fa35d..31c15ed2e5fc 100644
--- a/net/netfilter/nft_bitwise.c
+++ b/net/netfilter/nft_bitwise.c
@@ -52,6 +52,7 @@ static int nft_bitwise_init(const struct nft_ctx *ctx,
52{ 52{
53 struct nft_bitwise *priv = nft_expr_priv(expr); 53 struct nft_bitwise *priv = nft_expr_priv(expr);
54 struct nft_data_desc d1, d2; 54 struct nft_data_desc d1, d2;
55 u32 len;
55 int err; 56 int err;
56 57
57 if (tb[NFTA_BITWISE_SREG] == NULL || 58 if (tb[NFTA_BITWISE_SREG] == NULL ||
@@ -61,7 +62,12 @@ static int nft_bitwise_init(const struct nft_ctx *ctx,
61 tb[NFTA_BITWISE_XOR] == NULL) 62 tb[NFTA_BITWISE_XOR] == NULL)
62 return -EINVAL; 63 return -EINVAL;
63 64
64 priv->len = ntohl(nla_get_be32(tb[NFTA_BITWISE_LEN])); 65 err = nft_parse_u32_check(tb[NFTA_BITWISE_LEN], U8_MAX, &len);
66 if (err < 0)
67 return err;
68
69 priv->len = len;
70
65 priv->sreg = nft_parse_register(tb[NFTA_BITWISE_SREG]); 71 priv->sreg = nft_parse_register(tb[NFTA_BITWISE_SREG]);
66 err = nft_validate_register_load(priv->sreg, priv->len); 72 err = nft_validate_register_load(priv->sreg, priv->len);
67 if (err < 0) 73 if (err < 0)
diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c
index b78c28ba465f..ee63d981268d 100644
--- a/net/netfilter/nft_byteorder.c
+++ b/net/netfilter/nft_byteorder.c
@@ -99,6 +99,7 @@ static int nft_byteorder_init(const struct nft_ctx *ctx,
99 const struct nlattr * const tb[]) 99 const struct nlattr * const tb[])
100{ 100{
101 struct nft_byteorder *priv = nft_expr_priv(expr); 101 struct nft_byteorder *priv = nft_expr_priv(expr);
102 u32 size, len;
102 int err; 103 int err;
103 104
104 if (tb[NFTA_BYTEORDER_SREG] == NULL || 105 if (tb[NFTA_BYTEORDER_SREG] == NULL ||
@@ -117,7 +118,12 @@ static int nft_byteorder_init(const struct nft_ctx *ctx,
117 return -EINVAL; 118 return -EINVAL;
118 } 119 }
119 120
120 priv->size = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_SIZE])); 121 err = nft_parse_u32_check(tb[NFTA_BYTEORDER_SIZE], U8_MAX, &size);
122 if (err < 0)
123 return err;
124
125 priv->size = size;
126
121 switch (priv->size) { 127 switch (priv->size) {
122 case 2: 128 case 2:
123 case 4: 129 case 4:
@@ -128,7 +134,12 @@ static int nft_byteorder_init(const struct nft_ctx *ctx,
128 } 134 }
129 135
130 priv->sreg = nft_parse_register(tb[NFTA_BYTEORDER_SREG]); 136 priv->sreg = nft_parse_register(tb[NFTA_BYTEORDER_SREG]);
131 priv->len = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_LEN])); 137 err = nft_parse_u32_check(tb[NFTA_BYTEORDER_LEN], U8_MAX, &len);
138 if (err < 0)
139 return err;
140
141 priv->len = len;
142
132 err = nft_validate_register_load(priv->sreg, priv->len); 143 err = nft_validate_register_load(priv->sreg, priv->len);
133 if (err < 0) 144 if (err < 0)
134 return err; 145 return err;
diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c
index e25b35d70e4d..2e53739812b1 100644
--- a/net/netfilter/nft_cmp.c
+++ b/net/netfilter/nft_cmp.c
@@ -84,6 +84,9 @@ static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
84 if (err < 0) 84 if (err < 0)
85 return err; 85 return err;
86 86
87 if (desc.len > U8_MAX)
88 return -ERANGE;
89
87 priv->op = ntohl(nla_get_be32(tb[NFTA_CMP_OP])); 90 priv->op = ntohl(nla_get_be32(tb[NFTA_CMP_OP]));
88 priv->len = desc.len; 91 priv->len = desc.len;
89 return 0; 92 return 0;
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index 51e180f2a003..d7b0d171172a 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -128,15 +128,18 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
128 memcpy(dest, &count, sizeof(count)); 128 memcpy(dest, &count, sizeof(count));
129 return; 129 return;
130 } 130 }
131 case NFT_CT_L3PROTOCOL:
132 *dest = nf_ct_l3num(ct);
133 return;
134 case NFT_CT_PROTOCOL:
135 *dest = nf_ct_protonum(ct);
136 return;
131 default: 137 default:
132 break; 138 break;
133 } 139 }
134 140
135 tuple = &ct->tuplehash[priv->dir].tuple; 141 tuple = &ct->tuplehash[priv->dir].tuple;
136 switch (priv->key) { 142 switch (priv->key) {
137 case NFT_CT_L3PROTOCOL:
138 *dest = nf_ct_l3num(ct);
139 return;
140 case NFT_CT_SRC: 143 case NFT_CT_SRC:
141 memcpy(dest, tuple->src.u3.all, 144 memcpy(dest, tuple->src.u3.all,
142 nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16); 145 nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16);
@@ -145,9 +148,6 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
145 memcpy(dest, tuple->dst.u3.all, 148 memcpy(dest, tuple->dst.u3.all,
146 nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16); 149 nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16);
147 return; 150 return;
148 case NFT_CT_PROTOCOL:
149 *dest = nf_ct_protonum(ct);
150 return;
151 case NFT_CT_PROTO_SRC: 151 case NFT_CT_PROTO_SRC:
152 *dest = (__force __u16)tuple->src.u.all; 152 *dest = (__force __u16)tuple->src.u.all;
153 return; 153 return;
@@ -283,8 +283,9 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
283 283
284 case NFT_CT_L3PROTOCOL: 284 case NFT_CT_L3PROTOCOL:
285 case NFT_CT_PROTOCOL: 285 case NFT_CT_PROTOCOL:
286 if (tb[NFTA_CT_DIRECTION] == NULL) 286 /* For compatibility, do not report error if NFTA_CT_DIRECTION
287 return -EINVAL; 287 * attribute is specified.
288 */
288 len = sizeof(u8); 289 len = sizeof(u8);
289 break; 290 break;
290 case NFT_CT_SRC: 291 case NFT_CT_SRC:
@@ -363,6 +364,8 @@ static int nft_ct_set_init(const struct nft_ctx *ctx,
363 switch (priv->key) { 364 switch (priv->key) {
364#ifdef CONFIG_NF_CONNTRACK_MARK 365#ifdef CONFIG_NF_CONNTRACK_MARK
365 case NFT_CT_MARK: 366 case NFT_CT_MARK:
367 if (tb[NFTA_CT_DIRECTION])
368 return -EINVAL;
366 len = FIELD_SIZEOF(struct nf_conn, mark); 369 len = FIELD_SIZEOF(struct nf_conn, mark);
367 break; 370 break;
368#endif 371#endif
@@ -432,8 +435,6 @@ static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr)
432 goto nla_put_failure; 435 goto nla_put_failure;
433 436
434 switch (priv->key) { 437 switch (priv->key) {
435 case NFT_CT_L3PROTOCOL:
436 case NFT_CT_PROTOCOL:
437 case NFT_CT_SRC: 438 case NFT_CT_SRC:
438 case NFT_CT_DST: 439 case NFT_CT_DST:
439 case NFT_CT_PROTO_SRC: 440 case NFT_CT_PROTO_SRC:
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 0af26699bf04..e3b83c31da2e 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -22,6 +22,7 @@ struct nft_dynset {
22 enum nft_dynset_ops op:8; 22 enum nft_dynset_ops op:8;
23 enum nft_registers sreg_key:8; 23 enum nft_registers sreg_key:8;
24 enum nft_registers sreg_data:8; 24 enum nft_registers sreg_data:8;
25 bool invert;
25 u64 timeout; 26 u64 timeout;
26 struct nft_expr *expr; 27 struct nft_expr *expr;
27 struct nft_set_binding binding; 28 struct nft_set_binding binding;
@@ -82,10 +83,14 @@ static void nft_dynset_eval(const struct nft_expr *expr,
82 83
83 if (sexpr != NULL) 84 if (sexpr != NULL)
84 sexpr->ops->eval(sexpr, regs, pkt); 85 sexpr->ops->eval(sexpr, regs, pkt);
86
87 if (priv->invert)
88 regs->verdict.code = NFT_BREAK;
85 return; 89 return;
86 } 90 }
87out: 91out:
88 regs->verdict.code = NFT_BREAK; 92 if (!priv->invert)
93 regs->verdict.code = NFT_BREAK;
89} 94}
90 95
91static const struct nla_policy nft_dynset_policy[NFTA_DYNSET_MAX + 1] = { 96static const struct nla_policy nft_dynset_policy[NFTA_DYNSET_MAX + 1] = {
@@ -96,6 +101,7 @@ static const struct nla_policy nft_dynset_policy[NFTA_DYNSET_MAX + 1] = {
96 [NFTA_DYNSET_SREG_DATA] = { .type = NLA_U32 }, 101 [NFTA_DYNSET_SREG_DATA] = { .type = NLA_U32 },
97 [NFTA_DYNSET_TIMEOUT] = { .type = NLA_U64 }, 102 [NFTA_DYNSET_TIMEOUT] = { .type = NLA_U64 },
98 [NFTA_DYNSET_EXPR] = { .type = NLA_NESTED }, 103 [NFTA_DYNSET_EXPR] = { .type = NLA_NESTED },
104 [NFTA_DYNSET_FLAGS] = { .type = NLA_U32 },
99}; 105};
100 106
101static int nft_dynset_init(const struct nft_ctx *ctx, 107static int nft_dynset_init(const struct nft_ctx *ctx,
@@ -113,6 +119,15 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
113 tb[NFTA_DYNSET_SREG_KEY] == NULL) 119 tb[NFTA_DYNSET_SREG_KEY] == NULL)
114 return -EINVAL; 120 return -EINVAL;
115 121
122 if (tb[NFTA_DYNSET_FLAGS]) {
123 u32 flags = ntohl(nla_get_be32(tb[NFTA_DYNSET_FLAGS]));
124
125 if (flags & ~NFT_DYNSET_F_INV)
126 return -EINVAL;
127 if (flags & NFT_DYNSET_F_INV)
128 priv->invert = true;
129 }
130
116 set = nf_tables_set_lookup(ctx->table, tb[NFTA_DYNSET_SET_NAME], 131 set = nf_tables_set_lookup(ctx->table, tb[NFTA_DYNSET_SET_NAME],
117 genmask); 132 genmask);
118 if (IS_ERR(set)) { 133 if (IS_ERR(set)) {
@@ -220,6 +235,7 @@ static void nft_dynset_destroy(const struct nft_ctx *ctx,
220static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr) 235static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr)
221{ 236{
222 const struct nft_dynset *priv = nft_expr_priv(expr); 237 const struct nft_dynset *priv = nft_expr_priv(expr);
238 u32 flags = priv->invert ? NFT_DYNSET_F_INV : 0;
223 239
224 if (nft_dump_register(skb, NFTA_DYNSET_SREG_KEY, priv->sreg_key)) 240 if (nft_dump_register(skb, NFTA_DYNSET_SREG_KEY, priv->sreg_key))
225 goto nla_put_failure; 241 goto nla_put_failure;
@@ -235,6 +251,8 @@ static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr)
235 goto nla_put_failure; 251 goto nla_put_failure;
236 if (priv->expr && nft_expr_dump(skb, NFTA_DYNSET_EXPR, priv->expr)) 252 if (priv->expr && nft_expr_dump(skb, NFTA_DYNSET_EXPR, priv->expr))
237 goto nla_put_failure; 253 goto nla_put_failure;
254 if (nla_put_be32(skb, NFTA_DYNSET_FLAGS, htonl(flags)))
255 goto nla_put_failure;
238 return 0; 256 return 0;
239 257
240nla_put_failure: 258nla_put_failure:
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
index 82c264e40278..a84cf3d66056 100644
--- a/net/netfilter/nft_exthdr.c
+++ b/net/netfilter/nft_exthdr.c
@@ -59,7 +59,7 @@ static int nft_exthdr_init(const struct nft_ctx *ctx,
59 const struct nlattr * const tb[]) 59 const struct nlattr * const tb[])
60{ 60{
61 struct nft_exthdr *priv = nft_expr_priv(expr); 61 struct nft_exthdr *priv = nft_expr_priv(expr);
62 u32 offset, len; 62 u32 offset, len, err;
63 63
64 if (tb[NFTA_EXTHDR_DREG] == NULL || 64 if (tb[NFTA_EXTHDR_DREG] == NULL ||
65 tb[NFTA_EXTHDR_TYPE] == NULL || 65 tb[NFTA_EXTHDR_TYPE] == NULL ||
@@ -67,11 +67,13 @@ static int nft_exthdr_init(const struct nft_ctx *ctx,
67 tb[NFTA_EXTHDR_LEN] == NULL) 67 tb[NFTA_EXTHDR_LEN] == NULL)
68 return -EINVAL; 68 return -EINVAL;
69 69
70 offset = ntohl(nla_get_be32(tb[NFTA_EXTHDR_OFFSET])); 70 err = nft_parse_u32_check(tb[NFTA_EXTHDR_OFFSET], U8_MAX, &offset);
71 len = ntohl(nla_get_be32(tb[NFTA_EXTHDR_LEN])); 71 if (err < 0)
72 return err;
72 73
73 if (offset > U8_MAX || len > U8_MAX) 74 err = nft_parse_u32_check(tb[NFTA_EXTHDR_LEN], U8_MAX, &len);
74 return -ERANGE; 75 if (err < 0)
76 return err;
75 77
76 priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]); 78 priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]);
77 priv->offset = offset; 79 priv->offset = offset;
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index 564fa7929ed5..09473b415b95 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -1,395 +1,145 @@
1/* 1/*
2 * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net> 2 * Copyright (c) 2016 Laura Garcia <nevola@gmail.com>
3 * 3 *
4 * This program is free software; you can redistribute it and/or modify 4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as 5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation. 6 * published by the Free Software Foundation.
7 * 7 *
8 * Development of this code funded by Astaro AG (http://www.astaro.com/)
9 */ 8 */
10 9
11#include <linux/kernel.h> 10#include <linux/kernel.h>
12#include <linux/init.h> 11#include <linux/init.h>
13#include <linux/module.h> 12#include <linux/module.h>
14#include <linux/list.h>
15#include <linux/log2.h>
16#include <linux/jhash.h>
17#include <linux/netlink.h> 13#include <linux/netlink.h>
18#include <linux/workqueue.h>
19#include <linux/rhashtable.h>
20#include <linux/netfilter.h> 14#include <linux/netfilter.h>
21#include <linux/netfilter/nf_tables.h> 15#include <linux/netfilter/nf_tables.h>
22#include <net/netfilter/nf_tables.h> 16#include <net/netfilter/nf_tables.h>
23 17#include <net/netfilter/nf_tables_core.h>
24/* We target a hash table size of 4, element hint is 75% of final size */ 18#include <linux/jhash.h>
25#define NFT_HASH_ELEMENT_HINT 3
26 19
27struct nft_hash { 20struct nft_hash {
28 struct rhashtable ht; 21 enum nft_registers sreg:8;
29 struct delayed_work gc_work; 22 enum nft_registers dreg:8;
30}; 23 u8 len;
31 24 u32 modulus;
32struct nft_hash_elem { 25 u32 seed;
33 struct rhash_head node; 26 u32 offset;
34 struct nft_set_ext ext;
35}; 27};
36 28
37struct nft_hash_cmp_arg { 29static void nft_hash_eval(const struct nft_expr *expr,
38 const struct nft_set *set; 30 struct nft_regs *regs,
39 const u32 *key; 31 const struct nft_pktinfo *pkt)
40 u8 genmask;
41};
42
43static const struct rhashtable_params nft_hash_params;
44
45static inline u32 nft_hash_key(const void *data, u32 len, u32 seed)
46{
47 const struct nft_hash_cmp_arg *arg = data;
48
49 return jhash(arg->key, len, seed);
50}
51
52static inline u32 nft_hash_obj(const void *data, u32 len, u32 seed)
53{
54 const struct nft_hash_elem *he = data;
55
56 return jhash(nft_set_ext_key(&he->ext), len, seed);
57}
58
59static inline int nft_hash_cmp(struct rhashtable_compare_arg *arg,
60 const void *ptr)
61{
62 const struct nft_hash_cmp_arg *x = arg->key;
63 const struct nft_hash_elem *he = ptr;
64
65 if (memcmp(nft_set_ext_key(&he->ext), x->key, x->set->klen))
66 return 1;
67 if (nft_set_elem_expired(&he->ext))
68 return 1;
69 if (!nft_set_elem_active(&he->ext, x->genmask))
70 return 1;
71 return 0;
72}
73
74static bool nft_hash_lookup(const struct net *net, const struct nft_set *set,
75 const u32 *key, const struct nft_set_ext **ext)
76{
77 struct nft_hash *priv = nft_set_priv(set);
78 const struct nft_hash_elem *he;
79 struct nft_hash_cmp_arg arg = {
80 .genmask = nft_genmask_cur(net),
81 .set = set,
82 .key = key,
83 };
84
85 he = rhashtable_lookup_fast(&priv->ht, &arg, nft_hash_params);
86 if (he != NULL)
87 *ext = &he->ext;
88
89 return !!he;
90}
91
92static bool nft_hash_update(struct nft_set *set, const u32 *key,
93 void *(*new)(struct nft_set *,
94 const struct nft_expr *,
95 struct nft_regs *regs),
96 const struct nft_expr *expr,
97 struct nft_regs *regs,
98 const struct nft_set_ext **ext)
99{
100 struct nft_hash *priv = nft_set_priv(set);
101 struct nft_hash_elem *he;
102 struct nft_hash_cmp_arg arg = {
103 .genmask = NFT_GENMASK_ANY,
104 .set = set,
105 .key = key,
106 };
107
108 he = rhashtable_lookup_fast(&priv->ht, &arg, nft_hash_params);
109 if (he != NULL)
110 goto out;
111
112 he = new(set, expr, regs);
113 if (he == NULL)
114 goto err1;
115 if (rhashtable_lookup_insert_key(&priv->ht, &arg, &he->node,
116 nft_hash_params))
117 goto err2;
118out:
119 *ext = &he->ext;
120 return true;
121
122err2:
123 nft_set_elem_destroy(set, he);
124err1:
125 return false;
126}
127
128static int nft_hash_insert(const struct net *net, const struct nft_set *set,
129 const struct nft_set_elem *elem)
130{
131 struct nft_hash *priv = nft_set_priv(set);
132 struct nft_hash_elem *he = elem->priv;
133 struct nft_hash_cmp_arg arg = {
134 .genmask = nft_genmask_next(net),
135 .set = set,
136 .key = elem->key.val.data,
137 };
138
139 return rhashtable_lookup_insert_key(&priv->ht, &arg, &he->node,
140 nft_hash_params);
141}
142
143static void nft_hash_activate(const struct net *net, const struct nft_set *set,
144 const struct nft_set_elem *elem)
145{
146 struct nft_hash_elem *he = elem->priv;
147
148 nft_set_elem_change_active(net, set, &he->ext);
149 nft_set_elem_clear_busy(&he->ext);
150}
151
152static void *nft_hash_deactivate(const struct net *net,
153 const struct nft_set *set,
154 const struct nft_set_elem *elem)
155{ 32{
156 struct nft_hash *priv = nft_set_priv(set); 33 struct nft_hash *priv = nft_expr_priv(expr);
157 struct nft_hash_elem *he; 34 const void *data = &regs->data[priv->sreg];
158 struct nft_hash_cmp_arg arg = { 35 u32 h;
159 .genmask = nft_genmask_next(net),
160 .set = set,
161 .key = elem->key.val.data,
162 };
163
164 rcu_read_lock();
165 he = rhashtable_lookup_fast(&priv->ht, &arg, nft_hash_params);
166 if (he != NULL) {
167 if (!nft_set_elem_mark_busy(&he->ext) ||
168 !nft_is_active(net, &he->ext))
169 nft_set_elem_change_active(net, set, &he->ext);
170 else
171 he = NULL;
172 }
173 rcu_read_unlock();
174 36
175 return he; 37 h = reciprocal_scale(jhash(data, priv->len, priv->seed), priv->modulus);
38 regs->data[priv->dreg] = h + priv->offset;
176} 39}
177 40
178static void nft_hash_remove(const struct nft_set *set, 41static const struct nla_policy nft_hash_policy[NFTA_HASH_MAX + 1] = {
179 const struct nft_set_elem *elem) 42 [NFTA_HASH_SREG] = { .type = NLA_U32 },
180{ 43 [NFTA_HASH_DREG] = { .type = NLA_U32 },
181 struct nft_hash *priv = nft_set_priv(set); 44 [NFTA_HASH_LEN] = { .type = NLA_U32 },
182 struct nft_hash_elem *he = elem->priv; 45 [NFTA_HASH_MODULUS] = { .type = NLA_U32 },
183 46 [NFTA_HASH_SEED] = { .type = NLA_U32 },
184 rhashtable_remove_fast(&priv->ht, &he->node, nft_hash_params); 47};
185}
186 48
187static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set, 49static int nft_hash_init(const struct nft_ctx *ctx,
188 struct nft_set_iter *iter) 50 const struct nft_expr *expr,
51 const struct nlattr * const tb[])
189{ 52{
190 struct nft_hash *priv = nft_set_priv(set); 53 struct nft_hash *priv = nft_expr_priv(expr);
191 struct nft_hash_elem *he; 54 u32 len;
192 struct rhashtable_iter hti;
193 struct nft_set_elem elem;
194 int err;
195
196 err = rhashtable_walk_init(&priv->ht, &hti, GFP_KERNEL);
197 iter->err = err;
198 if (err)
199 return;
200
201 err = rhashtable_walk_start(&hti);
202 if (err && err != -EAGAIN) {
203 iter->err = err;
204 goto out;
205 }
206
207 while ((he = rhashtable_walk_next(&hti))) {
208 if (IS_ERR(he)) {
209 err = PTR_ERR(he);
210 if (err != -EAGAIN) {
211 iter->err = err;
212 goto out;
213 }
214 55
215 continue; 56 if (!tb[NFTA_HASH_SREG] ||
216 } 57 !tb[NFTA_HASH_DREG] ||
58 !tb[NFTA_HASH_LEN] ||
59 !tb[NFTA_HASH_SEED] ||
60 !tb[NFTA_HASH_MODULUS])
61 return -EINVAL;
217 62
218 if (iter->count < iter->skip) 63 if (tb[NFTA_HASH_OFFSET])
219 goto cont; 64 priv->offset = ntohl(nla_get_be32(tb[NFTA_HASH_OFFSET]));
220 if (nft_set_elem_expired(&he->ext))
221 goto cont;
222 if (!nft_set_elem_active(&he->ext, iter->genmask))
223 goto cont;
224 65
225 elem.priv = he; 66 priv->sreg = nft_parse_register(tb[NFTA_HASH_SREG]);
67 priv->dreg = nft_parse_register(tb[NFTA_HASH_DREG]);
226 68
227 iter->err = iter->fn(ctx, set, iter, &elem); 69 len = ntohl(nla_get_be32(tb[NFTA_HASH_LEN]));
228 if (iter->err < 0) 70 if (len == 0 || len > U8_MAX)
229 goto out; 71 return -ERANGE;
230 72
231cont: 73 priv->len = len;
232 iter->count++;
233 }
234 74
235out: 75 priv->modulus = ntohl(nla_get_be32(tb[NFTA_HASH_MODULUS]));
236 rhashtable_walk_stop(&hti); 76 if (priv->modulus <= 1)
237 rhashtable_walk_exit(&hti); 77 return -ERANGE;
238}
239
240static void nft_hash_gc(struct work_struct *work)
241{
242 struct nft_set *set;
243 struct nft_hash_elem *he;
244 struct nft_hash *priv;
245 struct nft_set_gc_batch *gcb = NULL;
246 struct rhashtable_iter hti;
247 int err;
248
249 priv = container_of(work, struct nft_hash, gc_work.work);
250 set = nft_set_container_of(priv);
251
252 err = rhashtable_walk_init(&priv->ht, &hti, GFP_KERNEL);
253 if (err)
254 goto schedule;
255 78
256 err = rhashtable_walk_start(&hti); 79 if (priv->offset + priv->modulus - 1 < priv->offset)
257 if (err && err != -EAGAIN) 80 return -EOVERFLOW;
258 goto out;
259 81
260 while ((he = rhashtable_walk_next(&hti))) { 82 priv->seed = ntohl(nla_get_be32(tb[NFTA_HASH_SEED]));
261 if (IS_ERR(he)) {
262 if (PTR_ERR(he) != -EAGAIN)
263 goto out;
264 continue;
265 }
266 83
267 if (!nft_set_elem_expired(&he->ext)) 84 return nft_validate_register_load(priv->sreg, len) &&
268 continue; 85 nft_validate_register_store(ctx, priv->dreg, NULL,
269 if (nft_set_elem_mark_busy(&he->ext)) 86 NFT_DATA_VALUE, sizeof(u32));
270 continue;
271
272 gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
273 if (gcb == NULL)
274 goto out;
275 rhashtable_remove_fast(&priv->ht, &he->node, nft_hash_params);
276 atomic_dec(&set->nelems);
277 nft_set_gc_batch_add(gcb, he);
278 }
279out:
280 rhashtable_walk_stop(&hti);
281 rhashtable_walk_exit(&hti);
282
283 nft_set_gc_batch_complete(gcb);
284schedule:
285 queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
286 nft_set_gc_interval(set));
287} 87}
288 88
289static unsigned int nft_hash_privsize(const struct nlattr * const nla[]) 89static int nft_hash_dump(struct sk_buff *skb,
90 const struct nft_expr *expr)
290{ 91{
291 return sizeof(struct nft_hash); 92 const struct nft_hash *priv = nft_expr_priv(expr);
292}
293
294static const struct rhashtable_params nft_hash_params = {
295 .head_offset = offsetof(struct nft_hash_elem, node),
296 .hashfn = nft_hash_key,
297 .obj_hashfn = nft_hash_obj,
298 .obj_cmpfn = nft_hash_cmp,
299 .automatic_shrinking = true,
300};
301 93
302static int nft_hash_init(const struct nft_set *set, 94 if (nft_dump_register(skb, NFTA_HASH_SREG, priv->sreg))
303 const struct nft_set_desc *desc, 95 goto nla_put_failure;
304 const struct nlattr * const tb[]) 96 if (nft_dump_register(skb, NFTA_HASH_DREG, priv->dreg))
305{ 97 goto nla_put_failure;
306 struct nft_hash *priv = nft_set_priv(set); 98 if (nla_put_be32(skb, NFTA_HASH_LEN, htonl(priv->len)))
307 struct rhashtable_params params = nft_hash_params; 99 goto nla_put_failure;
308 int err; 100 if (nla_put_be32(skb, NFTA_HASH_MODULUS, htonl(priv->modulus)))
309 101 goto nla_put_failure;
310 params.nelem_hint = desc->size ?: NFT_HASH_ELEMENT_HINT; 102 if (nla_put_be32(skb, NFTA_HASH_SEED, htonl(priv->seed)))
311 params.key_len = set->klen; 103 goto nla_put_failure;
312 104 if (priv->offset != 0)
313 err = rhashtable_init(&priv->ht, &params); 105 if (nla_put_be32(skb, NFTA_HASH_OFFSET, htonl(priv->offset)))
314 if (err < 0) 106 goto nla_put_failure;
315 return err;
316
317 INIT_DEFERRABLE_WORK(&priv->gc_work, nft_hash_gc);
318 if (set->flags & NFT_SET_TIMEOUT)
319 queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
320 nft_set_gc_interval(set));
321 return 0; 107 return 0;
322}
323 108
324static void nft_hash_elem_destroy(void *ptr, void *arg) 109nla_put_failure:
325{ 110 return -1;
326 nft_set_elem_destroy((const struct nft_set *)arg, ptr);
327} 111}
328 112
329static void nft_hash_destroy(const struct nft_set *set) 113static struct nft_expr_type nft_hash_type;
330{ 114static const struct nft_expr_ops nft_hash_ops = {
331 struct nft_hash *priv = nft_set_priv(set); 115 .type = &nft_hash_type,
332 116 .size = NFT_EXPR_SIZE(sizeof(struct nft_hash)),
333 cancel_delayed_work_sync(&priv->gc_work); 117 .eval = nft_hash_eval,
334 rhashtable_free_and_destroy(&priv->ht, nft_hash_elem_destroy,
335 (void *)set);
336}
337
338static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features,
339 struct nft_set_estimate *est)
340{
341 unsigned int esize;
342
343 esize = sizeof(struct nft_hash_elem);
344 if (desc->size) {
345 est->size = sizeof(struct nft_hash) +
346 roundup_pow_of_two(desc->size * 4 / 3) *
347 sizeof(struct nft_hash_elem *) +
348 desc->size * esize;
349 } else {
350 /* Resizing happens when the load drops below 30% or goes
351 * above 75%. The average of 52.5% load (approximated by 50%)
352 * is used for the size estimation of the hash buckets,
353 * meaning we calculate two buckets per element.
354 */
355 est->size = esize + 2 * sizeof(struct nft_hash_elem *);
356 }
357
358 est->class = NFT_SET_CLASS_O_1;
359
360 return true;
361}
362
363static struct nft_set_ops nft_hash_ops __read_mostly = {
364 .privsize = nft_hash_privsize,
365 .elemsize = offsetof(struct nft_hash_elem, ext),
366 .estimate = nft_hash_estimate,
367 .init = nft_hash_init, 118 .init = nft_hash_init,
368 .destroy = nft_hash_destroy, 119 .dump = nft_hash_dump,
369 .insert = nft_hash_insert, 120};
370 .activate = nft_hash_activate, 121
371 .deactivate = nft_hash_deactivate, 122static struct nft_expr_type nft_hash_type __read_mostly = {
372 .remove = nft_hash_remove, 123 .name = "hash",
373 .lookup = nft_hash_lookup, 124 .ops = &nft_hash_ops,
374 .update = nft_hash_update, 125 .policy = nft_hash_policy,
375 .walk = nft_hash_walk, 126 .maxattr = NFTA_HASH_MAX,
376 .features = NFT_SET_MAP | NFT_SET_TIMEOUT,
377 .owner = THIS_MODULE, 127 .owner = THIS_MODULE,
378}; 128};
379 129
380static int __init nft_hash_module_init(void) 130static int __init nft_hash_module_init(void)
381{ 131{
382 return nft_register_set(&nft_hash_ops); 132 return nft_register_expr(&nft_hash_type);
383} 133}
384 134
385static void __exit nft_hash_module_exit(void) 135static void __exit nft_hash_module_exit(void)
386{ 136{
387 nft_unregister_set(&nft_hash_ops); 137 nft_unregister_expr(&nft_hash_type);
388} 138}
389 139
390module_init(nft_hash_module_init); 140module_init(nft_hash_module_init);
391module_exit(nft_hash_module_exit); 141module_exit(nft_hash_module_exit);
392 142
393MODULE_LICENSE("GPL"); 143MODULE_LICENSE("GPL");
394MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); 144MODULE_AUTHOR("Laura Garcia <nevola@gmail.com>");
395MODULE_ALIAS_NFT_SET(); 145MODULE_ALIAS_NFT_EXPR("hash");
diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c
index db3b746858e3..d17018ff54e6 100644
--- a/net/netfilter/nft_immediate.c
+++ b/net/netfilter/nft_immediate.c
@@ -53,6 +53,10 @@ static int nft_immediate_init(const struct nft_ctx *ctx,
53 tb[NFTA_IMMEDIATE_DATA]); 53 tb[NFTA_IMMEDIATE_DATA]);
54 if (err < 0) 54 if (err < 0)
55 return err; 55 return err;
56
57 if (desc.len > U8_MAX)
58 return -ERANGE;
59
56 priv->dlen = desc.len; 60 priv->dlen = desc.len;
57 61
58 priv->dreg = nft_parse_register(tb[NFTA_IMMEDIATE_DREG]); 62 priv->dreg = nft_parse_register(tb[NFTA_IMMEDIATE_DREG]);
diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c
index 070b98938e02..c6baf412236d 100644
--- a/net/netfilter/nft_limit.c
+++ b/net/netfilter/nft_limit.c
@@ -145,7 +145,7 @@ static int nft_limit_pkts_init(const struct nft_ctx *ctx,
145 if (err < 0) 145 if (err < 0)
146 return err; 146 return err;
147 147
148 priv->cost = div_u64(priv->limit.nsecs, priv->limit.rate); 148 priv->cost = div64_u64(priv->limit.nsecs, priv->limit.rate);
149 return 0; 149 return 0;
150} 150}
151 151
@@ -170,7 +170,7 @@ static void nft_limit_pkt_bytes_eval(const struct nft_expr *expr,
170 const struct nft_pktinfo *pkt) 170 const struct nft_pktinfo *pkt)
171{ 171{
172 struct nft_limit *priv = nft_expr_priv(expr); 172 struct nft_limit *priv = nft_expr_priv(expr);
173 u64 cost = div_u64(priv->nsecs * pkt->skb->len, priv->rate); 173 u64 cost = div64_u64(priv->nsecs * pkt->skb->len, priv->rate);
174 174
175 if (nft_limit_eval(priv, cost)) 175 if (nft_limit_eval(priv, cost))
176 regs->verdict.code = NFT_BREAK; 176 regs->verdict.code = NFT_BREAK;
diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c
index 24a73bb26e94..1b01404bb33f 100644
--- a/net/netfilter/nft_log.c
+++ b/net/netfilter/nft_log.c
@@ -58,8 +58,11 @@ static int nft_log_init(const struct nft_ctx *ctx,
58 if (tb[NFTA_LOG_LEVEL] != NULL && 58 if (tb[NFTA_LOG_LEVEL] != NULL &&
59 tb[NFTA_LOG_GROUP] != NULL) 59 tb[NFTA_LOG_GROUP] != NULL)
60 return -EINVAL; 60 return -EINVAL;
61 if (tb[NFTA_LOG_GROUP] != NULL) 61 if (tb[NFTA_LOG_GROUP] != NULL) {
62 li->type = NF_LOG_TYPE_ULOG; 62 li->type = NF_LOG_TYPE_ULOG;
63 if (tb[NFTA_LOG_FLAGS] != NULL)
64 return -EINVAL;
65 }
63 66
64 nla = tb[NFTA_LOG_PREFIX]; 67 nla = tb[NFTA_LOG_PREFIX];
65 if (nla != NULL) { 68 if (nla != NULL) {
@@ -87,6 +90,10 @@ static int nft_log_init(const struct nft_ctx *ctx,
87 if (tb[NFTA_LOG_FLAGS] != NULL) { 90 if (tb[NFTA_LOG_FLAGS] != NULL) {
88 li->u.log.logflags = 91 li->u.log.logflags =
89 ntohl(nla_get_be32(tb[NFTA_LOG_FLAGS])); 92 ntohl(nla_get_be32(tb[NFTA_LOG_FLAGS]));
93 if (li->u.log.logflags & ~NF_LOG_MASK) {
94 err = -EINVAL;
95 goto err1;
96 }
90 } 97 }
91 break; 98 break;
92 case NF_LOG_TYPE_ULOG: 99 case NF_LOG_TYPE_ULOG:
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
index e164325d1bc0..8166b6994cc7 100644
--- a/net/netfilter/nft_lookup.c
+++ b/net/netfilter/nft_lookup.c
@@ -43,7 +43,7 @@ static void nft_lookup_eval(const struct nft_expr *expr,
43 return; 43 return;
44 } 44 }
45 45
46 if (found && set->flags & NFT_SET_MAP) 46 if (set->flags & NFT_SET_MAP)
47 nft_data_copy(&regs->data[priv->dreg], 47 nft_data_copy(&regs->data[priv->dreg],
48 nft_set_ext_data(ext), set->dlen); 48 nft_set_ext_data(ext), set->dlen);
49 49
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 8a6bc7630912..6c1e0246706e 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -52,6 +52,8 @@ void nft_meta_get_eval(const struct nft_expr *expr,
52 *dest = pkt->pf; 52 *dest = pkt->pf;
53 break; 53 break;
54 case NFT_META_L4PROTO: 54 case NFT_META_L4PROTO:
55 if (!pkt->tprot_set)
56 goto err;
55 *dest = pkt->tprot; 57 *dest = pkt->tprot;
56 break; 58 break;
57 case NFT_META_PRIORITY: 59 case NFT_META_PRIORITY:
diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c
new file mode 100644
index 000000000000..55bc5ab78d4a
--- /dev/null
+++ b/net/netfilter/nft_numgen.c
@@ -0,0 +1,212 @@
1/*
2 * Copyright (c) 2016 Laura Garcia <nevola@gmail.com>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 *
8 */
9
10#include <linux/kernel.h>
11#include <linux/init.h>
12#include <linux/module.h>
13#include <linux/netlink.h>
14#include <linux/netfilter.h>
15#include <linux/netfilter/nf_tables.h>
16#include <linux/static_key.h>
17#include <net/netfilter/nf_tables.h>
18#include <net/netfilter/nf_tables_core.h>
19
20static DEFINE_PER_CPU(struct rnd_state, nft_numgen_prandom_state);
21
22struct nft_ng_inc {
23 enum nft_registers dreg:8;
24 u32 modulus;
25 atomic_t counter;
26 u32 offset;
27};
28
29static void nft_ng_inc_eval(const struct nft_expr *expr,
30 struct nft_regs *regs,
31 const struct nft_pktinfo *pkt)
32{
33 struct nft_ng_inc *priv = nft_expr_priv(expr);
34 u32 nval, oval;
35
36 do {
37 oval = atomic_read(&priv->counter);
38 nval = (oval + 1 < priv->modulus) ? oval + 1 : 0;
39 } while (atomic_cmpxchg(&priv->counter, oval, nval) != oval);
40
41 regs->data[priv->dreg] = nval + priv->offset;
42}
43
44static const struct nla_policy nft_ng_policy[NFTA_NG_MAX + 1] = {
45 [NFTA_NG_DREG] = { .type = NLA_U32 },
46 [NFTA_NG_MODULUS] = { .type = NLA_U32 },
47 [NFTA_NG_TYPE] = { .type = NLA_U32 },
48 [NFTA_NG_OFFSET] = { .type = NLA_U32 },
49};
50
51static int nft_ng_inc_init(const struct nft_ctx *ctx,
52 const struct nft_expr *expr,
53 const struct nlattr * const tb[])
54{
55 struct nft_ng_inc *priv = nft_expr_priv(expr);
56
57 if (tb[NFTA_NG_OFFSET])
58 priv->offset = ntohl(nla_get_be32(tb[NFTA_NG_OFFSET]));
59
60 priv->modulus = ntohl(nla_get_be32(tb[NFTA_NG_MODULUS]));
61 if (priv->modulus == 0)
62 return -ERANGE;
63
64 if (priv->offset + priv->modulus - 1 < priv->offset)
65 return -EOVERFLOW;
66
67 priv->dreg = nft_parse_register(tb[NFTA_NG_DREG]);
68 atomic_set(&priv->counter, 0);
69
70 return nft_validate_register_store(ctx, priv->dreg, NULL,
71 NFT_DATA_VALUE, sizeof(u32));
72}
73
74static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg,
75 u32 modulus, enum nft_ng_types type, u32 offset)
76{
77 if (nft_dump_register(skb, NFTA_NG_DREG, dreg))
78 goto nla_put_failure;
79 if (nla_put_be32(skb, NFTA_NG_MODULUS, htonl(modulus)))
80 goto nla_put_failure;
81 if (nla_put_be32(skb, NFTA_NG_TYPE, htonl(type)))
82 goto nla_put_failure;
83 if (nla_put_be32(skb, NFTA_NG_OFFSET, htonl(offset)))
84 goto nla_put_failure;
85
86 return 0;
87
88nla_put_failure:
89 return -1;
90}
91
92static int nft_ng_inc_dump(struct sk_buff *skb, const struct nft_expr *expr)
93{
94 const struct nft_ng_inc *priv = nft_expr_priv(expr);
95
96 return nft_ng_dump(skb, priv->dreg, priv->modulus, NFT_NG_INCREMENTAL,
97 priv->offset);
98}
99
100struct nft_ng_random {
101 enum nft_registers dreg:8;
102 u32 modulus;
103 u32 offset;
104};
105
106static void nft_ng_random_eval(const struct nft_expr *expr,
107 struct nft_regs *regs,
108 const struct nft_pktinfo *pkt)
109{
110 struct nft_ng_random *priv = nft_expr_priv(expr);
111 struct rnd_state *state = this_cpu_ptr(&nft_numgen_prandom_state);
112 u32 val;
113
114 val = reciprocal_scale(prandom_u32_state(state), priv->modulus);
115 regs->data[priv->dreg] = val + priv->offset;
116}
117
118static int nft_ng_random_init(const struct nft_ctx *ctx,
119 const struct nft_expr *expr,
120 const struct nlattr * const tb[])
121{
122 struct nft_ng_random *priv = nft_expr_priv(expr);
123
124 if (tb[NFTA_NG_OFFSET])
125 priv->offset = ntohl(nla_get_be32(tb[NFTA_NG_OFFSET]));
126
127 priv->modulus = ntohl(nla_get_be32(tb[NFTA_NG_MODULUS]));
128 if (priv->modulus == 0)
129 return -ERANGE;
130
131 if (priv->offset + priv->modulus - 1 < priv->offset)
132 return -EOVERFLOW;
133
134 prandom_init_once(&nft_numgen_prandom_state);
135
136 priv->dreg = nft_parse_register(tb[NFTA_NG_DREG]);
137
138 return nft_validate_register_store(ctx, priv->dreg, NULL,
139 NFT_DATA_VALUE, sizeof(u32));
140}
141
142static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr)
143{
144 const struct nft_ng_random *priv = nft_expr_priv(expr);
145
146 return nft_ng_dump(skb, priv->dreg, priv->modulus, NFT_NG_RANDOM,
147 priv->offset);
148}
149
150static struct nft_expr_type nft_ng_type;
151static const struct nft_expr_ops nft_ng_inc_ops = {
152 .type = &nft_ng_type,
153 .size = NFT_EXPR_SIZE(sizeof(struct nft_ng_inc)),
154 .eval = nft_ng_inc_eval,
155 .init = nft_ng_inc_init,
156 .dump = nft_ng_inc_dump,
157};
158
159static const struct nft_expr_ops nft_ng_random_ops = {
160 .type = &nft_ng_type,
161 .size = NFT_EXPR_SIZE(sizeof(struct nft_ng_random)),
162 .eval = nft_ng_random_eval,
163 .init = nft_ng_random_init,
164 .dump = nft_ng_random_dump,
165};
166
167static const struct nft_expr_ops *
168nft_ng_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
169{
170 u32 type;
171
172 if (!tb[NFTA_NG_DREG] ||
173 !tb[NFTA_NG_MODULUS] ||
174 !tb[NFTA_NG_TYPE])
175 return ERR_PTR(-EINVAL);
176
177 type = ntohl(nla_get_be32(tb[NFTA_NG_TYPE]));
178
179 switch (type) {
180 case NFT_NG_INCREMENTAL:
181 return &nft_ng_inc_ops;
182 case NFT_NG_RANDOM:
183 return &nft_ng_random_ops;
184 }
185
186 return ERR_PTR(-EINVAL);
187}
188
189static struct nft_expr_type nft_ng_type __read_mostly = {
190 .name = "numgen",
191 .select_ops = &nft_ng_select_ops,
192 .policy = nft_ng_policy,
193 .maxattr = NFTA_NG_MAX,
194 .owner = THIS_MODULE,
195};
196
197static int __init nft_ng_module_init(void)
198{
199 return nft_register_expr(&nft_ng_type);
200}
201
202static void __exit nft_ng_module_exit(void)
203{
204 nft_unregister_expr(&nft_ng_type);
205}
206
207module_init(nft_ng_module_init);
208module_exit(nft_ng_module_exit);
209
210MODULE_LICENSE("GPL");
211MODULE_AUTHOR("Laura Garcia <nevola@gmail.com>");
212MODULE_ALIAS_NFT_EXPR("numgen");
diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
index 12cd4bf16d17..b2f88617611a 100644
--- a/net/netfilter/nft_payload.c
+++ b/net/netfilter/nft_payload.c
@@ -92,6 +92,8 @@ static void nft_payload_eval(const struct nft_expr *expr,
92 offset = skb_network_offset(skb); 92 offset = skb_network_offset(skb);
93 break; 93 break;
94 case NFT_PAYLOAD_TRANSPORT_HEADER: 94 case NFT_PAYLOAD_TRANSPORT_HEADER:
95 if (!pkt->tprot_set)
96 goto err;
95 offset = pkt->xt.thoff; 97 offset = pkt->xt.thoff;
96 break; 98 break;
97 default: 99 default:
@@ -184,6 +186,8 @@ static void nft_payload_set_eval(const struct nft_expr *expr,
184 offset = skb_network_offset(skb); 186 offset = skb_network_offset(skb);
185 break; 187 break;
186 case NFT_PAYLOAD_TRANSPORT_HEADER: 188 case NFT_PAYLOAD_TRANSPORT_HEADER:
189 if (!pkt->tprot_set)
190 goto err;
187 offset = pkt->xt.thoff; 191 offset = pkt->xt.thoff;
188 break; 192 break;
189 default: 193 default:
diff --git a/net/netfilter/nft_queue.c b/net/netfilter/nft_queue.c
index 61d216eb7917..393d359a1889 100644
--- a/net/netfilter/nft_queue.c
+++ b/net/netfilter/nft_queue.c
@@ -22,9 +22,10 @@
22static u32 jhash_initval __read_mostly; 22static u32 jhash_initval __read_mostly;
23 23
24struct nft_queue { 24struct nft_queue {
25 u16 queuenum; 25 enum nft_registers sreg_qnum:8;
26 u16 queues_total; 26 u16 queuenum;
27 u16 flags; 27 u16 queues_total;
28 u16 flags;
28}; 29};
29 30
30static void nft_queue_eval(const struct nft_expr *expr, 31static void nft_queue_eval(const struct nft_expr *expr,
@@ -54,31 +55,78 @@ static void nft_queue_eval(const struct nft_expr *expr,
54 regs->verdict.code = ret; 55 regs->verdict.code = ret;
55} 56}
56 57
58static void nft_queue_sreg_eval(const struct nft_expr *expr,
59 struct nft_regs *regs,
60 const struct nft_pktinfo *pkt)
61{
62 struct nft_queue *priv = nft_expr_priv(expr);
63 u32 queue, ret;
64
65 queue = regs->data[priv->sreg_qnum];
66
67 ret = NF_QUEUE_NR(queue);
68 if (priv->flags & NFT_QUEUE_FLAG_BYPASS)
69 ret |= NF_VERDICT_FLAG_QUEUE_BYPASS;
70
71 regs->verdict.code = ret;
72}
73
57static const struct nla_policy nft_queue_policy[NFTA_QUEUE_MAX + 1] = { 74static const struct nla_policy nft_queue_policy[NFTA_QUEUE_MAX + 1] = {
58 [NFTA_QUEUE_NUM] = { .type = NLA_U16 }, 75 [NFTA_QUEUE_NUM] = { .type = NLA_U16 },
59 [NFTA_QUEUE_TOTAL] = { .type = NLA_U16 }, 76 [NFTA_QUEUE_TOTAL] = { .type = NLA_U16 },
60 [NFTA_QUEUE_FLAGS] = { .type = NLA_U16 }, 77 [NFTA_QUEUE_FLAGS] = { .type = NLA_U16 },
78 [NFTA_QUEUE_SREG_QNUM] = { .type = NLA_U32 },
61}; 79};
62 80
63static int nft_queue_init(const struct nft_ctx *ctx, 81static int nft_queue_init(const struct nft_ctx *ctx,
64 const struct nft_expr *expr, 82 const struct nft_expr *expr,
65 const struct nlattr * const tb[]) 83 const struct nlattr * const tb[])
66{ 84{
67 struct nft_queue *priv = nft_expr_priv(expr); 85 struct nft_queue *priv = nft_expr_priv(expr);
86 u32 maxid;
68 87
69 if (tb[NFTA_QUEUE_NUM] == NULL)
70 return -EINVAL;
71
72 init_hashrandom(&jhash_initval);
73 priv->queuenum = ntohs(nla_get_be16(tb[NFTA_QUEUE_NUM])); 88 priv->queuenum = ntohs(nla_get_be16(tb[NFTA_QUEUE_NUM]));
74 89
75 if (tb[NFTA_QUEUE_TOTAL] != NULL) 90 if (tb[NFTA_QUEUE_TOTAL])
76 priv->queues_total = ntohs(nla_get_be16(tb[NFTA_QUEUE_TOTAL])); 91 priv->queues_total = ntohs(nla_get_be16(tb[NFTA_QUEUE_TOTAL]));
77 if (tb[NFTA_QUEUE_FLAGS] != NULL) { 92 else
93 priv->queues_total = 1;
94
95 if (priv->queues_total == 0)
96 return -EINVAL;
97
98 maxid = priv->queues_total - 1 + priv->queuenum;
99 if (maxid > U16_MAX)
100 return -ERANGE;
101
102 if (tb[NFTA_QUEUE_FLAGS]) {
103 priv->flags = ntohs(nla_get_be16(tb[NFTA_QUEUE_FLAGS]));
104 if (priv->flags & ~NFT_QUEUE_FLAG_MASK)
105 return -EINVAL;
106 }
107 return 0;
108}
109
110static int nft_queue_sreg_init(const struct nft_ctx *ctx,
111 const struct nft_expr *expr,
112 const struct nlattr * const tb[])
113{
114 struct nft_queue *priv = nft_expr_priv(expr);
115 int err;
116
117 priv->sreg_qnum = nft_parse_register(tb[NFTA_QUEUE_SREG_QNUM]);
118 err = nft_validate_register_load(priv->sreg_qnum, sizeof(u32));
119 if (err < 0)
120 return err;
121
122 if (tb[NFTA_QUEUE_FLAGS]) {
78 priv->flags = ntohs(nla_get_be16(tb[NFTA_QUEUE_FLAGS])); 123 priv->flags = ntohs(nla_get_be16(tb[NFTA_QUEUE_FLAGS]));
79 if (priv->flags & ~NFT_QUEUE_FLAG_MASK) 124 if (priv->flags & ~NFT_QUEUE_FLAG_MASK)
80 return -EINVAL; 125 return -EINVAL;
126 if (priv->flags & NFT_QUEUE_FLAG_CPU_FANOUT)
127 return -EOPNOTSUPP;
81 } 128 }
129
82 return 0; 130 return 0;
83} 131}
84 132
@@ -97,6 +145,21 @@ nla_put_failure:
97 return -1; 145 return -1;
98} 146}
99 147
148static int
149nft_queue_sreg_dump(struct sk_buff *skb, const struct nft_expr *expr)
150{
151 const struct nft_queue *priv = nft_expr_priv(expr);
152
153 if (nft_dump_register(skb, NFTA_QUEUE_SREG_QNUM, priv->sreg_qnum) ||
154 nla_put_be16(skb, NFTA_QUEUE_FLAGS, htons(priv->flags)))
155 goto nla_put_failure;
156
157 return 0;
158
159nla_put_failure:
160 return -1;
161}
162
100static struct nft_expr_type nft_queue_type; 163static struct nft_expr_type nft_queue_type;
101static const struct nft_expr_ops nft_queue_ops = { 164static const struct nft_expr_ops nft_queue_ops = {
102 .type = &nft_queue_type, 165 .type = &nft_queue_type,
@@ -106,9 +169,35 @@ static const struct nft_expr_ops nft_queue_ops = {
106 .dump = nft_queue_dump, 169 .dump = nft_queue_dump,
107}; 170};
108 171
172static const struct nft_expr_ops nft_queue_sreg_ops = {
173 .type = &nft_queue_type,
174 .size = NFT_EXPR_SIZE(sizeof(struct nft_queue)),
175 .eval = nft_queue_sreg_eval,
176 .init = nft_queue_sreg_init,
177 .dump = nft_queue_sreg_dump,
178};
179
180static const struct nft_expr_ops *
181nft_queue_select_ops(const struct nft_ctx *ctx,
182 const struct nlattr * const tb[])
183{
184 if (tb[NFTA_QUEUE_NUM] && tb[NFTA_QUEUE_SREG_QNUM])
185 return ERR_PTR(-EINVAL);
186
187 init_hashrandom(&jhash_initval);
188
189 if (tb[NFTA_QUEUE_NUM])
190 return &nft_queue_ops;
191
192 if (tb[NFTA_QUEUE_SREG_QNUM])
193 return &nft_queue_sreg_ops;
194
195 return ERR_PTR(-EINVAL);
196}
197
109static struct nft_expr_type nft_queue_type __read_mostly = { 198static struct nft_expr_type nft_queue_type __read_mostly = {
110 .name = "queue", 199 .name = "queue",
111 .ops = &nft_queue_ops, 200 .select_ops = &nft_queue_select_ops,
112 .policy = nft_queue_policy, 201 .policy = nft_queue_policy,
113 .maxattr = NFTA_QUEUE_MAX, 202 .maxattr = NFTA_QUEUE_MAX,
114 .owner = THIS_MODULE, 203 .owner = THIS_MODULE,
diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c
new file mode 100644
index 000000000000..c00104c07095
--- /dev/null
+++ b/net/netfilter/nft_quota.c
@@ -0,0 +1,121 @@
1/*
2 * Copyright (c) 2016 Pablo Neira Ayuso <pablo@netfilter.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8
9#include <linux/kernel.h>
10#include <linux/init.h>
11#include <linux/module.h>
12#include <linux/atomic.h>
13#include <linux/netlink.h>
14#include <linux/netfilter.h>
15#include <linux/netfilter/nf_tables.h>
16#include <net/netfilter/nf_tables.h>
17
18struct nft_quota {
19 u64 quota;
20 bool invert;
21 atomic64_t remain;
22};
23
24static inline bool nft_overquota(struct nft_quota *priv,
25 const struct nft_pktinfo *pkt)
26{
27 return atomic64_sub_return(pkt->skb->len, &priv->remain) < 0;
28}
29
30static void nft_quota_eval(const struct nft_expr *expr,
31 struct nft_regs *regs,
32 const struct nft_pktinfo *pkt)
33{
34 struct nft_quota *priv = nft_expr_priv(expr);
35
36 if (nft_overquota(priv, pkt) ^ priv->invert)
37 regs->verdict.code = NFT_BREAK;
38}
39
40static const struct nla_policy nft_quota_policy[NFTA_QUOTA_MAX + 1] = {
41 [NFTA_QUOTA_BYTES] = { .type = NLA_U64 },
42 [NFTA_QUOTA_FLAGS] = { .type = NLA_U32 },
43};
44
45static int nft_quota_init(const struct nft_ctx *ctx,
46 const struct nft_expr *expr,
47 const struct nlattr * const tb[])
48{
49 struct nft_quota *priv = nft_expr_priv(expr);
50 u32 flags = 0;
51 u64 quota;
52
53 if (!tb[NFTA_QUOTA_BYTES])
54 return -EINVAL;
55
56 quota = be64_to_cpu(nla_get_be64(tb[NFTA_QUOTA_BYTES]));
57 if (quota > S64_MAX)
58 return -EOVERFLOW;
59
60 if (tb[NFTA_QUOTA_FLAGS]) {
61 flags = ntohl(nla_get_be32(tb[NFTA_QUOTA_FLAGS]));
62 if (flags & ~NFT_QUOTA_F_INV)
63 return -EINVAL;
64 }
65
66 priv->quota = quota;
67 priv->invert = (flags & NFT_QUOTA_F_INV) ? true : false;
68 atomic64_set(&priv->remain, quota);
69
70 return 0;
71}
72
73static int nft_quota_dump(struct sk_buff *skb, const struct nft_expr *expr)
74{
75 const struct nft_quota *priv = nft_expr_priv(expr);
76 u32 flags = priv->invert ? NFT_QUOTA_F_INV : 0;
77
78 if (nla_put_be64(skb, NFTA_QUOTA_BYTES, cpu_to_be64(priv->quota),
79 NFTA_QUOTA_PAD) ||
80 nla_put_be32(skb, NFTA_QUOTA_FLAGS, htonl(flags)))
81 goto nla_put_failure;
82 return 0;
83
84nla_put_failure:
85 return -1;
86}
87
88static struct nft_expr_type nft_quota_type;
89static const struct nft_expr_ops nft_quota_ops = {
90 .type = &nft_quota_type,
91 .size = NFT_EXPR_SIZE(sizeof(struct nft_quota)),
92 .eval = nft_quota_eval,
93 .init = nft_quota_init,
94 .dump = nft_quota_dump,
95};
96
97static struct nft_expr_type nft_quota_type __read_mostly = {
98 .name = "quota",
99 .ops = &nft_quota_ops,
100 .policy = nft_quota_policy,
101 .maxattr = NFTA_QUOTA_MAX,
102 .flags = NFT_EXPR_STATEFUL,
103 .owner = THIS_MODULE,
104};
105
106static int __init nft_quota_module_init(void)
107{
108 return nft_register_expr(&nft_quota_type);
109}
110
111static void __exit nft_quota_module_exit(void)
112{
113 nft_unregister_expr(&nft_quota_type);
114}
115
116module_init(nft_quota_module_init);
117module_exit(nft_quota_module_exit);
118
119MODULE_LICENSE("GPL");
120MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
121MODULE_ALIAS_NFT_EXPR("quota");
diff --git a/net/netfilter/nft_range.c b/net/netfilter/nft_range.c
new file mode 100644
index 000000000000..c6d5358482d1
--- /dev/null
+++ b/net/netfilter/nft_range.c
@@ -0,0 +1,138 @@
1/*
2 * Copyright (c) 2016 Pablo Neira Ayuso <pablo@netfilter.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8
9#include <linux/kernel.h>
10#include <linux/init.h>
11#include <linux/module.h>
12#include <linux/netlink.h>
13#include <linux/netfilter.h>
14#include <linux/netfilter/nf_tables.h>
15#include <net/netfilter/nf_tables_core.h>
16#include <net/netfilter/nf_tables.h>
17
18struct nft_range_expr {
19 struct nft_data data_from;
20 struct nft_data data_to;
21 enum nft_registers sreg:8;
22 u8 len;
23 enum nft_range_ops op:8;
24};
25
26static void nft_range_eval(const struct nft_expr *expr,
27 struct nft_regs *regs,
28 const struct nft_pktinfo *pkt)
29{
30 const struct nft_range_expr *priv = nft_expr_priv(expr);
31 bool mismatch;
32 int d1, d2;
33
34 d1 = memcmp(&regs->data[priv->sreg], &priv->data_from, priv->len);
35 d2 = memcmp(&regs->data[priv->sreg], &priv->data_to, priv->len);
36 switch (priv->op) {
37 case NFT_RANGE_EQ:
38 mismatch = (d1 < 0 || d2 > 0);
39 break;
40 case NFT_RANGE_NEQ:
41 mismatch = (d1 >= 0 && d2 <= 0);
42 break;
43 }
44
45 if (mismatch)
46 regs->verdict.code = NFT_BREAK;
47}
48
49static const struct nla_policy nft_range_policy[NFTA_RANGE_MAX + 1] = {
50 [NFTA_RANGE_SREG] = { .type = NLA_U32 },
51 [NFTA_RANGE_OP] = { .type = NLA_U32 },
52 [NFTA_RANGE_FROM_DATA] = { .type = NLA_NESTED },
53 [NFTA_RANGE_TO_DATA] = { .type = NLA_NESTED },
54};
55
56static int nft_range_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
57 const struct nlattr * const tb[])
58{
59 struct nft_range_expr *priv = nft_expr_priv(expr);
60 struct nft_data_desc desc_from, desc_to;
61 int err;
62
63 err = nft_data_init(NULL, &priv->data_from, sizeof(priv->data_from),
64 &desc_from, tb[NFTA_RANGE_FROM_DATA]);
65 if (err < 0)
66 return err;
67
68 err = nft_data_init(NULL, &priv->data_to, sizeof(priv->data_to),
69 &desc_to, tb[NFTA_RANGE_TO_DATA]);
70 if (err < 0)
71 goto err1;
72
73 if (desc_from.len != desc_to.len) {
74 err = -EINVAL;
75 goto err2;
76 }
77
78 priv->sreg = nft_parse_register(tb[NFTA_RANGE_SREG]);
79 err = nft_validate_register_load(priv->sreg, desc_from.len);
80 if (err < 0)
81 goto err2;
82
83 priv->op = ntohl(nla_get_be32(tb[NFTA_RANGE_OP]));
84 priv->len = desc_from.len;
85 return 0;
86err2:
87 nft_data_uninit(&priv->data_to, desc_to.type);
88err1:
89 nft_data_uninit(&priv->data_from, desc_from.type);
90 return err;
91}
92
93static int nft_range_dump(struct sk_buff *skb, const struct nft_expr *expr)
94{
95 const struct nft_range_expr *priv = nft_expr_priv(expr);
96
97 if (nft_dump_register(skb, NFTA_RANGE_SREG, priv->sreg))
98 goto nla_put_failure;
99 if (nla_put_be32(skb, NFTA_RANGE_OP, htonl(priv->op)))
100 goto nla_put_failure;
101
102 if (nft_data_dump(skb, NFTA_RANGE_FROM_DATA, &priv->data_from,
103 NFT_DATA_VALUE, priv->len) < 0 ||
104 nft_data_dump(skb, NFTA_RANGE_TO_DATA, &priv->data_to,
105 NFT_DATA_VALUE, priv->len) < 0)
106 goto nla_put_failure;
107 return 0;
108
109nla_put_failure:
110 return -1;
111}
112
113static struct nft_expr_type nft_range_type;
114static const struct nft_expr_ops nft_range_ops = {
115 .type = &nft_range_type,
116 .size = NFT_EXPR_SIZE(sizeof(struct nft_range_expr)),
117 .eval = nft_range_eval,
118 .init = nft_range_init,
119 .dump = nft_range_dump,
120};
121
122static struct nft_expr_type nft_range_type __read_mostly = {
123 .name = "range",
124 .ops = &nft_range_ops,
125 .policy = nft_range_policy,
126 .maxattr = NFTA_RANGE_MAX,
127 .owner = THIS_MODULE,
128};
129
130int __init nft_range_module_init(void)
131{
132 return nft_register_expr(&nft_range_type);
133}
134
135void nft_range_module_exit(void)
136{
137 nft_unregister_expr(&nft_range_type);
138}
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
new file mode 100644
index 000000000000..3794cb2fc788
--- /dev/null
+++ b/net/netfilter/nft_set_hash.c
@@ -0,0 +1,404 @@
1/*
2 * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 *
8 * Development of this code funded by Astaro AG (http://www.astaro.com/)
9 */
10
11#include <linux/kernel.h>
12#include <linux/init.h>
13#include <linux/module.h>
14#include <linux/list.h>
15#include <linux/log2.h>
16#include <linux/jhash.h>
17#include <linux/netlink.h>
18#include <linux/workqueue.h>
19#include <linux/rhashtable.h>
20#include <linux/netfilter.h>
21#include <linux/netfilter/nf_tables.h>
22#include <net/netfilter/nf_tables.h>
23
24/* We target a hash table size of 4, element hint is 75% of final size */
25#define NFT_HASH_ELEMENT_HINT 3
26
27struct nft_hash {
28 struct rhashtable ht;
29 struct delayed_work gc_work;
30};
31
32struct nft_hash_elem {
33 struct rhash_head node;
34 struct nft_set_ext ext;
35};
36
37struct nft_hash_cmp_arg {
38 const struct nft_set *set;
39 const u32 *key;
40 u8 genmask;
41};
42
43static const struct rhashtable_params nft_hash_params;
44
45static inline u32 nft_hash_key(const void *data, u32 len, u32 seed)
46{
47 const struct nft_hash_cmp_arg *arg = data;
48
49 return jhash(arg->key, len, seed);
50}
51
52static inline u32 nft_hash_obj(const void *data, u32 len, u32 seed)
53{
54 const struct nft_hash_elem *he = data;
55
56 return jhash(nft_set_ext_key(&he->ext), len, seed);
57}
58
59static inline int nft_hash_cmp(struct rhashtable_compare_arg *arg,
60 const void *ptr)
61{
62 const struct nft_hash_cmp_arg *x = arg->key;
63 const struct nft_hash_elem *he = ptr;
64
65 if (memcmp(nft_set_ext_key(&he->ext), x->key, x->set->klen))
66 return 1;
67 if (nft_set_elem_expired(&he->ext))
68 return 1;
69 if (!nft_set_elem_active(&he->ext, x->genmask))
70 return 1;
71 return 0;
72}
73
74static bool nft_hash_lookup(const struct net *net, const struct nft_set *set,
75 const u32 *key, const struct nft_set_ext **ext)
76{
77 struct nft_hash *priv = nft_set_priv(set);
78 const struct nft_hash_elem *he;
79 struct nft_hash_cmp_arg arg = {
80 .genmask = nft_genmask_cur(net),
81 .set = set,
82 .key = key,
83 };
84
85 he = rhashtable_lookup_fast(&priv->ht, &arg, nft_hash_params);
86 if (he != NULL)
87 *ext = &he->ext;
88
89 return !!he;
90}
91
92static bool nft_hash_update(struct nft_set *set, const u32 *key,
93 void *(*new)(struct nft_set *,
94 const struct nft_expr *,
95 struct nft_regs *regs),
96 const struct nft_expr *expr,
97 struct nft_regs *regs,
98 const struct nft_set_ext **ext)
99{
100 struct nft_hash *priv = nft_set_priv(set);
101 struct nft_hash_elem *he;
102 struct nft_hash_cmp_arg arg = {
103 .genmask = NFT_GENMASK_ANY,
104 .set = set,
105 .key = key,
106 };
107
108 he = rhashtable_lookup_fast(&priv->ht, &arg, nft_hash_params);
109 if (he != NULL)
110 goto out;
111
112 he = new(set, expr, regs);
113 if (he == NULL)
114 goto err1;
115 if (rhashtable_lookup_insert_key(&priv->ht, &arg, &he->node,
116 nft_hash_params))
117 goto err2;
118out:
119 *ext = &he->ext;
120 return true;
121
122err2:
123 nft_set_elem_destroy(set, he);
124err1:
125 return false;
126}
127
128static int nft_hash_insert(const struct net *net, const struct nft_set *set,
129 const struct nft_set_elem *elem,
130 struct nft_set_ext **ext)
131{
132 struct nft_hash *priv = nft_set_priv(set);
133 struct nft_hash_elem *he = elem->priv;
134 struct nft_hash_cmp_arg arg = {
135 .genmask = nft_genmask_next(net),
136 .set = set,
137 .key = elem->key.val.data,
138 };
139 struct nft_hash_elem *prev;
140
141 prev = rhashtable_lookup_get_insert_key(&priv->ht, &arg, &he->node,
142 nft_hash_params);
143 if (IS_ERR(prev))
144 return PTR_ERR(prev);
145 if (prev) {
146 *ext = &prev->ext;
147 return -EEXIST;
148 }
149 return 0;
150}
151
152static void nft_hash_activate(const struct net *net, const struct nft_set *set,
153 const struct nft_set_elem *elem)
154{
155 struct nft_hash_elem *he = elem->priv;
156
157 nft_set_elem_change_active(net, set, &he->ext);
158 nft_set_elem_clear_busy(&he->ext);
159}
160
161static void *nft_hash_deactivate(const struct net *net,
162 const struct nft_set *set,
163 const struct nft_set_elem *elem)
164{
165 struct nft_hash *priv = nft_set_priv(set);
166 struct nft_hash_elem *he;
167 struct nft_hash_cmp_arg arg = {
168 .genmask = nft_genmask_next(net),
169 .set = set,
170 .key = elem->key.val.data,
171 };
172
173 rcu_read_lock();
174 he = rhashtable_lookup_fast(&priv->ht, &arg, nft_hash_params);
175 if (he != NULL) {
176 if (!nft_set_elem_mark_busy(&he->ext) ||
177 !nft_is_active(net, &he->ext))
178 nft_set_elem_change_active(net, set, &he->ext);
179 else
180 he = NULL;
181 }
182 rcu_read_unlock();
183
184 return he;
185}
186
187static void nft_hash_remove(const struct nft_set *set,
188 const struct nft_set_elem *elem)
189{
190 struct nft_hash *priv = nft_set_priv(set);
191 struct nft_hash_elem *he = elem->priv;
192
193 rhashtable_remove_fast(&priv->ht, &he->node, nft_hash_params);
194}
195
196static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set,
197 struct nft_set_iter *iter)
198{
199 struct nft_hash *priv = nft_set_priv(set);
200 struct nft_hash_elem *he;
201 struct rhashtable_iter hti;
202 struct nft_set_elem elem;
203 int err;
204
205 err = rhashtable_walk_init(&priv->ht, &hti, GFP_KERNEL);
206 iter->err = err;
207 if (err)
208 return;
209
210 err = rhashtable_walk_start(&hti);
211 if (err && err != -EAGAIN) {
212 iter->err = err;
213 goto out;
214 }
215
216 while ((he = rhashtable_walk_next(&hti))) {
217 if (IS_ERR(he)) {
218 err = PTR_ERR(he);
219 if (err != -EAGAIN) {
220 iter->err = err;
221 goto out;
222 }
223
224 continue;
225 }
226
227 if (iter->count < iter->skip)
228 goto cont;
229 if (nft_set_elem_expired(&he->ext))
230 goto cont;
231 if (!nft_set_elem_active(&he->ext, iter->genmask))
232 goto cont;
233
234 elem.priv = he;
235
236 iter->err = iter->fn(ctx, set, iter, &elem);
237 if (iter->err < 0)
238 goto out;
239
240cont:
241 iter->count++;
242 }
243
244out:
245 rhashtable_walk_stop(&hti);
246 rhashtable_walk_exit(&hti);
247}
248
249static void nft_hash_gc(struct work_struct *work)
250{
251 struct nft_set *set;
252 struct nft_hash_elem *he;
253 struct nft_hash *priv;
254 struct nft_set_gc_batch *gcb = NULL;
255 struct rhashtable_iter hti;
256 int err;
257
258 priv = container_of(work, struct nft_hash, gc_work.work);
259 set = nft_set_container_of(priv);
260
261 err = rhashtable_walk_init(&priv->ht, &hti, GFP_KERNEL);
262 if (err)
263 goto schedule;
264
265 err = rhashtable_walk_start(&hti);
266 if (err && err != -EAGAIN)
267 goto out;
268
269 while ((he = rhashtable_walk_next(&hti))) {
270 if (IS_ERR(he)) {
271 if (PTR_ERR(he) != -EAGAIN)
272 goto out;
273 continue;
274 }
275
276 if (!nft_set_elem_expired(&he->ext))
277 continue;
278 if (nft_set_elem_mark_busy(&he->ext))
279 continue;
280
281 gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
282 if (gcb == NULL)
283 goto out;
284 rhashtable_remove_fast(&priv->ht, &he->node, nft_hash_params);
285 atomic_dec(&set->nelems);
286 nft_set_gc_batch_add(gcb, he);
287 }
288out:
289 rhashtable_walk_stop(&hti);
290 rhashtable_walk_exit(&hti);
291
292 nft_set_gc_batch_complete(gcb);
293schedule:
294 queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
295 nft_set_gc_interval(set));
296}
297
298static unsigned int nft_hash_privsize(const struct nlattr * const nla[])
299{
300 return sizeof(struct nft_hash);
301}
302
303static const struct rhashtable_params nft_hash_params = {
304 .head_offset = offsetof(struct nft_hash_elem, node),
305 .hashfn = nft_hash_key,
306 .obj_hashfn = nft_hash_obj,
307 .obj_cmpfn = nft_hash_cmp,
308 .automatic_shrinking = true,
309};
310
311static int nft_hash_init(const struct nft_set *set,
312 const struct nft_set_desc *desc,
313 const struct nlattr * const tb[])
314{
315 struct nft_hash *priv = nft_set_priv(set);
316 struct rhashtable_params params = nft_hash_params;
317 int err;
318
319 params.nelem_hint = desc->size ?: NFT_HASH_ELEMENT_HINT;
320 params.key_len = set->klen;
321
322 err = rhashtable_init(&priv->ht, &params);
323 if (err < 0)
324 return err;
325
326 INIT_DEFERRABLE_WORK(&priv->gc_work, nft_hash_gc);
327 if (set->flags & NFT_SET_TIMEOUT)
328 queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
329 nft_set_gc_interval(set));
330 return 0;
331}
332
333static void nft_hash_elem_destroy(void *ptr, void *arg)
334{
335 nft_set_elem_destroy((const struct nft_set *)arg, ptr);
336}
337
338static void nft_hash_destroy(const struct nft_set *set)
339{
340 struct nft_hash *priv = nft_set_priv(set);
341
342 cancel_delayed_work_sync(&priv->gc_work);
343 rhashtable_free_and_destroy(&priv->ht, nft_hash_elem_destroy,
344 (void *)set);
345}
346
347static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features,
348 struct nft_set_estimate *est)
349{
350 unsigned int esize;
351
352 esize = sizeof(struct nft_hash_elem);
353 if (desc->size) {
354 est->size = sizeof(struct nft_hash) +
355 roundup_pow_of_two(desc->size * 4 / 3) *
356 sizeof(struct nft_hash_elem *) +
357 desc->size * esize;
358 } else {
359 /* Resizing happens when the load drops below 30% or goes
360 * above 75%. The average of 52.5% load (approximated by 50%)
361 * is used for the size estimation of the hash buckets,
362 * meaning we calculate two buckets per element.
363 */
364 est->size = esize + 2 * sizeof(struct nft_hash_elem *);
365 }
366
367 est->class = NFT_SET_CLASS_O_1;
368
369 return true;
370}
371
372static struct nft_set_ops nft_hash_ops __read_mostly = {
373 .privsize = nft_hash_privsize,
374 .elemsize = offsetof(struct nft_hash_elem, ext),
375 .estimate = nft_hash_estimate,
376 .init = nft_hash_init,
377 .destroy = nft_hash_destroy,
378 .insert = nft_hash_insert,
379 .activate = nft_hash_activate,
380 .deactivate = nft_hash_deactivate,
381 .remove = nft_hash_remove,
382 .lookup = nft_hash_lookup,
383 .update = nft_hash_update,
384 .walk = nft_hash_walk,
385 .features = NFT_SET_MAP | NFT_SET_TIMEOUT,
386 .owner = THIS_MODULE,
387};
388
389static int __init nft_hash_module_init(void)
390{
391 return nft_register_set(&nft_hash_ops);
392}
393
394static void __exit nft_hash_module_exit(void)
395{
396 nft_unregister_set(&nft_hash_ops);
397}
398
399module_init(nft_hash_module_init);
400module_exit(nft_hash_module_exit);
401
402MODULE_LICENSE("GPL");
403MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
404MODULE_ALIAS_NFT_SET();
diff --git a/net/netfilter/nft_rbtree.c b/net/netfilter/nft_set_rbtree.c
index ffe9ae062d23..38b5bda242f8 100644
--- a/net/netfilter/nft_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -96,7 +96,8 @@ out:
96} 96}
97 97
98static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, 98static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
99 struct nft_rbtree_elem *new) 99 struct nft_rbtree_elem *new,
100 struct nft_set_ext **ext)
100{ 101{
101 struct nft_rbtree *priv = nft_set_priv(set); 102 struct nft_rbtree *priv = nft_set_priv(set);
102 u8 genmask = nft_genmask_next(net); 103 u8 genmask = nft_genmask_next(net);
@@ -124,8 +125,10 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
124 else if (!nft_rbtree_interval_end(rbe) && 125 else if (!nft_rbtree_interval_end(rbe) &&
125 nft_rbtree_interval_end(new)) 126 nft_rbtree_interval_end(new))
126 p = &parent->rb_right; 127 p = &parent->rb_right;
127 else 128 else {
129 *ext = &rbe->ext;
128 return -EEXIST; 130 return -EEXIST;
131 }
129 } 132 }
130 } 133 }
131 } 134 }
@@ -135,13 +138,14 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
135} 138}
136 139
137static int nft_rbtree_insert(const struct net *net, const struct nft_set *set, 140static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
138 const struct nft_set_elem *elem) 141 const struct nft_set_elem *elem,
142 struct nft_set_ext **ext)
139{ 143{
140 struct nft_rbtree_elem *rbe = elem->priv; 144 struct nft_rbtree_elem *rbe = elem->priv;
141 int err; 145 int err;
142 146
143 spin_lock_bh(&nft_rbtree_lock); 147 spin_lock_bh(&nft_rbtree_lock);
144 err = __nft_rbtree_insert(net, set, rbe); 148 err = __nft_rbtree_insert(net, set, rbe, ext);
145 spin_unlock_bh(&nft_rbtree_lock); 149 spin_unlock_bh(&nft_rbtree_lock);
146 150
147 return err; 151 return err;
diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
index 515131f9e021..dbd6c4a12b97 100644
--- a/net/netfilter/xt_RATEEST.c
+++ b/net/netfilter/xt_RATEEST.c
@@ -24,7 +24,6 @@ static DEFINE_MUTEX(xt_rateest_mutex);
24#define RATEEST_HSIZE 16 24#define RATEEST_HSIZE 16
25static struct hlist_head rateest_hash[RATEEST_HSIZE] __read_mostly; 25static struct hlist_head rateest_hash[RATEEST_HSIZE] __read_mostly;
26static unsigned int jhash_rnd __read_mostly; 26static unsigned int jhash_rnd __read_mostly;
27static bool rnd_inited __read_mostly;
28 27
29static unsigned int xt_rateest_hash(const char *name) 28static unsigned int xt_rateest_hash(const char *name)
30{ 29{
@@ -99,10 +98,7 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par)
99 } cfg; 98 } cfg;
100 int ret; 99 int ret;
101 100
102 if (unlikely(!rnd_inited)) { 101 net_get_random_once(&jhash_rnd, sizeof(jhash_rnd));
103 get_random_bytes(&jhash_rnd, sizeof(jhash_rnd));
104 rnd_inited = true;
105 }
106 102
107 est = xt_rateest_lookup(info->name); 103 est = xt_rateest_lookup(info->name);
108 if (est) { 104 if (est) {
diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c
index e118397254af..872db2d0e2a9 100644
--- a/net/netfilter/xt_TCPMSS.c
+++ b/net/netfilter/xt_TCPMSS.c
@@ -110,18 +110,14 @@ tcpmss_mangle_packet(struct sk_buff *skb,
110 if (info->mss == XT_TCPMSS_CLAMP_PMTU) { 110 if (info->mss == XT_TCPMSS_CLAMP_PMTU) {
111 struct net *net = par->net; 111 struct net *net = par->net;
112 unsigned int in_mtu = tcpmss_reverse_mtu(net, skb, family); 112 unsigned int in_mtu = tcpmss_reverse_mtu(net, skb, family);
113 unsigned int min_mtu = min(dst_mtu(skb_dst(skb)), in_mtu);
113 114
114 if (dst_mtu(skb_dst(skb)) <= minlen) { 115 if (min_mtu <= minlen) {
115 net_err_ratelimited("unknown or invalid path-MTU (%u)\n", 116 net_err_ratelimited("unknown or invalid path-MTU (%u)\n",
116 dst_mtu(skb_dst(skb))); 117 min_mtu);
117 return -1; 118 return -1;
118 } 119 }
119 if (in_mtu <= minlen) { 120 newmss = min_mtu - minlen;
120 net_err_ratelimited("unknown or invalid path-MTU (%u)\n",
121 in_mtu);
122 return -1;
123 }
124 newmss = min(dst_mtu(skb_dst(skb)), in_mtu) - minlen;
125 } else 121 } else
126 newmss = info->mss; 122 newmss = info->mss;
127 123
diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c
index 6e57a3966dc5..0471db4032c5 100644
--- a/net/netfilter/xt_TEE.c
+++ b/net/netfilter/xt_TEE.c
@@ -89,6 +89,8 @@ static int tee_tg_check(const struct xt_tgchk_param *par)
89 return -EINVAL; 89 return -EINVAL;
90 90
91 if (info->oif[0]) { 91 if (info->oif[0]) {
92 int ret;
93
92 if (info->oif[sizeof(info->oif)-1] != '\0') 94 if (info->oif[sizeof(info->oif)-1] != '\0')
93 return -EINVAL; 95 return -EINVAL;
94 96
@@ -101,7 +103,11 @@ static int tee_tg_check(const struct xt_tgchk_param *par)
101 priv->notifier.notifier_call = tee_netdev_event; 103 priv->notifier.notifier_call = tee_netdev_event;
102 info->priv = priv; 104 info->priv = priv;
103 105
104 register_netdevice_notifier(&priv->notifier); 106 ret = register_netdevice_notifier(&priv->notifier);
107 if (ret) {
108 kfree(priv);
109 return ret;
110 }
105 } else 111 } else
106 info->priv = NULL; 112 info->priv = NULL;
107 113
diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c
index 99bbc829868d..b6dc322593a3 100644
--- a/net/netfilter/xt_connlimit.c
+++ b/net/netfilter/xt_connlimit.c
@@ -366,14 +366,8 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par)
366 unsigned int i; 366 unsigned int i;
367 int ret; 367 int ret;
368 368
369 if (unlikely(!connlimit_rnd)) { 369 net_get_random_once(&connlimit_rnd, sizeof(connlimit_rnd));
370 u_int32_t rand;
371 370
372 do {
373 get_random_bytes(&rand, sizeof(rand));
374 } while (!rand);
375 cmpxchg(&connlimit_rnd, 0, rand);
376 }
377 ret = nf_ct_l3proto_try_module_get(par->family); 371 ret = nf_ct_l3proto_try_module_get(par->family);
378 if (ret < 0) { 372 if (ret < 0) {
379 pr_info("cannot load conntrack support for " 373 pr_info("cannot load conntrack support for "
diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c
index 188404b9b002..a3b8f697cfc5 100644
--- a/net/netfilter/xt_conntrack.c
+++ b/net/netfilter/xt_conntrack.c
@@ -233,10 +233,8 @@ conntrack_mt(const struct sk_buff *skb, struct xt_action_param *par,
233 return false; 233 return false;
234 234
235 if (info->match_flags & XT_CONNTRACK_EXPIRES) { 235 if (info->match_flags & XT_CONNTRACK_EXPIRES) {
236 unsigned long expires = 0; 236 unsigned long expires = nf_ct_expires(ct) / HZ;
237 237
238 if (timer_pending(&ct->timeout))
239 expires = (ct->timeout.expires - jiffies) / HZ;
240 if ((expires >= info->expires_min && 238 if ((expires >= info->expires_min &&
241 expires <= info->expires_max) ^ 239 expires <= info->expires_max) ^
242 !(info->invert_flags & XT_CONNTRACK_EXPIRES)) 240 !(info->invert_flags & XT_CONNTRACK_EXPIRES))
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 178696852bde..2fab0c65aa94 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -56,6 +56,7 @@ static inline struct hashlimit_net *hashlimit_pernet(struct net *net)
56} 56}
57 57
58/* need to declare this at the top */ 58/* need to declare this at the top */
59static const struct file_operations dl_file_ops_v1;
59static const struct file_operations dl_file_ops; 60static const struct file_operations dl_file_ops;
60 61
61/* hash table crap */ 62/* hash table crap */
@@ -86,8 +87,8 @@ struct dsthash_ent {
86 unsigned long expires; /* precalculated expiry time */ 87 unsigned long expires; /* precalculated expiry time */
87 struct { 88 struct {
88 unsigned long prev; /* last modification */ 89 unsigned long prev; /* last modification */
89 u_int32_t credit; 90 u_int64_t credit;
90 u_int32_t credit_cap, cost; 91 u_int64_t credit_cap, cost;
91 } rateinfo; 92 } rateinfo;
92 struct rcu_head rcu; 93 struct rcu_head rcu;
93}; 94};
@@ -98,7 +99,7 @@ struct xt_hashlimit_htable {
98 u_int8_t family; 99 u_int8_t family;
99 bool rnd_initialized; 100 bool rnd_initialized;
100 101
101 struct hashlimit_cfg1 cfg; /* config */ 102 struct hashlimit_cfg2 cfg; /* config */
102 103
103 /* used internally */ 104 /* used internally */
104 spinlock_t lock; /* lock for list_head */ 105 spinlock_t lock; /* lock for list_head */
@@ -114,6 +115,30 @@ struct xt_hashlimit_htable {
114 struct hlist_head hash[0]; /* hashtable itself */ 115 struct hlist_head hash[0]; /* hashtable itself */
115}; 116};
116 117
118static int
119cfg_copy(struct hashlimit_cfg2 *to, void *from, int revision)
120{
121 if (revision == 1) {
122 struct hashlimit_cfg1 *cfg = (struct hashlimit_cfg1 *)from;
123
124 to->mode = cfg->mode;
125 to->avg = cfg->avg;
126 to->burst = cfg->burst;
127 to->size = cfg->size;
128 to->max = cfg->max;
129 to->gc_interval = cfg->gc_interval;
130 to->expire = cfg->expire;
131 to->srcmask = cfg->srcmask;
132 to->dstmask = cfg->dstmask;
133 } else if (revision == 2) {
134 memcpy(to, from, sizeof(struct hashlimit_cfg2));
135 } else {
136 return -EINVAL;
137 }
138
139 return 0;
140}
141
117static DEFINE_MUTEX(hashlimit_mutex); /* protects htables list */ 142static DEFINE_MUTEX(hashlimit_mutex); /* protects htables list */
118static struct kmem_cache *hashlimit_cachep __read_mostly; 143static struct kmem_cache *hashlimit_cachep __read_mostly;
119 144
@@ -215,16 +240,18 @@ dsthash_free(struct xt_hashlimit_htable *ht, struct dsthash_ent *ent)
215} 240}
216static void htable_gc(struct work_struct *work); 241static void htable_gc(struct work_struct *work);
217 242
218static int htable_create(struct net *net, struct xt_hashlimit_mtinfo1 *minfo, 243static int htable_create(struct net *net, struct hashlimit_cfg2 *cfg,
219 u_int8_t family) 244 const char *name, u_int8_t family,
245 struct xt_hashlimit_htable **out_hinfo,
246 int revision)
220{ 247{
221 struct hashlimit_net *hashlimit_net = hashlimit_pernet(net); 248 struct hashlimit_net *hashlimit_net = hashlimit_pernet(net);
222 struct xt_hashlimit_htable *hinfo; 249 struct xt_hashlimit_htable *hinfo;
223 unsigned int size; 250 unsigned int size, i;
224 unsigned int i; 251 int ret;
225 252
226 if (minfo->cfg.size) { 253 if (cfg->size) {
227 size = minfo->cfg.size; 254 size = cfg->size;
228 } else { 255 } else {
229 size = (totalram_pages << PAGE_SHIFT) / 16384 / 256 size = (totalram_pages << PAGE_SHIFT) / 16384 /
230 sizeof(struct list_head); 257 sizeof(struct list_head);
@@ -238,10 +265,14 @@ static int htable_create(struct net *net, struct xt_hashlimit_mtinfo1 *minfo,
238 sizeof(struct list_head) * size); 265 sizeof(struct list_head) * size);
239 if (hinfo == NULL) 266 if (hinfo == NULL)
240 return -ENOMEM; 267 return -ENOMEM;
241 minfo->hinfo = hinfo; 268 *out_hinfo = hinfo;
242 269
243 /* copy match config into hashtable config */ 270 /* copy match config into hashtable config */
244 memcpy(&hinfo->cfg, &minfo->cfg, sizeof(hinfo->cfg)); 271 ret = cfg_copy(&hinfo->cfg, (void *)cfg, 2);
272
273 if (ret)
274 return ret;
275
245 hinfo->cfg.size = size; 276 hinfo->cfg.size = size;
246 if (hinfo->cfg.max == 0) 277 if (hinfo->cfg.max == 0)
247 hinfo->cfg.max = 8 * hinfo->cfg.size; 278 hinfo->cfg.max = 8 * hinfo->cfg.size;
@@ -255,17 +286,18 @@ static int htable_create(struct net *net, struct xt_hashlimit_mtinfo1 *minfo,
255 hinfo->count = 0; 286 hinfo->count = 0;
256 hinfo->family = family; 287 hinfo->family = family;
257 hinfo->rnd_initialized = false; 288 hinfo->rnd_initialized = false;
258 hinfo->name = kstrdup(minfo->name, GFP_KERNEL); 289 hinfo->name = kstrdup(name, GFP_KERNEL);
259 if (!hinfo->name) { 290 if (!hinfo->name) {
260 vfree(hinfo); 291 vfree(hinfo);
261 return -ENOMEM; 292 return -ENOMEM;
262 } 293 }
263 spin_lock_init(&hinfo->lock); 294 spin_lock_init(&hinfo->lock);
264 295
265 hinfo->pde = proc_create_data(minfo->name, 0, 296 hinfo->pde = proc_create_data(name, 0,
266 (family == NFPROTO_IPV4) ? 297 (family == NFPROTO_IPV4) ?
267 hashlimit_net->ipt_hashlimit : hashlimit_net->ip6t_hashlimit, 298 hashlimit_net->ipt_hashlimit : hashlimit_net->ip6t_hashlimit,
268 &dl_file_ops, hinfo); 299 (revision == 1) ? &dl_file_ops_v1 : &dl_file_ops,
300 hinfo);
269 if (hinfo->pde == NULL) { 301 if (hinfo->pde == NULL) {
270 kfree(hinfo->name); 302 kfree(hinfo->name);
271 vfree(hinfo); 303 vfree(hinfo);
@@ -398,7 +430,8 @@ static void htable_put(struct xt_hashlimit_htable *hinfo)
398 (slowest userspace tool allows), which means 430 (slowest userspace tool allows), which means
399 CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32 ie. 431 CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32 ie.
400*/ 432*/
401#define MAX_CPJ (0xFFFFFFFF / (HZ*60*60*24)) 433#define MAX_CPJ_v1 (0xFFFFFFFF / (HZ*60*60*24))
434#define MAX_CPJ (0xFFFFFFFFFFFFFFFF / (HZ*60*60*24))
402 435
403/* Repeated shift and or gives us all 1s, final shift and add 1 gives 436/* Repeated shift and or gives us all 1s, final shift and add 1 gives
404 * us the power of 2 below the theoretical max, so GCC simply does a 437 * us the power of 2 below the theoretical max, so GCC simply does a
@@ -408,9 +441,12 @@ static void htable_put(struct xt_hashlimit_htable *hinfo)
408#define _POW2_BELOW8(x) (_POW2_BELOW4(x)|_POW2_BELOW4((x)>>4)) 441#define _POW2_BELOW8(x) (_POW2_BELOW4(x)|_POW2_BELOW4((x)>>4))
409#define _POW2_BELOW16(x) (_POW2_BELOW8(x)|_POW2_BELOW8((x)>>8)) 442#define _POW2_BELOW16(x) (_POW2_BELOW8(x)|_POW2_BELOW8((x)>>8))
410#define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16)) 443#define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16))
444#define _POW2_BELOW64(x) (_POW2_BELOW32(x)|_POW2_BELOW32((x)>>32))
411#define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1) 445#define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1)
446#define POW2_BELOW64(x) ((_POW2_BELOW64(x)>>1) + 1)
412 447
413#define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ) 448#define CREDITS_PER_JIFFY POW2_BELOW64(MAX_CPJ)
449#define CREDITS_PER_JIFFY_v1 POW2_BELOW32(MAX_CPJ_v1)
414 450
415/* in byte mode, the lowest possible rate is one packet/second. 451/* in byte mode, the lowest possible rate is one packet/second.
416 * credit_cap is used as a counter that tells us how many times we can 452 * credit_cap is used as a counter that tells us how many times we can
@@ -425,14 +461,25 @@ static u32 xt_hashlimit_len_to_chunks(u32 len)
425} 461}
426 462
427/* Precision saver. */ 463/* Precision saver. */
428static u32 user2credits(u32 user) 464static u64 user2credits(u64 user, int revision)
429{ 465{
430 /* If multiplying would overflow... */ 466 if (revision == 1) {
431 if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY)) 467 /* If multiplying would overflow... */
432 /* Divide first. */ 468 if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY_v1))
433 return (user / XT_HASHLIMIT_SCALE) * HZ * CREDITS_PER_JIFFY; 469 /* Divide first. */
470 return div64_u64(user, XT_HASHLIMIT_SCALE)
471 * HZ * CREDITS_PER_JIFFY_v1;
472
473 return div64_u64(user * HZ * CREDITS_PER_JIFFY_v1,
474 XT_HASHLIMIT_SCALE);
475 } else {
476 if (user > 0xFFFFFFFFFFFFFFFF / (HZ*CREDITS_PER_JIFFY))
477 return div64_u64(user, XT_HASHLIMIT_SCALE_v2)
478 * HZ * CREDITS_PER_JIFFY;
434 479
435 return (user * HZ * CREDITS_PER_JIFFY) / XT_HASHLIMIT_SCALE; 480 return div64_u64(user * HZ * CREDITS_PER_JIFFY,
481 XT_HASHLIMIT_SCALE_v2);
482 }
436} 483}
437 484
438static u32 user2credits_byte(u32 user) 485static u32 user2credits_byte(u32 user)
@@ -442,10 +489,11 @@ static u32 user2credits_byte(u32 user)
442 return (u32) (us >> 32); 489 return (u32) (us >> 32);
443} 490}
444 491
445static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, u32 mode) 492static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now,
493 u32 mode, int revision)
446{ 494{
447 unsigned long delta = now - dh->rateinfo.prev; 495 unsigned long delta = now - dh->rateinfo.prev;
448 u32 cap; 496 u64 cap, cpj;
449 497
450 if (delta == 0) 498 if (delta == 0)
451 return; 499 return;
@@ -453,7 +501,7 @@ static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, u32 mode)
453 dh->rateinfo.prev = now; 501 dh->rateinfo.prev = now;
454 502
455 if (mode & XT_HASHLIMIT_BYTES) { 503 if (mode & XT_HASHLIMIT_BYTES) {
456 u32 tmp = dh->rateinfo.credit; 504 u64 tmp = dh->rateinfo.credit;
457 dh->rateinfo.credit += CREDITS_PER_JIFFY_BYTES * delta; 505 dh->rateinfo.credit += CREDITS_PER_JIFFY_BYTES * delta;
458 cap = CREDITS_PER_JIFFY_BYTES * HZ; 506 cap = CREDITS_PER_JIFFY_BYTES * HZ;
459 if (tmp >= dh->rateinfo.credit) {/* overflow */ 507 if (tmp >= dh->rateinfo.credit) {/* overflow */
@@ -461,7 +509,9 @@ static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, u32 mode)
461 return; 509 return;
462 } 510 }
463 } else { 511 } else {
464 dh->rateinfo.credit += delta * CREDITS_PER_JIFFY; 512 cpj = (revision == 1) ?
513 CREDITS_PER_JIFFY_v1 : CREDITS_PER_JIFFY;
514 dh->rateinfo.credit += delta * cpj;
465 cap = dh->rateinfo.credit_cap; 515 cap = dh->rateinfo.credit_cap;
466 } 516 }
467 if (dh->rateinfo.credit > cap) 517 if (dh->rateinfo.credit > cap)
@@ -469,7 +519,7 @@ static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, u32 mode)
469} 519}
470 520
471static void rateinfo_init(struct dsthash_ent *dh, 521static void rateinfo_init(struct dsthash_ent *dh,
472 struct xt_hashlimit_htable *hinfo) 522 struct xt_hashlimit_htable *hinfo, int revision)
473{ 523{
474 dh->rateinfo.prev = jiffies; 524 dh->rateinfo.prev = jiffies;
475 if (hinfo->cfg.mode & XT_HASHLIMIT_BYTES) { 525 if (hinfo->cfg.mode & XT_HASHLIMIT_BYTES) {
@@ -478,8 +528,8 @@ static void rateinfo_init(struct dsthash_ent *dh,
478 dh->rateinfo.credit_cap = hinfo->cfg.burst; 528 dh->rateinfo.credit_cap = hinfo->cfg.burst;
479 } else { 529 } else {
480 dh->rateinfo.credit = user2credits(hinfo->cfg.avg * 530 dh->rateinfo.credit = user2credits(hinfo->cfg.avg *
481 hinfo->cfg.burst); 531 hinfo->cfg.burst, revision);
482 dh->rateinfo.cost = user2credits(hinfo->cfg.avg); 532 dh->rateinfo.cost = user2credits(hinfo->cfg.avg, revision);
483 dh->rateinfo.credit_cap = dh->rateinfo.credit; 533 dh->rateinfo.credit_cap = dh->rateinfo.credit;
484 } 534 }
485} 535}
@@ -603,15 +653,15 @@ static u32 hashlimit_byte_cost(unsigned int len, struct dsthash_ent *dh)
603} 653}
604 654
605static bool 655static bool
606hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) 656hashlimit_mt_common(const struct sk_buff *skb, struct xt_action_param *par,
657 struct xt_hashlimit_htable *hinfo,
658 const struct hashlimit_cfg2 *cfg, int revision)
607{ 659{
608 const struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
609 struct xt_hashlimit_htable *hinfo = info->hinfo;
610 unsigned long now = jiffies; 660 unsigned long now = jiffies;
611 struct dsthash_ent *dh; 661 struct dsthash_ent *dh;
612 struct dsthash_dst dst; 662 struct dsthash_dst dst;
613 bool race = false; 663 bool race = false;
614 u32 cost; 664 u64 cost;
615 665
616 if (hashlimit_init_dst(hinfo, &dst, skb, par->thoff) < 0) 666 if (hashlimit_init_dst(hinfo, &dst, skb, par->thoff) < 0)
617 goto hotdrop; 667 goto hotdrop;
@@ -626,18 +676,18 @@ hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
626 } else if (race) { 676 } else if (race) {
627 /* Already got an entry, update expiration timeout */ 677 /* Already got an entry, update expiration timeout */
628 dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire); 678 dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire);
629 rateinfo_recalc(dh, now, hinfo->cfg.mode); 679 rateinfo_recalc(dh, now, hinfo->cfg.mode, revision);
630 } else { 680 } else {
631 dh->expires = jiffies + msecs_to_jiffies(hinfo->cfg.expire); 681 dh->expires = jiffies + msecs_to_jiffies(hinfo->cfg.expire);
632 rateinfo_init(dh, hinfo); 682 rateinfo_init(dh, hinfo, revision);
633 } 683 }
634 } else { 684 } else {
635 /* update expiration timeout */ 685 /* update expiration timeout */
636 dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire); 686 dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire);
637 rateinfo_recalc(dh, now, hinfo->cfg.mode); 687 rateinfo_recalc(dh, now, hinfo->cfg.mode, revision);
638 } 688 }
639 689
640 if (info->cfg.mode & XT_HASHLIMIT_BYTES) 690 if (cfg->mode & XT_HASHLIMIT_BYTES)
641 cost = hashlimit_byte_cost(skb->len, dh); 691 cost = hashlimit_byte_cost(skb->len, dh);
642 else 692 else
643 cost = dh->rateinfo.cost; 693 cost = dh->rateinfo.cost;
@@ -647,84 +697,157 @@ hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
647 dh->rateinfo.credit -= cost; 697 dh->rateinfo.credit -= cost;
648 spin_unlock(&dh->lock); 698 spin_unlock(&dh->lock);
649 rcu_read_unlock_bh(); 699 rcu_read_unlock_bh();
650 return !(info->cfg.mode & XT_HASHLIMIT_INVERT); 700 return !(cfg->mode & XT_HASHLIMIT_INVERT);
651 } 701 }
652 702
653 spin_unlock(&dh->lock); 703 spin_unlock(&dh->lock);
654 rcu_read_unlock_bh(); 704 rcu_read_unlock_bh();
655 /* default match is underlimit - so over the limit, we need to invert */ 705 /* default match is underlimit - so over the limit, we need to invert */
656 return info->cfg.mode & XT_HASHLIMIT_INVERT; 706 return cfg->mode & XT_HASHLIMIT_INVERT;
657 707
658 hotdrop: 708 hotdrop:
659 par->hotdrop = true; 709 par->hotdrop = true;
660 return false; 710 return false;
661} 711}
662 712
663static int hashlimit_mt_check(const struct xt_mtchk_param *par) 713static bool
714hashlimit_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
715{
716 const struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
717 struct xt_hashlimit_htable *hinfo = info->hinfo;
718 struct hashlimit_cfg2 cfg = {};
719 int ret;
720
721 ret = cfg_copy(&cfg, (void *)&info->cfg, 1);
722
723 if (ret)
724 return ret;
725
726 return hashlimit_mt_common(skb, par, hinfo, &cfg, 1);
727}
728
729static bool
730hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
731{
732 const struct xt_hashlimit_mtinfo2 *info = par->matchinfo;
733 struct xt_hashlimit_htable *hinfo = info->hinfo;
734
735 return hashlimit_mt_common(skb, par, hinfo, &info->cfg, 2);
736}
737
738static int hashlimit_mt_check_common(const struct xt_mtchk_param *par,
739 struct xt_hashlimit_htable **hinfo,
740 struct hashlimit_cfg2 *cfg,
741 const char *name, int revision)
664{ 742{
665 struct net *net = par->net; 743 struct net *net = par->net;
666 struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
667 int ret; 744 int ret;
668 745
669 if (info->cfg.gc_interval == 0 || info->cfg.expire == 0) 746 if (cfg->gc_interval == 0 || cfg->expire == 0)
670 return -EINVAL;
671 if (info->name[sizeof(info->name)-1] != '\0')
672 return -EINVAL; 747 return -EINVAL;
673 if (par->family == NFPROTO_IPV4) { 748 if (par->family == NFPROTO_IPV4) {
674 if (info->cfg.srcmask > 32 || info->cfg.dstmask > 32) 749 if (cfg->srcmask > 32 || cfg->dstmask > 32)
675 return -EINVAL; 750 return -EINVAL;
676 } else { 751 } else {
677 if (info->cfg.srcmask > 128 || info->cfg.dstmask > 128) 752 if (cfg->srcmask > 128 || cfg->dstmask > 128)
678 return -EINVAL; 753 return -EINVAL;
679 } 754 }
680 755
681 if (info->cfg.mode & ~XT_HASHLIMIT_ALL) { 756 if (cfg->mode & ~XT_HASHLIMIT_ALL) {
682 pr_info("Unknown mode mask %X, kernel too old?\n", 757 pr_info("Unknown mode mask %X, kernel too old?\n",
683 info->cfg.mode); 758 cfg->mode);
684 return -EINVAL; 759 return -EINVAL;
685 } 760 }
686 761
687 /* Check for overflow. */ 762 /* Check for overflow. */
688 if (info->cfg.mode & XT_HASHLIMIT_BYTES) { 763 if (cfg->mode & XT_HASHLIMIT_BYTES) {
689 if (user2credits_byte(info->cfg.avg) == 0) { 764 if (user2credits_byte(cfg->avg) == 0) {
690 pr_info("overflow, rate too high: %u\n", info->cfg.avg); 765 pr_info("overflow, rate too high: %llu\n", cfg->avg);
691 return -EINVAL; 766 return -EINVAL;
692 } 767 }
693 } else if (info->cfg.burst == 0 || 768 } else if (cfg->burst == 0 ||
694 user2credits(info->cfg.avg * info->cfg.burst) < 769 user2credits(cfg->avg * cfg->burst, revision) <
695 user2credits(info->cfg.avg)) { 770 user2credits(cfg->avg, revision)) {
696 pr_info("overflow, try lower: %u/%u\n", 771 pr_info("overflow, try lower: %llu/%llu\n",
697 info->cfg.avg, info->cfg.burst); 772 cfg->avg, cfg->burst);
698 return -ERANGE; 773 return -ERANGE;
699 } 774 }
700 775
701 mutex_lock(&hashlimit_mutex); 776 mutex_lock(&hashlimit_mutex);
702 info->hinfo = htable_find_get(net, info->name, par->family); 777 *hinfo = htable_find_get(net, name, par->family);
703 if (info->hinfo == NULL) { 778 if (*hinfo == NULL) {
704 ret = htable_create(net, info, par->family); 779 ret = htable_create(net, cfg, name, par->family,
780 hinfo, revision);
705 if (ret < 0) { 781 if (ret < 0) {
706 mutex_unlock(&hashlimit_mutex); 782 mutex_unlock(&hashlimit_mutex);
707 return ret; 783 return ret;
708 } 784 }
709 } 785 }
710 mutex_unlock(&hashlimit_mutex); 786 mutex_unlock(&hashlimit_mutex);
787
711 return 0; 788 return 0;
712} 789}
713 790
714static void hashlimit_mt_destroy(const struct xt_mtdtor_param *par) 791static int hashlimit_mt_check_v1(const struct xt_mtchk_param *par)
792{
793 struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
794 struct hashlimit_cfg2 cfg = {};
795 int ret;
796
797 if (info->name[sizeof(info->name) - 1] != '\0')
798 return -EINVAL;
799
800 ret = cfg_copy(&cfg, (void *)&info->cfg, 1);
801
802 if (ret)
803 return ret;
804
805 return hashlimit_mt_check_common(par, &info->hinfo,
806 &cfg, info->name, 1);
807}
808
809static int hashlimit_mt_check(const struct xt_mtchk_param *par)
810{
811 struct xt_hashlimit_mtinfo2 *info = par->matchinfo;
812
813 if (info->name[sizeof(info->name) - 1] != '\0')
814 return -EINVAL;
815
816 return hashlimit_mt_check_common(par, &info->hinfo, &info->cfg,
817 info->name, 2);
818}
819
820static void hashlimit_mt_destroy_v1(const struct xt_mtdtor_param *par)
715{ 821{
716 const struct xt_hashlimit_mtinfo1 *info = par->matchinfo; 822 const struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
717 823
718 htable_put(info->hinfo); 824 htable_put(info->hinfo);
719} 825}
720 826
827static void hashlimit_mt_destroy(const struct xt_mtdtor_param *par)
828{
829 const struct xt_hashlimit_mtinfo2 *info = par->matchinfo;
830
831 htable_put(info->hinfo);
832}
833
721static struct xt_match hashlimit_mt_reg[] __read_mostly = { 834static struct xt_match hashlimit_mt_reg[] __read_mostly = {
722 { 835 {
723 .name = "hashlimit", 836 .name = "hashlimit",
724 .revision = 1, 837 .revision = 1,
725 .family = NFPROTO_IPV4, 838 .family = NFPROTO_IPV4,
726 .match = hashlimit_mt, 839 .match = hashlimit_mt_v1,
727 .matchsize = sizeof(struct xt_hashlimit_mtinfo1), 840 .matchsize = sizeof(struct xt_hashlimit_mtinfo1),
841 .checkentry = hashlimit_mt_check_v1,
842 .destroy = hashlimit_mt_destroy_v1,
843 .me = THIS_MODULE,
844 },
845 {
846 .name = "hashlimit",
847 .revision = 2,
848 .family = NFPROTO_IPV4,
849 .match = hashlimit_mt,
850 .matchsize = sizeof(struct xt_hashlimit_mtinfo2),
728 .checkentry = hashlimit_mt_check, 851 .checkentry = hashlimit_mt_check,
729 .destroy = hashlimit_mt_destroy, 852 .destroy = hashlimit_mt_destroy,
730 .me = THIS_MODULE, 853 .me = THIS_MODULE,
@@ -734,8 +857,18 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
734 .name = "hashlimit", 857 .name = "hashlimit",
735 .revision = 1, 858 .revision = 1,
736 .family = NFPROTO_IPV6, 859 .family = NFPROTO_IPV6,
737 .match = hashlimit_mt, 860 .match = hashlimit_mt_v1,
738 .matchsize = sizeof(struct xt_hashlimit_mtinfo1), 861 .matchsize = sizeof(struct xt_hashlimit_mtinfo1),
862 .checkentry = hashlimit_mt_check_v1,
863 .destroy = hashlimit_mt_destroy_v1,
864 .me = THIS_MODULE,
865 },
866 {
867 .name = "hashlimit",
868 .revision = 2,
869 .family = NFPROTO_IPV6,
870 .match = hashlimit_mt,
871 .matchsize = sizeof(struct xt_hashlimit_mtinfo2),
739 .checkentry = hashlimit_mt_check, 872 .checkentry = hashlimit_mt_check,
740 .destroy = hashlimit_mt_destroy, 873 .destroy = hashlimit_mt_destroy,
741 .me = THIS_MODULE, 874 .me = THIS_MODULE,
@@ -786,18 +919,12 @@ static void dl_seq_stop(struct seq_file *s, void *v)
786 spin_unlock_bh(&htable->lock); 919 spin_unlock_bh(&htable->lock);
787} 920}
788 921
789static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family, 922static void dl_seq_print(struct dsthash_ent *ent, u_int8_t family,
790 struct seq_file *s) 923 struct seq_file *s)
791{ 924{
792 const struct xt_hashlimit_htable *ht = s->private;
793
794 spin_lock(&ent->lock);
795 /* recalculate to show accurate numbers */
796 rateinfo_recalc(ent, jiffies, ht->cfg.mode);
797
798 switch (family) { 925 switch (family) {
799 case NFPROTO_IPV4: 926 case NFPROTO_IPV4:
800 seq_printf(s, "%ld %pI4:%u->%pI4:%u %u %u %u\n", 927 seq_printf(s, "%ld %pI4:%u->%pI4:%u %llu %llu %llu\n",
801 (long)(ent->expires - jiffies)/HZ, 928 (long)(ent->expires - jiffies)/HZ,
802 &ent->dst.ip.src, 929 &ent->dst.ip.src,
803 ntohs(ent->dst.src_port), 930 ntohs(ent->dst.src_port),
@@ -808,7 +935,7 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
808 break; 935 break;
809#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) 936#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
810 case NFPROTO_IPV6: 937 case NFPROTO_IPV6:
811 seq_printf(s, "%ld %pI6:%u->%pI6:%u %u %u %u\n", 938 seq_printf(s, "%ld %pI6:%u->%pI6:%u %llu %llu %llu\n",
812 (long)(ent->expires - jiffies)/HZ, 939 (long)(ent->expires - jiffies)/HZ,
813 &ent->dst.ip6.src, 940 &ent->dst.ip6.src,
814 ntohs(ent->dst.src_port), 941 ntohs(ent->dst.src_port),
@@ -821,10 +948,52 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
821 default: 948 default:
822 BUG(); 949 BUG();
823 } 950 }
951}
952
953static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family,
954 struct seq_file *s)
955{
956 const struct xt_hashlimit_htable *ht = s->private;
957
958 spin_lock(&ent->lock);
959 /* recalculate to show accurate numbers */
960 rateinfo_recalc(ent, jiffies, ht->cfg.mode, 1);
961
962 dl_seq_print(ent, family, s);
963
964 spin_unlock(&ent->lock);
965 return seq_has_overflowed(s);
966}
967
968static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
969 struct seq_file *s)
970{
971 const struct xt_hashlimit_htable *ht = s->private;
972
973 spin_lock(&ent->lock);
974 /* recalculate to show accurate numbers */
975 rateinfo_recalc(ent, jiffies, ht->cfg.mode, 2);
976
977 dl_seq_print(ent, family, s);
978
824 spin_unlock(&ent->lock); 979 spin_unlock(&ent->lock);
825 return seq_has_overflowed(s); 980 return seq_has_overflowed(s);
826} 981}
827 982
983static int dl_seq_show_v1(struct seq_file *s, void *v)
984{
985 struct xt_hashlimit_htable *htable = s->private;
986 unsigned int *bucket = (unsigned int *)v;
987 struct dsthash_ent *ent;
988
989 if (!hlist_empty(&htable->hash[*bucket])) {
990 hlist_for_each_entry(ent, &htable->hash[*bucket], node)
991 if (dl_seq_real_show_v1(ent, htable->family, s))
992 return -1;
993 }
994 return 0;
995}
996
828static int dl_seq_show(struct seq_file *s, void *v) 997static int dl_seq_show(struct seq_file *s, void *v)
829{ 998{
830 struct xt_hashlimit_htable *htable = s->private; 999 struct xt_hashlimit_htable *htable = s->private;
@@ -839,6 +1008,13 @@ static int dl_seq_show(struct seq_file *s, void *v)
839 return 0; 1008 return 0;
840} 1009}
841 1010
1011static const struct seq_operations dl_seq_ops_v1 = {
1012 .start = dl_seq_start,
1013 .next = dl_seq_next,
1014 .stop = dl_seq_stop,
1015 .show = dl_seq_show_v1
1016};
1017
842static const struct seq_operations dl_seq_ops = { 1018static const struct seq_operations dl_seq_ops = {
843 .start = dl_seq_start, 1019 .start = dl_seq_start,
844 .next = dl_seq_next, 1020 .next = dl_seq_next,
@@ -846,17 +1022,37 @@ static const struct seq_operations dl_seq_ops = {
846 .show = dl_seq_show 1022 .show = dl_seq_show
847}; 1023};
848 1024
1025static int dl_proc_open_v1(struct inode *inode, struct file *file)
1026{
1027 int ret = seq_open(file, &dl_seq_ops_v1);
1028
1029 if (!ret) {
1030 struct seq_file *sf = file->private_data;
1031 sf->private = PDE_DATA(inode);
1032 }
1033 return ret;
1034}
1035
849static int dl_proc_open(struct inode *inode, struct file *file) 1036static int dl_proc_open(struct inode *inode, struct file *file)
850{ 1037{
851 int ret = seq_open(file, &dl_seq_ops); 1038 int ret = seq_open(file, &dl_seq_ops);
852 1039
853 if (!ret) { 1040 if (!ret) {
854 struct seq_file *sf = file->private_data; 1041 struct seq_file *sf = file->private_data;
1042
855 sf->private = PDE_DATA(inode); 1043 sf->private = PDE_DATA(inode);
856 } 1044 }
857 return ret; 1045 return ret;
858} 1046}
859 1047
1048static const struct file_operations dl_file_ops_v1 = {
1049 .owner = THIS_MODULE,
1050 .open = dl_proc_open_v1,
1051 .read = seq_read,
1052 .llseek = seq_lseek,
1053 .release = seq_release
1054};
1055
860static const struct file_operations dl_file_ops = { 1056static const struct file_operations dl_file_ops = {
861 .owner = THIS_MODULE, 1057 .owner = THIS_MODULE,
862 .open = dl_proc_open, 1058 .open = dl_proc_open,
diff --git a/net/netfilter/xt_helper.c b/net/netfilter/xt_helper.c
index 9f4ab00c8050..f679dd4c272a 100644
--- a/net/netfilter/xt_helper.c
+++ b/net/netfilter/xt_helper.c
@@ -41,7 +41,7 @@ helper_mt(const struct sk_buff *skb, struct xt_action_param *par)
41 if (!master_help) 41 if (!master_help)
42 return ret; 42 return ret;
43 43
44 /* rcu_read_lock()ed by nf_hook_slow */ 44 /* rcu_read_lock()ed by nf_hook_thresh */
45 helper = rcu_dereference(master_help->helper); 45 helper = rcu_dereference(master_help->helper);
46 if (!helper) 46 if (!helper)
47 return ret; 47 return ret;
@@ -65,7 +65,7 @@ static int helper_mt_check(const struct xt_mtchk_param *par)
65 par->family); 65 par->family);
66 return ret; 66 return ret;
67 } 67 }
68 info->name[29] = '\0'; 68 info->name[sizeof(info->name) - 1] = '\0';
69 return 0; 69 return 0;
70} 70}
71 71
diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c
index e5f18988aee0..bb33598e4530 100644
--- a/net/netfilter/xt_physdev.c
+++ b/net/netfilter/xt_physdev.c
@@ -107,8 +107,8 @@ static int physdev_mt_check(const struct xt_mtchk_param *par)
107 info->invert & XT_PHYSDEV_OP_BRIDGED) && 107 info->invert & XT_PHYSDEV_OP_BRIDGED) &&
108 par->hook_mask & ((1 << NF_INET_LOCAL_OUT) | 108 par->hook_mask & ((1 << NF_INET_LOCAL_OUT) |
109 (1 << NF_INET_FORWARD) | (1 << NF_INET_POST_ROUTING))) { 109 (1 << NF_INET_FORWARD) | (1 << NF_INET_POST_ROUTING))) {
110 pr_info("using --physdev-out and --physdev-is-out are only" 110 pr_info("using --physdev-out and --physdev-is-out are only "
111 "supported in the FORWARD and POSTROUTING chains with" 111 "supported in the FORWARD and POSTROUTING chains with "
112 "bridged traffic.\n"); 112 "bridged traffic.\n");
113 if (par->hook_mask & (1 << NF_INET_LOCAL_OUT)) 113 if (par->hook_mask & (1 << NF_INET_LOCAL_OUT))
114 return -EINVAL; 114 return -EINVAL;
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index d725a27743a1..e3b7a09b103e 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -110,7 +110,6 @@ static const struct file_operations recent_old_fops, recent_mt_fops;
110#endif 110#endif
111 111
112static u_int32_t hash_rnd __read_mostly; 112static u_int32_t hash_rnd __read_mostly;
113static bool hash_rnd_inited __read_mostly;
114 113
115static inline unsigned int recent_entry_hash4(const union nf_inet_addr *addr) 114static inline unsigned int recent_entry_hash4(const union nf_inet_addr *addr)
116{ 115{
@@ -340,10 +339,8 @@ static int recent_mt_check(const struct xt_mtchk_param *par,
340 int ret = -EINVAL; 339 int ret = -EINVAL;
341 size_t sz; 340 size_t sz;
342 341
343 if (unlikely(!hash_rnd_inited)) { 342 net_get_random_once(&hash_rnd, sizeof(hash_rnd));
344 get_random_bytes(&hash_rnd, sizeof(hash_rnd)); 343
345 hash_rnd_inited = true;
346 }
347 if (info->check_set & ~XT_RECENT_VALID_FLAGS) { 344 if (info->check_set & ~XT_RECENT_VALID_FLAGS) {
348 pr_info("Unsupported user space flags (%08x)\n", 345 pr_info("Unsupported user space flags (%08x)\n",
349 info->check_set); 346 info->check_set);
diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c
index ef36a56a02c6..4dedb96d1a06 100644
--- a/net/netfilter/xt_sctp.c
+++ b/net/netfilter/xt_sctp.c
@@ -68,7 +68,7 @@ match_packet(const struct sk_buff *skb,
68 ++i, offset, sch->type, htons(sch->length), 68 ++i, offset, sch->type, htons(sch->length),
69 sch->flags); 69 sch->flags);
70#endif 70#endif
71 offset += WORD_ROUND(ntohs(sch->length)); 71 offset += SCTP_PAD4(ntohs(sch->length));
72 72
73 pr_debug("skb->len: %d\toffset: %d\n", skb->len, offset); 73 pr_debug("skb->len: %d\toffset: %d\n", skb->len, offset);
74 74
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 627f898c05b9..62bea4591054 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1832,7 +1832,7 @@ static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
1832 /* Record the max length of recvmsg() calls for future allocations */ 1832 /* Record the max length of recvmsg() calls for future allocations */
1833 nlk->max_recvmsg_len = max(nlk->max_recvmsg_len, len); 1833 nlk->max_recvmsg_len = max(nlk->max_recvmsg_len, len);
1834 nlk->max_recvmsg_len = min_t(size_t, nlk->max_recvmsg_len, 1834 nlk->max_recvmsg_len = min_t(size_t, nlk->max_recvmsg_len,
1835 16384); 1835 SKB_WITH_OVERHEAD(32768));
1836 1836
1837 copied = data_skb->len; 1837 copied = data_skb->len;
1838 if (len < copied) { 1838 if (len < copied) {
@@ -2083,8 +2083,9 @@ static int netlink_dump(struct sock *sk)
2083 2083
2084 if (alloc_min_size < nlk->max_recvmsg_len) { 2084 if (alloc_min_size < nlk->max_recvmsg_len) {
2085 alloc_size = nlk->max_recvmsg_len; 2085 alloc_size = nlk->max_recvmsg_len;
2086 skb = alloc_skb(alloc_size, GFP_KERNEL | 2086 skb = alloc_skb(alloc_size,
2087 __GFP_NOWARN | __GFP_NORETRY); 2087 (GFP_KERNEL & ~__GFP_DIRECT_RECLAIM) |
2088 __GFP_NOWARN | __GFP_NORETRY);
2088 } 2089 }
2089 if (!skb) { 2090 if (!skb) {
2090 alloc_size = alloc_min_size; 2091 alloc_size = alloc_min_size;
diff --git a/net/netlink/diag.c b/net/netlink/diag.c
index 8dd836a8dd60..b2f0e986a6f4 100644
--- a/net/netlink/diag.c
+++ b/net/netlink/diag.c
@@ -63,43 +63,74 @@ out_nlmsg_trim:
63static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, 63static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
64 int protocol, int s_num) 64 int protocol, int s_num)
65{ 65{
66 struct rhashtable_iter *hti = (void *)cb->args[2];
66 struct netlink_table *tbl = &nl_table[protocol]; 67 struct netlink_table *tbl = &nl_table[protocol];
67 struct rhashtable *ht = &tbl->hash;
68 const struct bucket_table *htbl = rht_dereference_rcu(ht->tbl, ht);
69 struct net *net = sock_net(skb->sk); 68 struct net *net = sock_net(skb->sk);
70 struct netlink_diag_req *req; 69 struct netlink_diag_req *req;
71 struct netlink_sock *nlsk; 70 struct netlink_sock *nlsk;
72 struct sock *sk; 71 struct sock *sk;
73 int ret = 0, num = 0, i; 72 int num = 2;
73 int ret = 0;
74 74
75 req = nlmsg_data(cb->nlh); 75 req = nlmsg_data(cb->nlh);
76 76
77 for (i = 0; i < htbl->size; i++) { 77 if (s_num > 1)
78 struct rhash_head *pos; 78 goto mc_list;
79 79
80 rht_for_each_entry_rcu(nlsk, pos, htbl, i, node) { 80 num--;
81 sk = (struct sock *)nlsk;
82 81
83 if (!net_eq(sock_net(sk), net)) 82 if (!hti) {
84 continue; 83 hti = kmalloc(sizeof(*hti), GFP_KERNEL);
85 if (num < s_num) { 84 if (!hti)
86 num++; 85 return -ENOMEM;
86
87 cb->args[2] = (long)hti;
88 }
89
90 if (!s_num)
91 rhashtable_walk_enter(&tbl->hash, hti);
92
93 ret = rhashtable_walk_start(hti);
94 if (ret == -EAGAIN)
95 ret = 0;
96 if (ret)
97 goto stop;
98
99 while ((nlsk = rhashtable_walk_next(hti))) {
100 if (IS_ERR(nlsk)) {
101 ret = PTR_ERR(nlsk);
102 if (ret == -EAGAIN) {
103 ret = 0;
87 continue; 104 continue;
88 } 105 }
106 break;
107 }
89 108
90 if (sk_diag_fill(sk, skb, req, 109 sk = (struct sock *)nlsk;
91 NETLINK_CB(cb->skb).portid,
92 cb->nlh->nlmsg_seq,
93 NLM_F_MULTI,
94 sock_i_ino(sk)) < 0) {
95 ret = 1;
96 goto done;
97 }
98 110
99 num++; 111 if (!net_eq(sock_net(sk), net))
112 continue;
113
114 if (sk_diag_fill(sk, skb, req,
115 NETLINK_CB(cb->skb).portid,
116 cb->nlh->nlmsg_seq,
117 NLM_F_MULTI,
118 sock_i_ino(sk)) < 0) {
119 ret = 1;
120 break;
100 } 121 }
101 } 122 }
102 123
124stop:
125 rhashtable_walk_stop(hti);
126 if (ret)
127 goto done;
128
129 rhashtable_walk_exit(hti);
130 num++;
131
132mc_list:
133 read_lock(&nl_table_lock);
103 sk_for_each_bound(sk, &tbl->mc_list) { 134 sk_for_each_bound(sk, &tbl->mc_list) {
104 if (sk_hashed(sk)) 135 if (sk_hashed(sk))
105 continue; 136 continue;
@@ -116,13 +147,14 @@ static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
116 NLM_F_MULTI, 147 NLM_F_MULTI,
117 sock_i_ino(sk)) < 0) { 148 sock_i_ino(sk)) < 0) {
118 ret = 1; 149 ret = 1;
119 goto done; 150 break;
120 } 151 }
121 num++; 152 num++;
122 } 153 }
154 read_unlock(&nl_table_lock);
155
123done: 156done:
124 cb->args[0] = num; 157 cb->args[0] = num;
125 cb->args[1] = protocol;
126 158
127 return ret; 159 return ret;
128} 160}
@@ -131,20 +163,20 @@ static int netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
131{ 163{
132 struct netlink_diag_req *req; 164 struct netlink_diag_req *req;
133 int s_num = cb->args[0]; 165 int s_num = cb->args[0];
166 int err = 0;
134 167
135 req = nlmsg_data(cb->nlh); 168 req = nlmsg_data(cb->nlh);
136 169
137 rcu_read_lock();
138 read_lock(&nl_table_lock);
139
140 if (req->sdiag_protocol == NDIAG_PROTO_ALL) { 170 if (req->sdiag_protocol == NDIAG_PROTO_ALL) {
141 int i; 171 int i;
142 172
143 for (i = cb->args[1]; i < MAX_LINKS; i++) { 173 for (i = cb->args[1]; i < MAX_LINKS; i++) {
144 if (__netlink_diag_dump(skb, cb, i, s_num)) 174 err = __netlink_diag_dump(skb, cb, i, s_num);
175 if (err)
145 break; 176 break;
146 s_num = 0; 177 s_num = 0;
147 } 178 }
179 cb->args[1] = i;
148 } else { 180 } else {
149 if (req->sdiag_protocol >= MAX_LINKS) { 181 if (req->sdiag_protocol >= MAX_LINKS) {
150 read_unlock(&nl_table_lock); 182 read_unlock(&nl_table_lock);
@@ -152,13 +184,22 @@ static int netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
152 return -ENOENT; 184 return -ENOENT;
153 } 185 }
154 186
155 __netlink_diag_dump(skb, cb, req->sdiag_protocol, s_num); 187 err = __netlink_diag_dump(skb, cb, req->sdiag_protocol, s_num);
156 } 188 }
157 189
158 read_unlock(&nl_table_lock); 190 return err < 0 ? err : skb->len;
159 rcu_read_unlock(); 191}
192
193static int netlink_diag_dump_done(struct netlink_callback *cb)
194{
195 struct rhashtable_iter *hti = (void *)cb->args[2];
196
197 if (cb->args[0] == 1)
198 rhashtable_walk_exit(hti);
160 199
161 return skb->len; 200 kfree(hti);
201
202 return 0;
162} 203}
163 204
164static int netlink_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) 205static int netlink_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
@@ -172,6 +213,7 @@ static int netlink_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
172 if (h->nlmsg_flags & NLM_F_DUMP) { 213 if (h->nlmsg_flags & NLM_F_DUMP) {
173 struct netlink_dump_control c = { 214 struct netlink_dump_control c = {
174 .dump = netlink_diag_dump, 215 .dump = netlink_diag_dump,
216 .done = netlink_diag_dump_done,
175 }; 217 };
176 return netlink_dump_start(net->diag_nlsk, skb, h, &c); 218 return netlink_dump_start(net->diag_nlsk, skb, h, &c);
177 } else 219 } else
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index a09132a69869..23cc12639ba7 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -977,7 +977,7 @@ static int genl_ctrl_event(int event, struct genl_family *family,
977 return 0; 977 return 0;
978} 978}
979 979
980static struct genl_ops genl_ctrl_ops[] = { 980static const struct genl_ops genl_ctrl_ops[] = {
981 { 981 {
982 .cmd = CTRL_CMD_GETFAMILY, 982 .cmd = CTRL_CMD_GETFAMILY,
983 .doit = ctrl_getfamily, 983 .doit = ctrl_getfamily,
@@ -986,7 +986,7 @@ static struct genl_ops genl_ctrl_ops[] = {
986 }, 986 },
987}; 987};
988 988
989static struct genl_multicast_group genl_ctrl_groups[] = { 989static const struct genl_multicast_group genl_ctrl_groups[] = {
990 { .name = "notify", }, 990 { .name = "notify", },
991}; 991};
992 992
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 1ecbd7715f6d..4e03f64709bc 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -71,6 +71,8 @@ struct ovs_frag_data {
71static DEFINE_PER_CPU(struct ovs_frag_data, ovs_frag_data_storage); 71static DEFINE_PER_CPU(struct ovs_frag_data, ovs_frag_data_storage);
72 72
73#define DEFERRED_ACTION_FIFO_SIZE 10 73#define DEFERRED_ACTION_FIFO_SIZE 10
74#define OVS_RECURSION_LIMIT 5
75#define OVS_DEFERRED_ACTION_THRESHOLD (OVS_RECURSION_LIMIT - 2)
74struct action_fifo { 76struct action_fifo {
75 int head; 77 int head;
76 int tail; 78 int tail;
@@ -78,7 +80,12 @@ struct action_fifo {
78 struct deferred_action fifo[DEFERRED_ACTION_FIFO_SIZE]; 80 struct deferred_action fifo[DEFERRED_ACTION_FIFO_SIZE];
79}; 81};
80 82
83struct recirc_keys {
84 struct sw_flow_key key[OVS_DEFERRED_ACTION_THRESHOLD];
85};
86
81static struct action_fifo __percpu *action_fifos; 87static struct action_fifo __percpu *action_fifos;
88static struct recirc_keys __percpu *recirc_keys;
82static DEFINE_PER_CPU(int, exec_actions_level); 89static DEFINE_PER_CPU(int, exec_actions_level);
83 90
84static void action_fifo_init(struct action_fifo *fifo) 91static void action_fifo_init(struct action_fifo *fifo)
@@ -153,7 +160,7 @@ static void update_ethertype(struct sk_buff *skb, struct ethhdr *hdr,
153static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key, 160static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key,
154 const struct ovs_action_push_mpls *mpls) 161 const struct ovs_action_push_mpls *mpls)
155{ 162{
156 __be32 *new_mpls_lse; 163 struct mpls_shim_hdr *new_mpls_lse;
157 164
158 /* Networking stack do not allow simultaneous Tunnel and MPLS GSO. */ 165 /* Networking stack do not allow simultaneous Tunnel and MPLS GSO. */
159 if (skb->encapsulation) 166 if (skb->encapsulation)
@@ -162,19 +169,23 @@ static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key,
162 if (skb_cow_head(skb, MPLS_HLEN) < 0) 169 if (skb_cow_head(skb, MPLS_HLEN) < 0)
163 return -ENOMEM; 170 return -ENOMEM;
164 171
172 if (!skb->inner_protocol) {
173 skb_set_inner_network_header(skb, skb->mac_len);
174 skb_set_inner_protocol(skb, skb->protocol);
175 }
176
165 skb_push(skb, MPLS_HLEN); 177 skb_push(skb, MPLS_HLEN);
166 memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb), 178 memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb),
167 skb->mac_len); 179 skb->mac_len);
168 skb_reset_mac_header(skb); 180 skb_reset_mac_header(skb);
181 skb_set_network_header(skb, skb->mac_len);
169 182
170 new_mpls_lse = (__be32 *)skb_mpls_header(skb); 183 new_mpls_lse = mpls_hdr(skb);
171 *new_mpls_lse = mpls->mpls_lse; 184 new_mpls_lse->label_stack_entry = mpls->mpls_lse;
172 185
173 skb_postpush_rcsum(skb, new_mpls_lse, MPLS_HLEN); 186 skb_postpush_rcsum(skb, new_mpls_lse, MPLS_HLEN);
174 187
175 update_ethertype(skb, eth_hdr(skb), mpls->mpls_ethertype); 188 update_ethertype(skb, eth_hdr(skb), mpls->mpls_ethertype);
176 if (!skb->inner_protocol)
177 skb_set_inner_protocol(skb, skb->protocol);
178 skb->protocol = mpls->mpls_ethertype; 189 skb->protocol = mpls->mpls_ethertype;
179 190
180 invalidate_flow_key(key); 191 invalidate_flow_key(key);
@@ -191,18 +202,19 @@ static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key,
191 if (unlikely(err)) 202 if (unlikely(err))
192 return err; 203 return err;
193 204
194 skb_postpull_rcsum(skb, skb_mpls_header(skb), MPLS_HLEN); 205 skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN);
195 206
196 memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb), 207 memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb),
197 skb->mac_len); 208 skb->mac_len);
198 209
199 __skb_pull(skb, MPLS_HLEN); 210 __skb_pull(skb, MPLS_HLEN);
200 skb_reset_mac_header(skb); 211 skb_reset_mac_header(skb);
212 skb_set_network_header(skb, skb->mac_len);
201 213
202 /* skb_mpls_header() is used to locate the ethertype 214 /* mpls_hdr() is used to locate the ethertype field correctly in the
203 * field correctly in the presence of VLAN tags. 215 * presence of VLAN tags.
204 */ 216 */
205 hdr = (struct ethhdr *)(skb_mpls_header(skb) - ETH_HLEN); 217 hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN);
206 update_ethertype(skb, hdr, ethertype); 218 update_ethertype(skb, hdr, ethertype);
207 if (eth_p_mpls(skb->protocol)) 219 if (eth_p_mpls(skb->protocol))
208 skb->protocol = ethertype; 220 skb->protocol = ethertype;
@@ -214,7 +226,7 @@ static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key,
214static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key, 226static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key,
215 const __be32 *mpls_lse, const __be32 *mask) 227 const __be32 *mpls_lse, const __be32 *mask)
216{ 228{
217 __be32 *stack; 229 struct mpls_shim_hdr *stack;
218 __be32 lse; 230 __be32 lse;
219 int err; 231 int err;
220 232
@@ -222,16 +234,16 @@ static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key,
222 if (unlikely(err)) 234 if (unlikely(err))
223 return err; 235 return err;
224 236
225 stack = (__be32 *)skb_mpls_header(skb); 237 stack = mpls_hdr(skb);
226 lse = OVS_MASKED(*stack, *mpls_lse, *mask); 238 lse = OVS_MASKED(stack->label_stack_entry, *mpls_lse, *mask);
227 if (skb->ip_summed == CHECKSUM_COMPLETE) { 239 if (skb->ip_summed == CHECKSUM_COMPLETE) {
228 __be32 diff[] = { ~(*stack), lse }; 240 __be32 diff[] = { ~(stack->label_stack_entry), lse };
229 241
230 skb->csum = ~csum_partial((char *)diff, sizeof(diff), 242 skb->csum = ~csum_partial((char *)diff, sizeof(diff),
231 ~skb->csum); 243 ~skb->csum);
232 } 244 }
233 245
234 *stack = lse; 246 stack->label_stack_entry = lse;
235 flow_key->mpls.top_lse = lse; 247 flow_key->mpls.top_lse = lse;
236 return 0; 248 return 0;
237} 249}
@@ -241,20 +253,24 @@ static int pop_vlan(struct sk_buff *skb, struct sw_flow_key *key)
241 int err; 253 int err;
242 254
243 err = skb_vlan_pop(skb); 255 err = skb_vlan_pop(skb);
244 if (skb_vlan_tag_present(skb)) 256 if (skb_vlan_tag_present(skb)) {
245 invalidate_flow_key(key); 257 invalidate_flow_key(key);
246 else 258 } else {
247 key->eth.tci = 0; 259 key->eth.vlan.tci = 0;
260 key->eth.vlan.tpid = 0;
261 }
248 return err; 262 return err;
249} 263}
250 264
251static int push_vlan(struct sk_buff *skb, struct sw_flow_key *key, 265static int push_vlan(struct sk_buff *skb, struct sw_flow_key *key,
252 const struct ovs_action_push_vlan *vlan) 266 const struct ovs_action_push_vlan *vlan)
253{ 267{
254 if (skb_vlan_tag_present(skb)) 268 if (skb_vlan_tag_present(skb)) {
255 invalidate_flow_key(key); 269 invalidate_flow_key(key);
256 else 270 } else {
257 key->eth.tci = vlan->vlan_tci; 271 key->eth.vlan.tci = vlan->vlan_tci;
272 key->eth.vlan.tpid = vlan->vlan_tpid;
273 }
258 return skb_vlan_push(skb, vlan->vlan_tpid, 274 return skb_vlan_push(skb, vlan->vlan_tpid,
259 ntohs(vlan->vlan_tci) & ~VLAN_TAG_PRESENT); 275 ntohs(vlan->vlan_tci) & ~VLAN_TAG_PRESENT);
260} 276}
@@ -1011,6 +1027,7 @@ static int execute_recirc(struct datapath *dp, struct sk_buff *skb,
1011 const struct nlattr *a, int rem) 1027 const struct nlattr *a, int rem)
1012{ 1028{
1013 struct deferred_action *da; 1029 struct deferred_action *da;
1030 int level;
1014 1031
1015 if (!is_flow_key_valid(key)) { 1032 if (!is_flow_key_valid(key)) {
1016 int err; 1033 int err;
@@ -1034,6 +1051,18 @@ static int execute_recirc(struct datapath *dp, struct sk_buff *skb,
1034 return 0; 1051 return 0;
1035 } 1052 }
1036 1053
1054 level = this_cpu_read(exec_actions_level);
1055 if (level <= OVS_DEFERRED_ACTION_THRESHOLD) {
1056 struct recirc_keys *rks = this_cpu_ptr(recirc_keys);
1057 struct sw_flow_key *recirc_key = &rks->key[level - 1];
1058
1059 *recirc_key = *key;
1060 recirc_key->recirc_id = nla_get_u32(a);
1061 ovs_dp_process_packet(skb, recirc_key);
1062
1063 return 0;
1064 }
1065
1037 da = add_deferred_actions(skb, key, NULL); 1066 da = add_deferred_actions(skb, key, NULL);
1038 if (da) { 1067 if (da) {
1039 da->pkt_key.recirc_id = nla_get_u32(a); 1068 da->pkt_key.recirc_id = nla_get_u32(a);
@@ -1200,11 +1229,10 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb,
1200 const struct sw_flow_actions *acts, 1229 const struct sw_flow_actions *acts,
1201 struct sw_flow_key *key) 1230 struct sw_flow_key *key)
1202{ 1231{
1203 static const int ovs_recursion_limit = 5;
1204 int err, level; 1232 int err, level;
1205 1233
1206 level = __this_cpu_inc_return(exec_actions_level); 1234 level = __this_cpu_inc_return(exec_actions_level);
1207 if (unlikely(level > ovs_recursion_limit)) { 1235 if (unlikely(level > OVS_RECURSION_LIMIT)) {
1208 net_crit_ratelimited("ovs: recursion limit reached on datapath %s, probable configuration error\n", 1236 net_crit_ratelimited("ovs: recursion limit reached on datapath %s, probable configuration error\n",
1209 ovs_dp_name(dp)); 1237 ovs_dp_name(dp));
1210 kfree_skb(skb); 1238 kfree_skb(skb);
@@ -1229,10 +1257,17 @@ int action_fifos_init(void)
1229 if (!action_fifos) 1257 if (!action_fifos)
1230 return -ENOMEM; 1258 return -ENOMEM;
1231 1259
1260 recirc_keys = alloc_percpu(struct recirc_keys);
1261 if (!recirc_keys) {
1262 free_percpu(action_fifos);
1263 return -ENOMEM;
1264 }
1265
1232 return 0; 1266 return 0;
1233} 1267}
1234 1268
1235void action_fifos_exit(void) 1269void action_fifos_exit(void)
1236{ 1270{
1237 free_percpu(action_fifos); 1271 free_percpu(action_fifos);
1272 free_percpu(recirc_keys);
1238} 1273}
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index e054a748ff25..31045ef44a82 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -1367,7 +1367,7 @@ static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info)
1367 if (ct_info->helper) 1367 if (ct_info->helper)
1368 module_put(ct_info->helper->me); 1368 module_put(ct_info->helper->me);
1369 if (ct_info->ct) 1369 if (ct_info->ct)
1370 nf_ct_put(ct_info->ct); 1370 nf_ct_tmpl_free(ct_info->ct);
1371} 1371}
1372 1372
1373void ovs_ct_init(struct net *net) 1373void ovs_ct_init(struct net *net)
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 524c0fd3078e..4d67ea856067 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -928,7 +928,6 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
928 struct sw_flow_mask mask; 928 struct sw_flow_mask mask;
929 struct sk_buff *reply; 929 struct sk_buff *reply;
930 struct datapath *dp; 930 struct datapath *dp;
931 struct sw_flow_key key;
932 struct sw_flow_actions *acts; 931 struct sw_flow_actions *acts;
933 struct sw_flow_match match; 932 struct sw_flow_match match;
934 u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); 933 u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]);
@@ -956,20 +955,24 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
956 } 955 }
957 956
958 /* Extract key. */ 957 /* Extract key. */
959 ovs_match_init(&match, &key, &mask); 958 ovs_match_init(&match, &new_flow->key, false, &mask);
960 error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], 959 error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY],
961 a[OVS_FLOW_ATTR_MASK], log); 960 a[OVS_FLOW_ATTR_MASK], log);
962 if (error) 961 if (error)
963 goto err_kfree_flow; 962 goto err_kfree_flow;
964 963
965 ovs_flow_mask_key(&new_flow->key, &key, true, &mask);
966
967 /* Extract flow identifier. */ 964 /* Extract flow identifier. */
968 error = ovs_nla_get_identifier(&new_flow->id, a[OVS_FLOW_ATTR_UFID], 965 error = ovs_nla_get_identifier(&new_flow->id, a[OVS_FLOW_ATTR_UFID],
969 &key, log); 966 &new_flow->key, log);
970 if (error) 967 if (error)
971 goto err_kfree_flow; 968 goto err_kfree_flow;
972 969
970 /* unmasked key is needed to match when ufid is not used. */
971 if (ovs_identifier_is_key(&new_flow->id))
972 match.key = new_flow->id.unmasked_key;
973
974 ovs_flow_mask_key(&new_flow->key, &new_flow->key, true, &mask);
975
973 /* Validate actions. */ 976 /* Validate actions. */
974 error = ovs_nla_copy_actions(net, a[OVS_FLOW_ATTR_ACTIONS], 977 error = ovs_nla_copy_actions(net, a[OVS_FLOW_ATTR_ACTIONS],
975 &new_flow->key, &acts, log); 978 &new_flow->key, &acts, log);
@@ -996,7 +999,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
996 if (ovs_identifier_is_ufid(&new_flow->id)) 999 if (ovs_identifier_is_ufid(&new_flow->id))
997 flow = ovs_flow_tbl_lookup_ufid(&dp->table, &new_flow->id); 1000 flow = ovs_flow_tbl_lookup_ufid(&dp->table, &new_flow->id);
998 if (!flow) 1001 if (!flow)
999 flow = ovs_flow_tbl_lookup(&dp->table, &key); 1002 flow = ovs_flow_tbl_lookup(&dp->table, &new_flow->key);
1000 if (likely(!flow)) { 1003 if (likely(!flow)) {
1001 rcu_assign_pointer(new_flow->sf_acts, acts); 1004 rcu_assign_pointer(new_flow->sf_acts, acts);
1002 1005
@@ -1121,7 +1124,7 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info)
1121 1124
1122 ufid_present = ovs_nla_get_ufid(&sfid, a[OVS_FLOW_ATTR_UFID], log); 1125 ufid_present = ovs_nla_get_ufid(&sfid, a[OVS_FLOW_ATTR_UFID], log);
1123 if (a[OVS_FLOW_ATTR_KEY]) { 1126 if (a[OVS_FLOW_ATTR_KEY]) {
1124 ovs_match_init(&match, &key, &mask); 1127 ovs_match_init(&match, &key, true, &mask);
1125 error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], 1128 error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY],
1126 a[OVS_FLOW_ATTR_MASK], log); 1129 a[OVS_FLOW_ATTR_MASK], log);
1127 } else if (!ufid_present) { 1130 } else if (!ufid_present) {
@@ -1238,7 +1241,7 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
1238 1241
1239 ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log); 1242 ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log);
1240 if (a[OVS_FLOW_ATTR_KEY]) { 1243 if (a[OVS_FLOW_ATTR_KEY]) {
1241 ovs_match_init(&match, &key, NULL); 1244 ovs_match_init(&match, &key, true, NULL);
1242 err = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], NULL, 1245 err = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], NULL,
1243 log); 1246 log);
1244 } else if (!ufid_present) { 1247 } else if (!ufid_present) {
@@ -1297,7 +1300,7 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
1297 1300
1298 ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log); 1301 ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log);
1299 if (a[OVS_FLOW_ATTR_KEY]) { 1302 if (a[OVS_FLOW_ATTR_KEY]) {
1300 ovs_match_init(&match, &key, NULL); 1303 ovs_match_init(&match, &key, true, NULL);
1301 err = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], 1304 err = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY],
1302 NULL, log); 1305 NULL, log);
1303 if (unlikely(err)) 1306 if (unlikely(err))
@@ -2437,3 +2440,7 @@ module_exit(dp_cleanup);
2437 2440
2438MODULE_DESCRIPTION("Open vSwitch switching datapath"); 2441MODULE_DESCRIPTION("Open vSwitch switching datapath");
2439MODULE_LICENSE("GPL"); 2442MODULE_LICENSE("GPL");
2443MODULE_ALIAS_GENL_FAMILY(OVS_DATAPATH_FAMILY);
2444MODULE_ALIAS_GENL_FAMILY(OVS_VPORT_FAMILY);
2445MODULE_ALIAS_GENL_FAMILY(OVS_FLOW_FAMILY);
2446MODULE_ALIAS_GENL_FAMILY(OVS_PACKET_FAMILY);
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 0ea128eeeab2..22087062bd10 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -29,6 +29,7 @@
29#include <linux/module.h> 29#include <linux/module.h>
30#include <linux/in.h> 30#include <linux/in.h>
31#include <linux/rcupdate.h> 31#include <linux/rcupdate.h>
32#include <linux/cpumask.h>
32#include <linux/if_arp.h> 33#include <linux/if_arp.h>
33#include <linux/ip.h> 34#include <linux/ip.h>
34#include <linux/ipv6.h> 35#include <linux/ipv6.h>
@@ -72,32 +73,33 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags,
72{ 73{
73 struct flow_stats *stats; 74 struct flow_stats *stats;
74 int node = numa_node_id(); 75 int node = numa_node_id();
76 int cpu = smp_processor_id();
75 int len = skb->len + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); 77 int len = skb->len + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
76 78
77 stats = rcu_dereference(flow->stats[node]); 79 stats = rcu_dereference(flow->stats[cpu]);
78 80
79 /* Check if already have node-specific stats. */ 81 /* Check if already have CPU-specific stats. */
80 if (likely(stats)) { 82 if (likely(stats)) {
81 spin_lock(&stats->lock); 83 spin_lock(&stats->lock);
82 /* Mark if we write on the pre-allocated stats. */ 84 /* Mark if we write on the pre-allocated stats. */
83 if (node == 0 && unlikely(flow->stats_last_writer != node)) 85 if (cpu == 0 && unlikely(flow->stats_last_writer != cpu))
84 flow->stats_last_writer = node; 86 flow->stats_last_writer = cpu;
85 } else { 87 } else {
86 stats = rcu_dereference(flow->stats[0]); /* Pre-allocated. */ 88 stats = rcu_dereference(flow->stats[0]); /* Pre-allocated. */
87 spin_lock(&stats->lock); 89 spin_lock(&stats->lock);
88 90
89 /* If the current NUMA-node is the only writer on the 91 /* If the current CPU is the only writer on the
90 * pre-allocated stats keep using them. 92 * pre-allocated stats keep using them.
91 */ 93 */
92 if (unlikely(flow->stats_last_writer != node)) { 94 if (unlikely(flow->stats_last_writer != cpu)) {
93 /* A previous locker may have already allocated the 95 /* A previous locker may have already allocated the
94 * stats, so we need to check again. If node-specific 96 * stats, so we need to check again. If CPU-specific
95 * stats were already allocated, we update the pre- 97 * stats were already allocated, we update the pre-
96 * allocated stats as we have already locked them. 98 * allocated stats as we have already locked them.
97 */ 99 */
98 if (likely(flow->stats_last_writer != NUMA_NO_NODE) 100 if (likely(flow->stats_last_writer != -1) &&
99 && likely(!rcu_access_pointer(flow->stats[node]))) { 101 likely(!rcu_access_pointer(flow->stats[cpu]))) {
100 /* Try to allocate node-specific stats. */ 102 /* Try to allocate CPU-specific stats. */
101 struct flow_stats *new_stats; 103 struct flow_stats *new_stats;
102 104
103 new_stats = 105 new_stats =
@@ -114,12 +116,12 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags,
114 new_stats->tcp_flags = tcp_flags; 116 new_stats->tcp_flags = tcp_flags;
115 spin_lock_init(&new_stats->lock); 117 spin_lock_init(&new_stats->lock);
116 118
117 rcu_assign_pointer(flow->stats[node], 119 rcu_assign_pointer(flow->stats[cpu],
118 new_stats); 120 new_stats);
119 goto unlock; 121 goto unlock;
120 } 122 }
121 } 123 }
122 flow->stats_last_writer = node; 124 flow->stats_last_writer = cpu;
123 } 125 }
124 } 126 }
125 127
@@ -136,14 +138,15 @@ void ovs_flow_stats_get(const struct sw_flow *flow,
136 struct ovs_flow_stats *ovs_stats, 138 struct ovs_flow_stats *ovs_stats,
137 unsigned long *used, __be16 *tcp_flags) 139 unsigned long *used, __be16 *tcp_flags)
138{ 140{
139 int node; 141 int cpu;
140 142
141 *used = 0; 143 *used = 0;
142 *tcp_flags = 0; 144 *tcp_flags = 0;
143 memset(ovs_stats, 0, sizeof(*ovs_stats)); 145 memset(ovs_stats, 0, sizeof(*ovs_stats));
144 146
145 for_each_node(node) { 147 /* We open code this to make sure cpu 0 is always considered */
146 struct flow_stats *stats = rcu_dereference_ovsl(flow->stats[node]); 148 for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, cpu_possible_mask)) {
149 struct flow_stats *stats = rcu_dereference_ovsl(flow->stats[cpu]);
147 150
148 if (stats) { 151 if (stats) {
149 /* Local CPU may write on non-local stats, so we must 152 /* Local CPU may write on non-local stats, so we must
@@ -163,10 +166,11 @@ void ovs_flow_stats_get(const struct sw_flow *flow,
163/* Called with ovs_mutex. */ 166/* Called with ovs_mutex. */
164void ovs_flow_stats_clear(struct sw_flow *flow) 167void ovs_flow_stats_clear(struct sw_flow *flow)
165{ 168{
166 int node; 169 int cpu;
167 170
168 for_each_node(node) { 171 /* We open code this to make sure cpu 0 is always considered */
169 struct flow_stats *stats = ovsl_dereference(flow->stats[node]); 172 for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, cpu_possible_mask)) {
173 struct flow_stats *stats = ovsl_dereference(flow->stats[cpu]);
170 174
171 if (stats) { 175 if (stats) {
172 spin_lock_bh(&stats->lock); 176 spin_lock_bh(&stats->lock);
@@ -302,24 +306,57 @@ static bool icmp6hdr_ok(struct sk_buff *skb)
302 sizeof(struct icmp6hdr)); 306 sizeof(struct icmp6hdr));
303} 307}
304 308
305static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key) 309/**
310 * Parse vlan tag from vlan header.
311 * Returns ERROR on memory error.
312 * Returns 0 if it encounters a non-vlan or incomplete packet.
313 * Returns 1 after successfully parsing vlan tag.
314 */
315static int parse_vlan_tag(struct sk_buff *skb, struct vlan_head *key_vh)
306{ 316{
307 struct qtag_prefix { 317 struct vlan_head *vh = (struct vlan_head *)skb->data;
308 __be16 eth_type; /* ETH_P_8021Q */ 318
309 __be16 tci; 319 if (likely(!eth_type_vlan(vh->tpid)))
310 }; 320 return 0;
311 struct qtag_prefix *qp;
312 321
313 if (unlikely(skb->len < sizeof(struct qtag_prefix) + sizeof(__be16))) 322 if (unlikely(skb->len < sizeof(struct vlan_head) + sizeof(__be16)))
314 return 0; 323 return 0;
315 324
316 if (unlikely(!pskb_may_pull(skb, sizeof(struct qtag_prefix) + 325 if (unlikely(!pskb_may_pull(skb, sizeof(struct vlan_head) +
317 sizeof(__be16)))) 326 sizeof(__be16))))
318 return -ENOMEM; 327 return -ENOMEM;
319 328
320 qp = (struct qtag_prefix *) skb->data; 329 vh = (struct vlan_head *)skb->data;
321 key->eth.tci = qp->tci | htons(VLAN_TAG_PRESENT); 330 key_vh->tci = vh->tci | htons(VLAN_TAG_PRESENT);
322 __skb_pull(skb, sizeof(struct qtag_prefix)); 331 key_vh->tpid = vh->tpid;
332
333 __skb_pull(skb, sizeof(struct vlan_head));
334 return 1;
335}
336
337static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key)
338{
339 int res;
340
341 key->eth.vlan.tci = 0;
342 key->eth.vlan.tpid = 0;
343 key->eth.cvlan.tci = 0;
344 key->eth.cvlan.tpid = 0;
345
346 if (skb_vlan_tag_present(skb)) {
347 key->eth.vlan.tci = htons(skb->vlan_tci);
348 key->eth.vlan.tpid = skb->vlan_proto;
349 } else {
350 /* Parse outer vlan tag in the non-accelerated case. */
351 res = parse_vlan_tag(skb, &key->eth.vlan);
352 if (res <= 0)
353 return res;
354 }
355
356 /* Parse inner vlan tag. */
357 res = parse_vlan_tag(skb, &key->eth.cvlan);
358 if (res <= 0)
359 return res;
323 360
324 return 0; 361 return 0;
325} 362}
@@ -480,12 +517,8 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
480 * update skb->csum here. 517 * update skb->csum here.
481 */ 518 */
482 519
483 key->eth.tci = 0; 520 if (unlikely(parse_vlan(skb, key)))
484 if (skb_vlan_tag_present(skb)) 521 return -ENOMEM;
485 key->eth.tci = htons(skb->vlan_tci);
486 else if (eth->h_proto == htons(ETH_P_8021Q))
487 if (unlikely(parse_vlan(skb, key)))
488 return -ENOMEM;
489 522
490 key->eth.type = parse_ethertype(skb); 523 key->eth.type = parse_ethertype(skb);
491 if (unlikely(key->eth.type == htons(0))) 524 if (unlikely(key->eth.type == htons(0)))
@@ -600,12 +633,7 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
600 } else if (eth_p_mpls(key->eth.type)) { 633 } else if (eth_p_mpls(key->eth.type)) {
601 size_t stack_len = MPLS_HLEN; 634 size_t stack_len = MPLS_HLEN;
602 635
603 /* In the presence of an MPLS label stack the end of the L2 636 skb_set_inner_network_header(skb, skb->mac_len);
604 * header and the beginning of the L3 header differ.
605 *
606 * Advance network_header to the beginning of the L3
607 * header. mac_len corresponds to the end of the L2 header.
608 */
609 while (1) { 637 while (1) {
610 __be32 lse; 638 __be32 lse;
611 639
@@ -613,12 +641,12 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
613 if (unlikely(error)) 641 if (unlikely(error))
614 return 0; 642 return 0;
615 643
616 memcpy(&lse, skb_network_header(skb), MPLS_HLEN); 644 memcpy(&lse, skb_inner_network_header(skb), MPLS_HLEN);
617 645
618 if (stack_len == MPLS_HLEN) 646 if (stack_len == MPLS_HLEN)
619 memcpy(&key->mpls.top_lse, &lse, MPLS_HLEN); 647 memcpy(&key->mpls.top_lse, &lse, MPLS_HLEN);
620 648
621 skb_set_network_header(skb, skb->mac_len + stack_len); 649 skb_set_inner_network_header(skb, skb->mac_len + stack_len);
622 if (lse & htonl(MPLS_LS_S_MASK)) 650 if (lse & htonl(MPLS_LS_S_MASK))
623 break; 651 break;
624 652
@@ -734,8 +762,6 @@ int ovs_flow_key_extract_userspace(struct net *net, const struct nlattr *attr,
734{ 762{
735 int err; 763 int err;
736 764
737 memset(key, 0, OVS_SW_FLOW_KEY_METADATA_SIZE);
738
739 /* Extract metadata from netlink attributes. */ 765 /* Extract metadata from netlink attributes. */
740 err = ovs_nla_get_flow_metadata(net, attr, key, log); 766 err = ovs_nla_get_flow_metadata(net, attr, key, log);
741 if (err) 767 if (err)
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index 03378e75a67c..ae783f5c6695 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -50,6 +50,11 @@ struct ovs_tunnel_info {
50 struct metadata_dst *tun_dst; 50 struct metadata_dst *tun_dst;
51}; 51};
52 52
53struct vlan_head {
54 __be16 tpid; /* Vlan type. Generally 802.1q or 802.1ad.*/
55 __be16 tci; /* 0 if no VLAN, VLAN_TAG_PRESENT set otherwise. */
56};
57
53#define OVS_SW_FLOW_KEY_METADATA_SIZE \ 58#define OVS_SW_FLOW_KEY_METADATA_SIZE \
54 (offsetof(struct sw_flow_key, recirc_id) + \ 59 (offsetof(struct sw_flow_key, recirc_id) + \
55 FIELD_SIZEOF(struct sw_flow_key, recirc_id)) 60 FIELD_SIZEOF(struct sw_flow_key, recirc_id))
@@ -69,7 +74,8 @@ struct sw_flow_key {
69 struct { 74 struct {
70 u8 src[ETH_ALEN]; /* Ethernet source address. */ 75 u8 src[ETH_ALEN]; /* Ethernet source address. */
71 u8 dst[ETH_ALEN]; /* Ethernet destination address. */ 76 u8 dst[ETH_ALEN]; /* Ethernet destination address. */
72 __be16 tci; /* 0 if no VLAN, VLAN_TAG_PRESENT set otherwise. */ 77 struct vlan_head vlan;
78 struct vlan_head cvlan;
73 __be16 type; /* Ethernet frame type. */ 79 __be16 type; /* Ethernet frame type. */
74 } eth; 80 } eth;
75 union { 81 union {
@@ -172,14 +178,14 @@ struct sw_flow {
172 struct hlist_node node[2]; 178 struct hlist_node node[2];
173 u32 hash; 179 u32 hash;
174 } flow_table, ufid_table; 180 } flow_table, ufid_table;
175 int stats_last_writer; /* NUMA-node id of the last writer on 181 int stats_last_writer; /* CPU id of the last writer on
176 * 'stats[0]'. 182 * 'stats[0]'.
177 */ 183 */
178 struct sw_flow_key key; 184 struct sw_flow_key key;
179 struct sw_flow_id id; 185 struct sw_flow_id id;
180 struct sw_flow_mask *mask; 186 struct sw_flow_mask *mask;
181 struct sw_flow_actions __rcu *sf_acts; 187 struct sw_flow_actions __rcu *sf_acts;
182 struct flow_stats __rcu *stats[]; /* One for each NUMA node. First one 188 struct flow_stats __rcu *stats[]; /* One for each CPU. First one
183 * is allocated at flow creation time, 189 * is allocated at flow creation time,
184 * the rest are allocated on demand 190 * the rest are allocated on demand
185 * while holding the 'stats[0].lock'. 191 * while holding the 'stats[0].lock'.
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index c78a6a1476fb..ae25ded82b3b 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -808,6 +808,167 @@ int ovs_nla_put_tunnel_info(struct sk_buff *skb,
808 ip_tunnel_info_af(tun_info)); 808 ip_tunnel_info_af(tun_info));
809} 809}
810 810
811static int encode_vlan_from_nlattrs(struct sw_flow_match *match,
812 const struct nlattr *a[],
813 bool is_mask, bool inner)
814{
815 __be16 tci = 0;
816 __be16 tpid = 0;
817
818 if (a[OVS_KEY_ATTR_VLAN])
819 tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
820
821 if (a[OVS_KEY_ATTR_ETHERTYPE])
822 tpid = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
823
824 if (likely(!inner)) {
825 SW_FLOW_KEY_PUT(match, eth.vlan.tpid, tpid, is_mask);
826 SW_FLOW_KEY_PUT(match, eth.vlan.tci, tci, is_mask);
827 } else {
828 SW_FLOW_KEY_PUT(match, eth.cvlan.tpid, tpid, is_mask);
829 SW_FLOW_KEY_PUT(match, eth.cvlan.tci, tci, is_mask);
830 }
831 return 0;
832}
833
834static int validate_vlan_from_nlattrs(const struct sw_flow_match *match,
835 u64 key_attrs, bool inner,
836 const struct nlattr **a, bool log)
837{
838 __be16 tci = 0;
839
840 if (!((key_attrs & (1 << OVS_KEY_ATTR_ETHERNET)) &&
841 (key_attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) &&
842 eth_type_vlan(nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE])))) {
843 /* Not a VLAN. */
844 return 0;
845 }
846
847 if (!((key_attrs & (1 << OVS_KEY_ATTR_VLAN)) &&
848 (key_attrs & (1 << OVS_KEY_ATTR_ENCAP)))) {
849 OVS_NLERR(log, "Invalid %s frame", (inner) ? "C-VLAN" : "VLAN");
850 return -EINVAL;
851 }
852
853 if (a[OVS_KEY_ATTR_VLAN])
854 tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
855
856 if (!(tci & htons(VLAN_TAG_PRESENT))) {
857 if (tci) {
858 OVS_NLERR(log, "%s TCI does not have VLAN_TAG_PRESENT bit set.",
859 (inner) ? "C-VLAN" : "VLAN");
860 return -EINVAL;
861 } else if (nla_len(a[OVS_KEY_ATTR_ENCAP])) {
862 /* Corner case for truncated VLAN header. */
863 OVS_NLERR(log, "Truncated %s header has non-zero encap attribute.",
864 (inner) ? "C-VLAN" : "VLAN");
865 return -EINVAL;
866 }
867 }
868
869 return 1;
870}
871
872static int validate_vlan_mask_from_nlattrs(const struct sw_flow_match *match,
873 u64 key_attrs, bool inner,
874 const struct nlattr **a, bool log)
875{
876 __be16 tci = 0;
877 __be16 tpid = 0;
878 bool encap_valid = !!(match->key->eth.vlan.tci &
879 htons(VLAN_TAG_PRESENT));
880 bool i_encap_valid = !!(match->key->eth.cvlan.tci &
881 htons(VLAN_TAG_PRESENT));
882
883 if (!(key_attrs & (1 << OVS_KEY_ATTR_ENCAP))) {
884 /* Not a VLAN. */
885 return 0;
886 }
887
888 if ((!inner && !encap_valid) || (inner && !i_encap_valid)) {
889 OVS_NLERR(log, "Encap mask attribute is set for non-%s frame.",
890 (inner) ? "C-VLAN" : "VLAN");
891 return -EINVAL;
892 }
893
894 if (a[OVS_KEY_ATTR_VLAN])
895 tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
896
897 if (a[OVS_KEY_ATTR_ETHERTYPE])
898 tpid = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
899
900 if (tpid != htons(0xffff)) {
901 OVS_NLERR(log, "Must have an exact match on %s TPID (mask=%x).",
902 (inner) ? "C-VLAN" : "VLAN", ntohs(tpid));
903 return -EINVAL;
904 }
905 if (!(tci & htons(VLAN_TAG_PRESENT))) {
906 OVS_NLERR(log, "%s TCI mask does not have exact match for VLAN_TAG_PRESENT bit.",
907 (inner) ? "C-VLAN" : "VLAN");
908 return -EINVAL;
909 }
910
911 return 1;
912}
913
914static int __parse_vlan_from_nlattrs(struct sw_flow_match *match,
915 u64 *key_attrs, bool inner,
916 const struct nlattr **a, bool is_mask,
917 bool log)
918{
919 int err;
920 const struct nlattr *encap;
921
922 if (!is_mask)
923 err = validate_vlan_from_nlattrs(match, *key_attrs, inner,
924 a, log);
925 else
926 err = validate_vlan_mask_from_nlattrs(match, *key_attrs, inner,
927 a, log);
928 if (err <= 0)
929 return err;
930
931 err = encode_vlan_from_nlattrs(match, a, is_mask, inner);
932 if (err)
933 return err;
934
935 *key_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP);
936 *key_attrs &= ~(1 << OVS_KEY_ATTR_VLAN);
937 *key_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
938
939 encap = a[OVS_KEY_ATTR_ENCAP];
940
941 if (!is_mask)
942 err = parse_flow_nlattrs(encap, a, key_attrs, log);
943 else
944 err = parse_flow_mask_nlattrs(encap, a, key_attrs, log);
945
946 return err;
947}
948
949static int parse_vlan_from_nlattrs(struct sw_flow_match *match,
950 u64 *key_attrs, const struct nlattr **a,
951 bool is_mask, bool log)
952{
953 int err;
954 bool encap_valid = false;
955
956 err = __parse_vlan_from_nlattrs(match, key_attrs, false, a,
957 is_mask, log);
958 if (err)
959 return err;
960
961 encap_valid = !!(match->key->eth.vlan.tci & htons(VLAN_TAG_PRESENT));
962 if (encap_valid) {
963 err = __parse_vlan_from_nlattrs(match, key_attrs, true, a,
964 is_mask, log);
965 if (err)
966 return err;
967 }
968
969 return 0;
970}
971
811static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match, 972static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match,
812 u64 *attrs, const struct nlattr **a, 973 u64 *attrs, const struct nlattr **a,
813 bool is_mask, bool log) 974 bool is_mask, bool log)
@@ -923,20 +1084,11 @@ static int ovs_key_from_nlattrs(struct net *net, struct sw_flow_match *match,
923 } 1084 }
924 1085
925 if (attrs & (1 << OVS_KEY_ATTR_VLAN)) { 1086 if (attrs & (1 << OVS_KEY_ATTR_VLAN)) {
926 __be16 tci; 1087 /* VLAN attribute is always parsed before getting here since it
927 1088 * may occur multiple times.
928 tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); 1089 */
929 if (!(tci & htons(VLAN_TAG_PRESENT))) { 1090 OVS_NLERR(log, "VLAN attribute unexpected.");
930 if (is_mask) 1091 return -EINVAL;
931 OVS_NLERR(log, "VLAN TCI mask does not have exact match for VLAN_TAG_PRESENT bit.");
932 else
933 OVS_NLERR(log, "VLAN TCI does not have VLAN_TAG_PRESENT bit set.");
934
935 return -EINVAL;
936 }
937
938 SW_FLOW_KEY_PUT(match, eth.tci, tci, is_mask);
939 attrs &= ~(1 << OVS_KEY_ATTR_VLAN);
940 } 1092 }
941 1093
942 if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) { 1094 if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) {
@@ -1182,49 +1334,18 @@ int ovs_nla_get_match(struct net *net, struct sw_flow_match *match,
1182 bool log) 1334 bool log)
1183{ 1335{
1184 const struct nlattr *a[OVS_KEY_ATTR_MAX + 1]; 1336 const struct nlattr *a[OVS_KEY_ATTR_MAX + 1];
1185 const struct nlattr *encap;
1186 struct nlattr *newmask = NULL; 1337 struct nlattr *newmask = NULL;
1187 u64 key_attrs = 0; 1338 u64 key_attrs = 0;
1188 u64 mask_attrs = 0; 1339 u64 mask_attrs = 0;
1189 bool encap_valid = false;
1190 int err; 1340 int err;
1191 1341
1192 err = parse_flow_nlattrs(nla_key, a, &key_attrs, log); 1342 err = parse_flow_nlattrs(nla_key, a, &key_attrs, log);
1193 if (err) 1343 if (err)
1194 return err; 1344 return err;
1195 1345
1196 if ((key_attrs & (1 << OVS_KEY_ATTR_ETHERNET)) && 1346 err = parse_vlan_from_nlattrs(match, &key_attrs, a, false, log);
1197 (key_attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) && 1347 if (err)
1198 (nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]) == htons(ETH_P_8021Q))) { 1348 return err;
1199 __be16 tci;
1200
1201 if (!((key_attrs & (1 << OVS_KEY_ATTR_VLAN)) &&
1202 (key_attrs & (1 << OVS_KEY_ATTR_ENCAP)))) {
1203 OVS_NLERR(log, "Invalid Vlan frame.");
1204 return -EINVAL;
1205 }
1206
1207 key_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
1208 tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
1209 encap = a[OVS_KEY_ATTR_ENCAP];
1210 key_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP);
1211 encap_valid = true;
1212
1213 if (tci & htons(VLAN_TAG_PRESENT)) {
1214 err = parse_flow_nlattrs(encap, a, &key_attrs, log);
1215 if (err)
1216 return err;
1217 } else if (!tci) {
1218 /* Corner case for truncated 802.1Q header. */
1219 if (nla_len(encap)) {
1220 OVS_NLERR(log, "Truncated 802.1Q header has non-zero encap attribute.");
1221 return -EINVAL;
1222 }
1223 } else {
1224 OVS_NLERR(log, "Encap attr is set for non-VLAN frame");
1225 return -EINVAL;
1226 }
1227 }
1228 1349
1229 err = ovs_key_from_nlattrs(net, match, key_attrs, a, false, log); 1350 err = ovs_key_from_nlattrs(net, match, key_attrs, a, false, log);
1230 if (err) 1351 if (err)
@@ -1265,46 +1386,12 @@ int ovs_nla_get_match(struct net *net, struct sw_flow_match *match,
1265 goto free_newmask; 1386 goto free_newmask;
1266 1387
1267 /* Always match on tci. */ 1388 /* Always match on tci. */
1268 SW_FLOW_KEY_PUT(match, eth.tci, htons(0xffff), true); 1389 SW_FLOW_KEY_PUT(match, eth.vlan.tci, htons(0xffff), true);
1269 1390 SW_FLOW_KEY_PUT(match, eth.cvlan.tci, htons(0xffff), true);
1270 if (mask_attrs & 1 << OVS_KEY_ATTR_ENCAP) {
1271 __be16 eth_type = 0;
1272 __be16 tci = 0;
1273
1274 if (!encap_valid) {
1275 OVS_NLERR(log, "Encap mask attribute is set for non-VLAN frame.");
1276 err = -EINVAL;
1277 goto free_newmask;
1278 }
1279
1280 mask_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP);
1281 if (a[OVS_KEY_ATTR_ETHERTYPE])
1282 eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
1283
1284 if (eth_type == htons(0xffff)) {
1285 mask_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
1286 encap = a[OVS_KEY_ATTR_ENCAP];
1287 err = parse_flow_mask_nlattrs(encap, a,
1288 &mask_attrs, log);
1289 if (err)
1290 goto free_newmask;
1291 } else {
1292 OVS_NLERR(log, "VLAN frames must have an exact match on the TPID (mask=%x).",
1293 ntohs(eth_type));
1294 err = -EINVAL;
1295 goto free_newmask;
1296 }
1297
1298 if (a[OVS_KEY_ATTR_VLAN])
1299 tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
1300 1391
1301 if (!(tci & htons(VLAN_TAG_PRESENT))) { 1392 err = parse_vlan_from_nlattrs(match, &mask_attrs, a, true, log);
1302 OVS_NLERR(log, "VLAN tag present bit must have an exact match (tci_mask=%x).", 1393 if (err)
1303 ntohs(tci)); 1394 goto free_newmask;
1304 err = -EINVAL;
1305 goto free_newmask;
1306 }
1307 }
1308 1395
1309 err = ovs_key_from_nlattrs(net, match, mask_attrs, a, true, 1396 err = ovs_key_from_nlattrs(net, match, mask_attrs, a, true,
1310 log); 1397 log);
@@ -1410,12 +1497,25 @@ int ovs_nla_get_flow_metadata(struct net *net, const struct nlattr *attr,
1410 return metadata_from_nlattrs(net, &match, &attrs, a, false, log); 1497 return metadata_from_nlattrs(net, &match, &attrs, a, false, log);
1411} 1498}
1412 1499
1500static int ovs_nla_put_vlan(struct sk_buff *skb, const struct vlan_head *vh,
1501 bool is_mask)
1502{
1503 __be16 eth_type = !is_mask ? vh->tpid : htons(0xffff);
1504
1505 if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, eth_type) ||
1506 nla_put_be16(skb, OVS_KEY_ATTR_VLAN, vh->tci))
1507 return -EMSGSIZE;
1508 return 0;
1509}
1510
1413static int __ovs_nla_put_key(const struct sw_flow_key *swkey, 1511static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
1414 const struct sw_flow_key *output, bool is_mask, 1512 const struct sw_flow_key *output, bool is_mask,
1415 struct sk_buff *skb) 1513 struct sk_buff *skb)
1416{ 1514{
1417 struct ovs_key_ethernet *eth_key; 1515 struct ovs_key_ethernet *eth_key;
1418 struct nlattr *nla, *encap; 1516 struct nlattr *nla;
1517 struct nlattr *encap = NULL;
1518 struct nlattr *in_encap = NULL;
1419 1519
1420 if (nla_put_u32(skb, OVS_KEY_ATTR_RECIRC_ID, output->recirc_id)) 1520 if (nla_put_u32(skb, OVS_KEY_ATTR_RECIRC_ID, output->recirc_id))
1421 goto nla_put_failure; 1521 goto nla_put_failure;
@@ -1464,17 +1564,21 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
1464 ether_addr_copy(eth_key->eth_src, output->eth.src); 1564 ether_addr_copy(eth_key->eth_src, output->eth.src);
1465 ether_addr_copy(eth_key->eth_dst, output->eth.dst); 1565 ether_addr_copy(eth_key->eth_dst, output->eth.dst);
1466 1566
1467 if (swkey->eth.tci || swkey->eth.type == htons(ETH_P_8021Q)) { 1567 if (swkey->eth.vlan.tci || eth_type_vlan(swkey->eth.type)) {
1468 __be16 eth_type; 1568 if (ovs_nla_put_vlan(skb, &output->eth.vlan, is_mask))
1469 eth_type = !is_mask ? htons(ETH_P_8021Q) : htons(0xffff);
1470 if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, eth_type) ||
1471 nla_put_be16(skb, OVS_KEY_ATTR_VLAN, output->eth.tci))
1472 goto nla_put_failure; 1569 goto nla_put_failure;
1473 encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP); 1570 encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP);
1474 if (!swkey->eth.tci) 1571 if (!swkey->eth.vlan.tci)
1475 goto unencap; 1572 goto unencap;
1476 } else 1573
1477 encap = NULL; 1574 if (swkey->eth.cvlan.tci || eth_type_vlan(swkey->eth.type)) {
1575 if (ovs_nla_put_vlan(skb, &output->eth.cvlan, is_mask))
1576 goto nla_put_failure;
1577 in_encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP);
1578 if (!swkey->eth.cvlan.tci)
1579 goto unencap;
1580 }
1581 }
1478 1582
1479 if (swkey->eth.type == htons(ETH_P_802_2)) { 1583 if (swkey->eth.type == htons(ETH_P_802_2)) {
1480 /* 1584 /*
@@ -1493,6 +1597,14 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
1493 if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, output->eth.type)) 1597 if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, output->eth.type))
1494 goto nla_put_failure; 1598 goto nla_put_failure;
1495 1599
1600 if (eth_type_vlan(swkey->eth.type)) {
1601 /* There are 3 VLAN tags, we don't know anything about the rest
1602 * of the packet, so truncate here.
1603 */
1604 WARN_ON_ONCE(!(encap && in_encap));
1605 goto unencap;
1606 }
1607
1496 if (swkey->eth.type == htons(ETH_P_IP)) { 1608 if (swkey->eth.type == htons(ETH_P_IP)) {
1497 struct ovs_key_ipv4 *ipv4_key; 1609 struct ovs_key_ipv4 *ipv4_key;
1498 1610
@@ -1619,6 +1731,8 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
1619 } 1731 }
1620 1732
1621unencap: 1733unencap:
1734 if (in_encap)
1735 nla_nest_end(skb, in_encap);
1622 if (encap) 1736 if (encap)
1623 nla_nest_end(skb, encap); 1737 nla_nest_end(skb, encap);
1624 1738
@@ -1882,13 +1996,15 @@ static int validate_and_copy_sample(struct net *net, const struct nlattr *attr,
1882 1996
1883void ovs_match_init(struct sw_flow_match *match, 1997void ovs_match_init(struct sw_flow_match *match,
1884 struct sw_flow_key *key, 1998 struct sw_flow_key *key,
1999 bool reset_key,
1885 struct sw_flow_mask *mask) 2000 struct sw_flow_mask *mask)
1886{ 2001{
1887 memset(match, 0, sizeof(*match)); 2002 memset(match, 0, sizeof(*match));
1888 match->key = key; 2003 match->key = key;
1889 match->mask = mask; 2004 match->mask = mask;
1890 2005
1891 memset(key, 0, sizeof(*key)); 2006 if (reset_key)
2007 memset(key, 0, sizeof(*key));
1892 2008
1893 if (mask) { 2009 if (mask) {
1894 memset(&mask->key, 0, sizeof(mask->key)); 2010 memset(&mask->key, 0, sizeof(mask->key));
@@ -1935,7 +2051,7 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
1935 struct nlattr *a; 2051 struct nlattr *a;
1936 int err = 0, start, opts_type; 2052 int err = 0, start, opts_type;
1937 2053
1938 ovs_match_init(&match, &key, NULL); 2054 ovs_match_init(&match, &key, true, NULL);
1939 opts_type = ip_tun_from_nlattr(nla_data(attr), &match, false, log); 2055 opts_type = ip_tun_from_nlattr(nla_data(attr), &match, false, log);
1940 if (opts_type < 0) 2056 if (opts_type < 0)
1941 return opts_type; 2057 return opts_type;
@@ -2283,7 +2399,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
2283 2399
2284 case OVS_ACTION_ATTR_PUSH_VLAN: 2400 case OVS_ACTION_ATTR_PUSH_VLAN:
2285 vlan = nla_data(a); 2401 vlan = nla_data(a);
2286 if (vlan->vlan_tpid != htons(ETH_P_8021Q)) 2402 if (!eth_type_vlan(vlan->vlan_tpid))
2287 return -EINVAL; 2403 return -EINVAL;
2288 if (!(vlan->vlan_tci & htons(VLAN_TAG_PRESENT))) 2404 if (!(vlan->vlan_tci & htons(VLAN_TAG_PRESENT)))
2289 return -EINVAL; 2405 return -EINVAL;
@@ -2388,7 +2504,7 @@ int ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
2388 2504
2389 (*sfa)->orig_len = nla_len(attr); 2505 (*sfa)->orig_len = nla_len(attr);
2390 err = __ovs_nla_copy_actions(net, attr, key, 0, sfa, key->eth.type, 2506 err = __ovs_nla_copy_actions(net, attr, key, 0, sfa, key->eth.type,
2391 key->eth.tci, log); 2507 key->eth.vlan.tci, log);
2392 if (err) 2508 if (err)
2393 ovs_nla_free_flow_actions(*sfa); 2509 ovs_nla_free_flow_actions(*sfa);
2394 2510
diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h
index 47dd142eca1c..45f9769e5aac 100644
--- a/net/openvswitch/flow_netlink.h
+++ b/net/openvswitch/flow_netlink.h
@@ -41,7 +41,8 @@ size_t ovs_tun_key_attr_size(void);
41size_t ovs_key_attr_size(void); 41size_t ovs_key_attr_size(void);
42 42
43void ovs_match_init(struct sw_flow_match *match, 43void ovs_match_init(struct sw_flow_match *match,
44 struct sw_flow_key *key, struct sw_flow_mask *mask); 44 struct sw_flow_key *key, bool reset_key,
45 struct sw_flow_mask *mask);
45 46
46int ovs_nla_put_key(const struct sw_flow_key *, const struct sw_flow_key *, 47int ovs_nla_put_key(const struct sw_flow_key *, const struct sw_flow_key *,
47 int attr, bool is_mask, struct sk_buff *); 48 int attr, bool is_mask, struct sk_buff *);
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index d073fff82fdb..ea7a8073fa02 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -32,6 +32,7 @@
32#include <linux/module.h> 32#include <linux/module.h>
33#include <linux/in.h> 33#include <linux/in.h>
34#include <linux/rcupdate.h> 34#include <linux/rcupdate.h>
35#include <linux/cpumask.h>
35#include <linux/if_arp.h> 36#include <linux/if_arp.h>
36#include <linux/ip.h> 37#include <linux/ip.h>
37#include <linux/ipv6.h> 38#include <linux/ipv6.h>
@@ -79,17 +80,12 @@ struct sw_flow *ovs_flow_alloc(void)
79{ 80{
80 struct sw_flow *flow; 81 struct sw_flow *flow;
81 struct flow_stats *stats; 82 struct flow_stats *stats;
82 int node;
83 83
84 flow = kmem_cache_alloc(flow_cache, GFP_KERNEL); 84 flow = kmem_cache_zalloc(flow_cache, GFP_KERNEL);
85 if (!flow) 85 if (!flow)
86 return ERR_PTR(-ENOMEM); 86 return ERR_PTR(-ENOMEM);
87 87
88 flow->sf_acts = NULL; 88 flow->stats_last_writer = -1;
89 flow->mask = NULL;
90 flow->id.unmasked_key = NULL;
91 flow->id.ufid_len = 0;
92 flow->stats_last_writer = NUMA_NO_NODE;
93 89
94 /* Initialize the default stat node. */ 90 /* Initialize the default stat node. */
95 stats = kmem_cache_alloc_node(flow_stats_cache, 91 stats = kmem_cache_alloc_node(flow_stats_cache,
@@ -102,10 +98,6 @@ struct sw_flow *ovs_flow_alloc(void)
102 98
103 RCU_INIT_POINTER(flow->stats[0], stats); 99 RCU_INIT_POINTER(flow->stats[0], stats);
104 100
105 for_each_node(node)
106 if (node != 0)
107 RCU_INIT_POINTER(flow->stats[node], NULL);
108
109 return flow; 101 return flow;
110err: 102err:
111 kmem_cache_free(flow_cache, flow); 103 kmem_cache_free(flow_cache, flow);
@@ -142,16 +134,17 @@ static struct flex_array *alloc_buckets(unsigned int n_buckets)
142 134
143static void flow_free(struct sw_flow *flow) 135static void flow_free(struct sw_flow *flow)
144{ 136{
145 int node; 137 int cpu;
146 138
147 if (ovs_identifier_is_key(&flow->id)) 139 if (ovs_identifier_is_key(&flow->id))
148 kfree(flow->id.unmasked_key); 140 kfree(flow->id.unmasked_key);
149 if (flow->sf_acts) 141 if (flow->sf_acts)
150 ovs_nla_free_flow_actions((struct sw_flow_actions __force *)flow->sf_acts); 142 ovs_nla_free_flow_actions((struct sw_flow_actions __force *)flow->sf_acts);
151 for_each_node(node) 143 /* We open code this to make sure cpu 0 is always considered */
152 if (flow->stats[node]) 144 for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, cpu_possible_mask))
145 if (flow->stats[cpu])
153 kmem_cache_free(flow_stats_cache, 146 kmem_cache_free(flow_stats_cache,
154 (struct flow_stats __force *)flow->stats[node]); 147 (struct flow_stats __force *)flow->stats[cpu]);
155 kmem_cache_free(flow_cache, flow); 148 kmem_cache_free(flow_cache, flow);
156} 149}
157 150
@@ -756,7 +749,7 @@ int ovs_flow_init(void)
756 BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long)); 749 BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long));
757 750
758 flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow) 751 flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow)
759 + (nr_node_ids 752 + (nr_cpu_ids
760 * sizeof(struct flow_stats *)), 753 * sizeof(struct flow_stats *)),
761 0, 0, NULL); 754 0, 0, NULL);
762 if (flow_cache == NULL) 755 if (flow_cache == NULL)
diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
index 95c36147a6e1..e7da29021b38 100644
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -176,7 +176,7 @@ static void do_setup(struct net_device *netdev)
176 176
177 netdev->vlan_features = netdev->features; 177 netdev->vlan_features = netdev->features;
178 netdev->hw_enc_features = netdev->features; 178 netdev->hw_enc_features = netdev->features;
179 netdev->features |= NETIF_F_HW_VLAN_CTAG_TX; 179 netdev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX;
180 netdev->hw_features = netdev->features & ~NETIF_F_LLTX; 180 netdev->hw_features = netdev->features & ~NETIF_F_LLTX;
181 181
182 eth_hw_addr_random(netdev); 182 eth_hw_addr_random(netdev);
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index 6b21fd068d87..7387418ac514 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -485,9 +485,15 @@ static unsigned int packet_length(const struct sk_buff *skb)
485{ 485{
486 unsigned int length = skb->len - ETH_HLEN; 486 unsigned int length = skb->len - ETH_HLEN;
487 487
488 if (skb->protocol == htons(ETH_P_8021Q)) 488 if (!skb_vlan_tag_present(skb) &&
489 eth_type_vlan(skb->protocol))
489 length -= VLAN_HLEN; 490 length -= VLAN_HLEN;
490 491
492 /* Don't subtract for multiple VLAN tags. Most (all?) drivers allow
493 * (ETH_LEN + VLAN_HLEN) in addition to the mtu value, but almost none
494 * account for 802.1ad. e.g. is_skb_forwardable().
495 */
496
491 return length; 497 return length;
492} 498}
493 499
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 33a4697d5539..11db0d619c00 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -3952,6 +3952,7 @@ static int packet_notifier(struct notifier_block *this,
3952 } 3952 }
3953 if (msg == NETDEV_UNREGISTER) { 3953 if (msg == NETDEV_UNREGISTER) {
3954 packet_cached_dev_reset(po); 3954 packet_cached_dev_reset(po);
3955 fanout_release(sk);
3955 po->ifindex = -1; 3956 po->ifindex = -1;
3956 if (po->prot_hook.dev) 3957 if (po->prot_hook.dev)
3957 dev_put(po->prot_hook.dev); 3958 dev_put(po->prot_hook.dev);
diff --git a/net/rds/ib.c b/net/rds/ib.c
index 7eaf887e46f8..5680d90b0b77 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -160,7 +160,7 @@ static void rds_ib_add_one(struct ib_device *device)
160 rds_ibdev->max_responder_resources = device->attrs.max_qp_rd_atom; 160 rds_ibdev->max_responder_resources = device->attrs.max_qp_rd_atom;
161 161
162 rds_ibdev->dev = device; 162 rds_ibdev->dev = device;
163 rds_ibdev->pd = ib_alloc_pd(device); 163 rds_ibdev->pd = ib_alloc_pd(device, 0);
164 if (IS_ERR(rds_ibdev->pd)) { 164 if (IS_ERR(rds_ibdev->pd)) {
165 rds_ibdev->pd = NULL; 165 rds_ibdev->pd = NULL;
166 goto put_dev; 166 goto put_dev;
diff --git a/net/rds/ib.h b/net/rds/ib.h
index 046f7508c06b..45ac8e8e58f4 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -333,6 +333,7 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp);
333void rds_ib_state_change(struct sock *sk); 333void rds_ib_state_change(struct sock *sk);
334int rds_ib_listen_init(void); 334int rds_ib_listen_init(void);
335void rds_ib_listen_stop(void); 335void rds_ib_listen_stop(void);
336__printf(2, 3)
336void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...); 337void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...);
337int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, 338int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
338 struct rdma_cm_event *event); 339 struct rdma_cm_event *event);
diff --git a/net/rds/rds.h b/net/rds/rds.h
index b2d17f0fafa8..fd0bccb2f9f9 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -688,6 +688,7 @@ void __rds_conn_error(struct rds_connection *conn, const char *, ...);
688#define rds_conn_error(conn, fmt...) \ 688#define rds_conn_error(conn, fmt...) \
689 __rds_conn_error(conn, KERN_WARNING "RDS: " fmt) 689 __rds_conn_error(conn, KERN_WARNING "RDS: " fmt)
690 690
691__printf(2, 3)
691void __rds_conn_path_error(struct rds_conn_path *cp, const char *, ...); 692void __rds_conn_path_error(struct rds_conn_path *cp, const char *, ...);
692#define rds_conn_path_error(cp, fmt...) \ 693#define rds_conn_path_error(cp, fmt...) \
693 __rds_conn_path_error(cp, KERN_WARNING "RDS: " fmt) 694 __rds_conn_path_error(cp, KERN_WARNING "RDS: " fmt)
diff --git a/net/rxrpc/Kconfig b/net/rxrpc/Kconfig
index 784c53163b7b..86f8853a038c 100644
--- a/net/rxrpc/Kconfig
+++ b/net/rxrpc/Kconfig
@@ -19,6 +19,20 @@ config AF_RXRPC
19 19
20 See Documentation/networking/rxrpc.txt. 20 See Documentation/networking/rxrpc.txt.
21 21
22config AF_RXRPC_IPV6
23 bool "IPv6 support for RxRPC"
24 depends on (IPV6 = m && AF_RXRPC = m) || (IPV6 = y && AF_RXRPC)
25 help
26 Say Y here to allow AF_RXRPC to use IPV6 UDP as well as IPV4 UDP as
27 its network transport.
28
29config AF_RXRPC_INJECT_LOSS
30 bool "Inject packet loss into RxRPC packet stream"
31 depends on AF_RXRPC
32 help
33 Say Y here to inject packet loss by discarding some received and some
34 transmitted packets.
35
22 36
23config AF_RXRPC_DEBUG 37config AF_RXRPC_DEBUG
24 bool "RxRPC dynamic debugging" 38 bool "RxRPC dynamic debugging"
diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile
index 10f3f48a16a8..8fc6ea347182 100644
--- a/net/rxrpc/Makefile
+++ b/net/rxrpc/Makefile
@@ -22,6 +22,7 @@ af-rxrpc-y := \
22 peer_object.o \ 22 peer_object.o \
23 recvmsg.o \ 23 recvmsg.o \
24 security.o \ 24 security.o \
25 sendmsg.o \
25 skbuff.o \ 26 skbuff.o \
26 utils.o 27 utils.o
27 28
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 88effadd4b16..2d59c9be40e1 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -16,12 +16,14 @@
16#include <linux/net.h> 16#include <linux/net.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/skbuff.h> 18#include <linux/skbuff.h>
19#include <linux/random.h>
19#include <linux/poll.h> 20#include <linux/poll.h>
20#include <linux/proc_fs.h> 21#include <linux/proc_fs.h>
21#include <linux/key-type.h> 22#include <linux/key-type.h>
22#include <net/net_namespace.h> 23#include <net/net_namespace.h>
23#include <net/sock.h> 24#include <net/sock.h>
24#include <net/af_rxrpc.h> 25#include <net/af_rxrpc.h>
26#define CREATE_TRACE_POINTS
25#include "ar-internal.h" 27#include "ar-internal.h"
26 28
27MODULE_DESCRIPTION("RxRPC network protocol"); 29MODULE_DESCRIPTION("RxRPC network protocol");
@@ -43,7 +45,7 @@ u32 rxrpc_epoch;
43atomic_t rxrpc_debug_id; 45atomic_t rxrpc_debug_id;
44 46
45/* count of skbs currently in use */ 47/* count of skbs currently in use */
46atomic_t rxrpc_n_skbs; 48atomic_t rxrpc_n_tx_skbs, rxrpc_n_rx_skbs;
47 49
48struct workqueue_struct *rxrpc_workqueue; 50struct workqueue_struct *rxrpc_workqueue;
49 51
@@ -104,19 +106,25 @@ static int rxrpc_validate_address(struct rxrpc_sock *rx,
104 case AF_INET: 106 case AF_INET:
105 if (srx->transport_len < sizeof(struct sockaddr_in)) 107 if (srx->transport_len < sizeof(struct sockaddr_in))
106 return -EINVAL; 108 return -EINVAL;
107 _debug("INET: %x @ %pI4",
108 ntohs(srx->transport.sin.sin_port),
109 &srx->transport.sin.sin_addr);
110 tail = offsetof(struct sockaddr_rxrpc, transport.sin.__pad); 109 tail = offsetof(struct sockaddr_rxrpc, transport.sin.__pad);
111 break; 110 break;
112 111
112#ifdef CONFIG_AF_RXRPC_IPV6
113 case AF_INET6: 113 case AF_INET6:
114 if (srx->transport_len < sizeof(struct sockaddr_in6))
115 return -EINVAL;
116 tail = offsetof(struct sockaddr_rxrpc, transport) +
117 sizeof(struct sockaddr_in6);
118 break;
119#endif
120
114 default: 121 default:
115 return -EAFNOSUPPORT; 122 return -EAFNOSUPPORT;
116 } 123 }
117 124
118 if (tail < len) 125 if (tail < len)
119 memset((void *)srx + tail, 0, len - tail); 126 memset((void *)srx + tail, 0, len - tail);
127 _debug("INET: %pISp", &srx->transport);
120 return 0; 128 return 0;
121} 129}
122 130
@@ -128,7 +136,8 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len)
128 struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *)saddr; 136 struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *)saddr;
129 struct sock *sk = sock->sk; 137 struct sock *sk = sock->sk;
130 struct rxrpc_local *local; 138 struct rxrpc_local *local;
131 struct rxrpc_sock *rx = rxrpc_sk(sk), *prx; 139 struct rxrpc_sock *rx = rxrpc_sk(sk);
140 u16 service_id = srx->srx_service;
132 int ret; 141 int ret;
133 142
134 _enter("%p,%p,%d", rx, saddr, len); 143 _enter("%p,%p,%d", rx, saddr, len);
@@ -152,16 +161,13 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len)
152 goto error_unlock; 161 goto error_unlock;
153 } 162 }
154 163
155 if (rx->srx.srx_service) { 164 if (service_id) {
156 write_lock_bh(&local->services_lock); 165 write_lock(&local->services_lock);
157 list_for_each_entry(prx, &local->services, listen_link) { 166 if (rcu_access_pointer(local->service))
158 if (prx->srx.srx_service == rx->srx.srx_service) 167 goto service_in_use;
159 goto service_in_use;
160 }
161
162 rx->local = local; 168 rx->local = local;
163 list_add_tail(&rx->listen_link, &local->services); 169 rcu_assign_pointer(local->service, rx);
164 write_unlock_bh(&local->services_lock); 170 write_unlock(&local->services_lock);
165 171
166 rx->sk.sk_state = RXRPC_SERVER_BOUND; 172 rx->sk.sk_state = RXRPC_SERVER_BOUND;
167 } else { 173 } else {
@@ -174,7 +180,7 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len)
174 return 0; 180 return 0;
175 181
176service_in_use: 182service_in_use:
177 write_unlock_bh(&local->services_lock); 183 write_unlock(&local->services_lock);
178 rxrpc_put_local(local); 184 rxrpc_put_local(local);
179 ret = -EADDRINUSE; 185 ret = -EADDRINUSE;
180error_unlock: 186error_unlock:
@@ -191,7 +197,7 @@ static int rxrpc_listen(struct socket *sock, int backlog)
191{ 197{
192 struct sock *sk = sock->sk; 198 struct sock *sk = sock->sk;
193 struct rxrpc_sock *rx = rxrpc_sk(sk); 199 struct rxrpc_sock *rx = rxrpc_sk(sk);
194 unsigned int max; 200 unsigned int max, old;
195 int ret; 201 int ret;
196 202
197 _enter("%p,%d", rx, backlog); 203 _enter("%p,%d", rx, backlog);
@@ -210,9 +216,13 @@ static int rxrpc_listen(struct socket *sock, int backlog)
210 backlog = max; 216 backlog = max;
211 else if (backlog < 0 || backlog > max) 217 else if (backlog < 0 || backlog > max)
212 break; 218 break;
219 old = sk->sk_max_ack_backlog;
213 sk->sk_max_ack_backlog = backlog; 220 sk->sk_max_ack_backlog = backlog;
214 rx->sk.sk_state = RXRPC_SERVER_LISTENING; 221 ret = rxrpc_service_prealloc(rx, GFP_KERNEL);
215 ret = 0; 222 if (ret == 0)
223 rx->sk.sk_state = RXRPC_SERVER_LISTENING;
224 else
225 sk->sk_max_ack_backlog = old;
216 break; 226 break;
217 default: 227 default:
218 ret = -EBUSY; 228 ret = -EBUSY;
@@ -230,6 +240,8 @@ static int rxrpc_listen(struct socket *sock, int backlog)
230 * @srx: The address of the peer to contact 240 * @srx: The address of the peer to contact
231 * @key: The security context to use (defaults to socket setting) 241 * @key: The security context to use (defaults to socket setting)
232 * @user_call_ID: The ID to use 242 * @user_call_ID: The ID to use
243 * @gfp: The allocation constraints
244 * @notify_rx: Where to send notifications instead of socket queue
233 * 245 *
234 * Allow a kernel service to begin a call on the nominated socket. This just 246 * Allow a kernel service to begin a call on the nominated socket. This just
235 * sets up all the internal tracking structures and allocates connection and 247 * sets up all the internal tracking structures and allocates connection and
@@ -242,7 +254,8 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
242 struct sockaddr_rxrpc *srx, 254 struct sockaddr_rxrpc *srx,
243 struct key *key, 255 struct key *key,
244 unsigned long user_call_ID, 256 unsigned long user_call_ID,
245 gfp_t gfp) 257 gfp_t gfp,
258 rxrpc_notify_rx_t notify_rx)
246{ 259{
247 struct rxrpc_conn_parameters cp; 260 struct rxrpc_conn_parameters cp;
248 struct rxrpc_call *call; 261 struct rxrpc_call *call;
@@ -269,6 +282,8 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
269 cp.exclusive = false; 282 cp.exclusive = false;
270 cp.service_id = srx->srx_service; 283 cp.service_id = srx->srx_service;
271 call = rxrpc_new_client_call(rx, &cp, srx, user_call_ID, gfp); 284 call = rxrpc_new_client_call(rx, &cp, srx, user_call_ID, gfp);
285 if (!IS_ERR(call))
286 call->notify_rx = notify_rx;
272 287
273 release_sock(&rx->sk); 288 release_sock(&rx->sk);
274 _leave(" = %p", call); 289 _leave(" = %p", call);
@@ -278,40 +293,39 @@ EXPORT_SYMBOL(rxrpc_kernel_begin_call);
278 293
279/** 294/**
280 * rxrpc_kernel_end_call - Allow a kernel service to end a call it was using 295 * rxrpc_kernel_end_call - Allow a kernel service to end a call it was using
296 * @sock: The socket the call is on
281 * @call: The call to end 297 * @call: The call to end
282 * 298 *
283 * Allow a kernel service to end a call it was using. The call must be 299 * Allow a kernel service to end a call it was using. The call must be
284 * complete before this is called (the call should be aborted if necessary). 300 * complete before this is called (the call should be aborted if necessary).
285 */ 301 */
286void rxrpc_kernel_end_call(struct rxrpc_call *call) 302void rxrpc_kernel_end_call(struct socket *sock, struct rxrpc_call *call)
287{ 303{
288 _enter("%d{%d}", call->debug_id, atomic_read(&call->usage)); 304 _enter("%d{%d}", call->debug_id, atomic_read(&call->usage));
289 rxrpc_remove_user_ID(call->socket, call); 305 rxrpc_release_call(rxrpc_sk(sock->sk), call);
290 rxrpc_put_call(call); 306 rxrpc_put_call(call, rxrpc_call_put_kernel);
291} 307}
292EXPORT_SYMBOL(rxrpc_kernel_end_call); 308EXPORT_SYMBOL(rxrpc_kernel_end_call);
293 309
294/** 310/**
295 * rxrpc_kernel_intercept_rx_messages - Intercept received RxRPC messages 311 * rxrpc_kernel_new_call_notification - Get notifications of new calls
296 * @sock: The socket to intercept received messages on 312 * @sock: The socket to intercept received messages on
297 * @interceptor: The function to pass the messages to 313 * @notify_new_call: Function to be called when new calls appear
314 * @discard_new_call: Function to discard preallocated calls
298 * 315 *
299 * Allow a kernel service to intercept messages heading for the Rx queue on an 316 * Allow a kernel service to be given notifications about new calls.
300 * RxRPC socket. They get passed to the specified function instead.
301 * @interceptor should free the socket buffers it is given. @interceptor is
302 * called with the socket receive queue spinlock held and softirqs disabled -
303 * this ensures that the messages will be delivered in the right order.
304 */ 317 */
305void rxrpc_kernel_intercept_rx_messages(struct socket *sock, 318void rxrpc_kernel_new_call_notification(
306 rxrpc_interceptor_t interceptor) 319 struct socket *sock,
320 rxrpc_notify_new_call_t notify_new_call,
321 rxrpc_discard_new_call_t discard_new_call)
307{ 322{
308 struct rxrpc_sock *rx = rxrpc_sk(sock->sk); 323 struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
309 324
310 _enter(""); 325 rx->notify_new_call = notify_new_call;
311 rx->interceptor = interceptor; 326 rx->discard_new_call = discard_new_call;
312} 327}
313 328EXPORT_SYMBOL(rxrpc_kernel_new_call_notification);
314EXPORT_SYMBOL(rxrpc_kernel_intercept_rx_messages);
315 329
316/* 330/*
317 * connect an RxRPC socket 331 * connect an RxRPC socket
@@ -391,6 +405,23 @@ static int rxrpc_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
391 405
392 switch (rx->sk.sk_state) { 406 switch (rx->sk.sk_state) {
393 case RXRPC_UNBOUND: 407 case RXRPC_UNBOUND:
408 rx->srx.srx_family = AF_RXRPC;
409 rx->srx.srx_service = 0;
410 rx->srx.transport_type = SOCK_DGRAM;
411 rx->srx.transport.family = rx->family;
412 switch (rx->family) {
413 case AF_INET:
414 rx->srx.transport_len = sizeof(struct sockaddr_in);
415 break;
416#ifdef CONFIG_AF_RXRPC_IPV6
417 case AF_INET6:
418 rx->srx.transport_len = sizeof(struct sockaddr_in6);
419 break;
420#endif
421 default:
422 ret = -EAFNOSUPPORT;
423 goto error_unlock;
424 }
394 local = rxrpc_lookup_local(&rx->srx); 425 local = rxrpc_lookup_local(&rx->srx);
395 if (IS_ERR(local)) { 426 if (IS_ERR(local)) {
396 ret = PTR_ERR(local); 427 ret = PTR_ERR(local);
@@ -505,15 +536,16 @@ error:
505static unsigned int rxrpc_poll(struct file *file, struct socket *sock, 536static unsigned int rxrpc_poll(struct file *file, struct socket *sock,
506 poll_table *wait) 537 poll_table *wait)
507{ 538{
508 unsigned int mask;
509 struct sock *sk = sock->sk; 539 struct sock *sk = sock->sk;
540 struct rxrpc_sock *rx = rxrpc_sk(sk);
541 unsigned int mask;
510 542
511 sock_poll_wait(file, sk_sleep(sk), wait); 543 sock_poll_wait(file, sk_sleep(sk), wait);
512 mask = 0; 544 mask = 0;
513 545
514 /* the socket is readable if there are any messages waiting on the Rx 546 /* the socket is readable if there are any messages waiting on the Rx
515 * queue */ 547 * queue */
516 if (!skb_queue_empty(&sk->sk_receive_queue)) 548 if (!list_empty(&rx->recvmsg_q))
517 mask |= POLLIN | POLLRDNORM; 549 mask |= POLLIN | POLLRDNORM;
518 550
519 /* the socket is writable if there is space to add new data to the 551 /* the socket is writable if there is space to add new data to the
@@ -540,7 +572,8 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol,
540 return -EAFNOSUPPORT; 572 return -EAFNOSUPPORT;
541 573
542 /* we support transport protocol UDP/UDP6 only */ 574 /* we support transport protocol UDP/UDP6 only */
543 if (protocol != PF_INET) 575 if (protocol != PF_INET &&
576 IS_ENABLED(CONFIG_AF_RXRPC_IPV6) && protocol != PF_INET6)
544 return -EPROTONOSUPPORT; 577 return -EPROTONOSUPPORT;
545 578
546 if (sock->type != SOCK_DGRAM) 579 if (sock->type != SOCK_DGRAM)
@@ -554,6 +587,7 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol,
554 return -ENOMEM; 587 return -ENOMEM;
555 588
556 sock_init_data(sock, sk); 589 sock_init_data(sock, sk);
590 sock_set_flag(sk, SOCK_RCU_FREE);
557 sk->sk_state = RXRPC_UNBOUND; 591 sk->sk_state = RXRPC_UNBOUND;
558 sk->sk_write_space = rxrpc_write_space; 592 sk->sk_write_space = rxrpc_write_space;
559 sk->sk_max_ack_backlog = 0; 593 sk->sk_max_ack_backlog = 0;
@@ -563,9 +597,11 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol,
563 rx->family = protocol; 597 rx->family = protocol;
564 rx->calls = RB_ROOT; 598 rx->calls = RB_ROOT;
565 599
566 INIT_LIST_HEAD(&rx->listen_link); 600 spin_lock_init(&rx->incoming_lock);
567 INIT_LIST_HEAD(&rx->secureq); 601 INIT_LIST_HEAD(&rx->sock_calls);
568 INIT_LIST_HEAD(&rx->acceptq); 602 INIT_LIST_HEAD(&rx->to_be_accepted);
603 INIT_LIST_HEAD(&rx->recvmsg_q);
604 rwlock_init(&rx->recvmsg_lock);
569 rwlock_init(&rx->call_lock); 605 rwlock_init(&rx->call_lock);
570 memset(&rx->srx, 0, sizeof(rx->srx)); 606 memset(&rx->srx, 0, sizeof(rx->srx));
571 607
@@ -574,6 +610,39 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol,
574} 610}
575 611
576/* 612/*
613 * Kill all the calls on a socket and shut it down.
614 */
615static int rxrpc_shutdown(struct socket *sock, int flags)
616{
617 struct sock *sk = sock->sk;
618 struct rxrpc_sock *rx = rxrpc_sk(sk);
619 int ret = 0;
620
621 _enter("%p,%d", sk, flags);
622
623 if (flags != SHUT_RDWR)
624 return -EOPNOTSUPP;
625 if (sk->sk_state == RXRPC_CLOSE)
626 return -ESHUTDOWN;
627
628 lock_sock(sk);
629
630 spin_lock_bh(&sk->sk_receive_queue.lock);
631 if (sk->sk_state < RXRPC_CLOSE) {
632 sk->sk_state = RXRPC_CLOSE;
633 sk->sk_shutdown = SHUTDOWN_MASK;
634 } else {
635 ret = -ESHUTDOWN;
636 }
637 spin_unlock_bh(&sk->sk_receive_queue.lock);
638
639 rxrpc_discard_prealloc(rx);
640
641 release_sock(sk);
642 return ret;
643}
644
645/*
577 * RxRPC socket destructor 646 * RxRPC socket destructor
578 */ 647 */
579static void rxrpc_sock_destructor(struct sock *sk) 648static void rxrpc_sock_destructor(struct sock *sk)
@@ -609,15 +678,14 @@ static int rxrpc_release_sock(struct sock *sk)
609 sk->sk_state = RXRPC_CLOSE; 678 sk->sk_state = RXRPC_CLOSE;
610 spin_unlock_bh(&sk->sk_receive_queue.lock); 679 spin_unlock_bh(&sk->sk_receive_queue.lock);
611 680
612 ASSERTCMP(rx->listen_link.next, !=, LIST_POISON1); 681 if (rx->local && rcu_access_pointer(rx->local->service) == rx) {
613 682 write_lock(&rx->local->services_lock);
614 if (!list_empty(&rx->listen_link)) { 683 rcu_assign_pointer(rx->local->service, NULL);
615 write_lock_bh(&rx->local->services_lock); 684 write_unlock(&rx->local->services_lock);
616 list_del(&rx->listen_link);
617 write_unlock_bh(&rx->local->services_lock);
618 } 685 }
619 686
620 /* try to flush out this socket */ 687 /* try to flush out this socket */
688 rxrpc_discard_prealloc(rx);
621 rxrpc_release_calls_on_socket(rx); 689 rxrpc_release_calls_on_socket(rx);
622 flush_workqueue(rxrpc_workqueue); 690 flush_workqueue(rxrpc_workqueue);
623 rxrpc_purge_queue(&sk->sk_receive_queue); 691 rxrpc_purge_queue(&sk->sk_receive_queue);
@@ -666,7 +734,7 @@ static const struct proto_ops rxrpc_rpc_ops = {
666 .poll = rxrpc_poll, 734 .poll = rxrpc_poll,
667 .ioctl = sock_no_ioctl, 735 .ioctl = sock_no_ioctl,
668 .listen = rxrpc_listen, 736 .listen = rxrpc_listen,
669 .shutdown = sock_no_shutdown, 737 .shutdown = rxrpc_shutdown,
670 .setsockopt = rxrpc_setsockopt, 738 .setsockopt = rxrpc_setsockopt,
671 .getsockopt = sock_no_getsockopt, 739 .getsockopt = sock_no_getsockopt,
672 .sendmsg = rxrpc_sendmsg, 740 .sendmsg = rxrpc_sendmsg,
@@ -697,7 +765,13 @@ static int __init af_rxrpc_init(void)
697 765
698 BUILD_BUG_ON(sizeof(struct rxrpc_skb_priv) > FIELD_SIZEOF(struct sk_buff, cb)); 766 BUILD_BUG_ON(sizeof(struct rxrpc_skb_priv) > FIELD_SIZEOF(struct sk_buff, cb));
699 767
700 rxrpc_epoch = get_seconds(); 768 get_random_bytes(&rxrpc_epoch, sizeof(rxrpc_epoch));
769 rxrpc_epoch |= RXRPC_RANDOM_EPOCH;
770 get_random_bytes(&rxrpc_client_conn_ids.cur,
771 sizeof(rxrpc_client_conn_ids.cur));
772 rxrpc_client_conn_ids.cur &= 0x3fffffff;
773 if (rxrpc_client_conn_ids.cur == 0)
774 rxrpc_client_conn_ids.cur = 1;
701 775
702 ret = -ENOMEM; 776 ret = -ENOMEM;
703 rxrpc_call_jar = kmem_cache_create( 777 rxrpc_call_jar = kmem_cache_create(
@@ -788,7 +862,8 @@ static void __exit af_rxrpc_exit(void)
788 proto_unregister(&rxrpc_proto); 862 proto_unregister(&rxrpc_proto);
789 rxrpc_destroy_all_calls(); 863 rxrpc_destroy_all_calls();
790 rxrpc_destroy_all_connections(); 864 rxrpc_destroy_all_connections();
791 ASSERTCMP(atomic_read(&rxrpc_n_skbs), ==, 0); 865 ASSERTCMP(atomic_read(&rxrpc_n_tx_skbs), ==, 0);
866 ASSERTCMP(atomic_read(&rxrpc_n_rx_skbs), ==, 0);
792 rxrpc_destroy_all_locals(); 867 rxrpc_destroy_all_locals();
793 868
794 remove_proc_entry("rxrpc_conns", init_net.proc_net); 869 remove_proc_entry("rxrpc_conns", init_net.proc_net);
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index ff83fb1ddd47..f60e35576526 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -35,11 +35,23 @@ struct rxrpc_crypt {
35#define rxrpc_queue_delayed_work(WS,D) \ 35#define rxrpc_queue_delayed_work(WS,D) \
36 queue_delayed_work(rxrpc_workqueue, (WS), (D)) 36 queue_delayed_work(rxrpc_workqueue, (WS), (D))
37 37
38#define rxrpc_queue_call(CALL) rxrpc_queue_work(&(CALL)->processor)
39
40struct rxrpc_connection; 38struct rxrpc_connection;
41 39
42/* 40/*
41 * Mark applied to socket buffers.
42 */
43enum rxrpc_skb_mark {
44 RXRPC_SKB_MARK_DATA, /* data message */
45 RXRPC_SKB_MARK_FINAL_ACK, /* final ACK received message */
46 RXRPC_SKB_MARK_BUSY, /* server busy message */
47 RXRPC_SKB_MARK_REMOTE_ABORT, /* remote abort message */
48 RXRPC_SKB_MARK_LOCAL_ABORT, /* local abort message */
49 RXRPC_SKB_MARK_NET_ERROR, /* network error message */
50 RXRPC_SKB_MARK_LOCAL_ERROR, /* local error message */
51 RXRPC_SKB_MARK_NEW_CALL, /* local error message */
52};
53
54/*
43 * sk_state for RxRPC sockets 55 * sk_state for RxRPC sockets
44 */ 56 */
45enum { 57enum {
@@ -52,19 +64,44 @@ enum {
52}; 64};
53 65
54/* 66/*
67 * Service backlog preallocation.
68 *
69 * This contains circular buffers of preallocated peers, connections and calls
70 * for incoming service calls and their head and tail pointers. This allows
71 * calls to be set up in the data_ready handler, thereby avoiding the need to
72 * shuffle packets around so much.
73 */
74struct rxrpc_backlog {
75 unsigned short peer_backlog_head;
76 unsigned short peer_backlog_tail;
77 unsigned short conn_backlog_head;
78 unsigned short conn_backlog_tail;
79 unsigned short call_backlog_head;
80 unsigned short call_backlog_tail;
81#define RXRPC_BACKLOG_MAX 32
82 struct rxrpc_peer *peer_backlog[RXRPC_BACKLOG_MAX];
83 struct rxrpc_connection *conn_backlog[RXRPC_BACKLOG_MAX];
84 struct rxrpc_call *call_backlog[RXRPC_BACKLOG_MAX];
85};
86
87/*
55 * RxRPC socket definition 88 * RxRPC socket definition
56 */ 89 */
57struct rxrpc_sock { 90struct rxrpc_sock {
58 /* WARNING: sk has to be the first member */ 91 /* WARNING: sk has to be the first member */
59 struct sock sk; 92 struct sock sk;
60 rxrpc_interceptor_t interceptor; /* kernel service Rx interceptor function */ 93 rxrpc_notify_new_call_t notify_new_call; /* Func to notify of new call */
94 rxrpc_discard_new_call_t discard_new_call; /* Func to discard a new call */
61 struct rxrpc_local *local; /* local endpoint */ 95 struct rxrpc_local *local; /* local endpoint */
62 struct list_head listen_link; /* link in the local endpoint's listen list */ 96 struct rxrpc_backlog *backlog; /* Preallocation for services */
63 struct list_head secureq; /* calls awaiting connection security clearance */ 97 spinlock_t incoming_lock; /* Incoming call vs service shutdown lock */
64 struct list_head acceptq; /* calls awaiting acceptance */ 98 struct list_head sock_calls; /* List of calls owned by this socket */
99 struct list_head to_be_accepted; /* calls awaiting acceptance */
100 struct list_head recvmsg_q; /* Calls awaiting recvmsg's attention */
101 rwlock_t recvmsg_lock; /* Lock for recvmsg_q */
65 struct key *key; /* security for this socket */ 102 struct key *key; /* security for this socket */
66 struct key *securities; /* list of server security descriptors */ 103 struct key *securities; /* list of server security descriptors */
67 struct rb_root calls; /* outstanding calls on this socket */ 104 struct rb_root calls; /* User ID -> call mapping */
68 unsigned long flags; 105 unsigned long flags;
69#define RXRPC_SOCK_CONNECTED 0 /* connect_srx is set */ 106#define RXRPC_SOCK_CONNECTED 0 /* connect_srx is set */
70 rwlock_t call_lock; /* lock for calls */ 107 rwlock_t call_lock; /* lock for calls */
@@ -103,13 +140,11 @@ struct rxrpc_host_header {
103 * - max 48 bytes (struct sk_buff::cb) 140 * - max 48 bytes (struct sk_buff::cb)
104 */ 141 */
105struct rxrpc_skb_priv { 142struct rxrpc_skb_priv {
106 struct rxrpc_call *call; /* call with which associated */
107 unsigned long resend_at; /* time in jiffies at which to resend */
108 union { 143 union {
109 unsigned int offset; /* offset into buffer of next read */ 144 u8 nr_jumbo; /* Number of jumbo subpackets */
145 };
146 union {
110 int remain; /* amount of space remaining for next write */ 147 int remain; /* amount of space remaining for next write */
111 u32 error; /* network error code */
112 bool need_resend; /* T if needs resending */
113 }; 148 };
114 149
115 struct rxrpc_host_header hdr; /* RxRPC packet header from this packet */ 150 struct rxrpc_host_header hdr; /* RxRPC packet header from this packet */
@@ -117,13 +152,6 @@ struct rxrpc_skb_priv {
117 152
118#define rxrpc_skb(__skb) ((struct rxrpc_skb_priv *) &(__skb)->cb) 153#define rxrpc_skb(__skb) ((struct rxrpc_skb_priv *) &(__skb)->cb)
119 154
120enum rxrpc_command {
121 RXRPC_CMD_SEND_DATA, /* send data message */
122 RXRPC_CMD_SEND_ABORT, /* request abort generation */
123 RXRPC_CMD_ACCEPT, /* [server] accept incoming call */
124 RXRPC_CMD_REJECT_BUSY, /* [server] reject a call as busy */
125};
126
127/* 155/*
128 * RxRPC security module interface 156 * RxRPC security module interface
129 */ 157 */
@@ -150,7 +178,12 @@ struct rxrpc_security {
150 void *); 178 void *);
151 179
152 /* verify the security on a received packet */ 180 /* verify the security on a received packet */
153 int (*verify_packet)(struct rxrpc_call *, struct sk_buff *, u32 *); 181 int (*verify_packet)(struct rxrpc_call *, struct sk_buff *,
182 unsigned int, unsigned int, rxrpc_seq_t, u16);
183
184 /* Locate the data in a received packet that has been verified. */
185 void (*locate_data)(struct rxrpc_call *, struct sk_buff *,
186 unsigned int *, unsigned int *);
154 187
155 /* issue a challenge */ 188 /* issue a challenge */
156 int (*issue_challenge)(struct rxrpc_connection *); 189 int (*issue_challenge)(struct rxrpc_connection *);
@@ -180,9 +213,8 @@ struct rxrpc_local {
180 struct list_head link; 213 struct list_head link;
181 struct socket *socket; /* my UDP socket */ 214 struct socket *socket; /* my UDP socket */
182 struct work_struct processor; 215 struct work_struct processor;
183 struct list_head services; /* services listening on this endpoint */ 216 struct rxrpc_sock __rcu *service; /* Service(s) listening on this endpoint */
184 struct rw_semaphore defrag_sem; /* control re-enablement of IP DF bit */ 217 struct rw_semaphore defrag_sem; /* control re-enablement of IP DF bit */
185 struct sk_buff_head accept_queue; /* incoming calls awaiting acceptance */
186 struct sk_buff_head reject_queue; /* packets awaiting rejection */ 218 struct sk_buff_head reject_queue; /* packets awaiting rejection */
187 struct sk_buff_head event_queue; /* endpoint event packets awaiting processing */ 219 struct sk_buff_head event_queue; /* endpoint event packets awaiting processing */
188 struct rb_root client_conns; /* Client connections by socket params */ 220 struct rb_root client_conns; /* Client connections by socket params */
@@ -220,10 +252,12 @@ struct rxrpc_peer {
220 252
221 /* calculated RTT cache */ 253 /* calculated RTT cache */
222#define RXRPC_RTT_CACHE_SIZE 32 254#define RXRPC_RTT_CACHE_SIZE 32
223 suseconds_t rtt; /* current RTT estimate (in uS) */ 255 ktime_t rtt_last_req; /* Time of last RTT request */
224 unsigned int rtt_point; /* next entry at which to insert */ 256 u64 rtt; /* Current RTT estimate (in nS) */
225 unsigned int rtt_usage; /* amount of cache actually used */ 257 u64 rtt_sum; /* Sum of cache contents */
226 suseconds_t rtt_cache[RXRPC_RTT_CACHE_SIZE]; /* calculated RTT cache */ 258 u64 rtt_cache[RXRPC_RTT_CACHE_SIZE]; /* Determined RTT cache */
259 u8 rtt_cursor; /* next entry at which to insert */
260 u8 rtt_usage; /* amount of cache actually used */
227}; 261};
228 262
229/* 263/*
@@ -255,6 +289,9 @@ enum rxrpc_conn_flag {
255 RXRPC_CONN_HAS_IDR, /* Has a client conn ID assigned */ 289 RXRPC_CONN_HAS_IDR, /* Has a client conn ID assigned */
256 RXRPC_CONN_IN_SERVICE_CONNS, /* Conn is in peer->service_conns */ 290 RXRPC_CONN_IN_SERVICE_CONNS, /* Conn is in peer->service_conns */
257 RXRPC_CONN_IN_CLIENT_CONNS, /* Conn is in local->client_conns */ 291 RXRPC_CONN_IN_CLIENT_CONNS, /* Conn is in local->client_conns */
292 RXRPC_CONN_EXPOSED, /* Conn has extra ref for exposure */
293 RXRPC_CONN_DONT_REUSE, /* Don't reuse this connection */
294 RXRPC_CONN_COUNTED, /* Counted by rxrpc_nr_client_conns */
258}; 295};
259 296
260/* 297/*
@@ -265,17 +302,29 @@ enum rxrpc_conn_event {
265}; 302};
266 303
267/* 304/*
305 * The connection cache state.
306 */
307enum rxrpc_conn_cache_state {
308 RXRPC_CONN_CLIENT_INACTIVE, /* Conn is not yet listed */
309 RXRPC_CONN_CLIENT_WAITING, /* Conn is on wait list, waiting for capacity */
310 RXRPC_CONN_CLIENT_ACTIVE, /* Conn is on active list, doing calls */
311 RXRPC_CONN_CLIENT_CULLED, /* Conn is culled and delisted, doing calls */
312 RXRPC_CONN_CLIENT_IDLE, /* Conn is on idle list, doing mostly nothing */
313 RXRPC_CONN__NR_CACHE_STATES
314};
315
316/*
268 * The connection protocol state. 317 * The connection protocol state.
269 */ 318 */
270enum rxrpc_conn_proto_state { 319enum rxrpc_conn_proto_state {
271 RXRPC_CONN_UNUSED, /* Connection not yet attempted */ 320 RXRPC_CONN_UNUSED, /* Connection not yet attempted */
272 RXRPC_CONN_CLIENT, /* Client connection */ 321 RXRPC_CONN_CLIENT, /* Client connection */
322 RXRPC_CONN_SERVICE_PREALLOC, /* Service connection preallocation */
273 RXRPC_CONN_SERVICE_UNSECURED, /* Service unsecured connection */ 323 RXRPC_CONN_SERVICE_UNSECURED, /* Service unsecured connection */
274 RXRPC_CONN_SERVICE_CHALLENGING, /* Service challenging for security */ 324 RXRPC_CONN_SERVICE_CHALLENGING, /* Service challenging for security */
275 RXRPC_CONN_SERVICE, /* Service secured connection */ 325 RXRPC_CONN_SERVICE, /* Service secured connection */
276 RXRPC_CONN_REMOTELY_ABORTED, /* Conn aborted by peer */ 326 RXRPC_CONN_REMOTELY_ABORTED, /* Conn aborted by peer */
277 RXRPC_CONN_LOCALLY_ABORTED, /* Conn aborted locally */ 327 RXRPC_CONN_LOCALLY_ABORTED, /* Conn aborted locally */
278 RXRPC_CONN_NETWORK_ERROR, /* Conn terminated by network error */
279 RXRPC_CONN__NR_STATES 328 RXRPC_CONN__NR_STATES
280}; 329};
281 330
@@ -288,23 +337,33 @@ struct rxrpc_connection {
288 struct rxrpc_conn_proto proto; 337 struct rxrpc_conn_proto proto;
289 struct rxrpc_conn_parameters params; 338 struct rxrpc_conn_parameters params;
290 339
291 spinlock_t channel_lock; 340 atomic_t usage;
341 struct rcu_head rcu;
342 struct list_head cache_link;
292 343
344 spinlock_t channel_lock;
345 unsigned char active_chans; /* Mask of active channels */
346#define RXRPC_ACTIVE_CHANS_MASK ((1 << RXRPC_MAXCALLS) - 1)
347 struct list_head waiting_calls; /* Calls waiting for channels */
293 struct rxrpc_channel { 348 struct rxrpc_channel {
294 struct rxrpc_call __rcu *call; /* Active call */ 349 struct rxrpc_call __rcu *call; /* Active call */
295 u32 call_id; /* ID of current call */ 350 u32 call_id; /* ID of current call */
296 u32 call_counter; /* Call ID counter */ 351 u32 call_counter; /* Call ID counter */
297 u32 last_call; /* ID of last call */ 352 u32 last_call; /* ID of last call */
298 u32 last_result; /* Result of last call (0/abort) */ 353 u8 last_type; /* Type of last packet */
354 u16 last_service_id;
355 union {
356 u32 last_seq;
357 u32 last_abort;
358 };
299 } channels[RXRPC_MAXCALLS]; 359 } channels[RXRPC_MAXCALLS];
300 wait_queue_head_t channel_wq; /* queue to wait for channel to become available */
301 360
302 struct rcu_head rcu;
303 struct work_struct processor; /* connection event processor */ 361 struct work_struct processor; /* connection event processor */
304 union { 362 union {
305 struct rb_node client_node; /* Node in local->client_conns */ 363 struct rb_node client_node; /* Node in local->client_conns */
306 struct rb_node service_node; /* Node in peer->service_conns */ 364 struct rb_node service_node; /* Node in peer->service_conns */
307 }; 365 };
366 struct list_head proc_link; /* link in procfs list */
308 struct list_head link; /* link in master connection list */ 367 struct list_head link; /* link in master connection list */
309 struct sk_buff_head rx_queue; /* received conn-level packets */ 368 struct sk_buff_head rx_queue; /* received conn-level packets */
310 const struct rxrpc_security *security; /* applied security module */ 369 const struct rxrpc_security *security; /* applied security module */
@@ -313,21 +372,18 @@ struct rxrpc_connection {
313 struct rxrpc_crypt csum_iv; /* packet checksum base */ 372 struct rxrpc_crypt csum_iv; /* packet checksum base */
314 unsigned long flags; 373 unsigned long flags;
315 unsigned long events; 374 unsigned long events;
316 unsigned long put_time; /* Time at which last put */ 375 unsigned long idle_timestamp; /* Time at which last became idle */
317 spinlock_t state_lock; /* state-change lock */ 376 spinlock_t state_lock; /* state-change lock */
318 atomic_t usage; 377 enum rxrpc_conn_cache_state cache_state;
319 enum rxrpc_conn_proto_state state : 8; /* current state of connection */ 378 enum rxrpc_conn_proto_state state; /* current state of connection */
320 u32 local_abort; /* local abort code */ 379 u32 local_abort; /* local abort code */
321 u32 remote_abort; /* remote abort code */ 380 u32 remote_abort; /* remote abort code */
322 int error; /* local error incurred */
323 int debug_id; /* debug ID for printks */ 381 int debug_id; /* debug ID for printks */
324 atomic_t serial; /* packet serial number counter */ 382 atomic_t serial; /* packet serial number counter */
325 atomic_t hi_serial; /* highest serial number received */ 383 unsigned int hi_serial; /* highest serial number received */
326 atomic_t avail_chans; /* number of channels available */ 384 u32 security_nonce; /* response re-use preventer */
327 u8 size_align; /* data size alignment (for security) */ 385 u8 size_align; /* data size alignment (for security) */
328 u8 header_size; /* rxrpc + security header size */
329 u8 security_size; /* security header size */ 386 u8 security_size; /* security header size */
330 u32 security_nonce; /* response re-use preventer */
331 u8 security_ix; /* security type */ 387 u8 security_ix; /* security type */
332 u8 out_clientflag; /* RXRPC_CLIENT_INITIATED if we are client */ 388 u8 out_clientflag; /* RXRPC_CLIENT_INITIATED if we are client */
333}; 389};
@@ -337,37 +393,25 @@ struct rxrpc_connection {
337 */ 393 */
338enum rxrpc_call_flag { 394enum rxrpc_call_flag {
339 RXRPC_CALL_RELEASED, /* call has been released - no more message to userspace */ 395 RXRPC_CALL_RELEASED, /* call has been released - no more message to userspace */
340 RXRPC_CALL_TERMINAL_MSG, /* call has given the socket its final message */
341 RXRPC_CALL_RCVD_LAST, /* all packets received */
342 RXRPC_CALL_RUN_RTIMER, /* Tx resend timer started */
343 RXRPC_CALL_TX_SOFT_ACK, /* sent some soft ACKs */
344 RXRPC_CALL_PROC_BUSY, /* the processor is busy */
345 RXRPC_CALL_INIT_ACCEPT, /* acceptance was initiated */
346 RXRPC_CALL_HAS_USERID, /* has a user ID attached */ 396 RXRPC_CALL_HAS_USERID, /* has a user ID attached */
347 RXRPC_CALL_EXPECT_OOS, /* expect out of sequence packets */ 397 RXRPC_CALL_IS_SERVICE, /* Call is service call */
398 RXRPC_CALL_EXPOSED, /* The call was exposed to the world */
399 RXRPC_CALL_RX_LAST, /* Received the last packet (at rxtx_top) */
400 RXRPC_CALL_TX_LAST, /* Last packet in Tx buffer (at rxtx_top) */
401 RXRPC_CALL_SEND_PING, /* A ping will need to be sent */
402 RXRPC_CALL_PINGING, /* Ping in process */
403 RXRPC_CALL_RETRANS_TIMEOUT, /* Retransmission due to timeout occurred */
348}; 404};
349 405
350/* 406/*
351 * Events that can be raised on a call. 407 * Events that can be raised on a call.
352 */ 408 */
353enum rxrpc_call_event { 409enum rxrpc_call_event {
354 RXRPC_CALL_EV_RCVD_ACKALL, /* ACKALL or reply received */
355 RXRPC_CALL_EV_RCVD_BUSY, /* busy packet received */
356 RXRPC_CALL_EV_RCVD_ABORT, /* abort packet received */
357 RXRPC_CALL_EV_RCVD_ERROR, /* network error received */
358 RXRPC_CALL_EV_ACK_FINAL, /* need to generate final ACK (and release call) */
359 RXRPC_CALL_EV_ACK, /* need to generate ACK */ 410 RXRPC_CALL_EV_ACK, /* need to generate ACK */
360 RXRPC_CALL_EV_REJECT_BUSY, /* need to generate busy message */
361 RXRPC_CALL_EV_ABORT, /* need to generate abort */ 411 RXRPC_CALL_EV_ABORT, /* need to generate abort */
362 RXRPC_CALL_EV_CONN_ABORT, /* local connection abort generated */ 412 RXRPC_CALL_EV_TIMER, /* Timer expired */
363 RXRPC_CALL_EV_RESEND_TIMER, /* Tx resend timer expired */
364 RXRPC_CALL_EV_RESEND, /* Tx resend required */ 413 RXRPC_CALL_EV_RESEND, /* Tx resend required */
365 RXRPC_CALL_EV_DRAIN_RX_OOS, /* drain the Rx out of sequence queue */ 414 RXRPC_CALL_EV_PING, /* Ping send required */
366 RXRPC_CALL_EV_LIFE_TIMER, /* call's lifetimer ran out */
367 RXRPC_CALL_EV_ACCEPTED, /* incoming call accepted by userspace app */
368 RXRPC_CALL_EV_SECURED, /* incoming call's connection is now secure */
369 RXRPC_CALL_EV_POST_ACCEPT, /* need to post an "accept?" message to the app */
370 RXRPC_CALL_EV_RELEASE, /* need to release the call's resources */
371}; 415};
372 416
373/* 417/*
@@ -379,20 +423,38 @@ enum rxrpc_call_state {
379 RXRPC_CALL_CLIENT_SEND_REQUEST, /* - client sending request phase */ 423 RXRPC_CALL_CLIENT_SEND_REQUEST, /* - client sending request phase */
380 RXRPC_CALL_CLIENT_AWAIT_REPLY, /* - client awaiting reply */ 424 RXRPC_CALL_CLIENT_AWAIT_REPLY, /* - client awaiting reply */
381 RXRPC_CALL_CLIENT_RECV_REPLY, /* - client receiving reply phase */ 425 RXRPC_CALL_CLIENT_RECV_REPLY, /* - client receiving reply phase */
382 RXRPC_CALL_CLIENT_FINAL_ACK, /* - client sending final ACK phase */ 426 RXRPC_CALL_SERVER_PREALLOC, /* - service preallocation */
383 RXRPC_CALL_SERVER_SECURING, /* - server securing request connection */ 427 RXRPC_CALL_SERVER_SECURING, /* - server securing request connection */
384 RXRPC_CALL_SERVER_ACCEPTING, /* - server accepting request */ 428 RXRPC_CALL_SERVER_ACCEPTING, /* - server accepting request */
385 RXRPC_CALL_SERVER_RECV_REQUEST, /* - server receiving request */ 429 RXRPC_CALL_SERVER_RECV_REQUEST, /* - server receiving request */
386 RXRPC_CALL_SERVER_ACK_REQUEST, /* - server pending ACK of request */ 430 RXRPC_CALL_SERVER_ACK_REQUEST, /* - server pending ACK of request */
387 RXRPC_CALL_SERVER_SEND_REPLY, /* - server sending reply */ 431 RXRPC_CALL_SERVER_SEND_REPLY, /* - server sending reply */
388 RXRPC_CALL_SERVER_AWAIT_ACK, /* - server awaiting final ACK */ 432 RXRPC_CALL_SERVER_AWAIT_ACK, /* - server awaiting final ACK */
389 RXRPC_CALL_COMPLETE, /* - call completed */ 433 RXRPC_CALL_COMPLETE, /* - call complete */
390 RXRPC_CALL_SERVER_BUSY, /* - call rejected by busy server */ 434 NR__RXRPC_CALL_STATES
435};
436
437/*
438 * Call completion condition (state == RXRPC_CALL_COMPLETE).
439 */
440enum rxrpc_call_completion {
441 RXRPC_CALL_SUCCEEDED, /* - Normal termination */
391 RXRPC_CALL_REMOTELY_ABORTED, /* - call aborted by peer */ 442 RXRPC_CALL_REMOTELY_ABORTED, /* - call aborted by peer */
392 RXRPC_CALL_LOCALLY_ABORTED, /* - call aborted locally on error or close */ 443 RXRPC_CALL_LOCALLY_ABORTED, /* - call aborted locally on error or close */
444 RXRPC_CALL_LOCAL_ERROR, /* - call failed due to local error */
393 RXRPC_CALL_NETWORK_ERROR, /* - call terminated by network error */ 445 RXRPC_CALL_NETWORK_ERROR, /* - call terminated by network error */
394 RXRPC_CALL_DEAD, /* - call is dead */ 446 NR__RXRPC_CALL_COMPLETIONS
395 NR__RXRPC_CALL_STATES 447};
448
449/*
450 * Call Tx congestion management modes.
451 */
452enum rxrpc_congest_mode {
453 RXRPC_CALL_SLOW_START,
454 RXRPC_CALL_CONGEST_AVOIDANCE,
455 RXRPC_CALL_PACKET_LOSS,
456 RXRPC_CALL_FAST_RETRANSMIT,
457 NR__RXRPC_CONGEST_MODES
396}; 458};
397 459
398/* 460/*
@@ -402,92 +464,335 @@ enum rxrpc_call_state {
402struct rxrpc_call { 464struct rxrpc_call {
403 struct rcu_head rcu; 465 struct rcu_head rcu;
404 struct rxrpc_connection *conn; /* connection carrying call */ 466 struct rxrpc_connection *conn; /* connection carrying call */
405 struct rxrpc_sock *socket; /* socket responsible */ 467 struct rxrpc_peer *peer; /* Peer record for remote address */
406 struct timer_list lifetimer; /* lifetime remaining on call */ 468 struct rxrpc_sock __rcu *socket; /* socket responsible */
407 struct timer_list deadspan; /* reap timer for re-ACK'ing, etc */ 469 ktime_t ack_at; /* When deferred ACK needs to happen */
408 struct timer_list ack_timer; /* ACK generation timer */ 470 ktime_t resend_at; /* When next resend needs to happen */
409 struct timer_list resend_timer; /* Tx resend timer */ 471 ktime_t ping_at; /* When next to send a ping */
410 struct work_struct destroyer; /* call destroyer */ 472 ktime_t expire_at; /* When the call times out */
411 struct work_struct processor; /* packet processor and ACK generator */ 473 struct timer_list timer; /* Combined event timer */
474 struct work_struct processor; /* Event processor */
475 rxrpc_notify_rx_t notify_rx; /* kernel service Rx notification function */
412 struct list_head link; /* link in master call list */ 476 struct list_head link; /* link in master call list */
477 struct list_head chan_wait_link; /* Link in conn->waiting_calls */
413 struct hlist_node error_link; /* link in error distribution list */ 478 struct hlist_node error_link; /* link in error distribution list */
414 struct list_head accept_link; /* calls awaiting acceptance */ 479 struct list_head accept_link; /* Link in rx->acceptq */
415 struct rb_node sock_node; /* node in socket call tree */ 480 struct list_head recvmsg_link; /* Link in rx->recvmsg_q */
416 struct sk_buff_head rx_queue; /* received packets */ 481 struct list_head sock_link; /* Link in rx->sock_calls */
417 struct sk_buff_head rx_oos_queue; /* packets received out of sequence */ 482 struct rb_node sock_node; /* Node in rx->calls */
418 struct sk_buff *tx_pending; /* Tx socket buffer being filled */ 483 struct sk_buff *tx_pending; /* Tx socket buffer being filled */
419 wait_queue_head_t tx_waitq; /* wait for Tx window space to become available */ 484 wait_queue_head_t waitq; /* Wait queue for channel or Tx */
420 __be32 crypto_buf[2]; /* Temporary packet crypto buffer */ 485 __be32 crypto_buf[2]; /* Temporary packet crypto buffer */
421 unsigned long user_call_ID; /* user-defined call ID */ 486 unsigned long user_call_ID; /* user-defined call ID */
422 unsigned long creation_jif; /* time of call creation */
423 unsigned long flags; 487 unsigned long flags;
424 unsigned long events; 488 unsigned long events;
425 spinlock_t lock; 489 spinlock_t lock;
426 rwlock_t state_lock; /* lock for state transition */ 490 rwlock_t state_lock; /* lock for state transition */
427 atomic_t usage; 491 u32 abort_code; /* Local/remote abort code */
428 atomic_t skb_count; /* Outstanding packets on this call */
429 atomic_t sequence; /* Tx data packet sequence counter */
430 u32 local_abort; /* local abort code */
431 u32 remote_abort; /* remote abort code */
432 int error_report; /* Network error (ICMP/local transport) */
433 int error; /* Local error incurred */ 492 int error; /* Local error incurred */
434 enum rxrpc_call_state state : 8; /* current state of call */ 493 enum rxrpc_call_state state; /* current state of call */
494 enum rxrpc_call_completion completion; /* Call completion condition */
495 atomic_t usage;
496 u16 service_id; /* service ID */
497 u8 security_ix; /* Security type */
498 u32 call_id; /* call ID on connection */
499 u32 cid; /* connection ID plus channel index */
435 int debug_id; /* debug ID for printks */ 500 int debug_id; /* debug ID for printks */
436 u8 channel; /* connection channel occupied by this call */ 501 unsigned short rx_pkt_offset; /* Current recvmsg packet offset */
437 502 unsigned short rx_pkt_len; /* Current recvmsg packet len */
438 /* transmission-phase ACK management */ 503
439 u8 acks_head; /* offset into window of first entry */ 504 /* Rx/Tx circular buffer, depending on phase.
440 u8 acks_tail; /* offset into window of last entry */ 505 *
441 u8 acks_winsz; /* size of un-ACK'd window */ 506 * In the Rx phase, packets are annotated with 0 or the number of the
442 u8 acks_unacked; /* lowest unacked packet in last ACK received */ 507 * segment of a jumbo packet each buffer refers to. There can be up to
443 int acks_latest; /* serial number of latest ACK received */ 508 * 47 segments in a maximum-size UDP packet.
444 rxrpc_seq_t acks_hard; /* highest definitively ACK'd msg seq */ 509 *
445 unsigned long *acks_window; /* sent packet window 510 * In the Tx phase, packets are annotated with which buffers have been
446 * - elements are pointers with LSB set if ACK'd 511 * acked.
512 */
513#define RXRPC_RXTX_BUFF_SIZE 64
514#define RXRPC_RXTX_BUFF_MASK (RXRPC_RXTX_BUFF_SIZE - 1)
515#define RXRPC_INIT_RX_WINDOW_SIZE 32
516 struct sk_buff **rxtx_buffer;
517 u8 *rxtx_annotations;
518#define RXRPC_TX_ANNO_ACK 0
519#define RXRPC_TX_ANNO_UNACK 1
520#define RXRPC_TX_ANNO_NAK 2
521#define RXRPC_TX_ANNO_RETRANS 3
522#define RXRPC_TX_ANNO_MASK 0x03
523#define RXRPC_TX_ANNO_LAST 0x04
524#define RXRPC_TX_ANNO_RESENT 0x08
525
526#define RXRPC_RX_ANNO_JUMBO 0x3f /* Jumbo subpacket number + 1 if not zero */
527#define RXRPC_RX_ANNO_JLAST 0x40 /* Set if last element of a jumbo packet */
528#define RXRPC_RX_ANNO_VERIFIED 0x80 /* Set if verified and decrypted */
529 rxrpc_seq_t tx_hard_ack; /* Dead slot in buffer; the first transmitted but
530 * not hard-ACK'd packet follows this.
531 */
532 rxrpc_seq_t tx_top; /* Highest Tx slot allocated. */
533
534 /* TCP-style slow-start congestion control [RFC5681]. Since the SMSS
535 * is fixed, we keep these numbers in terms of segments (ie. DATA
536 * packets) rather than bytes.
537 */
538#define RXRPC_TX_SMSS RXRPC_JUMBO_DATALEN
539 u8 cong_cwnd; /* Congestion window size */
540 u8 cong_extra; /* Extra to send for congestion management */
541 u8 cong_ssthresh; /* Slow-start threshold */
542 enum rxrpc_congest_mode cong_mode:8; /* Congestion management mode */
543 u8 cong_dup_acks; /* Count of ACKs showing missing packets */
544 u8 cong_cumul_acks; /* Cumulative ACK count */
545 ktime_t cong_tstamp; /* Last time cwnd was changed */
546
547 rxrpc_seq_t rx_hard_ack; /* Dead slot in buffer; the first received but not
548 * consumed packet follows this.
447 */ 549 */
550 rxrpc_seq_t rx_top; /* Highest Rx slot allocated. */
551 rxrpc_seq_t rx_expect_next; /* Expected next packet sequence number */
552 u8 rx_winsize; /* Size of Rx window */
553 u8 tx_winsize; /* Maximum size of Tx window */
554 bool tx_phase; /* T if transmission phase, F if receive phase */
555 u8 nr_jumbo_bad; /* Number of jumbo dups/exceeds-windows */
448 556
449 /* receive-phase ACK management */ 557 /* receive-phase ACK management */
450 rxrpc_seq_t rx_data_expect; /* next data seq ID expected to be received */
451 rxrpc_seq_t rx_data_post; /* next data seq ID expected to be posted */
452 rxrpc_seq_t rx_data_recv; /* last data seq ID encountered by recvmsg */
453 rxrpc_seq_t rx_data_eaten; /* last data seq ID consumed by recvmsg */
454 rxrpc_seq_t rx_first_oos; /* first packet in rx_oos_queue (or 0) */
455 rxrpc_seq_t ackr_win_top; /* top of ACK window (rx_data_eaten is bottom) */
456 rxrpc_seq_t ackr_prev_seq; /* previous sequence number received */
457 u8 ackr_reason; /* reason to ACK */ 558 u8 ackr_reason; /* reason to ACK */
559 u16 ackr_skew; /* skew on packet being ACK'd */
458 rxrpc_serial_t ackr_serial; /* serial of packet being ACK'd */ 560 rxrpc_serial_t ackr_serial; /* serial of packet being ACK'd */
459 atomic_t ackr_not_idle; /* number of packets in Rx queue */ 561 rxrpc_seq_t ackr_prev_seq; /* previous sequence number received */
562 rxrpc_seq_t ackr_consumed; /* Highest packet shown consumed */
563 rxrpc_seq_t ackr_seen; /* Highest packet shown seen */
460 564
461 /* received packet records, 1 bit per record */ 565 /* ping management */
462#define RXRPC_ACKR_WINDOW_ASZ DIV_ROUND_UP(RXRPC_MAXACKS, BITS_PER_LONG) 566 rxrpc_serial_t ping_serial; /* Last ping sent */
463 unsigned long ackr_window[RXRPC_ACKR_WINDOW_ASZ + 1]; 567 ktime_t ping_time; /* Time last ping sent */
464 568
465 u8 in_clientflag; /* Copy of conn->in_clientflag */ 569 /* transmission-phase ACK management */
466 struct rxrpc_local *local; /* Local endpoint. */ 570 ktime_t acks_latest_ts; /* Timestamp of latest ACK received */
467 u32 call_id; /* call ID on connection */ 571 rxrpc_serial_t acks_latest; /* serial number of latest ACK received */
468 u32 cid; /* connection ID plus channel index */ 572 rxrpc_seq_t acks_lowest_nak; /* Lowest NACK in the buffer (or ==tx_hard_ack) */
469 u32 epoch; /* epoch of this connection */
470 u16 service_id; /* service ID */
471}; 573};
472 574
473/* 575/*
474 * locally abort an RxRPC call 576 * Summary of a new ACK and the changes it made to the Tx buffer packet states.
475 */ 577 */
476static inline void rxrpc_abort_call(struct rxrpc_call *call, u32 abort_code) 578struct rxrpc_ack_summary {
477{ 579 u8 ack_reason;
478 write_lock_bh(&call->state_lock); 580 u8 nr_acks; /* Number of ACKs in packet */
479 if (call->state < RXRPC_CALL_COMPLETE) { 581 u8 nr_nacks; /* Number of NACKs in packet */
480 call->local_abort = abort_code; 582 u8 nr_new_acks; /* Number of new ACKs in packet */
481 call->state = RXRPC_CALL_LOCALLY_ABORTED; 583 u8 nr_new_nacks; /* Number of new NACKs in packet */
482 set_bit(RXRPC_CALL_EV_ABORT, &call->events); 584 u8 nr_rot_new_acks; /* Number of rotated new ACKs */
483 } 585 bool new_low_nack; /* T if new low NACK found */
484 write_unlock_bh(&call->state_lock); 586 bool retrans_timeo; /* T if reTx due to timeout happened */
485} 587 u8 flight_size; /* Number of unreceived transmissions */
588 /* Place to stash values for tracing */
589 enum rxrpc_congest_mode mode:8;
590 u8 cwnd;
591 u8 ssthresh;
592 u8 dup_acks;
593 u8 cumulative_acks;
594};
595
596enum rxrpc_skb_trace {
597 rxrpc_skb_rx_cleaned,
598 rxrpc_skb_rx_freed,
599 rxrpc_skb_rx_got,
600 rxrpc_skb_rx_lost,
601 rxrpc_skb_rx_received,
602 rxrpc_skb_rx_rotated,
603 rxrpc_skb_rx_purged,
604 rxrpc_skb_rx_seen,
605 rxrpc_skb_tx_cleaned,
606 rxrpc_skb_tx_freed,
607 rxrpc_skb_tx_got,
608 rxrpc_skb_tx_new,
609 rxrpc_skb_tx_rotated,
610 rxrpc_skb_tx_seen,
611 rxrpc_skb__nr_trace
612};
613
614extern const char rxrpc_skb_traces[rxrpc_skb__nr_trace][7];
615
616enum rxrpc_conn_trace {
617 rxrpc_conn_new_client,
618 rxrpc_conn_new_service,
619 rxrpc_conn_queued,
620 rxrpc_conn_seen,
621 rxrpc_conn_got,
622 rxrpc_conn_put_client,
623 rxrpc_conn_put_service,
624 rxrpc_conn__nr_trace
625};
626
627extern const char rxrpc_conn_traces[rxrpc_conn__nr_trace][4];
628
629enum rxrpc_client_trace {
630 rxrpc_client_activate_chans,
631 rxrpc_client_alloc,
632 rxrpc_client_chan_activate,
633 rxrpc_client_chan_disconnect,
634 rxrpc_client_chan_pass,
635 rxrpc_client_chan_unstarted,
636 rxrpc_client_cleanup,
637 rxrpc_client_count,
638 rxrpc_client_discard,
639 rxrpc_client_duplicate,
640 rxrpc_client_exposed,
641 rxrpc_client_replace,
642 rxrpc_client_to_active,
643 rxrpc_client_to_culled,
644 rxrpc_client_to_idle,
645 rxrpc_client_to_inactive,
646 rxrpc_client_to_waiting,
647 rxrpc_client_uncount,
648 rxrpc_client__nr_trace
649};
650
651extern const char rxrpc_client_traces[rxrpc_client__nr_trace][7];
652extern const char rxrpc_conn_cache_states[RXRPC_CONN__NR_CACHE_STATES][5];
653
654enum rxrpc_call_trace {
655 rxrpc_call_new_client,
656 rxrpc_call_new_service,
657 rxrpc_call_queued,
658 rxrpc_call_queued_ref,
659 rxrpc_call_seen,
660 rxrpc_call_connected,
661 rxrpc_call_release,
662 rxrpc_call_got,
663 rxrpc_call_got_userid,
664 rxrpc_call_got_kernel,
665 rxrpc_call_put,
666 rxrpc_call_put_userid,
667 rxrpc_call_put_kernel,
668 rxrpc_call_put_noqueue,
669 rxrpc_call_error,
670 rxrpc_call__nr_trace
671};
672
673extern const char rxrpc_call_traces[rxrpc_call__nr_trace][4];
674
675enum rxrpc_transmit_trace {
676 rxrpc_transmit_wait,
677 rxrpc_transmit_queue,
678 rxrpc_transmit_queue_last,
679 rxrpc_transmit_rotate,
680 rxrpc_transmit_rotate_last,
681 rxrpc_transmit_await_reply,
682 rxrpc_transmit_end,
683 rxrpc_transmit__nr_trace
684};
685
686extern const char rxrpc_transmit_traces[rxrpc_transmit__nr_trace][4];
687
688enum rxrpc_receive_trace {
689 rxrpc_receive_incoming,
690 rxrpc_receive_queue,
691 rxrpc_receive_queue_last,
692 rxrpc_receive_front,
693 rxrpc_receive_rotate,
694 rxrpc_receive_end,
695 rxrpc_receive__nr_trace
696};
697
698extern const char rxrpc_receive_traces[rxrpc_receive__nr_trace][4];
699
700enum rxrpc_recvmsg_trace {
701 rxrpc_recvmsg_enter,
702 rxrpc_recvmsg_wait,
703 rxrpc_recvmsg_dequeue,
704 rxrpc_recvmsg_hole,
705 rxrpc_recvmsg_next,
706 rxrpc_recvmsg_cont,
707 rxrpc_recvmsg_full,
708 rxrpc_recvmsg_data_return,
709 rxrpc_recvmsg_terminal,
710 rxrpc_recvmsg_to_be_accepted,
711 rxrpc_recvmsg_return,
712 rxrpc_recvmsg__nr_trace
713};
714
715extern const char rxrpc_recvmsg_traces[rxrpc_recvmsg__nr_trace][5];
716
717enum rxrpc_rtt_tx_trace {
718 rxrpc_rtt_tx_ping,
719 rxrpc_rtt_tx_data,
720 rxrpc_rtt_tx__nr_trace
721};
722
723extern const char rxrpc_rtt_tx_traces[rxrpc_rtt_tx__nr_trace][5];
724
725enum rxrpc_rtt_rx_trace {
726 rxrpc_rtt_rx_ping_response,
727 rxrpc_rtt_rx_requested_ack,
728 rxrpc_rtt_rx__nr_trace
729};
730
731extern const char rxrpc_rtt_rx_traces[rxrpc_rtt_rx__nr_trace][5];
732
733enum rxrpc_timer_trace {
734 rxrpc_timer_begin,
735 rxrpc_timer_init_for_reply,
736 rxrpc_timer_init_for_send_reply,
737 rxrpc_timer_expired,
738 rxrpc_timer_set_for_ack,
739 rxrpc_timer_set_for_ping,
740 rxrpc_timer_set_for_resend,
741 rxrpc_timer_set_for_send,
742 rxrpc_timer__nr_trace
743};
744
745extern const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8];
746
747enum rxrpc_propose_ack_trace {
748 rxrpc_propose_ack_client_tx_end,
749 rxrpc_propose_ack_input_data,
750 rxrpc_propose_ack_ping_for_lost_ack,
751 rxrpc_propose_ack_ping_for_lost_reply,
752 rxrpc_propose_ack_ping_for_params,
753 rxrpc_propose_ack_processing_op,
754 rxrpc_propose_ack_respond_to_ack,
755 rxrpc_propose_ack_respond_to_ping,
756 rxrpc_propose_ack_retry_tx,
757 rxrpc_propose_ack_rotate_rx,
758 rxrpc_propose_ack_terminal_ack,
759 rxrpc_propose_ack__nr_trace
760};
761
762enum rxrpc_propose_ack_outcome {
763 rxrpc_propose_ack_use,
764 rxrpc_propose_ack_update,
765 rxrpc_propose_ack_subsume,
766 rxrpc_propose_ack__nr_outcomes
767};
768
769extern const char rxrpc_propose_ack_traces[rxrpc_propose_ack__nr_trace][8];
770extern const char *const rxrpc_propose_ack_outcomes[rxrpc_propose_ack__nr_outcomes];
771
772enum rxrpc_congest_change {
773 rxrpc_cong_begin_retransmission,
774 rxrpc_cong_cleared_nacks,
775 rxrpc_cong_new_low_nack,
776 rxrpc_cong_no_change,
777 rxrpc_cong_progress,
778 rxrpc_cong_retransmit_again,
779 rxrpc_cong_rtt_window_end,
780 rxrpc_cong_saw_nack,
781 rxrpc_congest__nr_change
782};
783
784extern const char rxrpc_congest_modes[NR__RXRPC_CONGEST_MODES][10];
785extern const char rxrpc_congest_changes[rxrpc_congest__nr_change][9];
786
787extern const char *const rxrpc_pkts[];
788extern const char rxrpc_ack_names[RXRPC_ACK__INVALID + 1][4];
789
790#include <trace/events/rxrpc.h>
486 791
487/* 792/*
488 * af_rxrpc.c 793 * af_rxrpc.c
489 */ 794 */
490extern atomic_t rxrpc_n_skbs; 795extern atomic_t rxrpc_n_tx_skbs, rxrpc_n_rx_skbs;
491extern u32 rxrpc_epoch; 796extern u32 rxrpc_epoch;
492extern atomic_t rxrpc_debug_id; 797extern atomic_t rxrpc_debug_id;
493extern struct workqueue_struct *rxrpc_workqueue; 798extern struct workqueue_struct *rxrpc_workqueue;
@@ -495,70 +800,179 @@ extern struct workqueue_struct *rxrpc_workqueue;
495/* 800/*
496 * call_accept.c 801 * call_accept.c
497 */ 802 */
803int rxrpc_service_prealloc(struct rxrpc_sock *, gfp_t);
804void rxrpc_discard_prealloc(struct rxrpc_sock *);
805struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *,
806 struct rxrpc_connection *,
807 struct sk_buff *);
498void rxrpc_accept_incoming_calls(struct rxrpc_local *); 808void rxrpc_accept_incoming_calls(struct rxrpc_local *);
499struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *, unsigned long); 809struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *, unsigned long,
810 rxrpc_notify_rx_t);
500int rxrpc_reject_call(struct rxrpc_sock *); 811int rxrpc_reject_call(struct rxrpc_sock *);
501 812
502/* 813/*
503 * call_event.c 814 * call_event.c
504 */ 815 */
505void __rxrpc_propose_ACK(struct rxrpc_call *, u8, u32, bool); 816void __rxrpc_set_timer(struct rxrpc_call *, enum rxrpc_timer_trace, ktime_t);
506void rxrpc_propose_ACK(struct rxrpc_call *, u8, u32, bool); 817void rxrpc_set_timer(struct rxrpc_call *, enum rxrpc_timer_trace, ktime_t);
818void rxrpc_propose_ACK(struct rxrpc_call *, u8, u16, u32, bool, bool,
819 enum rxrpc_propose_ack_trace);
507void rxrpc_process_call(struct work_struct *); 820void rxrpc_process_call(struct work_struct *);
508 821
509/* 822/*
510 * call_object.c 823 * call_object.c
511 */ 824 */
825extern const char *const rxrpc_call_states[];
826extern const char *const rxrpc_call_completions[];
512extern unsigned int rxrpc_max_call_lifetime; 827extern unsigned int rxrpc_max_call_lifetime;
513extern unsigned int rxrpc_dead_call_expiry;
514extern struct kmem_cache *rxrpc_call_jar; 828extern struct kmem_cache *rxrpc_call_jar;
515extern struct list_head rxrpc_calls; 829extern struct list_head rxrpc_calls;
516extern rwlock_t rxrpc_call_lock; 830extern rwlock_t rxrpc_call_lock;
517 831
518struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *, unsigned long); 832struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *, unsigned long);
833struct rxrpc_call *rxrpc_alloc_call(gfp_t);
519struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *, 834struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *,
520 struct rxrpc_conn_parameters *, 835 struct rxrpc_conn_parameters *,
521 struct sockaddr_rxrpc *, 836 struct sockaddr_rxrpc *,
522 unsigned long, gfp_t); 837 unsigned long, gfp_t);
523struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *, 838void rxrpc_incoming_call(struct rxrpc_sock *, struct rxrpc_call *,
524 struct rxrpc_connection *, 839 struct sk_buff *);
525 struct sk_buff *); 840void rxrpc_release_call(struct rxrpc_sock *, struct rxrpc_call *);
526void rxrpc_release_call(struct rxrpc_call *);
527void rxrpc_release_calls_on_socket(struct rxrpc_sock *); 841void rxrpc_release_calls_on_socket(struct rxrpc_sock *);
528void __rxrpc_put_call(struct rxrpc_call *); 842bool __rxrpc_queue_call(struct rxrpc_call *);
843bool rxrpc_queue_call(struct rxrpc_call *);
844void rxrpc_see_call(struct rxrpc_call *);
845void rxrpc_get_call(struct rxrpc_call *, enum rxrpc_call_trace);
846void rxrpc_put_call(struct rxrpc_call *, enum rxrpc_call_trace);
847void rxrpc_cleanup_call(struct rxrpc_call *);
529void __exit rxrpc_destroy_all_calls(void); 848void __exit rxrpc_destroy_all_calls(void);
530 849
850static inline bool rxrpc_is_service_call(const struct rxrpc_call *call)
851{
852 return test_bit(RXRPC_CALL_IS_SERVICE, &call->flags);
853}
854
855static inline bool rxrpc_is_client_call(const struct rxrpc_call *call)
856{
857 return !rxrpc_is_service_call(call);
858}
859
860/*
861 * Transition a call to the complete state.
862 */
863static inline bool __rxrpc_set_call_completion(struct rxrpc_call *call,
864 enum rxrpc_call_completion compl,
865 u32 abort_code,
866 int error)
867{
868 if (call->state < RXRPC_CALL_COMPLETE) {
869 call->abort_code = abort_code;
870 call->error = error;
871 call->completion = compl,
872 call->state = RXRPC_CALL_COMPLETE;
873 wake_up(&call->waitq);
874 return true;
875 }
876 return false;
877}
878
879static inline bool rxrpc_set_call_completion(struct rxrpc_call *call,
880 enum rxrpc_call_completion compl,
881 u32 abort_code,
882 int error)
883{
884 bool ret;
885
886 write_lock_bh(&call->state_lock);
887 ret = __rxrpc_set_call_completion(call, compl, abort_code, error);
888 write_unlock_bh(&call->state_lock);
889 return ret;
890}
891
892/*
893 * Record that a call successfully completed.
894 */
895static inline bool __rxrpc_call_completed(struct rxrpc_call *call)
896{
897 return __rxrpc_set_call_completion(call, RXRPC_CALL_SUCCEEDED, 0, 0);
898}
899
900static inline bool rxrpc_call_completed(struct rxrpc_call *call)
901{
902 bool ret;
903
904 write_lock_bh(&call->state_lock);
905 ret = __rxrpc_call_completed(call);
906 write_unlock_bh(&call->state_lock);
907 return ret;
908}
909
910/*
911 * Record that a call is locally aborted.
912 */
913static inline bool __rxrpc_abort_call(const char *why, struct rxrpc_call *call,
914 rxrpc_seq_t seq,
915 u32 abort_code, int error)
916{
917 trace_rxrpc_abort(why, call->cid, call->call_id, seq,
918 abort_code, error);
919 return __rxrpc_set_call_completion(call, RXRPC_CALL_LOCALLY_ABORTED,
920 abort_code, error);
921}
922
923static inline bool rxrpc_abort_call(const char *why, struct rxrpc_call *call,
924 rxrpc_seq_t seq, u32 abort_code, int error)
925{
926 bool ret;
927
928 write_lock_bh(&call->state_lock);
929 ret = __rxrpc_abort_call(why, call, seq, abort_code, error);
930 write_unlock_bh(&call->state_lock);
931 return ret;
932}
933
531/* 934/*
532 * conn_client.c 935 * conn_client.c
533 */ 936 */
937extern unsigned int rxrpc_max_client_connections;
938extern unsigned int rxrpc_reap_client_connections;
939extern unsigned int rxrpc_conn_idle_client_expiry;
940extern unsigned int rxrpc_conn_idle_client_fast_expiry;
534extern struct idr rxrpc_client_conn_ids; 941extern struct idr rxrpc_client_conn_ids;
535 942
536void rxrpc_destroy_client_conn_ids(void); 943void rxrpc_destroy_client_conn_ids(void);
537int rxrpc_connect_call(struct rxrpc_call *, struct rxrpc_conn_parameters *, 944int rxrpc_connect_call(struct rxrpc_call *, struct rxrpc_conn_parameters *,
538 struct sockaddr_rxrpc *, gfp_t); 945 struct sockaddr_rxrpc *, gfp_t);
539void rxrpc_unpublish_client_conn(struct rxrpc_connection *); 946void rxrpc_expose_client_call(struct rxrpc_call *);
947void rxrpc_disconnect_client_call(struct rxrpc_call *);
948void rxrpc_put_client_conn(struct rxrpc_connection *);
949void __exit rxrpc_destroy_all_client_connections(void);
540 950
541/* 951/*
542 * conn_event.c 952 * conn_event.c
543 */ 953 */
544void rxrpc_process_connection(struct work_struct *); 954void rxrpc_process_connection(struct work_struct *);
545void rxrpc_reject_packet(struct rxrpc_local *, struct sk_buff *);
546void rxrpc_reject_packets(struct rxrpc_local *);
547 955
548/* 956/*
549 * conn_object.c 957 * conn_object.c
550 */ 958 */
551extern unsigned int rxrpc_connection_expiry; 959extern unsigned int rxrpc_connection_expiry;
552extern struct list_head rxrpc_connections; 960extern struct list_head rxrpc_connections;
961extern struct list_head rxrpc_connection_proc_list;
553extern rwlock_t rxrpc_connection_lock; 962extern rwlock_t rxrpc_connection_lock;
554 963
555int rxrpc_extract_addr_from_skb(struct sockaddr_rxrpc *, struct sk_buff *); 964int rxrpc_extract_addr_from_skb(struct sockaddr_rxrpc *, struct sk_buff *);
556struct rxrpc_connection *rxrpc_alloc_connection(gfp_t); 965struct rxrpc_connection *rxrpc_alloc_connection(gfp_t);
557struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *, 966struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *,
558 struct sk_buff *); 967 struct sk_buff *);
559void __rxrpc_disconnect_call(struct rxrpc_call *); 968void __rxrpc_disconnect_call(struct rxrpc_connection *, struct rxrpc_call *);
560void rxrpc_disconnect_call(struct rxrpc_call *); 969void rxrpc_disconnect_call(struct rxrpc_call *);
561void rxrpc_put_connection(struct rxrpc_connection *); 970void rxrpc_kill_connection(struct rxrpc_connection *);
971bool rxrpc_queue_conn(struct rxrpc_connection *);
972void rxrpc_see_connection(struct rxrpc_connection *);
973void rxrpc_get_connection(struct rxrpc_connection *);
974struct rxrpc_connection *rxrpc_get_connection_maybe(struct rxrpc_connection *);
975void rxrpc_put_service_conn(struct rxrpc_connection *);
562void __exit rxrpc_destroy_all_connections(void); 976void __exit rxrpc_destroy_all_connections(void);
563 977
564static inline bool rxrpc_conn_is_client(const struct rxrpc_connection *conn) 978static inline bool rxrpc_conn_is_client(const struct rxrpc_connection *conn)
@@ -571,24 +985,15 @@ static inline bool rxrpc_conn_is_service(const struct rxrpc_connection *conn)
571 return !rxrpc_conn_is_client(conn); 985 return !rxrpc_conn_is_client(conn);
572} 986}
573 987
574static inline void rxrpc_get_connection(struct rxrpc_connection *conn) 988static inline void rxrpc_put_connection(struct rxrpc_connection *conn)
575{ 989{
576 atomic_inc(&conn->usage); 990 if (!conn)
577} 991 return;
578
579static inline
580struct rxrpc_connection *rxrpc_get_connection_maybe(struct rxrpc_connection *conn)
581{
582 return atomic_inc_not_zero(&conn->usage) ? conn : NULL;
583}
584 992
585static inline bool rxrpc_queue_conn(struct rxrpc_connection *conn) 993 if (rxrpc_conn_is_client(conn))
586{ 994 rxrpc_put_client_conn(conn);
587 if (!rxrpc_get_connection_maybe(conn)) 995 else
588 return false; 996 rxrpc_put_service_conn(conn);
589 if (!rxrpc_queue_work(&conn->processor))
590 rxrpc_put_connection(conn);
591 return true;
592} 997}
593 998
594/* 999/*
@@ -596,17 +1001,14 @@ static inline bool rxrpc_queue_conn(struct rxrpc_connection *conn)
596 */ 1001 */
597struct rxrpc_connection *rxrpc_find_service_conn_rcu(struct rxrpc_peer *, 1002struct rxrpc_connection *rxrpc_find_service_conn_rcu(struct rxrpc_peer *,
598 struct sk_buff *); 1003 struct sk_buff *);
599struct rxrpc_connection *rxrpc_incoming_connection(struct rxrpc_local *, 1004struct rxrpc_connection *rxrpc_prealloc_service_connection(gfp_t);
600 struct sockaddr_rxrpc *, 1005void rxrpc_new_incoming_connection(struct rxrpc_connection *, struct sk_buff *);
601 struct sk_buff *);
602void rxrpc_unpublish_service_conn(struct rxrpc_connection *); 1006void rxrpc_unpublish_service_conn(struct rxrpc_connection *);
603 1007
604/* 1008/*
605 * input.c 1009 * input.c
606 */ 1010 */
607void rxrpc_data_ready(struct sock *); 1011void rxrpc_data_ready(struct sock *);
608int rxrpc_queue_rcv_skb(struct rxrpc_call *, struct sk_buff *, bool, bool);
609void rxrpc_fast_process_packet(struct rxrpc_call *, struct sk_buff *);
610 1012
611/* 1013/*
612 * insecure.c 1014 * insecure.c
@@ -668,25 +1070,25 @@ extern unsigned int rxrpc_idle_ack_delay;
668extern unsigned int rxrpc_rx_window_size; 1070extern unsigned int rxrpc_rx_window_size;
669extern unsigned int rxrpc_rx_mtu; 1071extern unsigned int rxrpc_rx_mtu;
670extern unsigned int rxrpc_rx_jumbo_max; 1072extern unsigned int rxrpc_rx_jumbo_max;
1073extern unsigned int rxrpc_resend_timeout;
671 1074
672extern const char *const rxrpc_pkts[];
673extern const s8 rxrpc_ack_priority[]; 1075extern const s8 rxrpc_ack_priority[];
674 1076
675extern const char *rxrpc_acks(u8 reason);
676
677/* 1077/*
678 * output.c 1078 * output.c
679 */ 1079 */
680extern unsigned int rxrpc_resend_timeout; 1080int rxrpc_send_ack_packet(struct rxrpc_call *, bool);
681 1081int rxrpc_send_abort_packet(struct rxrpc_call *);
682int rxrpc_send_data_packet(struct rxrpc_connection *, struct sk_buff *); 1082int rxrpc_send_data_packet(struct rxrpc_call *, struct sk_buff *, bool);
683int rxrpc_do_sendmsg(struct rxrpc_sock *, struct msghdr *, size_t); 1083void rxrpc_reject_packets(struct rxrpc_local *);
684 1084
685/* 1085/*
686 * peer_event.c 1086 * peer_event.c
687 */ 1087 */
688void rxrpc_error_report(struct sock *); 1088void rxrpc_error_report(struct sock *);
689void rxrpc_peer_error_distributor(struct work_struct *); 1089void rxrpc_peer_error_distributor(struct work_struct *);
1090void rxrpc_peer_add_rtt(struct rxrpc_call *, enum rxrpc_rtt_rx_trace,
1091 rxrpc_serial_t, rxrpc_serial_t, ktime_t, ktime_t);
690 1092
691/* 1093/*
692 * peer_object.c 1094 * peer_object.c
@@ -696,10 +1098,13 @@ struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *,
696struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *, 1098struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *,
697 struct sockaddr_rxrpc *, gfp_t); 1099 struct sockaddr_rxrpc *, gfp_t);
698struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *, gfp_t); 1100struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *, gfp_t);
1101struct rxrpc_peer *rxrpc_lookup_incoming_peer(struct rxrpc_local *,
1102 struct rxrpc_peer *);
699 1103
700static inline void rxrpc_get_peer(struct rxrpc_peer *peer) 1104static inline struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *peer)
701{ 1105{
702 atomic_inc(&peer->usage); 1106 atomic_inc(&peer->usage);
1107 return peer;
703} 1108}
704 1109
705static inline 1110static inline
@@ -718,14 +1123,13 @@ static inline void rxrpc_put_peer(struct rxrpc_peer *peer)
718/* 1123/*
719 * proc.c 1124 * proc.c
720 */ 1125 */
721extern const char *const rxrpc_call_states[];
722extern const struct file_operations rxrpc_call_seq_fops; 1126extern const struct file_operations rxrpc_call_seq_fops;
723extern const struct file_operations rxrpc_connection_seq_fops; 1127extern const struct file_operations rxrpc_connection_seq_fops;
724 1128
725/* 1129/*
726 * recvmsg.c 1130 * recvmsg.c
727 */ 1131 */
728void rxrpc_remove_user_ID(struct rxrpc_sock *, struct rxrpc_call *); 1132void rxrpc_notify_socket(struct rxrpc_call *);
729int rxrpc_recvmsg(struct socket *, struct msghdr *, size_t, int); 1133int rxrpc_recvmsg(struct socket *, struct msghdr *, size_t, int);
730 1134
731/* 1135/*
@@ -744,9 +1148,21 @@ int rxrpc_init_client_conn_security(struct rxrpc_connection *);
744int rxrpc_init_server_conn_security(struct rxrpc_connection *); 1148int rxrpc_init_server_conn_security(struct rxrpc_connection *);
745 1149
746/* 1150/*
1151 * sendmsg.c
1152 */
1153int rxrpc_do_sendmsg(struct rxrpc_sock *, struct msghdr *, size_t);
1154
1155/*
747 * skbuff.c 1156 * skbuff.c
748 */ 1157 */
1158void rxrpc_kernel_data_consumed(struct rxrpc_call *, struct sk_buff *);
749void rxrpc_packet_destructor(struct sk_buff *); 1159void rxrpc_packet_destructor(struct sk_buff *);
1160void rxrpc_new_skb(struct sk_buff *, enum rxrpc_skb_trace);
1161void rxrpc_see_skb(struct sk_buff *, enum rxrpc_skb_trace);
1162void rxrpc_get_skb(struct sk_buff *, enum rxrpc_skb_trace);
1163void rxrpc_free_skb(struct sk_buff *, enum rxrpc_skb_trace);
1164void rxrpc_lose_skb(struct sk_buff *, enum rxrpc_skb_trace);
1165void rxrpc_purge_queue(struct sk_buff_head *);
750 1166
751/* 1167/*
752 * sysctl.c 1168 * sysctl.c
@@ -764,6 +1180,23 @@ static inline void rxrpc_sysctl_exit(void) {}
764 */ 1180 */
765int rxrpc_extract_addr_from_skb(struct sockaddr_rxrpc *, struct sk_buff *); 1181int rxrpc_extract_addr_from_skb(struct sockaddr_rxrpc *, struct sk_buff *);
766 1182
1183static inline bool before(u32 seq1, u32 seq2)
1184{
1185 return (s32)(seq1 - seq2) < 0;
1186}
1187static inline bool before_eq(u32 seq1, u32 seq2)
1188{
1189 return (s32)(seq1 - seq2) <= 0;
1190}
1191static inline bool after(u32 seq1, u32 seq2)
1192{
1193 return (s32)(seq1 - seq2) > 0;
1194}
1195static inline bool after_eq(u32 seq1, u32 seq2)
1196{
1197 return (s32)(seq1 - seq2) >= 0;
1198}
1199
767/* 1200/*
768 * debug tracing 1201 * debug tracing
769 */ 1202 */
@@ -846,11 +1279,12 @@ do { \
846 1279
847#define ASSERTCMP(X, OP, Y) \ 1280#define ASSERTCMP(X, OP, Y) \
848do { \ 1281do { \
849 unsigned long _x = (unsigned long)(X); \ 1282 __typeof__(X) _x = (X); \
850 unsigned long _y = (unsigned long)(Y); \ 1283 __typeof__(Y) _y = (__typeof__(X))(Y); \
851 if (unlikely(!(_x OP _y))) { \ 1284 if (unlikely(!(_x OP _y))) { \
852 pr_err("Assertion failed - %lu(0x%lx) %s %lu(0x%lx) is false\n", \ 1285 pr_err("Assertion failed - %lu(0x%lx) %s %lu(0x%lx) is false\n", \
853 _x, _x, #OP, _y, _y); \ 1286 (unsigned long)_x, (unsigned long)_x, #OP, \
1287 (unsigned long)_y, (unsigned long)_y); \
854 BUG(); \ 1288 BUG(); \
855 } \ 1289 } \
856} while (0) 1290} while (0)
@@ -865,11 +1299,12 @@ do { \
865 1299
866#define ASSERTIFCMP(C, X, OP, Y) \ 1300#define ASSERTIFCMP(C, X, OP, Y) \
867do { \ 1301do { \
868 unsigned long _x = (unsigned long)(X); \ 1302 __typeof__(X) _x = (X); \
869 unsigned long _y = (unsigned long)(Y); \ 1303 __typeof__(Y) _y = (__typeof__(X))(Y); \
870 if (unlikely((C) && !(_x OP _y))) { \ 1304 if (unlikely((C) && !(_x OP _y))) { \
871 pr_err("Assertion failed - %lu(0x%lx) %s %lu(0x%lx) is false\n", \ 1305 pr_err("Assertion failed - %lu(0x%lx) %s %lu(0x%lx) is false\n", \
872 _x, _x, #OP, _y, _y); \ 1306 (unsigned long)_x, (unsigned long)_x, #OP, \
1307 (unsigned long)_y, (unsigned long)_y); \
873 BUG(); \ 1308 BUG(); \
874 } \ 1309 } \
875} while (0) 1310} while (0)
@@ -893,54 +1328,3 @@ do { \
893} while (0) 1328} while (0)
894 1329
895#endif /* __KDEBUGALL */ 1330#endif /* __KDEBUGALL */
896
897/*
898 * socket buffer accounting / leak finding
899 */
900static inline void __rxrpc_new_skb(struct sk_buff *skb, const char *fn)
901{
902 //_net("new skb %p %s [%d]", skb, fn, atomic_read(&rxrpc_n_skbs));
903 //atomic_inc(&rxrpc_n_skbs);
904}
905
906#define rxrpc_new_skb(skb) __rxrpc_new_skb((skb), __func__)
907
908static inline void __rxrpc_kill_skb(struct sk_buff *skb, const char *fn)
909{
910 //_net("kill skb %p %s [%d]", skb, fn, atomic_read(&rxrpc_n_skbs));
911 //atomic_dec(&rxrpc_n_skbs);
912}
913
914#define rxrpc_kill_skb(skb) __rxrpc_kill_skb((skb), __func__)
915
916static inline void __rxrpc_free_skb(struct sk_buff *skb, const char *fn)
917{
918 if (skb) {
919 CHECK_SLAB_OKAY(&skb->users);
920 //_net("free skb %p %s [%d]",
921 // skb, fn, atomic_read(&rxrpc_n_skbs));
922 //atomic_dec(&rxrpc_n_skbs);
923 kfree_skb(skb);
924 }
925}
926
927#define rxrpc_free_skb(skb) __rxrpc_free_skb((skb), __func__)
928
929static inline void rxrpc_purge_queue(struct sk_buff_head *list)
930{
931 struct sk_buff *skb;
932 while ((skb = skb_dequeue((list))) != NULL)
933 rxrpc_free_skb(skb);
934}
935
936#define rxrpc_get_call(CALL) \
937do { \
938 CHECK_SLAB_OKAY(&(CALL)->usage); \
939 if (atomic_inc_return(&(CALL)->usage) == 1) \
940 BUG(); \
941} while (0)
942
943#define rxrpc_put_call(CALL) \
944do { \
945 __rxrpc_put_call(CALL); \
946} while (0)
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index 9bae21e66d65..832d854c2d5c 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -20,265 +20,409 @@
20#include <linux/in6.h> 20#include <linux/in6.h>
21#include <linux/icmp.h> 21#include <linux/icmp.h>
22#include <linux/gfp.h> 22#include <linux/gfp.h>
23#include <linux/circ_buf.h>
23#include <net/sock.h> 24#include <net/sock.h>
24#include <net/af_rxrpc.h> 25#include <net/af_rxrpc.h>
25#include <net/ip.h> 26#include <net/ip.h>
26#include "ar-internal.h" 27#include "ar-internal.h"
27 28
28/* 29/*
29 * generate a connection-level abort 30 * Preallocate a single service call, connection and peer and, if possible,
31 * give them a user ID and attach the user's side of the ID to them.
30 */ 32 */
31static int rxrpc_busy(struct rxrpc_local *local, struct sockaddr_rxrpc *srx, 33static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx,
32 struct rxrpc_wire_header *whdr) 34 struct rxrpc_backlog *b,
35 rxrpc_notify_rx_t notify_rx,
36 rxrpc_user_attach_call_t user_attach_call,
37 unsigned long user_call_ID, gfp_t gfp)
33{ 38{
34 struct msghdr msg; 39 const void *here = __builtin_return_address(0);
35 struct kvec iov[1]; 40 struct rxrpc_call *call;
36 size_t len; 41 int max, tmp;
37 int ret; 42 unsigned int size = RXRPC_BACKLOG_MAX;
43 unsigned int head, tail, call_head, call_tail;
44
45 max = rx->sk.sk_max_ack_backlog;
46 tmp = rx->sk.sk_ack_backlog;
47 if (tmp >= max) {
48 _leave(" = -ENOBUFS [full %u]", max);
49 return -ENOBUFS;
50 }
51 max -= tmp;
52
53 /* We don't need more conns and peers than we have calls, but on the
54 * other hand, we shouldn't ever use more peers than conns or conns
55 * than calls.
56 */
57 call_head = b->call_backlog_head;
58 call_tail = READ_ONCE(b->call_backlog_tail);
59 tmp = CIRC_CNT(call_head, call_tail, size);
60 if (tmp >= max) {
61 _leave(" = -ENOBUFS [enough %u]", tmp);
62 return -ENOBUFS;
63 }
64 max = tmp + 1;
65
66 head = b->peer_backlog_head;
67 tail = READ_ONCE(b->peer_backlog_tail);
68 if (CIRC_CNT(head, tail, size) < max) {
69 struct rxrpc_peer *peer = rxrpc_alloc_peer(rx->local, gfp);
70 if (!peer)
71 return -ENOMEM;
72 b->peer_backlog[head] = peer;
73 smp_store_release(&b->peer_backlog_head,
74 (head + 1) & (size - 1));
75 }
38 76
39 _enter("%d,,", local->debug_id); 77 head = b->conn_backlog_head;
78 tail = READ_ONCE(b->conn_backlog_tail);
79 if (CIRC_CNT(head, tail, size) < max) {
80 struct rxrpc_connection *conn;
40 81
41 whdr->type = RXRPC_PACKET_TYPE_BUSY; 82 conn = rxrpc_prealloc_service_connection(gfp);
42 whdr->serial = htonl(1); 83 if (!conn)
84 return -ENOMEM;
85 b->conn_backlog[head] = conn;
86 smp_store_release(&b->conn_backlog_head,
87 (head + 1) & (size - 1));
43 88
44 msg.msg_name = &srx->transport.sin; 89 trace_rxrpc_conn(conn, rxrpc_conn_new_service,
45 msg.msg_namelen = sizeof(srx->transport.sin); 90 atomic_read(&conn->usage), here);
46 msg.msg_control = NULL; 91 }
47 msg.msg_controllen = 0;
48 msg.msg_flags = 0;
49 92
50 iov[0].iov_base = whdr; 93 /* Now it gets complicated, because calls get registered with the
51 iov[0].iov_len = sizeof(*whdr); 94 * socket here, particularly if a user ID is preassigned by the user.
95 */
96 call = rxrpc_alloc_call(gfp);
97 if (!call)
98 return -ENOMEM;
99 call->flags |= (1 << RXRPC_CALL_IS_SERVICE);
100 call->state = RXRPC_CALL_SERVER_PREALLOC;
52 101
53 len = iov[0].iov_len; 102 trace_rxrpc_call(call, rxrpc_call_new_service,
103 atomic_read(&call->usage),
104 here, (const void *)user_call_ID);
54 105
55 _proto("Tx BUSY %%1"); 106 write_lock(&rx->call_lock);
107 if (user_attach_call) {
108 struct rxrpc_call *xcall;
109 struct rb_node *parent, **pp;
110
111 /* Check the user ID isn't already in use */
112 pp = &rx->calls.rb_node;
113 parent = NULL;
114 while (*pp) {
115 parent = *pp;
116 xcall = rb_entry(parent, struct rxrpc_call, sock_node);
117 if (user_call_ID < call->user_call_ID)
118 pp = &(*pp)->rb_left;
119 else if (user_call_ID > call->user_call_ID)
120 pp = &(*pp)->rb_right;
121 else
122 goto id_in_use;
123 }
56 124
57 ret = kernel_sendmsg(local->socket, &msg, iov, 1, len); 125 call->user_call_ID = user_call_ID;
58 if (ret < 0) { 126 call->notify_rx = notify_rx;
59 _leave(" = -EAGAIN [sendmsg failed: %d]", ret); 127 rxrpc_get_call(call, rxrpc_call_got_kernel);
60 return -EAGAIN; 128 user_attach_call(call, user_call_ID);
129 rxrpc_get_call(call, rxrpc_call_got_userid);
130 rb_link_node(&call->sock_node, parent, pp);
131 rb_insert_color(&call->sock_node, &rx->calls);
132 set_bit(RXRPC_CALL_HAS_USERID, &call->flags);
61 } 133 }
62 134
63 _leave(" = 0"); 135 list_add(&call->sock_link, &rx->sock_calls);
136
137 write_unlock(&rx->call_lock);
138
139 write_lock(&rxrpc_call_lock);
140 list_add_tail(&call->link, &rxrpc_calls);
141 write_unlock(&rxrpc_call_lock);
142
143 b->call_backlog[call_head] = call;
144 smp_store_release(&b->call_backlog_head, (call_head + 1) & (size - 1));
145 _leave(" = 0 [%d -> %lx]", call->debug_id, user_call_ID);
64 return 0; 146 return 0;
147
148id_in_use:
149 write_unlock(&rx->call_lock);
150 rxrpc_cleanup_call(call);
151 _leave(" = -EBADSLT");
152 return -EBADSLT;
65} 153}
66 154
67/* 155/*
68 * accept an incoming call that needs peer, transport and/or connection setting 156 * Preallocate sufficient service connections, calls and peers to cover the
69 * up 157 * entire backlog of a socket. When a new call comes in, if we don't have
158 * sufficient of each available, the call gets rejected as busy or ignored.
159 *
160 * The backlog is replenished when a connection is accepted or rejected.
70 */ 161 */
71static int rxrpc_accept_incoming_call(struct rxrpc_local *local, 162int rxrpc_service_prealloc(struct rxrpc_sock *rx, gfp_t gfp)
72 struct rxrpc_sock *rx,
73 struct sk_buff *skb,
74 struct sockaddr_rxrpc *srx)
75{ 163{
76 struct rxrpc_connection *conn; 164 struct rxrpc_backlog *b = rx->backlog;
77 struct rxrpc_skb_priv *sp, *nsp;
78 struct rxrpc_call *call;
79 struct sk_buff *notification;
80 int ret;
81 165
82 _enter(""); 166 if (!b) {
167 b = kzalloc(sizeof(struct rxrpc_backlog), gfp);
168 if (!b)
169 return -ENOMEM;
170 rx->backlog = b;
171 }
172
173 if (rx->discard_new_call)
174 return 0;
175
176 while (rxrpc_service_prealloc_one(rx, b, NULL, NULL, 0, gfp) == 0)
177 ;
83 178
84 sp = rxrpc_skb(skb); 179 return 0;
180}
85 181
86 /* get a notification message to send to the server app */ 182/*
87 notification = alloc_skb(0, GFP_NOFS); 183 * Discard the preallocation on a service.
88 if (!notification) { 184 */
89 _debug("no memory"); 185void rxrpc_discard_prealloc(struct rxrpc_sock *rx)
90 ret = -ENOMEM; 186{
91 goto error_nofree; 187 struct rxrpc_backlog *b = rx->backlog;
188 unsigned int size = RXRPC_BACKLOG_MAX, head, tail;
189
190 if (!b)
191 return;
192 rx->backlog = NULL;
193
194 /* Make sure that there aren't any incoming calls in progress before we
195 * clear the preallocation buffers.
196 */
197 spin_lock_bh(&rx->incoming_lock);
198 spin_unlock_bh(&rx->incoming_lock);
199
200 head = b->peer_backlog_head;
201 tail = b->peer_backlog_tail;
202 while (CIRC_CNT(head, tail, size) > 0) {
203 struct rxrpc_peer *peer = b->peer_backlog[tail];
204 kfree(peer);
205 tail = (tail + 1) & (size - 1);
92 } 206 }
93 rxrpc_new_skb(notification); 207
94 notification->mark = RXRPC_SKB_MARK_NEW_CALL; 208 head = b->conn_backlog_head;
95 209 tail = b->conn_backlog_tail;
96 conn = rxrpc_incoming_connection(local, srx, skb); 210 while (CIRC_CNT(head, tail, size) > 0) {
97 if (IS_ERR(conn)) { 211 struct rxrpc_connection *conn = b->conn_backlog[tail];
98 _debug("no conn"); 212 write_lock(&rxrpc_connection_lock);
99 ret = PTR_ERR(conn); 213 list_del(&conn->link);
100 goto error; 214 list_del(&conn->proc_link);
215 write_unlock(&rxrpc_connection_lock);
216 kfree(conn);
217 tail = (tail + 1) & (size - 1);
101 } 218 }
102 219
103 call = rxrpc_incoming_call(rx, conn, skb); 220 head = b->call_backlog_head;
104 rxrpc_put_connection(conn); 221 tail = b->call_backlog_tail;
105 if (IS_ERR(call)) { 222 while (CIRC_CNT(head, tail, size) > 0) {
106 _debug("no call"); 223 struct rxrpc_call *call = b->call_backlog[tail];
107 ret = PTR_ERR(call); 224 if (rx->discard_new_call) {
108 goto error; 225 _debug("discard %lx", call->user_call_ID);
226 rx->discard_new_call(call, call->user_call_ID);
227 rxrpc_put_call(call, rxrpc_call_put_kernel);
228 }
229 rxrpc_call_completed(call);
230 rxrpc_release_call(rx, call);
231 rxrpc_put_call(call, rxrpc_call_put);
232 tail = (tail + 1) & (size - 1);
109 } 233 }
110 234
111 /* attach the call to the socket */ 235 kfree(b);
112 read_lock_bh(&local->services_lock); 236}
113 if (rx->sk.sk_state == RXRPC_CLOSE)
114 goto invalid_service;
115 237
116 write_lock(&rx->call_lock); 238/*
117 if (!test_and_set_bit(RXRPC_CALL_INIT_ACCEPT, &call->flags)) { 239 * Allocate a new incoming call from the prealloc pool, along with a connection
118 rxrpc_get_call(call); 240 * and a peer as necessary.
119 241 */
120 spin_lock(&call->conn->state_lock); 242static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx,
121 if (sp->hdr.securityIndex > 0 && 243 struct rxrpc_local *local,
122 call->conn->state == RXRPC_CONN_SERVICE_UNSECURED) { 244 struct rxrpc_connection *conn,
123 _debug("await conn sec"); 245 struct sk_buff *skb)
124 list_add_tail(&call->accept_link, &rx->secureq); 246{
125 call->conn->state = RXRPC_CONN_SERVICE_CHALLENGING; 247 struct rxrpc_backlog *b = rx->backlog;
126 set_bit(RXRPC_CONN_EV_CHALLENGE, &call->conn->events); 248 struct rxrpc_peer *peer, *xpeer;
127 rxrpc_queue_conn(call->conn); 249 struct rxrpc_call *call;
128 } else { 250 unsigned short call_head, conn_head, peer_head;
129 _debug("conn ready"); 251 unsigned short call_tail, conn_tail, peer_tail;
130 call->state = RXRPC_CALL_SERVER_ACCEPTING; 252 unsigned short call_count, conn_count;
131 list_add_tail(&call->accept_link, &rx->acceptq); 253
132 rxrpc_get_call(call); 254 /* #calls >= #conns >= #peers must hold true. */
133 atomic_inc(&call->skb_count); 255 call_head = smp_load_acquire(&b->call_backlog_head);
134 nsp = rxrpc_skb(notification); 256 call_tail = b->call_backlog_tail;
135 nsp->call = call; 257 call_count = CIRC_CNT(call_head, call_tail, RXRPC_BACKLOG_MAX);
136 258 conn_head = smp_load_acquire(&b->conn_backlog_head);
137 ASSERTCMP(atomic_read(&call->usage), >=, 3); 259 conn_tail = b->conn_backlog_tail;
138 260 conn_count = CIRC_CNT(conn_head, conn_tail, RXRPC_BACKLOG_MAX);
139 _debug("notify"); 261 ASSERTCMP(conn_count, >=, call_count);
140 spin_lock(&call->lock); 262 peer_head = smp_load_acquire(&b->peer_backlog_head);
141 ret = rxrpc_queue_rcv_skb(call, notification, true, 263 peer_tail = b->peer_backlog_tail;
142 false); 264 ASSERTCMP(CIRC_CNT(peer_head, peer_tail, RXRPC_BACKLOG_MAX), >=,
143 spin_unlock(&call->lock); 265 conn_count);
144 notification = NULL; 266
145 BUG_ON(ret < 0); 267 if (call_count == 0)
268 return NULL;
269
270 if (!conn) {
271 /* No connection. We're going to need a peer to start off
272 * with. If one doesn't yet exist, use a spare from the
273 * preallocation set. We dump the address into the spare in
274 * anticipation - and to save on stack space.
275 */
276 xpeer = b->peer_backlog[peer_tail];
277 if (rxrpc_extract_addr_from_skb(&xpeer->srx, skb) < 0)
278 return NULL;
279
280 peer = rxrpc_lookup_incoming_peer(local, xpeer);
281 if (peer == xpeer) {
282 b->peer_backlog[peer_tail] = NULL;
283 smp_store_release(&b->peer_backlog_tail,
284 (peer_tail + 1) &
285 (RXRPC_BACKLOG_MAX - 1));
146 } 286 }
147 spin_unlock(&call->conn->state_lock);
148 287
149 _debug("queued"); 288 /* Now allocate and set up the connection */
289 conn = b->conn_backlog[conn_tail];
290 b->conn_backlog[conn_tail] = NULL;
291 smp_store_release(&b->conn_backlog_tail,
292 (conn_tail + 1) & (RXRPC_BACKLOG_MAX - 1));
293 rxrpc_get_local(local);
294 conn->params.local = local;
295 conn->params.peer = peer;
296 rxrpc_see_connection(conn);
297 rxrpc_new_incoming_connection(conn, skb);
298 } else {
299 rxrpc_get_connection(conn);
150 } 300 }
151 write_unlock(&rx->call_lock);
152 301
153 _debug("process"); 302 /* And now we can allocate and set up a new call */
154 rxrpc_fast_process_packet(call, skb); 303 call = b->call_backlog[call_tail];
304 b->call_backlog[call_tail] = NULL;
305 smp_store_release(&b->call_backlog_tail,
306 (call_tail + 1) & (RXRPC_BACKLOG_MAX - 1));
155 307
156 _debug("done"); 308 rxrpc_see_call(call);
157 read_unlock_bh(&local->services_lock); 309 call->conn = conn;
158 rxrpc_free_skb(notification); 310 call->peer = rxrpc_get_peer(conn->params.peer);
159 rxrpc_put_call(call); 311 return call;
160 _leave(" = 0");
161 return 0;
162
163invalid_service:
164 _debug("invalid");
165 read_unlock_bh(&local->services_lock);
166
167 read_lock_bh(&call->state_lock);
168 if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
169 !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events)) {
170 rxrpc_get_call(call);
171 rxrpc_queue_call(call);
172 }
173 read_unlock_bh(&call->state_lock);
174 rxrpc_put_call(call);
175 ret = -ECONNREFUSED;
176error:
177 rxrpc_free_skb(notification);
178error_nofree:
179 _leave(" = %d", ret);
180 return ret;
181} 312}
182 313
183/* 314/*
184 * accept incoming calls that need peer, transport and/or connection setting up 315 * Set up a new incoming call. Called in BH context with the RCU read lock
185 * - the packets we get are all incoming client DATA packets that have seq == 1 316 * held.
317 *
318 * If this is for a kernel service, when we allocate the call, it will have
319 * three refs on it: (1) the kernel service, (2) the user_call_ID tree, (3) the
320 * retainer ref obtained from the backlog buffer. Prealloc calls for userspace
321 * services only have the ref from the backlog buffer. We want to pass this
322 * ref to non-BH context to dispose of.
323 *
324 * If we want to report an error, we mark the skb with the packet type and
325 * abort code and return NULL.
186 */ 326 */
187void rxrpc_accept_incoming_calls(struct rxrpc_local *local) 327struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local,
328 struct rxrpc_connection *conn,
329 struct sk_buff *skb)
188{ 330{
189 struct rxrpc_skb_priv *sp; 331 struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
190 struct sockaddr_rxrpc srx;
191 struct rxrpc_sock *rx; 332 struct rxrpc_sock *rx;
192 struct rxrpc_wire_header whdr; 333 struct rxrpc_call *call;
193 struct sk_buff *skb; 334 u16 service_id = sp->hdr.serviceId;
194 int ret;
195 335
196 _enter("%d", local->debug_id); 336 _enter("");
197 337
198 skb = skb_dequeue(&local->accept_queue); 338 /* Get the socket providing the service */
199 if (!skb) { 339 rx = rcu_dereference(local->service);
200 _leave("\n"); 340 if (rx && service_id == rx->srx.srx_service)
201 return; 341 goto found_service;
342
343 trace_rxrpc_abort("INV", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq,
344 RX_INVALID_OPERATION, EOPNOTSUPP);
345 skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT;
346 skb->priority = RX_INVALID_OPERATION;
347 _leave(" = NULL [service]");
348 return NULL;
349
350found_service:
351 spin_lock(&rx->incoming_lock);
352 if (rx->sk.sk_state == RXRPC_CLOSE) {
353 trace_rxrpc_abort("CLS", sp->hdr.cid, sp->hdr.callNumber,
354 sp->hdr.seq, RX_INVALID_OPERATION, ESHUTDOWN);
355 skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT;
356 skb->priority = RX_INVALID_OPERATION;
357 _leave(" = NULL [close]");
358 call = NULL;
359 goto out;
202 } 360 }
203 361
204 _net("incoming call skb %p", skb); 362 call = rxrpc_alloc_incoming_call(rx, local, conn, skb);
205 363 if (!call) {
206 sp = rxrpc_skb(skb); 364 skb->mark = RXRPC_SKB_MARK_BUSY;
207 365 _leave(" = NULL [busy]");
208 /* Set up a response packet header in case we need it */ 366 call = NULL;
209 whdr.epoch = htonl(sp->hdr.epoch); 367 goto out;
210 whdr.cid = htonl(sp->hdr.cid);
211 whdr.callNumber = htonl(sp->hdr.callNumber);
212 whdr.seq = htonl(sp->hdr.seq);
213 whdr.serial = 0;
214 whdr.flags = 0;
215 whdr.type = 0;
216 whdr.userStatus = 0;
217 whdr.securityIndex = sp->hdr.securityIndex;
218 whdr._rsvd = 0;
219 whdr.serviceId = htons(sp->hdr.serviceId);
220
221 if (rxrpc_extract_addr_from_skb(&srx, skb) < 0)
222 goto drop;
223
224 /* get the socket providing the service */
225 read_lock_bh(&local->services_lock);
226 list_for_each_entry(rx, &local->services, listen_link) {
227 if (rx->srx.srx_service == sp->hdr.serviceId &&
228 rx->sk.sk_state != RXRPC_CLOSE)
229 goto found_service;
230 } 368 }
231 read_unlock_bh(&local->services_lock);
232 goto invalid_service;
233 369
234found_service: 370 trace_rxrpc_receive(call, rxrpc_receive_incoming,
235 _debug("found service %hd", rx->srx.srx_service); 371 sp->hdr.serial, sp->hdr.seq);
236 if (sk_acceptq_is_full(&rx->sk)) 372
237 goto backlog_full; 373 /* Make the call live. */
238 sk_acceptq_added(&rx->sk); 374 rxrpc_incoming_call(rx, call, skb);
239 sock_hold(&rx->sk); 375 conn = call->conn;
240 read_unlock_bh(&local->services_lock); 376
241 377 if (rx->notify_new_call)
242 ret = rxrpc_accept_incoming_call(local, rx, skb, &srx); 378 rx->notify_new_call(&rx->sk, call, call->user_call_ID);
243 if (ret < 0) 379 else
244 sk_acceptq_removed(&rx->sk); 380 sk_acceptq_added(&rx->sk);
245 sock_put(&rx->sk); 381
246 switch (ret) { 382 spin_lock(&conn->state_lock);
247 case -ECONNRESET: /* old calls are ignored */ 383 switch (conn->state) {
248 case -ECONNABORTED: /* aborted calls are reaborted or ignored */ 384 case RXRPC_CONN_SERVICE_UNSECURED:
249 case 0: 385 conn->state = RXRPC_CONN_SERVICE_CHALLENGING;
250 return; 386 set_bit(RXRPC_CONN_EV_CHALLENGE, &call->conn->events);
251 case -ECONNREFUSED: 387 rxrpc_queue_conn(call->conn);
252 goto invalid_service; 388 break;
253 case -EBUSY: 389
254 goto busy; 390 case RXRPC_CONN_SERVICE:
255 case -EKEYREJECTED: 391 write_lock(&call->state_lock);
256 goto security_mismatch; 392 if (rx->discard_new_call)
393 call->state = RXRPC_CALL_SERVER_RECV_REQUEST;
394 else
395 call->state = RXRPC_CALL_SERVER_ACCEPTING;
396 write_unlock(&call->state_lock);
397 break;
398
399 case RXRPC_CONN_REMOTELY_ABORTED:
400 rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED,
401 conn->remote_abort, ECONNABORTED);
402 break;
403 case RXRPC_CONN_LOCALLY_ABORTED:
404 rxrpc_abort_call("CON", call, sp->hdr.seq,
405 conn->local_abort, ECONNABORTED);
406 break;
257 default: 407 default:
258 BUG(); 408 BUG();
259 } 409 }
410 spin_unlock(&conn->state_lock);
260 411
261backlog_full: 412 if (call->state == RXRPC_CALL_SERVER_ACCEPTING)
262 read_unlock_bh(&local->services_lock); 413 rxrpc_notify_socket(call);
263busy:
264 rxrpc_busy(local, &srx, &whdr);
265 rxrpc_free_skb(skb);
266 return;
267 414
268drop: 415 /* We have to discard the prealloc queue's ref here and rely on a
269 rxrpc_free_skb(skb); 416 * combination of the RCU read lock and refs held either by the socket
270 return; 417 * (recvmsg queue, to-be-accepted queue or user ID tree) or the kernel
418 * service to prevent the call from being deallocated too early.
419 */
420 rxrpc_put_call(call, rxrpc_call_put);
271 421
272invalid_service: 422 _leave(" = %p{%d}", call, call->debug_id);
273 skb->priority = RX_INVALID_OPERATION; 423out:
274 rxrpc_reject_packet(local, skb); 424 spin_unlock(&rx->incoming_lock);
275 return; 425 return call;
276
277 /* can't change connection security type mid-flow */
278security_mismatch:
279 skb->priority = RX_PROTOCOL_ERROR;
280 rxrpc_reject_packet(local, skb);
281 return;
282} 426}
283 427
284/* 428/*
@@ -286,7 +430,8 @@ security_mismatch:
286 * - assign the user call ID to the call at the front of the queue 430 * - assign the user call ID to the call at the front of the queue
287 */ 431 */
288struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx, 432struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx,
289 unsigned long user_call_ID) 433 unsigned long user_call_ID,
434 rxrpc_notify_rx_t notify_rx)
290{ 435{
291 struct rxrpc_call *call; 436 struct rxrpc_call *call;
292 struct rb_node *parent, **pp; 437 struct rb_node *parent, **pp;
@@ -298,12 +443,13 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx,
298 443
299 write_lock(&rx->call_lock); 444 write_lock(&rx->call_lock);
300 445
301 ret = -ENODATA; 446 if (list_empty(&rx->to_be_accepted)) {
302 if (list_empty(&rx->acceptq)) 447 write_unlock(&rx->call_lock);
303 goto out; 448 kleave(" = -ENODATA [empty]");
449 return ERR_PTR(-ENODATA);
450 }
304 451
305 /* check the user ID isn't already in use */ 452 /* check the user ID isn't already in use */
306 ret = -EBADSLT;
307 pp = &rx->calls.rb_node; 453 pp = &rx->calls.rb_node;
308 parent = NULL; 454 parent = NULL;
309 while (*pp) { 455 while (*pp) {
@@ -315,62 +461,59 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx,
315 else if (user_call_ID > call->user_call_ID) 461 else if (user_call_ID > call->user_call_ID)
316 pp = &(*pp)->rb_right; 462 pp = &(*pp)->rb_right;
317 else 463 else
318 goto out; 464 goto id_in_use;
319 } 465 }
320 466
321 /* dequeue the first call and check it's still valid */ 467 /* Dequeue the first call and check it's still valid. We gain
322 call = list_entry(rx->acceptq.next, struct rxrpc_call, accept_link); 468 * responsibility for the queue's reference.
469 */
470 call = list_entry(rx->to_be_accepted.next,
471 struct rxrpc_call, accept_link);
323 list_del_init(&call->accept_link); 472 list_del_init(&call->accept_link);
324 sk_acceptq_removed(&rx->sk); 473 sk_acceptq_removed(&rx->sk);
474 rxrpc_see_call(call);
325 475
326 write_lock_bh(&call->state_lock); 476 write_lock_bh(&call->state_lock);
327 switch (call->state) { 477 switch (call->state) {
328 case RXRPC_CALL_SERVER_ACCEPTING: 478 case RXRPC_CALL_SERVER_ACCEPTING:
329 call->state = RXRPC_CALL_SERVER_RECV_REQUEST; 479 call->state = RXRPC_CALL_SERVER_RECV_REQUEST;
330 break; 480 break;
331 case RXRPC_CALL_REMOTELY_ABORTED: 481 case RXRPC_CALL_COMPLETE:
332 case RXRPC_CALL_LOCALLY_ABORTED: 482 ret = call->error;
333 ret = -ECONNABORTED;
334 goto out_release;
335 case RXRPC_CALL_NETWORK_ERROR:
336 ret = call->conn->error;
337 goto out_release; 483 goto out_release;
338 case RXRPC_CALL_DEAD:
339 ret = -ETIME;
340 goto out_discard;
341 default: 484 default:
342 BUG(); 485 BUG();
343 } 486 }
344 487
345 /* formalise the acceptance */ 488 /* formalise the acceptance */
489 call->notify_rx = notify_rx;
346 call->user_call_ID = user_call_ID; 490 call->user_call_ID = user_call_ID;
491 rxrpc_get_call(call, rxrpc_call_got_userid);
347 rb_link_node(&call->sock_node, parent, pp); 492 rb_link_node(&call->sock_node, parent, pp);
348 rb_insert_color(&call->sock_node, &rx->calls); 493 rb_insert_color(&call->sock_node, &rx->calls);
349 if (test_and_set_bit(RXRPC_CALL_HAS_USERID, &call->flags)) 494 if (test_and_set_bit(RXRPC_CALL_HAS_USERID, &call->flags))
350 BUG(); 495 BUG();
351 if (test_and_set_bit(RXRPC_CALL_EV_ACCEPTED, &call->events))
352 BUG();
353 rxrpc_queue_call(call);
354 496
355 rxrpc_get_call(call);
356 write_unlock_bh(&call->state_lock); 497 write_unlock_bh(&call->state_lock);
357 write_unlock(&rx->call_lock); 498 write_unlock(&rx->call_lock);
499 rxrpc_notify_socket(call);
500 rxrpc_service_prealloc(rx, GFP_KERNEL);
358 _leave(" = %p{%d}", call, call->debug_id); 501 _leave(" = %p{%d}", call, call->debug_id);
359 return call; 502 return call;
360 503
361 /* if the call is already dying or dead, then we leave the socket's ref
362 * on it to be released by rxrpc_dead_call_expired() as induced by
363 * rxrpc_release_call() */
364out_release: 504out_release:
365 _debug("release %p", call); 505 _debug("release %p", call);
366 if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
367 !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events))
368 rxrpc_queue_call(call);
369out_discard:
370 write_unlock_bh(&call->state_lock); 506 write_unlock_bh(&call->state_lock);
371 _debug("discard %p", call);
372out:
373 write_unlock(&rx->call_lock); 507 write_unlock(&rx->call_lock);
508 rxrpc_release_call(rx, call);
509 rxrpc_put_call(call, rxrpc_call_put);
510 goto out;
511
512id_in_use:
513 ret = -EBADSLT;
514 write_unlock(&rx->call_lock);
515out:
516 rxrpc_service_prealloc(rx, GFP_KERNEL);
374 _leave(" = %d", ret); 517 _leave(" = %d", ret);
375 return ERR_PTR(ret); 518 return ERR_PTR(ret);
376} 519}
@@ -382,6 +525,7 @@ out:
382int rxrpc_reject_call(struct rxrpc_sock *rx) 525int rxrpc_reject_call(struct rxrpc_sock *rx)
383{ 526{
384 struct rxrpc_call *call; 527 struct rxrpc_call *call;
528 bool abort = false;
385 int ret; 529 int ret;
386 530
387 _enter(""); 531 _enter("");
@@ -390,88 +534,73 @@ int rxrpc_reject_call(struct rxrpc_sock *rx)
390 534
391 write_lock(&rx->call_lock); 535 write_lock(&rx->call_lock);
392 536
393 ret = -ENODATA; 537 if (list_empty(&rx->to_be_accepted)) {
394 if (list_empty(&rx->acceptq)) 538 write_unlock(&rx->call_lock);
395 goto out; 539 return -ENODATA;
540 }
396 541
397 /* dequeue the first call and check it's still valid */ 542 /* Dequeue the first call and check it's still valid. We gain
398 call = list_entry(rx->acceptq.next, struct rxrpc_call, accept_link); 543 * responsibility for the queue's reference.
544 */
545 call = list_entry(rx->to_be_accepted.next,
546 struct rxrpc_call, accept_link);
399 list_del_init(&call->accept_link); 547 list_del_init(&call->accept_link);
400 sk_acceptq_removed(&rx->sk); 548 sk_acceptq_removed(&rx->sk);
549 rxrpc_see_call(call);
401 550
402 write_lock_bh(&call->state_lock); 551 write_lock_bh(&call->state_lock);
403 switch (call->state) { 552 switch (call->state) {
404 case RXRPC_CALL_SERVER_ACCEPTING: 553 case RXRPC_CALL_SERVER_ACCEPTING:
405 call->state = RXRPC_CALL_SERVER_BUSY; 554 __rxrpc_abort_call("REJ", call, 1, RX_USER_ABORT, ECONNABORTED);
406 if (test_and_set_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events)) 555 abort = true;
407 rxrpc_queue_call(call); 556 /* fall through */
408 ret = 0; 557 case RXRPC_CALL_COMPLETE:
409 goto out_release; 558 ret = call->error;
410 case RXRPC_CALL_REMOTELY_ABORTED:
411 case RXRPC_CALL_LOCALLY_ABORTED:
412 ret = -ECONNABORTED;
413 goto out_release;
414 case RXRPC_CALL_NETWORK_ERROR:
415 ret = call->conn->error;
416 goto out_release;
417 case RXRPC_CALL_DEAD:
418 ret = -ETIME;
419 goto out_discard; 559 goto out_discard;
420 default: 560 default:
421 BUG(); 561 BUG();
422 } 562 }
423 563
424 /* if the call is already dying or dead, then we leave the socket's ref
425 * on it to be released by rxrpc_dead_call_expired() as induced by
426 * rxrpc_release_call() */
427out_release:
428 _debug("release %p", call);
429 if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
430 !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events))
431 rxrpc_queue_call(call);
432out_discard: 564out_discard:
433 write_unlock_bh(&call->state_lock); 565 write_unlock_bh(&call->state_lock);
434 _debug("discard %p", call);
435out:
436 write_unlock(&rx->call_lock); 566 write_unlock(&rx->call_lock);
567 if (abort) {
568 rxrpc_send_abort_packet(call);
569 rxrpc_release_call(rx, call);
570 rxrpc_put_call(call, rxrpc_call_put);
571 }
572 rxrpc_service_prealloc(rx, GFP_KERNEL);
437 _leave(" = %d", ret); 573 _leave(" = %d", ret);
438 return ret; 574 return ret;
439} 575}
440 576
441/** 577/*
442 * rxrpc_kernel_accept_call - Allow a kernel service to accept an incoming call 578 * rxrpc_kernel_charge_accept - Charge up socket with preallocated calls
443 * @sock: The socket on which the impending call is waiting 579 * @sock: The socket on which to preallocate
444 * @user_call_ID: The tag to attach to the call 580 * @notify_rx: Event notification function for the call
581 * @user_attach_call: Func to attach call to user_call_ID
582 * @user_call_ID: The tag to attach to the preallocated call
583 * @gfp: The allocation conditions.
445 * 584 *
446 * Allow a kernel service to accept an incoming call, assuming the incoming 585 * Charge up the socket with preallocated calls, each with a user ID. A
447 * call is still valid. 586 * function should be provided to effect the attachment from the user's side.
448 */ 587 * The user is given a ref to hold on the call.
449struct rxrpc_call *rxrpc_kernel_accept_call(struct socket *sock,
450 unsigned long user_call_ID)
451{
452 struct rxrpc_call *call;
453
454 _enter(",%lx", user_call_ID);
455 call = rxrpc_accept_call(rxrpc_sk(sock->sk), user_call_ID);
456 _leave(" = %p", call);
457 return call;
458}
459EXPORT_SYMBOL(rxrpc_kernel_accept_call);
460
461/**
462 * rxrpc_kernel_reject_call - Allow a kernel service to reject an incoming call
463 * @sock: The socket on which the impending call is waiting
464 * 588 *
465 * Allow a kernel service to reject an incoming call with a BUSY message, 589 * Note that the call may be come connected before this function returns.
466 * assuming the incoming call is still valid.
467 */ 590 */
468int rxrpc_kernel_reject_call(struct socket *sock) 591int rxrpc_kernel_charge_accept(struct socket *sock,
592 rxrpc_notify_rx_t notify_rx,
593 rxrpc_user_attach_call_t user_attach_call,
594 unsigned long user_call_ID, gfp_t gfp)
469{ 595{
470 int ret; 596 struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
597 struct rxrpc_backlog *b = rx->backlog;
471 598
472 _enter(""); 599 if (sock->sk->sk_state == RXRPC_CLOSE)
473 ret = rxrpc_reject_call(rxrpc_sk(sock->sk)); 600 return -ESHUTDOWN;
474 _leave(" = %d", ret); 601
475 return ret; 602 return rxrpc_service_prealloc_one(rx, b, notify_rx,
603 user_attach_call, user_call_ID,
604 gfp);
476} 605}
477EXPORT_SYMBOL(rxrpc_kernel_reject_call); 606EXPORT_SYMBOL(rxrpc_kernel_charge_accept);
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index e60cf65c2232..97a17ada4431 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -22,1281 +22,402 @@
22#include "ar-internal.h" 22#include "ar-internal.h"
23 23
24/* 24/*
25 * propose an ACK be sent 25 * Set the timer
26 */ 26 */
27void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, 27void __rxrpc_set_timer(struct rxrpc_call *call, enum rxrpc_timer_trace why,
28 u32 serial, bool immediate) 28 ktime_t now)
29{ 29{
30 unsigned long expiry; 30 unsigned long t_j, now_j = jiffies;
31 s8 prior = rxrpc_ack_priority[ack_reason]; 31 ktime_t t;
32 32 bool queue = false;
33 ASSERTCMP(prior, >, 0); 33
34 34 if (call->state < RXRPC_CALL_COMPLETE) {
35 _enter("{%d},%s,%%%x,%u", 35 t = call->expire_at;
36 call->debug_id, rxrpc_acks(ack_reason), serial, immediate); 36 if (!ktime_after(t, now)) {
37 trace_rxrpc_timer(call, why, now, now_j);
38 queue = true;
39 goto out;
40 }
37 41
38 if (prior < rxrpc_ack_priority[call->ackr_reason]) { 42 if (!ktime_after(call->resend_at, now)) {
39 if (immediate) 43 call->resend_at = call->expire_at;
40 goto cancel_timer; 44 if (!test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events))
41 return; 45 queue = true;
42 } 46 } else if (ktime_before(call->resend_at, t)) {
47 t = call->resend_at;
48 }
43 49
44 /* update DELAY, IDLE, REQUESTED and PING_RESPONSE ACK serial 50 if (!ktime_after(call->ack_at, now)) {
45 * numbers */ 51 call->ack_at = call->expire_at;
46 if (prior == rxrpc_ack_priority[call->ackr_reason]) { 52 if (!test_and_set_bit(RXRPC_CALL_EV_ACK, &call->events))
47 if (prior <= 4) 53 queue = true;
48 call->ackr_serial = serial; 54 } else if (ktime_before(call->ack_at, t)) {
49 if (immediate) 55 t = call->ack_at;
50 goto cancel_timer; 56 }
51 return;
52 }
53 57
54 call->ackr_reason = ack_reason; 58 if (!ktime_after(call->ping_at, now)) {
55 call->ackr_serial = serial; 59 call->ping_at = call->expire_at;
60 if (!test_and_set_bit(RXRPC_CALL_EV_PING, &call->events))
61 queue = true;
62 } else if (ktime_before(call->ping_at, t)) {
63 t = call->ping_at;
64 }
56 65
57 switch (ack_reason) { 66 t_j = nsecs_to_jiffies(ktime_to_ns(ktime_sub(t, now)));
58 case RXRPC_ACK_DELAY: 67 t_j += jiffies;
59 _debug("run delay timer");
60 expiry = rxrpc_soft_ack_delay;
61 goto run_timer;
62 68
63 case RXRPC_ACK_IDLE: 69 /* We have to make sure that the calculated jiffies value falls
64 if (!immediate) { 70 * at or after the nsec value, or we may loop ceaselessly
65 _debug("run defer timer"); 71 * because the timer times out, but we haven't reached the nsec
66 expiry = rxrpc_idle_ack_delay; 72 * timeout yet.
67 goto run_timer; 73 */
68 } 74 t_j++;
69 goto cancel_timer;
70 75
71 case RXRPC_ACK_REQUESTED: 76 if (call->timer.expires != t_j || !timer_pending(&call->timer)) {
72 expiry = rxrpc_requested_ack_delay; 77 mod_timer(&call->timer, t_j);
73 if (!expiry) 78 trace_rxrpc_timer(call, why, now, now_j);
74 goto cancel_timer;
75 if (!immediate || serial == 1) {
76 _debug("run defer timer");
77 goto run_timer;
78 } 79 }
79
80 default:
81 _debug("immediate ACK");
82 goto cancel_timer;
83 } 80 }
84 81
85run_timer: 82out:
86 expiry += jiffies; 83 if (queue)
87 if (!timer_pending(&call->ack_timer) ||
88 time_after(call->ack_timer.expires, expiry))
89 mod_timer(&call->ack_timer, expiry);
90 return;
91
92cancel_timer:
93 _debug("cancel timer %%%u", serial);
94 try_to_del_timer_sync(&call->ack_timer);
95 read_lock_bh(&call->state_lock);
96 if (call->state <= RXRPC_CALL_COMPLETE &&
97 !test_and_set_bit(RXRPC_CALL_EV_ACK, &call->events))
98 rxrpc_queue_call(call); 84 rxrpc_queue_call(call);
99 read_unlock_bh(&call->state_lock);
100}
101
102/*
103 * propose an ACK be sent, locking the call structure
104 */
105void rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
106 u32 serial, bool immediate)
107{
108 s8 prior = rxrpc_ack_priority[ack_reason];
109
110 if (prior > rxrpc_ack_priority[call->ackr_reason]) {
111 spin_lock_bh(&call->lock);
112 __rxrpc_propose_ACK(call, ack_reason, serial, immediate);
113 spin_unlock_bh(&call->lock);
114 }
115} 85}
116 86
117/* 87/*
118 * set the resend timer 88 * Set the timer
119 */ 89 */
120static void rxrpc_set_resend(struct rxrpc_call *call, u8 resend, 90void rxrpc_set_timer(struct rxrpc_call *call, enum rxrpc_timer_trace why,
121 unsigned long resend_at) 91 ktime_t now)
122{ 92{
123 read_lock_bh(&call->state_lock); 93 read_lock_bh(&call->state_lock);
124 if (call->state >= RXRPC_CALL_COMPLETE) 94 __rxrpc_set_timer(call, why, now);
125 resend = 0;
126
127 if (resend & 1) {
128 _debug("SET RESEND");
129 set_bit(RXRPC_CALL_EV_RESEND, &call->events);
130 }
131
132 if (resend & 2) {
133 _debug("MODIFY RESEND TIMER");
134 set_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
135 mod_timer(&call->resend_timer, resend_at);
136 } else {
137 _debug("KILL RESEND TIMER");
138 del_timer_sync(&call->resend_timer);
139 clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events);
140 clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
141 }
142 read_unlock_bh(&call->state_lock); 95 read_unlock_bh(&call->state_lock);
143} 96}
144 97
145/* 98/*
146 * resend packets 99 * Propose a PING ACK be sent.
147 */ 100 */
148static void rxrpc_resend(struct rxrpc_call *call) 101static void rxrpc_propose_ping(struct rxrpc_call *call,
102 bool immediate, bool background)
149{ 103{
150 struct rxrpc_wire_header *whdr; 104 if (immediate) {
151 struct rxrpc_skb_priv *sp; 105 if (background &&
152 struct sk_buff *txb; 106 !test_and_set_bit(RXRPC_CALL_EV_PING, &call->events))
153 unsigned long *p_txb, resend_at; 107 rxrpc_queue_call(call);
154 bool stop; 108 } else {
155 int loop; 109 ktime_t now = ktime_get_real();
156 u8 resend; 110 ktime_t ping_at = ktime_add_ms(now, rxrpc_idle_ack_delay);
157
158 _enter("{%d,%d,%d,%d},",
159 call->acks_hard, call->acks_unacked,
160 atomic_read(&call->sequence),
161 CIRC_CNT(call->acks_head, call->acks_tail, call->acks_winsz));
162
163 stop = false;
164 resend = 0;
165 resend_at = 0;
166
167 for (loop = call->acks_tail;
168 loop != call->acks_head || stop;
169 loop = (loop + 1) & (call->acks_winsz - 1)
170 ) {
171 p_txb = call->acks_window + loop;
172 smp_read_barrier_depends();
173 if (*p_txb & 1)
174 continue;
175
176 txb = (struct sk_buff *) *p_txb;
177 sp = rxrpc_skb(txb);
178
179 if (sp->need_resend) {
180 sp->need_resend = false;
181
182 /* each Tx packet has a new serial number */
183 sp->hdr.serial = atomic_inc_return(&call->conn->serial);
184
185 whdr = (struct rxrpc_wire_header *)txb->head;
186 whdr->serial = htonl(sp->hdr.serial);
187
188 _proto("Tx DATA %%%u { #%d }",
189 sp->hdr.serial, sp->hdr.seq);
190 if (rxrpc_send_data_packet(call->conn, txb) < 0) {
191 stop = true;
192 sp->resend_at = jiffies + 3;
193 } else {
194 sp->resend_at =
195 jiffies + rxrpc_resend_timeout;
196 }
197 }
198
199 if (time_after_eq(jiffies + 1, sp->resend_at)) {
200 sp->need_resend = true;
201 resend |= 1;
202 } else if (resend & 2) {
203 if (time_before(sp->resend_at, resend_at))
204 resend_at = sp->resend_at;
205 } else {
206 resend_at = sp->resend_at;
207 resend |= 2;
208 }
209 }
210
211 rxrpc_set_resend(call, resend, resend_at);
212 _leave("");
213}
214
215/*
216 * handle resend timer expiry
217 */
218static void rxrpc_resend_timer(struct rxrpc_call *call)
219{
220 struct rxrpc_skb_priv *sp;
221 struct sk_buff *txb;
222 unsigned long *p_txb, resend_at;
223 int loop;
224 u8 resend;
225
226 _enter("%d,%d,%d",
227 call->acks_tail, call->acks_unacked, call->acks_head);
228
229 if (call->state >= RXRPC_CALL_COMPLETE)
230 return;
231
232 resend = 0;
233 resend_at = 0;
234
235 for (loop = call->acks_unacked;
236 loop != call->acks_head;
237 loop = (loop + 1) & (call->acks_winsz - 1)
238 ) {
239 p_txb = call->acks_window + loop;
240 smp_read_barrier_depends();
241 txb = (struct sk_buff *) (*p_txb & ~1);
242 sp = rxrpc_skb(txb);
243
244 ASSERT(!(*p_txb & 1));
245 111
246 if (sp->need_resend) { 112 if (ktime_before(ping_at, call->ping_at)) {
247 ; 113 call->ping_at = ping_at;
248 } else if (time_after_eq(jiffies + 1, sp->resend_at)) { 114 rxrpc_set_timer(call, rxrpc_timer_set_for_ping, now);
249 sp->need_resend = true;
250 resend |= 1;
251 } else if (resend & 2) {
252 if (time_before(sp->resend_at, resend_at))
253 resend_at = sp->resend_at;
254 } else {
255 resend_at = sp->resend_at;
256 resend |= 2;
257 } 115 }
258 } 116 }
259
260 rxrpc_set_resend(call, resend, resend_at);
261 _leave("");
262} 117}
263 118
264/* 119/*
265 * process soft ACKs of our transmitted packets 120 * propose an ACK be sent
266 * - these indicate packets the peer has or has not received, but hasn't yet
267 * given to the consumer, and so can still be discarded and re-requested
268 */ 121 */
269static int rxrpc_process_soft_ACKs(struct rxrpc_call *call, 122static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
270 struct rxrpc_ackpacket *ack, 123 u16 skew, u32 serial, bool immediate,
271 struct sk_buff *skb) 124 bool background,
125 enum rxrpc_propose_ack_trace why)
272{ 126{
273 struct rxrpc_skb_priv *sp; 127 enum rxrpc_propose_ack_outcome outcome = rxrpc_propose_ack_use;
274 struct sk_buff *txb; 128 unsigned int expiry = rxrpc_soft_ack_delay;
275 unsigned long *p_txb, resend_at; 129 ktime_t now, ack_at;
276 int loop; 130 s8 prior = rxrpc_ack_priority[ack_reason];
277 u8 sacks[RXRPC_MAXACKS], resend;
278
279 _enter("{%d,%d},{%d},",
280 call->acks_hard,
281 CIRC_CNT(call->acks_head, call->acks_tail, call->acks_winsz),
282 ack->nAcks);
283
284 if (skb_copy_bits(skb, 0, sacks, ack->nAcks) < 0)
285 goto protocol_error;
286
287 resend = 0;
288 resend_at = 0;
289 for (loop = 0; loop < ack->nAcks; loop++) {
290 p_txb = call->acks_window;
291 p_txb += (call->acks_tail + loop) & (call->acks_winsz - 1);
292 smp_read_barrier_depends();
293 txb = (struct sk_buff *) (*p_txb & ~1);
294 sp = rxrpc_skb(txb);
295
296 switch (sacks[loop]) {
297 case RXRPC_ACK_TYPE_ACK:
298 sp->need_resend = false;
299 *p_txb |= 1;
300 break;
301 case RXRPC_ACK_TYPE_NACK:
302 sp->need_resend = true;
303 *p_txb &= ~1;
304 resend = 1;
305 break;
306 default:
307 _debug("Unsupported ACK type %d", sacks[loop]);
308 goto protocol_error;
309 }
310 }
311
312 smp_mb();
313 call->acks_unacked = (call->acks_tail + loop) & (call->acks_winsz - 1);
314
315 /* anything not explicitly ACK'd is implicitly NACK'd, but may just not
316 * have been received or processed yet by the far end */
317 for (loop = call->acks_unacked;
318 loop != call->acks_head;
319 loop = (loop + 1) & (call->acks_winsz - 1)
320 ) {
321 p_txb = call->acks_window + loop;
322 smp_read_barrier_depends();
323 txb = (struct sk_buff *) (*p_txb & ~1);
324 sp = rxrpc_skb(txb);
325 131
326 if (*p_txb & 1) { 132 /* Pings are handled specially because we don't want to accidentally
327 /* packet must have been discarded */ 133 * lose a ping response by subsuming it into a ping.
328 sp->need_resend = true; 134 */
329 *p_txb &= ~1; 135 if (ack_reason == RXRPC_ACK_PING) {
330 resend |= 1; 136 rxrpc_propose_ping(call, immediate, background);
331 } else if (sp->need_resend) { 137 goto trace;
332 ; 138 }
333 } else if (time_after_eq(jiffies + 1, sp->resend_at)) { 139
334 sp->need_resend = true; 140 /* Update DELAY, IDLE, REQUESTED and PING_RESPONSE ACK serial
335 resend |= 1; 141 * numbers, but we don't alter the timeout.
336 } else if (resend & 2) { 142 */
337 if (time_before(sp->resend_at, resend_at)) 143 _debug("prior %u %u vs %u %u",
338 resend_at = sp->resend_at; 144 ack_reason, prior,
339 } else { 145 call->ackr_reason, rxrpc_ack_priority[call->ackr_reason]);
340 resend_at = sp->resend_at; 146 if (ack_reason == call->ackr_reason) {
341 resend |= 2; 147 if (RXRPC_ACK_UPDATEABLE & (1 << ack_reason)) {
148 outcome = rxrpc_propose_ack_update;
149 call->ackr_serial = serial;
150 call->ackr_skew = skew;
342 } 151 }
152 if (!immediate)
153 goto trace;
154 } else if (prior > rxrpc_ack_priority[call->ackr_reason]) {
155 call->ackr_reason = ack_reason;
156 call->ackr_serial = serial;
157 call->ackr_skew = skew;
158 } else {
159 outcome = rxrpc_propose_ack_subsume;
343 } 160 }
344 161
345 rxrpc_set_resend(call, resend, resend_at); 162 switch (ack_reason) {
346 _leave(" = 0"); 163 case RXRPC_ACK_REQUESTED:
347 return 0; 164 if (rxrpc_requested_ack_delay < expiry)
348 165 expiry = rxrpc_requested_ack_delay;
349protocol_error: 166 if (serial == 1)
350 _leave(" = -EPROTO"); 167 immediate = false;
351 return -EPROTO; 168 break;
352}
353
354/*
355 * discard hard-ACK'd packets from the Tx window
356 */
357static void rxrpc_rotate_tx_window(struct rxrpc_call *call, u32 hard)
358{
359 unsigned long _skb;
360 int tail = call->acks_tail, old_tail;
361 int win = CIRC_CNT(call->acks_head, tail, call->acks_winsz);
362 169
363 _enter("{%u,%u},%u", call->acks_hard, win, hard); 170 case RXRPC_ACK_DELAY:
171 if (rxrpc_soft_ack_delay < expiry)
172 expiry = rxrpc_soft_ack_delay;
173 break;
364 174
365 ASSERTCMP(hard - call->acks_hard, <=, win); 175 case RXRPC_ACK_IDLE:
176 if (rxrpc_idle_ack_delay < expiry)
177 expiry = rxrpc_idle_ack_delay;
178 break;
366 179
367 while (call->acks_hard < hard) { 180 default:
368 smp_read_barrier_depends(); 181 immediate = true;
369 _skb = call->acks_window[tail] & ~1; 182 break;
370 rxrpc_free_skb((struct sk_buff *) _skb);
371 old_tail = tail;
372 tail = (tail + 1) & (call->acks_winsz - 1);
373 call->acks_tail = tail;
374 if (call->acks_unacked == old_tail)
375 call->acks_unacked = tail;
376 call->acks_hard++;
377 } 183 }
378 184
379 wake_up(&call->tx_waitq); 185 if (test_bit(RXRPC_CALL_EV_ACK, &call->events)) {
380} 186 _debug("already scheduled");
381 187 } else if (immediate || expiry == 0) {
382/* 188 _debug("immediate ACK %lx", call->events);
383 * clear the Tx window in the event of a failure 189 if (!test_and_set_bit(RXRPC_CALL_EV_ACK, &call->events) &&
384 */ 190 background)
385static void rxrpc_clear_tx_window(struct rxrpc_call *call) 191 rxrpc_queue_call(call);
386{ 192 } else {
387 rxrpc_rotate_tx_window(call, atomic_read(&call->sequence)); 193 now = ktime_get_real();
388} 194 ack_at = ktime_add_ms(now, expiry);
389 195 if (ktime_before(ack_at, call->ack_at)) {
390/* 196 call->ack_at = ack_at;
391 * drain the out of sequence received packet queue into the packet Rx queue 197 rxrpc_set_timer(call, rxrpc_timer_set_for_ack, now);
392 */
393static int rxrpc_drain_rx_oos_queue(struct rxrpc_call *call)
394{
395 struct rxrpc_skb_priv *sp;
396 struct sk_buff *skb;
397 bool terminal;
398 int ret;
399
400 _enter("{%d,%d}", call->rx_data_post, call->rx_first_oos);
401
402 spin_lock_bh(&call->lock);
403
404 ret = -ECONNRESET;
405 if (test_bit(RXRPC_CALL_RELEASED, &call->flags))
406 goto socket_unavailable;
407
408 skb = skb_dequeue(&call->rx_oos_queue);
409 if (skb) {
410 sp = rxrpc_skb(skb);
411
412 _debug("drain OOS packet %d [%d]",
413 sp->hdr.seq, call->rx_first_oos);
414
415 if (sp->hdr.seq != call->rx_first_oos) {
416 skb_queue_head(&call->rx_oos_queue, skb);
417 call->rx_first_oos = rxrpc_skb(skb)->hdr.seq;
418 _debug("requeue %p {%u}", skb, call->rx_first_oos);
419 } else {
420 skb->mark = RXRPC_SKB_MARK_DATA;
421 terminal = ((sp->hdr.flags & RXRPC_LAST_PACKET) &&
422 !(sp->hdr.flags & RXRPC_CLIENT_INITIATED));
423 ret = rxrpc_queue_rcv_skb(call, skb, true, terminal);
424 BUG_ON(ret < 0);
425 _debug("drain #%u", call->rx_data_post);
426 call->rx_data_post++;
427
428 /* find out what the next packet is */
429 skb = skb_peek(&call->rx_oos_queue);
430 if (skb)
431 call->rx_first_oos = rxrpc_skb(skb)->hdr.seq;
432 else
433 call->rx_first_oos = 0;
434 _debug("peek %p {%u}", skb, call->rx_first_oos);
435 } 198 }
436 } 199 }
437 200
438 ret = 0; 201trace:
439socket_unavailable: 202 trace_rxrpc_propose_ack(call, why, ack_reason, serial, immediate,
440 spin_unlock_bh(&call->lock); 203 background, outcome);
441 _leave(" = %d", ret);
442 return ret;
443} 204}
444 205
445/* 206/*
446 * insert an out of sequence packet into the buffer 207 * propose an ACK be sent, locking the call structure
447 */ 208 */
448static void rxrpc_insert_oos_packet(struct rxrpc_call *call, 209void rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
449 struct sk_buff *skb) 210 u16 skew, u32 serial, bool immediate, bool background,
211 enum rxrpc_propose_ack_trace why)
450{ 212{
451 struct rxrpc_skb_priv *sp, *psp;
452 struct sk_buff *p;
453 u32 seq;
454
455 sp = rxrpc_skb(skb);
456 seq = sp->hdr.seq;
457 _enter(",,{%u}", seq);
458
459 skb->destructor = rxrpc_packet_destructor;
460 ASSERTCMP(sp->call, ==, NULL);
461 sp->call = call;
462 rxrpc_get_call(call);
463 atomic_inc(&call->skb_count);
464
465 /* insert into the buffer in sequence order */
466 spin_lock_bh(&call->lock); 213 spin_lock_bh(&call->lock);
467 214 __rxrpc_propose_ACK(call, ack_reason, skew, serial,
468 skb_queue_walk(&call->rx_oos_queue, p) { 215 immediate, background, why);
469 psp = rxrpc_skb(p);
470 if (psp->hdr.seq > seq) {
471 _debug("insert oos #%u before #%u", seq, psp->hdr.seq);
472 skb_insert(p, skb, &call->rx_oos_queue);
473 goto inserted;
474 }
475 }
476
477 _debug("append oos #%u", seq);
478 skb_queue_tail(&call->rx_oos_queue, skb);
479inserted:
480
481 /* we might now have a new front to the queue */
482 if (call->rx_first_oos == 0 || seq < call->rx_first_oos)
483 call->rx_first_oos = seq;
484
485 read_lock(&call->state_lock);
486 if (call->state < RXRPC_CALL_COMPLETE &&
487 call->rx_data_post == call->rx_first_oos) {
488 _debug("drain rx oos now");
489 set_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events);
490 }
491 read_unlock(&call->state_lock);
492
493 spin_unlock_bh(&call->lock); 216 spin_unlock_bh(&call->lock);
494 _leave(" [stored #%u]", call->rx_first_oos);
495} 217}
496 218
497/* 219/*
498 * clear the Tx window on final ACK reception 220 * Handle congestion being detected by the retransmit timeout.
499 */ 221 */
500static void rxrpc_zap_tx_window(struct rxrpc_call *call) 222static void rxrpc_congestion_timeout(struct rxrpc_call *call)
501{ 223{
502 struct rxrpc_skb_priv *sp; 224 set_bit(RXRPC_CALL_RETRANS_TIMEOUT, &call->flags);
503 struct sk_buff *skb;
504 unsigned long _skb, *acks_window;
505 u8 winsz = call->acks_winsz;
506 int tail;
507
508 acks_window = call->acks_window;
509 call->acks_window = NULL;
510
511 while (CIRC_CNT(call->acks_head, call->acks_tail, winsz) > 0) {
512 tail = call->acks_tail;
513 smp_read_barrier_depends();
514 _skb = acks_window[tail] & ~1;
515 smp_mb();
516 call->acks_tail = (call->acks_tail + 1) & (winsz - 1);
517
518 skb = (struct sk_buff *) _skb;
519 sp = rxrpc_skb(skb);
520 _debug("+++ clear Tx %u", sp->hdr.seq);
521 rxrpc_free_skb(skb);
522 }
523
524 kfree(acks_window);
525}
526
527/*
528 * process the extra information that may be appended to an ACK packet
529 */
530static void rxrpc_extract_ackinfo(struct rxrpc_call *call, struct sk_buff *skb,
531 unsigned int latest, int nAcks)
532{
533 struct rxrpc_ackinfo ackinfo;
534 struct rxrpc_peer *peer;
535 unsigned int mtu;
536
537 if (skb_copy_bits(skb, nAcks + 3, &ackinfo, sizeof(ackinfo)) < 0) {
538 _leave(" [no ackinfo]");
539 return;
540 }
541
542 _proto("Rx ACK %%%u Info { rx=%u max=%u rwin=%u jm=%u }",
543 latest,
544 ntohl(ackinfo.rxMTU), ntohl(ackinfo.maxMTU),
545 ntohl(ackinfo.rwind), ntohl(ackinfo.jumbo_max));
546
547 mtu = min(ntohl(ackinfo.rxMTU), ntohl(ackinfo.maxMTU));
548
549 peer = call->conn->params.peer;
550 if (mtu < peer->maxdata) {
551 spin_lock_bh(&peer->lock);
552 peer->maxdata = mtu;
553 peer->mtu = mtu + peer->hdrsize;
554 spin_unlock_bh(&peer->lock);
555 _net("Net MTU %u (maxdata %u)", peer->mtu, peer->maxdata);
556 }
557} 225}
558 226
559/* 227/*
560 * process packets in the reception queue 228 * Perform retransmission of NAK'd and unack'd packets.
561 */ 229 */
562static int rxrpc_process_rx_queue(struct rxrpc_call *call, 230static void rxrpc_resend(struct rxrpc_call *call, ktime_t now)
563 u32 *_abort_code)
564{ 231{
565 struct rxrpc_ackpacket ack;
566 struct rxrpc_skb_priv *sp; 232 struct rxrpc_skb_priv *sp;
567 struct sk_buff *skb; 233 struct sk_buff *skb;
568 bool post_ACK; 234 rxrpc_seq_t cursor, seq, top;
569 int latest; 235 ktime_t max_age, oldest, ack_ts;
570 u32 hard, tx; 236 int ix;
237 u8 annotation, anno_type, retrans = 0, unacked = 0;
571 238
572 _enter(""); 239 _enter("{%d,%d}", call->tx_hard_ack, call->tx_top);
573 240
574process_further: 241 max_age = ktime_sub_ms(now, rxrpc_resend_timeout);
575 skb = skb_dequeue(&call->rx_queue);
576 if (!skb)
577 return -EAGAIN;
578 242
579 _net("deferred skb %p", skb); 243 spin_lock_bh(&call->lock);
580
581 sp = rxrpc_skb(skb);
582
583 _debug("process %s [st %d]", rxrpc_pkts[sp->hdr.type], call->state);
584
585 post_ACK = false;
586
587 switch (sp->hdr.type) {
588 /* data packets that wind up here have been received out of
589 * order, need security processing or are jumbo packets */
590 case RXRPC_PACKET_TYPE_DATA:
591 _proto("OOSQ DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq);
592
593 /* secured packets must be verified and possibly decrypted */
594 if (call->conn->security->verify_packet(call, skb,
595 _abort_code) < 0)
596 goto protocol_error;
597
598 rxrpc_insert_oos_packet(call, skb);
599 goto process_further;
600
601 /* partial ACK to process */
602 case RXRPC_PACKET_TYPE_ACK:
603 if (skb_copy_bits(skb, 0, &ack, sizeof(ack)) < 0) {
604 _debug("extraction failure");
605 goto protocol_error;
606 }
607 if (!skb_pull(skb, sizeof(ack)))
608 BUG();
609
610 latest = sp->hdr.serial;
611 hard = ntohl(ack.firstPacket);
612 tx = atomic_read(&call->sequence);
613
614 _proto("Rx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }",
615 latest,
616 ntohs(ack.maxSkew),
617 hard,
618 ntohl(ack.previousPacket),
619 ntohl(ack.serial),
620 rxrpc_acks(ack.reason),
621 ack.nAcks);
622
623 rxrpc_extract_ackinfo(call, skb, latest, ack.nAcks);
624
625 if (ack.reason == RXRPC_ACK_PING) {
626 _proto("Rx ACK %%%u PING Request", latest);
627 rxrpc_propose_ACK(call, RXRPC_ACK_PING_RESPONSE,
628 sp->hdr.serial, true);
629 }
630
631 /* discard any out-of-order or duplicate ACKs */
632 if (latest - call->acks_latest <= 0) {
633 _debug("discard ACK %d <= %d",
634 latest, call->acks_latest);
635 goto discard;
636 }
637 call->acks_latest = latest;
638
639 if (call->state != RXRPC_CALL_CLIENT_SEND_REQUEST &&
640 call->state != RXRPC_CALL_CLIENT_AWAIT_REPLY &&
641 call->state != RXRPC_CALL_SERVER_SEND_REPLY &&
642 call->state != RXRPC_CALL_SERVER_AWAIT_ACK)
643 goto discard;
644
645 _debug("Tx=%d H=%u S=%d", tx, call->acks_hard, call->state);
646
647 if (hard > 0) {
648 if (hard - 1 > tx) {
649 _debug("hard-ACK'd packet %d not transmitted"
650 " (%d top)",
651 hard - 1, tx);
652 goto protocol_error;
653 }
654 244
655 if ((call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY || 245 cursor = call->tx_hard_ack;
656 call->state == RXRPC_CALL_SERVER_AWAIT_ACK) && 246 top = call->tx_top;
657 hard > tx) { 247 ASSERT(before_eq(cursor, top));
658 call->acks_hard = tx; 248 if (cursor == top)
659 goto all_acked; 249 goto out_unlock;
660 } 250
251 /* Scan the packet list without dropping the lock and decide which of
252 * the packets in the Tx buffer we're going to resend and what the new
253 * resend timeout will be.
254 */
255 oldest = now;
256 for (seq = cursor + 1; before_eq(seq, top); seq++) {
257 ix = seq & RXRPC_RXTX_BUFF_MASK;
258 annotation = call->rxtx_annotations[ix];
259 anno_type = annotation & RXRPC_TX_ANNO_MASK;
260 annotation &= ~RXRPC_TX_ANNO_MASK;
261 if (anno_type == RXRPC_TX_ANNO_ACK)
262 continue;
661 263
662 smp_rmb(); 264 skb = call->rxtx_buffer[ix];
663 rxrpc_rotate_tx_window(call, hard - 1); 265 rxrpc_see_skb(skb, rxrpc_skb_tx_seen);
664 } 266 sp = rxrpc_skb(skb);
665 267
666 if (ack.nAcks > 0) { 268 if (anno_type == RXRPC_TX_ANNO_UNACK) {
667 if (hard - 1 + ack.nAcks > tx) { 269 if (ktime_after(skb->tstamp, max_age)) {
668 _debug("soft-ACK'd packet %d+%d not" 270 if (ktime_before(skb->tstamp, oldest))
669 " transmitted (%d top)", 271 oldest = skb->tstamp;
670 hard - 1, ack.nAcks, tx); 272 continue;
671 goto protocol_error;
672 } 273 }
673 274 if (!(annotation & RXRPC_TX_ANNO_RESENT))
674 if (rxrpc_process_soft_ACKs(call, &ack, skb) < 0) 275 unacked++;
675 goto protocol_error;
676 } 276 }
677 goto discard;
678
679 /* complete ACK to process */
680 case RXRPC_PACKET_TYPE_ACKALL:
681 goto all_acked;
682
683 /* abort and busy are handled elsewhere */
684 case RXRPC_PACKET_TYPE_BUSY:
685 case RXRPC_PACKET_TYPE_ABORT:
686 BUG();
687
688 /* connection level events - also handled elsewhere */
689 case RXRPC_PACKET_TYPE_CHALLENGE:
690 case RXRPC_PACKET_TYPE_RESPONSE:
691 case RXRPC_PACKET_TYPE_DEBUG:
692 BUG();
693 }
694
695 /* if we've had a hard ACK that covers all the packets we've sent, then
696 * that ends that phase of the operation */
697all_acked:
698 write_lock_bh(&call->state_lock);
699 _debug("ack all %d", call->state);
700 277
701 switch (call->state) { 278 /* Okay, we need to retransmit a packet. */
702 case RXRPC_CALL_CLIENT_AWAIT_REPLY: 279 call->rxtx_annotations[ix] = RXRPC_TX_ANNO_RETRANS | annotation;
703 call->state = RXRPC_CALL_CLIENT_RECV_REPLY; 280 retrans++;
704 break; 281 trace_rxrpc_retransmit(call, seq, annotation | anno_type,
705 case RXRPC_CALL_SERVER_AWAIT_ACK: 282 ktime_to_ns(ktime_sub(skb->tstamp, max_age)));
706 _debug("srv complete");
707 call->state = RXRPC_CALL_COMPLETE;
708 post_ACK = true;
709 break;
710 case RXRPC_CALL_CLIENT_SEND_REQUEST:
711 case RXRPC_CALL_SERVER_RECV_REQUEST:
712 goto protocol_error_unlock; /* can't occur yet */
713 default:
714 write_unlock_bh(&call->state_lock);
715 goto discard; /* assume packet left over from earlier phase */
716 } 283 }
717 284
718 write_unlock_bh(&call->state_lock); 285 call->resend_at = ktime_add_ms(oldest, rxrpc_resend_timeout);
719
720 /* if all the packets we sent are hard-ACK'd, then we can discard
721 * whatever we've got left */
722 _debug("clear Tx %d",
723 CIRC_CNT(call->acks_head, call->acks_tail, call->acks_winsz));
724
725 del_timer_sync(&call->resend_timer);
726 clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
727 clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events);
728 286
729 if (call->acks_window) 287 if (unacked)
730 rxrpc_zap_tx_window(call); 288 rxrpc_congestion_timeout(call);
731 289
732 if (post_ACK) { 290 /* If there was nothing that needed retransmission then it's likely
733 /* post the final ACK message for userspace to pick up */ 291 * that an ACK got lost somewhere. Send a ping to find out instead of
734 _debug("post ACK"); 292 * retransmitting data.
735 skb->mark = RXRPC_SKB_MARK_FINAL_ACK; 293 */
736 sp->call = call; 294 if (!retrans) {
737 rxrpc_get_call(call); 295 rxrpc_set_timer(call, rxrpc_timer_set_for_resend, now);
738 atomic_inc(&call->skb_count);
739 spin_lock_bh(&call->lock);
740 if (rxrpc_queue_rcv_skb(call, skb, true, true) < 0)
741 BUG();
742 spin_unlock_bh(&call->lock); 296 spin_unlock_bh(&call->lock);
743 goto process_further; 297 ack_ts = ktime_sub(now, call->acks_latest_ts);
744 } 298 if (ktime_to_ns(ack_ts) < call->peer->rtt)
745 299 goto out;
746discard: 300 rxrpc_propose_ACK(call, RXRPC_ACK_PING, 0, 0, true, false,
747 rxrpc_free_skb(skb); 301 rxrpc_propose_ack_ping_for_lost_ack);
748 goto process_further; 302 rxrpc_send_ack_packet(call, true);
749 303 goto out;
750protocol_error_unlock: 304 }
751 write_unlock_bh(&call->state_lock); 305
752protocol_error: 306 /* Now go through the Tx window and perform the retransmissions. We
753 rxrpc_free_skb(skb); 307 * have to drop the lock for each send. If an ACK comes in whilst the
754 _leave(" = -EPROTO"); 308 * lock is dropped, it may clear some of the retransmission markers for
755 return -EPROTO; 309 * packets that it soft-ACKs.
756} 310 */
757 311 for (seq = cursor + 1; before_eq(seq, top); seq++) {
758/* 312 ix = seq & RXRPC_RXTX_BUFF_MASK;
759 * post a message to the socket Rx queue for recvmsg() to pick up 313 annotation = call->rxtx_annotations[ix];
760 */ 314 anno_type = annotation & RXRPC_TX_ANNO_MASK;
761static int rxrpc_post_message(struct rxrpc_call *call, u32 mark, u32 error, 315 if (anno_type != RXRPC_TX_ANNO_RETRANS)
762 bool fatal) 316 continue;
763{
764 struct rxrpc_skb_priv *sp;
765 struct sk_buff *skb;
766 int ret;
767
768 _enter("{%d,%lx},%u,%u,%d",
769 call->debug_id, call->flags, mark, error, fatal);
770
771 /* remove timers and things for fatal messages */
772 if (fatal) {
773 del_timer_sync(&call->resend_timer);
774 del_timer_sync(&call->ack_timer);
775 clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
776 }
777 317
778 if (mark != RXRPC_SKB_MARK_NEW_CALL && 318 skb = call->rxtx_buffer[ix];
779 !test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) { 319 rxrpc_get_skb(skb, rxrpc_skb_tx_got);
780 _leave("[no userid]"); 320 spin_unlock_bh(&call->lock);
781 return 0;
782 }
783 321
784 if (!test_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags)) { 322 if (rxrpc_send_data_packet(call, skb, true) < 0) {
785 skb = alloc_skb(0, GFP_NOFS); 323 rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
786 if (!skb) 324 return;
787 return -ENOMEM; 325 }
788 326
789 rxrpc_new_skb(skb); 327 if (rxrpc_is_client_call(call))
328 rxrpc_expose_client_call(call);
790 329
791 skb->mark = mark; 330 rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
331 spin_lock_bh(&call->lock);
792 332
793 sp = rxrpc_skb(skb); 333 /* We need to clear the retransmit state, but there are two
794 memset(sp, 0, sizeof(*sp)); 334 * things we need to be aware of: A new ACK/NAK might have been
795 sp->error = error; 335 * received and the packet might have been hard-ACK'd (in which
796 sp->call = call; 336 * case it will no longer be in the buffer).
797 rxrpc_get_call(call); 337 */
798 atomic_inc(&call->skb_count); 338 if (after(seq, call->tx_hard_ack)) {
339 annotation = call->rxtx_annotations[ix];
340 anno_type = annotation & RXRPC_TX_ANNO_MASK;
341 if (anno_type == RXRPC_TX_ANNO_RETRANS ||
342 anno_type == RXRPC_TX_ANNO_NAK) {
343 annotation &= ~RXRPC_TX_ANNO_MASK;
344 annotation |= RXRPC_TX_ANNO_UNACK;
345 }
346 annotation |= RXRPC_TX_ANNO_RESENT;
347 call->rxtx_annotations[ix] = annotation;
348 }
799 349
800 spin_lock_bh(&call->lock); 350 if (after(call->tx_hard_ack, seq))
801 ret = rxrpc_queue_rcv_skb(call, skb, true, fatal); 351 seq = call->tx_hard_ack;
802 spin_unlock_bh(&call->lock);
803 BUG_ON(ret < 0);
804 } 352 }
805 353
806 return 0; 354out_unlock:
355 spin_unlock_bh(&call->lock);
356out:
357 _leave("");
807} 358}
808 359
809/* 360/*
810 * handle background processing of incoming call packets and ACK / abort 361 * Handle retransmission and deferred ACK/abort generation.
811 * generation
812 */ 362 */
813void rxrpc_process_call(struct work_struct *work) 363void rxrpc_process_call(struct work_struct *work)
814{ 364{
815 struct rxrpc_call *call = 365 struct rxrpc_call *call =
816 container_of(work, struct rxrpc_call, processor); 366 container_of(work, struct rxrpc_call, processor);
817 struct rxrpc_wire_header whdr; 367 ktime_t now;
818 struct rxrpc_ackpacket ack;
819 struct rxrpc_ackinfo ackinfo;
820 struct msghdr msg;
821 struct kvec iov[5];
822 enum rxrpc_call_event genbit;
823 unsigned long bits;
824 __be32 data, pad;
825 size_t len;
826 int loop, nbit, ioc, ret, mtu;
827 u32 serial, abort_code = RX_PROTOCOL_ERROR;
828 u8 *acks = NULL;
829
830 //printk("\n--------------------\n");
831 _enter("{%d,%s,%lx} [%lu]",
832 call->debug_id, rxrpc_call_states[call->state], call->events,
833 (jiffies - call->creation_jif) / (HZ / 10));
834
835 if (test_and_set_bit(RXRPC_CALL_PROC_BUSY, &call->flags)) {
836 _debug("XXXXXXXXXXXXX RUNNING ON MULTIPLE CPUS XXXXXXXXXXXXX");
837 return;
838 }
839
840 if (!call->conn)
841 goto skip_msg_init;
842
843 /* there's a good chance we're going to have to send a message, so set
844 * one up in advance */
845 msg.msg_name = &call->conn->params.peer->srx.transport;
846 msg.msg_namelen = call->conn->params.peer->srx.transport_len;
847 msg.msg_control = NULL;
848 msg.msg_controllen = 0;
849 msg.msg_flags = 0;
850
851 whdr.epoch = htonl(call->conn->proto.epoch);
852 whdr.cid = htonl(call->cid);
853 whdr.callNumber = htonl(call->call_id);
854 whdr.seq = 0;
855 whdr.type = RXRPC_PACKET_TYPE_ACK;
856 whdr.flags = call->conn->out_clientflag;
857 whdr.userStatus = 0;
858 whdr.securityIndex = call->conn->security_ix;
859 whdr._rsvd = 0;
860 whdr.serviceId = htons(call->service_id);
861
862 memset(iov, 0, sizeof(iov));
863 iov[0].iov_base = &whdr;
864 iov[0].iov_len = sizeof(whdr);
865skip_msg_init:
866
867 /* deal with events of a final nature */
868 if (test_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events)) {
869 enum rxrpc_skb_mark mark;
870 int error;
871 368
872 clear_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events); 369 rxrpc_see_call(call);
873 clear_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events);
874 clear_bit(RXRPC_CALL_EV_ABORT, &call->events);
875
876 error = call->error_report;
877 if (error < RXRPC_LOCAL_ERROR_OFFSET) {
878 mark = RXRPC_SKB_MARK_NET_ERROR;
879 _debug("post net error %d", error);
880 } else {
881 mark = RXRPC_SKB_MARK_LOCAL_ERROR;
882 error -= RXRPC_LOCAL_ERROR_OFFSET;
883 _debug("post net local error %d", error);
884 }
885
886 if (rxrpc_post_message(call, mark, error, true) < 0)
887 goto no_mem;
888 clear_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events);
889 goto kill_ACKs;
890 }
891
892 if (test_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events)) {
893 ASSERTCMP(call->state, >, RXRPC_CALL_COMPLETE);
894
895 clear_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events);
896 clear_bit(RXRPC_CALL_EV_ABORT, &call->events);
897
898 _debug("post conn abort");
899
900 if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR,
901 call->conn->error, true) < 0)
902 goto no_mem;
903 clear_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events);
904 goto kill_ACKs;
905 }
906 370
907 if (test_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events)) { 371 //printk("\n--------------------\n");
908 whdr.type = RXRPC_PACKET_TYPE_BUSY; 372 _enter("{%d,%s,%lx}",
909 genbit = RXRPC_CALL_EV_REJECT_BUSY; 373 call->debug_id, rxrpc_call_states[call->state], call->events);
910 goto send_message;
911 }
912
913 if (test_bit(RXRPC_CALL_EV_ABORT, &call->events)) {
914 ASSERTCMP(call->state, >, RXRPC_CALL_COMPLETE);
915
916 if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR,
917 ECONNABORTED, true) < 0)
918 goto no_mem;
919 whdr.type = RXRPC_PACKET_TYPE_ABORT;
920 data = htonl(call->local_abort);
921 iov[1].iov_base = &data;
922 iov[1].iov_len = sizeof(data);
923 genbit = RXRPC_CALL_EV_ABORT;
924 goto send_message;
925 }
926
927 if (test_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events)) {
928 genbit = RXRPC_CALL_EV_ACK_FINAL;
929
930 ack.bufferSpace = htons(8);
931 ack.maxSkew = 0;
932 ack.serial = 0;
933 ack.reason = RXRPC_ACK_IDLE;
934 ack.nAcks = 0;
935 call->ackr_reason = 0;
936
937 spin_lock_bh(&call->lock);
938 ack.serial = htonl(call->ackr_serial);
939 ack.previousPacket = htonl(call->ackr_prev_seq);
940 ack.firstPacket = htonl(call->rx_data_eaten + 1);
941 spin_unlock_bh(&call->lock);
942
943 pad = 0;
944
945 iov[1].iov_base = &ack;
946 iov[1].iov_len = sizeof(ack);
947 iov[2].iov_base = &pad;
948 iov[2].iov_len = 3;
949 iov[3].iov_base = &ackinfo;
950 iov[3].iov_len = sizeof(ackinfo);
951 goto send_ACK;
952 }
953
954 if (call->events & ((1 << RXRPC_CALL_EV_RCVD_BUSY) |
955 (1 << RXRPC_CALL_EV_RCVD_ABORT))
956 ) {
957 u32 mark;
958
959 if (test_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events))
960 mark = RXRPC_SKB_MARK_REMOTE_ABORT;
961 else
962 mark = RXRPC_SKB_MARK_BUSY;
963
964 _debug("post abort/busy");
965 rxrpc_clear_tx_window(call);
966 if (rxrpc_post_message(call, mark, ECONNABORTED, true) < 0)
967 goto no_mem;
968
969 clear_bit(RXRPC_CALL_EV_RCVD_BUSY, &call->events);
970 clear_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events);
971 goto kill_ACKs;
972 }
973
974 if (test_and_clear_bit(RXRPC_CALL_EV_RCVD_ACKALL, &call->events)) {
975 _debug("do implicit ackall");
976 rxrpc_clear_tx_window(call);
977 }
978
979 if (test_bit(RXRPC_CALL_EV_LIFE_TIMER, &call->events)) {
980 write_lock_bh(&call->state_lock);
981 if (call->state <= RXRPC_CALL_COMPLETE) {
982 call->state = RXRPC_CALL_LOCALLY_ABORTED;
983 call->local_abort = RX_CALL_TIMEOUT;
984 set_bit(RXRPC_CALL_EV_ABORT, &call->events);
985 }
986 write_unlock_bh(&call->state_lock);
987
988 _debug("post timeout");
989 if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR,
990 ETIME, true) < 0)
991 goto no_mem;
992 374
993 clear_bit(RXRPC_CALL_EV_LIFE_TIMER, &call->events); 375recheck_state:
994 goto kill_ACKs; 376 if (test_and_clear_bit(RXRPC_CALL_EV_ABORT, &call->events)) {
377 rxrpc_send_abort_packet(call);
378 goto recheck_state;
995 } 379 }
996 380
997 /* deal with assorted inbound messages */ 381 if (call->state == RXRPC_CALL_COMPLETE) {
998 if (!skb_queue_empty(&call->rx_queue)) { 382 del_timer_sync(&call->timer);
999 switch (rxrpc_process_rx_queue(call, &abort_code)) { 383 rxrpc_notify_socket(call);
1000 case 0: 384 goto out_put;
1001 case -EAGAIN:
1002 break;
1003 case -ENOMEM:
1004 goto no_mem;
1005 case -EKEYEXPIRED:
1006 case -EKEYREJECTED:
1007 case -EPROTO:
1008 rxrpc_abort_call(call, abort_code);
1009 goto kill_ACKs;
1010 }
1011 } 385 }
1012 386
1013 /* handle resending */ 387 now = ktime_get_real();
1014 if (test_and_clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events)) 388 if (ktime_before(call->expire_at, now)) {
1015 rxrpc_resend_timer(call); 389 rxrpc_abort_call("EXP", call, 0, RX_CALL_TIMEOUT, ETIME);
1016 if (test_and_clear_bit(RXRPC_CALL_EV_RESEND, &call->events)) 390 set_bit(RXRPC_CALL_EV_ABORT, &call->events);
1017 rxrpc_resend(call); 391 goto recheck_state;
1018
1019 /* consider sending an ordinary ACK */
1020 if (test_bit(RXRPC_CALL_EV_ACK, &call->events)) {
1021 _debug("send ACK: window: %d - %d { %lx }",
1022 call->rx_data_eaten, call->ackr_win_top,
1023 call->ackr_window[0]);
1024
1025 if (call->state > RXRPC_CALL_SERVER_ACK_REQUEST &&
1026 call->ackr_reason != RXRPC_ACK_PING_RESPONSE) {
1027 /* ACK by sending reply DATA packet in this state */
1028 clear_bit(RXRPC_CALL_EV_ACK, &call->events);
1029 goto maybe_reschedule;
1030 }
1031
1032 genbit = RXRPC_CALL_EV_ACK;
1033
1034 acks = kzalloc(call->ackr_win_top - call->rx_data_eaten,
1035 GFP_NOFS);
1036 if (!acks)
1037 goto no_mem;
1038
1039 //hdr.flags = RXRPC_SLOW_START_OK;
1040 ack.bufferSpace = htons(8);
1041 ack.maxSkew = 0;
1042
1043 spin_lock_bh(&call->lock);
1044 ack.reason = call->ackr_reason;
1045 ack.serial = htonl(call->ackr_serial);
1046 ack.previousPacket = htonl(call->ackr_prev_seq);
1047 ack.firstPacket = htonl(call->rx_data_eaten + 1);
1048
1049 ack.nAcks = 0;
1050 for (loop = 0; loop < RXRPC_ACKR_WINDOW_ASZ; loop++) {
1051 nbit = loop * BITS_PER_LONG;
1052 for (bits = call->ackr_window[loop]; bits; bits >>= 1
1053 ) {
1054 _debug("- l=%d n=%d b=%lx", loop, nbit, bits);
1055 if (bits & 1) {
1056 acks[nbit] = RXRPC_ACK_TYPE_ACK;
1057 ack.nAcks = nbit + 1;
1058 }
1059 nbit++;
1060 }
1061 }
1062 call->ackr_reason = 0;
1063 spin_unlock_bh(&call->lock);
1064
1065 pad = 0;
1066
1067 iov[1].iov_base = &ack;
1068 iov[1].iov_len = sizeof(ack);
1069 iov[2].iov_base = acks;
1070 iov[2].iov_len = ack.nAcks;
1071 iov[3].iov_base = &pad;
1072 iov[3].iov_len = 3;
1073 iov[4].iov_base = &ackinfo;
1074 iov[4].iov_len = sizeof(ackinfo);
1075
1076 switch (ack.reason) {
1077 case RXRPC_ACK_REQUESTED:
1078 case RXRPC_ACK_DUPLICATE:
1079 case RXRPC_ACK_OUT_OF_SEQUENCE:
1080 case RXRPC_ACK_EXCEEDS_WINDOW:
1081 case RXRPC_ACK_NOSPACE:
1082 case RXRPC_ACK_PING:
1083 case RXRPC_ACK_PING_RESPONSE:
1084 goto send_ACK_with_skew;
1085 case RXRPC_ACK_DELAY:
1086 case RXRPC_ACK_IDLE:
1087 goto send_ACK;
1088 }
1089 } 392 }
1090 393
1091 /* handle completion of security negotiations on an incoming 394 if (test_and_clear_bit(RXRPC_CALL_EV_ACK, &call->events)) {
1092 * connection */ 395 if (call->ackr_reason) {
1093 if (test_and_clear_bit(RXRPC_CALL_EV_SECURED, &call->events)) { 396 rxrpc_send_ack_packet(call, false);
1094 _debug("secured"); 397 goto recheck_state;
1095 spin_lock_bh(&call->lock);
1096
1097 if (call->state == RXRPC_CALL_SERVER_SECURING) {
1098 _debug("securing");
1099 write_lock(&call->socket->call_lock);
1100 if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
1101 !test_bit(RXRPC_CALL_EV_RELEASE, &call->events)) {
1102 _debug("not released");
1103 call->state = RXRPC_CALL_SERVER_ACCEPTING;
1104 list_move_tail(&call->accept_link,
1105 &call->socket->acceptq);
1106 }
1107 write_unlock(&call->socket->call_lock);
1108 read_lock(&call->state_lock);
1109 if (call->state < RXRPC_CALL_COMPLETE)
1110 set_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events);
1111 read_unlock(&call->state_lock);
1112 } 398 }
1113
1114 spin_unlock_bh(&call->lock);
1115 if (!test_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events))
1116 goto maybe_reschedule;
1117 } 399 }
1118 400
1119 /* post a notification of an acceptable connection to the app */ 401 if (test_and_clear_bit(RXRPC_CALL_EV_PING, &call->events)) {
1120 if (test_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events)) { 402 rxrpc_send_ack_packet(call, true);
1121 _debug("post accept"); 403 goto recheck_state;
1122 if (rxrpc_post_message(call, RXRPC_SKB_MARK_NEW_CALL,
1123 0, false) < 0)
1124 goto no_mem;
1125 clear_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events);
1126 goto maybe_reschedule;
1127 } 404 }
1128 405
1129 /* handle incoming call acceptance */ 406 if (test_and_clear_bit(RXRPC_CALL_EV_RESEND, &call->events)) {
1130 if (test_and_clear_bit(RXRPC_CALL_EV_ACCEPTED, &call->events)) { 407 rxrpc_resend(call, now);
1131 _debug("accepted"); 408 goto recheck_state;
1132 ASSERTCMP(call->rx_data_post, ==, 0);
1133 call->rx_data_post = 1;
1134 read_lock_bh(&call->state_lock);
1135 if (call->state < RXRPC_CALL_COMPLETE)
1136 set_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events);
1137 read_unlock_bh(&call->state_lock);
1138 } 409 }
1139 410
1140 /* drain the out of sequence received packet queue into the packet Rx 411 rxrpc_set_timer(call, rxrpc_timer_set_for_resend, now);
1141 * queue */
1142 if (test_and_clear_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events)) {
1143 while (call->rx_data_post == call->rx_first_oos)
1144 if (rxrpc_drain_rx_oos_queue(call) < 0)
1145 break;
1146 goto maybe_reschedule;
1147 }
1148
1149 if (test_bit(RXRPC_CALL_EV_RELEASE, &call->events)) {
1150 rxrpc_release_call(call);
1151 clear_bit(RXRPC_CALL_EV_RELEASE, &call->events);
1152 }
1153 412
1154 /* other events may have been raised since we started checking */ 413 /* other events may have been raised since we started checking */
1155 goto maybe_reschedule; 414 if (call->events && call->state < RXRPC_CALL_COMPLETE) {
1156 415 __rxrpc_queue_call(call);
1157send_ACK_with_skew: 416 goto out;
1158 ack.maxSkew = htons(atomic_read(&call->conn->hi_serial) -
1159 ntohl(ack.serial));
1160send_ACK:
1161 mtu = call->conn->params.peer->if_mtu;
1162 mtu -= call->conn->params.peer->hdrsize;
1163 ackinfo.maxMTU = htonl(mtu);
1164 ackinfo.rwind = htonl(rxrpc_rx_window_size);
1165
1166 /* permit the peer to send us jumbo packets if it wants to */
1167 ackinfo.rxMTU = htonl(rxrpc_rx_mtu);
1168 ackinfo.jumbo_max = htonl(rxrpc_rx_jumbo_max);
1169
1170 serial = atomic_inc_return(&call->conn->serial);
1171 whdr.serial = htonl(serial);
1172 _proto("Tx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }",
1173 serial,
1174 ntohs(ack.maxSkew),
1175 ntohl(ack.firstPacket),
1176 ntohl(ack.previousPacket),
1177 ntohl(ack.serial),
1178 rxrpc_acks(ack.reason),
1179 ack.nAcks);
1180
1181 del_timer_sync(&call->ack_timer);
1182 if (ack.nAcks > 0)
1183 set_bit(RXRPC_CALL_TX_SOFT_ACK, &call->flags);
1184 goto send_message_2;
1185
1186send_message:
1187 _debug("send message");
1188
1189 serial = atomic_inc_return(&call->conn->serial);
1190 whdr.serial = htonl(serial);
1191 _proto("Tx %s %%%u", rxrpc_pkts[whdr.type], serial);
1192send_message_2:
1193
1194 len = iov[0].iov_len;
1195 ioc = 1;
1196 if (iov[4].iov_len) {
1197 ioc = 5;
1198 len += iov[4].iov_len;
1199 len += iov[3].iov_len;
1200 len += iov[2].iov_len;
1201 len += iov[1].iov_len;
1202 } else if (iov[3].iov_len) {
1203 ioc = 4;
1204 len += iov[3].iov_len;
1205 len += iov[2].iov_len;
1206 len += iov[1].iov_len;
1207 } else if (iov[2].iov_len) {
1208 ioc = 3;
1209 len += iov[2].iov_len;
1210 len += iov[1].iov_len;
1211 } else if (iov[1].iov_len) {
1212 ioc = 2;
1213 len += iov[1].iov_len;
1214 }
1215
1216 ret = kernel_sendmsg(call->conn->params.local->socket,
1217 &msg, iov, ioc, len);
1218 if (ret < 0) {
1219 _debug("sendmsg failed: %d", ret);
1220 read_lock_bh(&call->state_lock);
1221 if (call->state < RXRPC_CALL_DEAD)
1222 rxrpc_queue_call(call);
1223 read_unlock_bh(&call->state_lock);
1224 goto error;
1225 }
1226
1227 switch (genbit) {
1228 case RXRPC_CALL_EV_ABORT:
1229 clear_bit(genbit, &call->events);
1230 clear_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events);
1231 goto kill_ACKs;
1232
1233 case RXRPC_CALL_EV_ACK_FINAL:
1234 write_lock_bh(&call->state_lock);
1235 if (call->state == RXRPC_CALL_CLIENT_FINAL_ACK)
1236 call->state = RXRPC_CALL_COMPLETE;
1237 write_unlock_bh(&call->state_lock);
1238 goto kill_ACKs;
1239
1240 default:
1241 clear_bit(genbit, &call->events);
1242 switch (call->state) {
1243 case RXRPC_CALL_CLIENT_AWAIT_REPLY:
1244 case RXRPC_CALL_CLIENT_RECV_REPLY:
1245 case RXRPC_CALL_SERVER_RECV_REQUEST:
1246 case RXRPC_CALL_SERVER_ACK_REQUEST:
1247 _debug("start ACK timer");
1248 rxrpc_propose_ACK(call, RXRPC_ACK_DELAY,
1249 call->ackr_serial, false);
1250 default:
1251 break;
1252 }
1253 goto maybe_reschedule;
1254 }
1255
1256kill_ACKs:
1257 del_timer_sync(&call->ack_timer);
1258 if (test_and_clear_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events))
1259 rxrpc_put_call(call);
1260 clear_bit(RXRPC_CALL_EV_ACK, &call->events);
1261
1262maybe_reschedule:
1263 if (call->events || !skb_queue_empty(&call->rx_queue)) {
1264 read_lock_bh(&call->state_lock);
1265 if (call->state < RXRPC_CALL_DEAD)
1266 rxrpc_queue_call(call);
1267 read_unlock_bh(&call->state_lock);
1268 }
1269
1270 /* don't leave aborted connections on the accept queue */
1271 if (call->state >= RXRPC_CALL_COMPLETE &&
1272 !list_empty(&call->accept_link)) {
1273 _debug("X unlinking once-pending call %p { e=%lx f=%lx c=%x }",
1274 call, call->events, call->flags, call->conn->proto.cid);
1275
1276 read_lock_bh(&call->state_lock);
1277 if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
1278 !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events))
1279 rxrpc_queue_call(call);
1280 read_unlock_bh(&call->state_lock);
1281 }
1282
1283error:
1284 clear_bit(RXRPC_CALL_PROC_BUSY, &call->flags);
1285 kfree(acks);
1286
1287 /* because we don't want two CPUs both processing the work item for one
1288 * call at the same time, we use a flag to note when it's busy; however
1289 * this means there's a race between clearing the flag and setting the
1290 * work pending bit and the work item being processed again */
1291 if (call->events && !work_pending(&call->processor)) {
1292 _debug("jumpstart %x", call->conn->proto.cid);
1293 rxrpc_queue_call(call);
1294 } 417 }
1295 418
419out_put:
420 rxrpc_put_call(call, rxrpc_call_put);
421out:
1296 _leave(""); 422 _leave("");
1297 return;
1298
1299no_mem:
1300 _debug("out of memory");
1301 goto maybe_reschedule;
1302} 423}
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index ae057e0740f3..4353a29f3b57 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -19,23 +19,13 @@
19#include <net/af_rxrpc.h> 19#include <net/af_rxrpc.h>
20#include "ar-internal.h" 20#include "ar-internal.h"
21 21
22/*
23 * Maximum lifetime of a call (in jiffies).
24 */
25unsigned int rxrpc_max_call_lifetime = 60 * HZ;
26
27/*
28 * Time till dead call expires after last use (in jiffies).
29 */
30unsigned int rxrpc_dead_call_expiry = 2 * HZ;
31
32const char *const rxrpc_call_states[NR__RXRPC_CALL_STATES] = { 22const char *const rxrpc_call_states[NR__RXRPC_CALL_STATES] = {
33 [RXRPC_CALL_UNINITIALISED] = "Uninit", 23 [RXRPC_CALL_UNINITIALISED] = "Uninit ",
34 [RXRPC_CALL_CLIENT_AWAIT_CONN] = "ClWtConn", 24 [RXRPC_CALL_CLIENT_AWAIT_CONN] = "ClWtConn",
35 [RXRPC_CALL_CLIENT_SEND_REQUEST] = "ClSndReq", 25 [RXRPC_CALL_CLIENT_SEND_REQUEST] = "ClSndReq",
36 [RXRPC_CALL_CLIENT_AWAIT_REPLY] = "ClAwtRpl", 26 [RXRPC_CALL_CLIENT_AWAIT_REPLY] = "ClAwtRpl",
37 [RXRPC_CALL_CLIENT_RECV_REPLY] = "ClRcvRpl", 27 [RXRPC_CALL_CLIENT_RECV_REPLY] = "ClRcvRpl",
38 [RXRPC_CALL_CLIENT_FINAL_ACK] = "ClFnlACK", 28 [RXRPC_CALL_SERVER_PREALLOC] = "SvPrealc",
39 [RXRPC_CALL_SERVER_SECURING] = "SvSecure", 29 [RXRPC_CALL_SERVER_SECURING] = "SvSecure",
40 [RXRPC_CALL_SERVER_ACCEPTING] = "SvAccept", 30 [RXRPC_CALL_SERVER_ACCEPTING] = "SvAccept",
41 [RXRPC_CALL_SERVER_RECV_REQUEST] = "SvRcvReq", 31 [RXRPC_CALL_SERVER_RECV_REQUEST] = "SvRcvReq",
@@ -43,22 +33,47 @@ const char *const rxrpc_call_states[NR__RXRPC_CALL_STATES] = {
43 [RXRPC_CALL_SERVER_SEND_REPLY] = "SvSndRpl", 33 [RXRPC_CALL_SERVER_SEND_REPLY] = "SvSndRpl",
44 [RXRPC_CALL_SERVER_AWAIT_ACK] = "SvAwtACK", 34 [RXRPC_CALL_SERVER_AWAIT_ACK] = "SvAwtACK",
45 [RXRPC_CALL_COMPLETE] = "Complete", 35 [RXRPC_CALL_COMPLETE] = "Complete",
46 [RXRPC_CALL_SERVER_BUSY] = "SvBusy ", 36};
37
38const char *const rxrpc_call_completions[NR__RXRPC_CALL_COMPLETIONS] = {
39 [RXRPC_CALL_SUCCEEDED] = "Complete",
47 [RXRPC_CALL_REMOTELY_ABORTED] = "RmtAbort", 40 [RXRPC_CALL_REMOTELY_ABORTED] = "RmtAbort",
48 [RXRPC_CALL_LOCALLY_ABORTED] = "LocAbort", 41 [RXRPC_CALL_LOCALLY_ABORTED] = "LocAbort",
42 [RXRPC_CALL_LOCAL_ERROR] = "LocError",
49 [RXRPC_CALL_NETWORK_ERROR] = "NetError", 43 [RXRPC_CALL_NETWORK_ERROR] = "NetError",
50 [RXRPC_CALL_DEAD] = "Dead ", 44};
45
46const char rxrpc_call_traces[rxrpc_call__nr_trace][4] = {
47 [rxrpc_call_new_client] = "NWc",
48 [rxrpc_call_new_service] = "NWs",
49 [rxrpc_call_queued] = "QUE",
50 [rxrpc_call_queued_ref] = "QUR",
51 [rxrpc_call_connected] = "CON",
52 [rxrpc_call_release] = "RLS",
53 [rxrpc_call_seen] = "SEE",
54 [rxrpc_call_got] = "GOT",
55 [rxrpc_call_got_userid] = "Gus",
56 [rxrpc_call_got_kernel] = "Gke",
57 [rxrpc_call_put] = "PUT",
58 [rxrpc_call_put_userid] = "Pus",
59 [rxrpc_call_put_kernel] = "Pke",
60 [rxrpc_call_put_noqueue] = "PNQ",
61 [rxrpc_call_error] = "*E*",
51}; 62};
52 63
53struct kmem_cache *rxrpc_call_jar; 64struct kmem_cache *rxrpc_call_jar;
54LIST_HEAD(rxrpc_calls); 65LIST_HEAD(rxrpc_calls);
55DEFINE_RWLOCK(rxrpc_call_lock); 66DEFINE_RWLOCK(rxrpc_call_lock);
56 67
57static void rxrpc_destroy_call(struct work_struct *work); 68static void rxrpc_call_timer_expired(unsigned long _call)
58static void rxrpc_call_life_expired(unsigned long _call); 69{
59static void rxrpc_dead_call_expired(unsigned long _call); 70 struct rxrpc_call *call = (struct rxrpc_call *)_call;
60static void rxrpc_ack_time_expired(unsigned long _call); 71
61static void rxrpc_resend_time_expired(unsigned long _call); 72 _enter("%d", call->debug_id);
73
74 if (call->state < RXRPC_CALL_COMPLETE)
75 rxrpc_set_timer(call, rxrpc_timer_expired, ktime_get_real());
76}
62 77
63/* 78/*
64 * find an extant server call 79 * find an extant server call
@@ -91,7 +106,7 @@ struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *rx,
91 return NULL; 106 return NULL;
92 107
93found_extant_call: 108found_extant_call:
94 rxrpc_get_call(call); 109 rxrpc_get_call(call, rxrpc_call_got);
95 read_unlock(&rx->call_lock); 110 read_unlock(&rx->call_lock);
96 _leave(" = %p [%d]", call, atomic_read(&call->usage)); 111 _leave(" = %p [%d]", call, atomic_read(&call->usage));
97 return call; 112 return call;
@@ -100,7 +115,7 @@ found_extant_call:
100/* 115/*
101 * allocate a new call 116 * allocate a new call
102 */ 117 */
103static struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp) 118struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp)
104{ 119{
105 struct rxrpc_call *call; 120 struct rxrpc_call *call;
106 121
@@ -108,29 +123,25 @@ static struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp)
108 if (!call) 123 if (!call)
109 return NULL; 124 return NULL;
110 125
111 call->acks_winsz = 16; 126 call->rxtx_buffer = kcalloc(RXRPC_RXTX_BUFF_SIZE,
112 call->acks_window = kmalloc(call->acks_winsz * sizeof(unsigned long), 127 sizeof(struct sk_buff *),
113 gfp); 128 gfp);
114 if (!call->acks_window) { 129 if (!call->rxtx_buffer)
115 kmem_cache_free(rxrpc_call_jar, call); 130 goto nomem;
116 return NULL; 131
117 } 132 call->rxtx_annotations = kcalloc(RXRPC_RXTX_BUFF_SIZE, sizeof(u8), gfp);
133 if (!call->rxtx_annotations)
134 goto nomem_2;
118 135
119 setup_timer(&call->lifetimer, &rxrpc_call_life_expired, 136 setup_timer(&call->timer, rxrpc_call_timer_expired,
120 (unsigned long) call); 137 (unsigned long)call);
121 setup_timer(&call->deadspan, &rxrpc_dead_call_expired,
122 (unsigned long) call);
123 setup_timer(&call->ack_timer, &rxrpc_ack_time_expired,
124 (unsigned long) call);
125 setup_timer(&call->resend_timer, &rxrpc_resend_time_expired,
126 (unsigned long) call);
127 INIT_WORK(&call->destroyer, &rxrpc_destroy_call);
128 INIT_WORK(&call->processor, &rxrpc_process_call); 138 INIT_WORK(&call->processor, &rxrpc_process_call);
129 INIT_LIST_HEAD(&call->link); 139 INIT_LIST_HEAD(&call->link);
140 INIT_LIST_HEAD(&call->chan_wait_link);
130 INIT_LIST_HEAD(&call->accept_link); 141 INIT_LIST_HEAD(&call->accept_link);
131 skb_queue_head_init(&call->rx_queue); 142 INIT_LIST_HEAD(&call->recvmsg_link);
132 skb_queue_head_init(&call->rx_oos_queue); 143 INIT_LIST_HEAD(&call->sock_link);
133 init_waitqueue_head(&call->tx_waitq); 144 init_waitqueue_head(&call->waitq);
134 spin_lock_init(&call->lock); 145 spin_lock_init(&call->lock);
135 rwlock_init(&call->state_lock); 146 rwlock_init(&call->state_lock);
136 atomic_set(&call->usage, 1); 147 atomic_set(&call->usage, 1);
@@ -138,70 +149,66 @@ static struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp)
138 149
139 memset(&call->sock_node, 0xed, sizeof(call->sock_node)); 150 memset(&call->sock_node, 0xed, sizeof(call->sock_node));
140 151
141 call->rx_data_expect = 1; 152 /* Leave space in the ring to handle a maxed-out jumbo packet */
142 call->rx_data_eaten = 0; 153 call->rx_winsize = rxrpc_rx_window_size;
143 call->rx_first_oos = 0; 154 call->tx_winsize = 16;
144 call->ackr_win_top = call->rx_data_eaten + 1 + rxrpc_rx_window_size; 155 call->rx_expect_next = 1;
145 call->creation_jif = jiffies; 156
157 if (RXRPC_TX_SMSS > 2190)
158 call->cong_cwnd = 2;
159 else if (RXRPC_TX_SMSS > 1095)
160 call->cong_cwnd = 3;
161 else
162 call->cong_cwnd = 4;
163 call->cong_ssthresh = RXRPC_RXTX_BUFF_SIZE - 1;
146 return call; 164 return call;
165
166nomem_2:
167 kfree(call->rxtx_buffer);
168nomem:
169 kmem_cache_free(rxrpc_call_jar, call);
170 return NULL;
147} 171}
148 172
149/* 173/*
150 * Allocate a new client call. 174 * Allocate a new client call.
151 */ 175 */
152static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx, 176static struct rxrpc_call *rxrpc_alloc_client_call(struct sockaddr_rxrpc *srx,
153 struct sockaddr_rxrpc *srx,
154 gfp_t gfp) 177 gfp_t gfp)
155{ 178{
156 struct rxrpc_call *call; 179 struct rxrpc_call *call;
180 ktime_t now;
157 181
158 _enter(""); 182 _enter("");
159 183
160 ASSERT(rx->local != NULL);
161
162 call = rxrpc_alloc_call(gfp); 184 call = rxrpc_alloc_call(gfp);
163 if (!call) 185 if (!call)
164 return ERR_PTR(-ENOMEM); 186 return ERR_PTR(-ENOMEM);
165 call->state = RXRPC_CALL_CLIENT_AWAIT_CONN; 187 call->state = RXRPC_CALL_CLIENT_AWAIT_CONN;
166
167 sock_hold(&rx->sk);
168 call->socket = rx;
169 call->rx_data_post = 1;
170
171 call->local = rx->local;
172 call->service_id = srx->srx_service; 188 call->service_id = srx->srx_service;
173 call->in_clientflag = 0; 189 call->tx_phase = true;
190 now = ktime_get_real();
191 call->acks_latest_ts = now;
192 call->cong_tstamp = now;
174 193
175 _leave(" = %p", call); 194 _leave(" = %p", call);
176 return call; 195 return call;
177} 196}
178 197
179/* 198/*
180 * Begin client call. 199 * Initiate the call ack/resend/expiry timer.
181 */ 200 */
182static int rxrpc_begin_client_call(struct rxrpc_call *call, 201static void rxrpc_start_call_timer(struct rxrpc_call *call)
183 struct rxrpc_conn_parameters *cp,
184 struct sockaddr_rxrpc *srx,
185 gfp_t gfp)
186{ 202{
187 int ret; 203 ktime_t now = ktime_get_real(), expire_at;
188 204
189 /* Set up or get a connection record and set the protocol parameters, 205 expire_at = ktime_add_ms(now, rxrpc_max_call_lifetime);
190 * including channel number and call ID. 206 call->expire_at = expire_at;
191 */ 207 call->ack_at = expire_at;
192 ret = rxrpc_connect_call(call, cp, srx, gfp); 208 call->ping_at = expire_at;
193 if (ret < 0) 209 call->resend_at = expire_at;
194 return ret; 210 call->timer.expires = jiffies + LONG_MAX / 2;
195 211 rxrpc_set_timer(call, rxrpc_timer_begin, now);
196 call->state = RXRPC_CALL_CLIENT_SEND_REQUEST;
197
198 spin_lock(&call->conn->params.peer->lock);
199 hlist_add_head(&call->error_link, &call->conn->params.peer->error_targets);
200 spin_unlock(&call->conn->params.peer->lock);
201
202 call->lifetimer.expires = jiffies + rxrpc_max_call_lifetime;
203 add_timer(&call->lifetimer);
204 return 0;
205} 212}
206 213
207/* 214/*
@@ -216,20 +223,21 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
216{ 223{
217 struct rxrpc_call *call, *xcall; 224 struct rxrpc_call *call, *xcall;
218 struct rb_node *parent, **pp; 225 struct rb_node *parent, **pp;
226 const void *here = __builtin_return_address(0);
219 int ret; 227 int ret;
220 228
221 _enter("%p,%lx", rx, user_call_ID); 229 _enter("%p,%lx", rx, user_call_ID);
222 230
223 call = rxrpc_alloc_client_call(rx, srx, gfp); 231 call = rxrpc_alloc_client_call(srx, gfp);
224 if (IS_ERR(call)) { 232 if (IS_ERR(call)) {
225 _leave(" = %ld", PTR_ERR(call)); 233 _leave(" = %ld", PTR_ERR(call));
226 return call; 234 return call;
227 } 235 }
228 236
229 /* Publish the call, even though it is incompletely set up as yet */ 237 trace_rxrpc_call(call, rxrpc_call_new_client, atomic_read(&call->usage),
230 call->user_call_ID = user_call_ID; 238 here, (const void *)user_call_ID);
231 __set_bit(RXRPC_CALL_HAS_USERID, &call->flags);
232 239
240 /* Publish the call, even though it is incompletely set up as yet */
233 write_lock(&rx->call_lock); 241 write_lock(&rx->call_lock);
234 242
235 pp = &rx->calls.rb_node; 243 pp = &rx->calls.rb_node;
@@ -243,369 +251,285 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
243 else if (user_call_ID > xcall->user_call_ID) 251 else if (user_call_ID > xcall->user_call_ID)
244 pp = &(*pp)->rb_right; 252 pp = &(*pp)->rb_right;
245 else 253 else
246 goto found_user_ID_now_present; 254 goto error_dup_user_ID;
247 } 255 }
248 256
249 rxrpc_get_call(call); 257 rcu_assign_pointer(call->socket, rx);
250 258 call->user_call_ID = user_call_ID;
259 __set_bit(RXRPC_CALL_HAS_USERID, &call->flags);
260 rxrpc_get_call(call, rxrpc_call_got_userid);
251 rb_link_node(&call->sock_node, parent, pp); 261 rb_link_node(&call->sock_node, parent, pp);
252 rb_insert_color(&call->sock_node, &rx->calls); 262 rb_insert_color(&call->sock_node, &rx->calls);
263 list_add(&call->sock_link, &rx->sock_calls);
264
253 write_unlock(&rx->call_lock); 265 write_unlock(&rx->call_lock);
254 266
255 write_lock_bh(&rxrpc_call_lock); 267 write_lock(&rxrpc_call_lock);
256 list_add_tail(&call->link, &rxrpc_calls); 268 list_add_tail(&call->link, &rxrpc_calls);
257 write_unlock_bh(&rxrpc_call_lock); 269 write_unlock(&rxrpc_call_lock);
258 270
259 ret = rxrpc_begin_client_call(call, cp, srx, gfp); 271 /* Set up or get a connection record and set the protocol parameters,
272 * including channel number and call ID.
273 */
274 ret = rxrpc_connect_call(call, cp, srx, gfp);
260 if (ret < 0) 275 if (ret < 0)
261 goto error; 276 goto error;
262 277
263 _net("CALL new %d on CONN %d", call->debug_id, call->conn->debug_id); 278 trace_rxrpc_call(call, rxrpc_call_connected, atomic_read(&call->usage),
279 here, ERR_PTR(ret));
264 280
265 _leave(" = %p [new]", call); 281 spin_lock_bh(&call->conn->params.peer->lock);
266 return call; 282 hlist_add_head(&call->error_link,
283 &call->conn->params.peer->error_targets);
284 spin_unlock_bh(&call->conn->params.peer->lock);
267 285
268error: 286 rxrpc_start_call_timer(call);
269 write_lock(&rx->call_lock);
270 rb_erase(&call->sock_node, &rx->calls);
271 write_unlock(&rx->call_lock);
272 rxrpc_put_call(call);
273 287
274 write_lock_bh(&rxrpc_call_lock); 288 _net("CALL new %d on CONN %d", call->debug_id, call->conn->debug_id);
275 list_del_init(&call->link);
276 write_unlock_bh(&rxrpc_call_lock);
277 289
278 set_bit(RXRPC_CALL_RELEASED, &call->flags); 290 _leave(" = %p [new]", call);
279 call->state = RXRPC_CALL_DEAD; 291 return call;
280 rxrpc_put_call(call);
281 _leave(" = %d", ret);
282 return ERR_PTR(ret);
283 292
284 /* We unexpectedly found the user ID in the list after taking 293 /* We unexpectedly found the user ID in the list after taking
285 * the call_lock. This shouldn't happen unless the user races 294 * the call_lock. This shouldn't happen unless the user races
286 * with itself and tries to add the same user ID twice at the 295 * with itself and tries to add the same user ID twice at the
287 * same time in different threads. 296 * same time in different threads.
288 */ 297 */
289found_user_ID_now_present: 298error_dup_user_ID:
290 write_unlock(&rx->call_lock); 299 write_unlock(&rx->call_lock);
291 set_bit(RXRPC_CALL_RELEASED, &call->flags); 300 ret = -EEXIST;
292 call->state = RXRPC_CALL_DEAD; 301
293 rxrpc_put_call(call); 302error:
294 _leave(" = -EEXIST [%p]", call); 303 __rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR,
295 return ERR_PTR(-EEXIST); 304 RX_CALL_DEAD, ret);
305 trace_rxrpc_call(call, rxrpc_call_error, atomic_read(&call->usage),
306 here, ERR_PTR(ret));
307 rxrpc_release_call(rx, call);
308 rxrpc_put_call(call, rxrpc_call_put);
309 _leave(" = %d", ret);
310 return ERR_PTR(ret);
296} 311}
297 312
298/* 313/*
299 * set up an incoming call 314 * Set up an incoming call. call->conn points to the connection.
300 * - called in process context with IRQs enabled 315 * This is called in BH context and isn't allowed to fail.
301 */ 316 */
302struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, 317void rxrpc_incoming_call(struct rxrpc_sock *rx,
303 struct rxrpc_connection *conn, 318 struct rxrpc_call *call,
304 struct sk_buff *skb) 319 struct sk_buff *skb)
305{ 320{
321 struct rxrpc_connection *conn = call->conn;
306 struct rxrpc_skb_priv *sp = rxrpc_skb(skb); 322 struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
307 struct rxrpc_call *call, *candidate; 323 u32 chan;
308 u32 call_id, chan; 324
309 325 _enter(",%d", call->conn->debug_id);
310 _enter(",%d", conn->debug_id); 326
311 327 rcu_assign_pointer(call->socket, rx);
312 ASSERT(rx != NULL); 328 call->call_id = sp->hdr.callNumber;
313 329 call->service_id = sp->hdr.serviceId;
314 candidate = rxrpc_alloc_call(GFP_NOIO); 330 call->cid = sp->hdr.cid;
315 if (!candidate) 331 call->state = RXRPC_CALL_SERVER_ACCEPTING;
316 return ERR_PTR(-EBUSY); 332 if (sp->hdr.securityIndex > 0)
317 333 call->state = RXRPC_CALL_SERVER_SECURING;
318 chan = sp->hdr.cid & RXRPC_CHANNELMASK; 334 call->cong_tstamp = skb->tstamp;
319 candidate->socket = rx; 335
320 candidate->conn = conn; 336 /* Set the channel for this call. We don't get channel_lock as we're
321 candidate->cid = sp->hdr.cid; 337 * only defending against the data_ready handler (which we're called
322 candidate->call_id = sp->hdr.callNumber; 338 * from) and the RESPONSE packet parser (which is only really
323 candidate->channel = chan; 339 * interested in call_counter and can cope with a disagreement with the
324 candidate->rx_data_post = 0; 340 * call pointer).
325 candidate->state = RXRPC_CALL_SERVER_ACCEPTING;
326 if (conn->security_ix > 0)
327 candidate->state = RXRPC_CALL_SERVER_SECURING;
328
329 spin_lock(&conn->channel_lock);
330
331 /* set the channel for this call */
332 call = rcu_dereference_protected(conn->channels[chan].call,
333 lockdep_is_held(&conn->channel_lock));
334
335 _debug("channel[%u] is %p", candidate->channel, call);
336 if (call && call->call_id == sp->hdr.callNumber) {
337 /* already set; must've been a duplicate packet */
338 _debug("extant call [%d]", call->state);
339 ASSERTCMP(call->conn, ==, conn);
340
341 read_lock(&call->state_lock);
342 switch (call->state) {
343 case RXRPC_CALL_LOCALLY_ABORTED:
344 if (!test_and_set_bit(RXRPC_CALL_EV_ABORT, &call->events))
345 rxrpc_queue_call(call);
346 case RXRPC_CALL_REMOTELY_ABORTED:
347 read_unlock(&call->state_lock);
348 goto aborted_call;
349 default:
350 rxrpc_get_call(call);
351 read_unlock(&call->state_lock);
352 goto extant_call;
353 }
354 }
355
356 if (call) {
357 /* it seems the channel is still in use from the previous call
358 * - ditch the old binding if its call is now complete */
359 _debug("CALL: %u { %s }",
360 call->debug_id, rxrpc_call_states[call->state]);
361
362 if (call->state >= RXRPC_CALL_COMPLETE) {
363 __rxrpc_disconnect_call(call);
364 } else {
365 spin_unlock(&conn->channel_lock);
366 kmem_cache_free(rxrpc_call_jar, candidate);
367 _leave(" = -EBUSY");
368 return ERR_PTR(-EBUSY);
369 }
370 }
371
372 /* check the call number isn't duplicate */
373 _debug("check dup");
374 call_id = sp->hdr.callNumber;
375
376 /* We just ignore calls prior to the current call ID. Terminated calls
377 * are handled via the connection.
378 */ 341 */
379 if (call_id <= conn->channels[chan].call_counter) 342 chan = sp->hdr.cid & RXRPC_CHANNELMASK;
380 goto old_call; /* TODO: Just drop packet */ 343 conn->channels[chan].call_counter = call->call_id;
381 344 conn->channels[chan].call_id = call->call_id;
382 /* make the call available */
383 _debug("new call");
384 call = candidate;
385 candidate = NULL;
386 conn->channels[chan].call_counter = call_id;
387 rcu_assign_pointer(conn->channels[chan].call, call); 345 rcu_assign_pointer(conn->channels[chan].call, call);
388 sock_hold(&rx->sk);
389 rxrpc_get_connection(conn);
390 spin_unlock(&conn->channel_lock);
391 346
392 spin_lock(&conn->params.peer->lock); 347 spin_lock(&conn->params.peer->lock);
393 hlist_add_head(&call->error_link, &conn->params.peer->error_targets); 348 hlist_add_head(&call->error_link, &conn->params.peer->error_targets);
394 spin_unlock(&conn->params.peer->lock); 349 spin_unlock(&conn->params.peer->lock);
395 350
396 write_lock_bh(&rxrpc_call_lock); 351 _net("CALL incoming %d on CONN %d", call->debug_id, call->conn->debug_id);
397 list_add_tail(&call->link, &rxrpc_calls);
398 write_unlock_bh(&rxrpc_call_lock);
399 352
400 call->local = conn->params.local; 353 rxrpc_start_call_timer(call);
401 call->epoch = conn->proto.epoch; 354 _leave("");
402 call->service_id = conn->params.service_id; 355}
403 call->in_clientflag = RXRPC_CLIENT_INITIATED;
404 356
405 _net("CALL incoming %d on CONN %d", call->debug_id, call->conn->debug_id); 357/*
358 * Queue a call's work processor, getting a ref to pass to the work queue.
359 */
360bool rxrpc_queue_call(struct rxrpc_call *call)
361{
362 const void *here = __builtin_return_address(0);
363 int n = __atomic_add_unless(&call->usage, 1, 0);
364 if (n == 0)
365 return false;
366 if (rxrpc_queue_work(&call->processor))
367 trace_rxrpc_call(call, rxrpc_call_queued, n + 1, here, NULL);
368 else
369 rxrpc_put_call(call, rxrpc_call_put_noqueue);
370 return true;
371}
406 372
407 call->lifetimer.expires = jiffies + rxrpc_max_call_lifetime; 373/*
408 add_timer(&call->lifetimer); 374 * Queue a call's work processor, passing the callers ref to the work queue.
409 _leave(" = %p {%d} [new]", call, call->debug_id); 375 */
410 return call; 376bool __rxrpc_queue_call(struct rxrpc_call *call)
377{
378 const void *here = __builtin_return_address(0);
379 int n = atomic_read(&call->usage);
380 ASSERTCMP(n, >=, 1);
381 if (rxrpc_queue_work(&call->processor))
382 trace_rxrpc_call(call, rxrpc_call_queued_ref, n, here, NULL);
383 else
384 rxrpc_put_call(call, rxrpc_call_put_noqueue);
385 return true;
386}
411 387
412extant_call: 388/*
413 spin_unlock(&conn->channel_lock); 389 * Note the re-emergence of a call.
414 kmem_cache_free(rxrpc_call_jar, candidate); 390 */
415 _leave(" = %p {%d} [extant]", call, call ? call->debug_id : -1); 391void rxrpc_see_call(struct rxrpc_call *call)
416 return call; 392{
393 const void *here = __builtin_return_address(0);
394 if (call) {
395 int n = atomic_read(&call->usage);
417 396
418aborted_call: 397 trace_rxrpc_call(call, rxrpc_call_seen, n, here, NULL);
419 spin_unlock(&conn->channel_lock); 398 }
420 kmem_cache_free(rxrpc_call_jar, candidate);
421 _leave(" = -ECONNABORTED");
422 return ERR_PTR(-ECONNABORTED);
423
424old_call:
425 spin_unlock(&conn->channel_lock);
426 kmem_cache_free(rxrpc_call_jar, candidate);
427 _leave(" = -ECONNRESET [old]");
428 return ERR_PTR(-ECONNRESET);
429} 399}
430 400
431/* 401/*
432 * detach a call from a socket and set up for release 402 * Note the addition of a ref on a call.
433 */ 403 */
434void rxrpc_release_call(struct rxrpc_call *call) 404void rxrpc_get_call(struct rxrpc_call *call, enum rxrpc_call_trace op)
435{ 405{
406 const void *here = __builtin_return_address(0);
407 int n = atomic_inc_return(&call->usage);
408
409 trace_rxrpc_call(call, op, n, here, NULL);
410}
411
412/*
413 * Detach a call from its owning socket.
414 */
415void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call)
416{
417 const void *here = __builtin_return_address(0);
436 struct rxrpc_connection *conn = call->conn; 418 struct rxrpc_connection *conn = call->conn;
437 struct rxrpc_sock *rx = call->socket; 419 bool put = false;
420 int i;
438 421
439 _enter("{%d,%d,%d,%d}", 422 _enter("{%d,%d}", call->debug_id, atomic_read(&call->usage));
440 call->debug_id, atomic_read(&call->usage), 423
441 atomic_read(&call->ackr_not_idle), 424 trace_rxrpc_call(call, rxrpc_call_release, atomic_read(&call->usage),
442 call->rx_first_oos); 425 here, (const void *)call->flags);
426
427 ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE);
443 428
444 spin_lock_bh(&call->lock); 429 spin_lock_bh(&call->lock);
445 if (test_and_set_bit(RXRPC_CALL_RELEASED, &call->flags)) 430 if (test_and_set_bit(RXRPC_CALL_RELEASED, &call->flags))
446 BUG(); 431 BUG();
447 spin_unlock_bh(&call->lock); 432 spin_unlock_bh(&call->lock);
448 433
449 /* dissociate from the socket 434 del_timer_sync(&call->timer);
450 * - the socket's ref on the call is passed to the death timer
451 */
452 _debug("RELEASE CALL %p (%d CONN %p)", call, call->debug_id, conn);
453 435
454 spin_lock(&conn->params.peer->lock); 436 /* Make sure we don't get any more notifications */
455 hlist_del_init(&call->error_link); 437 write_lock_bh(&rx->recvmsg_lock);
456 spin_unlock(&conn->params.peer->lock);
457 438
458 write_lock_bh(&rx->call_lock); 439 if (!list_empty(&call->recvmsg_link)) {
459 if (!list_empty(&call->accept_link)) {
460 _debug("unlinking once-pending call %p { e=%lx f=%lx }", 440 _debug("unlinking once-pending call %p { e=%lx f=%lx }",
461 call, call->events, call->flags); 441 call, call->events, call->flags);
462 ASSERT(!test_bit(RXRPC_CALL_HAS_USERID, &call->flags)); 442 list_del(&call->recvmsg_link);
463 list_del_init(&call->accept_link); 443 put = true;
464 sk_acceptq_removed(&rx->sk);
465 } else if (test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) {
466 rb_erase(&call->sock_node, &rx->calls);
467 memset(&call->sock_node, 0xdd, sizeof(call->sock_node));
468 clear_bit(RXRPC_CALL_HAS_USERID, &call->flags);
469 } 444 }
470 write_unlock_bh(&rx->call_lock);
471 445
472 /* free up the channel for reuse */ 446 /* list_empty() must return false in rxrpc_notify_socket() */
473 write_lock_bh(&call->state_lock); 447 call->recvmsg_link.next = NULL;
448 call->recvmsg_link.prev = NULL;
474 449
475 if (call->state < RXRPC_CALL_COMPLETE && 450 write_unlock_bh(&rx->recvmsg_lock);
476 call->state != RXRPC_CALL_CLIENT_FINAL_ACK) { 451 if (put)
477 _debug("+++ ABORTING STATE %d +++\n", call->state); 452 rxrpc_put_call(call, rxrpc_call_put);
478 call->state = RXRPC_CALL_LOCALLY_ABORTED;
479 call->local_abort = RX_CALL_DEAD;
480 }
481 write_unlock_bh(&call->state_lock);
482 453
483 rxrpc_disconnect_call(call); 454 write_lock(&rx->call_lock);
484 455
485 /* clean up the Rx queue */ 456 if (test_and_clear_bit(RXRPC_CALL_HAS_USERID, &call->flags)) {
486 if (!skb_queue_empty(&call->rx_queue) || 457 rb_erase(&call->sock_node, &rx->calls);
487 !skb_queue_empty(&call->rx_oos_queue)) { 458 memset(&call->sock_node, 0xdd, sizeof(call->sock_node));
488 struct rxrpc_skb_priv *sp; 459 rxrpc_put_call(call, rxrpc_call_put_userid);
489 struct sk_buff *skb; 460 }
490 461
491 _debug("purge Rx queues"); 462 list_del(&call->sock_link);
463 write_unlock(&rx->call_lock);
492 464
493 spin_lock_bh(&call->lock); 465 _debug("RELEASE CALL %p (%d CONN %p)", call, call->debug_id, conn);
494 while ((skb = skb_dequeue(&call->rx_queue)) ||
495 (skb = skb_dequeue(&call->rx_oos_queue))) {
496 spin_unlock_bh(&call->lock);
497 466
498 sp = rxrpc_skb(skb); 467 if (conn)
499 _debug("- zap %s %%%u #%u", 468 rxrpc_disconnect_call(call);
500 rxrpc_pkts[sp->hdr.type],
501 sp->hdr.serial, sp->hdr.seq);
502 rxrpc_free_skb(skb);
503 spin_lock_bh(&call->lock);
504 }
505 spin_unlock_bh(&call->lock);
506 469
507 ASSERTCMP(call->state, !=, RXRPC_CALL_COMPLETE); 470 for (i = 0; i < RXRPC_RXTX_BUFF_SIZE; i++) {
471 rxrpc_free_skb(call->rxtx_buffer[i],
472 (call->tx_phase ? rxrpc_skb_tx_cleaned :
473 rxrpc_skb_rx_cleaned));
474 call->rxtx_buffer[i] = NULL;
508 } 475 }
509 476
510 del_timer_sync(&call->resend_timer);
511 del_timer_sync(&call->ack_timer);
512 del_timer_sync(&call->lifetimer);
513 call->deadspan.expires = jiffies + rxrpc_dead_call_expiry;
514 add_timer(&call->deadspan);
515
516 _leave(""); 477 _leave("");
517} 478}
518 479
519/* 480/*
520 * handle a dead call being ready for reaping
521 */
522static void rxrpc_dead_call_expired(unsigned long _call)
523{
524 struct rxrpc_call *call = (struct rxrpc_call *) _call;
525
526 _enter("{%d}", call->debug_id);
527
528 write_lock_bh(&call->state_lock);
529 call->state = RXRPC_CALL_DEAD;
530 write_unlock_bh(&call->state_lock);
531 rxrpc_put_call(call);
532}
533
534/*
535 * mark a call as to be released, aborting it if it's still in progress
536 * - called with softirqs disabled
537 */
538static void rxrpc_mark_call_released(struct rxrpc_call *call)
539{
540 bool sched;
541
542 write_lock(&call->state_lock);
543 if (call->state < RXRPC_CALL_DEAD) {
544 sched = false;
545 if (call->state < RXRPC_CALL_COMPLETE) {
546 _debug("abort call %p", call);
547 call->state = RXRPC_CALL_LOCALLY_ABORTED;
548 call->local_abort = RX_CALL_DEAD;
549 if (!test_and_set_bit(RXRPC_CALL_EV_ABORT, &call->events))
550 sched = true;
551 }
552 if (!test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events))
553 sched = true;
554 if (sched)
555 rxrpc_queue_call(call);
556 }
557 write_unlock(&call->state_lock);
558}
559
560/*
561 * release all the calls associated with a socket 481 * release all the calls associated with a socket
562 */ 482 */
563void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx) 483void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx)
564{ 484{
565 struct rxrpc_call *call; 485 struct rxrpc_call *call;
566 struct rb_node *p;
567 486
568 _enter("%p", rx); 487 _enter("%p", rx);
569 488
570 read_lock_bh(&rx->call_lock); 489 while (!list_empty(&rx->to_be_accepted)) {
571 490 call = list_entry(rx->to_be_accepted.next,
572 /* mark all the calls as no longer wanting incoming packets */ 491 struct rxrpc_call, accept_link);
573 for (p = rb_first(&rx->calls); p; p = rb_next(p)) { 492 list_del(&call->accept_link);
574 call = rb_entry(p, struct rxrpc_call, sock_node); 493 rxrpc_abort_call("SKR", call, 0, RX_CALL_DEAD, ECONNRESET);
575 rxrpc_mark_call_released(call); 494 rxrpc_put_call(call, rxrpc_call_put);
576 }
577
578 /* kill the not-yet-accepted incoming calls */
579 list_for_each_entry(call, &rx->secureq, accept_link) {
580 rxrpc_mark_call_released(call);
581 } 495 }
582 496
583 list_for_each_entry(call, &rx->acceptq, accept_link) { 497 while (!list_empty(&rx->sock_calls)) {
584 rxrpc_mark_call_released(call); 498 call = list_entry(rx->sock_calls.next,
499 struct rxrpc_call, sock_link);
500 rxrpc_get_call(call, rxrpc_call_got);
501 rxrpc_abort_call("SKT", call, 0, RX_CALL_DEAD, ECONNRESET);
502 rxrpc_send_abort_packet(call);
503 rxrpc_release_call(rx, call);
504 rxrpc_put_call(call, rxrpc_call_put);
585 } 505 }
586 506
587 read_unlock_bh(&rx->call_lock);
588 _leave(""); 507 _leave("");
589} 508}
590 509
591/* 510/*
592 * release a call 511 * release a call
593 */ 512 */
594void __rxrpc_put_call(struct rxrpc_call *call) 513void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace op)
595{ 514{
515 const void *here = __builtin_return_address(0);
516 int n;
517
596 ASSERT(call != NULL); 518 ASSERT(call != NULL);
597 519
598 _enter("%p{u=%d}", call, atomic_read(&call->usage)); 520 n = atomic_dec_return(&call->usage);
521 trace_rxrpc_call(call, op, n, here, NULL);
522 ASSERTCMP(n, >=, 0);
523 if (n == 0) {
524 _debug("call %d dead", call->debug_id);
525 ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE);
599 526
600 ASSERTCMP(atomic_read(&call->usage), >, 0); 527 write_lock(&rxrpc_call_lock);
528 list_del_init(&call->link);
529 write_unlock(&rxrpc_call_lock);
601 530
602 if (atomic_dec_and_test(&call->usage)) { 531 rxrpc_cleanup_call(call);
603 _debug("call %d dead", call->debug_id);
604 WARN_ON(atomic_read(&call->skb_count) != 0);
605 ASSERTCMP(call->state, ==, RXRPC_CALL_DEAD);
606 rxrpc_queue_work(&call->destroyer);
607 } 532 }
608 _leave("");
609} 533}
610 534
611/* 535/*
@@ -615,187 +539,70 @@ static void rxrpc_rcu_destroy_call(struct rcu_head *rcu)
615{ 539{
616 struct rxrpc_call *call = container_of(rcu, struct rxrpc_call, rcu); 540 struct rxrpc_call *call = container_of(rcu, struct rxrpc_call, rcu);
617 541
618 rxrpc_purge_queue(&call->rx_queue); 542 rxrpc_put_peer(call->peer);
543 kfree(call->rxtx_buffer);
544 kfree(call->rxtx_annotations);
619 kmem_cache_free(rxrpc_call_jar, call); 545 kmem_cache_free(rxrpc_call_jar, call);
620} 546}
621 547
622/* 548/*
623 * clean up a call 549 * clean up a call
624 */ 550 */
625static void rxrpc_cleanup_call(struct rxrpc_call *call) 551void rxrpc_cleanup_call(struct rxrpc_call *call)
626{ 552{
627 _net("DESTROY CALL %d", call->debug_id); 553 int i;
628 554
629 ASSERT(call->socket); 555 _net("DESTROY CALL %d", call->debug_id);
630 556
631 memset(&call->sock_node, 0xcd, sizeof(call->sock_node)); 557 memset(&call->sock_node, 0xcd, sizeof(call->sock_node));
632 558
633 del_timer_sync(&call->lifetimer); 559 del_timer_sync(&call->timer);
634 del_timer_sync(&call->deadspan);
635 del_timer_sync(&call->ack_timer);
636 del_timer_sync(&call->resend_timer);
637 560
561 ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE);
638 ASSERT(test_bit(RXRPC_CALL_RELEASED, &call->flags)); 562 ASSERT(test_bit(RXRPC_CALL_RELEASED, &call->flags));
639 ASSERTCMP(call->events, ==, 0);
640 if (work_pending(&call->processor)) {
641 _debug("defer destroy");
642 rxrpc_queue_work(&call->destroyer);
643 return;
644 }
645
646 ASSERTCMP(call->conn, ==, NULL); 563 ASSERTCMP(call->conn, ==, NULL);
647 564
648 if (call->acks_window) { 565 /* Clean up the Rx/Tx buffer */
649 _debug("kill Tx window %d", 566 for (i = 0; i < RXRPC_RXTX_BUFF_SIZE; i++)
650 CIRC_CNT(call->acks_head, call->acks_tail, 567 rxrpc_free_skb(call->rxtx_buffer[i],
651 call->acks_winsz)); 568 (call->tx_phase ? rxrpc_skb_tx_cleaned :
652 smp_mb(); 569 rxrpc_skb_rx_cleaned));
653 while (CIRC_CNT(call->acks_head, call->acks_tail,
654 call->acks_winsz) > 0) {
655 struct rxrpc_skb_priv *sp;
656 unsigned long _skb;
657
658 _skb = call->acks_window[call->acks_tail] & ~1;
659 sp = rxrpc_skb((struct sk_buff *)_skb);
660 _debug("+++ clear Tx %u", sp->hdr.seq);
661 rxrpc_free_skb((struct sk_buff *)_skb);
662 call->acks_tail =
663 (call->acks_tail + 1) & (call->acks_winsz - 1);
664 }
665
666 kfree(call->acks_window);
667 }
668 570
669 rxrpc_free_skb(call->tx_pending); 571 rxrpc_free_skb(call->tx_pending, rxrpc_skb_tx_cleaned);
670 572
671 rxrpc_purge_queue(&call->rx_queue);
672 ASSERT(skb_queue_empty(&call->rx_oos_queue));
673 sock_put(&call->socket->sk);
674 call_rcu(&call->rcu, rxrpc_rcu_destroy_call); 573 call_rcu(&call->rcu, rxrpc_rcu_destroy_call);
675} 574}
676 575
677/* 576/*
678 * destroy a call 577 * Make sure that all calls are gone.
679 */
680static void rxrpc_destroy_call(struct work_struct *work)
681{
682 struct rxrpc_call *call =
683 container_of(work, struct rxrpc_call, destroyer);
684
685 _enter("%p{%d,%d,%p}",
686 call, atomic_read(&call->usage), call->channel, call->conn);
687
688 ASSERTCMP(call->state, ==, RXRPC_CALL_DEAD);
689
690 write_lock_bh(&rxrpc_call_lock);
691 list_del_init(&call->link);
692 write_unlock_bh(&rxrpc_call_lock);
693
694 rxrpc_cleanup_call(call);
695 _leave("");
696}
697
698/*
699 * preemptively destroy all the call records from a transport endpoint rather
700 * than waiting for them to time out
701 */ 578 */
702void __exit rxrpc_destroy_all_calls(void) 579void __exit rxrpc_destroy_all_calls(void)
703{ 580{
704 struct rxrpc_call *call; 581 struct rxrpc_call *call;
705 582
706 _enter(""); 583 _enter("");
707 write_lock_bh(&rxrpc_call_lock); 584
585 if (list_empty(&rxrpc_calls))
586 return;
587
588 write_lock(&rxrpc_call_lock);
708 589
709 while (!list_empty(&rxrpc_calls)) { 590 while (!list_empty(&rxrpc_calls)) {
710 call = list_entry(rxrpc_calls.next, struct rxrpc_call, link); 591 call = list_entry(rxrpc_calls.next, struct rxrpc_call, link);
711 _debug("Zapping call %p", call); 592 _debug("Zapping call %p", call);
712 593
594 rxrpc_see_call(call);
713 list_del_init(&call->link); 595 list_del_init(&call->link);
714 596
715 switch (atomic_read(&call->usage)) { 597 pr_err("Call %p still in use (%d,%s,%lx,%lx)!\n",
716 case 0: 598 call, atomic_read(&call->usage),
717 ASSERTCMP(call->state, ==, RXRPC_CALL_DEAD); 599 rxrpc_call_states[call->state],
718 break; 600 call->flags, call->events);
719 case 1:
720 if (del_timer_sync(&call->deadspan) != 0 &&
721 call->state != RXRPC_CALL_DEAD)
722 rxrpc_dead_call_expired((unsigned long) call);
723 if (call->state != RXRPC_CALL_DEAD)
724 break;
725 default:
726 pr_err("Call %p still in use (%d,%d,%s,%lx,%lx)!\n",
727 call, atomic_read(&call->usage),
728 atomic_read(&call->ackr_not_idle),
729 rxrpc_call_states[call->state],
730 call->flags, call->events);
731 if (!skb_queue_empty(&call->rx_queue))
732 pr_err("Rx queue occupied\n");
733 if (!skb_queue_empty(&call->rx_oos_queue))
734 pr_err("OOS queue occupied\n");
735 break;
736 }
737
738 write_unlock_bh(&rxrpc_call_lock);
739 cond_resched();
740 write_lock_bh(&rxrpc_call_lock);
741 }
742
743 write_unlock_bh(&rxrpc_call_lock);
744 _leave("");
745}
746
747/*
748 * handle call lifetime being exceeded
749 */
750static void rxrpc_call_life_expired(unsigned long _call)
751{
752 struct rxrpc_call *call = (struct rxrpc_call *) _call;
753 601
754 if (call->state >= RXRPC_CALL_COMPLETE) 602 write_unlock(&rxrpc_call_lock);
755 return; 603 cond_resched();
756 604 write_lock(&rxrpc_call_lock);
757 _enter("{%d}", call->debug_id);
758 read_lock_bh(&call->state_lock);
759 if (call->state < RXRPC_CALL_COMPLETE) {
760 set_bit(RXRPC_CALL_EV_LIFE_TIMER, &call->events);
761 rxrpc_queue_call(call);
762 } 605 }
763 read_unlock_bh(&call->state_lock);
764}
765
766/*
767 * handle resend timer expiry
768 * - may not take call->state_lock as this can deadlock against del_timer_sync()
769 */
770static void rxrpc_resend_time_expired(unsigned long _call)
771{
772 struct rxrpc_call *call = (struct rxrpc_call *) _call;
773
774 _enter("{%d}", call->debug_id);
775
776 if (call->state >= RXRPC_CALL_COMPLETE)
777 return;
778
779 clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
780 if (!test_and_set_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events))
781 rxrpc_queue_call(call);
782}
783
784/*
785 * handle ACK timer expiry
786 */
787static void rxrpc_ack_time_expired(unsigned long _call)
788{
789 struct rxrpc_call *call = (struct rxrpc_call *) _call;
790
791 _enter("{%d}", call->debug_id);
792
793 if (call->state >= RXRPC_CALL_COMPLETE)
794 return;
795 606
796 read_lock_bh(&call->state_lock); 607 write_unlock(&rxrpc_call_lock);
797 if (call->state < RXRPC_CALL_COMPLETE &&
798 !test_and_set_bit(RXRPC_CALL_EV_ACK, &call->events))
799 rxrpc_queue_call(call);
800 read_unlock_bh(&call->state_lock);
801} 608}
diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
index 9e91f27b0d0f..60ef9605167e 100644
--- a/net/rxrpc/conn_client.c
+++ b/net/rxrpc/conn_client.c
@@ -7,6 +7,68 @@
7 * modify it under the terms of the GNU General Public Licence 7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version 8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version. 9 * 2 of the Licence, or (at your option) any later version.
10 *
11 *
12 * Client connections need to be cached for a little while after they've made a
13 * call so as to handle retransmitted DATA packets in case the server didn't
14 * receive the final ACK or terminating ABORT we sent it.
15 *
16 * Client connections can be in one of a number of cache states:
17 *
18 * (1) INACTIVE - The connection is not held in any list and may not have been
19 * exposed to the world. If it has been previously exposed, it was
20 * discarded from the idle list after expiring.
21 *
22 * (2) WAITING - The connection is waiting for the number of client conns to
23 * drop below the maximum capacity. Calls may be in progress upon it from
24 * when it was active and got culled.
25 *
26 * The connection is on the rxrpc_waiting_client_conns list which is kept
27 * in to-be-granted order. Culled conns with waiters go to the back of
28 * the queue just like new conns.
29 *
30 * (3) ACTIVE - The connection has at least one call in progress upon it, it
31 * may freely grant available channels to new calls and calls may be
32 * waiting on it for channels to become available.
33 *
34 * The connection is on the rxrpc_active_client_conns list which is kept
35 * in activation order for culling purposes.
36 *
37 * rxrpc_nr_active_client_conns is held incremented also.
38 *
39 * (4) CULLED - The connection got summarily culled to try and free up
40 * capacity. Calls currently in progress on the connection are allowed to
41 * continue, but new calls will have to wait. There can be no waiters in
42 * this state - the conn would have to go to the WAITING state instead.
43 *
44 * (5) IDLE - The connection has no calls in progress upon it and must have
45 * been exposed to the world (ie. the EXPOSED flag must be set). When it
46 * expires, the EXPOSED flag is cleared and the connection transitions to
47 * the INACTIVE state.
48 *
49 * The connection is on the rxrpc_idle_client_conns list which is kept in
50 * order of how soon they'll expire.
51 *
52 * There are flags of relevance to the cache:
53 *
54 * (1) EXPOSED - The connection ID got exposed to the world. If this flag is
55 * set, an extra ref is added to the connection preventing it from being
56 * reaped when it has no calls outstanding. This flag is cleared and the
57 * ref dropped when a conn is discarded from the idle list.
58 *
59 * This allows us to move terminal call state retransmission to the
60 * connection and to discard the call immediately we think it is done
61 * with. It also give us a chance to reuse the connection.
62 *
63 * (2) DONT_REUSE - The connection should be discarded as soon as possible and
64 * should not be reused. This is set when an exclusive connection is used
65 * or a call ID counter overflows.
66 *
67 * The caching state may only be changed if the cache lock is held.
68 *
69 * There are two idle client connection expiry durations. If the total number
70 * of connections is below the reap threshold, we use the normal duration; if
71 * it's above, we use the fast duration.
10 */ 72 */
11 73
12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 74#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -16,27 +78,50 @@
16#include <linux/timer.h> 78#include <linux/timer.h>
17#include "ar-internal.h" 79#include "ar-internal.h"
18 80
81__read_mostly unsigned int rxrpc_max_client_connections = 1000;
82__read_mostly unsigned int rxrpc_reap_client_connections = 900;
83__read_mostly unsigned int rxrpc_conn_idle_client_expiry = 2 * 60 * HZ;
84__read_mostly unsigned int rxrpc_conn_idle_client_fast_expiry = 2 * HZ;
85
86static unsigned int rxrpc_nr_client_conns;
87static unsigned int rxrpc_nr_active_client_conns;
88static __read_mostly bool rxrpc_kill_all_client_conns;
89
90static DEFINE_SPINLOCK(rxrpc_client_conn_cache_lock);
91static DEFINE_SPINLOCK(rxrpc_client_conn_discard_mutex);
92static LIST_HEAD(rxrpc_waiting_client_conns);
93static LIST_HEAD(rxrpc_active_client_conns);
94static LIST_HEAD(rxrpc_idle_client_conns);
95
19/* 96/*
20 * We use machine-unique IDs for our client connections. 97 * We use machine-unique IDs for our client connections.
21 */ 98 */
22DEFINE_IDR(rxrpc_client_conn_ids); 99DEFINE_IDR(rxrpc_client_conn_ids);
23static DEFINE_SPINLOCK(rxrpc_conn_id_lock); 100static DEFINE_SPINLOCK(rxrpc_conn_id_lock);
24 101
102static void rxrpc_cull_active_client_conns(void);
103static void rxrpc_discard_expired_client_conns(struct work_struct *);
104
105static DECLARE_DELAYED_WORK(rxrpc_client_conn_reap,
106 rxrpc_discard_expired_client_conns);
107
108const char rxrpc_conn_cache_states[RXRPC_CONN__NR_CACHE_STATES][5] = {
109 [RXRPC_CONN_CLIENT_INACTIVE] = "Inac",
110 [RXRPC_CONN_CLIENT_WAITING] = "Wait",
111 [RXRPC_CONN_CLIENT_ACTIVE] = "Actv",
112 [RXRPC_CONN_CLIENT_CULLED] = "Cull",
113 [RXRPC_CONN_CLIENT_IDLE] = "Idle",
114};
115
25/* 116/*
26 * Get a connection ID and epoch for a client connection from the global pool. 117 * Get a connection ID and epoch for a client connection from the global pool.
27 * The connection struct pointer is then recorded in the idr radix tree. The 118 * The connection struct pointer is then recorded in the idr radix tree. The
28 * epoch is changed if this wraps. 119 * epoch doesn't change until the client is rebooted (or, at least, unless the
29 * 120 * module is unloaded).
30 * TODO: The IDR tree gets very expensive on memory if the connection IDs are
31 * widely scattered throughout the number space, so we shall need to retire
32 * connections that have, say, an ID more than four times the maximum number of
33 * client conns away from the current allocation point to try and keep the IDs
34 * concentrated. We will also need to retire connections from an old epoch.
35 */ 121 */
36static int rxrpc_get_client_connection_id(struct rxrpc_connection *conn, 122static int rxrpc_get_client_connection_id(struct rxrpc_connection *conn,
37 gfp_t gfp) 123 gfp_t gfp)
38{ 124{
39 u32 epoch;
40 int id; 125 int id;
41 126
42 _enter(""); 127 _enter("");
@@ -44,34 +129,18 @@ static int rxrpc_get_client_connection_id(struct rxrpc_connection *conn,
44 idr_preload(gfp); 129 idr_preload(gfp);
45 spin_lock(&rxrpc_conn_id_lock); 130 spin_lock(&rxrpc_conn_id_lock);
46 131
47 epoch = rxrpc_epoch; 132 id = idr_alloc_cyclic(&rxrpc_client_conn_ids, conn,
48 133 1, 0x40000000, GFP_NOWAIT);
49 /* We could use idr_alloc_cyclic() here, but we really need to know 134 if (id < 0)
50 * when the thing wraps so that we can advance the epoch. 135 goto error;
51 */
52 if (rxrpc_client_conn_ids.cur == 0)
53 rxrpc_client_conn_ids.cur = 1;
54 id = idr_alloc(&rxrpc_client_conn_ids, conn,
55 rxrpc_client_conn_ids.cur, 0x40000000, GFP_NOWAIT);
56 if (id < 0) {
57 if (id != -ENOSPC)
58 goto error;
59 id = idr_alloc(&rxrpc_client_conn_ids, conn,
60 1, 0x40000000, GFP_NOWAIT);
61 if (id < 0)
62 goto error;
63 epoch++;
64 rxrpc_epoch = epoch;
65 }
66 rxrpc_client_conn_ids.cur = id + 1;
67 136
68 spin_unlock(&rxrpc_conn_id_lock); 137 spin_unlock(&rxrpc_conn_id_lock);
69 idr_preload_end(); 138 idr_preload_end();
70 139
71 conn->proto.epoch = epoch; 140 conn->proto.epoch = rxrpc_epoch;
72 conn->proto.cid = id << RXRPC_CIDSHIFT; 141 conn->proto.cid = id << RXRPC_CIDSHIFT;
73 set_bit(RXRPC_CONN_HAS_IDR, &conn->flags); 142 set_bit(RXRPC_CONN_HAS_IDR, &conn->flags);
74 _leave(" [CID %x:%x]", epoch, conn->proto.cid); 143 _leave(" [CID %x]", conn->proto.cid);
75 return 0; 144 return 0;
76 145
77error: 146error:
@@ -114,8 +183,7 @@ void rxrpc_destroy_client_conn_ids(void)
114} 183}
115 184
116/* 185/*
117 * Allocate a client connection. The caller must take care to clear any 186 * Allocate a client connection.
118 * padding bytes in *cp.
119 */ 187 */
120static struct rxrpc_connection * 188static struct rxrpc_connection *
121rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp) 189rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp)
@@ -131,6 +199,10 @@ rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp)
131 return ERR_PTR(-ENOMEM); 199 return ERR_PTR(-ENOMEM);
132 } 200 }
133 201
202 atomic_set(&conn->usage, 1);
203 if (cp->exclusive)
204 __set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags);
205
134 conn->params = *cp; 206 conn->params = *cp;
135 conn->out_clientflag = RXRPC_CLIENT_INITIATED; 207 conn->out_clientflag = RXRPC_CLIENT_INITIATED;
136 conn->state = RXRPC_CONN_CLIENT; 208 conn->state = RXRPC_CONN_CLIENT;
@@ -148,7 +220,7 @@ rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp)
148 goto error_2; 220 goto error_2;
149 221
150 write_lock(&rxrpc_connection_lock); 222 write_lock(&rxrpc_connection_lock);
151 list_add_tail(&conn->link, &rxrpc_connections); 223 list_add_tail(&conn->proc_link, &rxrpc_connection_proc_list);
152 write_unlock(&rxrpc_connection_lock); 224 write_unlock(&rxrpc_connection_lock);
153 225
154 /* We steal the caller's peer ref. */ 226 /* We steal the caller's peer ref. */
@@ -156,6 +228,9 @@ rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp)
156 rxrpc_get_local(conn->params.local); 228 rxrpc_get_local(conn->params.local);
157 key_get(conn->params.key); 229 key_get(conn->params.key);
158 230
231 trace_rxrpc_conn(conn, rxrpc_conn_new_client, atomic_read(&conn->usage),
232 __builtin_return_address(0));
233 trace_rxrpc_client(conn, -1, rxrpc_client_alloc);
159 _leave(" = %p", conn); 234 _leave(" = %p", conn);
160 return conn; 235 return conn;
161 236
@@ -170,32 +245,68 @@ error_0:
170} 245}
171 246
172/* 247/*
173 * find a connection for a call 248 * Determine if a connection may be reused.
174 * - called in process context with IRQs enabled
175 */ 249 */
176int rxrpc_connect_call(struct rxrpc_call *call, 250static bool rxrpc_may_reuse_conn(struct rxrpc_connection *conn)
177 struct rxrpc_conn_parameters *cp, 251{
178 struct sockaddr_rxrpc *srx, 252 int id_cursor, id, distance, limit;
179 gfp_t gfp) 253
254 if (test_bit(RXRPC_CONN_DONT_REUSE, &conn->flags))
255 goto dont_reuse;
256
257 if (conn->proto.epoch != rxrpc_epoch)
258 goto mark_dont_reuse;
259
260 /* The IDR tree gets very expensive on memory if the connection IDs are
261 * widely scattered throughout the number space, so we shall want to
262 * kill off connections that, say, have an ID more than about four
263 * times the maximum number of client conns away from the current
264 * allocation point to try and keep the IDs concentrated.
265 */
266 id_cursor = READ_ONCE(rxrpc_client_conn_ids.cur);
267 id = conn->proto.cid >> RXRPC_CIDSHIFT;
268 distance = id - id_cursor;
269 if (distance < 0)
270 distance = -distance;
271 limit = round_up(rxrpc_max_client_connections, IDR_SIZE) * 4;
272 if (distance > limit)
273 goto mark_dont_reuse;
274
275 return true;
276
277mark_dont_reuse:
278 set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags);
279dont_reuse:
280 return false;
281}
282
283/*
284 * Create or find a client connection to use for a call.
285 *
286 * If we return with a connection, the call will be on its waiting list. It's
287 * left to the caller to assign a channel and wake up the call.
288 */
289static int rxrpc_get_client_conn(struct rxrpc_call *call,
290 struct rxrpc_conn_parameters *cp,
291 struct sockaddr_rxrpc *srx,
292 gfp_t gfp)
180{ 293{
181 struct rxrpc_connection *conn, *candidate = NULL; 294 struct rxrpc_connection *conn, *candidate = NULL;
182 struct rxrpc_local *local = cp->local; 295 struct rxrpc_local *local = cp->local;
183 struct rb_node *p, **pp, *parent; 296 struct rb_node *p, **pp, *parent;
184 long diff; 297 long diff;
185 int chan; 298 int ret = -ENOMEM;
186
187 DECLARE_WAITQUEUE(myself, current);
188 299
189 _enter("{%d,%lx},", call->debug_id, call->user_call_ID); 300 _enter("{%d,%lx},", call->debug_id, call->user_call_ID);
190 301
191 cp->peer = rxrpc_lookup_peer(cp->local, srx, gfp); 302 cp->peer = rxrpc_lookup_peer(cp->local, srx, gfp);
192 if (!cp->peer) 303 if (!cp->peer)
193 return -ENOMEM; 304 goto error;
194 305
306 /* If the connection is not meant to be exclusive, search the available
307 * connections to see if the connection we want to use already exists.
308 */
195 if (!cp->exclusive) { 309 if (!cp->exclusive) {
196 /* Search for a existing client connection unless this is going
197 * to be a connection that's used exclusively for a single call.
198 */
199 _debug("search 1"); 310 _debug("search 1");
200 spin_lock(&local->client_conns_lock); 311 spin_lock(&local->client_conns_lock);
201 p = local->client_conns.rb_node; 312 p = local->client_conns.rb_node;
@@ -206,39 +317,56 @@ int rxrpc_connect_call(struct rxrpc_call *call,
206 diff = (cmp(peer) ?: 317 diff = (cmp(peer) ?:
207 cmp(key) ?: 318 cmp(key) ?:
208 cmp(security_level)); 319 cmp(security_level));
209 if (diff < 0) 320#undef cmp
321 if (diff < 0) {
210 p = p->rb_left; 322 p = p->rb_left;
211 else if (diff > 0) 323 } else if (diff > 0) {
212 p = p->rb_right; 324 p = p->rb_right;
213 else 325 } else {
214 goto found_extant_conn; 326 if (rxrpc_may_reuse_conn(conn) &&
327 rxrpc_get_connection_maybe(conn))
328 goto found_extant_conn;
329 /* The connection needs replacing. It's better
330 * to effect that when we have something to
331 * replace it with so that we don't have to
332 * rebalance the tree twice.
333 */
334 break;
335 }
215 } 336 }
216 spin_unlock(&local->client_conns_lock); 337 spin_unlock(&local->client_conns_lock);
217 } 338 }
218 339
219 /* We didn't find a connection or we want an exclusive one. */ 340 /* There wasn't a connection yet or we need an exclusive connection.
220 _debug("get new conn"); 341 * We need to create a candidate and then potentially redo the search
342 * in case we're racing with another thread also trying to connect on a
343 * shareable connection.
344 */
345 _debug("new conn");
221 candidate = rxrpc_alloc_client_connection(cp, gfp); 346 candidate = rxrpc_alloc_client_connection(cp, gfp);
222 if (!candidate) { 347 if (IS_ERR(candidate)) {
223 _leave(" = -ENOMEM"); 348 ret = PTR_ERR(candidate);
224 return -ENOMEM; 349 goto error_peer;
225 } 350 }
226 351
352 /* Add the call to the new connection's waiting list in case we're
353 * going to have to wait for the connection to come live. It's our
354 * connection, so we want first dibs on the channel slots. We would
355 * normally have to take channel_lock but we do this before anyone else
356 * can see the connection.
357 */
358 list_add_tail(&call->chan_wait_link, &candidate->waiting_calls);
359
227 if (cp->exclusive) { 360 if (cp->exclusive) {
228 /* Assign the call on an exclusive connection to channel 0 and 361 call->conn = candidate;
229 * don't add the connection to the endpoint's shareable conn 362 call->security_ix = candidate->security_ix;
230 * lookup tree. 363 _leave(" = 0 [exclusive %d]", candidate->debug_id);
231 */ 364 return 0;
232 _debug("exclusive chan 0");
233 conn = candidate;
234 atomic_set(&conn->avail_chans, RXRPC_MAXCALLS - 1);
235 spin_lock(&conn->channel_lock);
236 chan = 0;
237 goto found_channel;
238 } 365 }
239 366
240 /* We need to redo the search before attempting to add a new connection 367 /* Publish the new connection for userspace to find. We need to redo
241 * lest we race with someone else adding a conflicting instance. 368 * the search before doing this lest we race with someone else adding a
369 * conflicting instance.
242 */ 370 */
243 _debug("search 2"); 371 _debug("search 2");
244 spin_lock(&local->client_conns_lock); 372 spin_lock(&local->client_conns_lock);
@@ -249,124 +377,711 @@ int rxrpc_connect_call(struct rxrpc_call *call,
249 parent = *pp; 377 parent = *pp;
250 conn = rb_entry(parent, struct rxrpc_connection, client_node); 378 conn = rb_entry(parent, struct rxrpc_connection, client_node);
251 379
380#define cmp(X) ((long)conn->params.X - (long)candidate->params.X)
252 diff = (cmp(peer) ?: 381 diff = (cmp(peer) ?:
253 cmp(key) ?: 382 cmp(key) ?:
254 cmp(security_level)); 383 cmp(security_level));
255 if (diff < 0) 384#undef cmp
385 if (diff < 0) {
256 pp = &(*pp)->rb_left; 386 pp = &(*pp)->rb_left;
257 else if (diff > 0) 387 } else if (diff > 0) {
258 pp = &(*pp)->rb_right; 388 pp = &(*pp)->rb_right;
259 else 389 } else {
260 goto found_extant_conn; 390 if (rxrpc_may_reuse_conn(conn) &&
391 rxrpc_get_connection_maybe(conn))
392 goto found_extant_conn;
393 /* The old connection is from an outdated epoch. */
394 _debug("replace conn");
395 clear_bit(RXRPC_CONN_IN_CLIENT_CONNS, &conn->flags);
396 rb_replace_node(&conn->client_node,
397 &candidate->client_node,
398 &local->client_conns);
399 trace_rxrpc_client(conn, -1, rxrpc_client_replace);
400 goto candidate_published;
401 }
261 } 402 }
262 403
263 /* The second search also failed; simply add the new connection with
264 * the new call in channel 0. Note that we need to take the channel
265 * lock before dropping the client conn lock.
266 */
267 _debug("new conn"); 404 _debug("new conn");
268 set_bit(RXRPC_CONN_IN_CLIENT_CONNS, &candidate->flags);
269 rb_link_node(&candidate->client_node, parent, pp); 405 rb_link_node(&candidate->client_node, parent, pp);
270 rb_insert_color(&candidate->client_node, &local->client_conns); 406 rb_insert_color(&candidate->client_node, &local->client_conns);
271attached:
272 conn = candidate;
273 candidate = NULL;
274 407
275 atomic_set(&conn->avail_chans, RXRPC_MAXCALLS - 1); 408candidate_published:
276 spin_lock(&conn->channel_lock); 409 set_bit(RXRPC_CONN_IN_CLIENT_CONNS, &candidate->flags);
410 call->conn = candidate;
411 call->security_ix = candidate->security_ix;
277 spin_unlock(&local->client_conns_lock); 412 spin_unlock(&local->client_conns_lock);
278 chan = 0; 413 _leave(" = 0 [new %d]", candidate->debug_id);
414 return 0;
279 415
280found_channel: 416 /* We come here if we found a suitable connection already in existence.
281 _debug("found chan"); 417 * Discard any candidate we may have allocated, and try to get a
282 call->conn = conn; 418 * channel on this one.
283 call->channel = chan; 419 */
284 call->epoch = conn->proto.epoch; 420found_extant_conn:
285 call->cid = conn->proto.cid | chan; 421 _debug("found conn");
286 call->call_id = ++conn->channels[chan].call_counter; 422 spin_unlock(&local->client_conns_lock);
287 conn->channels[chan].call_id = call->call_id;
288 rcu_assign_pointer(conn->channels[chan].call, call);
289 423
290 _net("CONNECT call %d on conn %d", call->debug_id, conn->debug_id); 424 if (candidate) {
425 trace_rxrpc_client(candidate, -1, rxrpc_client_duplicate);
426 rxrpc_put_connection(candidate);
427 candidate = NULL;
428 }
291 429
430 spin_lock(&conn->channel_lock);
431 call->conn = conn;
432 call->security_ix = conn->security_ix;
433 list_add(&call->chan_wait_link, &conn->waiting_calls);
292 spin_unlock(&conn->channel_lock); 434 spin_unlock(&conn->channel_lock);
435 _leave(" = 0 [extant %d]", conn->debug_id);
436 return 0;
437
438error_peer:
293 rxrpc_put_peer(cp->peer); 439 rxrpc_put_peer(cp->peer);
294 cp->peer = NULL; 440 cp->peer = NULL;
295 _leave(" = %p {u=%d}", conn, atomic_read(&conn->usage)); 441error:
296 return 0; 442 _leave(" = %d", ret);
443 return ret;
444}
445
446/*
447 * Activate a connection.
448 */
449static void rxrpc_activate_conn(struct rxrpc_connection *conn)
450{
451 trace_rxrpc_client(conn, -1, rxrpc_client_to_active);
452 conn->cache_state = RXRPC_CONN_CLIENT_ACTIVE;
453 rxrpc_nr_active_client_conns++;
454 list_move_tail(&conn->cache_link, &rxrpc_active_client_conns);
455}
456
457/*
458 * Attempt to animate a connection for a new call.
459 *
460 * If it's not exclusive, the connection is in the endpoint tree, and we're in
461 * the conn's list of those waiting to grab a channel. There is, however, a
462 * limit on the number of live connections allowed at any one time, so we may
463 * have to wait for capacity to become available.
464 *
465 * Note that a connection on the waiting queue might *also* have active
466 * channels if it has been culled to make space and then re-requested by a new
467 * call.
468 */
469static void rxrpc_animate_client_conn(struct rxrpc_connection *conn)
470{
471 unsigned int nr_conns;
472
473 _enter("%d,%d", conn->debug_id, conn->cache_state);
474
475 if (conn->cache_state == RXRPC_CONN_CLIENT_ACTIVE)
476 goto out;
477
478 spin_lock(&rxrpc_client_conn_cache_lock);
479
480 nr_conns = rxrpc_nr_client_conns;
481 if (!test_and_set_bit(RXRPC_CONN_COUNTED, &conn->flags)) {
482 trace_rxrpc_client(conn, -1, rxrpc_client_count);
483 rxrpc_nr_client_conns = nr_conns + 1;
484 }
485
486 switch (conn->cache_state) {
487 case RXRPC_CONN_CLIENT_ACTIVE:
488 case RXRPC_CONN_CLIENT_WAITING:
489 break;
490
491 case RXRPC_CONN_CLIENT_INACTIVE:
492 case RXRPC_CONN_CLIENT_CULLED:
493 case RXRPC_CONN_CLIENT_IDLE:
494 if (nr_conns >= rxrpc_max_client_connections)
495 goto wait_for_capacity;
496 goto activate_conn;
497
498 default:
499 BUG();
500 }
501
502out_unlock:
503 spin_unlock(&rxrpc_client_conn_cache_lock);
504out:
505 _leave(" [%d]", conn->cache_state);
506 return;
507
508activate_conn:
509 _debug("activate");
510 rxrpc_activate_conn(conn);
511 goto out_unlock;
512
513wait_for_capacity:
514 _debug("wait");
515 trace_rxrpc_client(conn, -1, rxrpc_client_to_waiting);
516 conn->cache_state = RXRPC_CONN_CLIENT_WAITING;
517 list_move_tail(&conn->cache_link, &rxrpc_waiting_client_conns);
518 goto out_unlock;
519}
520
521/*
522 * Deactivate a channel.
523 */
524static void rxrpc_deactivate_one_channel(struct rxrpc_connection *conn,
525 unsigned int channel)
526{
527 struct rxrpc_channel *chan = &conn->channels[channel];
528
529 rcu_assign_pointer(chan->call, NULL);
530 conn->active_chans &= ~(1 << channel);
531}
532
533/*
534 * Assign a channel to the call at the front of the queue and wake the call up.
535 * We don't increment the callNumber counter until this number has been exposed
536 * to the world.
537 */
538static void rxrpc_activate_one_channel(struct rxrpc_connection *conn,
539 unsigned int channel)
540{
541 struct rxrpc_channel *chan = &conn->channels[channel];
542 struct rxrpc_call *call = list_entry(conn->waiting_calls.next,
543 struct rxrpc_call, chan_wait_link);
544 u32 call_id = chan->call_counter + 1;
545
546 trace_rxrpc_client(conn, channel, rxrpc_client_chan_activate);
547
548 write_lock_bh(&call->state_lock);
549 call->state = RXRPC_CALL_CLIENT_SEND_REQUEST;
550 write_unlock_bh(&call->state_lock);
551
552 rxrpc_see_call(call);
553 list_del_init(&call->chan_wait_link);
554 conn->active_chans |= 1 << channel;
555 call->peer = rxrpc_get_peer(conn->params.peer);
556 call->cid = conn->proto.cid | channel;
557 call->call_id = call_id;
558
559 _net("CONNECT call %08x:%08x as call %d on conn %d",
560 call->cid, call->call_id, call->debug_id, conn->debug_id);
297 561
298 /* We found a potentially suitable connection already in existence. If 562 /* Paired with the read barrier in rxrpc_wait_for_channel(). This
299 * we can reuse it (ie. its usage count hasn't been reduced to 0 by the 563 * orders cid and epoch in the connection wrt to call_id without the
300 * reaper), discard any candidate we may have allocated, and try to get 564 * need to take the channel_lock.
301 * a channel on this one, otherwise we have to replace it. 565 *
566 * We provisionally assign a callNumber at this point, but we don't
567 * confirm it until the call is about to be exposed.
568 *
569 * TODO: Pair with a barrier in the data_ready handler when that looks
570 * at the call ID through a connection channel.
302 */ 571 */
303found_extant_conn: 572 smp_wmb();
304 _debug("found conn"); 573 chan->call_id = call_id;
305 if (!rxrpc_get_connection_maybe(conn)) { 574 rcu_assign_pointer(chan->call, call);
306 set_bit(RXRPC_CONN_IN_CLIENT_CONNS, &candidate->flags); 575 wake_up(&call->waitq);
307 rb_replace_node(&conn->client_node, 576}
308 &candidate->client_node, 577
309 &local->client_conns); 578/*
310 clear_bit(RXRPC_CONN_IN_CLIENT_CONNS, &conn->flags); 579 * Assign channels and callNumbers to waiting calls with channel_lock
311 goto attached; 580 * held by caller.
581 */
582static void rxrpc_activate_channels_locked(struct rxrpc_connection *conn)
583{
584 u8 avail, mask;
585
586 switch (conn->cache_state) {
587 case RXRPC_CONN_CLIENT_ACTIVE:
588 mask = RXRPC_ACTIVE_CHANS_MASK;
589 break;
590 default:
591 return;
312 } 592 }
313 593
314 spin_unlock(&local->client_conns_lock); 594 while (!list_empty(&conn->waiting_calls) &&
595 (avail = ~conn->active_chans,
596 avail &= mask,
597 avail != 0))
598 rxrpc_activate_one_channel(conn, __ffs(avail));
599}
600
601/*
602 * Assign channels and callNumbers to waiting calls.
603 */
604static void rxrpc_activate_channels(struct rxrpc_connection *conn)
605{
606 _enter("%d", conn->debug_id);
315 607
316 rxrpc_put_connection(candidate); 608 trace_rxrpc_client(conn, -1, rxrpc_client_activate_chans);
609
610 if (conn->active_chans == RXRPC_ACTIVE_CHANS_MASK)
611 return;
612
613 spin_lock(&conn->channel_lock);
614 rxrpc_activate_channels_locked(conn);
615 spin_unlock(&conn->channel_lock);
616 _leave("");
617}
618
619/*
620 * Wait for a callNumber and a channel to be granted to a call.
621 */
622static int rxrpc_wait_for_channel(struct rxrpc_call *call, gfp_t gfp)
623{
624 int ret = 0;
625
626 _enter("%d", call->debug_id);
627
628 if (!call->call_id) {
629 DECLARE_WAITQUEUE(myself, current);
317 630
318 if (!atomic_add_unless(&conn->avail_chans, -1, 0)) {
319 if (!gfpflags_allow_blocking(gfp)) { 631 if (!gfpflags_allow_blocking(gfp)) {
320 rxrpc_put_connection(conn); 632 ret = -EAGAIN;
321 _leave(" = -EAGAIN"); 633 goto out;
322 return -EAGAIN;
323 } 634 }
324 635
325 add_wait_queue(&conn->channel_wq, &myself); 636 add_wait_queue_exclusive(&call->waitq, &myself);
326 for (;;) { 637 for (;;) {
327 set_current_state(TASK_INTERRUPTIBLE); 638 set_current_state(TASK_INTERRUPTIBLE);
328 if (atomic_add_unless(&conn->avail_chans, -1, 0)) 639 if (call->call_id)
640 break;
641 if (signal_pending(current)) {
642 ret = -ERESTARTSYS;
329 break; 643 break;
330 if (signal_pending(current)) 644 }
331 goto interrupted;
332 schedule(); 645 schedule();
333 } 646 }
334 remove_wait_queue(&conn->channel_wq, &myself); 647 remove_wait_queue(&call->waitq, &myself);
335 __set_current_state(TASK_RUNNING); 648 __set_current_state(TASK_RUNNING);
336 } 649 }
337 650
338 /* The connection allegedly now has a free channel and we can now 651 /* Paired with the write barrier in rxrpc_activate_one_channel(). */
339 * attach the call to it. 652 smp_rmb();
340 */ 653
654out:
655 _leave(" = %d", ret);
656 return ret;
657}
658
659/*
660 * find a connection for a call
661 * - called in process context with IRQs enabled
662 */
663int rxrpc_connect_call(struct rxrpc_call *call,
664 struct rxrpc_conn_parameters *cp,
665 struct sockaddr_rxrpc *srx,
666 gfp_t gfp)
667{
668 int ret;
669
670 _enter("{%d,%lx},", call->debug_id, call->user_call_ID);
671
672 rxrpc_discard_expired_client_conns(NULL);
673 rxrpc_cull_active_client_conns();
674
675 ret = rxrpc_get_client_conn(call, cp, srx, gfp);
676 if (ret < 0)
677 return ret;
678
679 rxrpc_animate_client_conn(call->conn);
680 rxrpc_activate_channels(call->conn);
681
682 ret = rxrpc_wait_for_channel(call, gfp);
683 if (ret < 0)
684 rxrpc_disconnect_client_call(call);
685
686 _leave(" = %d", ret);
687 return ret;
688}
689
690/*
691 * Note that a connection is about to be exposed to the world. Once it is
692 * exposed, we maintain an extra ref on it that stops it from being summarily
693 * discarded before it's (a) had a chance to deal with retransmission and (b)
694 * had a chance at re-use (the per-connection security negotiation is
695 * expensive).
696 */
697static void rxrpc_expose_client_conn(struct rxrpc_connection *conn,
698 unsigned int channel)
699{
700 if (!test_and_set_bit(RXRPC_CONN_EXPOSED, &conn->flags)) {
701 trace_rxrpc_client(conn, channel, rxrpc_client_exposed);
702 rxrpc_get_connection(conn);
703 }
704}
705
706/*
707 * Note that a call, and thus a connection, is about to be exposed to the
708 * world.
709 */
710void rxrpc_expose_client_call(struct rxrpc_call *call)
711{
712 unsigned int channel = call->cid & RXRPC_CHANNELMASK;
713 struct rxrpc_connection *conn = call->conn;
714 struct rxrpc_channel *chan = &conn->channels[channel];
715
716 if (!test_and_set_bit(RXRPC_CALL_EXPOSED, &call->flags)) {
717 /* Mark the call ID as being used. If the callNumber counter
718 * exceeds ~2 billion, we kill the connection after its
719 * outstanding calls have finished so that the counter doesn't
720 * wrap.
721 */
722 chan->call_counter++;
723 if (chan->call_counter >= INT_MAX)
724 set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags);
725 rxrpc_expose_client_conn(conn, channel);
726 }
727}
728
729/*
730 * Disconnect a client call.
731 */
732void rxrpc_disconnect_client_call(struct rxrpc_call *call)
733{
734 unsigned int channel = call->cid & RXRPC_CHANNELMASK;
735 struct rxrpc_connection *conn = call->conn;
736 struct rxrpc_channel *chan = &conn->channels[channel];
737
738 trace_rxrpc_client(conn, channel, rxrpc_client_chan_disconnect);
739 call->conn = NULL;
740
341 spin_lock(&conn->channel_lock); 741 spin_lock(&conn->channel_lock);
342 742
343 for (chan = 0; chan < RXRPC_MAXCALLS; chan++) 743 /* Calls that have never actually been assigned a channel can simply be
344 if (!conn->channels[chan].call) 744 * discarded. If the conn didn't get used either, it will follow
345 goto found_channel; 745 * immediately unless someone else grabs it in the meantime.
346 BUG(); 746 */
747 if (!list_empty(&call->chan_wait_link)) {
748 _debug("call is waiting");
749 ASSERTCMP(call->call_id, ==, 0);
750 ASSERT(!test_bit(RXRPC_CALL_EXPOSED, &call->flags));
751 list_del_init(&call->chan_wait_link);
752
753 trace_rxrpc_client(conn, channel, rxrpc_client_chan_unstarted);
754
755 /* We must deactivate or idle the connection if it's now
756 * waiting for nothing.
757 */
758 spin_lock(&rxrpc_client_conn_cache_lock);
759 if (conn->cache_state == RXRPC_CONN_CLIENT_WAITING &&
760 list_empty(&conn->waiting_calls) &&
761 !conn->active_chans)
762 goto idle_connection;
763 goto out;
764 }
765
766 ASSERTCMP(rcu_access_pointer(chan->call), ==, call);
767
768 /* If a client call was exposed to the world, we save the result for
769 * retransmission.
770 *
771 * We use a barrier here so that the call number and abort code can be
772 * read without needing to take a lock.
773 *
774 * TODO: Make the incoming packet handler check this and handle
775 * terminal retransmission without requiring access to the call.
776 */
777 if (test_bit(RXRPC_CALL_EXPOSED, &call->flags)) {
778 _debug("exposed %u,%u", call->call_id, call->abort_code);
779 __rxrpc_disconnect_call(conn, call);
780 }
781
782 /* See if we can pass the channel directly to another call. */
783 if (conn->cache_state == RXRPC_CONN_CLIENT_ACTIVE &&
784 !list_empty(&conn->waiting_calls)) {
785 trace_rxrpc_client(conn, channel, rxrpc_client_chan_pass);
786 rxrpc_activate_one_channel(conn, channel);
787 goto out_2;
788 }
789
790 /* Things are more complex and we need the cache lock. We might be
791 * able to simply idle the conn or it might now be lurking on the wait
792 * list. It might even get moved back to the active list whilst we're
793 * waiting for the lock.
794 */
795 spin_lock(&rxrpc_client_conn_cache_lock);
796
797 switch (conn->cache_state) {
798 case RXRPC_CONN_CLIENT_ACTIVE:
799 if (list_empty(&conn->waiting_calls)) {
800 rxrpc_deactivate_one_channel(conn, channel);
801 if (!conn->active_chans) {
802 rxrpc_nr_active_client_conns--;
803 goto idle_connection;
804 }
805 goto out;
806 }
807
808 trace_rxrpc_client(conn, channel, rxrpc_client_chan_pass);
809 rxrpc_activate_one_channel(conn, channel);
810 goto out;
347 811
348interrupted: 812 case RXRPC_CONN_CLIENT_CULLED:
349 remove_wait_queue(&conn->channel_wq, &myself); 813 rxrpc_deactivate_one_channel(conn, channel);
350 __set_current_state(TASK_RUNNING); 814 ASSERT(list_empty(&conn->waiting_calls));
815 if (!conn->active_chans)
816 goto idle_connection;
817 goto out;
818
819 case RXRPC_CONN_CLIENT_WAITING:
820 rxrpc_deactivate_one_channel(conn, channel);
821 goto out;
822
823 default:
824 BUG();
825 }
826
827out:
828 spin_unlock(&rxrpc_client_conn_cache_lock);
829out_2:
830 spin_unlock(&conn->channel_lock);
351 rxrpc_put_connection(conn); 831 rxrpc_put_connection(conn);
352 rxrpc_put_peer(cp->peer); 832 _leave("");
353 cp->peer = NULL; 833 return;
354 _leave(" = -ERESTARTSYS"); 834
355 return -ERESTARTSYS; 835idle_connection:
836 /* As no channels remain active, the connection gets deactivated
837 * immediately or moved to the idle list for a short while.
838 */
839 if (test_bit(RXRPC_CONN_EXPOSED, &conn->flags)) {
840 trace_rxrpc_client(conn, channel, rxrpc_client_to_idle);
841 conn->idle_timestamp = jiffies;
842 conn->cache_state = RXRPC_CONN_CLIENT_IDLE;
843 list_move_tail(&conn->cache_link, &rxrpc_idle_client_conns);
844 if (rxrpc_idle_client_conns.next == &conn->cache_link &&
845 !rxrpc_kill_all_client_conns)
846 queue_delayed_work(rxrpc_workqueue,
847 &rxrpc_client_conn_reap,
848 rxrpc_conn_idle_client_expiry);
849 } else {
850 trace_rxrpc_client(conn, channel, rxrpc_client_to_inactive);
851 conn->cache_state = RXRPC_CONN_CLIENT_INACTIVE;
852 list_del_init(&conn->cache_link);
853 }
854 goto out;
356} 855}
357 856
358/* 857/*
359 * Remove a client connection from the local endpoint's tree, thereby removing 858 * Clean up a dead client connection.
360 * it as a target for reuse for new client calls.
361 */ 859 */
362void rxrpc_unpublish_client_conn(struct rxrpc_connection *conn) 860static struct rxrpc_connection *
861rxrpc_put_one_client_conn(struct rxrpc_connection *conn)
363{ 862{
863 struct rxrpc_connection *next = NULL;
364 struct rxrpc_local *local = conn->params.local; 864 struct rxrpc_local *local = conn->params.local;
865 unsigned int nr_conns;
365 866
366 spin_lock(&local->client_conns_lock); 867 trace_rxrpc_client(conn, -1, rxrpc_client_cleanup);
367 if (test_and_clear_bit(RXRPC_CONN_IN_CLIENT_CONNS, &conn->flags)) 868
368 rb_erase(&conn->client_node, &local->client_conns); 869 if (test_bit(RXRPC_CONN_IN_CLIENT_CONNS, &conn->flags)) {
369 spin_unlock(&local->client_conns_lock); 870 spin_lock(&local->client_conns_lock);
871 if (test_and_clear_bit(RXRPC_CONN_IN_CLIENT_CONNS,
872 &conn->flags))
873 rb_erase(&conn->client_node, &local->client_conns);
874 spin_unlock(&local->client_conns_lock);
875 }
370 876
371 rxrpc_put_client_connection_id(conn); 877 rxrpc_put_client_connection_id(conn);
878
879 ASSERTCMP(conn->cache_state, ==, RXRPC_CONN_CLIENT_INACTIVE);
880
881 if (test_bit(RXRPC_CONN_COUNTED, &conn->flags)) {
882 trace_rxrpc_client(conn, -1, rxrpc_client_uncount);
883 spin_lock(&rxrpc_client_conn_cache_lock);
884 nr_conns = --rxrpc_nr_client_conns;
885
886 if (nr_conns < rxrpc_max_client_connections &&
887 !list_empty(&rxrpc_waiting_client_conns)) {
888 next = list_entry(rxrpc_waiting_client_conns.next,
889 struct rxrpc_connection, cache_link);
890 rxrpc_get_connection(next);
891 rxrpc_activate_conn(next);
892 }
893
894 spin_unlock(&rxrpc_client_conn_cache_lock);
895 }
896
897 rxrpc_kill_connection(conn);
898 if (next)
899 rxrpc_activate_channels(next);
900
901 /* We need to get rid of the temporary ref we took upon next, but we
902 * can't call rxrpc_put_connection() recursively.
903 */
904 return next;
905}
906
907/*
908 * Clean up a dead client connections.
909 */
910void rxrpc_put_client_conn(struct rxrpc_connection *conn)
911{
912 const void *here = __builtin_return_address(0);
913 int n;
914
915 do {
916 n = atomic_dec_return(&conn->usage);
917 trace_rxrpc_conn(conn, rxrpc_conn_put_client, n, here);
918 if (n > 0)
919 return;
920 ASSERTCMP(n, >=, 0);
921
922 conn = rxrpc_put_one_client_conn(conn);
923 } while (conn);
924}
925
926/*
927 * Kill the longest-active client connections to make room for new ones.
928 */
929static void rxrpc_cull_active_client_conns(void)
930{
931 struct rxrpc_connection *conn;
932 unsigned int nr_conns = rxrpc_nr_client_conns;
933 unsigned int nr_active, limit;
934
935 _enter("");
936
937 ASSERTCMP(nr_conns, >=, 0);
938 if (nr_conns < rxrpc_max_client_connections) {
939 _leave(" [ok]");
940 return;
941 }
942 limit = rxrpc_reap_client_connections;
943
944 spin_lock(&rxrpc_client_conn_cache_lock);
945 nr_active = rxrpc_nr_active_client_conns;
946
947 while (nr_active > limit) {
948 ASSERT(!list_empty(&rxrpc_active_client_conns));
949 conn = list_entry(rxrpc_active_client_conns.next,
950 struct rxrpc_connection, cache_link);
951 ASSERTCMP(conn->cache_state, ==, RXRPC_CONN_CLIENT_ACTIVE);
952
953 if (list_empty(&conn->waiting_calls)) {
954 trace_rxrpc_client(conn, -1, rxrpc_client_to_culled);
955 conn->cache_state = RXRPC_CONN_CLIENT_CULLED;
956 list_del_init(&conn->cache_link);
957 } else {
958 trace_rxrpc_client(conn, -1, rxrpc_client_to_waiting);
959 conn->cache_state = RXRPC_CONN_CLIENT_WAITING;
960 list_move_tail(&conn->cache_link,
961 &rxrpc_waiting_client_conns);
962 }
963
964 nr_active--;
965 }
966
967 rxrpc_nr_active_client_conns = nr_active;
968 spin_unlock(&rxrpc_client_conn_cache_lock);
969 ASSERTCMP(nr_active, >=, 0);
970 _leave(" [culled]");
971}
972
973/*
974 * Discard expired client connections from the idle list. Each conn in the
975 * idle list has been exposed and holds an extra ref because of that.
976 *
977 * This may be called from conn setup or from a work item so cannot be
978 * considered non-reentrant.
979 */
980static void rxrpc_discard_expired_client_conns(struct work_struct *work)
981{
982 struct rxrpc_connection *conn;
983 unsigned long expiry, conn_expires_at, now;
984 unsigned int nr_conns;
985 bool did_discard = false;
986
987 _enter("%c", work ? 'w' : 'n');
988
989 if (list_empty(&rxrpc_idle_client_conns)) {
990 _leave(" [empty]");
991 return;
992 }
993
994 /* Don't double up on the discarding */
995 if (!spin_trylock(&rxrpc_client_conn_discard_mutex)) {
996 _leave(" [already]");
997 return;
998 }
999
1000 /* We keep an estimate of what the number of conns ought to be after
1001 * we've discarded some so that we don't overdo the discarding.
1002 */
1003 nr_conns = rxrpc_nr_client_conns;
1004
1005next:
1006 spin_lock(&rxrpc_client_conn_cache_lock);
1007
1008 if (list_empty(&rxrpc_idle_client_conns))
1009 goto out;
1010
1011 conn = list_entry(rxrpc_idle_client_conns.next,
1012 struct rxrpc_connection, cache_link);
1013 ASSERT(test_bit(RXRPC_CONN_EXPOSED, &conn->flags));
1014
1015 if (!rxrpc_kill_all_client_conns) {
1016 /* If the number of connections is over the reap limit, we
1017 * expedite discard by reducing the expiry timeout. We must,
1018 * however, have at least a short grace period to be able to do
1019 * final-ACK or ABORT retransmission.
1020 */
1021 expiry = rxrpc_conn_idle_client_expiry;
1022 if (nr_conns > rxrpc_reap_client_connections)
1023 expiry = rxrpc_conn_idle_client_fast_expiry;
1024
1025 conn_expires_at = conn->idle_timestamp + expiry;
1026
1027 now = READ_ONCE(jiffies);
1028 if (time_after(conn_expires_at, now))
1029 goto not_yet_expired;
1030 }
1031
1032 trace_rxrpc_client(conn, -1, rxrpc_client_discard);
1033 if (!test_and_clear_bit(RXRPC_CONN_EXPOSED, &conn->flags))
1034 BUG();
1035 conn->cache_state = RXRPC_CONN_CLIENT_INACTIVE;
1036 list_del_init(&conn->cache_link);
1037
1038 spin_unlock(&rxrpc_client_conn_cache_lock);
1039
1040 /* When we cleared the EXPOSED flag, we took on responsibility for the
1041 * reference that that had on the usage count. We deal with that here.
1042 * If someone re-sets the flag and re-gets the ref, that's fine.
1043 */
1044 rxrpc_put_connection(conn);
1045 did_discard = true;
1046 nr_conns--;
1047 goto next;
1048
1049not_yet_expired:
1050 /* The connection at the front of the queue hasn't yet expired, so
1051 * schedule the work item for that point if we discarded something.
1052 *
1053 * We don't worry if the work item is already scheduled - it can look
1054 * after rescheduling itself at a later time. We could cancel it, but
1055 * then things get messier.
1056 */
1057 _debug("not yet");
1058 if (!rxrpc_kill_all_client_conns)
1059 queue_delayed_work(rxrpc_workqueue,
1060 &rxrpc_client_conn_reap,
1061 conn_expires_at - now);
1062
1063out:
1064 spin_unlock(&rxrpc_client_conn_cache_lock);
1065 spin_unlock(&rxrpc_client_conn_discard_mutex);
1066 _leave("");
1067}
1068
1069/*
1070 * Preemptively destroy all the client connection records rather than waiting
1071 * for them to time out
1072 */
1073void __exit rxrpc_destroy_all_client_connections(void)
1074{
1075 _enter("");
1076
1077 spin_lock(&rxrpc_client_conn_cache_lock);
1078 rxrpc_kill_all_client_conns = true;
1079 spin_unlock(&rxrpc_client_conn_cache_lock);
1080
1081 cancel_delayed_work(&rxrpc_client_conn_reap);
1082
1083 if (!queue_delayed_work(rxrpc_workqueue, &rxrpc_client_conn_reap, 0))
1084 _debug("destroy: queue failed");
1085
1086 _leave("");
372} 1087}
diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index cee0f35bc1cf..3f9d8d7ec632 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -15,20 +15,128 @@
15#include <linux/net.h> 15#include <linux/net.h>
16#include <linux/skbuff.h> 16#include <linux/skbuff.h>
17#include <linux/errqueue.h> 17#include <linux/errqueue.h>
18#include <linux/udp.h>
19#include <linux/in.h>
20#include <linux/in6.h>
21#include <linux/icmp.h>
22#include <net/sock.h> 18#include <net/sock.h>
23#include <net/af_rxrpc.h> 19#include <net/af_rxrpc.h>
24#include <net/ip.h> 20#include <net/ip.h>
25#include "ar-internal.h" 21#include "ar-internal.h"
26 22
27/* 23/*
24 * Retransmit terminal ACK or ABORT of the previous call.
25 */
26static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
27 struct sk_buff *skb)
28{
29 struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
30 struct rxrpc_channel *chan;
31 struct msghdr msg;
32 struct kvec iov;
33 struct {
34 struct rxrpc_wire_header whdr;
35 union {
36 struct {
37 __be32 code;
38 } abort;
39 struct {
40 struct rxrpc_ackpacket ack;
41 u8 padding[3];
42 struct rxrpc_ackinfo info;
43 };
44 };
45 } __attribute__((packed)) pkt;
46 size_t len;
47 u32 serial, mtu, call_id;
48
49 _enter("%d", conn->debug_id);
50
51 chan = &conn->channels[sp->hdr.cid & RXRPC_CHANNELMASK];
52
53 /* If the last call got moved on whilst we were waiting to run, just
54 * ignore this packet.
55 */
56 call_id = READ_ONCE(chan->last_call);
57 /* Sync with __rxrpc_disconnect_call() */
58 smp_rmb();
59 if (call_id != sp->hdr.callNumber)
60 return;
61
62 msg.msg_name = &conn->params.peer->srx.transport;
63 msg.msg_namelen = conn->params.peer->srx.transport_len;
64 msg.msg_control = NULL;
65 msg.msg_controllen = 0;
66 msg.msg_flags = 0;
67
68 pkt.whdr.epoch = htonl(sp->hdr.epoch);
69 pkt.whdr.cid = htonl(sp->hdr.cid);
70 pkt.whdr.callNumber = htonl(sp->hdr.callNumber);
71 pkt.whdr.seq = 0;
72 pkt.whdr.type = chan->last_type;
73 pkt.whdr.flags = conn->out_clientflag;
74 pkt.whdr.userStatus = 0;
75 pkt.whdr.securityIndex = conn->security_ix;
76 pkt.whdr._rsvd = 0;
77 pkt.whdr.serviceId = htons(chan->last_service_id);
78
79 len = sizeof(pkt.whdr);
80 switch (chan->last_type) {
81 case RXRPC_PACKET_TYPE_ABORT:
82 pkt.abort.code = htonl(chan->last_abort);
83 len += sizeof(pkt.abort);
84 break;
85
86 case RXRPC_PACKET_TYPE_ACK:
87 mtu = conn->params.peer->if_mtu;
88 mtu -= conn->params.peer->hdrsize;
89 pkt.ack.bufferSpace = 0;
90 pkt.ack.maxSkew = htons(skb->priority);
91 pkt.ack.firstPacket = htonl(chan->last_seq);
92 pkt.ack.previousPacket = htonl(chan->last_seq - 1);
93 pkt.ack.serial = htonl(sp->hdr.serial);
94 pkt.ack.reason = RXRPC_ACK_DUPLICATE;
95 pkt.ack.nAcks = 0;
96 pkt.info.rxMTU = htonl(rxrpc_rx_mtu);
97 pkt.info.maxMTU = htonl(mtu);
98 pkt.info.rwind = htonl(rxrpc_rx_window_size);
99 pkt.info.jumbo_max = htonl(rxrpc_rx_jumbo_max);
100 pkt.whdr.flags |= RXRPC_SLOW_START_OK;
101 len += sizeof(pkt.ack) + sizeof(pkt.info);
102 break;
103 }
104
105 /* Resync with __rxrpc_disconnect_call() and check that the last call
106 * didn't get advanced whilst we were filling out the packets.
107 */
108 smp_rmb();
109 if (READ_ONCE(chan->last_call) != call_id)
110 return;
111
112 iov.iov_base = &pkt;
113 iov.iov_len = len;
114
115 serial = atomic_inc_return(&conn->serial);
116 pkt.whdr.serial = htonl(serial);
117
118 switch (chan->last_type) {
119 case RXRPC_PACKET_TYPE_ABORT:
120 _proto("Tx ABORT %%%u { %d } [re]", serial, conn->local_abort);
121 break;
122 case RXRPC_PACKET_TYPE_ACK:
123 trace_rxrpc_tx_ack(NULL, serial, chan->last_seq, 0,
124 RXRPC_ACK_DUPLICATE, 0);
125 _proto("Tx ACK %%%u [re]", serial);
126 break;
127 }
128
129 kernel_sendmsg(conn->params.local->socket, &msg, &iov, 1, len);
130 _leave("");
131 return;
132}
133
134/*
28 * pass a connection-level abort onto all calls on that connection 135 * pass a connection-level abort onto all calls on that connection
29 */ 136 */
30static void rxrpc_abort_calls(struct rxrpc_connection *conn, int state, 137static void rxrpc_abort_calls(struct rxrpc_connection *conn,
31 u32 abort_code) 138 enum rxrpc_call_completion compl,
139 u32 abort_code, int error)
32{ 140{
33 struct rxrpc_call *call; 141 struct rxrpc_call *call;
34 int i; 142 int i;
@@ -41,19 +149,15 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn, int state,
41 call = rcu_dereference_protected( 149 call = rcu_dereference_protected(
42 conn->channels[i].call, 150 conn->channels[i].call,
43 lockdep_is_held(&conn->channel_lock)); 151 lockdep_is_held(&conn->channel_lock));
44 write_lock_bh(&call->state_lock); 152 if (call) {
45 if (call->state <= RXRPC_CALL_COMPLETE) { 153 if (compl == RXRPC_CALL_LOCALLY_ABORTED)
46 call->state = state; 154 trace_rxrpc_abort("CON", call->cid,
47 if (state == RXRPC_CALL_LOCALLY_ABORTED) { 155 call->call_id, 0,
48 call->local_abort = conn->local_abort; 156 abort_code, error);
49 set_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events); 157 if (rxrpc_set_call_completion(call, compl,
50 } else { 158 abort_code, error))
51 call->remote_abort = conn->remote_abort; 159 rxrpc_notify_socket(call);
52 set_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events);
53 }
54 rxrpc_queue_call(call);
55 } 160 }
56 write_unlock_bh(&call->state_lock);
57 } 161 }
58 162
59 spin_unlock(&conn->channel_lock); 163 spin_unlock(&conn->channel_lock);
@@ -78,17 +182,16 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn,
78 182
79 /* generate a connection-level abort */ 183 /* generate a connection-level abort */
80 spin_lock_bh(&conn->state_lock); 184 spin_lock_bh(&conn->state_lock);
81 if (conn->state < RXRPC_CONN_REMOTELY_ABORTED) { 185 if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) {
82 conn->state = RXRPC_CONN_LOCALLY_ABORTED;
83 conn->error = error;
84 spin_unlock_bh(&conn->state_lock);
85 } else {
86 spin_unlock_bh(&conn->state_lock); 186 spin_unlock_bh(&conn->state_lock);
87 _leave(" = 0 [already dead]"); 187 _leave(" = 0 [already dead]");
88 return 0; 188 return 0;
89 } 189 }
90 190
91 rxrpc_abort_calls(conn, RXRPC_CALL_LOCALLY_ABORTED, abort_code); 191 conn->state = RXRPC_CONN_LOCALLY_ABORTED;
192 spin_unlock_bh(&conn->state_lock);
193
194 rxrpc_abort_calls(conn, RXRPC_CALL_LOCALLY_ABORTED, abort_code, error);
92 195
93 msg.msg_name = &conn->params.peer->srx.transport; 196 msg.msg_name = &conn->params.peer->srx.transport;
94 msg.msg_namelen = conn->params.peer->srx.transport_len; 197 msg.msg_namelen = conn->params.peer->srx.transport_len;
@@ -132,17 +235,18 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn,
132 235
133/* 236/*
134 * mark a call as being on a now-secured channel 237 * mark a call as being on a now-secured channel
135 * - must be called with softirqs disabled 238 * - must be called with BH's disabled.
136 */ 239 */
137static void rxrpc_call_is_secure(struct rxrpc_call *call) 240static void rxrpc_call_is_secure(struct rxrpc_call *call)
138{ 241{
139 _enter("%p", call); 242 _enter("%p", call);
140 if (call) { 243 if (call) {
141 read_lock(&call->state_lock); 244 write_lock_bh(&call->state_lock);
142 if (call->state < RXRPC_CALL_COMPLETE && 245 if (call->state == RXRPC_CALL_SERVER_SECURING) {
143 !test_and_set_bit(RXRPC_CALL_EV_SECURED, &call->events)) 246 call->state = RXRPC_CALL_SERVER_ACCEPTING;
144 rxrpc_queue_call(call); 247 rxrpc_notify_socket(call);
145 read_unlock(&call->state_lock); 248 }
249 write_unlock_bh(&call->state_lock);
146 } 250 }
147} 251}
148 252
@@ -159,22 +263,28 @@ static int rxrpc_process_event(struct rxrpc_connection *conn,
159 int loop, ret; 263 int loop, ret;
160 264
161 if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) { 265 if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) {
162 kleave(" = -ECONNABORTED [%u]", conn->state); 266 _leave(" = -ECONNABORTED [%u]", conn->state);
163 return -ECONNABORTED; 267 return -ECONNABORTED;
164 } 268 }
165 269
166 _enter("{%d},{%u,%%%u},", conn->debug_id, sp->hdr.type, sp->hdr.serial); 270 _enter("{%d},{%u,%%%u},", conn->debug_id, sp->hdr.type, sp->hdr.serial);
167 271
168 switch (sp->hdr.type) { 272 switch (sp->hdr.type) {
273 case RXRPC_PACKET_TYPE_DATA:
274 case RXRPC_PACKET_TYPE_ACK:
275 rxrpc_conn_retransmit_call(conn, skb);
276 return 0;
277
169 case RXRPC_PACKET_TYPE_ABORT: 278 case RXRPC_PACKET_TYPE_ABORT:
170 if (skb_copy_bits(skb, 0, &wtmp, sizeof(wtmp)) < 0) 279 if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header),
280 &wtmp, sizeof(wtmp)) < 0)
171 return -EPROTO; 281 return -EPROTO;
172 abort_code = ntohl(wtmp); 282 abort_code = ntohl(wtmp);
173 _proto("Rx ABORT %%%u { ac=%d }", sp->hdr.serial, abort_code); 283 _proto("Rx ABORT %%%u { ac=%d }", sp->hdr.serial, abort_code);
174 284
175 conn->state = RXRPC_CONN_REMOTELY_ABORTED; 285 conn->state = RXRPC_CONN_REMOTELY_ABORTED;
176 rxrpc_abort_calls(conn, RXRPC_CALL_REMOTELY_ABORTED, 286 rxrpc_abort_calls(conn, RXRPC_CALL_REMOTELY_ABORTED,
177 abort_code); 287 abort_code, ECONNABORTED);
178 return -ECONNABORTED; 288 return -ECONNABORTED;
179 289
180 case RXRPC_PACKET_TYPE_CHALLENGE: 290 case RXRPC_PACKET_TYPE_CHALLENGE:
@@ -199,14 +309,16 @@ static int rxrpc_process_event(struct rxrpc_connection *conn,
199 309
200 if (conn->state == RXRPC_CONN_SERVICE_CHALLENGING) { 310 if (conn->state == RXRPC_CONN_SERVICE_CHALLENGING) {
201 conn->state = RXRPC_CONN_SERVICE; 311 conn->state = RXRPC_CONN_SERVICE;
312 spin_unlock(&conn->state_lock);
202 for (loop = 0; loop < RXRPC_MAXCALLS; loop++) 313 for (loop = 0; loop < RXRPC_MAXCALLS; loop++)
203 rxrpc_call_is_secure( 314 rxrpc_call_is_secure(
204 rcu_dereference_protected( 315 rcu_dereference_protected(
205 conn->channels[loop].call, 316 conn->channels[loop].call,
206 lockdep_is_held(&conn->channel_lock))); 317 lockdep_is_held(&conn->channel_lock)));
318 } else {
319 spin_unlock(&conn->state_lock);
207 } 320 }
208 321
209 spin_unlock(&conn->state_lock);
210 spin_unlock(&conn->channel_lock); 322 spin_unlock(&conn->channel_lock);
211 return 0; 323 return 0;
212 324
@@ -269,7 +381,7 @@ void rxrpc_process_connection(struct work_struct *work)
269 u32 abort_code = RX_PROTOCOL_ERROR; 381 u32 abort_code = RX_PROTOCOL_ERROR;
270 int ret; 382 int ret;
271 383
272 _enter("{%d}", conn->debug_id); 384 rxrpc_see_connection(conn);
273 385
274 if (test_and_clear_bit(RXRPC_CONN_EV_CHALLENGE, &conn->events)) 386 if (test_and_clear_bit(RXRPC_CONN_EV_CHALLENGE, &conn->events))
275 rxrpc_secure_connection(conn); 387 rxrpc_secure_connection(conn);
@@ -277,6 +389,7 @@ void rxrpc_process_connection(struct work_struct *work)
277 /* go through the conn-level event packets, releasing the ref on this 389 /* go through the conn-level event packets, releasing the ref on this
278 * connection that each one has when we've finished with it */ 390 * connection that each one has when we've finished with it */
279 while ((skb = skb_dequeue(&conn->rx_queue))) { 391 while ((skb = skb_dequeue(&conn->rx_queue))) {
392 rxrpc_see_skb(skb, rxrpc_skb_rx_seen);
280 ret = rxrpc_process_event(conn, skb, &abort_code); 393 ret = rxrpc_process_event(conn, skb, &abort_code);
281 switch (ret) { 394 switch (ret) {
282 case -EPROTO: 395 case -EPROTO:
@@ -287,7 +400,7 @@ void rxrpc_process_connection(struct work_struct *work)
287 goto requeue_and_leave; 400 goto requeue_and_leave;
288 case -ECONNABORTED: 401 case -ECONNABORTED:
289 default: 402 default:
290 rxrpc_free_skb(skb); 403 rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
291 break; 404 break;
292 } 405 }
293 } 406 }
@@ -304,91 +417,7 @@ requeue_and_leave:
304protocol_error: 417protocol_error:
305 if (rxrpc_abort_connection(conn, -ret, abort_code) < 0) 418 if (rxrpc_abort_connection(conn, -ret, abort_code) < 0)
306 goto requeue_and_leave; 419 goto requeue_and_leave;
307 rxrpc_free_skb(skb); 420 rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
308 _leave(" [EPROTO]"); 421 _leave(" [EPROTO]");
309 goto out; 422 goto out;
310} 423}
311
312/*
313 * put a packet up for transport-level abort
314 */
315void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb)
316{
317 CHECK_SLAB_OKAY(&local->usage);
318
319 skb_queue_tail(&local->reject_queue, skb);
320 rxrpc_queue_local(local);
321}
322
323/*
324 * reject packets through the local endpoint
325 */
326void rxrpc_reject_packets(struct rxrpc_local *local)
327{
328 union {
329 struct sockaddr sa;
330 struct sockaddr_in sin;
331 } sa;
332 struct rxrpc_skb_priv *sp;
333 struct rxrpc_wire_header whdr;
334 struct sk_buff *skb;
335 struct msghdr msg;
336 struct kvec iov[2];
337 size_t size;
338 __be32 code;
339
340 _enter("%d", local->debug_id);
341
342 iov[0].iov_base = &whdr;
343 iov[0].iov_len = sizeof(whdr);
344 iov[1].iov_base = &code;
345 iov[1].iov_len = sizeof(code);
346 size = sizeof(whdr) + sizeof(code);
347
348 msg.msg_name = &sa;
349 msg.msg_control = NULL;
350 msg.msg_controllen = 0;
351 msg.msg_flags = 0;
352
353 memset(&sa, 0, sizeof(sa));
354 sa.sa.sa_family = local->srx.transport.family;
355 switch (sa.sa.sa_family) {
356 case AF_INET:
357 msg.msg_namelen = sizeof(sa.sin);
358 break;
359 default:
360 msg.msg_namelen = 0;
361 break;
362 }
363
364 memset(&whdr, 0, sizeof(whdr));
365 whdr.type = RXRPC_PACKET_TYPE_ABORT;
366
367 while ((skb = skb_dequeue(&local->reject_queue))) {
368 sp = rxrpc_skb(skb);
369 switch (sa.sa.sa_family) {
370 case AF_INET:
371 sa.sin.sin_port = udp_hdr(skb)->source;
372 sa.sin.sin_addr.s_addr = ip_hdr(skb)->saddr;
373 code = htonl(skb->priority);
374
375 whdr.epoch = htonl(sp->hdr.epoch);
376 whdr.cid = htonl(sp->hdr.cid);
377 whdr.callNumber = htonl(sp->hdr.callNumber);
378 whdr.serviceId = htons(sp->hdr.serviceId);
379 whdr.flags = sp->hdr.flags;
380 whdr.flags ^= RXRPC_CLIENT_INITIATED;
381 whdr.flags &= RXRPC_CLIENT_INITIATED;
382
383 kernel_sendmsg(local->socket, &msg, iov, 2, size);
384 break;
385
386 default:
387 break;
388 }
389
390 rxrpc_free_skb(skb);
391 }
392
393 _leave("");
394}
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index 896d84493a05..e1e83af47866 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -1,6 +1,6 @@
1/* RxRPC virtual connection handler 1/* RxRPC virtual connection handler, common bits.
2 * 2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. 3 * Copyright (C) 2007, 2016 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 4 * Written by David Howells (dhowells@redhat.com)
5 * 5 *
6 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
@@ -15,8 +15,6 @@
15#include <linux/slab.h> 15#include <linux/slab.h>
16#include <linux/net.h> 16#include <linux/net.h>
17#include <linux/skbuff.h> 17#include <linux/skbuff.h>
18#include <net/sock.h>
19#include <net/af_rxrpc.h>
20#include "ar-internal.h" 18#include "ar-internal.h"
21 19
22/* 20/*
@@ -27,9 +25,12 @@ unsigned int rxrpc_connection_expiry = 10 * 60;
27static void rxrpc_connection_reaper(struct work_struct *work); 25static void rxrpc_connection_reaper(struct work_struct *work);
28 26
29LIST_HEAD(rxrpc_connections); 27LIST_HEAD(rxrpc_connections);
28LIST_HEAD(rxrpc_connection_proc_list);
30DEFINE_RWLOCK(rxrpc_connection_lock); 29DEFINE_RWLOCK(rxrpc_connection_lock);
31static DECLARE_DELAYED_WORK(rxrpc_connection_reap, rxrpc_connection_reaper); 30static DECLARE_DELAYED_WORK(rxrpc_connection_reap, rxrpc_connection_reaper);
32 31
32static void rxrpc_destroy_connection(struct rcu_head *);
33
33/* 34/*
34 * allocate a new connection 35 * allocate a new connection
35 */ 36 */
@@ -41,21 +42,18 @@ struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp)
41 42
42 conn = kzalloc(sizeof(struct rxrpc_connection), gfp); 43 conn = kzalloc(sizeof(struct rxrpc_connection), gfp);
43 if (conn) { 44 if (conn) {
45 INIT_LIST_HEAD(&conn->cache_link);
44 spin_lock_init(&conn->channel_lock); 46 spin_lock_init(&conn->channel_lock);
45 init_waitqueue_head(&conn->channel_wq); 47 INIT_LIST_HEAD(&conn->waiting_calls);
46 INIT_WORK(&conn->processor, &rxrpc_process_connection); 48 INIT_WORK(&conn->processor, &rxrpc_process_connection);
49 INIT_LIST_HEAD(&conn->proc_link);
47 INIT_LIST_HEAD(&conn->link); 50 INIT_LIST_HEAD(&conn->link);
48 skb_queue_head_init(&conn->rx_queue); 51 skb_queue_head_init(&conn->rx_queue);
49 conn->security = &rxrpc_no_security; 52 conn->security = &rxrpc_no_security;
50 spin_lock_init(&conn->state_lock); 53 spin_lock_init(&conn->state_lock);
51 /* We maintain an extra ref on the connection whilst it is
52 * on the rxrpc_connections list.
53 */
54 atomic_set(&conn->usage, 2);
55 conn->debug_id = atomic_inc_return(&rxrpc_debug_id); 54 conn->debug_id = atomic_inc_return(&rxrpc_debug_id);
56 atomic_set(&conn->avail_chans, RXRPC_MAXCALLS);
57 conn->size_align = 4; 55 conn->size_align = 4;
58 conn->header_size = sizeof(struct rxrpc_wire_header); 56 conn->idle_timestamp = jiffies;
59 } 57 }
60 58
61 _leave(" = %p{%d}", conn, conn ? conn->debug_id : 0); 59 _leave(" = %p{%d}", conn, conn ? conn->debug_id : 0);
@@ -135,6 +133,16 @@ struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *local,
135 srx.transport.sin.sin_addr.s_addr) 133 srx.transport.sin.sin_addr.s_addr)
136 goto not_found; 134 goto not_found;
137 break; 135 break;
136#ifdef CONFIG_AF_RXRPC_IPV6
137 case AF_INET6:
138 if (peer->srx.transport.sin6.sin6_port !=
139 srx.transport.sin6.sin6_port ||
140 memcmp(&peer->srx.transport.sin6.sin6_addr,
141 &srx.transport.sin6.sin6_addr,
142 sizeof(struct in6_addr)) != 0)
143 goto not_found;
144 break;
145#endif
138 default: 146 default:
139 BUG(); 147 BUG();
140 } 148 }
@@ -153,25 +161,32 @@ not_found:
153 * terminates. The caller must hold the channel_lock and must release the 161 * terminates. The caller must hold the channel_lock and must release the
154 * call's ref on the connection. 162 * call's ref on the connection.
155 */ 163 */
156void __rxrpc_disconnect_call(struct rxrpc_call *call) 164void __rxrpc_disconnect_call(struct rxrpc_connection *conn,
165 struct rxrpc_call *call)
157{ 166{
158 struct rxrpc_connection *conn = call->conn; 167 struct rxrpc_channel *chan =
159 struct rxrpc_channel *chan = &conn->channels[call->channel]; 168 &conn->channels[call->cid & RXRPC_CHANNELMASK];
160 169
161 _enter("%d,%d", conn->debug_id, call->channel); 170 _enter("%d,%x", conn->debug_id, call->cid);
162 171
163 if (rcu_access_pointer(chan->call) == call) { 172 if (rcu_access_pointer(chan->call) == call) {
164 /* Save the result of the call so that we can repeat it if necessary 173 /* Save the result of the call so that we can repeat it if necessary
165 * through the channel, whilst disposing of the actual call record. 174 * through the channel, whilst disposing of the actual call record.
166 */ 175 */
167 chan->last_result = call->local_abort; 176 chan->last_service_id = call->service_id;
177 if (call->abort_code) {
178 chan->last_abort = call->abort_code;
179 chan->last_type = RXRPC_PACKET_TYPE_ABORT;
180 } else {
181 chan->last_seq = call->rx_hard_ack;
182 chan->last_type = RXRPC_PACKET_TYPE_ACK;
183 }
184 /* Sync with rxrpc_conn_retransmit(). */
168 smp_wmb(); 185 smp_wmb();
169 chan->last_call = chan->call_id; 186 chan->last_call = chan->call_id;
170 chan->call_id = chan->call_counter; 187 chan->call_id = chan->call_counter;
171 188
172 rcu_assign_pointer(chan->call, NULL); 189 rcu_assign_pointer(chan->call, NULL);
173 atomic_inc(&conn->avail_chans);
174 wake_up(&conn->channel_wq);
175 } 190 }
176 191
177 _leave(""); 192 _leave("");
@@ -185,34 +200,122 @@ void rxrpc_disconnect_call(struct rxrpc_call *call)
185{ 200{
186 struct rxrpc_connection *conn = call->conn; 201 struct rxrpc_connection *conn = call->conn;
187 202
203 spin_lock_bh(&conn->params.peer->lock);
204 hlist_del_init(&call->error_link);
205 spin_unlock_bh(&conn->params.peer->lock);
206
207 if (rxrpc_is_client_call(call))
208 return rxrpc_disconnect_client_call(call);
209
188 spin_lock(&conn->channel_lock); 210 spin_lock(&conn->channel_lock);
189 __rxrpc_disconnect_call(call); 211 __rxrpc_disconnect_call(conn, call);
190 spin_unlock(&conn->channel_lock); 212 spin_unlock(&conn->channel_lock);
191 213
192 call->conn = NULL; 214 call->conn = NULL;
215 conn->idle_timestamp = jiffies;
193 rxrpc_put_connection(conn); 216 rxrpc_put_connection(conn);
194} 217}
195 218
196/* 219/*
197 * release a virtual connection 220 * Kill off a connection.
198 */ 221 */
199void rxrpc_put_connection(struct rxrpc_connection *conn) 222void rxrpc_kill_connection(struct rxrpc_connection *conn)
200{ 223{
201 if (!conn) 224 ASSERT(!rcu_access_pointer(conn->channels[0].call) &&
202 return; 225 !rcu_access_pointer(conn->channels[1].call) &&
226 !rcu_access_pointer(conn->channels[2].call) &&
227 !rcu_access_pointer(conn->channels[3].call));
228 ASSERT(list_empty(&conn->cache_link));
203 229
204 _enter("%p{u=%d,d=%d}", 230 write_lock(&rxrpc_connection_lock);
205 conn, atomic_read(&conn->usage), conn->debug_id); 231 list_del_init(&conn->proc_link);
232 write_unlock(&rxrpc_connection_lock);
206 233
207 ASSERTCMP(atomic_read(&conn->usage), >, 1); 234 /* Drain the Rx queue. Note that even though we've unpublished, an
235 * incoming packet could still be being added to our Rx queue, so we
236 * will need to drain it again in the RCU cleanup handler.
237 */
238 rxrpc_purge_queue(&conn->rx_queue);
208 239
209 conn->put_time = ktime_get_seconds(); 240 /* Leave final destruction to RCU. The connection processor work item
210 if (atomic_dec_return(&conn->usage) == 1) { 241 * must carry a ref on the connection to prevent us getting here whilst
211 _debug("zombie"); 242 * it is queued or running.
212 rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0); 243 */
244 call_rcu(&conn->rcu, rxrpc_destroy_connection);
245}
246
247/*
248 * Queue a connection's work processor, getting a ref to pass to the work
249 * queue.
250 */
251bool rxrpc_queue_conn(struct rxrpc_connection *conn)
252{
253 const void *here = __builtin_return_address(0);
254 int n = __atomic_add_unless(&conn->usage, 1, 0);
255 if (n == 0)
256 return false;
257 if (rxrpc_queue_work(&conn->processor))
258 trace_rxrpc_conn(conn, rxrpc_conn_queued, n + 1, here);
259 else
260 rxrpc_put_connection(conn);
261 return true;
262}
263
264/*
265 * Note the re-emergence of a connection.
266 */
267void rxrpc_see_connection(struct rxrpc_connection *conn)
268{
269 const void *here = __builtin_return_address(0);
270 if (conn) {
271 int n = atomic_read(&conn->usage);
272
273 trace_rxrpc_conn(conn, rxrpc_conn_seen, n, here);
274 }
275}
276
277/*
278 * Get a ref on a connection.
279 */
280void rxrpc_get_connection(struct rxrpc_connection *conn)
281{
282 const void *here = __builtin_return_address(0);
283 int n = atomic_inc_return(&conn->usage);
284
285 trace_rxrpc_conn(conn, rxrpc_conn_got, n, here);
286}
287
288/*
289 * Try to get a ref on a connection.
290 */
291struct rxrpc_connection *
292rxrpc_get_connection_maybe(struct rxrpc_connection *conn)
293{
294 const void *here = __builtin_return_address(0);
295
296 if (conn) {
297 int n = __atomic_add_unless(&conn->usage, 1, 0);
298 if (n > 0)
299 trace_rxrpc_conn(conn, rxrpc_conn_got, n + 1, here);
300 else
301 conn = NULL;
213 } 302 }
303 return conn;
304}
214 305
215 _leave(""); 306/*
307 * Release a service connection
308 */
309void rxrpc_put_service_conn(struct rxrpc_connection *conn)
310{
311 const void *here = __builtin_return_address(0);
312 int n;
313
314 n = atomic_dec_return(&conn->usage);
315 trace_rxrpc_conn(conn, rxrpc_conn_put_service, n, here);
316 ASSERTCMP(n, >=, 0);
317 if (n == 0)
318 rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0);
216} 319}
217 320
218/* 321/*
@@ -242,19 +345,19 @@ static void rxrpc_destroy_connection(struct rcu_head *rcu)
242} 345}
243 346
244/* 347/*
245 * reap dead connections 348 * reap dead service connections
246 */ 349 */
247static void rxrpc_connection_reaper(struct work_struct *work) 350static void rxrpc_connection_reaper(struct work_struct *work)
248{ 351{
249 struct rxrpc_connection *conn, *_p; 352 struct rxrpc_connection *conn, *_p;
250 unsigned long reap_older_than, earliest, put_time, now; 353 unsigned long reap_older_than, earliest, idle_timestamp, now;
251 354
252 LIST_HEAD(graveyard); 355 LIST_HEAD(graveyard);
253 356
254 _enter(""); 357 _enter("");
255 358
256 now = ktime_get_seconds(); 359 now = jiffies;
257 reap_older_than = now - rxrpc_connection_expiry; 360 reap_older_than = now - rxrpc_connection_expiry * HZ;
258 earliest = ULONG_MAX; 361 earliest = ULONG_MAX;
259 362
260 write_lock(&rxrpc_connection_lock); 363 write_lock(&rxrpc_connection_lock);
@@ -262,11 +365,17 @@ static void rxrpc_connection_reaper(struct work_struct *work)
262 ASSERTCMP(atomic_read(&conn->usage), >, 0); 365 ASSERTCMP(atomic_read(&conn->usage), >, 0);
263 if (likely(atomic_read(&conn->usage) > 1)) 366 if (likely(atomic_read(&conn->usage) > 1))
264 continue; 367 continue;
368 if (conn->state == RXRPC_CONN_SERVICE_PREALLOC)
369 continue;
370
371 idle_timestamp = READ_ONCE(conn->idle_timestamp);
372 _debug("reap CONN %d { u=%d,t=%ld }",
373 conn->debug_id, atomic_read(&conn->usage),
374 (long)reap_older_than - (long)idle_timestamp);
265 375
266 put_time = READ_ONCE(conn->put_time); 376 if (time_after(idle_timestamp, reap_older_than)) {
267 if (time_after(put_time, reap_older_than)) { 377 if (time_before(idle_timestamp, earliest))
268 if (time_before(put_time, earliest)) 378 earliest = idle_timestamp;
269 earliest = put_time;
270 continue; 379 continue;
271 } 380 }
272 381
@@ -277,7 +386,7 @@ static void rxrpc_connection_reaper(struct work_struct *work)
277 continue; 386 continue;
278 387
279 if (rxrpc_conn_is_client(conn)) 388 if (rxrpc_conn_is_client(conn))
280 rxrpc_unpublish_client_conn(conn); 389 BUG();
281 else 390 else
282 rxrpc_unpublish_service_conn(conn); 391 rxrpc_unpublish_service_conn(conn);
283 392
@@ -287,9 +396,9 @@ static void rxrpc_connection_reaper(struct work_struct *work)
287 396
288 if (earliest != ULONG_MAX) { 397 if (earliest != ULONG_MAX) {
289 _debug("reschedule reaper %ld", (long) earliest - now); 398 _debug("reschedule reaper %ld", (long) earliest - now);
290 ASSERTCMP(earliest, >, now); 399 ASSERT(time_after(earliest, now));
291 rxrpc_queue_delayed_work(&rxrpc_connection_reap, 400 rxrpc_queue_delayed_work(&rxrpc_connection_reap,
292 (earliest - now) * HZ); 401 earliest - now);
293 } 402 }
294 403
295 while (!list_empty(&graveyard)) { 404 while (!list_empty(&graveyard)) {
@@ -298,16 +407,15 @@ static void rxrpc_connection_reaper(struct work_struct *work)
298 list_del_init(&conn->link); 407 list_del_init(&conn->link);
299 408
300 ASSERTCMP(atomic_read(&conn->usage), ==, 0); 409 ASSERTCMP(atomic_read(&conn->usage), ==, 0);
301 skb_queue_purge(&conn->rx_queue); 410 rxrpc_kill_connection(conn);
302 call_rcu(&conn->rcu, rxrpc_destroy_connection);
303 } 411 }
304 412
305 _leave(""); 413 _leave("");
306} 414}
307 415
308/* 416/*
309 * preemptively destroy all the connection records rather than waiting for them 417 * preemptively destroy all the service connection records rather than
310 * to time out 418 * waiting for them to time out
311 */ 419 */
312void __exit rxrpc_destroy_all_connections(void) 420void __exit rxrpc_destroy_all_connections(void)
313{ 421{
@@ -316,6 +424,8 @@ void __exit rxrpc_destroy_all_connections(void)
316 424
317 _enter(""); 425 _enter("");
318 426
427 rxrpc_destroy_all_client_connections();
428
319 rxrpc_connection_expiry = 0; 429 rxrpc_connection_expiry = 0;
320 cancel_delayed_work(&rxrpc_connection_reap); 430 cancel_delayed_work(&rxrpc_connection_reap);
321 rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0); 431 rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0);
@@ -330,6 +440,8 @@ void __exit rxrpc_destroy_all_connections(void)
330 write_unlock(&rxrpc_connection_lock); 440 write_unlock(&rxrpc_connection_lock);
331 BUG_ON(leak); 441 BUG_ON(leak);
332 442
443 ASSERT(list_empty(&rxrpc_connection_proc_list));
444
333 /* Make sure the local and peer records pinned by any dying connections 445 /* Make sure the local and peer records pinned by any dying connections
334 * are released. 446 * are released.
335 */ 447 */
diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c
index fd9027ccba8f..eef551f40dc2 100644
--- a/net/rxrpc/conn_service.c
+++ b/net/rxrpc/conn_service.c
@@ -65,9 +65,8 @@ done:
65 * Insert a service connection into a peer's tree, thereby making it a target 65 * Insert a service connection into a peer's tree, thereby making it a target
66 * for incoming packets. 66 * for incoming packets.
67 */ 67 */
68static struct rxrpc_connection * 68static void rxrpc_publish_service_conn(struct rxrpc_peer *peer,
69rxrpc_publish_service_conn(struct rxrpc_peer *peer, 69 struct rxrpc_connection *conn)
70 struct rxrpc_connection *conn)
71{ 70{
72 struct rxrpc_connection *cursor = NULL; 71 struct rxrpc_connection *cursor = NULL;
73 struct rxrpc_conn_proto k = conn->proto; 72 struct rxrpc_conn_proto k = conn->proto;
@@ -96,7 +95,7 @@ conn_published:
96 set_bit(RXRPC_CONN_IN_SERVICE_CONNS, &conn->flags); 95 set_bit(RXRPC_CONN_IN_SERVICE_CONNS, &conn->flags);
97 write_sequnlock_bh(&peer->service_conn_lock); 96 write_sequnlock_bh(&peer->service_conn_lock);
98 _leave(" = %d [new]", conn->debug_id); 97 _leave(" = %d [new]", conn->debug_id);
99 return conn; 98 return;
100 99
101found_extant_conn: 100found_extant_conn:
102 if (atomic_read(&cursor->usage) == 0) 101 if (atomic_read(&cursor->usage) == 0)
@@ -119,100 +118,58 @@ replace_old_connection:
119} 118}
120 119
121/* 120/*
122 * get a record of an incoming connection 121 * Preallocate a service connection. The connection is placed on the proc and
122 * reap lists so that we don't have to get the lock from BH context.
123 */ 123 */
124struct rxrpc_connection *rxrpc_incoming_connection(struct rxrpc_local *local, 124struct rxrpc_connection *rxrpc_prealloc_service_connection(gfp_t gfp)
125 struct sockaddr_rxrpc *srx,
126 struct sk_buff *skb)
127{ 125{
128 struct rxrpc_connection *conn; 126 struct rxrpc_connection *conn = rxrpc_alloc_connection(gfp);
129 struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
130 struct rxrpc_peer *peer;
131 const char *new = "old";
132
133 _enter("");
134 127
135 peer = rxrpc_lookup_peer(local, srx, GFP_NOIO); 128 if (conn) {
136 if (!peer) { 129 /* We maintain an extra ref on the connection whilst it is on
137 _debug("no peer"); 130 * the rxrpc_connections list.
138 return ERR_PTR(-EBUSY); 131 */
139 } 132 conn->state = RXRPC_CONN_SERVICE_PREALLOC;
133 atomic_set(&conn->usage, 2);
140 134
141 ASSERT(sp->hdr.flags & RXRPC_CLIENT_INITIATED); 135 write_lock(&rxrpc_connection_lock);
142 136 list_add_tail(&conn->link, &rxrpc_connections);
143 rcu_read_lock(); 137 list_add_tail(&conn->proc_link, &rxrpc_connection_proc_list);
144 peer = rxrpc_lookup_peer_rcu(local, srx); 138 write_unlock(&rxrpc_connection_lock);
145 if (peer) {
146 conn = rxrpc_find_service_conn_rcu(peer, skb);
147 if (conn) {
148 if (sp->hdr.securityIndex != conn->security_ix)
149 goto security_mismatch_rcu;
150 if (rxrpc_get_connection_maybe(conn))
151 goto found_extant_connection_rcu;
152
153 /* The conn has expired but we can't remove it without
154 * the appropriate lock, so we attempt to replace it
155 * when we have a new candidate.
156 */
157 }
158 139
159 if (!rxrpc_get_peer_maybe(peer)) 140 trace_rxrpc_conn(conn, rxrpc_conn_new_service,
160 peer = NULL; 141 atomic_read(&conn->usage),
142 __builtin_return_address(0));
161 } 143 }
162 rcu_read_unlock();
163 144
164 if (!peer) { 145 return conn;
165 peer = rxrpc_lookup_peer(local, srx, GFP_NOIO); 146}
166 if (!peer)
167 goto enomem;
168 }
169 147
170 /* We don't have a matching record yet. */ 148/*
171 conn = rxrpc_alloc_connection(GFP_NOIO); 149 * Set up an incoming connection. This is called in BH context with the RCU
172 if (!conn) 150 * read lock held.
173 goto enomem_peer; 151 */
152void rxrpc_new_incoming_connection(struct rxrpc_connection *conn,
153 struct sk_buff *skb)
154{
155 struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
156
157 _enter("");
174 158
175 conn->proto.epoch = sp->hdr.epoch; 159 conn->proto.epoch = sp->hdr.epoch;
176 conn->proto.cid = sp->hdr.cid & RXRPC_CIDMASK; 160 conn->proto.cid = sp->hdr.cid & RXRPC_CIDMASK;
177 conn->params.local = local;
178 conn->params.peer = peer;
179 conn->params.service_id = sp->hdr.serviceId; 161 conn->params.service_id = sp->hdr.serviceId;
180 conn->security_ix = sp->hdr.securityIndex; 162 conn->security_ix = sp->hdr.securityIndex;
181 conn->out_clientflag = 0; 163 conn->out_clientflag = 0;
182 conn->state = RXRPC_CONN_SERVICE; 164 if (conn->security_ix)
183 if (conn->params.service_id)
184 conn->state = RXRPC_CONN_SERVICE_UNSECURED; 165 conn->state = RXRPC_CONN_SERVICE_UNSECURED;
185 166 else
186 rxrpc_get_local(local); 167 conn->state = RXRPC_CONN_SERVICE;
187
188 write_lock(&rxrpc_connection_lock);
189 list_add_tail(&conn->link, &rxrpc_connections);
190 write_unlock(&rxrpc_connection_lock);
191 168
192 /* Make the connection a target for incoming packets. */ 169 /* Make the connection a target for incoming packets. */
193 rxrpc_publish_service_conn(peer, conn); 170 rxrpc_publish_service_conn(conn->params.peer, conn);
194
195 new = "new";
196
197success:
198 _net("CONNECTION %s %d {%x}", new, conn->debug_id, conn->proto.cid);
199 _leave(" = %p {u=%d}", conn, atomic_read(&conn->usage));
200 return conn;
201
202found_extant_connection_rcu:
203 rcu_read_unlock();
204 goto success;
205
206security_mismatch_rcu:
207 rcu_read_unlock();
208 _leave(" = -EKEYREJECTED");
209 return ERR_PTR(-EKEYREJECTED);
210 171
211enomem_peer: 172 _net("CONNECTION new %d {%x}", conn->debug_id, conn->proto.cid);
212 rxrpc_put_peer(peer);
213enomem:
214 _leave(" = -ENOMEM");
215 return ERR_PTR(-ENOMEM);
216} 173}
217 174
218/* 175/*
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 70bb77818dea..44fb8d893c7d 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -1,6 +1,6 @@
1/* RxRPC packet reception 1/* RxRPC packet reception
2 * 2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. 3 * Copyright (C) 2007, 2016 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 4 * Written by David Howells (dhowells@redhat.com)
5 * 5 *
6 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
@@ -27,550 +27,948 @@
27#include <net/net_namespace.h> 27#include <net/net_namespace.h>
28#include "ar-internal.h" 28#include "ar-internal.h"
29 29
30static void rxrpc_proto_abort(const char *why,
31 struct rxrpc_call *call, rxrpc_seq_t seq)
32{
33 if (rxrpc_abort_call(why, call, seq, RX_PROTOCOL_ERROR, EBADMSG)) {
34 set_bit(RXRPC_CALL_EV_ABORT, &call->events);
35 rxrpc_queue_call(call);
36 }
37}
38
30/* 39/*
31 * queue a packet for recvmsg to pass to userspace 40 * Do TCP-style congestion management [RFC 5681].
32 * - the caller must hold a lock on call->lock
33 * - must not be called with interrupts disabled (sk_filter() disables BH's)
34 * - eats the packet whether successful or not
35 * - there must be just one reference to the packet, which the caller passes to
36 * this function
37 */ 41 */
38int rxrpc_queue_rcv_skb(struct rxrpc_call *call, struct sk_buff *skb, 42static void rxrpc_congestion_management(struct rxrpc_call *call,
39 bool force, bool terminal) 43 struct sk_buff *skb,
44 struct rxrpc_ack_summary *summary,
45 rxrpc_serial_t acked_serial)
40{ 46{
41 struct rxrpc_skb_priv *sp; 47 enum rxrpc_congest_change change = rxrpc_cong_no_change;
42 struct rxrpc_sock *rx = call->socket; 48 unsigned int cumulative_acks = call->cong_cumul_acks;
43 struct sock *sk; 49 unsigned int cwnd = call->cong_cwnd;
44 int ret; 50 bool resend = false;
51
52 summary->flight_size =
53 (call->tx_top - call->tx_hard_ack) - summary->nr_acks;
54
55 if (test_and_clear_bit(RXRPC_CALL_RETRANS_TIMEOUT, &call->flags)) {
56 summary->retrans_timeo = true;
57 call->cong_ssthresh = max_t(unsigned int,
58 summary->flight_size / 2, 2);
59 cwnd = 1;
60 if (cwnd >= call->cong_ssthresh &&
61 call->cong_mode == RXRPC_CALL_SLOW_START) {
62 call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE;
63 call->cong_tstamp = skb->tstamp;
64 cumulative_acks = 0;
65 }
66 }
45 67
46 _enter(",,%d,%d", force, terminal); 68 cumulative_acks += summary->nr_new_acks;
69 cumulative_acks += summary->nr_rot_new_acks;
70 if (cumulative_acks > 255)
71 cumulative_acks = 255;
72
73 summary->mode = call->cong_mode;
74 summary->cwnd = call->cong_cwnd;
75 summary->ssthresh = call->cong_ssthresh;
76 summary->cumulative_acks = cumulative_acks;
77 summary->dup_acks = call->cong_dup_acks;
78
79 switch (call->cong_mode) {
80 case RXRPC_CALL_SLOW_START:
81 if (summary->nr_nacks > 0)
82 goto packet_loss_detected;
83 if (summary->cumulative_acks > 0)
84 cwnd += 1;
85 if (cwnd >= call->cong_ssthresh) {
86 call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE;
87 call->cong_tstamp = skb->tstamp;
88 }
89 goto out;
47 90
48 ASSERT(!irqs_disabled()); 91 case RXRPC_CALL_CONGEST_AVOIDANCE:
92 if (summary->nr_nacks > 0)
93 goto packet_loss_detected;
49 94
50 sp = rxrpc_skb(skb); 95 /* We analyse the number of packets that get ACK'd per RTT
51 ASSERTCMP(sp->call, ==, call); 96 * period and increase the window if we managed to fill it.
52 97 */
53 /* if we've already posted the terminal message for a call, then we 98 if (call->peer->rtt_usage == 0)
54 * don't post any more */
55 if (test_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags)) {
56 _debug("already terminated");
57 ASSERTCMP(call->state, >=, RXRPC_CALL_COMPLETE);
58 rxrpc_free_skb(skb);
59 return 0;
60 }
61
62 sk = &rx->sk;
63
64 if (!force) {
65 /* cast skb->rcvbuf to unsigned... It's pointless, but
66 * reduces number of warnings when compiling with -W
67 * --ANK */
68// ret = -ENOBUFS;
69// if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
70// (unsigned int) sk->sk_rcvbuf)
71// goto out;
72
73 ret = sk_filter(sk, skb);
74 if (ret < 0)
75 goto out; 99 goto out;
76 } 100 if (ktime_before(skb->tstamp,
101 ktime_add_ns(call->cong_tstamp,
102 call->peer->rtt)))
103 goto out_no_clear_ca;
104 change = rxrpc_cong_rtt_window_end;
105 call->cong_tstamp = skb->tstamp;
106 if (cumulative_acks >= cwnd)
107 cwnd++;
108 goto out;
77 109
78 spin_lock_bh(&sk->sk_receive_queue.lock); 110 case RXRPC_CALL_PACKET_LOSS:
79 if (!test_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags) && 111 if (summary->nr_nacks == 0)
80 !test_bit(RXRPC_CALL_RELEASED, &call->flags) && 112 goto resume_normality;
81 call->socket->sk.sk_state != RXRPC_CLOSE) {
82 skb->destructor = rxrpc_packet_destructor;
83 skb->dev = NULL;
84 skb->sk = sk;
85 atomic_add(skb->truesize, &sk->sk_rmem_alloc);
86 113
87 if (terminal) { 114 if (summary->new_low_nack) {
88 _debug("<<<< TERMINAL MESSAGE >>>>"); 115 change = rxrpc_cong_new_low_nack;
89 set_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags); 116 call->cong_dup_acks = 1;
117 if (call->cong_extra > 1)
118 call->cong_extra = 1;
119 goto send_extra_data;
90 } 120 }
91 121
92 /* allow interception by a kernel service */ 122 call->cong_dup_acks++;
93 if (rx->interceptor) { 123 if (call->cong_dup_acks < 3)
94 rx->interceptor(sk, call->user_call_ID, skb); 124 goto send_extra_data;
95 spin_unlock_bh(&sk->sk_receive_queue.lock); 125
96 } else { 126 change = rxrpc_cong_begin_retransmission;
97 _net("post skb %p", skb); 127 call->cong_mode = RXRPC_CALL_FAST_RETRANSMIT;
98 __skb_queue_tail(&sk->sk_receive_queue, skb); 128 call->cong_ssthresh = max_t(unsigned int,
99 spin_unlock_bh(&sk->sk_receive_queue.lock); 129 summary->flight_size / 2, 2);
130 cwnd = call->cong_ssthresh + 3;
131 call->cong_extra = 0;
132 call->cong_dup_acks = 0;
133 resend = true;
134 goto out;
100 135
101 if (!sock_flag(sk, SOCK_DEAD)) 136 case RXRPC_CALL_FAST_RETRANSMIT:
102 sk->sk_data_ready(sk); 137 if (!summary->new_low_nack) {
138 if (summary->nr_new_acks == 0)
139 cwnd += 1;
140 call->cong_dup_acks++;
141 if (call->cong_dup_acks == 2) {
142 change = rxrpc_cong_retransmit_again;
143 call->cong_dup_acks = 0;
144 resend = true;
145 }
146 } else {
147 change = rxrpc_cong_progress;
148 cwnd = call->cong_ssthresh;
149 if (summary->nr_nacks == 0)
150 goto resume_normality;
103 } 151 }
104 skb = NULL; 152 goto out;
105 } else { 153
106 spin_unlock_bh(&sk->sk_receive_queue.lock); 154 default:
155 BUG();
156 goto out;
107 } 157 }
108 ret = 0;
109 158
159resume_normality:
160 change = rxrpc_cong_cleared_nacks;
161 call->cong_dup_acks = 0;
162 call->cong_extra = 0;
163 call->cong_tstamp = skb->tstamp;
164 if (cwnd < call->cong_ssthresh)
165 call->cong_mode = RXRPC_CALL_SLOW_START;
166 else
167 call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE;
110out: 168out:
111 rxrpc_free_skb(skb); 169 cumulative_acks = 0;
170out_no_clear_ca:
171 if (cwnd >= RXRPC_RXTX_BUFF_SIZE - 1)
172 cwnd = RXRPC_RXTX_BUFF_SIZE - 1;
173 call->cong_cwnd = cwnd;
174 call->cong_cumul_acks = cumulative_acks;
175 trace_rxrpc_congest(call, summary, acked_serial, change);
176 if (resend && !test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events))
177 rxrpc_queue_call(call);
178 return;
179
180packet_loss_detected:
181 change = rxrpc_cong_saw_nack;
182 call->cong_mode = RXRPC_CALL_PACKET_LOSS;
183 call->cong_dup_acks = 0;
184 goto send_extra_data;
112 185
113 _leave(" = %d", ret); 186send_extra_data:
114 return ret; 187 /* Send some previously unsent DATA if we have some to advance the ACK
188 * state.
189 */
190 if (call->rxtx_annotations[call->tx_top & RXRPC_RXTX_BUFF_MASK] &
191 RXRPC_TX_ANNO_LAST ||
192 summary->nr_acks != call->tx_top - call->tx_hard_ack) {
193 call->cong_extra++;
194 wake_up(&call->waitq);
195 }
196 goto out_no_clear_ca;
115} 197}
116 198
117/* 199/*
118 * process a DATA packet, posting the packet to the appropriate queue 200 * Ping the other end to fill our RTT cache and to retrieve the rwind
119 * - eats the packet if successful 201 * and MTU parameters.
120 */ 202 */
121static int rxrpc_fast_process_data(struct rxrpc_call *call, 203static void rxrpc_send_ping(struct rxrpc_call *call, struct sk_buff *skb,
122 struct sk_buff *skb, u32 seq) 204 int skew)
123{ 205{
124 struct rxrpc_skb_priv *sp; 206 struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
125 bool terminal; 207 ktime_t now = skb->tstamp;
126 int ret, ackbit, ack;
127 u32 serial;
128 u8 flags;
129 208
130 _enter("{%u,%u},,{%u}", call->rx_data_post, call->rx_first_oos, seq); 209 if (call->peer->rtt_usage < 3 ||
210 ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), now))
211 rxrpc_propose_ACK(call, RXRPC_ACK_PING, skew, sp->hdr.serial,
212 true, true,
213 rxrpc_propose_ack_ping_for_params);
214}
131 215
132 sp = rxrpc_skb(skb); 216/*
133 ASSERTCMP(sp->call, ==, NULL); 217 * Apply a hard ACK by advancing the Tx window.
134 flags = sp->hdr.flags; 218 */
135 serial = sp->hdr.serial; 219static void rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to,
220 struct rxrpc_ack_summary *summary)
221{
222 struct sk_buff *skb, *list = NULL;
223 int ix;
224 u8 annotation;
225
226 if (call->acks_lowest_nak == call->tx_hard_ack) {
227 call->acks_lowest_nak = to;
228 } else if (before_eq(call->acks_lowest_nak, to)) {
229 summary->new_low_nack = true;
230 call->acks_lowest_nak = to;
231 }
136 232
137 spin_lock(&call->lock); 233 spin_lock(&call->lock);
138 234
139 if (call->state > RXRPC_CALL_COMPLETE) 235 while (before(call->tx_hard_ack, to)) {
140 goto discard; 236 call->tx_hard_ack++;
237 ix = call->tx_hard_ack & RXRPC_RXTX_BUFF_MASK;
238 skb = call->rxtx_buffer[ix];
239 annotation = call->rxtx_annotations[ix];
240 rxrpc_see_skb(skb, rxrpc_skb_tx_rotated);
241 call->rxtx_buffer[ix] = NULL;
242 call->rxtx_annotations[ix] = 0;
243 skb->next = list;
244 list = skb;
245
246 if (annotation & RXRPC_TX_ANNO_LAST)
247 set_bit(RXRPC_CALL_TX_LAST, &call->flags);
248 if ((annotation & RXRPC_TX_ANNO_MASK) != RXRPC_TX_ANNO_ACK)
249 summary->nr_rot_new_acks++;
250 }
141 251
142 ASSERTCMP(call->rx_data_expect, >=, call->rx_data_post); 252 spin_unlock(&call->lock);
143 ASSERTCMP(call->rx_data_post, >=, call->rx_data_recv);
144 ASSERTCMP(call->rx_data_recv, >=, call->rx_data_eaten);
145 253
146 if (seq < call->rx_data_post) { 254 trace_rxrpc_transmit(call, (test_bit(RXRPC_CALL_TX_LAST, &call->flags) ?
147 _debug("dup #%u [-%u]", seq, call->rx_data_post); 255 rxrpc_transmit_rotate_last :
148 ack = RXRPC_ACK_DUPLICATE; 256 rxrpc_transmit_rotate));
149 ret = -ENOBUFS; 257 wake_up(&call->waitq);
150 goto discard_and_ack;
151 }
152 258
153 /* we may already have the packet in the out of sequence queue */ 259 while (list) {
154 ackbit = seq - (call->rx_data_eaten + 1); 260 skb = list;
155 ASSERTCMP(ackbit, >=, 0); 261 list = skb->next;
156 if (__test_and_set_bit(ackbit, call->ackr_window)) { 262 skb->next = NULL;
157 _debug("dup oos #%u [%u,%u]", 263 rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
158 seq, call->rx_data_eaten, call->rx_data_post);
159 ack = RXRPC_ACK_DUPLICATE;
160 goto discard_and_ack;
161 } 264 }
265}
162 266
163 if (seq >= call->ackr_win_top) { 267/*
164 _debug("exceed #%u [%u]", seq, call->ackr_win_top); 268 * End the transmission phase of a call.
165 __clear_bit(ackbit, call->ackr_window); 269 *
166 ack = RXRPC_ACK_EXCEEDS_WINDOW; 270 * This occurs when we get an ACKALL packet, the first DATA packet of a reply,
167 goto discard_and_ack; 271 * or a final ACK packet.
168 } 272 */
273static bool rxrpc_end_tx_phase(struct rxrpc_call *call, bool reply_begun,
274 const char *abort_why)
275{
169 276
170 if (seq == call->rx_data_expect) { 277 ASSERT(test_bit(RXRPC_CALL_TX_LAST, &call->flags));
171 clear_bit(RXRPC_CALL_EXPECT_OOS, &call->flags);
172 call->rx_data_expect++;
173 } else if (seq > call->rx_data_expect) {
174 _debug("oos #%u [%u]", seq, call->rx_data_expect);
175 call->rx_data_expect = seq + 1;
176 if (test_and_set_bit(RXRPC_CALL_EXPECT_OOS, &call->flags)) {
177 ack = RXRPC_ACK_OUT_OF_SEQUENCE;
178 goto enqueue_and_ack;
179 }
180 goto enqueue_packet;
181 }
182 278
183 if (seq != call->rx_data_post) { 279 write_lock(&call->state_lock);
184 _debug("ahead #%u [%u]", seq, call->rx_data_post);
185 goto enqueue_packet;
186 }
187 280
188 if (test_bit(RXRPC_CALL_RCVD_LAST, &call->flags)) 281 switch (call->state) {
189 goto protocol_error; 282 case RXRPC_CALL_CLIENT_SEND_REQUEST:
283 case RXRPC_CALL_CLIENT_AWAIT_REPLY:
284 if (reply_begun)
285 call->state = RXRPC_CALL_CLIENT_RECV_REPLY;
286 else
287 call->state = RXRPC_CALL_CLIENT_AWAIT_REPLY;
288 break;
190 289
191 /* if the packet need security things doing to it, then it goes down 290 case RXRPC_CALL_SERVER_AWAIT_ACK:
192 * the slow path */ 291 __rxrpc_call_completed(call);
193 if (call->conn->security_ix) 292 rxrpc_notify_socket(call);
194 goto enqueue_packet; 293 break;
195 294
196 sp->call = call; 295 default:
197 rxrpc_get_call(call); 296 goto bad_state;
198 atomic_inc(&call->skb_count);
199 terminal = ((flags & RXRPC_LAST_PACKET) &&
200 !(flags & RXRPC_CLIENT_INITIATED));
201 ret = rxrpc_queue_rcv_skb(call, skb, false, terminal);
202 if (ret < 0) {
203 if (ret == -ENOMEM || ret == -ENOBUFS) {
204 __clear_bit(ackbit, call->ackr_window);
205 ack = RXRPC_ACK_NOSPACE;
206 goto discard_and_ack;
207 }
208 goto out;
209 } 297 }
210 298
211 skb = NULL; 299 write_unlock(&call->state_lock);
212 sp = NULL; 300 if (call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY) {
213 301 rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, 0, false, true,
214 _debug("post #%u", seq); 302 rxrpc_propose_ack_client_tx_end);
215 ASSERTCMP(call->rx_data_post, ==, seq); 303 trace_rxrpc_transmit(call, rxrpc_transmit_await_reply);
216 call->rx_data_post++; 304 } else {
305 trace_rxrpc_transmit(call, rxrpc_transmit_end);
306 }
307 _leave(" = ok");
308 return true;
309
310bad_state:
311 write_unlock(&call->state_lock);
312 kdebug("end_tx %s", rxrpc_call_states[call->state]);
313 rxrpc_proto_abort(abort_why, call, call->tx_top);
314 return false;
315}
217 316
218 if (flags & RXRPC_LAST_PACKET) 317/*
219 set_bit(RXRPC_CALL_RCVD_LAST, &call->flags); 318 * Begin the reply reception phase of a call.
319 */
320static bool rxrpc_receiving_reply(struct rxrpc_call *call)
321{
322 struct rxrpc_ack_summary summary = { 0 };
323 rxrpc_seq_t top = READ_ONCE(call->tx_top);
324
325 if (call->ackr_reason) {
326 spin_lock_bh(&call->lock);
327 call->ackr_reason = 0;
328 call->resend_at = call->expire_at;
329 call->ack_at = call->expire_at;
330 spin_unlock_bh(&call->lock);
331 rxrpc_set_timer(call, rxrpc_timer_init_for_reply,
332 ktime_get_real());
333 }
220 334
221 /* if we've reached an out of sequence packet then we need to drain 335 if (!test_bit(RXRPC_CALL_TX_LAST, &call->flags))
222 * that queue into the socket Rx queue now */ 336 rxrpc_rotate_tx_window(call, top, &summary);
223 if (call->rx_data_post == call->rx_first_oos) { 337 if (!test_bit(RXRPC_CALL_TX_LAST, &call->flags)) {
224 _debug("drain rx oos now"); 338 rxrpc_proto_abort("TXL", call, top);
225 read_lock(&call->state_lock); 339 return false;
226 if (call->state < RXRPC_CALL_COMPLETE &&
227 !test_and_set_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events))
228 rxrpc_queue_call(call);
229 read_unlock(&call->state_lock);
230 } 340 }
341 if (!rxrpc_end_tx_phase(call, true, "ETD"))
342 return false;
343 call->tx_phase = false;
344 return true;
345}
231 346
232 spin_unlock(&call->lock); 347/*
233 atomic_inc(&call->ackr_not_idle); 348 * Scan a jumbo packet to validate its structure and to work out how many
234 rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, serial, false); 349 * subpackets it contains.
235 _leave(" = 0 [posted]"); 350 *
236 return 0; 351 * A jumbo packet is a collection of consecutive packets glued together with
352 * little headers between that indicate how to change the initial header for
353 * each subpacket.
354 *
355 * RXRPC_JUMBO_PACKET must be set on all but the last subpacket - and all but
356 * the last are RXRPC_JUMBO_DATALEN in size. The last subpacket may be of any
357 * size.
358 */
359static bool rxrpc_validate_jumbo(struct sk_buff *skb)
360{
361 struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
362 unsigned int offset = sizeof(struct rxrpc_wire_header);
363 unsigned int len = skb->len;
364 int nr_jumbo = 1;
365 u8 flags = sp->hdr.flags;
237 366
238protocol_error: 367 do {
239 ret = -EBADMSG; 368 nr_jumbo++;
240out: 369 if (len - offset < RXRPC_JUMBO_SUBPKTLEN)
241 spin_unlock(&call->lock); 370 goto protocol_error;
242 _leave(" = %d", ret); 371 if (flags & RXRPC_LAST_PACKET)
243 return ret; 372 goto protocol_error;
373 offset += RXRPC_JUMBO_DATALEN;
374 if (skb_copy_bits(skb, offset, &flags, 1) < 0)
375 goto protocol_error;
376 offset += sizeof(struct rxrpc_jumbo_header);
377 } while (flags & RXRPC_JUMBO_PACKET);
244 378
245discard_and_ack: 379 sp->nr_jumbo = nr_jumbo;
246 _debug("discard and ACK packet %p", skb); 380 return true;
247 __rxrpc_propose_ACK(call, ack, serial, true);
248discard:
249 spin_unlock(&call->lock);
250 rxrpc_free_skb(skb);
251 _leave(" = 0 [discarded]");
252 return 0;
253 381
254enqueue_and_ack: 382protocol_error:
255 __rxrpc_propose_ACK(call, ack, serial, true); 383 return false;
256enqueue_packet:
257 _net("defer skb %p", skb);
258 spin_unlock(&call->lock);
259 skb_queue_tail(&call->rx_queue, skb);
260 atomic_inc(&call->ackr_not_idle);
261 read_lock(&call->state_lock);
262 if (call->state < RXRPC_CALL_DEAD)
263 rxrpc_queue_call(call);
264 read_unlock(&call->state_lock);
265 _leave(" = 0 [queued]");
266 return 0;
267} 384}
268 385
269/* 386/*
270 * assume an implicit ACKALL of the transmission phase of a client socket upon 387 * Handle reception of a duplicate packet.
271 * reception of the first reply packet 388 *
389 * We have to take care to avoid an attack here whereby we're given a series of
390 * jumbograms, each with a sequence number one before the preceding one and
391 * filled up to maximum UDP size. If they never send us the first packet in
392 * the sequence, they can cause us to have to hold on to around 2MiB of kernel
393 * space until the call times out.
394 *
395 * We limit the space usage by only accepting three duplicate jumbo packets per
396 * call. After that, we tell the other side we're no longer accepting jumbos
397 * (that information is encoded in the ACK packet).
272 */ 398 */
273static void rxrpc_assume_implicit_ackall(struct rxrpc_call *call, u32 serial) 399static void rxrpc_input_dup_data(struct rxrpc_call *call, rxrpc_seq_t seq,
400 u8 annotation, bool *_jumbo_bad)
274{ 401{
275 write_lock_bh(&call->state_lock); 402 /* Discard normal packets that are duplicates. */
276 403 if (annotation == 0)
277 switch (call->state) { 404 return;
278 case RXRPC_CALL_CLIENT_AWAIT_REPLY:
279 call->state = RXRPC_CALL_CLIENT_RECV_REPLY;
280 call->acks_latest = serial;
281
282 _debug("implicit ACKALL %%%u", call->acks_latest);
283 set_bit(RXRPC_CALL_EV_RCVD_ACKALL, &call->events);
284 write_unlock_bh(&call->state_lock);
285
286 if (try_to_del_timer_sync(&call->resend_timer) >= 0) {
287 clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events);
288 clear_bit(RXRPC_CALL_EV_RESEND, &call->events);
289 clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
290 }
291 break;
292 405
293 default: 406 /* Skip jumbo subpackets that are duplicates. When we've had three or
294 write_unlock_bh(&call->state_lock); 407 * more partially duplicate jumbo packets, we refuse to take any more
295 break; 408 * jumbos for this call.
409 */
410 if (!*_jumbo_bad) {
411 call->nr_jumbo_bad++;
412 *_jumbo_bad = true;
296 } 413 }
297} 414}
298 415
299/* 416/*
300 * post an incoming packet to the nominated call to deal with 417 * Process a DATA packet, adding the packet to the Rx ring.
301 * - must get rid of the sk_buff, either by freeing it or by queuing it
302 */ 418 */
303void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb) 419static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb,
420 u16 skew)
304{ 421{
305 struct rxrpc_skb_priv *sp = rxrpc_skb(skb); 422 struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
306 __be32 wtmp; 423 unsigned int offset = sizeof(struct rxrpc_wire_header);
307 u32 hi_serial, abort_code; 424 unsigned int ix;
425 rxrpc_serial_t serial = sp->hdr.serial, ack_serial = 0;
426 rxrpc_seq_t seq = sp->hdr.seq, hard_ack;
427 bool immediate_ack = false, jumbo_bad = false, queued;
428 u16 len;
429 u8 ack = 0, flags, annotation = 0;
308 430
309 _enter("%p,%p", call, skb); 431 _enter("{%u,%u},{%u,%u}",
432 call->rx_hard_ack, call->rx_top, skb->len, seq);
310 433
311 ASSERT(!irqs_disabled()); 434 _proto("Rx DATA %%%u { #%u f=%02x }",
435 sp->hdr.serial, seq, sp->hdr.flags);
436
437 if (call->state >= RXRPC_CALL_COMPLETE)
438 return;
439
440 /* Received data implicitly ACKs all of the request packets we sent
441 * when we're acting as a client.
442 */
443 if ((call->state == RXRPC_CALL_CLIENT_SEND_REQUEST ||
444 call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY) &&
445 !rxrpc_receiving_reply(call))
446 return;
447
448 call->ackr_prev_seq = seq;
312 449
313#if 0 // INJECT RX ERROR 450 hard_ack = READ_ONCE(call->rx_hard_ack);
314 if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA) { 451 if (after(seq, hard_ack + call->rx_winsize)) {
315 static int skip = 0; 452 ack = RXRPC_ACK_EXCEEDS_WINDOW;
316 if (++skip == 3) { 453 ack_serial = serial;
317 printk("DROPPED 3RD PACKET!!!!!!!!!!!!!\n"); 454 goto ack;
318 skip = 0; 455 }
319 goto free_packet; 456
457 flags = sp->hdr.flags;
458 if (flags & RXRPC_JUMBO_PACKET) {
459 if (call->nr_jumbo_bad > 3) {
460 ack = RXRPC_ACK_NOSPACE;
461 ack_serial = serial;
462 goto ack;
320 } 463 }
464 annotation = 1;
321 } 465 }
322#endif
323 466
324 /* track the latest serial number on this connection for ACK packet 467next_subpacket:
325 * information */ 468 queued = false;
326 hi_serial = atomic_read(&call->conn->hi_serial); 469 ix = seq & RXRPC_RXTX_BUFF_MASK;
327 while (sp->hdr.serial > hi_serial) 470 len = skb->len;
328 hi_serial = atomic_cmpxchg(&call->conn->hi_serial, hi_serial, 471 if (flags & RXRPC_JUMBO_PACKET)
329 sp->hdr.serial); 472 len = RXRPC_JUMBO_DATALEN;
473
474 if (flags & RXRPC_LAST_PACKET) {
475 if (test_bit(RXRPC_CALL_RX_LAST, &call->flags) &&
476 seq != call->rx_top)
477 return rxrpc_proto_abort("LSN", call, seq);
478 } else {
479 if (test_bit(RXRPC_CALL_RX_LAST, &call->flags) &&
480 after_eq(seq, call->rx_top))
481 return rxrpc_proto_abort("LSA", call, seq);
482 }
330 483
331 /* request ACK generation for any ACK or DATA packet that requests 484 if (before_eq(seq, hard_ack)) {
332 * it */ 485 ack = RXRPC_ACK_DUPLICATE;
333 if (sp->hdr.flags & RXRPC_REQUEST_ACK) { 486 ack_serial = serial;
334 _proto("ACK Requested on %%%u", sp->hdr.serial); 487 goto skip;
335 rxrpc_propose_ACK(call, RXRPC_ACK_REQUESTED, sp->hdr.serial, false);
336 } 488 }
337 489
338 switch (sp->hdr.type) { 490 if (flags & RXRPC_REQUEST_ACK && !ack) {
339 case RXRPC_PACKET_TYPE_ABORT: 491 ack = RXRPC_ACK_REQUESTED;
340 _debug("abort"); 492 ack_serial = serial;
493 }
341 494
342 if (skb_copy_bits(skb, 0, &wtmp, sizeof(wtmp)) < 0) 495 if (call->rxtx_buffer[ix]) {
343 goto protocol_error; 496 rxrpc_input_dup_data(call, seq, annotation, &jumbo_bad);
497 if (ack != RXRPC_ACK_DUPLICATE) {
498 ack = RXRPC_ACK_DUPLICATE;
499 ack_serial = serial;
500 }
501 immediate_ack = true;
502 goto skip;
503 }
344 504
345 abort_code = ntohl(wtmp); 505 /* Queue the packet. We use a couple of memory barriers here as need
346 _proto("Rx ABORT %%%u { %x }", sp->hdr.serial, abort_code); 506 * to make sure that rx_top is perceived to be set after the buffer
507 * pointer and that the buffer pointer is set after the annotation and
508 * the skb data.
509 *
510 * Barriers against rxrpc_recvmsg_data() and rxrpc_rotate_rx_window()
511 * and also rxrpc_fill_out_ack().
512 */
513 rxrpc_get_skb(skb, rxrpc_skb_rx_got);
514 call->rxtx_annotations[ix] = annotation;
515 smp_wmb();
516 call->rxtx_buffer[ix] = skb;
517 if (after(seq, call->rx_top)) {
518 smp_store_release(&call->rx_top, seq);
519 } else if (before(seq, call->rx_top)) {
520 /* Send an immediate ACK if we fill in a hole */
521 if (!ack) {
522 ack = RXRPC_ACK_DELAY;
523 ack_serial = serial;
524 }
525 immediate_ack = true;
526 }
527 if (flags & RXRPC_LAST_PACKET) {
528 set_bit(RXRPC_CALL_RX_LAST, &call->flags);
529 trace_rxrpc_receive(call, rxrpc_receive_queue_last, serial, seq);
530 } else {
531 trace_rxrpc_receive(call, rxrpc_receive_queue, serial, seq);
532 }
533 queued = true;
347 534
348 write_lock_bh(&call->state_lock); 535 if (after_eq(seq, call->rx_expect_next)) {
349 if (call->state < RXRPC_CALL_COMPLETE) { 536 if (after(seq, call->rx_expect_next)) {
350 call->state = RXRPC_CALL_REMOTELY_ABORTED; 537 _net("OOS %u > %u", seq, call->rx_expect_next);
351 call->remote_abort = abort_code; 538 ack = RXRPC_ACK_OUT_OF_SEQUENCE;
352 set_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events); 539 ack_serial = serial;
353 rxrpc_queue_call(call);
354 } 540 }
355 goto free_packet_unlock; 541 call->rx_expect_next = seq + 1;
542 }
356 543
357 case RXRPC_PACKET_TYPE_BUSY: 544skip:
358 _proto("Rx BUSY %%%u", sp->hdr.serial); 545 offset += len;
546 if (flags & RXRPC_JUMBO_PACKET) {
547 if (skb_copy_bits(skb, offset, &flags, 1) < 0)
548 return rxrpc_proto_abort("XJF", call, seq);
549 offset += sizeof(struct rxrpc_jumbo_header);
550 seq++;
551 serial++;
552 annotation++;
553 if (flags & RXRPC_JUMBO_PACKET)
554 annotation |= RXRPC_RX_ANNO_JLAST;
555 if (after(seq, hard_ack + call->rx_winsize)) {
556 ack = RXRPC_ACK_EXCEEDS_WINDOW;
557 ack_serial = serial;
558 if (!jumbo_bad) {
559 call->nr_jumbo_bad++;
560 jumbo_bad = true;
561 }
562 goto ack;
563 }
359 564
360 if (rxrpc_conn_is_service(call->conn)) 565 _proto("Rx DATA Jumbo %%%u", serial);
361 goto protocol_error; 566 goto next_subpacket;
567 }
362 568
363 write_lock_bh(&call->state_lock); 569 if (queued && flags & RXRPC_LAST_PACKET && !ack) {
364 switch (call->state) { 570 ack = RXRPC_ACK_DELAY;
365 case RXRPC_CALL_CLIENT_SEND_REQUEST: 571 ack_serial = serial;
366 call->state = RXRPC_CALL_SERVER_BUSY; 572 }
367 set_bit(RXRPC_CALL_EV_RCVD_BUSY, &call->events);
368 rxrpc_queue_call(call);
369 case RXRPC_CALL_SERVER_BUSY:
370 goto free_packet_unlock;
371 default:
372 goto protocol_error_locked;
373 }
374 573
375 default: 574ack:
376 _proto("Rx %s %%%u", rxrpc_pkts[sp->hdr.type], sp->hdr.serial); 575 if (ack)
377 goto protocol_error; 576 rxrpc_propose_ACK(call, ack, skew, ack_serial,
577 immediate_ack, true,
578 rxrpc_propose_ack_input_data);
378 579
379 case RXRPC_PACKET_TYPE_DATA: 580 if (sp->hdr.seq == READ_ONCE(call->rx_hard_ack) + 1)
380 _proto("Rx DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq); 581 rxrpc_notify_socket(call);
582 _leave(" [queued]");
583}
381 584
382 if (sp->hdr.seq == 0) 585/*
383 goto protocol_error; 586 * Process a requested ACK.
587 */
588static void rxrpc_input_requested_ack(struct rxrpc_call *call,
589 ktime_t resp_time,
590 rxrpc_serial_t orig_serial,
591 rxrpc_serial_t ack_serial)
592{
593 struct rxrpc_skb_priv *sp;
594 struct sk_buff *skb;
595 ktime_t sent_at;
596 int ix;
597
598 for (ix = 0; ix < RXRPC_RXTX_BUFF_SIZE; ix++) {
599 skb = call->rxtx_buffer[ix];
600 if (!skb)
601 continue;
602
603 sp = rxrpc_skb(skb);
604 if (sp->hdr.serial != orig_serial)
605 continue;
606 smp_rmb();
607 sent_at = skb->tstamp;
608 goto found;
609 }
610 return;
384 611
385 call->ackr_prev_seq = sp->hdr.seq; 612found:
613 rxrpc_peer_add_rtt(call, rxrpc_rtt_rx_requested_ack,
614 orig_serial, ack_serial, sent_at, resp_time);
615}
386 616
387 /* received data implicitly ACKs all of the request packets we 617/*
388 * sent when we're acting as a client */ 618 * Process a ping response.
389 if (call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY) 619 */
390 rxrpc_assume_implicit_ackall(call, sp->hdr.serial); 620static void rxrpc_input_ping_response(struct rxrpc_call *call,
621 ktime_t resp_time,
622 rxrpc_serial_t orig_serial,
623 rxrpc_serial_t ack_serial)
624{
625 rxrpc_serial_t ping_serial;
626 ktime_t ping_time;
391 627
392 switch (rxrpc_fast_process_data(call, skb, sp->hdr.seq)) { 628 ping_time = call->ping_time;
393 case 0: 629 smp_rmb();
394 skb = NULL; 630 ping_serial = call->ping_serial;
395 goto done;
396 631
397 default: 632 if (!test_bit(RXRPC_CALL_PINGING, &call->flags) ||
398 BUG(); 633 before(orig_serial, ping_serial))
634 return;
635 clear_bit(RXRPC_CALL_PINGING, &call->flags);
636 if (after(orig_serial, ping_serial))
637 return;
399 638
400 /* data packet received beyond the last packet */ 639 rxrpc_peer_add_rtt(call, rxrpc_rtt_rx_ping_response,
401 case -EBADMSG: 640 orig_serial, ack_serial, ping_time, resp_time);
402 goto protocol_error; 641}
403 }
404 642
405 case RXRPC_PACKET_TYPE_ACKALL: 643/*
406 case RXRPC_PACKET_TYPE_ACK: 644 * Process the extra information that may be appended to an ACK packet
407 /* ACK processing is done in process context */ 645 */
408 read_lock_bh(&call->state_lock); 646static void rxrpc_input_ackinfo(struct rxrpc_call *call, struct sk_buff *skb,
409 if (call->state < RXRPC_CALL_DEAD) { 647 struct rxrpc_ackinfo *ackinfo)
410 skb_queue_tail(&call->rx_queue, skb); 648{
411 rxrpc_queue_call(call); 649 struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
412 skb = NULL; 650 struct rxrpc_peer *peer;
413 } 651 unsigned int mtu;
414 read_unlock_bh(&call->state_lock); 652 u32 rwind = ntohl(ackinfo->rwind);
415 goto free_packet; 653
654 _proto("Rx ACK %%%u Info { rx=%u max=%u rwin=%u jm=%u }",
655 sp->hdr.serial,
656 ntohl(ackinfo->rxMTU), ntohl(ackinfo->maxMTU),
657 rwind, ntohl(ackinfo->jumbo_max));
658
659 if (rwind > RXRPC_RXTX_BUFF_SIZE - 1)
660 rwind = RXRPC_RXTX_BUFF_SIZE - 1;
661 call->tx_winsize = rwind;
662 if (call->cong_ssthresh > rwind)
663 call->cong_ssthresh = rwind;
664
665 mtu = min(ntohl(ackinfo->rxMTU), ntohl(ackinfo->maxMTU));
666
667 peer = call->peer;
668 if (mtu < peer->maxdata) {
669 spin_lock_bh(&peer->lock);
670 peer->maxdata = mtu;
671 peer->mtu = mtu + peer->hdrsize;
672 spin_unlock_bh(&peer->lock);
673 _net("Net MTU %u (maxdata %u)", peer->mtu, peer->maxdata);
416 } 674 }
675}
417 676
418protocol_error: 677/*
419 _debug("protocol error"); 678 * Process individual soft ACKs.
420 write_lock_bh(&call->state_lock); 679 *
421protocol_error_locked: 680 * Each ACK in the array corresponds to one packet and can be either an ACK or
422 if (call->state <= RXRPC_CALL_COMPLETE) { 681 * a NAK. If we get find an explicitly NAK'd packet we resend immediately;
423 call->state = RXRPC_CALL_LOCALLY_ABORTED; 682 * packets that lie beyond the end of the ACK list are scheduled for resend by
424 call->local_abort = RX_PROTOCOL_ERROR; 683 * the timer on the basis that the peer might just not have processed them at
425 set_bit(RXRPC_CALL_EV_ABORT, &call->events); 684 * the time the ACK was sent.
426 rxrpc_queue_call(call); 685 */
686static void rxrpc_input_soft_acks(struct rxrpc_call *call, u8 *acks,
687 rxrpc_seq_t seq, int nr_acks,
688 struct rxrpc_ack_summary *summary)
689{
690 int ix;
691 u8 annotation, anno_type;
692
693 for (; nr_acks > 0; nr_acks--, seq++) {
694 ix = seq & RXRPC_RXTX_BUFF_MASK;
695 annotation = call->rxtx_annotations[ix];
696 anno_type = annotation & RXRPC_TX_ANNO_MASK;
697 annotation &= ~RXRPC_TX_ANNO_MASK;
698 switch (*acks++) {
699 case RXRPC_ACK_TYPE_ACK:
700 summary->nr_acks++;
701 if (anno_type == RXRPC_TX_ANNO_ACK)
702 continue;
703 summary->nr_new_acks++;
704 call->rxtx_annotations[ix] =
705 RXRPC_TX_ANNO_ACK | annotation;
706 break;
707 case RXRPC_ACK_TYPE_NACK:
708 if (!summary->nr_nacks &&
709 call->acks_lowest_nak != seq) {
710 call->acks_lowest_nak = seq;
711 summary->new_low_nack = true;
712 }
713 summary->nr_nacks++;
714 if (anno_type == RXRPC_TX_ANNO_NAK)
715 continue;
716 summary->nr_new_nacks++;
717 if (anno_type == RXRPC_TX_ANNO_RETRANS)
718 continue;
719 call->rxtx_annotations[ix] =
720 RXRPC_TX_ANNO_NAK | annotation;
721 break;
722 default:
723 return rxrpc_proto_abort("SFT", call, 0);
724 }
427 } 725 }
428free_packet_unlock:
429 write_unlock_bh(&call->state_lock);
430free_packet:
431 rxrpc_free_skb(skb);
432done:
433 _leave("");
434} 726}
435 727
436/* 728/*
437 * split up a jumbo data packet 729 * Process an ACK packet.
730 *
731 * ack.firstPacket is the sequence number of the first soft-ACK'd/NAK'd packet
732 * in the ACK array. Anything before that is hard-ACK'd and may be discarded.
733 *
734 * A hard-ACK means that a packet has been processed and may be discarded; a
735 * soft-ACK means that the packet may be discarded and retransmission
736 * requested. A phase is complete when all packets are hard-ACK'd.
438 */ 737 */
439static void rxrpc_process_jumbo_packet(struct rxrpc_call *call, 738static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
440 struct sk_buff *jumbo) 739 u16 skew)
441{ 740{
442 struct rxrpc_jumbo_header jhdr; 741 struct rxrpc_ack_summary summary = { 0 };
443 struct rxrpc_skb_priv *sp; 742 struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
444 struct sk_buff *part; 743 union {
744 struct rxrpc_ackpacket ack;
745 struct rxrpc_ackinfo info;
746 u8 acks[RXRPC_MAXACKS];
747 } buf;
748 rxrpc_serial_t acked_serial;
749 rxrpc_seq_t first_soft_ack, hard_ack;
750 int nr_acks, offset, ioffset;
751
752 _enter("");
753
754 offset = sizeof(struct rxrpc_wire_header);
755 if (skb_copy_bits(skb, offset, &buf.ack, sizeof(buf.ack)) < 0) {
756 _debug("extraction failure");
757 return rxrpc_proto_abort("XAK", call, 0);
758 }
759 offset += sizeof(buf.ack);
760
761 acked_serial = ntohl(buf.ack.serial);
762 first_soft_ack = ntohl(buf.ack.firstPacket);
763 hard_ack = first_soft_ack - 1;
764 nr_acks = buf.ack.nAcks;
765 summary.ack_reason = (buf.ack.reason < RXRPC_ACK__INVALID ?
766 buf.ack.reason : RXRPC_ACK__INVALID);
767
768 trace_rxrpc_rx_ack(call, first_soft_ack, summary.ack_reason, nr_acks);
769
770 _proto("Rx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }",
771 sp->hdr.serial,
772 ntohs(buf.ack.maxSkew),
773 first_soft_ack,
774 ntohl(buf.ack.previousPacket),
775 acked_serial,
776 rxrpc_ack_names[summary.ack_reason],
777 buf.ack.nAcks);
778
779 if (buf.ack.reason == RXRPC_ACK_PING_RESPONSE)
780 rxrpc_input_ping_response(call, skb->tstamp, acked_serial,
781 sp->hdr.serial);
782 if (buf.ack.reason == RXRPC_ACK_REQUESTED)
783 rxrpc_input_requested_ack(call, skb->tstamp, acked_serial,
784 sp->hdr.serial);
785
786 if (buf.ack.reason == RXRPC_ACK_PING) {
787 _proto("Rx ACK %%%u PING Request", sp->hdr.serial);
788 rxrpc_propose_ACK(call, RXRPC_ACK_PING_RESPONSE,
789 skew, sp->hdr.serial, true, true,
790 rxrpc_propose_ack_respond_to_ping);
791 } else if (sp->hdr.flags & RXRPC_REQUEST_ACK) {
792 rxrpc_propose_ACK(call, RXRPC_ACK_REQUESTED,
793 skew, sp->hdr.serial, true, true,
794 rxrpc_propose_ack_respond_to_ack);
795 }
445 796
446 _enter(",{%u,%u}", jumbo->data_len, jumbo->len); 797 ioffset = offset + nr_acks + 3;
798 if (skb->len >= ioffset + sizeof(buf.info)) {
799 if (skb_copy_bits(skb, ioffset, &buf.info, sizeof(buf.info)) < 0)
800 return rxrpc_proto_abort("XAI", call, 0);
801 rxrpc_input_ackinfo(call, skb, &buf.info);
802 }
447 803
448 sp = rxrpc_skb(jumbo); 804 if (first_soft_ack == 0)
805 return rxrpc_proto_abort("AK0", call, 0);
449 806
450 do { 807 /* Ignore ACKs unless we are or have just been transmitting. */
451 sp->hdr.flags &= ~RXRPC_JUMBO_PACKET; 808 switch (call->state) {
452 809 case RXRPC_CALL_CLIENT_SEND_REQUEST:
453 /* make a clone to represent the first subpacket in what's left 810 case RXRPC_CALL_CLIENT_AWAIT_REPLY:
454 * of the jumbo packet */ 811 case RXRPC_CALL_SERVER_SEND_REPLY:
455 part = skb_clone(jumbo, GFP_ATOMIC); 812 case RXRPC_CALL_SERVER_AWAIT_ACK:
456 if (!part) { 813 break;
457 /* simply ditch the tail in the event of ENOMEM */ 814 default:
458 pskb_trim(jumbo, RXRPC_JUMBO_DATALEN); 815 return;
459 break; 816 }
460 }
461 rxrpc_new_skb(part);
462 817
463 pskb_trim(part, RXRPC_JUMBO_DATALEN); 818 /* Discard any out-of-order or duplicate ACKs. */
819 if (before_eq(sp->hdr.serial, call->acks_latest)) {
820 _debug("discard ACK %d <= %d",
821 sp->hdr.serial, call->acks_latest);
822 return;
823 }
824 call->acks_latest_ts = skb->tstamp;
825 call->acks_latest = sp->hdr.serial;
826
827 if (before(hard_ack, call->tx_hard_ack) ||
828 after(hard_ack, call->tx_top))
829 return rxrpc_proto_abort("AKW", call, 0);
830 if (nr_acks > call->tx_top - hard_ack)
831 return rxrpc_proto_abort("AKN", call, 0);
832
833 if (after(hard_ack, call->tx_hard_ack))
834 rxrpc_rotate_tx_window(call, hard_ack, &summary);
835
836 if (nr_acks > 0) {
837 if (skb_copy_bits(skb, offset, buf.acks, nr_acks) < 0)
838 return rxrpc_proto_abort("XSA", call, 0);
839 rxrpc_input_soft_acks(call, buf.acks, first_soft_ack, nr_acks,
840 &summary);
841 }
464 842
465 if (!pskb_pull(jumbo, RXRPC_JUMBO_DATALEN)) 843 if (test_bit(RXRPC_CALL_TX_LAST, &call->flags)) {
466 goto protocol_error; 844 rxrpc_end_tx_phase(call, false, "ETA");
845 return;
846 }
467 847
468 if (skb_copy_bits(jumbo, 0, &jhdr, sizeof(jhdr)) < 0) 848 if (call->rxtx_annotations[call->tx_top & RXRPC_RXTX_BUFF_MASK] &
469 goto protocol_error; 849 RXRPC_TX_ANNO_LAST &&
470 if (!pskb_pull(jumbo, sizeof(jhdr))) 850 summary.nr_acks == call->tx_top - hard_ack &&
471 BUG(); 851 rxrpc_is_client_call(call))
852 rxrpc_propose_ACK(call, RXRPC_ACK_PING, skew, sp->hdr.serial,
853 false, true,
854 rxrpc_propose_ack_ping_for_lost_reply);
472 855
473 sp->hdr.seq += 1; 856 return rxrpc_congestion_management(call, skb, &summary, acked_serial);
474 sp->hdr.serial += 1; 857}
475 sp->hdr.flags = jhdr.flags;
476 sp->hdr._rsvd = ntohs(jhdr._rsvd);
477 858
478 _proto("Rx DATA Jumbo %%%u", sp->hdr.serial - 1); 859/*
860 * Process an ACKALL packet.
861 */
862static void rxrpc_input_ackall(struct rxrpc_call *call, struct sk_buff *skb)
863{
864 struct rxrpc_ack_summary summary = { 0 };
865 struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
479 866
480 rxrpc_fast_process_packet(call, part); 867 _proto("Rx ACKALL %%%u", sp->hdr.serial);
481 part = NULL;
482 868
483 } while (sp->hdr.flags & RXRPC_JUMBO_PACKET); 869 rxrpc_rotate_tx_window(call, call->tx_top, &summary);
870 if (test_bit(RXRPC_CALL_TX_LAST, &call->flags))
871 rxrpc_end_tx_phase(call, false, "ETL");
872}
484 873
485 rxrpc_fast_process_packet(call, jumbo); 874/*
486 _leave(""); 875 * Process an ABORT packet.
487 return; 876 */
877static void rxrpc_input_abort(struct rxrpc_call *call, struct sk_buff *skb)
878{
879 struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
880 __be32 wtmp;
881 u32 abort_code = RX_CALL_DEAD;
488 882
489protocol_error: 883 _enter("");
490 _debug("protocol error"); 884
491 rxrpc_free_skb(part); 885 if (skb->len >= 4 &&
492 rxrpc_free_skb(jumbo); 886 skb_copy_bits(skb, sizeof(struct rxrpc_wire_header),
493 write_lock_bh(&call->state_lock); 887 &wtmp, sizeof(wtmp)) >= 0)
494 if (call->state <= RXRPC_CALL_COMPLETE) { 888 abort_code = ntohl(wtmp);
495 call->state = RXRPC_CALL_LOCALLY_ABORTED; 889
496 call->local_abort = RX_PROTOCOL_ERROR; 890 _proto("Rx ABORT %%%u { %x }", sp->hdr.serial, abort_code);
497 set_bit(RXRPC_CALL_EV_ABORT, &call->events); 891
498 rxrpc_queue_call(call); 892 if (rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED,
499 } 893 abort_code, ECONNABORTED))
500 write_unlock_bh(&call->state_lock); 894 rxrpc_notify_socket(call);
501 _leave("");
502} 895}
503 896
504/* 897/*
505 * post an incoming packet to the appropriate call/socket to deal with 898 * Process an incoming call packet.
506 * - must get rid of the sk_buff, either by freeing it or by queuing it
507 */ 899 */
508static void rxrpc_post_packet_to_call(struct rxrpc_call *call, 900static void rxrpc_input_call_packet(struct rxrpc_call *call,
509 struct sk_buff *skb) 901 struct sk_buff *skb, u16 skew)
510{ 902{
511 struct rxrpc_skb_priv *sp; 903 struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
512 904
513 _enter("%p,%p", call, skb); 905 _enter("%p,%p", call, skb);
514 906
515 sp = rxrpc_skb(skb); 907 switch (sp->hdr.type) {
908 case RXRPC_PACKET_TYPE_DATA:
909 rxrpc_input_data(call, skb, skew);
910 break;
911
912 case RXRPC_PACKET_TYPE_ACK:
913 rxrpc_input_ack(call, skb, skew);
914 break;
516 915
517 _debug("extant call [%d]", call->state); 916 case RXRPC_PACKET_TYPE_BUSY:
917 _proto("Rx BUSY %%%u", sp->hdr.serial);
918
919 /* Just ignore BUSY packets from the server; the retry and
920 * lifespan timers will take care of business. BUSY packets
921 * from the client don't make sense.
922 */
923 break;
924
925 case RXRPC_PACKET_TYPE_ABORT:
926 rxrpc_input_abort(call, skb);
927 break;
518 928
519 read_lock(&call->state_lock); 929 case RXRPC_PACKET_TYPE_ACKALL:
930 rxrpc_input_ackall(call, skb);
931 break;
932
933 default:
934 _proto("Rx %s %%%u", rxrpc_pkts[sp->hdr.type], sp->hdr.serial);
935 break;
936 }
937
938 _leave("");
939}
940
941/*
942 * Handle a new call on a channel implicitly completing the preceding call on
943 * that channel.
944 *
945 * TODO: If callNumber > call_id + 1, renegotiate security.
946 */
947static void rxrpc_input_implicit_end_call(struct rxrpc_connection *conn,
948 struct rxrpc_call *call)
949{
520 switch (call->state) { 950 switch (call->state) {
521 case RXRPC_CALL_LOCALLY_ABORTED: 951 case RXRPC_CALL_SERVER_AWAIT_ACK:
522 if (!test_and_set_bit(RXRPC_CALL_EV_ABORT, &call->events)) { 952 rxrpc_call_completed(call);
523 rxrpc_queue_call(call); 953 break;
524 goto free_unlock;
525 }
526 case RXRPC_CALL_REMOTELY_ABORTED:
527 case RXRPC_CALL_NETWORK_ERROR:
528 case RXRPC_CALL_DEAD:
529 goto dead_call;
530 case RXRPC_CALL_COMPLETE: 954 case RXRPC_CALL_COMPLETE:
531 case RXRPC_CALL_CLIENT_FINAL_ACK: 955 break;
532 /* complete server call */
533 if (rxrpc_conn_is_service(call->conn))
534 goto dead_call;
535 /* resend last packet of a completed call */
536 _debug("final ack again");
537 rxrpc_get_call(call);
538 set_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events);
539 rxrpc_queue_call(call);
540 goto free_unlock;
541 default: 956 default:
957 if (rxrpc_abort_call("IMP", call, 0, RX_CALL_DEAD, ESHUTDOWN)) {
958 set_bit(RXRPC_CALL_EV_ABORT, &call->events);
959 rxrpc_queue_call(call);
960 }
542 break; 961 break;
543 } 962 }
544 963
545 read_unlock(&call->state_lock); 964 __rxrpc_disconnect_call(conn, call);
546 rxrpc_get_call(call); 965 rxrpc_notify_socket(call);
547
548 if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA &&
549 sp->hdr.flags & RXRPC_JUMBO_PACKET)
550 rxrpc_process_jumbo_packet(call, skb);
551 else
552 rxrpc_fast_process_packet(call, skb);
553
554 rxrpc_put_call(call);
555 goto done;
556
557dead_call:
558 if (sp->hdr.type != RXRPC_PACKET_TYPE_ABORT) {
559 skb->priority = RX_CALL_DEAD;
560 rxrpc_reject_packet(call->conn->params.local, skb);
561 goto unlock;
562 }
563free_unlock:
564 rxrpc_free_skb(skb);
565unlock:
566 read_unlock(&call->state_lock);
567done:
568 _leave("");
569} 966}
570 967
571/* 968/*
572 * post connection-level events to the connection 969 * post connection-level events to the connection
573 * - this includes challenges, responses and some aborts 970 * - this includes challenges, responses, some aborts and call terminal packet
971 * retransmission.
574 */ 972 */
575static void rxrpc_post_packet_to_conn(struct rxrpc_connection *conn, 973static void rxrpc_post_packet_to_conn(struct rxrpc_connection *conn,
576 struct sk_buff *skb) 974 struct sk_buff *skb)
@@ -595,6 +993,17 @@ static void rxrpc_post_packet_to_local(struct rxrpc_local *local,
595} 993}
596 994
597/* 995/*
996 * put a packet up for transport-level abort
997 */
998static void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb)
999{
1000 CHECK_SLAB_OKAY(&local->usage);
1001
1002 skb_queue_tail(&local->reject_queue, skb);
1003 rxrpc_queue_local(local);
1004}
1005
1006/*
598 * Extract the wire header from a packet and translate the byte order. 1007 * Extract the wire header from a packet and translate the byte order.
599 */ 1008 */
600static noinline 1009static noinline
@@ -605,8 +1014,6 @@ int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb)
605 /* dig out the RxRPC connection details */ 1014 /* dig out the RxRPC connection details */
606 if (skb_copy_bits(skb, 0, &whdr, sizeof(whdr)) < 0) 1015 if (skb_copy_bits(skb, 0, &whdr, sizeof(whdr)) < 0)
607 return -EBADMSG; 1016 return -EBADMSG;
608 if (!pskb_pull(skb, sizeof(whdr)))
609 BUG();
610 1017
611 memset(sp, 0, sizeof(*sp)); 1018 memset(sp, 0, sizeof(*sp));
612 sp->hdr.epoch = ntohl(whdr.epoch); 1019 sp->hdr.epoch = ntohl(whdr.epoch);
@@ -631,19 +1038,22 @@ int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb)
631 * shut down and the local endpoint from going away, thus sk_user_data will not 1038 * shut down and the local endpoint from going away, thus sk_user_data will not
632 * be cleared until this function returns. 1039 * be cleared until this function returns.
633 */ 1040 */
634void rxrpc_data_ready(struct sock *sk) 1041void rxrpc_data_ready(struct sock *udp_sk)
635{ 1042{
636 struct rxrpc_connection *conn; 1043 struct rxrpc_connection *conn;
1044 struct rxrpc_channel *chan;
1045 struct rxrpc_call *call;
637 struct rxrpc_skb_priv *sp; 1046 struct rxrpc_skb_priv *sp;
638 struct rxrpc_local *local = sk->sk_user_data; 1047 struct rxrpc_local *local = udp_sk->sk_user_data;
639 struct sk_buff *skb; 1048 struct sk_buff *skb;
640 int ret; 1049 unsigned int channel;
1050 int ret, skew;
641 1051
642 _enter("%p", sk); 1052 _enter("%p", udp_sk);
643 1053
644 ASSERT(!irqs_disabled()); 1054 ASSERT(!irqs_disabled());
645 1055
646 skb = skb_recv_datagram(sk, 0, 1, &ret); 1056 skb = skb_recv_datagram(udp_sk, 0, 1, &ret);
647 if (!skb) { 1057 if (!skb) {
648 if (ret == -EAGAIN) 1058 if (ret == -EAGAIN)
649 return; 1059 return;
@@ -651,13 +1061,13 @@ void rxrpc_data_ready(struct sock *sk)
651 return; 1061 return;
652 } 1062 }
653 1063
654 rxrpc_new_skb(skb); 1064 rxrpc_new_skb(skb, rxrpc_skb_rx_received);
655 1065
656 _net("recv skb %p", skb); 1066 _net("recv skb %p", skb);
657 1067
658 /* we'll probably need to checksum it (didn't call sock_recvmsg) */ 1068 /* we'll probably need to checksum it (didn't call sock_recvmsg) */
659 if (skb_checksum_complete(skb)) { 1069 if (skb_checksum_complete(skb)) {
660 rxrpc_free_skb(skb); 1070 rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
661 __UDP_INC_STATS(&init_net, UDP_MIB_INERRORS, 0); 1071 __UDP_INC_STATS(&init_net, UDP_MIB_INERRORS, 0);
662 _leave(" [CSUM failed]"); 1072 _leave(" [CSUM failed]");
663 return; 1073 return;
@@ -671,13 +1081,21 @@ void rxrpc_data_ready(struct sock *sk)
671 skb_orphan(skb); 1081 skb_orphan(skb);
672 sp = rxrpc_skb(skb); 1082 sp = rxrpc_skb(skb);
673 1083
674 _net("Rx UDP packet from %08x:%04hu",
675 ntohl(ip_hdr(skb)->saddr), ntohs(udp_hdr(skb)->source));
676
677 /* dig out the RxRPC connection details */ 1084 /* dig out the RxRPC connection details */
678 if (rxrpc_extract_header(sp, skb) < 0) 1085 if (rxrpc_extract_header(sp, skb) < 0)
679 goto bad_message; 1086 goto bad_message;
680 1087
1088 if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) {
1089 static int lose;
1090 if ((lose++ & 7) == 7) {
1091 trace_rxrpc_rx_lose(sp);
1092 rxrpc_lose_skb(skb, rxrpc_skb_rx_lost);
1093 return;
1094 }
1095 }
1096
1097 trace_rxrpc_rx_packet(sp);
1098
681 _net("Rx RxRPC %s ep=%x call=%x:%x", 1099 _net("Rx RxRPC %s ep=%x call=%x:%x",
682 sp->hdr.flags & RXRPC_CLIENT_INITIATED ? "ToServer" : "ToClient", 1100 sp->hdr.flags & RXRPC_CLIENT_INITIATED ? "ToServer" : "ToClient",
683 sp->hdr.epoch, sp->hdr.cid, sp->hdr.callNumber); 1101 sp->hdr.epoch, sp->hdr.cid, sp->hdr.callNumber);
@@ -688,70 +1106,135 @@ void rxrpc_data_ready(struct sock *sk)
688 goto bad_message; 1106 goto bad_message;
689 } 1107 }
690 1108
691 if (sp->hdr.type == RXRPC_PACKET_TYPE_VERSION) { 1109 switch (sp->hdr.type) {
1110 case RXRPC_PACKET_TYPE_VERSION:
692 rxrpc_post_packet_to_local(local, skb); 1111 rxrpc_post_packet_to_local(local, skb);
693 goto out; 1112 goto out;
694 }
695 1113
696 if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA && 1114 case RXRPC_PACKET_TYPE_BUSY:
697 (sp->hdr.callNumber == 0 || sp->hdr.seq == 0)) 1115 if (sp->hdr.flags & RXRPC_CLIENT_INITIATED)
698 goto bad_message; 1116 goto discard;
1117
1118 case RXRPC_PACKET_TYPE_DATA:
1119 if (sp->hdr.callNumber == 0)
1120 goto bad_message;
1121 if (sp->hdr.flags & RXRPC_JUMBO_PACKET &&
1122 !rxrpc_validate_jumbo(skb))
1123 goto bad_message;
1124 break;
1125 }
699 1126
700 rcu_read_lock(); 1127 rcu_read_lock();
701 1128
702 conn = rxrpc_find_connection_rcu(local, skb); 1129 conn = rxrpc_find_connection_rcu(local, skb);
703 if (!conn) 1130 if (conn) {
704 goto cant_route_call; 1131 if (sp->hdr.securityIndex != conn->security_ix)
1132 goto wrong_security;
1133
1134 if (sp->hdr.callNumber == 0) {
1135 /* Connection-level packet */
1136 _debug("CONN %p {%d}", conn, conn->debug_id);
1137 rxrpc_post_packet_to_conn(conn, skb);
1138 goto out_unlock;
1139 }
1140
1141 /* Note the serial number skew here */
1142 skew = (int)sp->hdr.serial - (int)conn->hi_serial;
1143 if (skew >= 0) {
1144 if (skew > 0)
1145 conn->hi_serial = sp->hdr.serial;
1146 } else {
1147 skew = -skew;
1148 skew = min(skew, 65535);
1149 }
705 1150
706 if (sp->hdr.callNumber == 0) {
707 /* Connection-level packet */
708 _debug("CONN %p {%d}", conn, conn->debug_id);
709 rxrpc_post_packet_to_conn(conn, skb);
710 } else {
711 /* Call-bound packets are routed by connection channel. */ 1151 /* Call-bound packets are routed by connection channel. */
712 unsigned int channel = sp->hdr.cid & RXRPC_CHANNELMASK; 1152 channel = sp->hdr.cid & RXRPC_CHANNELMASK;
713 struct rxrpc_channel *chan = &conn->channels[channel]; 1153 chan = &conn->channels[channel];
714 struct rxrpc_call *call = rcu_dereference(chan->call); 1154
1155 /* Ignore really old calls */
1156 if (sp->hdr.callNumber < chan->last_call)
1157 goto discard_unlock;
1158
1159 if (sp->hdr.callNumber == chan->last_call) {
1160 /* For the previous service call, if completed successfully, we
1161 * discard all further packets.
1162 */
1163 if (rxrpc_conn_is_service(conn) &&
1164 (chan->last_type == RXRPC_PACKET_TYPE_ACK ||
1165 sp->hdr.type == RXRPC_PACKET_TYPE_ABORT))
1166 goto discard_unlock;
1167
1168 /* But otherwise we need to retransmit the final packet from
1169 * data cached in the connection record.
1170 */
1171 rxrpc_post_packet_to_conn(conn, skb);
1172 goto out_unlock;
1173 }
715 1174
716 if (!call || atomic_read(&call->usage) == 0) 1175 call = rcu_dereference(chan->call);
717 goto cant_route_call; 1176
1177 if (sp->hdr.callNumber > chan->call_id) {
1178 if (!(sp->hdr.flags & RXRPC_CLIENT_INITIATED)) {
1179 rcu_read_unlock();
1180 goto reject_packet;
1181 }
1182 if (call)
1183 rxrpc_input_implicit_end_call(conn, call);
1184 call = NULL;
1185 }
1186 } else {
1187 skew = 0;
1188 call = NULL;
1189 }
718 1190
719 rxrpc_post_packet_to_call(call, skb); 1191 if (!call || atomic_read(&call->usage) == 0) {
1192 if (!(sp->hdr.type & RXRPC_CLIENT_INITIATED) ||
1193 sp->hdr.callNumber == 0 ||
1194 sp->hdr.type != RXRPC_PACKET_TYPE_DATA)
1195 goto bad_message_unlock;
1196 if (sp->hdr.seq != 1)
1197 goto discard_unlock;
1198 call = rxrpc_new_incoming_call(local, conn, skb);
1199 if (!call) {
1200 rcu_read_unlock();
1201 goto reject_packet;
1202 }
1203 rxrpc_send_ping(call, skb, skew);
720 } 1204 }
721 1205
1206 rxrpc_input_call_packet(call, skb, skew);
1207 goto discard_unlock;
1208
1209discard_unlock:
722 rcu_read_unlock(); 1210 rcu_read_unlock();
1211discard:
1212 rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
723out: 1213out:
1214 trace_rxrpc_rx_done(0, 0);
724 return; 1215 return;
725 1216
726cant_route_call: 1217out_unlock:
727 rcu_read_unlock(); 1218 rcu_read_unlock();
1219 goto out;
728 1220
729 _debug("can't route call"); 1221wrong_security:
730 if (sp->hdr.flags & RXRPC_CLIENT_INITIATED && 1222 rcu_read_unlock();
731 sp->hdr.type == RXRPC_PACKET_TYPE_DATA) { 1223 trace_rxrpc_abort("SEC", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq,
732 if (sp->hdr.seq == 1) { 1224 RXKADINCONSISTENCY, EBADMSG);
733 _debug("first packet"); 1225 skb->priority = RXKADINCONSISTENCY;
734 skb_queue_tail(&local->accept_queue, skb); 1226 goto post_abort;
735 rxrpc_queue_work(&local->processor);
736 _leave(" [incoming]");
737 return;
738 }
739 skb->priority = RX_INVALID_OPERATION;
740 } else {
741 skb->priority = RX_CALL_DEAD;
742 }
743
744 if (sp->hdr.type != RXRPC_PACKET_TYPE_ABORT) {
745 _debug("reject type %d",sp->hdr.type);
746 rxrpc_reject_packet(local, skb);
747 } else {
748 rxrpc_free_skb(skb);
749 }
750 _leave(" [no call]");
751 return;
752 1227
1228bad_message_unlock:
1229 rcu_read_unlock();
753bad_message: 1230bad_message:
1231 trace_rxrpc_abort("BAD", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq,
1232 RX_PROTOCOL_ERROR, EBADMSG);
754 skb->priority = RX_PROTOCOL_ERROR; 1233 skb->priority = RX_PROTOCOL_ERROR;
1234post_abort:
1235 skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT;
1236reject_packet:
1237 trace_rxrpc_rx_done(skb->mark, skb->priority);
755 rxrpc_reject_packet(local, skb); 1238 rxrpc_reject_packet(local, skb);
756 _leave(" [badmsg]"); 1239 _leave(" [badmsg]");
757} 1240}
diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c
index c21ad213b337..7d4375e557e6 100644
--- a/net/rxrpc/insecure.c
+++ b/net/rxrpc/insecure.c
@@ -23,31 +23,36 @@ static int none_prime_packet_security(struct rxrpc_connection *conn)
23} 23}
24 24
25static int none_secure_packet(struct rxrpc_call *call, 25static int none_secure_packet(struct rxrpc_call *call,
26 struct sk_buff *skb, 26 struct sk_buff *skb,
27 size_t data_size, 27 size_t data_size,
28 void *sechdr) 28 void *sechdr)
29{ 29{
30 return 0; 30 return 0;
31} 31}
32 32
33static int none_verify_packet(struct rxrpc_call *call, 33static int none_verify_packet(struct rxrpc_call *call, struct sk_buff *skb,
34 struct sk_buff *skb, 34 unsigned int offset, unsigned int len,
35 u32 *_abort_code) 35 rxrpc_seq_t seq, u16 expected_cksum)
36{ 36{
37 return 0; 37 return 0;
38} 38}
39 39
40static void none_locate_data(struct rxrpc_call *call, struct sk_buff *skb,
41 unsigned int *_offset, unsigned int *_len)
42{
43}
44
40static int none_respond_to_challenge(struct rxrpc_connection *conn, 45static int none_respond_to_challenge(struct rxrpc_connection *conn,
41 struct sk_buff *skb, 46 struct sk_buff *skb,
42 u32 *_abort_code) 47 u32 *_abort_code)
43{ 48{
44 *_abort_code = RX_PROTOCOL_ERROR; 49 *_abort_code = RX_PROTOCOL_ERROR;
45 return -EPROTO; 50 return -EPROTO;
46} 51}
47 52
48static int none_verify_response(struct rxrpc_connection *conn, 53static int none_verify_response(struct rxrpc_connection *conn,
49 struct sk_buff *skb, 54 struct sk_buff *skb,
50 u32 *_abort_code) 55 u32 *_abort_code)
51{ 56{
52 *_abort_code = RX_PROTOCOL_ERROR; 57 *_abort_code = RX_PROTOCOL_ERROR;
53 return -EPROTO; 58 return -EPROTO;
@@ -78,6 +83,7 @@ const struct rxrpc_security rxrpc_no_security = {
78 .prime_packet_security = none_prime_packet_security, 83 .prime_packet_security = none_prime_packet_security,
79 .secure_packet = none_secure_packet, 84 .secure_packet = none_secure_packet,
80 .verify_packet = none_verify_packet, 85 .verify_packet = none_verify_packet,
86 .locate_data = none_locate_data,
81 .respond_to_challenge = none_respond_to_challenge, 87 .respond_to_challenge = none_respond_to_challenge,
82 .verify_response = none_verify_response, 88 .verify_response = none_verify_response,
83 .clear = none_clear, 89 .clear = none_clear,
diff --git a/net/rxrpc/local_event.c b/net/rxrpc/local_event.c
index 31a3f86ef2f6..540d3955c1bc 100644
--- a/net/rxrpc/local_event.c
+++ b/net/rxrpc/local_event.c
@@ -15,8 +15,6 @@
15#include <linux/net.h> 15#include <linux/net.h>
16#include <linux/skbuff.h> 16#include <linux/skbuff.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/udp.h>
19#include <linux/ip.h>
20#include <net/sock.h> 18#include <net/sock.h>
21#include <net/af_rxrpc.h> 19#include <net/af_rxrpc.h>
22#include <generated/utsrelease.h> 20#include <generated/utsrelease.h>
@@ -33,7 +31,7 @@ static void rxrpc_send_version_request(struct rxrpc_local *local,
33{ 31{
34 struct rxrpc_wire_header whdr; 32 struct rxrpc_wire_header whdr;
35 struct rxrpc_skb_priv *sp = rxrpc_skb(skb); 33 struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
36 struct sockaddr_in sin; 34 struct sockaddr_rxrpc srx;
37 struct msghdr msg; 35 struct msghdr msg;
38 struct kvec iov[2]; 36 struct kvec iov[2];
39 size_t len; 37 size_t len;
@@ -41,12 +39,11 @@ static void rxrpc_send_version_request(struct rxrpc_local *local,
41 39
42 _enter(""); 40 _enter("");
43 41
44 sin.sin_family = AF_INET; 42 if (rxrpc_extract_addr_from_skb(&srx, skb) < 0)
45 sin.sin_port = udp_hdr(skb)->source; 43 return;
46 sin.sin_addr.s_addr = ip_hdr(skb)->saddr;
47 44
48 msg.msg_name = &sin; 45 msg.msg_name = &srx.transport;
49 msg.msg_namelen = sizeof(sin); 46 msg.msg_namelen = srx.transport_len;
50 msg.msg_control = NULL; 47 msg.msg_control = NULL;
51 msg.msg_controllen = 0; 48 msg.msg_controllen = 0;
52 msg.msg_flags = 0; 49 msg.msg_flags = 0;
@@ -93,11 +90,13 @@ void rxrpc_process_local_events(struct rxrpc_local *local)
93 if (skb) { 90 if (skb) {
94 struct rxrpc_skb_priv *sp = rxrpc_skb(skb); 91 struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
95 92
93 rxrpc_see_skb(skb, rxrpc_skb_rx_seen);
96 _debug("{%d},{%u}", local->debug_id, sp->hdr.type); 94 _debug("{%d},{%u}", local->debug_id, sp->hdr.type);
97 95
98 switch (sp->hdr.type) { 96 switch (sp->hdr.type) {
99 case RXRPC_PACKET_TYPE_VERSION: 97 case RXRPC_PACKET_TYPE_VERSION:
100 if (skb_copy_bits(skb, 0, &v, 1) < 0) 98 if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header),
99 &v, 1) < 0)
101 return; 100 return;
102 _proto("Rx VERSION { %02x }", v); 101 _proto("Rx VERSION { %02x }", v);
103 if (v == 0) 102 if (v == 0)
@@ -109,7 +108,7 @@ void rxrpc_process_local_events(struct rxrpc_local *local)
109 break; 108 break;
110 } 109 }
111 110
112 rxrpc_free_skb(skb); 111 rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
113 } 112 }
114 113
115 _leave(""); 114 _leave("");
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index a753796fbe8f..ff4864d550b8 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -58,6 +58,17 @@ static long rxrpc_local_cmp_key(const struct rxrpc_local *local,
58 memcmp(&local->srx.transport.sin.sin_addr, 58 memcmp(&local->srx.transport.sin.sin_addr,
59 &srx->transport.sin.sin_addr, 59 &srx->transport.sin.sin_addr,
60 sizeof(struct in_addr)); 60 sizeof(struct in_addr));
61#ifdef CONFIG_AF_RXRPC_IPV6
62 case AF_INET6:
63 /* If the choice of UDP6 port is left up to the transport, then
64 * the endpoint record doesn't match.
65 */
66 return ((u16 __force)local->srx.transport.sin6.sin6_port -
67 (u16 __force)srx->transport.sin6.sin6_port) ?:
68 memcmp(&local->srx.transport.sin6.sin6_addr,
69 &srx->transport.sin6.sin6_addr,
70 sizeof(struct in6_addr));
71#endif
61 default: 72 default:
62 BUG(); 73 BUG();
63 } 74 }
@@ -75,9 +86,7 @@ static struct rxrpc_local *rxrpc_alloc_local(const struct sockaddr_rxrpc *srx)
75 atomic_set(&local->usage, 1); 86 atomic_set(&local->usage, 1);
76 INIT_LIST_HEAD(&local->link); 87 INIT_LIST_HEAD(&local->link);
77 INIT_WORK(&local->processor, rxrpc_local_processor); 88 INIT_WORK(&local->processor, rxrpc_local_processor);
78 INIT_LIST_HEAD(&local->services);
79 init_rwsem(&local->defrag_sem); 89 init_rwsem(&local->defrag_sem);
80 skb_queue_head_init(&local->accept_queue);
81 skb_queue_head_init(&local->reject_queue); 90 skb_queue_head_init(&local->reject_queue);
82 skb_queue_head_init(&local->event_queue); 91 skb_queue_head_init(&local->event_queue);
83 local->client_conns = RB_ROOT; 92 local->client_conns = RB_ROOT;
@@ -101,11 +110,12 @@ static int rxrpc_open_socket(struct rxrpc_local *local)
101 struct sock *sock; 110 struct sock *sock;
102 int ret, opt; 111 int ret, opt;
103 112
104 _enter("%p{%d}", local, local->srx.transport_type); 113 _enter("%p{%d,%d}",
114 local, local->srx.transport_type, local->srx.transport.family);
105 115
106 /* create a socket to represent the local endpoint */ 116 /* create a socket to represent the local endpoint */
107 ret = sock_create_kern(&init_net, PF_INET, local->srx.transport_type, 117 ret = sock_create_kern(&init_net, local->srx.transport.family,
108 IPPROTO_UDP, &local->socket); 118 local->srx.transport_type, 0, &local->socket);
109 if (ret < 0) { 119 if (ret < 0) {
110 _leave(" = %d [socket]", ret); 120 _leave(" = %d [socket]", ret);
111 return ret; 121 return ret;
@@ -170,18 +180,8 @@ struct rxrpc_local *rxrpc_lookup_local(const struct sockaddr_rxrpc *srx)
170 long diff; 180 long diff;
171 int ret; 181 int ret;
172 182
173 if (srx->transport.family == AF_INET) { 183 _enter("{%d,%d,%pISp}",
174 _enter("{%d,%u,%pI4+%hu}", 184 srx->transport_type, srx->transport.family, &srx->transport);
175 srx->transport_type,
176 srx->transport.family,
177 &srx->transport.sin.sin_addr,
178 ntohs(srx->transport.sin.sin_port));
179 } else {
180 _enter("{%d,%u}",
181 srx->transport_type,
182 srx->transport.family);
183 return ERR_PTR(-EAFNOSUPPORT);
184 }
185 185
186 mutex_lock(&rxrpc_local_mutex); 186 mutex_lock(&rxrpc_local_mutex);
187 187
@@ -234,13 +234,8 @@ struct rxrpc_local *rxrpc_lookup_local(const struct sockaddr_rxrpc *srx)
234found: 234found:
235 mutex_unlock(&rxrpc_local_mutex); 235 mutex_unlock(&rxrpc_local_mutex);
236 236
237 _net("LOCAL %s %d {%d,%u,%pI4+%hu}", 237 _net("LOCAL %s %d {%pISp}",
238 age, 238 age, local->debug_id, &local->srx.transport);
239 local->debug_id,
240 local->srx.transport_type,
241 local->srx.transport.family,
242 &local->srx.transport.sin.sin_addr,
243 ntohs(local->srx.transport.sin.sin_port));
244 239
245 _leave(" = %p", local); 240 _leave(" = %p", local);
246 return local; 241 return local;
@@ -296,7 +291,7 @@ static void rxrpc_local_destroyer(struct rxrpc_local *local)
296 mutex_unlock(&rxrpc_local_mutex); 291 mutex_unlock(&rxrpc_local_mutex);
297 292
298 ASSERT(RB_EMPTY_ROOT(&local->client_conns)); 293 ASSERT(RB_EMPTY_ROOT(&local->client_conns));
299 ASSERT(list_empty(&local->services)); 294 ASSERT(!local->service);
300 295
301 if (socket) { 296 if (socket) {
302 local->socket = NULL; 297 local->socket = NULL;
@@ -308,7 +303,6 @@ static void rxrpc_local_destroyer(struct rxrpc_local *local)
308 /* At this point, there should be no more packets coming in to the 303 /* At this point, there should be no more packets coming in to the
309 * local endpoint. 304 * local endpoint.
310 */ 305 */
311 rxrpc_purge_queue(&local->accept_queue);
312 rxrpc_purge_queue(&local->reject_queue); 306 rxrpc_purge_queue(&local->reject_queue);
313 rxrpc_purge_queue(&local->event_queue); 307 rxrpc_purge_queue(&local->event_queue);
314 308
@@ -332,11 +326,6 @@ static void rxrpc_local_processor(struct work_struct *work)
332 if (atomic_read(&local->usage) == 0) 326 if (atomic_read(&local->usage) == 0)
333 return rxrpc_local_destroyer(local); 327 return rxrpc_local_destroyer(local);
334 328
335 if (!skb_queue_empty(&local->accept_queue)) {
336 rxrpc_accept_incoming_calls(local);
337 again = true;
338 }
339
340 if (!skb_queue_empty(&local->reject_queue)) { 329 if (!skb_queue_empty(&local->reject_queue)) {
341 rxrpc_reject_packets(local); 330 rxrpc_reject_packets(local);
342 again = true; 331 again = true;
diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c
index bdc5e42fe600..6dee55fad2d3 100644
--- a/net/rxrpc/misc.c
+++ b/net/rxrpc/misc.c
@@ -21,28 +21,33 @@
21unsigned int rxrpc_max_backlog __read_mostly = 10; 21unsigned int rxrpc_max_backlog __read_mostly = 10;
22 22
23/* 23/*
24 * Maximum lifetime of a call (in mx).
25 */
26unsigned int rxrpc_max_call_lifetime = 60 * 1000;
27
28/*
24 * How long to wait before scheduling ACK generation after seeing a 29 * How long to wait before scheduling ACK generation after seeing a
25 * packet with RXRPC_REQUEST_ACK set (in jiffies). 30 * packet with RXRPC_REQUEST_ACK set (in ms).
26 */ 31 */
27unsigned int rxrpc_requested_ack_delay = 1; 32unsigned int rxrpc_requested_ack_delay = 1;
28 33
29/* 34/*
30 * How long to wait before scheduling an ACK with subtype DELAY (in jiffies). 35 * How long to wait before scheduling an ACK with subtype DELAY (in ms).
31 * 36 *
32 * We use this when we've received new data packets. If those packets aren't 37 * We use this when we've received new data packets. If those packets aren't
33 * all consumed within this time we will send a DELAY ACK if an ACK was not 38 * all consumed within this time we will send a DELAY ACK if an ACK was not
34 * requested to let the sender know it doesn't need to resend. 39 * requested to let the sender know it doesn't need to resend.
35 */ 40 */
36unsigned int rxrpc_soft_ack_delay = 1 * HZ; 41unsigned int rxrpc_soft_ack_delay = 1 * 1000;
37 42
38/* 43/*
39 * How long to wait before scheduling an ACK with subtype IDLE (in jiffies). 44 * How long to wait before scheduling an ACK with subtype IDLE (in ms).
40 * 45 *
41 * We use this when we've consumed some previously soft-ACK'd packets when 46 * We use this when we've consumed some previously soft-ACK'd packets when
42 * further packets aren't immediately received to decide when to send an IDLE 47 * further packets aren't immediately received to decide when to send an IDLE
43 * ACK let the other end know that it can free up its Tx buffer space. 48 * ACK let the other end know that it can free up its Tx buffer space.
44 */ 49 */
45unsigned int rxrpc_idle_ack_delay = 0.5 * HZ; 50unsigned int rxrpc_idle_ack_delay = 0.5 * 1000;
46 51
47/* 52/*
48 * Receive window size in packets. This indicates the maximum number of 53 * Receive window size in packets. This indicates the maximum number of
@@ -50,7 +55,10 @@ unsigned int rxrpc_idle_ack_delay = 0.5 * HZ;
50 * limit is hit, we should generate an EXCEEDS_WINDOW ACK and discard further 55 * limit is hit, we should generate an EXCEEDS_WINDOW ACK and discard further
51 * packets. 56 * packets.
52 */ 57 */
53unsigned int rxrpc_rx_window_size = 32; 58unsigned int rxrpc_rx_window_size = RXRPC_INIT_RX_WINDOW_SIZE;
59#if (RXRPC_RXTX_BUFF_SIZE - 1) < RXRPC_INIT_RX_WINDOW_SIZE
60#error Need to reduce RXRPC_INIT_RX_WINDOW_SIZE
61#endif
54 62
55/* 63/*
56 * Maximum Rx MTU size. This indicates to the sender the size of jumbo packet 64 * Maximum Rx MTU size. This indicates to the sender the size of jumbo packet
@@ -64,6 +72,11 @@ unsigned int rxrpc_rx_mtu = 5692;
64 */ 72 */
65unsigned int rxrpc_rx_jumbo_max = 4; 73unsigned int rxrpc_rx_jumbo_max = 4;
66 74
75/*
76 * Time till packet resend (in milliseconds).
77 */
78unsigned int rxrpc_resend_timeout = 4 * 1000;
79
67const char *const rxrpc_pkts[] = { 80const char *const rxrpc_pkts[] = {
68 "?00", 81 "?00",
69 "DATA", "ACK", "BUSY", "ABORT", "ACKALL", "CHALL", "RESP", "DEBUG", 82 "DATA", "ACK", "BUSY", "ABORT", "ACKALL", "CHALL", "RESP", "DEBUG",
@@ -75,21 +88,154 @@ const s8 rxrpc_ack_priority[] = {
75 [RXRPC_ACK_DELAY] = 1, 88 [RXRPC_ACK_DELAY] = 1,
76 [RXRPC_ACK_REQUESTED] = 2, 89 [RXRPC_ACK_REQUESTED] = 2,
77 [RXRPC_ACK_IDLE] = 3, 90 [RXRPC_ACK_IDLE] = 3,
78 [RXRPC_ACK_PING_RESPONSE] = 4, 91 [RXRPC_ACK_DUPLICATE] = 4,
79 [RXRPC_ACK_DUPLICATE] = 5, 92 [RXRPC_ACK_OUT_OF_SEQUENCE] = 5,
80 [RXRPC_ACK_OUT_OF_SEQUENCE] = 6, 93 [RXRPC_ACK_EXCEEDS_WINDOW] = 6,
81 [RXRPC_ACK_EXCEEDS_WINDOW] = 7, 94 [RXRPC_ACK_NOSPACE] = 7,
82 [RXRPC_ACK_NOSPACE] = 8, 95 [RXRPC_ACK_PING_RESPONSE] = 8,
83}; 96};
84 97
85const char *rxrpc_acks(u8 reason) 98const char rxrpc_ack_names[RXRPC_ACK__INVALID + 1][4] = {
86{ 99 "---", "REQ", "DUP", "OOS", "WIN", "MEM", "PNG", "PNR", "DLY",
87 static const char *const str[] = { 100 "IDL", "-?-"
88 "---", "REQ", "DUP", "OOS", "WIN", "MEM", "PNG", "PNR", "DLY", 101};
89 "IDL", "-?-" 102
90 }; 103const char rxrpc_skb_traces[rxrpc_skb__nr_trace][7] = {
91 104 [rxrpc_skb_rx_cleaned] = "Rx CLN",
92 if (reason >= ARRAY_SIZE(str)) 105 [rxrpc_skb_rx_freed] = "Rx FRE",
93 reason = ARRAY_SIZE(str) - 1; 106 [rxrpc_skb_rx_got] = "Rx GOT",
94 return str[reason]; 107 [rxrpc_skb_rx_lost] = "Rx *L*",
95} 108 [rxrpc_skb_rx_received] = "Rx RCV",
109 [rxrpc_skb_rx_purged] = "Rx PUR",
110 [rxrpc_skb_rx_rotated] = "Rx ROT",
111 [rxrpc_skb_rx_seen] = "Rx SEE",
112 [rxrpc_skb_tx_cleaned] = "Tx CLN",
113 [rxrpc_skb_tx_freed] = "Tx FRE",
114 [rxrpc_skb_tx_got] = "Tx GOT",
115 [rxrpc_skb_tx_new] = "Tx NEW",
116 [rxrpc_skb_tx_rotated] = "Tx ROT",
117 [rxrpc_skb_tx_seen] = "Tx SEE",
118};
119
120const char rxrpc_conn_traces[rxrpc_conn__nr_trace][4] = {
121 [rxrpc_conn_new_client] = "NWc",
122 [rxrpc_conn_new_service] = "NWs",
123 [rxrpc_conn_queued] = "QUE",
124 [rxrpc_conn_seen] = "SEE",
125 [rxrpc_conn_got] = "GOT",
126 [rxrpc_conn_put_client] = "PTc",
127 [rxrpc_conn_put_service] = "PTs",
128};
129
130const char rxrpc_client_traces[rxrpc_client__nr_trace][7] = {
131 [rxrpc_client_activate_chans] = "Activa",
132 [rxrpc_client_alloc] = "Alloc ",
133 [rxrpc_client_chan_activate] = "ChActv",
134 [rxrpc_client_chan_disconnect] = "ChDisc",
135 [rxrpc_client_chan_pass] = "ChPass",
136 [rxrpc_client_chan_unstarted] = "ChUnst",
137 [rxrpc_client_cleanup] = "Clean ",
138 [rxrpc_client_count] = "Count ",
139 [rxrpc_client_discard] = "Discar",
140 [rxrpc_client_duplicate] = "Duplic",
141 [rxrpc_client_exposed] = "Expose",
142 [rxrpc_client_replace] = "Replac",
143 [rxrpc_client_to_active] = "->Actv",
144 [rxrpc_client_to_culled] = "->Cull",
145 [rxrpc_client_to_idle] = "->Idle",
146 [rxrpc_client_to_inactive] = "->Inac",
147 [rxrpc_client_to_waiting] = "->Wait",
148 [rxrpc_client_uncount] = "Uncoun",
149};
150
151const char rxrpc_transmit_traces[rxrpc_transmit__nr_trace][4] = {
152 [rxrpc_transmit_wait] = "WAI",
153 [rxrpc_transmit_queue] = "QUE",
154 [rxrpc_transmit_queue_last] = "QLS",
155 [rxrpc_transmit_rotate] = "ROT",
156 [rxrpc_transmit_rotate_last] = "RLS",
157 [rxrpc_transmit_await_reply] = "AWR",
158 [rxrpc_transmit_end] = "END",
159};
160
161const char rxrpc_receive_traces[rxrpc_receive__nr_trace][4] = {
162 [rxrpc_receive_incoming] = "INC",
163 [rxrpc_receive_queue] = "QUE",
164 [rxrpc_receive_queue_last] = "QLS",
165 [rxrpc_receive_front] = "FRN",
166 [rxrpc_receive_rotate] = "ROT",
167 [rxrpc_receive_end] = "END",
168};
169
170const char rxrpc_recvmsg_traces[rxrpc_recvmsg__nr_trace][5] = {
171 [rxrpc_recvmsg_enter] = "ENTR",
172 [rxrpc_recvmsg_wait] = "WAIT",
173 [rxrpc_recvmsg_dequeue] = "DEQU",
174 [rxrpc_recvmsg_hole] = "HOLE",
175 [rxrpc_recvmsg_next] = "NEXT",
176 [rxrpc_recvmsg_cont] = "CONT",
177 [rxrpc_recvmsg_full] = "FULL",
178 [rxrpc_recvmsg_data_return] = "DATA",
179 [rxrpc_recvmsg_terminal] = "TERM",
180 [rxrpc_recvmsg_to_be_accepted] = "TBAC",
181 [rxrpc_recvmsg_return] = "RETN",
182};
183
184const char rxrpc_rtt_tx_traces[rxrpc_rtt_tx__nr_trace][5] = {
185 [rxrpc_rtt_tx_ping] = "PING",
186 [rxrpc_rtt_tx_data] = "DATA",
187};
188
189const char rxrpc_rtt_rx_traces[rxrpc_rtt_rx__nr_trace][5] = {
190 [rxrpc_rtt_rx_ping_response] = "PONG",
191 [rxrpc_rtt_rx_requested_ack] = "RACK",
192};
193
194const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8] = {
195 [rxrpc_timer_begin] = "Begin ",
196 [rxrpc_timer_expired] = "*EXPR*",
197 [rxrpc_timer_init_for_reply] = "IniRpl",
198 [rxrpc_timer_init_for_send_reply] = "SndRpl",
199 [rxrpc_timer_set_for_ack] = "SetAck",
200 [rxrpc_timer_set_for_ping] = "SetPng",
201 [rxrpc_timer_set_for_send] = "SetTx ",
202 [rxrpc_timer_set_for_resend] = "SetRTx",
203};
204
205const char rxrpc_propose_ack_traces[rxrpc_propose_ack__nr_trace][8] = {
206 [rxrpc_propose_ack_client_tx_end] = "ClTxEnd",
207 [rxrpc_propose_ack_input_data] = "DataIn ",
208 [rxrpc_propose_ack_ping_for_lost_ack] = "LostAck",
209 [rxrpc_propose_ack_ping_for_lost_reply] = "LostRpl",
210 [rxrpc_propose_ack_ping_for_params] = "Params ",
211 [rxrpc_propose_ack_processing_op] = "ProcOp ",
212 [rxrpc_propose_ack_respond_to_ack] = "Rsp2Ack",
213 [rxrpc_propose_ack_respond_to_ping] = "Rsp2Png",
214 [rxrpc_propose_ack_retry_tx] = "RetryTx",
215 [rxrpc_propose_ack_rotate_rx] = "RxAck ",
216 [rxrpc_propose_ack_terminal_ack] = "ClTerm ",
217};
218
219const char *const rxrpc_propose_ack_outcomes[rxrpc_propose_ack__nr_outcomes] = {
220 [rxrpc_propose_ack_use] = "",
221 [rxrpc_propose_ack_update] = " Update",
222 [rxrpc_propose_ack_subsume] = " Subsume",
223};
224
225const char rxrpc_congest_modes[NR__RXRPC_CONGEST_MODES][10] = {
226 [RXRPC_CALL_SLOW_START] = "SlowStart",
227 [RXRPC_CALL_CONGEST_AVOIDANCE] = "CongAvoid",
228 [RXRPC_CALL_PACKET_LOSS] = "PktLoss ",
229 [RXRPC_CALL_FAST_RETRANSMIT] = "FastReTx ",
230};
231
232const char rxrpc_congest_changes[rxrpc_congest__nr_change][9] = {
233 [rxrpc_cong_begin_retransmission] = " Retrans",
234 [rxrpc_cong_cleared_nacks] = " Cleared",
235 [rxrpc_cong_new_low_nack] = " NewLowN",
236 [rxrpc_cong_no_change] = "",
237 [rxrpc_cong_progress] = " Progres",
238 [rxrpc_cong_retransmit_again] = " ReTxAgn",
239 [rxrpc_cong_rtt_window_end] = " RttWinE",
240 [rxrpc_cong_saw_nack] = " SawNack",
241};
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index f4bda06b7d2d..5dab1ff3a6c2 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -14,336 +14,351 @@
14#include <linux/net.h> 14#include <linux/net.h>
15#include <linux/gfp.h> 15#include <linux/gfp.h>
16#include <linux/skbuff.h> 16#include <linux/skbuff.h>
17#include <linux/circ_buf.h>
18#include <linux/export.h> 17#include <linux/export.h>
19#include <net/sock.h> 18#include <net/sock.h>
20#include <net/af_rxrpc.h> 19#include <net/af_rxrpc.h>
21#include "ar-internal.h" 20#include "ar-internal.h"
22 21
23/* 22struct rxrpc_ack_buffer {
24 * Time till packet resend (in jiffies). 23 struct rxrpc_wire_header whdr;
25 */ 24 struct rxrpc_ackpacket ack;
26unsigned int rxrpc_resend_timeout = 4 * HZ; 25 u8 acks[255];
27 26 u8 pad[3];
28static int rxrpc_send_data(struct rxrpc_sock *rx, 27 struct rxrpc_ackinfo ackinfo;
29 struct rxrpc_call *call, 28};
30 struct msghdr *msg, size_t len);
31
32/*
33 * extract control messages from the sendmsg() control buffer
34 */
35static int rxrpc_sendmsg_cmsg(struct msghdr *msg,
36 unsigned long *user_call_ID,
37 enum rxrpc_command *command,
38 u32 *abort_code,
39 bool *_exclusive)
40{
41 struct cmsghdr *cmsg;
42 bool got_user_ID = false;
43 int len;
44
45 *command = RXRPC_CMD_SEND_DATA;
46
47 if (msg->msg_controllen == 0)
48 return -EINVAL;
49
50 for_each_cmsghdr(cmsg, msg) {
51 if (!CMSG_OK(msg, cmsg))
52 return -EINVAL;
53
54 len = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr));
55 _debug("CMSG %d, %d, %d",
56 cmsg->cmsg_level, cmsg->cmsg_type, len);
57
58 if (cmsg->cmsg_level != SOL_RXRPC)
59 continue;
60
61 switch (cmsg->cmsg_type) {
62 case RXRPC_USER_CALL_ID:
63 if (msg->msg_flags & MSG_CMSG_COMPAT) {
64 if (len != sizeof(u32))
65 return -EINVAL;
66 *user_call_ID = *(u32 *) CMSG_DATA(cmsg);
67 } else {
68 if (len != sizeof(unsigned long))
69 return -EINVAL;
70 *user_call_ID = *(unsigned long *)
71 CMSG_DATA(cmsg);
72 }
73 _debug("User Call ID %lx", *user_call_ID);
74 got_user_ID = true;
75 break;
76
77 case RXRPC_ABORT:
78 if (*command != RXRPC_CMD_SEND_DATA)
79 return -EINVAL;
80 *command = RXRPC_CMD_SEND_ABORT;
81 if (len != sizeof(*abort_code))
82 return -EINVAL;
83 *abort_code = *(unsigned int *) CMSG_DATA(cmsg);
84 _debug("Abort %x", *abort_code);
85 if (*abort_code == 0)
86 return -EINVAL;
87 break;
88
89 case RXRPC_ACCEPT:
90 if (*command != RXRPC_CMD_SEND_DATA)
91 return -EINVAL;
92 *command = RXRPC_CMD_ACCEPT;
93 if (len != 0)
94 return -EINVAL;
95 break;
96
97 case RXRPC_EXCLUSIVE_CALL:
98 *_exclusive = true;
99 if (len != 0)
100 return -EINVAL;
101 break;
102 default:
103 return -EINVAL;
104 }
105 }
106 29
107 if (!got_user_ID) 30struct rxrpc_abort_buffer {
108 return -EINVAL; 31 struct rxrpc_wire_header whdr;
109 _leave(" = 0"); 32 __be32 abort_code;
110 return 0; 33};
111}
112 34
113/* 35/*
114 * abort a call, sending an ABORT packet to the peer 36 * Fill out an ACK packet.
115 */ 37 */
116static void rxrpc_send_abort(struct rxrpc_call *call, u32 abort_code) 38static size_t rxrpc_fill_out_ack(struct rxrpc_call *call,
39 struct rxrpc_ack_buffer *pkt,
40 rxrpc_seq_t *_hard_ack,
41 rxrpc_seq_t *_top,
42 u8 reason)
117{ 43{
118 write_lock_bh(&call->state_lock); 44 rxrpc_serial_t serial;
119 45 rxrpc_seq_t hard_ack, top, seq;
120 if (call->state <= RXRPC_CALL_COMPLETE) { 46 int ix;
121 call->state = RXRPC_CALL_LOCALLY_ABORTED; 47 u32 mtu, jmax;
122 call->local_abort = abort_code; 48 u8 *ackp = pkt->acks;
123 set_bit(RXRPC_CALL_EV_ABORT, &call->events); 49
124 del_timer_sync(&call->resend_timer); 50 /* Barrier against rxrpc_input_data(). */
125 del_timer_sync(&call->ack_timer); 51 serial = call->ackr_serial;
126 clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events); 52 hard_ack = READ_ONCE(call->rx_hard_ack);
127 clear_bit(RXRPC_CALL_EV_ACK, &call->events); 53 top = smp_load_acquire(&call->rx_top);
128 clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); 54 *_hard_ack = hard_ack;
129 rxrpc_queue_call(call); 55 *_top = top;
56
57 pkt->ack.bufferSpace = htons(8);
58 pkt->ack.maxSkew = htons(call->ackr_skew);
59 pkt->ack.firstPacket = htonl(hard_ack + 1);
60 pkt->ack.previousPacket = htonl(call->ackr_prev_seq);
61 pkt->ack.serial = htonl(serial);
62 pkt->ack.reason = reason;
63 pkt->ack.nAcks = top - hard_ack;
64
65 if (reason == RXRPC_ACK_PING)
66 pkt->whdr.flags |= RXRPC_REQUEST_ACK;
67
68 if (after(top, hard_ack)) {
69 seq = hard_ack + 1;
70 do {
71 ix = seq & RXRPC_RXTX_BUFF_MASK;
72 if (call->rxtx_buffer[ix])
73 *ackp++ = RXRPC_ACK_TYPE_ACK;
74 else
75 *ackp++ = RXRPC_ACK_TYPE_NACK;
76 seq++;
77 } while (before_eq(seq, top));
130 } 78 }
131 79
132 write_unlock_bh(&call->state_lock); 80 mtu = call->conn->params.peer->if_mtu;
133} 81 mtu -= call->conn->params.peer->hdrsize;
134 82 jmax = (call->nr_jumbo_bad > 3) ? 1 : rxrpc_rx_jumbo_max;
135/* 83 pkt->ackinfo.rxMTU = htonl(rxrpc_rx_mtu);
136 * Create a new client call for sendmsg(). 84 pkt->ackinfo.maxMTU = htonl(mtu);
137 */ 85 pkt->ackinfo.rwind = htonl(call->rx_winsize);
138static struct rxrpc_call * 86 pkt->ackinfo.jumbo_max = htonl(jmax);
139rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, 87
140 unsigned long user_call_ID, bool exclusive) 88 *ackp++ = 0;
141{ 89 *ackp++ = 0;
142 struct rxrpc_conn_parameters cp; 90 *ackp++ = 0;
143 struct rxrpc_call *call; 91 return top - hard_ack + 3;
144 struct key *key;
145
146 DECLARE_SOCKADDR(struct sockaddr_rxrpc *, srx, msg->msg_name);
147
148 _enter("");
149
150 if (!msg->msg_name)
151 return ERR_PTR(-EDESTADDRREQ);
152
153 key = rx->key;
154 if (key && !rx->key->payload.data[0])
155 key = NULL;
156
157 memset(&cp, 0, sizeof(cp));
158 cp.local = rx->local;
159 cp.key = rx->key;
160 cp.security_level = rx->min_sec_level;
161 cp.exclusive = rx->exclusive | exclusive;
162 cp.service_id = srx->srx_service;
163 call = rxrpc_new_client_call(rx, &cp, srx, user_call_ID, GFP_KERNEL);
164
165 _leave(" = %p\n", call);
166 return call;
167} 92}
168 93
169/* 94/*
170 * send a message forming part of a client call through an RxRPC socket 95 * Send an ACK call packet.
171 * - caller holds the socket locked
172 * - the socket may be either a client socket or a server socket
173 */ 96 */
174int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) 97int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping)
175{ 98{
176 enum rxrpc_command cmd; 99 struct rxrpc_connection *conn = NULL;
177 struct rxrpc_call *call; 100 struct rxrpc_ack_buffer *pkt;
178 unsigned long user_call_ID = 0; 101 struct msghdr msg;
179 bool exclusive = false; 102 struct kvec iov[2];
180 u32 abort_code = 0; 103 rxrpc_serial_t serial;
104 rxrpc_seq_t hard_ack, top;
105 size_t len, n;
181 int ret; 106 int ret;
182 107 u8 reason;
183 _enter(""); 108
184 109 spin_lock_bh(&call->lock);
185 ret = rxrpc_sendmsg_cmsg(msg, &user_call_ID, &cmd, &abort_code, 110 if (call->conn)
186 &exclusive); 111 conn = rxrpc_get_connection_maybe(call->conn);
187 if (ret < 0) 112 spin_unlock_bh(&call->lock);
188 return ret; 113 if (!conn)
189 114 return -ECONNRESET;
190 if (cmd == RXRPC_CMD_ACCEPT) { 115
191 if (rx->sk.sk_state != RXRPC_SERVER_LISTENING) 116 pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
192 return -EINVAL; 117 if (!pkt) {
193 call = rxrpc_accept_call(rx, user_call_ID); 118 rxrpc_put_connection(conn);
194 if (IS_ERR(call)) 119 return -ENOMEM;
195 return PTR_ERR(call);
196 rxrpc_put_call(call);
197 return 0;
198 } 120 }
199 121
200 call = rxrpc_find_call_by_user_ID(rx, user_call_ID); 122 msg.msg_name = &call->peer->srx.transport;
201 if (!call) { 123 msg.msg_namelen = call->peer->srx.transport_len;
202 if (cmd != RXRPC_CMD_SEND_DATA) 124 msg.msg_control = NULL;
203 return -EBADSLT; 125 msg.msg_controllen = 0;
204 call = rxrpc_new_client_call_for_sendmsg(rx, msg, user_call_ID, 126 msg.msg_flags = 0;
205 exclusive); 127
206 if (IS_ERR(call)) 128 pkt->whdr.epoch = htonl(conn->proto.epoch);
207 return PTR_ERR(call); 129 pkt->whdr.cid = htonl(call->cid);
130 pkt->whdr.callNumber = htonl(call->call_id);
131 pkt->whdr.seq = 0;
132 pkt->whdr.type = RXRPC_PACKET_TYPE_ACK;
133 pkt->whdr.flags = RXRPC_SLOW_START_OK | conn->out_clientflag;
134 pkt->whdr.userStatus = 0;
135 pkt->whdr.securityIndex = call->security_ix;
136 pkt->whdr._rsvd = 0;
137 pkt->whdr.serviceId = htons(call->service_id);
138
139 spin_lock_bh(&call->lock);
140 if (ping) {
141 reason = RXRPC_ACK_PING;
142 } else {
143 reason = call->ackr_reason;
144 if (!call->ackr_reason) {
145 spin_unlock_bh(&call->lock);
146 ret = 0;
147 goto out;
148 }
149 call->ackr_reason = 0;
150 }
151 n = rxrpc_fill_out_ack(call, pkt, &hard_ack, &top, reason);
152
153 spin_unlock_bh(&call->lock);
154
155 iov[0].iov_base = pkt;
156 iov[0].iov_len = sizeof(pkt->whdr) + sizeof(pkt->ack) + n;
157 iov[1].iov_base = &pkt->ackinfo;
158 iov[1].iov_len = sizeof(pkt->ackinfo);
159 len = iov[0].iov_len + iov[1].iov_len;
160
161 serial = atomic_inc_return(&conn->serial);
162 pkt->whdr.serial = htonl(serial);
163 trace_rxrpc_tx_ack(call, serial,
164 ntohl(pkt->ack.firstPacket),
165 ntohl(pkt->ack.serial),
166 pkt->ack.reason, pkt->ack.nAcks);
167
168 if (ping) {
169 call->ping_serial = serial;
170 smp_wmb();
171 /* We need to stick a time in before we send the packet in case
172 * the reply gets back before kernel_sendmsg() completes - but
173 * asking UDP to send the packet can take a relatively long
174 * time, so we update the time after, on the assumption that
175 * the packet transmission is more likely to happen towards the
176 * end of the kernel_sendmsg() call.
177 */
178 call->ping_time = ktime_get_real();
179 set_bit(RXRPC_CALL_PINGING, &call->flags);
180 trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_ping, serial);
208 } 181 }
209 182
210 _debug("CALL %d USR %lx ST %d on CONN %p", 183 ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len);
211 call->debug_id, call->user_call_ID, call->state, call->conn); 184 if (ping)
212 185 call->ping_time = ktime_get_real();
213 if (call->state >= RXRPC_CALL_COMPLETE) { 186
214 /* it's too late for this call */ 187 if (call->state < RXRPC_CALL_COMPLETE) {
215 ret = -ECONNRESET; 188 if (ret < 0) {
216 } else if (cmd == RXRPC_CMD_SEND_ABORT) { 189 if (ping)
217 rxrpc_send_abort(call, abort_code); 190 clear_bit(RXRPC_CALL_PINGING, &call->flags);
218 ret = 0; 191 rxrpc_propose_ACK(call, pkt->ack.reason,
219 } else if (cmd != RXRPC_CMD_SEND_DATA) { 192 ntohs(pkt->ack.maxSkew),
220 ret = -EINVAL; 193 ntohl(pkt->ack.serial),
221 } else if (!call->in_clientflag && 194 true, true,
222 call->state != RXRPC_CALL_CLIENT_SEND_REQUEST) { 195 rxrpc_propose_ack_retry_tx);
223 /* request phase complete for this client call */ 196 } else {
224 ret = -EPROTO; 197 spin_lock_bh(&call->lock);
225 } else if (call->in_clientflag && 198 if (after(hard_ack, call->ackr_consumed))
226 call->state != RXRPC_CALL_SERVER_ACK_REQUEST && 199 call->ackr_consumed = hard_ack;
227 call->state != RXRPC_CALL_SERVER_SEND_REPLY) { 200 if (after(top, call->ackr_seen))
228 /* Reply phase not begun or not complete for service call. */ 201 call->ackr_seen = top;
229 ret = -EPROTO; 202 spin_unlock_bh(&call->lock);
230 } else { 203 }
231 ret = rxrpc_send_data(rx, call, msg, len);
232 } 204 }
233 205
234 rxrpc_put_call(call); 206out:
235 _leave(" = %d", ret); 207 rxrpc_put_connection(conn);
208 kfree(pkt);
236 return ret; 209 return ret;
237} 210}
238 211
239/** 212/*
240 * rxrpc_kernel_send_data - Allow a kernel service to send data on a call 213 * Send an ABORT call packet.
241 * @call: The call to send data through
242 * @msg: The data to send
243 * @len: The amount of data to send
244 *
245 * Allow a kernel service to send data on a call. The call must be in an state
246 * appropriate to sending data. No control data should be supplied in @msg,
247 * nor should an address be supplied. MSG_MORE should be flagged if there's
248 * more data to come, otherwise this data will end the transmission phase.
249 */ 214 */
250int rxrpc_kernel_send_data(struct rxrpc_call *call, struct msghdr *msg, 215int rxrpc_send_abort_packet(struct rxrpc_call *call)
251 size_t len)
252{ 216{
217 struct rxrpc_connection *conn = NULL;
218 struct rxrpc_abort_buffer pkt;
219 struct msghdr msg;
220 struct kvec iov[1];
221 rxrpc_serial_t serial;
253 int ret; 222 int ret;
254 223
255 _enter("{%d,%s},", call->debug_id, rxrpc_call_states[call->state]); 224 spin_lock_bh(&call->lock);
256 225 if (call->conn)
257 ASSERTCMP(msg->msg_name, ==, NULL); 226 conn = rxrpc_get_connection_maybe(call->conn);
258 ASSERTCMP(msg->msg_control, ==, NULL); 227 spin_unlock_bh(&call->lock);
259 228 if (!conn)
260 lock_sock(&call->socket->sk); 229 return -ECONNRESET;
261
262 _debug("CALL %d USR %lx ST %d on CONN %p",
263 call->debug_id, call->user_call_ID, call->state, call->conn);
264 230
265 if (call->state >= RXRPC_CALL_COMPLETE) { 231 msg.msg_name = &call->peer->srx.transport;
266 ret = -ESHUTDOWN; /* it's too late for this call */ 232 msg.msg_namelen = call->peer->srx.transport_len;
267 } else if (call->state != RXRPC_CALL_CLIENT_SEND_REQUEST && 233 msg.msg_control = NULL;
268 call->state != RXRPC_CALL_SERVER_ACK_REQUEST && 234 msg.msg_controllen = 0;
269 call->state != RXRPC_CALL_SERVER_SEND_REPLY) { 235 msg.msg_flags = 0;
270 ret = -EPROTO; /* request phase complete for this client call */ 236
271 } else { 237 pkt.whdr.epoch = htonl(conn->proto.epoch);
272 ret = rxrpc_send_data(call->socket, call, msg, len); 238 pkt.whdr.cid = htonl(call->cid);
273 } 239 pkt.whdr.callNumber = htonl(call->call_id);
274 240 pkt.whdr.seq = 0;
275 release_sock(&call->socket->sk); 241 pkt.whdr.type = RXRPC_PACKET_TYPE_ABORT;
276 _leave(" = %d", ret); 242 pkt.whdr.flags = conn->out_clientflag;
243 pkt.whdr.userStatus = 0;
244 pkt.whdr.securityIndex = call->security_ix;
245 pkt.whdr._rsvd = 0;
246 pkt.whdr.serviceId = htons(call->service_id);
247 pkt.abort_code = htonl(call->abort_code);
248
249 iov[0].iov_base = &pkt;
250 iov[0].iov_len = sizeof(pkt);
251
252 serial = atomic_inc_return(&conn->serial);
253 pkt.whdr.serial = htonl(serial);
254
255 ret = kernel_sendmsg(conn->params.local->socket,
256 &msg, iov, 1, sizeof(pkt));
257
258 rxrpc_put_connection(conn);
277 return ret; 259 return ret;
278} 260}
279 261
280EXPORT_SYMBOL(rxrpc_kernel_send_data);
281
282/**
283 * rxrpc_kernel_abort_call - Allow a kernel service to abort a call
284 * @call: The call to be aborted
285 * @abort_code: The abort code to stick into the ABORT packet
286 *
287 * Allow a kernel service to abort a call, if it's still in an abortable state.
288 */
289void rxrpc_kernel_abort_call(struct rxrpc_call *call, u32 abort_code)
290{
291 _enter("{%d},%d", call->debug_id, abort_code);
292
293 lock_sock(&call->socket->sk);
294
295 _debug("CALL %d USR %lx ST %d on CONN %p",
296 call->debug_id, call->user_call_ID, call->state, call->conn);
297
298 if (call->state < RXRPC_CALL_COMPLETE)
299 rxrpc_send_abort(call, abort_code);
300
301 release_sock(&call->socket->sk);
302 _leave("");
303}
304
305EXPORT_SYMBOL(rxrpc_kernel_abort_call);
306
307/* 262/*
308 * send a packet through the transport endpoint 263 * send a packet through the transport endpoint
309 */ 264 */
310int rxrpc_send_data_packet(struct rxrpc_connection *conn, struct sk_buff *skb) 265int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb,
266 bool retrans)
311{ 267{
312 struct kvec iov[1]; 268 struct rxrpc_connection *conn = call->conn;
269 struct rxrpc_wire_header whdr;
270 struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
313 struct msghdr msg; 271 struct msghdr msg;
272 struct kvec iov[2];
273 rxrpc_serial_t serial;
274 size_t len;
275 bool lost = false;
314 int ret, opt; 276 int ret, opt;
315 277
316 _enter(",{%d}", skb->len); 278 _enter(",{%d}", skb->len);
317 279
318 iov[0].iov_base = skb->head; 280 /* Each transmission of a Tx packet needs a new serial number */
319 iov[0].iov_len = skb->len; 281 serial = atomic_inc_return(&conn->serial);
282
283 whdr.epoch = htonl(conn->proto.epoch);
284 whdr.cid = htonl(call->cid);
285 whdr.callNumber = htonl(call->call_id);
286 whdr.seq = htonl(sp->hdr.seq);
287 whdr.serial = htonl(serial);
288 whdr.type = RXRPC_PACKET_TYPE_DATA;
289 whdr.flags = sp->hdr.flags;
290 whdr.userStatus = 0;
291 whdr.securityIndex = call->security_ix;
292 whdr._rsvd = htons(sp->hdr._rsvd);
293 whdr.serviceId = htons(call->service_id);
320 294
321 msg.msg_name = &conn->params.peer->srx.transport; 295 iov[0].iov_base = &whdr;
322 msg.msg_namelen = conn->params.peer->srx.transport_len; 296 iov[0].iov_len = sizeof(whdr);
297 iov[1].iov_base = skb->head;
298 iov[1].iov_len = skb->len;
299 len = iov[0].iov_len + iov[1].iov_len;
300
301 msg.msg_name = &call->peer->srx.transport;
302 msg.msg_namelen = call->peer->srx.transport_len;
323 msg.msg_control = NULL; 303 msg.msg_control = NULL;
324 msg.msg_controllen = 0; 304 msg.msg_controllen = 0;
325 msg.msg_flags = 0; 305 msg.msg_flags = 0;
326 306
327 /* send the packet with the don't fragment bit set if we currently 307 /* If our RTT cache needs working on, request an ACK. Also request
328 * think it's small enough */ 308 * ACKs if a DATA packet appears to have been lost.
329 if (skb->len - sizeof(struct rxrpc_wire_header) < conn->params.peer->maxdata) { 309 */
330 down_read(&conn->params.local->defrag_sem); 310 if (!(sp->hdr.flags & RXRPC_LAST_PACKET) &&
331 /* send the packet by UDP 311 (retrans ||
332 * - returns -EMSGSIZE if UDP would have to fragment the packet 312 call->cong_mode == RXRPC_CALL_SLOW_START ||
333 * to go out of the interface 313 (call->peer->rtt_usage < 3 && sp->hdr.seq & 1) ||
334 * - in which case, we'll have processed the ICMP error 314 ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000),
335 * message and update the peer record 315 ktime_get_real())))
336 */ 316 whdr.flags |= RXRPC_REQUEST_ACK;
337 ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 1, 317
338 iov[0].iov_len); 318 if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) {
319 static int lose;
320 if ((lose++ & 7) == 7) {
321 ret = 0;
322 lost = true;
323 goto done;
324 }
325 }
339 326
340 up_read(&conn->params.local->defrag_sem); 327 _proto("Tx DATA %%%u { #%u }", serial, sp->hdr.seq);
341 if (ret == -EMSGSIZE)
342 goto send_fragmentable;
343 328
344 _leave(" = %d [%u]", ret, conn->params.peer->maxdata); 329 /* send the packet with the don't fragment bit set if we currently
345 return ret; 330 * think it's small enough */
331 if (iov[1].iov_len >= call->peer->maxdata)
332 goto send_fragmentable;
333
334 down_read(&conn->params.local->defrag_sem);
335 /* send the packet by UDP
336 * - returns -EMSGSIZE if UDP would have to fragment the packet
337 * to go out of the interface
338 * - in which case, we'll have processed the ICMP error
339 * message and update the peer record
340 */
341 ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len);
342
343 up_read(&conn->params.local->defrag_sem);
344 if (ret == -EMSGSIZE)
345 goto send_fragmentable;
346
347done:
348 trace_rxrpc_tx_data(call, sp->hdr.seq, serial, whdr.flags,
349 retrans, lost);
350 if (ret >= 0) {
351 ktime_t now = ktime_get_real();
352 skb->tstamp = now;
353 smp_wmb();
354 sp->hdr.serial = serial;
355 if (whdr.flags & RXRPC_REQUEST_ACK) {
356 call->peer->rtt_last_req = now;
357 trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_data, serial);
358 }
346 } 359 }
360 _leave(" = %d [%u]", ret, call->peer->maxdata);
361 return ret;
347 362
348send_fragmentable: 363send_fragmentable:
349 /* attempt to send this message with fragmentation enabled */ 364 /* attempt to send this message with fragmentation enabled */
@@ -358,8 +373,8 @@ send_fragmentable:
358 SOL_IP, IP_MTU_DISCOVER, 373 SOL_IP, IP_MTU_DISCOVER,
359 (char *)&opt, sizeof(opt)); 374 (char *)&opt, sizeof(opt));
360 if (ret == 0) { 375 if (ret == 0) {
361 ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 1, 376 ret = kernel_sendmsg(conn->params.local->socket, &msg,
362 iov[0].iov_len); 377 iov, 2, len);
363 378
364 opt = IP_PMTUDISC_DO; 379 opt = IP_PMTUDISC_DO;
365 kernel_setsockopt(conn->params.local->socket, SOL_IP, 380 kernel_setsockopt(conn->params.local->socket, SOL_IP,
@@ -367,355 +382,82 @@ send_fragmentable:
367 (char *)&opt, sizeof(opt)); 382 (char *)&opt, sizeof(opt));
368 } 383 }
369 break; 384 break;
370 }
371
372 up_write(&conn->params.local->defrag_sem);
373 _leave(" = %d [frag %u]", ret, conn->params.peer->maxdata);
374 return ret;
375}
376 385
377/* 386#ifdef CONFIG_AF_RXRPC_IPV6
378 * wait for space to appear in the transmit/ACK window 387 case AF_INET6:
379 * - caller holds the socket locked 388 opt = IPV6_PMTUDISC_DONT;
380 */ 389 ret = kernel_setsockopt(conn->params.local->socket,
381static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx, 390 SOL_IPV6, IPV6_MTU_DISCOVER,
382 struct rxrpc_call *call, 391 (char *)&opt, sizeof(opt));
383 long *timeo) 392 if (ret == 0) {
384{ 393 ret = kernel_sendmsg(conn->params.local->socket, &msg,
385 DECLARE_WAITQUEUE(myself, current); 394 iov, 1, iov[0].iov_len);
386 int ret;
387
388 _enter(",{%d},%ld",
389 CIRC_SPACE(call->acks_head, ACCESS_ONCE(call->acks_tail),
390 call->acks_winsz),
391 *timeo);
392
393 add_wait_queue(&call->tx_waitq, &myself);
394
395 for (;;) {
396 set_current_state(TASK_INTERRUPTIBLE);
397 ret = 0;
398 if (CIRC_SPACE(call->acks_head, ACCESS_ONCE(call->acks_tail),
399 call->acks_winsz) > 0)
400 break;
401 if (signal_pending(current)) {
402 ret = sock_intr_errno(*timeo);
403 break;
404 }
405
406 release_sock(&rx->sk);
407 *timeo = schedule_timeout(*timeo);
408 lock_sock(&rx->sk);
409 }
410
411 remove_wait_queue(&call->tx_waitq, &myself);
412 set_current_state(TASK_RUNNING);
413 _leave(" = %d", ret);
414 return ret;
415}
416
417/*
418 * attempt to schedule an instant Tx resend
419 */
420static inline void rxrpc_instant_resend(struct rxrpc_call *call)
421{
422 read_lock_bh(&call->state_lock);
423 if (try_to_del_timer_sync(&call->resend_timer) >= 0) {
424 clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
425 if (call->state < RXRPC_CALL_COMPLETE &&
426 !test_and_set_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events))
427 rxrpc_queue_call(call);
428 }
429 read_unlock_bh(&call->state_lock);
430}
431
432/*
433 * queue a packet for transmission, set the resend timer and attempt
434 * to send the packet immediately
435 */
436static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb,
437 bool last)
438{
439 struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
440 int ret;
441 395
442 _net("queue skb %p [%d]", skb, call->acks_head); 396 opt = IPV6_PMTUDISC_DO;
443 397 kernel_setsockopt(conn->params.local->socket,
444 ASSERT(call->acks_window != NULL); 398 SOL_IPV6, IPV6_MTU_DISCOVER,
445 call->acks_window[call->acks_head] = (unsigned long) skb; 399 (char *)&opt, sizeof(opt));
446 smp_wmb();
447 call->acks_head = (call->acks_head + 1) & (call->acks_winsz - 1);
448
449 if (last || call->state == RXRPC_CALL_SERVER_ACK_REQUEST) {
450 _debug("________awaiting reply/ACK__________");
451 write_lock_bh(&call->state_lock);
452 switch (call->state) {
453 case RXRPC_CALL_CLIENT_SEND_REQUEST:
454 call->state = RXRPC_CALL_CLIENT_AWAIT_REPLY;
455 break;
456 case RXRPC_CALL_SERVER_ACK_REQUEST:
457 call->state = RXRPC_CALL_SERVER_SEND_REPLY;
458 if (!last)
459 break;
460 case RXRPC_CALL_SERVER_SEND_REPLY:
461 call->state = RXRPC_CALL_SERVER_AWAIT_ACK;
462 break;
463 default:
464 break;
465 } 400 }
466 write_unlock_bh(&call->state_lock); 401 break;
467 } 402#endif
468
469 _proto("Tx DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq);
470
471 sp->need_resend = false;
472 sp->resend_at = jiffies + rxrpc_resend_timeout;
473 if (!test_and_set_bit(RXRPC_CALL_RUN_RTIMER, &call->flags)) {
474 _debug("run timer");
475 call->resend_timer.expires = sp->resend_at;
476 add_timer(&call->resend_timer);
477 }
478
479 /* attempt to cancel the rx-ACK timer, deferring reply transmission if
480 * we're ACK'ing the request phase of an incoming call */
481 ret = -EAGAIN;
482 if (try_to_del_timer_sync(&call->ack_timer) >= 0) {
483 /* the packet may be freed by rxrpc_process_call() before this
484 * returns */
485 ret = rxrpc_send_data_packet(call->conn, skb);
486 _net("sent skb %p", skb);
487 } else {
488 _debug("failed to delete ACK timer");
489 }
490
491 if (ret < 0) {
492 _debug("need instant resend %d", ret);
493 sp->need_resend = true;
494 rxrpc_instant_resend(call);
495 } 403 }
496 404
497 _leave(""); 405 up_write(&conn->params.local->defrag_sem);
498} 406 goto done;
499
500/*
501 * Convert a host-endian header into a network-endian header.
502 */
503static void rxrpc_insert_header(struct sk_buff *skb)
504{
505 struct rxrpc_wire_header whdr;
506 struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
507
508 whdr.epoch = htonl(sp->hdr.epoch);
509 whdr.cid = htonl(sp->hdr.cid);
510 whdr.callNumber = htonl(sp->hdr.callNumber);
511 whdr.seq = htonl(sp->hdr.seq);
512 whdr.serial = htonl(sp->hdr.serial);
513 whdr.type = sp->hdr.type;
514 whdr.flags = sp->hdr.flags;
515 whdr.userStatus = sp->hdr.userStatus;
516 whdr.securityIndex = sp->hdr.securityIndex;
517 whdr._rsvd = htons(sp->hdr._rsvd);
518 whdr.serviceId = htons(sp->hdr.serviceId);
519
520 memcpy(skb->head, &whdr, sizeof(whdr));
521} 407}
522 408
523/* 409/*
524 * send data through a socket 410 * reject packets through the local endpoint
525 * - must be called in process context
526 * - caller holds the socket locked
527 */ 411 */
528static int rxrpc_send_data(struct rxrpc_sock *rx, 412void rxrpc_reject_packets(struct rxrpc_local *local)
529 struct rxrpc_call *call,
530 struct msghdr *msg, size_t len)
531{ 413{
414 struct sockaddr_rxrpc srx;
532 struct rxrpc_skb_priv *sp; 415 struct rxrpc_skb_priv *sp;
416 struct rxrpc_wire_header whdr;
533 struct sk_buff *skb; 417 struct sk_buff *skb;
534 struct sock *sk = &rx->sk; 418 struct msghdr msg;
535 long timeo; 419 struct kvec iov[2];
536 bool more; 420 size_t size;
537 int ret, copied; 421 __be32 code;
538
539 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
540
541 /* this should be in poll */
542 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
543
544 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
545 return -EPIPE;
546
547 more = msg->msg_flags & MSG_MORE;
548
549 skb = call->tx_pending;
550 call->tx_pending = NULL;
551
552 copied = 0;
553 do {
554 if (!skb) {
555 size_t size, chunk, max, space;
556
557 _debug("alloc");
558
559 if (CIRC_SPACE(call->acks_head,
560 ACCESS_ONCE(call->acks_tail),
561 call->acks_winsz) <= 0) {
562 ret = -EAGAIN;
563 if (msg->msg_flags & MSG_DONTWAIT)
564 goto maybe_error;
565 ret = rxrpc_wait_for_tx_window(rx, call,
566 &timeo);
567 if (ret < 0)
568 goto maybe_error;
569 }
570
571 max = call->conn->params.peer->maxdata;
572 max -= call->conn->security_size;
573 max &= ~(call->conn->size_align - 1UL);
574
575 chunk = max;
576 if (chunk > msg_data_left(msg) && !more)
577 chunk = msg_data_left(msg);
578
579 space = chunk + call->conn->size_align;
580 space &= ~(call->conn->size_align - 1UL);
581
582 size = space + call->conn->header_size;
583
584 _debug("SIZE: %zu/%zu/%zu", chunk, space, size);
585
586 /* create a buffer that we can retain until it's ACK'd */
587 skb = sock_alloc_send_skb(
588 sk, size, msg->msg_flags & MSG_DONTWAIT, &ret);
589 if (!skb)
590 goto maybe_error;
591
592 rxrpc_new_skb(skb);
593
594 _debug("ALLOC SEND %p", skb);
595 422
596 ASSERTCMP(skb->mark, ==, 0); 423 _enter("%d", local->debug_id);
597 424
598 _debug("HS: %u", call->conn->header_size); 425 iov[0].iov_base = &whdr;
599 skb_reserve(skb, call->conn->header_size); 426 iov[0].iov_len = sizeof(whdr);
600 skb->len += call->conn->header_size; 427 iov[1].iov_base = &code;
428 iov[1].iov_len = sizeof(code);
429 size = sizeof(whdr) + sizeof(code);
601 430
602 sp = rxrpc_skb(skb); 431 msg.msg_name = &srx.transport;
603 sp->remain = chunk; 432 msg.msg_control = NULL;
604 if (sp->remain > skb_tailroom(skb)) 433 msg.msg_controllen = 0;
605 sp->remain = skb_tailroom(skb); 434 msg.msg_flags = 0;
606 435
607 _net("skb: hr %d, tr %d, hl %d, rm %d", 436 memset(&whdr, 0, sizeof(whdr));
608 skb_headroom(skb), 437 whdr.type = RXRPC_PACKET_TYPE_ABORT;
609 skb_tailroom(skb),
610 skb_headlen(skb),
611 sp->remain);
612 438
613 skb->ip_summed = CHECKSUM_UNNECESSARY; 439 while ((skb = skb_dequeue(&local->reject_queue))) {
614 } 440 rxrpc_see_skb(skb, rxrpc_skb_rx_seen);
615
616 _debug("append");
617 sp = rxrpc_skb(skb); 441 sp = rxrpc_skb(skb);
618 442
619 /* append next segment of data to the current buffer */ 443 if (rxrpc_extract_addr_from_skb(&srx, skb) == 0) {
620 if (msg_data_left(msg) > 0) { 444 msg.msg_namelen = srx.transport_len;
621 int copy = skb_tailroom(skb);
622 ASSERTCMP(copy, >, 0);
623 if (copy > msg_data_left(msg))
624 copy = msg_data_left(msg);
625 if (copy > sp->remain)
626 copy = sp->remain;
627
628 _debug("add");
629 ret = skb_add_data(skb, &msg->msg_iter, copy);
630 _debug("added");
631 if (ret < 0)
632 goto efault;
633 sp->remain -= copy;
634 skb->mark += copy;
635 copied += copy;
636 }
637 445
638 /* check for the far side aborting the call or a network error 446 code = htonl(skb->priority);
639 * occurring */
640 if (call->state > RXRPC_CALL_COMPLETE)
641 goto call_aborted;
642
643 /* add the packet to the send queue if it's now full */
644 if (sp->remain <= 0 ||
645 (msg_data_left(msg) == 0 && !more)) {
646 struct rxrpc_connection *conn = call->conn;
647 uint32_t seq;
648 size_t pad;
649
650 /* pad out if we're using security */
651 if (conn->security_ix) {
652 pad = conn->security_size + skb->mark;
653 pad = conn->size_align - pad;
654 pad &= conn->size_align - 1;
655 _debug("pad %zu", pad);
656 if (pad)
657 memset(skb_put(skb, pad), 0, pad);
658 }
659
660 seq = atomic_inc_return(&call->sequence);
661
662 sp->hdr.epoch = conn->proto.epoch;
663 sp->hdr.cid = call->cid;
664 sp->hdr.callNumber = call->call_id;
665 sp->hdr.seq = seq;
666 sp->hdr.serial = atomic_inc_return(&conn->serial);
667 sp->hdr.type = RXRPC_PACKET_TYPE_DATA;
668 sp->hdr.userStatus = 0;
669 sp->hdr.securityIndex = conn->security_ix;
670 sp->hdr._rsvd = 0;
671 sp->hdr.serviceId = call->service_id;
672
673 sp->hdr.flags = conn->out_clientflag;
674 if (msg_data_left(msg) == 0 && !more)
675 sp->hdr.flags |= RXRPC_LAST_PACKET;
676 else if (CIRC_SPACE(call->acks_head,
677 ACCESS_ONCE(call->acks_tail),
678 call->acks_winsz) > 1)
679 sp->hdr.flags |= RXRPC_MORE_PACKETS;
680 if (more && seq & 1)
681 sp->hdr.flags |= RXRPC_REQUEST_ACK;
682
683 ret = conn->security->secure_packet(
684 call, skb, skb->mark,
685 skb->head + sizeof(struct rxrpc_wire_header));
686 if (ret < 0)
687 goto out;
688
689 rxrpc_insert_header(skb);
690 rxrpc_queue_packet(call, skb, !msg_data_left(msg) && !more);
691 skb = NULL;
692 }
693 } while (msg_data_left(msg) > 0);
694 447
695success: 448 whdr.epoch = htonl(sp->hdr.epoch);
696 ret = copied; 449 whdr.cid = htonl(sp->hdr.cid);
697out: 450 whdr.callNumber = htonl(sp->hdr.callNumber);
698 call->tx_pending = skb; 451 whdr.serviceId = htons(sp->hdr.serviceId);
699 _leave(" = %d", ret); 452 whdr.flags = sp->hdr.flags;
700 return ret; 453 whdr.flags ^= RXRPC_CLIENT_INITIATED;
454 whdr.flags &= RXRPC_CLIENT_INITIATED;
701 455
702call_aborted: 456 kernel_sendmsg(local->socket, &msg, iov, 2, size);
703 rxrpc_free_skb(skb); 457 }
704 if (call->state == RXRPC_CALL_NETWORK_ERROR)
705 ret = call->error_report < RXRPC_LOCAL_ERROR_OFFSET ?
706 call->error_report :
707 call->error_report - RXRPC_LOCAL_ERROR_OFFSET;
708 else
709 ret = -ECONNABORTED;
710 _leave(" = %d", ret);
711 return ret;
712 458
713maybe_error: 459 rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
714 if (copied) 460 }
715 goto success;
716 goto out;
717 461
718efault: 462 _leave("");
719 ret = -EFAULT;
720 goto out;
721} 463}
diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c
index 8940674b5e08..bf13b8470c9a 100644
--- a/net/rxrpc/peer_event.c
+++ b/net/rxrpc/peer_event.c
@@ -66,6 +66,32 @@ static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local,
66 } 66 }
67 break; 67 break;
68 68
69#ifdef CONFIG_AF_RXRPC_IPV6
70 case AF_INET6:
71 srx.transport.sin6.sin6_port = serr->port;
72 srx.transport_len = sizeof(struct sockaddr_in6);
73 switch (serr->ee.ee_origin) {
74 case SO_EE_ORIGIN_ICMP6:
75 _net("Rx ICMP6");
76 memcpy(&srx.transport.sin6.sin6_addr,
77 skb_network_header(skb) + serr->addr_offset,
78 sizeof(struct in6_addr));
79 break;
80 case SO_EE_ORIGIN_ICMP:
81 _net("Rx ICMP on v6 sock");
82 memcpy(srx.transport.sin6.sin6_addr.s6_addr + 12,
83 skb_network_header(skb) + serr->addr_offset,
84 sizeof(struct in_addr));
85 break;
86 default:
87 memcpy(&srx.transport.sin6.sin6_addr,
88 &ipv6_hdr(skb)->saddr,
89 sizeof(struct in6_addr));
90 break;
91 }
92 break;
93#endif
94
69 default: 95 default:
70 BUG(); 96 BUG();
71 } 97 }
@@ -129,22 +155,21 @@ void rxrpc_error_report(struct sock *sk)
129 _leave("UDP socket errqueue empty"); 155 _leave("UDP socket errqueue empty");
130 return; 156 return;
131 } 157 }
158 rxrpc_new_skb(skb, rxrpc_skb_rx_received);
132 serr = SKB_EXT_ERR(skb); 159 serr = SKB_EXT_ERR(skb);
133 if (!skb->len && serr->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) { 160 if (!skb->len && serr->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) {
134 _leave("UDP empty message"); 161 _leave("UDP empty message");
135 kfree_skb(skb); 162 rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
136 return; 163 return;
137 } 164 }
138 165
139 rxrpc_new_skb(skb);
140
141 rcu_read_lock(); 166 rcu_read_lock();
142 peer = rxrpc_lookup_peer_icmp_rcu(local, skb); 167 peer = rxrpc_lookup_peer_icmp_rcu(local, skb);
143 if (peer && !rxrpc_get_peer_maybe(peer)) 168 if (peer && !rxrpc_get_peer_maybe(peer))
144 peer = NULL; 169 peer = NULL;
145 if (!peer) { 170 if (!peer) {
146 rcu_read_unlock(); 171 rcu_read_unlock();
147 rxrpc_free_skb(skb); 172 rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
148 _leave(" [no peer]"); 173 _leave(" [no peer]");
149 return; 174 return;
150 } 175 }
@@ -154,7 +179,7 @@ void rxrpc_error_report(struct sock *sk)
154 serr->ee.ee_code == ICMP_FRAG_NEEDED)) { 179 serr->ee.ee_code == ICMP_FRAG_NEEDED)) {
155 rxrpc_adjust_mtu(peer, serr); 180 rxrpc_adjust_mtu(peer, serr);
156 rcu_read_unlock(); 181 rcu_read_unlock();
157 rxrpc_free_skb(skb); 182 rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
158 rxrpc_put_peer(peer); 183 rxrpc_put_peer(peer);
159 _leave(" [MTU update]"); 184 _leave(" [MTU update]");
160 return; 185 return;
@@ -162,7 +187,7 @@ void rxrpc_error_report(struct sock *sk)
162 187
163 rxrpc_store_error(peer, serr); 188 rxrpc_store_error(peer, serr);
164 rcu_read_unlock(); 189 rcu_read_unlock();
165 rxrpc_free_skb(skb); 190 rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
166 191
167 /* The ref we obtained is passed off to the work item */ 192 /* The ref we obtained is passed off to the work item */
168 rxrpc_queue_work(&peer->error_distributor); 193 rxrpc_queue_work(&peer->error_distributor);
@@ -248,13 +273,20 @@ void rxrpc_peer_error_distributor(struct work_struct *work)
248 struct rxrpc_peer *peer = 273 struct rxrpc_peer *peer =
249 container_of(work, struct rxrpc_peer, error_distributor); 274 container_of(work, struct rxrpc_peer, error_distributor);
250 struct rxrpc_call *call; 275 struct rxrpc_call *call;
251 int error_report; 276 enum rxrpc_call_completion compl;
277 int error;
252 278
253 _enter(""); 279 _enter("");
254 280
255 error_report = READ_ONCE(peer->error_report); 281 error = READ_ONCE(peer->error_report);
282 if (error < RXRPC_LOCAL_ERROR_OFFSET) {
283 compl = RXRPC_CALL_NETWORK_ERROR;
284 } else {
285 compl = RXRPC_CALL_LOCAL_ERROR;
286 error -= RXRPC_LOCAL_ERROR_OFFSET;
287 }
256 288
257 _debug("ISSUE ERROR %d", error_report); 289 _debug("ISSUE ERROR %s %d", rxrpc_call_completions[compl], error);
258 290
259 spin_lock_bh(&peer->lock); 291 spin_lock_bh(&peer->lock);
260 292
@@ -262,16 +294,10 @@ void rxrpc_peer_error_distributor(struct work_struct *work)
262 call = hlist_entry(peer->error_targets.first, 294 call = hlist_entry(peer->error_targets.first,
263 struct rxrpc_call, error_link); 295 struct rxrpc_call, error_link);
264 hlist_del_init(&call->error_link); 296 hlist_del_init(&call->error_link);
297 rxrpc_see_call(call);
265 298
266 write_lock(&call->state_lock); 299 if (rxrpc_set_call_completion(call, compl, 0, error))
267 if (call->state != RXRPC_CALL_COMPLETE && 300 rxrpc_notify_socket(call);
268 call->state < RXRPC_CALL_NETWORK_ERROR) {
269 call->error_report = error_report;
270 call->state = RXRPC_CALL_NETWORK_ERROR;
271 set_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events);
272 rxrpc_queue_call(call);
273 }
274 write_unlock(&call->state_lock);
275 } 301 }
276 302
277 spin_unlock_bh(&peer->lock); 303 spin_unlock_bh(&peer->lock);
@@ -279,3 +305,44 @@ void rxrpc_peer_error_distributor(struct work_struct *work)
279 rxrpc_put_peer(peer); 305 rxrpc_put_peer(peer);
280 _leave(""); 306 _leave("");
281} 307}
308
309/*
310 * Add RTT information to cache. This is called in softirq mode and has
311 * exclusive access to the peer RTT data.
312 */
313void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why,
314 rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial,
315 ktime_t send_time, ktime_t resp_time)
316{
317 struct rxrpc_peer *peer = call->peer;
318 s64 rtt;
319 u64 sum = peer->rtt_sum, avg;
320 u8 cursor = peer->rtt_cursor, usage = peer->rtt_usage;
321
322 rtt = ktime_to_ns(ktime_sub(resp_time, send_time));
323 if (rtt < 0)
324 return;
325
326 /* Replace the oldest datum in the RTT buffer */
327 sum -= peer->rtt_cache[cursor];
328 sum += rtt;
329 peer->rtt_cache[cursor] = rtt;
330 peer->rtt_cursor = (cursor + 1) & (RXRPC_RTT_CACHE_SIZE - 1);
331 peer->rtt_sum = sum;
332 if (usage < RXRPC_RTT_CACHE_SIZE) {
333 usage++;
334 peer->rtt_usage = usage;
335 }
336
337 /* Now recalculate the average */
338 if (usage == RXRPC_RTT_CACHE_SIZE) {
339 avg = sum / RXRPC_RTT_CACHE_SIZE;
340 } else {
341 avg = sum;
342 do_div(avg, usage);
343 }
344
345 peer->rtt = avg;
346 trace_rxrpc_rtt_rx(call, why, send_serial, resp_serial, rtt,
347 usage, avg);
348}
diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c
index 538e9831c699..941b724d523b 100644
--- a/net/rxrpc/peer_object.c
+++ b/net/rxrpc/peer_object.c
@@ -16,12 +16,14 @@
16#include <linux/skbuff.h> 16#include <linux/skbuff.h>
17#include <linux/udp.h> 17#include <linux/udp.h>
18#include <linux/in.h> 18#include <linux/in.h>
19#include <linux/in6.h>
19#include <linux/slab.h> 20#include <linux/slab.h>
20#include <linux/hashtable.h> 21#include <linux/hashtable.h>
21#include <net/sock.h> 22#include <net/sock.h>
22#include <net/af_rxrpc.h> 23#include <net/af_rxrpc.h>
23#include <net/ip.h> 24#include <net/ip.h>
24#include <net/route.h> 25#include <net/route.h>
26#include <net/ip6_route.h>
25#include "ar-internal.h" 27#include "ar-internal.h"
26 28
27static DEFINE_HASHTABLE(rxrpc_peer_hash, 10); 29static DEFINE_HASHTABLE(rxrpc_peer_hash, 10);
@@ -50,6 +52,13 @@ static unsigned long rxrpc_peer_hash_key(struct rxrpc_local *local,
50 size = sizeof(srx->transport.sin.sin_addr); 52 size = sizeof(srx->transport.sin.sin_addr);
51 p = (u16 *)&srx->transport.sin.sin_addr; 53 p = (u16 *)&srx->transport.sin.sin_addr;
52 break; 54 break;
55#ifdef CONFIG_AF_RXRPC_IPV6
56 case AF_INET6:
57 hash_key += (u16 __force)srx->transport.sin.sin_port;
58 size = sizeof(srx->transport.sin6.sin6_addr);
59 p = (u16 *)&srx->transport.sin6.sin6_addr;
60 break;
61#endif
53 default: 62 default:
54 WARN(1, "AF_RXRPC: Unsupported transport address family\n"); 63 WARN(1, "AF_RXRPC: Unsupported transport address family\n");
55 return 0; 64 return 0;
@@ -93,6 +102,14 @@ static long rxrpc_peer_cmp_key(const struct rxrpc_peer *peer,
93 memcmp(&peer->srx.transport.sin.sin_addr, 102 memcmp(&peer->srx.transport.sin.sin_addr,
94 &srx->transport.sin.sin_addr, 103 &srx->transport.sin.sin_addr,
95 sizeof(struct in_addr)); 104 sizeof(struct in_addr));
105#ifdef CONFIG_AF_RXRPC_IPV6
106 case AF_INET6:
107 return ((u16 __force)peer->srx.transport.sin6.sin6_port -
108 (u16 __force)srx->transport.sin6.sin6_port) ?:
109 memcmp(&peer->srx.transport.sin6.sin6_addr,
110 &srx->transport.sin6.sin6_addr,
111 sizeof(struct in6_addr));
112#endif
96 default: 113 default:
97 BUG(); 114 BUG();
98 } 115 }
@@ -130,17 +147,7 @@ struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *local,
130 147
131 peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key); 148 peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key);
132 if (peer) { 149 if (peer) {
133 switch (srx->transport.family) { 150 _net("PEER %d {%pISp}", peer->debug_id, &peer->srx.transport);
134 case AF_INET:
135 _net("PEER %d {%d,%u,%pI4+%hu}",
136 peer->debug_id,
137 peer->srx.transport_type,
138 peer->srx.transport.family,
139 &peer->srx.transport.sin.sin_addr,
140 ntohs(peer->srx.transport.sin.sin_port));
141 break;
142 }
143
144 _leave(" = %p {u=%d}", peer, atomic_read(&peer->usage)); 151 _leave(" = %p {u=%d}", peer, atomic_read(&peer->usage));
145 } 152 }
146 return peer; 153 return peer;
@@ -152,22 +159,53 @@ struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *local,
152 */ 159 */
153static void rxrpc_assess_MTU_size(struct rxrpc_peer *peer) 160static void rxrpc_assess_MTU_size(struct rxrpc_peer *peer)
154{ 161{
162 struct dst_entry *dst;
155 struct rtable *rt; 163 struct rtable *rt;
156 struct flowi4 fl4; 164 struct flowi fl;
165 struct flowi4 *fl4 = &fl.u.ip4;
166#ifdef CONFIG_AF_RXRPC_IPV6
167 struct flowi6 *fl6 = &fl.u.ip6;
168#endif
157 169
158 peer->if_mtu = 1500; 170 peer->if_mtu = 1500;
159 171
160 rt = ip_route_output_ports(&init_net, &fl4, NULL, 172 memset(&fl, 0, sizeof(fl));
161 peer->srx.transport.sin.sin_addr.s_addr, 0, 173 switch (peer->srx.transport.family) {
162 htons(7000), htons(7001), 174 case AF_INET:
163 IPPROTO_UDP, 0, 0); 175 rt = ip_route_output_ports(
164 if (IS_ERR(rt)) { 176 &init_net, fl4, NULL,
165 _leave(" [route err %ld]", PTR_ERR(rt)); 177 peer->srx.transport.sin.sin_addr.s_addr, 0,
166 return; 178 htons(7000), htons(7001), IPPROTO_UDP, 0, 0);
179 if (IS_ERR(rt)) {
180 _leave(" [route err %ld]", PTR_ERR(rt));
181 return;
182 }
183 dst = &rt->dst;
184 break;
185
186#ifdef CONFIG_AF_RXRPC_IPV6
187 case AF_INET6:
188 fl6->flowi6_iif = LOOPBACK_IFINDEX;
189 fl6->flowi6_scope = RT_SCOPE_UNIVERSE;
190 fl6->flowi6_proto = IPPROTO_UDP;
191 memcpy(&fl6->daddr, &peer->srx.transport.sin6.sin6_addr,
192 sizeof(struct in6_addr));
193 fl6->fl6_dport = htons(7001);
194 fl6->fl6_sport = htons(7000);
195 dst = ip6_route_output(&init_net, NULL, fl6);
196 if (IS_ERR(dst)) {
197 _leave(" [route err %ld]", PTR_ERR(dst));
198 return;
199 }
200 break;
201#endif
202
203 default:
204 BUG();
167 } 205 }
168 206
169 peer->if_mtu = dst_mtu(&rt->dst); 207 peer->if_mtu = dst_mtu(dst);
170 dst_release(&rt->dst); 208 dst_release(dst);
171 209
172 _leave(" [if_mtu %u]", peer->if_mtu); 210 _leave(" [if_mtu %u]", peer->if_mtu);
173} 211}
@@ -199,6 +237,41 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp)
199} 237}
200 238
201/* 239/*
240 * Initialise peer record.
241 */
242static void rxrpc_init_peer(struct rxrpc_peer *peer, unsigned long hash_key)
243{
244 peer->hash_key = hash_key;
245 rxrpc_assess_MTU_size(peer);
246 peer->mtu = peer->if_mtu;
247 peer->rtt_last_req = ktime_get_real();
248
249 switch (peer->srx.transport.family) {
250 case AF_INET:
251 peer->hdrsize = sizeof(struct iphdr);
252 break;
253#ifdef CONFIG_AF_RXRPC_IPV6
254 case AF_INET6:
255 peer->hdrsize = sizeof(struct ipv6hdr);
256 break;
257#endif
258 default:
259 BUG();
260 }
261
262 switch (peer->srx.transport_type) {
263 case SOCK_DGRAM:
264 peer->hdrsize += sizeof(struct udphdr);
265 break;
266 default:
267 BUG();
268 }
269
270 peer->hdrsize += sizeof(struct rxrpc_wire_header);
271 peer->maxdata = peer->mtu - peer->hdrsize;
272}
273
274/*
202 * Set up a new peer. 275 * Set up a new peer.
203 */ 276 */
204static struct rxrpc_peer *rxrpc_create_peer(struct rxrpc_local *local, 277static struct rxrpc_peer *rxrpc_create_peer(struct rxrpc_local *local,
@@ -212,31 +285,40 @@ static struct rxrpc_peer *rxrpc_create_peer(struct rxrpc_local *local,
212 285
213 peer = rxrpc_alloc_peer(local, gfp); 286 peer = rxrpc_alloc_peer(local, gfp);
214 if (peer) { 287 if (peer) {
215 peer->hash_key = hash_key;
216 memcpy(&peer->srx, srx, sizeof(*srx)); 288 memcpy(&peer->srx, srx, sizeof(*srx));
289 rxrpc_init_peer(peer, hash_key);
290 }
217 291
218 rxrpc_assess_MTU_size(peer); 292 _leave(" = %p", peer);
219 peer->mtu = peer->if_mtu; 293 return peer;
220 294}
221 if (srx->transport.family == AF_INET) { 295
222 peer->hdrsize = sizeof(struct iphdr); 296/*
223 switch (srx->transport_type) { 297 * Set up a new incoming peer. The address is prestored in the preallocated
224 case SOCK_DGRAM: 298 * peer.
225 peer->hdrsize += sizeof(struct udphdr); 299 */
226 break; 300struct rxrpc_peer *rxrpc_lookup_incoming_peer(struct rxrpc_local *local,
227 default: 301 struct rxrpc_peer *prealloc)
228 BUG(); 302{
229 break; 303 struct rxrpc_peer *peer;
230 } 304 unsigned long hash_key;
231 } else { 305
232 BUG(); 306 hash_key = rxrpc_peer_hash_key(local, &prealloc->srx);
233 } 307 prealloc->local = local;
308 rxrpc_init_peer(prealloc, hash_key);
234 309
235 peer->hdrsize += sizeof(struct rxrpc_wire_header); 310 spin_lock(&rxrpc_peer_hash_lock);
236 peer->maxdata = peer->mtu - peer->hdrsize; 311
312 /* Need to check that we aren't racing with someone else */
313 peer = __rxrpc_lookup_peer_rcu(local, &prealloc->srx, hash_key);
314 if (peer && !rxrpc_get_peer_maybe(peer))
315 peer = NULL;
316 if (!peer) {
317 peer = prealloc;
318 hash_add_rcu(rxrpc_peer_hash, &peer->hash_link, hash_key);
237 } 319 }
238 320
239 _leave(" = %p", peer); 321 spin_unlock(&rxrpc_peer_hash_lock);
240 return peer; 322 return peer;
241} 323}
242 324
@@ -249,11 +331,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local,
249 struct rxrpc_peer *peer, *candidate; 331 struct rxrpc_peer *peer, *candidate;
250 unsigned long hash_key = rxrpc_peer_hash_key(local, srx); 332 unsigned long hash_key = rxrpc_peer_hash_key(local, srx);
251 333
252 _enter("{%d,%d,%pI4+%hu}", 334 _enter("{%pISp}", &srx->transport);
253 srx->transport_type,
254 srx->transport_len,
255 &srx->transport.sin.sin_addr,
256 ntohs(srx->transport.sin.sin_port));
257 335
258 /* search the peer list first */ 336 /* search the peer list first */
259 rcu_read_lock(); 337 rcu_read_lock();
@@ -272,7 +350,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local,
272 return NULL; 350 return NULL;
273 } 351 }
274 352
275 spin_lock(&rxrpc_peer_hash_lock); 353 spin_lock_bh(&rxrpc_peer_hash_lock);
276 354
277 /* Need to check that we aren't racing with someone else */ 355 /* Need to check that we aren't racing with someone else */
278 peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key); 356 peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key);
@@ -282,7 +360,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local,
282 hash_add_rcu(rxrpc_peer_hash, 360 hash_add_rcu(rxrpc_peer_hash,
283 &candidate->hash_link, hash_key); 361 &candidate->hash_link, hash_key);
284 362
285 spin_unlock(&rxrpc_peer_hash_lock); 363 spin_unlock_bh(&rxrpc_peer_hash_lock);
286 364
287 if (peer) 365 if (peer)
288 kfree(candidate); 366 kfree(candidate);
@@ -290,11 +368,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local,
290 peer = candidate; 368 peer = candidate;
291 } 369 }
292 370
293 _net("PEER %d {%d,%pI4+%hu}", 371 _net("PEER %d {%pISp}", peer->debug_id, &peer->srx.transport);
294 peer->debug_id,
295 peer->srx.transport_type,
296 &peer->srx.transport.sin.sin_addr,
297 ntohs(peer->srx.transport.sin.sin_port));
298 372
299 _leave(" = %p {u=%d}", peer, atomic_read(&peer->usage)); 373 _leave(" = %p {u=%d}", peer, atomic_read(&peer->usage));
300 return peer; 374 return peer;
@@ -307,9 +381,24 @@ void __rxrpc_put_peer(struct rxrpc_peer *peer)
307{ 381{
308 ASSERT(hlist_empty(&peer->error_targets)); 382 ASSERT(hlist_empty(&peer->error_targets));
309 383
310 spin_lock(&rxrpc_peer_hash_lock); 384 spin_lock_bh(&rxrpc_peer_hash_lock);
311 hash_del_rcu(&peer->hash_link); 385 hash_del_rcu(&peer->hash_link);
312 spin_unlock(&rxrpc_peer_hash_lock); 386 spin_unlock_bh(&rxrpc_peer_hash_lock);
313 387
314 kfree_rcu(peer, rcu); 388 kfree_rcu(peer, rcu);
315} 389}
390
391/**
392 * rxrpc_kernel_get_peer - Get the peer address of a call
393 * @sock: The socket on which the call is in progress.
394 * @call: The call to query
395 * @_srx: Where to place the result
396 *
397 * Get the address of the remote peer in a call.
398 */
399void rxrpc_kernel_get_peer(struct socket *sock, struct rxrpc_call *call,
400 struct sockaddr_rxrpc *_srx)
401{
402 *_srx = call->peer->srx;
403}
404EXPORT_SYMBOL(rxrpc_kernel_get_peer);
diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c
index ced5f07444e5..65cd980767fa 100644
--- a/net/rxrpc/proc.c
+++ b/net/rxrpc/proc.c
@@ -17,12 +17,12 @@
17static const char *const rxrpc_conn_states[RXRPC_CONN__NR_STATES] = { 17static const char *const rxrpc_conn_states[RXRPC_CONN__NR_STATES] = {
18 [RXRPC_CONN_UNUSED] = "Unused ", 18 [RXRPC_CONN_UNUSED] = "Unused ",
19 [RXRPC_CONN_CLIENT] = "Client ", 19 [RXRPC_CONN_CLIENT] = "Client ",
20 [RXRPC_CONN_SERVICE_PREALLOC] = "SvPrealc",
20 [RXRPC_CONN_SERVICE_UNSECURED] = "SvUnsec ", 21 [RXRPC_CONN_SERVICE_UNSECURED] = "SvUnsec ",
21 [RXRPC_CONN_SERVICE_CHALLENGING] = "SvChall ", 22 [RXRPC_CONN_SERVICE_CHALLENGING] = "SvChall ",
22 [RXRPC_CONN_SERVICE] = "SvSecure", 23 [RXRPC_CONN_SERVICE] = "SvSecure",
23 [RXRPC_CONN_REMOTELY_ABORTED] = "RmtAbort", 24 [RXRPC_CONN_REMOTELY_ABORTED] = "RmtAbort",
24 [RXRPC_CONN_LOCALLY_ABORTED] = "LocAbort", 25 [RXRPC_CONN_LOCALLY_ABORTED] = "LocAbort",
25 [RXRPC_CONN_NETWORK_ERROR] = "NetError",
26}; 26};
27 27
28/* 28/*
@@ -30,6 +30,7 @@ static const char *const rxrpc_conn_states[RXRPC_CONN__NR_STATES] = {
30 */ 30 */
31static void *rxrpc_call_seq_start(struct seq_file *seq, loff_t *_pos) 31static void *rxrpc_call_seq_start(struct seq_file *seq, loff_t *_pos)
32{ 32{
33 rcu_read_lock();
33 read_lock(&rxrpc_call_lock); 34 read_lock(&rxrpc_call_lock);
34 return seq_list_start_head(&rxrpc_calls, *_pos); 35 return seq_list_start_head(&rxrpc_calls, *_pos);
35} 36}
@@ -42,17 +43,21 @@ static void *rxrpc_call_seq_next(struct seq_file *seq, void *v, loff_t *pos)
42static void rxrpc_call_seq_stop(struct seq_file *seq, void *v) 43static void rxrpc_call_seq_stop(struct seq_file *seq, void *v)
43{ 44{
44 read_unlock(&rxrpc_call_lock); 45 read_unlock(&rxrpc_call_lock);
46 rcu_read_unlock();
45} 47}
46 48
47static int rxrpc_call_seq_show(struct seq_file *seq, void *v) 49static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
48{ 50{
49 struct rxrpc_connection *conn; 51 struct rxrpc_local *local;
52 struct rxrpc_sock *rx;
53 struct rxrpc_peer *peer;
50 struct rxrpc_call *call; 54 struct rxrpc_call *call;
51 char lbuff[4 + 4 + 4 + 4 + 5 + 1], rbuff[4 + 4 + 4 + 4 + 5 + 1]; 55 char lbuff[50], rbuff[50];
52 56
53 if (v == &rxrpc_calls) { 57 if (v == &rxrpc_calls) {
54 seq_puts(seq, 58 seq_puts(seq,
55 "Proto Local Remote " 59 "Proto Local "
60 " Remote "
56 " SvID ConnID CallID End Use State Abort " 61 " SvID ConnID CallID End Use State Abort "
57 " UserID\n"); 62 " UserID\n");
58 return 0; 63 return 0;
@@ -60,30 +65,35 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
60 65
61 call = list_entry(v, struct rxrpc_call, link); 66 call = list_entry(v, struct rxrpc_call, link);
62 67
63 sprintf(lbuff, "%pI4:%u", 68 rx = rcu_dereference(call->socket);
64 &call->local->srx.transport.sin.sin_addr, 69 if (rx) {
65 ntohs(call->local->srx.transport.sin.sin_port)); 70 local = READ_ONCE(rx->local);
71 if (local)
72 sprintf(lbuff, "%pISpc", &local->srx.transport);
73 else
74 strcpy(lbuff, "no_local");
75 } else {
76 strcpy(lbuff, "no_socket");
77 }
66 78
67 conn = call->conn; 79 peer = call->peer;
68 if (conn) 80 if (peer)
69 sprintf(rbuff, "%pI4:%u", 81 sprintf(rbuff, "%pISpc", &peer->srx.transport);
70 &conn->params.peer->srx.transport.sin.sin_addr,
71 ntohs(conn->params.peer->srx.transport.sin.sin_port));
72 else 82 else
73 strcpy(rbuff, "no_connection"); 83 strcpy(rbuff, "no_connection");
74 84
75 seq_printf(seq, 85 seq_printf(seq,
76 "UDP %-22.22s %-22.22s %4x %08x %08x %s %3u" 86 "UDP %-47.47s %-47.47s %4x %08x %08x %s %3u"
77 " %-8.8s %08x %lx\n", 87 " %-8.8s %08x %lx\n",
78 lbuff, 88 lbuff,
79 rbuff, 89 rbuff,
80 call->service_id, 90 call->service_id,
81 call->cid, 91 call->cid,
82 call->call_id, 92 call->call_id,
83 call->in_clientflag ? "Svc" : "Clt", 93 rxrpc_is_service_call(call) ? "Svc" : "Clt",
84 atomic_read(&call->usage), 94 atomic_read(&call->usage),
85 rxrpc_call_states[call->state], 95 rxrpc_call_states[call->state],
86 call->remote_abort ?: call->local_abort, 96 call->abort_code,
87 call->user_call_ID); 97 call->user_call_ID);
88 98
89 return 0; 99 return 0;
@@ -115,13 +125,13 @@ const struct file_operations rxrpc_call_seq_fops = {
115static void *rxrpc_connection_seq_start(struct seq_file *seq, loff_t *_pos) 125static void *rxrpc_connection_seq_start(struct seq_file *seq, loff_t *_pos)
116{ 126{
117 read_lock(&rxrpc_connection_lock); 127 read_lock(&rxrpc_connection_lock);
118 return seq_list_start_head(&rxrpc_connections, *_pos); 128 return seq_list_start_head(&rxrpc_connection_proc_list, *_pos);
119} 129}
120 130
121static void *rxrpc_connection_seq_next(struct seq_file *seq, void *v, 131static void *rxrpc_connection_seq_next(struct seq_file *seq, void *v,
122 loff_t *pos) 132 loff_t *pos)
123{ 133{
124 return seq_list_next(v, &rxrpc_connections, pos); 134 return seq_list_next(v, &rxrpc_connection_proc_list, pos);
125} 135}
126 136
127static void rxrpc_connection_seq_stop(struct seq_file *seq, void *v) 137static void rxrpc_connection_seq_stop(struct seq_file *seq, void *v)
@@ -132,29 +142,31 @@ static void rxrpc_connection_seq_stop(struct seq_file *seq, void *v)
132static int rxrpc_connection_seq_show(struct seq_file *seq, void *v) 142static int rxrpc_connection_seq_show(struct seq_file *seq, void *v)
133{ 143{
134 struct rxrpc_connection *conn; 144 struct rxrpc_connection *conn;
135 char lbuff[4 + 4 + 4 + 4 + 5 + 1], rbuff[4 + 4 + 4 + 4 + 5 + 1]; 145 char lbuff[50], rbuff[50];
136 146
137 if (v == &rxrpc_connections) { 147 if (v == &rxrpc_connection_proc_list) {
138 seq_puts(seq, 148 seq_puts(seq,
139 "Proto Local Remote " 149 "Proto Local "
150 " Remote "
140 " SvID ConnID End Use State Key " 151 " SvID ConnID End Use State Key "
141 " Serial ISerial\n" 152 " Serial ISerial\n"
142 ); 153 );
143 return 0; 154 return 0;
144 } 155 }
145 156
146 conn = list_entry(v, struct rxrpc_connection, link); 157 conn = list_entry(v, struct rxrpc_connection, proc_link);
147 158 if (conn->state == RXRPC_CONN_SERVICE_PREALLOC) {
148 sprintf(lbuff, "%pI4:%u", 159 strcpy(lbuff, "no_local");
149 &conn->params.local->srx.transport.sin.sin_addr, 160 strcpy(rbuff, "no_connection");
150 ntohs(conn->params.local->srx.transport.sin.sin_port)); 161 goto print;
162 }
151 163
152 sprintf(rbuff, "%pI4:%u", 164 sprintf(lbuff, "%pISpc", &conn->params.local->srx.transport);
153 &conn->params.peer->srx.transport.sin.sin_addr,
154 ntohs(conn->params.peer->srx.transport.sin.sin_port));
155 165
166 sprintf(rbuff, "%pISpc", &conn->params.peer->srx.transport);
167print:
156 seq_printf(seq, 168 seq_printf(seq,
157 "UDP %-22.22s %-22.22s %4x %08x %s %3u" 169 "UDP %-47.47s %-47.47s %4x %08x %s %3u"
158 " %s %08x %08x %08x\n", 170 " %s %08x %08x %08x\n",
159 lbuff, 171 lbuff,
160 rbuff, 172 rbuff,
@@ -165,7 +177,7 @@ static int rxrpc_connection_seq_show(struct seq_file *seq, void *v)
165 rxrpc_conn_states[conn->state], 177 rxrpc_conn_states[conn->state],
166 key_serial(conn->params.key), 178 key_serial(conn->params.key),
167 atomic_read(&conn->serial), 179 atomic_read(&conn->serial),
168 atomic_read(&conn->hi_serial)); 180 conn->hi_serial);
169 181
170 return 0; 182 return 0;
171} 183}
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index 9ed66d533002..c29362d50a92 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -19,399 +19,649 @@
19#include "ar-internal.h" 19#include "ar-internal.h"
20 20
21/* 21/*
22 * removal a call's user ID from the socket tree to make the user ID available 22 * Post a call for attention by the socket or kernel service. Further
23 * again and so that it won't be seen again in association with that call 23 * notifications are suppressed by putting recvmsg_link on a dummy queue.
24 */ 24 */
25void rxrpc_remove_user_ID(struct rxrpc_sock *rx, struct rxrpc_call *call) 25void rxrpc_notify_socket(struct rxrpc_call *call)
26{ 26{
27 _debug("RELEASE CALL %d", call->debug_id); 27 struct rxrpc_sock *rx;
28 struct sock *sk;
29
30 _enter("%d", call->debug_id);
31
32 if (!list_empty(&call->recvmsg_link))
33 return;
34
35 rcu_read_lock();
36
37 rx = rcu_dereference(call->socket);
38 sk = &rx->sk;
39 if (rx && sk->sk_state < RXRPC_CLOSE) {
40 if (call->notify_rx) {
41 call->notify_rx(sk, call, call->user_call_ID);
42 } else {
43 write_lock_bh(&rx->recvmsg_lock);
44 if (list_empty(&call->recvmsg_link)) {
45 rxrpc_get_call(call, rxrpc_call_got);
46 list_add_tail(&call->recvmsg_link, &rx->recvmsg_q);
47 }
48 write_unlock_bh(&rx->recvmsg_lock);
28 49
29 if (test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) { 50 if (!sock_flag(sk, SOCK_DEAD)) {
30 write_lock_bh(&rx->call_lock); 51 _debug("call %ps", sk->sk_data_ready);
31 rb_erase(&call->sock_node, &call->socket->calls); 52 sk->sk_data_ready(sk);
32 clear_bit(RXRPC_CALL_HAS_USERID, &call->flags); 53 }
33 write_unlock_bh(&rx->call_lock); 54 }
34 } 55 }
35 56
36 read_lock_bh(&call->state_lock); 57 rcu_read_unlock();
37 if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) && 58 _leave("");
38 !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events))
39 rxrpc_queue_call(call);
40 read_unlock_bh(&call->state_lock);
41} 59}
42 60
43/* 61/*
44 * receive a message from an RxRPC socket 62 * Pass a call terminating message to userspace.
45 * - we need to be careful about two or more threads calling recvmsg
46 * simultaneously
47 */ 63 */
48int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, 64static int rxrpc_recvmsg_term(struct rxrpc_call *call, struct msghdr *msg)
49 int flags)
50{ 65{
51 struct rxrpc_skb_priv *sp; 66 u32 tmp = 0;
52 struct rxrpc_call *call = NULL, *continue_call = NULL; 67 int ret;
53 struct rxrpc_sock *rx = rxrpc_sk(sock->sk); 68
54 struct sk_buff *skb; 69 switch (call->completion) {
55 long timeo; 70 case RXRPC_CALL_SUCCEEDED:
56 int copy, ret, ullen, offset, copied = 0; 71 ret = 0;
57 u32 abort_code; 72 if (rxrpc_is_service_call(call))
73 ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ACK, 0, &tmp);
74 break;
75 case RXRPC_CALL_REMOTELY_ABORTED:
76 tmp = call->abort_code;
77 ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &tmp);
78 break;
79 case RXRPC_CALL_LOCALLY_ABORTED:
80 tmp = call->abort_code;
81 ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &tmp);
82 break;
83 case RXRPC_CALL_NETWORK_ERROR:
84 tmp = call->error;
85 ret = put_cmsg(msg, SOL_RXRPC, RXRPC_NET_ERROR, 4, &tmp);
86 break;
87 case RXRPC_CALL_LOCAL_ERROR:
88 tmp = call->error;
89 ret = put_cmsg(msg, SOL_RXRPC, RXRPC_LOCAL_ERROR, 4, &tmp);
90 break;
91 default:
92 pr_err("Invalid terminal call state %u\n", call->state);
93 BUG();
94 break;
95 }
58 96
59 DEFINE_WAIT(wait); 97 trace_rxrpc_recvmsg(call, rxrpc_recvmsg_terminal, call->rx_hard_ack,
98 call->rx_pkt_offset, call->rx_pkt_len, ret);
99 return ret;
100}
60 101
61 _enter(",,,%zu,%d", len, flags); 102/*
103 * Pass back notification of a new call. The call is added to the
104 * to-be-accepted list. This means that the next call to be accepted might not
105 * be the last call seen awaiting acceptance, but unless we leave this on the
106 * front of the queue and block all other messages until someone gives us a
107 * user_ID for it, there's not a lot we can do.
108 */
109static int rxrpc_recvmsg_new_call(struct rxrpc_sock *rx,
110 struct rxrpc_call *call,
111 struct msghdr *msg, int flags)
112{
113 int tmp = 0, ret;
62 114
63 if (flags & (MSG_OOB | MSG_TRUNC)) 115 ret = put_cmsg(msg, SOL_RXRPC, RXRPC_NEW_CALL, 0, &tmp);
64 return -EOPNOTSUPP;
65 116
66 ullen = msg->msg_flags & MSG_CMSG_COMPAT ? 4 : sizeof(unsigned long); 117 if (ret == 0 && !(flags & MSG_PEEK)) {
118 _debug("to be accepted");
119 write_lock_bh(&rx->recvmsg_lock);
120 list_del_init(&call->recvmsg_link);
121 write_unlock_bh(&rx->recvmsg_lock);
67 122
68 timeo = sock_rcvtimeo(&rx->sk, flags & MSG_DONTWAIT); 123 rxrpc_get_call(call, rxrpc_call_got);
69 msg->msg_flags |= MSG_MORE; 124 write_lock(&rx->call_lock);
125 list_add_tail(&call->accept_link, &rx->to_be_accepted);
126 write_unlock(&rx->call_lock);
127 }
70 128
71 lock_sock(&rx->sk); 129 trace_rxrpc_recvmsg(call, rxrpc_recvmsg_to_be_accepted, 1, 0, 0, ret);
130 return ret;
131}
72 132
73 for (;;) { 133/*
74 /* return immediately if a client socket has no outstanding 134 * End the packet reception phase.
75 * calls */ 135 */
76 if (RB_EMPTY_ROOT(&rx->calls)) { 136static void rxrpc_end_rx_phase(struct rxrpc_call *call, rxrpc_serial_t serial)
77 if (copied) 137{
78 goto out; 138 _enter("%d,%s", call->debug_id, rxrpc_call_states[call->state]);
79 if (rx->sk.sk_state != RXRPC_SERVER_LISTENING) {
80 release_sock(&rx->sk);
81 if (continue_call)
82 rxrpc_put_call(continue_call);
83 return -ENODATA;
84 }
85 }
86 139
87 /* get the next message on the Rx queue */ 140 trace_rxrpc_receive(call, rxrpc_receive_end, 0, call->rx_top);
88 skb = skb_peek(&rx->sk.sk_receive_queue); 141 ASSERTCMP(call->rx_hard_ack, ==, call->rx_top);
89 if (!skb) {
90 /* nothing remains on the queue */
91 if (copied &&
92 (flags & MSG_PEEK || timeo == 0))
93 goto out;
94 142
95 /* wait for a message to turn up */ 143 if (call->state == RXRPC_CALL_CLIENT_RECV_REPLY) {
96 release_sock(&rx->sk); 144 rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, serial, true, false,
97 prepare_to_wait_exclusive(sk_sleep(&rx->sk), &wait, 145 rxrpc_propose_ack_terminal_ack);
98 TASK_INTERRUPTIBLE); 146 rxrpc_send_ack_packet(call, false);
99 ret = sock_error(&rx->sk); 147 }
100 if (ret)
101 goto wait_error;
102
103 if (skb_queue_empty(&rx->sk.sk_receive_queue)) {
104 if (signal_pending(current))
105 goto wait_interrupted;
106 timeo = schedule_timeout(timeo);
107 }
108 finish_wait(sk_sleep(&rx->sk), &wait);
109 lock_sock(&rx->sk);
110 continue;
111 }
112 148
113 peek_next_packet: 149 write_lock_bh(&call->state_lock);
114 sp = rxrpc_skb(skb);
115 call = sp->call;
116 ASSERT(call != NULL);
117 150
118 _debug("next pkt %s", rxrpc_pkts[sp->hdr.type]); 151 switch (call->state) {
152 case RXRPC_CALL_CLIENT_RECV_REPLY:
153 __rxrpc_call_completed(call);
154 write_unlock_bh(&call->state_lock);
155 break;
119 156
120 /* make sure we wait for the state to be updated in this call */ 157 case RXRPC_CALL_SERVER_RECV_REQUEST:
121 spin_lock_bh(&call->lock); 158 call->tx_phase = true;
122 spin_unlock_bh(&call->lock); 159 call->state = RXRPC_CALL_SERVER_ACK_REQUEST;
160 call->ack_at = call->expire_at;
161 write_unlock_bh(&call->state_lock);
162 rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, 0, serial, false, true,
163 rxrpc_propose_ack_processing_op);
164 break;
165 default:
166 write_unlock_bh(&call->state_lock);
167 break;
168 }
169}
170
171/*
172 * Discard a packet we've used up and advance the Rx window by one.
173 */
174static void rxrpc_rotate_rx_window(struct rxrpc_call *call)
175{
176 struct rxrpc_skb_priv *sp;
177 struct sk_buff *skb;
178 rxrpc_serial_t serial;
179 rxrpc_seq_t hard_ack, top;
180 u8 flags;
181 int ix;
182
183 _enter("%d", call->debug_id);
184
185 hard_ack = call->rx_hard_ack;
186 top = smp_load_acquire(&call->rx_top);
187 ASSERT(before(hard_ack, top));
188
189 hard_ack++;
190 ix = hard_ack & RXRPC_RXTX_BUFF_MASK;
191 skb = call->rxtx_buffer[ix];
192 rxrpc_see_skb(skb, rxrpc_skb_rx_rotated);
193 sp = rxrpc_skb(skb);
194 flags = sp->hdr.flags;
195 serial = sp->hdr.serial;
196 if (call->rxtx_annotations[ix] & RXRPC_RX_ANNO_JUMBO)
197 serial += (call->rxtx_annotations[ix] & RXRPC_RX_ANNO_JUMBO) - 1;
198
199 call->rxtx_buffer[ix] = NULL;
200 call->rxtx_annotations[ix] = 0;
201 /* Barrier against rxrpc_input_data(). */
202 smp_store_release(&call->rx_hard_ack, hard_ack);
203
204 rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
205
206 _debug("%u,%u,%02x", hard_ack, top, flags);
207 trace_rxrpc_receive(call, rxrpc_receive_rotate, serial, hard_ack);
208 if (flags & RXRPC_LAST_PACKET) {
209 rxrpc_end_rx_phase(call, serial);
210 } else {
211 /* Check to see if there's an ACK that needs sending. */
212 if (after_eq(hard_ack, call->ackr_consumed + 2) ||
213 after_eq(top, call->ackr_seen + 2) ||
214 (hard_ack == top && after(hard_ack, call->ackr_consumed)))
215 rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, 0, serial,
216 true, false,
217 rxrpc_propose_ack_rotate_rx);
218 if (call->ackr_reason)
219 rxrpc_send_ack_packet(call, false);
220 }
221}
123 222
124 if (test_bit(RXRPC_CALL_RELEASED, &call->flags)) { 223/*
125 _debug("packet from released call"); 224 * Decrypt and verify a (sub)packet. The packet's length may be changed due to
126 if (skb_dequeue(&rx->sk.sk_receive_queue) != skb) 225 * padding, but if this is the case, the packet length will be resident in the
127 BUG(); 226 * socket buffer. Note that we can't modify the master skb info as the skb may
128 rxrpc_free_skb(skb); 227 * be the home to multiple subpackets.
129 continue; 228 */
229static int rxrpc_verify_packet(struct rxrpc_call *call, struct sk_buff *skb,
230 u8 annotation,
231 unsigned int offset, unsigned int len)
232{
233 struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
234 rxrpc_seq_t seq = sp->hdr.seq;
235 u16 cksum = sp->hdr.cksum;
236
237 _enter("");
238
239 /* For all but the head jumbo subpacket, the security checksum is in a
240 * jumbo header immediately prior to the data.
241 */
242 if ((annotation & RXRPC_RX_ANNO_JUMBO) > 1) {
243 __be16 tmp;
244 if (skb_copy_bits(skb, offset - 2, &tmp, 2) < 0)
245 BUG();
246 cksum = ntohs(tmp);
247 seq += (annotation & RXRPC_RX_ANNO_JUMBO) - 1;
248 }
249
250 return call->conn->security->verify_packet(call, skb, offset, len,
251 seq, cksum);
252}
253
254/*
255 * Locate the data within a packet. This is complicated by:
256 *
257 * (1) An skb may contain a jumbo packet - so we have to find the appropriate
258 * subpacket.
259 *
260 * (2) The (sub)packets may be encrypted and, if so, the encrypted portion
261 * contains an extra header which includes the true length of the data,
262 * excluding any encrypted padding.
263 */
264static int rxrpc_locate_data(struct rxrpc_call *call, struct sk_buff *skb,
265 u8 *_annotation,
266 unsigned int *_offset, unsigned int *_len)
267{
268 unsigned int offset = sizeof(struct rxrpc_wire_header);
269 unsigned int len = *_len;
270 int ret;
271 u8 annotation = *_annotation;
272
273 /* Locate the subpacket */
274 len = skb->len - offset;
275 if ((annotation & RXRPC_RX_ANNO_JUMBO) > 0) {
276 offset += (((annotation & RXRPC_RX_ANNO_JUMBO) - 1) *
277 RXRPC_JUMBO_SUBPKTLEN);
278 len = (annotation & RXRPC_RX_ANNO_JLAST) ?
279 skb->len - offset : RXRPC_JUMBO_SUBPKTLEN;
280 }
281
282 if (!(annotation & RXRPC_RX_ANNO_VERIFIED)) {
283 ret = rxrpc_verify_packet(call, skb, annotation, offset, len);
284 if (ret < 0)
285 return ret;
286 *_annotation |= RXRPC_RX_ANNO_VERIFIED;
287 }
288
289 *_offset = offset;
290 *_len = len;
291 call->conn->security->locate_data(call, skb, _offset, _len);
292 return 0;
293}
294
295/*
296 * Deliver messages to a call. This keeps processing packets until the buffer
297 * is filled and we find either more DATA (returns 0) or the end of the DATA
298 * (returns 1). If more packets are required, it returns -EAGAIN.
299 */
300static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
301 struct msghdr *msg, struct iov_iter *iter,
302 size_t len, int flags, size_t *_offset)
303{
304 struct rxrpc_skb_priv *sp;
305 struct sk_buff *skb;
306 rxrpc_seq_t hard_ack, top, seq;
307 size_t remain;
308 bool last;
309 unsigned int rx_pkt_offset, rx_pkt_len;
310 int ix, copy, ret = -EAGAIN, ret2;
311
312 rx_pkt_offset = call->rx_pkt_offset;
313 rx_pkt_len = call->rx_pkt_len;
314
315 if (call->state >= RXRPC_CALL_SERVER_ACK_REQUEST) {
316 seq = call->rx_hard_ack;
317 ret = 1;
318 goto done;
319 }
320
321 /* Barriers against rxrpc_input_data(). */
322 hard_ack = call->rx_hard_ack;
323 top = smp_load_acquire(&call->rx_top);
324 for (seq = hard_ack + 1; before_eq(seq, top); seq++) {
325 ix = seq & RXRPC_RXTX_BUFF_MASK;
326 skb = call->rxtx_buffer[ix];
327 if (!skb) {
328 trace_rxrpc_recvmsg(call, rxrpc_recvmsg_hole, seq,
329 rx_pkt_offset, rx_pkt_len, 0);
330 break;
130 } 331 }
332 smp_rmb();
333 rxrpc_see_skb(skb, rxrpc_skb_rx_seen);
334 sp = rxrpc_skb(skb);
131 335
132 /* determine whether to continue last data receive */ 336 if (!(flags & MSG_PEEK))
133 if (continue_call) { 337 trace_rxrpc_receive(call, rxrpc_receive_front,
134 _debug("maybe cont"); 338 sp->hdr.serial, seq);
135 if (call != continue_call || 339
136 skb->mark != RXRPC_SKB_MARK_DATA) { 340 if (msg)
137 release_sock(&rx->sk); 341 sock_recv_timestamp(msg, sock->sk, skb);
138 rxrpc_put_call(continue_call); 342
139 _leave(" = %d [noncont]", copied); 343 if (rx_pkt_offset == 0) {
140 return copied; 344 ret2 = rxrpc_locate_data(call, skb,
345 &call->rxtx_annotations[ix],
346 &rx_pkt_offset, &rx_pkt_len);
347 trace_rxrpc_recvmsg(call, rxrpc_recvmsg_next, seq,
348 rx_pkt_offset, rx_pkt_len, ret2);
349 if (ret2 < 0) {
350 ret = ret2;
351 goto out;
141 } 352 }
353 } else {
354 trace_rxrpc_recvmsg(call, rxrpc_recvmsg_cont, seq,
355 rx_pkt_offset, rx_pkt_len, 0);
142 } 356 }
143 357
144 rxrpc_get_call(call); 358 /* We have to handle short, empty and used-up DATA packets. */
145 359 remain = len - *_offset;
146 /* copy the peer address and timestamp */ 360 copy = rx_pkt_len;
147 if (!continue_call) { 361 if (copy > remain)
148 if (msg->msg_name) { 362 copy = remain;
149 size_t len = 363 if (copy > 0) {
150 sizeof(call->conn->params.peer->srx); 364 ret2 = skb_copy_datagram_iter(skb, rx_pkt_offset, iter,
151 memcpy(msg->msg_name, 365 copy);
152 &call->conn->params.peer->srx, len); 366 if (ret2 < 0) {
153 msg->msg_namelen = len; 367 ret = ret2;
368 goto out;
154 } 369 }
155 sock_recv_timestamp(msg, &rx->sk, skb);
156 }
157 370
158 /* receive the message */ 371 /* handle piecemeal consumption of data packets */
159 if (skb->mark != RXRPC_SKB_MARK_DATA) 372 rx_pkt_offset += copy;
160 goto receive_non_data_message; 373 rx_pkt_len -= copy;
374 *_offset += copy;
375 }
161 376
162 _debug("recvmsg DATA #%u { %d, %d }", 377 if (rx_pkt_len > 0) {
163 sp->hdr.seq, skb->len, sp->offset); 378 trace_rxrpc_recvmsg(call, rxrpc_recvmsg_full, seq,
379 rx_pkt_offset, rx_pkt_len, 0);
380 ASSERTCMP(*_offset, ==, len);
381 ret = 0;
382 break;
383 }
164 384
165 if (!continue_call) { 385 /* The whole packet has been transferred. */
166 /* only set the control data once per recvmsg() */ 386 last = sp->hdr.flags & RXRPC_LAST_PACKET;
167 ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, 387 if (!(flags & MSG_PEEK))
168 ullen, &call->user_call_ID); 388 rxrpc_rotate_rx_window(call);
169 if (ret < 0) 389 rx_pkt_offset = 0;
170 goto copy_error; 390 rx_pkt_len = 0;
171 ASSERT(test_bit(RXRPC_CALL_HAS_USERID, &call->flags)); 391
392 if (last) {
393 ASSERTCMP(seq, ==, READ_ONCE(call->rx_top));
394 ret = 1;
395 goto out;
172 } 396 }
397 }
173 398
174 ASSERTCMP(sp->hdr.seq, >=, call->rx_data_recv); 399out:
175 ASSERTCMP(sp->hdr.seq, <=, call->rx_data_recv + 1); 400 if (!(flags & MSG_PEEK)) {
176 call->rx_data_recv = sp->hdr.seq; 401 call->rx_pkt_offset = rx_pkt_offset;
402 call->rx_pkt_len = rx_pkt_len;
403 }
404done:
405 trace_rxrpc_recvmsg(call, rxrpc_recvmsg_data_return, seq,
406 rx_pkt_offset, rx_pkt_len, ret);
407 return ret;
408}
177 409
178 ASSERTCMP(sp->hdr.seq, >, call->rx_data_eaten); 410/*
411 * Receive a message from an RxRPC socket
412 * - we need to be careful about two or more threads calling recvmsg
413 * simultaneously
414 */
415int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
416 int flags)
417{
418 struct rxrpc_call *call;
419 struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
420 struct list_head *l;
421 size_t copied = 0;
422 long timeo;
423 int ret;
179 424
180 offset = sp->offset; 425 DEFINE_WAIT(wait);
181 copy = skb->len - offset;
182 if (copy > len - copied)
183 copy = len - copied;
184 426
185 ret = skb_copy_datagram_msg(skb, offset, msg, copy); 427 trace_rxrpc_recvmsg(NULL, rxrpc_recvmsg_enter, 0, 0, 0, 0);
186 428
187 if (ret < 0) 429 if (flags & (MSG_OOB | MSG_TRUNC))
188 goto copy_error; 430 return -EOPNOTSUPP;
189 431
190 /* handle piecemeal consumption of data packets */ 432 timeo = sock_rcvtimeo(&rx->sk, flags & MSG_DONTWAIT);
191 _debug("copied %d+%d", copy, copied);
192 433
193 offset += copy; 434try_again:
194 copied += copy; 435 lock_sock(&rx->sk);
195 436
196 if (!(flags & MSG_PEEK)) 437 /* Return immediately if a client socket has no outstanding calls */
197 sp->offset = offset; 438 if (RB_EMPTY_ROOT(&rx->calls) &&
439 list_empty(&rx->recvmsg_q) &&
440 rx->sk.sk_state != RXRPC_SERVER_LISTENING) {
441 release_sock(&rx->sk);
442 return -ENODATA;
443 }
198 444
199 if (sp->offset < skb->len) { 445 if (list_empty(&rx->recvmsg_q)) {
200 _debug("buffer full"); 446 ret = -EWOULDBLOCK;
201 ASSERTCMP(copied, ==, len); 447 if (timeo == 0) {
202 break; 448 call = NULL;
449 goto error_no_call;
203 } 450 }
204 451
205 /* we transferred the whole data packet */ 452 release_sock(&rx->sk);
206 if (!(flags & MSG_PEEK)) 453
207 rxrpc_kernel_data_consumed(call, skb); 454 /* Wait for something to happen */
208 455 prepare_to_wait_exclusive(sk_sleep(&rx->sk), &wait,
209 if (sp->hdr.flags & RXRPC_LAST_PACKET) { 456 TASK_INTERRUPTIBLE);
210 _debug("last"); 457 ret = sock_error(&rx->sk);
211 if (rxrpc_conn_is_client(call->conn)) { 458 if (ret)
212 /* last byte of reply received */ 459 goto wait_error;
213 ret = copied; 460
214 goto terminal_message; 461 if (list_empty(&rx->recvmsg_q)) {
215 } 462 if (signal_pending(current))
216 463 goto wait_interrupted;
217 /* last bit of request received */ 464 trace_rxrpc_recvmsg(NULL, rxrpc_recvmsg_wait,
218 if (!(flags & MSG_PEEK)) { 465 0, 0, 0, 0);
219 _debug("eat packet"); 466 timeo = schedule_timeout(timeo);
220 if (skb_dequeue(&rx->sk.sk_receive_queue) !=
221 skb)
222 BUG();
223 rxrpc_free_skb(skb);
224 }
225 msg->msg_flags &= ~MSG_MORE;
226 break;
227 } 467 }
468 finish_wait(sk_sleep(&rx->sk), &wait);
469 goto try_again;
470 }
228 471
229 /* move on to the next data message */ 472 /* Find the next call and dequeue it if we're not just peeking. If we
230 _debug("next"); 473 * do dequeue it, that comes with a ref that we will need to release.
231 if (!continue_call) 474 */
232 continue_call = sp->call; 475 write_lock_bh(&rx->recvmsg_lock);
233 else 476 l = rx->recvmsg_q.next;
234 rxrpc_put_call(call); 477 call = list_entry(l, struct rxrpc_call, recvmsg_link);
235 call = NULL; 478 if (!(flags & MSG_PEEK))
236 479 list_del_init(&call->recvmsg_link);
237 if (flags & MSG_PEEK) { 480 else
238 _debug("peek next"); 481 rxrpc_get_call(call, rxrpc_call_got);
239 skb = skb->next; 482 write_unlock_bh(&rx->recvmsg_lock);
240 if (skb == (struct sk_buff *) &rx->sk.sk_receive_queue) 483
241 break; 484 trace_rxrpc_recvmsg(call, rxrpc_recvmsg_dequeue, 0, 0, 0, 0);
242 goto peek_next_packet; 485
243 } 486 if (test_bit(RXRPC_CALL_RELEASED, &call->flags))
487 BUG();
244 488
245 _debug("eat packet"); 489 if (test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) {
246 if (skb_dequeue(&rx->sk.sk_receive_queue) != skb) 490 if (flags & MSG_CMSG_COMPAT) {
247 BUG(); 491 unsigned int id32 = call->user_call_ID;
248 rxrpc_free_skb(skb);
249 }
250 492
251 /* end of non-terminal data packet reception for the moment */ 493 ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID,
252 _debug("end rcv data"); 494 sizeof(unsigned int), &id32);
253out: 495 } else {
254 release_sock(&rx->sk); 496 ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID,
255 if (call) 497 sizeof(unsigned long),
256 rxrpc_put_call(call); 498 &call->user_call_ID);
257 if (continue_call)
258 rxrpc_put_call(continue_call);
259 _leave(" = %d [data]", copied);
260 return copied;
261
262 /* handle non-DATA messages such as aborts, incoming connections and
263 * final ACKs */
264receive_non_data_message:
265 _debug("non-data");
266
267 if (skb->mark == RXRPC_SKB_MARK_NEW_CALL) {
268 _debug("RECV NEW CALL");
269 ret = put_cmsg(msg, SOL_RXRPC, RXRPC_NEW_CALL, 0, &abort_code);
270 if (ret < 0)
271 goto copy_error;
272 if (!(flags & MSG_PEEK)) {
273 if (skb_dequeue(&rx->sk.sk_receive_queue) != skb)
274 BUG();
275 rxrpc_free_skb(skb);
276 } 499 }
277 goto out; 500 if (ret < 0)
501 goto error;
278 } 502 }
279 503
280 ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, 504 if (msg->msg_name) {
281 ullen, &call->user_call_ID); 505 size_t len = sizeof(call->conn->params.peer->srx);
282 if (ret < 0) 506 memcpy(msg->msg_name, &call->conn->params.peer->srx, len);
283 goto copy_error; 507 msg->msg_namelen = len;
284 ASSERT(test_bit(RXRPC_CALL_HAS_USERID, &call->flags)); 508 }
285 509
286 switch (skb->mark) { 510 switch (call->state) {
287 case RXRPC_SKB_MARK_DATA: 511 case RXRPC_CALL_SERVER_ACCEPTING:
288 BUG(); 512 ret = rxrpc_recvmsg_new_call(rx, call, msg, flags);
289 case RXRPC_SKB_MARK_FINAL_ACK:
290 ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ACK, 0, &abort_code);
291 break;
292 case RXRPC_SKB_MARK_BUSY:
293 ret = put_cmsg(msg, SOL_RXRPC, RXRPC_BUSY, 0, &abort_code);
294 break; 513 break;
295 case RXRPC_SKB_MARK_REMOTE_ABORT: 514 case RXRPC_CALL_CLIENT_RECV_REPLY:
296 abort_code = call->remote_abort; 515 case RXRPC_CALL_SERVER_RECV_REQUEST:
297 ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &abort_code); 516 case RXRPC_CALL_SERVER_ACK_REQUEST:
298 break; 517 ret = rxrpc_recvmsg_data(sock, call, msg, &msg->msg_iter, len,
299 case RXRPC_SKB_MARK_LOCAL_ABORT: 518 flags, &copied);
300 abort_code = call->local_abort; 519 if (ret == -EAGAIN)
301 ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &abort_code); 520 ret = 0;
302 break; 521
303 case RXRPC_SKB_MARK_NET_ERROR: 522 if (after(call->rx_top, call->rx_hard_ack) &&
304 _debug("RECV NET ERROR %d", sp->error); 523 call->rxtx_buffer[(call->rx_hard_ack + 1) & RXRPC_RXTX_BUFF_MASK])
305 abort_code = sp->error; 524 rxrpc_notify_socket(call);
306 ret = put_cmsg(msg, SOL_RXRPC, RXRPC_NET_ERROR, 4, &abort_code);
307 break;
308 case RXRPC_SKB_MARK_LOCAL_ERROR:
309 _debug("RECV LOCAL ERROR %d", sp->error);
310 abort_code = sp->error;
311 ret = put_cmsg(msg, SOL_RXRPC, RXRPC_LOCAL_ERROR, 4,
312 &abort_code);
313 break; 525 break;
314 default: 526 default:
315 pr_err("Unknown packet mark %u\n", skb->mark); 527 ret = 0;
316 BUG();
317 break; 528 break;
318 } 529 }
319 530
320 if (ret < 0) 531 if (ret < 0)
321 goto copy_error; 532 goto error;
322
323terminal_message:
324 _debug("terminal");
325 msg->msg_flags &= ~MSG_MORE;
326 msg->msg_flags |= MSG_EOR;
327 533
328 if (!(flags & MSG_PEEK)) { 534 if (call->state == RXRPC_CALL_COMPLETE) {
329 _net("free terminal skb %p", skb); 535 ret = rxrpc_recvmsg_term(call, msg);
330 if (skb_dequeue(&rx->sk.sk_receive_queue) != skb) 536 if (ret < 0)
331 BUG(); 537 goto error;
332 rxrpc_free_skb(skb); 538 if (!(flags & MSG_PEEK))
333 rxrpc_remove_user_ID(rx, call); 539 rxrpc_release_call(rx, call);
540 msg->msg_flags |= MSG_EOR;
541 ret = 1;
334 } 542 }
335 543
336 release_sock(&rx->sk); 544 if (ret == 0)
337 rxrpc_put_call(call); 545 msg->msg_flags |= MSG_MORE;
338 if (continue_call) 546 else
339 rxrpc_put_call(continue_call); 547 msg->msg_flags &= ~MSG_MORE;
340 _leave(" = %d", ret); 548 ret = copied;
341 return ret;
342 549
343copy_error: 550error:
344 _debug("copy error"); 551 rxrpc_put_call(call, rxrpc_call_put);
552error_no_call:
345 release_sock(&rx->sk); 553 release_sock(&rx->sk);
346 rxrpc_put_call(call); 554 trace_rxrpc_recvmsg(call, rxrpc_recvmsg_return, 0, 0, 0, ret);
347 if (continue_call)
348 rxrpc_put_call(continue_call);
349 _leave(" = %d", ret);
350 return ret; 555 return ret;
351 556
352wait_interrupted: 557wait_interrupted:
353 ret = sock_intr_errno(timeo); 558 ret = sock_intr_errno(timeo);
354wait_error: 559wait_error:
355 finish_wait(sk_sleep(&rx->sk), &wait); 560 finish_wait(sk_sleep(&rx->sk), &wait);
356 if (continue_call) 561 call = NULL;
357 rxrpc_put_call(continue_call); 562 goto error_no_call;
358 if (copied)
359 copied = ret;
360 _leave(" = %d [waitfail %d]", copied, ret);
361 return copied;
362
363} 563}
364 564
365/** 565/**
366 * rxrpc_kernel_is_data_last - Determine if data message is last one 566 * rxrpc_kernel_recv_data - Allow a kernel service to receive data/info
367 * @skb: Message holding data 567 * @sock: The socket that the call exists on
568 * @call: The call to send data through
569 * @buf: The buffer to receive into
570 * @size: The size of the buffer, including data already read
571 * @_offset: The running offset into the buffer.
572 * @want_more: True if more data is expected to be read
573 * @_abort: Where the abort code is stored if -ECONNABORTED is returned
368 * 574 *
369 * Determine if data message is last one for the parent call. 575 * Allow a kernel service to receive data and pick up information about the
576 * state of a call. Returns 0 if got what was asked for and there's more
577 * available, 1 if we got what was asked for and we're at the end of the data
578 * and -EAGAIN if we need more data.
579 *
580 * Note that we may return -EAGAIN to drain empty packets at the end of the
581 * data, even if we've already copied over the requested data.
582 *
583 * This function adds the amount it transfers to *_offset, so this should be
584 * precleared as appropriate. Note that the amount remaining in the buffer is
585 * taken to be size - *_offset.
586 *
587 * *_abort should also be initialised to 0.
370 */ 588 */
371bool rxrpc_kernel_is_data_last(struct sk_buff *skb) 589int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call,
590 void *buf, size_t size, size_t *_offset,
591 bool want_more, u32 *_abort)
372{ 592{
373 struct rxrpc_skb_priv *sp = rxrpc_skb(skb); 593 struct iov_iter iter;
594 struct kvec iov;
595 int ret;
374 596
375 ASSERTCMP(skb->mark, ==, RXRPC_SKB_MARK_DATA); 597 _enter("{%d,%s},%zu/%zu,%d",
598 call->debug_id, rxrpc_call_states[call->state],
599 *_offset, size, want_more);
376 600
377 return sp->hdr.flags & RXRPC_LAST_PACKET; 601 ASSERTCMP(*_offset, <=, size);
378} 602 ASSERTCMP(call->state, !=, RXRPC_CALL_SERVER_ACCEPTING);
379 603
380EXPORT_SYMBOL(rxrpc_kernel_is_data_last); 604 iov.iov_base = buf + *_offset;
605 iov.iov_len = size - *_offset;
606 iov_iter_kvec(&iter, ITER_KVEC | READ, &iov, 1, size - *_offset);
381 607
382/** 608 lock_sock(sock->sk);
383 * rxrpc_kernel_get_abort_code - Get the abort code from an RxRPC abort message 609
384 * @skb: Message indicating an abort 610 switch (call->state) {
385 * 611 case RXRPC_CALL_CLIENT_RECV_REPLY:
386 * Get the abort code from an RxRPC abort message. 612 case RXRPC_CALL_SERVER_RECV_REQUEST:
387 */ 613 case RXRPC_CALL_SERVER_ACK_REQUEST:
388u32 rxrpc_kernel_get_abort_code(struct sk_buff *skb) 614 ret = rxrpc_recvmsg_data(sock, call, NULL, &iter, size, 0,
389{ 615 _offset);
390 struct rxrpc_skb_priv *sp = rxrpc_skb(skb); 616 if (ret < 0)
617 goto out;
618
619 /* We can only reach here with a partially full buffer if we
620 * have reached the end of the data. We must otherwise have a
621 * full buffer or have been given -EAGAIN.
622 */
623 if (ret == 1) {
624 if (*_offset < size)
625 goto short_data;
626 if (!want_more)
627 goto read_phase_complete;
628 ret = 0;
629 goto out;
630 }
631
632 if (!want_more)
633 goto excess_data;
634 goto out;
635
636 case RXRPC_CALL_COMPLETE:
637 goto call_complete;
391 638
392 switch (skb->mark) {
393 case RXRPC_SKB_MARK_REMOTE_ABORT:
394 return sp->call->remote_abort;
395 case RXRPC_SKB_MARK_LOCAL_ABORT:
396 return sp->call->local_abort;
397 default: 639 default:
398 BUG(); 640 ret = -EINPROGRESS;
641 goto out;
399 } 642 }
400}
401
402EXPORT_SYMBOL(rxrpc_kernel_get_abort_code);
403 643
404/** 644read_phase_complete:
405 * rxrpc_kernel_get_error - Get the error number from an RxRPC error message 645 ret = 1;
406 * @skb: Message indicating an error 646out:
407 * 647 release_sock(sock->sk);
408 * Get the error number from an RxRPC error message. 648 _leave(" = %d [%zu,%d]", ret, *_offset, *_abort);
409 */ 649 return ret;
410int rxrpc_kernel_get_error_number(struct sk_buff *skb)
411{
412 struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
413 650
414 return sp->error; 651short_data:
652 ret = -EBADMSG;
653 goto out;
654excess_data:
655 ret = -EMSGSIZE;
656 goto out;
657call_complete:
658 *_abort = call->abort_code;
659 ret = -call->error;
660 if (call->completion == RXRPC_CALL_SUCCEEDED) {
661 ret = 1;
662 if (size > 0)
663 ret = -ECONNRESET;
664 }
665 goto out;
415} 666}
416 667EXPORT_SYMBOL(rxrpc_kernel_recv_data);
417EXPORT_SYMBOL(rxrpc_kernel_get_error_number);
diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c
index 63afa9e9cc08..4374e7b9c7bf 100644
--- a/net/rxrpc/rxkad.c
+++ b/net/rxrpc/rxkad.c
@@ -80,12 +80,10 @@ static int rxkad_init_connection_security(struct rxrpc_connection *conn)
80 case RXRPC_SECURITY_AUTH: 80 case RXRPC_SECURITY_AUTH:
81 conn->size_align = 8; 81 conn->size_align = 8;
82 conn->security_size = sizeof(struct rxkad_level1_hdr); 82 conn->security_size = sizeof(struct rxkad_level1_hdr);
83 conn->header_size += sizeof(struct rxkad_level1_hdr);
84 break; 83 break;
85 case RXRPC_SECURITY_ENCRYPT: 84 case RXRPC_SECURITY_ENCRYPT:
86 conn->size_align = 8; 85 conn->size_align = 8;
87 conn->security_size = sizeof(struct rxkad_level2_hdr); 86 conn->security_size = sizeof(struct rxkad_level2_hdr);
88 conn->header_size += sizeof(struct rxkad_level2_hdr);
89 break; 87 break;
90 default: 88 default:
91 ret = -EKEYREJECTED; 89 ret = -EKEYREJECTED;
@@ -161,7 +159,7 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call,
161 159
162 _enter(""); 160 _enter("");
163 161
164 check = sp->hdr.seq ^ sp->hdr.callNumber; 162 check = sp->hdr.seq ^ call->call_id;
165 data_size |= (u32)check << 16; 163 data_size |= (u32)check << 16;
166 164
167 hdr.data_size = htonl(data_size); 165 hdr.data_size = htonl(data_size);
@@ -205,7 +203,7 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call,
205 203
206 _enter(""); 204 _enter("");
207 205
208 check = sp->hdr.seq ^ sp->hdr.callNumber; 206 check = sp->hdr.seq ^ call->call_id;
209 207
210 rxkhdr.data_size = htonl(data_size | (u32)check << 16); 208 rxkhdr.data_size = htonl(data_size | (u32)check << 16);
211 rxkhdr.checksum = 0; 209 rxkhdr.checksum = 0;
@@ -275,9 +273,9 @@ static int rxkad_secure_packet(struct rxrpc_call *call,
275 memcpy(&iv, call->conn->csum_iv.x, sizeof(iv)); 273 memcpy(&iv, call->conn->csum_iv.x, sizeof(iv));
276 274
277 /* calculate the security checksum */ 275 /* calculate the security checksum */
278 x = call->channel << (32 - RXRPC_CIDSHIFT); 276 x = (call->cid & RXRPC_CHANNELMASK) << (32 - RXRPC_CIDSHIFT);
279 x |= sp->hdr.seq & 0x3fffffff; 277 x |= sp->hdr.seq & 0x3fffffff;
280 call->crypto_buf[0] = htonl(sp->hdr.callNumber); 278 call->crypto_buf[0] = htonl(call->call_id);
281 call->crypto_buf[1] = htonl(x); 279 call->crypto_buf[1] = htonl(x);
282 280
283 sg_init_one(&sg, call->crypto_buf, 8); 281 sg_init_one(&sg, call->crypto_buf, 8);
@@ -316,12 +314,11 @@ static int rxkad_secure_packet(struct rxrpc_call *call,
316/* 314/*
317 * decrypt partial encryption on a packet (level 1 security) 315 * decrypt partial encryption on a packet (level 1 security)
318 */ 316 */
319static int rxkad_verify_packet_auth(const struct rxrpc_call *call, 317static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb,
320 struct sk_buff *skb, 318 unsigned int offset, unsigned int len,
321 u32 *_abort_code) 319 rxrpc_seq_t seq)
322{ 320{
323 struct rxkad_level1_hdr sechdr; 321 struct rxkad_level1_hdr sechdr;
324 struct rxrpc_skb_priv *sp;
325 SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher); 322 SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher);
326 struct rxrpc_crypt iv; 323 struct rxrpc_crypt iv;
327 struct scatterlist sg[16]; 324 struct scatterlist sg[16];
@@ -332,15 +329,20 @@ static int rxkad_verify_packet_auth(const struct rxrpc_call *call,
332 329
333 _enter(""); 330 _enter("");
334 331
335 sp = rxrpc_skb(skb); 332 if (len < 8) {
333 rxrpc_abort_call("V1H", call, seq, RXKADSEALEDINCON, EPROTO);
334 goto protocol_error;
335 }
336 336
337 /* we want to decrypt the skbuff in-place */ 337 /* Decrypt the skbuff in-place. TODO: We really want to decrypt
338 * directly into the target buffer.
339 */
338 nsg = skb_cow_data(skb, 0, &trailer); 340 nsg = skb_cow_data(skb, 0, &trailer);
339 if (nsg < 0 || nsg > 16) 341 if (nsg < 0 || nsg > 16)
340 goto nomem; 342 goto nomem;
341 343
342 sg_init_table(sg, nsg); 344 sg_init_table(sg, nsg);
343 skb_to_sgvec(skb, sg, 0, 8); 345 skb_to_sgvec(skb, sg, offset, 8);
344 346
345 /* start the decryption afresh */ 347 /* start the decryption afresh */
346 memset(&iv, 0, sizeof(iv)); 348 memset(&iv, 0, sizeof(iv));
@@ -351,35 +353,35 @@ static int rxkad_verify_packet_auth(const struct rxrpc_call *call,
351 crypto_skcipher_decrypt(req); 353 crypto_skcipher_decrypt(req);
352 skcipher_request_zero(req); 354 skcipher_request_zero(req);
353 355
354 /* remove the decrypted packet length */ 356 /* Extract the decrypted packet length */
355 if (skb_copy_bits(skb, 0, &sechdr, sizeof(sechdr)) < 0) 357 if (skb_copy_bits(skb, offset, &sechdr, sizeof(sechdr)) < 0) {
356 goto datalen_error; 358 rxrpc_abort_call("XV1", call, seq, RXKADDATALEN, EPROTO);
357 if (!skb_pull(skb, sizeof(sechdr))) 359 goto protocol_error;
358 BUG(); 360 }
361 offset += sizeof(sechdr);
362 len -= sizeof(sechdr);
359 363
360 buf = ntohl(sechdr.data_size); 364 buf = ntohl(sechdr.data_size);
361 data_size = buf & 0xffff; 365 data_size = buf & 0xffff;
362 366
363 check = buf >> 16; 367 check = buf >> 16;
364 check ^= sp->hdr.seq ^ sp->hdr.callNumber; 368 check ^= seq ^ call->call_id;
365 check &= 0xffff; 369 check &= 0xffff;
366 if (check != 0) { 370 if (check != 0) {
367 *_abort_code = RXKADSEALEDINCON; 371 rxrpc_abort_call("V1C", call, seq, RXKADSEALEDINCON, EPROTO);
368 goto protocol_error; 372 goto protocol_error;
369 } 373 }
370 374
371 /* shorten the packet to remove the padding */ 375 if (data_size > len) {
372 if (data_size > skb->len) 376 rxrpc_abort_call("V1L", call, seq, RXKADDATALEN, EPROTO);
373 goto datalen_error; 377 goto protocol_error;
374 else if (data_size < skb->len) 378 }
375 skb->len = data_size;
376 379
377 _leave(" = 0 [dlen=%x]", data_size); 380 _leave(" = 0 [dlen=%x]", data_size);
378 return 0; 381 return 0;
379 382
380datalen_error:
381 *_abort_code = RXKADDATALEN;
382protocol_error: 383protocol_error:
384 rxrpc_send_abort_packet(call);
383 _leave(" = -EPROTO"); 385 _leave(" = -EPROTO");
384 return -EPROTO; 386 return -EPROTO;
385 387
@@ -391,13 +393,12 @@ nomem:
391/* 393/*
392 * wholly decrypt a packet (level 2 security) 394 * wholly decrypt a packet (level 2 security)
393 */ 395 */
394static int rxkad_verify_packet_encrypt(const struct rxrpc_call *call, 396static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb,
395 struct sk_buff *skb, 397 unsigned int offset, unsigned int len,
396 u32 *_abort_code) 398 rxrpc_seq_t seq)
397{ 399{
398 const struct rxrpc_key_token *token; 400 const struct rxrpc_key_token *token;
399 struct rxkad_level2_hdr sechdr; 401 struct rxkad_level2_hdr sechdr;
400 struct rxrpc_skb_priv *sp;
401 SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher); 402 SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher);
402 struct rxrpc_crypt iv; 403 struct rxrpc_crypt iv;
403 struct scatterlist _sg[4], *sg; 404 struct scatterlist _sg[4], *sg;
@@ -408,9 +409,14 @@ static int rxkad_verify_packet_encrypt(const struct rxrpc_call *call,
408 409
409 _enter(",{%d}", skb->len); 410 _enter(",{%d}", skb->len);
410 411
411 sp = rxrpc_skb(skb); 412 if (len < 8) {
413 rxrpc_abort_call("V2H", call, seq, RXKADSEALEDINCON, EPROTO);
414 goto protocol_error;
415 }
412 416
413 /* we want to decrypt the skbuff in-place */ 417 /* Decrypt the skbuff in-place. TODO: We really want to decrypt
418 * directly into the target buffer.
419 */
414 nsg = skb_cow_data(skb, 0, &trailer); 420 nsg = skb_cow_data(skb, 0, &trailer);
415 if (nsg < 0) 421 if (nsg < 0)
416 goto nomem; 422 goto nomem;
@@ -423,7 +429,7 @@ static int rxkad_verify_packet_encrypt(const struct rxrpc_call *call,
423 } 429 }
424 430
425 sg_init_table(sg, nsg); 431 sg_init_table(sg, nsg);
426 skb_to_sgvec(skb, sg, 0, skb->len); 432 skb_to_sgvec(skb, sg, offset, len);
427 433
428 /* decrypt from the session key */ 434 /* decrypt from the session key */
429 token = call->conn->params.key->payload.data[0]; 435 token = call->conn->params.key->payload.data[0];
@@ -431,41 +437,41 @@ static int rxkad_verify_packet_encrypt(const struct rxrpc_call *call,
431 437
432 skcipher_request_set_tfm(req, call->conn->cipher); 438 skcipher_request_set_tfm(req, call->conn->cipher);
433 skcipher_request_set_callback(req, 0, NULL, NULL); 439 skcipher_request_set_callback(req, 0, NULL, NULL);
434 skcipher_request_set_crypt(req, sg, sg, skb->len, iv.x); 440 skcipher_request_set_crypt(req, sg, sg, len, iv.x);
435 crypto_skcipher_decrypt(req); 441 crypto_skcipher_decrypt(req);
436 skcipher_request_zero(req); 442 skcipher_request_zero(req);
437 if (sg != _sg) 443 if (sg != _sg)
438 kfree(sg); 444 kfree(sg);
439 445
440 /* remove the decrypted packet length */ 446 /* Extract the decrypted packet length */
441 if (skb_copy_bits(skb, 0, &sechdr, sizeof(sechdr)) < 0) 447 if (skb_copy_bits(skb, offset, &sechdr, sizeof(sechdr)) < 0) {
442 goto datalen_error; 448 rxrpc_abort_call("XV2", call, seq, RXKADDATALEN, EPROTO);
443 if (!skb_pull(skb, sizeof(sechdr))) 449 goto protocol_error;
444 BUG(); 450 }
451 offset += sizeof(sechdr);
452 len -= sizeof(sechdr);
445 453
446 buf = ntohl(sechdr.data_size); 454 buf = ntohl(sechdr.data_size);
447 data_size = buf & 0xffff; 455 data_size = buf & 0xffff;
448 456
449 check = buf >> 16; 457 check = buf >> 16;
450 check ^= sp->hdr.seq ^ sp->hdr.callNumber; 458 check ^= seq ^ call->call_id;
451 check &= 0xffff; 459 check &= 0xffff;
452 if (check != 0) { 460 if (check != 0) {
453 *_abort_code = RXKADSEALEDINCON; 461 rxrpc_abort_call("V2C", call, seq, RXKADSEALEDINCON, EPROTO);
454 goto protocol_error; 462 goto protocol_error;
455 } 463 }
456 464
457 /* shorten the packet to remove the padding */ 465 if (data_size > len) {
458 if (data_size > skb->len) 466 rxrpc_abort_call("V2L", call, seq, RXKADDATALEN, EPROTO);
459 goto datalen_error; 467 goto protocol_error;
460 else if (data_size < skb->len) 468 }
461 skb->len = data_size;
462 469
463 _leave(" = 0 [dlen=%x]", data_size); 470 _leave(" = 0 [dlen=%x]", data_size);
464 return 0; 471 return 0;
465 472
466datalen_error:
467 *_abort_code = RXKADDATALEN;
468protocol_error: 473protocol_error:
474 rxrpc_send_abort_packet(call);
469 _leave(" = -EPROTO"); 475 _leave(" = -EPROTO");
470 return -EPROTO; 476 return -EPROTO;
471 477
@@ -475,40 +481,31 @@ nomem:
475} 481}
476 482
477/* 483/*
478 * verify the security on a received packet 484 * Verify the security on a received packet or subpacket (if part of a
485 * jumbo packet).
479 */ 486 */
480static int rxkad_verify_packet(struct rxrpc_call *call, 487static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb,
481 struct sk_buff *skb, 488 unsigned int offset, unsigned int len,
482 u32 *_abort_code) 489 rxrpc_seq_t seq, u16 expected_cksum)
483{ 490{
484 SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher); 491 SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher);
485 struct rxrpc_skb_priv *sp;
486 struct rxrpc_crypt iv; 492 struct rxrpc_crypt iv;
487 struct scatterlist sg; 493 struct scatterlist sg;
488 u16 cksum; 494 u16 cksum;
489 u32 x, y; 495 u32 x, y;
490 int ret;
491
492 sp = rxrpc_skb(skb);
493 496
494 _enter("{%d{%x}},{#%u}", 497 _enter("{%d{%x}},{#%u}",
495 call->debug_id, key_serial(call->conn->params.key), sp->hdr.seq); 498 call->debug_id, key_serial(call->conn->params.key), seq);
496 499
497 if (!call->conn->cipher) 500 if (!call->conn->cipher)
498 return 0; 501 return 0;
499 502
500 if (sp->hdr.securityIndex != RXRPC_SECURITY_RXKAD) {
501 *_abort_code = RXKADINCONSISTENCY;
502 _leave(" = -EPROTO [not rxkad]");
503 return -EPROTO;
504 }
505
506 /* continue encrypting from where we left off */ 503 /* continue encrypting from where we left off */
507 memcpy(&iv, call->conn->csum_iv.x, sizeof(iv)); 504 memcpy(&iv, call->conn->csum_iv.x, sizeof(iv));
508 505
509 /* validate the security checksum */ 506 /* validate the security checksum */
510 x = call->channel << (32 - RXRPC_CIDSHIFT); 507 x = (call->cid & RXRPC_CHANNELMASK) << (32 - RXRPC_CIDSHIFT);
511 x |= sp->hdr.seq & 0x3fffffff; 508 x |= seq & 0x3fffffff;
512 call->crypto_buf[0] = htonl(call->call_id); 509 call->crypto_buf[0] = htonl(call->call_id);
513 call->crypto_buf[1] = htonl(x); 510 call->crypto_buf[1] = htonl(x);
514 511
@@ -524,29 +521,69 @@ static int rxkad_verify_packet(struct rxrpc_call *call,
524 if (cksum == 0) 521 if (cksum == 0)
525 cksum = 1; /* zero checksums are not permitted */ 522 cksum = 1; /* zero checksums are not permitted */
526 523
527 if (sp->hdr.cksum != cksum) { 524 if (cksum != expected_cksum) {
528 *_abort_code = RXKADSEALEDINCON; 525 rxrpc_abort_call("VCK", call, seq, RXKADSEALEDINCON, EPROTO);
526 rxrpc_send_abort_packet(call);
529 _leave(" = -EPROTO [csum failed]"); 527 _leave(" = -EPROTO [csum failed]");
530 return -EPROTO; 528 return -EPROTO;
531 } 529 }
532 530
533 switch (call->conn->params.security_level) { 531 switch (call->conn->params.security_level) {
534 case RXRPC_SECURITY_PLAIN: 532 case RXRPC_SECURITY_PLAIN:
535 ret = 0; 533 return 0;
536 break;
537 case RXRPC_SECURITY_AUTH: 534 case RXRPC_SECURITY_AUTH:
538 ret = rxkad_verify_packet_auth(call, skb, _abort_code); 535 return rxkad_verify_packet_1(call, skb, offset, len, seq);
539 break;
540 case RXRPC_SECURITY_ENCRYPT: 536 case RXRPC_SECURITY_ENCRYPT:
541 ret = rxkad_verify_packet_encrypt(call, skb, _abort_code); 537 return rxkad_verify_packet_2(call, skb, offset, len, seq);
542 break;
543 default: 538 default:
544 ret = -ENOANO; 539 return -ENOANO;
545 break;
546 } 540 }
541}
547 542
548 _leave(" = %d", ret); 543/*
549 return ret; 544 * Locate the data contained in a packet that was partially encrypted.
545 */
546static void rxkad_locate_data_1(struct rxrpc_call *call, struct sk_buff *skb,
547 unsigned int *_offset, unsigned int *_len)
548{
549 struct rxkad_level1_hdr sechdr;
550
551 if (skb_copy_bits(skb, *_offset, &sechdr, sizeof(sechdr)) < 0)
552 BUG();
553 *_offset += sizeof(sechdr);
554 *_len = ntohl(sechdr.data_size) & 0xffff;
555}
556
557/*
558 * Locate the data contained in a packet that was completely encrypted.
559 */
560static void rxkad_locate_data_2(struct rxrpc_call *call, struct sk_buff *skb,
561 unsigned int *_offset, unsigned int *_len)
562{
563 struct rxkad_level2_hdr sechdr;
564
565 if (skb_copy_bits(skb, *_offset, &sechdr, sizeof(sechdr)) < 0)
566 BUG();
567 *_offset += sizeof(sechdr);
568 *_len = ntohl(sechdr.data_size) & 0xffff;
569}
570
571/*
572 * Locate the data contained in an already decrypted packet.
573 */
574static void rxkad_locate_data(struct rxrpc_call *call, struct sk_buff *skb,
575 unsigned int *_offset, unsigned int *_len)
576{
577 switch (call->conn->params.security_level) {
578 case RXRPC_SECURITY_AUTH:
579 rxkad_locate_data_1(call, skb, _offset, _len);
580 return;
581 case RXRPC_SECURITY_ENCRYPT:
582 rxkad_locate_data_2(call, skb, _offset, _len);
583 return;
584 default:
585 return;
586 }
550} 587}
551 588
552/* 589/*
@@ -716,7 +753,7 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn,
716 struct rxkad_challenge challenge; 753 struct rxkad_challenge challenge;
717 struct rxkad_response resp 754 struct rxkad_response resp
718 __attribute__((aligned(8))); /* must be aligned for crypto */ 755 __attribute__((aligned(8))); /* must be aligned for crypto */
719 struct rxrpc_skb_priv *sp; 756 struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
720 u32 version, nonce, min_level, abort_code; 757 u32 version, nonce, min_level, abort_code;
721 int ret; 758 int ret;
722 759
@@ -734,8 +771,8 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn,
734 } 771 }
735 772
736 abort_code = RXKADPACKETSHORT; 773 abort_code = RXKADPACKETSHORT;
737 sp = rxrpc_skb(skb); 774 if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header),
738 if (skb_copy_bits(skb, 0, &challenge, sizeof(challenge)) < 0) 775 &challenge, sizeof(challenge)) < 0)
739 goto protocol_error; 776 goto protocol_error;
740 777
741 version = ntohl(challenge.version); 778 version = ntohl(challenge.version);
@@ -981,7 +1018,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
981{ 1018{
982 struct rxkad_response response 1019 struct rxkad_response response
983 __attribute__((aligned(8))); /* must be aligned for crypto */ 1020 __attribute__((aligned(8))); /* must be aligned for crypto */
984 struct rxrpc_skb_priv *sp; 1021 struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
985 struct rxrpc_crypt session_key; 1022 struct rxrpc_crypt session_key;
986 time_t expiry; 1023 time_t expiry;
987 void *ticket; 1024 void *ticket;
@@ -992,7 +1029,8 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
992 _enter("{%d,%x}", conn->debug_id, key_serial(conn->server_key)); 1029 _enter("{%d,%x}", conn->debug_id, key_serial(conn->server_key));
993 1030
994 abort_code = RXKADPACKETSHORT; 1031 abort_code = RXKADPACKETSHORT;
995 if (skb_copy_bits(skb, 0, &response, sizeof(response)) < 0) 1032 if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header),
1033 &response, sizeof(response)) < 0)
996 goto protocol_error; 1034 goto protocol_error;
997 if (!pskb_pull(skb, sizeof(response))) 1035 if (!pskb_pull(skb, sizeof(response)))
998 BUG(); 1036 BUG();
@@ -1000,7 +1038,6 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
1000 version = ntohl(response.version); 1038 version = ntohl(response.version);
1001 ticket_len = ntohl(response.ticket_len); 1039 ticket_len = ntohl(response.ticket_len);
1002 kvno = ntohl(response.kvno); 1040 kvno = ntohl(response.kvno);
1003 sp = rxrpc_skb(skb);
1004 _proto("Rx RESPONSE %%%u { v=%u kv=%u tl=%u }", 1041 _proto("Rx RESPONSE %%%u { v=%u kv=%u tl=%u }",
1005 sp->hdr.serial, version, kvno, ticket_len); 1042 sp->hdr.serial, version, kvno, ticket_len);
1006 1043
@@ -1022,7 +1059,8 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
1022 return -ENOMEM; 1059 return -ENOMEM;
1023 1060
1024 abort_code = RXKADPACKETSHORT; 1061 abort_code = RXKADPACKETSHORT;
1025 if (skb_copy_bits(skb, 0, ticket, ticket_len) < 0) 1062 if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header),
1063 ticket, ticket_len) < 0)
1026 goto protocol_error_free; 1064 goto protocol_error_free;
1027 1065
1028 ret = rxkad_decrypt_ticket(conn, ticket, ticket_len, &session_key, 1066 ret = rxkad_decrypt_ticket(conn, ticket, ticket_len, &session_key,
@@ -1147,6 +1185,7 @@ const struct rxrpc_security rxkad = {
1147 .prime_packet_security = rxkad_prime_packet_security, 1185 .prime_packet_security = rxkad_prime_packet_security,
1148 .secure_packet = rxkad_secure_packet, 1186 .secure_packet = rxkad_secure_packet,
1149 .verify_packet = rxkad_verify_packet, 1187 .verify_packet = rxkad_verify_packet,
1188 .locate_data = rxkad_locate_data,
1150 .issue_challenge = rxkad_issue_challenge, 1189 .issue_challenge = rxkad_issue_challenge,
1151 .respond_to_challenge = rxkad_respond_to_challenge, 1190 .respond_to_challenge = rxkad_respond_to_challenge,
1152 .verify_response = rxkad_verify_response, 1191 .verify_response = rxkad_verify_response,
diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c
index 814d285ff802..7d921e56e715 100644
--- a/net/rxrpc/security.c
+++ b/net/rxrpc/security.c
@@ -130,20 +130,20 @@ int rxrpc_init_server_conn_security(struct rxrpc_connection *conn)
130 } 130 }
131 131
132 /* find the service */ 132 /* find the service */
133 read_lock_bh(&local->services_lock); 133 read_lock(&local->services_lock);
134 list_for_each_entry(rx, &local->services, listen_link) { 134 rx = rcu_dereference_protected(local->service,
135 if (rx->srx.srx_service == conn->params.service_id) 135 lockdep_is_held(&local->services_lock));
136 goto found_service; 136 if (rx && rx->srx.srx_service == conn->params.service_id)
137 } 137 goto found_service;
138 138
139 /* the service appears to have died */ 139 /* the service appears to have died */
140 read_unlock_bh(&local->services_lock); 140 read_unlock(&local->services_lock);
141 _leave(" = -ENOENT"); 141 _leave(" = -ENOENT");
142 return -ENOENT; 142 return -ENOENT;
143 143
144found_service: 144found_service:
145 if (!rx->securities) { 145 if (!rx->securities) {
146 read_unlock_bh(&local->services_lock); 146 read_unlock(&local->services_lock);
147 _leave(" = -ENOKEY"); 147 _leave(" = -ENOKEY");
148 return -ENOKEY; 148 return -ENOKEY;
149 } 149 }
@@ -152,13 +152,13 @@ found_service:
152 kref = keyring_search(make_key_ref(rx->securities, 1UL), 152 kref = keyring_search(make_key_ref(rx->securities, 1UL),
153 &key_type_rxrpc_s, kdesc); 153 &key_type_rxrpc_s, kdesc);
154 if (IS_ERR(kref)) { 154 if (IS_ERR(kref)) {
155 read_unlock_bh(&local->services_lock); 155 read_unlock(&local->services_lock);
156 _leave(" = %ld [search]", PTR_ERR(kref)); 156 _leave(" = %ld [search]", PTR_ERR(kref));
157 return PTR_ERR(kref); 157 return PTR_ERR(kref);
158 } 158 }
159 159
160 key = key_ref_to_ptr(kref); 160 key = key_ref_to_ptr(kref);
161 read_unlock_bh(&local->services_lock); 161 read_unlock(&local->services_lock);
162 162
163 conn->server_key = key; 163 conn->server_key = key;
164 conn->security = sec; 164 conn->security = sec;
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
new file mode 100644
index 000000000000..b214a4d4a641
--- /dev/null
+++ b/net/rxrpc/sendmsg.c
@@ -0,0 +1,610 @@
1/* AF_RXRPC sendmsg() implementation.
2 *
3 * Copyright (C) 2007, 2016 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13
14#include <linux/net.h>
15#include <linux/gfp.h>
16#include <linux/skbuff.h>
17#include <linux/export.h>
18#include <net/sock.h>
19#include <net/af_rxrpc.h>
20#include "ar-internal.h"
21
22enum rxrpc_command {
23 RXRPC_CMD_SEND_DATA, /* send data message */
24 RXRPC_CMD_SEND_ABORT, /* request abort generation */
25 RXRPC_CMD_ACCEPT, /* [server] accept incoming call */
26 RXRPC_CMD_REJECT_BUSY, /* [server] reject a call as busy */
27};
28
29/*
30 * wait for space to appear in the transmit/ACK window
31 * - caller holds the socket locked
32 */
33static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx,
34 struct rxrpc_call *call,
35 long *timeo)
36{
37 DECLARE_WAITQUEUE(myself, current);
38 int ret;
39
40 _enter(",{%u,%u,%u}",
41 call->tx_hard_ack, call->tx_top, call->tx_winsize);
42
43 add_wait_queue(&call->waitq, &myself);
44
45 for (;;) {
46 set_current_state(TASK_INTERRUPTIBLE);
47 ret = 0;
48 if (call->tx_top - call->tx_hard_ack <
49 min_t(unsigned int, call->tx_winsize,
50 call->cong_cwnd + call->cong_extra))
51 break;
52 if (call->state >= RXRPC_CALL_COMPLETE) {
53 ret = -call->error;
54 break;
55 }
56 if (signal_pending(current)) {
57 ret = sock_intr_errno(*timeo);
58 break;
59 }
60
61 trace_rxrpc_transmit(call, rxrpc_transmit_wait);
62 release_sock(&rx->sk);
63 *timeo = schedule_timeout(*timeo);
64 lock_sock(&rx->sk);
65 }
66
67 remove_wait_queue(&call->waitq, &myself);
68 set_current_state(TASK_RUNNING);
69 _leave(" = %d", ret);
70 return ret;
71}
72
73/*
74 * Schedule an instant Tx resend.
75 */
76static inline void rxrpc_instant_resend(struct rxrpc_call *call, int ix)
77{
78 spin_lock_bh(&call->lock);
79
80 if (call->state < RXRPC_CALL_COMPLETE) {
81 call->rxtx_annotations[ix] = RXRPC_TX_ANNO_RETRANS;
82 if (!test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events))
83 rxrpc_queue_call(call);
84 }
85
86 spin_unlock_bh(&call->lock);
87}
88
89/*
90 * Queue a DATA packet for transmission, set the resend timeout and send the
91 * packet immediately
92 */
93static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb,
94 bool last)
95{
96 struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
97 rxrpc_seq_t seq = sp->hdr.seq;
98 int ret, ix;
99 u8 annotation = RXRPC_TX_ANNO_UNACK;
100
101 _net("queue skb %p [%d]", skb, seq);
102
103 ASSERTCMP(seq, ==, call->tx_top + 1);
104
105 if (last)
106 annotation |= RXRPC_TX_ANNO_LAST;
107
108 /* We have to set the timestamp before queueing as the retransmit
109 * algorithm can see the packet as soon as we queue it.
110 */
111 skb->tstamp = ktime_get_real();
112
113 ix = seq & RXRPC_RXTX_BUFF_MASK;
114 rxrpc_get_skb(skb, rxrpc_skb_tx_got);
115 call->rxtx_annotations[ix] = annotation;
116 smp_wmb();
117 call->rxtx_buffer[ix] = skb;
118 call->tx_top = seq;
119 if (last)
120 trace_rxrpc_transmit(call, rxrpc_transmit_queue_last);
121 else
122 trace_rxrpc_transmit(call, rxrpc_transmit_queue);
123
124 if (last || call->state == RXRPC_CALL_SERVER_ACK_REQUEST) {
125 _debug("________awaiting reply/ACK__________");
126 write_lock_bh(&call->state_lock);
127 switch (call->state) {
128 case RXRPC_CALL_CLIENT_SEND_REQUEST:
129 call->state = RXRPC_CALL_CLIENT_AWAIT_REPLY;
130 break;
131 case RXRPC_CALL_SERVER_ACK_REQUEST:
132 call->state = RXRPC_CALL_SERVER_SEND_REPLY;
133 call->ack_at = call->expire_at;
134 if (call->ackr_reason == RXRPC_ACK_DELAY)
135 call->ackr_reason = 0;
136 __rxrpc_set_timer(call, rxrpc_timer_init_for_send_reply,
137 ktime_get_real());
138 if (!last)
139 break;
140 case RXRPC_CALL_SERVER_SEND_REPLY:
141 call->state = RXRPC_CALL_SERVER_AWAIT_ACK;
142 break;
143 default:
144 break;
145 }
146 write_unlock_bh(&call->state_lock);
147 }
148
149 if (seq == 1 && rxrpc_is_client_call(call))
150 rxrpc_expose_client_call(call);
151
152 ret = rxrpc_send_data_packet(call, skb, false);
153 if (ret < 0) {
154 _debug("need instant resend %d", ret);
155 rxrpc_instant_resend(call, ix);
156 } else {
157 ktime_t now = ktime_get_real(), resend_at;
158
159 resend_at = ktime_add_ms(now, rxrpc_resend_timeout);
160
161 if (ktime_before(resend_at, call->resend_at)) {
162 call->resend_at = resend_at;
163 rxrpc_set_timer(call, rxrpc_timer_set_for_send, now);
164 }
165 }
166
167 rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
168 _leave("");
169}
170
171/*
172 * send data through a socket
173 * - must be called in process context
174 * - caller holds the socket locked
175 */
176static int rxrpc_send_data(struct rxrpc_sock *rx,
177 struct rxrpc_call *call,
178 struct msghdr *msg, size_t len)
179{
180 struct rxrpc_skb_priv *sp;
181 struct sk_buff *skb;
182 struct sock *sk = &rx->sk;
183 long timeo;
184 bool more;
185 int ret, copied;
186
187 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
188
189 /* this should be in poll */
190 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
191
192 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
193 return -EPIPE;
194
195 more = msg->msg_flags & MSG_MORE;
196
197 skb = call->tx_pending;
198 call->tx_pending = NULL;
199 rxrpc_see_skb(skb, rxrpc_skb_tx_seen);
200
201 copied = 0;
202 do {
203 /* Check to see if there's a ping ACK to reply to. */
204 if (call->ackr_reason == RXRPC_ACK_PING_RESPONSE)
205 rxrpc_send_ack_packet(call, false);
206
207 if (!skb) {
208 size_t size, chunk, max, space;
209
210 _debug("alloc");
211
212 if (call->tx_top - call->tx_hard_ack >=
213 min_t(unsigned int, call->tx_winsize,
214 call->cong_cwnd + call->cong_extra)) {
215 ret = -EAGAIN;
216 if (msg->msg_flags & MSG_DONTWAIT)
217 goto maybe_error;
218 ret = rxrpc_wait_for_tx_window(rx, call,
219 &timeo);
220 if (ret < 0)
221 goto maybe_error;
222 }
223
224 max = RXRPC_JUMBO_DATALEN;
225 max -= call->conn->security_size;
226 max &= ~(call->conn->size_align - 1UL);
227
228 chunk = max;
229 if (chunk > msg_data_left(msg) && !more)
230 chunk = msg_data_left(msg);
231
232 space = chunk + call->conn->size_align;
233 space &= ~(call->conn->size_align - 1UL);
234
235 size = space + call->conn->security_size;
236
237 _debug("SIZE: %zu/%zu/%zu", chunk, space, size);
238
239 /* create a buffer that we can retain until it's ACK'd */
240 skb = sock_alloc_send_skb(
241 sk, size, msg->msg_flags & MSG_DONTWAIT, &ret);
242 if (!skb)
243 goto maybe_error;
244
245 rxrpc_new_skb(skb, rxrpc_skb_tx_new);
246
247 _debug("ALLOC SEND %p", skb);
248
249 ASSERTCMP(skb->mark, ==, 0);
250
251 _debug("HS: %u", call->conn->security_size);
252 skb_reserve(skb, call->conn->security_size);
253 skb->len += call->conn->security_size;
254
255 sp = rxrpc_skb(skb);
256 sp->remain = chunk;
257 if (sp->remain > skb_tailroom(skb))
258 sp->remain = skb_tailroom(skb);
259
260 _net("skb: hr %d, tr %d, hl %d, rm %d",
261 skb_headroom(skb),
262 skb_tailroom(skb),
263 skb_headlen(skb),
264 sp->remain);
265
266 skb->ip_summed = CHECKSUM_UNNECESSARY;
267 }
268
269 _debug("append");
270 sp = rxrpc_skb(skb);
271
272 /* append next segment of data to the current buffer */
273 if (msg_data_left(msg) > 0) {
274 int copy = skb_tailroom(skb);
275 ASSERTCMP(copy, >, 0);
276 if (copy > msg_data_left(msg))
277 copy = msg_data_left(msg);
278 if (copy > sp->remain)
279 copy = sp->remain;
280
281 _debug("add");
282 ret = skb_add_data(skb, &msg->msg_iter, copy);
283 _debug("added");
284 if (ret < 0)
285 goto efault;
286 sp->remain -= copy;
287 skb->mark += copy;
288 copied += copy;
289 }
290
291 /* check for the far side aborting the call or a network error
292 * occurring */
293 if (call->state == RXRPC_CALL_COMPLETE)
294 goto call_terminated;
295
296 /* add the packet to the send queue if it's now full */
297 if (sp->remain <= 0 ||
298 (msg_data_left(msg) == 0 && !more)) {
299 struct rxrpc_connection *conn = call->conn;
300 uint32_t seq;
301 size_t pad;
302
303 /* pad out if we're using security */
304 if (conn->security_ix) {
305 pad = conn->security_size + skb->mark;
306 pad = conn->size_align - pad;
307 pad &= conn->size_align - 1;
308 _debug("pad %zu", pad);
309 if (pad)
310 memset(skb_put(skb, pad), 0, pad);
311 }
312
313 seq = call->tx_top + 1;
314
315 sp->hdr.seq = seq;
316 sp->hdr._rsvd = 0;
317 sp->hdr.flags = conn->out_clientflag;
318
319 if (msg_data_left(msg) == 0 && !more)
320 sp->hdr.flags |= RXRPC_LAST_PACKET;
321 else if (call->tx_top - call->tx_hard_ack <
322 call->tx_winsize)
323 sp->hdr.flags |= RXRPC_MORE_PACKETS;
324
325 ret = conn->security->secure_packet(
326 call, skb, skb->mark, skb->head);
327 if (ret < 0)
328 goto out;
329
330 rxrpc_queue_packet(call, skb, !msg_data_left(msg) && !more);
331 skb = NULL;
332 }
333 } while (msg_data_left(msg) > 0);
334
335success:
336 ret = copied;
337out:
338 call->tx_pending = skb;
339 _leave(" = %d", ret);
340 return ret;
341
342call_terminated:
343 rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
344 _leave(" = %d", -call->error);
345 return -call->error;
346
347maybe_error:
348 if (copied)
349 goto success;
350 goto out;
351
352efault:
353 ret = -EFAULT;
354 goto out;
355}
356
357/*
358 * extract control messages from the sendmsg() control buffer
359 */
360static int rxrpc_sendmsg_cmsg(struct msghdr *msg,
361 unsigned long *user_call_ID,
362 enum rxrpc_command *command,
363 u32 *abort_code,
364 bool *_exclusive)
365{
366 struct cmsghdr *cmsg;
367 bool got_user_ID = false;
368 int len;
369
370 *command = RXRPC_CMD_SEND_DATA;
371
372 if (msg->msg_controllen == 0)
373 return -EINVAL;
374
375 for_each_cmsghdr(cmsg, msg) {
376 if (!CMSG_OK(msg, cmsg))
377 return -EINVAL;
378
379 len = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr));
380 _debug("CMSG %d, %d, %d",
381 cmsg->cmsg_level, cmsg->cmsg_type, len);
382
383 if (cmsg->cmsg_level != SOL_RXRPC)
384 continue;
385
386 switch (cmsg->cmsg_type) {
387 case RXRPC_USER_CALL_ID:
388 if (msg->msg_flags & MSG_CMSG_COMPAT) {
389 if (len != sizeof(u32))
390 return -EINVAL;
391 *user_call_ID = *(u32 *) CMSG_DATA(cmsg);
392 } else {
393 if (len != sizeof(unsigned long))
394 return -EINVAL;
395 *user_call_ID = *(unsigned long *)
396 CMSG_DATA(cmsg);
397 }
398 _debug("User Call ID %lx", *user_call_ID);
399 got_user_ID = true;
400 break;
401
402 case RXRPC_ABORT:
403 if (*command != RXRPC_CMD_SEND_DATA)
404 return -EINVAL;
405 *command = RXRPC_CMD_SEND_ABORT;
406 if (len != sizeof(*abort_code))
407 return -EINVAL;
408 *abort_code = *(unsigned int *) CMSG_DATA(cmsg);
409 _debug("Abort %x", *abort_code);
410 if (*abort_code == 0)
411 return -EINVAL;
412 break;
413
414 case RXRPC_ACCEPT:
415 if (*command != RXRPC_CMD_SEND_DATA)
416 return -EINVAL;
417 *command = RXRPC_CMD_ACCEPT;
418 if (len != 0)
419 return -EINVAL;
420 break;
421
422 case RXRPC_EXCLUSIVE_CALL:
423 *_exclusive = true;
424 if (len != 0)
425 return -EINVAL;
426 break;
427 default:
428 return -EINVAL;
429 }
430 }
431
432 if (!got_user_ID)
433 return -EINVAL;
434 _leave(" = 0");
435 return 0;
436}
437
438/*
439 * Create a new client call for sendmsg().
440 */
441static struct rxrpc_call *
442rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg,
443 unsigned long user_call_ID, bool exclusive)
444{
445 struct rxrpc_conn_parameters cp;
446 struct rxrpc_call *call;
447 struct key *key;
448
449 DECLARE_SOCKADDR(struct sockaddr_rxrpc *, srx, msg->msg_name);
450
451 _enter("");
452
453 if (!msg->msg_name)
454 return ERR_PTR(-EDESTADDRREQ);
455
456 key = rx->key;
457 if (key && !rx->key->payload.data[0])
458 key = NULL;
459
460 memset(&cp, 0, sizeof(cp));
461 cp.local = rx->local;
462 cp.key = rx->key;
463 cp.security_level = rx->min_sec_level;
464 cp.exclusive = rx->exclusive | exclusive;
465 cp.service_id = srx->srx_service;
466 call = rxrpc_new_client_call(rx, &cp, srx, user_call_ID, GFP_KERNEL);
467
468 _leave(" = %p\n", call);
469 return call;
470}
471
472/*
473 * send a message forming part of a client call through an RxRPC socket
474 * - caller holds the socket locked
475 * - the socket may be either a client socket or a server socket
476 */
477int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
478{
479 enum rxrpc_command cmd;
480 struct rxrpc_call *call;
481 unsigned long user_call_ID = 0;
482 bool exclusive = false;
483 u32 abort_code = 0;
484 int ret;
485
486 _enter("");
487
488 ret = rxrpc_sendmsg_cmsg(msg, &user_call_ID, &cmd, &abort_code,
489 &exclusive);
490 if (ret < 0)
491 return ret;
492
493 if (cmd == RXRPC_CMD_ACCEPT) {
494 if (rx->sk.sk_state != RXRPC_SERVER_LISTENING)
495 return -EINVAL;
496 call = rxrpc_accept_call(rx, user_call_ID, NULL);
497 if (IS_ERR(call))
498 return PTR_ERR(call);
499 rxrpc_put_call(call, rxrpc_call_put);
500 return 0;
501 }
502
503 call = rxrpc_find_call_by_user_ID(rx, user_call_ID);
504 if (!call) {
505 if (cmd != RXRPC_CMD_SEND_DATA)
506 return -EBADSLT;
507 call = rxrpc_new_client_call_for_sendmsg(rx, msg, user_call_ID,
508 exclusive);
509 if (IS_ERR(call))
510 return PTR_ERR(call);
511 }
512
513 _debug("CALL %d USR %lx ST %d on CONN %p",
514 call->debug_id, call->user_call_ID, call->state, call->conn);
515
516 if (call->state >= RXRPC_CALL_COMPLETE) {
517 /* it's too late for this call */
518 ret = -ESHUTDOWN;
519 } else if (cmd == RXRPC_CMD_SEND_ABORT) {
520 ret = 0;
521 if (rxrpc_abort_call("CMD", call, 0, abort_code, ECONNABORTED))
522 ret = rxrpc_send_abort_packet(call);
523 } else if (cmd != RXRPC_CMD_SEND_DATA) {
524 ret = -EINVAL;
525 } else if (rxrpc_is_client_call(call) &&
526 call->state != RXRPC_CALL_CLIENT_SEND_REQUEST) {
527 /* request phase complete for this client call */
528 ret = -EPROTO;
529 } else if (rxrpc_is_service_call(call) &&
530 call->state != RXRPC_CALL_SERVER_ACK_REQUEST &&
531 call->state != RXRPC_CALL_SERVER_SEND_REPLY) {
532 /* Reply phase not begun or not complete for service call. */
533 ret = -EPROTO;
534 } else {
535 ret = rxrpc_send_data(rx, call, msg, len);
536 }
537
538 rxrpc_put_call(call, rxrpc_call_put);
539 _leave(" = %d", ret);
540 return ret;
541}
542
543/**
544 * rxrpc_kernel_send_data - Allow a kernel service to send data on a call
545 * @sock: The socket the call is on
546 * @call: The call to send data through
547 * @msg: The data to send
548 * @len: The amount of data to send
549 *
550 * Allow a kernel service to send data on a call. The call must be in an state
551 * appropriate to sending data. No control data should be supplied in @msg,
552 * nor should an address be supplied. MSG_MORE should be flagged if there's
553 * more data to come, otherwise this data will end the transmission phase.
554 */
555int rxrpc_kernel_send_data(struct socket *sock, struct rxrpc_call *call,
556 struct msghdr *msg, size_t len)
557{
558 int ret;
559
560 _enter("{%d,%s},", call->debug_id, rxrpc_call_states[call->state]);
561
562 ASSERTCMP(msg->msg_name, ==, NULL);
563 ASSERTCMP(msg->msg_control, ==, NULL);
564
565 lock_sock(sock->sk);
566
567 _debug("CALL %d USR %lx ST %d on CONN %p",
568 call->debug_id, call->user_call_ID, call->state, call->conn);
569
570 if (call->state >= RXRPC_CALL_COMPLETE) {
571 ret = -ESHUTDOWN; /* it's too late for this call */
572 } else if (call->state != RXRPC_CALL_CLIENT_SEND_REQUEST &&
573 call->state != RXRPC_CALL_SERVER_ACK_REQUEST &&
574 call->state != RXRPC_CALL_SERVER_SEND_REPLY) {
575 ret = -EPROTO; /* request phase complete for this client call */
576 } else {
577 ret = rxrpc_send_data(rxrpc_sk(sock->sk), call, msg, len);
578 }
579
580 release_sock(sock->sk);
581 _leave(" = %d", ret);
582 return ret;
583}
584EXPORT_SYMBOL(rxrpc_kernel_send_data);
585
586/**
587 * rxrpc_kernel_abort_call - Allow a kernel service to abort a call
588 * @sock: The socket the call is on
589 * @call: The call to be aborted
590 * @abort_code: The abort code to stick into the ABORT packet
591 * @error: Local error value
592 * @why: 3-char string indicating why.
593 *
594 * Allow a kernel service to abort a call, if it's still in an abortable state.
595 */
596void rxrpc_kernel_abort_call(struct socket *sock, struct rxrpc_call *call,
597 u32 abort_code, int error, const char *why)
598{
599 _enter("{%d},%d,%d,%s", call->debug_id, abort_code, error, why);
600
601 lock_sock(sock->sk);
602
603 if (rxrpc_abort_call(why, call, 0, abort_code, error))
604 rxrpc_send_abort_packet(call);
605
606 release_sock(sock->sk);
607 _leave("");
608}
609
610EXPORT_SYMBOL(rxrpc_kernel_abort_call);
diff --git a/net/rxrpc/skbuff.c b/net/rxrpc/skbuff.c
index 06c51d4b622d..67b02c45271b 100644
--- a/net/rxrpc/skbuff.c
+++ b/net/rxrpc/skbuff.c
@@ -18,148 +18,82 @@
18#include <net/af_rxrpc.h> 18#include <net/af_rxrpc.h>
19#include "ar-internal.h" 19#include "ar-internal.h"
20 20
21#define select_skb_count(op) (op >= rxrpc_skb_tx_cleaned ? &rxrpc_n_tx_skbs : &rxrpc_n_rx_skbs)
22
21/* 23/*
22 * set up for the ACK at the end of the receive phase when we discard the final 24 * Note the allocation or reception of a socket buffer.
23 * receive phase data packet
24 * - called with softirqs disabled
25 */ 25 */
26static void rxrpc_request_final_ACK(struct rxrpc_call *call) 26void rxrpc_new_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
27{ 27{
28 /* the call may be aborted before we have a chance to ACK it */ 28 const void *here = __builtin_return_address(0);
29 write_lock(&call->state_lock); 29 int n = atomic_inc_return(select_skb_count(op));
30 30 trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here);
31 switch (call->state) {
32 case RXRPC_CALL_CLIENT_RECV_REPLY:
33 call->state = RXRPC_CALL_CLIENT_FINAL_ACK;
34 _debug("request final ACK");
35
36 /* get an extra ref on the call for the final-ACK generator to
37 * release */
38 rxrpc_get_call(call);
39 set_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events);
40 if (try_to_del_timer_sync(&call->ack_timer) >= 0)
41 rxrpc_queue_call(call);
42 break;
43
44 case RXRPC_CALL_SERVER_RECV_REQUEST:
45 call->state = RXRPC_CALL_SERVER_ACK_REQUEST;
46 default:
47 break;
48 }
49
50 write_unlock(&call->state_lock);
51} 31}
52 32
53/* 33/*
54 * drop the bottom ACK off of the call ACK window and advance the window 34 * Note the re-emergence of a socket buffer from a queue or buffer.
55 */ 35 */
56static void rxrpc_hard_ACK_data(struct rxrpc_call *call, 36void rxrpc_see_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
57 struct rxrpc_skb_priv *sp)
58{ 37{
59 int loop; 38 const void *here = __builtin_return_address(0);
60 u32 seq; 39 if (skb) {
61 40 int n = atomic_read(select_skb_count(op));
62 spin_lock_bh(&call->lock); 41 trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here);
63
64 _debug("hard ACK #%u", sp->hdr.seq);
65
66 for (loop = 0; loop < RXRPC_ACKR_WINDOW_ASZ; loop++) {
67 call->ackr_window[loop] >>= 1;
68 call->ackr_window[loop] |=
69 call->ackr_window[loop + 1] << (BITS_PER_LONG - 1);
70 }
71
72 seq = sp->hdr.seq;
73 ASSERTCMP(seq, ==, call->rx_data_eaten + 1);
74 call->rx_data_eaten = seq;
75
76 if (call->ackr_win_top < UINT_MAX)
77 call->ackr_win_top++;
78
79 ASSERTIFCMP(call->state <= RXRPC_CALL_COMPLETE,
80 call->rx_data_post, >=, call->rx_data_recv);
81 ASSERTIFCMP(call->state <= RXRPC_CALL_COMPLETE,
82 call->rx_data_recv, >=, call->rx_data_eaten);
83
84 if (sp->hdr.flags & RXRPC_LAST_PACKET) {
85 rxrpc_request_final_ACK(call);
86 } else if (atomic_dec_and_test(&call->ackr_not_idle) &&
87 test_and_clear_bit(RXRPC_CALL_TX_SOFT_ACK, &call->flags)) {
88 /* We previously soft-ACK'd some received packets that have now
89 * been consumed, so send a hard-ACK if no more packets are
90 * immediately forthcoming to allow the transmitter to free up
91 * its Tx bufferage.
92 */
93 _debug("send Rx idle ACK");
94 __rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, sp->hdr.serial,
95 false);
96 } 42 }
97
98 spin_unlock_bh(&call->lock);
99} 43}
100 44
101/** 45/*
102 * rxrpc_kernel_data_consumed - Record consumption of data message 46 * Note the addition of a ref on a socket buffer.
103 * @call: The call to which the message pertains.
104 * @skb: Message holding data
105 *
106 * Record the consumption of a data message and generate an ACK if appropriate.
107 * The call state is shifted if this was the final packet. The caller must be
108 * in process context with no spinlocks held.
109 *
110 * TODO: Actually generate the ACK here rather than punting this to the
111 * workqueue.
112 */ 47 */
113void rxrpc_kernel_data_consumed(struct rxrpc_call *call, struct sk_buff *skb) 48void rxrpc_get_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
114{ 49{
115 struct rxrpc_skb_priv *sp = rxrpc_skb(skb); 50 const void *here = __builtin_return_address(0);
116 51 int n = atomic_inc_return(select_skb_count(op));
117 _enter("%d,%p{%u}", call->debug_id, skb, sp->hdr.seq); 52 trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here);
118 53 skb_get(skb);
119 ASSERTCMP(sp->call, ==, call);
120 ASSERTCMP(sp->hdr.type, ==, RXRPC_PACKET_TYPE_DATA);
121
122 /* TODO: Fix the sequence number tracking */
123 ASSERTCMP(sp->hdr.seq, >=, call->rx_data_recv);
124 ASSERTCMP(sp->hdr.seq, <=, call->rx_data_recv + 1);
125 ASSERTCMP(sp->hdr.seq, >, call->rx_data_eaten);
126
127 call->rx_data_recv = sp->hdr.seq;
128 rxrpc_hard_ACK_data(call, sp);
129} 54}
130EXPORT_SYMBOL(rxrpc_kernel_data_consumed);
131 55
132/* 56/*
133 * Destroy a packet that has an RxRPC control buffer 57 * Note the destruction of a socket buffer.
134 */ 58 */
135void rxrpc_packet_destructor(struct sk_buff *skb) 59void rxrpc_free_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
136{ 60{
137 struct rxrpc_skb_priv *sp = rxrpc_skb(skb); 61 const void *here = __builtin_return_address(0);
138 struct rxrpc_call *call = sp->call; 62 if (skb) {
139 63 int n;
140 _enter("%p{%p}", skb, call); 64 CHECK_SLAB_OKAY(&skb->users);
141 65 n = atomic_dec_return(select_skb_count(op));
142 if (call) { 66 trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here);
143 if (atomic_dec_return(&call->skb_count) < 0) 67 kfree_skb(skb);
144 BUG();
145 rxrpc_put_call(call);
146 sp->call = NULL;
147 } 68 }
69}
148 70
149 if (skb->sk) 71/*
150 sock_rfree(skb); 72 * Note the injected loss of a socket buffer.
151 _leave(""); 73 */
74void rxrpc_lose_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
75{
76 const void *here = __builtin_return_address(0);
77 if (skb) {
78 int n;
79 CHECK_SLAB_OKAY(&skb->users);
80 n = atomic_dec_return(select_skb_count(op));
81 trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here);
82 kfree_skb(skb);
83 }
152} 84}
153 85
154/** 86/*
155 * rxrpc_kernel_free_skb - Free an RxRPC socket buffer 87 * Clear a queue of socket buffers.
156 * @skb: The socket buffer to be freed
157 *
158 * Let RxRPC free its own socket buffer, permitting it to maintain debug
159 * accounting.
160 */ 88 */
161void rxrpc_kernel_free_skb(struct sk_buff *skb) 89void rxrpc_purge_queue(struct sk_buff_head *list)
162{ 90{
163 rxrpc_free_skb(skb); 91 const void *here = __builtin_return_address(0);
92 struct sk_buff *skb;
93 while ((skb = skb_dequeue((list))) != NULL) {
94 int n = atomic_dec_return(select_skb_count(rxrpc_skb_rx_purged));
95 trace_rxrpc_skb(skb, rxrpc_skb_rx_purged,
96 atomic_read(&skb->users), n, here);
97 kfree_skb(skb);
98 }
164} 99}
165EXPORT_SYMBOL(rxrpc_kernel_free_skb);
diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c
index 03ad08774d4e..34c706d2f79c 100644
--- a/net/rxrpc/sysctl.c
+++ b/net/rxrpc/sysctl.c
@@ -20,7 +20,7 @@ static const unsigned int one = 1;
20static const unsigned int four = 4; 20static const unsigned int four = 4;
21static const unsigned int thirtytwo = 32; 21static const unsigned int thirtytwo = 32;
22static const unsigned int n_65535 = 65535; 22static const unsigned int n_65535 = 65535;
23static const unsigned int n_max_acks = RXRPC_MAXACKS; 23static const unsigned int n_max_acks = RXRPC_RXTX_BUFF_SIZE - 1;
24 24
25/* 25/*
26 * RxRPC operating parameters. 26 * RxRPC operating parameters.
@@ -35,7 +35,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
35 .data = &rxrpc_requested_ack_delay, 35 .data = &rxrpc_requested_ack_delay,
36 .maxlen = sizeof(unsigned int), 36 .maxlen = sizeof(unsigned int),
37 .mode = 0644, 37 .mode = 0644,
38 .proc_handler = proc_dointvec_ms_jiffies, 38 .proc_handler = proc_dointvec,
39 .extra1 = (void *)&zero, 39 .extra1 = (void *)&zero,
40 }, 40 },
41 { 41 {
@@ -43,7 +43,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
43 .data = &rxrpc_soft_ack_delay, 43 .data = &rxrpc_soft_ack_delay,
44 .maxlen = sizeof(unsigned int), 44 .maxlen = sizeof(unsigned int),
45 .mode = 0644, 45 .mode = 0644,
46 .proc_handler = proc_dointvec_ms_jiffies, 46 .proc_handler = proc_dointvec,
47 .extra1 = (void *)&one, 47 .extra1 = (void *)&one,
48 }, 48 },
49 { 49 {
@@ -51,7 +51,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
51 .data = &rxrpc_idle_ack_delay, 51 .data = &rxrpc_idle_ack_delay,
52 .maxlen = sizeof(unsigned int), 52 .maxlen = sizeof(unsigned int),
53 .mode = 0644, 53 .mode = 0644,
54 .proc_handler = proc_dointvec_ms_jiffies, 54 .proc_handler = proc_dointvec,
55 .extra1 = (void *)&one, 55 .extra1 = (void *)&one,
56 }, 56 },
57 { 57 {
@@ -59,6 +59,22 @@ static struct ctl_table rxrpc_sysctl_table[] = {
59 .data = &rxrpc_resend_timeout, 59 .data = &rxrpc_resend_timeout,
60 .maxlen = sizeof(unsigned int), 60 .maxlen = sizeof(unsigned int),
61 .mode = 0644, 61 .mode = 0644,
62 .proc_handler = proc_dointvec,
63 .extra1 = (void *)&one,
64 },
65 {
66 .procname = "idle_conn_expiry",
67 .data = &rxrpc_conn_idle_client_expiry,
68 .maxlen = sizeof(unsigned int),
69 .mode = 0644,
70 .proc_handler = proc_dointvec_ms_jiffies,
71 .extra1 = (void *)&one,
72 },
73 {
74 .procname = "idle_conn_fast_expiry",
75 .data = &rxrpc_conn_idle_client_fast_expiry,
76 .maxlen = sizeof(unsigned int),
77 .mode = 0644,
62 .proc_handler = proc_dointvec_ms_jiffies, 78 .proc_handler = proc_dointvec_ms_jiffies,
63 .extra1 = (void *)&one, 79 .extra1 = (void *)&one,
64 }, 80 },
@@ -69,29 +85,28 @@ static struct ctl_table rxrpc_sysctl_table[] = {
69 .data = &rxrpc_max_call_lifetime, 85 .data = &rxrpc_max_call_lifetime,
70 .maxlen = sizeof(unsigned int), 86 .maxlen = sizeof(unsigned int),
71 .mode = 0644, 87 .mode = 0644,
72 .proc_handler = proc_dointvec_jiffies, 88 .proc_handler = proc_dointvec,
73 .extra1 = (void *)&one, 89 .extra1 = (void *)&one,
74 }, 90 },
91
92 /* Non-time values */
75 { 93 {
76 .procname = "dead_call_expiry", 94 .procname = "max_client_conns",
77 .data = &rxrpc_dead_call_expiry, 95 .data = &rxrpc_max_client_connections,
78 .maxlen = sizeof(unsigned int), 96 .maxlen = sizeof(unsigned int),
79 .mode = 0644, 97 .mode = 0644,
80 .proc_handler = proc_dointvec_jiffies, 98 .proc_handler = proc_dointvec_minmax,
81 .extra1 = (void *)&one, 99 .extra1 = (void *)&rxrpc_reap_client_connections,
82 }, 100 },
83
84 /* Values measured in seconds */
85 { 101 {
86 .procname = "connection_expiry", 102 .procname = "reap_client_conns",
87 .data = &rxrpc_connection_expiry, 103 .data = &rxrpc_reap_client_connections,
88 .maxlen = sizeof(unsigned int), 104 .maxlen = sizeof(unsigned int),
89 .mode = 0644, 105 .mode = 0644,
90 .proc_handler = proc_dointvec_minmax, 106 .proc_handler = proc_dointvec_minmax,
91 .extra1 = (void *)&one, 107 .extra1 = (void *)&one,
108 .extra2 = (void *)&rxrpc_max_client_connections,
92 }, 109 },
93
94 /* Non-time values */
95 { 110 {
96 .procname = "max_backlog", 111 .procname = "max_backlog",
97 .data = &rxrpc_max_backlog, 112 .data = &rxrpc_max_backlog,
diff --git a/net/rxrpc/utils.c b/net/rxrpc/utils.c
index b88914d53ca5..ff7af71c4b49 100644
--- a/net/rxrpc/utils.c
+++ b/net/rxrpc/utils.c
@@ -30,6 +30,7 @@ int rxrpc_extract_addr_from_skb(struct sockaddr_rxrpc *srx, struct sk_buff *skb)
30 srx->transport.sin.sin_addr.s_addr = ip_hdr(skb)->saddr; 30 srx->transport.sin.sin_addr.s_addr = ip_hdr(skb)->saddr;
31 return 0; 31 return 0;
32 32
33#ifdef CONFIG_AF_RXRPC_IPV6
33 case ETH_P_IPV6: 34 case ETH_P_IPV6:
34 srx->transport_type = SOCK_DGRAM; 35 srx->transport_type = SOCK_DGRAM;
35 srx->transport_len = sizeof(srx->transport.sin6); 36 srx->transport_len = sizeof(srx->transport.sin6);
@@ -37,6 +38,7 @@ int rxrpc_extract_addr_from_skb(struct sockaddr_rxrpc *srx, struct sk_buff *skb)
37 srx->transport.sin6.sin6_port = udp_hdr(skb)->source; 38 srx->transport.sin6.sin6_port = udp_hdr(skb)->source;
38 srx->transport.sin6.sin6_addr = ipv6_hdr(skb)->saddr; 39 srx->transport.sin6.sin6_addr = ipv6_hdr(skb)->saddr;
39 return 0; 40 return 0;
41#endif
40 42
41 default: 43 default:
42 pr_warn_ratelimited("AF_RXRPC: Unknown eth protocol %u\n", 44 pr_warn_ratelimited("AF_RXRPC: Unknown eth protocol %u\n",
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index ccf931b3b94c..87956a768d1b 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -749,6 +749,17 @@ config NET_ACT_CONNMARK
749 To compile this code as a module, choose M here: the 749 To compile this code as a module, choose M here: the
750 module will be called act_connmark. 750 module will be called act_connmark.
751 751
752config NET_ACT_SKBMOD
753 tristate "skb data modification action"
754 depends on NET_CLS_ACT
755 ---help---
756 Say Y here to allow modification of skb data
757
758 If unsure, say N.
759
760 To compile this code as a module, choose M here: the
761 module will be called act_skbmod.
762
752config NET_ACT_IFE 763config NET_ACT_IFE
753 tristate "Inter-FE action based on IETF ForCES InterFE LFB" 764 tristate "Inter-FE action based on IETF ForCES InterFE LFB"
754 depends on NET_CLS_ACT 765 depends on NET_CLS_ACT
@@ -761,6 +772,17 @@ config NET_ACT_IFE
761 To compile this code as a module, choose M here: the 772 To compile this code as a module, choose M here: the
762 module will be called act_ife. 773 module will be called act_ife.
763 774
775config NET_ACT_TUNNEL_KEY
776 tristate "IP tunnel metadata manipulation"
777 depends on NET_CLS_ACT
778 ---help---
779 Say Y here to set/release ip tunnel metadata.
780
781 If unsure, say N.
782
783 To compile this code as a module, choose M here: the
784 module will be called act_tunnel_key.
785
764config NET_IFE_SKBMARK 786config NET_IFE_SKBMARK
765 tristate "Support to encoding decoding skb mark on IFE action" 787 tristate "Support to encoding decoding skb mark on IFE action"
766 depends on NET_ACT_IFE 788 depends on NET_ACT_IFE
@@ -771,6 +793,11 @@ config NET_IFE_SKBPRIO
771 depends on NET_ACT_IFE 793 depends on NET_ACT_IFE
772 ---help--- 794 ---help---
773 795
796config NET_IFE_SKBTCINDEX
797 tristate "Support to encoding decoding skb tcindex on IFE action"
798 depends on NET_ACT_IFE
799 ---help---
800
774config NET_CLS_IND 801config NET_CLS_IND
775 bool "Incoming device classification" 802 bool "Incoming device classification"
776 depends on NET_CLS_U32 || NET_CLS_FW 803 depends on NET_CLS_U32 || NET_CLS_FW
diff --git a/net/sched/Makefile b/net/sched/Makefile
index ae088a5a9d95..4bdda3634e0b 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -19,9 +19,12 @@ obj-$(CONFIG_NET_ACT_CSUM) += act_csum.o
19obj-$(CONFIG_NET_ACT_VLAN) += act_vlan.o 19obj-$(CONFIG_NET_ACT_VLAN) += act_vlan.o
20obj-$(CONFIG_NET_ACT_BPF) += act_bpf.o 20obj-$(CONFIG_NET_ACT_BPF) += act_bpf.o
21obj-$(CONFIG_NET_ACT_CONNMARK) += act_connmark.o 21obj-$(CONFIG_NET_ACT_CONNMARK) += act_connmark.o
22obj-$(CONFIG_NET_ACT_SKBMOD) += act_skbmod.o
22obj-$(CONFIG_NET_ACT_IFE) += act_ife.o 23obj-$(CONFIG_NET_ACT_IFE) += act_ife.o
23obj-$(CONFIG_NET_IFE_SKBMARK) += act_meta_mark.o 24obj-$(CONFIG_NET_IFE_SKBMARK) += act_meta_mark.o
24obj-$(CONFIG_NET_IFE_SKBPRIO) += act_meta_skbprio.o 25obj-$(CONFIG_NET_IFE_SKBPRIO) += act_meta_skbprio.o
26obj-$(CONFIG_NET_IFE_SKBTCINDEX) += act_meta_skbtcindex.o
27obj-$(CONFIG_NET_ACT_TUNNEL_KEY)+= act_tunnel_key.o
25obj-$(CONFIG_NET_SCH_FIFO) += sch_fifo.o 28obj-$(CONFIG_NET_SCH_FIFO) += sch_fifo.o
26obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o 29obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o
27obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o 30obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index d09d0687594b..a512b18c0088 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -341,22 +341,25 @@ int tcf_register_action(struct tc_action_ops *act,
341 if (!act->act || !act->dump || !act->init || !act->walk || !act->lookup) 341 if (!act->act || !act->dump || !act->init || !act->walk || !act->lookup)
342 return -EINVAL; 342 return -EINVAL;
343 343
344 /* We have to register pernet ops before making the action ops visible,
345 * otherwise tcf_action_init_1() could get a partially initialized
346 * netns.
347 */
348 ret = register_pernet_subsys(ops);
349 if (ret)
350 return ret;
351
344 write_lock(&act_mod_lock); 352 write_lock(&act_mod_lock);
345 list_for_each_entry(a, &act_base, head) { 353 list_for_each_entry(a, &act_base, head) {
346 if (act->type == a->type || (strcmp(act->kind, a->kind) == 0)) { 354 if (act->type == a->type || (strcmp(act->kind, a->kind) == 0)) {
347 write_unlock(&act_mod_lock); 355 write_unlock(&act_mod_lock);
356 unregister_pernet_subsys(ops);
348 return -EEXIST; 357 return -EEXIST;
349 } 358 }
350 } 359 }
351 list_add_tail(&act->head, &act_base); 360 list_add_tail(&act->head, &act_base);
352 write_unlock(&act_mod_lock); 361 write_unlock(&act_mod_lock);
353 362
354 ret = register_pernet_subsys(ops);
355 if (ret) {
356 tcf_unregister_action(act, ops);
357 return ret;
358 }
359
360 return 0; 363 return 0;
361} 364}
362EXPORT_SYMBOL(tcf_register_action); 365EXPORT_SYMBOL(tcf_register_action);
@@ -367,8 +370,6 @@ int tcf_unregister_action(struct tc_action_ops *act,
367 struct tc_action_ops *a; 370 struct tc_action_ops *a;
368 int err = -ENOENT; 371 int err = -ENOENT;
369 372
370 unregister_pernet_subsys(ops);
371
372 write_lock(&act_mod_lock); 373 write_lock(&act_mod_lock);
373 list_for_each_entry(a, &act_base, head) { 374 list_for_each_entry(a, &act_base, head) {
374 if (a == act) { 375 if (a == act) {
@@ -378,6 +379,8 @@ int tcf_unregister_action(struct tc_action_ops *act,
378 } 379 }
379 } 380 }
380 write_unlock(&act_mod_lock); 381 write_unlock(&act_mod_lock);
382 if (!err)
383 unregister_pernet_subsys(ops);
381 return err; 384 return err;
382} 385}
383EXPORT_SYMBOL(tcf_unregister_action); 386EXPORT_SYMBOL(tcf_unregister_action);
@@ -592,9 +595,19 @@ err_out:
592 return ERR_PTR(err); 595 return ERR_PTR(err);
593} 596}
594 597
595int tcf_action_init(struct net *net, struct nlattr *nla, 598static void cleanup_a(struct list_head *actions, int ovr)
596 struct nlattr *est, char *name, int ovr, 599{
597 int bind, struct list_head *actions) 600 struct tc_action *a;
601
602 if (!ovr)
603 return;
604
605 list_for_each_entry(a, actions, list)
606 a->tcfa_refcnt--;
607}
608
609int tcf_action_init(struct net *net, struct nlattr *nla, struct nlattr *est,
610 char *name, int ovr, int bind, struct list_head *actions)
598{ 611{
599 struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; 612 struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
600 struct tc_action *act; 613 struct tc_action *act;
@@ -612,8 +625,15 @@ int tcf_action_init(struct net *net, struct nlattr *nla,
612 goto err; 625 goto err;
613 } 626 }
614 act->order = i; 627 act->order = i;
628 if (ovr)
629 act->tcfa_refcnt++;
615 list_add_tail(&act->list, actions); 630 list_add_tail(&act->list, actions);
616 } 631 }
632
633 /* Remove the temp refcnt which was necessary to protect against
634 * destroying an existing action which was being replaced
635 */
636 cleanup_a(actions, ovr);
617 return 0; 637 return 0;
618 638
619err: 639err:
@@ -883,6 +903,8 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
883 goto err; 903 goto err;
884 } 904 }
885 act->order = i; 905 act->order = i;
906 if (event == RTM_GETACTION)
907 act->tcfa_refcnt++;
886 list_add_tail(&act->list, &actions); 908 list_add_tail(&act->list, &actions);
887 } 909 }
888 910
@@ -923,9 +945,8 @@ tcf_add_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions,
923 return err; 945 return err;
924} 946}
925 947
926static int 948static int tcf_action_add(struct net *net, struct nlattr *nla,
927tcf_action_add(struct net *net, struct nlattr *nla, struct nlmsghdr *n, 949 struct nlmsghdr *n, u32 portid, int ovr)
928 u32 portid, int ovr)
929{ 950{
930 int ret = 0; 951 int ret = 0;
931 LIST_HEAD(actions); 952 LIST_HEAD(actions);
@@ -988,8 +1009,7 @@ replay:
988 return ret; 1009 return ret;
989} 1010}
990 1011
991static struct nlattr * 1012static struct nlattr *find_dump_kind(const struct nlmsghdr *n)
992find_dump_kind(const struct nlmsghdr *n)
993{ 1013{
994 struct nlattr *tb1, *tb2[TCA_ACT_MAX + 1]; 1014 struct nlattr *tb1, *tb2[TCA_ACT_MAX + 1];
995 struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; 1015 struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
@@ -1016,8 +1036,7 @@ find_dump_kind(const struct nlmsghdr *n)
1016 return kind; 1036 return kind;
1017} 1037}
1018 1038
1019static int 1039static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
1020tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
1021{ 1040{
1022 struct net *net = sock_net(skb->sk); 1041 struct net *net = sock_net(skb->sk);
1023 struct nlmsghdr *nlh; 1042 struct nlmsghdr *nlh;
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index bfa870731e74..1d3960033f61 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -39,13 +39,10 @@ static struct tc_action_ops act_bpf_ops;
39static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act, 39static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act,
40 struct tcf_result *res) 40 struct tcf_result *res)
41{ 41{
42 bool at_ingress = skb_at_tc_ingress(skb);
42 struct tcf_bpf *prog = to_bpf(act); 43 struct tcf_bpf *prog = to_bpf(act);
43 struct bpf_prog *filter; 44 struct bpf_prog *filter;
44 int action, filter_res; 45 int action, filter_res;
45 bool at_ingress = G_TC_AT(skb->tc_verd) & AT_INGRESS;
46
47 if (unlikely(!skb_mac_header_was_set(skb)))
48 return TC_ACT_UNSPEC;
49 46
50 tcf_lastuse_update(&prog->tcf_tm); 47 tcf_lastuse_update(&prog->tcf_tm);
51 bstats_cpu_update(this_cpu_ptr(prog->common.cpu_bstats), skb); 48 bstats_cpu_update(this_cpu_ptr(prog->common.cpu_bstats), skb);
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index b5dbf633a863..e0defcef376d 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -116,8 +116,8 @@ static void *tcf_csum_skb_nextlayer(struct sk_buff *skb,
116 return (void *)(skb_network_header(skb) + ihl); 116 return (void *)(skb_network_header(skb) + ihl);
117} 117}
118 118
119static int tcf_csum_ipv4_icmp(struct sk_buff *skb, 119static int tcf_csum_ipv4_icmp(struct sk_buff *skb, unsigned int ihl,
120 unsigned int ihl, unsigned int ipl) 120 unsigned int ipl)
121{ 121{
122 struct icmphdr *icmph; 122 struct icmphdr *icmph;
123 123
@@ -152,8 +152,8 @@ static int tcf_csum_ipv4_igmp(struct sk_buff *skb,
152 return 1; 152 return 1;
153} 153}
154 154
155static int tcf_csum_ipv6_icmp(struct sk_buff *skb, 155static int tcf_csum_ipv6_icmp(struct sk_buff *skb, unsigned int ihl,
156 unsigned int ihl, unsigned int ipl) 156 unsigned int ipl)
157{ 157{
158 struct icmp6hdr *icmp6h; 158 struct icmp6hdr *icmp6h;
159 const struct ipv6hdr *ip6h; 159 const struct ipv6hdr *ip6h;
@@ -174,8 +174,8 @@ static int tcf_csum_ipv6_icmp(struct sk_buff *skb,
174 return 1; 174 return 1;
175} 175}
176 176
177static int tcf_csum_ipv4_tcp(struct sk_buff *skb, 177static int tcf_csum_ipv4_tcp(struct sk_buff *skb, unsigned int ihl,
178 unsigned int ihl, unsigned int ipl) 178 unsigned int ipl)
179{ 179{
180 struct tcphdr *tcph; 180 struct tcphdr *tcph;
181 const struct iphdr *iph; 181 const struct iphdr *iph;
@@ -195,8 +195,8 @@ static int tcf_csum_ipv4_tcp(struct sk_buff *skb,
195 return 1; 195 return 1;
196} 196}
197 197
198static int tcf_csum_ipv6_tcp(struct sk_buff *skb, 198static int tcf_csum_ipv6_tcp(struct sk_buff *skb, unsigned int ihl,
199 unsigned int ihl, unsigned int ipl) 199 unsigned int ipl)
200{ 200{
201 struct tcphdr *tcph; 201 struct tcphdr *tcph;
202 const struct ipv6hdr *ip6h; 202 const struct ipv6hdr *ip6h;
@@ -217,8 +217,8 @@ static int tcf_csum_ipv6_tcp(struct sk_buff *skb,
217 return 1; 217 return 1;
218} 218}
219 219
220static int tcf_csum_ipv4_udp(struct sk_buff *skb, 220static int tcf_csum_ipv4_udp(struct sk_buff *skb, unsigned int ihl,
221 unsigned int ihl, unsigned int ipl, int udplite) 221 unsigned int ipl, int udplite)
222{ 222{
223 struct udphdr *udph; 223 struct udphdr *udph;
224 const struct iphdr *iph; 224 const struct iphdr *iph;
@@ -270,8 +270,8 @@ ignore_obscure_skb:
270 return 1; 270 return 1;
271} 271}
272 272
273static int tcf_csum_ipv6_udp(struct sk_buff *skb, 273static int tcf_csum_ipv6_udp(struct sk_buff *skb, unsigned int ihl,
274 unsigned int ihl, unsigned int ipl, int udplite) 274 unsigned int ipl, int udplite)
275{ 275{
276 struct udphdr *udph; 276 struct udphdr *udph;
277 const struct ipv6hdr *ip6h; 277 const struct ipv6hdr *ip6h;
@@ -380,8 +380,8 @@ fail:
380 return 0; 380 return 0;
381} 381}
382 382
383static int tcf_csum_ipv6_hopopts(struct ipv6_opt_hdr *ip6xh, 383static int tcf_csum_ipv6_hopopts(struct ipv6_opt_hdr *ip6xh, unsigned int ixhl,
384 unsigned int ixhl, unsigned int *pl) 384 unsigned int *pl)
385{ 385{
386 int off, len, optlen; 386 int off, len, optlen;
387 unsigned char *xh = (void *)ip6xh; 387 unsigned char *xh = (void *)ip6xh;
@@ -494,8 +494,8 @@ fail:
494 return 0; 494 return 0;
495} 495}
496 496
497static int tcf_csum(struct sk_buff *skb, 497static int tcf_csum(struct sk_buff *skb, const struct tc_action *a,
498 const struct tc_action *a, struct tcf_result *res) 498 struct tcf_result *res)
499{ 499{
500 struct tcf_csum *p = to_tcf_csum(a); 500 struct tcf_csum *p = to_tcf_csum(a);
501 int action; 501 int action;
@@ -531,8 +531,8 @@ drop:
531 return TC_ACT_SHOT; 531 return TC_ACT_SHOT;
532} 532}
533 533
534static int tcf_csum_dump(struct sk_buff *skb, 534static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind,
535 struct tc_action *a, int bind, int ref) 535 int ref)
536{ 536{
537 unsigned char *b = skb_tail_pointer(skb); 537 unsigned char *b = skb_tail_pointer(skb);
538 struct tcf_csum *p = to_tcf_csum(a); 538 struct tcf_csum *p = to_tcf_csum(a);
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index e24a4093d6f6..e0aa30f83c6c 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -156,7 +156,8 @@ static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets,
156 int action = READ_ONCE(gact->tcf_action); 156 int action = READ_ONCE(gact->tcf_action);
157 struct tcf_t *tm = &gact->tcf_tm; 157 struct tcf_t *tm = &gact->tcf_tm;
158 158
159 _bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats), bytes, packets); 159 _bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats), bytes,
160 packets);
160 if (action == TC_ACT_SHOT) 161 if (action == TC_ACT_SHOT)
161 this_cpu_ptr(gact->common.cpu_qstats)->drops += packets; 162 this_cpu_ptr(gact->common.cpu_qstats)->drops += packets;
162 163
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 4a60cd5e1875..95c463cbb9a6 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -63,6 +63,23 @@ int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen, const void *dval)
63} 63}
64EXPORT_SYMBOL_GPL(ife_tlv_meta_encode); 64EXPORT_SYMBOL_GPL(ife_tlv_meta_encode);
65 65
66int ife_encode_meta_u16(u16 metaval, void *skbdata, struct tcf_meta_info *mi)
67{
68 u16 edata = 0;
69
70 if (mi->metaval)
71 edata = *(u16 *)mi->metaval;
72 else if (metaval)
73 edata = metaval;
74
75 if (!edata) /* will not encode */
76 return 0;
77
78 edata = htons(edata);
79 return ife_tlv_meta_encode(skbdata, mi->metaid, 2, &edata);
80}
81EXPORT_SYMBOL_GPL(ife_encode_meta_u16);
82
66int ife_get_meta_u32(struct sk_buff *skb, struct tcf_meta_info *mi) 83int ife_get_meta_u32(struct sk_buff *skb, struct tcf_meta_info *mi)
67{ 84{
68 if (mi->metaval) 85 if (mi->metaval)
@@ -81,6 +98,15 @@ int ife_check_meta_u32(u32 metaval, struct tcf_meta_info *mi)
81} 98}
82EXPORT_SYMBOL_GPL(ife_check_meta_u32); 99EXPORT_SYMBOL_GPL(ife_check_meta_u32);
83 100
101int ife_check_meta_u16(u16 metaval, struct tcf_meta_info *mi)
102{
103 if (metaval || mi->metaval)
104 return 8; /* T+L+(V) == 2+2+(2+2bytepad) */
105
106 return 0;
107}
108EXPORT_SYMBOL_GPL(ife_check_meta_u16);
109
84int ife_encode_meta_u32(u32 metaval, void *skbdata, struct tcf_meta_info *mi) 110int ife_encode_meta_u32(u32 metaval, void *skbdata, struct tcf_meta_info *mi)
85{ 111{
86 u32 edata = metaval; 112 u32 edata = metaval;
diff --git a/net/sched/act_meta_skbtcindex.c b/net/sched/act_meta_skbtcindex.c
new file mode 100644
index 000000000000..3b35774ce890
--- /dev/null
+++ b/net/sched/act_meta_skbtcindex.c
@@ -0,0 +1,79 @@
1/*
2 * net/sched/act_meta_tc_index.c IFE skb->tc_index metadata module
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * copyright Jamal Hadi Salim (2016)
10 *
11*/
12
13#include <linux/types.h>
14#include <linux/kernel.h>
15#include <linux/string.h>
16#include <linux/errno.h>
17#include <linux/skbuff.h>
18#include <linux/rtnetlink.h>
19#include <linux/module.h>
20#include <linux/init.h>
21#include <net/netlink.h>
22#include <net/pkt_sched.h>
23#include <uapi/linux/tc_act/tc_ife.h>
24#include <net/tc_act/tc_ife.h>
25#include <linux/rtnetlink.h>
26
27static int skbtcindex_encode(struct sk_buff *skb, void *skbdata,
28 struct tcf_meta_info *e)
29{
30 u32 ifetc_index = skb->tc_index;
31
32 return ife_encode_meta_u16(ifetc_index, skbdata, e);
33}
34
35static int skbtcindex_decode(struct sk_buff *skb, void *data, u16 len)
36{
37 u16 ifetc_index = *(u16 *)data;
38
39 skb->tc_index = ntohs(ifetc_index);
40 return 0;
41}
42
43static int skbtcindex_check(struct sk_buff *skb, struct tcf_meta_info *e)
44{
45 return ife_check_meta_u16(skb->tc_index, e);
46}
47
48static struct tcf_meta_ops ife_skbtcindex_ops = {
49 .metaid = IFE_META_TCINDEX,
50 .metatype = NLA_U16,
51 .name = "tc_index",
52 .synopsis = "skb tc_index 16 bit metadata",
53 .check_presence = skbtcindex_check,
54 .encode = skbtcindex_encode,
55 .decode = skbtcindex_decode,
56 .get = ife_get_meta_u16,
57 .alloc = ife_alloc_meta_u16,
58 .release = ife_release_meta_gen,
59 .validate = ife_validate_meta_u16,
60 .owner = THIS_MODULE,
61};
62
63static int __init ifetc_index_init_module(void)
64{
65 return register_ife_op(&ife_skbtcindex_ops);
66}
67
68static void __exit ifetc_index_cleanup_module(void)
69{
70 unregister_ife_op(&ife_skbtcindex_ops);
71}
72
73module_init(ifetc_index_init_module);
74module_exit(ifetc_index_cleanup_module);
75
76MODULE_AUTHOR("Jamal Hadi Salim(2016)");
77MODULE_DESCRIPTION("Inter-FE skb tc_index metadata module");
78MODULE_LICENSE("GPL");
79MODULE_ALIAS_IFE_META(IFE_META_SKBTCINDEX);
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 6038c85d92f5..667dc382df82 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -204,7 +204,15 @@ out:
204 return retval; 204 return retval;
205} 205}
206 206
207static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) 207static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets,
208 u64 lastuse)
209{
210 tcf_lastuse_update(&a->tcfa_tm);
211 _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets);
212}
213
214static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind,
215 int ref)
208{ 216{
209 unsigned char *b = skb_tail_pointer(skb); 217 unsigned char *b = skb_tail_pointer(skb);
210 struct tcf_mirred *m = to_mirred(a); 218 struct tcf_mirred *m = to_mirred(a);
@@ -280,6 +288,7 @@ static struct tc_action_ops act_mirred_ops = {
280 .type = TCA_ACT_MIRRED, 288 .type = TCA_ACT_MIRRED,
281 .owner = THIS_MODULE, 289 .owner = THIS_MODULE,
282 .act = tcf_mirred, 290 .act = tcf_mirred,
291 .stats_update = tcf_stats_update,
283 .dump = tcf_mirred_dump, 292 .dump = tcf_mirred_dump,
284 .cleanup = tcf_mirred_release, 293 .cleanup = tcf_mirred_release,
285 .init = tcf_mirred_init, 294 .init = tcf_mirred_init,
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 8a3be1d99775..d1bd248fe146 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -249,6 +249,8 @@ static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a,
249 police->tcfp_t_c = now; 249 police->tcfp_t_c = now;
250 police->tcfp_toks = toks; 250 police->tcfp_toks = toks;
251 police->tcfp_ptoks = ptoks; 251 police->tcfp_ptoks = ptoks;
252 if (police->tcfp_result == TC_ACT_SHOT)
253 police->tcf_qstats.drops++;
252 spin_unlock(&police->tcf_lock); 254 spin_unlock(&police->tcf_lock);
253 return police->tcfp_result; 255 return police->tcfp_result;
254 } 256 }
@@ -261,8 +263,8 @@ static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a,
261 return police->tcf_action; 263 return police->tcf_action;
262} 264}
263 265
264static int 266static int tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a,
265tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) 267 int bind, int ref)
266{ 268{
267 unsigned char *b = skb_tail_pointer(skb); 269 unsigned char *b = skb_tail_pointer(skb);
268 struct tcf_police *police = to_police(a); 270 struct tcf_police *police = to_police(a);
@@ -347,14 +349,12 @@ static struct pernet_operations police_net_ops = {
347 .size = sizeof(struct tc_action_net), 349 .size = sizeof(struct tc_action_net),
348}; 350};
349 351
350static int __init 352static int __init police_init_module(void)
351police_init_module(void)
352{ 353{
353 return tcf_register_action(&act_police_ops, &police_net_ops); 354 return tcf_register_action(&act_police_ops, &police_net_ops);
354} 355}
355 356
356static void __exit 357static void __exit police_cleanup_module(void)
357police_cleanup_module(void)
358{ 358{
359 tcf_unregister_action(&act_police_ops, &police_net_ops); 359 tcf_unregister_action(&act_police_ops, &police_net_ops);
360} 360}
diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
new file mode 100644
index 000000000000..e7d96381c908
--- /dev/null
+++ b/net/sched/act_skbmod.c
@@ -0,0 +1,301 @@
1/*
2 * net/sched/act_skbmod.c skb data modifier
3 *
4 * Copyright (c) 2016 Jamal Hadi Salim <jhs@mojatatu.com>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10*/
11
12#include <linux/module.h>
13#include <linux/init.h>
14#include <linux/kernel.h>
15#include <linux/skbuff.h>
16#include <linux/rtnetlink.h>
17#include <net/netlink.h>
18#include <net/pkt_sched.h>
19
20#include <linux/tc_act/tc_skbmod.h>
21#include <net/tc_act/tc_skbmod.h>
22
23#define SKBMOD_TAB_MASK 15
24
25static int skbmod_net_id;
26static struct tc_action_ops act_skbmod_ops;
27
28#define MAX_EDIT_LEN ETH_HLEN
29static int tcf_skbmod_run(struct sk_buff *skb, const struct tc_action *a,
30 struct tcf_result *res)
31{
32 struct tcf_skbmod *d = to_skbmod(a);
33 int action;
34 struct tcf_skbmod_params *p;
35 u64 flags;
36 int err;
37
38 tcf_lastuse_update(&d->tcf_tm);
39 bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb);
40
41 /* XXX: if you are going to edit more fields beyond ethernet header
42 * (example when you add IP header replacement or vlan swap)
43 * then MAX_EDIT_LEN needs to change appropriately
44 */
45 err = skb_ensure_writable(skb, MAX_EDIT_LEN);
46 if (unlikely(err)) { /* best policy is to drop on the floor */
47 qstats_overlimit_inc(this_cpu_ptr(d->common.cpu_qstats));
48 return TC_ACT_SHOT;
49 }
50
51 rcu_read_lock();
52 action = READ_ONCE(d->tcf_action);
53 if (unlikely(action == TC_ACT_SHOT)) {
54 qstats_overlimit_inc(this_cpu_ptr(d->common.cpu_qstats));
55 rcu_read_unlock();
56 return action;
57 }
58
59 p = rcu_dereference(d->skbmod_p);
60 flags = p->flags;
61 if (flags & SKBMOD_F_DMAC)
62 ether_addr_copy(eth_hdr(skb)->h_dest, p->eth_dst);
63 if (flags & SKBMOD_F_SMAC)
64 ether_addr_copy(eth_hdr(skb)->h_source, p->eth_src);
65 if (flags & SKBMOD_F_ETYPE)
66 eth_hdr(skb)->h_proto = p->eth_type;
67 rcu_read_unlock();
68
69 if (flags & SKBMOD_F_SWAPMAC) {
70 u16 tmpaddr[ETH_ALEN / 2]; /* ether_addr_copy() requirement */
71 /*XXX: I am sure we can come up with more efficient swapping*/
72 ether_addr_copy((u8 *)tmpaddr, eth_hdr(skb)->h_dest);
73 ether_addr_copy(eth_hdr(skb)->h_dest, eth_hdr(skb)->h_source);
74 ether_addr_copy(eth_hdr(skb)->h_source, (u8 *)tmpaddr);
75 }
76
77 return action;
78}
79
80static const struct nla_policy skbmod_policy[TCA_SKBMOD_MAX + 1] = {
81 [TCA_SKBMOD_PARMS] = { .len = sizeof(struct tc_skbmod) },
82 [TCA_SKBMOD_DMAC] = { .len = ETH_ALEN },
83 [TCA_SKBMOD_SMAC] = { .len = ETH_ALEN },
84 [TCA_SKBMOD_ETYPE] = { .type = NLA_U16 },
85};
86
87static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
88 struct nlattr *est, struct tc_action **a,
89 int ovr, int bind)
90{
91 struct tc_action_net *tn = net_generic(net, skbmod_net_id);
92 struct nlattr *tb[TCA_SKBMOD_MAX + 1];
93 struct tcf_skbmod_params *p, *p_old;
94 struct tc_skbmod *parm;
95 struct tcf_skbmod *d;
96 bool exists = false;
97 u8 *daddr = NULL;
98 u8 *saddr = NULL;
99 u16 eth_type = 0;
100 u32 lflags = 0;
101 int ret = 0, err;
102
103 if (!nla)
104 return -EINVAL;
105
106 err = nla_parse_nested(tb, TCA_SKBMOD_MAX, nla, skbmod_policy);
107 if (err < 0)
108 return err;
109
110 if (!tb[TCA_SKBMOD_PARMS])
111 return -EINVAL;
112
113 if (tb[TCA_SKBMOD_DMAC]) {
114 daddr = nla_data(tb[TCA_SKBMOD_DMAC]);
115 lflags |= SKBMOD_F_DMAC;
116 }
117
118 if (tb[TCA_SKBMOD_SMAC]) {
119 saddr = nla_data(tb[TCA_SKBMOD_SMAC]);
120 lflags |= SKBMOD_F_SMAC;
121 }
122
123 if (tb[TCA_SKBMOD_ETYPE]) {
124 eth_type = nla_get_u16(tb[TCA_SKBMOD_ETYPE]);
125 lflags |= SKBMOD_F_ETYPE;
126 }
127
128 parm = nla_data(tb[TCA_SKBMOD_PARMS]);
129 if (parm->flags & SKBMOD_F_SWAPMAC)
130 lflags = SKBMOD_F_SWAPMAC;
131
132 exists = tcf_hash_check(tn, parm->index, a, bind);
133 if (exists && bind)
134 return 0;
135
136 if (!lflags)
137 return -EINVAL;
138
139 if (!exists) {
140 ret = tcf_hash_create(tn, parm->index, est, a,
141 &act_skbmod_ops, bind, true);
142 if (ret)
143 return ret;
144
145 ret = ACT_P_CREATED;
146 } else {
147 tcf_hash_release(*a, bind);
148 if (!ovr)
149 return -EEXIST;
150 }
151
152 d = to_skbmod(*a);
153
154 ASSERT_RTNL();
155 p = kzalloc(sizeof(struct tcf_skbmod_params), GFP_KERNEL);
156 if (unlikely(!p)) {
157 if (ovr)
158 tcf_hash_release(*a, bind);
159 return -ENOMEM;
160 }
161
162 p->flags = lflags;
163 d->tcf_action = parm->action;
164
165 p_old = rtnl_dereference(d->skbmod_p);
166
167 if (ovr)
168 spin_lock_bh(&d->tcf_lock);
169
170 if (lflags & SKBMOD_F_DMAC)
171 ether_addr_copy(p->eth_dst, daddr);
172 if (lflags & SKBMOD_F_SMAC)
173 ether_addr_copy(p->eth_src, saddr);
174 if (lflags & SKBMOD_F_ETYPE)
175 p->eth_type = htons(eth_type);
176
177 rcu_assign_pointer(d->skbmod_p, p);
178 if (ovr)
179 spin_unlock_bh(&d->tcf_lock);
180
181 if (p_old)
182 kfree_rcu(p_old, rcu);
183
184 if (ret == ACT_P_CREATED)
185 tcf_hash_insert(tn, *a);
186 return ret;
187}
188
189static void tcf_skbmod_cleanup(struct tc_action *a, int bind)
190{
191 struct tcf_skbmod *d = to_skbmod(a);
192 struct tcf_skbmod_params *p;
193
194 p = rcu_dereference_protected(d->skbmod_p, 1);
195 kfree_rcu(p, rcu);
196}
197
198static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a,
199 int bind, int ref)
200{
201 struct tcf_skbmod *d = to_skbmod(a);
202 unsigned char *b = skb_tail_pointer(skb);
203 struct tcf_skbmod_params *p = rtnl_dereference(d->skbmod_p);
204 struct tc_skbmod opt = {
205 .index = d->tcf_index,
206 .refcnt = d->tcf_refcnt - ref,
207 .bindcnt = d->tcf_bindcnt - bind,
208 .action = d->tcf_action,
209 };
210 struct tcf_t t;
211
212 opt.flags = p->flags;
213 if (nla_put(skb, TCA_SKBMOD_PARMS, sizeof(opt), &opt))
214 goto nla_put_failure;
215 if ((p->flags & SKBMOD_F_DMAC) &&
216 nla_put(skb, TCA_SKBMOD_DMAC, ETH_ALEN, p->eth_dst))
217 goto nla_put_failure;
218 if ((p->flags & SKBMOD_F_SMAC) &&
219 nla_put(skb, TCA_SKBMOD_SMAC, ETH_ALEN, p->eth_src))
220 goto nla_put_failure;
221 if ((p->flags & SKBMOD_F_ETYPE) &&
222 nla_put_u16(skb, TCA_SKBMOD_ETYPE, ntohs(p->eth_type)))
223 goto nla_put_failure;
224
225 tcf_tm_dump(&t, &d->tcf_tm);
226 if (nla_put_64bit(skb, TCA_SKBMOD_TM, sizeof(t), &t, TCA_SKBMOD_PAD))
227 goto nla_put_failure;
228
229 return skb->len;
230nla_put_failure:
231 rcu_read_unlock();
232 nlmsg_trim(skb, b);
233 return -1;
234}
235
236static int tcf_skbmod_walker(struct net *net, struct sk_buff *skb,
237 struct netlink_callback *cb, int type,
238 const struct tc_action_ops *ops)
239{
240 struct tc_action_net *tn = net_generic(net, skbmod_net_id);
241
242 return tcf_generic_walker(tn, skb, cb, type, ops);
243}
244
245static int tcf_skbmod_search(struct net *net, struct tc_action **a, u32 index)
246{
247 struct tc_action_net *tn = net_generic(net, skbmod_net_id);
248
249 return tcf_hash_search(tn, a, index);
250}
251
252static struct tc_action_ops act_skbmod_ops = {
253 .kind = "skbmod",
254 .type = TCA_ACT_SKBMOD,
255 .owner = THIS_MODULE,
256 .act = tcf_skbmod_run,
257 .dump = tcf_skbmod_dump,
258 .init = tcf_skbmod_init,
259 .cleanup = tcf_skbmod_cleanup,
260 .walk = tcf_skbmod_walker,
261 .lookup = tcf_skbmod_search,
262 .size = sizeof(struct tcf_skbmod),
263};
264
265static __net_init int skbmod_init_net(struct net *net)
266{
267 struct tc_action_net *tn = net_generic(net, skbmod_net_id);
268
269 return tc_action_net_init(tn, &act_skbmod_ops, SKBMOD_TAB_MASK);
270}
271
272static void __net_exit skbmod_exit_net(struct net *net)
273{
274 struct tc_action_net *tn = net_generic(net, skbmod_net_id);
275
276 tc_action_net_exit(tn);
277}
278
279static struct pernet_operations skbmod_net_ops = {
280 .init = skbmod_init_net,
281 .exit = skbmod_exit_net,
282 .id = &skbmod_net_id,
283 .size = sizeof(struct tc_action_net),
284};
285
286MODULE_AUTHOR("Jamal Hadi Salim, <jhs@mojatatu.com>");
287MODULE_DESCRIPTION("SKB data mod-ing");
288MODULE_LICENSE("GPL");
289
290static int __init skbmod_init_module(void)
291{
292 return tcf_register_action(&act_skbmod_ops, &skbmod_net_ops);
293}
294
295static void __exit skbmod_cleanup_module(void)
296{
297 tcf_unregister_action(&act_skbmod_ops, &skbmod_net_ops);
298}
299
300module_init(skbmod_init_module);
301module_exit(skbmod_cleanup_module);
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
new file mode 100644
index 000000000000..af47bdf2f483
--- /dev/null
+++ b/net/sched/act_tunnel_key.c
@@ -0,0 +1,342 @@
1/*
2 * Copyright (c) 2016, Amir Vadai <amir@vadai.me>
3 * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 */
10
11#include <linux/module.h>
12#include <linux/init.h>
13#include <linux/kernel.h>
14#include <linux/skbuff.h>
15#include <linux/rtnetlink.h>
16#include <net/netlink.h>
17#include <net/pkt_sched.h>
18#include <net/dst.h>
19#include <net/dst_metadata.h>
20
21#include <linux/tc_act/tc_tunnel_key.h>
22#include <net/tc_act/tc_tunnel_key.h>
23
24#define TUNNEL_KEY_TAB_MASK 15
25
26static int tunnel_key_net_id;
27static struct tc_action_ops act_tunnel_key_ops;
28
29static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a,
30 struct tcf_result *res)
31{
32 struct tcf_tunnel_key *t = to_tunnel_key(a);
33 struct tcf_tunnel_key_params *params;
34 int action;
35
36 rcu_read_lock();
37
38 params = rcu_dereference(t->params);
39
40 tcf_lastuse_update(&t->tcf_tm);
41 bstats_cpu_update(this_cpu_ptr(t->common.cpu_bstats), skb);
42 action = params->action;
43
44 switch (params->tcft_action) {
45 case TCA_TUNNEL_KEY_ACT_RELEASE:
46 skb_dst_drop(skb);
47 break;
48 case TCA_TUNNEL_KEY_ACT_SET:
49 skb_dst_drop(skb);
50 skb_dst_set(skb, dst_clone(&params->tcft_enc_metadata->dst));
51 break;
52 default:
53 WARN_ONCE(1, "Bad tunnel_key action %d.\n",
54 params->tcft_action);
55 break;
56 }
57
58 rcu_read_unlock();
59
60 return action;
61}
62
63static const struct nla_policy tunnel_key_policy[TCA_TUNNEL_KEY_MAX + 1] = {
64 [TCA_TUNNEL_KEY_PARMS] = { .len = sizeof(struct tc_tunnel_key) },
65 [TCA_TUNNEL_KEY_ENC_IPV4_SRC] = { .type = NLA_U32 },
66 [TCA_TUNNEL_KEY_ENC_IPV4_DST] = { .type = NLA_U32 },
67 [TCA_TUNNEL_KEY_ENC_IPV6_SRC] = { .len = sizeof(struct in6_addr) },
68 [TCA_TUNNEL_KEY_ENC_IPV6_DST] = { .len = sizeof(struct in6_addr) },
69 [TCA_TUNNEL_KEY_ENC_KEY_ID] = { .type = NLA_U32 },
70};
71
72static int tunnel_key_init(struct net *net, struct nlattr *nla,
73 struct nlattr *est, struct tc_action **a,
74 int ovr, int bind)
75{
76 struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
77 struct nlattr *tb[TCA_TUNNEL_KEY_MAX + 1];
78 struct tcf_tunnel_key_params *params_old;
79 struct tcf_tunnel_key_params *params_new;
80 struct metadata_dst *metadata = NULL;
81 struct tc_tunnel_key *parm;
82 struct tcf_tunnel_key *t;
83 bool exists = false;
84 __be64 key_id;
85 int ret = 0;
86 int err;
87
88 if (!nla)
89 return -EINVAL;
90
91 err = nla_parse_nested(tb, TCA_TUNNEL_KEY_MAX, nla, tunnel_key_policy);
92 if (err < 0)
93 return err;
94
95 if (!tb[TCA_TUNNEL_KEY_PARMS])
96 return -EINVAL;
97
98 parm = nla_data(tb[TCA_TUNNEL_KEY_PARMS]);
99 exists = tcf_hash_check(tn, parm->index, a, bind);
100 if (exists && bind)
101 return 0;
102
103 switch (parm->t_action) {
104 case TCA_TUNNEL_KEY_ACT_RELEASE:
105 break;
106 case TCA_TUNNEL_KEY_ACT_SET:
107 if (!tb[TCA_TUNNEL_KEY_ENC_KEY_ID]) {
108 ret = -EINVAL;
109 goto err_out;
110 }
111
112 key_id = key32_to_tunnel_id(nla_get_be32(tb[TCA_TUNNEL_KEY_ENC_KEY_ID]));
113
114 if (tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC] &&
115 tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]) {
116 __be32 saddr;
117 __be32 daddr;
118
119 saddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC]);
120 daddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]);
121
122 metadata = __ip_tun_set_dst(saddr, daddr, 0, 0,
123 TUNNEL_KEY, key_id, 0);
124 } else if (tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC] &&
125 tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]) {
126 struct in6_addr saddr;
127 struct in6_addr daddr;
128
129 saddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC]);
130 daddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]);
131
132 metadata = __ipv6_tun_set_dst(&saddr, &daddr, 0, 0, 0,
133 TUNNEL_KEY, key_id, 0);
134 }
135
136 if (!metadata) {
137 ret = -EINVAL;
138 goto err_out;
139 }
140
141 metadata->u.tun_info.mode |= IP_TUNNEL_INFO_TX;
142 break;
143 default:
144 goto err_out;
145 }
146
147 if (!exists) {
148 ret = tcf_hash_create(tn, parm->index, est, a,
149 &act_tunnel_key_ops, bind, true);
150 if (ret)
151 return ret;
152
153 ret = ACT_P_CREATED;
154 } else {
155 tcf_hash_release(*a, bind);
156 if (!ovr)
157 return -EEXIST;
158 }
159
160 t = to_tunnel_key(*a);
161
162 ASSERT_RTNL();
163 params_new = kzalloc(sizeof(*params_new), GFP_KERNEL);
164 if (unlikely(!params_new)) {
165 if (ret == ACT_P_CREATED)
166 tcf_hash_release(*a, bind);
167 return -ENOMEM;
168 }
169
170 params_old = rtnl_dereference(t->params);
171
172 params_new->action = parm->action;
173 params_new->tcft_action = parm->t_action;
174 params_new->tcft_enc_metadata = metadata;
175
176 rcu_assign_pointer(t->params, params_new);
177
178 if (params_old)
179 kfree_rcu(params_old, rcu);
180
181 if (ret == ACT_P_CREATED)
182 tcf_hash_insert(tn, *a);
183
184 return ret;
185
186err_out:
187 if (exists)
188 tcf_hash_release(*a, bind);
189 return ret;
190}
191
192static void tunnel_key_release(struct tc_action *a, int bind)
193{
194 struct tcf_tunnel_key *t = to_tunnel_key(a);
195 struct tcf_tunnel_key_params *params;
196
197 params = rcu_dereference_protected(t->params, 1);
198
199 if (params->tcft_action == TCA_TUNNEL_KEY_ACT_SET)
200 dst_release(&params->tcft_enc_metadata->dst);
201
202 kfree_rcu(params, rcu);
203}
204
205static int tunnel_key_dump_addresses(struct sk_buff *skb,
206 const struct ip_tunnel_info *info)
207{
208 unsigned short family = ip_tunnel_info_af(info);
209
210 if (family == AF_INET) {
211 __be32 saddr = info->key.u.ipv4.src;
212 __be32 daddr = info->key.u.ipv4.dst;
213
214 if (!nla_put_in_addr(skb, TCA_TUNNEL_KEY_ENC_IPV4_SRC, saddr) &&
215 !nla_put_in_addr(skb, TCA_TUNNEL_KEY_ENC_IPV4_DST, daddr))
216 return 0;
217 }
218
219 if (family == AF_INET6) {
220 const struct in6_addr *saddr6 = &info->key.u.ipv6.src;
221 const struct in6_addr *daddr6 = &info->key.u.ipv6.dst;
222
223 if (!nla_put_in6_addr(skb,
224 TCA_TUNNEL_KEY_ENC_IPV6_SRC, saddr6) &&
225 !nla_put_in6_addr(skb,
226 TCA_TUNNEL_KEY_ENC_IPV6_DST, daddr6))
227 return 0;
228 }
229
230 return -EINVAL;
231}
232
233static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a,
234 int bind, int ref)
235{
236 unsigned char *b = skb_tail_pointer(skb);
237 struct tcf_tunnel_key *t = to_tunnel_key(a);
238 struct tcf_tunnel_key_params *params;
239 struct tc_tunnel_key opt = {
240 .index = t->tcf_index,
241 .refcnt = t->tcf_refcnt - ref,
242 .bindcnt = t->tcf_bindcnt - bind,
243 };
244 struct tcf_t tm;
245
246 params = rtnl_dereference(t->params);
247
248 opt.t_action = params->tcft_action;
249 opt.action = params->action;
250
251 if (nla_put(skb, TCA_TUNNEL_KEY_PARMS, sizeof(opt), &opt))
252 goto nla_put_failure;
253
254 if (params->tcft_action == TCA_TUNNEL_KEY_ACT_SET) {
255 struct ip_tunnel_key *key =
256 &params->tcft_enc_metadata->u.tun_info.key;
257 __be32 key_id = tunnel_id_to_key32(key->tun_id);
258
259 if (nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_KEY_ID, key_id) ||
260 tunnel_key_dump_addresses(skb,
261 &params->tcft_enc_metadata->u.tun_info))
262 goto nla_put_failure;
263 }
264
265 tcf_tm_dump(&tm, &t->tcf_tm);
266 if (nla_put_64bit(skb, TCA_TUNNEL_KEY_TM, sizeof(tm),
267 &tm, TCA_TUNNEL_KEY_PAD))
268 goto nla_put_failure;
269
270 return skb->len;
271
272nla_put_failure:
273 nlmsg_trim(skb, b);
274 return -1;
275}
276
277static int tunnel_key_walker(struct net *net, struct sk_buff *skb,
278 struct netlink_callback *cb, int type,
279 const struct tc_action_ops *ops)
280{
281 struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
282
283 return tcf_generic_walker(tn, skb, cb, type, ops);
284}
285
286static int tunnel_key_search(struct net *net, struct tc_action **a, u32 index)
287{
288 struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
289
290 return tcf_hash_search(tn, a, index);
291}
292
293static struct tc_action_ops act_tunnel_key_ops = {
294 .kind = "tunnel_key",
295 .type = TCA_ACT_TUNNEL_KEY,
296 .owner = THIS_MODULE,
297 .act = tunnel_key_act,
298 .dump = tunnel_key_dump,
299 .init = tunnel_key_init,
300 .cleanup = tunnel_key_release,
301 .walk = tunnel_key_walker,
302 .lookup = tunnel_key_search,
303 .size = sizeof(struct tcf_tunnel_key),
304};
305
306static __net_init int tunnel_key_init_net(struct net *net)
307{
308 struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
309
310 return tc_action_net_init(tn, &act_tunnel_key_ops, TUNNEL_KEY_TAB_MASK);
311}
312
313static void __net_exit tunnel_key_exit_net(struct net *net)
314{
315 struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
316
317 tc_action_net_exit(tn);
318}
319
320static struct pernet_operations tunnel_key_net_ops = {
321 .init = tunnel_key_init_net,
322 .exit = tunnel_key_exit_net,
323 .id = &tunnel_key_net_id,
324 .size = sizeof(struct tc_action_net),
325};
326
327static int __init tunnel_key_init_module(void)
328{
329 return tcf_register_action(&act_tunnel_key_ops, &tunnel_key_net_ops);
330}
331
332static void __exit tunnel_key_cleanup_module(void)
333{
334 tcf_unregister_action(&act_tunnel_key_ops, &tunnel_key_net_ops);
335}
336
337module_init(tunnel_key_init_module);
338module_exit(tunnel_key_cleanup_module);
339
340MODULE_AUTHOR("Amir Vadai <amir@vadai.me>");
341MODULE_DESCRIPTION("ip tunnel manipulation actions");
342MODULE_LICENSE("GPL v2");
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 691409de3e1a..b57fcbcefea1 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -30,12 +30,19 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a,
30 struct tcf_vlan *v = to_vlan(a); 30 struct tcf_vlan *v = to_vlan(a);
31 int action; 31 int action;
32 int err; 32 int err;
33 u16 tci;
33 34
34 spin_lock(&v->tcf_lock); 35 spin_lock(&v->tcf_lock);
35 tcf_lastuse_update(&v->tcf_tm); 36 tcf_lastuse_update(&v->tcf_tm);
36 bstats_update(&v->tcf_bstats, skb); 37 bstats_update(&v->tcf_bstats, skb);
37 action = v->tcf_action; 38 action = v->tcf_action;
38 39
40 /* Ensure 'data' points at mac_header prior calling vlan manipulating
41 * functions.
42 */
43 if (skb_at_tc_ingress(skb))
44 skb_push_rcsum(skb, skb->mac_len);
45
39 switch (v->tcfv_action) { 46 switch (v->tcfv_action) {
40 case TCA_VLAN_ACT_POP: 47 case TCA_VLAN_ACT_POP:
41 err = skb_vlan_pop(skb); 48 err = skb_vlan_pop(skb);
@@ -43,10 +50,35 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a,
43 goto drop; 50 goto drop;
44 break; 51 break;
45 case TCA_VLAN_ACT_PUSH: 52 case TCA_VLAN_ACT_PUSH:
46 err = skb_vlan_push(skb, v->tcfv_push_proto, v->tcfv_push_vid); 53 err = skb_vlan_push(skb, v->tcfv_push_proto, v->tcfv_push_vid |
54 (v->tcfv_push_prio << VLAN_PRIO_SHIFT));
47 if (err) 55 if (err)
48 goto drop; 56 goto drop;
49 break; 57 break;
58 case TCA_VLAN_ACT_MODIFY:
59 /* No-op if no vlan tag (either hw-accel or in-payload) */
60 if (!skb_vlan_tagged(skb))
61 goto unlock;
62 /* extract existing tag (and guarantee no hw-accel tag) */
63 if (skb_vlan_tag_present(skb)) {
64 tci = skb_vlan_tag_get(skb);
65 skb->vlan_tci = 0;
66 } else {
67 /* in-payload vlan tag, pop it */
68 err = __skb_vlan_pop(skb, &tci);
69 if (err)
70 goto drop;
71 }
72 /* replace the vid */
73 tci = (tci & ~VLAN_VID_MASK) | v->tcfv_push_vid;
74 /* replace prio bits, if tcfv_push_prio specified */
75 if (v->tcfv_push_prio) {
76 tci &= ~VLAN_PRIO_MASK;
77 tci |= v->tcfv_push_prio << VLAN_PRIO_SHIFT;
78 }
79 /* put updated tci as hwaccel tag */
80 __vlan_hwaccel_put_tag(skb, v->tcfv_push_proto, tci);
81 break;
50 default: 82 default:
51 BUG(); 83 BUG();
52 } 84 }
@@ -57,6 +89,9 @@ drop:
57 action = TC_ACT_SHOT; 89 action = TC_ACT_SHOT;
58 v->tcf_qstats.drops++; 90 v->tcf_qstats.drops++;
59unlock: 91unlock:
92 if (skb_at_tc_ingress(skb))
93 skb_pull_rcsum(skb, skb->mac_len);
94
60 spin_unlock(&v->tcf_lock); 95 spin_unlock(&v->tcf_lock);
61 return action; 96 return action;
62} 97}
@@ -65,6 +100,7 @@ static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = {
65 [TCA_VLAN_PARMS] = { .len = sizeof(struct tc_vlan) }, 100 [TCA_VLAN_PARMS] = { .len = sizeof(struct tc_vlan) },
66 [TCA_VLAN_PUSH_VLAN_ID] = { .type = NLA_U16 }, 101 [TCA_VLAN_PUSH_VLAN_ID] = { .type = NLA_U16 },
67 [TCA_VLAN_PUSH_VLAN_PROTOCOL] = { .type = NLA_U16 }, 102 [TCA_VLAN_PUSH_VLAN_PROTOCOL] = { .type = NLA_U16 },
103 [TCA_VLAN_PUSH_VLAN_PRIORITY] = { .type = NLA_U8 },
68}; 104};
69 105
70static int tcf_vlan_init(struct net *net, struct nlattr *nla, 106static int tcf_vlan_init(struct net *net, struct nlattr *nla,
@@ -78,6 +114,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
78 int action; 114 int action;
79 __be16 push_vid = 0; 115 __be16 push_vid = 0;
80 __be16 push_proto = 0; 116 __be16 push_proto = 0;
117 u8 push_prio = 0;
81 bool exists = false; 118 bool exists = false;
82 int ret = 0, err; 119 int ret = 0, err;
83 120
@@ -99,6 +136,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
99 case TCA_VLAN_ACT_POP: 136 case TCA_VLAN_ACT_POP:
100 break; 137 break;
101 case TCA_VLAN_ACT_PUSH: 138 case TCA_VLAN_ACT_PUSH:
139 case TCA_VLAN_ACT_MODIFY:
102 if (!tb[TCA_VLAN_PUSH_VLAN_ID]) { 140 if (!tb[TCA_VLAN_PUSH_VLAN_ID]) {
103 if (exists) 141 if (exists)
104 tcf_hash_release(*a, bind); 142 tcf_hash_release(*a, bind);
@@ -123,6 +161,9 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
123 } else { 161 } else {
124 push_proto = htons(ETH_P_8021Q); 162 push_proto = htons(ETH_P_8021Q);
125 } 163 }
164
165 if (tb[TCA_VLAN_PUSH_VLAN_PRIORITY])
166 push_prio = nla_get_u8(tb[TCA_VLAN_PUSH_VLAN_PRIORITY]);
126 break; 167 break;
127 default: 168 default:
128 if (exists) 169 if (exists)
@@ -150,6 +191,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
150 191
151 v->tcfv_action = action; 192 v->tcfv_action = action;
152 v->tcfv_push_vid = push_vid; 193 v->tcfv_push_vid = push_vid;
194 v->tcfv_push_prio = push_prio;
153 v->tcfv_push_proto = push_proto; 195 v->tcfv_push_proto = push_proto;
154 196
155 v->tcf_action = parm->action; 197 v->tcf_action = parm->action;
@@ -178,10 +220,13 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a,
178 if (nla_put(skb, TCA_VLAN_PARMS, sizeof(opt), &opt)) 220 if (nla_put(skb, TCA_VLAN_PARMS, sizeof(opt), &opt))
179 goto nla_put_failure; 221 goto nla_put_failure;
180 222
181 if (v->tcfv_action == TCA_VLAN_ACT_PUSH && 223 if ((v->tcfv_action == TCA_VLAN_ACT_PUSH ||
224 v->tcfv_action == TCA_VLAN_ACT_MODIFY) &&
182 (nla_put_u16(skb, TCA_VLAN_PUSH_VLAN_ID, v->tcfv_push_vid) || 225 (nla_put_u16(skb, TCA_VLAN_PUSH_VLAN_ID, v->tcfv_push_vid) ||
183 nla_put_be16(skb, TCA_VLAN_PUSH_VLAN_PROTOCOL, 226 nla_put_be16(skb, TCA_VLAN_PUSH_VLAN_PROTOCOL,
184 v->tcfv_push_proto))) 227 v->tcfv_push_proto) ||
228 (nla_put_u8(skb, TCA_VLAN_PUSH_VLAN_PRIORITY,
229 v->tcfv_push_prio))))
185 goto nla_put_failure; 230 goto nla_put_failure;
186 231
187 tcf_tm_dump(&t, &v->tcf_tm); 232 tcf_tm_dump(&t, &v->tcf_tm);
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index a7c5645373af..2ee29a3375f6 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -101,7 +101,7 @@ EXPORT_SYMBOL(unregister_tcf_proto_ops);
101 101
102static int tfilter_notify(struct net *net, struct sk_buff *oskb, 102static int tfilter_notify(struct net *net, struct sk_buff *oskb,
103 struct nlmsghdr *n, struct tcf_proto *tp, 103 struct nlmsghdr *n, struct tcf_proto *tp,
104 unsigned long fh, int event); 104 unsigned long fh, int event, bool unicast);
105 105
106static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb, 106static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb,
107 struct nlmsghdr *n, 107 struct nlmsghdr *n,
@@ -112,7 +112,7 @@ static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb,
112 112
113 for (it_chain = chain; (tp = rtnl_dereference(*it_chain)) != NULL; 113 for (it_chain = chain; (tp = rtnl_dereference(*it_chain)) != NULL;
114 it_chain = &tp->next) 114 it_chain = &tp->next)
115 tfilter_notify(net, oskb, n, tp, 0, event); 115 tfilter_notify(net, oskb, n, tp, 0, event, false);
116} 116}
117 117
118/* Select new prio value from the range, managed by kernel. */ 118/* Select new prio value from the range, managed by kernel. */
@@ -319,7 +319,8 @@ replay:
319 319
320 RCU_INIT_POINTER(*back, next); 320 RCU_INIT_POINTER(*back, next);
321 321
322 tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER); 322 tfilter_notify(net, skb, n, tp, fh,
323 RTM_DELTFILTER, false);
323 tcf_destroy(tp, true); 324 tcf_destroy(tp, true);
324 err = 0; 325 err = 0;
325 goto errout; 326 goto errout;
@@ -344,13 +345,15 @@ replay:
344 if (err == 0) { 345 if (err == 0) {
345 struct tcf_proto *next = rtnl_dereference(tp->next); 346 struct tcf_proto *next = rtnl_dereference(tp->next);
346 347
347 tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER); 348 tfilter_notify(net, skb, n, tp, fh,
349 RTM_DELTFILTER, false);
348 if (tcf_destroy(tp, false)) 350 if (tcf_destroy(tp, false))
349 RCU_INIT_POINTER(*back, next); 351 RCU_INIT_POINTER(*back, next);
350 } 352 }
351 goto errout; 353 goto errout;
352 case RTM_GETTFILTER: 354 case RTM_GETTFILTER:
353 err = tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER); 355 err = tfilter_notify(net, skb, n, tp, fh,
356 RTM_NEWTFILTER, true);
354 goto errout; 357 goto errout;
355 default: 358 default:
356 err = -EINVAL; 359 err = -EINVAL;
@@ -365,7 +368,7 @@ replay:
365 RCU_INIT_POINTER(tp->next, rtnl_dereference(*back)); 368 RCU_INIT_POINTER(tp->next, rtnl_dereference(*back));
366 rcu_assign_pointer(*back, tp); 369 rcu_assign_pointer(*back, tp);
367 } 370 }
368 tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER); 371 tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER, false);
369 } else { 372 } else {
370 if (tp_created) 373 if (tp_created)
371 tcf_destroy(tp, true); 374 tcf_destroy(tp, true);
@@ -417,7 +420,7 @@ nla_put_failure:
417 420
418static int tfilter_notify(struct net *net, struct sk_buff *oskb, 421static int tfilter_notify(struct net *net, struct sk_buff *oskb,
419 struct nlmsghdr *n, struct tcf_proto *tp, 422 struct nlmsghdr *n, struct tcf_proto *tp,
420 unsigned long fh, int event) 423 unsigned long fh, int event, bool unicast)
421{ 424{
422 struct sk_buff *skb; 425 struct sk_buff *skb;
423 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; 426 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
@@ -431,6 +434,9 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb,
431 return -EINVAL; 434 return -EINVAL;
432 } 435 }
433 436
437 if (unicast)
438 return netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT);
439
434 return rtnetlink_send(skb, net, portid, RTNLGRP_TC, 440 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
435 n->nlmsg_flags & NLM_F_ECHO); 441 n->nlmsg_flags & NLM_F_ECHO);
436} 442}
@@ -448,7 +454,8 @@ static int tcf_node_dump(struct tcf_proto *tp, unsigned long n,
448 struct net *net = sock_net(a->skb->sk); 454 struct net *net = sock_net(a->skb->sk);
449 455
450 return tcf_fill_node(net, a->skb, tp, n, NETLINK_CB(a->cb->skb).portid, 456 return tcf_fill_node(net, a->skb, tp, n, NETLINK_CB(a->cb->skb).portid,
451 a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER); 457 a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
458 RTM_NEWTFILTER);
452} 459}
453 460
454/* called with RTNL */ 461/* called with RTNL */
@@ -552,7 +559,7 @@ void tcf_exts_destroy(struct tcf_exts *exts)
552EXPORT_SYMBOL(tcf_exts_destroy); 559EXPORT_SYMBOL(tcf_exts_destroy);
553 560
554int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, 561int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
555 struct nlattr *rate_tlv, struct tcf_exts *exts, bool ovr) 562 struct nlattr *rate_tlv, struct tcf_exts *exts, bool ovr)
556{ 563{
557#ifdef CONFIG_NET_CLS_ACT 564#ifdef CONFIG_NET_CLS_ACT
558 { 565 {
@@ -560,8 +567,7 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
560 567
561 if (exts->police && tb[exts->police]) { 568 if (exts->police && tb[exts->police]) {
562 act = tcf_action_init_1(net, tb[exts->police], rate_tlv, 569 act = tcf_action_init_1(net, tb[exts->police], rate_tlv,
563 "police", ovr, 570 "police", ovr, TCA_ACT_BIND);
564 TCA_ACT_BIND);
565 if (IS_ERR(act)) 571 if (IS_ERR(act))
566 return PTR_ERR(act); 572 return PTR_ERR(act);
567 573
@@ -573,8 +579,8 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
573 int err, i = 0; 579 int err, i = 0;
574 580
575 err = tcf_action_init(net, tb[exts->action], rate_tlv, 581 err = tcf_action_init(net, tb[exts->action], rate_tlv,
576 NULL, ovr, 582 NULL, ovr, TCA_ACT_BIND,
577 TCA_ACT_BIND, &actions); 583 &actions);
578 if (err) 584 if (err)
579 return err; 585 return err;
580 list_for_each_entry(act, &actions, list) 586 list_for_each_entry(act, &actions, list)
diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index 0b8c3ace671f..eb219b78cd49 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -138,10 +138,12 @@ static int basic_set_parms(struct net *net, struct tcf_proto *tp,
138 struct tcf_exts e; 138 struct tcf_exts e;
139 struct tcf_ematch_tree t; 139 struct tcf_ematch_tree t;
140 140
141 tcf_exts_init(&e, TCA_BASIC_ACT, TCA_BASIC_POLICE); 141 err = tcf_exts_init(&e, TCA_BASIC_ACT, TCA_BASIC_POLICE);
142 err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
143 if (err < 0) 142 if (err < 0)
144 return err; 143 return err;
144 err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
145 if (err < 0)
146 goto errout;
145 147
146 err = tcf_em_tree_validate(tp, tb[TCA_BASIC_EMATCHES], &t); 148 err = tcf_em_tree_validate(tp, tb[TCA_BASIC_EMATCHES], &t);
147 if (err < 0) 149 if (err < 0)
@@ -189,7 +191,10 @@ static int basic_change(struct net *net, struct sk_buff *in_skb,
189 if (!fnew) 191 if (!fnew)
190 return -ENOBUFS; 192 return -ENOBUFS;
191 193
192 tcf_exts_init(&fnew->exts, TCA_BASIC_ACT, TCA_BASIC_POLICE); 194 err = tcf_exts_init(&fnew->exts, TCA_BASIC_ACT, TCA_BASIC_POLICE);
195 if (err < 0)
196 goto errout;
197
193 err = -EINVAL; 198 err = -EINVAL;
194 if (handle) { 199 if (handle) {
195 fnew->handle = handle; 200 fnew->handle = handle;
@@ -226,6 +231,7 @@ static int basic_change(struct net *net, struct sk_buff *in_skb,
226 231
227 return 0; 232 return 0;
228errout: 233errout:
234 tcf_exts_destroy(&fnew->exts);
229 kfree(fnew); 235 kfree(fnew);
230 return err; 236 return err;
231} 237}
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index c3002c2c68bb..bb1d5a487081 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -27,6 +27,8 @@ MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
27MODULE_DESCRIPTION("TC BPF based classifier"); 27MODULE_DESCRIPTION("TC BPF based classifier");
28 28
29#define CLS_BPF_NAME_LEN 256 29#define CLS_BPF_NAME_LEN 256
30#define CLS_BPF_SUPPORTED_GEN_FLAGS \
31 (TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW)
30 32
31struct cls_bpf_head { 33struct cls_bpf_head {
32 struct list_head plist; 34 struct list_head plist;
@@ -39,6 +41,8 @@ struct cls_bpf_prog {
39 struct list_head link; 41 struct list_head link;
40 struct tcf_result res; 42 struct tcf_result res;
41 bool exts_integrated; 43 bool exts_integrated;
44 bool offloaded;
45 u32 gen_flags;
42 struct tcf_exts exts; 46 struct tcf_exts exts;
43 u32 handle; 47 u32 handle;
44 union { 48 union {
@@ -54,8 +58,10 @@ struct cls_bpf_prog {
54static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = { 58static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = {
55 [TCA_BPF_CLASSID] = { .type = NLA_U32 }, 59 [TCA_BPF_CLASSID] = { .type = NLA_U32 },
56 [TCA_BPF_FLAGS] = { .type = NLA_U32 }, 60 [TCA_BPF_FLAGS] = { .type = NLA_U32 },
61 [TCA_BPF_FLAGS_GEN] = { .type = NLA_U32 },
57 [TCA_BPF_FD] = { .type = NLA_U32 }, 62 [TCA_BPF_FD] = { .type = NLA_U32 },
58 [TCA_BPF_NAME] = { .type = NLA_NUL_STRING, .len = CLS_BPF_NAME_LEN }, 63 [TCA_BPF_NAME] = { .type = NLA_NUL_STRING,
64 .len = CLS_BPF_NAME_LEN },
59 [TCA_BPF_OPS_LEN] = { .type = NLA_U16 }, 65 [TCA_BPF_OPS_LEN] = { .type = NLA_U16 },
60 [TCA_BPF_OPS] = { .type = NLA_BINARY, 66 [TCA_BPF_OPS] = { .type = NLA_BINARY,
61 .len = sizeof(struct sock_filter) * BPF_MAXINSNS }, 67 .len = sizeof(struct sock_filter) * BPF_MAXINSNS },
@@ -83,9 +89,6 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
83 struct cls_bpf_prog *prog; 89 struct cls_bpf_prog *prog;
84 int ret = -1; 90 int ret = -1;
85 91
86 if (unlikely(!skb_mac_header_was_set(skb)))
87 return -1;
88
89 /* Needed here for accessing maps. */ 92 /* Needed here for accessing maps. */
90 rcu_read_lock(); 93 rcu_read_lock();
91 list_for_each_entry_rcu(prog, &head->plist, link) { 94 list_for_each_entry_rcu(prog, &head->plist, link) {
@@ -93,7 +96,9 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
93 96
94 qdisc_skb_cb(skb)->tc_classid = prog->res.classid; 97 qdisc_skb_cb(skb)->tc_classid = prog->res.classid;
95 98
96 if (at_ingress) { 99 if (tc_skip_sw(prog->gen_flags)) {
100 filter_res = prog->exts_integrated ? TC_ACT_UNSPEC : 0;
101 } else if (at_ingress) {
97 /* It is safe to push/pull even if skb_shared() */ 102 /* It is safe to push/pull even if skb_shared() */
98 __skb_push(skb, skb->mac_len); 103 __skb_push(skb, skb->mac_len);
99 bpf_compute_data_end(skb); 104 bpf_compute_data_end(skb);
@@ -140,6 +145,91 @@ static bool cls_bpf_is_ebpf(const struct cls_bpf_prog *prog)
140 return !prog->bpf_ops; 145 return !prog->bpf_ops;
141} 146}
142 147
148static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
149 enum tc_clsbpf_command cmd)
150{
151 struct net_device *dev = tp->q->dev_queue->dev;
152 struct tc_cls_bpf_offload bpf_offload = {};
153 struct tc_to_netdev offload;
154
155 offload.type = TC_SETUP_CLSBPF;
156 offload.cls_bpf = &bpf_offload;
157
158 bpf_offload.command = cmd;
159 bpf_offload.exts = &prog->exts;
160 bpf_offload.prog = prog->filter;
161 bpf_offload.name = prog->bpf_name;
162 bpf_offload.exts_integrated = prog->exts_integrated;
163 bpf_offload.gen_flags = prog->gen_flags;
164
165 return dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
166 tp->protocol, &offload);
167}
168
169static int cls_bpf_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog,
170 struct cls_bpf_prog *oldprog)
171{
172 struct net_device *dev = tp->q->dev_queue->dev;
173 struct cls_bpf_prog *obj = prog;
174 enum tc_clsbpf_command cmd;
175 bool skip_sw;
176 int ret;
177
178 skip_sw = tc_skip_sw(prog->gen_flags) ||
179 (oldprog && tc_skip_sw(oldprog->gen_flags));
180
181 if (oldprog && oldprog->offloaded) {
182 if (tc_should_offload(dev, tp, prog->gen_flags)) {
183 cmd = TC_CLSBPF_REPLACE;
184 } else if (!tc_skip_sw(prog->gen_flags)) {
185 obj = oldprog;
186 cmd = TC_CLSBPF_DESTROY;
187 } else {
188 return -EINVAL;
189 }
190 } else {
191 if (!tc_should_offload(dev, tp, prog->gen_flags))
192 return skip_sw ? -EINVAL : 0;
193 cmd = TC_CLSBPF_ADD;
194 }
195
196 ret = cls_bpf_offload_cmd(tp, obj, cmd);
197 if (ret)
198 return skip_sw ? ret : 0;
199
200 obj->offloaded = true;
201 if (oldprog)
202 oldprog->offloaded = false;
203
204 return 0;
205}
206
207static void cls_bpf_stop_offload(struct tcf_proto *tp,
208 struct cls_bpf_prog *prog)
209{
210 int err;
211
212 if (!prog->offloaded)
213 return;
214
215 err = cls_bpf_offload_cmd(tp, prog, TC_CLSBPF_DESTROY);
216 if (err) {
217 pr_err("Stopping hardware offload failed: %d\n", err);
218 return;
219 }
220
221 prog->offloaded = false;
222}
223
224static void cls_bpf_offload_update_stats(struct tcf_proto *tp,
225 struct cls_bpf_prog *prog)
226{
227 if (!prog->offloaded)
228 return;
229
230 cls_bpf_offload_cmd(tp, prog, TC_CLSBPF_STATS);
231}
232
143static int cls_bpf_init(struct tcf_proto *tp) 233static int cls_bpf_init(struct tcf_proto *tp)
144{ 234{
145 struct cls_bpf_head *head; 235 struct cls_bpf_head *head;
@@ -179,6 +269,7 @@ static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg)
179{ 269{
180 struct cls_bpf_prog *prog = (struct cls_bpf_prog *) arg; 270 struct cls_bpf_prog *prog = (struct cls_bpf_prog *) arg;
181 271
272 cls_bpf_stop_offload(tp, prog);
182 list_del_rcu(&prog->link); 273 list_del_rcu(&prog->link);
183 tcf_unbind_filter(tp, &prog->res); 274 tcf_unbind_filter(tp, &prog->res);
184 call_rcu(&prog->rcu, __cls_bpf_delete_prog); 275 call_rcu(&prog->rcu, __cls_bpf_delete_prog);
@@ -195,6 +286,7 @@ static bool cls_bpf_destroy(struct tcf_proto *tp, bool force)
195 return false; 286 return false;
196 287
197 list_for_each_entry_safe(prog, tmp, &head->plist, link) { 288 list_for_each_entry_safe(prog, tmp, &head->plist, link) {
289 cls_bpf_stop_offload(tp, prog);
198 list_del_rcu(&prog->link); 290 list_del_rcu(&prog->link);
199 tcf_unbind_filter(tp, &prog->res); 291 tcf_unbind_filter(tp, &prog->res);
200 call_rcu(&prog->rcu, __cls_bpf_delete_prog); 292 call_rcu(&prog->rcu, __cls_bpf_delete_prog);
@@ -304,6 +396,7 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp,
304{ 396{
305 bool is_bpf, is_ebpf, have_exts = false; 397 bool is_bpf, is_ebpf, have_exts = false;
306 struct tcf_exts exts; 398 struct tcf_exts exts;
399 u32 gen_flags = 0;
307 int ret; 400 int ret;
308 401
309 is_bpf = tb[TCA_BPF_OPS_LEN] && tb[TCA_BPF_OPS]; 402 is_bpf = tb[TCA_BPF_OPS_LEN] && tb[TCA_BPF_OPS];
@@ -311,30 +404,39 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp,
311 if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf)) 404 if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf))
312 return -EINVAL; 405 return -EINVAL;
313 406
314 tcf_exts_init(&exts, TCA_BPF_ACT, TCA_BPF_POLICE); 407 ret = tcf_exts_init(&exts, TCA_BPF_ACT, TCA_BPF_POLICE);
315 ret = tcf_exts_validate(net, tp, tb, est, &exts, ovr);
316 if (ret < 0) 408 if (ret < 0)
317 return ret; 409 return ret;
410 ret = tcf_exts_validate(net, tp, tb, est, &exts, ovr);
411 if (ret < 0)
412 goto errout;
318 413
319 if (tb[TCA_BPF_FLAGS]) { 414 if (tb[TCA_BPF_FLAGS]) {
320 u32 bpf_flags = nla_get_u32(tb[TCA_BPF_FLAGS]); 415 u32 bpf_flags = nla_get_u32(tb[TCA_BPF_FLAGS]);
321 416
322 if (bpf_flags & ~TCA_BPF_FLAG_ACT_DIRECT) { 417 if (bpf_flags & ~TCA_BPF_FLAG_ACT_DIRECT) {
323 tcf_exts_destroy(&exts); 418 ret = -EINVAL;
324 return -EINVAL; 419 goto errout;
325 } 420 }
326 421
327 have_exts = bpf_flags & TCA_BPF_FLAG_ACT_DIRECT; 422 have_exts = bpf_flags & TCA_BPF_FLAG_ACT_DIRECT;
328 } 423 }
424 if (tb[TCA_BPF_FLAGS_GEN]) {
425 gen_flags = nla_get_u32(tb[TCA_BPF_FLAGS_GEN]);
426 if (gen_flags & ~CLS_BPF_SUPPORTED_GEN_FLAGS ||
427 !tc_flags_valid(gen_flags)) {
428 ret = -EINVAL;
429 goto errout;
430 }
431 }
329 432
330 prog->exts_integrated = have_exts; 433 prog->exts_integrated = have_exts;
434 prog->gen_flags = gen_flags;
331 435
332 ret = is_bpf ? cls_bpf_prog_from_ops(tb, prog) : 436 ret = is_bpf ? cls_bpf_prog_from_ops(tb, prog) :
333 cls_bpf_prog_from_efd(tb, prog, tp); 437 cls_bpf_prog_from_efd(tb, prog, tp);
334 if (ret < 0) { 438 if (ret < 0)
335 tcf_exts_destroy(&exts); 439 goto errout;
336 return ret;
337 }
338 440
339 if (tb[TCA_BPF_CLASSID]) { 441 if (tb[TCA_BPF_CLASSID]) {
340 prog->res.classid = nla_get_u32(tb[TCA_BPF_CLASSID]); 442 prog->res.classid = nla_get_u32(tb[TCA_BPF_CLASSID]);
@@ -343,6 +445,10 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp,
343 445
344 tcf_exts_change(tp, &prog->exts, &exts); 446 tcf_exts_change(tp, &prog->exts, &exts);
345 return 0; 447 return 0;
448
449errout:
450 tcf_exts_destroy(&exts);
451 return ret;
346} 452}
347 453
348static u32 cls_bpf_grab_new_handle(struct tcf_proto *tp, 454static u32 cls_bpf_grab_new_handle(struct tcf_proto *tp,
@@ -388,7 +494,9 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
388 if (!prog) 494 if (!prog)
389 return -ENOBUFS; 495 return -ENOBUFS;
390 496
391 tcf_exts_init(&prog->exts, TCA_BPF_ACT, TCA_BPF_POLICE); 497 ret = tcf_exts_init(&prog->exts, TCA_BPF_ACT, TCA_BPF_POLICE);
498 if (ret < 0)
499 goto errout;
392 500
393 if (oldprog) { 501 if (oldprog) {
394 if (handle && oldprog->handle != handle) { 502 if (handle && oldprog->handle != handle) {
@@ -406,10 +514,17 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
406 goto errout; 514 goto errout;
407 } 515 }
408 516
409 ret = cls_bpf_modify_existing(net, tp, prog, base, tb, tca[TCA_RATE], ovr); 517 ret = cls_bpf_modify_existing(net, tp, prog, base, tb, tca[TCA_RATE],
518 ovr);
410 if (ret < 0) 519 if (ret < 0)
411 goto errout; 520 goto errout;
412 521
522 ret = cls_bpf_offload(tp, prog, oldprog);
523 if (ret) {
524 cls_bpf_delete_prog(tp, prog);
525 return ret;
526 }
527
413 if (oldprog) { 528 if (oldprog) {
414 list_replace_rcu(&oldprog->link, &prog->link); 529 list_replace_rcu(&oldprog->link, &prog->link);
415 tcf_unbind_filter(tp, &oldprog->res); 530 tcf_unbind_filter(tp, &oldprog->res);
@@ -420,9 +535,10 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
420 535
421 *arg = (unsigned long) prog; 536 *arg = (unsigned long) prog;
422 return 0; 537 return 0;
538
423errout: 539errout:
540 tcf_exts_destroy(&prog->exts);
424 kfree(prog); 541 kfree(prog);
425
426 return ret; 542 return ret;
427} 543}
428 544
@@ -470,6 +586,8 @@ static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
470 586
471 tm->tcm_handle = prog->handle; 587 tm->tcm_handle = prog->handle;
472 588
589 cls_bpf_offload_update_stats(tp, prog);
590
473 nest = nla_nest_start(skb, TCA_OPTIONS); 591 nest = nla_nest_start(skb, TCA_OPTIONS);
474 if (nest == NULL) 592 if (nest == NULL)
475 goto nla_put_failure; 593 goto nla_put_failure;
@@ -492,6 +610,9 @@ static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
492 bpf_flags |= TCA_BPF_FLAG_ACT_DIRECT; 610 bpf_flags |= TCA_BPF_FLAG_ACT_DIRECT;
493 if (bpf_flags && nla_put_u32(skb, TCA_BPF_FLAGS, bpf_flags)) 611 if (bpf_flags && nla_put_u32(skb, TCA_BPF_FLAGS, bpf_flags))
494 goto nla_put_failure; 612 goto nla_put_failure;
613 if (prog->gen_flags &&
614 nla_put_u32(skb, TCA_BPF_FLAGS_GEN, prog->gen_flags))
615 goto nla_put_failure;
495 616
496 nla_nest_end(skb, nest); 617 nla_nest_end(skb, nest);
497 618
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index 4c85bd3a750c..85233c470035 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -93,7 +93,9 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb,
93 if (!new) 93 if (!new)
94 return -ENOBUFS; 94 return -ENOBUFS;
95 95
96 tcf_exts_init(&new->exts, TCA_CGROUP_ACT, TCA_CGROUP_POLICE); 96 err = tcf_exts_init(&new->exts, TCA_CGROUP_ACT, TCA_CGROUP_POLICE);
97 if (err < 0)
98 goto errout;
97 new->handle = handle; 99 new->handle = handle;
98 new->tp = tp; 100 new->tp = tp;
99 err = nla_parse_nested(tb, TCA_CGROUP_MAX, tca[TCA_OPTIONS], 101 err = nla_parse_nested(tb, TCA_CGROUP_MAX, tca[TCA_OPTIONS],
@@ -101,10 +103,14 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb,
101 if (err < 0) 103 if (err < 0)
102 goto errout; 104 goto errout;
103 105
104 tcf_exts_init(&e, TCA_CGROUP_ACT, TCA_CGROUP_POLICE); 106 err = tcf_exts_init(&e, TCA_CGROUP_ACT, TCA_CGROUP_POLICE);
105 err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr);
106 if (err < 0) 107 if (err < 0)
107 goto errout; 108 goto errout;
109 err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr);
110 if (err < 0) {
111 tcf_exts_destroy(&e);
112 goto errout;
113 }
108 114
109 err = tcf_em_tree_validate(tp, tb[TCA_CGROUP_EMATCHES], &t); 115 err = tcf_em_tree_validate(tp, tb[TCA_CGROUP_EMATCHES], &t);
110 if (err < 0) { 116 if (err < 0) {
@@ -120,6 +126,7 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb,
120 call_rcu(&head->rcu, cls_cgroup_destroy_rcu); 126 call_rcu(&head->rcu, cls_cgroup_destroy_rcu);
121 return 0; 127 return 0;
122errout: 128errout:
129 tcf_exts_destroy(&new->exts);
123 kfree(new); 130 kfree(new);
124 return err; 131 return err;
125} 132}
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index fbfec6a18839..e39672394c7b 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -29,7 +29,7 @@
29#include <net/route.h> 29#include <net/route.h>
30#include <net/flow_dissector.h> 30#include <net/flow_dissector.h>
31 31
32#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) 32#if IS_ENABLED(CONFIG_NF_CONNTRACK)
33#include <net/netfilter/nf_conntrack.h> 33#include <net/netfilter/nf_conntrack.h>
34#endif 34#endif
35 35
@@ -87,12 +87,14 @@ static u32 flow_get_dst(const struct sk_buff *skb, const struct flow_keys *flow)
87 return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb); 87 return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb);
88} 88}
89 89
90static u32 flow_get_proto(const struct sk_buff *skb, const struct flow_keys *flow) 90static u32 flow_get_proto(const struct sk_buff *skb,
91 const struct flow_keys *flow)
91{ 92{
92 return flow->basic.ip_proto; 93 return flow->basic.ip_proto;
93} 94}
94 95
95static u32 flow_get_proto_src(const struct sk_buff *skb, const struct flow_keys *flow) 96static u32 flow_get_proto_src(const struct sk_buff *skb,
97 const struct flow_keys *flow)
96{ 98{
97 if (flow->ports.ports) 99 if (flow->ports.ports)
98 return ntohs(flow->ports.src); 100 return ntohs(flow->ports.src);
@@ -100,7 +102,8 @@ static u32 flow_get_proto_src(const struct sk_buff *skb, const struct flow_keys
100 return addr_fold(skb->sk); 102 return addr_fold(skb->sk);
101} 103}
102 104
103static u32 flow_get_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow) 105static u32 flow_get_proto_dst(const struct sk_buff *skb,
106 const struct flow_keys *flow)
104{ 107{
105 if (flow->ports.ports) 108 if (flow->ports.ports)
106 return ntohs(flow->ports.dst); 109 return ntohs(flow->ports.dst);
@@ -125,14 +128,14 @@ static u32 flow_get_mark(const struct sk_buff *skb)
125 128
126static u32 flow_get_nfct(const struct sk_buff *skb) 129static u32 flow_get_nfct(const struct sk_buff *skb)
127{ 130{
128#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) 131#if IS_ENABLED(CONFIG_NF_CONNTRACK)
129 return addr_fold(skb->nfct); 132 return addr_fold(skb->nfct);
130#else 133#else
131 return 0; 134 return 0;
132#endif 135#endif
133} 136}
134 137
135#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) 138#if IS_ENABLED(CONFIG_NF_CONNTRACK)
136#define CTTUPLE(skb, member) \ 139#define CTTUPLE(skb, member) \
137({ \ 140({ \
138 enum ip_conntrack_info ctinfo; \ 141 enum ip_conntrack_info ctinfo; \
@@ -149,7 +152,8 @@ static u32 flow_get_nfct(const struct sk_buff *skb)
149}) 152})
150#endif 153#endif
151 154
152static u32 flow_get_nfct_src(const struct sk_buff *skb, const struct flow_keys *flow) 155static u32 flow_get_nfct_src(const struct sk_buff *skb,
156 const struct flow_keys *flow)
153{ 157{
154 switch (tc_skb_protocol(skb)) { 158 switch (tc_skb_protocol(skb)) {
155 case htons(ETH_P_IP): 159 case htons(ETH_P_IP):
@@ -161,7 +165,8 @@ fallback:
161 return flow_get_src(skb, flow); 165 return flow_get_src(skb, flow);
162} 166}
163 167
164static u32 flow_get_nfct_dst(const struct sk_buff *skb, const struct flow_keys *flow) 168static u32 flow_get_nfct_dst(const struct sk_buff *skb,
169 const struct flow_keys *flow)
165{ 170{
166 switch (tc_skb_protocol(skb)) { 171 switch (tc_skb_protocol(skb)) {
167 case htons(ETH_P_IP): 172 case htons(ETH_P_IP):
@@ -173,14 +178,16 @@ fallback:
173 return flow_get_dst(skb, flow); 178 return flow_get_dst(skb, flow);
174} 179}
175 180
176static u32 flow_get_nfct_proto_src(const struct sk_buff *skb, const struct flow_keys *flow) 181static u32 flow_get_nfct_proto_src(const struct sk_buff *skb,
182 const struct flow_keys *flow)
177{ 183{
178 return ntohs(CTTUPLE(skb, src.u.all)); 184 return ntohs(CTTUPLE(skb, src.u.all));
179fallback: 185fallback:
180 return flow_get_proto_src(skb, flow); 186 return flow_get_proto_src(skb, flow);
181} 187}
182 188
183static u32 flow_get_nfct_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow) 189static u32 flow_get_nfct_proto_dst(const struct sk_buff *skb,
190 const struct flow_keys *flow)
184{ 191{
185 return ntohs(CTTUPLE(skb, dst.u.all)); 192 return ntohs(CTTUPLE(skb, dst.u.all));
186fallback: 193fallback:
@@ -418,10 +425,12 @@ static int flow_change(struct net *net, struct sk_buff *in_skb,
418 return -EOPNOTSUPP; 425 return -EOPNOTSUPP;
419 } 426 }
420 427
421 tcf_exts_init(&e, TCA_FLOW_ACT, TCA_FLOW_POLICE); 428 err = tcf_exts_init(&e, TCA_FLOW_ACT, TCA_FLOW_POLICE);
429 if (err < 0)
430 goto err1;
422 err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr); 431 err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr);
423 if (err < 0) 432 if (err < 0)
424 return err; 433 goto err1;
425 434
426 err = tcf_em_tree_validate(tp, tb[TCA_FLOW_EMATCHES], &t); 435 err = tcf_em_tree_validate(tp, tb[TCA_FLOW_EMATCHES], &t);
427 if (err < 0) 436 if (err < 0)
@@ -432,13 +441,15 @@ static int flow_change(struct net *net, struct sk_buff *in_skb,
432 if (!fnew) 441 if (!fnew)
433 goto err2; 442 goto err2;
434 443
435 tcf_exts_init(&fnew->exts, TCA_FLOW_ACT, TCA_FLOW_POLICE); 444 err = tcf_exts_init(&fnew->exts, TCA_FLOW_ACT, TCA_FLOW_POLICE);
445 if (err < 0)
446 goto err3;
436 447
437 fold = (struct flow_filter *)*arg; 448 fold = (struct flow_filter *)*arg;
438 if (fold) { 449 if (fold) {
439 err = -EINVAL; 450 err = -EINVAL;
440 if (fold->handle != handle && handle) 451 if (fold->handle != handle && handle)
441 goto err2; 452 goto err3;
442 453
443 /* Copy fold into fnew */ 454 /* Copy fold into fnew */
444 fnew->tp = fold->tp; 455 fnew->tp = fold->tp;
@@ -458,31 +469,31 @@ static int flow_change(struct net *net, struct sk_buff *in_skb,
458 if (tb[TCA_FLOW_MODE]) 469 if (tb[TCA_FLOW_MODE])
459 mode = nla_get_u32(tb[TCA_FLOW_MODE]); 470 mode = nla_get_u32(tb[TCA_FLOW_MODE]);
460 if (mode != FLOW_MODE_HASH && nkeys > 1) 471 if (mode != FLOW_MODE_HASH && nkeys > 1)
461 goto err2; 472 goto err3;
462 473
463 if (mode == FLOW_MODE_HASH) 474 if (mode == FLOW_MODE_HASH)
464 perturb_period = fold->perturb_period; 475 perturb_period = fold->perturb_period;
465 if (tb[TCA_FLOW_PERTURB]) { 476 if (tb[TCA_FLOW_PERTURB]) {
466 if (mode != FLOW_MODE_HASH) 477 if (mode != FLOW_MODE_HASH)
467 goto err2; 478 goto err3;
468 perturb_period = nla_get_u32(tb[TCA_FLOW_PERTURB]) * HZ; 479 perturb_period = nla_get_u32(tb[TCA_FLOW_PERTURB]) * HZ;
469 } 480 }
470 } else { 481 } else {
471 err = -EINVAL; 482 err = -EINVAL;
472 if (!handle) 483 if (!handle)
473 goto err2; 484 goto err3;
474 if (!tb[TCA_FLOW_KEYS]) 485 if (!tb[TCA_FLOW_KEYS])
475 goto err2; 486 goto err3;
476 487
477 mode = FLOW_MODE_MAP; 488 mode = FLOW_MODE_MAP;
478 if (tb[TCA_FLOW_MODE]) 489 if (tb[TCA_FLOW_MODE])
479 mode = nla_get_u32(tb[TCA_FLOW_MODE]); 490 mode = nla_get_u32(tb[TCA_FLOW_MODE]);
480 if (mode != FLOW_MODE_HASH && nkeys > 1) 491 if (mode != FLOW_MODE_HASH && nkeys > 1)
481 goto err2; 492 goto err3;
482 493
483 if (tb[TCA_FLOW_PERTURB]) { 494 if (tb[TCA_FLOW_PERTURB]) {
484 if (mode != FLOW_MODE_HASH) 495 if (mode != FLOW_MODE_HASH)
485 goto err2; 496 goto err3;
486 perturb_period = nla_get_u32(tb[TCA_FLOW_PERTURB]) * HZ; 497 perturb_period = nla_get_u32(tb[TCA_FLOW_PERTURB]) * HZ;
487 } 498 }
488 499
@@ -542,6 +553,8 @@ static int flow_change(struct net *net, struct sk_buff *in_skb,
542 call_rcu(&fold->rcu, flow_destroy_filter); 553 call_rcu(&fold->rcu, flow_destroy_filter);
543 return 0; 554 return 0;
544 555
556err3:
557 tcf_exts_destroy(&fnew->exts);
545err2: 558err2:
546 tcf_em_tree_destroy(&t); 559 tcf_em_tree_destroy(&t);
547 kfree(fnew); 560 kfree(fnew);
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 5060801a2f6d..f6f40fba599b 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -23,17 +23,26 @@
23#include <net/ip.h> 23#include <net/ip.h>
24#include <net/flow_dissector.h> 24#include <net/flow_dissector.h>
25 25
26#include <net/dst.h>
27#include <net/dst_metadata.h>
28
26struct fl_flow_key { 29struct fl_flow_key {
27 int indev_ifindex; 30 int indev_ifindex;
28 struct flow_dissector_key_control control; 31 struct flow_dissector_key_control control;
32 struct flow_dissector_key_control enc_control;
29 struct flow_dissector_key_basic basic; 33 struct flow_dissector_key_basic basic;
30 struct flow_dissector_key_eth_addrs eth; 34 struct flow_dissector_key_eth_addrs eth;
31 struct flow_dissector_key_addrs ipaddrs; 35 struct flow_dissector_key_vlan vlan;
32 union { 36 union {
33 struct flow_dissector_key_ipv4_addrs ipv4; 37 struct flow_dissector_key_ipv4_addrs ipv4;
34 struct flow_dissector_key_ipv6_addrs ipv6; 38 struct flow_dissector_key_ipv6_addrs ipv6;
35 }; 39 };
36 struct flow_dissector_key_ports tp; 40 struct flow_dissector_key_ports tp;
41 struct flow_dissector_key_keyid enc_key_id;
42 union {
43 struct flow_dissector_key_ipv4_addrs enc_ipv4;
44 struct flow_dissector_key_ipv6_addrs enc_ipv6;
45 };
37} __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ 46} __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */
38 47
39struct fl_flow_mask_range { 48struct fl_flow_mask_range {
@@ -123,11 +132,31 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
123 struct cls_fl_filter *f; 132 struct cls_fl_filter *f;
124 struct fl_flow_key skb_key; 133 struct fl_flow_key skb_key;
125 struct fl_flow_key skb_mkey; 134 struct fl_flow_key skb_mkey;
135 struct ip_tunnel_info *info;
126 136
127 if (!atomic_read(&head->ht.nelems)) 137 if (!atomic_read(&head->ht.nelems))
128 return -1; 138 return -1;
129 139
130 fl_clear_masked_range(&skb_key, &head->mask); 140 fl_clear_masked_range(&skb_key, &head->mask);
141
142 info = skb_tunnel_info(skb);
143 if (info) {
144 struct ip_tunnel_key *key = &info->key;
145
146 switch (ip_tunnel_info_af(info)) {
147 case AF_INET:
148 skb_key.enc_ipv4.src = key->u.ipv4.src;
149 skb_key.enc_ipv4.dst = key->u.ipv4.dst;
150 break;
151 case AF_INET6:
152 skb_key.enc_ipv6.src = key->u.ipv6.src;
153 skb_key.enc_ipv6.dst = key->u.ipv6.dst;
154 break;
155 }
156
157 skb_key.enc_key_id.keyid = tunnel_id_to_key32(key->tun_id);
158 }
159
131 skb_key.indev_ifindex = skb->skb_iif; 160 skb_key.indev_ifindex = skb->skb_iif;
132 /* skb_flow_dissect() does not set n_proto in case an unknown protocol, 161 /* skb_flow_dissect() does not set n_proto in case an unknown protocol,
133 * so do it rather here. 162 * so do it rather here.
@@ -212,7 +241,8 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
212 tc.type = TC_SETUP_CLSFLOWER; 241 tc.type = TC_SETUP_CLSFLOWER;
213 tc.cls_flower = &offload; 242 tc.cls_flower = &offload;
214 243
215 err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc); 244 err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol,
245 &tc);
216 246
217 if (tc_skip_sw(flags)) 247 if (tc_skip_sw(flags))
218 return err; 248 return err;
@@ -293,6 +323,22 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
293 [TCA_FLOWER_KEY_TCP_DST] = { .type = NLA_U16 }, 323 [TCA_FLOWER_KEY_TCP_DST] = { .type = NLA_U16 },
294 [TCA_FLOWER_KEY_UDP_SRC] = { .type = NLA_U16 }, 324 [TCA_FLOWER_KEY_UDP_SRC] = { .type = NLA_U16 },
295 [TCA_FLOWER_KEY_UDP_DST] = { .type = NLA_U16 }, 325 [TCA_FLOWER_KEY_UDP_DST] = { .type = NLA_U16 },
326 [TCA_FLOWER_KEY_VLAN_ID] = { .type = NLA_U16 },
327 [TCA_FLOWER_KEY_VLAN_PRIO] = { .type = NLA_U8 },
328 [TCA_FLOWER_KEY_VLAN_ETH_TYPE] = { .type = NLA_U16 },
329 [TCA_FLOWER_KEY_ENC_KEY_ID] = { .type = NLA_U32 },
330 [TCA_FLOWER_KEY_ENC_IPV4_SRC] = { .type = NLA_U32 },
331 [TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK] = { .type = NLA_U32 },
332 [TCA_FLOWER_KEY_ENC_IPV4_DST] = { .type = NLA_U32 },
333 [TCA_FLOWER_KEY_ENC_IPV4_DST_MASK] = { .type = NLA_U32 },
334 [TCA_FLOWER_KEY_ENC_IPV6_SRC] = { .len = sizeof(struct in6_addr) },
335 [TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK] = { .len = sizeof(struct in6_addr) },
336 [TCA_FLOWER_KEY_ENC_IPV6_DST] = { .len = sizeof(struct in6_addr) },
337 [TCA_FLOWER_KEY_ENC_IPV6_DST_MASK] = { .len = sizeof(struct in6_addr) },
338 [TCA_FLOWER_KEY_TCP_SRC_MASK] = { .type = NLA_U16 },
339 [TCA_FLOWER_KEY_TCP_DST_MASK] = { .type = NLA_U16 },
340 [TCA_FLOWER_KEY_UDP_SRC_MASK] = { .type = NLA_U16 },
341 [TCA_FLOWER_KEY_UDP_DST_MASK] = { .type = NLA_U16 },
296}; 342};
297 343
298static void fl_set_key_val(struct nlattr **tb, 344static void fl_set_key_val(struct nlattr **tb,
@@ -308,9 +354,29 @@ static void fl_set_key_val(struct nlattr **tb,
308 memcpy(mask, nla_data(tb[mask_type]), len); 354 memcpy(mask, nla_data(tb[mask_type]), len);
309} 355}
310 356
357static void fl_set_key_vlan(struct nlattr **tb,
358 struct flow_dissector_key_vlan *key_val,
359 struct flow_dissector_key_vlan *key_mask)
360{
361#define VLAN_PRIORITY_MASK 0x7
362
363 if (tb[TCA_FLOWER_KEY_VLAN_ID]) {
364 key_val->vlan_id =
365 nla_get_u16(tb[TCA_FLOWER_KEY_VLAN_ID]) & VLAN_VID_MASK;
366 key_mask->vlan_id = VLAN_VID_MASK;
367 }
368 if (tb[TCA_FLOWER_KEY_VLAN_PRIO]) {
369 key_val->vlan_priority =
370 nla_get_u8(tb[TCA_FLOWER_KEY_VLAN_PRIO]) &
371 VLAN_PRIORITY_MASK;
372 key_mask->vlan_priority = VLAN_PRIORITY_MASK;
373 }
374}
375
311static int fl_set_key(struct net *net, struct nlattr **tb, 376static int fl_set_key(struct net *net, struct nlattr **tb,
312 struct fl_flow_key *key, struct fl_flow_key *mask) 377 struct fl_flow_key *key, struct fl_flow_key *mask)
313{ 378{
379 __be16 ethertype;
314#ifdef CONFIG_NET_CLS_IND 380#ifdef CONFIG_NET_CLS_IND
315 if (tb[TCA_FLOWER_INDEV]) { 381 if (tb[TCA_FLOWER_INDEV]) {
316 int err = tcf_change_indev(net, tb[TCA_FLOWER_INDEV]); 382 int err = tcf_change_indev(net, tb[TCA_FLOWER_INDEV]);
@@ -328,9 +394,20 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
328 mask->eth.src, TCA_FLOWER_KEY_ETH_SRC_MASK, 394 mask->eth.src, TCA_FLOWER_KEY_ETH_SRC_MASK,
329 sizeof(key->eth.src)); 395 sizeof(key->eth.src));
330 396
331 fl_set_key_val(tb, &key->basic.n_proto, TCA_FLOWER_KEY_ETH_TYPE, 397 if (tb[TCA_FLOWER_KEY_ETH_TYPE]) {
332 &mask->basic.n_proto, TCA_FLOWER_UNSPEC, 398 ethertype = nla_get_be16(tb[TCA_FLOWER_KEY_ETH_TYPE]);
333 sizeof(key->basic.n_proto)); 399
400 if (ethertype == htons(ETH_P_8021Q)) {
401 fl_set_key_vlan(tb, &key->vlan, &mask->vlan);
402 fl_set_key_val(tb, &key->basic.n_proto,
403 TCA_FLOWER_KEY_VLAN_ETH_TYPE,
404 &mask->basic.n_proto, TCA_FLOWER_UNSPEC,
405 sizeof(key->basic.n_proto));
406 } else {
407 key->basic.n_proto = ethertype;
408 mask->basic.n_proto = cpu_to_be16(~0);
409 }
410 }
334 411
335 if (key->basic.n_proto == htons(ETH_P_IP) || 412 if (key->basic.n_proto == htons(ETH_P_IP) ||
336 key->basic.n_proto == htons(ETH_P_IPV6)) { 413 key->basic.n_proto == htons(ETH_P_IPV6)) {
@@ -359,20 +436,54 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
359 436
360 if (key->basic.ip_proto == IPPROTO_TCP) { 437 if (key->basic.ip_proto == IPPROTO_TCP) {
361 fl_set_key_val(tb, &key->tp.src, TCA_FLOWER_KEY_TCP_SRC, 438 fl_set_key_val(tb, &key->tp.src, TCA_FLOWER_KEY_TCP_SRC,
362 &mask->tp.src, TCA_FLOWER_UNSPEC, 439 &mask->tp.src, TCA_FLOWER_KEY_TCP_SRC_MASK,
363 sizeof(key->tp.src)); 440 sizeof(key->tp.src));
364 fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_TCP_DST, 441 fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_TCP_DST,
365 &mask->tp.dst, TCA_FLOWER_UNSPEC, 442 &mask->tp.dst, TCA_FLOWER_KEY_TCP_DST_MASK,
366 sizeof(key->tp.dst)); 443 sizeof(key->tp.dst));
367 } else if (key->basic.ip_proto == IPPROTO_UDP) { 444 } else if (key->basic.ip_proto == IPPROTO_UDP) {
368 fl_set_key_val(tb, &key->tp.src, TCA_FLOWER_KEY_UDP_SRC, 445 fl_set_key_val(tb, &key->tp.src, TCA_FLOWER_KEY_UDP_SRC,
369 &mask->tp.src, TCA_FLOWER_UNSPEC, 446 &mask->tp.src, TCA_FLOWER_KEY_UDP_SRC_MASK,
370 sizeof(key->tp.src)); 447 sizeof(key->tp.src));
371 fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_UDP_DST, 448 fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_UDP_DST,
372 &mask->tp.dst, TCA_FLOWER_UNSPEC, 449 &mask->tp.dst, TCA_FLOWER_KEY_UDP_DST_MASK,
373 sizeof(key->tp.dst)); 450 sizeof(key->tp.dst));
374 } 451 }
375 452
453 if (tb[TCA_FLOWER_KEY_ENC_IPV4_SRC] ||
454 tb[TCA_FLOWER_KEY_ENC_IPV4_DST]) {
455 key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
456 fl_set_key_val(tb, &key->enc_ipv4.src,
457 TCA_FLOWER_KEY_ENC_IPV4_SRC,
458 &mask->enc_ipv4.src,
459 TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,
460 sizeof(key->enc_ipv4.src));
461 fl_set_key_val(tb, &key->enc_ipv4.dst,
462 TCA_FLOWER_KEY_ENC_IPV4_DST,
463 &mask->enc_ipv4.dst,
464 TCA_FLOWER_KEY_ENC_IPV4_DST_MASK,
465 sizeof(key->enc_ipv4.dst));
466 }
467
468 if (tb[TCA_FLOWER_KEY_ENC_IPV6_SRC] ||
469 tb[TCA_FLOWER_KEY_ENC_IPV6_DST]) {
470 key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
471 fl_set_key_val(tb, &key->enc_ipv6.src,
472 TCA_FLOWER_KEY_ENC_IPV6_SRC,
473 &mask->enc_ipv6.src,
474 TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,
475 sizeof(key->enc_ipv6.src));
476 fl_set_key_val(tb, &key->enc_ipv6.dst,
477 TCA_FLOWER_KEY_ENC_IPV6_DST,
478 &mask->enc_ipv6.dst,
479 TCA_FLOWER_KEY_ENC_IPV6_DST_MASK,
480 sizeof(key->enc_ipv6.dst));
481 }
482
483 fl_set_key_val(tb, &key->enc_key_id.keyid, TCA_FLOWER_KEY_ENC_KEY_ID,
484 &mask->enc_key_id.keyid, TCA_FLOWER_UNSPEC,
485 sizeof(key->enc_key_id.keyid));
486
376 return 0; 487 return 0;
377} 488}
378 489
@@ -404,12 +515,10 @@ static int fl_init_hashtable(struct cls_fl_head *head,
404 515
405#define FL_KEY_MEMBER_OFFSET(member) offsetof(struct fl_flow_key, member) 516#define FL_KEY_MEMBER_OFFSET(member) offsetof(struct fl_flow_key, member)
406#define FL_KEY_MEMBER_SIZE(member) (sizeof(((struct fl_flow_key *) 0)->member)) 517#define FL_KEY_MEMBER_SIZE(member) (sizeof(((struct fl_flow_key *) 0)->member))
407#define FL_KEY_MEMBER_END_OFFSET(member) \
408 (FL_KEY_MEMBER_OFFSET(member) + FL_KEY_MEMBER_SIZE(member))
409 518
410#define FL_KEY_IN_RANGE(mask, member) \ 519#define FL_KEY_IS_MASKED(mask, member) \
411 (FL_KEY_MEMBER_OFFSET(member) <= (mask)->range.end && \ 520 memchr_inv(((char *)mask) + FL_KEY_MEMBER_OFFSET(member), \
412 FL_KEY_MEMBER_END_OFFSET(member) >= (mask)->range.start) 521 0, FL_KEY_MEMBER_SIZE(member)) \
413 522
414#define FL_KEY_SET(keys, cnt, id, member) \ 523#define FL_KEY_SET(keys, cnt, id, member) \
415 do { \ 524 do { \
@@ -418,9 +527,9 @@ static int fl_init_hashtable(struct cls_fl_head *head,
418 cnt++; \ 527 cnt++; \
419 } while(0); 528 } while(0);
420 529
421#define FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt, id, member) \ 530#define FL_KEY_SET_IF_MASKED(mask, keys, cnt, id, member) \
422 do { \ 531 do { \
423 if (FL_KEY_IN_RANGE(mask, member)) \ 532 if (FL_KEY_IS_MASKED(mask, member)) \
424 FL_KEY_SET(keys, cnt, id, member); \ 533 FL_KEY_SET(keys, cnt, id, member); \
425 } while(0); 534 } while(0);
426 535
@@ -432,14 +541,16 @@ static void fl_init_dissector(struct cls_fl_head *head,
432 541
433 FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_CONTROL, control); 542 FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_CONTROL, control);
434 FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_BASIC, basic); 543 FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_BASIC, basic);
435 FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt, 544 FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
436 FLOW_DISSECTOR_KEY_ETH_ADDRS, eth); 545 FLOW_DISSECTOR_KEY_ETH_ADDRS, eth);
437 FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt, 546 FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
438 FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4); 547 FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4);
439 FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt, 548 FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
440 FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6); 549 FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6);
441 FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt, 550 FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
442 FLOW_DISSECTOR_KEY_PORTS, tp); 551 FLOW_DISSECTOR_KEY_PORTS, tp);
552 FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
553 FLOW_DISSECTOR_KEY_VLAN, vlan);
443 554
444 skb_flow_dissector_init(&head->dissector, keys, cnt); 555 skb_flow_dissector_init(&head->dissector, keys, cnt);
445} 556}
@@ -478,10 +589,12 @@ static int fl_set_parms(struct net *net, struct tcf_proto *tp,
478 struct tcf_exts e; 589 struct tcf_exts e;
479 int err; 590 int err;
480 591
481 tcf_exts_init(&e, TCA_FLOWER_ACT, 0); 592 err = tcf_exts_init(&e, TCA_FLOWER_ACT, 0);
482 err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
483 if (err < 0) 593 if (err < 0)
484 return err; 594 return err;
595 err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
596 if (err < 0)
597 goto errout;
485 598
486 if (tb[TCA_FLOWER_CLASSID]) { 599 if (tb[TCA_FLOWER_CLASSID]) {
487 f->res.classid = nla_get_u32(tb[TCA_FLOWER_CLASSID]); 600 f->res.classid = nla_get_u32(tb[TCA_FLOWER_CLASSID]);
@@ -550,7 +663,9 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
550 if (!fnew) 663 if (!fnew)
551 return -ENOBUFS; 664 return -ENOBUFS;
552 665
553 tcf_exts_init(&fnew->exts, TCA_FLOWER_ACT, 0); 666 err = tcf_exts_init(&fnew->exts, TCA_FLOWER_ACT, 0);
667 if (err < 0)
668 goto errout;
554 669
555 if (!handle) { 670 if (!handle) {
556 handle = fl_grab_new_handle(tp, head); 671 handle = fl_grab_new_handle(tp, head);
@@ -614,6 +729,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
614 return 0; 729 return 0;
615 730
616errout: 731errout:
732 tcf_exts_destroy(&fnew->exts);
617 kfree(fnew); 733 kfree(fnew);
618 return err; 734 return err;
619} 735}
@@ -668,6 +784,29 @@ static int fl_dump_key_val(struct sk_buff *skb,
668 return 0; 784 return 0;
669} 785}
670 786
787static int fl_dump_key_vlan(struct sk_buff *skb,
788 struct flow_dissector_key_vlan *vlan_key,
789 struct flow_dissector_key_vlan *vlan_mask)
790{
791 int err;
792
793 if (!memchr_inv(vlan_mask, 0, sizeof(*vlan_mask)))
794 return 0;
795 if (vlan_mask->vlan_id) {
796 err = nla_put_u16(skb, TCA_FLOWER_KEY_VLAN_ID,
797 vlan_key->vlan_id);
798 if (err)
799 return err;
800 }
801 if (vlan_mask->vlan_priority) {
802 err = nla_put_u8(skb, TCA_FLOWER_KEY_VLAN_PRIO,
803 vlan_key->vlan_priority);
804 if (err)
805 return err;
806 }
807 return 0;
808}
809
671static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, 810static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
672 struct sk_buff *skb, struct tcmsg *t) 811 struct sk_buff *skb, struct tcmsg *t)
673{ 812{
@@ -712,6 +851,10 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
712 &mask->basic.n_proto, TCA_FLOWER_UNSPEC, 851 &mask->basic.n_proto, TCA_FLOWER_UNSPEC,
713 sizeof(key->basic.n_proto))) 852 sizeof(key->basic.n_proto)))
714 goto nla_put_failure; 853 goto nla_put_failure;
854
855 if (fl_dump_key_vlan(skb, &key->vlan, &mask->vlan))
856 goto nla_put_failure;
857
715 if ((key->basic.n_proto == htons(ETH_P_IP) || 858 if ((key->basic.n_proto == htons(ETH_P_IP) ||
716 key->basic.n_proto == htons(ETH_P_IPV6)) && 859 key->basic.n_proto == htons(ETH_P_IPV6)) &&
717 fl_dump_key_val(skb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO, 860 fl_dump_key_val(skb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO,
@@ -738,21 +881,48 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
738 881
739 if (key->basic.ip_proto == IPPROTO_TCP && 882 if (key->basic.ip_proto == IPPROTO_TCP &&
740 (fl_dump_key_val(skb, &key->tp.src, TCA_FLOWER_KEY_TCP_SRC, 883 (fl_dump_key_val(skb, &key->tp.src, TCA_FLOWER_KEY_TCP_SRC,
741 &mask->tp.src, TCA_FLOWER_UNSPEC, 884 &mask->tp.src, TCA_FLOWER_KEY_TCP_SRC_MASK,
742 sizeof(key->tp.src)) || 885 sizeof(key->tp.src)) ||
743 fl_dump_key_val(skb, &key->tp.dst, TCA_FLOWER_KEY_TCP_DST, 886 fl_dump_key_val(skb, &key->tp.dst, TCA_FLOWER_KEY_TCP_DST,
744 &mask->tp.dst, TCA_FLOWER_UNSPEC, 887 &mask->tp.dst, TCA_FLOWER_KEY_TCP_DST_MASK,
745 sizeof(key->tp.dst)))) 888 sizeof(key->tp.dst))))
746 goto nla_put_failure; 889 goto nla_put_failure;
747 else if (key->basic.ip_proto == IPPROTO_UDP && 890 else if (key->basic.ip_proto == IPPROTO_UDP &&
748 (fl_dump_key_val(skb, &key->tp.src, TCA_FLOWER_KEY_UDP_SRC, 891 (fl_dump_key_val(skb, &key->tp.src, TCA_FLOWER_KEY_UDP_SRC,
749 &mask->tp.src, TCA_FLOWER_UNSPEC, 892 &mask->tp.src, TCA_FLOWER_KEY_UDP_SRC_MASK,
750 sizeof(key->tp.src)) || 893 sizeof(key->tp.src)) ||
751 fl_dump_key_val(skb, &key->tp.dst, TCA_FLOWER_KEY_UDP_DST, 894 fl_dump_key_val(skb, &key->tp.dst, TCA_FLOWER_KEY_UDP_DST,
752 &mask->tp.dst, TCA_FLOWER_UNSPEC, 895 &mask->tp.dst, TCA_FLOWER_KEY_UDP_DST_MASK,
753 sizeof(key->tp.dst)))) 896 sizeof(key->tp.dst))))
754 goto nla_put_failure; 897 goto nla_put_failure;
755 898
899 if (key->enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS &&
900 (fl_dump_key_val(skb, &key->enc_ipv4.src,
901 TCA_FLOWER_KEY_ENC_IPV4_SRC, &mask->enc_ipv4.src,
902 TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,
903 sizeof(key->enc_ipv4.src)) ||
904 fl_dump_key_val(skb, &key->enc_ipv4.dst,
905 TCA_FLOWER_KEY_ENC_IPV4_DST, &mask->enc_ipv4.dst,
906 TCA_FLOWER_KEY_ENC_IPV4_DST_MASK,
907 sizeof(key->enc_ipv4.dst))))
908 goto nla_put_failure;
909 else if (key->enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS &&
910 (fl_dump_key_val(skb, &key->enc_ipv6.src,
911 TCA_FLOWER_KEY_ENC_IPV6_SRC, &mask->enc_ipv6.src,
912 TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,
913 sizeof(key->enc_ipv6.src)) ||
914 fl_dump_key_val(skb, &key->enc_ipv6.dst,
915 TCA_FLOWER_KEY_ENC_IPV6_DST,
916 &mask->enc_ipv6.dst,
917 TCA_FLOWER_KEY_ENC_IPV6_DST_MASK,
918 sizeof(key->enc_ipv6.dst))))
919 goto nla_put_failure;
920
921 if (fl_dump_key_val(skb, &key->enc_key_id, TCA_FLOWER_KEY_ENC_KEY_ID,
922 &mask->enc_key_id, TCA_FLOWER_UNSPEC,
923 sizeof(key->enc_key_id)))
924 goto nla_put_failure;
925
756 nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags); 926 nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags);
757 927
758 if (tcf_exts_dump(skb, &f->exts)) 928 if (tcf_exts_dump(skb, &f->exts))
diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
index f23a3b68bba6..9dc63d54e167 100644
--- a/net/sched/cls_fw.c
+++ b/net/sched/cls_fw.c
@@ -57,7 +57,7 @@ static u32 fw_hash(u32 handle)
57} 57}
58 58
59static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp, 59static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp,
60 struct tcf_result *res) 60 struct tcf_result *res)
61{ 61{
62 struct fw_head *head = rcu_dereference_bh(tp->root); 62 struct fw_head *head = rcu_dereference_bh(tp->root);
63 struct fw_filter *f; 63 struct fw_filter *f;
@@ -188,17 +188,20 @@ static const struct nla_policy fw_policy[TCA_FW_MAX + 1] = {
188 188
189static int 189static int
190fw_change_attrs(struct net *net, struct tcf_proto *tp, struct fw_filter *f, 190fw_change_attrs(struct net *net, struct tcf_proto *tp, struct fw_filter *f,
191 struct nlattr **tb, struct nlattr **tca, unsigned long base, bool ovr) 191 struct nlattr **tb, struct nlattr **tca, unsigned long base,
192 bool ovr)
192{ 193{
193 struct fw_head *head = rtnl_dereference(tp->root); 194 struct fw_head *head = rtnl_dereference(tp->root);
194 struct tcf_exts e; 195 struct tcf_exts e;
195 u32 mask; 196 u32 mask;
196 int err; 197 int err;
197 198
198 tcf_exts_init(&e, TCA_FW_ACT, TCA_FW_POLICE); 199 err = tcf_exts_init(&e, TCA_FW_ACT, TCA_FW_POLICE);
199 err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr);
200 if (err < 0) 200 if (err < 0)
201 return err; 201 return err;
202 err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr);
203 if (err < 0)
204 goto errout;
202 205
203 if (tb[TCA_FW_CLASSID]) { 206 if (tb[TCA_FW_CLASSID]) {
204 f->res.classid = nla_get_u32(tb[TCA_FW_CLASSID]); 207 f->res.classid = nla_get_u32(tb[TCA_FW_CLASSID]);
@@ -235,9 +238,8 @@ errout:
235 238
236static int fw_change(struct net *net, struct sk_buff *in_skb, 239static int fw_change(struct net *net, struct sk_buff *in_skb,
237 struct tcf_proto *tp, unsigned long base, 240 struct tcf_proto *tp, unsigned long base,
238 u32 handle, 241 u32 handle, struct nlattr **tca, unsigned long *arg,
239 struct nlattr **tca, 242 bool ovr)
240 unsigned long *arg, bool ovr)
241{ 243{
242 struct fw_head *head = rtnl_dereference(tp->root); 244 struct fw_head *head = rtnl_dereference(tp->root);
243 struct fw_filter *f = (struct fw_filter *) *arg; 245 struct fw_filter *f = (struct fw_filter *) *arg;
@@ -270,10 +272,15 @@ static int fw_change(struct net *net, struct sk_buff *in_skb,
270#endif /* CONFIG_NET_CLS_IND */ 272#endif /* CONFIG_NET_CLS_IND */
271 fnew->tp = f->tp; 273 fnew->tp = f->tp;
272 274
273 tcf_exts_init(&fnew->exts, TCA_FW_ACT, TCA_FW_POLICE); 275 err = tcf_exts_init(&fnew->exts, TCA_FW_ACT, TCA_FW_POLICE);
276 if (err < 0) {
277 kfree(fnew);
278 return err;
279 }
274 280
275 err = fw_change_attrs(net, tp, fnew, tb, tca, base, ovr); 281 err = fw_change_attrs(net, tp, fnew, tb, tca, base, ovr);
276 if (err < 0) { 282 if (err < 0) {
283 tcf_exts_destroy(&fnew->exts);
277 kfree(fnew); 284 kfree(fnew);
278 return err; 285 return err;
279 } 286 }
@@ -313,7 +320,9 @@ static int fw_change(struct net *net, struct sk_buff *in_skb,
313 if (f == NULL) 320 if (f == NULL)
314 return -ENOBUFS; 321 return -ENOBUFS;
315 322
316 tcf_exts_init(&f->exts, TCA_FW_ACT, TCA_FW_POLICE); 323 err = tcf_exts_init(&f->exts, TCA_FW_ACT, TCA_FW_POLICE);
324 if (err < 0)
325 goto errout;
317 f->id = handle; 326 f->id = handle;
318 f->tp = tp; 327 f->tp = tp;
319 328
@@ -328,6 +337,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb,
328 return 0; 337 return 0;
329 338
330errout: 339errout:
340 tcf_exts_destroy(&f->exts);
331 kfree(f); 341 kfree(f);
332 return err; 342 return err;
333} 343}
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
index 08a3b0a6f5ab..455fc8f83d0a 100644
--- a/net/sched/cls_route.c
+++ b/net/sched/cls_route.c
@@ -268,8 +268,7 @@ static int route4_init(struct tcf_proto *tp)
268 return 0; 268 return 0;
269} 269}
270 270
271static void 271static void route4_delete_filter(struct rcu_head *head)
272route4_delete_filter(struct rcu_head *head)
273{ 272{
274 struct route4_filter *f = container_of(head, struct route4_filter, rcu); 273 struct route4_filter *f = container_of(head, struct route4_filter, rcu);
275 274
@@ -383,17 +382,19 @@ static int route4_set_parms(struct net *net, struct tcf_proto *tp,
383 struct nlattr **tb, struct nlattr *est, int new, 382 struct nlattr **tb, struct nlattr *est, int new,
384 bool ovr) 383 bool ovr)
385{ 384{
386 int err;
387 u32 id = 0, to = 0, nhandle = 0x8000; 385 u32 id = 0, to = 0, nhandle = 0x8000;
388 struct route4_filter *fp; 386 struct route4_filter *fp;
389 unsigned int h1; 387 unsigned int h1;
390 struct route4_bucket *b; 388 struct route4_bucket *b;
391 struct tcf_exts e; 389 struct tcf_exts e;
390 int err;
392 391
393 tcf_exts_init(&e, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE); 392 err = tcf_exts_init(&e, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE);
394 err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
395 if (err < 0) 393 if (err < 0)
396 return err; 394 return err;
395 err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
396 if (err < 0)
397 goto errout;
397 398
398 err = -EINVAL; 399 err = -EINVAL;
399 if (tb[TCA_ROUTE4_TO]) { 400 if (tb[TCA_ROUTE4_TO]) {
@@ -472,10 +473,8 @@ errout:
472} 473}
473 474
474static int route4_change(struct net *net, struct sk_buff *in_skb, 475static int route4_change(struct net *net, struct sk_buff *in_skb,
475 struct tcf_proto *tp, unsigned long base, 476 struct tcf_proto *tp, unsigned long base, u32 handle,
476 u32 handle, 477 struct nlattr **tca, unsigned long *arg, bool ovr)
477 struct nlattr **tca,
478 unsigned long *arg, bool ovr)
479{ 478{
480 struct route4_head *head = rtnl_dereference(tp->root); 479 struct route4_head *head = rtnl_dereference(tp->root);
481 struct route4_filter __rcu **fp; 480 struct route4_filter __rcu **fp;
@@ -503,7 +502,10 @@ static int route4_change(struct net *net, struct sk_buff *in_skb,
503 if (!f) 502 if (!f)
504 goto errout; 503 goto errout;
505 504
506 tcf_exts_init(&f->exts, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE); 505 err = tcf_exts_init(&f->exts, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE);
506 if (err < 0)
507 goto errout;
508
507 if (fold) { 509 if (fold) {
508 f->id = fold->id; 510 f->id = fold->id;
509 f->iif = fold->iif; 511 f->iif = fold->iif;
@@ -557,6 +559,8 @@ static int route4_change(struct net *net, struct sk_buff *in_skb,
557 return 0; 559 return 0;
558 560
559errout: 561errout:
562 if (f)
563 tcf_exts_destroy(&f->exts);
560 kfree(f); 564 kfree(f);
561 return err; 565 return err;
562} 566}
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
index f9c9fc075fe6..4f05a19fb073 100644
--- a/net/sched/cls_rsvp.h
+++ b/net/sched/cls_rsvp.h
@@ -487,10 +487,12 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb,
487 if (err < 0) 487 if (err < 0)
488 return err; 488 return err;
489 489
490 tcf_exts_init(&e, TCA_RSVP_ACT, TCA_RSVP_POLICE); 490 err = tcf_exts_init(&e, TCA_RSVP_ACT, TCA_RSVP_POLICE);
491 err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr);
492 if (err < 0) 491 if (err < 0)
493 return err; 492 return err;
493 err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr);
494 if (err < 0)
495 goto errout2;
494 496
495 f = (struct rsvp_filter *)*arg; 497 f = (struct rsvp_filter *)*arg;
496 if (f) { 498 if (f) {
@@ -506,7 +508,11 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb,
506 goto errout2; 508 goto errout2;
507 } 509 }
508 510
509 tcf_exts_init(&n->exts, TCA_RSVP_ACT, TCA_RSVP_POLICE); 511 err = tcf_exts_init(&n->exts, TCA_RSVP_ACT, TCA_RSVP_POLICE);
512 if (err < 0) {
513 kfree(n);
514 goto errout2;
515 }
510 516
511 if (tb[TCA_RSVP_CLASSID]) { 517 if (tb[TCA_RSVP_CLASSID]) {
512 n->res.classid = nla_get_u32(tb[TCA_RSVP_CLASSID]); 518 n->res.classid = nla_get_u32(tb[TCA_RSVP_CLASSID]);
@@ -530,7 +536,9 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb,
530 if (f == NULL) 536 if (f == NULL)
531 goto errout2; 537 goto errout2;
532 538
533 tcf_exts_init(&f->exts, TCA_RSVP_ACT, TCA_RSVP_POLICE); 539 err = tcf_exts_init(&f->exts, TCA_RSVP_ACT, TCA_RSVP_POLICE);
540 if (err < 0)
541 goto errout;
534 h2 = 16; 542 h2 = 16;
535 if (tb[TCA_RSVP_SRC]) { 543 if (tb[TCA_RSVP_SRC]) {
536 memcpy(f->src, nla_data(tb[TCA_RSVP_SRC]), sizeof(f->src)); 544 memcpy(f->src, nla_data(tb[TCA_RSVP_SRC]), sizeof(f->src));
@@ -627,6 +635,7 @@ insert:
627 goto insert; 635 goto insert;
628 636
629errout: 637errout:
638 tcf_exts_destroy(&f->exts);
630 kfree(f); 639 kfree(f);
631errout2: 640errout2:
632 tcf_exts_destroy(&e); 641 tcf_exts_destroy(&e);
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
index 944c8ff45055..96144bdf30db 100644
--- a/net/sched/cls_tcindex.c
+++ b/net/sched/cls_tcindex.c
@@ -50,14 +50,13 @@ struct tcindex_data {
50 struct rcu_head rcu; 50 struct rcu_head rcu;
51}; 51};
52 52
53static inline int 53static inline int tcindex_filter_is_set(struct tcindex_filter_result *r)
54tcindex_filter_is_set(struct tcindex_filter_result *r)
55{ 54{
56 return tcf_exts_is_predicative(&r->exts) || r->res.classid; 55 return tcf_exts_is_predicative(&r->exts) || r->res.classid;
57} 56}
58 57
59static struct tcindex_filter_result * 58static struct tcindex_filter_result *tcindex_lookup(struct tcindex_data *p,
60tcindex_lookup(struct tcindex_data *p, u16 key) 59 u16 key)
61{ 60{
62 if (p->perfect) { 61 if (p->perfect) {
63 struct tcindex_filter_result *f = p->perfect + key; 62 struct tcindex_filter_result *f = p->perfect + key;
@@ -144,7 +143,8 @@ static void tcindex_destroy_rexts(struct rcu_head *head)
144 143
145static void tcindex_destroy_fexts(struct rcu_head *head) 144static void tcindex_destroy_fexts(struct rcu_head *head)
146{ 145{
147 struct tcindex_filter *f = container_of(head, struct tcindex_filter, rcu); 146 struct tcindex_filter *f = container_of(head, struct tcindex_filter,
147 rcu);
148 148
149 tcf_exts_destroy(&f->result.exts); 149 tcf_exts_destroy(&f->result.exts);
150 kfree(f); 150 kfree(f);
@@ -219,10 +219,10 @@ static const struct nla_policy tcindex_policy[TCA_TCINDEX_MAX + 1] = {
219 [TCA_TCINDEX_CLASSID] = { .type = NLA_U32 }, 219 [TCA_TCINDEX_CLASSID] = { .type = NLA_U32 },
220}; 220};
221 221
222static void tcindex_filter_result_init(struct tcindex_filter_result *r) 222static int tcindex_filter_result_init(struct tcindex_filter_result *r)
223{ 223{
224 memset(r, 0, sizeof(*r)); 224 memset(r, 0, sizeof(*r));
225 tcf_exts_init(&r->exts, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); 225 return tcf_exts_init(&r->exts, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
226} 226}
227 227
228static void __tcindex_partial_destroy(struct rcu_head *head) 228static void __tcindex_partial_destroy(struct rcu_head *head)
@@ -233,23 +233,57 @@ static void __tcindex_partial_destroy(struct rcu_head *head)
233 kfree(p); 233 kfree(p);
234} 234}
235 235
236static void tcindex_free_perfect_hash(struct tcindex_data *cp)
237{
238 int i;
239
240 for (i = 0; i < cp->hash; i++)
241 tcf_exts_destroy(&cp->perfect[i].exts);
242 kfree(cp->perfect);
243}
244
245static int tcindex_alloc_perfect_hash(struct tcindex_data *cp)
246{
247 int i, err = 0;
248
249 cp->perfect = kcalloc(cp->hash, sizeof(struct tcindex_filter_result),
250 GFP_KERNEL);
251 if (!cp->perfect)
252 return -ENOMEM;
253
254 for (i = 0; i < cp->hash; i++) {
255 err = tcf_exts_init(&cp->perfect[i].exts,
256 TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
257 if (err < 0)
258 goto errout;
259 }
260
261 return 0;
262
263errout:
264 tcindex_free_perfect_hash(cp);
265 return err;
266}
267
236static int 268static int
237tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, 269tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
238 u32 handle, struct tcindex_data *p, 270 u32 handle, struct tcindex_data *p,
239 struct tcindex_filter_result *r, struct nlattr **tb, 271 struct tcindex_filter_result *r, struct nlattr **tb,
240 struct nlattr *est, bool ovr) 272 struct nlattr *est, bool ovr)
241{ 273{
242 int err, balloc = 0;
243 struct tcindex_filter_result new_filter_result, *old_r = r; 274 struct tcindex_filter_result new_filter_result, *old_r = r;
244 struct tcindex_filter_result cr; 275 struct tcindex_filter_result cr;
245 struct tcindex_data *cp, *oldp; 276 struct tcindex_data *cp = NULL, *oldp;
246 struct tcindex_filter *f = NULL; /* make gcc behave */ 277 struct tcindex_filter *f = NULL; /* make gcc behave */
278 int err, balloc = 0;
247 struct tcf_exts e; 279 struct tcf_exts e;
248 280
249 tcf_exts_init(&e, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); 281 err = tcf_exts_init(&e, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
250 err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
251 if (err < 0) 282 if (err < 0)
252 return err; 283 return err;
284 err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
285 if (err < 0)
286 goto errout;
253 287
254 err = -ENOMEM; 288 err = -ENOMEM;
255 /* tcindex_data attributes must look atomic to classifier/lookup so 289 /* tcindex_data attributes must look atomic to classifier/lookup so
@@ -270,19 +304,20 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
270 if (p->perfect) { 304 if (p->perfect) {
271 int i; 305 int i;
272 306
273 cp->perfect = kmemdup(p->perfect, 307 if (tcindex_alloc_perfect_hash(cp) < 0)
274 sizeof(*r) * cp->hash, GFP_KERNEL);
275 if (!cp->perfect)
276 goto errout; 308 goto errout;
277 for (i = 0; i < cp->hash; i++) 309 for (i = 0; i < cp->hash; i++)
278 tcf_exts_init(&cp->perfect[i].exts, 310 cp->perfect[i].res = p->perfect[i].res;
279 TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
280 balloc = 1; 311 balloc = 1;
281 } 312 }
282 cp->h = p->h; 313 cp->h = p->h;
283 314
284 tcindex_filter_result_init(&new_filter_result); 315 err = tcindex_filter_result_init(&new_filter_result);
285 tcindex_filter_result_init(&cr); 316 if (err < 0)
317 goto errout1;
318 err = tcindex_filter_result_init(&cr);
319 if (err < 0)
320 goto errout1;
286 if (old_r) 321 if (old_r)
287 cr.res = r->res; 322 cr.res = r->res;
288 323
@@ -338,15 +373,8 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
338 err = -ENOMEM; 373 err = -ENOMEM;
339 if (!cp->perfect && !cp->h) { 374 if (!cp->perfect && !cp->h) {
340 if (valid_perfect_hash(cp)) { 375 if (valid_perfect_hash(cp)) {
341 int i; 376 if (tcindex_alloc_perfect_hash(cp) < 0)
342
343 cp->perfect = kcalloc(cp->hash, sizeof(*r), GFP_KERNEL);
344 if (!cp->perfect)
345 goto errout_alloc; 377 goto errout_alloc;
346 for (i = 0; i < cp->hash; i++)
347 tcf_exts_init(&cp->perfect[i].exts,
348 TCA_TCINDEX_ACT,
349 TCA_TCINDEX_POLICE);
350 balloc = 1; 378 balloc = 1;
351 } else { 379 } else {
352 struct tcindex_filter __rcu **hash; 380 struct tcindex_filter __rcu **hash;
@@ -373,8 +401,12 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
373 if (!f) 401 if (!f)
374 goto errout_alloc; 402 goto errout_alloc;
375 f->key = handle; 403 f->key = handle;
376 tcindex_filter_result_init(&f->result);
377 f->next = NULL; 404 f->next = NULL;
405 err = tcindex_filter_result_init(&f->result);
406 if (err < 0) {
407 kfree(f);
408 goto errout_alloc;
409 }
378 } 410 }
379 411
380 if (tb[TCA_TCINDEX_CLASSID]) { 412 if (tb[TCA_TCINDEX_CLASSID]) {
@@ -387,8 +419,13 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
387 else 419 else
388 tcf_exts_change(tp, &cr.exts, &e); 420 tcf_exts_change(tp, &cr.exts, &e);
389 421
390 if (old_r && old_r != r) 422 if (old_r && old_r != r) {
391 tcindex_filter_result_init(old_r); 423 err = tcindex_filter_result_init(old_r);
424 if (err < 0) {
425 kfree(f);
426 goto errout_alloc;
427 }
428 }
392 429
393 oldp = p; 430 oldp = p;
394 r->res = cr.res; 431 r->res = cr.res;
@@ -415,9 +452,12 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
415 452
416errout_alloc: 453errout_alloc:
417 if (balloc == 1) 454 if (balloc == 1)
418 kfree(cp->perfect); 455 tcindex_free_perfect_hash(cp);
419 else if (balloc == 2) 456 else if (balloc == 2)
420 kfree(cp->h); 457 kfree(cp->h);
458errout1:
459 tcf_exts_destroy(&cr.exts);
460 tcf_exts_destroy(&new_filter_result.exts);
421errout: 461errout:
422 kfree(cp); 462 kfree(cp);
423 tcf_exts_destroy(&e); 463 tcf_exts_destroy(&e);
@@ -510,7 +550,7 @@ static bool tcindex_destroy(struct tcf_proto *tp, bool force)
510 550
511 551
512static int tcindex_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, 552static int tcindex_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
513 struct sk_buff *skb, struct tcmsg *t) 553 struct sk_buff *skb, struct tcmsg *t)
514{ 554{
515 struct tcindex_data *p = rtnl_dereference(tp->root); 555 struct tcindex_data *p = rtnl_dereference(tp->root);
516 struct tcindex_filter_result *r = (struct tcindex_filter_result *) fh; 556 struct tcindex_filter_result *r = (struct tcindex_filter_result *) fh;
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index ffe593efe930..ae83c3aec308 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -104,7 +104,8 @@ static inline unsigned int u32_hash_fold(__be32 key,
104 return h; 104 return h;
105} 105}
106 106
107static int u32_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) 107static int u32_classify(struct sk_buff *skb, const struct tcf_proto *tp,
108 struct tcf_result *res)
108{ 109{
109 struct { 110 struct {
110 struct tc_u_knode *knode; 111 struct tc_u_knode *knode;
@@ -256,8 +257,7 @@ deadloop:
256 return -1; 257 return -1;
257} 258}
258 259
259static struct tc_u_hnode * 260static struct tc_u_hnode *u32_lookup_ht(struct tc_u_common *tp_c, u32 handle)
260u32_lookup_ht(struct tc_u_common *tp_c, u32 handle)
261{ 261{
262 struct tc_u_hnode *ht; 262 struct tc_u_hnode *ht;
263 263
@@ -270,8 +270,7 @@ u32_lookup_ht(struct tc_u_common *tp_c, u32 handle)
270 return ht; 270 return ht;
271} 271}
272 272
273static struct tc_u_knode * 273static struct tc_u_knode *u32_lookup_key(struct tc_u_hnode *ht, u32 handle)
274u32_lookup_key(struct tc_u_hnode *ht, u32 handle)
275{ 274{
276 unsigned int sel; 275 unsigned int sel;
277 struct tc_u_knode *n = NULL; 276 struct tc_u_knode *n = NULL;
@@ -360,8 +359,7 @@ static int u32_init(struct tcf_proto *tp)
360 return 0; 359 return 0;
361} 360}
362 361
363static int u32_destroy_key(struct tcf_proto *tp, 362static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n,
364 struct tc_u_knode *n,
365 bool free_pf) 363 bool free_pf)
366{ 364{
367 tcf_exts_destroy(&n->exts); 365 tcf_exts_destroy(&n->exts);
@@ -448,9 +446,8 @@ static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle)
448 } 446 }
449} 447}
450 448
451static int u32_replace_hw_hnode(struct tcf_proto *tp, 449static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h,
452 struct tc_u_hnode *h, 450 u32 flags)
453 u32 flags)
454{ 451{
455 struct net_device *dev = tp->q->dev_queue->dev; 452 struct net_device *dev = tp->q->dev_queue->dev;
456 struct tc_cls_u32_offload u32_offload = {0}; 453 struct tc_cls_u32_offload u32_offload = {0};
@@ -496,9 +493,8 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
496 } 493 }
497} 494}
498 495
499static int u32_replace_hw_knode(struct tcf_proto *tp, 496static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
500 struct tc_u_knode *n, 497 u32 flags)
501 u32 flags)
502{ 498{
503 struct net_device *dev = tp->q->dev_queue->dev; 499 struct net_device *dev = tp->q->dev_queue->dev;
504 struct tc_cls_u32_offload u32_offload = {0}; 500 struct tc_cls_u32_offload u32_offload = {0};
@@ -709,13 +705,15 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp,
709 struct tc_u_knode *n, struct nlattr **tb, 705 struct tc_u_knode *n, struct nlattr **tb,
710 struct nlattr *est, bool ovr) 706 struct nlattr *est, bool ovr)
711{ 707{
712 int err;
713 struct tcf_exts e; 708 struct tcf_exts e;
709 int err;
714 710
715 tcf_exts_init(&e, TCA_U32_ACT, TCA_U32_POLICE); 711 err = tcf_exts_init(&e, TCA_U32_ACT, TCA_U32_POLICE);
716 err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
717 if (err < 0) 712 if (err < 0)
718 return err; 713 return err;
714 err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
715 if (err < 0)
716 goto errout;
719 717
720 err = -EINVAL; 718 err = -EINVAL;
721 if (tb[TCA_U32_LINK]) { 719 if (tb[TCA_U32_LINK]) {
@@ -761,8 +759,7 @@ errout:
761 return err; 759 return err;
762} 760}
763 761
764static void u32_replace_knode(struct tcf_proto *tp, 762static void u32_replace_knode(struct tcf_proto *tp, struct tc_u_common *tp_c,
765 struct tc_u_common *tp_c,
766 struct tc_u_knode *n) 763 struct tc_u_knode *n)
767{ 764{
768 struct tc_u_knode __rcu **ins; 765 struct tc_u_knode __rcu **ins;
@@ -833,15 +830,17 @@ static struct tc_u_knode *u32_init_knode(struct tcf_proto *tp,
833 new->tp = tp; 830 new->tp = tp;
834 memcpy(&new->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key)); 831 memcpy(&new->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key));
835 832
836 tcf_exts_init(&new->exts, TCA_U32_ACT, TCA_U32_POLICE); 833 if (tcf_exts_init(&new->exts, TCA_U32_ACT, TCA_U32_POLICE)) {
834 kfree(new);
835 return NULL;
836 }
837 837
838 return new; 838 return new;
839} 839}
840 840
841static int u32_change(struct net *net, struct sk_buff *in_skb, 841static int u32_change(struct net *net, struct sk_buff *in_skb,
842 struct tcf_proto *tp, unsigned long base, u32 handle, 842 struct tcf_proto *tp, unsigned long base, u32 handle,
843 struct nlattr **tca, 843 struct nlattr **tca, unsigned long *arg, bool ovr)
844 unsigned long *arg, bool ovr)
845{ 844{
846 struct tc_u_common *tp_c = tp->data; 845 struct tc_u_common *tp_c = tp->data;
847 struct tc_u_hnode *ht; 846 struct tc_u_hnode *ht;
@@ -985,9 +984,12 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
985 n->handle = handle; 984 n->handle = handle;
986 n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0; 985 n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0;
987 n->flags = flags; 986 n->flags = flags;
988 tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE);
989 n->tp = tp; 987 n->tp = tp;
990 988
989 err = tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE);
990 if (err < 0)
991 goto errout;
992
991#ifdef CONFIG_CLS_U32_MARK 993#ifdef CONFIG_CLS_U32_MARK
992 n->pcpu_success = alloc_percpu(u32); 994 n->pcpu_success = alloc_percpu(u32);
993 if (!n->pcpu_success) { 995 if (!n->pcpu_success) {
@@ -1028,9 +1030,10 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
1028errhw: 1030errhw:
1029#ifdef CONFIG_CLS_U32_MARK 1031#ifdef CONFIG_CLS_U32_MARK
1030 free_percpu(n->pcpu_success); 1032 free_percpu(n->pcpu_success);
1031errout:
1032#endif 1033#endif
1033 1034
1035errout:
1036 tcf_exts_destroy(&n->exts);
1034#ifdef CONFIG_CLS_U32_PERF 1037#ifdef CONFIG_CLS_U32_PERF
1035 free_percpu(n->pf); 1038 free_percpu(n->pf);
1036#endif 1039#endif
@@ -1079,7 +1082,7 @@ static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg)
1079} 1082}
1080 1083
1081static int u32_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, 1084static int u32_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
1082 struct sk_buff *skb, struct tcmsg *t) 1085 struct sk_buff *skb, struct tcmsg *t)
1083{ 1086{
1084 struct tc_u_knode *n = (struct tc_u_knode *)fh; 1087 struct tc_u_knode *n = (struct tc_u_knode *)fh;
1085 struct tc_u_hnode *ht_up, *ht_down; 1088 struct tc_u_hnode *ht_up, *ht_down;
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 12ebde845523..206dc24add3a 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -29,6 +29,7 @@
29#include <linux/hrtimer.h> 29#include <linux/hrtimer.h>
30#include <linux/lockdep.h> 30#include <linux/lockdep.h>
31#include <linux/slab.h> 31#include <linux/slab.h>
32#include <linux/hashtable.h>
32 33
33#include <net/net_namespace.h> 34#include <net/net_namespace.h>
34#include <net/sock.h> 35#include <net/sock.h>
@@ -259,37 +260,40 @@ static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
259{ 260{
260 struct Qdisc *q; 261 struct Qdisc *q;
261 262
263 if (!qdisc_dev(root))
264 return (root->handle == handle ? root : NULL);
265
262 if (!(root->flags & TCQ_F_BUILTIN) && 266 if (!(root->flags & TCQ_F_BUILTIN) &&
263 root->handle == handle) 267 root->handle == handle)
264 return root; 268 return root;
265 269
266 list_for_each_entry_rcu(q, &root->list, list) { 270 hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle) {
267 if (q->handle == handle) 271 if (q->handle == handle)
268 return q; 272 return q;
269 } 273 }
270 return NULL; 274 return NULL;
271} 275}
272 276
273void qdisc_list_add(struct Qdisc *q) 277void qdisc_hash_add(struct Qdisc *q)
274{ 278{
275 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) { 279 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
276 struct Qdisc *root = qdisc_dev(q)->qdisc; 280 struct Qdisc *root = qdisc_dev(q)->qdisc;
277 281
278 WARN_ON_ONCE(root == &noop_qdisc); 282 WARN_ON_ONCE(root == &noop_qdisc);
279 ASSERT_RTNL(); 283 ASSERT_RTNL();
280 list_add_tail_rcu(&q->list, &root->list); 284 hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
281 } 285 }
282} 286}
283EXPORT_SYMBOL(qdisc_list_add); 287EXPORT_SYMBOL(qdisc_hash_add);
284 288
285void qdisc_list_del(struct Qdisc *q) 289void qdisc_hash_del(struct Qdisc *q)
286{ 290{
287 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) { 291 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
288 ASSERT_RTNL(); 292 ASSERT_RTNL();
289 list_del_rcu(&q->list); 293 hash_del_rcu(&q->hash);
290 } 294 }
291} 295}
292EXPORT_SYMBOL(qdisc_list_del); 296EXPORT_SYMBOL(qdisc_hash_del);
293 297
294struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle) 298struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
295{ 299{
@@ -385,7 +389,8 @@ static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
385 389
386static struct qdisc_rate_table *qdisc_rtab_list; 390static struct qdisc_rate_table *qdisc_rtab_list;
387 391
388struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab) 392struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
393 struct nlattr *tab)
389{ 394{
390 struct qdisc_rate_table *rtab; 395 struct qdisc_rate_table *rtab;
391 396
@@ -537,7 +542,8 @@ nla_put_failure:
537 return -1; 542 return -1;
538} 543}
539 544
540void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab) 545void __qdisc_calculate_pkt_len(struct sk_buff *skb,
546 const struct qdisc_size_table *stab)
541{ 547{
542 int pkt_len, slot; 548 int pkt_len, slot;
543 549
@@ -884,10 +890,10 @@ static struct lock_class_key qdisc_rx_lock;
884 Parameters are passed via opt. 890 Parameters are passed via opt.
885 */ 891 */
886 892
887static struct Qdisc * 893static struct Qdisc *qdisc_create(struct net_device *dev,
888qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue, 894 struct netdev_queue *dev_queue,
889 struct Qdisc *p, u32 parent, u32 handle, 895 struct Qdisc *p, u32 parent, u32 handle,
890 struct nlattr **tca, int *errp) 896 struct nlattr **tca, int *errp)
891{ 897{
892 int err; 898 int err;
893 struct nlattr *kind = tca[TCA_KIND]; 899 struct nlattr *kind = tca[TCA_KIND];
@@ -998,7 +1004,7 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
998 goto err_out4; 1004 goto err_out4;
999 } 1005 }
1000 1006
1001 qdisc_list_add(sch); 1007 qdisc_hash_add(sch);
1002 1008
1003 return sch; 1009 return sch;
1004 } 1010 }
@@ -1069,7 +1075,8 @@ struct check_loop_arg {
1069 int depth; 1075 int depth;
1070}; 1076};
1071 1077
1072static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w); 1078static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1079 struct qdisc_walker *w);
1073 1080
1074static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth) 1081static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1075{ 1082{
@@ -1431,10 +1438,11 @@ err_out:
1431 1438
1432static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb, 1439static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1433 struct netlink_callback *cb, 1440 struct netlink_callback *cb,
1434 int *q_idx_p, int s_q_idx) 1441 int *q_idx_p, int s_q_idx, bool recur)
1435{ 1442{
1436 int ret = 0, q_idx = *q_idx_p; 1443 int ret = 0, q_idx = *q_idx_p;
1437 struct Qdisc *q; 1444 struct Qdisc *q;
1445 int b;
1438 1446
1439 if (!root) 1447 if (!root)
1440 return 0; 1448 return 0;
@@ -1445,18 +1453,30 @@ static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1445 } else { 1453 } else {
1446 if (!tc_qdisc_dump_ignore(q) && 1454 if (!tc_qdisc_dump_ignore(q) &&
1447 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid, 1455 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1448 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) 1456 cb->nlh->nlmsg_seq, NLM_F_MULTI,
1457 RTM_NEWQDISC) <= 0)
1449 goto done; 1458 goto done;
1450 q_idx++; 1459 q_idx++;
1451 } 1460 }
1452 list_for_each_entry(q, &root->list, list) { 1461
1462 /* If dumping singletons, there is no qdisc_dev(root) and the singleton
1463 * itself has already been dumped.
1464 *
1465 * If we've already dumped the top-level (ingress) qdisc above and the global
1466 * qdisc hashtable, we don't want to hit it again
1467 */
1468 if (!qdisc_dev(root) || !recur)
1469 goto out;
1470
1471 hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1453 if (q_idx < s_q_idx) { 1472 if (q_idx < s_q_idx) {
1454 q_idx++; 1473 q_idx++;
1455 continue; 1474 continue;
1456 } 1475 }
1457 if (!tc_qdisc_dump_ignore(q) && 1476 if (!tc_qdisc_dump_ignore(q) &&
1458 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid, 1477 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1459 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) 1478 cb->nlh->nlmsg_seq, NLM_F_MULTI,
1479 RTM_NEWQDISC) <= 0)
1460 goto done; 1480 goto done;
1461 q_idx++; 1481 q_idx++;
1462 } 1482 }
@@ -1490,13 +1510,14 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1490 s_q_idx = 0; 1510 s_q_idx = 0;
1491 q_idx = 0; 1511 q_idx = 0;
1492 1512
1493 if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0) 1513 if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx,
1514 true) < 0)
1494 goto done; 1515 goto done;
1495 1516
1496 dev_queue = dev_ingress_queue(dev); 1517 dev_queue = dev_ingress_queue(dev);
1497 if (dev_queue && 1518 if (dev_queue &&
1498 tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, 1519 tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1499 &q_idx, s_q_idx) < 0) 1520 &q_idx, s_q_idx, false) < 0)
1500 goto done; 1521 goto done;
1501 1522
1502cont: 1523cont:
@@ -1625,7 +1646,8 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n)
1625 if (cops->delete) 1646 if (cops->delete)
1626 err = cops->delete(q, cl); 1647 err = cops->delete(q, cl);
1627 if (err == 0) 1648 if (err == 0)
1628 tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS); 1649 tclass_notify(net, skb, n, q, cl,
1650 RTM_DELTCLASS);
1629 goto out; 1651 goto out;
1630 case RTM_GETTCLASS: 1652 case RTM_GETTCLASS:
1631 err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS); 1653 err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
@@ -1723,12 +1745,14 @@ struct qdisc_dump_args {
1723 struct netlink_callback *cb; 1745 struct netlink_callback *cb;
1724}; 1746};
1725 1747
1726static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg) 1748static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
1749 struct qdisc_walker *arg)
1727{ 1750{
1728 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg; 1751 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1729 1752
1730 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid, 1753 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
1731 a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS); 1754 a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
1755 RTM_NEWTCLASS);
1732} 1756}
1733 1757
1734static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb, 1758static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
@@ -1765,6 +1789,7 @@ static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1765 int *t_p, int s_t) 1789 int *t_p, int s_t)
1766{ 1790{
1767 struct Qdisc *q; 1791 struct Qdisc *q;
1792 int b;
1768 1793
1769 if (!root) 1794 if (!root)
1770 return 0; 1795 return 0;
@@ -1772,7 +1797,10 @@ static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1772 if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0) 1797 if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1773 return -1; 1798 return -1;
1774 1799
1775 list_for_each_entry(q, &root->list, list) { 1800 if (!qdisc_dev(root))
1801 return 0;
1802
1803 hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1776 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0) 1804 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1777 return -1; 1805 return -1;
1778 } 1806 }
@@ -1957,10 +1985,12 @@ static int __init pktsched_init(void)
1957 1985
1958 rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL); 1986 rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL);
1959 rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL); 1987 rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL);
1960 rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, NULL); 1988 rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
1989 NULL);
1961 rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL); 1990 rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL);
1962 rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL); 1991 rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL);
1963 rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, NULL); 1992 rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
1993 NULL);
1964 1994
1965 return 0; 1995 return 0;
1966} 1996}
diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c
index 4002df3c7d9f..5bfa79ee657c 100644
--- a/net/sched/sch_codel.c
+++ b/net/sched/sch_codel.c
@@ -69,7 +69,7 @@ struct codel_sched_data {
69static struct sk_buff *dequeue_func(struct codel_vars *vars, void *ctx) 69static struct sk_buff *dequeue_func(struct codel_vars *vars, void *ctx)
70{ 70{
71 struct Qdisc *sch = ctx; 71 struct Qdisc *sch = ctx;
72 struct sk_buff *skb = __skb_dequeue(&sch->q); 72 struct sk_buff *skb = __qdisc_dequeue_head(&sch->q);
73 73
74 if (skb) 74 if (skb)
75 sch->qstats.backlog -= qdisc_pkt_len(skb); 75 sch->qstats.backlog -= qdisc_pkt_len(skb);
@@ -172,7 +172,7 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt)
172 172
173 qlen = sch->q.qlen; 173 qlen = sch->q.qlen;
174 while (sch->q.qlen > sch->limit) { 174 while (sch->q.qlen > sch->limit) {
175 struct sk_buff *skb = __skb_dequeue(&sch->q); 175 struct sk_buff *skb = __qdisc_dequeue_head(&sch->q);
176 176
177 dropped += qdisc_pkt_len(skb); 177 dropped += qdisc_pkt_len(skb);
178 qdisc_qstats_backlog_dec(sch, skb); 178 qdisc_qstats_backlog_dec(sch, skb);
diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c
index baeed6a78d28..1e37247656f8 100644
--- a/net/sched/sch_fifo.c
+++ b/net/sched/sch_fifo.c
@@ -31,7 +31,7 @@ static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch,
31static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch, 31static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch,
32 struct sk_buff **to_free) 32 struct sk_buff **to_free)
33{ 33{
34 if (likely(skb_queue_len(&sch->q) < sch->limit)) 34 if (likely(sch->q.qlen < sch->limit))
35 return qdisc_enqueue_tail(skb, sch); 35 return qdisc_enqueue_tail(skb, sch);
36 36
37 return qdisc_drop(skb, sch, to_free); 37 return qdisc_drop(skb, sch, to_free);
@@ -42,7 +42,7 @@ static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch,
42{ 42{
43 unsigned int prev_backlog; 43 unsigned int prev_backlog;
44 44
45 if (likely(skb_queue_len(&sch->q) < sch->limit)) 45 if (likely(sch->q.qlen < sch->limit))
46 return qdisc_enqueue_tail(skb, sch); 46 return qdisc_enqueue_tail(skb, sch);
47 47
48 prev_backlog = sch->qstats.backlog; 48 prev_backlog = sch->qstats.backlog;
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index e5458b99e09c..18e752439f6f 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -86,6 +86,7 @@ struct fq_sched_data {
86 86
87 struct rb_root delayed; /* for rate limited flows */ 87 struct rb_root delayed; /* for rate limited flows */
88 u64 time_next_delayed_flow; 88 u64 time_next_delayed_flow;
89 unsigned long unthrottle_latency_ns;
89 90
90 struct fq_flow internal; /* for non classified or high prio packets */ 91 struct fq_flow internal; /* for non classified or high prio packets */
91 u32 quantum; 92 u32 quantum;
@@ -94,6 +95,7 @@ struct fq_sched_data {
94 u32 flow_max_rate; /* optional max rate per flow */ 95 u32 flow_max_rate; /* optional max rate per flow */
95 u32 flow_plimit; /* max packets per flow */ 96 u32 flow_plimit; /* max packets per flow */
96 u32 orphan_mask; /* mask for orphaned skb */ 97 u32 orphan_mask; /* mask for orphaned skb */
98 u32 low_rate_threshold;
97 struct rb_root *fq_root; 99 struct rb_root *fq_root;
98 u8 rate_enable; 100 u8 rate_enable;
99 u8 fq_trees_log; 101 u8 fq_trees_log;
@@ -407,11 +409,19 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
407 409
408static void fq_check_throttled(struct fq_sched_data *q, u64 now) 410static void fq_check_throttled(struct fq_sched_data *q, u64 now)
409{ 411{
412 unsigned long sample;
410 struct rb_node *p; 413 struct rb_node *p;
411 414
412 if (q->time_next_delayed_flow > now) 415 if (q->time_next_delayed_flow > now)
413 return; 416 return;
414 417
418 /* Update unthrottle latency EWMA.
419 * This is cheap and can help diagnosing timer/latency problems.
420 */
421 sample = (unsigned long)(now - q->time_next_delayed_flow);
422 q->unthrottle_latency_ns -= q->unthrottle_latency_ns >> 3;
423 q->unthrottle_latency_ns += sample >> 3;
424
415 q->time_next_delayed_flow = ~0ULL; 425 q->time_next_delayed_flow = ~0ULL;
416 while ((p = rb_first(&q->delayed)) != NULL) { 426 while ((p = rb_first(&q->delayed)) != NULL) {
417 struct fq_flow *f = container_of(p, struct fq_flow, rate_node); 427 struct fq_flow *f = container_of(p, struct fq_flow, rate_node);
@@ -433,7 +443,7 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
433 struct fq_flow_head *head; 443 struct fq_flow_head *head;
434 struct sk_buff *skb; 444 struct sk_buff *skb;
435 struct fq_flow *f; 445 struct fq_flow *f;
436 u32 rate; 446 u32 rate, plen;
437 447
438 skb = fq_dequeue_head(sch, &q->internal); 448 skb = fq_dequeue_head(sch, &q->internal);
439 if (skb) 449 if (skb)
@@ -482,7 +492,7 @@ begin:
482 prefetch(&skb->end); 492 prefetch(&skb->end);
483 f->credit -= qdisc_pkt_len(skb); 493 f->credit -= qdisc_pkt_len(skb);
484 494
485 if (f->credit > 0 || !q->rate_enable) 495 if (!q->rate_enable)
486 goto out; 496 goto out;
487 497
488 /* Do not pace locally generated ack packets */ 498 /* Do not pace locally generated ack packets */
@@ -493,8 +503,15 @@ begin:
493 if (skb->sk) 503 if (skb->sk)
494 rate = min(skb->sk->sk_pacing_rate, rate); 504 rate = min(skb->sk->sk_pacing_rate, rate);
495 505
506 if (rate <= q->low_rate_threshold) {
507 f->credit = 0;
508 plen = qdisc_pkt_len(skb);
509 } else {
510 plen = max(qdisc_pkt_len(skb), q->quantum);
511 if (f->credit > 0)
512 goto out;
513 }
496 if (rate != ~0U) { 514 if (rate != ~0U) {
497 u32 plen = max(qdisc_pkt_len(skb), q->quantum);
498 u64 len = (u64)plen * NSEC_PER_SEC; 515 u64 len = (u64)plen * NSEC_PER_SEC;
499 516
500 if (likely(rate)) 517 if (likely(rate))
@@ -507,7 +524,12 @@ begin:
507 len = NSEC_PER_SEC; 524 len = NSEC_PER_SEC;
508 q->stat_pkts_too_long++; 525 q->stat_pkts_too_long++;
509 } 526 }
510 527 /* Account for schedule/timers drifts.
528 * f->time_next_packet was set when prior packet was sent,
529 * and current time (@now) can be too late by tens of us.
530 */
531 if (f->time_next_packet)
532 len -= min(len/2, now - f->time_next_packet);
511 f->time_next_packet = now + len; 533 f->time_next_packet = now + len;
512 } 534 }
513out: 535out:
@@ -662,6 +684,7 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
662 [TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 }, 684 [TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 },
663 [TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 }, 685 [TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 },
664 [TCA_FQ_FLOW_REFILL_DELAY] = { .type = NLA_U32 }, 686 [TCA_FQ_FLOW_REFILL_DELAY] = { .type = NLA_U32 },
687 [TCA_FQ_LOW_RATE_THRESHOLD] = { .type = NLA_U32 },
665}; 688};
666 689
667static int fq_change(struct Qdisc *sch, struct nlattr *opt) 690static int fq_change(struct Qdisc *sch, struct nlattr *opt)
@@ -716,6 +739,10 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt)
716 if (tb[TCA_FQ_FLOW_MAX_RATE]) 739 if (tb[TCA_FQ_FLOW_MAX_RATE])
717 q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]); 740 q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
718 741
742 if (tb[TCA_FQ_LOW_RATE_THRESHOLD])
743 q->low_rate_threshold =
744 nla_get_u32(tb[TCA_FQ_LOW_RATE_THRESHOLD]);
745
719 if (tb[TCA_FQ_RATE_ENABLE]) { 746 if (tb[TCA_FQ_RATE_ENABLE]) {
720 u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]); 747 u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]);
721 748
@@ -774,6 +801,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt)
774 q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch)); 801 q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch));
775 q->flow_refill_delay = msecs_to_jiffies(40); 802 q->flow_refill_delay = msecs_to_jiffies(40);
776 q->flow_max_rate = ~0U; 803 q->flow_max_rate = ~0U;
804 q->time_next_delayed_flow = ~0ULL;
777 q->rate_enable = 1; 805 q->rate_enable = 1;
778 q->new_flows.first = NULL; 806 q->new_flows.first = NULL;
779 q->old_flows.first = NULL; 807 q->old_flows.first = NULL;
@@ -781,6 +809,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt)
781 q->fq_root = NULL; 809 q->fq_root = NULL;
782 q->fq_trees_log = ilog2(1024); 810 q->fq_trees_log = ilog2(1024);
783 q->orphan_mask = 1024 - 1; 811 q->orphan_mask = 1024 - 1;
812 q->low_rate_threshold = 550000 / 8;
784 qdisc_watchdog_init(&q->watchdog, sch); 813 qdisc_watchdog_init(&q->watchdog, sch);
785 814
786 if (opt) 815 if (opt)
@@ -811,6 +840,8 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
811 nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY, 840 nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY,
812 jiffies_to_usecs(q->flow_refill_delay)) || 841 jiffies_to_usecs(q->flow_refill_delay)) ||
813 nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) || 842 nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) ||
843 nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD,
844 q->low_rate_threshold) ||
814 nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log)) 845 nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log))
815 goto nla_put_failure; 846 goto nla_put_failure;
816 847
@@ -823,20 +854,24 @@ nla_put_failure:
823static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) 854static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
824{ 855{
825 struct fq_sched_data *q = qdisc_priv(sch); 856 struct fq_sched_data *q = qdisc_priv(sch);
826 u64 now = ktime_get_ns(); 857 struct tc_fq_qd_stats st;
827 struct tc_fq_qd_stats st = { 858
828 .gc_flows = q->stat_gc_flows, 859 sch_tree_lock(sch);
829 .highprio_packets = q->stat_internal_packets, 860
830 .tcp_retrans = q->stat_tcp_retrans, 861 st.gc_flows = q->stat_gc_flows;
831 .throttled = q->stat_throttled, 862 st.highprio_packets = q->stat_internal_packets;
832 .flows_plimit = q->stat_flows_plimit, 863 st.tcp_retrans = q->stat_tcp_retrans;
833 .pkts_too_long = q->stat_pkts_too_long, 864 st.throttled = q->stat_throttled;
834 .allocation_errors = q->stat_allocation_errors, 865 st.flows_plimit = q->stat_flows_plimit;
835 .flows = q->flows, 866 st.pkts_too_long = q->stat_pkts_too_long;
836 .inactive_flows = q->inactive_flows, 867 st.allocation_errors = q->stat_allocation_errors;
837 .throttled_flows = q->throttled_flows, 868 st.time_next_delayed_flow = q->time_next_delayed_flow - ktime_get_ns();
838 .time_next_delayed_flow = q->time_next_delayed_flow - now, 869 st.flows = q->flows;
839 }; 870 st.inactive_flows = q->inactive_flows;
871 st.throttled_flows = q->throttled_flows;
872 st.unthrottle_latency_ns = min_t(unsigned long,
873 q->unthrottle_latency_ns, ~0U);
874 sch_tree_unlock(sch);
840 875
841 return gnet_stats_copy_app(d, &st, sizeof(st)); 876 return gnet_stats_copy_app(d, &st, sizeof(st));
842} 877}
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 657c13362b19..6cfb6e9038c2 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -423,7 +423,6 @@ struct Qdisc noop_qdisc = {
423 .dequeue = noop_dequeue, 423 .dequeue = noop_dequeue,
424 .flags = TCQ_F_BUILTIN, 424 .flags = TCQ_F_BUILTIN,
425 .ops = &noop_qdisc_ops, 425 .ops = &noop_qdisc_ops,
426 .list = LIST_HEAD_INIT(noop_qdisc.list),
427 .q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock), 426 .q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
428 .dev_queue = &noop_netdev_queue, 427 .dev_queue = &noop_netdev_queue,
429 .running = SEQCNT_ZERO(noop_qdisc.running), 428 .running = SEQCNT_ZERO(noop_qdisc.running),
@@ -467,7 +466,7 @@ static const u8 prio2band[TC_PRIO_MAX + 1] = {
467 */ 466 */
468struct pfifo_fast_priv { 467struct pfifo_fast_priv {
469 u32 bitmap; 468 u32 bitmap;
470 struct sk_buff_head q[PFIFO_FAST_BANDS]; 469 struct qdisc_skb_head q[PFIFO_FAST_BANDS];
471}; 470};
472 471
473/* 472/*
@@ -478,7 +477,7 @@ struct pfifo_fast_priv {
478 */ 477 */
479static const int bitmap2band[] = {-1, 0, 1, 0, 2, 0, 1, 0}; 478static const int bitmap2band[] = {-1, 0, 1, 0, 2, 0, 1, 0};
480 479
481static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv, 480static inline struct qdisc_skb_head *band2list(struct pfifo_fast_priv *priv,
482 int band) 481 int band)
483{ 482{
484 return priv->q + band; 483 return priv->q + band;
@@ -487,10 +486,10 @@ static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv,
487static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, 486static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
488 struct sk_buff **to_free) 487 struct sk_buff **to_free)
489{ 488{
490 if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) { 489 if (qdisc->q.qlen < qdisc_dev(qdisc)->tx_queue_len) {
491 int band = prio2band[skb->priority & TC_PRIO_MAX]; 490 int band = prio2band[skb->priority & TC_PRIO_MAX];
492 struct pfifo_fast_priv *priv = qdisc_priv(qdisc); 491 struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
493 struct sk_buff_head *list = band2list(priv, band); 492 struct qdisc_skb_head *list = band2list(priv, band);
494 493
495 priv->bitmap |= (1 << band); 494 priv->bitmap |= (1 << band);
496 qdisc->q.qlen++; 495 qdisc->q.qlen++;
@@ -506,11 +505,16 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
506 int band = bitmap2band[priv->bitmap]; 505 int band = bitmap2band[priv->bitmap];
507 506
508 if (likely(band >= 0)) { 507 if (likely(band >= 0)) {
509 struct sk_buff_head *list = band2list(priv, band); 508 struct qdisc_skb_head *qh = band2list(priv, band);
510 struct sk_buff *skb = __qdisc_dequeue_head(qdisc, list); 509 struct sk_buff *skb = __qdisc_dequeue_head(qh);
510
511 if (likely(skb != NULL)) {
512 qdisc_qstats_backlog_dec(qdisc, skb);
513 qdisc_bstats_update(qdisc, skb);
514 }
511 515
512 qdisc->q.qlen--; 516 qdisc->q.qlen--;
513 if (skb_queue_empty(list)) 517 if (qh->qlen == 0)
514 priv->bitmap &= ~(1 << band); 518 priv->bitmap &= ~(1 << band);
515 519
516 return skb; 520 return skb;
@@ -525,9 +529,9 @@ static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc)
525 int band = bitmap2band[priv->bitmap]; 529 int band = bitmap2band[priv->bitmap];
526 530
527 if (band >= 0) { 531 if (band >= 0) {
528 struct sk_buff_head *list = band2list(priv, band); 532 struct qdisc_skb_head *qh = band2list(priv, band);
529 533
530 return skb_peek(list); 534 return qh->head;
531 } 535 }
532 536
533 return NULL; 537 return NULL;
@@ -565,7 +569,7 @@ static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt)
565 struct pfifo_fast_priv *priv = qdisc_priv(qdisc); 569 struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
566 570
567 for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) 571 for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
568 __skb_queue_head_init(band2list(priv, prio)); 572 qdisc_skb_head_init(band2list(priv, prio));
569 573
570 /* Can by-pass the queue discipline */ 574 /* Can by-pass the queue discipline */
571 qdisc->flags |= TCQ_F_CAN_BYPASS; 575 qdisc->flags |= TCQ_F_CAN_BYPASS;
@@ -613,8 +617,8 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
613 sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p); 617 sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
614 sch->padded = (char *) sch - (char *) p; 618 sch->padded = (char *) sch - (char *) p;
615 } 619 }
616 INIT_LIST_HEAD(&sch->list); 620 qdisc_skb_head_init(&sch->q);
617 skb_queue_head_init(&sch->q); 621 spin_lock_init(&sch->q.lock);
618 622
619 spin_lock_init(&sch->busylock); 623 spin_lock_init(&sch->busylock);
620 lockdep_set_class(&sch->busylock, 624 lockdep_set_class(&sch->busylock,
@@ -701,7 +705,7 @@ void qdisc_destroy(struct Qdisc *qdisc)
701 return; 705 return;
702 706
703#ifdef CONFIG_NET_SCHED 707#ifdef CONFIG_NET_SCHED
704 qdisc_list_del(qdisc); 708 qdisc_hash_del(qdisc);
705 709
706 qdisc_put_stab(rtnl_dereference(qdisc->stab)); 710 qdisc_put_stab(rtnl_dereference(qdisc->stab));
707#endif 711#endif
@@ -789,6 +793,10 @@ static void attach_default_qdiscs(struct net_device *dev)
789 qdisc->ops->attach(qdisc); 793 qdisc->ops->attach(qdisc);
790 } 794 }
791 } 795 }
796#ifdef CONFIG_NET_SCHED
797 if (dev->qdisc)
798 qdisc_hash_add(dev->qdisc);
799#endif
792} 800}
793 801
794static void transition_one_qdisc(struct net_device *dev, 802static void transition_one_qdisc(struct net_device *dev,
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 3ddc7bd74ecb..000f1d36128e 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -142,8 +142,6 @@ struct hfsc_class {
142 link-sharing, max(myf, cfmin) */ 142 link-sharing, max(myf, cfmin) */
143 u64 cl_myf; /* my fit-time (calculated from this 143 u64 cl_myf; /* my fit-time (calculated from this
144 class's own upperlimit curve) */ 144 class's own upperlimit curve) */
145 u64 cl_myfadj; /* my fit-time adjustment (to cancel
146 history dependence) */
147 u64 cl_cfmin; /* earliest children's fit-time (used 145 u64 cl_cfmin; /* earliest children's fit-time (used
148 with cl_myf to obtain cl_f) */ 146 with cl_myf to obtain cl_f) */
149 u64 cl_cvtmin; /* minimal virtual time among the 147 u64 cl_cvtmin; /* minimal virtual time among the
@@ -151,11 +149,8 @@ struct hfsc_class {
151 (monotonic within a period) */ 149 (monotonic within a period) */
152 u64 cl_vtadj; /* intra-period cumulative vt 150 u64 cl_vtadj; /* intra-period cumulative vt
153 adjustment */ 151 adjustment */
154 u64 cl_vtoff; /* inter-period cumulative vt offset */ 152 u64 cl_cvtoff; /* largest virtual time seen among
155 u64 cl_cvtmax; /* max child's vt in the last period */ 153 the children */
156 u64 cl_cvtoff; /* cumulative cvtmax of all periods */
157 u64 cl_pcvtoff; /* parent's cvtoff at initialization
158 time */
159 154
160 struct internal_sc cl_rsc; /* internal real-time service curve */ 155 struct internal_sc cl_rsc; /* internal real-time service curve */
161 struct internal_sc cl_fsc; /* internal fair service curve */ 156 struct internal_sc cl_fsc; /* internal fair service curve */
@@ -701,28 +696,16 @@ init_vf(struct hfsc_class *cl, unsigned int len)
701 } else { 696 } else {
702 /* 697 /*
703 * first child for a new parent backlog period. 698 * first child for a new parent backlog period.
704 * add parent's cvtmax to cvtoff to make a new 699 * initialize cl_vt to the highest value seen
705 * vt (vtoff + vt) larger than the vt in the 700 * among the siblings. this is analogous to
706 * last period for all children. 701 * what cur_time would provide in realtime case.
707 */ 702 */
708 vt = cl->cl_parent->cl_cvtmax; 703 cl->cl_vt = cl->cl_parent->cl_cvtoff;
709 cl->cl_parent->cl_cvtoff += vt;
710 cl->cl_parent->cl_cvtmax = 0;
711 cl->cl_parent->cl_cvtmin = 0; 704 cl->cl_parent->cl_cvtmin = 0;
712 cl->cl_vt = 0;
713 } 705 }
714 706
715 cl->cl_vtoff = cl->cl_parent->cl_cvtoff -
716 cl->cl_pcvtoff;
717
718 /* update the virtual curve */ 707 /* update the virtual curve */
719 vt = cl->cl_vt + cl->cl_vtoff; 708 rtsc_min(&cl->cl_virtual, &cl->cl_fsc, cl->cl_vt, cl->cl_total);
720 rtsc_min(&cl->cl_virtual, &cl->cl_fsc, vt,
721 cl->cl_total);
722 if (cl->cl_virtual.x == vt) {
723 cl->cl_virtual.x -= cl->cl_vtoff;
724 cl->cl_vtoff = 0;
725 }
726 cl->cl_vtadj = 0; 709 cl->cl_vtadj = 0;
727 710
728 cl->cl_vtperiod++; /* increment vt period */ 711 cl->cl_vtperiod++; /* increment vt period */
@@ -745,7 +728,6 @@ init_vf(struct hfsc_class *cl, unsigned int len)
745 /* compute myf */ 728 /* compute myf */
746 cl->cl_myf = rtsc_y2x(&cl->cl_ulimit, 729 cl->cl_myf = rtsc_y2x(&cl->cl_ulimit,
747 cl->cl_total); 730 cl->cl_total);
748 cl->cl_myfadj = 0;
749 } 731 }
750 } 732 }
751 733
@@ -779,8 +761,7 @@ update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time)
779 go_passive = 0; 761 go_passive = 0;
780 762
781 /* update vt */ 763 /* update vt */
782 cl->cl_vt = rtsc_y2x(&cl->cl_virtual, cl->cl_total) 764 cl->cl_vt = rtsc_y2x(&cl->cl_virtual, cl->cl_total) + cl->cl_vtadj;
783 - cl->cl_vtoff + cl->cl_vtadj;
784 765
785 /* 766 /*
786 * if vt of the class is smaller than cvtmin, 767 * if vt of the class is smaller than cvtmin,
@@ -795,9 +776,9 @@ update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time)
795 if (go_passive) { 776 if (go_passive) {
796 /* no more active child, going passive */ 777 /* no more active child, going passive */
797 778
798 /* update cvtmax of the parent class */ 779 /* update cvtoff of the parent class */
799 if (cl->cl_vt > cl->cl_parent->cl_cvtmax) 780 if (cl->cl_vt > cl->cl_parent->cl_cvtoff)
800 cl->cl_parent->cl_cvtmax = cl->cl_vt; 781 cl->cl_parent->cl_cvtoff = cl->cl_vt;
801 782
802 /* remove this class from the vt tree */ 783 /* remove this class from the vt tree */
803 vttree_remove(cl); 784 vttree_remove(cl);
@@ -813,9 +794,10 @@ update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time)
813 794
814 /* update f */ 795 /* update f */
815 if (cl->cl_flags & HFSC_USC) { 796 if (cl->cl_flags & HFSC_USC) {
797 cl->cl_myf = rtsc_y2x(&cl->cl_ulimit, cl->cl_total);
798#if 0
816 cl->cl_myf = cl->cl_myfadj + rtsc_y2x(&cl->cl_ulimit, 799 cl->cl_myf = cl->cl_myfadj + rtsc_y2x(&cl->cl_ulimit,
817 cl->cl_total); 800 cl->cl_total);
818#if 0
819 /* 801 /*
820 * This code causes classes to stay way under their 802 * This code causes classes to stay way under their
821 * limit when multiple classes are used at gigabit 803 * limit when multiple classes are used at gigabit
@@ -940,7 +922,7 @@ static void
940hfsc_change_fsc(struct hfsc_class *cl, struct tc_service_curve *fsc) 922hfsc_change_fsc(struct hfsc_class *cl, struct tc_service_curve *fsc)
941{ 923{
942 sc2isc(fsc, &cl->cl_fsc); 924 sc2isc(fsc, &cl->cl_fsc);
943 rtsc_init(&cl->cl_virtual, &cl->cl_fsc, cl->cl_vtoff + cl->cl_vt, cl->cl_total); 925 rtsc_init(&cl->cl_virtual, &cl->cl_fsc, cl->cl_vt, cl->cl_total);
944 cl->cl_flags |= HFSC_FSC; 926 cl->cl_flags |= HFSC_FSC;
945} 927}
946 928
@@ -1094,7 +1076,6 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
1094 if (parent->level == 0) 1076 if (parent->level == 0)
1095 hfsc_purge_queue(sch, parent); 1077 hfsc_purge_queue(sch, parent);
1096 hfsc_adjust_levels(parent); 1078 hfsc_adjust_levels(parent);
1097 cl->cl_pcvtoff = parent->cl_cvtoff;
1098 sch_tree_unlock(sch); 1079 sch_tree_unlock(sch);
1099 1080
1100 qdisc_class_hash_grow(sch, &q->clhash); 1081 qdisc_class_hash_grow(sch, &q->clhash);
@@ -1482,16 +1463,12 @@ hfsc_reset_class(struct hfsc_class *cl)
1482 cl->cl_e = 0; 1463 cl->cl_e = 0;
1483 cl->cl_vt = 0; 1464 cl->cl_vt = 0;
1484 cl->cl_vtadj = 0; 1465 cl->cl_vtadj = 0;
1485 cl->cl_vtoff = 0;
1486 cl->cl_cvtmin = 0; 1466 cl->cl_cvtmin = 0;
1487 cl->cl_cvtmax = 0;
1488 cl->cl_cvtoff = 0; 1467 cl->cl_cvtoff = 0;
1489 cl->cl_pcvtoff = 0;
1490 cl->cl_vtperiod = 0; 1468 cl->cl_vtperiod = 0;
1491 cl->cl_parentperiod = 0; 1469 cl->cl_parentperiod = 0;
1492 cl->cl_f = 0; 1470 cl->cl_f = 0;
1493 cl->cl_myf = 0; 1471 cl->cl_myf = 0;
1494 cl->cl_myfadj = 0;
1495 cl->cl_cfmin = 0; 1472 cl->cl_cfmin = 0;
1496 cl->cl_nactive = 0; 1473 cl->cl_nactive = 0;
1497 1474
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 53dbfa187870..c798d0de8a9d 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -162,7 +162,7 @@ struct htb_sched {
162 struct work_struct work; 162 struct work_struct work;
163 163
164 /* non shaped skbs; let them go directly thru */ 164 /* non shaped skbs; let them go directly thru */
165 struct sk_buff_head direct_queue; 165 struct qdisc_skb_head direct_queue;
166 long direct_pkts; 166 long direct_pkts;
167 167
168 struct qdisc_watchdog watchdog; 168 struct qdisc_watchdog watchdog;
@@ -570,6 +570,22 @@ static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
570 list_del_init(&cl->un.leaf.drop_list); 570 list_del_init(&cl->un.leaf.drop_list);
571} 571}
572 572
573static void htb_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch,
574 struct qdisc_skb_head *qh)
575{
576 struct sk_buff *last = qh->tail;
577
578 if (last) {
579 skb->next = NULL;
580 last->next = skb;
581 qh->tail = skb;
582 } else {
583 qh->tail = skb;
584 qh->head = skb;
585 }
586 qh->qlen++;
587}
588
573static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch, 589static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
574 struct sk_buff **to_free) 590 struct sk_buff **to_free)
575{ 591{
@@ -580,7 +596,7 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
580 if (cl == HTB_DIRECT) { 596 if (cl == HTB_DIRECT) {
581 /* enqueue to helper queue */ 597 /* enqueue to helper queue */
582 if (q->direct_queue.qlen < q->direct_qlen) { 598 if (q->direct_queue.qlen < q->direct_qlen) {
583 __skb_queue_tail(&q->direct_queue, skb); 599 htb_enqueue_tail(skb, sch, &q->direct_queue);
584 q->direct_pkts++; 600 q->direct_pkts++;
585 } else { 601 } else {
586 return qdisc_drop(skb, sch, to_free); 602 return qdisc_drop(skb, sch, to_free);
@@ -888,7 +904,7 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch)
888 unsigned long start_at; 904 unsigned long start_at;
889 905
890 /* try to dequeue direct packets as high prio (!) to minimize cpu work */ 906 /* try to dequeue direct packets as high prio (!) to minimize cpu work */
891 skb = __skb_dequeue(&q->direct_queue); 907 skb = __qdisc_dequeue_head(&q->direct_queue);
892 if (skb != NULL) { 908 if (skb != NULL) {
893ok: 909ok:
894 qdisc_bstats_update(sch, skb); 910 qdisc_bstats_update(sch, skb);
@@ -1019,7 +1035,7 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt)
1019 1035
1020 qdisc_watchdog_init(&q->watchdog, sch); 1036 qdisc_watchdog_init(&q->watchdog, sch);
1021 INIT_WORK(&q->work, htb_work_func); 1037 INIT_WORK(&q->work, htb_work_func);
1022 __skb_queue_head_init(&q->direct_queue); 1038 qdisc_skb_head_init(&q->direct_queue);
1023 1039
1024 if (tb[TCA_HTB_DIRECT_QLEN]) 1040 if (tb[TCA_HTB_DIRECT_QLEN])
1025 q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]); 1041 q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]);
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index b9439827c172..2bc8d7f8df16 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -88,7 +88,7 @@ static void mq_attach(struct Qdisc *sch)
88 qdisc_destroy(old); 88 qdisc_destroy(old);
89#ifdef CONFIG_NET_SCHED 89#ifdef CONFIG_NET_SCHED
90 if (ntx < dev->real_num_tx_queues) 90 if (ntx < dev->real_num_tx_queues)
91 qdisc_list_add(qdisc); 91 qdisc_hash_add(qdisc);
92#endif 92#endif
93 93
94 } 94 }
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index 549c66359924..b5c502c78143 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -182,7 +182,7 @@ static void mqprio_attach(struct Qdisc *sch)
182 if (old) 182 if (old)
183 qdisc_destroy(old); 183 qdisc_destroy(old);
184 if (ntx < dev->real_num_tx_queues) 184 if (ntx < dev->real_num_tx_queues)
185 qdisc_list_add(qdisc); 185 qdisc_hash_add(qdisc);
186 } 186 }
187 kfree(priv->qdiscs); 187 kfree(priv->qdiscs);
188 priv->qdiscs = NULL; 188 priv->qdiscs = NULL;
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index aaaf02175338..9f7b380cf0a3 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -413,6 +413,16 @@ static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch,
413 return segs; 413 return segs;
414} 414}
415 415
416static void netem_enqueue_skb_head(struct qdisc_skb_head *qh, struct sk_buff *skb)
417{
418 skb->next = qh->head;
419
420 if (!qh->head)
421 qh->tail = skb;
422 qh->head = skb;
423 qh->qlen++;
424}
425
416/* 426/*
417 * Insert one skb into qdisc. 427 * Insert one skb into qdisc.
418 * Note: parent depends on return value to account for queue length. 428 * Note: parent depends on return value to account for queue length.
@@ -502,7 +512,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
502 1<<(prandom_u32() % 8); 512 1<<(prandom_u32() % 8);
503 } 513 }
504 514
505 if (unlikely(skb_queue_len(&sch->q) >= sch->limit)) 515 if (unlikely(sch->q.qlen >= sch->limit))
506 return qdisc_drop(skb, sch, to_free); 516 return qdisc_drop(skb, sch, to_free);
507 517
508 qdisc_qstats_backlog_inc(sch, skb); 518 qdisc_qstats_backlog_inc(sch, skb);
@@ -522,8 +532,8 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
522 if (q->rate) { 532 if (q->rate) {
523 struct sk_buff *last; 533 struct sk_buff *last;
524 534
525 if (!skb_queue_empty(&sch->q)) 535 if (sch->q.qlen)
526 last = skb_peek_tail(&sch->q); 536 last = sch->q.tail;
527 else 537 else
528 last = netem_rb_to_skb(rb_last(&q->t_root)); 538 last = netem_rb_to_skb(rb_last(&q->t_root));
529 if (last) { 539 if (last) {
@@ -552,7 +562,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
552 cb->time_to_send = psched_get_time(); 562 cb->time_to_send = psched_get_time();
553 q->counter = 0; 563 q->counter = 0;
554 564
555 __skb_queue_head(&sch->q, skb); 565 netem_enqueue_skb_head(&sch->q, skb);
556 sch->qstats.requeues++; 566 sch->qstats.requeues++;
557 } 567 }
558 568
@@ -587,7 +597,7 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch)
587 struct rb_node *p; 597 struct rb_node *p;
588 598
589tfifo_dequeue: 599tfifo_dequeue:
590 skb = __skb_dequeue(&sch->q); 600 skb = __qdisc_dequeue_head(&sch->q);
591 if (skb) { 601 if (skb) {
592 qdisc_qstats_backlog_dec(sch, skb); 602 qdisc_qstats_backlog_dec(sch, skb);
593deliver: 603deliver:
diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c
index a570b0bb254c..5c3a99d6aa82 100644
--- a/net/sched/sch_pie.c
+++ b/net/sched/sch_pie.c
@@ -231,7 +231,7 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt)
231 /* Drop excess packets if new limit is lower */ 231 /* Drop excess packets if new limit is lower */
232 qlen = sch->q.qlen; 232 qlen = sch->q.qlen;
233 while (sch->q.qlen > sch->limit) { 233 while (sch->q.qlen > sch->limit) {
234 struct sk_buff *skb = __skb_dequeue(&sch->q); 234 struct sk_buff *skb = __qdisc_dequeue_head(&sch->q);
235 235
236 dropped += qdisc_pkt_len(skb); 236 dropped += qdisc_pkt_len(skb);
237 qdisc_qstats_backlog_dec(sch, skb); 237 qdisc_qstats_backlog_dec(sch, skb);
@@ -511,7 +511,7 @@ static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
511static struct sk_buff *pie_qdisc_dequeue(struct Qdisc *sch) 511static struct sk_buff *pie_qdisc_dequeue(struct Qdisc *sch)
512{ 512{
513 struct sk_buff *skb; 513 struct sk_buff *skb;
514 skb = __qdisc_dequeue_head(sch, &sch->q); 514 skb = qdisc_dequeue_head(sch);
515 515
516 if (!skb) 516 if (!skb)
517 return NULL; 517 return NULL;
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 1c23060c41a6..f10d3397f917 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -1408,7 +1408,7 @@ void sctp_assoc_sync_pmtu(struct sock *sk, struct sctp_association *asoc)
1408 transports) { 1408 transports) {
1409 if (t->pmtu_pending && t->dst) { 1409 if (t->pmtu_pending && t->dst) {
1410 sctp_transport_update_pmtu(sk, t, 1410 sctp_transport_update_pmtu(sk, t,
1411 WORD_TRUNC(dst_mtu(t->dst))); 1411 SCTP_TRUNC4(dst_mtu(t->dst)));
1412 t->pmtu_pending = 0; 1412 t->pmtu_pending = 0;
1413 } 1413 }
1414 if (!pmtu || (t->pathmtu < pmtu)) 1414 if (!pmtu || (t->pathmtu < pmtu))
diff --git a/net/sctp/auth.c b/net/sctp/auth.c
index 912eb1685a5d..f99d4855d3de 100644
--- a/net/sctp/auth.c
+++ b/net/sctp/auth.c
@@ -48,7 +48,7 @@ static struct sctp_hmac sctp_hmac_list[SCTP_AUTH_NUM_HMACS] = {
48 /* id 2 is reserved as well */ 48 /* id 2 is reserved as well */
49 .hmac_id = SCTP_AUTH_HMAC_ID_RESERVED_2, 49 .hmac_id = SCTP_AUTH_HMAC_ID_RESERVED_2,
50 }, 50 },
51#if defined (CONFIG_CRYPTO_SHA256) || defined (CONFIG_CRYPTO_SHA256_MODULE) 51#if IS_ENABLED(CONFIG_CRYPTO_SHA256)
52 { 52 {
53 .hmac_id = SCTP_AUTH_HMAC_ID_SHA256, 53 .hmac_id = SCTP_AUTH_HMAC_ID_SHA256,
54 .hmac_name = "hmac(sha256)", 54 .hmac_name = "hmac(sha256)",
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index 0a3dbec0a8fb..7a1cdf43e49d 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -70,6 +70,19 @@ static struct sctp_datamsg *sctp_datamsg_new(gfp_t gfp)
70 return msg; 70 return msg;
71} 71}
72 72
73void sctp_datamsg_free(struct sctp_datamsg *msg)
74{
75 struct sctp_chunk *chunk;
76
77 /* This doesn't have to be a _safe vairant because
78 * sctp_chunk_free() only drops the refs.
79 */
80 list_for_each_entry(chunk, &msg->chunks, frag_list)
81 sctp_chunk_free(chunk);
82
83 sctp_datamsg_put(msg);
84}
85
73/* Final destructruction of datamsg memory. */ 86/* Final destructruction of datamsg memory. */
74static void sctp_datamsg_destroy(struct sctp_datamsg *msg) 87static void sctp_datamsg_destroy(struct sctp_datamsg *msg)
75{ 88{
@@ -187,9 +200,10 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
187 /* This is the biggest possible DATA chunk that can fit into 200 /* This is the biggest possible DATA chunk that can fit into
188 * the packet 201 * the packet
189 */ 202 */
190 max_data = (asoc->pathmtu - 203 max_data = asoc->pathmtu -
191 sctp_sk(asoc->base.sk)->pf->af->net_header_len - 204 sctp_sk(asoc->base.sk)->pf->af->net_header_len -
192 sizeof(struct sctphdr) - sizeof(struct sctp_data_chunk)) & ~3; 205 sizeof(struct sctphdr) - sizeof(struct sctp_data_chunk);
206 max_data = SCTP_TRUNC4(max_data);
193 207
194 max = asoc->frag_point; 208 max = asoc->frag_point;
195 /* If the the peer requested that we authenticate DATA chunks 209 /* If the the peer requested that we authenticate DATA chunks
@@ -200,8 +214,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
200 struct sctp_hmac *hmac_desc = sctp_auth_asoc_get_hmac(asoc); 214 struct sctp_hmac *hmac_desc = sctp_auth_asoc_get_hmac(asoc);
201 215
202 if (hmac_desc) 216 if (hmac_desc)
203 max_data -= WORD_ROUND(sizeof(sctp_auth_chunk_t) + 217 max_data -= SCTP_PAD4(sizeof(sctp_auth_chunk_t) +
204 hmac_desc->hmac_len); 218 hmac_desc->hmac_len);
205 } 219 }
206 220
207 /* Now, check if we need to reduce our max */ 221 /* Now, check if we need to reduce our max */
@@ -221,7 +235,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
221 asoc->outqueue.out_qlen == 0 && 235 asoc->outqueue.out_qlen == 0 &&
222 list_empty(&asoc->outqueue.retransmit) && 236 list_empty(&asoc->outqueue.retransmit) &&
223 msg_len > max) 237 msg_len > max)
224 max_data -= WORD_ROUND(sizeof(sctp_sack_chunk_t)); 238 max_data -= SCTP_PAD4(sizeof(sctp_sack_chunk_t));
225 239
226 /* Encourage Cookie-ECHO bundling. */ 240 /* Encourage Cookie-ECHO bundling. */
227 if (asoc->state < SCTP_STATE_COOKIE_ECHOED) 241 if (asoc->state < SCTP_STATE_COOKIE_ECHOED)
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 1555fb8c68e0..a2ea1d1cc06a 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -605,7 +605,7 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info)
605 /* PMTU discovery (RFC1191) */ 605 /* PMTU discovery (RFC1191) */
606 if (ICMP_FRAG_NEEDED == code) { 606 if (ICMP_FRAG_NEEDED == code) {
607 sctp_icmp_frag_needed(sk, asoc, transport, 607 sctp_icmp_frag_needed(sk, asoc, transport,
608 WORD_TRUNC(info)); 608 SCTP_TRUNC4(info));
609 goto out_unlock; 609 goto out_unlock;
610 } else { 610 } else {
611 if (ICMP_PROT_UNREACH == code) { 611 if (ICMP_PROT_UNREACH == code) {
@@ -673,7 +673,7 @@ static int sctp_rcv_ootb(struct sk_buff *skb)
673 if (ntohs(ch->length) < sizeof(sctp_chunkhdr_t)) 673 if (ntohs(ch->length) < sizeof(sctp_chunkhdr_t))
674 break; 674 break;
675 675
676 ch_end = offset + WORD_ROUND(ntohs(ch->length)); 676 ch_end = offset + SCTP_PAD4(ntohs(ch->length));
677 if (ch_end > skb->len) 677 if (ch_end > skb->len)
678 break; 678 break;
679 679
@@ -1128,7 +1128,7 @@ static struct sctp_association *__sctp_rcv_walk_lookup(struct net *net,
1128 if (ntohs(ch->length) < sizeof(sctp_chunkhdr_t)) 1128 if (ntohs(ch->length) < sizeof(sctp_chunkhdr_t))
1129 break; 1129 break;
1130 1130
1131 ch_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length)); 1131 ch_end = ((__u8 *)ch) + SCTP_PAD4(ntohs(ch->length));
1132 if (ch_end > skb_tail_pointer(skb)) 1132 if (ch_end > skb_tail_pointer(skb))
1133 break; 1133 break;
1134 1134
@@ -1197,7 +1197,7 @@ static struct sctp_association *__sctp_rcv_lookup_harder(struct net *net,
1197 * that the chunk length doesn't cause overflow. Otherwise, we'll 1197 * that the chunk length doesn't cause overflow. Otherwise, we'll
1198 * walk off the end. 1198 * walk off the end.
1199 */ 1199 */
1200 if (WORD_ROUND(ntohs(ch->length)) > skb->len) 1200 if (SCTP_PAD4(ntohs(ch->length)) > skb->len)
1201 return NULL; 1201 return NULL;
1202 1202
1203 /* If this is INIT/INIT-ACK look inside the chunk too. */ 1203 /* If this is INIT/INIT-ACK look inside the chunk too. */
diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c
index 6437aa97cfd7..f731de3e8428 100644
--- a/net/sctp/inqueue.c
+++ b/net/sctp/inqueue.c
@@ -213,7 +213,7 @@ new_skb:
213 } 213 }
214 214
215 chunk->chunk_hdr = ch; 215 chunk->chunk_hdr = ch;
216 chunk->chunk_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length)); 216 chunk->chunk_end = ((__u8 *)ch) + SCTP_PAD4(ntohs(ch->length));
217 skb_pull(chunk->skb, sizeof(sctp_chunkhdr_t)); 217 skb_pull(chunk->skb, sizeof(sctp_chunkhdr_t));
218 chunk->subh.v = NULL; /* Subheader is no longer valid. */ 218 chunk->subh.v = NULL; /* Subheader is no longer valid. */
219 219
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 31b7bc35895d..2a5c1896d18f 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -180,7 +180,6 @@ sctp_xmit_t sctp_packet_transmit_chunk(struct sctp_packet *packet,
180 int one_packet, gfp_t gfp) 180 int one_packet, gfp_t gfp)
181{ 181{
182 sctp_xmit_t retval; 182 sctp_xmit_t retval;
183 int error = 0;
184 183
185 pr_debug("%s: packet:%p size:%Zu chunk:%p size:%d\n", __func__, 184 pr_debug("%s: packet:%p size:%Zu chunk:%p size:%d\n", __func__,
186 packet, packet->size, chunk, chunk->skb ? chunk->skb->len : -1); 185 packet, packet->size, chunk, chunk->skb ? chunk->skb->len : -1);
@@ -188,6 +187,8 @@ sctp_xmit_t sctp_packet_transmit_chunk(struct sctp_packet *packet,
188 switch ((retval = (sctp_packet_append_chunk(packet, chunk)))) { 187 switch ((retval = (sctp_packet_append_chunk(packet, chunk)))) {
189 case SCTP_XMIT_PMTU_FULL: 188 case SCTP_XMIT_PMTU_FULL:
190 if (!packet->has_cookie_echo) { 189 if (!packet->has_cookie_echo) {
190 int error = 0;
191
191 error = sctp_packet_transmit(packet, gfp); 192 error = sctp_packet_transmit(packet, gfp);
192 if (error < 0) 193 if (error < 0)
193 chunk->skb->sk->sk_err = -error; 194 chunk->skb->sk->sk_err = -error;
@@ -296,7 +297,7 @@ static sctp_xmit_t __sctp_packet_append_chunk(struct sctp_packet *packet,
296 struct sctp_chunk *chunk) 297 struct sctp_chunk *chunk)
297{ 298{
298 sctp_xmit_t retval = SCTP_XMIT_OK; 299 sctp_xmit_t retval = SCTP_XMIT_OK;
299 __u16 chunk_len = WORD_ROUND(ntohs(chunk->chunk_hdr->length)); 300 __u16 chunk_len = SCTP_PAD4(ntohs(chunk->chunk_hdr->length));
300 301
301 /* Check to see if this chunk will fit into the packet */ 302 /* Check to see if this chunk will fit into the packet */
302 retval = sctp_packet_will_fit(packet, chunk, chunk_len); 303 retval = sctp_packet_will_fit(packet, chunk, chunk_len);
@@ -441,14 +442,14 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
441 * time. Application may notice this error. 442 * time. Application may notice this error.
442 */ 443 */
443 pr_err_once("Trying to GSO but underlying device doesn't support it."); 444 pr_err_once("Trying to GSO but underlying device doesn't support it.");
444 goto nomem; 445 goto err;
445 } 446 }
446 } else { 447 } else {
447 pkt_size = packet->size; 448 pkt_size = packet->size;
448 } 449 }
449 head = alloc_skb(pkt_size + MAX_HEADER, gfp); 450 head = alloc_skb(pkt_size + MAX_HEADER, gfp);
450 if (!head) 451 if (!head)
451 goto nomem; 452 goto err;
452 if (gso) { 453 if (gso) {
453 NAPI_GRO_CB(head)->last = head; 454 NAPI_GRO_CB(head)->last = head;
454 skb_shinfo(head)->gso_type = sk->sk_gso_type; 455 skb_shinfo(head)->gso_type = sk->sk_gso_type;
@@ -469,8 +470,12 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
469 } 470 }
470 } 471 }
471 dst = dst_clone(tp->dst); 472 dst = dst_clone(tp->dst);
472 if (!dst) 473 if (!dst) {
473 goto no_route; 474 if (asoc)
475 IP_INC_STATS(sock_net(asoc->base.sk),
476 IPSTATS_MIB_OUTNOROUTES);
477 goto nodst;
478 }
474 skb_dst_set(head, dst); 479 skb_dst_set(head, dst);
475 480
476 /* Build the SCTP header. */ 481 /* Build the SCTP header. */
@@ -503,7 +508,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
503 if (gso) { 508 if (gso) {
504 pkt_size = packet->overhead; 509 pkt_size = packet->overhead;
505 list_for_each_entry(chunk, &packet->chunk_list, list) { 510 list_for_each_entry(chunk, &packet->chunk_list, list) {
506 int padded = WORD_ROUND(chunk->skb->len); 511 int padded = SCTP_PAD4(chunk->skb->len);
507 512
508 if (pkt_size + padded > tp->pathmtu) 513 if (pkt_size + padded > tp->pathmtu)
509 break; 514 break;
@@ -533,7 +538,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
533 * included in the chunk length field. The sender should 538 * included in the chunk length field. The sender should
534 * never pad with more than 3 bytes. 539 * never pad with more than 3 bytes.
535 * 540 *
536 * [This whole comment explains WORD_ROUND() below.] 541 * [This whole comment explains SCTP_PAD4() below.]
537 */ 542 */
538 543
539 pkt_size -= packet->overhead; 544 pkt_size -= packet->overhead;
@@ -555,7 +560,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
555 has_data = 1; 560 has_data = 1;
556 } 561 }
557 562
558 padding = WORD_ROUND(chunk->skb->len) - chunk->skb->len; 563 padding = SCTP_PAD4(chunk->skb->len) - chunk->skb->len;
559 if (padding) 564 if (padding)
560 memset(skb_put(chunk->skb, padding), 0, padding); 565 memset(skb_put(chunk->skb, padding), 0, padding);
561 566
@@ -582,7 +587,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
582 * acknowledged or have failed. 587 * acknowledged or have failed.
583 * Re-queue auth chunks if needed. 588 * Re-queue auth chunks if needed.
584 */ 589 */
585 pkt_size -= WORD_ROUND(chunk->skb->len); 590 pkt_size -= SCTP_PAD4(chunk->skb->len);
586 591
587 if (!sctp_chunk_is_data(chunk) && chunk != packet->auth) 592 if (!sctp_chunk_is_data(chunk) && chunk != packet->auth)
588 sctp_chunk_free(chunk); 593 sctp_chunk_free(chunk);
@@ -621,8 +626,10 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
621 if (!gso) 626 if (!gso)
622 break; 627 break;
623 628
624 if (skb_gro_receive(&head, nskb)) 629 if (skb_gro_receive(&head, nskb)) {
630 kfree_skb(nskb);
625 goto nomem; 631 goto nomem;
632 }
626 nskb = NULL; 633 nskb = NULL;
627 if (WARN_ON_ONCE(skb_shinfo(head)->gso_segs >= 634 if (WARN_ON_ONCE(skb_shinfo(head)->gso_segs >=
628 sk->sk_gso_max_segs)) 635 sk->sk_gso_max_segs))
@@ -716,18 +723,13 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
716 } 723 }
717 head->ignore_df = packet->ipfragok; 724 head->ignore_df = packet->ipfragok;
718 tp->af_specific->sctp_xmit(head, tp); 725 tp->af_specific->sctp_xmit(head, tp);
726 goto out;
719 727
720out: 728nomem:
721 sctp_packet_reset(packet); 729 if (packet->auth && list_empty(&packet->auth->list))
722 return err; 730 sctp_chunk_free(packet->auth);
723no_route:
724 kfree_skb(head);
725 if (nskb != head)
726 kfree_skb(nskb);
727
728 if (asoc)
729 IP_INC_STATS(sock_net(asoc->base.sk), IPSTATS_MIB_OUTNOROUTES);
730 731
732nodst:
731 /* FIXME: Returning the 'err' will effect all the associations 733 /* FIXME: Returning the 'err' will effect all the associations
732 * associated with a socket, although only one of the paths of the 734 * associated with a socket, although only one of the paths of the
733 * association is unreachable. 735 * association is unreachable.
@@ -736,22 +738,18 @@ no_route:
736 * required. 738 * required.
737 */ 739 */
738 /* err = -EHOSTUNREACH; */ 740 /* err = -EHOSTUNREACH; */
739err: 741 kfree_skb(head);
740 /* Control chunks are unreliable so just drop them. DATA chunks
741 * will get resent or dropped later.
742 */
743 742
743err:
744 list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) { 744 list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) {
745 list_del_init(&chunk->list); 745 list_del_init(&chunk->list);
746 if (!sctp_chunk_is_data(chunk)) 746 if (!sctp_chunk_is_data(chunk))
747 sctp_chunk_free(chunk); 747 sctp_chunk_free(chunk);
748 } 748 }
749 goto out; 749
750nomem: 750out:
751 if (packet->auth && list_empty(&packet->auth->list)) 751 sctp_packet_reset(packet);
752 sctp_chunk_free(packet->auth); 752 return err;
753 err = -ENOMEM;
754 goto err;
755} 753}
756 754
757/******************************************************************** 755/********************************************************************
@@ -913,7 +911,7 @@ static sctp_xmit_t sctp_packet_will_fit(struct sctp_packet *packet,
913 */ 911 */
914 maxsize = pmtu - packet->overhead; 912 maxsize = pmtu - packet->overhead;
915 if (packet->auth) 913 if (packet->auth)
916 maxsize -= WORD_ROUND(packet->auth->skb->len); 914 maxsize -= SCTP_PAD4(packet->auth->skb->len);
917 if (chunk_len > maxsize) 915 if (chunk_len > maxsize)
918 retval = SCTP_XMIT_PMTU_FULL; 916 retval = SCTP_XMIT_PMTU_FULL;
919 917
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 107233da5cc9..582585393d35 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -68,7 +68,7 @@ static void sctp_mark_missing(struct sctp_outq *q,
68 68
69static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 sack_ctsn); 69static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 sack_ctsn);
70 70
71static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp); 71static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp);
72 72
73/* Add data to the front of the queue. */ 73/* Add data to the front of the queue. */
74static inline void sctp_outq_head_data(struct sctp_outq *q, 74static inline void sctp_outq_head_data(struct sctp_outq *q,
@@ -285,10 +285,9 @@ void sctp_outq_free(struct sctp_outq *q)
285} 285}
286 286
287/* Put a new chunk in an sctp_outq. */ 287/* Put a new chunk in an sctp_outq. */
288int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk, gfp_t gfp) 288void sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk, gfp_t gfp)
289{ 289{
290 struct net *net = sock_net(q->asoc->base.sk); 290 struct net *net = sock_net(q->asoc->base.sk);
291 int error = 0;
292 291
293 pr_debug("%s: outq:%p, chunk:%p[%s]\n", __func__, q, chunk, 292 pr_debug("%s: outq:%p, chunk:%p[%s]\n", __func__, q, chunk,
294 chunk && chunk->chunk_hdr ? 293 chunk && chunk->chunk_hdr ?
@@ -299,54 +298,26 @@ int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk, gfp_t gfp)
299 * immediately. 298 * immediately.
300 */ 299 */
301 if (sctp_chunk_is_data(chunk)) { 300 if (sctp_chunk_is_data(chunk)) {
302 /* Is it OK to queue data chunks? */ 301 pr_debug("%s: outqueueing: outq:%p, chunk:%p[%s])\n",
303 /* From 9. Termination of Association 302 __func__, q, chunk, chunk && chunk->chunk_hdr ?
304 * 303 sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) :
305 * When either endpoint performs a shutdown, the 304 "illegal chunk");
306 * association on each peer will stop accepting new 305
307 * data from its user and only deliver data in queue 306 sctp_outq_tail_data(q, chunk);
308 * at the time of sending or receiving the SHUTDOWN 307 if (chunk->asoc->peer.prsctp_capable &&
309 * chunk. 308 SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags))
310 */ 309 chunk->asoc->sent_cnt_removable++;
311 switch (q->asoc->state) { 310 if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED)
312 case SCTP_STATE_CLOSED: 311 SCTP_INC_STATS(net, SCTP_MIB_OUTUNORDERCHUNKS);
313 case SCTP_STATE_SHUTDOWN_PENDING: 312 else
314 case SCTP_STATE_SHUTDOWN_SENT: 313 SCTP_INC_STATS(net, SCTP_MIB_OUTORDERCHUNKS);
315 case SCTP_STATE_SHUTDOWN_RECEIVED:
316 case SCTP_STATE_SHUTDOWN_ACK_SENT:
317 /* Cannot send after transport endpoint shutdown */
318 error = -ESHUTDOWN;
319 break;
320
321 default:
322 pr_debug("%s: outqueueing: outq:%p, chunk:%p[%s])\n",
323 __func__, q, chunk, chunk && chunk->chunk_hdr ?
324 sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) :
325 "illegal chunk");
326
327 sctp_chunk_hold(chunk);
328 sctp_outq_tail_data(q, chunk);
329 if (chunk->asoc->peer.prsctp_capable &&
330 SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags))
331 chunk->asoc->sent_cnt_removable++;
332 if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED)
333 SCTP_INC_STATS(net, SCTP_MIB_OUTUNORDERCHUNKS);
334 else
335 SCTP_INC_STATS(net, SCTP_MIB_OUTORDERCHUNKS);
336 break;
337 }
338 } else { 314 } else {
339 list_add_tail(&chunk->list, &q->control_chunk_list); 315 list_add_tail(&chunk->list, &q->control_chunk_list);
340 SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS); 316 SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS);
341 } 317 }
342 318
343 if (error < 0)
344 return error;
345
346 if (!q->cork) 319 if (!q->cork)
347 error = sctp_outq_flush(q, 0, gfp); 320 sctp_outq_flush(q, 0, gfp);
348
349 return error;
350} 321}
351 322
352/* Insert a chunk into the sorted list based on the TSNs. The retransmit list 323/* Insert a chunk into the sorted list based on the TSNs. The retransmit list
@@ -559,7 +530,6 @@ void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport,
559 sctp_retransmit_reason_t reason) 530 sctp_retransmit_reason_t reason)
560{ 531{
561 struct net *net = sock_net(q->asoc->base.sk); 532 struct net *net = sock_net(q->asoc->base.sk);
562 int error = 0;
563 533
564 switch (reason) { 534 switch (reason) {
565 case SCTP_RTXR_T3_RTX: 535 case SCTP_RTXR_T3_RTX:
@@ -603,10 +573,7 @@ void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport,
603 * will be flushed at the end. 573 * will be flushed at the end.
604 */ 574 */
605 if (reason != SCTP_RTXR_FAST_RTX) 575 if (reason != SCTP_RTXR_FAST_RTX)
606 error = sctp_outq_flush(q, /* rtx_timeout */ 1, GFP_ATOMIC); 576 sctp_outq_flush(q, /* rtx_timeout */ 1, GFP_ATOMIC);
607
608 if (error)
609 q->asoc->base.sk->sk_err = -error;
610} 577}
611 578
612/* 579/*
@@ -778,12 +745,12 @@ redo:
778} 745}
779 746
780/* Cork the outqueue so queued chunks are really queued. */ 747/* Cork the outqueue so queued chunks are really queued. */
781int sctp_outq_uncork(struct sctp_outq *q, gfp_t gfp) 748void sctp_outq_uncork(struct sctp_outq *q, gfp_t gfp)
782{ 749{
783 if (q->cork) 750 if (q->cork)
784 q->cork = 0; 751 q->cork = 0;
785 752
786 return sctp_outq_flush(q, 0, gfp); 753 sctp_outq_flush(q, 0, gfp);
787} 754}
788 755
789 756
@@ -796,7 +763,7 @@ int sctp_outq_uncork(struct sctp_outq *q, gfp_t gfp)
796 * locking concerns must be made. Today we use the sock lock to protect 763 * locking concerns must be made. Today we use the sock lock to protect
797 * this function. 764 * this function.
798 */ 765 */
799static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) 766static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
800{ 767{
801 struct sctp_packet *packet; 768 struct sctp_packet *packet;
802 struct sctp_packet singleton; 769 struct sctp_packet singleton;
@@ -919,8 +886,10 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
919 sctp_packet_config(&singleton, vtag, 0); 886 sctp_packet_config(&singleton, vtag, 0);
920 sctp_packet_append_chunk(&singleton, chunk); 887 sctp_packet_append_chunk(&singleton, chunk);
921 error = sctp_packet_transmit(&singleton, gfp); 888 error = sctp_packet_transmit(&singleton, gfp);
922 if (error < 0) 889 if (error < 0) {
923 return error; 890 asoc->base.sk->sk_err = -error;
891 return;
892 }
924 break; 893 break;
925 894
926 case SCTP_CID_ABORT: 895 case SCTP_CID_ABORT:
@@ -1018,6 +987,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
1018 retran: 987 retran:
1019 error = sctp_outq_flush_rtx(q, packet, 988 error = sctp_outq_flush_rtx(q, packet,
1020 rtx_timeout, &start_timer); 989 rtx_timeout, &start_timer);
990 if (error < 0)
991 asoc->base.sk->sk_err = -error;
1021 992
1022 if (start_timer) { 993 if (start_timer) {
1023 sctp_transport_reset_t3_rtx(transport); 994 sctp_transport_reset_t3_rtx(transport);
@@ -1192,14 +1163,15 @@ sctp_flush_out:
1192 struct sctp_transport, 1163 struct sctp_transport,
1193 send_ready); 1164 send_ready);
1194 packet = &t->packet; 1165 packet = &t->packet;
1195 if (!sctp_packet_empty(packet)) 1166 if (!sctp_packet_empty(packet)) {
1196 error = sctp_packet_transmit(packet, gfp); 1167 error = sctp_packet_transmit(packet, gfp);
1168 if (error < 0)
1169 asoc->base.sk->sk_err = -error;
1170 }
1197 1171
1198 /* Clear the burst limited state, if any */ 1172 /* Clear the burst limited state, if any */
1199 sctp_transport_burst_reset(t); 1173 sctp_transport_burst_reset(t);
1200 } 1174 }
1201
1202 return error;
1203} 1175}
1204 1176
1205/* Update unack_data based on the incoming SACK chunk */ 1177/* Update unack_data based on the incoming SACK chunk */
@@ -1747,7 +1719,7 @@ static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn)
1747{ 1719{
1748 int i; 1720 int i;
1749 sctp_sack_variable_t *frags; 1721 sctp_sack_variable_t *frags;
1750 __u16 gap; 1722 __u16 tsn_offset, blocks;
1751 __u32 ctsn = ntohl(sack->cum_tsn_ack); 1723 __u32 ctsn = ntohl(sack->cum_tsn_ack);
1752 1724
1753 if (TSN_lte(tsn, ctsn)) 1725 if (TSN_lte(tsn, ctsn))
@@ -1766,10 +1738,11 @@ static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn)
1766 */ 1738 */
1767 1739
1768 frags = sack->variable; 1740 frags = sack->variable;
1769 gap = tsn - ctsn; 1741 blocks = ntohs(sack->num_gap_ack_blocks);
1770 for (i = 0; i < ntohs(sack->num_gap_ack_blocks); ++i) { 1742 tsn_offset = tsn - ctsn;
1771 if (TSN_lte(ntohs(frags[i].gab.start), gap) && 1743 for (i = 0; i < blocks; ++i) {
1772 TSN_lte(gap, ntohs(frags[i].gab.end))) 1744 if (tsn_offset >= ntohs(frags[i].gab.start) &&
1745 tsn_offset <= ntohs(frags[i].gab.end))
1773 goto pass; 1746 goto pass;
1774 } 1747 }
1775 1748
diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index ef8ba77a5bea..206377fe91ec 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -73,13 +73,17 @@ static const struct snmp_mib sctp_snmp_list[] = {
73/* Display sctp snmp mib statistics(/proc/net/sctp/snmp). */ 73/* Display sctp snmp mib statistics(/proc/net/sctp/snmp). */
74static int sctp_snmp_seq_show(struct seq_file *seq, void *v) 74static int sctp_snmp_seq_show(struct seq_file *seq, void *v)
75{ 75{
76 unsigned long buff[SCTP_MIB_MAX];
76 struct net *net = seq->private; 77 struct net *net = seq->private;
77 int i; 78 int i;
78 79
79 for (i = 0; sctp_snmp_list[i].name != NULL; i++) 80 memset(buff, 0, sizeof(unsigned long) * SCTP_MIB_MAX);
81
82 snmp_get_cpu_field_batch(buff, sctp_snmp_list,
83 net->sctp.sctp_statistics);
84 for (i = 0; sctp_snmp_list[i].name; i++)
80 seq_printf(seq, "%-32s\t%ld\n", sctp_snmp_list[i].name, 85 seq_printf(seq, "%-32s\t%ld\n", sctp_snmp_list[i].name,
81 snmp_fold_field(net->sctp.sctp_statistics, 86 buff[i]);
82 sctp_snmp_list[i].entry));
83 87
84 return 0; 88 return 0;
85} 89}
diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c
index cef0cee182d4..048954eee984 100644
--- a/net/sctp/sctp_diag.c
+++ b/net/sctp/sctp_diag.c
@@ -106,7 +106,8 @@ static int inet_sctp_diag_fill(struct sock *sk, struct sctp_association *asoc,
106 const struct inet_diag_req_v2 *req, 106 const struct inet_diag_req_v2 *req,
107 struct user_namespace *user_ns, 107 struct user_namespace *user_ns,
108 int portid, u32 seq, u16 nlmsg_flags, 108 int portid, u32 seq, u16 nlmsg_flags,
109 const struct nlmsghdr *unlh) 109 const struct nlmsghdr *unlh,
110 bool net_admin)
110{ 111{
111 struct sctp_endpoint *ep = sctp_sk(sk)->ep; 112 struct sctp_endpoint *ep = sctp_sk(sk)->ep;
112 struct list_head *addr_list; 113 struct list_head *addr_list;
@@ -133,7 +134,7 @@ static int inet_sctp_diag_fill(struct sock *sk, struct sctp_association *asoc,
133 r->idiag_retrans = 0; 134 r->idiag_retrans = 0;
134 } 135 }
135 136
136 if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns)) 137 if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns, net_admin))
137 goto errout; 138 goto errout;
138 139
139 if (ext & (1 << (INET_DIAG_SKMEMINFO - 1))) { 140 if (ext & (1 << (INET_DIAG_SKMEMINFO - 1))) {
@@ -203,6 +204,7 @@ struct sctp_comm_param {
203 struct netlink_callback *cb; 204 struct netlink_callback *cb;
204 const struct inet_diag_req_v2 *r; 205 const struct inet_diag_req_v2 *r;
205 const struct nlmsghdr *nlh; 206 const struct nlmsghdr *nlh;
207 bool net_admin;
206}; 208};
207 209
208static size_t inet_assoc_attr_size(struct sctp_association *asoc) 210static size_t inet_assoc_attr_size(struct sctp_association *asoc)
@@ -219,6 +221,7 @@ static size_t inet_assoc_attr_size(struct sctp_association *asoc)
219 + nla_total_size(1) /* INET_DIAG_SHUTDOWN */ 221 + nla_total_size(1) /* INET_DIAG_SHUTDOWN */
220 + nla_total_size(1) /* INET_DIAG_TOS */ 222 + nla_total_size(1) /* INET_DIAG_TOS */
221 + nla_total_size(1) /* INET_DIAG_TCLASS */ 223 + nla_total_size(1) /* INET_DIAG_TCLASS */
224 + nla_total_size(4) /* INET_DIAG_MARK */
222 + nla_total_size(addrlen * asoc->peer.transport_count) 225 + nla_total_size(addrlen * asoc->peer.transport_count)
223 + nla_total_size(addrlen * addrcnt) 226 + nla_total_size(addrlen * addrcnt)
224 + nla_total_size(sizeof(struct inet_diag_meminfo)) 227 + nla_total_size(sizeof(struct inet_diag_meminfo))
@@ -256,7 +259,8 @@ static int sctp_tsp_dump_one(struct sctp_transport *tsp, void *p)
256 err = inet_sctp_diag_fill(sk, assoc, rep, req, 259 err = inet_sctp_diag_fill(sk, assoc, rep, req,
257 sk_user_ns(NETLINK_CB(in_skb).sk), 260 sk_user_ns(NETLINK_CB(in_skb).sk),
258 NETLINK_CB(in_skb).portid, 261 NETLINK_CB(in_skb).portid,
259 nlh->nlmsg_seq, 0, nlh); 262 nlh->nlmsg_seq, 0, nlh,
263 commp->net_admin);
260 release_sock(sk); 264 release_sock(sk);
261 if (err < 0) { 265 if (err < 0) {
262 WARN_ON(err == -EMSGSIZE); 266 WARN_ON(err == -EMSGSIZE);
@@ -299,7 +303,8 @@ static int sctp_sock_dump(struct sock *sk, void *p)
299 sk_user_ns(NETLINK_CB(cb->skb).sk), 303 sk_user_ns(NETLINK_CB(cb->skb).sk),
300 NETLINK_CB(cb->skb).portid, 304 NETLINK_CB(cb->skb).portid,
301 cb->nlh->nlmsg_seq, 305 cb->nlh->nlmsg_seq,
302 NLM_F_MULTI, cb->nlh) < 0) { 306 NLM_F_MULTI, cb->nlh,
307 commp->net_admin) < 0) {
303 cb->args[3] = 1; 308 cb->args[3] = 1;
304 err = 1; 309 err = 1;
305 goto release; 310 goto release;
@@ -309,7 +314,8 @@ static int sctp_sock_dump(struct sock *sk, void *p)
309 if (inet_sctp_diag_fill(sk, assoc, skb, r, 314 if (inet_sctp_diag_fill(sk, assoc, skb, r,
310 sk_user_ns(NETLINK_CB(cb->skb).sk), 315 sk_user_ns(NETLINK_CB(cb->skb).sk),
311 NETLINK_CB(cb->skb).portid, 316 NETLINK_CB(cb->skb).portid,
312 cb->nlh->nlmsg_seq, 0, cb->nlh) < 0) { 317 cb->nlh->nlmsg_seq, 0, cb->nlh,
318 commp->net_admin) < 0) {
313 err = 1; 319 err = 1;
314 goto release; 320 goto release;
315 } 321 }
@@ -389,7 +395,7 @@ static int sctp_ep_dump(struct sctp_endpoint *ep, void *p)
389 sk_user_ns(NETLINK_CB(cb->skb).sk), 395 sk_user_ns(NETLINK_CB(cb->skb).sk),
390 NETLINK_CB(cb->skb).portid, 396 NETLINK_CB(cb->skb).portid,
391 cb->nlh->nlmsg_seq, NLM_F_MULTI, 397 cb->nlh->nlmsg_seq, NLM_F_MULTI,
392 cb->nlh) < 0) { 398 cb->nlh, commp->net_admin) < 0) {
393 err = 2; 399 err = 2;
394 goto out; 400 goto out;
395 } 401 }
@@ -426,6 +432,7 @@ static int sctp_diag_dump_one(struct sk_buff *in_skb,
426 .skb = in_skb, 432 .skb = in_skb,
427 .r = req, 433 .r = req,
428 .nlh = nlh, 434 .nlh = nlh,
435 .net_admin = netlink_net_capable(in_skb, CAP_NET_ADMIN),
429 }; 436 };
430 437
431 if (req->sdiag_family == AF_INET) { 438 if (req->sdiag_family == AF_INET) {
@@ -461,6 +468,7 @@ static void sctp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
461 .skb = skb, 468 .skb = skb,
462 .cb = cb, 469 .cb = cb,
463 .r = r, 470 .r = r,
471 .net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN),
464 }; 472 };
465 473
466 /* eps hashtable dumps 474 /* eps hashtable dumps
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 46ffecc57214..9e9690b7afe1 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -253,7 +253,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
253 num_types = sp->pf->supported_addrs(sp, types); 253 num_types = sp->pf->supported_addrs(sp, types);
254 254
255 chunksize = sizeof(init) + addrs_len; 255 chunksize = sizeof(init) + addrs_len;
256 chunksize += WORD_ROUND(SCTP_SAT_LEN(num_types)); 256 chunksize += SCTP_PAD4(SCTP_SAT_LEN(num_types));
257 chunksize += sizeof(ecap_param); 257 chunksize += sizeof(ecap_param);
258 258
259 if (asoc->prsctp_enable) 259 if (asoc->prsctp_enable)
@@ -283,14 +283,14 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
283 /* Add HMACS parameter length if any were defined */ 283 /* Add HMACS parameter length if any were defined */
284 auth_hmacs = (sctp_paramhdr_t *)asoc->c.auth_hmacs; 284 auth_hmacs = (sctp_paramhdr_t *)asoc->c.auth_hmacs;
285 if (auth_hmacs->length) 285 if (auth_hmacs->length)
286 chunksize += WORD_ROUND(ntohs(auth_hmacs->length)); 286 chunksize += SCTP_PAD4(ntohs(auth_hmacs->length));
287 else 287 else
288 auth_hmacs = NULL; 288 auth_hmacs = NULL;
289 289
290 /* Add CHUNKS parameter length */ 290 /* Add CHUNKS parameter length */
291 auth_chunks = (sctp_paramhdr_t *)asoc->c.auth_chunks; 291 auth_chunks = (sctp_paramhdr_t *)asoc->c.auth_chunks;
292 if (auth_chunks->length) 292 if (auth_chunks->length)
293 chunksize += WORD_ROUND(ntohs(auth_chunks->length)); 293 chunksize += SCTP_PAD4(ntohs(auth_chunks->length));
294 else 294 else
295 auth_chunks = NULL; 295 auth_chunks = NULL;
296 296
@@ -300,8 +300,8 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
300 300
301 /* If we have any extensions to report, account for that */ 301 /* If we have any extensions to report, account for that */
302 if (num_ext) 302 if (num_ext)
303 chunksize += WORD_ROUND(sizeof(sctp_supported_ext_param_t) + 303 chunksize += SCTP_PAD4(sizeof(sctp_supported_ext_param_t) +
304 num_ext); 304 num_ext);
305 305
306 /* RFC 2960 3.3.2 Initiation (INIT) (1) 306 /* RFC 2960 3.3.2 Initiation (INIT) (1)
307 * 307 *
@@ -443,13 +443,13 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
443 443
444 auth_hmacs = (sctp_paramhdr_t *)asoc->c.auth_hmacs; 444 auth_hmacs = (sctp_paramhdr_t *)asoc->c.auth_hmacs;
445 if (auth_hmacs->length) 445 if (auth_hmacs->length)
446 chunksize += WORD_ROUND(ntohs(auth_hmacs->length)); 446 chunksize += SCTP_PAD4(ntohs(auth_hmacs->length));
447 else 447 else
448 auth_hmacs = NULL; 448 auth_hmacs = NULL;
449 449
450 auth_chunks = (sctp_paramhdr_t *)asoc->c.auth_chunks; 450 auth_chunks = (sctp_paramhdr_t *)asoc->c.auth_chunks;
451 if (auth_chunks->length) 451 if (auth_chunks->length)
452 chunksize += WORD_ROUND(ntohs(auth_chunks->length)); 452 chunksize += SCTP_PAD4(ntohs(auth_chunks->length));
453 else 453 else
454 auth_chunks = NULL; 454 auth_chunks = NULL;
455 455
@@ -458,8 +458,8 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
458 } 458 }
459 459
460 if (num_ext) 460 if (num_ext)
461 chunksize += WORD_ROUND(sizeof(sctp_supported_ext_param_t) + 461 chunksize += SCTP_PAD4(sizeof(sctp_supported_ext_param_t) +
462 num_ext); 462 num_ext);
463 463
464 /* Now allocate and fill out the chunk. */ 464 /* Now allocate and fill out the chunk. */
465 retval = sctp_make_control(asoc, SCTP_CID_INIT_ACK, 0, chunksize, gfp); 465 retval = sctp_make_control(asoc, SCTP_CID_INIT_ACK, 0, chunksize, gfp);
@@ -1375,7 +1375,7 @@ static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc,
1375 struct sock *sk; 1375 struct sock *sk;
1376 1376
1377 /* No need to allocate LL here, as this is only a chunk. */ 1377 /* No need to allocate LL here, as this is only a chunk. */
1378 skb = alloc_skb(WORD_ROUND(sizeof(sctp_chunkhdr_t) + paylen), gfp); 1378 skb = alloc_skb(SCTP_PAD4(sizeof(sctp_chunkhdr_t) + paylen), gfp);
1379 if (!skb) 1379 if (!skb)
1380 goto nodata; 1380 goto nodata;
1381 1381
@@ -1467,7 +1467,7 @@ void *sctp_addto_chunk(struct sctp_chunk *chunk, int len, const void *data)
1467 void *target; 1467 void *target;
1468 void *padding; 1468 void *padding;
1469 int chunklen = ntohs(chunk->chunk_hdr->length); 1469 int chunklen = ntohs(chunk->chunk_hdr->length);
1470 int padlen = WORD_ROUND(chunklen) - chunklen; 1470 int padlen = SCTP_PAD4(chunklen) - chunklen;
1471 1471
1472 padding = skb_put(chunk->skb, padlen); 1472 padding = skb_put(chunk->skb, padlen);
1473 target = skb_put(chunk->skb, len); 1473 target = skb_put(chunk->skb, len);
@@ -1885,7 +1885,7 @@ static int sctp_process_missing_param(const struct sctp_association *asoc,
1885 struct __sctp_missing report; 1885 struct __sctp_missing report;
1886 __u16 len; 1886 __u16 len;
1887 1887
1888 len = WORD_ROUND(sizeof(report)); 1888 len = SCTP_PAD4(sizeof(report));
1889 1889
1890 /* Make an ERROR chunk, preparing enough room for 1890 /* Make an ERROR chunk, preparing enough room for
1891 * returning multiple unknown parameters. 1891 * returning multiple unknown parameters.
@@ -2083,9 +2083,9 @@ static sctp_ierror_t sctp_process_unk_param(const struct sctp_association *asoc,
2083 2083
2084 if (*errp) { 2084 if (*errp) {
2085 if (!sctp_init_cause_fixed(*errp, SCTP_ERROR_UNKNOWN_PARAM, 2085 if (!sctp_init_cause_fixed(*errp, SCTP_ERROR_UNKNOWN_PARAM,
2086 WORD_ROUND(ntohs(param.p->length)))) 2086 SCTP_PAD4(ntohs(param.p->length))))
2087 sctp_addto_chunk_fixed(*errp, 2087 sctp_addto_chunk_fixed(*errp,
2088 WORD_ROUND(ntohs(param.p->length)), 2088 SCTP_PAD4(ntohs(param.p->length)),
2089 param.v); 2089 param.v);
2090 } else { 2090 } else {
2091 /* If there is no memory for generating the ERROR 2091 /* If there is no memory for generating the ERROR
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 12d45193357c..c345bf153bed 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -1020,19 +1020,13 @@ static void sctp_cmd_t1_timer_update(struct sctp_association *asoc,
1020 * This way the whole message is queued up and bundling if 1020 * This way the whole message is queued up and bundling if
1021 * encouraged for small fragments. 1021 * encouraged for small fragments.
1022 */ 1022 */
1023static int sctp_cmd_send_msg(struct sctp_association *asoc, 1023static void sctp_cmd_send_msg(struct sctp_association *asoc,
1024 struct sctp_datamsg *msg, gfp_t gfp) 1024 struct sctp_datamsg *msg, gfp_t gfp)
1025{ 1025{
1026 struct sctp_chunk *chunk; 1026 struct sctp_chunk *chunk;
1027 int error = 0;
1028
1029 list_for_each_entry(chunk, &msg->chunks, frag_list) {
1030 error = sctp_outq_tail(&asoc->outqueue, chunk, gfp);
1031 if (error)
1032 break;
1033 }
1034 1027
1035 return error; 1028 list_for_each_entry(chunk, &msg->chunks, frag_list)
1029 sctp_outq_tail(&asoc->outqueue, chunk, gfp);
1036} 1030}
1037 1031
1038 1032
@@ -1427,8 +1421,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
1427 local_cork = 1; 1421 local_cork = 1;
1428 } 1422 }
1429 /* Send a chunk to our peer. */ 1423 /* Send a chunk to our peer. */
1430 error = sctp_outq_tail(&asoc->outqueue, cmd->obj.chunk, 1424 sctp_outq_tail(&asoc->outqueue, cmd->obj.chunk, gfp);
1431 gfp);
1432 break; 1425 break;
1433 1426
1434 case SCTP_CMD_SEND_PKT: 1427 case SCTP_CMD_SEND_PKT:
@@ -1682,7 +1675,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
1682 case SCTP_CMD_FORCE_PRIM_RETRAN: 1675 case SCTP_CMD_FORCE_PRIM_RETRAN:
1683 t = asoc->peer.retran_path; 1676 t = asoc->peer.retran_path;
1684 asoc->peer.retran_path = asoc->peer.primary_path; 1677 asoc->peer.retran_path = asoc->peer.primary_path;
1685 error = sctp_outq_uncork(&asoc->outqueue, gfp); 1678 sctp_outq_uncork(&asoc->outqueue, gfp);
1686 local_cork = 0; 1679 local_cork = 0;
1687 asoc->peer.retran_path = t; 1680 asoc->peer.retran_path = t;
1688 break; 1681 break;
@@ -1709,7 +1702,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
1709 sctp_outq_cork(&asoc->outqueue); 1702 sctp_outq_cork(&asoc->outqueue);
1710 local_cork = 1; 1703 local_cork = 1;
1711 } 1704 }
1712 error = sctp_cmd_send_msg(asoc, cmd->obj.msg, gfp); 1705 sctp_cmd_send_msg(asoc, cmd->obj.msg, gfp);
1713 break; 1706 break;
1714 case SCTP_CMD_SEND_NEXT_ASCONF: 1707 case SCTP_CMD_SEND_NEXT_ASCONF:
1715 sctp_cmd_send_asconf(asoc); 1708 sctp_cmd_send_asconf(asoc);
@@ -1739,9 +1732,9 @@ out:
1739 */ 1732 */
1740 if (asoc && SCTP_EVENT_T_CHUNK == event_type && chunk) { 1733 if (asoc && SCTP_EVENT_T_CHUNK == event_type && chunk) {
1741 if (chunk->end_of_packet || chunk->singleton) 1734 if (chunk->end_of_packet || chunk->singleton)
1742 error = sctp_outq_uncork(&asoc->outqueue, gfp); 1735 sctp_outq_uncork(&asoc->outqueue, gfp);
1743 } else if (local_cork) 1736 } else if (local_cork)
1744 error = sctp_outq_uncork(&asoc->outqueue, gfp); 1737 sctp_outq_uncork(&asoc->outqueue, gfp);
1745 1738
1746 if (sp->data_ready_signalled) 1739 if (sp->data_ready_signalled)
1747 sp->data_ready_signalled = 0; 1740 sp->data_ready_signalled = 0;
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index d88bb2b0b699..026e3bca4a94 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -3454,7 +3454,7 @@ sctp_disposition_t sctp_sf_ootb(struct net *net,
3454 } 3454 }
3455 3455
3456 /* Report violation if chunk len overflows */ 3456 /* Report violation if chunk len overflows */
3457 ch_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length)); 3457 ch_end = ((__u8 *)ch) + SCTP_PAD4(ntohs(ch->length));
3458 if (ch_end > skb_tail_pointer(skb)) 3458 if (ch_end > skb_tail_pointer(skb))
3459 return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, 3459 return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
3460 commands); 3460 commands);
@@ -4185,7 +4185,7 @@ sctp_disposition_t sctp_sf_unk_chunk(struct net *net,
4185 hdr = unk_chunk->chunk_hdr; 4185 hdr = unk_chunk->chunk_hdr;
4186 err_chunk = sctp_make_op_error(asoc, unk_chunk, 4186 err_chunk = sctp_make_op_error(asoc, unk_chunk,
4187 SCTP_ERROR_UNKNOWN_CHUNK, hdr, 4187 SCTP_ERROR_UNKNOWN_CHUNK, hdr,
4188 WORD_ROUND(ntohs(hdr->length)), 4188 SCTP_PAD4(ntohs(hdr->length)),
4189 0); 4189 0);
4190 if (err_chunk) { 4190 if (err_chunk) {
4191 sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, 4191 sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
@@ -4203,7 +4203,7 @@ sctp_disposition_t sctp_sf_unk_chunk(struct net *net,
4203 hdr = unk_chunk->chunk_hdr; 4203 hdr = unk_chunk->chunk_hdr;
4204 err_chunk = sctp_make_op_error(asoc, unk_chunk, 4204 err_chunk = sctp_make_op_error(asoc, unk_chunk,
4205 SCTP_ERROR_UNKNOWN_CHUNK, hdr, 4205 SCTP_ERROR_UNKNOWN_CHUNK, hdr,
4206 WORD_ROUND(ntohs(hdr->length)), 4206 SCTP_PAD4(ntohs(hdr->length)),
4207 0); 4207 0);
4208 if (err_chunk) { 4208 if (err_chunk) {
4209 sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, 4209 sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 8ed2d99bde6d..fb02c7033307 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1958,6 +1958,8 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
1958 1958
1959 /* Now send the (possibly) fragmented message. */ 1959 /* Now send the (possibly) fragmented message. */
1960 list_for_each_entry(chunk, &datamsg->chunks, frag_list) { 1960 list_for_each_entry(chunk, &datamsg->chunks, frag_list) {
1961 sctp_chunk_hold(chunk);
1962
1961 /* Do accounting for the write space. */ 1963 /* Do accounting for the write space. */
1962 sctp_set_owner_w(chunk); 1964 sctp_set_owner_w(chunk);
1963 1965
@@ -1970,13 +1972,15 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
1970 * breaks. 1972 * breaks.
1971 */ 1973 */
1972 err = sctp_primitive_SEND(net, asoc, datamsg); 1974 err = sctp_primitive_SEND(net, asoc, datamsg);
1973 sctp_datamsg_put(datamsg);
1974 /* Did the lower layer accept the chunk? */ 1975 /* Did the lower layer accept the chunk? */
1975 if (err) 1976 if (err) {
1977 sctp_datamsg_free(datamsg);
1976 goto out_free; 1978 goto out_free;
1979 }
1977 1980
1978 pr_debug("%s: we sent primitively\n", __func__); 1981 pr_debug("%s: we sent primitively\n", __func__);
1979 1982
1983 sctp_datamsg_put(datamsg);
1980 err = msg_len; 1984 err = msg_len;
1981 1985
1982 if (unlikely(wait_connect)) { 1986 if (unlikely(wait_connect)) {
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 81b86678be4d..ce54dce13ddb 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -233,7 +233,7 @@ void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk)
233 } 233 }
234 234
235 if (transport->dst) { 235 if (transport->dst) {
236 transport->pathmtu = WORD_TRUNC(dst_mtu(transport->dst)); 236 transport->pathmtu = SCTP_TRUNC4(dst_mtu(transport->dst));
237 } else 237 } else
238 transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT; 238 transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT;
239} 239}
@@ -287,7 +287,7 @@ void sctp_transport_route(struct sctp_transport *transport,
287 return; 287 return;
288 } 288 }
289 if (transport->dst) { 289 if (transport->dst) {
290 transport->pathmtu = WORD_TRUNC(dst_mtu(transport->dst)); 290 transport->pathmtu = SCTP_TRUNC4(dst_mtu(transport->dst));
291 291
292 /* Initialize sk->sk_rcv_saddr, if the transport is the 292 /* Initialize sk->sk_rcv_saddr, if the transport is the
293 * association's active path for getsockname(). 293 * association's active path for getsockname().
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index d85b803da11d..bea00058ce35 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -383,7 +383,7 @@ sctp_ulpevent_make_remote_error(const struct sctp_association *asoc,
383 383
384 ch = (sctp_errhdr_t *)(chunk->skb->data); 384 ch = (sctp_errhdr_t *)(chunk->skb->data);
385 cause = ch->cause; 385 cause = ch->cause;
386 elen = WORD_ROUND(ntohs(ch->length)) - sizeof(sctp_errhdr_t); 386 elen = SCTP_PAD4(ntohs(ch->length)) - sizeof(sctp_errhdr_t);
387 387
388 /* Pull off the ERROR header. */ 388 /* Pull off the ERROR header. */
389 skb_pull(chunk->skb, sizeof(sctp_errhdr_t)); 389 skb_pull(chunk->skb, sizeof(sctp_errhdr_t));
@@ -688,7 +688,7 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc,
688 * MUST ignore the padding bytes. 688 * MUST ignore the padding bytes.
689 */ 689 */
690 len = ntohs(chunk->chunk_hdr->length); 690 len = ntohs(chunk->chunk_hdr->length);
691 padding = WORD_ROUND(len) - len; 691 padding = SCTP_PAD4(len) - len;
692 692
693 /* Fixup cloned skb with just this chunks data. */ 693 /* Fixup cloned skb with just this chunks data. */
694 skb_trim(skb, chunk->chunk_end - padding - skb->data); 694 skb_trim(skb, chunk->chunk_end - padding - skb->data);
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index 877e55066f89..84d0fdaf7de9 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -140,11 +140,8 @@ int sctp_clear_pd(struct sock *sk, struct sctp_association *asoc)
140 * we can go ahead and clear out the lobby in one shot 140 * we can go ahead and clear out the lobby in one shot
141 */ 141 */
142 if (!skb_queue_empty(&sp->pd_lobby)) { 142 if (!skb_queue_empty(&sp->pd_lobby)) {
143 struct list_head *list;
144 skb_queue_splice_tail_init(&sp->pd_lobby, 143 skb_queue_splice_tail_init(&sp->pd_lobby,
145 &sk->sk_receive_queue); 144 &sk->sk_receive_queue);
146 list = (struct list_head *)&sctp_sk(sk)->pd_lobby;
147 INIT_LIST_HEAD(list);
148 return 1; 145 return 1;
149 } 146 }
150 } else { 147 } else {
diff --git a/net/socket.c b/net/socket.c
index a1bd16106625..5a9bf5ee2464 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -320,11 +320,38 @@ static const struct dentry_operations sockfs_dentry_operations = {
320 .d_dname = sockfs_dname, 320 .d_dname = sockfs_dname,
321}; 321};
322 322
323static int sockfs_xattr_get(const struct xattr_handler *handler,
324 struct dentry *dentry, struct inode *inode,
325 const char *suffix, void *value, size_t size)
326{
327 if (value) {
328 if (dentry->d_name.len + 1 > size)
329 return -ERANGE;
330 memcpy(value, dentry->d_name.name, dentry->d_name.len + 1);
331 }
332 return dentry->d_name.len + 1;
333}
334
335#define XATTR_SOCKPROTONAME_SUFFIX "sockprotoname"
336#define XATTR_NAME_SOCKPROTONAME (XATTR_SYSTEM_PREFIX XATTR_SOCKPROTONAME_SUFFIX)
337#define XATTR_NAME_SOCKPROTONAME_LEN (sizeof(XATTR_NAME_SOCKPROTONAME)-1)
338
339static const struct xattr_handler sockfs_xattr_handler = {
340 .name = XATTR_NAME_SOCKPROTONAME,
341 .get = sockfs_xattr_get,
342};
343
344static const struct xattr_handler *sockfs_xattr_handlers[] = {
345 &sockfs_xattr_handler,
346 NULL
347};
348
323static struct dentry *sockfs_mount(struct file_system_type *fs_type, 349static struct dentry *sockfs_mount(struct file_system_type *fs_type,
324 int flags, const char *dev_name, void *data) 350 int flags, const char *dev_name, void *data)
325{ 351{
326 return mount_pseudo(fs_type, "socket:", &sockfs_ops, 352 return mount_pseudo_xattr(fs_type, "socket:", &sockfs_ops,
327 &sockfs_dentry_operations, SOCKFS_MAGIC); 353 sockfs_xattr_handlers,
354 &sockfs_dentry_operations, SOCKFS_MAGIC);
328} 355}
329 356
330static struct vfsmount *sock_mnt __read_mostly; 357static struct vfsmount *sock_mnt __read_mostly;
@@ -463,35 +490,6 @@ static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed)
463 return NULL; 490 return NULL;
464} 491}
465 492
466#define XATTR_SOCKPROTONAME_SUFFIX "sockprotoname"
467#define XATTR_NAME_SOCKPROTONAME (XATTR_SYSTEM_PREFIX XATTR_SOCKPROTONAME_SUFFIX)
468#define XATTR_NAME_SOCKPROTONAME_LEN (sizeof(XATTR_NAME_SOCKPROTONAME)-1)
469static ssize_t sockfs_getxattr(struct dentry *dentry, struct inode *inode,
470 const char *name, void *value, size_t size)
471{
472 const char *proto_name;
473 size_t proto_size;
474 int error;
475
476 error = -ENODATA;
477 if (!strncmp(name, XATTR_NAME_SOCKPROTONAME, XATTR_NAME_SOCKPROTONAME_LEN)) {
478 proto_name = dentry->d_name.name;
479 proto_size = strlen(proto_name);
480
481 if (value) {
482 error = -ERANGE;
483 if (proto_size + 1 > size)
484 goto out;
485
486 strncpy(value, proto_name, proto_size + 1);
487 }
488 error = proto_size + 1;
489 }
490
491out:
492 return error;
493}
494
495static ssize_t sockfs_listxattr(struct dentry *dentry, char *buffer, 493static ssize_t sockfs_listxattr(struct dentry *dentry, char *buffer,
496 size_t size) 494 size_t size)
497{ 495{
@@ -521,7 +519,6 @@ static ssize_t sockfs_listxattr(struct dentry *dentry, char *buffer,
521} 519}
522 520
523static const struct inode_operations sockfs_inode_ops = { 521static const struct inode_operations sockfs_inode_ops = {
524 .getxattr = sockfs_getxattr,
525 .listxattr = sockfs_listxattr, 522 .listxattr = sockfs_listxattr,
526}; 523};
527 524
diff --git a/net/strparser/Kconfig b/net/strparser/Kconfig
new file mode 100644
index 000000000000..6cff3f6d0c3a
--- /dev/null
+++ b/net/strparser/Kconfig
@@ -0,0 +1,4 @@
1
2config STREAM_PARSER
3 tristate
4 default n
diff --git a/net/strparser/Makefile b/net/strparser/Makefile
new file mode 100644
index 000000000000..858a126ebaa0
--- /dev/null
+++ b/net/strparser/Makefile
@@ -0,0 +1 @@
obj-$(CONFIG_STREAM_PARSER) += strparser.o
diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c
new file mode 100644
index 000000000000..41adf362936d
--- /dev/null
+++ b/net/strparser/strparser.c
@@ -0,0 +1,510 @@
1/*
2 * Stream Parser
3 *
4 * Copyright (c) 2016 Tom Herbert <tom@herbertland.com>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2
8 * as published by the Free Software Foundation.
9 */
10
11#include <linux/bpf.h>
12#include <linux/errno.h>
13#include <linux/errqueue.h>
14#include <linux/file.h>
15#include <linux/in.h>
16#include <linux/kernel.h>
17#include <linux/module.h>
18#include <linux/net.h>
19#include <linux/netdevice.h>
20#include <linux/poll.h>
21#include <linux/rculist.h>
22#include <linux/skbuff.h>
23#include <linux/socket.h>
24#include <linux/uaccess.h>
25#include <linux/workqueue.h>
26#include <net/strparser.h>
27#include <net/netns/generic.h>
28#include <net/sock.h>
29
30static struct workqueue_struct *strp_wq;
31
32struct _strp_rx_msg {
33 /* Internal cb structure. struct strp_rx_msg must be first for passing
34 * to upper layer.
35 */
36 struct strp_rx_msg strp;
37 int accum_len;
38 int early_eaten;
39};
40
41static inline struct _strp_rx_msg *_strp_rx_msg(struct sk_buff *skb)
42{
43 return (struct _strp_rx_msg *)((void *)skb->cb +
44 offsetof(struct qdisc_skb_cb, data));
45}
46
47/* Lower lock held */
48static void strp_abort_rx_strp(struct strparser *strp, int err)
49{
50 struct sock *csk = strp->sk;
51
52 /* Unrecoverable error in receive */
53
54 del_timer(&strp->rx_msg_timer);
55
56 if (strp->rx_stopped)
57 return;
58
59 strp->rx_stopped = 1;
60
61 /* Report an error on the lower socket */
62 csk->sk_err = err;
63 csk->sk_error_report(csk);
64}
65
66static void strp_start_rx_timer(struct strparser *strp)
67{
68 if (strp->sk->sk_rcvtimeo)
69 mod_timer(&strp->rx_msg_timer, strp->sk->sk_rcvtimeo);
70}
71
72/* Lower lock held */
73static void strp_parser_err(struct strparser *strp, int err,
74 read_descriptor_t *desc)
75{
76 desc->error = err;
77 kfree_skb(strp->rx_skb_head);
78 strp->rx_skb_head = NULL;
79 strp->cb.abort_parser(strp, err);
80}
81
82static inline int strp_peek_len(struct strparser *strp)
83{
84 struct socket *sock = strp->sk->sk_socket;
85
86 return sock->ops->peek_len(sock);
87}
88
89/* Lower socket lock held */
90static int strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
91 unsigned int orig_offset, size_t orig_len)
92{
93 struct strparser *strp = (struct strparser *)desc->arg.data;
94 struct _strp_rx_msg *rxm;
95 struct sk_buff *head, *skb;
96 size_t eaten = 0, cand_len;
97 ssize_t extra;
98 int err;
99 bool cloned_orig = false;
100
101 if (strp->rx_paused)
102 return 0;
103
104 head = strp->rx_skb_head;
105 if (head) {
106 /* Message already in progress */
107
108 rxm = _strp_rx_msg(head);
109 if (unlikely(rxm->early_eaten)) {
110 /* Already some number of bytes on the receive sock
111 * data saved in rx_skb_head, just indicate they
112 * are consumed.
113 */
114 eaten = orig_len <= rxm->early_eaten ?
115 orig_len : rxm->early_eaten;
116 rxm->early_eaten -= eaten;
117
118 return eaten;
119 }
120
121 if (unlikely(orig_offset)) {
122 /* Getting data with a non-zero offset when a message is
123 * in progress is not expected. If it does happen, we
124 * need to clone and pull since we can't deal with
125 * offsets in the skbs for a message expect in the head.
126 */
127 orig_skb = skb_clone(orig_skb, GFP_ATOMIC);
128 if (!orig_skb) {
129 STRP_STATS_INCR(strp->stats.rx_mem_fail);
130 desc->error = -ENOMEM;
131 return 0;
132 }
133 if (!pskb_pull(orig_skb, orig_offset)) {
134 STRP_STATS_INCR(strp->stats.rx_mem_fail);
135 kfree_skb(orig_skb);
136 desc->error = -ENOMEM;
137 return 0;
138 }
139 cloned_orig = true;
140 orig_offset = 0;
141 }
142
143 if (!strp->rx_skb_nextp) {
144 /* We are going to append to the frags_list of head.
145 * Need to unshare the frag_list.
146 */
147 err = skb_unclone(head, GFP_ATOMIC);
148 if (err) {
149 STRP_STATS_INCR(strp->stats.rx_mem_fail);
150 desc->error = err;
151 return 0;
152 }
153
154 if (unlikely(skb_shinfo(head)->frag_list)) {
155 /* We can't append to an sk_buff that already
156 * has a frag_list. We create a new head, point
157 * the frag_list of that to the old head, and
158 * then are able to use the old head->next for
159 * appending to the message.
160 */
161 if (WARN_ON(head->next)) {
162 desc->error = -EINVAL;
163 return 0;
164 }
165
166 skb = alloc_skb(0, GFP_ATOMIC);
167 if (!skb) {
168 STRP_STATS_INCR(strp->stats.rx_mem_fail);
169 desc->error = -ENOMEM;
170 return 0;
171 }
172 skb->len = head->len;
173 skb->data_len = head->len;
174 skb->truesize = head->truesize;
175 *_strp_rx_msg(skb) = *_strp_rx_msg(head);
176 strp->rx_skb_nextp = &head->next;
177 skb_shinfo(skb)->frag_list = head;
178 strp->rx_skb_head = skb;
179 head = skb;
180 } else {
181 strp->rx_skb_nextp =
182 &skb_shinfo(head)->frag_list;
183 }
184 }
185 }
186
187 while (eaten < orig_len) {
188 /* Always clone since we will consume something */
189 skb = skb_clone(orig_skb, GFP_ATOMIC);
190 if (!skb) {
191 STRP_STATS_INCR(strp->stats.rx_mem_fail);
192 desc->error = -ENOMEM;
193 break;
194 }
195
196 cand_len = orig_len - eaten;
197
198 head = strp->rx_skb_head;
199 if (!head) {
200 head = skb;
201 strp->rx_skb_head = head;
202 /* Will set rx_skb_nextp on next packet if needed */
203 strp->rx_skb_nextp = NULL;
204 rxm = _strp_rx_msg(head);
205 memset(rxm, 0, sizeof(*rxm));
206 rxm->strp.offset = orig_offset + eaten;
207 } else {
208 /* Unclone since we may be appending to an skb that we
209 * already share a frag_list with.
210 */
211 err = skb_unclone(skb, GFP_ATOMIC);
212 if (err) {
213 STRP_STATS_INCR(strp->stats.rx_mem_fail);
214 desc->error = err;
215 break;
216 }
217
218 rxm = _strp_rx_msg(head);
219 *strp->rx_skb_nextp = skb;
220 strp->rx_skb_nextp = &skb->next;
221 head->data_len += skb->len;
222 head->len += skb->len;
223 head->truesize += skb->truesize;
224 }
225
226 if (!rxm->strp.full_len) {
227 ssize_t len;
228
229 len = (*strp->cb.parse_msg)(strp, head);
230
231 if (!len) {
232 /* Need more header to determine length */
233 if (!rxm->accum_len) {
234 /* Start RX timer for new message */
235 strp_start_rx_timer(strp);
236 }
237 rxm->accum_len += cand_len;
238 eaten += cand_len;
239 STRP_STATS_INCR(strp->stats.rx_need_more_hdr);
240 WARN_ON(eaten != orig_len);
241 break;
242 } else if (len < 0) {
243 if (len == -ESTRPIPE && rxm->accum_len) {
244 len = -ENODATA;
245 strp->rx_unrecov_intr = 1;
246 } else {
247 strp->rx_interrupted = 1;
248 }
249 strp_parser_err(strp, len, desc);
250 break;
251 } else if (len > strp->sk->sk_rcvbuf) {
252 /* Message length exceeds maximum allowed */
253 STRP_STATS_INCR(strp->stats.rx_msg_too_big);
254 strp_parser_err(strp, -EMSGSIZE, desc);
255 break;
256 } else if (len <= (ssize_t)head->len -
257 skb->len - rxm->strp.offset) {
258 /* Length must be into new skb (and also
259 * greater than zero)
260 */
261 STRP_STATS_INCR(strp->stats.rx_bad_hdr_len);
262 strp_parser_err(strp, -EPROTO, desc);
263 break;
264 }
265
266 rxm->strp.full_len = len;
267 }
268
269 extra = (ssize_t)(rxm->accum_len + cand_len) -
270 rxm->strp.full_len;
271
272 if (extra < 0) {
273 /* Message not complete yet. */
274 if (rxm->strp.full_len - rxm->accum_len >
275 strp_peek_len(strp)) {
276 /* Don't have the whole messages in the socket
277 * buffer. Set strp->rx_need_bytes to wait for
278 * the rest of the message. Also, set "early
279 * eaten" since we've already buffered the skb
280 * but don't consume yet per strp_read_sock.
281 */
282
283 if (!rxm->accum_len) {
284 /* Start RX timer for new message */
285 strp_start_rx_timer(strp);
286 }
287
288 strp->rx_need_bytes = rxm->strp.full_len -
289 rxm->accum_len;
290 rxm->accum_len += cand_len;
291 rxm->early_eaten = cand_len;
292 STRP_STATS_ADD(strp->stats.rx_bytes, cand_len);
293 desc->count = 0; /* Stop reading socket */
294 break;
295 }
296 rxm->accum_len += cand_len;
297 eaten += cand_len;
298 WARN_ON(eaten != orig_len);
299 break;
300 }
301
302 /* Positive extra indicates ore bytes than needed for the
303 * message
304 */
305
306 WARN_ON(extra > cand_len);
307
308 eaten += (cand_len - extra);
309
310 /* Hurray, we have a new message! */
311 del_timer(&strp->rx_msg_timer);
312 strp->rx_skb_head = NULL;
313 STRP_STATS_INCR(strp->stats.rx_msgs);
314
315 /* Give skb to upper layer */
316 strp->cb.rcv_msg(strp, head);
317
318 if (unlikely(strp->rx_paused)) {
319 /* Upper layer paused strp */
320 break;
321 }
322 }
323
324 if (cloned_orig)
325 kfree_skb(orig_skb);
326
327 STRP_STATS_ADD(strp->stats.rx_bytes, eaten);
328
329 return eaten;
330}
331
332static int default_read_sock_done(struct strparser *strp, int err)
333{
334 return err;
335}
336
337/* Called with lock held on lower socket */
338static int strp_read_sock(struct strparser *strp)
339{
340 struct socket *sock = strp->sk->sk_socket;
341 read_descriptor_t desc;
342
343 desc.arg.data = strp;
344 desc.error = 0;
345 desc.count = 1; /* give more than one skb per call */
346
347 /* sk should be locked here, so okay to do read_sock */
348 sock->ops->read_sock(strp->sk, &desc, strp_recv);
349
350 desc.error = strp->cb.read_sock_done(strp, desc.error);
351
352 return desc.error;
353}
354
355/* Lower sock lock held */
356void strp_data_ready(struct strparser *strp)
357{
358 if (unlikely(strp->rx_stopped))
359 return;
360
361 /* This check is needed to synchronize with do_strp_rx_work.
362 * do_strp_rx_work acquires a process lock (lock_sock) whereas
363 * the lock held here is bh_lock_sock. The two locks can be
364 * held by different threads at the same time, but bh_lock_sock
365 * allows a thread in BH context to safely check if the process
366 * lock is held. In this case, if the lock is held, queue work.
367 */
368 if (sock_owned_by_user(strp->sk)) {
369 queue_work(strp_wq, &strp->rx_work);
370 return;
371 }
372
373 if (strp->rx_paused)
374 return;
375
376 if (strp->rx_need_bytes) {
377 if (strp_peek_len(strp) >= strp->rx_need_bytes)
378 strp->rx_need_bytes = 0;
379 else
380 return;
381 }
382
383 if (strp_read_sock(strp) == -ENOMEM)
384 queue_work(strp_wq, &strp->rx_work);
385}
386EXPORT_SYMBOL_GPL(strp_data_ready);
387
388static void do_strp_rx_work(struct strparser *strp)
389{
390 read_descriptor_t rd_desc;
391 struct sock *csk = strp->sk;
392
393 /* We need the read lock to synchronize with strp_data_ready. We
394 * need the socket lock for calling strp_read_sock.
395 */
396 lock_sock(csk);
397
398 if (unlikely(strp->rx_stopped))
399 goto out;
400
401 if (strp->rx_paused)
402 goto out;
403
404 rd_desc.arg.data = strp;
405
406 if (strp_read_sock(strp) == -ENOMEM)
407 queue_work(strp_wq, &strp->rx_work);
408
409out:
410 release_sock(csk);
411}
412
413static void strp_rx_work(struct work_struct *w)
414{
415 do_strp_rx_work(container_of(w, struct strparser, rx_work));
416}
417
418static void strp_rx_msg_timeout(unsigned long arg)
419{
420 struct strparser *strp = (struct strparser *)arg;
421
422 /* Message assembly timed out */
423 STRP_STATS_INCR(strp->stats.rx_msg_timeouts);
424 lock_sock(strp->sk);
425 strp->cb.abort_parser(strp, ETIMEDOUT);
426 release_sock(strp->sk);
427}
428
429int strp_init(struct strparser *strp, struct sock *csk,
430 struct strp_callbacks *cb)
431{
432 struct socket *sock = csk->sk_socket;
433
434 if (!cb || !cb->rcv_msg || !cb->parse_msg)
435 return -EINVAL;
436
437 if (!sock->ops->read_sock || !sock->ops->peek_len)
438 return -EAFNOSUPPORT;
439
440 memset(strp, 0, sizeof(*strp));
441
442 strp->sk = csk;
443
444 setup_timer(&strp->rx_msg_timer, strp_rx_msg_timeout,
445 (unsigned long)strp);
446
447 INIT_WORK(&strp->rx_work, strp_rx_work);
448
449 strp->cb.rcv_msg = cb->rcv_msg;
450 strp->cb.parse_msg = cb->parse_msg;
451 strp->cb.read_sock_done = cb->read_sock_done ? : default_read_sock_done;
452 strp->cb.abort_parser = cb->abort_parser ? : strp_abort_rx_strp;
453
454 return 0;
455}
456EXPORT_SYMBOL_GPL(strp_init);
457
458void strp_unpause(struct strparser *strp)
459{
460 strp->rx_paused = 0;
461
462 /* Sync setting rx_paused with RX work */
463 smp_mb();
464
465 queue_work(strp_wq, &strp->rx_work);
466}
467EXPORT_SYMBOL_GPL(strp_unpause);
468
469/* strp must already be stopped so that strp_recv will no longer be called.
470 * Note that strp_done is not called with the lower socket held.
471 */
472void strp_done(struct strparser *strp)
473{
474 WARN_ON(!strp->rx_stopped);
475
476 del_timer_sync(&strp->rx_msg_timer);
477 cancel_work_sync(&strp->rx_work);
478
479 if (strp->rx_skb_head) {
480 kfree_skb(strp->rx_skb_head);
481 strp->rx_skb_head = NULL;
482 }
483}
484EXPORT_SYMBOL_GPL(strp_done);
485
486void strp_stop(struct strparser *strp)
487{
488 strp->rx_stopped = 1;
489}
490EXPORT_SYMBOL_GPL(strp_stop);
491
492void strp_check_rcv(struct strparser *strp)
493{
494 queue_work(strp_wq, &strp->rx_work);
495}
496EXPORT_SYMBOL_GPL(strp_check_rcv);
497
498static int __init strp_mod_init(void)
499{
500 strp_wq = create_singlethread_workqueue("kstrp");
501
502 return 0;
503}
504
505static void __exit strp_mod_exit(void)
506{
507}
508module_init(strp_mod_init);
509module_exit(strp_mod_exit);
510MODULE_LICENSE("GPL");
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index a7e42f9a405c..2bff63a73cf8 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -551,7 +551,7 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
551 *entry, *new; 551 *entry, *new;
552 unsigned int nr; 552 unsigned int nr;
553 553
554 nr = hash_long(from_kuid(&init_user_ns, acred->uid), cache->hashbits); 554 nr = auth->au_ops->hash_cred(acred, cache->hashbits);
555 555
556 rcu_read_lock(); 556 rcu_read_lock();
557 hlist_for_each_entry_rcu(entry, &cache->hashtable[nr], cr_hash) { 557 hlist_for_each_entry_rcu(entry, &cache->hashtable[nr], cr_hash) {
diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c
index 168219535a34..f1df9837f1ac 100644
--- a/net/sunrpc/auth_generic.c
+++ b/net/sunrpc/auth_generic.c
@@ -78,6 +78,14 @@ static struct rpc_cred *generic_bind_cred(struct rpc_task *task,
78 return auth->au_ops->lookup_cred(auth, acred, lookupflags); 78 return auth->au_ops->lookup_cred(auth, acred, lookupflags);
79} 79}
80 80
81static int
82generic_hash_cred(struct auth_cred *acred, unsigned int hashbits)
83{
84 return hash_64(from_kgid(&init_user_ns, acred->gid) |
85 ((u64)from_kuid(&init_user_ns, acred->uid) <<
86 (sizeof(gid_t) * 8)), hashbits);
87}
88
81/* 89/*
82 * Lookup generic creds for current process 90 * Lookup generic creds for current process
83 */ 91 */
@@ -176,8 +184,8 @@ generic_match(struct auth_cred *acred, struct rpc_cred *cred, int flags)
176 if (gcred->acred.group_info->ngroups != acred->group_info->ngroups) 184 if (gcred->acred.group_info->ngroups != acred->group_info->ngroups)
177 goto out_nomatch; 185 goto out_nomatch;
178 for (i = 0; i < gcred->acred.group_info->ngroups; i++) { 186 for (i = 0; i < gcred->acred.group_info->ngroups; i++) {
179 if (!gid_eq(GROUP_AT(gcred->acred.group_info, i), 187 if (!gid_eq(gcred->acred.group_info->gid[i],
180 GROUP_AT(acred->group_info, i))) 188 acred->group_info->gid[i]))
181 goto out_nomatch; 189 goto out_nomatch;
182 } 190 }
183out_match: 191out_match:
@@ -258,6 +266,7 @@ generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred)
258static const struct rpc_authops generic_auth_ops = { 266static const struct rpc_authops generic_auth_ops = {
259 .owner = THIS_MODULE, 267 .owner = THIS_MODULE,
260 .au_name = "Generic", 268 .au_name = "Generic",
269 .hash_cred = generic_hash_cred,
261 .lookup_cred = generic_lookup_cred, 270 .lookup_cred = generic_lookup_cred,
262 .crcreate = generic_create_cred, 271 .crcreate = generic_create_cred,
263 .key_timeout = generic_key_timeout, 272 .key_timeout = generic_key_timeout,
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 976c7812bbd5..d8bd97a5a7c9 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -1298,6 +1298,12 @@ gss_destroy_cred(struct rpc_cred *cred)
1298 gss_destroy_nullcred(cred); 1298 gss_destroy_nullcred(cred);
1299} 1299}
1300 1300
1301static int
1302gss_hash_cred(struct auth_cred *acred, unsigned int hashbits)
1303{
1304 return hash_64(from_kuid(&init_user_ns, acred->uid), hashbits);
1305}
1306
1301/* 1307/*
1302 * Lookup RPCSEC_GSS cred for the current process 1308 * Lookup RPCSEC_GSS cred for the current process
1303 */ 1309 */
@@ -1982,6 +1988,7 @@ static const struct rpc_authops authgss_ops = {
1982 .au_name = "RPCSEC_GSS", 1988 .au_name = "RPCSEC_GSS",
1983 .create = gss_create, 1989 .create = gss_create,
1984 .destroy = gss_destroy, 1990 .destroy = gss_destroy,
1991 .hash_cred = gss_hash_cred,
1985 .lookup_cred = gss_lookup_cred, 1992 .lookup_cred = gss_lookup_cred,
1986 .crcreate = gss_create_cred, 1993 .crcreate = gss_create_cred,
1987 .list_pseudoflavors = gss_mech_list_pseudoflavors, 1994 .list_pseudoflavors = gss_mech_list_pseudoflavors,
diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c
index eeeba5adee6d..dc6fb79a361f 100644
--- a/net/sunrpc/auth_gss/gss_rpc_xdr.c
+++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c
@@ -229,7 +229,7 @@ static int gssx_dec_linux_creds(struct xdr_stream *xdr,
229 kgid = make_kgid(&init_user_ns, tmp); 229 kgid = make_kgid(&init_user_ns, tmp);
230 if (!gid_valid(kgid)) 230 if (!gid_valid(kgid))
231 goto out_free_groups; 231 goto out_free_groups;
232 GROUP_AT(creds->cr_group_info, i) = kgid; 232 creds->cr_group_info->gid[i] = kgid;
233 } 233 }
234 234
235 return 0; 235 return 0;
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index d8582028b346..d67f7e1bc82d 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -479,7 +479,7 @@ static int rsc_parse(struct cache_detail *cd,
479 kgid = make_kgid(&init_user_ns, id); 479 kgid = make_kgid(&init_user_ns, id);
480 if (!gid_valid(kgid)) 480 if (!gid_valid(kgid))
481 goto out; 481 goto out;
482 GROUP_AT(rsci.cred.cr_group_info, i) = kgid; 482 rsci.cred.cr_group_info->gid[i] = kgid;
483 } 483 }
484 484
485 /* mech name */ 485 /* mech name */
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index a99278c984e8..306fc0f54596 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -46,6 +46,14 @@ unx_destroy(struct rpc_auth *auth)
46 rpcauth_clear_credcache(auth->au_credcache); 46 rpcauth_clear_credcache(auth->au_credcache);
47} 47}
48 48
49static int
50unx_hash_cred(struct auth_cred *acred, unsigned int hashbits)
51{
52 return hash_64(from_kgid(&init_user_ns, acred->gid) |
53 ((u64)from_kuid(&init_user_ns, acred->uid) <<
54 (sizeof(gid_t) * 8)), hashbits);
55}
56
49/* 57/*
50 * Lookup AUTH_UNIX creds for current process 58 * Lookup AUTH_UNIX creds for current process
51 */ 59 */
@@ -79,7 +87,7 @@ unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t
79 87
80 cred->uc_gid = acred->gid; 88 cred->uc_gid = acred->gid;
81 for (i = 0; i < groups; i++) 89 for (i = 0; i < groups; i++)
82 cred->uc_gids[i] = GROUP_AT(acred->group_info, i); 90 cred->uc_gids[i] = acred->group_info->gid[i];
83 if (i < NFS_NGROUPS) 91 if (i < NFS_NGROUPS)
84 cred->uc_gids[i] = INVALID_GID; 92 cred->uc_gids[i] = INVALID_GID;
85 93
@@ -127,7 +135,7 @@ unx_match(struct auth_cred *acred, struct rpc_cred *rcred, int flags)
127 if (groups > NFS_NGROUPS) 135 if (groups > NFS_NGROUPS)
128 groups = NFS_NGROUPS; 136 groups = NFS_NGROUPS;
129 for (i = 0; i < groups ; i++) 137 for (i = 0; i < groups ; i++)
130 if (!gid_eq(cred->uc_gids[i], GROUP_AT(acred->group_info, i))) 138 if (!gid_eq(cred->uc_gids[i], acred->group_info->gid[i]))
131 return 0; 139 return 0;
132 if (groups < NFS_NGROUPS && gid_valid(cred->uc_gids[groups])) 140 if (groups < NFS_NGROUPS && gid_valid(cred->uc_gids[groups]))
133 return 0; 141 return 0;
@@ -220,6 +228,7 @@ const struct rpc_authops authunix_ops = {
220 .au_name = "UNIX", 228 .au_name = "UNIX",
221 .create = unx_create, 229 .create = unx_create,
222 .destroy = unx_destroy, 230 .destroy = unx_destroy,
231 .hash_cred = unx_hash_cred,
223 .lookup_cred = unx_lookup_cred, 232 .lookup_cred = unx_lookup_cred,
224 .crcreate = unx_create_cred, 233 .crcreate = unx_create_cred,
225}; 234};
diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c
index 229956bf8457..ac701c28f44f 100644
--- a/net/sunrpc/backchannel_rqst.c
+++ b/net/sunrpc/backchannel_rqst.c
@@ -76,13 +76,7 @@ static int xprt_alloc_xdr_buf(struct xdr_buf *buf, gfp_t gfp_flags)
76 page = alloc_page(gfp_flags); 76 page = alloc_page(gfp_flags);
77 if (page == NULL) 77 if (page == NULL)
78 return -ENOMEM; 78 return -ENOMEM;
79 buf->head[0].iov_base = page_address(page); 79 xdr_buf_init(buf, page_address(page), PAGE_SIZE);
80 buf->head[0].iov_len = PAGE_SIZE;
81 buf->tail[0].iov_base = NULL;
82 buf->tail[0].iov_len = 0;
83 buf->page_len = 0;
84 buf->len = 0;
85 buf->buflen = PAGE_SIZE;
86 return 0; 80 return 0;
87} 81}
88 82
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 4d8e11f94a35..8aabe12201f8 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -353,7 +353,7 @@ void sunrpc_init_cache_detail(struct cache_detail *cd)
353 spin_unlock(&cache_list_lock); 353 spin_unlock(&cache_list_lock);
354 354
355 /* start the cleaning process */ 355 /* start the cleaning process */
356 schedule_delayed_work(&cache_cleaner, 0); 356 queue_delayed_work(system_power_efficient_wq, &cache_cleaner, 0);
357} 357}
358EXPORT_SYMBOL_GPL(sunrpc_init_cache_detail); 358EXPORT_SYMBOL_GPL(sunrpc_init_cache_detail);
359 359
@@ -476,7 +476,8 @@ static void do_cache_clean(struct work_struct *work)
476 delay = 0; 476 delay = 0;
477 477
478 if (delay) 478 if (delay)
479 schedule_delayed_work(&cache_cleaner, delay); 479 queue_delayed_work(system_power_efficient_wq,
480 &cache_cleaner, delay);
480} 481}
481 482
482 483
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 66f23b376fa0..34dd7b26ee5f 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -184,7 +184,6 @@ static int __rpc_clnt_handle_event(struct rpc_clnt *clnt, unsigned long event,
184 struct super_block *sb) 184 struct super_block *sb)
185{ 185{
186 struct dentry *dentry; 186 struct dentry *dentry;
187 int err = 0;
188 187
189 switch (event) { 188 switch (event) {
190 case RPC_PIPEFS_MOUNT: 189 case RPC_PIPEFS_MOUNT:
@@ -201,7 +200,7 @@ static int __rpc_clnt_handle_event(struct rpc_clnt *clnt, unsigned long event,
201 printk(KERN_ERR "%s: unknown event: %ld\n", __func__, event); 200 printk(KERN_ERR "%s: unknown event: %ld\n", __func__, event);
202 return -ENOTSUPP; 201 return -ENOTSUPP;
203 } 202 }
204 return err; 203 return 0;
205} 204}
206 205
207static int __rpc_pipefs_event(struct rpc_clnt *clnt, unsigned long event, 206static int __rpc_pipefs_event(struct rpc_clnt *clnt, unsigned long event,
@@ -988,7 +987,6 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt)
988{ 987{
989 988
990 if (clnt != NULL) { 989 if (clnt != NULL) {
991 rpc_task_release_client(task);
992 if (task->tk_xprt == NULL) 990 if (task->tk_xprt == NULL)
993 task->tk_xprt = xprt_iter_get_next(&clnt->cl_xpi); 991 task->tk_xprt = xprt_iter_get_next(&clnt->cl_xpi);
994 task->tk_client = clnt; 992 task->tk_client = clnt;
@@ -1693,6 +1691,7 @@ call_allocate(struct rpc_task *task)
1693 struct rpc_rqst *req = task->tk_rqstp; 1691 struct rpc_rqst *req = task->tk_rqstp;
1694 struct rpc_xprt *xprt = req->rq_xprt; 1692 struct rpc_xprt *xprt = req->rq_xprt;
1695 struct rpc_procinfo *proc = task->tk_msg.rpc_proc; 1693 struct rpc_procinfo *proc = task->tk_msg.rpc_proc;
1694 int status;
1696 1695
1697 dprint_status(task); 1696 dprint_status(task);
1698 1697
@@ -1718,11 +1717,14 @@ call_allocate(struct rpc_task *task)
1718 req->rq_rcvsize = RPC_REPHDRSIZE + slack + proc->p_replen; 1717 req->rq_rcvsize = RPC_REPHDRSIZE + slack + proc->p_replen;
1719 req->rq_rcvsize <<= 2; 1718 req->rq_rcvsize <<= 2;
1720 1719
1721 req->rq_buffer = xprt->ops->buf_alloc(task, 1720 status = xprt->ops->buf_alloc(task);
1722 req->rq_callsize + req->rq_rcvsize);
1723 if (req->rq_buffer != NULL)
1724 return;
1725 xprt_inject_disconnect(xprt); 1721 xprt_inject_disconnect(xprt);
1722 if (status == 0)
1723 return;
1724 if (status != -ENOMEM) {
1725 rpc_exit(task, status);
1726 return;
1727 }
1726 1728
1727 dprintk("RPC: %5u rpc_buffer allocation failed\n", task->tk_pid); 1729 dprintk("RPC: %5u rpc_buffer allocation failed\n", task->tk_pid);
1728 1730
@@ -1748,18 +1750,6 @@ rpc_task_force_reencode(struct rpc_task *task)
1748 task->tk_rqstp->rq_bytes_sent = 0; 1750 task->tk_rqstp->rq_bytes_sent = 0;
1749} 1751}
1750 1752
1751static inline void
1752rpc_xdr_buf_init(struct xdr_buf *buf, void *start, size_t len)
1753{
1754 buf->head[0].iov_base = start;
1755 buf->head[0].iov_len = len;
1756 buf->tail[0].iov_len = 0;
1757 buf->page_len = 0;
1758 buf->flags = 0;
1759 buf->len = 0;
1760 buf->buflen = len;
1761}
1762
1763/* 1753/*
1764 * 3. Encode arguments of an RPC call 1754 * 3. Encode arguments of an RPC call
1765 */ 1755 */
@@ -1772,12 +1762,12 @@ rpc_xdr_encode(struct rpc_task *task)
1772 1762
1773 dprint_status(task); 1763 dprint_status(task);
1774 1764
1775 rpc_xdr_buf_init(&req->rq_snd_buf, 1765 xdr_buf_init(&req->rq_snd_buf,
1776 req->rq_buffer, 1766 req->rq_buffer,
1777 req->rq_callsize); 1767 req->rq_callsize);
1778 rpc_xdr_buf_init(&req->rq_rcv_buf, 1768 xdr_buf_init(&req->rq_rcv_buf,
1779 (char *)req->rq_buffer + req->rq_callsize, 1769 req->rq_rbuffer,
1780 req->rq_rcvsize); 1770 req->rq_rcvsize);
1781 1771
1782 p = rpc_encode_header(task); 1772 p = rpc_encode_header(task);
1783 if (p == NULL) { 1773 if (p == NULL) {
@@ -2616,6 +2606,70 @@ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt,
2616EXPORT_SYMBOL_GPL(rpc_clnt_test_and_add_xprt); 2606EXPORT_SYMBOL_GPL(rpc_clnt_test_and_add_xprt);
2617 2607
2618/** 2608/**
2609 * rpc_clnt_setup_test_and_add_xprt()
2610 *
2611 * This is an rpc_clnt_add_xprt setup() function which returns 1 so:
2612 * 1) caller of the test function must dereference the rpc_xprt_switch
2613 * and the rpc_xprt.
2614 * 2) test function must call rpc_xprt_switch_add_xprt, usually in
2615 * the rpc_call_done routine.
2616 *
2617 * Upon success (return of 1), the test function adds the new
2618 * transport to the rpc_clnt xprt switch
2619 *
2620 * @clnt: struct rpc_clnt to get the new transport
2621 * @xps: the rpc_xprt_switch to hold the new transport
2622 * @xprt: the rpc_xprt to test
2623 * @data: a struct rpc_add_xprt_test pointer that holds the test function
2624 * and test function call data
2625 */
2626int rpc_clnt_setup_test_and_add_xprt(struct rpc_clnt *clnt,
2627 struct rpc_xprt_switch *xps,
2628 struct rpc_xprt *xprt,
2629 void *data)
2630{
2631 struct rpc_cred *cred;
2632 struct rpc_task *task;
2633 struct rpc_add_xprt_test *xtest = (struct rpc_add_xprt_test *)data;
2634 int status = -EADDRINUSE;
2635
2636 xprt = xprt_get(xprt);
2637 xprt_switch_get(xps);
2638
2639 if (rpc_xprt_switch_has_addr(xps, (struct sockaddr *)&xprt->addr))
2640 goto out_err;
2641
2642 /* Test the connection */
2643 cred = authnull_ops.lookup_cred(NULL, NULL, 0);
2644 task = rpc_call_null_helper(clnt, xprt, cred,
2645 RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
2646 NULL, NULL);
2647 put_rpccred(cred);
2648 if (IS_ERR(task)) {
2649 status = PTR_ERR(task);
2650 goto out_err;
2651 }
2652 status = task->tk_status;
2653 rpc_put_task(task);
2654
2655 if (status < 0)
2656 goto out_err;
2657
2658 /* rpc_xprt_switch and rpc_xprt are deferrenced by add_xprt_test() */
2659 xtest->add_xprt_test(clnt, xprt, xtest->data);
2660
2661 /* so that rpc_clnt_add_xprt does not call rpc_xprt_switch_add_xprt */
2662 return 1;
2663out_err:
2664 xprt_put(xprt);
2665 xprt_switch_put(xps);
2666 pr_info("RPC: rpc_clnt_test_xprt failed: %d addr %s not added\n",
2667 status, xprt->address_strings[RPC_DISPLAY_ADDR]);
2668 return status;
2669}
2670EXPORT_SYMBOL_GPL(rpc_clnt_setup_test_and_add_xprt);
2671
2672/**
2619 * rpc_clnt_add_xprt - Add a new transport to a rpc_clnt 2673 * rpc_clnt_add_xprt - Add a new transport to a rpc_clnt
2620 * @clnt: pointer to struct rpc_clnt 2674 * @clnt: pointer to struct rpc_clnt
2621 * @xprtargs: pointer to struct xprt_create 2675 * @xprtargs: pointer to struct xprt_create
@@ -2697,6 +2751,34 @@ rpc_cap_max_reconnect_timeout(struct rpc_clnt *clnt, unsigned long timeo)
2697} 2751}
2698EXPORT_SYMBOL_GPL(rpc_cap_max_reconnect_timeout); 2752EXPORT_SYMBOL_GPL(rpc_cap_max_reconnect_timeout);
2699 2753
2754void rpc_clnt_xprt_switch_put(struct rpc_clnt *clnt)
2755{
2756 xprt_switch_put(rcu_dereference(clnt->cl_xpi.xpi_xpswitch));
2757}
2758EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_put);
2759
2760void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt)
2761{
2762 rpc_xprt_switch_add_xprt(rcu_dereference(clnt->cl_xpi.xpi_xpswitch),
2763 xprt);
2764}
2765EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_add_xprt);
2766
2767bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt,
2768 const struct sockaddr *sap)
2769{
2770 struct rpc_xprt_switch *xps;
2771 bool ret;
2772
2773 xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch);
2774
2775 rcu_read_lock();
2776 ret = rpc_xprt_switch_has_addr(xps, sap);
2777 rcu_read_unlock();
2778 return ret;
2779}
2780EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_has_addr);
2781
2700#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) 2782#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
2701static void rpc_show_header(void) 2783static void rpc_show_header(void)
2702{ 2784{
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 84f98cbe31c3..61a504fb1ae2 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -477,7 +477,7 @@ rpc_get_inode(struct super_block *sb, umode_t mode)
477 return NULL; 477 return NULL;
478 inode->i_ino = get_next_ino(); 478 inode->i_ino = get_next_ino();
479 inode->i_mode = mode; 479 inode->i_mode = mode;
480 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 480 inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
481 switch (mode & S_IFMT) { 481 switch (mode & S_IFMT) {
482 case S_IFDIR: 482 case S_IFDIR:
483 inode->i_fop = &simple_dir_operations; 483 inode->i_fop = &simple_dir_operations;
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 9ae588511aaf..5db68b371db2 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -849,14 +849,17 @@ static void rpc_async_schedule(struct work_struct *work)
849} 849}
850 850
851/** 851/**
852 * rpc_malloc - allocate an RPC buffer 852 * rpc_malloc - allocate RPC buffer resources
853 * @task: RPC task that will use this buffer 853 * @task: RPC task
854 * @size: requested byte size 854 *
855 * A single memory region is allocated, which is split between the
856 * RPC call and RPC reply that this task is being used for. When
857 * this RPC is retired, the memory is released by calling rpc_free.
855 * 858 *
856 * To prevent rpciod from hanging, this allocator never sleeps, 859 * To prevent rpciod from hanging, this allocator never sleeps,
857 * returning NULL and suppressing warning if the request cannot be serviced 860 * returning -ENOMEM and suppressing warning if the request cannot
858 * immediately. 861 * be serviced immediately. The caller can arrange to sleep in a
859 * The caller can arrange to sleep in a way that is safe for rpciod. 862 * way that is safe for rpciod.
860 * 863 *
861 * Most requests are 'small' (under 2KiB) and can be serviced from a 864 * Most requests are 'small' (under 2KiB) and can be serviced from a
862 * mempool, ensuring that NFS reads and writes can always proceed, 865 * mempool, ensuring that NFS reads and writes can always proceed,
@@ -865,8 +868,10 @@ static void rpc_async_schedule(struct work_struct *work)
865 * In order to avoid memory starvation triggering more writebacks of 868 * In order to avoid memory starvation triggering more writebacks of
866 * NFS requests, we avoid using GFP_KERNEL. 869 * NFS requests, we avoid using GFP_KERNEL.
867 */ 870 */
868void *rpc_malloc(struct rpc_task *task, size_t size) 871int rpc_malloc(struct rpc_task *task)
869{ 872{
873 struct rpc_rqst *rqst = task->tk_rqstp;
874 size_t size = rqst->rq_callsize + rqst->rq_rcvsize;
870 struct rpc_buffer *buf; 875 struct rpc_buffer *buf;
871 gfp_t gfp = GFP_NOIO | __GFP_NOWARN; 876 gfp_t gfp = GFP_NOIO | __GFP_NOWARN;
872 877
@@ -880,28 +885,28 @@ void *rpc_malloc(struct rpc_task *task, size_t size)
880 buf = kmalloc(size, gfp); 885 buf = kmalloc(size, gfp);
881 886
882 if (!buf) 887 if (!buf)
883 return NULL; 888 return -ENOMEM;
884 889
885 buf->len = size; 890 buf->len = size;
886 dprintk("RPC: %5u allocated buffer of size %zu at %p\n", 891 dprintk("RPC: %5u allocated buffer of size %zu at %p\n",
887 task->tk_pid, size, buf); 892 task->tk_pid, size, buf);
888 return &buf->data; 893 rqst->rq_buffer = buf->data;
894 rqst->rq_rbuffer = (char *)rqst->rq_buffer + rqst->rq_callsize;
895 return 0;
889} 896}
890EXPORT_SYMBOL_GPL(rpc_malloc); 897EXPORT_SYMBOL_GPL(rpc_malloc);
891 898
892/** 899/**
893 * rpc_free - free buffer allocated via rpc_malloc 900 * rpc_free - free RPC buffer resources allocated via rpc_malloc
894 * @buffer: buffer to free 901 * @task: RPC task
895 * 902 *
896 */ 903 */
897void rpc_free(void *buffer) 904void rpc_free(struct rpc_task *task)
898{ 905{
906 void *buffer = task->tk_rqstp->rq_buffer;
899 size_t size; 907 size_t size;
900 struct rpc_buffer *buf; 908 struct rpc_buffer *buf;
901 909
902 if (!buffer)
903 return;
904
905 buf = container_of(buffer, struct rpc_buffer, data); 910 buf = container_of(buffer, struct rpc_buffer, data);
906 size = buf->len; 911 size = buf->len;
907 912
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index c5b0cb4f4056..7c8070ec93c8 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -401,6 +401,21 @@ int svc_bind(struct svc_serv *serv, struct net *net)
401} 401}
402EXPORT_SYMBOL_GPL(svc_bind); 402EXPORT_SYMBOL_GPL(svc_bind);
403 403
404#if defined(CONFIG_SUNRPC_BACKCHANNEL)
405static void
406__svc_init_bc(struct svc_serv *serv)
407{
408 INIT_LIST_HEAD(&serv->sv_cb_list);
409 spin_lock_init(&serv->sv_cb_lock);
410 init_waitqueue_head(&serv->sv_cb_waitq);
411}
412#else
413static void
414__svc_init_bc(struct svc_serv *serv)
415{
416}
417#endif
418
404/* 419/*
405 * Create an RPC service 420 * Create an RPC service
406 */ 421 */
@@ -443,6 +458,8 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
443 init_timer(&serv->sv_temptimer); 458 init_timer(&serv->sv_temptimer);
444 spin_lock_init(&serv->sv_lock); 459 spin_lock_init(&serv->sv_lock);
445 460
461 __svc_init_bc(serv);
462
446 serv->sv_nrpools = npools; 463 serv->sv_nrpools = npools;
447 serv->sv_pools = 464 serv->sv_pools =
448 kcalloc(serv->sv_nrpools, sizeof(struct svc_pool), 465 kcalloc(serv->sv_nrpools, sizeof(struct svc_pool),
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index dfacdc95b3f5..64af4f034de6 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -517,7 +517,7 @@ static int unix_gid_parse(struct cache_detail *cd,
517 kgid = make_kgid(&init_user_ns, gid); 517 kgid = make_kgid(&init_user_ns, gid);
518 if (!gid_valid(kgid)) 518 if (!gid_valid(kgid))
519 goto out; 519 goto out;
520 GROUP_AT(ug.gi, i) = kgid; 520 ug.gi->gid[i] = kgid;
521 } 521 }
522 522
523 ugp = unix_gid_lookup(cd, uid); 523 ugp = unix_gid_lookup(cd, uid);
@@ -564,7 +564,7 @@ static int unix_gid_show(struct seq_file *m,
564 564
565 seq_printf(m, "%u %d:", from_kuid_munged(user_ns, ug->uid), glen); 565 seq_printf(m, "%u %d:", from_kuid_munged(user_ns, ug->uid), glen);
566 for (i = 0; i < glen; i++) 566 for (i = 0; i < glen; i++)
567 seq_printf(m, " %d", from_kgid_munged(user_ns, GROUP_AT(ug->gi, i))); 567 seq_printf(m, " %d", from_kgid_munged(user_ns, ug->gi->gid[i]));
568 seq_printf(m, "\n"); 568 seq_printf(m, "\n");
569 return 0; 569 return 0;
570} 570}
@@ -817,7 +817,7 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp)
817 return SVC_CLOSE; 817 return SVC_CLOSE;
818 for (i = 0; i < slen; i++) { 818 for (i = 0; i < slen; i++) {
819 kgid_t kgid = make_kgid(&init_user_ns, svc_getnl(argv)); 819 kgid_t kgid = make_kgid(&init_user_ns, svc_getnl(argv));
820 GROUP_AT(cred->cr_group_info, i) = kgid; 820 cred->cr_group_info->gid[i] = kgid;
821 } 821 }
822 if (svc_getu32(argv) != htonl(RPC_AUTH_NULL) || svc_getu32(argv) != 0) { 822 if (svc_getu32(argv) != htonl(RPC_AUTH_NULL) || svc_getu32(argv) != 0) {
823 *authp = rpc_autherr_badverf; 823 *authp = rpc_autherr_badverf;
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index c4f3cc0c0775..7f1071e103ca 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -767,7 +767,7 @@ static void xdr_set_next_page(struct xdr_stream *xdr)
767 newbase -= xdr->buf->page_base; 767 newbase -= xdr->buf->page_base;
768 768
769 if (xdr_set_page_base(xdr, newbase, PAGE_SIZE) < 0) 769 if (xdr_set_page_base(xdr, newbase, PAGE_SIZE) < 0)
770 xdr_set_iov(xdr, xdr->buf->tail, xdr->buf->len); 770 xdr_set_iov(xdr, xdr->buf->tail, xdr->nwords << 2);
771} 771}
772 772
773static bool xdr_set_next_buffer(struct xdr_stream *xdr) 773static bool xdr_set_next_buffer(struct xdr_stream *xdr)
@@ -776,7 +776,7 @@ static bool xdr_set_next_buffer(struct xdr_stream *xdr)
776 xdr_set_next_page(xdr); 776 xdr_set_next_page(xdr);
777 else if (xdr->iov == xdr->buf->head) { 777 else if (xdr->iov == xdr->buf->head) {
778 if (xdr_set_page_base(xdr, 0, PAGE_SIZE) < 0) 778 if (xdr_set_page_base(xdr, 0, PAGE_SIZE) < 0)
779 xdr_set_iov(xdr, xdr->buf->tail, xdr->buf->len); 779 xdr_set_iov(xdr, xdr->buf->tail, xdr->nwords << 2);
780 } 780 }
781 return xdr->p != xdr->end; 781 return xdr->p != xdr->end;
782} 782}
@@ -859,12 +859,15 @@ EXPORT_SYMBOL_GPL(xdr_set_scratch_buffer);
859static __be32 *xdr_copy_to_scratch(struct xdr_stream *xdr, size_t nbytes) 859static __be32 *xdr_copy_to_scratch(struct xdr_stream *xdr, size_t nbytes)
860{ 860{
861 __be32 *p; 861 __be32 *p;
862 void *cpdest = xdr->scratch.iov_base; 862 char *cpdest = xdr->scratch.iov_base;
863 size_t cplen = (char *)xdr->end - (char *)xdr->p; 863 size_t cplen = (char *)xdr->end - (char *)xdr->p;
864 864
865 if (nbytes > xdr->scratch.iov_len) 865 if (nbytes > xdr->scratch.iov_len)
866 return NULL; 866 return NULL;
867 memcpy(cpdest, xdr->p, cplen); 867 p = __xdr_inline_decode(xdr, cplen);
868 if (p == NULL)
869 return NULL;
870 memcpy(cpdest, p, cplen);
868 cpdest += cplen; 871 cpdest += cplen;
869 nbytes -= cplen; 872 nbytes -= cplen;
870 if (!xdr_set_next_buffer(xdr)) 873 if (!xdr_set_next_buffer(xdr))
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index ea244b29138b..685e6d225414 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1295,7 +1295,7 @@ void xprt_release(struct rpc_task *task)
1295 xprt_schedule_autodisconnect(xprt); 1295 xprt_schedule_autodisconnect(xprt);
1296 spin_unlock_bh(&xprt->transport_lock); 1296 spin_unlock_bh(&xprt->transport_lock);
1297 if (req->rq_buffer) 1297 if (req->rq_buffer)
1298 xprt->ops->buf_free(req->rq_buffer); 1298 xprt->ops->buf_free(task);
1299 xprt_inject_disconnect(xprt); 1299 xprt_inject_disconnect(xprt);
1300 if (req->rq_cred != NULL) 1300 if (req->rq_cred != NULL)
1301 put_rpccred(req->rq_cred); 1301 put_rpccred(req->rq_cred);
diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c
index 66c9d63f4797..ae92a9e9ba52 100644
--- a/net/sunrpc/xprtmultipath.c
+++ b/net/sunrpc/xprtmultipath.c
@@ -15,6 +15,7 @@
15#include <asm/cmpxchg.h> 15#include <asm/cmpxchg.h>
16#include <linux/spinlock.h> 16#include <linux/spinlock.h>
17#include <linux/sunrpc/xprt.h> 17#include <linux/sunrpc/xprt.h>
18#include <linux/sunrpc/addr.h>
18#include <linux/sunrpc/xprtmultipath.h> 19#include <linux/sunrpc/xprtmultipath.h>
19 20
20typedef struct rpc_xprt *(*xprt_switch_find_xprt_t)(struct list_head *head, 21typedef struct rpc_xprt *(*xprt_switch_find_xprt_t)(struct list_head *head,
@@ -49,7 +50,8 @@ void rpc_xprt_switch_add_xprt(struct rpc_xprt_switch *xps,
49 if (xprt == NULL) 50 if (xprt == NULL)
50 return; 51 return;
51 spin_lock(&xps->xps_lock); 52 spin_lock(&xps->xps_lock);
52 if (xps->xps_net == xprt->xprt_net || xps->xps_net == NULL) 53 if ((xps->xps_net == xprt->xprt_net || xps->xps_net == NULL) &&
54 !rpc_xprt_switch_has_addr(xps, (struct sockaddr *)&xprt->addr))
53 xprt_switch_add_xprt_locked(xps, xprt); 55 xprt_switch_add_xprt_locked(xps, xprt);
54 spin_unlock(&xps->xps_lock); 56 spin_unlock(&xps->xps_lock);
55} 57}
@@ -232,6 +234,26 @@ struct rpc_xprt *xprt_iter_current_entry(struct rpc_xprt_iter *xpi)
232 return xprt_switch_find_current_entry(head, xpi->xpi_cursor); 234 return xprt_switch_find_current_entry(head, xpi->xpi_cursor);
233} 235}
234 236
237bool rpc_xprt_switch_has_addr(struct rpc_xprt_switch *xps,
238 const struct sockaddr *sap)
239{
240 struct list_head *head;
241 struct rpc_xprt *pos;
242
243 if (xps == NULL || sap == NULL)
244 return false;
245
246 head = &xps->xps_xprt_list;
247 list_for_each_entry_rcu(pos, head, xprt_switch) {
248 if (rpc_cmp_addr_port(sap, (struct sockaddr *)&pos->addr)) {
249 pr_info("RPC: addr %s already in xprt switch\n",
250 pos->address_strings[RPC_DISPLAY_ADDR]);
251 return true;
252 }
253 }
254 return false;
255}
256
235static 257static
236struct rpc_xprt *xprt_switch_find_next_entry(struct list_head *head, 258struct rpc_xprt *xprt_switch_find_next_entry(struct list_head *head,
237 const struct rpc_xprt *cur) 259 const struct rpc_xprt *cur)
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index 87762d976b63..2c472e1b4827 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -27,7 +27,7 @@ static void rpcrdma_bc_free_rqst(struct rpcrdma_xprt *r_xprt,
27 list_del(&req->rl_all); 27 list_del(&req->rl_all);
28 spin_unlock(&buf->rb_reqslock); 28 spin_unlock(&buf->rb_reqslock);
29 29
30 rpcrdma_destroy_req(&r_xprt->rx_ia, req); 30 rpcrdma_destroy_req(req);
31 31
32 kfree(rqst); 32 kfree(rqst);
33} 33}
@@ -35,10 +35,8 @@ static void rpcrdma_bc_free_rqst(struct rpcrdma_xprt *r_xprt,
35static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt, 35static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt,
36 struct rpc_rqst *rqst) 36 struct rpc_rqst *rqst)
37{ 37{
38 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
39 struct rpcrdma_regbuf *rb; 38 struct rpcrdma_regbuf *rb;
40 struct rpcrdma_req *req; 39 struct rpcrdma_req *req;
41 struct xdr_buf *buf;
42 size_t size; 40 size_t size;
43 41
44 req = rpcrdma_create_req(r_xprt); 42 req = rpcrdma_create_req(r_xprt);
@@ -46,30 +44,19 @@ static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt,
46 return PTR_ERR(req); 44 return PTR_ERR(req);
47 req->rl_backchannel = true; 45 req->rl_backchannel = true;
48 46
49 size = RPCRDMA_INLINE_WRITE_THRESHOLD(rqst); 47 rb = rpcrdma_alloc_regbuf(RPCRDMA_HDRBUF_SIZE,
50 rb = rpcrdma_alloc_regbuf(ia, size, GFP_KERNEL); 48 DMA_TO_DEVICE, GFP_KERNEL);
51 if (IS_ERR(rb)) 49 if (IS_ERR(rb))
52 goto out_fail; 50 goto out_fail;
53 req->rl_rdmabuf = rb; 51 req->rl_rdmabuf = rb;
54 52
55 size += RPCRDMA_INLINE_READ_THRESHOLD(rqst); 53 size = r_xprt->rx_data.inline_rsize;
56 rb = rpcrdma_alloc_regbuf(ia, size, GFP_KERNEL); 54 rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, GFP_KERNEL);
57 if (IS_ERR(rb)) 55 if (IS_ERR(rb))
58 goto out_fail; 56 goto out_fail;
59 rb->rg_owner = req;
60 req->rl_sendbuf = rb; 57 req->rl_sendbuf = rb;
61 /* so that rpcr_to_rdmar works when receiving a request */ 58 xdr_buf_init(&rqst->rq_snd_buf, rb->rg_base, size);
62 rqst->rq_buffer = (void *)req->rl_sendbuf->rg_base; 59 rpcrdma_set_xprtdata(rqst, req);
63
64 buf = &rqst->rq_snd_buf;
65 buf->head[0].iov_base = rqst->rq_buffer;
66 buf->head[0].iov_len = 0;
67 buf->tail[0].iov_base = NULL;
68 buf->tail[0].iov_len = 0;
69 buf->page_len = 0;
70 buf->len = 0;
71 buf->buflen = size;
72
73 return 0; 60 return 0;
74 61
75out_fail: 62out_fail:
@@ -219,7 +206,6 @@ int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst)
219 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); 206 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
220 struct rpcrdma_req *req = rpcr_to_rdmar(rqst); 207 struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
221 struct rpcrdma_msg *headerp; 208 struct rpcrdma_msg *headerp;
222 size_t rpclen;
223 209
224 headerp = rdmab_to_msg(req->rl_rdmabuf); 210 headerp = rdmab_to_msg(req->rl_rdmabuf);
225 headerp->rm_xid = rqst->rq_xid; 211 headerp->rm_xid = rqst->rq_xid;
@@ -231,26 +217,9 @@ int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst)
231 headerp->rm_body.rm_chunks[1] = xdr_zero; 217 headerp->rm_body.rm_chunks[1] = xdr_zero;
232 headerp->rm_body.rm_chunks[2] = xdr_zero; 218 headerp->rm_body.rm_chunks[2] = xdr_zero;
233 219
234 rpclen = rqst->rq_svec[0].iov_len; 220 if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, RPCRDMA_HDRLEN_MIN,
235 221 &rqst->rq_snd_buf, rpcrdma_noch))
236#ifdef RPCRDMA_BACKCHANNEL_DEBUG 222 return -EIO;
237 pr_info("RPC: %s: rpclen %zd headerp 0x%p lkey 0x%x\n",
238 __func__, rpclen, headerp, rdmab_lkey(req->rl_rdmabuf));
239 pr_info("RPC: %s: RPC/RDMA: %*ph\n",
240 __func__, (int)RPCRDMA_HDRLEN_MIN, headerp);
241 pr_info("RPC: %s: RPC: %*ph\n",
242 __func__, (int)rpclen, rqst->rq_svec[0].iov_base);
243#endif
244
245 req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf);
246 req->rl_send_iov[0].length = RPCRDMA_HDRLEN_MIN;
247 req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf);
248
249 req->rl_send_iov[1].addr = rdmab_addr(req->rl_sendbuf);
250 req->rl_send_iov[1].length = rpclen;
251 req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf);
252
253 req->rl_niovs = 2;
254 return 0; 223 return 0;
255} 224}
256 225
@@ -402,7 +371,7 @@ out_overflow:
402out_short: 371out_short:
403 pr_warn("RPC/RDMA short backward direction call\n"); 372 pr_warn("RPC/RDMA short backward direction call\n");
404 373
405 if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep)) 374 if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, rep))
406 xprt_disconnect_done(xprt); 375 xprt_disconnect_done(xprt);
407 else 376 else
408 pr_warn("RPC: %s: reposting rep %p\n", 377 pr_warn("RPC: %s: reposting rep %p\n",
diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index 21cb3b150b37..1ebb09e1ac4f 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -160,9 +160,8 @@ static int
160fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, 160fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
161 struct rpcrdma_create_data_internal *cdata) 161 struct rpcrdma_create_data_internal *cdata)
162{ 162{
163 rpcrdma_set_max_header_sizes(ia, cdata, max_t(unsigned int, 1, 163 ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS /
164 RPCRDMA_MAX_DATA_SEGS / 164 RPCRDMA_MAX_FMR_SGES);
165 RPCRDMA_MAX_FMR_SGES));
166 return 0; 165 return 0;
167} 166}
168 167
@@ -274,6 +273,7 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
274 */ 273 */
275 list_for_each_entry(mw, &req->rl_registered, mw_list) 274 list_for_each_entry(mw, &req->rl_registered, mw_list)
276 list_add_tail(&mw->fmr.fm_mr->list, &unmap_list); 275 list_add_tail(&mw->fmr.fm_mr->list, &unmap_list);
276 r_xprt->rx_stats.local_inv_needed++;
277 rc = ib_unmap_fmr(&unmap_list); 277 rc = ib_unmap_fmr(&unmap_list);
278 if (rc) 278 if (rc)
279 goto out_reset; 279 goto out_reset;
@@ -331,4 +331,5 @@ const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
331 .ro_init_mr = fmr_op_init_mr, 331 .ro_init_mr = fmr_op_init_mr,
332 .ro_release_mr = fmr_op_release_mr, 332 .ro_release_mr = fmr_op_release_mr,
333 .ro_displayname = "fmr", 333 .ro_displayname = "fmr",
334 .ro_send_w_inv_ok = 0,
334}; 335};
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 892b5e1d9b09..210949562786 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -67,6 +67,8 @@
67 * pending send queue WRs before the transport is reconnected. 67 * pending send queue WRs before the transport is reconnected.
68 */ 68 */
69 69
70#include <linux/sunrpc/rpc_rdma.h>
71
70#include "xprt_rdma.h" 72#include "xprt_rdma.h"
71 73
72#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) 74#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
@@ -161,7 +163,7 @@ __frwr_reset_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r)
161 return PTR_ERR(f->fr_mr); 163 return PTR_ERR(f->fr_mr);
162 } 164 }
163 165
164 dprintk("RPC: %s: recovered FRMR %p\n", __func__, r); 166 dprintk("RPC: %s: recovered FRMR %p\n", __func__, f);
165 f->fr_state = FRMR_IS_INVALID; 167 f->fr_state = FRMR_IS_INVALID;
166 return 0; 168 return 0;
167} 169}
@@ -242,9 +244,8 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
242 depth; 244 depth;
243 } 245 }
244 246
245 rpcrdma_set_max_header_sizes(ia, cdata, max_t(unsigned int, 1, 247 ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS /
246 RPCRDMA_MAX_DATA_SEGS / 248 ia->ri_max_frmr_depth);
247 ia->ri_max_frmr_depth));
248 return 0; 249 return 0;
249} 250}
250 251
@@ -329,7 +330,7 @@ frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
329 frmr = container_of(cqe, struct rpcrdma_frmr, fr_cqe); 330 frmr = container_of(cqe, struct rpcrdma_frmr, fr_cqe);
330 if (wc->status != IB_WC_SUCCESS) 331 if (wc->status != IB_WC_SUCCESS)
331 __frwr_sendcompletion_flush(wc, frmr, "localinv"); 332 __frwr_sendcompletion_flush(wc, frmr, "localinv");
332 complete_all(&frmr->fr_linv_done); 333 complete(&frmr->fr_linv_done);
333} 334}
334 335
335/* Post a REG_MR Work Request to register a memory region 336/* Post a REG_MR Work Request to register a memory region
@@ -396,7 +397,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
396 goto out_mapmr_err; 397 goto out_mapmr_err;
397 398
398 dprintk("RPC: %s: Using frmr %p to map %u segments (%u bytes)\n", 399 dprintk("RPC: %s: Using frmr %p to map %u segments (%u bytes)\n",
399 __func__, mw, mw->mw_nents, mr->length); 400 __func__, frmr, mw->mw_nents, mr->length);
400 401
401 key = (u8)(mr->rkey & 0x000000FF); 402 key = (u8)(mr->rkey & 0x000000FF);
402 ib_update_fast_reg_key(mr, ++key); 403 ib_update_fast_reg_key(mr, ++key);
@@ -449,6 +450,8 @@ __frwr_prepare_linv_wr(struct rpcrdma_mw *mw)
449 struct rpcrdma_frmr *f = &mw->frmr; 450 struct rpcrdma_frmr *f = &mw->frmr;
450 struct ib_send_wr *invalidate_wr; 451 struct ib_send_wr *invalidate_wr;
451 452
453 dprintk("RPC: %s: invalidating frmr %p\n", __func__, f);
454
452 f->fr_state = FRMR_IS_INVALID; 455 f->fr_state = FRMR_IS_INVALID;
453 invalidate_wr = &f->fr_invwr; 456 invalidate_wr = &f->fr_invwr;
454 457
@@ -472,6 +475,7 @@ static void
472frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) 475frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
473{ 476{
474 struct ib_send_wr *invalidate_wrs, *pos, *prev, *bad_wr; 477 struct ib_send_wr *invalidate_wrs, *pos, *prev, *bad_wr;
478 struct rpcrdma_rep *rep = req->rl_reply;
475 struct rpcrdma_ia *ia = &r_xprt->rx_ia; 479 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
476 struct rpcrdma_mw *mw, *tmp; 480 struct rpcrdma_mw *mw, *tmp;
477 struct rpcrdma_frmr *f; 481 struct rpcrdma_frmr *f;
@@ -487,6 +491,12 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
487 f = NULL; 491 f = NULL;
488 invalidate_wrs = pos = prev = NULL; 492 invalidate_wrs = pos = prev = NULL;
489 list_for_each_entry(mw, &req->rl_registered, mw_list) { 493 list_for_each_entry(mw, &req->rl_registered, mw_list) {
494 if ((rep->rr_wc_flags & IB_WC_WITH_INVALIDATE) &&
495 (mw->mw_handle == rep->rr_inv_rkey)) {
496 mw->frmr.fr_state = FRMR_IS_INVALID;
497 continue;
498 }
499
490 pos = __frwr_prepare_linv_wr(mw); 500 pos = __frwr_prepare_linv_wr(mw);
491 501
492 if (!invalidate_wrs) 502 if (!invalidate_wrs)
@@ -496,6 +506,8 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
496 prev = pos; 506 prev = pos;
497 f = &mw->frmr; 507 f = &mw->frmr;
498 } 508 }
509 if (!f)
510 goto unmap;
499 511
500 /* Strong send queue ordering guarantees that when the 512 /* Strong send queue ordering guarantees that when the
501 * last WR in the chain completes, all WRs in the chain 513 * last WR in the chain completes, all WRs in the chain
@@ -510,6 +522,7 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
510 * replaces the QP. The RPC reply handler won't call us 522 * replaces the QP. The RPC reply handler won't call us
511 * unless ri_id->qp is a valid pointer. 523 * unless ri_id->qp is a valid pointer.
512 */ 524 */
525 r_xprt->rx_stats.local_inv_needed++;
513 rc = ib_post_send(ia->ri_id->qp, invalidate_wrs, &bad_wr); 526 rc = ib_post_send(ia->ri_id->qp, invalidate_wrs, &bad_wr);
514 if (rc) 527 if (rc)
515 goto reset_mrs; 528 goto reset_mrs;
@@ -521,6 +534,8 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
521 */ 534 */
522unmap: 535unmap:
523 list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) { 536 list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) {
537 dprintk("RPC: %s: unmapping frmr %p\n",
538 __func__, &mw->frmr);
524 list_del_init(&mw->mw_list); 539 list_del_init(&mw->mw_list);
525 ib_dma_unmap_sg(ia->ri_device, 540 ib_dma_unmap_sg(ia->ri_device,
526 mw->mw_sg, mw->mw_nents, mw->mw_dir); 541 mw->mw_sg, mw->mw_nents, mw->mw_dir);
@@ -576,4 +591,5 @@ const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
576 .ro_init_mr = frwr_op_init_mr, 591 .ro_init_mr = frwr_op_init_mr,
577 .ro_release_mr = frwr_op_release_mr, 592 .ro_release_mr = frwr_op_release_mr,
578 .ro_displayname = "frwr", 593 .ro_displayname = "frwr",
594 .ro_send_w_inv_ok = RPCRDMA_CMP_F_SND_W_INV_OK,
579}; 595};
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index a47f170b20ef..d987c2d3dd6e 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -53,14 +53,6 @@
53# define RPCDBG_FACILITY RPCDBG_TRANS 53# define RPCDBG_FACILITY RPCDBG_TRANS
54#endif 54#endif
55 55
56enum rpcrdma_chunktype {
57 rpcrdma_noch = 0,
58 rpcrdma_readch,
59 rpcrdma_areadch,
60 rpcrdma_writech,
61 rpcrdma_replych
62};
63
64static const char transfertypes[][12] = { 56static const char transfertypes[][12] = {
65 "inline", /* no chunks */ 57 "inline", /* no chunks */
66 "read list", /* some argument via rdma read */ 58 "read list", /* some argument via rdma read */
@@ -118,10 +110,12 @@ static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs)
118 return size; 110 return size;
119} 111}
120 112
121void rpcrdma_set_max_header_sizes(struct rpcrdma_ia *ia, 113void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *r_xprt)
122 struct rpcrdma_create_data_internal *cdata,
123 unsigned int maxsegs)
124{ 114{
115 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
116 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
117 unsigned int maxsegs = ia->ri_max_segs;
118
125 ia->ri_max_inline_write = cdata->inline_wsize - 119 ia->ri_max_inline_write = cdata->inline_wsize -
126 rpcrdma_max_call_header_size(maxsegs); 120 rpcrdma_max_call_header_size(maxsegs);
127 ia->ri_max_inline_read = cdata->inline_rsize - 121 ia->ri_max_inline_read = cdata->inline_rsize -
@@ -155,42 +149,6 @@ static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
155 return rqst->rq_rcv_buf.buflen <= ia->ri_max_inline_read; 149 return rqst->rq_rcv_buf.buflen <= ia->ri_max_inline_read;
156} 150}
157 151
158static int
159rpcrdma_tail_pullup(struct xdr_buf *buf)
160{
161 size_t tlen = buf->tail[0].iov_len;
162 size_t skip = tlen & 3;
163
164 /* Do not include the tail if it is only an XDR pad */
165 if (tlen < 4)
166 return 0;
167
168 /* xdr_write_pages() adds a pad at the beginning of the tail
169 * if the content in "buf->pages" is unaligned. Force the
170 * tail's actual content to land at the next XDR position
171 * after the head instead.
172 */
173 if (skip) {
174 unsigned char *src, *dst;
175 unsigned int count;
176
177 src = buf->tail[0].iov_base;
178 dst = buf->head[0].iov_base;
179 dst += buf->head[0].iov_len;
180
181 src += skip;
182 tlen -= skip;
183
184 dprintk("RPC: %s: skip=%zu, memmove(%p, %p, %zu)\n",
185 __func__, skip, dst, src, tlen);
186
187 for (count = tlen; count; count--)
188 *dst++ = *src++;
189 }
190
191 return tlen;
192}
193
194/* Split "vec" on page boundaries into segments. FMR registers pages, 152/* Split "vec" on page boundaries into segments. FMR registers pages,
195 * not a byte range. Other modes coalesce these segments into a single 153 * not a byte range. Other modes coalesce these segments into a single
196 * MR when they can. 154 * MR when they can.
@@ -229,7 +187,8 @@ rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, int n)
229 187
230static int 188static int
231rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, 189rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
232 enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg) 190 enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg,
191 bool reminv_expected)
233{ 192{
234 int len, n, p, page_base; 193 int len, n, p, page_base;
235 struct page **ppages; 194 struct page **ppages;
@@ -271,6 +230,13 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
271 if (type == rpcrdma_readch) 230 if (type == rpcrdma_readch)
272 return n; 231 return n;
273 232
233 /* When encoding the Write list, some servers need to see an extra
234 * segment for odd-length Write chunks. The upper layer provides
235 * space in the tail iovec for this purpose.
236 */
237 if (type == rpcrdma_writech && reminv_expected)
238 return n;
239
274 if (xdrbuf->tail[0].iov_len) { 240 if (xdrbuf->tail[0].iov_len) {
275 /* the rpcrdma protocol allows us to omit any trailing 241 /* the rpcrdma protocol allows us to omit any trailing
276 * xdr pad bytes, saving the server an RDMA operation. */ 242 * xdr pad bytes, saving the server an RDMA operation. */
@@ -327,7 +293,7 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
327 if (rtype == rpcrdma_areadch) 293 if (rtype == rpcrdma_areadch)
328 pos = 0; 294 pos = 0;
329 seg = req->rl_segments; 295 seg = req->rl_segments;
330 nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg); 296 nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg, false);
331 if (nsegs < 0) 297 if (nsegs < 0)
332 return ERR_PTR(nsegs); 298 return ERR_PTR(nsegs);
333 299
@@ -391,7 +357,8 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
391 seg = req->rl_segments; 357 seg = req->rl_segments;
392 nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 358 nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf,
393 rqst->rq_rcv_buf.head[0].iov_len, 359 rqst->rq_rcv_buf.head[0].iov_len,
394 wtype, seg); 360 wtype, seg,
361 r_xprt->rx_ia.ri_reminv_expected);
395 if (nsegs < 0) 362 if (nsegs < 0)
396 return ERR_PTR(nsegs); 363 return ERR_PTR(nsegs);
397 364
@@ -456,7 +423,8 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
456 } 423 }
457 424
458 seg = req->rl_segments; 425 seg = req->rl_segments;
459 nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg); 426 nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg,
427 r_xprt->rx_ia.ri_reminv_expected);
460 if (nsegs < 0) 428 if (nsegs < 0)
461 return ERR_PTR(nsegs); 429 return ERR_PTR(nsegs);
462 430
@@ -491,74 +459,184 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
491 return iptr; 459 return iptr;
492} 460}
493 461
494/* 462/* Prepare the RPC-over-RDMA header SGE.
495 * Copy write data inline.
496 * This function is used for "small" requests. Data which is passed
497 * to RPC via iovecs (or page list) is copied directly into the
498 * pre-registered memory buffer for this request. For small amounts
499 * of data, this is efficient. The cutoff value is tunable.
500 */ 463 */
501static void rpcrdma_inline_pullup(struct rpc_rqst *rqst) 464static bool
465rpcrdma_prepare_hdr_sge(struct rpcrdma_ia *ia, struct rpcrdma_req *req,
466 u32 len)
502{ 467{
503 int i, npages, curlen; 468 struct rpcrdma_regbuf *rb = req->rl_rdmabuf;
504 int copy_len; 469 struct ib_sge *sge = &req->rl_send_sge[0];
505 unsigned char *srcp, *destp; 470
506 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); 471 if (unlikely(!rpcrdma_regbuf_is_mapped(rb))) {
507 int page_base; 472 if (!__rpcrdma_dma_map_regbuf(ia, rb))
508 struct page **ppages; 473 return false;
474 sge->addr = rdmab_addr(rb);
475 sge->lkey = rdmab_lkey(rb);
476 }
477 sge->length = len;
478
479 ib_dma_sync_single_for_device(ia->ri_device, sge->addr,
480 sge->length, DMA_TO_DEVICE);
481 req->rl_send_wr.num_sge++;
482 return true;
483}
509 484
510 destp = rqst->rq_svec[0].iov_base; 485/* Prepare the Send SGEs. The head and tail iovec, and each entry
511 curlen = rqst->rq_svec[0].iov_len; 486 * in the page list, gets its own SGE.
512 destp += curlen; 487 */
488static bool
489rpcrdma_prepare_msg_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req,
490 struct xdr_buf *xdr, enum rpcrdma_chunktype rtype)
491{
492 unsigned int sge_no, page_base, len, remaining;
493 struct rpcrdma_regbuf *rb = req->rl_sendbuf;
494 struct ib_device *device = ia->ri_device;
495 struct ib_sge *sge = req->rl_send_sge;
496 u32 lkey = ia->ri_pd->local_dma_lkey;
497 struct page *page, **ppages;
498
499 /* The head iovec is straightforward, as it is already
500 * DMA-mapped. Sync the content that has changed.
501 */
502 if (!rpcrdma_dma_map_regbuf(ia, rb))
503 return false;
504 sge_no = 1;
505 sge[sge_no].addr = rdmab_addr(rb);
506 sge[sge_no].length = xdr->head[0].iov_len;
507 sge[sge_no].lkey = rdmab_lkey(rb);
508 ib_dma_sync_single_for_device(device, sge[sge_no].addr,
509 sge[sge_no].length, DMA_TO_DEVICE);
510
511 /* If there is a Read chunk, the page list is being handled
512 * via explicit RDMA, and thus is skipped here. However, the
513 * tail iovec may include an XDR pad for the page list, as
514 * well as additional content, and may not reside in the
515 * same page as the head iovec.
516 */
517 if (rtype == rpcrdma_readch) {
518 len = xdr->tail[0].iov_len;
513 519
514 dprintk("RPC: %s: destp 0x%p len %d hdrlen %d\n", 520 /* Do not include the tail if it is only an XDR pad */
515 __func__, destp, rqst->rq_slen, curlen); 521 if (len < 4)
522 goto out;
516 523
517 copy_len = rqst->rq_snd_buf.page_len; 524 page = virt_to_page(xdr->tail[0].iov_base);
525 page_base = (unsigned long)xdr->tail[0].iov_base & ~PAGE_MASK;
518 526
519 if (rqst->rq_snd_buf.tail[0].iov_len) { 527 /* If the content in the page list is an odd length,
520 curlen = rqst->rq_snd_buf.tail[0].iov_len; 528 * xdr_write_pages() has added a pad at the beginning
521 if (destp + copy_len != rqst->rq_snd_buf.tail[0].iov_base) { 529 * of the tail iovec. Force the tail's non-pad content
522 memmove(destp + copy_len, 530 * to land at the next XDR position in the Send message.
523 rqst->rq_snd_buf.tail[0].iov_base, curlen); 531 */
524 r_xprt->rx_stats.pullup_copy_count += curlen; 532 page_base += len & 3;
533 len -= len & 3;
534 goto map_tail;
535 }
536
537 /* If there is a page list present, temporarily DMA map
538 * and prepare an SGE for each page to be sent.
539 */
540 if (xdr->page_len) {
541 ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
542 page_base = xdr->page_base & ~PAGE_MASK;
543 remaining = xdr->page_len;
544 while (remaining) {
545 sge_no++;
546 if (sge_no > RPCRDMA_MAX_SEND_SGES - 2)
547 goto out_mapping_overflow;
548
549 len = min_t(u32, PAGE_SIZE - page_base, remaining);
550 sge[sge_no].addr = ib_dma_map_page(device, *ppages,
551 page_base, len,
552 DMA_TO_DEVICE);
553 if (ib_dma_mapping_error(device, sge[sge_no].addr))
554 goto out_mapping_err;
555 sge[sge_no].length = len;
556 sge[sge_no].lkey = lkey;
557
558 req->rl_mapped_sges++;
559 ppages++;
560 remaining -= len;
561 page_base = 0;
525 } 562 }
526 dprintk("RPC: %s: tail destp 0x%p len %d\n",
527 __func__, destp + copy_len, curlen);
528 rqst->rq_svec[0].iov_len += curlen;
529 } 563 }
530 r_xprt->rx_stats.pullup_copy_count += copy_len;
531 564
532 page_base = rqst->rq_snd_buf.page_base; 565 /* The tail iovec is not always constructed in the same
533 ppages = rqst->rq_snd_buf.pages + (page_base >> PAGE_SHIFT); 566 * page where the head iovec resides (see, for example,
534 page_base &= ~PAGE_MASK; 567 * gss_wrap_req_priv). To neatly accommodate that case,
535 npages = PAGE_ALIGN(page_base+copy_len) >> PAGE_SHIFT; 568 * DMA map it separately.
536 for (i = 0; copy_len && i < npages; i++) { 569 */
537 curlen = PAGE_SIZE - page_base; 570 if (xdr->tail[0].iov_len) {
538 if (curlen > copy_len) 571 page = virt_to_page(xdr->tail[0].iov_base);
539 curlen = copy_len; 572 page_base = (unsigned long)xdr->tail[0].iov_base & ~PAGE_MASK;
540 dprintk("RPC: %s: page %d destp 0x%p len %d curlen %d\n", 573 len = xdr->tail[0].iov_len;
541 __func__, i, destp, copy_len, curlen); 574
542 srcp = kmap_atomic(ppages[i]); 575map_tail:
543 memcpy(destp, srcp+page_base, curlen); 576 sge_no++;
544 kunmap_atomic(srcp); 577 sge[sge_no].addr = ib_dma_map_page(device, page,
545 rqst->rq_svec[0].iov_len += curlen; 578 page_base, len,
546 destp += curlen; 579 DMA_TO_DEVICE);
547 copy_len -= curlen; 580 if (ib_dma_mapping_error(device, sge[sge_no].addr))
548 page_base = 0; 581 goto out_mapping_err;
582 sge[sge_no].length = len;
583 sge[sge_no].lkey = lkey;
584 req->rl_mapped_sges++;
549 } 585 }
550 /* header now contains entire send message */ 586
587out:
588 req->rl_send_wr.num_sge = sge_no + 1;
589 return true;
590
591out_mapping_overflow:
592 pr_err("rpcrdma: too many Send SGEs (%u)\n", sge_no);
593 return false;
594
595out_mapping_err:
596 pr_err("rpcrdma: Send mapping error\n");
597 return false;
598}
599
600bool
601rpcrdma_prepare_send_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req,
602 u32 hdrlen, struct xdr_buf *xdr,
603 enum rpcrdma_chunktype rtype)
604{
605 req->rl_send_wr.num_sge = 0;
606 req->rl_mapped_sges = 0;
607
608 if (!rpcrdma_prepare_hdr_sge(ia, req, hdrlen))
609 goto out_map;
610
611 if (rtype != rpcrdma_areadch)
612 if (!rpcrdma_prepare_msg_sges(ia, req, xdr, rtype))
613 goto out_map;
614
615 return true;
616
617out_map:
618 pr_err("rpcrdma: failed to DMA map a Send buffer\n");
619 return false;
620}
621
622void
623rpcrdma_unmap_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
624{
625 struct ib_device *device = ia->ri_device;
626 struct ib_sge *sge;
627 int count;
628
629 sge = &req->rl_send_sge[2];
630 for (count = req->rl_mapped_sges; count--; sge++)
631 ib_dma_unmap_page(device, sge->addr, sge->length,
632 DMA_TO_DEVICE);
633 req->rl_mapped_sges = 0;
551} 634}
552 635
553/* 636/*
554 * Marshal a request: the primary job of this routine is to choose 637 * Marshal a request: the primary job of this routine is to choose
555 * the transfer modes. See comments below. 638 * the transfer modes. See comments below.
556 * 639 *
557 * Prepares up to two IOVs per Call message:
558 *
559 * [0] -- RPC RDMA header
560 * [1] -- the RPC header/data
561 *
562 * Returns zero on success, otherwise a negative errno. 640 * Returns zero on success, otherwise a negative errno.
563 */ 641 */
564 642
@@ -626,12 +704,11 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
626 */ 704 */
627 if (rpcrdma_args_inline(r_xprt, rqst)) { 705 if (rpcrdma_args_inline(r_xprt, rqst)) {
628 rtype = rpcrdma_noch; 706 rtype = rpcrdma_noch;
629 rpcrdma_inline_pullup(rqst); 707 rpclen = rqst->rq_snd_buf.len;
630 rpclen = rqst->rq_svec[0].iov_len;
631 } else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) { 708 } else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
632 rtype = rpcrdma_readch; 709 rtype = rpcrdma_readch;
633 rpclen = rqst->rq_svec[0].iov_len; 710 rpclen = rqst->rq_snd_buf.head[0].iov_len +
634 rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf); 711 rqst->rq_snd_buf.tail[0].iov_len;
635 } else { 712 } else {
636 r_xprt->rx_stats.nomsg_call_count++; 713 r_xprt->rx_stats.nomsg_call_count++;
637 headerp->rm_type = htonl(RDMA_NOMSG); 714 headerp->rm_type = htonl(RDMA_NOMSG);
@@ -673,34 +750,18 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
673 goto out_unmap; 750 goto out_unmap;
674 hdrlen = (unsigned char *)iptr - (unsigned char *)headerp; 751 hdrlen = (unsigned char *)iptr - (unsigned char *)headerp;
675 752
676 if (hdrlen + rpclen > RPCRDMA_INLINE_WRITE_THRESHOLD(rqst))
677 goto out_overflow;
678
679 dprintk("RPC: %5u %s: %s/%s: hdrlen %zd rpclen %zd\n", 753 dprintk("RPC: %5u %s: %s/%s: hdrlen %zd rpclen %zd\n",
680 rqst->rq_task->tk_pid, __func__, 754 rqst->rq_task->tk_pid, __func__,
681 transfertypes[rtype], transfertypes[wtype], 755 transfertypes[rtype], transfertypes[wtype],
682 hdrlen, rpclen); 756 hdrlen, rpclen);
683 757
684 req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf); 758 if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, hdrlen,
685 req->rl_send_iov[0].length = hdrlen; 759 &rqst->rq_snd_buf, rtype)) {
686 req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf); 760 iptr = ERR_PTR(-EIO);
687 761 goto out_unmap;
688 req->rl_niovs = 1; 762 }
689 if (rtype == rpcrdma_areadch)
690 return 0;
691
692 req->rl_send_iov[1].addr = rdmab_addr(req->rl_sendbuf);
693 req->rl_send_iov[1].length = rpclen;
694 req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf);
695
696 req->rl_niovs = 2;
697 return 0; 763 return 0;
698 764
699out_overflow:
700 pr_err("rpcrdma: send overflow: hdrlen %zd rpclen %zu %s/%s\n",
701 hdrlen, rpclen, transfertypes[rtype], transfertypes[wtype]);
702 iptr = ERR_PTR(-EIO);
703
704out_unmap: 765out_unmap:
705 r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false); 766 r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false);
706 return PTR_ERR(iptr); 767 return PTR_ERR(iptr);
@@ -916,8 +977,10 @@ rpcrdma_conn_func(struct rpcrdma_ep *ep)
916 * allowed to timeout, to discover the errors at that time. 977 * allowed to timeout, to discover the errors at that time.
917 */ 978 */
918void 979void
919rpcrdma_reply_handler(struct rpcrdma_rep *rep) 980rpcrdma_reply_handler(struct work_struct *work)
920{ 981{
982 struct rpcrdma_rep *rep =
983 container_of(work, struct rpcrdma_rep, rr_work);
921 struct rpcrdma_msg *headerp; 984 struct rpcrdma_msg *headerp;
922 struct rpcrdma_req *req; 985 struct rpcrdma_req *req;
923 struct rpc_rqst *rqst; 986 struct rpc_rqst *rqst;
@@ -1132,6 +1195,6 @@ out_duplicate:
1132 1195
1133repost: 1196repost:
1134 r_xprt->rx_stats.bad_reply_count++; 1197 r_xprt->rx_stats.bad_reply_count++;
1135 if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep)) 1198 if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, rep))
1136 rpcrdma_recv_buffer_put(rep); 1199 rpcrdma_recv_buffer_put(rep);
1137} 1200}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index a2a7519b0f23..2d8545c34095 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -129,7 +129,7 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma,
129 ret = -EIO; 129 ret = -EIO;
130 goto out_unmap; 130 goto out_unmap;
131 } 131 }
132 atomic_inc(&rdma->sc_dma_used); 132 svc_rdma_count_mappings(rdma, ctxt);
133 133
134 memset(&send_wr, 0, sizeof(send_wr)); 134 memset(&send_wr, 0, sizeof(send_wr));
135 ctxt->cqe.done = svc_rdma_wc_send; 135 ctxt->cqe.done = svc_rdma_wc_send;
@@ -159,33 +159,34 @@ out_unmap:
159/* Server-side transport endpoint wants a whole page for its send 159/* Server-side transport endpoint wants a whole page for its send
160 * buffer. The client RPC code constructs the RPC header in this 160 * buffer. The client RPC code constructs the RPC header in this
161 * buffer before it invokes ->send_request. 161 * buffer before it invokes ->send_request.
162 *
163 * Returns NULL if there was a temporary allocation failure.
164 */ 162 */
165static void * 163static int
166xprt_rdma_bc_allocate(struct rpc_task *task, size_t size) 164xprt_rdma_bc_allocate(struct rpc_task *task)
167{ 165{
168 struct rpc_rqst *rqst = task->tk_rqstp; 166 struct rpc_rqst *rqst = task->tk_rqstp;
169 struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt; 167 struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt;
168 size_t size = rqst->rq_callsize;
170 struct svcxprt_rdma *rdma; 169 struct svcxprt_rdma *rdma;
171 struct page *page; 170 struct page *page;
172 171
173 rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt); 172 rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt);
174 173
175 /* Prevent an infinite loop: try to make this case work */ 174 if (size > PAGE_SIZE) {
176 if (size > PAGE_SIZE)
177 WARN_ONCE(1, "svcrdma: large bc buffer request (size %zu)\n", 175 WARN_ONCE(1, "svcrdma: large bc buffer request (size %zu)\n",
178 size); 176 size);
177 return -EINVAL;
178 }
179 179
180 page = alloc_page(RPCRDMA_DEF_GFP); 180 page = alloc_page(RPCRDMA_DEF_GFP);
181 if (!page) 181 if (!page)
182 return NULL; 182 return -ENOMEM;
183 183
184 return page_address(page); 184 rqst->rq_buffer = page_address(page);
185 return 0;
185} 186}
186 187
187static void 188static void
188xprt_rdma_bc_free(void *buffer) 189xprt_rdma_bc_free(struct rpc_task *task)
189{ 190{
190 /* No-op: ctxt and page have already been freed. */ 191 /* No-op: ctxt and page have already been freed. */
191} 192}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 2c25606f2561..ad1df979b3f0 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -159,7 +159,7 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
159 ctxt->sge[pno].addr); 159 ctxt->sge[pno].addr);
160 if (ret) 160 if (ret)
161 goto err; 161 goto err;
162 atomic_inc(&xprt->sc_dma_used); 162 svc_rdma_count_mappings(xprt, ctxt);
163 163
164 ctxt->sge[pno].lkey = xprt->sc_pd->local_dma_lkey; 164 ctxt->sge[pno].lkey = xprt->sc_pd->local_dma_lkey;
165 ctxt->sge[pno].length = len; 165 ctxt->sge[pno].length = len;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 54d533300620..f5a91edcd233 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -225,6 +225,48 @@ svc_rdma_get_reply_array(struct rpcrdma_msg *rmsgp,
225 return rp_ary; 225 return rp_ary;
226} 226}
227 227
228/* RPC-over-RDMA Version One private extension: Remote Invalidation.
229 * Responder's choice: requester signals it can handle Send With
230 * Invalidate, and responder chooses one rkey to invalidate.
231 *
232 * Find a candidate rkey to invalidate when sending a reply. Picks the
233 * first rkey it finds in the chunks lists.
234 *
235 * Returns zero if RPC's chunk lists are empty.
236 */
237static u32 svc_rdma_get_inv_rkey(struct rpcrdma_msg *rdma_argp,
238 struct rpcrdma_write_array *wr_ary,
239 struct rpcrdma_write_array *rp_ary)
240{
241 struct rpcrdma_read_chunk *rd_ary;
242 struct rpcrdma_segment *arg_ch;
243 u32 inv_rkey;
244
245 inv_rkey = 0;
246
247 rd_ary = svc_rdma_get_read_chunk(rdma_argp);
248 if (rd_ary) {
249 inv_rkey = be32_to_cpu(rd_ary->rc_target.rs_handle);
250 goto out;
251 }
252
253 if (wr_ary && be32_to_cpu(wr_ary->wc_nchunks)) {
254 arg_ch = &wr_ary->wc_array[0].wc_target;
255 inv_rkey = be32_to_cpu(arg_ch->rs_handle);
256 goto out;
257 }
258
259 if (rp_ary && be32_to_cpu(rp_ary->wc_nchunks)) {
260 arg_ch = &rp_ary->wc_array[0].wc_target;
261 inv_rkey = be32_to_cpu(arg_ch->rs_handle);
262 goto out;
263 }
264
265out:
266 dprintk("svcrdma: Send With Invalidate rkey=%08x\n", inv_rkey);
267 return inv_rkey;
268}
269
228/* Assumptions: 270/* Assumptions:
229 * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE 271 * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE
230 */ 272 */
@@ -280,7 +322,7 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
280 if (ib_dma_mapping_error(xprt->sc_cm_id->device, 322 if (ib_dma_mapping_error(xprt->sc_cm_id->device,
281 sge[sge_no].addr)) 323 sge[sge_no].addr))
282 goto err; 324 goto err;
283 atomic_inc(&xprt->sc_dma_used); 325 svc_rdma_count_mappings(xprt, ctxt);
284 sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey; 326 sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey;
285 ctxt->count++; 327 ctxt->count++;
286 sge_off = 0; 328 sge_off = 0;
@@ -464,7 +506,8 @@ static int send_reply(struct svcxprt_rdma *rdma,
464 struct page *page, 506 struct page *page,
465 struct rpcrdma_msg *rdma_resp, 507 struct rpcrdma_msg *rdma_resp,
466 struct svc_rdma_req_map *vec, 508 struct svc_rdma_req_map *vec,
467 int byte_count) 509 int byte_count,
510 u32 inv_rkey)
468{ 511{
469 struct svc_rdma_op_ctxt *ctxt; 512 struct svc_rdma_op_ctxt *ctxt;
470 struct ib_send_wr send_wr; 513 struct ib_send_wr send_wr;
@@ -489,7 +532,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
489 ctxt->sge[0].length, DMA_TO_DEVICE); 532 ctxt->sge[0].length, DMA_TO_DEVICE);
490 if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr)) 533 if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr))
491 goto err; 534 goto err;
492 atomic_inc(&rdma->sc_dma_used); 535 svc_rdma_count_mappings(rdma, ctxt);
493 536
494 ctxt->direction = DMA_TO_DEVICE; 537 ctxt->direction = DMA_TO_DEVICE;
495 538
@@ -505,7 +548,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
505 if (ib_dma_mapping_error(rdma->sc_cm_id->device, 548 if (ib_dma_mapping_error(rdma->sc_cm_id->device,
506 ctxt->sge[sge_no].addr)) 549 ctxt->sge[sge_no].addr))
507 goto err; 550 goto err;
508 atomic_inc(&rdma->sc_dma_used); 551 svc_rdma_count_mappings(rdma, ctxt);
509 ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey; 552 ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey;
510 ctxt->sge[sge_no].length = sge_bytes; 553 ctxt->sge[sge_no].length = sge_bytes;
511 } 554 }
@@ -523,23 +566,9 @@ static int send_reply(struct svcxprt_rdma *rdma,
523 ctxt->pages[page_no+1] = rqstp->rq_respages[page_no]; 566 ctxt->pages[page_no+1] = rqstp->rq_respages[page_no];
524 ctxt->count++; 567 ctxt->count++;
525 rqstp->rq_respages[page_no] = NULL; 568 rqstp->rq_respages[page_no] = NULL;
526 /*
527 * If there are more pages than SGE, terminate SGE
528 * list so that svc_rdma_unmap_dma doesn't attempt to
529 * unmap garbage.
530 */
531 if (page_no+1 >= sge_no)
532 ctxt->sge[page_no+1].length = 0;
533 } 569 }
534 rqstp->rq_next_page = rqstp->rq_respages + 1; 570 rqstp->rq_next_page = rqstp->rq_respages + 1;
535 571
536 /* The loop above bumps sc_dma_used for each sge. The
537 * xdr_buf.tail gets a separate sge, but resides in the
538 * same page as xdr_buf.head. Don't count it twice.
539 */
540 if (sge_no > ctxt->count)
541 atomic_dec(&rdma->sc_dma_used);
542
543 if (sge_no > rdma->sc_max_sge) { 572 if (sge_no > rdma->sc_max_sge) {
544 pr_err("svcrdma: Too many sges (%d)\n", sge_no); 573 pr_err("svcrdma: Too many sges (%d)\n", sge_no);
545 goto err; 574 goto err;
@@ -549,7 +578,11 @@ static int send_reply(struct svcxprt_rdma *rdma,
549 send_wr.wr_cqe = &ctxt->cqe; 578 send_wr.wr_cqe = &ctxt->cqe;
550 send_wr.sg_list = ctxt->sge; 579 send_wr.sg_list = ctxt->sge;
551 send_wr.num_sge = sge_no; 580 send_wr.num_sge = sge_no;
552 send_wr.opcode = IB_WR_SEND; 581 if (inv_rkey) {
582 send_wr.opcode = IB_WR_SEND_WITH_INV;
583 send_wr.ex.invalidate_rkey = inv_rkey;
584 } else
585 send_wr.opcode = IB_WR_SEND;
553 send_wr.send_flags = IB_SEND_SIGNALED; 586 send_wr.send_flags = IB_SEND_SIGNALED;
554 587
555 ret = svc_rdma_send(rdma, &send_wr); 588 ret = svc_rdma_send(rdma, &send_wr);
@@ -581,6 +614,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
581 int inline_bytes; 614 int inline_bytes;
582 struct page *res_page; 615 struct page *res_page;
583 struct svc_rdma_req_map *vec; 616 struct svc_rdma_req_map *vec;
617 u32 inv_rkey;
584 618
585 dprintk("svcrdma: sending response for rqstp=%p\n", rqstp); 619 dprintk("svcrdma: sending response for rqstp=%p\n", rqstp);
586 620
@@ -591,6 +625,10 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
591 wr_ary = svc_rdma_get_write_array(rdma_argp); 625 wr_ary = svc_rdma_get_write_array(rdma_argp);
592 rp_ary = svc_rdma_get_reply_array(rdma_argp, wr_ary); 626 rp_ary = svc_rdma_get_reply_array(rdma_argp, wr_ary);
593 627
628 inv_rkey = 0;
629 if (rdma->sc_snd_w_inv)
630 inv_rkey = svc_rdma_get_inv_rkey(rdma_argp, wr_ary, rp_ary);
631
594 /* Build an req vec for the XDR */ 632 /* Build an req vec for the XDR */
595 vec = svc_rdma_get_req_map(rdma); 633 vec = svc_rdma_get_req_map(rdma);
596 ret = svc_rdma_map_xdr(rdma, &rqstp->rq_res, vec, wr_ary != NULL); 634 ret = svc_rdma_map_xdr(rdma, &rqstp->rq_res, vec, wr_ary != NULL);
@@ -633,9 +671,9 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
633 goto err1; 671 goto err1;
634 672
635 ret = send_reply(rdma, rqstp, res_page, rdma_resp, vec, 673 ret = send_reply(rdma, rqstp, res_page, rdma_resp, vec,
636 inline_bytes); 674 inline_bytes, inv_rkey);
637 if (ret < 0) 675 if (ret < 0)
638 goto err1; 676 goto err0;
639 677
640 svc_rdma_put_req_map(rdma, vec); 678 svc_rdma_put_req_map(rdma, vec);
641 dprintk("svcrdma: send_reply returns %d\n", ret); 679 dprintk("svcrdma: send_reply returns %d\n", ret);
@@ -692,7 +730,7 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
692 svc_rdma_put_context(ctxt, 1); 730 svc_rdma_put_context(ctxt, 1);
693 return; 731 return;
694 } 732 }
695 atomic_inc(&xprt->sc_dma_used); 733 svc_rdma_count_mappings(xprt, ctxt);
696 734
697 /* Prepare SEND WR */ 735 /* Prepare SEND WR */
698 memset(&err_wr, 0, sizeof(err_wr)); 736 memset(&err_wr, 0, sizeof(err_wr));
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index dd9440137834..6864fb967038 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -198,6 +198,7 @@ struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
198 198
199out: 199out:
200 ctxt->count = 0; 200 ctxt->count = 0;
201 ctxt->mapped_sges = 0;
201 ctxt->frmr = NULL; 202 ctxt->frmr = NULL;
202 return ctxt; 203 return ctxt;
203 204
@@ -221,22 +222,27 @@ out_empty:
221void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt) 222void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
222{ 223{
223 struct svcxprt_rdma *xprt = ctxt->xprt; 224 struct svcxprt_rdma *xprt = ctxt->xprt;
224 int i; 225 struct ib_device *device = xprt->sc_cm_id->device;
225 for (i = 0; i < ctxt->count && ctxt->sge[i].length; i++) { 226 u32 lkey = xprt->sc_pd->local_dma_lkey;
227 unsigned int i, count;
228
229 for (count = 0, i = 0; i < ctxt->mapped_sges; i++) {
226 /* 230 /*
227 * Unmap the DMA addr in the SGE if the lkey matches 231 * Unmap the DMA addr in the SGE if the lkey matches
228 * the local_dma_lkey, otherwise, ignore it since it is 232 * the local_dma_lkey, otherwise, ignore it since it is
229 * an FRMR lkey and will be unmapped later when the 233 * an FRMR lkey and will be unmapped later when the
230 * last WR that uses it completes. 234 * last WR that uses it completes.
231 */ 235 */
232 if (ctxt->sge[i].lkey == xprt->sc_pd->local_dma_lkey) { 236 if (ctxt->sge[i].lkey == lkey) {
233 atomic_dec(&xprt->sc_dma_used); 237 count++;
234 ib_dma_unmap_page(xprt->sc_cm_id->device, 238 ib_dma_unmap_page(device,
235 ctxt->sge[i].addr, 239 ctxt->sge[i].addr,
236 ctxt->sge[i].length, 240 ctxt->sge[i].length,
237 ctxt->direction); 241 ctxt->direction);
238 } 242 }
239 } 243 }
244 ctxt->mapped_sges = 0;
245 atomic_sub(count, &xprt->sc_dma_used);
240} 246}
241 247
242void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages) 248void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
@@ -600,7 +606,7 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt, gfp_t flags)
600 DMA_FROM_DEVICE); 606 DMA_FROM_DEVICE);
601 if (ib_dma_mapping_error(xprt->sc_cm_id->device, pa)) 607 if (ib_dma_mapping_error(xprt->sc_cm_id->device, pa))
602 goto err_put_ctxt; 608 goto err_put_ctxt;
603 atomic_inc(&xprt->sc_dma_used); 609 svc_rdma_count_mappings(xprt, ctxt);
604 ctxt->sge[sge_no].addr = pa; 610 ctxt->sge[sge_no].addr = pa;
605 ctxt->sge[sge_no].length = PAGE_SIZE; 611 ctxt->sge[sge_no].length = PAGE_SIZE;
606 ctxt->sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey; 612 ctxt->sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey;
@@ -642,6 +648,26 @@ int svc_rdma_repost_recv(struct svcxprt_rdma *xprt, gfp_t flags)
642 return ret; 648 return ret;
643} 649}
644 650
651static void
652svc_rdma_parse_connect_private(struct svcxprt_rdma *newxprt,
653 struct rdma_conn_param *param)
654{
655 const struct rpcrdma_connect_private *pmsg = param->private_data;
656
657 if (pmsg &&
658 pmsg->cp_magic == rpcrdma_cmp_magic &&
659 pmsg->cp_version == RPCRDMA_CMP_VERSION) {
660 newxprt->sc_snd_w_inv = pmsg->cp_flags &
661 RPCRDMA_CMP_F_SND_W_INV_OK;
662
663 dprintk("svcrdma: client send_size %u, recv_size %u "
664 "remote inv %ssupported\n",
665 rpcrdma_decode_buffer_size(pmsg->cp_send_size),
666 rpcrdma_decode_buffer_size(pmsg->cp_recv_size),
667 newxprt->sc_snd_w_inv ? "" : "un");
668 }
669}
670
645/* 671/*
646 * This function handles the CONNECT_REQUEST event on a listening 672 * This function handles the CONNECT_REQUEST event on a listening
647 * endpoint. It is passed the cma_id for the _new_ connection. The context in 673 * endpoint. It is passed the cma_id for the _new_ connection. The context in
@@ -653,7 +679,8 @@ int svc_rdma_repost_recv(struct svcxprt_rdma *xprt, gfp_t flags)
653 * will call the recvfrom method on the listen xprt which will accept the new 679 * will call the recvfrom method on the listen xprt which will accept the new
654 * connection. 680 * connection.
655 */ 681 */
656static void handle_connect_req(struct rdma_cm_id *new_cma_id, size_t client_ird) 682static void handle_connect_req(struct rdma_cm_id *new_cma_id,
683 struct rdma_conn_param *param)
657{ 684{
658 struct svcxprt_rdma *listen_xprt = new_cma_id->context; 685 struct svcxprt_rdma *listen_xprt = new_cma_id->context;
659 struct svcxprt_rdma *newxprt; 686 struct svcxprt_rdma *newxprt;
@@ -669,9 +696,10 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id, size_t client_ird)
669 new_cma_id->context = newxprt; 696 new_cma_id->context = newxprt;
670 dprintk("svcrdma: Creating newxprt=%p, cm_id=%p, listenxprt=%p\n", 697 dprintk("svcrdma: Creating newxprt=%p, cm_id=%p, listenxprt=%p\n",
671 newxprt, newxprt->sc_cm_id, listen_xprt); 698 newxprt, newxprt->sc_cm_id, listen_xprt);
699 svc_rdma_parse_connect_private(newxprt, param);
672 700
673 /* Save client advertised inbound read limit for use later in accept. */ 701 /* Save client advertised inbound read limit for use later in accept. */
674 newxprt->sc_ord = client_ird; 702 newxprt->sc_ord = param->initiator_depth;
675 703
676 /* Set the local and remote addresses in the transport */ 704 /* Set the local and remote addresses in the transport */
677 sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr; 705 sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr;
@@ -706,8 +734,7 @@ static int rdma_listen_handler(struct rdma_cm_id *cma_id,
706 dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, " 734 dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, "
707 "event = %s (%d)\n", cma_id, cma_id->context, 735 "event = %s (%d)\n", cma_id, cma_id->context,
708 rdma_event_msg(event->event), event->event); 736 rdma_event_msg(event->event), event->event);
709 handle_connect_req(cma_id, 737 handle_connect_req(cma_id, &event->param.conn);
710 event->param.conn.initiator_depth);
711 break; 738 break;
712 739
713 case RDMA_CM_EVENT_ESTABLISHED: 740 case RDMA_CM_EVENT_ESTABLISHED:
@@ -941,6 +968,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
941 struct svcxprt_rdma *listen_rdma; 968 struct svcxprt_rdma *listen_rdma;
942 struct svcxprt_rdma *newxprt = NULL; 969 struct svcxprt_rdma *newxprt = NULL;
943 struct rdma_conn_param conn_param; 970 struct rdma_conn_param conn_param;
971 struct rpcrdma_connect_private pmsg;
944 struct ib_qp_init_attr qp_attr; 972 struct ib_qp_init_attr qp_attr;
945 struct ib_device *dev; 973 struct ib_device *dev;
946 unsigned int i; 974 unsigned int i;
@@ -993,7 +1021,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
993 newxprt->sc_ord = min_t(size_t, dev->attrs.max_qp_rd_atom, newxprt->sc_ord); 1021 newxprt->sc_ord = min_t(size_t, dev->attrs.max_qp_rd_atom, newxprt->sc_ord);
994 newxprt->sc_ord = min_t(size_t, svcrdma_ord, newxprt->sc_ord); 1022 newxprt->sc_ord = min_t(size_t, svcrdma_ord, newxprt->sc_ord);
995 1023
996 newxprt->sc_pd = ib_alloc_pd(dev); 1024 newxprt->sc_pd = ib_alloc_pd(dev, 0);
997 if (IS_ERR(newxprt->sc_pd)) { 1025 if (IS_ERR(newxprt->sc_pd)) {
998 dprintk("svcrdma: error creating PD for connect request\n"); 1026 dprintk("svcrdma: error creating PD for connect request\n");
999 goto errout; 1027 goto errout;
@@ -1070,7 +1098,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
1070 dev->attrs.max_fast_reg_page_list_len; 1098 dev->attrs.max_fast_reg_page_list_len;
1071 newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG; 1099 newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG;
1072 newxprt->sc_reader = rdma_read_chunk_frmr; 1100 newxprt->sc_reader = rdma_read_chunk_frmr;
1073 } 1101 } else
1102 newxprt->sc_snd_w_inv = false;
1074 1103
1075 /* 1104 /*
1076 * Determine if a DMA MR is required and if so, what privs are required 1105 * Determine if a DMA MR is required and if so, what privs are required
@@ -1094,11 +1123,20 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
1094 /* Swap out the handler */ 1123 /* Swap out the handler */
1095 newxprt->sc_cm_id->event_handler = rdma_cma_handler; 1124 newxprt->sc_cm_id->event_handler = rdma_cma_handler;
1096 1125
1126 /* Construct RDMA-CM private message */
1127 pmsg.cp_magic = rpcrdma_cmp_magic;
1128 pmsg.cp_version = RPCRDMA_CMP_VERSION;
1129 pmsg.cp_flags = 0;
1130 pmsg.cp_send_size = pmsg.cp_recv_size =
1131 rpcrdma_encode_buffer_size(newxprt->sc_max_req_size);
1132
1097 /* Accept Connection */ 1133 /* Accept Connection */
1098 set_bit(RDMAXPRT_CONN_PENDING, &newxprt->sc_flags); 1134 set_bit(RDMAXPRT_CONN_PENDING, &newxprt->sc_flags);
1099 memset(&conn_param, 0, sizeof conn_param); 1135 memset(&conn_param, 0, sizeof conn_param);
1100 conn_param.responder_resources = 0; 1136 conn_param.responder_resources = 0;
1101 conn_param.initiator_depth = newxprt->sc_ord; 1137 conn_param.initiator_depth = newxprt->sc_ord;
1138 conn_param.private_data = &pmsg;
1139 conn_param.private_data_len = sizeof(pmsg);
1102 ret = rdma_accept(newxprt->sc_cm_id, &conn_param); 1140 ret = rdma_accept(newxprt->sc_cm_id, &conn_param);
1103 if (ret) { 1141 if (ret) {
1104 dprintk("svcrdma: failed to accept new connection, ret=%d\n", 1142 dprintk("svcrdma: failed to accept new connection, ret=%d\n",
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 81f0e879f019..ed5e285fd2ea 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -97,7 +97,7 @@ static struct ctl_table xr_tunables_table[] = {
97 .data = &xprt_rdma_max_inline_read, 97 .data = &xprt_rdma_max_inline_read,
98 .maxlen = sizeof(unsigned int), 98 .maxlen = sizeof(unsigned int),
99 .mode = 0644, 99 .mode = 0644,
100 .proc_handler = proc_dointvec, 100 .proc_handler = proc_dointvec_minmax,
101 .extra1 = &min_inline_size, 101 .extra1 = &min_inline_size,
102 .extra2 = &max_inline_size, 102 .extra2 = &max_inline_size,
103 }, 103 },
@@ -106,7 +106,7 @@ static struct ctl_table xr_tunables_table[] = {
106 .data = &xprt_rdma_max_inline_write, 106 .data = &xprt_rdma_max_inline_write,
107 .maxlen = sizeof(unsigned int), 107 .maxlen = sizeof(unsigned int),
108 .mode = 0644, 108 .mode = 0644,
109 .proc_handler = proc_dointvec, 109 .proc_handler = proc_dointvec_minmax,
110 .extra1 = &min_inline_size, 110 .extra1 = &min_inline_size,
111 .extra2 = &max_inline_size, 111 .extra2 = &max_inline_size,
112 }, 112 },
@@ -477,115 +477,152 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
477 } 477 }
478} 478}
479 479
480/* 480/* Allocate a fixed-size buffer in which to construct and send the
481 * The RDMA allocate/free functions need the task structure as a place 481 * RPC-over-RDMA header for this request.
482 * to hide the struct rpcrdma_req, which is necessary for the actual send/recv 482 */
483 * sequence. 483static bool
484rpcrdma_get_rdmabuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
485 gfp_t flags)
486{
487 size_t size = RPCRDMA_HDRBUF_SIZE;
488 struct rpcrdma_regbuf *rb;
489
490 if (req->rl_rdmabuf)
491 return true;
492
493 rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, flags);
494 if (IS_ERR(rb))
495 return false;
496
497 r_xprt->rx_stats.hardway_register_count += size;
498 req->rl_rdmabuf = rb;
499 return true;
500}
501
502static bool
503rpcrdma_get_sendbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
504 size_t size, gfp_t flags)
505{
506 struct rpcrdma_regbuf *rb;
507
508 if (req->rl_sendbuf && rdmab_length(req->rl_sendbuf) >= size)
509 return true;
510
511 rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, flags);
512 if (IS_ERR(rb))
513 return false;
514
515 rpcrdma_free_regbuf(req->rl_sendbuf);
516 r_xprt->rx_stats.hardway_register_count += size;
517 req->rl_sendbuf = rb;
518 return true;
519}
520
521/* The rq_rcv_buf is used only if a Reply chunk is necessary.
522 * The decision to use a Reply chunk is made later in
523 * rpcrdma_marshal_req. This buffer is registered at that time.
484 * 524 *
485 * The RPC layer allocates both send and receive buffers in the same call 525 * Otherwise, the associated RPC Reply arrives in a separate
486 * (rq_send_buf and rq_rcv_buf are both part of a single contiguous buffer). 526 * Receive buffer, arbitrarily chosen by the HCA. The buffer
487 * We may register rq_rcv_buf when using reply chunks. 527 * allocated here for the RPC Reply is not utilized in that
528 * case. See rpcrdma_inline_fixup.
529 *
530 * A regbuf is used here to remember the buffer size.
488 */ 531 */
489static void * 532static bool
490xprt_rdma_allocate(struct rpc_task *task, size_t size) 533rpcrdma_get_recvbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
534 size_t size, gfp_t flags)
491{ 535{
492 struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt;
493 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
494 struct rpcrdma_regbuf *rb; 536 struct rpcrdma_regbuf *rb;
537
538 if (req->rl_recvbuf && rdmab_length(req->rl_recvbuf) >= size)
539 return true;
540
541 rb = rpcrdma_alloc_regbuf(size, DMA_NONE, flags);
542 if (IS_ERR(rb))
543 return false;
544
545 rpcrdma_free_regbuf(req->rl_recvbuf);
546 r_xprt->rx_stats.hardway_register_count += size;
547 req->rl_recvbuf = rb;
548 return true;
549}
550
551/**
552 * xprt_rdma_allocate - allocate transport resources for an RPC
553 * @task: RPC task
554 *
555 * Return values:
556 * 0: Success; rq_buffer points to RPC buffer to use
557 * ENOMEM: Out of memory, call again later
558 * EIO: A permanent error occurred, do not retry
559 *
560 * The RDMA allocate/free functions need the task structure as a place
561 * to hide the struct rpcrdma_req, which is necessary for the actual
562 * send/recv sequence.
563 *
564 * xprt_rdma_allocate provides buffers that are already mapped for
565 * DMA, and a local DMA lkey is provided for each.
566 */
567static int
568xprt_rdma_allocate(struct rpc_task *task)
569{
570 struct rpc_rqst *rqst = task->tk_rqstp;
571 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
495 struct rpcrdma_req *req; 572 struct rpcrdma_req *req;
496 size_t min_size;
497 gfp_t flags; 573 gfp_t flags;
498 574
499 req = rpcrdma_buffer_get(&r_xprt->rx_buf); 575 req = rpcrdma_buffer_get(&r_xprt->rx_buf);
500 if (req == NULL) 576 if (req == NULL)
501 return NULL; 577 return -ENOMEM;
502 578
503 flags = RPCRDMA_DEF_GFP; 579 flags = RPCRDMA_DEF_GFP;
504 if (RPC_IS_SWAPPER(task)) 580 if (RPC_IS_SWAPPER(task))
505 flags = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN; 581 flags = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN;
506 582
507 if (req->rl_rdmabuf == NULL) 583 if (!rpcrdma_get_rdmabuf(r_xprt, req, flags))
508 goto out_rdmabuf;
509 if (req->rl_sendbuf == NULL)
510 goto out_sendbuf;
511 if (size > req->rl_sendbuf->rg_size)
512 goto out_sendbuf;
513
514out:
515 dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req);
516 req->rl_connect_cookie = 0; /* our reserved value */
517 req->rl_task = task;
518 return req->rl_sendbuf->rg_base;
519
520out_rdmabuf:
521 min_size = RPCRDMA_INLINE_WRITE_THRESHOLD(task->tk_rqstp);
522 rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, min_size, flags);
523 if (IS_ERR(rb))
524 goto out_fail; 584 goto out_fail;
525 req->rl_rdmabuf = rb; 585 if (!rpcrdma_get_sendbuf(r_xprt, req, rqst->rq_callsize, flags))
526 586 goto out_fail;
527out_sendbuf: 587 if (!rpcrdma_get_recvbuf(r_xprt, req, rqst->rq_rcvsize, flags))
528 /* XDR encoding and RPC/RDMA marshaling of this request has not
529 * yet occurred. Thus a lower bound is needed to prevent buffer
530 * overrun during marshaling.
531 *
532 * RPC/RDMA marshaling may choose to send payload bearing ops
533 * inline, if the result is smaller than the inline threshold.
534 * The value of the "size" argument accounts for header
535 * requirements but not for the payload in these cases.
536 *
537 * Likewise, allocate enough space to receive a reply up to the
538 * size of the inline threshold.
539 *
540 * It's unlikely that both the send header and the received
541 * reply will be large, but slush is provided here to allow
542 * flexibility when marshaling.
543 */
544 min_size = RPCRDMA_INLINE_READ_THRESHOLD(task->tk_rqstp);
545 min_size += RPCRDMA_INLINE_WRITE_THRESHOLD(task->tk_rqstp);
546 if (size < min_size)
547 size = min_size;
548
549 rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, size, flags);
550 if (IS_ERR(rb))
551 goto out_fail; 588 goto out_fail;
552 rb->rg_owner = req;
553 589
554 r_xprt->rx_stats.hardway_register_count += size; 590 dprintk("RPC: %5u %s: send size = %zd, recv size = %zd, req = %p\n",
555 rpcrdma_free_regbuf(&r_xprt->rx_ia, req->rl_sendbuf); 591 task->tk_pid, __func__, rqst->rq_callsize,
556 req->rl_sendbuf = rb; 592 rqst->rq_rcvsize, req);
557 goto out; 593
594 req->rl_connect_cookie = 0; /* our reserved value */
595 rpcrdma_set_xprtdata(rqst, req);
596 rqst->rq_buffer = req->rl_sendbuf->rg_base;
597 rqst->rq_rbuffer = req->rl_recvbuf->rg_base;
598 return 0;
558 599
559out_fail: 600out_fail:
560 rpcrdma_buffer_put(req); 601 rpcrdma_buffer_put(req);
561 return NULL; 602 return -ENOMEM;
562} 603}
563 604
564/* 605/**
565 * This function returns all RDMA resources to the pool. 606 * xprt_rdma_free - release resources allocated by xprt_rdma_allocate
607 * @task: RPC task
608 *
609 * Caller guarantees rqst->rq_buffer is non-NULL.
566 */ 610 */
567static void 611static void
568xprt_rdma_free(void *buffer) 612xprt_rdma_free(struct rpc_task *task)
569{ 613{
570 struct rpcrdma_req *req; 614 struct rpc_rqst *rqst = task->tk_rqstp;
571 struct rpcrdma_xprt *r_xprt; 615 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
572 struct rpcrdma_regbuf *rb; 616 struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
573 617 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
574 if (buffer == NULL)
575 return;
576 618
577 rb = container_of(buffer, struct rpcrdma_regbuf, rg_base[0]);
578 req = rb->rg_owner;
579 if (req->rl_backchannel) 619 if (req->rl_backchannel)
580 return; 620 return;
581 621
582 r_xprt = container_of(req->rl_buffer, struct rpcrdma_xprt, rx_buf);
583
584 dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply); 622 dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply);
585 623
586 r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, 624 ia->ri_ops->ro_unmap_safe(r_xprt, req, !RPC_IS_ASYNC(task));
587 !RPC_IS_ASYNC(req->rl_task)); 625 rpcrdma_unmap_sges(ia, req);
588
589 rpcrdma_buffer_put(req); 626 rpcrdma_buffer_put(req);
590} 627}
591 628
@@ -685,10 +722,11 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
685 r_xprt->rx_stats.failed_marshal_count, 722 r_xprt->rx_stats.failed_marshal_count,
686 r_xprt->rx_stats.bad_reply_count, 723 r_xprt->rx_stats.bad_reply_count,
687 r_xprt->rx_stats.nomsg_call_count); 724 r_xprt->rx_stats.nomsg_call_count);
688 seq_printf(seq, "%lu %lu %lu\n", 725 seq_printf(seq, "%lu %lu %lu %lu\n",
689 r_xprt->rx_stats.mrs_recovered, 726 r_xprt->rx_stats.mrs_recovered,
690 r_xprt->rx_stats.mrs_orphaned, 727 r_xprt->rx_stats.mrs_orphaned,
691 r_xprt->rx_stats.mrs_allocated); 728 r_xprt->rx_stats.mrs_allocated,
729 r_xprt->rx_stats.local_inv_needed);
692} 730}
693 731
694static int 732static int
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 799cce6cbe45..ec74289af7ec 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -129,15 +129,6 @@ rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
129 wc->status, wc->vendor_err); 129 wc->status, wc->vendor_err);
130} 130}
131 131
132static void
133rpcrdma_receive_worker(struct work_struct *work)
134{
135 struct rpcrdma_rep *rep =
136 container_of(work, struct rpcrdma_rep, rr_work);
137
138 rpcrdma_reply_handler(rep);
139}
140
141/* Perform basic sanity checking to avoid using garbage 132/* Perform basic sanity checking to avoid using garbage
142 * to update the credit grant value. 133 * to update the credit grant value.
143 */ 134 */
@@ -161,13 +152,13 @@ rpcrdma_update_granted_credits(struct rpcrdma_rep *rep)
161} 152}
162 153
163/** 154/**
164 * rpcrdma_receive_wc - Invoked by RDMA provider for each polled Receive WC 155 * rpcrdma_wc_receive - Invoked by RDMA provider for each polled Receive WC
165 * @cq: completion queue (ignored) 156 * @cq: completion queue (ignored)
166 * @wc: completed WR 157 * @wc: completed WR
167 * 158 *
168 */ 159 */
169static void 160static void
170rpcrdma_receive_wc(struct ib_cq *cq, struct ib_wc *wc) 161rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
171{ 162{
172 struct ib_cqe *cqe = wc->wr_cqe; 163 struct ib_cqe *cqe = wc->wr_cqe;
173 struct rpcrdma_rep *rep = container_of(cqe, struct rpcrdma_rep, 164 struct rpcrdma_rep *rep = container_of(cqe, struct rpcrdma_rep,
@@ -185,6 +176,9 @@ rpcrdma_receive_wc(struct ib_cq *cq, struct ib_wc *wc)
185 __func__, rep, wc->byte_len); 176 __func__, rep, wc->byte_len);
186 177
187 rep->rr_len = wc->byte_len; 178 rep->rr_len = wc->byte_len;
179 rep->rr_wc_flags = wc->wc_flags;
180 rep->rr_inv_rkey = wc->ex.invalidate_rkey;
181
188 ib_dma_sync_single_for_cpu(rep->rr_device, 182 ib_dma_sync_single_for_cpu(rep->rr_device,
189 rdmab_addr(rep->rr_rdmabuf), 183 rdmab_addr(rep->rr_rdmabuf),
190 rep->rr_len, DMA_FROM_DEVICE); 184 rep->rr_len, DMA_FROM_DEVICE);
@@ -204,6 +198,36 @@ out_fail:
204 goto out_schedule; 198 goto out_schedule;
205} 199}
206 200
201static void
202rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt,
203 struct rdma_conn_param *param)
204{
205 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
206 const struct rpcrdma_connect_private *pmsg = param->private_data;
207 unsigned int rsize, wsize;
208
209 /* Default settings for RPC-over-RDMA Version One */
210 r_xprt->rx_ia.ri_reminv_expected = false;
211 rsize = RPCRDMA_V1_DEF_INLINE_SIZE;
212 wsize = RPCRDMA_V1_DEF_INLINE_SIZE;
213
214 if (pmsg &&
215 pmsg->cp_magic == rpcrdma_cmp_magic &&
216 pmsg->cp_version == RPCRDMA_CMP_VERSION) {
217 r_xprt->rx_ia.ri_reminv_expected = true;
218 rsize = rpcrdma_decode_buffer_size(pmsg->cp_send_size);
219 wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size);
220 }
221
222 if (rsize < cdata->inline_rsize)
223 cdata->inline_rsize = rsize;
224 if (wsize < cdata->inline_wsize)
225 cdata->inline_wsize = wsize;
226 pr_info("rpcrdma: max send %u, max recv %u\n",
227 cdata->inline_wsize, cdata->inline_rsize);
228 rpcrdma_set_max_header_sizes(r_xprt);
229}
230
207static int 231static int
208rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) 232rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
209{ 233{
@@ -244,6 +268,7 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
244 " (%d initiator)\n", 268 " (%d initiator)\n",
245 __func__, attr->max_dest_rd_atomic, 269 __func__, attr->max_dest_rd_atomic,
246 attr->max_rd_atomic); 270 attr->max_rd_atomic);
271 rpcrdma_update_connect_private(xprt, &event->param.conn);
247 goto connected; 272 goto connected;
248 case RDMA_CM_EVENT_CONNECT_ERROR: 273 case RDMA_CM_EVENT_CONNECT_ERROR:
249 connstate = -ENOTCONN; 274 connstate = -ENOTCONN;
@@ -387,7 +412,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
387 } 412 }
388 ia->ri_device = ia->ri_id->device; 413 ia->ri_device = ia->ri_id->device;
389 414
390 ia->ri_pd = ib_alloc_pd(ia->ri_device); 415 ia->ri_pd = ib_alloc_pd(ia->ri_device, 0);
391 if (IS_ERR(ia->ri_pd)) { 416 if (IS_ERR(ia->ri_pd)) {
392 rc = PTR_ERR(ia->ri_pd); 417 rc = PTR_ERR(ia->ri_pd);
393 pr_err("rpcrdma: ib_alloc_pd() returned %d\n", rc); 418 pr_err("rpcrdma: ib_alloc_pd() returned %d\n", rc);
@@ -454,11 +479,12 @@ int
454rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, 479rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
455 struct rpcrdma_create_data_internal *cdata) 480 struct rpcrdma_create_data_internal *cdata)
456{ 481{
482 struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private;
457 struct ib_cq *sendcq, *recvcq; 483 struct ib_cq *sendcq, *recvcq;
458 unsigned int max_qp_wr; 484 unsigned int max_qp_wr;
459 int rc; 485 int rc;
460 486
461 if (ia->ri_device->attrs.max_sge < RPCRDMA_MAX_IOVS) { 487 if (ia->ri_device->attrs.max_sge < RPCRDMA_MAX_SEND_SGES) {
462 dprintk("RPC: %s: insufficient sge's available\n", 488 dprintk("RPC: %s: insufficient sge's available\n",
463 __func__); 489 __func__);
464 return -ENOMEM; 490 return -ENOMEM;
@@ -487,7 +513,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
487 ep->rep_attr.cap.max_recv_wr = cdata->max_requests; 513 ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
488 ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS; 514 ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
489 ep->rep_attr.cap.max_recv_wr += 1; /* drain cqe */ 515 ep->rep_attr.cap.max_recv_wr += 1; /* drain cqe */
490 ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_IOVS; 516 ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_SEND_SGES;
491 ep->rep_attr.cap.max_recv_sge = 1; 517 ep->rep_attr.cap.max_recv_sge = 1;
492 ep->rep_attr.cap.max_inline_data = 0; 518 ep->rep_attr.cap.max_inline_data = 0;
493 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR; 519 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
@@ -536,9 +562,14 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
536 /* Initialize cma parameters */ 562 /* Initialize cma parameters */
537 memset(&ep->rep_remote_cma, 0, sizeof(ep->rep_remote_cma)); 563 memset(&ep->rep_remote_cma, 0, sizeof(ep->rep_remote_cma));
538 564
539 /* RPC/RDMA does not use private data */ 565 /* Prepare RDMA-CM private message */
540 ep->rep_remote_cma.private_data = NULL; 566 pmsg->cp_magic = rpcrdma_cmp_magic;
541 ep->rep_remote_cma.private_data_len = 0; 567 pmsg->cp_version = RPCRDMA_CMP_VERSION;
568 pmsg->cp_flags |= ia->ri_ops->ro_send_w_inv_ok;
569 pmsg->cp_send_size = rpcrdma_encode_buffer_size(cdata->inline_wsize);
570 pmsg->cp_recv_size = rpcrdma_encode_buffer_size(cdata->inline_rsize);
571 ep->rep_remote_cma.private_data = pmsg;
572 ep->rep_remote_cma.private_data_len = sizeof(*pmsg);
542 573
543 /* Client offers RDMA Read but does not initiate */ 574 /* Client offers RDMA Read but does not initiate */
544 ep->rep_remote_cma.initiator_depth = 0; 575 ep->rep_remote_cma.initiator_depth = 0;
@@ -849,6 +880,10 @@ rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
849 req->rl_cqe.done = rpcrdma_wc_send; 880 req->rl_cqe.done = rpcrdma_wc_send;
850 req->rl_buffer = &r_xprt->rx_buf; 881 req->rl_buffer = &r_xprt->rx_buf;
851 INIT_LIST_HEAD(&req->rl_registered); 882 INIT_LIST_HEAD(&req->rl_registered);
883 req->rl_send_wr.next = NULL;
884 req->rl_send_wr.wr_cqe = &req->rl_cqe;
885 req->rl_send_wr.sg_list = req->rl_send_sge;
886 req->rl_send_wr.opcode = IB_WR_SEND;
852 return req; 887 return req;
853} 888}
854 889
@@ -865,17 +900,21 @@ rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
865 if (rep == NULL) 900 if (rep == NULL)
866 goto out; 901 goto out;
867 902
868 rep->rr_rdmabuf = rpcrdma_alloc_regbuf(ia, cdata->inline_rsize, 903 rep->rr_rdmabuf = rpcrdma_alloc_regbuf(cdata->inline_rsize,
869 GFP_KERNEL); 904 DMA_FROM_DEVICE, GFP_KERNEL);
870 if (IS_ERR(rep->rr_rdmabuf)) { 905 if (IS_ERR(rep->rr_rdmabuf)) {
871 rc = PTR_ERR(rep->rr_rdmabuf); 906 rc = PTR_ERR(rep->rr_rdmabuf);
872 goto out_free; 907 goto out_free;
873 } 908 }
874 909
875 rep->rr_device = ia->ri_device; 910 rep->rr_device = ia->ri_device;
876 rep->rr_cqe.done = rpcrdma_receive_wc; 911 rep->rr_cqe.done = rpcrdma_wc_receive;
877 rep->rr_rxprt = r_xprt; 912 rep->rr_rxprt = r_xprt;
878 INIT_WORK(&rep->rr_work, rpcrdma_receive_worker); 913 INIT_WORK(&rep->rr_work, rpcrdma_reply_handler);
914 rep->rr_recv_wr.next = NULL;
915 rep->rr_recv_wr.wr_cqe = &rep->rr_cqe;
916 rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
917 rep->rr_recv_wr.num_sge = 1;
879 return rep; 918 return rep;
880 919
881out_free: 920out_free:
@@ -966,17 +1005,18 @@ rpcrdma_buffer_get_rep_locked(struct rpcrdma_buffer *buf)
966} 1005}
967 1006
968static void 1007static void
969rpcrdma_destroy_rep(struct rpcrdma_ia *ia, struct rpcrdma_rep *rep) 1008rpcrdma_destroy_rep(struct rpcrdma_rep *rep)
970{ 1009{
971 rpcrdma_free_regbuf(ia, rep->rr_rdmabuf); 1010 rpcrdma_free_regbuf(rep->rr_rdmabuf);
972 kfree(rep); 1011 kfree(rep);
973} 1012}
974 1013
975void 1014void
976rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req) 1015rpcrdma_destroy_req(struct rpcrdma_req *req)
977{ 1016{
978 rpcrdma_free_regbuf(ia, req->rl_sendbuf); 1017 rpcrdma_free_regbuf(req->rl_recvbuf);
979 rpcrdma_free_regbuf(ia, req->rl_rdmabuf); 1018 rpcrdma_free_regbuf(req->rl_sendbuf);
1019 rpcrdma_free_regbuf(req->rl_rdmabuf);
980 kfree(req); 1020 kfree(req);
981} 1021}
982 1022
@@ -1009,15 +1049,13 @@ rpcrdma_destroy_mrs(struct rpcrdma_buffer *buf)
1009void 1049void
1010rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) 1050rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1011{ 1051{
1012 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
1013
1014 cancel_delayed_work_sync(&buf->rb_recovery_worker); 1052 cancel_delayed_work_sync(&buf->rb_recovery_worker);
1015 1053
1016 while (!list_empty(&buf->rb_recv_bufs)) { 1054 while (!list_empty(&buf->rb_recv_bufs)) {
1017 struct rpcrdma_rep *rep; 1055 struct rpcrdma_rep *rep;
1018 1056
1019 rep = rpcrdma_buffer_get_rep_locked(buf); 1057 rep = rpcrdma_buffer_get_rep_locked(buf);
1020 rpcrdma_destroy_rep(ia, rep); 1058 rpcrdma_destroy_rep(rep);
1021 } 1059 }
1022 buf->rb_send_count = 0; 1060 buf->rb_send_count = 0;
1023 1061
@@ -1030,7 +1068,7 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1030 list_del(&req->rl_all); 1068 list_del(&req->rl_all);
1031 1069
1032 spin_unlock(&buf->rb_reqslock); 1070 spin_unlock(&buf->rb_reqslock);
1033 rpcrdma_destroy_req(ia, req); 1071 rpcrdma_destroy_req(req);
1034 spin_lock(&buf->rb_reqslock); 1072 spin_lock(&buf->rb_reqslock);
1035 } 1073 }
1036 spin_unlock(&buf->rb_reqslock); 1074 spin_unlock(&buf->rb_reqslock);
@@ -1129,7 +1167,7 @@ rpcrdma_buffer_put(struct rpcrdma_req *req)
1129 struct rpcrdma_buffer *buffers = req->rl_buffer; 1167 struct rpcrdma_buffer *buffers = req->rl_buffer;
1130 struct rpcrdma_rep *rep = req->rl_reply; 1168 struct rpcrdma_rep *rep = req->rl_reply;
1131 1169
1132 req->rl_niovs = 0; 1170 req->rl_send_wr.num_sge = 0;
1133 req->rl_reply = NULL; 1171 req->rl_reply = NULL;
1134 1172
1135 spin_lock(&buffers->rb_lock); 1173 spin_lock(&buffers->rb_lock);
@@ -1171,70 +1209,81 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1171 spin_unlock(&buffers->rb_lock); 1209 spin_unlock(&buffers->rb_lock);
1172} 1210}
1173 1211
1174/*
1175 * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1176 */
1177
1178/** 1212/**
1179 * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers 1213 * rpcrdma_alloc_regbuf - allocate and DMA-map memory for SEND/RECV buffers
1180 * @ia: controlling rpcrdma_ia
1181 * @size: size of buffer to be allocated, in bytes 1214 * @size: size of buffer to be allocated, in bytes
1215 * @direction: direction of data movement
1182 * @flags: GFP flags 1216 * @flags: GFP flags
1183 * 1217 *
1184 * Returns pointer to private header of an area of internally 1218 * Returns an ERR_PTR, or a pointer to a regbuf, a buffer that
1185 * registered memory, or an ERR_PTR. The registered buffer follows 1219 * can be persistently DMA-mapped for I/O.
1186 * the end of the private header.
1187 * 1220 *
1188 * xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for 1221 * xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for
1189 * receiving the payload of RDMA RECV operations. regbufs are not 1222 * receiving the payload of RDMA RECV operations. During Long Calls
1190 * used for RDMA READ/WRITE operations, thus are registered only for 1223 * or Replies they may be registered externally via ro_map.
1191 * LOCAL access.
1192 */ 1224 */
1193struct rpcrdma_regbuf * 1225struct rpcrdma_regbuf *
1194rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags) 1226rpcrdma_alloc_regbuf(size_t size, enum dma_data_direction direction,
1227 gfp_t flags)
1195{ 1228{
1196 struct rpcrdma_regbuf *rb; 1229 struct rpcrdma_regbuf *rb;
1197 struct ib_sge *iov;
1198 1230
1199 rb = kmalloc(sizeof(*rb) + size, flags); 1231 rb = kmalloc(sizeof(*rb) + size, flags);
1200 if (rb == NULL) 1232 if (rb == NULL)
1201 goto out; 1233 return ERR_PTR(-ENOMEM);
1202 1234
1203 iov = &rb->rg_iov; 1235 rb->rg_device = NULL;
1204 iov->addr = ib_dma_map_single(ia->ri_device, 1236 rb->rg_direction = direction;
1205 (void *)rb->rg_base, size, 1237 rb->rg_iov.length = size;
1206 DMA_BIDIRECTIONAL);
1207 if (ib_dma_mapping_error(ia->ri_device, iov->addr))
1208 goto out_free;
1209 1238
1210 iov->length = size;
1211 iov->lkey = ia->ri_pd->local_dma_lkey;
1212 rb->rg_size = size;
1213 rb->rg_owner = NULL;
1214 return rb; 1239 return rb;
1240}
1215 1241
1216out_free: 1242/**
1217 kfree(rb); 1243 * __rpcrdma_map_regbuf - DMA-map a regbuf
1218out: 1244 * @ia: controlling rpcrdma_ia
1219 return ERR_PTR(-ENOMEM); 1245 * @rb: regbuf to be mapped
1246 */
1247bool
1248__rpcrdma_dma_map_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
1249{
1250 if (rb->rg_direction == DMA_NONE)
1251 return false;
1252
1253 rb->rg_iov.addr = ib_dma_map_single(ia->ri_device,
1254 (void *)rb->rg_base,
1255 rdmab_length(rb),
1256 rb->rg_direction);
1257 if (ib_dma_mapping_error(ia->ri_device, rdmab_addr(rb)))
1258 return false;
1259
1260 rb->rg_device = ia->ri_device;
1261 rb->rg_iov.lkey = ia->ri_pd->local_dma_lkey;
1262 return true;
1263}
1264
1265static void
1266rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb)
1267{
1268 if (!rpcrdma_regbuf_is_mapped(rb))
1269 return;
1270
1271 ib_dma_unmap_single(rb->rg_device, rdmab_addr(rb),
1272 rdmab_length(rb), rb->rg_direction);
1273 rb->rg_device = NULL;
1220} 1274}
1221 1275
1222/** 1276/**
1223 * rpcrdma_free_regbuf - deregister and free registered buffer 1277 * rpcrdma_free_regbuf - deregister and free registered buffer
1224 * @ia: controlling rpcrdma_ia
1225 * @rb: regbuf to be deregistered and freed 1278 * @rb: regbuf to be deregistered and freed
1226 */ 1279 */
1227void 1280void
1228rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb) 1281rpcrdma_free_regbuf(struct rpcrdma_regbuf *rb)
1229{ 1282{
1230 struct ib_sge *iov;
1231
1232 if (!rb) 1283 if (!rb)
1233 return; 1284 return;
1234 1285
1235 iov = &rb->rg_iov; 1286 rpcrdma_dma_unmap_regbuf(rb);
1236 ib_dma_unmap_single(ia->ri_device,
1237 iov->addr, iov->length, DMA_BIDIRECTIONAL);
1238 kfree(rb); 1287 kfree(rb);
1239} 1288}
1240 1289
@@ -1248,39 +1297,28 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
1248 struct rpcrdma_ep *ep, 1297 struct rpcrdma_ep *ep,
1249 struct rpcrdma_req *req) 1298 struct rpcrdma_req *req)
1250{ 1299{
1251 struct ib_device *device = ia->ri_device; 1300 struct ib_send_wr *send_wr = &req->rl_send_wr;
1252 struct ib_send_wr send_wr, *send_wr_fail; 1301 struct ib_send_wr *send_wr_fail;
1253 struct rpcrdma_rep *rep = req->rl_reply; 1302 int rc;
1254 struct ib_sge *iov = req->rl_send_iov;
1255 int i, rc;
1256 1303
1257 if (rep) { 1304 if (req->rl_reply) {
1258 rc = rpcrdma_ep_post_recv(ia, ep, rep); 1305 rc = rpcrdma_ep_post_recv(ia, req->rl_reply);
1259 if (rc) 1306 if (rc)
1260 return rc; 1307 return rc;
1261 req->rl_reply = NULL; 1308 req->rl_reply = NULL;
1262 } 1309 }
1263 1310
1264 send_wr.next = NULL;
1265 send_wr.wr_cqe = &req->rl_cqe;
1266 send_wr.sg_list = iov;
1267 send_wr.num_sge = req->rl_niovs;
1268 send_wr.opcode = IB_WR_SEND;
1269
1270 for (i = 0; i < send_wr.num_sge; i++)
1271 ib_dma_sync_single_for_device(device, iov[i].addr,
1272 iov[i].length, DMA_TO_DEVICE);
1273 dprintk("RPC: %s: posting %d s/g entries\n", 1311 dprintk("RPC: %s: posting %d s/g entries\n",
1274 __func__, send_wr.num_sge); 1312 __func__, send_wr->num_sge);
1275 1313
1276 if (DECR_CQCOUNT(ep) > 0) 1314 if (DECR_CQCOUNT(ep) > 0)
1277 send_wr.send_flags = 0; 1315 send_wr->send_flags = 0;
1278 else { /* Provider must take a send completion every now and then */ 1316 else { /* Provider must take a send completion every now and then */
1279 INIT_CQCOUNT(ep); 1317 INIT_CQCOUNT(ep);
1280 send_wr.send_flags = IB_SEND_SIGNALED; 1318 send_wr->send_flags = IB_SEND_SIGNALED;
1281 } 1319 }
1282 1320
1283 rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail); 1321 rc = ib_post_send(ia->ri_id->qp, send_wr, &send_wr_fail);
1284 if (rc) 1322 if (rc)
1285 goto out_postsend_err; 1323 goto out_postsend_err;
1286 return 0; 1324 return 0;
@@ -1290,32 +1328,24 @@ out_postsend_err:
1290 return -ENOTCONN; 1328 return -ENOTCONN;
1291} 1329}
1292 1330
1293/*
1294 * (Re)post a receive buffer.
1295 */
1296int 1331int
1297rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, 1332rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
1298 struct rpcrdma_ep *ep,
1299 struct rpcrdma_rep *rep) 1333 struct rpcrdma_rep *rep)
1300{ 1334{
1301 struct ib_recv_wr recv_wr, *recv_wr_fail; 1335 struct ib_recv_wr *recv_wr_fail;
1302 int rc; 1336 int rc;
1303 1337
1304 recv_wr.next = NULL; 1338 if (!rpcrdma_dma_map_regbuf(ia, rep->rr_rdmabuf))
1305 recv_wr.wr_cqe = &rep->rr_cqe; 1339 goto out_map;
1306 recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov; 1340 rc = ib_post_recv(ia->ri_id->qp, &rep->rr_recv_wr, &recv_wr_fail);
1307 recv_wr.num_sge = 1;
1308
1309 ib_dma_sync_single_for_cpu(ia->ri_device,
1310 rdmab_addr(rep->rr_rdmabuf),
1311 rdmab_length(rep->rr_rdmabuf),
1312 DMA_BIDIRECTIONAL);
1313
1314 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
1315 if (rc) 1341 if (rc)
1316 goto out_postrecv; 1342 goto out_postrecv;
1317 return 0; 1343 return 0;
1318 1344
1345out_map:
1346 pr_err("rpcrdma: failed to DMA map the Receive buffer\n");
1347 return -EIO;
1348
1319out_postrecv: 1349out_postrecv:
1320 pr_err("rpcrdma: ib_post_recv returned %i\n", rc); 1350 pr_err("rpcrdma: ib_post_recv returned %i\n", rc);
1321 return -ENOTCONN; 1351 return -ENOTCONN;
@@ -1333,7 +1363,6 @@ rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *r_xprt, unsigned int count)
1333{ 1363{
1334 struct rpcrdma_buffer *buffers = &r_xprt->rx_buf; 1364 struct rpcrdma_buffer *buffers = &r_xprt->rx_buf;
1335 struct rpcrdma_ia *ia = &r_xprt->rx_ia; 1365 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1336 struct rpcrdma_ep *ep = &r_xprt->rx_ep;
1337 struct rpcrdma_rep *rep; 1366 struct rpcrdma_rep *rep;
1338 int rc; 1367 int rc;
1339 1368
@@ -1344,7 +1373,7 @@ rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *r_xprt, unsigned int count)
1344 rep = rpcrdma_buffer_get_rep_locked(buffers); 1373 rep = rpcrdma_buffer_get_rep_locked(buffers);
1345 spin_unlock(&buffers->rb_lock); 1374 spin_unlock(&buffers->rb_lock);
1346 1375
1347 rc = rpcrdma_ep_post_recv(ia, ep, rep); 1376 rc = rpcrdma_ep_post_recv(ia, rep);
1348 if (rc) 1377 if (rc)
1349 goto out_rc; 1378 goto out_rc;
1350 } 1379 }
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index a71b0f5897d8..0d35b761c883 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -70,9 +70,11 @@ struct rpcrdma_ia {
70 struct ib_pd *ri_pd; 70 struct ib_pd *ri_pd;
71 struct completion ri_done; 71 struct completion ri_done;
72 int ri_async_rc; 72 int ri_async_rc;
73 unsigned int ri_max_segs;
73 unsigned int ri_max_frmr_depth; 74 unsigned int ri_max_frmr_depth;
74 unsigned int ri_max_inline_write; 75 unsigned int ri_max_inline_write;
75 unsigned int ri_max_inline_read; 76 unsigned int ri_max_inline_read;
77 bool ri_reminv_expected;
76 struct ib_qp_attr ri_qp_attr; 78 struct ib_qp_attr ri_qp_attr;
77 struct ib_qp_init_attr ri_qp_init_attr; 79 struct ib_qp_init_attr ri_qp_init_attr;
78}; 80};
@@ -87,6 +89,7 @@ struct rpcrdma_ep {
87 int rep_connected; 89 int rep_connected;
88 struct ib_qp_init_attr rep_attr; 90 struct ib_qp_init_attr rep_attr;
89 wait_queue_head_t rep_connect_wait; 91 wait_queue_head_t rep_connect_wait;
92 struct rpcrdma_connect_private rep_cm_private;
90 struct rdma_conn_param rep_remote_cma; 93 struct rdma_conn_param rep_remote_cma;
91 struct sockaddr_storage rep_remote_addr; 94 struct sockaddr_storage rep_remote_addr;
92 struct delayed_work rep_connect_worker; 95 struct delayed_work rep_connect_worker;
@@ -112,9 +115,9 @@ struct rpcrdma_ep {
112 */ 115 */
113 116
114struct rpcrdma_regbuf { 117struct rpcrdma_regbuf {
115 size_t rg_size;
116 struct rpcrdma_req *rg_owner;
117 struct ib_sge rg_iov; 118 struct ib_sge rg_iov;
119 struct ib_device *rg_device;
120 enum dma_data_direction rg_direction;
118 __be32 rg_base[0] __attribute__ ((aligned(256))); 121 __be32 rg_base[0] __attribute__ ((aligned(256)));
119}; 122};
120 123
@@ -162,7 +165,10 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
162 * The smallest inline threshold is 1024 bytes, ensuring that 165 * The smallest inline threshold is 1024 bytes, ensuring that
163 * at least 750 bytes are available for RPC messages. 166 * at least 750 bytes are available for RPC messages.
164 */ 167 */
165#define RPCRDMA_MAX_HDR_SEGS (8) 168enum {
169 RPCRDMA_MAX_HDR_SEGS = 8,
170 RPCRDMA_HDRBUF_SIZE = 256,
171};
166 172
167/* 173/*
168 * struct rpcrdma_rep -- this structure encapsulates state required to recv 174 * struct rpcrdma_rep -- this structure encapsulates state required to recv
@@ -182,10 +188,13 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
182struct rpcrdma_rep { 188struct rpcrdma_rep {
183 struct ib_cqe rr_cqe; 189 struct ib_cqe rr_cqe;
184 unsigned int rr_len; 190 unsigned int rr_len;
191 int rr_wc_flags;
192 u32 rr_inv_rkey;
185 struct ib_device *rr_device; 193 struct ib_device *rr_device;
186 struct rpcrdma_xprt *rr_rxprt; 194 struct rpcrdma_xprt *rr_rxprt;
187 struct work_struct rr_work; 195 struct work_struct rr_work;
188 struct list_head rr_list; 196 struct list_head rr_list;
197 struct ib_recv_wr rr_recv_wr;
189 struct rpcrdma_regbuf *rr_rdmabuf; 198 struct rpcrdma_regbuf *rr_rdmabuf;
190}; 199};
191 200
@@ -276,19 +285,30 @@ struct rpcrdma_mr_seg { /* chunk descriptors */
276 char *mr_offset; /* kva if no page, else offset */ 285 char *mr_offset; /* kva if no page, else offset */
277}; 286};
278 287
279#define RPCRDMA_MAX_IOVS (2) 288/* Reserve enough Send SGEs to send a maximum size inline request:
289 * - RPC-over-RDMA header
290 * - xdr_buf head iovec
291 * - RPCRDMA_MAX_INLINE bytes, possibly unaligned, in pages
292 * - xdr_buf tail iovec
293 */
294enum {
295 RPCRDMA_MAX_SEND_PAGES = PAGE_SIZE + RPCRDMA_MAX_INLINE - 1,
296 RPCRDMA_MAX_PAGE_SGES = (RPCRDMA_MAX_SEND_PAGES >> PAGE_SHIFT) + 1,
297 RPCRDMA_MAX_SEND_SGES = 1 + 1 + RPCRDMA_MAX_PAGE_SGES + 1,
298};
280 299
281struct rpcrdma_buffer; 300struct rpcrdma_buffer;
282struct rpcrdma_req { 301struct rpcrdma_req {
283 struct list_head rl_free; 302 struct list_head rl_free;
284 unsigned int rl_niovs; 303 unsigned int rl_mapped_sges;
285 unsigned int rl_connect_cookie; 304 unsigned int rl_connect_cookie;
286 struct rpc_task *rl_task;
287 struct rpcrdma_buffer *rl_buffer; 305 struct rpcrdma_buffer *rl_buffer;
288 struct rpcrdma_rep *rl_reply;/* holder for reply buffer */ 306 struct rpcrdma_rep *rl_reply;
289 struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS]; 307 struct ib_send_wr rl_send_wr;
290 struct rpcrdma_regbuf *rl_rdmabuf; 308 struct ib_sge rl_send_sge[RPCRDMA_MAX_SEND_SGES];
291 struct rpcrdma_regbuf *rl_sendbuf; 309 struct rpcrdma_regbuf *rl_rdmabuf; /* xprt header */
310 struct rpcrdma_regbuf *rl_sendbuf; /* rq_snd_buf */
311 struct rpcrdma_regbuf *rl_recvbuf; /* rq_rcv_buf */
292 312
293 struct ib_cqe rl_cqe; 313 struct ib_cqe rl_cqe;
294 struct list_head rl_all; 314 struct list_head rl_all;
@@ -298,14 +318,16 @@ struct rpcrdma_req {
298 struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; 318 struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];
299}; 319};
300 320
321static inline void
322rpcrdma_set_xprtdata(struct rpc_rqst *rqst, struct rpcrdma_req *req)
323{
324 rqst->rq_xprtdata = req;
325}
326
301static inline struct rpcrdma_req * 327static inline struct rpcrdma_req *
302rpcr_to_rdmar(struct rpc_rqst *rqst) 328rpcr_to_rdmar(struct rpc_rqst *rqst)
303{ 329{
304 void *buffer = rqst->rq_buffer; 330 return rqst->rq_xprtdata;
305 struct rpcrdma_regbuf *rb;
306
307 rb = container_of(buffer, struct rpcrdma_regbuf, rg_base);
308 return rb->rg_owner;
309} 331}
310 332
311/* 333/*
@@ -356,15 +378,6 @@ struct rpcrdma_create_data_internal {
356 unsigned int padding; /* non-rdma write header padding */ 378 unsigned int padding; /* non-rdma write header padding */
357}; 379};
358 380
359#define RPCRDMA_INLINE_READ_THRESHOLD(rq) \
360 (rpcx_to_rdmad(rq->rq_xprt).inline_rsize)
361
362#define RPCRDMA_INLINE_WRITE_THRESHOLD(rq)\
363 (rpcx_to_rdmad(rq->rq_xprt).inline_wsize)
364
365#define RPCRDMA_INLINE_PAD_VALUE(rq)\
366 rpcx_to_rdmad(rq->rq_xprt).padding
367
368/* 381/*
369 * Statistics for RPCRDMA 382 * Statistics for RPCRDMA
370 */ 383 */
@@ -386,6 +399,7 @@ struct rpcrdma_stats {
386 unsigned long mrs_recovered; 399 unsigned long mrs_recovered;
387 unsigned long mrs_orphaned; 400 unsigned long mrs_orphaned;
388 unsigned long mrs_allocated; 401 unsigned long mrs_allocated;
402 unsigned long local_inv_needed;
389}; 403};
390 404
391/* 405/*
@@ -409,6 +423,7 @@ struct rpcrdma_memreg_ops {
409 struct rpcrdma_mw *); 423 struct rpcrdma_mw *);
410 void (*ro_release_mr)(struct rpcrdma_mw *); 424 void (*ro_release_mr)(struct rpcrdma_mw *);
411 const char *ro_displayname; 425 const char *ro_displayname;
426 const int ro_send_w_inv_ok;
412}; 427};
413 428
414extern const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops; 429extern const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops;
@@ -461,15 +476,14 @@ void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
461 476
462int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *, 477int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *,
463 struct rpcrdma_req *); 478 struct rpcrdma_req *);
464int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_ep *, 479int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_rep *);
465 struct rpcrdma_rep *);
466 480
467/* 481/*
468 * Buffer calls - xprtrdma/verbs.c 482 * Buffer calls - xprtrdma/verbs.c
469 */ 483 */
470struct rpcrdma_req *rpcrdma_create_req(struct rpcrdma_xprt *); 484struct rpcrdma_req *rpcrdma_create_req(struct rpcrdma_xprt *);
471struct rpcrdma_rep *rpcrdma_create_rep(struct rpcrdma_xprt *); 485struct rpcrdma_rep *rpcrdma_create_rep(struct rpcrdma_xprt *);
472void rpcrdma_destroy_req(struct rpcrdma_ia *, struct rpcrdma_req *); 486void rpcrdma_destroy_req(struct rpcrdma_req *);
473int rpcrdma_buffer_create(struct rpcrdma_xprt *); 487int rpcrdma_buffer_create(struct rpcrdma_xprt *);
474void rpcrdma_buffer_destroy(struct rpcrdma_buffer *); 488void rpcrdma_buffer_destroy(struct rpcrdma_buffer *);
475 489
@@ -482,10 +496,24 @@ void rpcrdma_recv_buffer_put(struct rpcrdma_rep *);
482 496
483void rpcrdma_defer_mr_recovery(struct rpcrdma_mw *); 497void rpcrdma_defer_mr_recovery(struct rpcrdma_mw *);
484 498
485struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *, 499struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(size_t, enum dma_data_direction,
486 size_t, gfp_t); 500 gfp_t);
487void rpcrdma_free_regbuf(struct rpcrdma_ia *, 501bool __rpcrdma_dma_map_regbuf(struct rpcrdma_ia *, struct rpcrdma_regbuf *);
488 struct rpcrdma_regbuf *); 502void rpcrdma_free_regbuf(struct rpcrdma_regbuf *);
503
504static inline bool
505rpcrdma_regbuf_is_mapped(struct rpcrdma_regbuf *rb)
506{
507 return rb->rg_device != NULL;
508}
509
510static inline bool
511rpcrdma_dma_map_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
512{
513 if (likely(rpcrdma_regbuf_is_mapped(rb)))
514 return true;
515 return __rpcrdma_dma_map_regbuf(ia, rb);
516}
489 517
490int rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *, unsigned int); 518int rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *, unsigned int);
491 519
@@ -507,15 +535,25 @@ rpcrdma_data_dir(bool writing)
507 */ 535 */
508void rpcrdma_connect_worker(struct work_struct *); 536void rpcrdma_connect_worker(struct work_struct *);
509void rpcrdma_conn_func(struct rpcrdma_ep *); 537void rpcrdma_conn_func(struct rpcrdma_ep *);
510void rpcrdma_reply_handler(struct rpcrdma_rep *); 538void rpcrdma_reply_handler(struct work_struct *);
511 539
512/* 540/*
513 * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c 541 * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
514 */ 542 */
543
544enum rpcrdma_chunktype {
545 rpcrdma_noch = 0,
546 rpcrdma_readch,
547 rpcrdma_areadch,
548 rpcrdma_writech,
549 rpcrdma_replych
550};
551
552bool rpcrdma_prepare_send_sges(struct rpcrdma_ia *, struct rpcrdma_req *,
553 u32, struct xdr_buf *, enum rpcrdma_chunktype);
554void rpcrdma_unmap_sges(struct rpcrdma_ia *, struct rpcrdma_req *);
515int rpcrdma_marshal_req(struct rpc_rqst *); 555int rpcrdma_marshal_req(struct rpc_rqst *);
516void rpcrdma_set_max_header_sizes(struct rpcrdma_ia *, 556void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *);
517 struct rpcrdma_create_data_internal *,
518 unsigned int);
519 557
520/* RPC/RDMA module init - xprtrdma/transport.c 558/* RPC/RDMA module init - xprtrdma/transport.c
521 */ 559 */
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index bf168838a029..0137af1c0916 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -473,7 +473,16 @@ static int xs_nospace(struct rpc_task *task)
473 spin_unlock_bh(&xprt->transport_lock); 473 spin_unlock_bh(&xprt->transport_lock);
474 474
475 /* Race breaker in case memory is freed before above code is called */ 475 /* Race breaker in case memory is freed before above code is called */
476 sk->sk_write_space(sk); 476 if (ret == -EAGAIN) {
477 struct socket_wq *wq;
478
479 rcu_read_lock();
480 wq = rcu_dereference(sk->sk_wq);
481 set_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags);
482 rcu_read_unlock();
483
484 sk->sk_write_space(sk);
485 }
477 return ret; 486 return ret;
478} 487}
479 488
@@ -2533,35 +2542,38 @@ static void xs_tcp_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
2533 * we allocate pages instead doing a kmalloc like rpc_malloc is because we want 2542 * we allocate pages instead doing a kmalloc like rpc_malloc is because we want
2534 * to use the server side send routines. 2543 * to use the server side send routines.
2535 */ 2544 */
2536static void *bc_malloc(struct rpc_task *task, size_t size) 2545static int bc_malloc(struct rpc_task *task)
2537{ 2546{
2547 struct rpc_rqst *rqst = task->tk_rqstp;
2548 size_t size = rqst->rq_callsize;
2538 struct page *page; 2549 struct page *page;
2539 struct rpc_buffer *buf; 2550 struct rpc_buffer *buf;
2540 2551
2541 WARN_ON_ONCE(size > PAGE_SIZE - sizeof(struct rpc_buffer)); 2552 if (size > PAGE_SIZE - sizeof(struct rpc_buffer)) {
2542 if (size > PAGE_SIZE - sizeof(struct rpc_buffer)) 2553 WARN_ONCE(1, "xprtsock: large bc buffer request (size %zu)\n",
2543 return NULL; 2554 size);
2555 return -EINVAL;
2556 }
2544 2557
2545 page = alloc_page(GFP_KERNEL); 2558 page = alloc_page(GFP_KERNEL);
2546 if (!page) 2559 if (!page)
2547 return NULL; 2560 return -ENOMEM;
2548 2561
2549 buf = page_address(page); 2562 buf = page_address(page);
2550 buf->len = PAGE_SIZE; 2563 buf->len = PAGE_SIZE;
2551 2564
2552 return buf->data; 2565 rqst->rq_buffer = buf->data;
2566 return 0;
2553} 2567}
2554 2568
2555/* 2569/*
2556 * Free the space allocated in the bc_alloc routine 2570 * Free the space allocated in the bc_alloc routine
2557 */ 2571 */
2558static void bc_free(void *buffer) 2572static void bc_free(struct rpc_task *task)
2559{ 2573{
2574 void *buffer = task->tk_rqstp->rq_buffer;
2560 struct rpc_buffer *buf; 2575 struct rpc_buffer *buf;
2561 2576
2562 if (!buffer)
2563 return;
2564
2565 buf = container_of(buffer, struct rpc_buffer, data); 2577 buf = container_of(buffer, struct rpc_buffer, data);
2566 free_page((unsigned long)buf); 2578 free_page((unsigned long)buf);
2567} 2579}
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index a5fc9dd24aa9..02beb35f577f 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -21,7 +21,6 @@
21#include <linux/workqueue.h> 21#include <linux/workqueue.h>
22#include <linux/if_vlan.h> 22#include <linux/if_vlan.h>
23#include <linux/rtnetlink.h> 23#include <linux/rtnetlink.h>
24#include <net/ip_fib.h>
25#include <net/switchdev.h> 24#include <net/switchdev.h>
26 25
27/** 26/**
@@ -344,8 +343,6 @@ static size_t switchdev_obj_size(const struct switchdev_obj *obj)
344 switch (obj->id) { 343 switch (obj->id) {
345 case SWITCHDEV_OBJ_ID_PORT_VLAN: 344 case SWITCHDEV_OBJ_ID_PORT_VLAN:
346 return sizeof(struct switchdev_obj_port_vlan); 345 return sizeof(struct switchdev_obj_port_vlan);
347 case SWITCHDEV_OBJ_ID_IPV4_FIB:
348 return sizeof(struct switchdev_obj_ipv4_fib);
349 case SWITCHDEV_OBJ_ID_PORT_FDB: 346 case SWITCHDEV_OBJ_ID_PORT_FDB:
350 return sizeof(struct switchdev_obj_port_fdb); 347 return sizeof(struct switchdev_obj_port_fdb);
351 case SWITCHDEV_OBJ_ID_PORT_MDB: 348 case SWITCHDEV_OBJ_ID_PORT_MDB:
@@ -1042,7 +1039,7 @@ static int switchdev_port_fdb_dump_cb(struct switchdev_obj *obj)
1042 struct nlmsghdr *nlh; 1039 struct nlmsghdr *nlh;
1043 struct ndmsg *ndm; 1040 struct ndmsg *ndm;
1044 1041
1045 if (dump->idx < dump->cb->args[0]) 1042 if (dump->idx < dump->cb->args[2])
1046 goto skip; 1043 goto skip;
1047 1044
1048 nlh = nlmsg_put(dump->skb, portid, seq, RTM_NEWNEIGH, 1045 nlh = nlmsg_put(dump->skb, portid, seq, RTM_NEWNEIGH,
@@ -1089,7 +1086,7 @@ nla_put_failure:
1089 */ 1086 */
1090int switchdev_port_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, 1087int switchdev_port_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
1091 struct net_device *dev, 1088 struct net_device *dev,
1092 struct net_device *filter_dev, int idx) 1089 struct net_device *filter_dev, int *idx)
1093{ 1090{
1094 struct switchdev_fdb_dump dump = { 1091 struct switchdev_fdb_dump dump = {
1095 .fdb.obj.orig_dev = dev, 1092 .fdb.obj.orig_dev = dev,
@@ -1097,207 +1094,27 @@ int switchdev_port_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
1097 .dev = dev, 1094 .dev = dev,
1098 .skb = skb, 1095 .skb = skb,
1099 .cb = cb, 1096 .cb = cb,
1100 .idx = idx, 1097 .idx = *idx,
1101 }; 1098 };
1102 int err; 1099 int err;
1103 1100
1104 err = switchdev_port_obj_dump(dev, &dump.fdb.obj, 1101 err = switchdev_port_obj_dump(dev, &dump.fdb.obj,
1105 switchdev_port_fdb_dump_cb); 1102 switchdev_port_fdb_dump_cb);
1106 cb->args[1] = err; 1103 *idx = dump.idx;
1107 return dump.idx; 1104 return err;
1108} 1105}
1109EXPORT_SYMBOL_GPL(switchdev_port_fdb_dump); 1106EXPORT_SYMBOL_GPL(switchdev_port_fdb_dump);
1110 1107
1111static struct net_device *switchdev_get_lowest_dev(struct net_device *dev)
1112{
1113 const struct switchdev_ops *ops = dev->switchdev_ops;
1114 struct net_device *lower_dev;
1115 struct net_device *port_dev;
1116 struct list_head *iter;
1117
1118 /* Recusively search down until we find a sw port dev.
1119 * (A sw port dev supports switchdev_port_attr_get).
1120 */
1121
1122 if (ops && ops->switchdev_port_attr_get)
1123 return dev;
1124
1125 netdev_for_each_lower_dev(dev, lower_dev, iter) {
1126 port_dev = switchdev_get_lowest_dev(lower_dev);
1127 if (port_dev)
1128 return port_dev;
1129 }
1130
1131 return NULL;
1132}
1133
1134static struct net_device *switchdev_get_dev_by_nhs(struct fib_info *fi)
1135{
1136 struct switchdev_attr attr = {
1137 .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID,
1138 };
1139 struct switchdev_attr prev_attr;
1140 struct net_device *dev = NULL;
1141 int nhsel;
1142
1143 ASSERT_RTNL();
1144
1145 /* For this route, all nexthop devs must be on the same switch. */
1146
1147 for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
1148 const struct fib_nh *nh = &fi->fib_nh[nhsel];
1149
1150 if (!nh->nh_dev)
1151 return NULL;
1152
1153 dev = switchdev_get_lowest_dev(nh->nh_dev);
1154 if (!dev)
1155 return NULL;
1156
1157 attr.orig_dev = dev;
1158 if (switchdev_port_attr_get(dev, &attr))
1159 return NULL;
1160
1161 if (nhsel > 0 &&
1162 !netdev_phys_item_id_same(&prev_attr.u.ppid, &attr.u.ppid))
1163 return NULL;
1164
1165 prev_attr = attr;
1166 }
1167
1168 return dev;
1169}
1170
1171/**
1172 * switchdev_fib_ipv4_add - Add/modify switch IPv4 route entry
1173 *
1174 * @dst: route's IPv4 destination address
1175 * @dst_len: destination address length (prefix length)
1176 * @fi: route FIB info structure
1177 * @tos: route TOS
1178 * @type: route type
1179 * @nlflags: netlink flags passed in (NLM_F_*)
1180 * @tb_id: route table ID
1181 *
1182 * Add/modify switch IPv4 route entry.
1183 */
1184int switchdev_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi,
1185 u8 tos, u8 type, u32 nlflags, u32 tb_id)
1186{
1187 struct switchdev_obj_ipv4_fib ipv4_fib = {
1188 .obj.id = SWITCHDEV_OBJ_ID_IPV4_FIB,
1189 .dst = dst,
1190 .dst_len = dst_len,
1191 .fi = fi,
1192 .tos = tos,
1193 .type = type,
1194 .nlflags = nlflags,
1195 .tb_id = tb_id,
1196 };
1197 struct net_device *dev;
1198 int err = 0;
1199
1200 /* Don't offload route if using custom ip rules or if
1201 * IPv4 FIB offloading has been disabled completely.
1202 */
1203
1204#ifdef CONFIG_IP_MULTIPLE_TABLES
1205 if (fi->fib_net->ipv4.fib_has_custom_rules)
1206 return 0;
1207#endif
1208
1209 if (fi->fib_net->ipv4.fib_offload_disabled)
1210 return 0;
1211
1212 dev = switchdev_get_dev_by_nhs(fi);
1213 if (!dev)
1214 return 0;
1215
1216 ipv4_fib.obj.orig_dev = dev;
1217 err = switchdev_port_obj_add(dev, &ipv4_fib.obj);
1218 if (!err)
1219 fi->fib_flags |= RTNH_F_OFFLOAD;
1220
1221 return err == -EOPNOTSUPP ? 0 : err;
1222}
1223EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_add);
1224
1225/**
1226 * switchdev_fib_ipv4_del - Delete IPv4 route entry from switch
1227 *
1228 * @dst: route's IPv4 destination address
1229 * @dst_len: destination address length (prefix length)
1230 * @fi: route FIB info structure
1231 * @tos: route TOS
1232 * @type: route type
1233 * @tb_id: route table ID
1234 *
1235 * Delete IPv4 route entry from switch device.
1236 */
1237int switchdev_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi,
1238 u8 tos, u8 type, u32 tb_id)
1239{
1240 struct switchdev_obj_ipv4_fib ipv4_fib = {
1241 .obj.id = SWITCHDEV_OBJ_ID_IPV4_FIB,
1242 .dst = dst,
1243 .dst_len = dst_len,
1244 .fi = fi,
1245 .tos = tos,
1246 .type = type,
1247 .nlflags = 0,
1248 .tb_id = tb_id,
1249 };
1250 struct net_device *dev;
1251 int err = 0;
1252
1253 if (!(fi->fib_flags & RTNH_F_OFFLOAD))
1254 return 0;
1255
1256 dev = switchdev_get_dev_by_nhs(fi);
1257 if (!dev)
1258 return 0;
1259
1260 ipv4_fib.obj.orig_dev = dev;
1261 err = switchdev_port_obj_del(dev, &ipv4_fib.obj);
1262 if (!err)
1263 fi->fib_flags &= ~RTNH_F_OFFLOAD;
1264
1265 return err == -EOPNOTSUPP ? 0 : err;
1266}
1267EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_del);
1268
1269/**
1270 * switchdev_fib_ipv4_abort - Abort an IPv4 FIB operation
1271 *
1272 * @fi: route FIB info structure
1273 */
1274void switchdev_fib_ipv4_abort(struct fib_info *fi)
1275{
1276 /* There was a problem installing this route to the offload
1277 * device. For now, until we come up with more refined
1278 * policy handling, abruptly end IPv4 fib offloading for
1279 * for entire net by flushing offload device(s) of all
1280 * IPv4 routes, and mark IPv4 fib offloading broken from
1281 * this point forward.
1282 */
1283
1284 fib_flush_external(fi->fib_net);
1285 fi->fib_net->ipv4.fib_offload_disabled = true;
1286}
1287EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_abort);
1288
1289bool switchdev_port_same_parent_id(struct net_device *a, 1108bool switchdev_port_same_parent_id(struct net_device *a,
1290 struct net_device *b) 1109 struct net_device *b)
1291{ 1110{
1292 struct switchdev_attr a_attr = { 1111 struct switchdev_attr a_attr = {
1293 .orig_dev = a, 1112 .orig_dev = a,
1294 .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID, 1113 .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID,
1295 .flags = SWITCHDEV_F_NO_RECURSE,
1296 }; 1114 };
1297 struct switchdev_attr b_attr = { 1115 struct switchdev_attr b_attr = {
1298 .orig_dev = b, 1116 .orig_dev = b,
1299 .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID, 1117 .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID,
1300 .flags = SWITCHDEV_F_NO_RECURSE,
1301 }; 1118 };
1302 1119
1303 if (switchdev_port_attr_get(a, &a_attr) || 1120 if (switchdev_port_attr_get(a, &a_attr) ||
@@ -1306,89 +1123,4 @@ bool switchdev_port_same_parent_id(struct net_device *a,
1306 1123
1307 return netdev_phys_item_id_same(&a_attr.u.ppid, &b_attr.u.ppid); 1124 return netdev_phys_item_id_same(&a_attr.u.ppid, &b_attr.u.ppid);
1308} 1125}
1309
1310static u32 switchdev_port_fwd_mark_get(struct net_device *dev,
1311 struct net_device *group_dev)
1312{
1313 struct net_device *lower_dev;
1314 struct list_head *iter;
1315
1316 netdev_for_each_lower_dev(group_dev, lower_dev, iter) {
1317 if (lower_dev == dev)
1318 continue;
1319 if (switchdev_port_same_parent_id(dev, lower_dev))
1320 return lower_dev->offload_fwd_mark;
1321 return switchdev_port_fwd_mark_get(dev, lower_dev);
1322 }
1323
1324 return dev->ifindex;
1325}
1326EXPORT_SYMBOL_GPL(switchdev_port_same_parent_id); 1126EXPORT_SYMBOL_GPL(switchdev_port_same_parent_id);
1327
1328static void switchdev_port_fwd_mark_reset(struct net_device *group_dev,
1329 u32 old_mark, u32 *reset_mark)
1330{
1331 struct net_device *lower_dev;
1332 struct list_head *iter;
1333
1334 netdev_for_each_lower_dev(group_dev, lower_dev, iter) {
1335 if (lower_dev->offload_fwd_mark == old_mark) {
1336 if (!*reset_mark)
1337 *reset_mark = lower_dev->ifindex;
1338 lower_dev->offload_fwd_mark = *reset_mark;
1339 }
1340 switchdev_port_fwd_mark_reset(lower_dev, old_mark, reset_mark);
1341 }
1342}
1343
1344/**
1345 * switchdev_port_fwd_mark_set - Set port offload forwarding mark
1346 *
1347 * @dev: port device
1348 * @group_dev: containing device
1349 * @joining: true if dev is joining group; false if leaving group
1350 *
1351 * An ungrouped port's offload mark is just its ifindex. A grouped
1352 * port's (member of a bridge, for example) offload mark is the ifindex
1353 * of one of the ports in the group with the same parent (switch) ID.
1354 * Ports on the same device in the same group will have the same mark.
1355 *
1356 * Example:
1357 *
1358 * br0 ifindex=9
1359 * sw1p1 ifindex=2 mark=2
1360 * sw1p2 ifindex=3 mark=2
1361 * sw2p1 ifindex=4 mark=5
1362 * sw2p2 ifindex=5 mark=5
1363 *
1364 * If sw2p2 leaves the bridge, we'll have:
1365 *
1366 * br0 ifindex=9
1367 * sw1p1 ifindex=2 mark=2
1368 * sw1p2 ifindex=3 mark=2
1369 * sw2p1 ifindex=4 mark=4
1370 * sw2p2 ifindex=5 mark=5
1371 */
1372void switchdev_port_fwd_mark_set(struct net_device *dev,
1373 struct net_device *group_dev,
1374 bool joining)
1375{
1376 u32 mark = dev->ifindex;
1377 u32 reset_mark = 0;
1378
1379 if (group_dev) {
1380 ASSERT_RTNL();
1381 if (joining)
1382 mark = switchdev_port_fwd_mark_get(dev, group_dev);
1383 else if (dev->offload_fwd_mark == mark)
1384 /* Ohoh, this port was the mark reference port,
1385 * but it's leaving the group, so reset the
1386 * mark for the remaining ports in the group.
1387 */
1388 switchdev_port_fwd_mark_reset(group_dev, mark,
1389 &reset_mark);
1390 }
1391
1392 dev->offload_fwd_mark = mark;
1393}
1394EXPORT_SYMBOL_GPL(switchdev_port_fwd_mark_set);
diff --git a/net/sysctl_net.c b/net/sysctl_net.c
index 46a71c701e7c..919981324171 100644
--- a/net/sysctl_net.c
+++ b/net/sysctl_net.c
@@ -27,9 +27,9 @@
27#endif 27#endif
28 28
29static struct ctl_table_set * 29static struct ctl_table_set *
30net_ctl_header_lookup(struct ctl_table_root *root, struct nsproxy *namespaces) 30net_ctl_header_lookup(struct ctl_table_root *root)
31{ 31{
32 return &namespaces->net_ns->sysctls; 32 return &current->nsproxy->net_ns->sysctls;
33} 33}
34 34
35static int is_seen(struct ctl_table_set *set) 35static int is_seen(struct ctl_table_set *set)
@@ -42,26 +42,37 @@ static int net_ctl_permissions(struct ctl_table_header *head,
42 struct ctl_table *table) 42 struct ctl_table *table)
43{ 43{
44 struct net *net = container_of(head->set, struct net, sysctls); 44 struct net *net = container_of(head->set, struct net, sysctls);
45 kuid_t root_uid = make_kuid(net->user_ns, 0);
46 kgid_t root_gid = make_kgid(net->user_ns, 0);
47 45
48 /* Allow network administrator to have same access as root. */ 46 /* Allow network administrator to have same access as root. */
49 if (ns_capable_noaudit(net->user_ns, CAP_NET_ADMIN) || 47 if (ns_capable_noaudit(net->user_ns, CAP_NET_ADMIN)) {
50 uid_eq(root_uid, current_euid())) {
51 int mode = (table->mode >> 6) & 7; 48 int mode = (table->mode >> 6) & 7;
52 return (mode << 6) | (mode << 3) | mode; 49 return (mode << 6) | (mode << 3) | mode;
53 } 50 }
54 /* Allow netns root group to have the same access as the root group */ 51
55 if (in_egroup_p(root_gid)) {
56 int mode = (table->mode >> 3) & 7;
57 return (mode << 3) | mode;
58 }
59 return table->mode; 52 return table->mode;
60} 53}
61 54
55static void net_ctl_set_ownership(struct ctl_table_header *head,
56 struct ctl_table *table,
57 kuid_t *uid, kgid_t *gid)
58{
59 struct net *net = container_of(head->set, struct net, sysctls);
60 kuid_t ns_root_uid;
61 kgid_t ns_root_gid;
62
63 ns_root_uid = make_kuid(net->user_ns, 0);
64 if (uid_valid(ns_root_uid))
65 *uid = ns_root_uid;
66
67 ns_root_gid = make_kgid(net->user_ns, 0);
68 if (gid_valid(ns_root_gid))
69 *gid = ns_root_gid;
70}
71
62static struct ctl_table_root net_sysctl_root = { 72static struct ctl_table_root net_sysctl_root = {
63 .lookup = net_ctl_header_lookup, 73 .lookup = net_ctl_header_lookup,
64 .permissions = net_ctl_permissions, 74 .permissions = net_ctl_permissions,
75 .set_ownership = net_ctl_set_ownership,
65}; 76};
66 77
67static int __net_init sysctl_net_init(struct net *net) 78static int __net_init sysctl_net_init(struct net *net)
diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index ae469b37d852..753f774cb46f 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -269,18 +269,19 @@ void tipc_bcast_ack_rcv(struct net *net, struct tipc_link *l, u32 acked)
269 * 269 *
270 * RCU is locked, no other locks set 270 * RCU is locked, no other locks set
271 */ 271 */
272void tipc_bcast_sync_rcv(struct net *net, struct tipc_link *l, 272int tipc_bcast_sync_rcv(struct net *net, struct tipc_link *l,
273 struct tipc_msg *hdr) 273 struct tipc_msg *hdr)
274{ 274{
275 struct sk_buff_head *inputq = &tipc_bc_base(net)->inputq; 275 struct sk_buff_head *inputq = &tipc_bc_base(net)->inputq;
276 struct sk_buff_head xmitq; 276 struct sk_buff_head xmitq;
277 int rc = 0;
277 278
278 __skb_queue_head_init(&xmitq); 279 __skb_queue_head_init(&xmitq);
279 280
280 tipc_bcast_lock(net); 281 tipc_bcast_lock(net);
281 if (msg_type(hdr) == STATE_MSG) { 282 if (msg_type(hdr) == STATE_MSG) {
282 tipc_link_bc_ack_rcv(l, msg_bcast_ack(hdr), &xmitq); 283 tipc_link_bc_ack_rcv(l, msg_bcast_ack(hdr), &xmitq);
283 tipc_link_bc_sync_rcv(l, hdr, &xmitq); 284 rc = tipc_link_bc_sync_rcv(l, hdr, &xmitq);
284 } else { 285 } else {
285 tipc_link_bc_init_rcv(l, hdr); 286 tipc_link_bc_init_rcv(l, hdr);
286 } 287 }
@@ -291,6 +292,7 @@ void tipc_bcast_sync_rcv(struct net *net, struct tipc_link *l,
291 /* Any socket wakeup messages ? */ 292 /* Any socket wakeup messages ? */
292 if (!skb_queue_empty(inputq)) 293 if (!skb_queue_empty(inputq))
293 tipc_sk_rcv(net, inputq); 294 tipc_sk_rcv(net, inputq);
295 return rc;
294} 296}
295 297
296/* tipc_bcast_add_peer - add a peer node to broadcast link and bearer 298/* tipc_bcast_add_peer - add a peer node to broadcast link and bearer
diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h
index d5e79b3767fd..5ffe34472ccd 100644
--- a/net/tipc/bcast.h
+++ b/net/tipc/bcast.h
@@ -56,8 +56,8 @@ int tipc_bcast_get_mtu(struct net *net);
56int tipc_bcast_xmit(struct net *net, struct sk_buff_head *list); 56int tipc_bcast_xmit(struct net *net, struct sk_buff_head *list);
57int tipc_bcast_rcv(struct net *net, struct tipc_link *l, struct sk_buff *skb); 57int tipc_bcast_rcv(struct net *net, struct tipc_link *l, struct sk_buff *skb);
58void tipc_bcast_ack_rcv(struct net *net, struct tipc_link *l, u32 acked); 58void tipc_bcast_ack_rcv(struct net *net, struct tipc_link *l, u32 acked);
59void tipc_bcast_sync_rcv(struct net *net, struct tipc_link *l, 59int tipc_bcast_sync_rcv(struct net *net, struct tipc_link *l,
60 struct tipc_msg *hdr); 60 struct tipc_msg *hdr);
61int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg); 61int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg);
62int tipc_nl_bc_link_set(struct net *net, struct nlattr *attrs[]); 62int tipc_nl_bc_link_set(struct net *net, struct nlattr *attrs[]);
63int tipc_bclink_reset_stats(struct net *net); 63int tipc_bclink_reset_stats(struct net *net);
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index 65b1bbf133bd..975dbeb60ab0 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -42,6 +42,7 @@
42#include "monitor.h" 42#include "monitor.h"
43#include "bcast.h" 43#include "bcast.h"
44#include "netlink.h" 44#include "netlink.h"
45#include "udp_media.h"
45 46
46#define MAX_ADDR_STR 60 47#define MAX_ADDR_STR 60
47 48
@@ -56,6 +57,13 @@ static struct tipc_media * const media_info_array[] = {
56 NULL 57 NULL
57}; 58};
58 59
60static struct tipc_bearer *bearer_get(struct net *net, int bearer_id)
61{
62 struct tipc_net *tn = tipc_net(net);
63
64 return rcu_dereference_rtnl(tn->bearer_list[bearer_id]);
65}
66
59static void bearer_disable(struct net *net, struct tipc_bearer *b); 67static void bearer_disable(struct net *net, struct tipc_bearer *b);
60 68
61/** 69/**
@@ -323,6 +331,7 @@ restart:
323 b->domain = disc_domain; 331 b->domain = disc_domain;
324 b->net_plane = bearer_id + 'A'; 332 b->net_plane = bearer_id + 'A';
325 b->priority = priority; 333 b->priority = priority;
334 test_and_set_bit_lock(0, &b->up);
326 335
327 res = tipc_disc_create(net, b, &b->bcast_addr, &skb); 336 res = tipc_disc_create(net, b, &b->bcast_addr, &skb);
328 if (res) { 337 if (res) {
@@ -360,15 +369,24 @@ static int tipc_reset_bearer(struct net *net, struct tipc_bearer *b)
360 */ 369 */
361void tipc_bearer_reset_all(struct net *net) 370void tipc_bearer_reset_all(struct net *net)
362{ 371{
363 struct tipc_net *tn = tipc_net(net);
364 struct tipc_bearer *b; 372 struct tipc_bearer *b;
365 int i; 373 int i;
366 374
367 for (i = 0; i < MAX_BEARERS; i++) { 375 for (i = 0; i < MAX_BEARERS; i++) {
368 b = rcu_dereference_rtnl(tn->bearer_list[i]); 376 b = bearer_get(net, i);
377 if (b)
378 clear_bit_unlock(0, &b->up);
379 }
380 for (i = 0; i < MAX_BEARERS; i++) {
381 b = bearer_get(net, i);
369 if (b) 382 if (b)
370 tipc_reset_bearer(net, b); 383 tipc_reset_bearer(net, b);
371 } 384 }
385 for (i = 0; i < MAX_BEARERS; i++) {
386 b = bearer_get(net, i);
387 if (b)
388 test_and_set_bit_lock(0, &b->up);
389 }
372} 390}
373 391
374/** 392/**
@@ -382,8 +400,9 @@ static void bearer_disable(struct net *net, struct tipc_bearer *b)
382 int bearer_id = b->identity; 400 int bearer_id = b->identity;
383 401
384 pr_info("Disabling bearer <%s>\n", b->name); 402 pr_info("Disabling bearer <%s>\n", b->name);
385 b->media->disable_media(b); 403 clear_bit_unlock(0, &b->up);
386 tipc_node_delete_links(net, bearer_id); 404 tipc_node_delete_links(net, bearer_id);
405 b->media->disable_media(b);
387 RCU_INIT_POINTER(b->media_ptr, NULL); 406 RCU_INIT_POINTER(b->media_ptr, NULL);
388 if (b->link_req) 407 if (b->link_req)
389 tipc_disc_delete(b->link_req); 408 tipc_disc_delete(b->link_req);
@@ -440,22 +459,16 @@ int tipc_l2_send_msg(struct net *net, struct sk_buff *skb,
440{ 459{
441 struct net_device *dev; 460 struct net_device *dev;
442 int delta; 461 int delta;
443 void *tipc_ptr;
444 462
445 dev = (struct net_device *)rcu_dereference_rtnl(b->media_ptr); 463 dev = (struct net_device *)rcu_dereference_rtnl(b->media_ptr);
446 if (!dev) 464 if (!dev)
447 return 0; 465 return 0;
448 466
449 /* Send RESET message even if bearer is detached from device */ 467 delta = SKB_DATA_ALIGN(dev->hard_header_len - skb_headroom(skb));
450 tipc_ptr = rcu_dereference_rtnl(dev->tipc_ptr); 468 if ((delta > 0) && pskb_expand_head(skb, delta, 0, GFP_ATOMIC)) {
451 if (unlikely(!tipc_ptr && !msg_is_reset(buf_msg(skb)))) 469 kfree_skb(skb);
452 goto drop; 470 return 0;
453 471 }
454 delta = dev->hard_header_len - skb_headroom(skb);
455 if ((delta > 0) &&
456 pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC))
457 goto drop;
458
459 skb_reset_network_header(skb); 472 skb_reset_network_header(skb);
460 skb->dev = dev; 473 skb->dev = dev;
461 skb->protocol = htons(ETH_P_TIPC); 474 skb->protocol = htons(ETH_P_TIPC);
@@ -463,9 +476,6 @@ int tipc_l2_send_msg(struct net *net, struct sk_buff *skb,
463 dev->dev_addr, skb->len); 476 dev->dev_addr, skb->len);
464 dev_queue_xmit(skb); 477 dev_queue_xmit(skb);
465 return 0; 478 return 0;
466drop:
467 kfree_skb(skb);
468 return 0;
469} 479}
470 480
471int tipc_bearer_mtu(struct net *net, u32 bearer_id) 481int tipc_bearer_mtu(struct net *net, u32 bearer_id)
@@ -487,12 +497,12 @@ void tipc_bearer_xmit_skb(struct net *net, u32 bearer_id,
487 struct sk_buff *skb, 497 struct sk_buff *skb,
488 struct tipc_media_addr *dest) 498 struct tipc_media_addr *dest)
489{ 499{
490 struct tipc_net *tn = tipc_net(net); 500 struct tipc_msg *hdr = buf_msg(skb);
491 struct tipc_bearer *b; 501 struct tipc_bearer *b;
492 502
493 rcu_read_lock(); 503 rcu_read_lock();
494 b = rcu_dereference_rtnl(tn->bearer_list[bearer_id]); 504 b = bearer_get(net, bearer_id);
495 if (likely(b)) 505 if (likely(b && (test_bit(0, &b->up) || msg_is_reset(hdr))))
496 b->media->send_msg(net, skb, b, dest); 506 b->media->send_msg(net, skb, b, dest);
497 else 507 else
498 kfree_skb(skb); 508 kfree_skb(skb);
@@ -505,7 +515,6 @@ void tipc_bearer_xmit(struct net *net, u32 bearer_id,
505 struct sk_buff_head *xmitq, 515 struct sk_buff_head *xmitq,
506 struct tipc_media_addr *dst) 516 struct tipc_media_addr *dst)
507{ 517{
508 struct tipc_net *tn = net_generic(net, tipc_net_id);
509 struct tipc_bearer *b; 518 struct tipc_bearer *b;
510 struct sk_buff *skb, *tmp; 519 struct sk_buff *skb, *tmp;
511 520
@@ -513,12 +522,15 @@ void tipc_bearer_xmit(struct net *net, u32 bearer_id,
513 return; 522 return;
514 523
515 rcu_read_lock(); 524 rcu_read_lock();
516 b = rcu_dereference_rtnl(tn->bearer_list[bearer_id]); 525 b = bearer_get(net, bearer_id);
517 if (unlikely(!b)) 526 if (unlikely(!b))
518 __skb_queue_purge(xmitq); 527 __skb_queue_purge(xmitq);
519 skb_queue_walk_safe(xmitq, skb, tmp) { 528 skb_queue_walk_safe(xmitq, skb, tmp) {
520 __skb_dequeue(xmitq); 529 __skb_dequeue(xmitq);
521 b->media->send_msg(net, skb, b, dst); 530 if (likely(test_bit(0, &b->up) || msg_is_reset(buf_msg(skb))))
531 b->media->send_msg(net, skb, b, dst);
532 else
533 kfree_skb(skb);
522 } 534 }
523 rcu_read_unlock(); 535 rcu_read_unlock();
524} 536}
@@ -535,8 +547,8 @@ void tipc_bearer_bc_xmit(struct net *net, u32 bearer_id,
535 struct tipc_msg *hdr; 547 struct tipc_msg *hdr;
536 548
537 rcu_read_lock(); 549 rcu_read_lock();
538 b = rcu_dereference_rtnl(tn->bearer_list[bearer_id]); 550 b = bearer_get(net, bearer_id);
539 if (unlikely(!b)) 551 if (unlikely(!b || !test_bit(0, &b->up)))
540 __skb_queue_purge(xmitq); 552 __skb_queue_purge(xmitq);
541 skb_queue_walk_safe(xmitq, skb, tmp) { 553 skb_queue_walk_safe(xmitq, skb, tmp) {
542 hdr = buf_msg(skb); 554 hdr = buf_msg(skb);
@@ -566,7 +578,8 @@ static int tipc_l2_rcv_msg(struct sk_buff *skb, struct net_device *dev,
566 578
567 rcu_read_lock(); 579 rcu_read_lock();
568 b = rcu_dereference_rtnl(dev->tipc_ptr); 580 b = rcu_dereference_rtnl(dev->tipc_ptr);
569 if (likely(b && (skb->pkt_type <= PACKET_BROADCAST))) { 581 if (likely(b && test_bit(0, &b->up) &&
582 (skb->pkt_type <= PACKET_BROADCAST))) {
570 skb->next = NULL; 583 skb->next = NULL;
571 tipc_rcv(dev_net(dev), skb, b); 584 tipc_rcv(dev_net(dev), skb, b);
572 rcu_read_unlock(); 585 rcu_read_unlock();
@@ -591,18 +604,9 @@ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt,
591{ 604{
592 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 605 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
593 struct net *net = dev_net(dev); 606 struct net *net = dev_net(dev);
594 struct tipc_net *tn = tipc_net(net);
595 struct tipc_bearer *b; 607 struct tipc_bearer *b;
596 int i;
597 608
598 b = rtnl_dereference(dev->tipc_ptr); 609 b = rtnl_dereference(dev->tipc_ptr);
599 if (!b) {
600 for (i = 0; i < MAX_BEARERS; b = NULL, i++) {
601 b = rtnl_dereference(tn->bearer_list[i]);
602 if (b && (b->media_ptr == dev))
603 break;
604 }
605 }
606 if (!b) 610 if (!b)
607 return NOTIFY_DONE; 611 return NOTIFY_DONE;
608 612
@@ -613,11 +617,10 @@ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt,
613 if (netif_carrier_ok(dev)) 617 if (netif_carrier_ok(dev))
614 break; 618 break;
615 case NETDEV_UP: 619 case NETDEV_UP:
616 rcu_assign_pointer(dev->tipc_ptr, b); 620 test_and_set_bit_lock(0, &b->up);
617 break; 621 break;
618 case NETDEV_GOING_DOWN: 622 case NETDEV_GOING_DOWN:
619 RCU_INIT_POINTER(dev->tipc_ptr, NULL); 623 clear_bit_unlock(0, &b->up);
620 synchronize_net();
621 tipc_reset_bearer(net, b); 624 tipc_reset_bearer(net, b);
622 break; 625 break;
623 case NETDEV_CHANGEMTU: 626 case NETDEV_CHANGEMTU:
@@ -709,6 +712,14 @@ static int __tipc_nl_add_bearer(struct tipc_nl_msg *msg,
709 goto prop_msg_full; 712 goto prop_msg_full;
710 713
711 nla_nest_end(msg->skb, prop); 714 nla_nest_end(msg->skb, prop);
715
716#ifdef CONFIG_TIPC_MEDIA_UDP
717 if (bearer->media->type_id == TIPC_MEDIA_TYPE_UDP) {
718 if (tipc_udp_nl_add_bearer_data(msg, bearer))
719 goto attr_msg_full;
720 }
721#endif
722
712 nla_nest_end(msg->skb, attrs); 723 nla_nest_end(msg->skb, attrs);
713 genlmsg_end(msg->skb, hdr); 724 genlmsg_end(msg->skb, hdr);
714 725
@@ -895,6 +906,49 @@ int tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info)
895 return 0; 906 return 0;
896} 907}
897 908
909int tipc_nl_bearer_add(struct sk_buff *skb, struct genl_info *info)
910{
911 int err;
912 char *name;
913 struct tipc_bearer *b;
914 struct nlattr *attrs[TIPC_NLA_BEARER_MAX + 1];
915 struct net *net = sock_net(skb->sk);
916
917 if (!info->attrs[TIPC_NLA_BEARER])
918 return -EINVAL;
919
920 err = nla_parse_nested(attrs, TIPC_NLA_BEARER_MAX,
921 info->attrs[TIPC_NLA_BEARER],
922 tipc_nl_bearer_policy);
923 if (err)
924 return err;
925
926 if (!attrs[TIPC_NLA_BEARER_NAME])
927 return -EINVAL;
928 name = nla_data(attrs[TIPC_NLA_BEARER_NAME]);
929
930 rtnl_lock();
931 b = tipc_bearer_find(net, name);
932 if (!b) {
933 rtnl_unlock();
934 return -EINVAL;
935 }
936
937#ifdef CONFIG_TIPC_MEDIA_UDP
938 if (attrs[TIPC_NLA_BEARER_UDP_OPTS]) {
939 err = tipc_udp_nl_bearer_add(b,
940 attrs[TIPC_NLA_BEARER_UDP_OPTS]);
941 if (err) {
942 rtnl_unlock();
943 return err;
944 }
945 }
946#endif
947 rtnl_unlock();
948
949 return 0;
950}
951
898int tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info) 952int tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info)
899{ 953{
900 int err; 954 int err;
diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h
index 43757f1f9cb3..78892e2f53e3 100644
--- a/net/tipc/bearer.h
+++ b/net/tipc/bearer.h
@@ -150,6 +150,7 @@ struct tipc_bearer {
150 u32 identity; 150 u32 identity;
151 struct tipc_link_req *link_req; 151 struct tipc_link_req *link_req;
152 char net_plane; 152 char net_plane;
153 unsigned long up;
153}; 154};
154 155
155struct tipc_bearer_names { 156struct tipc_bearer_names {
@@ -180,6 +181,7 @@ int tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info);
180int tipc_nl_bearer_dump(struct sk_buff *skb, struct netlink_callback *cb); 181int tipc_nl_bearer_dump(struct sk_buff *skb, struct netlink_callback *cb);
181int tipc_nl_bearer_get(struct sk_buff *skb, struct genl_info *info); 182int tipc_nl_bearer_get(struct sk_buff *skb, struct genl_info *info);
182int tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info); 183int tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info);
184int tipc_nl_bearer_add(struct sk_buff *skb, struct genl_info *info);
183 185
184int tipc_nl_media_dump(struct sk_buff *skb, struct netlink_callback *cb); 186int tipc_nl_media_dump(struct sk_buff *skb, struct netlink_callback *cb);
185int tipc_nl_media_get(struct sk_buff *skb, struct genl_info *info); 187int tipc_nl_media_get(struct sk_buff *skb, struct genl_info *info);
diff --git a/net/tipc/link.c b/net/tipc/link.c
index 877d94f34814..b36e16cdc945 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -181,7 +181,10 @@ struct tipc_link {
181 u16 acked; 181 u16 acked;
182 struct tipc_link *bc_rcvlink; 182 struct tipc_link *bc_rcvlink;
183 struct tipc_link *bc_sndlink; 183 struct tipc_link *bc_sndlink;
184 int nack_state; 184 unsigned long prev_retr;
185 u16 prev_from;
186 u16 prev_to;
187 u8 nack_state;
185 bool bc_peer_is_up; 188 bool bc_peer_is_up;
186 189
187 /* Statistics */ 190 /* Statistics */
@@ -202,6 +205,8 @@ enum {
202 BC_NACK_SND_SUPPRESS, 205 BC_NACK_SND_SUPPRESS,
203}; 206};
204 207
208#define TIPC_BC_RETR_LIMIT 10 /* [ms] */
209
205/* 210/*
206 * Interval between NACKs when packets arrive out of order 211 * Interval between NACKs when packets arrive out of order
207 */ 212 */
@@ -237,8 +242,8 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe,
237 u16 rcvgap, int tolerance, int priority, 242 u16 rcvgap, int tolerance, int priority,
238 struct sk_buff_head *xmitq); 243 struct sk_buff_head *xmitq);
239static void link_print(struct tipc_link *l, const char *str); 244static void link_print(struct tipc_link *l, const char *str);
240static void tipc_link_build_nack_msg(struct tipc_link *l, 245static int tipc_link_build_nack_msg(struct tipc_link *l,
241 struct sk_buff_head *xmitq); 246 struct sk_buff_head *xmitq);
242static void tipc_link_build_bc_init_msg(struct tipc_link *l, 247static void tipc_link_build_bc_init_msg(struct tipc_link *l,
243 struct sk_buff_head *xmitq); 248 struct sk_buff_head *xmitq);
244static bool tipc_link_release_pkts(struct tipc_link *l, u16 to); 249static bool tipc_link_release_pkts(struct tipc_link *l, u16 to);
@@ -367,6 +372,18 @@ int tipc_link_bc_peers(struct tipc_link *l)
367 return l->ackers; 372 return l->ackers;
368} 373}
369 374
375u16 link_bc_rcv_gap(struct tipc_link *l)
376{
377 struct sk_buff *skb = skb_peek(&l->deferdq);
378 u16 gap = 0;
379
380 if (more(l->snd_nxt, l->rcv_nxt))
381 gap = l->snd_nxt - l->rcv_nxt;
382 if (skb)
383 gap = buf_seqno(skb) - l->rcv_nxt;
384 return gap;
385}
386
370void tipc_link_set_mtu(struct tipc_link *l, int mtu) 387void tipc_link_set_mtu(struct tipc_link *l, int mtu)
371{ 388{
372 l->mtu = mtu; 389 l->mtu = mtu;
@@ -807,7 +824,7 @@ void link_prepare_wakeup(struct tipc_link *l)
807 824
808 skb_queue_walk_safe(&l->wakeupq, skb, tmp) { 825 skb_queue_walk_safe(&l->wakeupq, skb, tmp) {
809 imp = TIPC_SKB_CB(skb)->chain_imp; 826 imp = TIPC_SKB_CB(skb)->chain_imp;
810 lim = l->window + l->backlog[imp].limit; 827 lim = l->backlog[imp].limit;
811 pnd[imp] += TIPC_SKB_CB(skb)->chain_sz; 828 pnd[imp] += TIPC_SKB_CB(skb)->chain_sz;
812 if ((pnd[imp] + l->backlog[imp].len) >= lim) 829 if ((pnd[imp] + l->backlog[imp].len) >= lim)
813 break; 830 break;
@@ -873,9 +890,11 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
873 struct sk_buff *skb, *_skb, *bskb; 890 struct sk_buff *skb, *_skb, *bskb;
874 891
875 /* Match msg importance against this and all higher backlog limits: */ 892 /* Match msg importance against this and all higher backlog limits: */
876 for (i = imp; i <= TIPC_SYSTEM_IMPORTANCE; i++) { 893 if (!skb_queue_empty(backlogq)) {
877 if (unlikely(l->backlog[i].len >= l->backlog[i].limit)) 894 for (i = imp; i <= TIPC_SYSTEM_IMPORTANCE; i++) {
878 return link_schedule_user(l, list); 895 if (unlikely(l->backlog[i].len >= l->backlog[i].limit))
896 return link_schedule_user(l, list);
897 }
879 } 898 }
880 if (unlikely(msg_size(hdr) > mtu)) { 899 if (unlikely(msg_size(hdr) > mtu)) {
881 skb_queue_purge(list); 900 skb_queue_purge(list);
@@ -1133,7 +1152,10 @@ int tipc_link_build_state_msg(struct tipc_link *l, struct sk_buff_head *xmitq)
1133 if (((l->rcv_nxt ^ tipc_own_addr(l->net)) & 0xf) != 0xf) 1152 if (((l->rcv_nxt ^ tipc_own_addr(l->net)) & 0xf) != 0xf)
1134 return 0; 1153 return 0;
1135 l->rcv_unacked = 0; 1154 l->rcv_unacked = 0;
1136 return TIPC_LINK_SND_BC_ACK; 1155
1156 /* Use snd_nxt to store peer's snd_nxt in broadcast rcv link */
1157 l->snd_nxt = l->rcv_nxt;
1158 return TIPC_LINK_SND_STATE;
1137 } 1159 }
1138 1160
1139 /* Unicast ACK */ 1161 /* Unicast ACK */
@@ -1162,17 +1184,26 @@ void tipc_link_build_reset_msg(struct tipc_link *l, struct sk_buff_head *xmitq)
1162} 1184}
1163 1185
1164/* tipc_link_build_nack_msg: prepare link nack message for transmission 1186/* tipc_link_build_nack_msg: prepare link nack message for transmission
1187 * Note that sending of broadcast NACK is coordinated among nodes, to
1188 * reduce the risk of NACK storms towards the sender
1165 */ 1189 */
1166static void tipc_link_build_nack_msg(struct tipc_link *l, 1190static int tipc_link_build_nack_msg(struct tipc_link *l,
1167 struct sk_buff_head *xmitq) 1191 struct sk_buff_head *xmitq)
1168{ 1192{
1169 u32 def_cnt = ++l->stats.deferred_recv; 1193 u32 def_cnt = ++l->stats.deferred_recv;
1194 int match1, match2;
1170 1195
1171 if (link_is_bc_rcvlink(l)) 1196 if (link_is_bc_rcvlink(l)) {
1172 return; 1197 match1 = def_cnt & 0xf;
1198 match2 = tipc_own_addr(l->net) & 0xf;
1199 if (match1 == match2)
1200 return TIPC_LINK_SND_STATE;
1201 return 0;
1202 }
1173 1203
1174 if ((skb_queue_len(&l->deferdq) == 1) || !(def_cnt % TIPC_NACK_INTV)) 1204 if ((skb_queue_len(&l->deferdq) == 1) || !(def_cnt % TIPC_NACK_INTV))
1175 tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, 0, xmitq); 1205 tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, 0, xmitq);
1206 return 0;
1176} 1207}
1177 1208
1178/* tipc_link_rcv - process TIPC packets/messages arriving from off-node 1209/* tipc_link_rcv - process TIPC packets/messages arriving from off-node
@@ -1223,7 +1254,7 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb,
1223 /* Defer delivery if sequence gap */ 1254 /* Defer delivery if sequence gap */
1224 if (unlikely(seqno != rcv_nxt)) { 1255 if (unlikely(seqno != rcv_nxt)) {
1225 __tipc_skb_queue_sorted(defq, seqno, skb); 1256 __tipc_skb_queue_sorted(defq, seqno, skb);
1226 tipc_link_build_nack_msg(l, xmitq); 1257 rc |= tipc_link_build_nack_msg(l, xmitq);
1227 break; 1258 break;
1228 } 1259 }
1229 1260
@@ -1234,7 +1265,7 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb,
1234 rc |= tipc_link_input(l, skb, l->inputq); 1265 rc |= tipc_link_input(l, skb, l->inputq);
1235 if (unlikely(++l->rcv_unacked >= TIPC_MIN_LINK_WIN)) 1266 if (unlikely(++l->rcv_unacked >= TIPC_MIN_LINK_WIN))
1236 rc |= tipc_link_build_state_msg(l, xmitq); 1267 rc |= tipc_link_build_state_msg(l, xmitq);
1237 if (unlikely(rc & ~TIPC_LINK_SND_BC_ACK)) 1268 if (unlikely(rc & ~TIPC_LINK_SND_STATE))
1238 break; 1269 break;
1239 } while ((skb = __skb_dequeue(defq))); 1270 } while ((skb = __skb_dequeue(defq)));
1240 1271
@@ -1248,10 +1279,11 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe,
1248 u16 rcvgap, int tolerance, int priority, 1279 u16 rcvgap, int tolerance, int priority,
1249 struct sk_buff_head *xmitq) 1280 struct sk_buff_head *xmitq)
1250{ 1281{
1282 struct tipc_link *bcl = l->bc_rcvlink;
1251 struct sk_buff *skb; 1283 struct sk_buff *skb;
1252 struct tipc_msg *hdr; 1284 struct tipc_msg *hdr;
1253 struct sk_buff_head *dfq = &l->deferdq; 1285 struct sk_buff_head *dfq = &l->deferdq;
1254 bool node_up = link_is_up(l->bc_rcvlink); 1286 bool node_up = link_is_up(bcl);
1255 struct tipc_mon_state *mstate = &l->mon_state; 1287 struct tipc_mon_state *mstate = &l->mon_state;
1256 int dlen = 0; 1288 int dlen = 0;
1257 void *data; 1289 void *data;
@@ -1279,7 +1311,7 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe,
1279 msg_set_net_plane(hdr, l->net_plane); 1311 msg_set_net_plane(hdr, l->net_plane);
1280 msg_set_next_sent(hdr, l->snd_nxt); 1312 msg_set_next_sent(hdr, l->snd_nxt);
1281 msg_set_ack(hdr, l->rcv_nxt - 1); 1313 msg_set_ack(hdr, l->rcv_nxt - 1);
1282 msg_set_bcast_ack(hdr, l->bc_rcvlink->rcv_nxt - 1); 1314 msg_set_bcast_ack(hdr, bcl->rcv_nxt - 1);
1283 msg_set_last_bcast(hdr, l->bc_sndlink->snd_nxt - 1); 1315 msg_set_last_bcast(hdr, l->bc_sndlink->snd_nxt - 1);
1284 msg_set_link_tolerance(hdr, tolerance); 1316 msg_set_link_tolerance(hdr, tolerance);
1285 msg_set_linkprio(hdr, priority); 1317 msg_set_linkprio(hdr, priority);
@@ -1289,6 +1321,7 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe,
1289 1321
1290 if (mtyp == STATE_MSG) { 1322 if (mtyp == STATE_MSG) {
1291 msg_set_seq_gap(hdr, rcvgap); 1323 msg_set_seq_gap(hdr, rcvgap);
1324 msg_set_bc_gap(hdr, link_bc_rcv_gap(bcl));
1292 msg_set_probe(hdr, probe); 1325 msg_set_probe(hdr, probe);
1293 tipc_mon_prep(l->net, data, &dlen, mstate, l->bearer_id); 1326 tipc_mon_prep(l->net, data, &dlen, mstate, l->bearer_id);
1294 msg_set_size(hdr, INT_H_SIZE + dlen); 1327 msg_set_size(hdr, INT_H_SIZE + dlen);
@@ -1571,51 +1604,107 @@ void tipc_link_bc_init_rcv(struct tipc_link *l, struct tipc_msg *hdr)
1571 l->rcv_nxt = peers_snd_nxt; 1604 l->rcv_nxt = peers_snd_nxt;
1572} 1605}
1573 1606
1607/* link_bc_retr eval()- check if the indicated range can be retransmitted now
1608 * - Adjust permitted range if there is overlap with previous retransmission
1609 */
1610static bool link_bc_retr_eval(struct tipc_link *l, u16 *from, u16 *to)
1611{
1612 unsigned long elapsed = jiffies_to_msecs(jiffies - l->prev_retr);
1613
1614 if (less(*to, *from))
1615 return false;
1616
1617 /* New retransmission request */
1618 if ((elapsed > TIPC_BC_RETR_LIMIT) ||
1619 less(*to, l->prev_from) || more(*from, l->prev_to)) {
1620 l->prev_from = *from;
1621 l->prev_to = *to;
1622 l->prev_retr = jiffies;
1623 return true;
1624 }
1625
1626 /* Inside range of previous retransmit */
1627 if (!less(*from, l->prev_from) && !more(*to, l->prev_to))
1628 return false;
1629
1630 /* Fully or partially outside previous range => exclude overlap */
1631 if (less(*from, l->prev_from)) {
1632 *to = l->prev_from - 1;
1633 l->prev_from = *from;
1634 }
1635 if (more(*to, l->prev_to)) {
1636 *from = l->prev_to + 1;
1637 l->prev_to = *to;
1638 }
1639 l->prev_retr = jiffies;
1640 return true;
1641}
1642
1574/* tipc_link_bc_sync_rcv - update rcv link according to peer's send state 1643/* tipc_link_bc_sync_rcv - update rcv link according to peer's send state
1575 */ 1644 */
1576void tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr, 1645int tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr,
1577 struct sk_buff_head *xmitq) 1646 struct sk_buff_head *xmitq)
1578{ 1647{
1648 struct tipc_link *snd_l = l->bc_sndlink;
1579 u16 peers_snd_nxt = msg_bc_snd_nxt(hdr); 1649 u16 peers_snd_nxt = msg_bc_snd_nxt(hdr);
1650 u16 from = msg_bcast_ack(hdr) + 1;
1651 u16 to = from + msg_bc_gap(hdr) - 1;
1652 int rc = 0;
1580 1653
1581 if (!link_is_up(l)) 1654 if (!link_is_up(l))
1582 return; 1655 return rc;
1583 1656
1584 if (!msg_peer_node_is_up(hdr)) 1657 if (!msg_peer_node_is_up(hdr))
1585 return; 1658 return rc;
1586 1659
1587 /* Open when peer ackowledges our bcast init msg (pkt #1) */ 1660 /* Open when peer ackowledges our bcast init msg (pkt #1) */
1588 if (msg_ack(hdr)) 1661 if (msg_ack(hdr))
1589 l->bc_peer_is_up = true; 1662 l->bc_peer_is_up = true;
1590 1663
1591 if (!l->bc_peer_is_up) 1664 if (!l->bc_peer_is_up)
1592 return; 1665 return rc;
1666
1667 l->stats.recv_nacks++;
1593 1668
1594 /* Ignore if peers_snd_nxt goes beyond receive window */ 1669 /* Ignore if peers_snd_nxt goes beyond receive window */
1595 if (more(peers_snd_nxt, l->rcv_nxt + l->window)) 1670 if (more(peers_snd_nxt, l->rcv_nxt + l->window))
1596 return; 1671 return rc;
1672
1673 if (link_bc_retr_eval(snd_l, &from, &to))
1674 rc = tipc_link_retrans(snd_l, from, to, xmitq);
1675
1676 l->snd_nxt = peers_snd_nxt;
1677 if (link_bc_rcv_gap(l))
1678 rc |= TIPC_LINK_SND_STATE;
1679
1680 /* Return now if sender supports nack via STATE messages */
1681 if (l->peer_caps & TIPC_BCAST_STATE_NACK)
1682 return rc;
1683
1684 /* Otherwise, be backwards compatible */
1597 1685
1598 if (!more(peers_snd_nxt, l->rcv_nxt)) { 1686 if (!more(peers_snd_nxt, l->rcv_nxt)) {
1599 l->nack_state = BC_NACK_SND_CONDITIONAL; 1687 l->nack_state = BC_NACK_SND_CONDITIONAL;
1600 return; 1688 return 0;
1601 } 1689 }
1602 1690
1603 /* Don't NACK if one was recently sent or peeked */ 1691 /* Don't NACK if one was recently sent or peeked */
1604 if (l->nack_state == BC_NACK_SND_SUPPRESS) { 1692 if (l->nack_state == BC_NACK_SND_SUPPRESS) {
1605 l->nack_state = BC_NACK_SND_UNCONDITIONAL; 1693 l->nack_state = BC_NACK_SND_UNCONDITIONAL;
1606 return; 1694 return 0;
1607 } 1695 }
1608 1696
1609 /* Conditionally delay NACK sending until next synch rcv */ 1697 /* Conditionally delay NACK sending until next synch rcv */
1610 if (l->nack_state == BC_NACK_SND_CONDITIONAL) { 1698 if (l->nack_state == BC_NACK_SND_CONDITIONAL) {
1611 l->nack_state = BC_NACK_SND_UNCONDITIONAL; 1699 l->nack_state = BC_NACK_SND_UNCONDITIONAL;
1612 if ((peers_snd_nxt - l->rcv_nxt) < TIPC_MIN_LINK_WIN) 1700 if ((peers_snd_nxt - l->rcv_nxt) < TIPC_MIN_LINK_WIN)
1613 return; 1701 return 0;
1614 } 1702 }
1615 1703
1616 /* Send NACK now but suppress next one */ 1704 /* Send NACK now but suppress next one */
1617 tipc_link_build_bc_proto_msg(l, true, peers_snd_nxt, xmitq); 1705 tipc_link_build_bc_proto_msg(l, true, peers_snd_nxt, xmitq);
1618 l->nack_state = BC_NACK_SND_SUPPRESS; 1706 l->nack_state = BC_NACK_SND_SUPPRESS;
1707 return 0;
1619} 1708}
1620 1709
1621void tipc_link_bc_ack_rcv(struct tipc_link *l, u16 acked, 1710void tipc_link_bc_ack_rcv(struct tipc_link *l, u16 acked,
@@ -1652,6 +1741,8 @@ void tipc_link_bc_ack_rcv(struct tipc_link *l, u16 acked,
1652} 1741}
1653 1742
1654/* tipc_link_bc_nack_rcv(): receive broadcast nack message 1743/* tipc_link_bc_nack_rcv(): receive broadcast nack message
1744 * This function is here for backwards compatibility, since
1745 * no BCAST_PROTOCOL/STATE messages occur from TIPC v2.5.
1655 */ 1746 */
1656int tipc_link_bc_nack_rcv(struct tipc_link *l, struct sk_buff *skb, 1747int tipc_link_bc_nack_rcv(struct tipc_link *l, struct sk_buff *skb,
1657 struct sk_buff_head *xmitq) 1748 struct sk_buff_head *xmitq)
@@ -1692,10 +1783,10 @@ void tipc_link_set_queue_limits(struct tipc_link *l, u32 win)
1692 int max_bulk = TIPC_MAX_PUBLICATIONS / (l->mtu / ITEM_SIZE); 1783 int max_bulk = TIPC_MAX_PUBLICATIONS / (l->mtu / ITEM_SIZE);
1693 1784
1694 l->window = win; 1785 l->window = win;
1695 l->backlog[TIPC_LOW_IMPORTANCE].limit = win / 2; 1786 l->backlog[TIPC_LOW_IMPORTANCE].limit = max_t(u16, 50, win);
1696 l->backlog[TIPC_MEDIUM_IMPORTANCE].limit = win; 1787 l->backlog[TIPC_MEDIUM_IMPORTANCE].limit = max_t(u16, 100, win * 2);
1697 l->backlog[TIPC_HIGH_IMPORTANCE].limit = win / 2 * 3; 1788 l->backlog[TIPC_HIGH_IMPORTANCE].limit = max_t(u16, 150, win * 3);
1698 l->backlog[TIPC_CRITICAL_IMPORTANCE].limit = win * 2; 1789 l->backlog[TIPC_CRITICAL_IMPORTANCE].limit = max_t(u16, 200, win * 4);
1699 l->backlog[TIPC_SYSTEM_IMPORTANCE].limit = max_bulk; 1790 l->backlog[TIPC_SYSTEM_IMPORTANCE].limit = max_bulk;
1700} 1791}
1701 1792
diff --git a/net/tipc/link.h b/net/tipc/link.h
index d7e9d42fcb2d..d1bd1787a768 100644
--- a/net/tipc/link.h
+++ b/net/tipc/link.h
@@ -63,7 +63,7 @@ enum {
63enum { 63enum {
64 TIPC_LINK_UP_EVT = 1, 64 TIPC_LINK_UP_EVT = 1,
65 TIPC_LINK_DOWN_EVT = (1 << 1), 65 TIPC_LINK_DOWN_EVT = (1 << 1),
66 TIPC_LINK_SND_BC_ACK = (1 << 2) 66 TIPC_LINK_SND_STATE = (1 << 2)
67}; 67};
68 68
69/* Starting value for maximum packet size negotiation on unicast links 69/* Starting value for maximum packet size negotiation on unicast links
@@ -138,8 +138,8 @@ void tipc_link_bc_ack_rcv(struct tipc_link *l, u16 acked,
138void tipc_link_build_bc_sync_msg(struct tipc_link *l, 138void tipc_link_build_bc_sync_msg(struct tipc_link *l,
139 struct sk_buff_head *xmitq); 139 struct sk_buff_head *xmitq);
140void tipc_link_bc_init_rcv(struct tipc_link *l, struct tipc_msg *hdr); 140void tipc_link_bc_init_rcv(struct tipc_link *l, struct tipc_msg *hdr);
141void tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr, 141int tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr,
142 struct sk_buff_head *xmitq); 142 struct sk_buff_head *xmitq);
143int tipc_link_bc_nack_rcv(struct tipc_link *l, struct sk_buff *skb, 143int tipc_link_bc_nack_rcv(struct tipc_link *l, struct sk_buff *skb,
144 struct sk_buff_head *xmitq); 144 struct sk_buff_head *xmitq);
145#endif 145#endif
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
index 7cf52fb39bee..c3832cdf2278 100644
--- a/net/tipc/msg.h
+++ b/net/tipc/msg.h
@@ -719,6 +719,16 @@ static inline char *msg_media_addr(struct tipc_msg *m)
719 return (char *)&m->hdr[TIPC_MEDIA_INFO_OFFSET]; 719 return (char *)&m->hdr[TIPC_MEDIA_INFO_OFFSET];
720} 720}
721 721
722static inline u32 msg_bc_gap(struct tipc_msg *m)
723{
724 return msg_bits(m, 8, 0, 0x3ff);
725}
726
727static inline void msg_set_bc_gap(struct tipc_msg *m, u32 n)
728{
729 msg_set_bits(m, 8, 0, 0x3ff, n);
730}
731
722/* 732/*
723 * Word 9 733 * Word 9
724 */ 734 */
diff --git a/net/tipc/net.h b/net/tipc/net.h
index 77a7a118911d..c7c254902873 100644
--- a/net/tipc/net.h
+++ b/net/tipc/net.h
@@ -39,6 +39,8 @@
39 39
40#include <net/genetlink.h> 40#include <net/genetlink.h>
41 41
42extern const struct nla_policy tipc_nl_net_policy[];
43
42int tipc_net_start(struct net *net, u32 addr); 44int tipc_net_start(struct net *net, u32 addr);
43 45
44void tipc_net_stop(struct net *net); 46void tipc_net_stop(struct net *net);
diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c
index a84daec0afe9..3200059d14b2 100644
--- a/net/tipc/netlink.c
+++ b/net/tipc/netlink.c
@@ -41,6 +41,7 @@
41#include "link.h" 41#include "link.h"
42#include "node.h" 42#include "node.h"
43#include "net.h" 43#include "net.h"
44#include "udp_media.h"
44#include <net/genetlink.h> 45#include <net/genetlink.h>
45 46
46static const struct nla_policy tipc_nl_policy[TIPC_NLA_MAX + 1] = { 47static const struct nla_policy tipc_nl_policy[TIPC_NLA_MAX + 1] = {
@@ -161,6 +162,11 @@ static const struct genl_ops tipc_genl_v2_ops[] = {
161 .policy = tipc_nl_policy, 162 .policy = tipc_nl_policy,
162 }, 163 },
163 { 164 {
165 .cmd = TIPC_NL_BEARER_ADD,
166 .doit = tipc_nl_bearer_add,
167 .policy = tipc_nl_policy,
168 },
169 {
164 .cmd = TIPC_NL_BEARER_SET, 170 .cmd = TIPC_NL_BEARER_SET,
165 .doit = tipc_nl_bearer_set, 171 .doit = tipc_nl_bearer_set,
166 .policy = tipc_nl_policy, 172 .policy = tipc_nl_policy,
@@ -238,6 +244,18 @@ static const struct genl_ops tipc_genl_v2_ops[] = {
238 .dumpit = tipc_nl_node_dump_monitor_peer, 244 .dumpit = tipc_nl_node_dump_monitor_peer,
239 .policy = tipc_nl_policy, 245 .policy = tipc_nl_policy,
240 }, 246 },
247 {
248 .cmd = TIPC_NL_PEER_REMOVE,
249 .doit = tipc_nl_peer_rm,
250 .policy = tipc_nl_policy,
251 },
252#ifdef CONFIG_TIPC_MEDIA_UDP
253 {
254 .cmd = TIPC_NL_UDP_GET_REMOTEIP,
255 .dumpit = tipc_udp_nl_dump_remoteip,
256 .policy = tipc_nl_policy,
257 },
258#endif
241}; 259};
242 260
243int tipc_nlmsg_parse(const struct nlmsghdr *nlh, struct nlattr ***attr) 261int tipc_nlmsg_parse(const struct nlmsghdr *nlh, struct nlattr ***attr)
diff --git a/net/tipc/node.c b/net/tipc/node.c
index 21974191e425..7ef14e2d2356 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -1262,6 +1262,34 @@ void tipc_node_broadcast(struct net *net, struct sk_buff *skb)
1262 kfree_skb(skb); 1262 kfree_skb(skb);
1263} 1263}
1264 1264
1265static void tipc_node_bc_sync_rcv(struct tipc_node *n, struct tipc_msg *hdr,
1266 int bearer_id, struct sk_buff_head *xmitq)
1267{
1268 struct tipc_link *ucl;
1269 int rc;
1270
1271 rc = tipc_bcast_sync_rcv(n->net, n->bc_entry.link, hdr);
1272
1273 if (rc & TIPC_LINK_DOWN_EVT) {
1274 tipc_bearer_reset_all(n->net);
1275 return;
1276 }
1277
1278 if (!(rc & TIPC_LINK_SND_STATE))
1279 return;
1280
1281 /* If probe message, a STATE response will be sent anyway */
1282 if (msg_probe(hdr))
1283 return;
1284
1285 /* Produce a STATE message carrying broadcast NACK */
1286 tipc_node_read_lock(n);
1287 ucl = n->links[bearer_id].link;
1288 if (ucl)
1289 tipc_link_build_state_msg(ucl, xmitq);
1290 tipc_node_read_unlock(n);
1291}
1292
1265/** 1293/**
1266 * tipc_node_bc_rcv - process TIPC broadcast packet arriving from off-node 1294 * tipc_node_bc_rcv - process TIPC broadcast packet arriving from off-node
1267 * @net: the applicable net namespace 1295 * @net: the applicable net namespace
@@ -1298,7 +1326,7 @@ static void tipc_node_bc_rcv(struct net *net, struct sk_buff *skb, int bearer_id
1298 rc = tipc_bcast_rcv(net, be->link, skb); 1326 rc = tipc_bcast_rcv(net, be->link, skb);
1299 1327
1300 /* Broadcast ACKs are sent on a unicast link */ 1328 /* Broadcast ACKs are sent on a unicast link */
1301 if (rc & TIPC_LINK_SND_BC_ACK) { 1329 if (rc & TIPC_LINK_SND_STATE) {
1302 tipc_node_read_lock(n); 1330 tipc_node_read_lock(n);
1303 tipc_link_build_state_msg(le->link, &xmitq); 1331 tipc_link_build_state_msg(le->link, &xmitq);
1304 tipc_node_read_unlock(n); 1332 tipc_node_read_unlock(n);
@@ -1505,7 +1533,7 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b)
1505 1533
1506 /* Ensure broadcast reception is in synch with peer's send state */ 1534 /* Ensure broadcast reception is in synch with peer's send state */
1507 if (unlikely(usr == LINK_PROTOCOL)) 1535 if (unlikely(usr == LINK_PROTOCOL))
1508 tipc_bcast_sync_rcv(net, n->bc_entry.link, hdr); 1536 tipc_node_bc_sync_rcv(n, hdr, bearer_id, &xmitq);
1509 else if (unlikely(tipc_link_acked(n->bc_entry.link) != bc_ack)) 1537 else if (unlikely(tipc_link_acked(n->bc_entry.link) != bc_ack))
1510 tipc_bcast_ack_rcv(net, n->bc_entry.link, bc_ack); 1538 tipc_bcast_ack_rcv(net, n->bc_entry.link, bc_ack);
1511 1539
@@ -1553,6 +1581,69 @@ discard:
1553 kfree_skb(skb); 1581 kfree_skb(skb);
1554} 1582}
1555 1583
1584int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info)
1585{
1586 struct net *net = sock_net(skb->sk);
1587 struct tipc_net *tn = net_generic(net, tipc_net_id);
1588 struct nlattr *attrs[TIPC_NLA_NET_MAX + 1];
1589 struct tipc_node *peer;
1590 u32 addr;
1591 int err;
1592 int i;
1593
1594 /* We identify the peer by its net */
1595 if (!info->attrs[TIPC_NLA_NET])
1596 return -EINVAL;
1597
1598 err = nla_parse_nested(attrs, TIPC_NLA_NET_MAX,
1599 info->attrs[TIPC_NLA_NET],
1600 tipc_nl_net_policy);
1601 if (err)
1602 return err;
1603
1604 if (!attrs[TIPC_NLA_NET_ADDR])
1605 return -EINVAL;
1606
1607 addr = nla_get_u32(attrs[TIPC_NLA_NET_ADDR]);
1608
1609 if (in_own_node(net, addr))
1610 return -ENOTSUPP;
1611
1612 spin_lock_bh(&tn->node_list_lock);
1613 peer = tipc_node_find(net, addr);
1614 if (!peer) {
1615 spin_unlock_bh(&tn->node_list_lock);
1616 return -ENXIO;
1617 }
1618
1619 tipc_node_write_lock(peer);
1620 if (peer->state != SELF_DOWN_PEER_DOWN &&
1621 peer->state != SELF_DOWN_PEER_LEAVING) {
1622 tipc_node_write_unlock(peer);
1623 err = -EBUSY;
1624 goto err_out;
1625 }
1626
1627 for (i = 0; i < MAX_BEARERS; i++) {
1628 struct tipc_link_entry *le = &peer->links[i];
1629
1630 if (le->link) {
1631 kfree(le->link);
1632 le->link = NULL;
1633 peer->link_cnt--;
1634 }
1635 }
1636 tipc_node_write_unlock(peer);
1637 tipc_node_delete(peer);
1638
1639 err = 0;
1640err_out:
1641 tipc_node_put(peer);
1642 spin_unlock_bh(&tn->node_list_lock);
1643
1644 return err;
1645}
1646
1556int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb) 1647int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb)
1557{ 1648{
1558 int err; 1649 int err;
diff --git a/net/tipc/node.h b/net/tipc/node.h
index d69fdfcc0ec9..39ef54c1f2ad 100644
--- a/net/tipc/node.h
+++ b/net/tipc/node.h
@@ -1,7 +1,7 @@
1/* 1/*
2 * net/tipc/node.h: Include file for TIPC node management routines 2 * net/tipc/node.h: Include file for TIPC node management routines
3 * 3 *
4 * Copyright (c) 2000-2006, 2014-2015, Ericsson AB 4 * Copyright (c) 2000-2006, 2014-2016, Ericsson AB
5 * Copyright (c) 2005, 2010-2014, Wind River Systems 5 * Copyright (c) 2005, 2010-2014, Wind River Systems
6 * All rights reserved. 6 * All rights reserved.
7 * 7 *
@@ -45,11 +45,14 @@
45/* Optional capabilities supported by this code version 45/* Optional capabilities supported by this code version
46 */ 46 */
47enum { 47enum {
48 TIPC_BCAST_SYNCH = (1 << 1), 48 TIPC_BCAST_SYNCH = (1 << 1),
49 TIPC_BLOCK_FLOWCTL = (2 << 1) 49 TIPC_BCAST_STATE_NACK = (1 << 2),
50 TIPC_BLOCK_FLOWCTL = (1 << 3)
50}; 51};
51 52
52#define TIPC_NODE_CAPABILITIES (TIPC_BCAST_SYNCH | TIPC_BLOCK_FLOWCTL) 53#define TIPC_NODE_CAPABILITIES (TIPC_BCAST_SYNCH | \
54 TIPC_BCAST_STATE_NACK | \
55 TIPC_BLOCK_FLOWCTL)
53#define INVALID_BEARER_ID -1 56#define INVALID_BEARER_ID -1
54 57
55void tipc_node_stop(struct net *net); 58void tipc_node_stop(struct net *net);
@@ -77,6 +80,7 @@ int tipc_nl_node_dump_link(struct sk_buff *skb, struct netlink_callback *cb);
77int tipc_nl_node_reset_link_stats(struct sk_buff *skb, struct genl_info *info); 80int tipc_nl_node_reset_link_stats(struct sk_buff *skb, struct genl_info *info);
78int tipc_nl_node_get_link(struct sk_buff *skb, struct genl_info *info); 81int tipc_nl_node_get_link(struct sk_buff *skb, struct genl_info *info);
79int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info); 82int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info);
83int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info);
80 84
81int tipc_nl_node_set_monitor(struct sk_buff *skb, struct genl_info *info); 85int tipc_nl_node_set_monitor(struct sk_buff *skb, struct genl_info *info);
82int tipc_nl_node_get_monitor(struct sk_buff *skb, struct genl_info *info); 86int tipc_nl_node_get_monitor(struct sk_buff *skb, struct genl_info *info);
diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index ae7e14cae085..78cab9c5a445 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -49,6 +49,7 @@
49#include "core.h" 49#include "core.h"
50#include "bearer.h" 50#include "bearer.h"
51#include "netlink.h" 51#include "netlink.h"
52#include "msg.h"
52 53
53/* IANA assigned UDP port */ 54/* IANA assigned UDP port */
54#define UDP_PORT_DEFAULT 6118 55#define UDP_PORT_DEFAULT 6118
@@ -70,6 +71,13 @@ struct udp_media_addr {
70 }; 71 };
71}; 72};
72 73
74/* struct udp_replicast - container for UDP remote addresses */
75struct udp_replicast {
76 struct udp_media_addr addr;
77 struct rcu_head rcu;
78 struct list_head list;
79};
80
73/** 81/**
74 * struct udp_bearer - ip/udp bearer data structure 82 * struct udp_bearer - ip/udp bearer data structure
75 * @bearer: associated generic tipc bearer 83 * @bearer: associated generic tipc bearer
@@ -82,8 +90,20 @@ struct udp_bearer {
82 struct socket *ubsock; 90 struct socket *ubsock;
83 u32 ifindex; 91 u32 ifindex;
84 struct work_struct work; 92 struct work_struct work;
93 struct udp_replicast rcast;
85}; 94};
86 95
96static int tipc_udp_is_mcast_addr(struct udp_media_addr *addr)
97{
98 if (ntohs(addr->proto) == ETH_P_IP)
99 return ipv4_is_multicast(addr->ipv4.s_addr);
100#if IS_ENABLED(CONFIG_IPV6)
101 else
102 return ipv6_addr_is_multicast(&addr->ipv6);
103#endif
104 return 0;
105}
106
87/* udp_media_addr_set - convert a ip/udp address to a TIPC media address */ 107/* udp_media_addr_set - convert a ip/udp address to a TIPC media address */
88static void tipc_udp_media_addr_set(struct tipc_media_addr *addr, 108static void tipc_udp_media_addr_set(struct tipc_media_addr *addr,
89 struct udp_media_addr *ua) 109 struct udp_media_addr *ua)
@@ -91,15 +111,9 @@ static void tipc_udp_media_addr_set(struct tipc_media_addr *addr,
91 memset(addr, 0, sizeof(struct tipc_media_addr)); 111 memset(addr, 0, sizeof(struct tipc_media_addr));
92 addr->media_id = TIPC_MEDIA_TYPE_UDP; 112 addr->media_id = TIPC_MEDIA_TYPE_UDP;
93 memcpy(addr->value, ua, sizeof(struct udp_media_addr)); 113 memcpy(addr->value, ua, sizeof(struct udp_media_addr));
94 if (ntohs(ua->proto) == ETH_P_IP) { 114
95 if (ipv4_is_multicast(ua->ipv4.s_addr)) 115 if (tipc_udp_is_mcast_addr(ua))
96 addr->broadcast = 1; 116 addr->broadcast = 1;
97 } else if (ntohs(ua->proto) == ETH_P_IPV6) {
98 if (ipv6_addr_type(&ua->ipv6) & IPV6_ADDR_MULTICAST)
99 addr->broadcast = 1;
100 } else {
101 pr_err("Invalid UDP media address\n");
102 }
103} 117}
104 118
105/* tipc_udp_addr2str - convert ip/udp address to string */ 119/* tipc_udp_addr2str - convert ip/udp address to string */
@@ -140,28 +154,13 @@ static int tipc_udp_addr2msg(char *msg, struct tipc_media_addr *a)
140} 154}
141 155
142/* tipc_send_msg - enqueue a send request */ 156/* tipc_send_msg - enqueue a send request */
143static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb, 157static int tipc_udp_xmit(struct net *net, struct sk_buff *skb,
144 struct tipc_bearer *b, 158 struct udp_bearer *ub, struct udp_media_addr *src,
145 struct tipc_media_addr *dest) 159 struct udp_media_addr *dst)
146{ 160{
147 int ttl, err = 0; 161 int ttl, err = 0;
148 struct udp_bearer *ub;
149 struct udp_media_addr *dst = (struct udp_media_addr *)&dest->value;
150 struct udp_media_addr *src = (struct udp_media_addr *)&b->addr.value;
151 struct rtable *rt; 162 struct rtable *rt;
152 163
153 if (skb_headroom(skb) < UDP_MIN_HEADROOM) {
154 err = pskb_expand_head(skb, UDP_MIN_HEADROOM, 0, GFP_ATOMIC);
155 if (err)
156 goto tx_error;
157 }
158
159 skb_set_inner_protocol(skb, htons(ETH_P_TIPC));
160 ub = rcu_dereference_rtnl(b->media_ptr);
161 if (!ub) {
162 err = -ENODEV;
163 goto tx_error;
164 }
165 if (dst->proto == htons(ETH_P_IP)) { 164 if (dst->proto == htons(ETH_P_IP)) {
166 struct flowi4 fl = { 165 struct flowi4 fl = {
167 .daddr = dst->ipv4.s_addr, 166 .daddr = dst->ipv4.s_addr,
@@ -207,29 +206,178 @@ tx_error:
207 return err; 206 return err;
208} 207}
209 208
209static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb,
210 struct tipc_bearer *b,
211 struct tipc_media_addr *addr)
212{
213 struct udp_media_addr *src = (struct udp_media_addr *)&b->addr.value;
214 struct udp_media_addr *dst = (struct udp_media_addr *)&addr->value;
215 struct udp_replicast *rcast;
216 struct udp_bearer *ub;
217 int err = 0;
218
219 if (skb_headroom(skb) < UDP_MIN_HEADROOM) {
220 err = pskb_expand_head(skb, UDP_MIN_HEADROOM, 0, GFP_ATOMIC);
221 if (err)
222 goto out;
223 }
224
225 skb_set_inner_protocol(skb, htons(ETH_P_TIPC));
226 ub = rcu_dereference_rtnl(b->media_ptr);
227 if (!ub) {
228 err = -ENODEV;
229 goto out;
230 }
231
232 if (!addr->broadcast || list_empty(&ub->rcast.list))
233 return tipc_udp_xmit(net, skb, ub, src, dst);
234
235 /* Replicast, send an skb to each configured IP address */
236 list_for_each_entry_rcu(rcast, &ub->rcast.list, list) {
237 struct sk_buff *_skb;
238
239 _skb = pskb_copy(skb, GFP_ATOMIC);
240 if (!_skb) {
241 err = -ENOMEM;
242 goto out;
243 }
244
245 err = tipc_udp_xmit(net, _skb, ub, src, &rcast->addr);
246 if (err) {
247 kfree_skb(_skb);
248 goto out;
249 }
250 }
251 err = 0;
252out:
253 kfree_skb(skb);
254 return err;
255}
256
257static bool tipc_udp_is_known_peer(struct tipc_bearer *b,
258 struct udp_media_addr *addr)
259{
260 struct udp_replicast *rcast, *tmp;
261 struct udp_bearer *ub;
262
263 ub = rcu_dereference_rtnl(b->media_ptr);
264 if (!ub) {
265 pr_err_ratelimited("UDP bearer instance not found\n");
266 return false;
267 }
268
269 list_for_each_entry_safe(rcast, tmp, &ub->rcast.list, list) {
270 if (!memcmp(&rcast->addr, addr, sizeof(struct udp_media_addr)))
271 return true;
272 }
273
274 return false;
275}
276
277static int tipc_udp_rcast_add(struct tipc_bearer *b,
278 struct udp_media_addr *addr)
279{
280 struct udp_replicast *rcast;
281 struct udp_bearer *ub;
282
283 ub = rcu_dereference_rtnl(b->media_ptr);
284 if (!ub)
285 return -ENODEV;
286
287 rcast = kmalloc(sizeof(*rcast), GFP_ATOMIC);
288 if (!rcast)
289 return -ENOMEM;
290
291 memcpy(&rcast->addr, addr, sizeof(struct udp_media_addr));
292
293 if (ntohs(addr->proto) == ETH_P_IP)
294 pr_info("New replicast peer: %pI4\n", &rcast->addr.ipv4);
295#if IS_ENABLED(CONFIG_IPV6)
296 else if (ntohs(addr->proto) == ETH_P_IPV6)
297 pr_info("New replicast peer: %pI6\n", &rcast->addr.ipv6);
298#endif
299
300 list_add_rcu(&rcast->list, &ub->rcast.list);
301 return 0;
302}
303
304static int tipc_udp_rcast_disc(struct tipc_bearer *b, struct sk_buff *skb)
305{
306 struct udp_media_addr src = {0};
307 struct udp_media_addr *dst;
308
309 dst = (struct udp_media_addr *)&b->bcast_addr.value;
310 if (tipc_udp_is_mcast_addr(dst))
311 return 0;
312
313 src.port = udp_hdr(skb)->source;
314
315 if (ip_hdr(skb)->version == 4) {
316 struct iphdr *iphdr = ip_hdr(skb);
317
318 src.proto = htons(ETH_P_IP);
319 src.ipv4.s_addr = iphdr->saddr;
320 if (ipv4_is_multicast(iphdr->daddr))
321 return 0;
322#if IS_ENABLED(CONFIG_IPV6)
323 } else if (ip_hdr(skb)->version == 6) {
324 struct ipv6hdr *iphdr = ipv6_hdr(skb);
325
326 src.proto = htons(ETH_P_IPV6);
327 src.ipv6 = iphdr->saddr;
328 if (ipv6_addr_is_multicast(&iphdr->daddr))
329 return 0;
330#endif
331 } else {
332 return 0;
333 }
334
335 if (likely(tipc_udp_is_known_peer(b, &src)))
336 return 0;
337
338 return tipc_udp_rcast_add(b, &src);
339}
340
210/* tipc_udp_recv - read data from bearer socket */ 341/* tipc_udp_recv - read data from bearer socket */
211static int tipc_udp_recv(struct sock *sk, struct sk_buff *skb) 342static int tipc_udp_recv(struct sock *sk, struct sk_buff *skb)
212{ 343{
213 struct udp_bearer *ub; 344 struct udp_bearer *ub;
214 struct tipc_bearer *b; 345 struct tipc_bearer *b;
346 struct tipc_msg *hdr;
347 int err;
215 348
216 ub = rcu_dereference_sk_user_data(sk); 349 ub = rcu_dereference_sk_user_data(sk);
217 if (!ub) { 350 if (!ub) {
218 pr_err_ratelimited("Failed to get UDP bearer reference"); 351 pr_err_ratelimited("Failed to get UDP bearer reference");
219 kfree_skb(skb); 352 goto out;
220 return 0;
221 } 353 }
222
223 skb_pull(skb, sizeof(struct udphdr)); 354 skb_pull(skb, sizeof(struct udphdr));
355 hdr = buf_msg(skb);
356
224 rcu_read_lock(); 357 rcu_read_lock();
225 b = rcu_dereference_rtnl(ub->bearer); 358 b = rcu_dereference_rtnl(ub->bearer);
359 if (!b)
360 goto rcu_out;
226 361
227 if (b) { 362 if (b && test_bit(0, &b->up)) {
228 tipc_rcv(sock_net(sk), skb, b); 363 tipc_rcv(sock_net(sk), skb, b);
229 rcu_read_unlock(); 364 rcu_read_unlock();
230 return 0; 365 return 0;
231 } 366 }
367
368 if (unlikely(msg_user(hdr) == LINK_CONFIG)) {
369 err = tipc_udp_rcast_disc(b, skb);
370 if (err)
371 goto rcu_out;
372 }
373
374 tipc_rcv(sock_net(sk), skb, b);
232 rcu_read_unlock(); 375 rcu_read_unlock();
376 return 0;
377
378rcu_out:
379 rcu_read_unlock();
380out:
233 kfree_skb(skb); 381 kfree_skb(skb);
234 return 0; 382 return 0;
235} 383}
@@ -241,15 +389,11 @@ static int enable_mcast(struct udp_bearer *ub, struct udp_media_addr *remote)
241 struct sock *sk = ub->ubsock->sk; 389 struct sock *sk = ub->ubsock->sk;
242 390
243 if (ntohs(remote->proto) == ETH_P_IP) { 391 if (ntohs(remote->proto) == ETH_P_IP) {
244 if (!ipv4_is_multicast(remote->ipv4.s_addr))
245 return 0;
246 mreqn.imr_multiaddr = remote->ipv4; 392 mreqn.imr_multiaddr = remote->ipv4;
247 mreqn.imr_ifindex = ub->ifindex; 393 mreqn.imr_ifindex = ub->ifindex;
248 err = ip_mc_join_group(sk, &mreqn); 394 err = ip_mc_join_group(sk, &mreqn);
249#if IS_ENABLED(CONFIG_IPV6) 395#if IS_ENABLED(CONFIG_IPV6)
250 } else { 396 } else {
251 if (!ipv6_addr_is_multicast(&remote->ipv6))
252 return 0;
253 err = ipv6_stub->ipv6_sock_mc_join(sk, ub->ifindex, 397 err = ipv6_stub->ipv6_sock_mc_join(sk, ub->ifindex,
254 &remote->ipv6); 398 &remote->ipv6);
255#endif 399#endif
@@ -257,75 +401,236 @@ static int enable_mcast(struct udp_bearer *ub, struct udp_media_addr *remote)
257 return err; 401 return err;
258} 402}
259 403
260/** 404static int __tipc_nl_add_udp_addr(struct sk_buff *skb,
261 * parse_options - build local/remote addresses from configuration 405 struct udp_media_addr *addr, int nla_t)
262 * @attrs: netlink config data
263 * @ub: UDP bearer instance
264 * @local: local bearer IP address/port
265 * @remote: peer or multicast IP/port
266 */
267static int parse_options(struct nlattr *attrs[], struct udp_bearer *ub,
268 struct udp_media_addr *local,
269 struct udp_media_addr *remote)
270{ 406{
271 struct nlattr *opts[TIPC_NLA_UDP_MAX + 1]; 407 if (ntohs(addr->proto) == ETH_P_IP) {
272 struct sockaddr_storage sa_local, sa_remote; 408 struct sockaddr_in ip4;
273 409
274 if (!attrs[TIPC_NLA_BEARER_UDP_OPTS]) 410 memset(&ip4, 0, sizeof(ip4));
275 goto err; 411 ip4.sin_family = AF_INET;
276 if (nla_parse_nested(opts, TIPC_NLA_UDP_MAX, 412 ip4.sin_port = addr->port;
277 attrs[TIPC_NLA_BEARER_UDP_OPTS], 413 ip4.sin_addr.s_addr = addr->ipv4.s_addr;
278 tipc_nl_udp_policy)) 414 if (nla_put(skb, nla_t, sizeof(ip4), &ip4))
279 goto err; 415 return -EMSGSIZE;
280 if (opts[TIPC_NLA_UDP_LOCAL] && opts[TIPC_NLA_UDP_REMOTE]) { 416
281 nla_memcpy(&sa_local, opts[TIPC_NLA_UDP_LOCAL], 417#if IS_ENABLED(CONFIG_IPV6)
282 sizeof(sa_local)); 418 } else if (ntohs(addr->proto) == ETH_P_IPV6) {
283 nla_memcpy(&sa_remote, opts[TIPC_NLA_UDP_REMOTE], 419 struct sockaddr_in6 ip6;
284 sizeof(sa_remote)); 420
421 memset(&ip6, 0, sizeof(ip6));
422 ip6.sin6_family = AF_INET6;
423 ip6.sin6_port = addr->port;
424 memcpy(&ip6.sin6_addr, &addr->ipv6, sizeof(struct in6_addr));
425 if (nla_put(skb, nla_t, sizeof(ip6), &ip6))
426 return -EMSGSIZE;
427#endif
428 }
429
430 return 0;
431}
432
433int tipc_udp_nl_dump_remoteip(struct sk_buff *skb, struct netlink_callback *cb)
434{
435 u32 bid = cb->args[0];
436 u32 skip_cnt = cb->args[1];
437 u32 portid = NETLINK_CB(cb->skb).portid;
438 struct udp_replicast *rcast, *tmp;
439 struct tipc_bearer *b;
440 struct udp_bearer *ub;
441 void *hdr;
442 int err;
443 int i;
444
445 if (!bid && !skip_cnt) {
446 struct net *net = sock_net(skb->sk);
447 struct nlattr *battrs[TIPC_NLA_BEARER_MAX + 1];
448 struct nlattr **attrs;
449 char *bname;
450
451 err = tipc_nlmsg_parse(cb->nlh, &attrs);
452 if (err)
453 return err;
454
455 if (!attrs[TIPC_NLA_BEARER])
456 return -EINVAL;
457
458 err = nla_parse_nested(battrs, TIPC_NLA_BEARER_MAX,
459 attrs[TIPC_NLA_BEARER],
460 tipc_nl_bearer_policy);
461 if (err)
462 return err;
463
464 if (!battrs[TIPC_NLA_BEARER_NAME])
465 return -EINVAL;
466
467 bname = nla_data(battrs[TIPC_NLA_BEARER_NAME]);
468
469 rtnl_lock();
470 b = tipc_bearer_find(net, bname);
471 if (!b) {
472 rtnl_unlock();
473 return -EINVAL;
474 }
475 bid = b->identity;
285 } else { 476 } else {
286err: 477 struct net *net = sock_net(skb->sk);
287 pr_err("Invalid UDP bearer configuration"); 478 struct tipc_net *tn = net_generic(net, tipc_net_id);
479
480 rtnl_lock();
481 b = rtnl_dereference(tn->bearer_list[bid]);
482 if (!b) {
483 rtnl_unlock();
484 return -EINVAL;
485 }
486 }
487
488 ub = rcu_dereference_rtnl(b->media_ptr);
489 if (!ub) {
490 rtnl_unlock();
288 return -EINVAL; 491 return -EINVAL;
289 } 492 }
290 if ((sa_local.ss_family & sa_remote.ss_family) == AF_INET) { 493
291 struct sockaddr_in *ip4; 494 i = 0;
292 495 list_for_each_entry_safe(rcast, tmp, &ub->rcast.list, list) {
293 ip4 = (struct sockaddr_in *)&sa_local; 496 if (i < skip_cnt)
294 local->proto = htons(ETH_P_IP); 497 goto count;
295 local->port = ip4->sin_port; 498
296 local->ipv4.s_addr = ip4->sin_addr.s_addr; 499 hdr = genlmsg_put(skb, portid, cb->nlh->nlmsg_seq,
297 500 &tipc_genl_family, NLM_F_MULTI,
298 ip4 = (struct sockaddr_in *)&sa_remote; 501 TIPC_NL_BEARER_GET);
299 remote->proto = htons(ETH_P_IP); 502 if (!hdr)
300 remote->port = ip4->sin_port; 503 goto done;
301 remote->ipv4.s_addr = ip4->sin_addr.s_addr; 504
505 err = __tipc_nl_add_udp_addr(skb, &rcast->addr,
506 TIPC_NLA_UDP_REMOTE);
507 if (err) {
508 genlmsg_cancel(skb, hdr);
509 goto done;
510 }
511 genlmsg_end(skb, hdr);
512count:
513 i++;
514 }
515done:
516 rtnl_unlock();
517 cb->args[0] = bid;
518 cb->args[1] = i;
519
520 return skb->len;
521}
522
523int tipc_udp_nl_add_bearer_data(struct tipc_nl_msg *msg, struct tipc_bearer *b)
524{
525 struct udp_media_addr *src = (struct udp_media_addr *)&b->addr.value;
526 struct udp_media_addr *dst;
527 struct udp_bearer *ub;
528 struct nlattr *nest;
529
530 ub = rcu_dereference_rtnl(b->media_ptr);
531 if (!ub)
532 return -ENODEV;
533
534 nest = nla_nest_start(msg->skb, TIPC_NLA_BEARER_UDP_OPTS);
535 if (!nest)
536 goto msg_full;
537
538 if (__tipc_nl_add_udp_addr(msg->skb, src, TIPC_NLA_UDP_LOCAL))
539 goto msg_full;
540
541 dst = (struct udp_media_addr *)&b->bcast_addr.value;
542 if (__tipc_nl_add_udp_addr(msg->skb, dst, TIPC_NLA_UDP_REMOTE))
543 goto msg_full;
544
545 if (!list_empty(&ub->rcast.list)) {
546 if (nla_put_flag(msg->skb, TIPC_NLA_UDP_MULTI_REMOTEIP))
547 goto msg_full;
548 }
549
550 nla_nest_end(msg->skb, nest);
551 return 0;
552msg_full:
553 nla_nest_cancel(msg->skb, nest);
554 return -EMSGSIZE;
555}
556
557/**
558 * tipc_parse_udp_addr - build udp media address from netlink data
559 * @nlattr: netlink attribute containing sockaddr storage aligned address
560 * @addr: tipc media address to fill with address, port and protocol type
561 * @scope_id: IPv6 scope id pointer, not NULL indicates it's required
562 */
563
564static int tipc_parse_udp_addr(struct nlattr *nla, struct udp_media_addr *addr,
565 u32 *scope_id)
566{
567 struct sockaddr_storage sa;
568
569 nla_memcpy(&sa, nla, sizeof(sa));
570 if (sa.ss_family == AF_INET) {
571 struct sockaddr_in *ip4 = (struct sockaddr_in *)&sa;
572
573 addr->proto = htons(ETH_P_IP);
574 addr->port = ip4->sin_port;
575 addr->ipv4.s_addr = ip4->sin_addr.s_addr;
302 return 0; 576 return 0;
303 577
304#if IS_ENABLED(CONFIG_IPV6) 578#if IS_ENABLED(CONFIG_IPV6)
305 } else if ((sa_local.ss_family & sa_remote.ss_family) == AF_INET6) { 579 } else if (sa.ss_family == AF_INET6) {
306 int atype; 580 struct sockaddr_in6 *ip6 = (struct sockaddr_in6 *)&sa;
307 struct sockaddr_in6 *ip6;
308 581
309 ip6 = (struct sockaddr_in6 *)&sa_local; 582 addr->proto = htons(ETH_P_IPV6);
310 atype = ipv6_addr_type(&ip6->sin6_addr); 583 addr->port = ip6->sin6_port;
311 if (__ipv6_addr_needs_scope_id(atype) && !ip6->sin6_scope_id) 584 memcpy(&addr->ipv6, &ip6->sin6_addr, sizeof(struct in6_addr));
312 return -EINVAL; 585
586 /* Scope ID is only interesting for local addresses */
587 if (scope_id) {
588 int atype;
313 589
314 local->proto = htons(ETH_P_IPV6); 590 atype = ipv6_addr_type(&ip6->sin6_addr);
315 local->port = ip6->sin6_port; 591 if (__ipv6_addr_needs_scope_id(atype) &&
316 memcpy(&local->ipv6, &ip6->sin6_addr, sizeof(struct in6_addr)); 592 !ip6->sin6_scope_id) {
317 ub->ifindex = ip6->sin6_scope_id; 593 return -EINVAL;
594 }
595
596 *scope_id = ip6->sin6_scope_id ? : 0;
597 }
318 598
319 ip6 = (struct sockaddr_in6 *)&sa_remote;
320 remote->proto = htons(ETH_P_IPV6);
321 remote->port = ip6->sin6_port;
322 memcpy(&remote->ipv6, &ip6->sin6_addr, sizeof(struct in6_addr));
323 return 0; 599 return 0;
324#endif 600#endif
325 } 601 }
326 return -EADDRNOTAVAIL; 602 return -EADDRNOTAVAIL;
327} 603}
328 604
605int tipc_udp_nl_bearer_add(struct tipc_bearer *b, struct nlattr *attr)
606{
607 int err;
608 struct udp_media_addr addr = {0};
609 struct nlattr *opts[TIPC_NLA_UDP_MAX + 1];
610 struct udp_media_addr *dst;
611
612 if (nla_parse_nested(opts, TIPC_NLA_UDP_MAX, attr, tipc_nl_udp_policy))
613 return -EINVAL;
614
615 if (!opts[TIPC_NLA_UDP_REMOTE])
616 return -EINVAL;
617
618 err = tipc_parse_udp_addr(opts[TIPC_NLA_UDP_REMOTE], &addr, NULL);
619 if (err)
620 return err;
621
622 dst = (struct udp_media_addr *)&b->bcast_addr.value;
623 if (tipc_udp_is_mcast_addr(dst)) {
624 pr_err("Can't add remote ip to TIPC UDP multicast bearer\n");
625 return -EINVAL;
626 }
627
628 if (tipc_udp_is_known_peer(b, &addr))
629 return 0;
630
631 return tipc_udp_rcast_add(b, &addr);
632}
633
329/** 634/**
330 * tipc_udp_enable - callback to create a new udp bearer instance 635 * tipc_udp_enable - callback to create a new udp bearer instance
331 * @net: network namespace 636 * @net: network namespace
@@ -340,18 +645,38 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
340{ 645{
341 int err = -EINVAL; 646 int err = -EINVAL;
342 struct udp_bearer *ub; 647 struct udp_bearer *ub;
343 struct udp_media_addr *remote; 648 struct udp_media_addr remote = {0};
344 struct udp_media_addr local = {0}; 649 struct udp_media_addr local = {0};
345 struct udp_port_cfg udp_conf = {0}; 650 struct udp_port_cfg udp_conf = {0};
346 struct udp_tunnel_sock_cfg tuncfg = {NULL}; 651 struct udp_tunnel_sock_cfg tuncfg = {NULL};
652 struct nlattr *opts[TIPC_NLA_UDP_MAX + 1];
347 653
348 ub = kzalloc(sizeof(*ub), GFP_ATOMIC); 654 ub = kzalloc(sizeof(*ub), GFP_ATOMIC);
349 if (!ub) 655 if (!ub)
350 return -ENOMEM; 656 return -ENOMEM;
351 657
352 remote = (struct udp_media_addr *)&b->bcast_addr.value; 658 INIT_LIST_HEAD(&ub->rcast.list);
353 memset(remote, 0, sizeof(struct udp_media_addr)); 659
354 err = parse_options(attrs, ub, &local, remote); 660 if (!attrs[TIPC_NLA_BEARER_UDP_OPTS])
661 goto err;
662
663 if (nla_parse_nested(opts, TIPC_NLA_UDP_MAX,
664 attrs[TIPC_NLA_BEARER_UDP_OPTS],
665 tipc_nl_udp_policy))
666 goto err;
667
668 if (!opts[TIPC_NLA_UDP_LOCAL] || !opts[TIPC_NLA_UDP_REMOTE]) {
669 pr_err("Invalid UDP bearer configuration");
670 err = -EINVAL;
671 goto err;
672 }
673
674 err = tipc_parse_udp_addr(opts[TIPC_NLA_UDP_LOCAL], &local,
675 &ub->ifindex);
676 if (err)
677 goto err;
678
679 err = tipc_parse_udp_addr(opts[TIPC_NLA_UDP_REMOTE], &remote, NULL);
355 if (err) 680 if (err)
356 goto err; 681 goto err;
357 682
@@ -396,9 +721,18 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
396 tuncfg.encap_destroy = NULL; 721 tuncfg.encap_destroy = NULL;
397 setup_udp_tunnel_sock(net, ub->ubsock, &tuncfg); 722 setup_udp_tunnel_sock(net, ub->ubsock, &tuncfg);
398 723
399 err = enable_mcast(ub, remote); 724 /**
725 * The bcast media address port is used for all peers and the ip
726 * is used if it's a multicast address.
727 */
728 memcpy(&b->bcast_addr.value, &remote, sizeof(remote));
729 if (tipc_udp_is_mcast_addr(&remote))
730 err = enable_mcast(ub, &remote);
731 else
732 err = tipc_udp_rcast_add(b, &remote);
400 if (err) 733 if (err)
401 goto err; 734 goto err;
735
402 return 0; 736 return 0;
403err: 737err:
404 if (ub->ubsock) 738 if (ub->ubsock)
@@ -411,6 +745,12 @@ err:
411static void cleanup_bearer(struct work_struct *work) 745static void cleanup_bearer(struct work_struct *work)
412{ 746{
413 struct udp_bearer *ub = container_of(work, struct udp_bearer, work); 747 struct udp_bearer *ub = container_of(work, struct udp_bearer, work);
748 struct udp_replicast *rcast, *tmp;
749
750 list_for_each_entry_safe(rcast, tmp, &ub->rcast.list, list) {
751 list_del_rcu(&rcast->list);
752 kfree_rcu(rcast, rcu);
753 }
414 754
415 if (ub->ubsock) 755 if (ub->ubsock)
416 udp_tunnel_sock_release(ub->ubsock); 756 udp_tunnel_sock_release(ub->ubsock);
diff --git a/net/tipc/udp_media.h b/net/tipc/udp_media.h
new file mode 100644
index 000000000000..281bbae87726
--- /dev/null
+++ b/net/tipc/udp_media.h
@@ -0,0 +1,46 @@
1/*
2 * net/tipc/udp_media.h: Include file for UDP bearer media
3 *
4 * Copyright (c) 1996-2006, 2013-2016, Ericsson AB
5 * Copyright (c) 2005, 2010-2011, Wind River Systems
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. Neither the names of the copyright holders nor the names of its
17 * contributors may be used to endorse or promote products derived from
18 * this software without specific prior written permission.
19 *
20 * Alternatively, this software may be distributed under the terms of the
21 * GNU General Public License ("GPL") version 2 as published by the Free
22 * Software Foundation.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
25 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
28 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
29 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
30 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
31 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
32 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
33 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34 * POSSIBILITY OF SUCH DAMAGE.
35 */
36
37#ifdef CONFIG_TIPC_MEDIA_UDP
38#ifndef _TIPC_UDP_MEDIA_H
39#define _TIPC_UDP_MEDIA_H
40
41int tipc_udp_nl_bearer_add(struct tipc_bearer *b, struct nlattr *attr);
42int tipc_udp_nl_add_bearer_data(struct tipc_nl_msg *msg, struct tipc_bearer *b);
43int tipc_udp_nl_dump_remoteip(struct sk_buff *skb, struct netlink_callback *cb);
44
45#endif
46#endif
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 8309687a56b0..145082e2ba36 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2475,28 +2475,13 @@ static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2475 return unix_stream_read_generic(&state); 2475 return unix_stream_read_generic(&state);
2476} 2476}
2477 2477
2478static ssize_t skb_unix_socket_splice(struct sock *sk,
2479 struct pipe_inode_info *pipe,
2480 struct splice_pipe_desc *spd)
2481{
2482 int ret;
2483 struct unix_sock *u = unix_sk(sk);
2484
2485 mutex_unlock(&u->iolock);
2486 ret = splice_to_pipe(pipe, spd);
2487 mutex_lock(&u->iolock);
2488
2489 return ret;
2490}
2491
2492static int unix_stream_splice_actor(struct sk_buff *skb, 2478static int unix_stream_splice_actor(struct sk_buff *skb,
2493 int skip, int chunk, 2479 int skip, int chunk,
2494 struct unix_stream_read_state *state) 2480 struct unix_stream_read_state *state)
2495{ 2481{
2496 return skb_splice_bits(skb, state->socket->sk, 2482 return skb_splice_bits(skb, state->socket->sk,
2497 UNIXCB(skb).consumed + skip, 2483 UNIXCB(skb).consumed + skip,
2498 state->pipe, chunk, state->splice_flags, 2484 state->pipe, chunk, state->splice_flags);
2499 skb_unix_socket_splice);
2500} 2485}
2501 2486
2502static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos, 2487static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos,
diff --git a/net/wireless/chan.c b/net/wireless/chan.c
index 0f506220a3bd..5497d022fada 100644
--- a/net/wireless/chan.c
+++ b/net/wireless/chan.c
@@ -372,6 +372,7 @@ int cfg80211_chandef_dfs_required(struct wiphy *wiphy,
372 case NL80211_IFTYPE_AP_VLAN: 372 case NL80211_IFTYPE_AP_VLAN:
373 case NL80211_IFTYPE_WDS: 373 case NL80211_IFTYPE_WDS:
374 case NL80211_IFTYPE_P2P_DEVICE: 374 case NL80211_IFTYPE_P2P_DEVICE:
375 case NL80211_IFTYPE_NAN:
375 break; 376 break;
376 case NL80211_IFTYPE_UNSPECIFIED: 377 case NL80211_IFTYPE_UNSPECIFIED:
377 case NUM_NL80211_IFTYPES: 378 case NUM_NL80211_IFTYPES:
@@ -946,6 +947,7 @@ cfg80211_get_chan_state(struct wireless_dev *wdev,
946 case NL80211_IFTYPE_AP_VLAN: 947 case NL80211_IFTYPE_AP_VLAN:
947 case NL80211_IFTYPE_WDS: 948 case NL80211_IFTYPE_WDS:
948 case NL80211_IFTYPE_P2P_DEVICE: 949 case NL80211_IFTYPE_P2P_DEVICE:
950 case NL80211_IFTYPE_NAN:
949 /* these interface types don't really have a channel */ 951 /* these interface types don't really have a channel */
950 return; 952 return;
951 case NL80211_IFTYPE_UNSPECIFIED: 953 case NL80211_IFTYPE_UNSPECIFIED:
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 7645e97362c0..8201e6d7449e 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -225,6 +225,23 @@ void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev,
225 } 225 }
226} 226}
227 227
228void cfg80211_stop_nan(struct cfg80211_registered_device *rdev,
229 struct wireless_dev *wdev)
230{
231 ASSERT_RTNL();
232
233 if (WARN_ON(wdev->iftype != NL80211_IFTYPE_NAN))
234 return;
235
236 if (!wdev->nan_started)
237 return;
238
239 rdev_stop_nan(rdev, wdev);
240 wdev->nan_started = false;
241
242 rdev->opencount--;
243}
244
228void cfg80211_shutdown_all_interfaces(struct wiphy *wiphy) 245void cfg80211_shutdown_all_interfaces(struct wiphy *wiphy)
229{ 246{
230 struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); 247 struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
@@ -242,6 +259,9 @@ void cfg80211_shutdown_all_interfaces(struct wiphy *wiphy)
242 case NL80211_IFTYPE_P2P_DEVICE: 259 case NL80211_IFTYPE_P2P_DEVICE:
243 cfg80211_stop_p2p_device(rdev, wdev); 260 cfg80211_stop_p2p_device(rdev, wdev);
244 break; 261 break;
262 case NL80211_IFTYPE_NAN:
263 cfg80211_stop_nan(rdev, wdev);
264 break;
245 default: 265 default:
246 break; 266 break;
247 } 267 }
@@ -537,6 +557,11 @@ static int wiphy_verify_combinations(struct wiphy *wiphy)
537 c->limits[j].max > 1)) 557 c->limits[j].max > 1))
538 return -EINVAL; 558 return -EINVAL;
539 559
560 /* Only a single NAN can be allowed */
561 if (WARN_ON(types & BIT(NL80211_IFTYPE_NAN) &&
562 c->limits[j].max > 1))
563 return -EINVAL;
564
540 cnt += c->limits[j].max; 565 cnt += c->limits[j].max;
541 /* 566 /*
542 * Don't advertise an unsupported type 567 * Don't advertise an unsupported type
@@ -579,6 +604,11 @@ int wiphy_register(struct wiphy *wiphy)
579 !rdev->ops->tdls_cancel_channel_switch))) 604 !rdev->ops->tdls_cancel_channel_switch)))
580 return -EINVAL; 605 return -EINVAL;
581 606
607 if (WARN_ON((wiphy->interface_modes & BIT(NL80211_IFTYPE_NAN)) &&
608 (!rdev->ops->start_nan || !rdev->ops->stop_nan ||
609 !rdev->ops->add_nan_func || !rdev->ops->del_nan_func)))
610 return -EINVAL;
611
582 /* 612 /*
583 * if a wiphy has unsupported modes for regulatory channel enforcement, 613 * if a wiphy has unsupported modes for regulatory channel enforcement,
584 * opt-out of enforcement checking 614 * opt-out of enforcement checking
@@ -589,6 +619,7 @@ int wiphy_register(struct wiphy *wiphy)
589 BIT(NL80211_IFTYPE_P2P_GO) | 619 BIT(NL80211_IFTYPE_P2P_GO) |
590 BIT(NL80211_IFTYPE_ADHOC) | 620 BIT(NL80211_IFTYPE_ADHOC) |
591 BIT(NL80211_IFTYPE_P2P_DEVICE) | 621 BIT(NL80211_IFTYPE_P2P_DEVICE) |
622 BIT(NL80211_IFTYPE_NAN) |
592 BIT(NL80211_IFTYPE_AP_VLAN) | 623 BIT(NL80211_IFTYPE_AP_VLAN) |
593 BIT(NL80211_IFTYPE_MONITOR))) 624 BIT(NL80211_IFTYPE_MONITOR)))
594 wiphy->regulatory_flags |= REGULATORY_IGNORE_STALE_KICKOFF; 625 wiphy->regulatory_flags |= REGULATORY_IGNORE_STALE_KICKOFF;
@@ -906,6 +937,8 @@ void cfg80211_unregister_wdev(struct wireless_dev *wdev)
906 if (WARN_ON(wdev->netdev)) 937 if (WARN_ON(wdev->netdev))
907 return; 938 return;
908 939
940 nl80211_notify_iface(rdev, wdev, NL80211_CMD_DEL_INTERFACE);
941
909 list_del_rcu(&wdev->list); 942 list_del_rcu(&wdev->list);
910 rdev->devlist_generation++; 943 rdev->devlist_generation++;
911 944
@@ -914,6 +947,9 @@ void cfg80211_unregister_wdev(struct wireless_dev *wdev)
914 cfg80211_mlme_purge_registrations(wdev); 947 cfg80211_mlme_purge_registrations(wdev);
915 cfg80211_stop_p2p_device(rdev, wdev); 948 cfg80211_stop_p2p_device(rdev, wdev);
916 break; 949 break;
950 case NL80211_IFTYPE_NAN:
951 cfg80211_stop_nan(rdev, wdev);
952 break;
917 default: 953 default:
918 WARN_ON_ONCE(1); 954 WARN_ON_ONCE(1);
919 break; 955 break;
@@ -977,6 +1013,7 @@ void __cfg80211_leave(struct cfg80211_registered_device *rdev,
977 /* must be handled by mac80211/driver, has no APIs */ 1013 /* must be handled by mac80211/driver, has no APIs */
978 break; 1014 break;
979 case NL80211_IFTYPE_P2P_DEVICE: 1015 case NL80211_IFTYPE_P2P_DEVICE:
1016 case NL80211_IFTYPE_NAN:
980 /* cannot happen, has no netdev */ 1017 /* cannot happen, has no netdev */
981 break; 1018 break;
982 case NL80211_IFTYPE_AP_VLAN: 1019 case NL80211_IFTYPE_AP_VLAN:
@@ -1079,6 +1116,8 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
1079 wdev->iftype == NL80211_IFTYPE_P2P_CLIENT || 1116 wdev->iftype == NL80211_IFTYPE_P2P_CLIENT ||
1080 wdev->iftype == NL80211_IFTYPE_ADHOC) && !wdev->use_4addr) 1117 wdev->iftype == NL80211_IFTYPE_ADHOC) && !wdev->use_4addr)
1081 dev->priv_flags |= IFF_DONT_BRIDGE; 1118 dev->priv_flags |= IFF_DONT_BRIDGE;
1119
1120 nl80211_notify_iface(rdev, wdev, NL80211_CMD_NEW_INTERFACE);
1082 break; 1121 break;
1083 case NETDEV_GOING_DOWN: 1122 case NETDEV_GOING_DOWN:
1084 cfg80211_leave(rdev, wdev); 1123 cfg80211_leave(rdev, wdev);
@@ -1157,6 +1196,8 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
1157 * remove and clean it up. 1196 * remove and clean it up.
1158 */ 1197 */
1159 if (!list_empty(&wdev->list)) { 1198 if (!list_empty(&wdev->list)) {
1199 nl80211_notify_iface(rdev, wdev,
1200 NL80211_CMD_DEL_INTERFACE);
1160 sysfs_remove_link(&dev->dev.kobj, "phy80211"); 1201 sysfs_remove_link(&dev->dev.kobj, "phy80211");
1161 list_del_rcu(&wdev->list); 1202 list_del_rcu(&wdev->list);
1162 rdev->devlist_generation++; 1203 rdev->devlist_generation++;
@@ -1246,7 +1287,7 @@ static int __init cfg80211_init(void)
1246 if (err) 1287 if (err)
1247 goto out_fail_reg; 1288 goto out_fail_reg;
1248 1289
1249 cfg80211_wq = create_singlethread_workqueue("cfg80211"); 1290 cfg80211_wq = alloc_ordered_workqueue("cfg80211", WQ_MEM_RECLAIM);
1250 if (!cfg80211_wq) { 1291 if (!cfg80211_wq) {
1251 err = -ENOMEM; 1292 err = -ENOMEM;
1252 goto out_fail_wq; 1293 goto out_fail_wq;
diff --git a/net/wireless/core.h b/net/wireless/core.h
index eee91443924d..08d2e948c9ad 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -249,9 +249,9 @@ struct cfg80211_event {
249}; 249};
250 250
251struct cfg80211_cached_keys { 251struct cfg80211_cached_keys {
252 struct key_params params[6]; 252 struct key_params params[CFG80211_MAX_WEP_KEYS];
253 u8 data[6][WLAN_MAX_KEY_LEN]; 253 u8 data[CFG80211_MAX_WEP_KEYS][WLAN_KEY_LEN_WEP104];
254 int def, defmgmt; 254 int def;
255}; 255};
256 256
257enum cfg80211_chan_mode { 257enum cfg80211_chan_mode {
@@ -488,6 +488,9 @@ void cfg80211_leave(struct cfg80211_registered_device *rdev,
488void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev, 488void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev,
489 struct wireless_dev *wdev); 489 struct wireless_dev *wdev);
490 490
491void cfg80211_stop_nan(struct cfg80211_registered_device *rdev,
492 struct wireless_dev *wdev);
493
491#define CFG80211_MAX_NUM_DIFFERENT_CHANNELS 10 494#define CFG80211_MAX_NUM_DIFFERENT_CHANNELS 10
492 495
493#ifdef CONFIG_CFG80211_DEVELOPER_WARNINGS 496#ifdef CONFIG_CFG80211_DEVELOPER_WARNINGS
diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c
index 4a4dda53bdf1..364f900a3dc4 100644
--- a/net/wireless/ibss.c
+++ b/net/wireless/ibss.c
@@ -43,7 +43,8 @@ void __cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid,
43 cfg80211_hold_bss(bss_from_pub(bss)); 43 cfg80211_hold_bss(bss_from_pub(bss));
44 wdev->current_bss = bss_from_pub(bss); 44 wdev->current_bss = bss_from_pub(bss);
45 45
46 cfg80211_upload_connect_keys(wdev); 46 if (!(wdev->wiphy->flags & WIPHY_FLAG_HAS_STATIC_WEP))
47 cfg80211_upload_connect_keys(wdev);
47 48
48 nl80211_send_ibss_bssid(wiphy_to_rdev(wdev->wiphy), dev, bssid, 49 nl80211_send_ibss_bssid(wiphy_to_rdev(wdev->wiphy), dev, bssid,
49 GFP_KERNEL); 50 GFP_KERNEL);
@@ -114,6 +115,9 @@ static int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev,
114 } 115 }
115 } 116 }
116 117
118 if (WARN_ON(connkeys && connkeys->def < 0))
119 return -EINVAL;
120
117 if (WARN_ON(wdev->connect_keys)) 121 if (WARN_ON(wdev->connect_keys))
118 kzfree(wdev->connect_keys); 122 kzfree(wdev->connect_keys);
119 wdev->connect_keys = connkeys; 123 wdev->connect_keys = connkeys;
@@ -284,18 +288,16 @@ int cfg80211_ibss_wext_join(struct cfg80211_registered_device *rdev,
284 if (!netif_running(wdev->netdev)) 288 if (!netif_running(wdev->netdev))
285 return 0; 289 return 0;
286 290
287 if (wdev->wext.keys) { 291 if (wdev->wext.keys)
288 wdev->wext.keys->def = wdev->wext.default_key; 292 wdev->wext.keys->def = wdev->wext.default_key;
289 wdev->wext.keys->defmgmt = wdev->wext.default_mgmt_key;
290 }
291 293
292 wdev->wext.ibss.privacy = wdev->wext.default_key != -1; 294 wdev->wext.ibss.privacy = wdev->wext.default_key != -1;
293 295
294 if (wdev->wext.keys) { 296 if (wdev->wext.keys && wdev->wext.keys->def != -1) {
295 ck = kmemdup(wdev->wext.keys, sizeof(*ck), GFP_KERNEL); 297 ck = kmemdup(wdev->wext.keys, sizeof(*ck), GFP_KERNEL);
296 if (!ck) 298 if (!ck)
297 return -ENOMEM; 299 return -ENOMEM;
298 for (i = 0; i < 6; i++) 300 for (i = 0; i < CFG80211_MAX_WEP_KEYS; i++)
299 ck->params[i].key = ck->data[i]; 301 ck->params[i].key = ck->data[i];
300 } 302 }
301 err = __cfg80211_join_ibss(rdev, wdev->netdev, 303 err = __cfg80211_join_ibss(rdev, wdev->netdev,
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index c284d883c349..cbb48e26a871 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -222,7 +222,7 @@ int cfg80211_mlme_auth(struct cfg80211_registered_device *rdev,
222 ASSERT_WDEV_LOCK(wdev); 222 ASSERT_WDEV_LOCK(wdev);
223 223
224 if (auth_type == NL80211_AUTHTYPE_SHARED_KEY) 224 if (auth_type == NL80211_AUTHTYPE_SHARED_KEY)
225 if (!key || !key_len || key_idx < 0 || key_idx > 4) 225 if (!key || !key_len || key_idx < 0 || key_idx > 3)
226 return -EINVAL; 226 return -EINVAL;
227 227
228 if (wdev->current_bss && 228 if (wdev->current_bss &&
@@ -634,6 +634,7 @@ int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev,
634 * fall through, P2P device only supports 634 * fall through, P2P device only supports
635 * public action frames 635 * public action frames
636 */ 636 */
637 case NL80211_IFTYPE_NAN:
637 default: 638 default:
638 err = -EOPNOTSUPP; 639 err = -EOPNOTSUPP;
639 break; 640 break;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 4809f4d2cdcc..c510810f0b7c 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -56,6 +56,7 @@ enum nl80211_multicast_groups {
56 NL80211_MCGRP_REGULATORY, 56 NL80211_MCGRP_REGULATORY,
57 NL80211_MCGRP_MLME, 57 NL80211_MCGRP_MLME,
58 NL80211_MCGRP_VENDOR, 58 NL80211_MCGRP_VENDOR,
59 NL80211_MCGRP_NAN,
59 NL80211_MCGRP_TESTMODE /* keep last - ifdef! */ 60 NL80211_MCGRP_TESTMODE /* keep last - ifdef! */
60}; 61};
61 62
@@ -65,6 +66,7 @@ static const struct genl_multicast_group nl80211_mcgrps[] = {
65 [NL80211_MCGRP_REGULATORY] = { .name = NL80211_MULTICAST_GROUP_REG }, 66 [NL80211_MCGRP_REGULATORY] = { .name = NL80211_MULTICAST_GROUP_REG },
66 [NL80211_MCGRP_MLME] = { .name = NL80211_MULTICAST_GROUP_MLME }, 67 [NL80211_MCGRP_MLME] = { .name = NL80211_MULTICAST_GROUP_MLME },
67 [NL80211_MCGRP_VENDOR] = { .name = NL80211_MULTICAST_GROUP_VENDOR }, 68 [NL80211_MCGRP_VENDOR] = { .name = NL80211_MULTICAST_GROUP_VENDOR },
69 [NL80211_MCGRP_NAN] = { .name = NL80211_MULTICAST_GROUP_NAN },
68#ifdef CONFIG_NL80211_TESTMODE 70#ifdef CONFIG_NL80211_TESTMODE
69 [NL80211_MCGRP_TESTMODE] = { .name = NL80211_MULTICAST_GROUP_TESTMODE } 71 [NL80211_MCGRP_TESTMODE] = { .name = NL80211_MULTICAST_GROUP_TESTMODE }
70#endif 72#endif
@@ -409,6 +411,9 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
409 .len = VHT_MUMIMO_GROUPS_DATA_LEN 411 .len = VHT_MUMIMO_GROUPS_DATA_LEN
410 }, 412 },
411 [NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR] = { .len = ETH_ALEN }, 413 [NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR] = { .len = ETH_ALEN },
414 [NL80211_ATTR_NAN_MASTER_PREF] = { .type = NLA_U8 },
415 [NL80211_ATTR_NAN_DUAL] = { .type = NLA_U8 },
416 [NL80211_ATTR_NAN_FUNC] = { .type = NLA_NESTED },
412}; 417};
413 418
414/* policy for the key attributes */ 419/* policy for the key attributes */
@@ -502,6 +507,39 @@ nl80211_bss_select_policy[NL80211_BSS_SELECT_ATTR_MAX + 1] = {
502 }, 507 },
503}; 508};
504 509
510/* policy for NAN function attributes */
511static const struct nla_policy
512nl80211_nan_func_policy[NL80211_NAN_FUNC_ATTR_MAX + 1] = {
513 [NL80211_NAN_FUNC_TYPE] = { .type = NLA_U8 },
514 [NL80211_NAN_FUNC_SERVICE_ID] = { .type = NLA_BINARY,
515 .len = NL80211_NAN_FUNC_SERVICE_ID_LEN },
516 [NL80211_NAN_FUNC_PUBLISH_TYPE] = { .type = NLA_U8 },
517 [NL80211_NAN_FUNC_PUBLISH_BCAST] = { .type = NLA_FLAG },
518 [NL80211_NAN_FUNC_SUBSCRIBE_ACTIVE] = { .type = NLA_FLAG },
519 [NL80211_NAN_FUNC_FOLLOW_UP_ID] = { .type = NLA_U8 },
520 [NL80211_NAN_FUNC_FOLLOW_UP_REQ_ID] = { .type = NLA_U8 },
521 [NL80211_NAN_FUNC_FOLLOW_UP_DEST] = { .len = ETH_ALEN },
522 [NL80211_NAN_FUNC_CLOSE_RANGE] = { .type = NLA_FLAG },
523 [NL80211_NAN_FUNC_TTL] = { .type = NLA_U32 },
524 [NL80211_NAN_FUNC_SERVICE_INFO] = { .type = NLA_BINARY,
525 .len = NL80211_NAN_FUNC_SERVICE_SPEC_INFO_MAX_LEN },
526 [NL80211_NAN_FUNC_SRF] = { .type = NLA_NESTED },
527 [NL80211_NAN_FUNC_RX_MATCH_FILTER] = { .type = NLA_NESTED },
528 [NL80211_NAN_FUNC_TX_MATCH_FILTER] = { .type = NLA_NESTED },
529 [NL80211_NAN_FUNC_INSTANCE_ID] = { .type = NLA_U8 },
530 [NL80211_NAN_FUNC_TERM_REASON] = { .type = NLA_U8 },
531};
532
533/* policy for Service Response Filter attributes */
534static const struct nla_policy
535nl80211_nan_srf_policy[NL80211_NAN_SRF_ATTR_MAX + 1] = {
536 [NL80211_NAN_SRF_INCLUDE] = { .type = NLA_FLAG },
537 [NL80211_NAN_SRF_BF] = { .type = NLA_BINARY,
538 .len = NL80211_NAN_FUNC_SRF_MAX_LEN },
539 [NL80211_NAN_SRF_BF_IDX] = { .type = NLA_U8 },
540 [NL80211_NAN_SRF_MAC_ADDRS] = { .type = NLA_NESTED },
541};
542
505static int nl80211_prepare_wdev_dump(struct sk_buff *skb, 543static int nl80211_prepare_wdev_dump(struct sk_buff *skb,
506 struct netlink_callback *cb, 544 struct netlink_callback *cb,
507 struct cfg80211_registered_device **rdev, 545 struct cfg80211_registered_device **rdev,
@@ -848,13 +886,21 @@ nl80211_parse_connkeys(struct cfg80211_registered_device *rdev,
848 struct nlattr *key; 886 struct nlattr *key;
849 struct cfg80211_cached_keys *result; 887 struct cfg80211_cached_keys *result;
850 int rem, err, def = 0; 888 int rem, err, def = 0;
889 bool have_key = false;
890
891 nla_for_each_nested(key, keys, rem) {
892 have_key = true;
893 break;
894 }
895
896 if (!have_key)
897 return NULL;
851 898
852 result = kzalloc(sizeof(*result), GFP_KERNEL); 899 result = kzalloc(sizeof(*result), GFP_KERNEL);
853 if (!result) 900 if (!result)
854 return ERR_PTR(-ENOMEM); 901 return ERR_PTR(-ENOMEM);
855 902
856 result->def = -1; 903 result->def = -1;
857 result->defmgmt = -1;
858 904
859 nla_for_each_nested(key, keys, rem) { 905 nla_for_each_nested(key, keys, rem) {
860 memset(&parse, 0, sizeof(parse)); 906 memset(&parse, 0, sizeof(parse));
@@ -866,7 +912,7 @@ nl80211_parse_connkeys(struct cfg80211_registered_device *rdev,
866 err = -EINVAL; 912 err = -EINVAL;
867 if (!parse.p.key) 913 if (!parse.p.key)
868 goto error; 914 goto error;
869 if (parse.idx < 0 || parse.idx > 4) 915 if (parse.idx < 0 || parse.idx > 3)
870 goto error; 916 goto error;
871 if (parse.def) { 917 if (parse.def) {
872 if (def) 918 if (def)
@@ -881,16 +927,24 @@ nl80211_parse_connkeys(struct cfg80211_registered_device *rdev,
881 parse.idx, false, NULL); 927 parse.idx, false, NULL);
882 if (err) 928 if (err)
883 goto error; 929 goto error;
930 if (parse.p.cipher != WLAN_CIPHER_SUITE_WEP40 &&
931 parse.p.cipher != WLAN_CIPHER_SUITE_WEP104) {
932 err = -EINVAL;
933 goto error;
934 }
884 result->params[parse.idx].cipher = parse.p.cipher; 935 result->params[parse.idx].cipher = parse.p.cipher;
885 result->params[parse.idx].key_len = parse.p.key_len; 936 result->params[parse.idx].key_len = parse.p.key_len;
886 result->params[parse.idx].key = result->data[parse.idx]; 937 result->params[parse.idx].key = result->data[parse.idx];
887 memcpy(result->data[parse.idx], parse.p.key, parse.p.key_len); 938 memcpy(result->data[parse.idx], parse.p.key, parse.p.key_len);
888 939
889 if (parse.p.cipher == WLAN_CIPHER_SUITE_WEP40 || 940 /* must be WEP key if we got here */
890 parse.p.cipher == WLAN_CIPHER_SUITE_WEP104) { 941 if (no_ht)
891 if (no_ht) 942 *no_ht = true;
892 *no_ht = true; 943 }
893 } 944
945 if (result->def < 0) {
946 err = -EINVAL;
947 goto error;
894 } 948 }
895 949
896 return result; 950 return result;
@@ -918,6 +972,7 @@ static int nl80211_key_allowed(struct wireless_dev *wdev)
918 case NL80211_IFTYPE_UNSPECIFIED: 972 case NL80211_IFTYPE_UNSPECIFIED:
919 case NL80211_IFTYPE_OCB: 973 case NL80211_IFTYPE_OCB:
920 case NL80211_IFTYPE_MONITOR: 974 case NL80211_IFTYPE_MONITOR:
975 case NL80211_IFTYPE_NAN:
921 case NL80211_IFTYPE_P2P_DEVICE: 976 case NL80211_IFTYPE_P2P_DEVICE:
922 case NL80211_IFTYPE_WDS: 977 case NL80211_IFTYPE_WDS:
923 case NUM_NL80211_IFTYPES: 978 case NUM_NL80211_IFTYPES:
@@ -2525,10 +2580,35 @@ static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback *
2525 int if_idx = 0; 2580 int if_idx = 0;
2526 int wp_start = cb->args[0]; 2581 int wp_start = cb->args[0];
2527 int if_start = cb->args[1]; 2582 int if_start = cb->args[1];
2583 int filter_wiphy = -1;
2528 struct cfg80211_registered_device *rdev; 2584 struct cfg80211_registered_device *rdev;
2529 struct wireless_dev *wdev; 2585 struct wireless_dev *wdev;
2530 2586
2531 rtnl_lock(); 2587 rtnl_lock();
2588 if (!cb->args[2]) {
2589 struct nl80211_dump_wiphy_state state = {
2590 .filter_wiphy = -1,
2591 };
2592 int ret;
2593
2594 ret = nl80211_dump_wiphy_parse(skb, cb, &state);
2595 if (ret)
2596 return ret;
2597
2598 filter_wiphy = state.filter_wiphy;
2599
2600 /*
2601 * if filtering, set cb->args[2] to +1 since 0 is the default
2602 * value needed to determine that parsing is necessary.
2603 */
2604 if (filter_wiphy >= 0)
2605 cb->args[2] = filter_wiphy + 1;
2606 else
2607 cb->args[2] = -1;
2608 } else if (cb->args[2] > 0) {
2609 filter_wiphy = cb->args[2] - 1;
2610 }
2611
2532 list_for_each_entry(rdev, &cfg80211_rdev_list, list) { 2612 list_for_each_entry(rdev, &cfg80211_rdev_list, list) {
2533 if (!net_eq(wiphy_net(&rdev->wiphy), sock_net(skb->sk))) 2613 if (!net_eq(wiphy_net(&rdev->wiphy), sock_net(skb->sk)))
2534 continue; 2614 continue;
@@ -2536,6 +2616,10 @@ static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback *
2536 wp_idx++; 2616 wp_idx++;
2537 continue; 2617 continue;
2538 } 2618 }
2619
2620 if (filter_wiphy >= 0 && filter_wiphy != rdev->wiphy_idx)
2621 continue;
2622
2539 if_idx = 0; 2623 if_idx = 0;
2540 2624
2541 list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { 2625 list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
@@ -2751,7 +2835,7 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info)
2751 struct cfg80211_registered_device *rdev = info->user_ptr[0]; 2835 struct cfg80211_registered_device *rdev = info->user_ptr[0];
2752 struct vif_params params; 2836 struct vif_params params;
2753 struct wireless_dev *wdev; 2837 struct wireless_dev *wdev;
2754 struct sk_buff *msg, *event; 2838 struct sk_buff *msg;
2755 int err; 2839 int err;
2756 enum nl80211_iftype type = NL80211_IFTYPE_UNSPECIFIED; 2840 enum nl80211_iftype type = NL80211_IFTYPE_UNSPECIFIED;
2757 u32 flags; 2841 u32 flags;
@@ -2774,7 +2858,7 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info)
2774 !(rdev->wiphy.interface_modes & (1 << type))) 2858 !(rdev->wiphy.interface_modes & (1 << type)))
2775 return -EOPNOTSUPP; 2859 return -EOPNOTSUPP;
2776 2860
2777 if ((type == NL80211_IFTYPE_P2P_DEVICE || 2861 if ((type == NL80211_IFTYPE_P2P_DEVICE || type == NL80211_IFTYPE_NAN ||
2778 rdev->wiphy.features & NL80211_FEATURE_MAC_ON_CREATE) && 2862 rdev->wiphy.features & NL80211_FEATURE_MAC_ON_CREATE) &&
2779 info->attrs[NL80211_ATTR_MAC]) { 2863 info->attrs[NL80211_ATTR_MAC]) {
2780 nla_memcpy(params.macaddr, info->attrs[NL80211_ATTR_MAC], 2864 nla_memcpy(params.macaddr, info->attrs[NL80211_ATTR_MAC],
@@ -2830,9 +2914,10 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info)
2830 wdev->mesh_id_up_len); 2914 wdev->mesh_id_up_len);
2831 wdev_unlock(wdev); 2915 wdev_unlock(wdev);
2832 break; 2916 break;
2917 case NL80211_IFTYPE_NAN:
2833 case NL80211_IFTYPE_P2P_DEVICE: 2918 case NL80211_IFTYPE_P2P_DEVICE:
2834 /* 2919 /*
2835 * P2P Device doesn't have a netdev, so doesn't go 2920 * P2P Device and NAN do not have a netdev, so don't go
2836 * through the netdev notifier and must be added here 2921 * through the netdev notifier and must be added here
2837 */ 2922 */
2838 mutex_init(&wdev->mtx); 2923 mutex_init(&wdev->mtx);
@@ -2855,20 +2940,15 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info)
2855 return -ENOBUFS; 2940 return -ENOBUFS;
2856 } 2941 }
2857 2942
2858 event = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); 2943 /*
2859 if (event) { 2944 * For wdevs which have no associated netdev object (e.g. of type
2860 if (nl80211_send_iface(event, 0, 0, 0, 2945 * NL80211_IFTYPE_P2P_DEVICE), emit the NEW_INTERFACE event here.
2861 rdev, wdev, false) < 0) { 2946 * For all other types, the event will be generated from the
2862 nlmsg_free(event); 2947 * netdev notifier
2863 goto out; 2948 */
2864 } 2949 if (!wdev->netdev)
2865 2950 nl80211_notify_iface(rdev, wdev, NL80211_CMD_NEW_INTERFACE);
2866 genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy),
2867 event, 0, NL80211_MCGRP_CONFIG,
2868 GFP_KERNEL);
2869 }
2870 2951
2871out:
2872 return genlmsg_reply(msg, info); 2952 return genlmsg_reply(msg, info);
2873} 2953}
2874 2954
@@ -2876,18 +2956,10 @@ static int nl80211_del_interface(struct sk_buff *skb, struct genl_info *info)
2876{ 2956{
2877 struct cfg80211_registered_device *rdev = info->user_ptr[0]; 2957 struct cfg80211_registered_device *rdev = info->user_ptr[0];
2878 struct wireless_dev *wdev = info->user_ptr[1]; 2958 struct wireless_dev *wdev = info->user_ptr[1];
2879 struct sk_buff *msg;
2880 int status;
2881 2959
2882 if (!rdev->ops->del_virtual_intf) 2960 if (!rdev->ops->del_virtual_intf)
2883 return -EOPNOTSUPP; 2961 return -EOPNOTSUPP;
2884 2962
2885 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
2886 if (msg && nl80211_send_iface(msg, 0, 0, 0, rdev, wdev, true) < 0) {
2887 nlmsg_free(msg);
2888 msg = NULL;
2889 }
2890
2891 /* 2963 /*
2892 * If we remove a wireless device without a netdev then clear 2964 * If we remove a wireless device without a netdev then clear
2893 * user_ptr[1] so that nl80211_post_doit won't dereference it 2965 * user_ptr[1] so that nl80211_post_doit won't dereference it
@@ -2898,15 +2970,7 @@ static int nl80211_del_interface(struct sk_buff *skb, struct genl_info *info)
2898 if (!wdev->netdev) 2970 if (!wdev->netdev)
2899 info->user_ptr[1] = NULL; 2971 info->user_ptr[1] = NULL;
2900 2972
2901 status = rdev_del_virtual_intf(rdev, wdev); 2973 return rdev_del_virtual_intf(rdev, wdev);
2902 if (status >= 0 && msg)
2903 genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy),
2904 msg, 0, NL80211_MCGRP_CONFIG,
2905 GFP_KERNEL);
2906 else
2907 nlmsg_free(msg);
2908
2909 return status;
2910} 2974}
2911 2975
2912static int nl80211_set_noack_map(struct sk_buff *skb, struct genl_info *info) 2976static int nl80211_set_noack_map(struct sk_buff *skb, struct genl_info *info)
@@ -3316,6 +3380,291 @@ static int nl80211_set_mac_acl(struct sk_buff *skb, struct genl_info *info)
3316 return err; 3380 return err;
3317} 3381}
3318 3382
3383static u32 rateset_to_mask(struct ieee80211_supported_band *sband,
3384 u8 *rates, u8 rates_len)
3385{
3386 u8 i;
3387 u32 mask = 0;
3388
3389 for (i = 0; i < rates_len; i++) {
3390 int rate = (rates[i] & 0x7f) * 5;
3391 int ridx;
3392
3393 for (ridx = 0; ridx < sband->n_bitrates; ridx++) {
3394 struct ieee80211_rate *srate =
3395 &sband->bitrates[ridx];
3396 if (rate == srate->bitrate) {
3397 mask |= 1 << ridx;
3398 break;
3399 }
3400 }
3401 if (ridx == sband->n_bitrates)
3402 return 0; /* rate not found */
3403 }
3404
3405 return mask;
3406}
3407
3408static bool ht_rateset_to_mask(struct ieee80211_supported_band *sband,
3409 u8 *rates, u8 rates_len,
3410 u8 mcs[IEEE80211_HT_MCS_MASK_LEN])
3411{
3412 u8 i;
3413
3414 memset(mcs, 0, IEEE80211_HT_MCS_MASK_LEN);
3415
3416 for (i = 0; i < rates_len; i++) {
3417 int ridx, rbit;
3418
3419 ridx = rates[i] / 8;
3420 rbit = BIT(rates[i] % 8);
3421
3422 /* check validity */
3423 if ((ridx < 0) || (ridx >= IEEE80211_HT_MCS_MASK_LEN))
3424 return false;
3425
3426 /* check availability */
3427 if (sband->ht_cap.mcs.rx_mask[ridx] & rbit)
3428 mcs[ridx] |= rbit;
3429 else
3430 return false;
3431 }
3432
3433 return true;
3434}
3435
3436static u16 vht_mcs_map_to_mcs_mask(u8 vht_mcs_map)
3437{
3438 u16 mcs_mask = 0;
3439
3440 switch (vht_mcs_map) {
3441 case IEEE80211_VHT_MCS_NOT_SUPPORTED:
3442 break;
3443 case IEEE80211_VHT_MCS_SUPPORT_0_7:
3444 mcs_mask = 0x00FF;
3445 break;
3446 case IEEE80211_VHT_MCS_SUPPORT_0_8:
3447 mcs_mask = 0x01FF;
3448 break;
3449 case IEEE80211_VHT_MCS_SUPPORT_0_9:
3450 mcs_mask = 0x03FF;
3451 break;
3452 default:
3453 break;
3454 }
3455
3456 return mcs_mask;
3457}
3458
3459static void vht_build_mcs_mask(u16 vht_mcs_map,
3460 u16 vht_mcs_mask[NL80211_VHT_NSS_MAX])
3461{
3462 u8 nss;
3463
3464 for (nss = 0; nss < NL80211_VHT_NSS_MAX; nss++) {
3465 vht_mcs_mask[nss] = vht_mcs_map_to_mcs_mask(vht_mcs_map & 0x03);
3466 vht_mcs_map >>= 2;
3467 }
3468}
3469
3470static bool vht_set_mcs_mask(struct ieee80211_supported_band *sband,
3471 struct nl80211_txrate_vht *txrate,
3472 u16 mcs[NL80211_VHT_NSS_MAX])
3473{
3474 u16 tx_mcs_map = le16_to_cpu(sband->vht_cap.vht_mcs.tx_mcs_map);
3475 u16 tx_mcs_mask[NL80211_VHT_NSS_MAX] = {};
3476 u8 i;
3477
3478 if (!sband->vht_cap.vht_supported)
3479 return false;
3480
3481 memset(mcs, 0, sizeof(u16) * NL80211_VHT_NSS_MAX);
3482
3483 /* Build vht_mcs_mask from VHT capabilities */
3484 vht_build_mcs_mask(tx_mcs_map, tx_mcs_mask);
3485
3486 for (i = 0; i < NL80211_VHT_NSS_MAX; i++) {
3487 if ((tx_mcs_mask[i] & txrate->mcs[i]) == txrate->mcs[i])
3488 mcs[i] = txrate->mcs[i];
3489 else
3490 return false;
3491 }
3492
3493 return true;
3494}
3495
3496static const struct nla_policy nl80211_txattr_policy[NL80211_TXRATE_MAX + 1] = {
3497 [NL80211_TXRATE_LEGACY] = { .type = NLA_BINARY,
3498 .len = NL80211_MAX_SUPP_RATES },
3499 [NL80211_TXRATE_HT] = { .type = NLA_BINARY,
3500 .len = NL80211_MAX_SUPP_HT_RATES },
3501 [NL80211_TXRATE_VHT] = { .len = sizeof(struct nl80211_txrate_vht)},
3502 [NL80211_TXRATE_GI] = { .type = NLA_U8 },
3503};
3504
3505static int nl80211_parse_tx_bitrate_mask(struct genl_info *info,
3506 struct cfg80211_bitrate_mask *mask)
3507{
3508 struct nlattr *tb[NL80211_TXRATE_MAX + 1];
3509 struct cfg80211_registered_device *rdev = info->user_ptr[0];
3510 int rem, i;
3511 struct nlattr *tx_rates;
3512 struct ieee80211_supported_band *sband;
3513 u16 vht_tx_mcs_map;
3514
3515 memset(mask, 0, sizeof(*mask));
3516 /* Default to all rates enabled */
3517 for (i = 0; i < NUM_NL80211_BANDS; i++) {
3518 sband = rdev->wiphy.bands[i];
3519
3520 if (!sband)
3521 continue;
3522
3523 mask->control[i].legacy = (1 << sband->n_bitrates) - 1;
3524 memcpy(mask->control[i].ht_mcs,
3525 sband->ht_cap.mcs.rx_mask,
3526 sizeof(mask->control[i].ht_mcs));
3527
3528 if (!sband->vht_cap.vht_supported)
3529 continue;
3530
3531 vht_tx_mcs_map = le16_to_cpu(sband->vht_cap.vht_mcs.tx_mcs_map);
3532 vht_build_mcs_mask(vht_tx_mcs_map, mask->control[i].vht_mcs);
3533 }
3534
3535 /* if no rates are given set it back to the defaults */
3536 if (!info->attrs[NL80211_ATTR_TX_RATES])
3537 goto out;
3538
3539 /* The nested attribute uses enum nl80211_band as the index. This maps
3540 * directly to the enum nl80211_band values used in cfg80211.
3541 */
3542 BUILD_BUG_ON(NL80211_MAX_SUPP_HT_RATES > IEEE80211_HT_MCS_MASK_LEN * 8);
3543 nla_for_each_nested(tx_rates, info->attrs[NL80211_ATTR_TX_RATES], rem) {
3544 enum nl80211_band band = nla_type(tx_rates);
3545 int err;
3546
3547 if (band < 0 || band >= NUM_NL80211_BANDS)
3548 return -EINVAL;
3549 sband = rdev->wiphy.bands[band];
3550 if (sband == NULL)
3551 return -EINVAL;
3552 err = nla_parse(tb, NL80211_TXRATE_MAX, nla_data(tx_rates),
3553 nla_len(tx_rates), nl80211_txattr_policy);
3554 if (err)
3555 return err;
3556 if (tb[NL80211_TXRATE_LEGACY]) {
3557 mask->control[band].legacy = rateset_to_mask(
3558 sband,
3559 nla_data(tb[NL80211_TXRATE_LEGACY]),
3560 nla_len(tb[NL80211_TXRATE_LEGACY]));
3561 if ((mask->control[band].legacy == 0) &&
3562 nla_len(tb[NL80211_TXRATE_LEGACY]))
3563 return -EINVAL;
3564 }
3565 if (tb[NL80211_TXRATE_HT]) {
3566 if (!ht_rateset_to_mask(
3567 sband,
3568 nla_data(tb[NL80211_TXRATE_HT]),
3569 nla_len(tb[NL80211_TXRATE_HT]),
3570 mask->control[band].ht_mcs))
3571 return -EINVAL;
3572 }
3573 if (tb[NL80211_TXRATE_VHT]) {
3574 if (!vht_set_mcs_mask(
3575 sband,
3576 nla_data(tb[NL80211_TXRATE_VHT]),
3577 mask->control[band].vht_mcs))
3578 return -EINVAL;
3579 }
3580 if (tb[NL80211_TXRATE_GI]) {
3581 mask->control[band].gi =
3582 nla_get_u8(tb[NL80211_TXRATE_GI]);
3583 if (mask->control[band].gi > NL80211_TXRATE_FORCE_LGI)
3584 return -EINVAL;
3585 }
3586
3587 if (mask->control[band].legacy == 0) {
3588 /* don't allow empty legacy rates if HT or VHT
3589 * are not even supported.
3590 */
3591 if (!(rdev->wiphy.bands[band]->ht_cap.ht_supported ||
3592 rdev->wiphy.bands[band]->vht_cap.vht_supported))
3593 return -EINVAL;
3594
3595 for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++)
3596 if (mask->control[band].ht_mcs[i])
3597 goto out;
3598
3599 for (i = 0; i < NL80211_VHT_NSS_MAX; i++)
3600 if (mask->control[band].vht_mcs[i])
3601 goto out;
3602
3603 /* legacy and mcs rates may not be both empty */
3604 return -EINVAL;
3605 }
3606 }
3607
3608out:
3609 return 0;
3610}
3611
3612static int validate_beacon_tx_rate(struct cfg80211_registered_device *rdev,
3613 enum nl80211_band band,
3614 struct cfg80211_bitrate_mask *beacon_rate)
3615{
3616 u32 count_ht, count_vht, i;
3617 u32 rate = beacon_rate->control[band].legacy;
3618
3619 /* Allow only one rate */
3620 if (hweight32(rate) > 1)
3621 return -EINVAL;
3622
3623 count_ht = 0;
3624 for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++) {
3625 if (hweight8(beacon_rate->control[band].ht_mcs[i]) > 1) {
3626 return -EINVAL;
3627 } else if (beacon_rate->control[band].ht_mcs[i]) {
3628 count_ht++;
3629 if (count_ht > 1)
3630 return -EINVAL;
3631 }
3632 if (count_ht && rate)
3633 return -EINVAL;
3634 }
3635
3636 count_vht = 0;
3637 for (i = 0; i < NL80211_VHT_NSS_MAX; i++) {
3638 if (hweight16(beacon_rate->control[band].vht_mcs[i]) > 1) {
3639 return -EINVAL;
3640 } else if (beacon_rate->control[band].vht_mcs[i]) {
3641 count_vht++;
3642 if (count_vht > 1)
3643 return -EINVAL;
3644 }
3645 if (count_vht && rate)
3646 return -EINVAL;
3647 }
3648
3649 if ((count_ht && count_vht) || (!rate && !count_ht && !count_vht))
3650 return -EINVAL;
3651
3652 if (rate &&
3653 !wiphy_ext_feature_isset(&rdev->wiphy,
3654 NL80211_EXT_FEATURE_BEACON_RATE_LEGACY))
3655 return -EINVAL;
3656 if (count_ht &&
3657 !wiphy_ext_feature_isset(&rdev->wiphy,
3658 NL80211_EXT_FEATURE_BEACON_RATE_HT))
3659 return -EINVAL;
3660 if (count_vht &&
3661 !wiphy_ext_feature_isset(&rdev->wiphy,
3662 NL80211_EXT_FEATURE_BEACON_RATE_VHT))
3663 return -EINVAL;
3664
3665 return 0;
3666}
3667
3319static int nl80211_parse_beacon(struct nlattr *attrs[], 3668static int nl80211_parse_beacon(struct nlattr *attrs[],
3320 struct cfg80211_beacon_data *bcn) 3669 struct cfg80211_beacon_data *bcn)
3321{ 3670{
@@ -3545,6 +3894,17 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
3545 wdev->iftype)) 3894 wdev->iftype))
3546 return -EINVAL; 3895 return -EINVAL;
3547 3896
3897 if (info->attrs[NL80211_ATTR_TX_RATES]) {
3898 err = nl80211_parse_tx_bitrate_mask(info, &params.beacon_rate);
3899 if (err)
3900 return err;
3901
3902 err = validate_beacon_tx_rate(rdev, params.chandef.chan->band,
3903 &params.beacon_rate);
3904 if (err)
3905 return err;
3906 }
3907
3548 if (info->attrs[NL80211_ATTR_SMPS_MODE]) { 3908 if (info->attrs[NL80211_ATTR_SMPS_MODE]) {
3549 params.smps_mode = 3909 params.smps_mode =
3550 nla_get_u8(info->attrs[NL80211_ATTR_SMPS_MODE]); 3910 nla_get_u8(info->attrs[NL80211_ATTR_SMPS_MODE]);
@@ -5374,6 +5734,18 @@ static int nl80211_check_s32(const struct nlattr *nla, s32 min, s32 max, s32 *ou
5374 return 0; 5734 return 0;
5375} 5735}
5376 5736
5737static int nl80211_check_power_mode(const struct nlattr *nla,
5738 enum nl80211_mesh_power_mode min,
5739 enum nl80211_mesh_power_mode max,
5740 enum nl80211_mesh_power_mode *out)
5741{
5742 u32 val = nla_get_u32(nla);
5743 if (val < min || val > max)
5744 return -EINVAL;
5745 *out = val;
5746 return 0;
5747}
5748
5377static int nl80211_parse_mesh_config(struct genl_info *info, 5749static int nl80211_parse_mesh_config(struct genl_info *info,
5378 struct mesh_config *cfg, 5750 struct mesh_config *cfg,
5379 u32 *mask_out) 5751 u32 *mask_out)
@@ -5518,7 +5890,7 @@ do { \
5518 NL80211_MESH_POWER_ACTIVE, 5890 NL80211_MESH_POWER_ACTIVE,
5519 NL80211_MESH_POWER_MAX, 5891 NL80211_MESH_POWER_MAX,
5520 mask, NL80211_MESHCONF_POWER_MODE, 5892 mask, NL80211_MESHCONF_POWER_MODE,
5521 nl80211_check_u32); 5893 nl80211_check_power_mode);
5522 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshAwakeWindowDuration, 5894 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshAwakeWindowDuration,
5523 0, 65535, mask, 5895 0, 65535, mask,
5524 NL80211_MESHCONF_AWAKE_WINDOW, nl80211_check_u16); 5896 NL80211_MESHCONF_AWAKE_WINDOW, nl80211_check_u16);
@@ -6102,6 +6474,9 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
6102 6474
6103 wiphy = &rdev->wiphy; 6475 wiphy = &rdev->wiphy;
6104 6476
6477 if (wdev->iftype == NL80211_IFTYPE_NAN)
6478 return -EOPNOTSUPP;
6479
6105 if (!rdev->ops->scan) 6480 if (!rdev->ops->scan)
6106 return -EOPNOTSUPP; 6481 return -EOPNOTSUPP;
6107 6482
@@ -7368,7 +7743,7 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info)
7368 (key.p.cipher != WLAN_CIPHER_SUITE_WEP104 || 7743 (key.p.cipher != WLAN_CIPHER_SUITE_WEP104 ||
7369 key.p.key_len != WLAN_KEY_LEN_WEP104)) 7744 key.p.key_len != WLAN_KEY_LEN_WEP104))
7370 return -EINVAL; 7745 return -EINVAL;
7371 if (key.idx > 4) 7746 if (key.idx > 3)
7372 return -EINVAL; 7747 return -EINVAL;
7373 } else { 7748 } else {
7374 key.p.key_len = 0; 7749 key.p.key_len = 0;
@@ -7773,12 +8148,13 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info)
7773 8148
7774 ibss.beacon_interval = 100; 8149 ibss.beacon_interval = 100;
7775 8150
7776 if (info->attrs[NL80211_ATTR_BEACON_INTERVAL]) { 8151 if (info->attrs[NL80211_ATTR_BEACON_INTERVAL])
7777 ibss.beacon_interval = 8152 ibss.beacon_interval =
7778 nla_get_u32(info->attrs[NL80211_ATTR_BEACON_INTERVAL]); 8153 nla_get_u32(info->attrs[NL80211_ATTR_BEACON_INTERVAL]);
7779 if (ibss.beacon_interval < 1 || ibss.beacon_interval > 10000) 8154
7780 return -EINVAL; 8155 err = cfg80211_validate_beacon_int(rdev, ibss.beacon_interval);
7781 } 8156 if (err)
8157 return err;
7782 8158
7783 if (!rdev->ops->join_ibss) 8159 if (!rdev->ops->join_ibss)
7784 return -EOPNOTSUPP; 8160 return -EOPNOTSUPP;
@@ -7985,6 +8361,8 @@ __cfg80211_alloc_vendor_skb(struct cfg80211_registered_device *rdev,
7985 } 8361 }
7986 8362
7987 data = nla_nest_start(skb, attr); 8363 data = nla_nest_start(skb, attr);
8364 if (!data)
8365 goto nla_put_failure;
7988 8366
7989 ((void **)skb->cb)[0] = rdev; 8367 ((void **)skb->cb)[0] = rdev;
7990 ((void **)skb->cb)[1] = hdr; 8368 ((void **)skb->cb)[1] = hdr;
@@ -8602,238 +8980,21 @@ static int nl80211_cancel_remain_on_channel(struct sk_buff *skb,
8602 return rdev_cancel_remain_on_channel(rdev, wdev, cookie); 8980 return rdev_cancel_remain_on_channel(rdev, wdev, cookie);
8603} 8981}
8604 8982
8605static u32 rateset_to_mask(struct ieee80211_supported_band *sband,
8606 u8 *rates, u8 rates_len)
8607{
8608 u8 i;
8609 u32 mask = 0;
8610
8611 for (i = 0; i < rates_len; i++) {
8612 int rate = (rates[i] & 0x7f) * 5;
8613 int ridx;
8614
8615 for (ridx = 0; ridx < sband->n_bitrates; ridx++) {
8616 struct ieee80211_rate *srate =
8617 &sband->bitrates[ridx];
8618 if (rate == srate->bitrate) {
8619 mask |= 1 << ridx;
8620 break;
8621 }
8622 }
8623 if (ridx == sband->n_bitrates)
8624 return 0; /* rate not found */
8625 }
8626
8627 return mask;
8628}
8629
8630static bool ht_rateset_to_mask(struct ieee80211_supported_band *sband,
8631 u8 *rates, u8 rates_len,
8632 u8 mcs[IEEE80211_HT_MCS_MASK_LEN])
8633{
8634 u8 i;
8635
8636 memset(mcs, 0, IEEE80211_HT_MCS_MASK_LEN);
8637
8638 for (i = 0; i < rates_len; i++) {
8639 int ridx, rbit;
8640
8641 ridx = rates[i] / 8;
8642 rbit = BIT(rates[i] % 8);
8643
8644 /* check validity */
8645 if ((ridx < 0) || (ridx >= IEEE80211_HT_MCS_MASK_LEN))
8646 return false;
8647
8648 /* check availability */
8649 if (sband->ht_cap.mcs.rx_mask[ridx] & rbit)
8650 mcs[ridx] |= rbit;
8651 else
8652 return false;
8653 }
8654
8655 return true;
8656}
8657
8658static u16 vht_mcs_map_to_mcs_mask(u8 vht_mcs_map)
8659{
8660 u16 mcs_mask = 0;
8661
8662 switch (vht_mcs_map) {
8663 case IEEE80211_VHT_MCS_NOT_SUPPORTED:
8664 break;
8665 case IEEE80211_VHT_MCS_SUPPORT_0_7:
8666 mcs_mask = 0x00FF;
8667 break;
8668 case IEEE80211_VHT_MCS_SUPPORT_0_8:
8669 mcs_mask = 0x01FF;
8670 break;
8671 case IEEE80211_VHT_MCS_SUPPORT_0_9:
8672 mcs_mask = 0x03FF;
8673 break;
8674 default:
8675 break;
8676 }
8677
8678 return mcs_mask;
8679}
8680
8681static void vht_build_mcs_mask(u16 vht_mcs_map,
8682 u16 vht_mcs_mask[NL80211_VHT_NSS_MAX])
8683{
8684 u8 nss;
8685
8686 for (nss = 0; nss < NL80211_VHT_NSS_MAX; nss++) {
8687 vht_mcs_mask[nss] = vht_mcs_map_to_mcs_mask(vht_mcs_map & 0x03);
8688 vht_mcs_map >>= 2;
8689 }
8690}
8691
8692static bool vht_set_mcs_mask(struct ieee80211_supported_band *sband,
8693 struct nl80211_txrate_vht *txrate,
8694 u16 mcs[NL80211_VHT_NSS_MAX])
8695{
8696 u16 tx_mcs_map = le16_to_cpu(sband->vht_cap.vht_mcs.tx_mcs_map);
8697 u16 tx_mcs_mask[NL80211_VHT_NSS_MAX] = {};
8698 u8 i;
8699
8700 if (!sband->vht_cap.vht_supported)
8701 return false;
8702
8703 memset(mcs, 0, sizeof(u16) * NL80211_VHT_NSS_MAX);
8704
8705 /* Build vht_mcs_mask from VHT capabilities */
8706 vht_build_mcs_mask(tx_mcs_map, tx_mcs_mask);
8707
8708 for (i = 0; i < NL80211_VHT_NSS_MAX; i++) {
8709 if ((tx_mcs_mask[i] & txrate->mcs[i]) == txrate->mcs[i])
8710 mcs[i] = txrate->mcs[i];
8711 else
8712 return false;
8713 }
8714
8715 return true;
8716}
8717
8718static const struct nla_policy nl80211_txattr_policy[NL80211_TXRATE_MAX + 1] = {
8719 [NL80211_TXRATE_LEGACY] = { .type = NLA_BINARY,
8720 .len = NL80211_MAX_SUPP_RATES },
8721 [NL80211_TXRATE_HT] = { .type = NLA_BINARY,
8722 .len = NL80211_MAX_SUPP_HT_RATES },
8723 [NL80211_TXRATE_VHT] = { .len = sizeof(struct nl80211_txrate_vht)},
8724 [NL80211_TXRATE_GI] = { .type = NLA_U8 },
8725};
8726
8727static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb, 8983static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb,
8728 struct genl_info *info) 8984 struct genl_info *info)
8729{ 8985{
8730 struct nlattr *tb[NL80211_TXRATE_MAX + 1];
8731 struct cfg80211_registered_device *rdev = info->user_ptr[0];
8732 struct cfg80211_bitrate_mask mask; 8986 struct cfg80211_bitrate_mask mask;
8733 int rem, i; 8987 struct cfg80211_registered_device *rdev = info->user_ptr[0];
8734 struct net_device *dev = info->user_ptr[1]; 8988 struct net_device *dev = info->user_ptr[1];
8735 struct nlattr *tx_rates; 8989 int err;
8736 struct ieee80211_supported_band *sband;
8737 u16 vht_tx_mcs_map;
8738 8990
8739 if (!rdev->ops->set_bitrate_mask) 8991 if (!rdev->ops->set_bitrate_mask)
8740 return -EOPNOTSUPP; 8992 return -EOPNOTSUPP;
8741 8993
8742 memset(&mask, 0, sizeof(mask)); 8994 err = nl80211_parse_tx_bitrate_mask(info, &mask);
8743 /* Default to all rates enabled */ 8995 if (err)
8744 for (i = 0; i < NUM_NL80211_BANDS; i++) { 8996 return err;
8745 sband = rdev->wiphy.bands[i];
8746
8747 if (!sband)
8748 continue;
8749
8750 mask.control[i].legacy = (1 << sband->n_bitrates) - 1;
8751 memcpy(mask.control[i].ht_mcs,
8752 sband->ht_cap.mcs.rx_mask,
8753 sizeof(mask.control[i].ht_mcs));
8754
8755 if (!sband->vht_cap.vht_supported)
8756 continue;
8757
8758 vht_tx_mcs_map = le16_to_cpu(sband->vht_cap.vht_mcs.tx_mcs_map);
8759 vht_build_mcs_mask(vht_tx_mcs_map, mask.control[i].vht_mcs);
8760 }
8761
8762 /* if no rates are given set it back to the defaults */
8763 if (!info->attrs[NL80211_ATTR_TX_RATES])
8764 goto out;
8765
8766 /*
8767 * The nested attribute uses enum nl80211_band as the index. This maps
8768 * directly to the enum nl80211_band values used in cfg80211.
8769 */
8770 BUILD_BUG_ON(NL80211_MAX_SUPP_HT_RATES > IEEE80211_HT_MCS_MASK_LEN * 8);
8771 nla_for_each_nested(tx_rates, info->attrs[NL80211_ATTR_TX_RATES], rem) {
8772 enum nl80211_band band = nla_type(tx_rates);
8773 int err;
8774
8775 if (band < 0 || band >= NUM_NL80211_BANDS)
8776 return -EINVAL;
8777 sband = rdev->wiphy.bands[band];
8778 if (sband == NULL)
8779 return -EINVAL;
8780 err = nla_parse(tb, NL80211_TXRATE_MAX, nla_data(tx_rates),
8781 nla_len(tx_rates), nl80211_txattr_policy);
8782 if (err)
8783 return err;
8784 if (tb[NL80211_TXRATE_LEGACY]) {
8785 mask.control[band].legacy = rateset_to_mask(
8786 sband,
8787 nla_data(tb[NL80211_TXRATE_LEGACY]),
8788 nla_len(tb[NL80211_TXRATE_LEGACY]));
8789 if ((mask.control[band].legacy == 0) &&
8790 nla_len(tb[NL80211_TXRATE_LEGACY]))
8791 return -EINVAL;
8792 }
8793 if (tb[NL80211_TXRATE_HT]) {
8794 if (!ht_rateset_to_mask(
8795 sband,
8796 nla_data(tb[NL80211_TXRATE_HT]),
8797 nla_len(tb[NL80211_TXRATE_HT]),
8798 mask.control[band].ht_mcs))
8799 return -EINVAL;
8800 }
8801 if (tb[NL80211_TXRATE_VHT]) {
8802 if (!vht_set_mcs_mask(
8803 sband,
8804 nla_data(tb[NL80211_TXRATE_VHT]),
8805 mask.control[band].vht_mcs))
8806 return -EINVAL;
8807 }
8808 if (tb[NL80211_TXRATE_GI]) {
8809 mask.control[band].gi =
8810 nla_get_u8(tb[NL80211_TXRATE_GI]);
8811 if (mask.control[band].gi > NL80211_TXRATE_FORCE_LGI)
8812 return -EINVAL;
8813 }
8814
8815 if (mask.control[band].legacy == 0) {
8816 /* don't allow empty legacy rates if HT or VHT
8817 * are not even supported.
8818 */
8819 if (!(rdev->wiphy.bands[band]->ht_cap.ht_supported ||
8820 rdev->wiphy.bands[band]->vht_cap.vht_supported))
8821 return -EINVAL;
8822
8823 for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++)
8824 if (mask.control[band].ht_mcs[i])
8825 goto out;
8826
8827 for (i = 0; i < NL80211_VHT_NSS_MAX; i++)
8828 if (mask.control[band].vht_mcs[i])
8829 goto out;
8830
8831 /* legacy and mcs rates may not be both empty */
8832 return -EINVAL;
8833 }
8834 }
8835 8997
8836out:
8837 return rdev_set_bitrate_mask(rdev, dev, NULL, &mask); 8998 return rdev_set_bitrate_mask(rdev, dev, NULL, &mask);
8838} 8999}
8839 9000
@@ -8859,6 +9020,7 @@ static int nl80211_register_mgmt(struct sk_buff *skb, struct genl_info *info)
8859 case NL80211_IFTYPE_P2P_GO: 9020 case NL80211_IFTYPE_P2P_GO:
8860 case NL80211_IFTYPE_P2P_DEVICE: 9021 case NL80211_IFTYPE_P2P_DEVICE:
8861 break; 9022 break;
9023 case NL80211_IFTYPE_NAN:
8862 default: 9024 default:
8863 return -EOPNOTSUPP; 9025 return -EOPNOTSUPP;
8864 } 9026 }
@@ -8904,6 +9066,7 @@ static int nl80211_tx_mgmt(struct sk_buff *skb, struct genl_info *info)
8904 case NL80211_IFTYPE_MESH_POINT: 9066 case NL80211_IFTYPE_MESH_POINT:
8905 case NL80211_IFTYPE_P2P_GO: 9067 case NL80211_IFTYPE_P2P_GO:
8906 break; 9068 break;
9069 case NL80211_IFTYPE_NAN:
8907 default: 9070 default:
8908 return -EOPNOTSUPP; 9071 return -EOPNOTSUPP;
8909 } 9072 }
@@ -9020,6 +9183,7 @@ static int nl80211_tx_mgmt_cancel_wait(struct sk_buff *skb, struct genl_info *in
9020 case NL80211_IFTYPE_P2P_GO: 9183 case NL80211_IFTYPE_P2P_GO:
9021 case NL80211_IFTYPE_P2P_DEVICE: 9184 case NL80211_IFTYPE_P2P_DEVICE:
9022 break; 9185 break;
9186 case NL80211_IFTYPE_NAN:
9023 default: 9187 default:
9024 return -EOPNOTSUPP; 9188 return -EOPNOTSUPP;
9025 } 9189 }
@@ -9252,9 +9416,10 @@ static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info)
9252 if (info->attrs[NL80211_ATTR_BEACON_INTERVAL]) { 9416 if (info->attrs[NL80211_ATTR_BEACON_INTERVAL]) {
9253 setup.beacon_interval = 9417 setup.beacon_interval =
9254 nla_get_u32(info->attrs[NL80211_ATTR_BEACON_INTERVAL]); 9418 nla_get_u32(info->attrs[NL80211_ATTR_BEACON_INTERVAL]);
9255 if (setup.beacon_interval < 10 || 9419
9256 setup.beacon_interval > 10000) 9420 err = cfg80211_validate_beacon_int(rdev, setup.beacon_interval);
9257 return -EINVAL; 9421 if (err)
9422 return err;
9258 } 9423 }
9259 9424
9260 if (info->attrs[NL80211_ATTR_DTIM_PERIOD]) { 9425 if (info->attrs[NL80211_ATTR_DTIM_PERIOD]) {
@@ -9300,6 +9465,17 @@ static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info)
9300 return err; 9465 return err;
9301 } 9466 }
9302 9467
9468 if (info->attrs[NL80211_ATTR_TX_RATES]) {
9469 err = nl80211_parse_tx_bitrate_mask(info, &setup.beacon_rate);
9470 if (err)
9471 return err;
9472
9473 err = validate_beacon_tx_rate(rdev, setup.chandef.chan->band,
9474 &setup.beacon_rate);
9475 if (err)
9476 return err;
9477 }
9478
9303 return cfg80211_join_mesh(rdev, dev, &setup, &cfg); 9479 return cfg80211_join_mesh(rdev, dev, &setup, &cfg);
9304} 9480}
9305 9481
@@ -9413,18 +9589,27 @@ static int nl80211_send_wowlan_nd(struct sk_buff *msg,
9413 if (!freqs) 9589 if (!freqs)
9414 return -ENOBUFS; 9590 return -ENOBUFS;
9415 9591
9416 for (i = 0; i < req->n_channels; i++) 9592 for (i = 0; i < req->n_channels; i++) {
9417 nla_put_u32(msg, i, req->channels[i]->center_freq); 9593 if (nla_put_u32(msg, i, req->channels[i]->center_freq))
9594 return -ENOBUFS;
9595 }
9418 9596
9419 nla_nest_end(msg, freqs); 9597 nla_nest_end(msg, freqs);
9420 9598
9421 if (req->n_match_sets) { 9599 if (req->n_match_sets) {
9422 matches = nla_nest_start(msg, NL80211_ATTR_SCHED_SCAN_MATCH); 9600 matches = nla_nest_start(msg, NL80211_ATTR_SCHED_SCAN_MATCH);
9601 if (!matches)
9602 return -ENOBUFS;
9603
9423 for (i = 0; i < req->n_match_sets; i++) { 9604 for (i = 0; i < req->n_match_sets; i++) {
9424 match = nla_nest_start(msg, i); 9605 match = nla_nest_start(msg, i);
9425 nla_put(msg, NL80211_SCHED_SCAN_MATCH_ATTR_SSID, 9606 if (!match)
9426 req->match_sets[i].ssid.ssid_len, 9607 return -ENOBUFS;
9427 req->match_sets[i].ssid.ssid); 9608
9609 if (nla_put(msg, NL80211_SCHED_SCAN_MATCH_ATTR_SSID,
9610 req->match_sets[i].ssid.ssid_len,
9611 req->match_sets[i].ssid.ssid))
9612 return -ENOBUFS;
9428 nla_nest_end(msg, match); 9613 nla_nest_end(msg, match);
9429 } 9614 }
9430 nla_nest_end(msg, matches); 9615 nla_nest_end(msg, matches);
@@ -9436,6 +9621,9 @@ static int nl80211_send_wowlan_nd(struct sk_buff *msg,
9436 9621
9437 for (i = 0; i < req->n_scan_plans; i++) { 9622 for (i = 0; i < req->n_scan_plans; i++) {
9438 scan_plan = nla_nest_start(msg, i + 1); 9623 scan_plan = nla_nest_start(msg, i + 1);
9624 if (!scan_plan)
9625 return -ENOBUFS;
9626
9439 if (!scan_plan || 9627 if (!scan_plan ||
9440 nla_put_u32(msg, NL80211_SCHED_SCAN_PLAN_INTERVAL, 9628 nla_put_u32(msg, NL80211_SCHED_SCAN_PLAN_INTERVAL,
9441 req->scan_plans[i].interval) || 9629 req->scan_plans[i].interval) ||
@@ -10362,6 +10550,549 @@ static int nl80211_stop_p2p_device(struct sk_buff *skb, struct genl_info *info)
10362 return 0; 10550 return 0;
10363} 10551}
10364 10552
10553static int nl80211_start_nan(struct sk_buff *skb, struct genl_info *info)
10554{
10555 struct cfg80211_registered_device *rdev = info->user_ptr[0];
10556 struct wireless_dev *wdev = info->user_ptr[1];
10557 struct cfg80211_nan_conf conf = {};
10558 int err;
10559
10560 if (wdev->iftype != NL80211_IFTYPE_NAN)
10561 return -EOPNOTSUPP;
10562
10563 if (wdev->nan_started)
10564 return -EEXIST;
10565
10566 if (rfkill_blocked(rdev->rfkill))
10567 return -ERFKILL;
10568
10569 if (!info->attrs[NL80211_ATTR_NAN_MASTER_PREF])
10570 return -EINVAL;
10571
10572 if (!info->attrs[NL80211_ATTR_NAN_DUAL])
10573 return -EINVAL;
10574
10575 conf.master_pref =
10576 nla_get_u8(info->attrs[NL80211_ATTR_NAN_MASTER_PREF]);
10577 if (!conf.master_pref)
10578 return -EINVAL;
10579
10580 conf.dual = nla_get_u8(info->attrs[NL80211_ATTR_NAN_DUAL]);
10581
10582 err = rdev_start_nan(rdev, wdev, &conf);
10583 if (err)
10584 return err;
10585
10586 wdev->nan_started = true;
10587 rdev->opencount++;
10588
10589 return 0;
10590}
10591
10592static int nl80211_stop_nan(struct sk_buff *skb, struct genl_info *info)
10593{
10594 struct cfg80211_registered_device *rdev = info->user_ptr[0];
10595 struct wireless_dev *wdev = info->user_ptr[1];
10596
10597 if (wdev->iftype != NL80211_IFTYPE_NAN)
10598 return -EOPNOTSUPP;
10599
10600 cfg80211_stop_nan(rdev, wdev);
10601
10602 return 0;
10603}
10604
10605static int validate_nan_filter(struct nlattr *filter_attr)
10606{
10607 struct nlattr *attr;
10608 int len = 0, n_entries = 0, rem;
10609
10610 nla_for_each_nested(attr, filter_attr, rem) {
10611 len += nla_len(attr);
10612 n_entries++;
10613 }
10614
10615 if (len >= U8_MAX)
10616 return -EINVAL;
10617
10618 return n_entries;
10619}
10620
10621static int handle_nan_filter(struct nlattr *attr_filter,
10622 struct cfg80211_nan_func *func,
10623 bool tx)
10624{
10625 struct nlattr *attr;
10626 int n_entries, rem, i;
10627 struct cfg80211_nan_func_filter *filter;
10628
10629 n_entries = validate_nan_filter(attr_filter);
10630 if (n_entries < 0)
10631 return n_entries;
10632
10633 BUILD_BUG_ON(sizeof(*func->rx_filters) != sizeof(*func->tx_filters));
10634
10635 filter = kcalloc(n_entries, sizeof(*func->rx_filters), GFP_KERNEL);
10636 if (!filter)
10637 return -ENOMEM;
10638
10639 i = 0;
10640 nla_for_each_nested(attr, attr_filter, rem) {
10641 filter[i].filter = kmemdup(nla_data(attr), nla_len(attr),
10642 GFP_KERNEL);
10643 filter[i].len = nla_len(attr);
10644 i++;
10645 }
10646 if (tx) {
10647 func->num_tx_filters = n_entries;
10648 func->tx_filters = filter;
10649 } else {
10650 func->num_rx_filters = n_entries;
10651 func->rx_filters = filter;
10652 }
10653
10654 return 0;
10655}
10656
10657static int nl80211_nan_add_func(struct sk_buff *skb,
10658 struct genl_info *info)
10659{
10660 struct cfg80211_registered_device *rdev = info->user_ptr[0];
10661 struct wireless_dev *wdev = info->user_ptr[1];
10662 struct nlattr *tb[NUM_NL80211_NAN_FUNC_ATTR], *func_attr;
10663 struct cfg80211_nan_func *func;
10664 struct sk_buff *msg = NULL;
10665 void *hdr = NULL;
10666 int err = 0;
10667
10668 if (wdev->iftype != NL80211_IFTYPE_NAN)
10669 return -EOPNOTSUPP;
10670
10671 if (!wdev->nan_started)
10672 return -ENOTCONN;
10673
10674 if (!info->attrs[NL80211_ATTR_NAN_FUNC])
10675 return -EINVAL;
10676
10677 if (wdev->owner_nlportid &&
10678 wdev->owner_nlportid != info->snd_portid)
10679 return -ENOTCONN;
10680
10681 err = nla_parse(tb, NL80211_NAN_FUNC_ATTR_MAX,
10682 nla_data(info->attrs[NL80211_ATTR_NAN_FUNC]),
10683 nla_len(info->attrs[NL80211_ATTR_NAN_FUNC]),
10684 nl80211_nan_func_policy);
10685 if (err)
10686 return err;
10687
10688 func = kzalloc(sizeof(*func), GFP_KERNEL);
10689 if (!func)
10690 return -ENOMEM;
10691
10692 func->cookie = wdev->wiphy->cookie_counter++;
10693
10694 if (!tb[NL80211_NAN_FUNC_TYPE] ||
10695 nla_get_u8(tb[NL80211_NAN_FUNC_TYPE]) > NL80211_NAN_FUNC_MAX_TYPE) {
10696 err = -EINVAL;
10697 goto out;
10698 }
10699
10700
10701 func->type = nla_get_u8(tb[NL80211_NAN_FUNC_TYPE]);
10702
10703 if (!tb[NL80211_NAN_FUNC_SERVICE_ID]) {
10704 err = -EINVAL;
10705 goto out;
10706 }
10707
10708 memcpy(func->service_id, nla_data(tb[NL80211_NAN_FUNC_SERVICE_ID]),
10709 sizeof(func->service_id));
10710
10711 func->close_range =
10712 nla_get_flag(tb[NL80211_NAN_FUNC_CLOSE_RANGE]);
10713
10714 if (tb[NL80211_NAN_FUNC_SERVICE_INFO]) {
10715 func->serv_spec_info_len =
10716 nla_len(tb[NL80211_NAN_FUNC_SERVICE_INFO]);
10717 func->serv_spec_info =
10718 kmemdup(nla_data(tb[NL80211_NAN_FUNC_SERVICE_INFO]),
10719 func->serv_spec_info_len,
10720 GFP_KERNEL);
10721 if (!func->serv_spec_info) {
10722 err = -ENOMEM;
10723 goto out;
10724 }
10725 }
10726
10727 if (tb[NL80211_NAN_FUNC_TTL])
10728 func->ttl = nla_get_u32(tb[NL80211_NAN_FUNC_TTL]);
10729
10730 switch (func->type) {
10731 case NL80211_NAN_FUNC_PUBLISH:
10732 if (!tb[NL80211_NAN_FUNC_PUBLISH_TYPE]) {
10733 err = -EINVAL;
10734 goto out;
10735 }
10736
10737 func->publish_type =
10738 nla_get_u8(tb[NL80211_NAN_FUNC_PUBLISH_TYPE]);
10739 func->publish_bcast =
10740 nla_get_flag(tb[NL80211_NAN_FUNC_PUBLISH_BCAST]);
10741
10742 if ((!(func->publish_type & NL80211_NAN_SOLICITED_PUBLISH)) &&
10743 func->publish_bcast) {
10744 err = -EINVAL;
10745 goto out;
10746 }
10747 break;
10748 case NL80211_NAN_FUNC_SUBSCRIBE:
10749 func->subscribe_active =
10750 nla_get_flag(tb[NL80211_NAN_FUNC_SUBSCRIBE_ACTIVE]);
10751 break;
10752 case NL80211_NAN_FUNC_FOLLOW_UP:
10753 if (!tb[NL80211_NAN_FUNC_FOLLOW_UP_ID] ||
10754 !tb[NL80211_NAN_FUNC_FOLLOW_UP_REQ_ID]) {
10755 err = -EINVAL;
10756 goto out;
10757 }
10758
10759 func->followup_id =
10760 nla_get_u8(tb[NL80211_NAN_FUNC_FOLLOW_UP_ID]);
10761 func->followup_reqid =
10762 nla_get_u8(tb[NL80211_NAN_FUNC_FOLLOW_UP_REQ_ID]);
10763 memcpy(func->followup_dest.addr,
10764 nla_data(tb[NL80211_NAN_FUNC_FOLLOW_UP_DEST]),
10765 sizeof(func->followup_dest.addr));
10766 if (func->ttl) {
10767 err = -EINVAL;
10768 goto out;
10769 }
10770 break;
10771 default:
10772 err = -EINVAL;
10773 goto out;
10774 }
10775
10776 if (tb[NL80211_NAN_FUNC_SRF]) {
10777 struct nlattr *srf_tb[NUM_NL80211_NAN_SRF_ATTR];
10778
10779 err = nla_parse(srf_tb, NL80211_NAN_SRF_ATTR_MAX,
10780 nla_data(tb[NL80211_NAN_FUNC_SRF]),
10781 nla_len(tb[NL80211_NAN_FUNC_SRF]), NULL);
10782 if (err)
10783 goto out;
10784
10785 func->srf_include =
10786 nla_get_flag(srf_tb[NL80211_NAN_SRF_INCLUDE]);
10787
10788 if (srf_tb[NL80211_NAN_SRF_BF]) {
10789 if (srf_tb[NL80211_NAN_SRF_MAC_ADDRS] ||
10790 !srf_tb[NL80211_NAN_SRF_BF_IDX]) {
10791 err = -EINVAL;
10792 goto out;
10793 }
10794
10795 func->srf_bf_len =
10796 nla_len(srf_tb[NL80211_NAN_SRF_BF]);
10797 func->srf_bf =
10798 kmemdup(nla_data(srf_tb[NL80211_NAN_SRF_BF]),
10799 func->srf_bf_len, GFP_KERNEL);
10800 if (!func->srf_bf) {
10801 err = -ENOMEM;
10802 goto out;
10803 }
10804
10805 func->srf_bf_idx =
10806 nla_get_u8(srf_tb[NL80211_NAN_SRF_BF_IDX]);
10807 } else {
10808 struct nlattr *attr, *mac_attr =
10809 srf_tb[NL80211_NAN_SRF_MAC_ADDRS];
10810 int n_entries, rem, i = 0;
10811
10812 if (!mac_attr) {
10813 err = -EINVAL;
10814 goto out;
10815 }
10816
10817 n_entries = validate_acl_mac_addrs(mac_attr);
10818 if (n_entries <= 0) {
10819 err = -EINVAL;
10820 goto out;
10821 }
10822
10823 func->srf_num_macs = n_entries;
10824 func->srf_macs =
10825 kzalloc(sizeof(*func->srf_macs) * n_entries,
10826 GFP_KERNEL);
10827 if (!func->srf_macs) {
10828 err = -ENOMEM;
10829 goto out;
10830 }
10831
10832 nla_for_each_nested(attr, mac_attr, rem)
10833 memcpy(func->srf_macs[i++].addr, nla_data(attr),
10834 sizeof(*func->srf_macs));
10835 }
10836 }
10837
10838 if (tb[NL80211_NAN_FUNC_TX_MATCH_FILTER]) {
10839 err = handle_nan_filter(tb[NL80211_NAN_FUNC_TX_MATCH_FILTER],
10840 func, true);
10841 if (err)
10842 goto out;
10843 }
10844
10845 if (tb[NL80211_NAN_FUNC_RX_MATCH_FILTER]) {
10846 err = handle_nan_filter(tb[NL80211_NAN_FUNC_RX_MATCH_FILTER],
10847 func, false);
10848 if (err)
10849 goto out;
10850 }
10851
10852 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
10853 if (!msg) {
10854 err = -ENOMEM;
10855 goto out;
10856 }
10857
10858 hdr = nl80211hdr_put(msg, info->snd_portid, info->snd_seq, 0,
10859 NL80211_CMD_ADD_NAN_FUNCTION);
10860 /* This can't really happen - we just allocated 4KB */
10861 if (WARN_ON(!hdr)) {
10862 err = -ENOMEM;
10863 goto out;
10864 }
10865
10866 err = rdev_add_nan_func(rdev, wdev, func);
10867out:
10868 if (err < 0) {
10869 cfg80211_free_nan_func(func);
10870 nlmsg_free(msg);
10871 return err;
10872 }
10873
10874 /* propagate the instance id and cookie to userspace */
10875 if (nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, func->cookie,
10876 NL80211_ATTR_PAD))
10877 goto nla_put_failure;
10878
10879 func_attr = nla_nest_start(msg, NL80211_ATTR_NAN_FUNC);
10880 if (!func_attr)
10881 goto nla_put_failure;
10882
10883 if (nla_put_u8(msg, NL80211_NAN_FUNC_INSTANCE_ID,
10884 func->instance_id))
10885 goto nla_put_failure;
10886
10887 nla_nest_end(msg, func_attr);
10888
10889 genlmsg_end(msg, hdr);
10890 return genlmsg_reply(msg, info);
10891
10892nla_put_failure:
10893 nlmsg_free(msg);
10894 return -ENOBUFS;
10895}
10896
10897static int nl80211_nan_del_func(struct sk_buff *skb,
10898 struct genl_info *info)
10899{
10900 struct cfg80211_registered_device *rdev = info->user_ptr[0];
10901 struct wireless_dev *wdev = info->user_ptr[1];
10902 u64 cookie;
10903
10904 if (wdev->iftype != NL80211_IFTYPE_NAN)
10905 return -EOPNOTSUPP;
10906
10907 if (!wdev->nan_started)
10908 return -ENOTCONN;
10909
10910 if (!info->attrs[NL80211_ATTR_COOKIE])
10911 return -EINVAL;
10912
10913 if (wdev->owner_nlportid &&
10914 wdev->owner_nlportid != info->snd_portid)
10915 return -ENOTCONN;
10916
10917 cookie = nla_get_u64(info->attrs[NL80211_ATTR_COOKIE]);
10918
10919 rdev_del_nan_func(rdev, wdev, cookie);
10920
10921 return 0;
10922}
10923
10924static int nl80211_nan_change_config(struct sk_buff *skb,
10925 struct genl_info *info)
10926{
10927 struct cfg80211_registered_device *rdev = info->user_ptr[0];
10928 struct wireless_dev *wdev = info->user_ptr[1];
10929 struct cfg80211_nan_conf conf = {};
10930 u32 changed = 0;
10931
10932 if (wdev->iftype != NL80211_IFTYPE_NAN)
10933 return -EOPNOTSUPP;
10934
10935 if (!wdev->nan_started)
10936 return -ENOTCONN;
10937
10938 if (info->attrs[NL80211_ATTR_NAN_MASTER_PREF]) {
10939 conf.master_pref =
10940 nla_get_u8(info->attrs[NL80211_ATTR_NAN_MASTER_PREF]);
10941 if (conf.master_pref <= 1 || conf.master_pref == 255)
10942 return -EINVAL;
10943
10944 changed |= CFG80211_NAN_CONF_CHANGED_PREF;
10945 }
10946
10947 if (info->attrs[NL80211_ATTR_NAN_DUAL]) {
10948 conf.dual = nla_get_u8(info->attrs[NL80211_ATTR_NAN_DUAL]);
10949 changed |= CFG80211_NAN_CONF_CHANGED_DUAL;
10950 }
10951
10952 if (!changed)
10953 return -EINVAL;
10954
10955 return rdev_nan_change_conf(rdev, wdev, &conf, changed);
10956}
10957
10958void cfg80211_nan_match(struct wireless_dev *wdev,
10959 struct cfg80211_nan_match_params *match, gfp_t gfp)
10960{
10961 struct wiphy *wiphy = wdev->wiphy;
10962 struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
10963 struct nlattr *match_attr, *local_func_attr, *peer_func_attr;
10964 struct sk_buff *msg;
10965 void *hdr;
10966
10967 if (WARN_ON(!match->inst_id || !match->peer_inst_id || !match->addr))
10968 return;
10969
10970 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
10971 if (!msg)
10972 return;
10973
10974 hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_NAN_MATCH);
10975 if (!hdr) {
10976 nlmsg_free(msg);
10977 return;
10978 }
10979
10980 if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
10981 (wdev->netdev && nla_put_u32(msg, NL80211_ATTR_IFINDEX,
10982 wdev->netdev->ifindex)) ||
10983 nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev),
10984 NL80211_ATTR_PAD))
10985 goto nla_put_failure;
10986
10987 if (nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, match->cookie,
10988 NL80211_ATTR_PAD) ||
10989 nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, match->addr))
10990 goto nla_put_failure;
10991
10992 match_attr = nla_nest_start(msg, NL80211_ATTR_NAN_MATCH);
10993 if (!match_attr)
10994 goto nla_put_failure;
10995
10996 local_func_attr = nla_nest_start(msg, NL80211_NAN_MATCH_FUNC_LOCAL);
10997 if (!local_func_attr)
10998 goto nla_put_failure;
10999
11000 if (nla_put_u8(msg, NL80211_NAN_FUNC_INSTANCE_ID, match->inst_id))
11001 goto nla_put_failure;
11002
11003 nla_nest_end(msg, local_func_attr);
11004
11005 peer_func_attr = nla_nest_start(msg, NL80211_NAN_MATCH_FUNC_PEER);
11006 if (!peer_func_attr)
11007 goto nla_put_failure;
11008
11009 if (nla_put_u8(msg, NL80211_NAN_FUNC_TYPE, match->type) ||
11010 nla_put_u8(msg, NL80211_NAN_FUNC_INSTANCE_ID, match->peer_inst_id))
11011 goto nla_put_failure;
11012
11013 if (match->info && match->info_len &&
11014 nla_put(msg, NL80211_NAN_FUNC_SERVICE_INFO, match->info_len,
11015 match->info))
11016 goto nla_put_failure;
11017
11018 nla_nest_end(msg, peer_func_attr);
11019 nla_nest_end(msg, match_attr);
11020 genlmsg_end(msg, hdr);
11021
11022 if (!wdev->owner_nlportid)
11023 genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy),
11024 msg, 0, NL80211_MCGRP_NAN, gfp);
11025 else
11026 genlmsg_unicast(wiphy_net(&rdev->wiphy), msg,
11027 wdev->owner_nlportid);
11028
11029 return;
11030
11031nla_put_failure:
11032 nlmsg_free(msg);
11033}
11034EXPORT_SYMBOL(cfg80211_nan_match);
11035
11036void cfg80211_nan_func_terminated(struct wireless_dev *wdev,
11037 u8 inst_id,
11038 enum nl80211_nan_func_term_reason reason,
11039 u64 cookie, gfp_t gfp)
11040{
11041 struct wiphy *wiphy = wdev->wiphy;
11042 struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
11043 struct sk_buff *msg;
11044 struct nlattr *func_attr;
11045 void *hdr;
11046
11047 if (WARN_ON(!inst_id))
11048 return;
11049
11050 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
11051 if (!msg)
11052 return;
11053
11054 hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_DEL_NAN_FUNCTION);
11055 if (!hdr) {
11056 nlmsg_free(msg);
11057 return;
11058 }
11059
11060 if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
11061 (wdev->netdev && nla_put_u32(msg, NL80211_ATTR_IFINDEX,
11062 wdev->netdev->ifindex)) ||
11063 nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev),
11064 NL80211_ATTR_PAD))
11065 goto nla_put_failure;
11066
11067 if (nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, cookie,
11068 NL80211_ATTR_PAD))
11069 goto nla_put_failure;
11070
11071 func_attr = nla_nest_start(msg, NL80211_ATTR_NAN_FUNC);
11072 if (!func_attr)
11073 goto nla_put_failure;
11074
11075 if (nla_put_u8(msg, NL80211_NAN_FUNC_INSTANCE_ID, inst_id) ||
11076 nla_put_u8(msg, NL80211_NAN_FUNC_TERM_REASON, reason))
11077 goto nla_put_failure;
11078
11079 nla_nest_end(msg, func_attr);
11080 genlmsg_end(msg, hdr);
11081
11082 if (!wdev->owner_nlportid)
11083 genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy),
11084 msg, 0, NL80211_MCGRP_NAN, gfp);
11085 else
11086 genlmsg_unicast(wiphy_net(&rdev->wiphy), msg,
11087 wdev->owner_nlportid);
11088
11089 return;
11090
11091nla_put_failure:
11092 nlmsg_free(msg);
11093}
11094EXPORT_SYMBOL(cfg80211_nan_func_terminated);
11095
10365static int nl80211_get_protocol_features(struct sk_buff *skb, 11096static int nl80211_get_protocol_features(struct sk_buff *skb,
10366 struct genl_info *info) 11097 struct genl_info *info)
10367{ 11098{
@@ -11063,7 +11794,14 @@ static int nl80211_pre_doit(const struct genl_ops *ops, struct sk_buff *skb,
11063 11794
11064 dev_hold(dev); 11795 dev_hold(dev);
11065 } else if (ops->internal_flags & NL80211_FLAG_CHECK_NETDEV_UP) { 11796 } else if (ops->internal_flags & NL80211_FLAG_CHECK_NETDEV_UP) {
11066 if (!wdev->p2p_started) { 11797 if (wdev->iftype == NL80211_IFTYPE_P2P_DEVICE &&
11798 !wdev->p2p_started) {
11799 if (rtnl)
11800 rtnl_unlock();
11801 return -ENETDOWN;
11802 }
11803 if (wdev->iftype == NL80211_IFTYPE_NAN &&
11804 !wdev->nan_started) {
11067 if (rtnl) 11805 if (rtnl)
11068 rtnl_unlock(); 11806 rtnl_unlock();
11069 return -ENETDOWN; 11807 return -ENETDOWN;
@@ -11697,6 +12435,46 @@ static const struct genl_ops nl80211_ops[] = {
11697 NL80211_FLAG_NEED_RTNL, 12435 NL80211_FLAG_NEED_RTNL,
11698 }, 12436 },
11699 { 12437 {
12438 .cmd = NL80211_CMD_START_NAN,
12439 .doit = nl80211_start_nan,
12440 .policy = nl80211_policy,
12441 .flags = GENL_ADMIN_PERM,
12442 .internal_flags = NL80211_FLAG_NEED_WDEV |
12443 NL80211_FLAG_NEED_RTNL,
12444 },
12445 {
12446 .cmd = NL80211_CMD_STOP_NAN,
12447 .doit = nl80211_stop_nan,
12448 .policy = nl80211_policy,
12449 .flags = GENL_ADMIN_PERM,
12450 .internal_flags = NL80211_FLAG_NEED_WDEV_UP |
12451 NL80211_FLAG_NEED_RTNL,
12452 },
12453 {
12454 .cmd = NL80211_CMD_ADD_NAN_FUNCTION,
12455 .doit = nl80211_nan_add_func,
12456 .policy = nl80211_policy,
12457 .flags = GENL_ADMIN_PERM,
12458 .internal_flags = NL80211_FLAG_NEED_WDEV_UP |
12459 NL80211_FLAG_NEED_RTNL,
12460 },
12461 {
12462 .cmd = NL80211_CMD_DEL_NAN_FUNCTION,
12463 .doit = nl80211_nan_del_func,
12464 .policy = nl80211_policy,
12465 .flags = GENL_ADMIN_PERM,
12466 .internal_flags = NL80211_FLAG_NEED_WDEV_UP |
12467 NL80211_FLAG_NEED_RTNL,
12468 },
12469 {
12470 .cmd = NL80211_CMD_CHANGE_NAN_CONFIG,
12471 .doit = nl80211_nan_change_config,
12472 .policy = nl80211_policy,
12473 .flags = GENL_ADMIN_PERM,
12474 .internal_flags = NL80211_FLAG_NEED_WDEV_UP |
12475 NL80211_FLAG_NEED_RTNL,
12476 },
12477 {
11700 .cmd = NL80211_CMD_SET_MCAST_RATE, 12478 .cmd = NL80211_CMD_SET_MCAST_RATE,
11701 .doit = nl80211_set_mcast_rate, 12479 .doit = nl80211_set_mcast_rate,
11702 .policy = nl80211_policy, 12480 .policy = nl80211_policy,
@@ -11847,6 +12625,29 @@ void nl80211_notify_wiphy(struct cfg80211_registered_device *rdev,
11847 NL80211_MCGRP_CONFIG, GFP_KERNEL); 12625 NL80211_MCGRP_CONFIG, GFP_KERNEL);
11848} 12626}
11849 12627
12628void nl80211_notify_iface(struct cfg80211_registered_device *rdev,
12629 struct wireless_dev *wdev,
12630 enum nl80211_commands cmd)
12631{
12632 struct sk_buff *msg;
12633
12634 WARN_ON(cmd != NL80211_CMD_NEW_INTERFACE &&
12635 cmd != NL80211_CMD_DEL_INTERFACE);
12636
12637 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
12638 if (!msg)
12639 return;
12640
12641 if (nl80211_send_iface(msg, 0, 0, 0, rdev, wdev,
12642 cmd == NL80211_CMD_DEL_INTERFACE) < 0) {
12643 nlmsg_free(msg);
12644 return;
12645 }
12646
12647 genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
12648 NL80211_MCGRP_CONFIG, GFP_KERNEL);
12649}
12650
11850static int nl80211_add_scan_req(struct sk_buff *msg, 12651static int nl80211_add_scan_req(struct sk_buff *msg,
11851 struct cfg80211_registered_device *rdev) 12652 struct cfg80211_registered_device *rdev)
11852{ 12653{
diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h
index a63f402b10b7..7e3821d7fcc5 100644
--- a/net/wireless/nl80211.h
+++ b/net/wireless/nl80211.h
@@ -7,6 +7,9 @@ int nl80211_init(void);
7void nl80211_exit(void); 7void nl80211_exit(void);
8void nl80211_notify_wiphy(struct cfg80211_registered_device *rdev, 8void nl80211_notify_wiphy(struct cfg80211_registered_device *rdev,
9 enum nl80211_commands cmd); 9 enum nl80211_commands cmd);
10void nl80211_notify_iface(struct cfg80211_registered_device *rdev,
11 struct wireless_dev *wdev,
12 enum nl80211_commands cmd);
10void nl80211_send_scan_start(struct cfg80211_registered_device *rdev, 13void nl80211_send_scan_start(struct cfg80211_registered_device *rdev,
11 struct wireless_dev *wdev); 14 struct wireless_dev *wdev);
12struct sk_buff *nl80211_build_scan_msg(struct cfg80211_registered_device *rdev, 15struct sk_buff *nl80211_build_scan_msg(struct cfg80211_registered_device *rdev,
diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h
index 85ff30bee2b9..11cf83c8ad4f 100644
--- a/net/wireless/rdev-ops.h
+++ b/net/wireless/rdev-ops.h
@@ -887,6 +887,64 @@ static inline void rdev_stop_p2p_device(struct cfg80211_registered_device *rdev,
887 trace_rdev_return_void(&rdev->wiphy); 887 trace_rdev_return_void(&rdev->wiphy);
888} 888}
889 889
890static inline int rdev_start_nan(struct cfg80211_registered_device *rdev,
891 struct wireless_dev *wdev,
892 struct cfg80211_nan_conf *conf)
893{
894 int ret;
895
896 trace_rdev_start_nan(&rdev->wiphy, wdev, conf);
897 ret = rdev->ops->start_nan(&rdev->wiphy, wdev, conf);
898 trace_rdev_return_int(&rdev->wiphy, ret);
899 return ret;
900}
901
902static inline void rdev_stop_nan(struct cfg80211_registered_device *rdev,
903 struct wireless_dev *wdev)
904{
905 trace_rdev_stop_nan(&rdev->wiphy, wdev);
906 rdev->ops->stop_nan(&rdev->wiphy, wdev);
907 trace_rdev_return_void(&rdev->wiphy);
908}
909
910static inline int
911rdev_add_nan_func(struct cfg80211_registered_device *rdev,
912 struct wireless_dev *wdev,
913 struct cfg80211_nan_func *nan_func)
914{
915 int ret;
916
917 trace_rdev_add_nan_func(&rdev->wiphy, wdev, nan_func);
918 ret = rdev->ops->add_nan_func(&rdev->wiphy, wdev, nan_func);
919 trace_rdev_return_int(&rdev->wiphy, ret);
920 return ret;
921}
922
923static inline void rdev_del_nan_func(struct cfg80211_registered_device *rdev,
924 struct wireless_dev *wdev, u64 cookie)
925{
926 trace_rdev_del_nan_func(&rdev->wiphy, wdev, cookie);
927 rdev->ops->del_nan_func(&rdev->wiphy, wdev, cookie);
928 trace_rdev_return_void(&rdev->wiphy);
929}
930
931static inline int
932rdev_nan_change_conf(struct cfg80211_registered_device *rdev,
933 struct wireless_dev *wdev,
934 struct cfg80211_nan_conf *conf, u32 changes)
935{
936 int ret;
937
938 trace_rdev_nan_change_conf(&rdev->wiphy, wdev, conf, changes);
939 if (rdev->ops->nan_change_conf)
940 ret = rdev->ops->nan_change_conf(&rdev->wiphy, wdev, conf,
941 changes);
942 else
943 ret = -ENOTSUPP;
944 trace_rdev_return_int(&rdev->wiphy, ret);
945 return ret;
946}
947
890static inline int rdev_set_mac_acl(struct cfg80211_registered_device *rdev, 948static inline int rdev_set_mac_acl(struct cfg80211_registered_device *rdev,
891 struct net_device *dev, 949 struct net_device *dev,
892 struct cfg80211_acl_data *params) 950 struct cfg80211_acl_data *params)
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index 0358e12be54b..b5bd58d0f731 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -352,52 +352,48 @@ void cfg80211_bss_expire(struct cfg80211_registered_device *rdev)
352 __cfg80211_bss_expire(rdev, jiffies - IEEE80211_SCAN_RESULT_EXPIRE); 352 __cfg80211_bss_expire(rdev, jiffies - IEEE80211_SCAN_RESULT_EXPIRE);
353} 353}
354 354
355const u8 *cfg80211_find_ie(u8 eid, const u8 *ies, int len) 355const u8 *cfg80211_find_ie_match(u8 eid, const u8 *ies, int len,
356 const u8 *match, int match_len,
357 int match_offset)
356{ 358{
357 while (len > 2 && ies[0] != eid) { 359 /* match_offset can't be smaller than 2, unless match_len is
360 * zero, in which case match_offset must be zero as well.
361 */
362 if (WARN_ON((match_len && match_offset < 2) ||
363 (!match_len && match_offset)))
364 return NULL;
365
366 while (len >= 2 && len >= ies[1] + 2) {
367 if ((ies[0] == eid) &&
368 (ies[1] + 2 >= match_offset + match_len) &&
369 !memcmp(ies + match_offset, match, match_len))
370 return ies;
371
358 len -= ies[1] + 2; 372 len -= ies[1] + 2;
359 ies += ies[1] + 2; 373 ies += ies[1] + 2;
360 } 374 }
361 if (len < 2) 375
362 return NULL; 376 return NULL;
363 if (len < 2 + ies[1])
364 return NULL;
365 return ies;
366} 377}
367EXPORT_SYMBOL(cfg80211_find_ie); 378EXPORT_SYMBOL(cfg80211_find_ie_match);
368 379
369const u8 *cfg80211_find_vendor_ie(unsigned int oui, int oui_type, 380const u8 *cfg80211_find_vendor_ie(unsigned int oui, int oui_type,
370 const u8 *ies, int len) 381 const u8 *ies, int len)
371{ 382{
372 struct ieee80211_vendor_ie *ie; 383 const u8 *ie;
373 const u8 *pos = ies, *end = ies + len; 384 u8 match[] = { oui >> 16, oui >> 8, oui, oui_type };
374 int ie_oui; 385 int match_len = (oui_type < 0) ? 3 : sizeof(match);
375 386
376 if (WARN_ON(oui_type > 0xff)) 387 if (WARN_ON(oui_type > 0xff))
377 return NULL; 388 return NULL;
378 389
379 while (pos < end) { 390 ie = cfg80211_find_ie_match(WLAN_EID_VENDOR_SPECIFIC, ies, len,
380 pos = cfg80211_find_ie(WLAN_EID_VENDOR_SPECIFIC, pos, 391 match, match_len, 2);
381 end - pos);
382 if (!pos)
383 return NULL;
384
385 ie = (struct ieee80211_vendor_ie *)pos;
386
387 /* make sure we can access ie->len */
388 BUILD_BUG_ON(offsetof(struct ieee80211_vendor_ie, len) != 1);
389 392
390 if (ie->len < sizeof(*ie)) 393 if (ie && (ie[1] < 4))
391 goto cont; 394 return NULL;
392 395
393 ie_oui = ie->oui[0] << 16 | ie->oui[1] << 8 | ie->oui[2]; 396 return ie;
394 if (ie_oui == oui &&
395 (oui_type < 0 || ie->oui_type == oui_type))
396 return pos;
397cont:
398 pos += 2 + ie->len;
399 }
400 return NULL;
401} 397}
402EXPORT_SYMBOL(cfg80211_find_vendor_ie); 398EXPORT_SYMBOL(cfg80211_find_vendor_ie);
403 399
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index add6824c44fd..a77db333927e 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -726,7 +726,8 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
726 726
727 wdev->current_bss = bss_from_pub(bss); 727 wdev->current_bss = bss_from_pub(bss);
728 728
729 cfg80211_upload_connect_keys(wdev); 729 if (!(wdev->wiphy->flags & WIPHY_FLAG_HAS_STATIC_WEP))
730 cfg80211_upload_connect_keys(wdev);
730 731
731 rcu_read_lock(); 732 rcu_read_lock();
732 country_ie = ieee80211_bss_get_ie(bss, WLAN_EID_COUNTRY); 733 country_ie = ieee80211_bss_get_ie(bss, WLAN_EID_COUNTRY);
@@ -1043,6 +1044,12 @@ int cfg80211_connect(struct cfg80211_registered_device *rdev,
1043 connect->crypto.ciphers_pairwise[0] = cipher; 1044 connect->crypto.ciphers_pairwise[0] = cipher;
1044 } 1045 }
1045 } 1046 }
1047
1048 connect->crypto.wep_keys = connkeys->params;
1049 connect->crypto.wep_tx_key = connkeys->def;
1050 } else {
1051 if (WARN_ON(connkeys))
1052 return -EINVAL;
1046 } 1053 }
1047 1054
1048 wdev->connect_keys = connkeys; 1055 wdev->connect_keys = connkeys;
diff --git a/net/wireless/sysfs.c b/net/wireless/sysfs.c
index e46469bc130f..0082f4b01795 100644
--- a/net/wireless/sysfs.c
+++ b/net/wireless/sysfs.c
@@ -57,7 +57,7 @@ static ssize_t addresses_show(struct device *dev,
57 return sprintf(buf, "%pM\n", wiphy->perm_addr); 57 return sprintf(buf, "%pM\n", wiphy->perm_addr);
58 58
59 for (i = 0; i < wiphy->n_addresses; i++) 59 for (i = 0; i < wiphy->n_addresses; i++)
60 buf += sprintf(buf, "%pM\n", &wiphy->addresses[i].addr); 60 buf += sprintf(buf, "%pM\n", wiphy->addresses[i].addr);
61 61
62 return buf - start; 62 return buf - start;
63} 63}
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 72b5255cefe2..a3d0a91b1e09 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -1889,6 +1889,96 @@ DEFINE_EVENT(wiphy_wdev_evt, rdev_stop_p2p_device,
1889 TP_ARGS(wiphy, wdev) 1889 TP_ARGS(wiphy, wdev)
1890); 1890);
1891 1891
1892TRACE_EVENT(rdev_start_nan,
1893 TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
1894 struct cfg80211_nan_conf *conf),
1895 TP_ARGS(wiphy, wdev, conf),
1896 TP_STRUCT__entry(
1897 WIPHY_ENTRY
1898 WDEV_ENTRY
1899 __field(u8, master_pref)
1900 __field(u8, dual);
1901 ),
1902 TP_fast_assign(
1903 WIPHY_ASSIGN;
1904 WDEV_ASSIGN;
1905 __entry->master_pref = conf->master_pref;
1906 __entry->dual = conf->dual;
1907 ),
1908 TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT
1909 ", master preference: %u, dual: %d",
1910 WIPHY_PR_ARG, WDEV_PR_ARG, __entry->master_pref,
1911 __entry->dual)
1912);
1913
1914TRACE_EVENT(rdev_nan_change_conf,
1915 TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
1916 struct cfg80211_nan_conf *conf, u32 changes),
1917 TP_ARGS(wiphy, wdev, conf, changes),
1918 TP_STRUCT__entry(
1919 WIPHY_ENTRY
1920 WDEV_ENTRY
1921 __field(u8, master_pref)
1922 __field(u8, dual);
1923 __field(u32, changes);
1924 ),
1925 TP_fast_assign(
1926 WIPHY_ASSIGN;
1927 WDEV_ASSIGN;
1928 __entry->master_pref = conf->master_pref;
1929 __entry->dual = conf->dual;
1930 __entry->changes = changes;
1931 ),
1932 TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT
1933 ", master preference: %u, dual: %d, changes: %x",
1934 WIPHY_PR_ARG, WDEV_PR_ARG, __entry->master_pref,
1935 __entry->dual, __entry->changes)
1936);
1937
1938DEFINE_EVENT(wiphy_wdev_evt, rdev_stop_nan,
1939 TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
1940 TP_ARGS(wiphy, wdev)
1941);
1942
1943TRACE_EVENT(rdev_add_nan_func,
1944 TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
1945 const struct cfg80211_nan_func *func),
1946 TP_ARGS(wiphy, wdev, func),
1947 TP_STRUCT__entry(
1948 WIPHY_ENTRY
1949 WDEV_ENTRY
1950 __field(u8, func_type)
1951 __field(u64, cookie)
1952 ),
1953 TP_fast_assign(
1954 WIPHY_ASSIGN;
1955 WDEV_ASSIGN;
1956 __entry->func_type = func->type;
1957 __entry->cookie = func->cookie
1958 ),
1959 TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", type=%u, cookie=%llu",
1960 WIPHY_PR_ARG, WDEV_PR_ARG, __entry->func_type,
1961 __entry->cookie)
1962);
1963
1964TRACE_EVENT(rdev_del_nan_func,
1965 TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
1966 u64 cookie),
1967 TP_ARGS(wiphy, wdev, cookie),
1968 TP_STRUCT__entry(
1969 WIPHY_ENTRY
1970 WDEV_ENTRY
1971 __field(u64, cookie)
1972 ),
1973 TP_fast_assign(
1974 WIPHY_ASSIGN;
1975 WDEV_ASSIGN;
1976 __entry->cookie = cookie;
1977 ),
1978 TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", cookie=%llu",
1979 WIPHY_PR_ARG, WDEV_PR_ARG, __entry->cookie)
1980);
1981
1892TRACE_EVENT(rdev_set_mac_acl, 1982TRACE_EVENT(rdev_set_mac_acl,
1893 TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, 1983 TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
1894 struct cfg80211_acl_data *params), 1984 struct cfg80211_acl_data *params),
diff --git a/net/wireless/util.c b/net/wireless/util.c
index b7d1592bd5b8..8edce22d1b93 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -218,7 +218,7 @@ int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev,
218 struct key_params *params, int key_idx, 218 struct key_params *params, int key_idx,
219 bool pairwise, const u8 *mac_addr) 219 bool pairwise, const u8 *mac_addr)
220{ 220{
221 if (key_idx > 5) 221 if (key_idx < 0 || key_idx > 5)
222 return -EINVAL; 222 return -EINVAL;
223 223
224 if (!pairwise && mac_addr && !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN)) 224 if (!pairwise && mac_addr && !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN))
@@ -249,7 +249,13 @@ int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev,
249 /* Disallow BIP (group-only) cipher as pairwise cipher */ 249 /* Disallow BIP (group-only) cipher as pairwise cipher */
250 if (pairwise) 250 if (pairwise)
251 return -EINVAL; 251 return -EINVAL;
252 if (key_idx < 4)
253 return -EINVAL;
252 break; 254 break;
255 case WLAN_CIPHER_SUITE_WEP40:
256 case WLAN_CIPHER_SUITE_WEP104:
257 if (key_idx > 3)
258 return -EINVAL;
253 default: 259 default:
254 break; 260 break;
255 } 261 }
@@ -906,7 +912,7 @@ void cfg80211_upload_connect_keys(struct wireless_dev *wdev)
906 if (!wdev->connect_keys) 912 if (!wdev->connect_keys)
907 return; 913 return;
908 914
909 for (i = 0; i < 6; i++) { 915 for (i = 0; i < CFG80211_MAX_WEP_KEYS; i++) {
910 if (!wdev->connect_keys->params[i].cipher) 916 if (!wdev->connect_keys->params[i].cipher)
911 continue; 917 continue;
912 if (rdev_add_key(rdev, dev, i, false, NULL, 918 if (rdev_add_key(rdev, dev, i, false, NULL,
@@ -919,9 +925,6 @@ void cfg80211_upload_connect_keys(struct wireless_dev *wdev)
919 netdev_err(dev, "failed to set defkey %d\n", i); 925 netdev_err(dev, "failed to set defkey %d\n", i);
920 continue; 926 continue;
921 } 927 }
922 if (wdev->connect_keys->defmgmt == i)
923 if (rdev_set_default_mgmt_key(rdev, dev, i))
924 netdev_err(dev, "failed to set mgtdef %d\n", i);
925 } 928 }
926 929
927 kzfree(wdev->connect_keys); 930 kzfree(wdev->connect_keys);
@@ -1005,8 +1008,9 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
1005 if (otype == NL80211_IFTYPE_AP_VLAN) 1008 if (otype == NL80211_IFTYPE_AP_VLAN)
1006 return -EOPNOTSUPP; 1009 return -EOPNOTSUPP;
1007 1010
1008 /* cannot change into P2P device type */ 1011 /* cannot change into P2P device or NAN */
1009 if (ntype == NL80211_IFTYPE_P2P_DEVICE) 1012 if (ntype == NL80211_IFTYPE_P2P_DEVICE ||
1013 ntype == NL80211_IFTYPE_NAN)
1010 return -EOPNOTSUPP; 1014 return -EOPNOTSUPP;
1011 1015
1012 if (!rdev->ops->change_virtual_intf || 1016 if (!rdev->ops->change_virtual_intf ||
@@ -1085,6 +1089,7 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
1085 /* not happening */ 1089 /* not happening */
1086 break; 1090 break;
1087 case NL80211_IFTYPE_P2P_DEVICE: 1091 case NL80211_IFTYPE_P2P_DEVICE:
1092 case NL80211_IFTYPE_NAN:
1088 WARN_ON(1); 1093 WARN_ON(1);
1089 break; 1094 break;
1090 } 1095 }
@@ -1559,7 +1564,7 @@ int cfg80211_validate_beacon_int(struct cfg80211_registered_device *rdev,
1559 struct wireless_dev *wdev; 1564 struct wireless_dev *wdev;
1560 int res = 0; 1565 int res = 0;
1561 1566
1562 if (!beacon_int) 1567 if (beacon_int < 10 || beacon_int > 10000)
1563 return -EINVAL; 1568 return -EINVAL;
1564 1569
1565 list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { 1570 list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
@@ -1757,6 +1762,28 @@ int cfg80211_get_station(struct net_device *dev, const u8 *mac_addr,
1757} 1762}
1758EXPORT_SYMBOL(cfg80211_get_station); 1763EXPORT_SYMBOL(cfg80211_get_station);
1759 1764
1765void cfg80211_free_nan_func(struct cfg80211_nan_func *f)
1766{
1767 int i;
1768
1769 if (!f)
1770 return;
1771
1772 kfree(f->serv_spec_info);
1773 kfree(f->srf_bf);
1774 kfree(f->srf_macs);
1775 for (i = 0; i < f->num_rx_filters; i++)
1776 kfree(f->rx_filters[i].filter);
1777
1778 for (i = 0; i < f->num_tx_filters; i++)
1779 kfree(f->tx_filters[i].filter);
1780
1781 kfree(f->rx_filters);
1782 kfree(f->tx_filters);
1783 kfree(f);
1784}
1785EXPORT_SYMBOL(cfg80211_free_nan_func);
1786
1760/* See IEEE 802.1H for LLC/SNAP encapsulation/decapsulation */ 1787/* See IEEE 802.1H for LLC/SNAP encapsulation/decapsulation */
1761/* Ethernet-II snap header (RFC1042 for most EtherTypes) */ 1788/* Ethernet-II snap header (RFC1042 for most EtherTypes) */
1762const unsigned char rfc1042_header[] __aligned(2) = 1789const unsigned char rfc1042_header[] __aligned(2) =
diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c
index 9f27221c8913..a220156cf217 100644
--- a/net/wireless/wext-compat.c
+++ b/net/wireless/wext-compat.c
@@ -406,12 +406,16 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
406 if (pairwise && !addr) 406 if (pairwise && !addr)
407 return -EINVAL; 407 return -EINVAL;
408 408
409 /*
410 * In many cases we won't actually need this, but it's better
411 * to do it first in case the allocation fails. Don't use wext.
412 */
409 if (!wdev->wext.keys) { 413 if (!wdev->wext.keys) {
410 wdev->wext.keys = kzalloc(sizeof(*wdev->wext.keys), 414 wdev->wext.keys = kzalloc(sizeof(*wdev->wext.keys),
411 GFP_KERNEL); 415 GFP_KERNEL);
412 if (!wdev->wext.keys) 416 if (!wdev->wext.keys)
413 return -ENOMEM; 417 return -ENOMEM;
414 for (i = 0; i < 6; i++) 418 for (i = 0; i < CFG80211_MAX_WEP_KEYS; i++)
415 wdev->wext.keys->params[i].key = 419 wdev->wext.keys->params[i].key =
416 wdev->wext.keys->data[i]; 420 wdev->wext.keys->data[i];
417 } 421 }
@@ -460,7 +464,7 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
460 if (err == -ENOENT) 464 if (err == -ENOENT)
461 err = 0; 465 err = 0;
462 if (!err) { 466 if (!err) {
463 if (!addr) { 467 if (!addr && idx < 4) {
464 memset(wdev->wext.keys->data[idx], 0, 468 memset(wdev->wext.keys->data[idx], 0,
465 sizeof(wdev->wext.keys->data[idx])); 469 sizeof(wdev->wext.keys->data[idx]));
466 wdev->wext.keys->params[idx].key_len = 0; 470 wdev->wext.keys->params[idx].key_len = 0;
@@ -487,10 +491,19 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
487 err = 0; 491 err = 0;
488 if (wdev->current_bss) 492 if (wdev->current_bss)
489 err = rdev_add_key(rdev, dev, idx, pairwise, addr, params); 493 err = rdev_add_key(rdev, dev, idx, pairwise, addr, params);
494 else if (params->cipher != WLAN_CIPHER_SUITE_WEP40 &&
495 params->cipher != WLAN_CIPHER_SUITE_WEP104)
496 return -EINVAL;
490 if (err) 497 if (err)
491 return err; 498 return err;
492 499
493 if (!addr) { 500 /*
501 * We only need to store WEP keys, since they're the only keys that
502 * can be be set before a connection is established and persist after
503 * disconnecting.
504 */
505 if (!addr && (params->cipher == WLAN_CIPHER_SUITE_WEP40 ||
506 params->cipher == WLAN_CIPHER_SUITE_WEP104)) {
494 wdev->wext.keys->params[idx] = *params; 507 wdev->wext.keys->params[idx] = *params;
495 memcpy(wdev->wext.keys->data[idx], 508 memcpy(wdev->wext.keys->data[idx],
496 params->key, params->key_len); 509 params->key, params->key_len);
diff --git a/net/wireless/wext-sme.c b/net/wireless/wext-sme.c
index a4e8af3321d2..995163830a61 100644
--- a/net/wireless/wext-sme.c
+++ b/net/wireless/wext-sme.c
@@ -35,7 +35,6 @@ int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev,
35 35
36 if (wdev->wext.keys) { 36 if (wdev->wext.keys) {
37 wdev->wext.keys->def = wdev->wext.default_key; 37 wdev->wext.keys->def = wdev->wext.default_key;
38 wdev->wext.keys->defmgmt = wdev->wext.default_mgmt_key;
39 if (wdev->wext.default_key != -1) 38 if (wdev->wext.default_key != -1)
40 wdev->wext.connect.privacy = true; 39 wdev->wext.connect.privacy = true;
41 } 40 }
@@ -43,11 +42,11 @@ int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev,
43 if (!wdev->wext.connect.ssid_len) 42 if (!wdev->wext.connect.ssid_len)
44 return 0; 43 return 0;
45 44
46 if (wdev->wext.keys) { 45 if (wdev->wext.keys && wdev->wext.keys->def != -1) {
47 ck = kmemdup(wdev->wext.keys, sizeof(*ck), GFP_KERNEL); 46 ck = kmemdup(wdev->wext.keys, sizeof(*ck), GFP_KERNEL);
48 if (!ck) 47 if (!ck)
49 return -ENOMEM; 48 return -ENOMEM;
50 for (i = 0; i < 6; i++) 49 for (i = 0; i < CFG80211_MAX_WEP_KEYS; i++)
51 ck->params[i].key = ck->data[i]; 50 ck->params[i].key = ck->data[i];
52 } 51 }
53 52
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index a750f330b8dd..f83b74d3e2ac 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -1500,12 +1500,8 @@ out_fac_release:
1500 goto out_dtefac_release; 1500 goto out_dtefac_release;
1501 if (dtefacs.calling_len > X25_MAX_AE_LEN) 1501 if (dtefacs.calling_len > X25_MAX_AE_LEN)
1502 goto out_dtefac_release; 1502 goto out_dtefac_release;
1503 if (dtefacs.calling_ae == NULL)
1504 goto out_dtefac_release;
1505 if (dtefacs.called_len > X25_MAX_AE_LEN) 1503 if (dtefacs.called_len > X25_MAX_AE_LEN)
1506 goto out_dtefac_release; 1504 goto out_dtefac_release;
1507 if (dtefacs.called_ae == NULL)
1508 goto out_dtefac_release;
1509 x25->dte_facilities = dtefacs; 1505 x25->dte_facilities = dtefacs;
1510 rc = 0; 1506 rc = 0;
1511out_dtefac_release: 1507out_dtefac_release:
diff --git a/net/xfrm/xfrm_algo.c b/net/xfrm/xfrm_algo.c
index 250e567ba3d6..44ac85fe2bc9 100644
--- a/net/xfrm/xfrm_algo.c
+++ b/net/xfrm/xfrm_algo.c
@@ -17,7 +17,7 @@
17#include <linux/crypto.h> 17#include <linux/crypto.h>
18#include <linux/scatterlist.h> 18#include <linux/scatterlist.h>
19#include <net/xfrm.h> 19#include <net/xfrm.h>
20#if defined(CONFIG_INET_ESP) || defined(CONFIG_INET_ESP_MODULE) || defined(CONFIG_INET6_ESP) || defined(CONFIG_INET6_ESP_MODULE) 20#if IS_ENABLED(CONFIG_INET_ESP) || IS_ENABLED(CONFIG_INET6_ESP)
21#include <net/esp.h> 21#include <net/esp.h>
22#endif 22#endif
23 23
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 45f9cf97ea25..fd6986634e6f 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -49,6 +49,7 @@ static struct xfrm_policy_afinfo __rcu *xfrm_policy_afinfo[NPROTO]
49 __read_mostly; 49 __read_mostly;
50 50
51static struct kmem_cache *xfrm_dst_cache __read_mostly; 51static struct kmem_cache *xfrm_dst_cache __read_mostly;
52static __read_mostly seqcount_t xfrm_policy_hash_generation;
52 53
53static void xfrm_init_pmtu(struct dst_entry *dst); 54static void xfrm_init_pmtu(struct dst_entry *dst);
54static int stale_bundle(struct dst_entry *dst); 55static int stale_bundle(struct dst_entry *dst);
@@ -59,6 +60,11 @@ static void __xfrm_policy_link(struct xfrm_policy *pol, int dir);
59static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol, 60static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
60 int dir); 61 int dir);
61 62
63static inline bool xfrm_pol_hold_rcu(struct xfrm_policy *policy)
64{
65 return atomic_inc_not_zero(&policy->refcnt);
66}
67
62static inline bool 68static inline bool
63__xfrm4_selector_match(const struct xfrm_selector *sel, const struct flowi *fl) 69__xfrm4_selector_match(const struct xfrm_selector *sel, const struct flowi *fl)
64{ 70{
@@ -385,9 +391,11 @@ static struct hlist_head *policy_hash_bysel(struct net *net,
385 __get_hash_thresh(net, family, dir, &dbits, &sbits); 391 __get_hash_thresh(net, family, dir, &dbits, &sbits);
386 hash = __sel_hash(sel, family, hmask, dbits, sbits); 392 hash = __sel_hash(sel, family, hmask, dbits, sbits);
387 393
388 return (hash == hmask + 1 ? 394 if (hash == hmask + 1)
389 &net->xfrm.policy_inexact[dir] : 395 return &net->xfrm.policy_inexact[dir];
390 net->xfrm.policy_bydst[dir].table + hash); 396
397 return rcu_dereference_check(net->xfrm.policy_bydst[dir].table,
398 lockdep_is_held(&net->xfrm.xfrm_policy_lock)) + hash;
391} 399}
392 400
393static struct hlist_head *policy_hash_direct(struct net *net, 401static struct hlist_head *policy_hash_direct(struct net *net,
@@ -403,7 +411,8 @@ static struct hlist_head *policy_hash_direct(struct net *net,
403 __get_hash_thresh(net, family, dir, &dbits, &sbits); 411 __get_hash_thresh(net, family, dir, &dbits, &sbits);
404 hash = __addr_hash(daddr, saddr, family, hmask, dbits, sbits); 412 hash = __addr_hash(daddr, saddr, family, hmask, dbits, sbits);
405 413
406 return net->xfrm.policy_bydst[dir].table + hash; 414 return rcu_dereference_check(net->xfrm.policy_bydst[dir].table,
415 lockdep_is_held(&net->xfrm.xfrm_policy_lock)) + hash;
407} 416}
408 417
409static void xfrm_dst_hash_transfer(struct net *net, 418static void xfrm_dst_hash_transfer(struct net *net,
@@ -426,14 +435,14 @@ redo:
426 h = __addr_hash(&pol->selector.daddr, &pol->selector.saddr, 435 h = __addr_hash(&pol->selector.daddr, &pol->selector.saddr,
427 pol->family, nhashmask, dbits, sbits); 436 pol->family, nhashmask, dbits, sbits);
428 if (!entry0) { 437 if (!entry0) {
429 hlist_del(&pol->bydst); 438 hlist_del_rcu(&pol->bydst);
430 hlist_add_head(&pol->bydst, ndsttable+h); 439 hlist_add_head_rcu(&pol->bydst, ndsttable + h);
431 h0 = h; 440 h0 = h;
432 } else { 441 } else {
433 if (h != h0) 442 if (h != h0)
434 continue; 443 continue;
435 hlist_del(&pol->bydst); 444 hlist_del_rcu(&pol->bydst);
436 hlist_add_behind(&pol->bydst, entry0); 445 hlist_add_behind_rcu(&pol->bydst, entry0);
437 } 446 }
438 entry0 = &pol->bydst; 447 entry0 = &pol->bydst;
439 } 448 }
@@ -468,22 +477,32 @@ static void xfrm_bydst_resize(struct net *net, int dir)
468 unsigned int hmask = net->xfrm.policy_bydst[dir].hmask; 477 unsigned int hmask = net->xfrm.policy_bydst[dir].hmask;
469 unsigned int nhashmask = xfrm_new_hash_mask(hmask); 478 unsigned int nhashmask = xfrm_new_hash_mask(hmask);
470 unsigned int nsize = (nhashmask + 1) * sizeof(struct hlist_head); 479 unsigned int nsize = (nhashmask + 1) * sizeof(struct hlist_head);
471 struct hlist_head *odst = net->xfrm.policy_bydst[dir].table;
472 struct hlist_head *ndst = xfrm_hash_alloc(nsize); 480 struct hlist_head *ndst = xfrm_hash_alloc(nsize);
481 struct hlist_head *odst;
473 int i; 482 int i;
474 483
475 if (!ndst) 484 if (!ndst)
476 return; 485 return;
477 486
478 write_lock_bh(&net->xfrm.xfrm_policy_lock); 487 spin_lock_bh(&net->xfrm.xfrm_policy_lock);
488 write_seqcount_begin(&xfrm_policy_hash_generation);
489
490 odst = rcu_dereference_protected(net->xfrm.policy_bydst[dir].table,
491 lockdep_is_held(&net->xfrm.xfrm_policy_lock));
492
493 odst = rcu_dereference_protected(net->xfrm.policy_bydst[dir].table,
494 lockdep_is_held(&net->xfrm.xfrm_policy_lock));
479 495
480 for (i = hmask; i >= 0; i--) 496 for (i = hmask; i >= 0; i--)
481 xfrm_dst_hash_transfer(net, odst + i, ndst, nhashmask, dir); 497 xfrm_dst_hash_transfer(net, odst + i, ndst, nhashmask, dir);
482 498
483 net->xfrm.policy_bydst[dir].table = ndst; 499 rcu_assign_pointer(net->xfrm.policy_bydst[dir].table, ndst);
484 net->xfrm.policy_bydst[dir].hmask = nhashmask; 500 net->xfrm.policy_bydst[dir].hmask = nhashmask;
485 501
486 write_unlock_bh(&net->xfrm.xfrm_policy_lock); 502 write_seqcount_end(&xfrm_policy_hash_generation);
503 spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
504
505 synchronize_rcu();
487 506
488 xfrm_hash_free(odst, (hmask + 1) * sizeof(struct hlist_head)); 507 xfrm_hash_free(odst, (hmask + 1) * sizeof(struct hlist_head));
489} 508}
@@ -500,7 +519,7 @@ static void xfrm_byidx_resize(struct net *net, int total)
500 if (!nidx) 519 if (!nidx)
501 return; 520 return;
502 521
503 write_lock_bh(&net->xfrm.xfrm_policy_lock); 522 spin_lock_bh(&net->xfrm.xfrm_policy_lock);
504 523
505 for (i = hmask; i >= 0; i--) 524 for (i = hmask; i >= 0; i--)
506 xfrm_idx_hash_transfer(oidx + i, nidx, nhashmask); 525 xfrm_idx_hash_transfer(oidx + i, nidx, nhashmask);
@@ -508,7 +527,7 @@ static void xfrm_byidx_resize(struct net *net, int total)
508 net->xfrm.policy_byidx = nidx; 527 net->xfrm.policy_byidx = nidx;
509 net->xfrm.policy_idx_hmask = nhashmask; 528 net->xfrm.policy_idx_hmask = nhashmask;
510 529
511 write_unlock_bh(&net->xfrm.xfrm_policy_lock); 530 spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
512 531
513 xfrm_hash_free(oidx, (hmask + 1) * sizeof(struct hlist_head)); 532 xfrm_hash_free(oidx, (hmask + 1) * sizeof(struct hlist_head));
514} 533}
@@ -541,7 +560,6 @@ static inline int xfrm_byidx_should_resize(struct net *net, int total)
541 560
542void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si) 561void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si)
543{ 562{
544 read_lock_bh(&net->xfrm.xfrm_policy_lock);
545 si->incnt = net->xfrm.policy_count[XFRM_POLICY_IN]; 563 si->incnt = net->xfrm.policy_count[XFRM_POLICY_IN];
546 si->outcnt = net->xfrm.policy_count[XFRM_POLICY_OUT]; 564 si->outcnt = net->xfrm.policy_count[XFRM_POLICY_OUT];
547 si->fwdcnt = net->xfrm.policy_count[XFRM_POLICY_FWD]; 565 si->fwdcnt = net->xfrm.policy_count[XFRM_POLICY_FWD];
@@ -550,7 +568,6 @@ void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si)
550 si->fwdscnt = net->xfrm.policy_count[XFRM_POLICY_FWD+XFRM_POLICY_MAX]; 568 si->fwdscnt = net->xfrm.policy_count[XFRM_POLICY_FWD+XFRM_POLICY_MAX];
551 si->spdhcnt = net->xfrm.policy_idx_hmask; 569 si->spdhcnt = net->xfrm.policy_idx_hmask;
552 si->spdhmcnt = xfrm_policy_hashmax; 570 si->spdhmcnt = xfrm_policy_hashmax;
553 read_unlock_bh(&net->xfrm.xfrm_policy_lock);
554} 571}
555EXPORT_SYMBOL(xfrm_spd_getinfo); 572EXPORT_SYMBOL(xfrm_spd_getinfo);
556 573
@@ -600,7 +617,7 @@ static void xfrm_hash_rebuild(struct work_struct *work)
600 rbits6 = net->xfrm.policy_hthresh.rbits6; 617 rbits6 = net->xfrm.policy_hthresh.rbits6;
601 } while (read_seqretry(&net->xfrm.policy_hthresh.lock, seq)); 618 } while (read_seqretry(&net->xfrm.policy_hthresh.lock, seq));
602 619
603 write_lock_bh(&net->xfrm.xfrm_policy_lock); 620 spin_lock_bh(&net->xfrm.xfrm_policy_lock);
604 621
605 /* reset the bydst and inexact table in all directions */ 622 /* reset the bydst and inexact table in all directions */
606 for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { 623 for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
@@ -646,7 +663,7 @@ static void xfrm_hash_rebuild(struct work_struct *work)
646 hlist_add_head(&policy->bydst, chain); 663 hlist_add_head(&policy->bydst, chain);
647 } 664 }
648 665
649 write_unlock_bh(&net->xfrm.xfrm_policy_lock); 666 spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
650 667
651 mutex_unlock(&hash_resize_mutex); 668 mutex_unlock(&hash_resize_mutex);
652} 669}
@@ -757,7 +774,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
757 struct hlist_head *chain; 774 struct hlist_head *chain;
758 struct hlist_node *newpos; 775 struct hlist_node *newpos;
759 776
760 write_lock_bh(&net->xfrm.xfrm_policy_lock); 777 spin_lock_bh(&net->xfrm.xfrm_policy_lock);
761 chain = policy_hash_bysel(net, &policy->selector, policy->family, dir); 778 chain = policy_hash_bysel(net, &policy->selector, policy->family, dir);
762 delpol = NULL; 779 delpol = NULL;
763 newpos = NULL; 780 newpos = NULL;
@@ -768,7 +785,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
768 xfrm_sec_ctx_match(pol->security, policy->security) && 785 xfrm_sec_ctx_match(pol->security, policy->security) &&
769 !WARN_ON(delpol)) { 786 !WARN_ON(delpol)) {
770 if (excl) { 787 if (excl) {
771 write_unlock_bh(&net->xfrm.xfrm_policy_lock); 788 spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
772 return -EEXIST; 789 return -EEXIST;
773 } 790 }
774 delpol = pol; 791 delpol = pol;
@@ -804,7 +821,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
804 policy->curlft.use_time = 0; 821 policy->curlft.use_time = 0;
805 if (!mod_timer(&policy->timer, jiffies + HZ)) 822 if (!mod_timer(&policy->timer, jiffies + HZ))
806 xfrm_pol_hold(policy); 823 xfrm_pol_hold(policy);
807 write_unlock_bh(&net->xfrm.xfrm_policy_lock); 824 spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
808 825
809 if (delpol) 826 if (delpol)
810 xfrm_policy_kill(delpol); 827 xfrm_policy_kill(delpol);
@@ -824,7 +841,7 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type,
824 struct hlist_head *chain; 841 struct hlist_head *chain;
825 842
826 *err = 0; 843 *err = 0;
827 write_lock_bh(&net->xfrm.xfrm_policy_lock); 844 spin_lock_bh(&net->xfrm.xfrm_policy_lock);
828 chain = policy_hash_bysel(net, sel, sel->family, dir); 845 chain = policy_hash_bysel(net, sel, sel->family, dir);
829 ret = NULL; 846 ret = NULL;
830 hlist_for_each_entry(pol, chain, bydst) { 847 hlist_for_each_entry(pol, chain, bydst) {
@@ -837,7 +854,7 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type,
837 *err = security_xfrm_policy_delete( 854 *err = security_xfrm_policy_delete(
838 pol->security); 855 pol->security);
839 if (*err) { 856 if (*err) {
840 write_unlock_bh(&net->xfrm.xfrm_policy_lock); 857 spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
841 return pol; 858 return pol;
842 } 859 }
843 __xfrm_policy_unlink(pol, dir); 860 __xfrm_policy_unlink(pol, dir);
@@ -846,7 +863,7 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type,
846 break; 863 break;
847 } 864 }
848 } 865 }
849 write_unlock_bh(&net->xfrm.xfrm_policy_lock); 866 spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
850 867
851 if (ret && delete) 868 if (ret && delete)
852 xfrm_policy_kill(ret); 869 xfrm_policy_kill(ret);
@@ -865,7 +882,7 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type,
865 return NULL; 882 return NULL;
866 883
867 *err = 0; 884 *err = 0;
868 write_lock_bh(&net->xfrm.xfrm_policy_lock); 885 spin_lock_bh(&net->xfrm.xfrm_policy_lock);
869 chain = net->xfrm.policy_byidx + idx_hash(net, id); 886 chain = net->xfrm.policy_byidx + idx_hash(net, id);
870 ret = NULL; 887 ret = NULL;
871 hlist_for_each_entry(pol, chain, byidx) { 888 hlist_for_each_entry(pol, chain, byidx) {
@@ -876,7 +893,7 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type,
876 *err = security_xfrm_policy_delete( 893 *err = security_xfrm_policy_delete(
877 pol->security); 894 pol->security);
878 if (*err) { 895 if (*err) {
879 write_unlock_bh(&net->xfrm.xfrm_policy_lock); 896 spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
880 return pol; 897 return pol;
881 } 898 }
882 __xfrm_policy_unlink(pol, dir); 899 __xfrm_policy_unlink(pol, dir);
@@ -885,7 +902,7 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type,
885 break; 902 break;
886 } 903 }
887 } 904 }
888 write_unlock_bh(&net->xfrm.xfrm_policy_lock); 905 spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
889 906
890 if (ret && delete) 907 if (ret && delete)
891 xfrm_policy_kill(ret); 908 xfrm_policy_kill(ret);
@@ -943,7 +960,7 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid)
943{ 960{
944 int dir, err = 0, cnt = 0; 961 int dir, err = 0, cnt = 0;
945 962
946 write_lock_bh(&net->xfrm.xfrm_policy_lock); 963 spin_lock_bh(&net->xfrm.xfrm_policy_lock);
947 964
948 err = xfrm_policy_flush_secctx_check(net, type, task_valid); 965 err = xfrm_policy_flush_secctx_check(net, type, task_valid);
949 if (err) 966 if (err)
@@ -959,14 +976,14 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid)
959 if (pol->type != type) 976 if (pol->type != type)
960 continue; 977 continue;
961 __xfrm_policy_unlink(pol, dir); 978 __xfrm_policy_unlink(pol, dir);
962 write_unlock_bh(&net->xfrm.xfrm_policy_lock); 979 spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
963 cnt++; 980 cnt++;
964 981
965 xfrm_audit_policy_delete(pol, 1, task_valid); 982 xfrm_audit_policy_delete(pol, 1, task_valid);
966 983
967 xfrm_policy_kill(pol); 984 xfrm_policy_kill(pol);
968 985
969 write_lock_bh(&net->xfrm.xfrm_policy_lock); 986 spin_lock_bh(&net->xfrm.xfrm_policy_lock);
970 goto again1; 987 goto again1;
971 } 988 }
972 989
@@ -978,13 +995,13 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid)
978 if (pol->type != type) 995 if (pol->type != type)
979 continue; 996 continue;
980 __xfrm_policy_unlink(pol, dir); 997 __xfrm_policy_unlink(pol, dir);
981 write_unlock_bh(&net->xfrm.xfrm_policy_lock); 998 spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
982 cnt++; 999 cnt++;
983 1000
984 xfrm_audit_policy_delete(pol, 1, task_valid); 1001 xfrm_audit_policy_delete(pol, 1, task_valid);
985 xfrm_policy_kill(pol); 1002 xfrm_policy_kill(pol);
986 1003
987 write_lock_bh(&net->xfrm.xfrm_policy_lock); 1004 spin_lock_bh(&net->xfrm.xfrm_policy_lock);
988 goto again2; 1005 goto again2;
989 } 1006 }
990 } 1007 }
@@ -993,7 +1010,7 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid)
993 if (!cnt) 1010 if (!cnt)
994 err = -ESRCH; 1011 err = -ESRCH;
995out: 1012out:
996 write_unlock_bh(&net->xfrm.xfrm_policy_lock); 1013 spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
997 return err; 1014 return err;
998} 1015}
999EXPORT_SYMBOL(xfrm_policy_flush); 1016EXPORT_SYMBOL(xfrm_policy_flush);
@@ -1013,7 +1030,7 @@ int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk,
1013 if (list_empty(&walk->walk.all) && walk->seq != 0) 1030 if (list_empty(&walk->walk.all) && walk->seq != 0)
1014 return 0; 1031 return 0;
1015 1032
1016 write_lock_bh(&net->xfrm.xfrm_policy_lock); 1033 spin_lock_bh(&net->xfrm.xfrm_policy_lock);
1017 if (list_empty(&walk->walk.all)) 1034 if (list_empty(&walk->walk.all))
1018 x = list_first_entry(&net->xfrm.policy_all, struct xfrm_policy_walk_entry, all); 1035 x = list_first_entry(&net->xfrm.policy_all, struct xfrm_policy_walk_entry, all);
1019 else 1036 else
@@ -1041,7 +1058,7 @@ int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk,
1041 } 1058 }
1042 list_del_init(&walk->walk.all); 1059 list_del_init(&walk->walk.all);
1043out: 1060out:
1044 write_unlock_bh(&net->xfrm.xfrm_policy_lock); 1061 spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
1045 return error; 1062 return error;
1046} 1063}
1047EXPORT_SYMBOL(xfrm_policy_walk); 1064EXPORT_SYMBOL(xfrm_policy_walk);
@@ -1060,9 +1077,9 @@ void xfrm_policy_walk_done(struct xfrm_policy_walk *walk, struct net *net)
1060 if (list_empty(&walk->walk.all)) 1077 if (list_empty(&walk->walk.all))
1061 return; 1078 return;
1062 1079
1063 write_lock_bh(&net->xfrm.xfrm_policy_lock); /*FIXME where is net? */ 1080 spin_lock_bh(&net->xfrm.xfrm_policy_lock); /*FIXME where is net? */
1064 list_del(&walk->walk.all); 1081 list_del(&walk->walk.all);
1065 write_unlock_bh(&net->xfrm.xfrm_policy_lock); 1082 spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
1066} 1083}
1067EXPORT_SYMBOL(xfrm_policy_walk_done); 1084EXPORT_SYMBOL(xfrm_policy_walk_done);
1068 1085
@@ -1100,17 +1117,24 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
1100 struct xfrm_policy *pol, *ret; 1117 struct xfrm_policy *pol, *ret;
1101 const xfrm_address_t *daddr, *saddr; 1118 const xfrm_address_t *daddr, *saddr;
1102 struct hlist_head *chain; 1119 struct hlist_head *chain;
1103 u32 priority = ~0U; 1120 unsigned int sequence;
1121 u32 priority;
1104 1122
1105 daddr = xfrm_flowi_daddr(fl, family); 1123 daddr = xfrm_flowi_daddr(fl, family);
1106 saddr = xfrm_flowi_saddr(fl, family); 1124 saddr = xfrm_flowi_saddr(fl, family);
1107 if (unlikely(!daddr || !saddr)) 1125 if (unlikely(!daddr || !saddr))
1108 return NULL; 1126 return NULL;
1109 1127
1110 read_lock_bh(&net->xfrm.xfrm_policy_lock); 1128 rcu_read_lock();
1111 chain = policy_hash_direct(net, daddr, saddr, family, dir); 1129 retry:
1130 do {
1131 sequence = read_seqcount_begin(&xfrm_policy_hash_generation);
1132 chain = policy_hash_direct(net, daddr, saddr, family, dir);
1133 } while (read_seqcount_retry(&xfrm_policy_hash_generation, sequence));
1134
1135 priority = ~0U;
1112 ret = NULL; 1136 ret = NULL;
1113 hlist_for_each_entry(pol, chain, bydst) { 1137 hlist_for_each_entry_rcu(pol, chain, bydst) {
1114 err = xfrm_policy_match(pol, fl, type, family, dir); 1138 err = xfrm_policy_match(pol, fl, type, family, dir);
1115 if (err) { 1139 if (err) {
1116 if (err == -ESRCH) 1140 if (err == -ESRCH)
@@ -1126,7 +1150,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
1126 } 1150 }
1127 } 1151 }
1128 chain = &net->xfrm.policy_inexact[dir]; 1152 chain = &net->xfrm.policy_inexact[dir];
1129 hlist_for_each_entry(pol, chain, bydst) { 1153 hlist_for_each_entry_rcu(pol, chain, bydst) {
1130 if ((pol->priority >= priority) && ret) 1154 if ((pol->priority >= priority) && ret)
1131 break; 1155 break;
1132 1156
@@ -1144,9 +1168,13 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
1144 } 1168 }
1145 } 1169 }
1146 1170
1147 xfrm_pol_hold(ret); 1171 if (read_seqcount_retry(&xfrm_policy_hash_generation, sequence))
1172 goto retry;
1173
1174 if (ret && !xfrm_pol_hold_rcu(ret))
1175 goto retry;
1148fail: 1176fail:
1149 read_unlock_bh(&net->xfrm.xfrm_policy_lock); 1177 rcu_read_unlock();
1150 1178
1151 return ret; 1179 return ret;
1152} 1180}
@@ -1223,10 +1251,9 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir,
1223 const struct flowi *fl) 1251 const struct flowi *fl)
1224{ 1252{
1225 struct xfrm_policy *pol; 1253 struct xfrm_policy *pol;
1226 struct net *net = sock_net(sk);
1227 1254
1228 rcu_read_lock(); 1255 rcu_read_lock();
1229 read_lock_bh(&net->xfrm.xfrm_policy_lock); 1256 again:
1230 pol = rcu_dereference(sk->sk_policy[dir]); 1257 pol = rcu_dereference(sk->sk_policy[dir]);
1231 if (pol != NULL) { 1258 if (pol != NULL) {
1232 bool match = xfrm_selector_match(&pol->selector, fl, 1259 bool match = xfrm_selector_match(&pol->selector, fl,
@@ -1241,8 +1268,8 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir,
1241 err = security_xfrm_policy_lookup(pol->security, 1268 err = security_xfrm_policy_lookup(pol->security,
1242 fl->flowi_secid, 1269 fl->flowi_secid,
1243 policy_to_flow_dir(dir)); 1270 policy_to_flow_dir(dir));
1244 if (!err) 1271 if (!err && !xfrm_pol_hold_rcu(pol))
1245 xfrm_pol_hold(pol); 1272 goto again;
1246 else if (err == -ESRCH) 1273 else if (err == -ESRCH)
1247 pol = NULL; 1274 pol = NULL;
1248 else 1275 else
@@ -1251,7 +1278,6 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir,
1251 pol = NULL; 1278 pol = NULL;
1252 } 1279 }
1253out: 1280out:
1254 read_unlock_bh(&net->xfrm.xfrm_policy_lock);
1255 rcu_read_unlock(); 1281 rcu_read_unlock();
1256 return pol; 1282 return pol;
1257} 1283}
@@ -1275,7 +1301,7 @@ static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
1275 1301
1276 /* Socket policies are not hashed. */ 1302 /* Socket policies are not hashed. */
1277 if (!hlist_unhashed(&pol->bydst)) { 1303 if (!hlist_unhashed(&pol->bydst)) {
1278 hlist_del(&pol->bydst); 1304 hlist_del_rcu(&pol->bydst);
1279 hlist_del(&pol->byidx); 1305 hlist_del(&pol->byidx);
1280 } 1306 }
1281 1307
@@ -1299,9 +1325,9 @@ int xfrm_policy_delete(struct xfrm_policy *pol, int dir)
1299{ 1325{
1300 struct net *net = xp_net(pol); 1326 struct net *net = xp_net(pol);
1301 1327
1302 write_lock_bh(&net->xfrm.xfrm_policy_lock); 1328 spin_lock_bh(&net->xfrm.xfrm_policy_lock);
1303 pol = __xfrm_policy_unlink(pol, dir); 1329 pol = __xfrm_policy_unlink(pol, dir);
1304 write_unlock_bh(&net->xfrm.xfrm_policy_lock); 1330 spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
1305 if (pol) { 1331 if (pol) {
1306 xfrm_policy_kill(pol); 1332 xfrm_policy_kill(pol);
1307 return 0; 1333 return 0;
@@ -1320,7 +1346,7 @@ int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol)
1320 return -EINVAL; 1346 return -EINVAL;
1321#endif 1347#endif
1322 1348
1323 write_lock_bh(&net->xfrm.xfrm_policy_lock); 1349 spin_lock_bh(&net->xfrm.xfrm_policy_lock);
1324 old_pol = rcu_dereference_protected(sk->sk_policy[dir], 1350 old_pol = rcu_dereference_protected(sk->sk_policy[dir],
1325 lockdep_is_held(&net->xfrm.xfrm_policy_lock)); 1351 lockdep_is_held(&net->xfrm.xfrm_policy_lock));
1326 if (pol) { 1352 if (pol) {
@@ -1338,7 +1364,7 @@ int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol)
1338 */ 1364 */
1339 xfrm_sk_policy_unlink(old_pol, dir); 1365 xfrm_sk_policy_unlink(old_pol, dir);
1340 } 1366 }
1341 write_unlock_bh(&net->xfrm.xfrm_policy_lock); 1367 spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
1342 1368
1343 if (old_pol) { 1369 if (old_pol) {
1344 xfrm_policy_kill(old_pol); 1370 xfrm_policy_kill(old_pol);
@@ -1368,9 +1394,9 @@ static struct xfrm_policy *clone_policy(const struct xfrm_policy *old, int dir)
1368 newp->type = old->type; 1394 newp->type = old->type;
1369 memcpy(newp->xfrm_vec, old->xfrm_vec, 1395 memcpy(newp->xfrm_vec, old->xfrm_vec,
1370 newp->xfrm_nr*sizeof(struct xfrm_tmpl)); 1396 newp->xfrm_nr*sizeof(struct xfrm_tmpl));
1371 write_lock_bh(&net->xfrm.xfrm_policy_lock); 1397 spin_lock_bh(&net->xfrm.xfrm_policy_lock);
1372 xfrm_sk_policy_link(newp, dir); 1398 xfrm_sk_policy_link(newp, dir);
1373 write_unlock_bh(&net->xfrm.xfrm_policy_lock); 1399 spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
1374 xfrm_pol_put(newp); 1400 xfrm_pol_put(newp);
1375 } 1401 }
1376 return newp; 1402 return newp;
@@ -3052,7 +3078,7 @@ static int __net_init xfrm_net_init(struct net *net)
3052 3078
3053 /* Initialize the per-net locks here */ 3079 /* Initialize the per-net locks here */
3054 spin_lock_init(&net->xfrm.xfrm_state_lock); 3080 spin_lock_init(&net->xfrm.xfrm_state_lock);
3055 rwlock_init(&net->xfrm.xfrm_policy_lock); 3081 spin_lock_init(&net->xfrm.xfrm_policy_lock);
3056 mutex_init(&net->xfrm.xfrm_cfg_mutex); 3082 mutex_init(&net->xfrm.xfrm_cfg_mutex);
3057 3083
3058 return 0; 3084 return 0;
@@ -3086,6 +3112,7 @@ static struct pernet_operations __net_initdata xfrm_net_ops = {
3086void __init xfrm_init(void) 3112void __init xfrm_init(void)
3087{ 3113{
3088 register_pernet_subsys(&xfrm_net_ops); 3114 register_pernet_subsys(&xfrm_net_ops);
3115 seqcount_init(&xfrm_policy_hash_generation);
3089 xfrm_input_init(); 3116 xfrm_input_init();
3090} 3117}
3091 3118
@@ -3183,7 +3210,7 @@ static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector *
3183 struct hlist_head *chain; 3210 struct hlist_head *chain;
3184 u32 priority = ~0U; 3211 u32 priority = ~0U;
3185 3212
3186 read_lock_bh(&net->xfrm.xfrm_policy_lock); /*FIXME*/ 3213 spin_lock_bh(&net->xfrm.xfrm_policy_lock);
3187 chain = policy_hash_direct(net, &sel->daddr, &sel->saddr, sel->family, dir); 3214 chain = policy_hash_direct(net, &sel->daddr, &sel->saddr, sel->family, dir);
3188 hlist_for_each_entry(pol, chain, bydst) { 3215 hlist_for_each_entry(pol, chain, bydst) {
3189 if (xfrm_migrate_selector_match(sel, &pol->selector) && 3216 if (xfrm_migrate_selector_match(sel, &pol->selector) &&
@@ -3207,7 +3234,7 @@ static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector *
3207 3234
3208 xfrm_pol_hold(ret); 3235 xfrm_pol_hold(ret);
3209 3236
3210 read_unlock_bh(&net->xfrm.xfrm_policy_lock); 3237 spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
3211 3238
3212 return ret; 3239 return ret;
3213} 3240}
diff --git a/net/xfrm/xfrm_proc.c b/net/xfrm/xfrm_proc.c
index 9c4fbd8935f4..ba2b539879bc 100644
--- a/net/xfrm/xfrm_proc.c
+++ b/net/xfrm/xfrm_proc.c
@@ -50,12 +50,18 @@ static const struct snmp_mib xfrm_mib_list[] = {
50 50
51static int xfrm_statistics_seq_show(struct seq_file *seq, void *v) 51static int xfrm_statistics_seq_show(struct seq_file *seq, void *v)
52{ 52{
53 unsigned long buff[LINUX_MIB_XFRMMAX];
53 struct net *net = seq->private; 54 struct net *net = seq->private;
54 int i; 55 int i;
56
57 memset(buff, 0, sizeof(unsigned long) * LINUX_MIB_XFRMMAX);
58
59 snmp_get_cpu_field_batch(buff, xfrm_mib_list,
60 net->mib.xfrm_statistics);
55 for (i = 0; xfrm_mib_list[i].name; i++) 61 for (i = 0; xfrm_mib_list[i].name; i++)
56 seq_printf(seq, "%-24s\t%lu\n", xfrm_mib_list[i].name, 62 seq_printf(seq, "%-24s\t%lu\n", xfrm_mib_list[i].name,
57 snmp_fold_field(net->mib.xfrm_statistics, 63 buff[i]);
58 xfrm_mib_list[i].entry)); 64
59 return 0; 65 return 0;
60} 66}
61 67
diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c
index 4fd725a0c500..cdc2e2e71bff 100644
--- a/net/xfrm/xfrm_replay.c
+++ b/net/xfrm/xfrm_replay.c
@@ -558,7 +558,7 @@ static void xfrm_replay_advance_esn(struct xfrm_state *x, __be32 net_seq)
558 x->repl->notify(x, XFRM_REPLAY_UPDATE); 558 x->repl->notify(x, XFRM_REPLAY_UPDATE);
559} 559}
560 560
561static struct xfrm_replay xfrm_replay_legacy = { 561static const struct xfrm_replay xfrm_replay_legacy = {
562 .advance = xfrm_replay_advance, 562 .advance = xfrm_replay_advance,
563 .check = xfrm_replay_check, 563 .check = xfrm_replay_check,
564 .recheck = xfrm_replay_check, 564 .recheck = xfrm_replay_check,
@@ -566,7 +566,7 @@ static struct xfrm_replay xfrm_replay_legacy = {
566 .overflow = xfrm_replay_overflow, 566 .overflow = xfrm_replay_overflow,
567}; 567};
568 568
569static struct xfrm_replay xfrm_replay_bmp = { 569static const struct xfrm_replay xfrm_replay_bmp = {
570 .advance = xfrm_replay_advance_bmp, 570 .advance = xfrm_replay_advance_bmp,
571 .check = xfrm_replay_check_bmp, 571 .check = xfrm_replay_check_bmp,
572 .recheck = xfrm_replay_check_bmp, 572 .recheck = xfrm_replay_check_bmp,
@@ -574,7 +574,7 @@ static struct xfrm_replay xfrm_replay_bmp = {
574 .overflow = xfrm_replay_overflow_bmp, 574 .overflow = xfrm_replay_overflow_bmp,
575}; 575};
576 576
577static struct xfrm_replay xfrm_replay_esn = { 577static const struct xfrm_replay xfrm_replay_esn = {
578 .advance = xfrm_replay_advance_esn, 578 .advance = xfrm_replay_advance_esn,
579 .check = xfrm_replay_check_esn, 579 .check = xfrm_replay_check_esn,
580 .recheck = xfrm_replay_recheck_esn, 580 .recheck = xfrm_replay_recheck_esn,
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index a30f898dc1c5..419bf5d463bd 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -28,6 +28,11 @@
28 28
29#include "xfrm_hash.h" 29#include "xfrm_hash.h"
30 30
31#define xfrm_state_deref_prot(table, net) \
32 rcu_dereference_protected((table), lockdep_is_held(&(net)->xfrm.xfrm_state_lock))
33
34static void xfrm_state_gc_task(struct work_struct *work);
35
31/* Each xfrm_state may be linked to two tables: 36/* Each xfrm_state may be linked to two tables:
32 37
33 1. Hash table by (spi,daddr,ah/esp) to find SA by SPI. (input,ctl) 38 1. Hash table by (spi,daddr,ah/esp) to find SA by SPI. (input,ctl)
@@ -36,6 +41,15 @@
36 */ 41 */
37 42
38static unsigned int xfrm_state_hashmax __read_mostly = 1 * 1024 * 1024; 43static unsigned int xfrm_state_hashmax __read_mostly = 1 * 1024 * 1024;
44static __read_mostly seqcount_t xfrm_state_hash_generation = SEQCNT_ZERO(xfrm_state_hash_generation);
45
46static DECLARE_WORK(xfrm_state_gc_work, xfrm_state_gc_task);
47static HLIST_HEAD(xfrm_state_gc_list);
48
49static inline bool xfrm_state_hold_rcu(struct xfrm_state __rcu *x)
50{
51 return atomic_inc_not_zero(&x->refcnt);
52}
39 53
40static inline unsigned int xfrm_dst_hash(struct net *net, 54static inline unsigned int xfrm_dst_hash(struct net *net,
41 const xfrm_address_t *daddr, 55 const xfrm_address_t *daddr,
@@ -76,18 +90,18 @@ static void xfrm_hash_transfer(struct hlist_head *list,
76 h = __xfrm_dst_hash(&x->id.daddr, &x->props.saddr, 90 h = __xfrm_dst_hash(&x->id.daddr, &x->props.saddr,
77 x->props.reqid, x->props.family, 91 x->props.reqid, x->props.family,
78 nhashmask); 92 nhashmask);
79 hlist_add_head(&x->bydst, ndsttable+h); 93 hlist_add_head_rcu(&x->bydst, ndsttable + h);
80 94
81 h = __xfrm_src_hash(&x->id.daddr, &x->props.saddr, 95 h = __xfrm_src_hash(&x->id.daddr, &x->props.saddr,
82 x->props.family, 96 x->props.family,
83 nhashmask); 97 nhashmask);
84 hlist_add_head(&x->bysrc, nsrctable+h); 98 hlist_add_head_rcu(&x->bysrc, nsrctable + h);
85 99
86 if (x->id.spi) { 100 if (x->id.spi) {
87 h = __xfrm_spi_hash(&x->id.daddr, x->id.spi, 101 h = __xfrm_spi_hash(&x->id.daddr, x->id.spi,
88 x->id.proto, x->props.family, 102 x->id.proto, x->props.family,
89 nhashmask); 103 nhashmask);
90 hlist_add_head(&x->byspi, nspitable+h); 104 hlist_add_head_rcu(&x->byspi, nspitable + h);
91 } 105 }
92 } 106 }
93} 107}
@@ -122,25 +136,29 @@ static void xfrm_hash_resize(struct work_struct *work)
122 } 136 }
123 137
124 spin_lock_bh(&net->xfrm.xfrm_state_lock); 138 spin_lock_bh(&net->xfrm.xfrm_state_lock);
139 write_seqcount_begin(&xfrm_state_hash_generation);
125 140
126 nhashmask = (nsize / sizeof(struct hlist_head)) - 1U; 141 nhashmask = (nsize / sizeof(struct hlist_head)) - 1U;
142 odst = xfrm_state_deref_prot(net->xfrm.state_bydst, net);
127 for (i = net->xfrm.state_hmask; i >= 0; i--) 143 for (i = net->xfrm.state_hmask; i >= 0; i--)
128 xfrm_hash_transfer(net->xfrm.state_bydst+i, ndst, nsrc, nspi, 144 xfrm_hash_transfer(odst + i, ndst, nsrc, nspi, nhashmask);
129 nhashmask);
130 145
131 odst = net->xfrm.state_bydst; 146 osrc = xfrm_state_deref_prot(net->xfrm.state_bysrc, net);
132 osrc = net->xfrm.state_bysrc; 147 ospi = xfrm_state_deref_prot(net->xfrm.state_byspi, net);
133 ospi = net->xfrm.state_byspi;
134 ohashmask = net->xfrm.state_hmask; 148 ohashmask = net->xfrm.state_hmask;
135 149
136 net->xfrm.state_bydst = ndst; 150 rcu_assign_pointer(net->xfrm.state_bydst, ndst);
137 net->xfrm.state_bysrc = nsrc; 151 rcu_assign_pointer(net->xfrm.state_bysrc, nsrc);
138 net->xfrm.state_byspi = nspi; 152 rcu_assign_pointer(net->xfrm.state_byspi, nspi);
139 net->xfrm.state_hmask = nhashmask; 153 net->xfrm.state_hmask = nhashmask;
140 154
155 write_seqcount_end(&xfrm_state_hash_generation);
141 spin_unlock_bh(&net->xfrm.xfrm_state_lock); 156 spin_unlock_bh(&net->xfrm.xfrm_state_lock);
142 157
143 osize = (ohashmask + 1) * sizeof(struct hlist_head); 158 osize = (ohashmask + 1) * sizeof(struct hlist_head);
159
160 synchronize_rcu();
161
144 xfrm_hash_free(odst, osize); 162 xfrm_hash_free(odst, osize);
145 xfrm_hash_free(osrc, osize); 163 xfrm_hash_free(osrc, osize);
146 xfrm_hash_free(ospi, osize); 164 xfrm_hash_free(ospi, osize);
@@ -356,15 +374,16 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x)
356 374
357static void xfrm_state_gc_task(struct work_struct *work) 375static void xfrm_state_gc_task(struct work_struct *work)
358{ 376{
359 struct net *net = container_of(work, struct net, xfrm.state_gc_work);
360 struct xfrm_state *x; 377 struct xfrm_state *x;
361 struct hlist_node *tmp; 378 struct hlist_node *tmp;
362 struct hlist_head gc_list; 379 struct hlist_head gc_list;
363 380
364 spin_lock_bh(&xfrm_state_gc_lock); 381 spin_lock_bh(&xfrm_state_gc_lock);
365 hlist_move_list(&net->xfrm.state_gc_list, &gc_list); 382 hlist_move_list(&xfrm_state_gc_list, &gc_list);
366 spin_unlock_bh(&xfrm_state_gc_lock); 383 spin_unlock_bh(&xfrm_state_gc_lock);
367 384
385 synchronize_rcu();
386
368 hlist_for_each_entry_safe(x, tmp, &gc_list, gclist) 387 hlist_for_each_entry_safe(x, tmp, &gc_list, gclist)
369 xfrm_state_gc_destroy(x); 388 xfrm_state_gc_destroy(x);
370} 389}
@@ -501,14 +520,12 @@ EXPORT_SYMBOL(xfrm_state_alloc);
501 520
502void __xfrm_state_destroy(struct xfrm_state *x) 521void __xfrm_state_destroy(struct xfrm_state *x)
503{ 522{
504 struct net *net = xs_net(x);
505
506 WARN_ON(x->km.state != XFRM_STATE_DEAD); 523 WARN_ON(x->km.state != XFRM_STATE_DEAD);
507 524
508 spin_lock_bh(&xfrm_state_gc_lock); 525 spin_lock_bh(&xfrm_state_gc_lock);
509 hlist_add_head(&x->gclist, &net->xfrm.state_gc_list); 526 hlist_add_head(&x->gclist, &xfrm_state_gc_list);
510 spin_unlock_bh(&xfrm_state_gc_lock); 527 spin_unlock_bh(&xfrm_state_gc_lock);
511 schedule_work(&net->xfrm.state_gc_work); 528 schedule_work(&xfrm_state_gc_work);
512} 529}
513EXPORT_SYMBOL(__xfrm_state_destroy); 530EXPORT_SYMBOL(__xfrm_state_destroy);
514 531
@@ -521,10 +538,10 @@ int __xfrm_state_delete(struct xfrm_state *x)
521 x->km.state = XFRM_STATE_DEAD; 538 x->km.state = XFRM_STATE_DEAD;
522 spin_lock(&net->xfrm.xfrm_state_lock); 539 spin_lock(&net->xfrm.xfrm_state_lock);
523 list_del(&x->km.all); 540 list_del(&x->km.all);
524 hlist_del(&x->bydst); 541 hlist_del_rcu(&x->bydst);
525 hlist_del(&x->bysrc); 542 hlist_del_rcu(&x->bysrc);
526 if (x->id.spi) 543 if (x->id.spi)
527 hlist_del(&x->byspi); 544 hlist_del_rcu(&x->byspi);
528 net->xfrm.state_num--; 545 net->xfrm.state_num--;
529 spin_unlock(&net->xfrm.xfrm_state_lock); 546 spin_unlock(&net->xfrm.xfrm_state_lock);
530 547
@@ -660,7 +677,7 @@ static struct xfrm_state *__xfrm_state_lookup(struct net *net, u32 mark,
660 unsigned int h = xfrm_spi_hash(net, daddr, spi, proto, family); 677 unsigned int h = xfrm_spi_hash(net, daddr, spi, proto, family);
661 struct xfrm_state *x; 678 struct xfrm_state *x;
662 679
663 hlist_for_each_entry(x, net->xfrm.state_byspi+h, byspi) { 680 hlist_for_each_entry_rcu(x, net->xfrm.state_byspi + h, byspi) {
664 if (x->props.family != family || 681 if (x->props.family != family ||
665 x->id.spi != spi || 682 x->id.spi != spi ||
666 x->id.proto != proto || 683 x->id.proto != proto ||
@@ -669,7 +686,8 @@ static struct xfrm_state *__xfrm_state_lookup(struct net *net, u32 mark,
669 686
670 if ((mark & x->mark.m) != x->mark.v) 687 if ((mark & x->mark.m) != x->mark.v)
671 continue; 688 continue;
672 xfrm_state_hold(x); 689 if (!xfrm_state_hold_rcu(x))
690 continue;
673 return x; 691 return x;
674 } 692 }
675 693
@@ -684,7 +702,7 @@ static struct xfrm_state *__xfrm_state_lookup_byaddr(struct net *net, u32 mark,
684 unsigned int h = xfrm_src_hash(net, daddr, saddr, family); 702 unsigned int h = xfrm_src_hash(net, daddr, saddr, family);
685 struct xfrm_state *x; 703 struct xfrm_state *x;
686 704
687 hlist_for_each_entry(x, net->xfrm.state_bysrc+h, bysrc) { 705 hlist_for_each_entry_rcu(x, net->xfrm.state_bysrc + h, bysrc) {
688 if (x->props.family != family || 706 if (x->props.family != family ||
689 x->id.proto != proto || 707 x->id.proto != proto ||
690 !xfrm_addr_equal(&x->id.daddr, daddr, family) || 708 !xfrm_addr_equal(&x->id.daddr, daddr, family) ||
@@ -693,7 +711,8 @@ static struct xfrm_state *__xfrm_state_lookup_byaddr(struct net *net, u32 mark,
693 711
694 if ((mark & x->mark.m) != x->mark.v) 712 if ((mark & x->mark.m) != x->mark.v)
695 continue; 713 continue;
696 xfrm_state_hold(x); 714 if (!xfrm_state_hold_rcu(x))
715 continue;
697 return x; 716 return x;
698 } 717 }
699 718
@@ -776,13 +795,16 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr,
776 struct xfrm_state *best = NULL; 795 struct xfrm_state *best = NULL;
777 u32 mark = pol->mark.v & pol->mark.m; 796 u32 mark = pol->mark.v & pol->mark.m;
778 unsigned short encap_family = tmpl->encap_family; 797 unsigned short encap_family = tmpl->encap_family;
798 unsigned int sequence;
779 struct km_event c; 799 struct km_event c;
780 800
781 to_put = NULL; 801 to_put = NULL;
782 802
783 spin_lock_bh(&net->xfrm.xfrm_state_lock); 803 sequence = read_seqcount_begin(&xfrm_state_hash_generation);
804
805 rcu_read_lock();
784 h = xfrm_dst_hash(net, daddr, saddr, tmpl->reqid, encap_family); 806 h = xfrm_dst_hash(net, daddr, saddr, tmpl->reqid, encap_family);
785 hlist_for_each_entry(x, net->xfrm.state_bydst+h, bydst) { 807 hlist_for_each_entry_rcu(x, net->xfrm.state_bydst + h, bydst) {
786 if (x->props.family == encap_family && 808 if (x->props.family == encap_family &&
787 x->props.reqid == tmpl->reqid && 809 x->props.reqid == tmpl->reqid &&
788 (mark & x->mark.m) == x->mark.v && 810 (mark & x->mark.m) == x->mark.v &&
@@ -798,7 +820,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr,
798 goto found; 820 goto found;
799 821
800 h_wildcard = xfrm_dst_hash(net, daddr, &saddr_wildcard, tmpl->reqid, encap_family); 822 h_wildcard = xfrm_dst_hash(net, daddr, &saddr_wildcard, tmpl->reqid, encap_family);
801 hlist_for_each_entry(x, net->xfrm.state_bydst+h_wildcard, bydst) { 823 hlist_for_each_entry_rcu(x, net->xfrm.state_bydst + h_wildcard, bydst) {
802 if (x->props.family == encap_family && 824 if (x->props.family == encap_family &&
803 x->props.reqid == tmpl->reqid && 825 x->props.reqid == tmpl->reqid &&
804 (mark & x->mark.m) == x->mark.v && 826 (mark & x->mark.m) == x->mark.v &&
@@ -851,19 +873,21 @@ found:
851 } 873 }
852 874
853 if (km_query(x, tmpl, pol) == 0) { 875 if (km_query(x, tmpl, pol) == 0) {
876 spin_lock_bh(&net->xfrm.xfrm_state_lock);
854 x->km.state = XFRM_STATE_ACQ; 877 x->km.state = XFRM_STATE_ACQ;
855 list_add(&x->km.all, &net->xfrm.state_all); 878 list_add(&x->km.all, &net->xfrm.state_all);
856 hlist_add_head(&x->bydst, net->xfrm.state_bydst+h); 879 hlist_add_head_rcu(&x->bydst, net->xfrm.state_bydst + h);
857 h = xfrm_src_hash(net, daddr, saddr, encap_family); 880 h = xfrm_src_hash(net, daddr, saddr, encap_family);
858 hlist_add_head(&x->bysrc, net->xfrm.state_bysrc+h); 881 hlist_add_head_rcu(&x->bysrc, net->xfrm.state_bysrc + h);
859 if (x->id.spi) { 882 if (x->id.spi) {
860 h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, encap_family); 883 h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, encap_family);
861 hlist_add_head(&x->byspi, net->xfrm.state_byspi+h); 884 hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h);
862 } 885 }
863 x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires; 886 x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires;
864 tasklet_hrtimer_start(&x->mtimer, ktime_set(net->xfrm.sysctl_acq_expires, 0), HRTIMER_MODE_REL); 887 tasklet_hrtimer_start(&x->mtimer, ktime_set(net->xfrm.sysctl_acq_expires, 0), HRTIMER_MODE_REL);
865 net->xfrm.state_num++; 888 net->xfrm.state_num++;
866 xfrm_hash_grow_check(net, x->bydst.next != NULL); 889 xfrm_hash_grow_check(net, x->bydst.next != NULL);
890 spin_unlock_bh(&net->xfrm.xfrm_state_lock);
867 } else { 891 } else {
868 x->km.state = XFRM_STATE_DEAD; 892 x->km.state = XFRM_STATE_DEAD;
869 to_put = x; 893 to_put = x;
@@ -872,13 +896,26 @@ found:
872 } 896 }
873 } 897 }
874out: 898out:
875 if (x) 899 if (x) {
876 xfrm_state_hold(x); 900 if (!xfrm_state_hold_rcu(x)) {
877 else 901 *err = -EAGAIN;
902 x = NULL;
903 }
904 } else {
878 *err = acquire_in_progress ? -EAGAIN : error; 905 *err = acquire_in_progress ? -EAGAIN : error;
879 spin_unlock_bh(&net->xfrm.xfrm_state_lock); 906 }
907 rcu_read_unlock();
880 if (to_put) 908 if (to_put)
881 xfrm_state_put(to_put); 909 xfrm_state_put(to_put);
910
911 if (read_seqcount_retry(&xfrm_state_hash_generation, sequence)) {
912 *err = -EAGAIN;
913 if (x) {
914 xfrm_state_put(x);
915 x = NULL;
916 }
917 }
918
882 return x; 919 return x;
883} 920}
884 921
@@ -946,16 +983,16 @@ static void __xfrm_state_insert(struct xfrm_state *x)
946 983
947 h = xfrm_dst_hash(net, &x->id.daddr, &x->props.saddr, 984 h = xfrm_dst_hash(net, &x->id.daddr, &x->props.saddr,
948 x->props.reqid, x->props.family); 985 x->props.reqid, x->props.family);
949 hlist_add_head(&x->bydst, net->xfrm.state_bydst+h); 986 hlist_add_head_rcu(&x->bydst, net->xfrm.state_bydst + h);
950 987
951 h = xfrm_src_hash(net, &x->id.daddr, &x->props.saddr, x->props.family); 988 h = xfrm_src_hash(net, &x->id.daddr, &x->props.saddr, x->props.family);
952 hlist_add_head(&x->bysrc, net->xfrm.state_bysrc+h); 989 hlist_add_head_rcu(&x->bysrc, net->xfrm.state_bysrc + h);
953 990
954 if (x->id.spi) { 991 if (x->id.spi) {
955 h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, 992 h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto,
956 x->props.family); 993 x->props.family);
957 994
958 hlist_add_head(&x->byspi, net->xfrm.state_byspi+h); 995 hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h);
959 } 996 }
960 997
961 tasklet_hrtimer_start(&x->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL); 998 tasklet_hrtimer_start(&x->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL);
@@ -1064,9 +1101,9 @@ static struct xfrm_state *__find_acq_core(struct net *net,
1064 xfrm_state_hold(x); 1101 xfrm_state_hold(x);
1065 tasklet_hrtimer_start(&x->mtimer, ktime_set(net->xfrm.sysctl_acq_expires, 0), HRTIMER_MODE_REL); 1102 tasklet_hrtimer_start(&x->mtimer, ktime_set(net->xfrm.sysctl_acq_expires, 0), HRTIMER_MODE_REL);
1066 list_add(&x->km.all, &net->xfrm.state_all); 1103 list_add(&x->km.all, &net->xfrm.state_all);
1067 hlist_add_head(&x->bydst, net->xfrm.state_bydst+h); 1104 hlist_add_head_rcu(&x->bydst, net->xfrm.state_bydst + h);
1068 h = xfrm_src_hash(net, daddr, saddr, family); 1105 h = xfrm_src_hash(net, daddr, saddr, family);
1069 hlist_add_head(&x->bysrc, net->xfrm.state_bysrc+h); 1106 hlist_add_head_rcu(&x->bysrc, net->xfrm.state_bysrc + h);
1070 1107
1071 net->xfrm.state_num++; 1108 net->xfrm.state_num++;
1072 1109
@@ -1395,9 +1432,9 @@ xfrm_state_lookup(struct net *net, u32 mark, const xfrm_address_t *daddr, __be32
1395{ 1432{
1396 struct xfrm_state *x; 1433 struct xfrm_state *x;
1397 1434
1398 spin_lock_bh(&net->xfrm.xfrm_state_lock); 1435 rcu_read_lock();
1399 x = __xfrm_state_lookup(net, mark, daddr, spi, proto, family); 1436 x = __xfrm_state_lookup(net, mark, daddr, spi, proto, family);
1400 spin_unlock_bh(&net->xfrm.xfrm_state_lock); 1437 rcu_read_unlock();
1401 return x; 1438 return x;
1402} 1439}
1403EXPORT_SYMBOL(xfrm_state_lookup); 1440EXPORT_SYMBOL(xfrm_state_lookup);
@@ -1582,7 +1619,7 @@ int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high)
1582 if (x->id.spi) { 1619 if (x->id.spi) {
1583 spin_lock_bh(&net->xfrm.xfrm_state_lock); 1620 spin_lock_bh(&net->xfrm.xfrm_state_lock);
1584 h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, x->props.family); 1621 h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, x->props.family);
1585 hlist_add_head(&x->byspi, net->xfrm.state_byspi+h); 1622 hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h);
1586 spin_unlock_bh(&net->xfrm.xfrm_state_lock); 1623 spin_unlock_bh(&net->xfrm.xfrm_state_lock);
1587 1624
1588 err = 0; 1625 err = 0;
@@ -2100,8 +2137,6 @@ int __net_init xfrm_state_init(struct net *net)
2100 2137
2101 net->xfrm.state_num = 0; 2138 net->xfrm.state_num = 0;
2102 INIT_WORK(&net->xfrm.state_hash_work, xfrm_hash_resize); 2139 INIT_WORK(&net->xfrm.state_hash_work, xfrm_hash_resize);
2103 INIT_HLIST_HEAD(&net->xfrm.state_gc_list);
2104 INIT_WORK(&net->xfrm.state_gc_work, xfrm_state_gc_task);
2105 spin_lock_init(&net->xfrm.xfrm_state_lock); 2140 spin_lock_init(&net->xfrm.xfrm_state_lock);
2106 return 0; 2141 return 0;
2107 2142
@@ -2119,7 +2154,7 @@ void xfrm_state_fini(struct net *net)
2119 2154
2120 flush_work(&net->xfrm.state_hash_work); 2155 flush_work(&net->xfrm.state_hash_work);
2121 xfrm_state_flush(net, IPSEC_PROTO_ANY, false); 2156 xfrm_state_flush(net, IPSEC_PROTO_ANY, false);
2122 flush_work(&net->xfrm.state_gc_work); 2157 flush_work(&xfrm_state_gc_work);
2123 2158
2124 WARN_ON(!list_empty(&net->xfrm.state_all)); 2159 WARN_ON(!list_empty(&net->xfrm.state_all));
2125 2160
diff --git a/net/xfrm/xfrm_sysctl.c b/net/xfrm/xfrm_sysctl.c
index 05a6e3d9c258..35a7e794ad04 100644
--- a/net/xfrm/xfrm_sysctl.c
+++ b/net/xfrm/xfrm_sysctl.c
@@ -17,13 +17,13 @@ static struct ctl_table xfrm_table[] = {
17 .procname = "xfrm_aevent_etime", 17 .procname = "xfrm_aevent_etime",
18 .maxlen = sizeof(u32), 18 .maxlen = sizeof(u32),
19 .mode = 0644, 19 .mode = 0644,
20 .proc_handler = proc_dointvec 20 .proc_handler = proc_douintvec
21 }, 21 },
22 { 22 {
23 .procname = "xfrm_aevent_rseqth", 23 .procname = "xfrm_aevent_rseqth",
24 .maxlen = sizeof(u32), 24 .maxlen = sizeof(u32),
25 .mode = 0644, 25 .mode = 0644,
26 .proc_handler = proc_dointvec 26 .proc_handler = proc_douintvec
27 }, 27 },
28 { 28 {
29 .procname = "xfrm_larval_drop", 29 .procname = "xfrm_larval_drop",