aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/6lowpan/nhc.c8
-rw-r--r--net/8021q/vlan_dev.c7
-rw-r--r--net/9p/client.c20
-rw-r--r--net/Kconfig10
-rw-r--r--net/Makefile3
-rw-r--r--net/appletalk/ddp.c4
-rw-r--r--net/atm/common.c2
-rw-r--r--net/atm/mpc.c2
-rw-r--r--net/atm/svc.c7
-rw-r--r--net/ax25/af_ax25.c5
-rw-r--r--net/batman-adv/Makefile2
-rw-r--r--net/batman-adv/bat_algo.c2
-rw-r--r--net/batman-adv/bat_algo.h2
-rw-r--r--net/batman-adv/bat_iv_ogm.c13
-rw-r--r--net/batman-adv/bat_iv_ogm.h2
-rw-r--r--net/batman-adv/bat_v.c16
-rw-r--r--net/batman-adv/bat_v.h2
-rw-r--r--net/batman-adv/bat_v_elp.c2
-rw-r--r--net/batman-adv/bat_v_elp.h2
-rw-r--r--net/batman-adv/bat_v_ogm.c2
-rw-r--r--net/batman-adv/bat_v_ogm.h2
-rw-r--r--net/batman-adv/bitarray.c2
-rw-r--r--net/batman-adv/bitarray.h2
-rw-r--r--net/batman-adv/bridge_loop_avoidance.c3
-rw-r--r--net/batman-adv/bridge_loop_avoidance.h20
-rw-r--r--net/batman-adv/debugfs.c4
-rw-r--r--net/batman-adv/debugfs.h2
-rw-r--r--net/batman-adv/distributed-arp-table.c3
-rw-r--r--net/batman-adv/distributed-arp-table.h2
-rw-r--r--net/batman-adv/fragmentation.c42
-rw-r--r--net/batman-adv/fragmentation.h2
-rw-r--r--net/batman-adv/gateway_client.c2
-rw-r--r--net/batman-adv/gateway_client.h2
-rw-r--r--net/batman-adv/gateway_common.c7
-rw-r--r--net/batman-adv/gateway_common.h2
-rw-r--r--net/batman-adv/hard-interface.c2
-rw-r--r--net/batman-adv/hard-interface.h2
-rw-r--r--net/batman-adv/hash.c2
-rw-r--r--net/batman-adv/hash.h2
-rw-r--r--net/batman-adv/icmp_socket.c2
-rw-r--r--net/batman-adv/icmp_socket.h2
-rw-r--r--net/batman-adv/log.c2
-rw-r--r--net/batman-adv/log.h2
-rw-r--r--net/batman-adv/main.c2
-rw-r--r--net/batman-adv/main.h4
-rw-r--r--net/batman-adv/multicast.c2
-rw-r--r--net/batman-adv/multicast.h2
-rw-r--r--net/batman-adv/netlink.c2
-rw-r--r--net/batman-adv/netlink.h2
-rw-r--r--net/batman-adv/network-coding.c2
-rw-r--r--net/batman-adv/network-coding.h2
-rw-r--r--net/batman-adv/originator.c2
-rw-r--r--net/batman-adv/originator.h2
-rw-r--r--net/batman-adv/packet.h2
-rw-r--r--net/batman-adv/routing.c11
-rw-r--r--net/batman-adv/routing.h2
-rw-r--r--net/batman-adv/send.c6
-rw-r--r--net/batman-adv/send.h2
-rw-r--r--net/batman-adv/soft-interface.c8
-rw-r--r--net/batman-adv/soft-interface.h2
-rw-r--r--net/batman-adv/sysfs.c2
-rw-r--r--net/batman-adv/sysfs.h2
-rw-r--r--net/batman-adv/tp_meter.c4
-rw-r--r--net/batman-adv/tp_meter.h2
-rw-r--r--net/batman-adv/translation-table.c4
-rw-r--r--net/batman-adv/translation-table.h2
-rw-r--r--net/batman-adv/tvlv.c2
-rw-r--r--net/batman-adv/tvlv.h2
-rw-r--r--net/batman-adv/types.h6
-rw-r--r--net/bluetooth/6lowpan.c2
-rw-r--r--net/bluetooth/a2mp.c4
-rw-r--r--net/bluetooth/af_bluetooth.c4
-rw-r--r--net/bluetooth/amp.c4
-rw-r--r--net/bluetooth/cmtp/capi.c2
-rw-r--r--net/bluetooth/hci_event.c2
-rw-r--r--net/bluetooth/hci_request.c2
-rw-r--r--net/bluetooth/hci_sock.c6
-rw-r--r--net/bluetooth/l2cap_core.c4
-rw-r--r--net/bluetooth/l2cap_sock.c3
-rw-r--r--net/bluetooth/rfcomm/sock.c4
-rw-r--r--net/bluetooth/sco.c3
-rw-r--r--net/bridge/Makefile5
-rw-r--r--net/bridge/br_device.c31
-rw-r--r--net/bridge/br_fdb.c215
-rw-r--r--net/bridge/br_forward.c44
-rw-r--r--net/bridge/br_if.c3
-rw-r--r--net/bridge/br_input.c18
-rw-r--r--net/bridge/br_ioctl.c2
-rw-r--r--net/bridge/br_mdb.c2
-rw-r--r--net/bridge/br_multicast.c154
-rw-r--r--net/bridge/br_netfilter_hooks.c33
-rw-r--r--net/bridge/br_netlink.c155
-rw-r--r--net/bridge/br_netlink_tunnel.c294
-rw-r--r--net/bridge/br_private.h82
-rw-r--r--net/bridge/br_private_tunnel.h83
-rw-r--r--net/bridge/br_stp.c2
-rw-r--r--net/bridge/br_stp_if.c4
-rw-r--r--net/bridge/br_stp_timer.c2
-rw-r--r--net/bridge/br_sysfs_br.c3
-rw-r--r--net/bridge/br_sysfs_if.c3
-rw-r--r--net/bridge/br_vlan.c24
-rw-r--r--net/bridge/br_vlan_tunnel.c205
-rw-r--r--net/bridge/netfilter/ebt_among.c2
-rw-r--r--net/bridge/netfilter/ebt_limit.c1
-rw-r--r--net/bridge/netfilter/ebt_log.c2
-rw-r--r--net/bridge/netfilter/ebtables.c78
-rw-r--r--net/caif/caif_socket.c2
-rw-r--r--net/caif/chnl_net.c1
-rw-r--r--net/can/af_can.c12
-rw-r--r--net/can/af_can.h3
-rw-r--r--net/can/bcm.c27
-rw-r--r--net/can/gw.c2
-rw-r--r--net/can/raw.c4
-rw-r--r--net/ceph/ceph_common.c15
-rw-r--r--net/ceph/cls_lock_client.c14
-rw-r--r--net/ceph/crush/crush.c5
-rw-r--r--net/ceph/crush/mapper.c227
-rw-r--r--net/ceph/crypto.c2
-rw-r--r--net/ceph/messenger.c54
-rw-r--r--net/ceph/osd_client.c176
-rw-r--r--net/ceph/osdmap.c101
-rw-r--r--net/ceph/snapshot.c2
-rw-r--r--net/compat.c34
-rw-r--r--net/core/Makefile1
-rw-r--r--net/core/datagram.c31
-rw-r--r--net/core/dev.c555
-rw-r--r--net/core/devlink.c50
-rw-r--r--net/core/dst.c1
-rw-r--r--net/core/ethtool.c51
-rw-r--r--net/core/filter.c274
-rw-r--r--net/core/flow_dissector.c57
-rw-r--r--net/core/gro_cells.c92
-rw-r--r--net/core/lwt_bpf.c4
-rw-r--r--net/core/lwtunnel.c4
-rw-r--r--net/core/neighbour.c6
-rw-r--r--net/core/net-sysfs.c7
-rw-r--r--net/core/net_namespace.c2
-rw-r--r--net/core/netclassid_cgroup.c34
-rw-r--r--net/core/netpoll.c10
-rw-r--r--net/core/netprio_cgroup.c3
-rw-r--r--net/core/pktgen.c4
-rw-r--r--net/core/request_sock.c2
-rw-r--r--net/core/rtnetlink.c78
-rw-r--r--net/core/scm.c3
-rw-r--r--net/core/secure_seq.c174
-rw-r--r--net/core/skbuff.c94
-rw-r--r--net/core/sock.c153
-rw-r--r--net/core/stream.c1
-rw-r--r--net/core/sysctl_net_core.c44
-rw-r--r--net/dccp/ccids/ccid2.c1
-rw-r--r--net/dccp/input.c13
-rw-r--r--net/dccp/ipv4.c10
-rw-r--r--net/dccp/ipv6.c16
-rw-r--r--net/dccp/minisocks.c29
-rw-r--r--net/dccp/output.c1
-rw-r--r--net/decnet/af_decnet.c7
-rw-r--r--net/dns_resolver/dns_query.c6
-rw-r--r--net/dsa/Kconfig16
-rw-r--r--net/dsa/Makefile2
-rw-r--r--net/dsa/dsa.c262
-rw-r--r--net/dsa/dsa2.c251
-rw-r--r--net/dsa/dsa_priv.h25
-rw-r--r--net/dsa/slave.c475
-rw-r--r--net/dsa/switch.c85
-rw-r--r--net/dsa/tag_brcm.c11
-rw-r--r--net/dsa/tag_dsa.c10
-rw-r--r--net/dsa/tag_edsa.c10
-rw-r--r--net/dsa/tag_qca.c4
-rw-r--r--net/dsa/tag_trailer.c6
-rw-r--r--net/ethernet/eth.c31
-rw-r--r--net/hsr/hsr_device.c2
-rw-r--r--net/hsr/hsr_slave.c3
-rw-r--r--net/ieee802154/socket.c4
-rw-r--r--net/ife/Kconfig16
-rw-r--r--net/ife/Makefile5
-rw-r--r--net/ife/ife.c142
-rw-r--r--net/ipv4/Kconfig14
-rw-r--r--net/ipv4/Makefile1
-rw-r--r--net/ipv4/af_inet.c53
-rw-r--r--net/ipv4/ah4.c3
-rw-r--r--net/ipv4/arp.c12
-rw-r--r--net/ipv4/cipso_ipv4.c4
-rw-r--r--net/ipv4/devinet.c3
-rw-r--r--net/ipv4/esp4.c332
-rw-r--r--net/ipv4/esp4_offload.c106
-rw-r--r--net/ipv4/fib_frontend.c10
-rw-r--r--net/ipv4/fib_semantics.c62
-rw-r--r--net/ipv4/fib_trie.c44
-rw-r--r--net/ipv4/icmp.c125
-rw-r--r--net/ipv4/igmp.c1
-rw-r--r--net/ipv4/inet_connection_sock.c280
-rw-r--r--net/ipv4/inet_diag.c2
-rw-r--r--net/ipv4/inet_hashtables.c19
-rw-r--r--net/ipv4/inet_timewait_sock.c3
-rw-r--r--net/ipv4/ip_fragment.c25
-rw-r--r--net/ipv4/ip_output.c13
-rw-r--r--net/ipv4/ip_sockglue.c36
-rw-r--r--net/ipv4/ip_tunnel_core.c10
-rw-r--r--net/ipv4/ipconfig.c2
-rw-r--r--net/ipv4/ipmr.c279
-rw-r--r--net/ipv4/netfilter.c7
-rw-r--r--net/ipv4/netfilter/arp_tables.c15
-rw-r--r--net/ipv4/netfilter/ip_tables.c21
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c3
-rw-r--r--net/ipv4/netfilter/ipt_SYNPROXY.c11
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c6
-rw-r--r--net/ipv4/netfilter/nf_conntrack_proto_icmp.c15
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c4
-rw-r--r--net/ipv4/netfilter/nf_dup_ipv4.c7
-rw-r--r--net/ipv4/netfilter/nf_log_arp.c4
-rw-r--r--net/ipv4/netfilter/nf_log_ipv4.c2
-rw-r--r--net/ipv4/netfilter/nf_nat_l3proto_ipv4.c5
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic.c20
-rw-r--r--net/ipv4/netfilter/nft_masq_ipv4.c8
-rw-r--r--net/ipv4/netfilter/nft_redir_ipv4.c8
-rw-r--r--net/ipv4/ping.c16
-rw-r--r--net/ipv4/proc.c5
-rw-r--r--net/ipv4/raw.c8
-rw-r--r--net/ipv4/route.c48
-rw-r--r--net/ipv4/syncookies.c21
-rw-r--r--net/ipv4/sysctl_net_ipv4.c110
-rw-r--r--net/ipv4/tcp.c110
-rw-r--r--net/ipv4/tcp_cdg.c2
-rw-r--r--net/ipv4/tcp_fastopen.c54
-rw-r--r--net/ipv4/tcp_input.c314
-rw-r--r--net/ipv4/tcp_ipv4.c64
-rw-r--r--net/ipv4/tcp_metrics.c8
-rw-r--r--net/ipv4/tcp_minisocks.c26
-rw-r--r--net/ipv4/tcp_output.c149
-rw-r--r--net/ipv4/tcp_recovery.c149
-rw-r--r--net/ipv4/tcp_timer.c11
-rw-r--r--net/ipv4/udp.c119
-rw-r--r--net/ipv4/xfrm4_input.c6
-rw-r--r--net/ipv4/xfrm4_mode_transport.c4
-rw-r--r--net/ipv4/xfrm4_policy.c9
-rw-r--r--net/ipv4/xfrm4_protocol.c3
-rw-r--r--net/ipv4/xfrm4_state.c8
-rw-r--r--net/ipv6/Kconfig14
-rw-r--r--net/ipv6/Makefile1
-rw-r--r--net/ipv6/addrconf.c152
-rw-r--r--net/ipv6/af_inet6.c13
-rw-r--r--net/ipv6/ah6.c3
-rw-r--r--net/ipv6/datagram.c24
-rw-r--r--net/ipv6/esp6.c318
-rw-r--r--net/ipv6/esp6_offload.c108
-rw-r--r--net/ipv6/exthdrs.c32
-rw-r--r--net/ipv6/icmp.c68
-rw-r--r--net/ipv6/ila/ila_lwt.c2
-rw-r--r--net/ipv6/inet6_connection_sock.c40
-rw-r--r--net/ipv6/inet6_hashtables.c46
-rw-r--r--net/ipv6/ip6_fib.c24
-rw-r--r--net/ipv6/ip6_gre.c48
-rw-r--r--net/ipv6/ip6_input.c7
-rw-r--r--net/ipv6/ip6_offload.c6
-rw-r--r--net/ipv6/ip6_output.c25
-rw-r--r--net/ipv6/ip6_tunnel.c2
-rw-r--r--net/ipv6/ip6_vti.c16
-rw-r--r--net/ipv6/ip6mr.c37
-rw-r--r--net/ipv6/ipv6_sockglue.c16
-rw-r--r--net/ipv6/mcast.c1
-rw-r--r--net/ipv6/netfilter/ip6_tables.c21
-rw-r--r--net/ipv6/netfilter/ip6t_NPT.c2
-rw-r--r--net/ipv6/netfilter/ip6t_SYNPROXY.c11
-rw-r--r--net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c20
-rw-r--r--net/ipv6/netfilter/nf_conntrack_reasm.c1
-rw-r--r--net/ipv6/netfilter/nf_defrag_ipv6_hooks.c4
-rw-r--r--net/ipv6/netfilter/nf_dup_ipv6.c7
-rw-r--r--net/ipv6/netfilter/nf_log_ipv6.c4
-rw-r--r--net/ipv6/netfilter/nft_masq_ipv6.c8
-rw-r--r--net/ipv6/netfilter/nft_redir_ipv6.c8
-rw-r--r--net/ipv6/ping.c7
-rw-r--r--net/ipv6/raw.c6
-rw-r--r--net/ipv6/route.c355
-rw-r--r--net/ipv6/seg6.c3
-rw-r--r--net/ipv6/seg6_hmac.c54
-rw-r--r--net/ipv6/seg6_iptunnel.c10
-rw-r--r--net/ipv6/sit.c1
-rw-r--r--net/ipv6/syncookies.c41
-rw-r--r--net/ipv6/tcp_ipv6.c82
-rw-r--r--net/ipv6/udp.c40
-rw-r--r--net/ipv6/xfrm6_input.c22
-rw-r--r--net/ipv6/xfrm6_mode_transport.c4
-rw-r--r--net/ipv6/xfrm6_policy.c9
-rw-r--r--net/ipv6/xfrm6_protocol.c3
-rw-r--r--net/irda/af_irda.c6
-rw-r--r--net/irda/ircomm/ircomm_tty.c2
-rw-r--r--net/irda/irnet/irnet_ppp.c15
-rw-r--r--net/irda/irqueue.c34
-rw-r--r--net/iucv/af_iucv.c4
-rw-r--r--net/kcm/kcmsock.c54
-rw-r--r--net/key/af_key.c93
-rw-r--r--net/l2tp/l2tp_core.c168
-rw-r--r--net/l2tp/l2tp_core.h10
-rw-r--r--net/l2tp/l2tp_debugfs.c10
-rw-r--r--net/l2tp/l2tp_eth.c16
-rw-r--r--net/l2tp/l2tp_ip.c80
-rw-r--r--net/l2tp/l2tp_ip6.c58
-rw-r--r--net/l2tp/l2tp_netlink.c52
-rw-r--r--net/l2tp/l2tp_ppp.c103
-rw-r--r--net/llc/af_llc.c6
-rw-r--r--net/llc/llc_conn.c3
-rw-r--r--net/llc/llc_sap.c3
-rw-r--r--net/mac80211/Kconfig1
-rw-r--r--net/mac80211/aes_cmac.c126
-rw-r--r--net/mac80211/aes_cmac.h15
-rw-r--r--net/mac80211/agg-rx.c3
-rw-r--r--net/mac80211/cfg.c16
-rw-r--r--net/mac80211/chan.c4
-rw-r--r--net/mac80211/debugfs.c35
-rw-r--r--net/mac80211/debugfs_netdev.c3
-rw-r--r--net/mac80211/debugfs_sta.c1
-rw-r--r--net/mac80211/fils_aead.c80
-rw-r--r--net/mac80211/ibss.c4
-rw-r--r--net/mac80211/ieee80211_i.h8
-rw-r--r--net/mac80211/iface.c7
-rw-r--r--net/mac80211/key.h2
-rw-r--r--net/mac80211/mesh.c11
-rw-r--r--net/mac80211/mesh.h2
-rw-r--r--net/mac80211/mesh_plink.c16
-rw-r--r--net/mac80211/mesh_sync.c27
-rw-r--r--net/mac80211/mlme.c13
-rw-r--r--net/mac80211/pm.c1
-rw-r--r--net/mac80211/rc80211_minstrel.c21
-rw-r--r--net/mac80211/rc80211_minstrel.h33
-rw-r--r--net/mac80211/rc80211_minstrel_debugfs.c24
-rw-r--r--net/mac80211/rc80211_minstrel_ht.c68
-rw-r--r--net/mac80211/rc80211_minstrel_ht.h6
-rw-r--r--net/mac80211/rc80211_minstrel_ht_debugfs.c32
-rw-r--r--net/mac80211/rx.c123
-rw-r--r--net/mac80211/scan.c8
-rw-r--r--net/mac80211/sta_info.c22
-rw-r--r--net/mac80211/sta_info.h8
-rw-r--r--net/mac80211/status.c17
-rw-r--r--net/mac80211/trace.h27
-rw-r--r--net/mac80211/tx.c136
-rw-r--r--net/mac80211/vht.c4
-rw-r--r--net/mac80211/wep.c3
-rw-r--r--net/mac80211/wpa.c3
-rw-r--r--net/mac802154/llsec.c2
-rw-r--r--net/mpls/af_mpls.c406
-rw-r--r--net/mpls/internal.h58
-rw-r--r--net/mpls/mpls_iptunnel.c13
-rw-r--r--net/netfilter/Kconfig12
-rw-r--r--net/netfilter/Makefile3
-rw-r--r--net/netfilter/core.c2
-rw-r--r--net/netfilter/ipset/ip_set_hash_gen.h2
-rw-r--r--net/netfilter/ipset/ip_set_list_set.c9
-rw-r--r--net/netfilter/ipvs/ip_vs_conn.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c15
-rw-r--r--net/netfilter/ipvs/ip_vs_dh.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_lblc.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_lblcr.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_sh.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_sync.c2
-rw-r--r--net/netfilter/nf_conntrack_core.c79
-rw-r--r--net/netfilter/nf_conntrack_ecache.c2
-rw-r--r--net/netfilter/nf_conntrack_expect.c18
-rw-r--r--net/netfilter/nf_conntrack_extend.c13
-rw-r--r--net/netfilter/nf_conntrack_ftp.c2
-rw-r--r--net/netfilter/nf_conntrack_helper.c56
-rw-r--r--net/netfilter/nf_conntrack_netlink.c75
-rw-r--r--net/netfilter/nf_conntrack_proto_dccp.c1
-rw-r--r--net/netfilter/nf_conntrack_proto_sctp.c32
-rw-r--r--net/netfilter/nf_conntrack_proto_tcp.c1
-rw-r--r--net/netfilter/nf_conntrack_proto_udp.c124
-rw-r--r--net/netfilter/nf_conntrack_proto_udplite.c324
-rw-r--r--net/netfilter/nf_conntrack_sip.c14
-rw-r--r--net/netfilter/nf_conntrack_standalone.c3
-rw-r--r--net/netfilter/nf_log.c24
-rw-r--r--net/netfilter/nf_nat_core.c2
-rw-r--r--net/netfilter/nf_nat_helper.c2
-rw-r--r--net/netfilter/nf_nat_proto_sctp.c13
-rw-r--r--net/netfilter/nf_nat_proto_udp.c78
-rw-r--r--net/netfilter/nf_nat_proto_udplite.c73
-rw-r--r--net/netfilter/nf_nat_redirect.c2
-rw-r--r--net/netfilter/nf_tables_api.c281
-rw-r--r--net/netfilter/nfnetlink.c92
-rw-r--r--net/netfilter/nfnetlink_cthelper.c289
-rw-r--r--net/netfilter/nfnetlink_cttimeout.c2
-rw-r--r--net/netfilter/nfnetlink_queue.c9
-rw-r--r--net/netfilter/nft_ct.c234
-rw-r--r--net/netfilter/nft_exthdr.c139
-rw-r--r--net/netfilter/nft_hash.c10
-rw-r--r--net/netfilter/nft_meta.c65
-rw-r--r--net/netfilter/nft_nat.c8
-rw-r--r--net/netfilter/nft_set_bitmap.c307
-rw-r--r--net/netfilter/nft_set_hash.c16
-rw-r--r--net/netfilter/nft_set_rbtree.c25
-rw-r--r--net/netfilter/x_tables.c70
-rw-r--r--net/netfilter/xt_CT.c15
-rw-r--r--net/netfilter/xt_RATEEST.c1
-rw-r--r--net/netfilter/xt_TCPMSS.c6
-rw-r--r--net/netfilter/xt_TEE.c2
-rw-r--r--net/netfilter/xt_TPROXY.c5
-rw-r--r--net/netfilter/xt_bpf.c2
-rw-r--r--net/netfilter/xt_cgroup.c1
-rw-r--r--net/netfilter/xt_connlimit.c5
-rw-r--r--net/netfilter/xt_hashlimit.c29
-rw-r--r--net/netfilter/xt_limit.c2
-rw-r--r--net/netfilter/xt_owner.c2
-rw-r--r--net/netfilter/xt_pkttype.c3
-rw-r--r--net/netfilter/xt_quota.c1
-rw-r--r--net/netfilter/xt_rateest.c1
-rw-r--r--net/netfilter/xt_string.c1
-rw-r--r--net/netlink/af_netlink.c47
-rw-r--r--net/netlink/genetlink.c4
-rw-r--r--net/netrom/af_netrom.c5
-rw-r--r--net/nfc/llcp_sock.c3
-rw-r--r--net/openvswitch/actions.c47
-rw-r--r--net/openvswitch/conntrack.c312
-rw-r--r--net/openvswitch/conntrack.h14
-rw-r--r--net/openvswitch/flow.c44
-rw-r--r--net/openvswitch/flow.h55
-rw-r--r--net/openvswitch/flow_netlink.c96
-rw-r--r--net/openvswitch/flow_netlink.h7
-rw-r--r--net/openvswitch/vport-internal_dev.c6
-rw-r--r--net/packet/af_packet.c137
-rw-r--r--net/packet/diag.c3
-rw-r--r--net/phonet/pep.c7
-rw-r--r--net/phonet/socket.c6
-rw-r--r--net/psample/Kconfig15
-rw-r--r--net/psample/Makefile5
-rw-r--r--net/psample/psample.c301
-rw-r--r--net/qrtr/qrtr.c4
-rw-r--r--net/rds/af_rds.c31
-rw-r--r--net/rds/bind.c4
-rw-r--r--net/rds/connection.c11
-rw-r--r--net/rds/ib.c20
-rw-r--r--net/rds/ib.h30
-rw-r--r--net/rds/ib_cm.c136
-rw-r--r--net/rds/ib_frmr.c16
-rw-r--r--net/rds/ib_mr.h3
-rw-r--r--net/rds/ib_recv.c14
-rw-r--r--net/rds/ib_send.c30
-rw-r--r--net/rds/ib_stats.c2
-rw-r--r--net/rds/page.c29
-rw-r--r--net/rds/rdma.c22
-rw-r--r--net/rds/rdma_transport.c11
-rw-r--r--net/rds/rds.h32
-rw-r--r--net/rds/recv.c36
-rw-r--r--net/rds/send.c50
-rw-r--r--net/rds/tcp.c45
-rw-r--r--net/rds/tcp.h2
-rw-r--r--net/rds/tcp_listen.c12
-rw-r--r--net/rds/tcp_recv.c5
-rw-r--r--net/rds/transport.c4
-rw-r--r--net/rfkill/Kconfig11
-rw-r--r--net/rfkill/Makefile1
-rw-r--r--net/rfkill/core.c100
-rw-r--r--net/rfkill/rfkill-regulator.c154
-rw-r--r--net/rose/af_rose.c5
-rw-r--r--net/rxrpc/Makefile12
-rw-r--r--net/rxrpc/af_rxrpc.c20
-rw-r--r--net/rxrpc/ar-internal.h196
-rw-r--r--net/rxrpc/call_accept.c51
-rw-r--r--net/rxrpc/call_object.c36
-rw-r--r--net/rxrpc/conn_client.c10
-rw-r--r--net/rxrpc/conn_event.c4
-rw-r--r--net/rxrpc/conn_object.c1
-rw-r--r--net/rxrpc/input.c44
-rw-r--r--net/rxrpc/key.c2
-rw-r--r--net/rxrpc/misc.c151
-rw-r--r--net/rxrpc/proc.c9
-rw-r--r--net/rxrpc/recvmsg.c53
-rw-r--r--net/rxrpc/sendmsg.c99
-rw-r--r--net/sched/Kconfig14
-rw-r--r--net/sched/Makefile1
-rw-r--r--net/sched/act_api.c70
-rw-r--r--net/sched/act_connmark.c3
-rw-r--r--net/sched/act_csum.c30
-rw-r--r--net/sched/act_ife.c117
-rw-r--r--net/sched/act_mirred.c23
-rw-r--r--net/sched/act_pedit.c220
-rw-r--r--net/sched/act_sample.c276
-rw-r--r--net/sched/act_skbmod.c1
-rw-r--r--net/sched/cls_api.c190
-rw-r--r--net/sched/cls_bpf.c13
-rw-r--r--net/sched/cls_flow.c2
-rw-r--r--net/sched/cls_flower.c106
-rw-r--r--net/sched/cls_matchall.c160
-rw-r--r--net/sched/cls_u32.c11
-rw-r--r--net/sched/em_meta.c1
-rw-r--r--net/sched/sch_api.c39
-rw-r--r--net/sched/sch_atm.c1
-rw-r--r--net/sched/sch_cbq.c1
-rw-r--r--net/sched/sch_choke.c1
-rw-r--r--net/sched/sch_dsmark.c11
-rw-r--r--net/sched/sch_fq_codel.c7
-rw-r--r--net/sched/sch_generic.c4
-rw-r--r--net/sched/sch_hhf.c8
-rw-r--r--net/sched/sch_htb.c1
-rw-r--r--net/sched/sch_ingress.c1
-rw-r--r--net/sched/sch_mq.c10
-rw-r--r--net/sched/sch_mqprio.c19
-rw-r--r--net/sched/sch_multiq.c2
-rw-r--r--net/sched/sch_netem.c2
-rw-r--r--net/sched/sch_prio.c2
-rw-r--r--net/sched/sch_sfb.c1
-rw-r--r--net/sched/sch_sfq.c4
-rw-r--r--net/sched/sch_teql.c5
-rw-r--r--net/sctp/Makefile2
-rw-r--r--net/sctp/associola.c47
-rw-r--r--net/sctp/chunk.c107
-rw-r--r--net/sctp/debug.c5
-rw-r--r--net/sctp/endpointola.c1
-rw-r--r--net/sctp/input.c37
-rw-r--r--net/sctp/ipv6.c21
-rw-r--r--net/sctp/objcnt.c2
-rw-r--r--net/sctp/output.c99
-rw-r--r--net/sctp/outqueue.c49
-rw-r--r--net/sctp/primitive.c3
-rw-r--r--net/sctp/proc.c4
-rw-r--r--net/sctp/protocol.c32
-rw-r--r--net/sctp/sm_make_chunk.c361
-rw-r--r--net/sctp/sm_sideeffect.c38
-rw-r--r--net/sctp/sm_statefuns.c232
-rw-r--r--net/sctp/sm_statetable.c70
-rw-r--r--net/sctp/socket.c220
-rw-r--r--net/sctp/ssnmap.c125
-rw-r--r--net/sctp/stream.c506
-rw-r--r--net/sctp/transport.c56
-rw-r--r--net/sctp/ulpevent.c29
-rw-r--r--net/sctp/ulpqueue.c36
-rw-r--r--net/smc/Kconfig20
-rw-r--r--net/smc/Makefile4
-rw-r--r--net/smc/af_smc.c1409
-rw-r--r--net/smc/smc.h274
-rw-r--r--net/smc/smc_cdc.c304
-rw-r--r--net/smc/smc_cdc.h218
-rw-r--r--net/smc/smc_clc.c282
-rw-r--r--net/smc/smc_clc.h116
-rw-r--r--net/smc/smc_close.c444
-rw-r--r--net/smc/smc_close.h28
-rw-r--r--net/smc/smc_core.c682
-rw-r--r--net/smc/smc_core.h181
-rw-r--r--net/smc/smc_diag.c215
-rw-r--r--net/smc/smc_ib.c466
-rw-r--r--net/smc/smc_ib.h71
-rw-r--r--net/smc/smc_llc.c158
-rw-r--r--net/smc/smc_llc.h63
-rw-r--r--net/smc/smc_pnet.c534
-rw-r--r--net/smc/smc_pnet.h23
-rw-r--r--net/smc/smc_rx.c219
-rw-r--r--net/smc/smc_rx.h23
-rw-r--r--net/smc/smc_tx.c485
-rw-r--r--net/smc/smc_tx.h35
-rw-r--r--net/smc/smc_wr.c614
-rw-r--r--net/smc/smc_wr.h106
-rw-r--r--net/socket.c30
-rw-r--r--net/strparser/strparser.c1
-rw-r--r--net/sunrpc/auth.c16
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c2
-rw-r--r--net/sunrpc/auth_gss/gss_rpc_xdr.c2
-rw-r--r--net/sunrpc/auth_gss/svcauth_gss.c4
-rw-r--r--net/sunrpc/auth_null.c3
-rw-r--r--net/sunrpc/auth_unix.c18
-rw-r--r--net/sunrpc/cache.c123
-rw-r--r--net/sunrpc/clnt.c51
-rw-r--r--net/sunrpc/debugfs.c35
-rw-r--r--net/sunrpc/svc.c28
-rw-r--r--net/sunrpc/svc_xprt.c6
-rw-r--r--net/sunrpc/svcauth.c15
-rw-r--r--net/sunrpc/svcauth_unix.c4
-rw-r--r--net/sunrpc/svcsock.c6
-rw-r--r--net/sunrpc/xdr.c34
-rw-r--r--net/sunrpc/xprt.c2
-rw-r--r--net/sunrpc/xprtrdma/fmr_ops.c5
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c11
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c82
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_backchannel.c18
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_marshal.c299
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c20
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c22
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c74
-rw-r--r--net/sunrpc/xprtrdma/transport.c6
-rw-r--r--net/sunrpc/xprtrdma/verbs.c97
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h30
-rw-r--r--net/sunrpc/xprtsock.c102
-rw-r--r--net/tipc/bcast.c204
-rw-r--r--net/tipc/bcast.h33
-rw-r--r--net/tipc/bearer.c15
-rw-r--r--net/tipc/bearer.h8
-rw-r--r--net/tipc/link.c87
-rw-r--r--net/tipc/msg.c17
-rw-r--r--net/tipc/msg.h11
-rw-r--r--net/tipc/name_table.c128
-rw-r--r--net/tipc/name_table.h24
-rw-r--r--net/tipc/net.c4
-rw-r--r--net/tipc/node.c54
-rw-r--r--net/tipc/node.h4
-rw-r--r--net/tipc/socket.c535
-rw-r--r--net/tipc/subscr.c7
-rw-r--r--net/tipc/udp_media.c8
-rw-r--r--net/unix/af_unix.c48
-rw-r--r--net/unix/garbage.c17
-rw-r--r--net/vmw_vsock/af_vsock.c18
-rw-r--r--net/vmw_vsock/virtio_transport.c45
-rw-r--r--net/vmw_vsock/virtio_transport_common.c8
-rw-r--r--net/wireless/Makefile1
-rw-r--r--net/wireless/core.c6
-rw-r--r--net/wireless/core.h8
-rw-r--r--net/wireless/debugfs.c10
-rw-r--r--net/wireless/mlme.c29
-rw-r--r--net/wireless/nl80211.c379
-rw-r--r--net/wireless/nl80211.h10
-rw-r--r--net/wireless/of.c138
-rw-r--r--net/wireless/reg.c27
-rw-r--r--net/wireless/scan.c9
-rw-r--r--net/wireless/sme.c72
-rw-r--r--net/wireless/sysfs.c16
-rw-r--r--net/wireless/trace.h27
-rw-r--r--net/wireless/util.c32
-rw-r--r--net/wireless/wext-core.c67
-rw-r--r--net/wireless/wext-sme.c23
-rw-r--r--net/x25/af_x25.c5
-rw-r--r--net/xfrm/Kconfig5
-rw-r--r--net/xfrm/xfrm_input.c111
-rw-r--r--net/xfrm/xfrm_output.c8
-rw-r--r--net/xfrm/xfrm_policy.c172
-rw-r--r--net/xfrm/xfrm_state.c86
-rw-r--r--net/xfrm/xfrm_user.c9
621 files changed, 23232 insertions, 8402 deletions
diff --git a/net/6lowpan/nhc.c b/net/6lowpan/nhc.c
index 7008d53e455c..4fa2fdda174d 100644
--- a/net/6lowpan/nhc.c
+++ b/net/6lowpan/nhc.c
@@ -27,8 +27,8 @@ static int lowpan_nhc_insert(struct lowpan_nhc *nhc)
27 27
28 /* Figure out where to put new node */ 28 /* Figure out where to put new node */
29 while (*new) { 29 while (*new) {
30 struct lowpan_nhc *this = container_of(*new, struct lowpan_nhc, 30 struct lowpan_nhc *this = rb_entry(*new, struct lowpan_nhc,
31 node); 31 node);
32 int result, len_dif, len; 32 int result, len_dif, len;
33 33
34 len_dif = nhc->idlen - this->idlen; 34 len_dif = nhc->idlen - this->idlen;
@@ -69,8 +69,8 @@ static struct lowpan_nhc *lowpan_nhc_by_nhcid(const struct sk_buff *skb)
69 const u8 *nhcid_skb_ptr = skb->data; 69 const u8 *nhcid_skb_ptr = skb->data;
70 70
71 while (node) { 71 while (node) {
72 struct lowpan_nhc *nhc = container_of(node, struct lowpan_nhc, 72 struct lowpan_nhc *nhc = rb_entry(node, struct lowpan_nhc,
73 node); 73 node);
74 u8 nhcid_skb_ptr_masked[LOWPAN_NHC_MAX_ID_LEN]; 74 u8 nhcid_skb_ptr_masked[LOWPAN_NHC_MAX_ID_LEN];
75 int result, i; 75 int result, i;
76 76
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 10da6c588bf8..e97ab824e368 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -671,7 +671,8 @@ static int vlan_ethtool_get_ts_info(struct net_device *dev,
671 return 0; 671 return 0;
672} 672}
673 673
674static struct rtnl_link_stats64 *vlan_dev_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) 674static void vlan_dev_get_stats64(struct net_device *dev,
675 struct rtnl_link_stats64 *stats)
675{ 676{
676 struct vlan_pcpu_stats *p; 677 struct vlan_pcpu_stats *p;
677 u32 rx_errors = 0, tx_dropped = 0; 678 u32 rx_errors = 0, tx_dropped = 0;
@@ -702,8 +703,6 @@ static struct rtnl_link_stats64 *vlan_dev_get_stats64(struct net_device *dev, st
702 } 703 }
703 stats->rx_errors = rx_errors; 704 stats->rx_errors = rx_errors;
704 stats->tx_dropped = tx_dropped; 705 stats->tx_dropped = tx_dropped;
705
706 return stats;
707} 706}
708 707
709#ifdef CONFIG_NET_POLL_CONTROLLER 708#ifdef CONFIG_NET_POLL_CONTROLLER
@@ -792,8 +791,6 @@ static const struct net_device_ops vlan_netdev_ops = {
792 .ndo_netpoll_cleanup = vlan_dev_netpoll_cleanup, 791 .ndo_netpoll_cleanup = vlan_dev_netpoll_cleanup,
793#endif 792#endif
794 .ndo_fix_features = vlan_dev_fix_features, 793 .ndo_fix_features = vlan_dev_fix_features,
795 .ndo_neigh_construct = netdev_default_l2upper_neigh_construct,
796 .ndo_neigh_destroy = netdev_default_l2upper_neigh_destroy,
797 .ndo_fdb_add = switchdev_port_fdb_add, 794 .ndo_fdb_add = switchdev_port_fdb_add,
798 .ndo_fdb_del = switchdev_port_fdb_del, 795 .ndo_fdb_del = switchdev_port_fdb_del,
799 .ndo_fdb_dump = switchdev_port_fdb_dump, 796 .ndo_fdb_dump = switchdev_port_fdb_dump,
diff --git a/net/9p/client.c b/net/9p/client.c
index 3fc94a49ccd5..3ce672af1596 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -32,7 +32,7 @@
32#include <linux/idr.h> 32#include <linux/idr.h>
33#include <linux/mutex.h> 33#include <linux/mutex.h>
34#include <linux/slab.h> 34#include <linux/slab.h>
35#include <linux/sched.h> 35#include <linux/sched/signal.h>
36#include <linux/uaccess.h> 36#include <linux/uaccess.h>
37#include <linux/uio.h> 37#include <linux/uio.h>
38#include <net/9p/9p.h> 38#include <net/9p/9p.h>
@@ -1101,7 +1101,7 @@ void p9_client_begin_disconnect(struct p9_client *clnt)
1101EXPORT_SYMBOL(p9_client_begin_disconnect); 1101EXPORT_SYMBOL(p9_client_begin_disconnect);
1102 1102
1103struct p9_fid *p9_client_attach(struct p9_client *clnt, struct p9_fid *afid, 1103struct p9_fid *p9_client_attach(struct p9_client *clnt, struct p9_fid *afid,
1104 char *uname, kuid_t n_uname, char *aname) 1104 const char *uname, kuid_t n_uname, const char *aname)
1105{ 1105{
1106 int err = 0; 1106 int err = 0;
1107 struct p9_req_t *req; 1107 struct p9_req_t *req;
@@ -1149,7 +1149,7 @@ error:
1149EXPORT_SYMBOL(p9_client_attach); 1149EXPORT_SYMBOL(p9_client_attach);
1150 1150
1151struct p9_fid *p9_client_walk(struct p9_fid *oldfid, uint16_t nwname, 1151struct p9_fid *p9_client_walk(struct p9_fid *oldfid, uint16_t nwname,
1152 char **wnames, int clone) 1152 const unsigned char * const *wnames, int clone)
1153{ 1153{
1154 int err; 1154 int err;
1155 struct p9_client *clnt; 1155 struct p9_client *clnt;
@@ -1271,7 +1271,7 @@ error:
1271} 1271}
1272EXPORT_SYMBOL(p9_client_open); 1272EXPORT_SYMBOL(p9_client_open);
1273 1273
1274int p9_client_create_dotl(struct p9_fid *ofid, char *name, u32 flags, u32 mode, 1274int p9_client_create_dotl(struct p9_fid *ofid, const char *name, u32 flags, u32 mode,
1275 kgid_t gid, struct p9_qid *qid) 1275 kgid_t gid, struct p9_qid *qid)
1276{ 1276{
1277 int err = 0; 1277 int err = 0;
@@ -1316,7 +1316,7 @@ error:
1316} 1316}
1317EXPORT_SYMBOL(p9_client_create_dotl); 1317EXPORT_SYMBOL(p9_client_create_dotl);
1318 1318
1319int p9_client_fcreate(struct p9_fid *fid, char *name, u32 perm, int mode, 1319int p9_client_fcreate(struct p9_fid *fid, const char *name, u32 perm, int mode,
1320 char *extension) 1320 char *extension)
1321{ 1321{
1322 int err; 1322 int err;
@@ -1361,8 +1361,8 @@ error:
1361} 1361}
1362EXPORT_SYMBOL(p9_client_fcreate); 1362EXPORT_SYMBOL(p9_client_fcreate);
1363 1363
1364int p9_client_symlink(struct p9_fid *dfid, char *name, char *symtgt, kgid_t gid, 1364int p9_client_symlink(struct p9_fid *dfid, const char *name,
1365 struct p9_qid *qid) 1365 const char *symtgt, kgid_t gid, struct p9_qid *qid)
1366{ 1366{
1367 int err = 0; 1367 int err = 0;
1368 struct p9_client *clnt; 1368 struct p9_client *clnt;
@@ -1395,7 +1395,7 @@ error:
1395} 1395}
1396EXPORT_SYMBOL(p9_client_symlink); 1396EXPORT_SYMBOL(p9_client_symlink);
1397 1397
1398int p9_client_link(struct p9_fid *dfid, struct p9_fid *oldfid, char *newname) 1398int p9_client_link(struct p9_fid *dfid, struct p9_fid *oldfid, const char *newname)
1399{ 1399{
1400 struct p9_client *clnt; 1400 struct p9_client *clnt;
1401 struct p9_req_t *req; 1401 struct p9_req_t *req;
@@ -2117,7 +2117,7 @@ error:
2117} 2117}
2118EXPORT_SYMBOL(p9_client_readdir); 2118EXPORT_SYMBOL(p9_client_readdir);
2119 2119
2120int p9_client_mknod_dotl(struct p9_fid *fid, char *name, int mode, 2120int p9_client_mknod_dotl(struct p9_fid *fid, const char *name, int mode,
2121 dev_t rdev, kgid_t gid, struct p9_qid *qid) 2121 dev_t rdev, kgid_t gid, struct p9_qid *qid)
2122{ 2122{
2123 int err; 2123 int err;
@@ -2148,7 +2148,7 @@ error:
2148} 2148}
2149EXPORT_SYMBOL(p9_client_mknod_dotl); 2149EXPORT_SYMBOL(p9_client_mknod_dotl);
2150 2150
2151int p9_client_mkdir_dotl(struct p9_fid *fid, char *name, int mode, 2151int p9_client_mkdir_dotl(struct p9_fid *fid, const char *name, int mode,
2152 kgid_t gid, struct p9_qid *qid) 2152 kgid_t gid, struct p9_qid *qid)
2153{ 2153{
2154 int err; 2154 int err;
diff --git a/net/Kconfig b/net/Kconfig
index a29bb4b41c50..102f781a0131 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -57,6 +57,7 @@ source "net/packet/Kconfig"
57source "net/unix/Kconfig" 57source "net/unix/Kconfig"
58source "net/xfrm/Kconfig" 58source "net/xfrm/Kconfig"
59source "net/iucv/Kconfig" 59source "net/iucv/Kconfig"
60source "net/smc/Kconfig"
60 61
61config INET 62config INET
62 bool "TCP/IP networking" 63 bool "TCP/IP networking"
@@ -296,7 +297,8 @@ config BPF_JIT
296 297
297 Note, admin should enable this feature changing: 298 Note, admin should enable this feature changing:
298 /proc/sys/net/core/bpf_jit_enable 299 /proc/sys/net/core/bpf_jit_enable
299 /proc/sys/net/core/bpf_jit_harden (optional) 300 /proc/sys/net/core/bpf_jit_harden (optional)
301 /proc/sys/net/core/bpf_jit_kallsyms (optional)
300 302
301config NET_FLOW_LIMIT 303config NET_FLOW_LIMIT
302 bool 304 bool
@@ -389,6 +391,8 @@ source "net/9p/Kconfig"
389source "net/caif/Kconfig" 391source "net/caif/Kconfig"
390source "net/ceph/Kconfig" 392source "net/ceph/Kconfig"
391source "net/nfc/Kconfig" 393source "net/nfc/Kconfig"
394source "net/psample/Kconfig"
395source "net/ife/Kconfig"
392 396
393config LWTUNNEL 397config LWTUNNEL
394 bool "Network light weight tunnels" 398 bool "Network light weight tunnels"
@@ -410,6 +414,10 @@ config DST_CACHE
410 bool 414 bool
411 default n 415 default n
412 416
417config GRO_CELLS
418 bool
419 default n
420
413config NET_DEVLINK 421config NET_DEVLINK
414 tristate "Network physical/parent device Netlink interface" 422 tristate "Network physical/parent device Netlink interface"
415 help 423 help
diff --git a/net/Makefile b/net/Makefile
index 4cafaa2b4667..9b681550e3a3 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -51,6 +51,7 @@ obj-$(CONFIG_MAC80211) += mac80211/
51obj-$(CONFIG_TIPC) += tipc/ 51obj-$(CONFIG_TIPC) += tipc/
52obj-$(CONFIG_NETLABEL) += netlabel/ 52obj-$(CONFIG_NETLABEL) += netlabel/
53obj-$(CONFIG_IUCV) += iucv/ 53obj-$(CONFIG_IUCV) += iucv/
54obj-$(CONFIG_SMC) += smc/
54obj-$(CONFIG_RFKILL) += rfkill/ 55obj-$(CONFIG_RFKILL) += rfkill/
55obj-$(CONFIG_NET_9P) += 9p/ 56obj-$(CONFIG_NET_9P) += 9p/
56obj-$(CONFIG_CAIF) += caif/ 57obj-$(CONFIG_CAIF) += caif/
@@ -69,6 +70,8 @@ obj-$(CONFIG_DNS_RESOLVER) += dns_resolver/
69obj-$(CONFIG_CEPH_LIB) += ceph/ 70obj-$(CONFIG_CEPH_LIB) += ceph/
70obj-$(CONFIG_BATMAN_ADV) += batman-adv/ 71obj-$(CONFIG_BATMAN_ADV) += batman-adv/
71obj-$(CONFIG_NFC) += nfc/ 72obj-$(CONFIG_NFC) += nfc/
73obj-$(CONFIG_PSAMPLE) += psample/
74obj-$(CONFIG_NET_IFE) += ife/
72obj-$(CONFIG_OPENVSWITCH) += openvswitch/ 75obj-$(CONFIG_OPENVSWITCH) += openvswitch/
73obj-$(CONFIG_VSOCKETS) += vmw_vsock/ 76obj-$(CONFIG_VSOCKETS) += vmw_vsock/
74obj-$(CONFIG_MPLS) += mpls/ 77obj-$(CONFIG_MPLS) += mpls/
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 10d2bdce686e..465cc24b41e5 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -1656,7 +1656,7 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
1656 ddp->deh_dport = usat->sat_port; 1656 ddp->deh_dport = usat->sat_port;
1657 ddp->deh_sport = at->src_port; 1657 ddp->deh_sport = at->src_port;
1658 1658
1659 SOCK_DEBUG(sk, "SK %p: Copy user data (%Zd bytes).\n", sk, len); 1659 SOCK_DEBUG(sk, "SK %p: Copy user data (%zd bytes).\n", sk, len);
1660 1660
1661 err = memcpy_from_msg(skb_put(skb, len), msg, len); 1661 err = memcpy_from_msg(skb_put(skb, len), msg, len);
1662 if (err) { 1662 if (err) {
@@ -1720,7 +1720,7 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
1720 */ 1720 */
1721 aarp_send_ddp(dev, skb, &usat->sat_addr, NULL); 1721 aarp_send_ddp(dev, skb, &usat->sat_addr, NULL);
1722 } 1722 }
1723 SOCK_DEBUG(sk, "SK %p: Done write (%Zd).\n", sk, len); 1723 SOCK_DEBUG(sk, "SK %p: Done write (%zd).\n", sk, len);
1724 1724
1725out: 1725out:
1726 release_sock(sk); 1726 release_sock(sk);
diff --git a/net/atm/common.c b/net/atm/common.c
index a3ca922d307b..9613381f5db0 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -13,7 +13,7 @@
13#include <linux/errno.h> /* error codes */ 13#include <linux/errno.h> /* error codes */
14#include <linux/capability.h> 14#include <linux/capability.h>
15#include <linux/mm.h> 15#include <linux/mm.h>
16#include <linux/sched.h> 16#include <linux/sched/signal.h>
17#include <linux/time.h> /* struct timeval */ 17#include <linux/time.h> /* struct timeval */
18#include <linux/skbuff.h> 18#include <linux/skbuff.h>
19#include <linux/bitops.h> 19#include <linux/bitops.h>
diff --git a/net/atm/mpc.c b/net/atm/mpc.c
index 3b3b1a292ec8..a190800572bd 100644
--- a/net/atm/mpc.c
+++ b/net/atm/mpc.c
@@ -451,7 +451,7 @@ static void lane2_assoc_ind(struct net_device *dev, const u8 *mac_addr,
451 return; 451 return;
452 } 452 }
453 if (end_of_tlvs - tlvs != 0) 453 if (end_of_tlvs - tlvs != 0)
454 pr_info("(%s) ignoring %Zd bytes of trailing TLV garbage\n", 454 pr_info("(%s) ignoring %zd bytes of trailing TLV garbage\n",
455 dev->name, end_of_tlvs - tlvs); 455 dev->name, end_of_tlvs - tlvs);
456} 456}
457 457
diff --git a/net/atm/svc.c b/net/atm/svc.c
index 878563a8354d..5589de7086af 100644
--- a/net/atm/svc.c
+++ b/net/atm/svc.c
@@ -10,7 +10,7 @@
10#include <linux/kernel.h> /* printk */ 10#include <linux/kernel.h> /* printk */
11#include <linux/skbuff.h> 11#include <linux/skbuff.h>
12#include <linux/wait.h> 12#include <linux/wait.h>
13#include <linux/sched.h> /* jiffies and HZ */ 13#include <linux/sched/signal.h>
14#include <linux/fcntl.h> /* O_NONBLOCK */ 14#include <linux/fcntl.h> /* O_NONBLOCK */
15#include <linux/init.h> 15#include <linux/init.h>
16#include <linux/atm.h> /* ATM stuff */ 16#include <linux/atm.h> /* ATM stuff */
@@ -318,7 +318,8 @@ out:
318 return error; 318 return error;
319} 319}
320 320
321static int svc_accept(struct socket *sock, struct socket *newsock, int flags) 321static int svc_accept(struct socket *sock, struct socket *newsock, int flags,
322 bool kern)
322{ 323{
323 struct sock *sk = sock->sk; 324 struct sock *sk = sock->sk;
324 struct sk_buff *skb; 325 struct sk_buff *skb;
@@ -329,7 +330,7 @@ static int svc_accept(struct socket *sock, struct socket *newsock, int flags)
329 330
330 lock_sock(sk); 331 lock_sock(sk);
331 332
332 error = svc_create(sock_net(sk), newsock, 0, 0); 333 error = svc_create(sock_net(sk), newsock, 0, kern);
333 if (error) 334 if (error)
334 goto out; 335 goto out;
335 336
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index 90fcf5fc2e0a..b7c486752b3a 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -20,7 +20,7 @@
20#include <linux/socket.h> 20#include <linux/socket.h>
21#include <linux/in.h> 21#include <linux/in.h>
22#include <linux/kernel.h> 22#include <linux/kernel.h>
23#include <linux/sched.h> 23#include <linux/sched/signal.h>
24#include <linux/timer.h> 24#include <linux/timer.h>
25#include <linux/string.h> 25#include <linux/string.h>
26#include <linux/sockios.h> 26#include <linux/sockios.h>
@@ -1320,7 +1320,8 @@ out_release:
1320 return err; 1320 return err;
1321} 1321}
1322 1322
1323static int ax25_accept(struct socket *sock, struct socket *newsock, int flags) 1323static int ax25_accept(struct socket *sock, struct socket *newsock, int flags,
1324 bool kern)
1324{ 1325{
1325 struct sk_buff *skb; 1326 struct sk_buff *skb;
1326 struct sock *newsk; 1327 struct sock *newsk;
diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile
index f724d3c98a81..915987bc6d29 100644
--- a/net/batman-adv/Makefile
+++ b/net/batman-adv/Makefile
@@ -1,5 +1,5 @@
1# 1#
2# Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: 2# Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
3# 3#
4# Marek Lindner, Simon Wunderlich 4# Marek Lindner, Simon Wunderlich
5# 5#
diff --git a/net/batman-adv/bat_algo.c b/net/batman-adv/bat_algo.c
index 623d04302aa2..44fd073b7546 100644
--- a/net/batman-adv/bat_algo.c
+++ b/net/batman-adv/bat_algo.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner, Simon Wunderlich 3 * Marek Lindner, Simon Wunderlich
4 * 4 *
diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h
index 3b5b69cdd12b..29f6312f9bf1 100644
--- a/net/batman-adv/bat_algo.h
+++ b/net/batman-adv/bat_algo.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner, Linus Lüssing 3 * Marek Lindner, Linus Lüssing
4 * 4 *
diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index f00f666e2ccd..71343d0fec94 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner, Simon Wunderlich 3 * Marek Lindner, Simon Wunderlich
4 * 4 *
@@ -2477,6 +2477,16 @@ static void batadv_iv_iface_activate(struct batadv_hard_iface *hard_iface)
2477 batadv_iv_ogm_schedule(hard_iface); 2477 batadv_iv_ogm_schedule(hard_iface);
2478} 2478}
2479 2479
2480/**
2481 * batadv_iv_init_sel_class - initialize GW selection class
2482 * @bat_priv: the bat priv with all the soft interface information
2483 */
2484static void batadv_iv_init_sel_class(struct batadv_priv *bat_priv)
2485{
2486 /* set default TQ difference threshold to 20 */
2487 atomic_set(&bat_priv->gw.sel_class, 20);
2488}
2489
2480static struct batadv_gw_node * 2490static struct batadv_gw_node *
2481batadv_iv_gw_get_best_gw_node(struct batadv_priv *bat_priv) 2491batadv_iv_gw_get_best_gw_node(struct batadv_priv *bat_priv)
2482{ 2492{
@@ -2823,6 +2833,7 @@ static struct batadv_algo_ops batadv_batman_iv __read_mostly = {
2823 .del_if = batadv_iv_ogm_orig_del_if, 2833 .del_if = batadv_iv_ogm_orig_del_if,
2824 }, 2834 },
2825 .gw = { 2835 .gw = {
2836 .init_sel_class = batadv_iv_init_sel_class,
2826 .get_best_gw_node = batadv_iv_gw_get_best_gw_node, 2837 .get_best_gw_node = batadv_iv_gw_get_best_gw_node,
2827 .is_eligible = batadv_iv_gw_is_eligible, 2838 .is_eligible = batadv_iv_gw_is_eligible,
2828#ifdef CONFIG_BATMAN_ADV_DEBUGFS 2839#ifdef CONFIG_BATMAN_ADV_DEBUGFS
diff --git a/net/batman-adv/bat_iv_ogm.h b/net/batman-adv/bat_iv_ogm.h
index b9f3550faaf7..ae2ab526bdb1 100644
--- a/net/batman-adv/bat_iv_ogm.h
+++ b/net/batman-adv/bat_iv_ogm.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner, Simon Wunderlich 3 * Marek Lindner, Simon Wunderlich
4 * 4 *
diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c
index 2ac612d7bab4..a36c8e7291d6 100644
--- a/net/batman-adv/bat_v.c
+++ b/net/batman-adv/bat_v.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Linus Lüssing, Marek Lindner 3 * Linus Lüssing, Marek Lindner
4 * 4 *
@@ -668,6 +668,16 @@ err_ifinfo1:
668 return ret; 668 return ret;
669} 669}
670 670
671/**
672 * batadv_v_init_sel_class - initialize GW selection class
673 * @bat_priv: the bat priv with all the soft interface information
674 */
675static void batadv_v_init_sel_class(struct batadv_priv *bat_priv)
676{
677 /* set default throughput difference threshold to 5Mbps */
678 atomic_set(&bat_priv->gw.sel_class, 50);
679}
680
671static ssize_t batadv_v_store_sel_class(struct batadv_priv *bat_priv, 681static ssize_t batadv_v_store_sel_class(struct batadv_priv *bat_priv,
672 char *buff, size_t count) 682 char *buff, size_t count)
673{ 683{
@@ -1052,6 +1062,7 @@ static struct batadv_algo_ops batadv_batman_v __read_mostly = {
1052 .dump = batadv_v_orig_dump, 1062 .dump = batadv_v_orig_dump,
1053 }, 1063 },
1054 .gw = { 1064 .gw = {
1065 .init_sel_class = batadv_v_init_sel_class,
1055 .store_sel_class = batadv_v_store_sel_class, 1066 .store_sel_class = batadv_v_store_sel_class,
1056 .show_sel_class = batadv_v_show_sel_class, 1067 .show_sel_class = batadv_v_show_sel_class,
1057 .get_best_gw_node = batadv_v_gw_get_best_gw_node, 1068 .get_best_gw_node = batadv_v_gw_get_best_gw_node,
@@ -1092,9 +1103,6 @@ int batadv_v_mesh_init(struct batadv_priv *bat_priv)
1092 if (ret < 0) 1103 if (ret < 0)
1093 return ret; 1104 return ret;
1094 1105
1095 /* set default throughput difference threshold to 5Mbps */
1096 atomic_set(&bat_priv->gw.sel_class, 50);
1097
1098 return 0; 1106 return 0;
1099} 1107}
1100 1108
diff --git a/net/batman-adv/bat_v.h b/net/batman-adv/bat_v.h
index 83b77639729e..dd7c4b647e6b 100644
--- a/net/batman-adv/bat_v.h
+++ b/net/batman-adv/bat_v.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner, Linus Lüssing 3 * Marek Lindner, Linus Lüssing
4 * 4 *
diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c
index f2fb2f05b6bf..b90c9903e246 100644
--- a/net/batman-adv/bat_v_elp.c
+++ b/net/batman-adv/bat_v_elp.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Linus Lüssing, Marek Lindner 3 * Linus Lüssing, Marek Lindner
4 * 4 *
diff --git a/net/batman-adv/bat_v_elp.h b/net/batman-adv/bat_v_elp.h
index be17c0b1369e..376ead280ab9 100644
--- a/net/batman-adv/bat_v_elp.h
+++ b/net/batman-adv/bat_v_elp.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Linus Lüssing, Marek Lindner 3 * Linus Lüssing, Marek Lindner
4 * 4 *
diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c
index 38b9aab83fc0..03a35c9f456d 100644
--- a/net/batman-adv/bat_v_ogm.c
+++ b/net/batman-adv/bat_v_ogm.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Antonio Quartulli 3 * Antonio Quartulli
4 * 4 *
diff --git a/net/batman-adv/bat_v_ogm.h b/net/batman-adv/bat_v_ogm.h
index 4c4d45caa422..2068770b542d 100644
--- a/net/batman-adv/bat_v_ogm.h
+++ b/net/batman-adv/bat_v_ogm.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Antonio Quartulli 3 * Antonio Quartulli
4 * 4 *
diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c
index 032271421a20..2b070c7e31da 100644
--- a/net/batman-adv/bitarray.c
+++ b/net/batman-adv/bitarray.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2006-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2006-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Simon Wunderlich, Marek Lindner 3 * Simon Wunderlich, Marek Lindner
4 * 4 *
diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h
index 0e6e9d09078c..cc262c9d97e0 100644
--- a/net/batman-adv/bitarray.h
+++ b/net/batman-adv/bitarray.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2006-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2006-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Simon Wunderlich, Marek Lindner 3 * Simon Wunderlich, Marek Lindner
4 * 4 *
diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c
index e7f690b571ea..ba8420d8a992 100644
--- a/net/batman-adv/bridge_loop_avoidance.c
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Simon Wunderlich 3 * Simon Wunderlich
4 * 4 *
@@ -449,7 +449,6 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, u8 *mac,
449 batadv_inc_counter(bat_priv, BATADV_CNT_RX); 449 batadv_inc_counter(bat_priv, BATADV_CNT_RX);
450 batadv_add_counter(bat_priv, BATADV_CNT_RX_BYTES, 450 batadv_add_counter(bat_priv, BATADV_CNT_RX_BYTES,
451 skb->len + ETH_HLEN); 451 skb->len + ETH_HLEN);
452 soft_iface->last_rx = jiffies;
453 452
454 netif_rx(skb); 453 netif_rx(skb);
455out: 454out:
diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h
index 1ae93e46fb98..e157986bd01c 100644
--- a/net/batman-adv/bridge_loop_avoidance.h
+++ b/net/batman-adv/bridge_loop_avoidance.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Simon Wunderlich 3 * Simon Wunderlich
4 * 4 *
@@ -20,6 +20,8 @@
20 20
21#include "main.h" 21#include "main.h"
22 22
23#include <linux/compiler.h>
24#include <linux/stddef.h>
23#include <linux/types.h> 25#include <linux/types.h>
24 26
25struct net_device; 27struct net_device;
@@ -27,6 +29,22 @@ struct netlink_callback;
27struct seq_file; 29struct seq_file;
28struct sk_buff; 30struct sk_buff;
29 31
32/**
33 * batadv_bla_is_loopdetect_mac - check if the mac address is from a loop detect
34 * frame sent by bridge loop avoidance
35 * @mac: mac address to check
36 *
37 * Return: true if the it looks like a loop detect frame
38 * (mac starts with BA:BE), false otherwise
39 */
40static inline bool batadv_bla_is_loopdetect_mac(const uint8_t *mac)
41{
42 if (mac[0] == 0xba && mac[1] == 0xbe)
43 return true;
44
45 return false;
46}
47
30#ifdef CONFIG_BATMAN_ADV_BLA 48#ifdef CONFIG_BATMAN_ADV_BLA
31bool batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb, 49bool batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb,
32 unsigned short vid, bool is_bcast); 50 unsigned short vid, bool is_bcast);
diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c
index 77925504379d..e32ad47c6efd 100644
--- a/net/batman-adv/debugfs.c
+++ b/net/batman-adv/debugfs.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2010-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2010-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner 3 * Marek Lindner
4 * 4 *
@@ -19,7 +19,7 @@
19#include "main.h" 19#include "main.h"
20 20
21#include <linux/debugfs.h> 21#include <linux/debugfs.h>
22#include <linux/device.h> 22#include <linux/err.h>
23#include <linux/errno.h> 23#include <linux/errno.h>
24#include <linux/export.h> 24#include <linux/export.h>
25#include <linux/fs.h> 25#include <linux/fs.h>
diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h
index e49121ee55f6..9c5d4a65b98c 100644
--- a/net/batman-adv/debugfs.h
+++ b/net/batman-adv/debugfs.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2010-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2010-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner 3 * Marek Lindner
4 * 4 *
diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c
index 49576c5a3fe3..1bfd1dbc2feb 100644
--- a/net/batman-adv/distributed-arp-table.c
+++ b/net/batman-adv/distributed-arp-table.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Antonio Quartulli 3 * Antonio Quartulli
4 * 4 *
@@ -1050,7 +1050,6 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv,
1050 bat_priv->soft_iface); 1050 bat_priv->soft_iface);
1051 bat_priv->stats.rx_packets++; 1051 bat_priv->stats.rx_packets++;
1052 bat_priv->stats.rx_bytes += skb->len + ETH_HLEN + hdr_size; 1052 bat_priv->stats.rx_bytes += skb->len + ETH_HLEN + hdr_size;
1053 bat_priv->soft_iface->last_rx = jiffies;
1054 1053
1055 netif_rx(skb_new); 1054 netif_rx(skb_new);
1056 batadv_dbg(BATADV_DBG_DAT, bat_priv, "ARP request replied locally\n"); 1055 batadv_dbg(BATADV_DBG_DAT, bat_priv, "ARP request replied locally\n");
diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h
index 813ecea96cf9..ec364a3c1c66 100644
--- a/net/batman-adv/distributed-arp-table.h
+++ b/net/batman-adv/distributed-arp-table.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Antonio Quartulli 3 * Antonio Quartulli
4 * 4 *
diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c
index 0854ebd8613e..8f964beaac28 100644
--- a/net/batman-adv/fragmentation.c
+++ b/net/batman-adv/fragmentation.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Martin Hundebøll <martin@hundeboll.net> 3 * Martin Hundebøll <martin@hundeboll.net>
4 * 4 *
@@ -239,8 +239,10 @@ err_unlock:
239 spin_unlock_bh(&chain->lock); 239 spin_unlock_bh(&chain->lock);
240 240
241err: 241err:
242 if (!ret) 242 if (!ret) {
243 kfree(frag_entry_new); 243 kfree(frag_entry_new);
244 kfree_skb(skb);
245 }
244 246
245 return ret; 247 return ret;
246} 248}
@@ -313,7 +315,7 @@ free:
313 * 315 *
314 * There are three possible outcomes: 1) Packet is merged: Return true and 316 * There are three possible outcomes: 1) Packet is merged: Return true and
315 * set *skb to merged packet; 2) Packet is buffered: Return true and set *skb 317 * set *skb to merged packet; 2) Packet is buffered: Return true and set *skb
316 * to NULL; 3) Error: Return false and leave skb as is. 318 * to NULL; 3) Error: Return false and free skb.
317 * 319 *
318 * Return: true when packet is merged or buffered, false when skb is not not 320 * Return: true when packet is merged or buffered, false when skb is not not
319 * used. 321 * used.
@@ -338,9 +340,9 @@ bool batadv_frag_skb_buffer(struct sk_buff **skb,
338 goto out_err; 340 goto out_err;
339 341
340out: 342out:
341 *skb = skb_out;
342 ret = true; 343 ret = true;
343out_err: 344out_err:
345 *skb = skb_out;
344 return ret; 346 return ret;
345} 347}
346 348
@@ -402,7 +404,7 @@ out:
402 * batadv_frag_create - create a fragment from skb 404 * batadv_frag_create - create a fragment from skb
403 * @skb: skb to create fragment from 405 * @skb: skb to create fragment from
404 * @frag_head: header to use in new fragment 406 * @frag_head: header to use in new fragment
405 * @mtu: size of new fragment 407 * @fragment_size: size of new fragment
406 * 408 *
407 * Split the passed skb into two fragments: A new one with size matching the 409 * Split the passed skb into two fragments: A new one with size matching the
408 * passed mtu and the old one with the rest. The new skb contains data from the 410 * passed mtu and the old one with the rest. The new skb contains data from the
@@ -412,11 +414,11 @@ out:
412 */ 414 */
413static struct sk_buff *batadv_frag_create(struct sk_buff *skb, 415static struct sk_buff *batadv_frag_create(struct sk_buff *skb,
414 struct batadv_frag_packet *frag_head, 416 struct batadv_frag_packet *frag_head,
415 unsigned int mtu) 417 unsigned int fragment_size)
416{ 418{
417 struct sk_buff *skb_fragment; 419 struct sk_buff *skb_fragment;
418 unsigned int header_size = sizeof(*frag_head); 420 unsigned int header_size = sizeof(*frag_head);
419 unsigned int fragment_size = mtu - header_size; 421 unsigned int mtu = fragment_size + header_size;
420 422
421 skb_fragment = netdev_alloc_skb(NULL, mtu + ETH_HLEN); 423 skb_fragment = netdev_alloc_skb(NULL, mtu + ETH_HLEN);
422 if (!skb_fragment) 424 if (!skb_fragment)
@@ -454,7 +456,7 @@ int batadv_frag_send_packet(struct sk_buff *skb,
454 struct sk_buff *skb_fragment; 456 struct sk_buff *skb_fragment;
455 unsigned int mtu = neigh_node->if_incoming->net_dev->mtu; 457 unsigned int mtu = neigh_node->if_incoming->net_dev->mtu;
456 unsigned int header_size = sizeof(frag_header); 458 unsigned int header_size = sizeof(frag_header);
457 unsigned int max_fragment_size, max_packet_size; 459 unsigned int max_fragment_size, num_fragments;
458 int ret; 460 int ret;
459 461
460 /* To avoid merge and refragmentation at next-hops we never send 462 /* To avoid merge and refragmentation at next-hops we never send
@@ -462,10 +464,15 @@ int batadv_frag_send_packet(struct sk_buff *skb,
462 */ 464 */
463 mtu = min_t(unsigned int, mtu, BATADV_FRAG_MAX_FRAG_SIZE); 465 mtu = min_t(unsigned int, mtu, BATADV_FRAG_MAX_FRAG_SIZE);
464 max_fragment_size = mtu - header_size; 466 max_fragment_size = mtu - header_size;
465 max_packet_size = max_fragment_size * BATADV_FRAG_MAX_FRAGMENTS; 467
468 if (skb->len == 0 || max_fragment_size == 0)
469 return -EINVAL;
470
471 num_fragments = (skb->len - 1) / max_fragment_size + 1;
472 max_fragment_size = (skb->len - 1) / num_fragments + 1;
466 473
467 /* Don't even try to fragment, if we need more than 16 fragments */ 474 /* Don't even try to fragment, if we need more than 16 fragments */
468 if (skb->len > max_packet_size) { 475 if (num_fragments > BATADV_FRAG_MAX_FRAGMENTS) {
469 ret = -EAGAIN; 476 ret = -EAGAIN;
470 goto free_skb; 477 goto free_skb;
471 } 478 }
@@ -499,7 +506,14 @@ int batadv_frag_send_packet(struct sk_buff *skb,
499 506
500 /* Eat and send fragments from the tail of skb */ 507 /* Eat and send fragments from the tail of skb */
501 while (skb->len > max_fragment_size) { 508 while (skb->len > max_fragment_size) {
502 skb_fragment = batadv_frag_create(skb, &frag_header, mtu); 509 /* The initial check in this function should cover this case */
510 if (unlikely(frag_header.no == BATADV_FRAG_MAX_FRAGMENTS - 1)) {
511 ret = -EINVAL;
512 goto put_primary_if;
513 }
514
515 skb_fragment = batadv_frag_create(skb, &frag_header,
516 max_fragment_size);
503 if (!skb_fragment) { 517 if (!skb_fragment) {
504 ret = -ENOMEM; 518 ret = -ENOMEM;
505 goto put_primary_if; 519 goto put_primary_if;
@@ -515,12 +529,6 @@ int batadv_frag_send_packet(struct sk_buff *skb,
515 } 529 }
516 530
517 frag_header.no++; 531 frag_header.no++;
518
519 /* The initial check in this function should cover this case */
520 if (frag_header.no == BATADV_FRAG_MAX_FRAGMENTS - 1) {
521 ret = -EINVAL;
522 goto put_primary_if;
523 }
524 } 532 }
525 533
526 /* Make room for the fragment header. */ 534 /* Make room for the fragment header. */
diff --git a/net/batman-adv/fragmentation.h b/net/batman-adv/fragmentation.h
index b95f619606af..1a2d6c308745 100644
--- a/net/batman-adv/fragmentation.h
+++ b/net/batman-adv/fragmentation.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Martin Hundebøll <martin@hundeboll.net> 3 * Martin Hundebøll <martin@hundeboll.net>
4 * 4 *
diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
index 52b8bd6ec431..de9955d5224d 100644
--- a/net/batman-adv/gateway_client.c
+++ b/net/batman-adv/gateway_client.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2009-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2009-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner 3 * Marek Lindner
4 * 4 *
diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h
index 859166d03561..3baa3d466e5e 100644
--- a/net/batman-adv/gateway_client.h
+++ b/net/batman-adv/gateway_client.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2009-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2009-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner 3 * Marek Lindner
4 * 4 *
diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c
index 21184810d89f..33940c5c74a8 100644
--- a/net/batman-adv/gateway_common.c
+++ b/net/batman-adv/gateway_common.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2009-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2009-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner 3 * Marek Lindner
4 * 4 *
@@ -253,6 +253,11 @@ static void batadv_gw_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv,
253 */ 253 */
254void batadv_gw_init(struct batadv_priv *bat_priv) 254void batadv_gw_init(struct batadv_priv *bat_priv)
255{ 255{
256 if (bat_priv->algo_ops->gw.init_sel_class)
257 bat_priv->algo_ops->gw.init_sel_class(bat_priv);
258 else
259 atomic_set(&bat_priv->gw.sel_class, 1);
260
256 batadv_tvlv_handler_register(bat_priv, batadv_gw_tvlv_ogm_handler_v1, 261 batadv_tvlv_handler_register(bat_priv, batadv_gw_tvlv_ogm_handler_v1,
257 NULL, BATADV_TVLV_GW, 1, 262 NULL, BATADV_TVLV_GW, 1,
258 BATADV_TVLV_HANDLER_OGM_CIFNOTFND); 263 BATADV_TVLV_HANDLER_OGM_CIFNOTFND);
diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h
index 8a5e1ddf1175..0a6a97d201f2 100644
--- a/net/batman-adv/gateway_common.h
+++ b/net/batman-adv/gateway_common.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2009-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2009-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner 3 * Marek Lindner
4 * 4 *
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index 61a431a9772b..e348f76ea8c1 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner, Simon Wunderlich 3 * Marek Lindner, Simon Wunderlich
4 * 4 *
diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h
index d6309a423629..9f9890ff7a22 100644
--- a/net/batman-adv/hard-interface.h
+++ b/net/batman-adv/hard-interface.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner, Simon Wunderlich 3 * Marek Lindner, Simon Wunderlich
4 * 4 *
diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c
index a0a0fdb85805..b5f7e13918ac 100644
--- a/net/batman-adv/hash.c
+++ b/net/batman-adv/hash.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2006-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2006-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Simon Wunderlich, Marek Lindner 3 * Simon Wunderlich, Marek Lindner
4 * 4 *
diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h
index 557a7044cfbc..0c905e91c5e2 100644
--- a/net/batman-adv/hash.h
+++ b/net/batman-adv/hash.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2006-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2006-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Simon Wunderlich, Marek Lindner 3 * Simon Wunderlich, Marek Lindner
4 * 4 *
diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c
index b310f381ae02..6308c9f0fd96 100644
--- a/net/batman-adv/icmp_socket.c
+++ b/net/batman-adv/icmp_socket.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner 3 * Marek Lindner
4 * 4 *
diff --git a/net/batman-adv/icmp_socket.h b/net/batman-adv/icmp_socket.h
index e44a7da51431..f3fec40aae86 100644
--- a/net/batman-adv/icmp_socket.h
+++ b/net/batman-adv/icmp_socket.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner 3 * Marek Lindner
4 * 4 *
diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c
index c73c31769aba..4ef4bde2cc2d 100644
--- a/net/batman-adv/log.c
+++ b/net/batman-adv/log.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2010-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2010-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner 3 * Marek Lindner
4 * 4 *
diff --git a/net/batman-adv/log.h b/net/batman-adv/log.h
index 3284a7b0325d..7a2b9f4da078 100644
--- a/net/batman-adv/log.h
+++ b/net/batman-adv/log.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner, Simon Wunderlich 3 * Marek Lindner, Simon Wunderlich
4 * 4 *
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index d46415edd3be..5000c540614d 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner, Simon Wunderlich 3 * Marek Lindner, Simon Wunderlich
4 * 4 *
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index a6cc8040a21d..57a8103dbce7 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner, Simon Wunderlich 3 * Marek Lindner, Simon Wunderlich
4 * 4 *
@@ -24,7 +24,7 @@
24#define BATADV_DRIVER_DEVICE "batman-adv" 24#define BATADV_DRIVER_DEVICE "batman-adv"
25 25
26#ifndef BATADV_SOURCE_VERSION 26#ifndef BATADV_SOURCE_VERSION
27#define BATADV_SOURCE_VERSION "2016.5" 27#define BATADV_SOURCE_VERSION "2017.0"
28#endif 28#endif
29 29
30/* B.A.T.M.A.N. parameters */ 30/* B.A.T.M.A.N. parameters */
diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c
index 090a69fc342e..952ba81a565b 100644
--- a/net/batman-adv/multicast.c
+++ b/net/batman-adv/multicast.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2014-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2014-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Linus Lüssing 3 * Linus Lüssing
4 * 4 *
diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h
index 2cddaf52a21d..2a78cddab0e9 100644
--- a/net/batman-adv/multicast.h
+++ b/net/batman-adv/multicast.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2014-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2014-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Linus Lüssing 3 * Linus Lüssing
4 * 4 *
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index 062738163bdc..ab13b4d58733 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2016-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Matthias Schiffer 3 * Matthias Schiffer
4 * 4 *
diff --git a/net/batman-adv/netlink.h b/net/batman-adv/netlink.h
index 52eb16281aba..f1cd8c5da966 100644
--- a/net/batman-adv/netlink.h
+++ b/net/batman-adv/netlink.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2016-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Matthias Schiffer 3 * Matthias Schiffer
4 * 4 *
diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c
index ab5a3bf0765f..e1f6fc72fe3e 100644
--- a/net/batman-adv/network-coding.c
+++ b/net/batman-adv/network-coding.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2012-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2012-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Martin Hundebøll, Jeppe Ledet-Pedersen 3 * Martin Hundebøll, Jeppe Ledet-Pedersen
4 * 4 *
diff --git a/net/batman-adv/network-coding.h b/net/batman-adv/network-coding.h
index d6d7fb4ec5d5..c66efb81d2f4 100644
--- a/net/batman-adv/network-coding.h
+++ b/net/batman-adv/network-coding.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2012-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2012-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Martin Hundebøll, Jeppe Ledet-Pedersen 3 * Martin Hundebøll, Jeppe Ledet-Pedersen
4 * 4 *
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index 8f3b2969cc4e..8e2a4b205257 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2009-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2009-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner, Simon Wunderlich 3 * Marek Lindner, Simon Wunderlich
4 * 4 *
diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h
index ebc56183f358..d94220a6d21a 100644
--- a/net/batman-adv/originator.h
+++ b/net/batman-adv/originator.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner, Simon Wunderlich 3 * Marek Lindner, Simon Wunderlich
4 * 4 *
diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h
index 7a36bcfa0ba0..8e8a5db197cb 100644
--- a/net/batman-adv/packet.h
+++ b/net/batman-adv/packet.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner, Simon Wunderlich 3 * Marek Lindner, Simon Wunderlich
4 * 4 *
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index 6713bdf414cd..7fd740b6e36d 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner, Simon Wunderlich 3 * Marek Lindner, Simon Wunderlich
4 * 4 *
@@ -719,20 +719,19 @@ static int batadv_route_unicast_packet(struct sk_buff *skb,
719 719
720 len = skb->len; 720 len = skb->len;
721 res = batadv_send_skb_to_orig(skb, orig_node, recv_if); 721 res = batadv_send_skb_to_orig(skb, orig_node, recv_if);
722 if (res == NET_XMIT_SUCCESS)
723 ret = NET_RX_SUCCESS;
724
725 /* skb was consumed */
726 skb = NULL;
727 722
728 /* translate transmit result into receive result */ 723 /* translate transmit result into receive result */
729 if (res == NET_XMIT_SUCCESS) { 724 if (res == NET_XMIT_SUCCESS) {
725 ret = NET_RX_SUCCESS;
730 /* skb was transmitted and consumed */ 726 /* skb was transmitted and consumed */
731 batadv_inc_counter(bat_priv, BATADV_CNT_FORWARD); 727 batadv_inc_counter(bat_priv, BATADV_CNT_FORWARD);
732 batadv_add_counter(bat_priv, BATADV_CNT_FORWARD_BYTES, 728 batadv_add_counter(bat_priv, BATADV_CNT_FORWARD_BYTES,
733 len + ETH_HLEN); 729 len + ETH_HLEN);
734 } 730 }
735 731
732 /* skb was consumed */
733 skb = NULL;
734
736put_orig_node: 735put_orig_node:
737 batadv_orig_node_put(orig_node); 736 batadv_orig_node_put(orig_node);
738free_skb: 737free_skb:
diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h
index 05c3ff42e181..5ede16c32f15 100644
--- a/net/batman-adv/routing.h
+++ b/net/batman-adv/routing.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner, Simon Wunderlich 3 * Marek Lindner, Simon Wunderlich
4 * 4 *
diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c
index 49021b7124f3..1489ec27daff 100644
--- a/net/batman-adv/send.c
+++ b/net/batman-adv/send.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner, Simon Wunderlich 3 * Marek Lindner, Simon Wunderlich
4 * 4 *
@@ -77,6 +77,7 @@ int batadv_send_skb_packet(struct sk_buff *skb,
77{ 77{
78 struct batadv_priv *bat_priv; 78 struct batadv_priv *bat_priv;
79 struct ethhdr *ethhdr; 79 struct ethhdr *ethhdr;
80 int ret;
80 81
81 bat_priv = netdev_priv(hard_iface->soft_iface); 82 bat_priv = netdev_priv(hard_iface->soft_iface);
82 83
@@ -115,7 +116,8 @@ int batadv_send_skb_packet(struct sk_buff *skb,
115 * congestion and traffic shaping, it drops and returns NET_XMIT_DROP 116 * congestion and traffic shaping, it drops and returns NET_XMIT_DROP
116 * (which is > 0). This will not be treated as an error. 117 * (which is > 0). This will not be treated as an error.
117 */ 118 */
118 return dev_queue_xmit(skb); 119 ret = dev_queue_xmit(skb);
120 return net_xmit_eval(ret);
119send_skb_err: 121send_skb_err:
120 kfree_skb(skb); 122 kfree_skb(skb);
121 return NET_XMIT_DROP; 123 return NET_XMIT_DROP;
diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h
index a94e1e8639ca..f21166d10323 100644
--- a/net/batman-adv/send.h
+++ b/net/batman-adv/send.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner, Simon Wunderlich 3 * Marek Lindner, Simon Wunderlich
4 * 4 *
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index 7b3494ae6ad9..d042c99af028 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner, Simon Wunderlich 3 * Marek Lindner, Simon Wunderlich
4 * 4 *
@@ -258,7 +258,8 @@ static int batadv_interface_tx(struct sk_buff *skb,
258 ethhdr = eth_hdr(skb); 258 ethhdr = eth_hdr(skb);
259 259
260 /* Register the client MAC in the transtable */ 260 /* Register the client MAC in the transtable */
261 if (!is_multicast_ether_addr(ethhdr->h_source)) { 261 if (!is_multicast_ether_addr(ethhdr->h_source) &&
262 !batadv_bla_is_loopdetect_mac(ethhdr->h_source)) {
262 client_added = batadv_tt_local_add(soft_iface, ethhdr->h_source, 263 client_added = batadv_tt_local_add(soft_iface, ethhdr->h_source,
263 vid, skb->skb_iif, 264 vid, skb->skb_iif,
264 skb->mark); 265 skb->mark);
@@ -481,8 +482,6 @@ void batadv_interface_rx(struct net_device *soft_iface,
481 batadv_add_counter(bat_priv, BATADV_CNT_RX_BYTES, 482 batadv_add_counter(bat_priv, BATADV_CNT_RX_BYTES,
482 skb->len + ETH_HLEN); 483 skb->len + ETH_HLEN);
483 484
484 soft_iface->last_rx = jiffies;
485
486 /* Let the bridge loop avoidance check the packet. If will 485 /* Let the bridge loop avoidance check the packet. If will
487 * not handle it, we can safely push it up. 486 * not handle it, we can safely push it up.
488 */ 487 */
@@ -820,7 +819,6 @@ static int batadv_softif_init_late(struct net_device *dev)
820 atomic_set(&bat_priv->mcast.num_want_all_ipv6, 0); 819 atomic_set(&bat_priv->mcast.num_want_all_ipv6, 0);
821#endif 820#endif
822 atomic_set(&bat_priv->gw.mode, BATADV_GW_MODE_OFF); 821 atomic_set(&bat_priv->gw.mode, BATADV_GW_MODE_OFF);
823 atomic_set(&bat_priv->gw.sel_class, 20);
824 atomic_set(&bat_priv->gw.bandwidth_down, 100); 822 atomic_set(&bat_priv->gw.bandwidth_down, 100);
825 atomic_set(&bat_priv->gw.bandwidth_up, 20); 823 atomic_set(&bat_priv->gw.bandwidth_up, 20);
826 atomic_set(&bat_priv->orig_interval, 1000); 824 atomic_set(&bat_priv->orig_interval, 1000);
diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h
index ec303ddbf647..639c3abb214a 100644
--- a/net/batman-adv/soft-interface.h
+++ b/net/batman-adv/soft-interface.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner 3 * Marek Lindner
4 * 4 *
diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c
index 17c844196eb2..0ae8b30e4eaa 100644
--- a/net/batman-adv/sysfs.c
+++ b/net/batman-adv/sysfs.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2010-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2010-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner 3 * Marek Lindner
4 * 4 *
diff --git a/net/batman-adv/sysfs.h b/net/batman-adv/sysfs.h
index c76021b4e198..e487412e256b 100644
--- a/net/batman-adv/sysfs.h
+++ b/net/batman-adv/sysfs.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2010-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2010-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner 3 * Marek Lindner
4 * 4 *
diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c
index 981e8c5b07e9..c94ebdecdc3d 100644
--- a/net/batman-adv/tp_meter.c
+++ b/net/batman-adv/tp_meter.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2012-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2012-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Edo Monticelli, Antonio Quartulli 3 * Edo Monticelli, Antonio Quartulli
4 * 4 *
@@ -23,7 +23,7 @@
23#include <linux/byteorder/generic.h> 23#include <linux/byteorder/generic.h>
24#include <linux/cache.h> 24#include <linux/cache.h>
25#include <linux/compiler.h> 25#include <linux/compiler.h>
26#include <linux/device.h> 26#include <linux/err.h>
27#include <linux/etherdevice.h> 27#include <linux/etherdevice.h>
28#include <linux/fs.h> 28#include <linux/fs.h>
29#include <linux/if_ether.h> 29#include <linux/if_ether.h>
diff --git a/net/batman-adv/tp_meter.h b/net/batman-adv/tp_meter.h
index ba922c425e56..a8ada5c123bd 100644
--- a/net/batman-adv/tp_meter.h
+++ b/net/batman-adv/tp_meter.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2012-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2012-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Edo Monticelli, Antonio Quartulli 3 * Edo Monticelli, Antonio Quartulli
4 * 4 *
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 30ecbfb40adf..6077a87d46f0 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner, Simon Wunderlich, Antonio Quartulli 3 * Marek Lindner, Simon Wunderlich, Antonio Quartulli
4 * 4 *
@@ -3714,7 +3714,6 @@ static void batadv_tt_local_set_flags(struct batadv_priv *bat_priv, u16 flags,
3714{ 3714{
3715 struct batadv_hashtable *hash = bat_priv->tt.local_hash; 3715 struct batadv_hashtable *hash = bat_priv->tt.local_hash;
3716 struct batadv_tt_common_entry *tt_common_entry; 3716 struct batadv_tt_common_entry *tt_common_entry;
3717 u16 changed_num = 0;
3718 struct hlist_head *head; 3717 struct hlist_head *head;
3719 u32 i; 3718 u32 i;
3720 3719
@@ -3736,7 +3735,6 @@ static void batadv_tt_local_set_flags(struct batadv_priv *bat_priv, u16 flags,
3736 continue; 3735 continue;
3737 tt_common_entry->flags &= ~flags; 3736 tt_common_entry->flags &= ~flags;
3738 } 3737 }
3739 changed_num++;
3740 3738
3741 if (!count) 3739 if (!count)
3742 continue; 3740 continue;
diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h
index 783fdba84db2..411d586191da 100644
--- a/net/batman-adv/translation-table.h
+++ b/net/batman-adv/translation-table.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner, Simon Wunderlich, Antonio Quartulli 3 * Marek Lindner, Simon Wunderlich, Antonio Quartulli
4 * 4 *
diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c
index a783420356ae..1d9e267caec9 100644
--- a/net/batman-adv/tvlv.c
+++ b/net/batman-adv/tvlv.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner, Simon Wunderlich 3 * Marek Lindner, Simon Wunderlich
4 * 4 *
diff --git a/net/batman-adv/tvlv.h b/net/batman-adv/tvlv.h
index e4369b547b43..4d01400ada30 100644
--- a/net/batman-adv/tvlv.h
+++ b/net/batman-adv/tvlv.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner, Simon Wunderlich 3 * Marek Lindner, Simon Wunderlich
4 * 4 *
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index e913aee28c98..246f21b4973b 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner, Simon Wunderlich 3 * Marek Lindner, Simon Wunderlich
4 * 4 *
@@ -402,7 +402,7 @@ struct batadv_gw_node {
402 struct rcu_head rcu; 402 struct rcu_head rcu;
403}; 403};
404 404
405DECLARE_EWMA(throughput, 1024, 8) 405DECLARE_EWMA(throughput, 10, 8)
406 406
407/** 407/**
408 * struct batadv_hardif_neigh_node_bat_v - B.A.T.M.A.N. V private neighbor 408 * struct batadv_hardif_neigh_node_bat_v - B.A.T.M.A.N. V private neighbor
@@ -1489,6 +1489,7 @@ struct batadv_algo_orig_ops {
1489 1489
1490/** 1490/**
1491 * struct batadv_algo_gw_ops - mesh algorithm callbacks (GW specific) 1491 * struct batadv_algo_gw_ops - mesh algorithm callbacks (GW specific)
1492 * @init_sel_class: initialize GW selection class (optional)
1492 * @store_sel_class: parse and stores a new GW selection class (optional) 1493 * @store_sel_class: parse and stores a new GW selection class (optional)
1493 * @show_sel_class: prints the current GW selection class (optional) 1494 * @show_sel_class: prints the current GW selection class (optional)
1494 * @get_best_gw_node: select the best GW from the list of available nodes 1495 * @get_best_gw_node: select the best GW from the list of available nodes
@@ -1499,6 +1500,7 @@ struct batadv_algo_orig_ops {
1499 * @dump: dump gateways to a netlink socket (optional) 1500 * @dump: dump gateways to a netlink socket (optional)
1500 */ 1501 */
1501struct batadv_algo_gw_ops { 1502struct batadv_algo_gw_ops {
1503 void (*init_sel_class)(struct batadv_priv *bat_priv);
1502 ssize_t (*store_sel_class)(struct batadv_priv *bat_priv, char *buff, 1504 ssize_t (*store_sel_class)(struct batadv_priv *bat_priv, char *buff,
1503 size_t count); 1505 size_t count);
1504 ssize_t (*show_sel_class)(struct batadv_priv *bat_priv, char *buff); 1506 ssize_t (*show_sel_class)(struct batadv_priv *bat_priv, char *buff);
diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c
index 1904a93f47d5..d491529332f4 100644
--- a/net/bluetooth/6lowpan.c
+++ b/net/bluetooth/6lowpan.c
@@ -920,7 +920,7 @@ static void chan_close_cb(struct l2cap_chan *chan)
920 BT_DBG("dev %p removing %speer %p", dev, 920 BT_DBG("dev %p removing %speer %p", dev,
921 last ? "last " : "1 ", peer); 921 last ? "last " : "1 ", peer);
922 BT_DBG("chan %p orig refcnt %d", chan, 922 BT_DBG("chan %p orig refcnt %d", chan,
923 atomic_read(&chan->kref.refcount)); 923 kref_read(&chan->kref));
924 924
925 l2cap_chan_put(chan); 925 l2cap_chan_put(chan);
926 break; 926 break;
diff --git a/net/bluetooth/a2mp.c b/net/bluetooth/a2mp.c
index 5f123c3320a7..f0095fd79818 100644
--- a/net/bluetooth/a2mp.c
+++ b/net/bluetooth/a2mp.c
@@ -810,7 +810,7 @@ static struct l2cap_chan *a2mp_chan_open(struct l2cap_conn *conn, bool locked)
810/* AMP Manager functions */ 810/* AMP Manager functions */
811struct amp_mgr *amp_mgr_get(struct amp_mgr *mgr) 811struct amp_mgr *amp_mgr_get(struct amp_mgr *mgr)
812{ 812{
813 BT_DBG("mgr %p orig refcnt %d", mgr, atomic_read(&mgr->kref.refcount)); 813 BT_DBG("mgr %p orig refcnt %d", mgr, kref_read(&mgr->kref));
814 814
815 kref_get(&mgr->kref); 815 kref_get(&mgr->kref);
816 816
@@ -833,7 +833,7 @@ static void amp_mgr_destroy(struct kref *kref)
833 833
834int amp_mgr_put(struct amp_mgr *mgr) 834int amp_mgr_put(struct amp_mgr *mgr)
835{ 835{
836 BT_DBG("mgr %p orig refcnt %d", mgr, atomic_read(&mgr->kref.refcount)); 836 BT_DBG("mgr %p orig refcnt %d", mgr, kref_read(&mgr->kref));
837 837
838 return kref_put(&mgr->kref, &amp_mgr_destroy); 838 return kref_put(&mgr->kref, &amp_mgr_destroy);
839} 839}
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 1aff2da9bc74..69e1f7d362a8 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -27,6 +27,8 @@
27#include <linux/module.h> 27#include <linux/module.h>
28#include <linux/debugfs.h> 28#include <linux/debugfs.h>
29#include <linux/stringify.h> 29#include <linux/stringify.h>
30#include <linux/sched/signal.h>
31
30#include <asm/ioctls.h> 32#include <asm/ioctls.h>
31 33
32#include <net/bluetooth/bluetooth.h> 34#include <net/bluetooth/bluetooth.h>
@@ -245,7 +247,7 @@ int bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
245 if (err == 0) { 247 if (err == 0) {
246 sock_recv_ts_and_drops(msg, sk, skb); 248 sock_recv_ts_and_drops(msg, sk, skb);
247 249
248 if (bt_sk(sk)->skb_msg_name) 250 if (msg->msg_name && bt_sk(sk)->skb_msg_name)
249 bt_sk(sk)->skb_msg_name(skb, msg->msg_name, 251 bt_sk(sk)->skb_msg_name(skb, msg->msg_name,
250 &msg->msg_namelen); 252 &msg->msg_namelen);
251 } 253 }
diff --git a/net/bluetooth/amp.c b/net/bluetooth/amp.c
index e32f34189007..02a4ccc04e1e 100644
--- a/net/bluetooth/amp.c
+++ b/net/bluetooth/amp.c
@@ -24,7 +24,7 @@
24void amp_ctrl_get(struct amp_ctrl *ctrl) 24void amp_ctrl_get(struct amp_ctrl *ctrl)
25{ 25{
26 BT_DBG("ctrl %p orig refcnt %d", ctrl, 26 BT_DBG("ctrl %p orig refcnt %d", ctrl,
27 atomic_read(&ctrl->kref.refcount)); 27 kref_read(&ctrl->kref));
28 28
29 kref_get(&ctrl->kref); 29 kref_get(&ctrl->kref);
30} 30}
@@ -42,7 +42,7 @@ static void amp_ctrl_destroy(struct kref *kref)
42int amp_ctrl_put(struct amp_ctrl *ctrl) 42int amp_ctrl_put(struct amp_ctrl *ctrl)
43{ 43{
44 BT_DBG("ctrl %p orig refcnt %d", ctrl, 44 BT_DBG("ctrl %p orig refcnt %d", ctrl,
45 atomic_read(&ctrl->kref.refcount)); 45 kref_read(&ctrl->kref));
46 46
47 return kref_put(&ctrl->kref, &amp_ctrl_destroy); 47 return kref_put(&ctrl->kref, &amp_ctrl_destroy);
48} 48}
diff --git a/net/bluetooth/cmtp/capi.c b/net/bluetooth/cmtp/capi.c
index 46ac686c8911..bb308224099c 100644
--- a/net/bluetooth/cmtp/capi.c
+++ b/net/bluetooth/cmtp/capi.c
@@ -26,7 +26,7 @@
26#include <linux/types.h> 26#include <linux/types.h>
27#include <linux/errno.h> 27#include <linux/errno.h>
28#include <linux/kernel.h> 28#include <linux/kernel.h>
29#include <linux/sched.h> 29#include <linux/sched/signal.h>
30#include <linux/slab.h> 30#include <linux/slab.h>
31#include <linux/poll.h> 31#include <linux/poll.h>
32#include <linux/fcntl.h> 32#include <linux/fcntl.h>
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index e17aacbc5630..0b4dba08a14e 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -4749,7 +4749,7 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr,
4749 case LE_ADV_SCAN_RSP: 4749 case LE_ADV_SCAN_RSP:
4750 break; 4750 break;
4751 default: 4751 default:
4752 BT_ERR_RATELIMITED("Unknown advetising packet type: 0x%02x", 4752 BT_ERR_RATELIMITED("Unknown advertising packet type: 0x%02x",
4753 type); 4753 type);
4754 return; 4754 return;
4755 } 4755 }
diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index 1015d9c8d97d..b5faff458d8b 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -21,6 +21,8 @@
21 SOFTWARE IS DISCLAIMED. 21 SOFTWARE IS DISCLAIMED.
22*/ 22*/
23 23
24#include <linux/sched/signal.h>
25
24#include <net/bluetooth/bluetooth.h> 26#include <net/bluetooth/bluetooth.h>
25#include <net/bluetooth/hci_core.h> 27#include <net/bluetooth/hci_core.h>
26#include <net/bluetooth/mgmt.h> 28#include <net/bluetooth/mgmt.h>
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 48f9471e7c85..f64d6566021f 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -851,7 +851,7 @@ static int hci_sock_release(struct socket *sock)
851 851
852 if (hdev) { 852 if (hdev) {
853 if (hci_pi(sk)->channel == HCI_CHANNEL_USER) { 853 if (hci_pi(sk)->channel == HCI_CHANNEL_USER) {
854 /* When releasing an user channel exclusive access, 854 /* When releasing a user channel exclusive access,
855 * call hci_dev_do_close directly instead of calling 855 * call hci_dev_do_close directly instead of calling
856 * hci_dev_close to ensure the exclusive access will 856 * hci_dev_close to ensure the exclusive access will
857 * be released and the controller brought back down. 857 * be released and the controller brought back down.
@@ -1172,7 +1172,7 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
1172 /* In case the transport is already up and 1172 /* In case the transport is already up and
1173 * running, clear the error here. 1173 * running, clear the error here.
1174 * 1174 *
1175 * This can happen when opening an user 1175 * This can happen when opening a user
1176 * channel and HCI_AUTO_OFF grace period 1176 * channel and HCI_AUTO_OFF grace period
1177 * is still active. 1177 * is still active.
1178 */ 1178 */
@@ -1190,7 +1190,7 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
1190 if (!hci_sock_gen_cookie(sk)) { 1190 if (!hci_sock_gen_cookie(sk)) {
1191 /* In the case when a cookie has already been assigned, 1191 /* In the case when a cookie has already been assigned,
1192 * this socket will transition from a raw socket into 1192 * this socket will transition from a raw socket into
1193 * an user channel socket. For a clean transition, send 1193 * a user channel socket. For a clean transition, send
1194 * the close notification first. 1194 * the close notification first.
1195 */ 1195 */
1196 skb = create_monitor_ctrl_close(sk); 1196 skb = create_monitor_ctrl_close(sk);
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index ce0b5dd01953..fc7f321a3823 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -481,14 +481,14 @@ static void l2cap_chan_destroy(struct kref *kref)
481 481
482void l2cap_chan_hold(struct l2cap_chan *c) 482void l2cap_chan_hold(struct l2cap_chan *c)
483{ 483{
484 BT_DBG("chan %p orig refcnt %d", c, atomic_read(&c->kref.refcount)); 484 BT_DBG("chan %p orig refcnt %d", c, kref_read(&c->kref));
485 485
486 kref_get(&c->kref); 486 kref_get(&c->kref);
487} 487}
488 488
489void l2cap_chan_put(struct l2cap_chan *c) 489void l2cap_chan_put(struct l2cap_chan *c)
490{ 490{
491 BT_DBG("chan %p orig refcnt %d", c, atomic_read(&c->kref.refcount)); 491 BT_DBG("chan %p orig refcnt %d", c, kref_read(&c->kref));
492 492
493 kref_put(&c->kref, l2cap_chan_destroy); 493 kref_put(&c->kref, l2cap_chan_destroy);
494} 494}
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index a8ba752732c9..507b80d59dec 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -29,6 +29,7 @@
29 29
30#include <linux/module.h> 30#include <linux/module.h>
31#include <linux/export.h> 31#include <linux/export.h>
32#include <linux/sched/signal.h>
32 33
33#include <net/bluetooth/bluetooth.h> 34#include <net/bluetooth/bluetooth.h>
34#include <net/bluetooth/hci_core.h> 35#include <net/bluetooth/hci_core.h>
@@ -300,7 +301,7 @@ done:
300} 301}
301 302
302static int l2cap_sock_accept(struct socket *sock, struct socket *newsock, 303static int l2cap_sock_accept(struct socket *sock, struct socket *newsock,
303 int flags) 304 int flags, bool kern)
304{ 305{
305 DEFINE_WAIT_FUNC(wait, woken_wake_function); 306 DEFINE_WAIT_FUNC(wait, woken_wake_function);
306 struct sock *sk = sock->sk, *nsk; 307 struct sock *sk = sock->sk, *nsk;
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index 7511df72347f..ac3c650cb234 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -27,6 +27,7 @@
27 27
28#include <linux/export.h> 28#include <linux/export.h>
29#include <linux/debugfs.h> 29#include <linux/debugfs.h>
30#include <linux/sched/signal.h>
30 31
31#include <net/bluetooth/bluetooth.h> 32#include <net/bluetooth/bluetooth.h>
32#include <net/bluetooth/hci_core.h> 33#include <net/bluetooth/hci_core.h>
@@ -470,7 +471,8 @@ done:
470 return err; 471 return err;
471} 472}
472 473
473static int rfcomm_sock_accept(struct socket *sock, struct socket *newsock, int flags) 474static int rfcomm_sock_accept(struct socket *sock, struct socket *newsock, int flags,
475 bool kern)
474{ 476{
475 DEFINE_WAIT_FUNC(wait, woken_wake_function); 477 DEFINE_WAIT_FUNC(wait, woken_wake_function);
476 struct sock *sk = sock->sk, *nsk; 478 struct sock *sk = sock->sk, *nsk;
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index 3125ce670c2f..728e0c8dc8e7 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -27,6 +27,7 @@
27#include <linux/module.h> 27#include <linux/module.h>
28#include <linux/debugfs.h> 28#include <linux/debugfs.h>
29#include <linux/seq_file.h> 29#include <linux/seq_file.h>
30#include <linux/sched/signal.h>
30 31
31#include <net/bluetooth/bluetooth.h> 32#include <net/bluetooth/bluetooth.h>
32#include <net/bluetooth/hci_core.h> 33#include <net/bluetooth/hci_core.h>
@@ -626,7 +627,7 @@ done:
626} 627}
627 628
628static int sco_sock_accept(struct socket *sock, struct socket *newsock, 629static int sco_sock_accept(struct socket *sock, struct socket *newsock,
629 int flags) 630 int flags, bool kern)
630{ 631{
631 DEFINE_WAIT_FUNC(wait, woken_wake_function); 632 DEFINE_WAIT_FUNC(wait, woken_wake_function);
632 struct sock *sk = sock->sk, *ch; 633 struct sock *sk = sock->sk, *ch;
diff --git a/net/bridge/Makefile b/net/bridge/Makefile
index 0aefc011b668..40b1ede527ca 100644
--- a/net/bridge/Makefile
+++ b/net/bridge/Makefile
@@ -6,7 +6,8 @@ obj-$(CONFIG_BRIDGE) += bridge.o
6 6
7bridge-y := br.o br_device.o br_fdb.o br_forward.o br_if.o br_input.o \ 7bridge-y := br.o br_device.o br_fdb.o br_forward.o br_if.o br_input.o \
8 br_ioctl.o br_stp.o br_stp_bpdu.o \ 8 br_ioctl.o br_stp.o br_stp_bpdu.o \
9 br_stp_if.o br_stp_timer.o br_netlink.o 9 br_stp_if.o br_stp_timer.o br_netlink.o \
10 br_netlink_tunnel.o
10 11
11bridge-$(CONFIG_SYSFS) += br_sysfs_if.o br_sysfs_br.o 12bridge-$(CONFIG_SYSFS) += br_sysfs_if.o br_sysfs_br.o
12 13
@@ -18,7 +19,7 @@ obj-$(CONFIG_BRIDGE_NETFILTER) += br_netfilter.o
18 19
19bridge-$(CONFIG_BRIDGE_IGMP_SNOOPING) += br_multicast.o br_mdb.o 20bridge-$(CONFIG_BRIDGE_IGMP_SNOOPING) += br_multicast.o br_mdb.o
20 21
21bridge-$(CONFIG_BRIDGE_VLAN_FILTERING) += br_vlan.o 22bridge-$(CONFIG_BRIDGE_VLAN_FILTERING) += br_vlan.o br_vlan_tunnel.o
22 23
23bridge-$(CONFIG_NET_SWITCHDEV) += br_switchdev.o 24bridge-$(CONFIG_NET_SWITCHDEV) += br_switchdev.o
24 25
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index ed3b3192fb00..90f49a194249 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -79,7 +79,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
79 br_multicast_flood(mdst, skb, false, true); 79 br_multicast_flood(mdst, skb, false, true);
80 else 80 else
81 br_flood(br, skb, BR_PKT_MULTICAST, false, true); 81 br_flood(br, skb, BR_PKT_MULTICAST, false, true);
82 } else if ((dst = __br_fdb_get(br, dest, vid)) != NULL) { 82 } else if ((dst = br_fdb_find_rcu(br, dest, vid)) != NULL) {
83 br_forward(dst->dst, skb, false, true); 83 br_forward(dst->dst, skb, false, true);
84 } else { 84 } else {
85 br_flood(br, skb, BR_PKT_UNICAST, false, true); 85 br_flood(br, skb, BR_PKT_UNICAST, false, true);
@@ -119,6 +119,15 @@ static int br_dev_init(struct net_device *dev)
119 return err; 119 return err;
120} 120}
121 121
122static void br_dev_uninit(struct net_device *dev)
123{
124 struct net_bridge *br = netdev_priv(dev);
125
126 br_multicast_uninit_stats(br);
127 br_vlan_flush(br);
128 free_percpu(br->stats);
129}
130
122static int br_dev_open(struct net_device *dev) 131static int br_dev_open(struct net_device *dev)
123{ 132{
124 struct net_bridge *br = netdev_priv(dev); 133 struct net_bridge *br = netdev_priv(dev);
@@ -153,8 +162,8 @@ static int br_dev_stop(struct net_device *dev)
153 return 0; 162 return 0;
154} 163}
155 164
156static struct rtnl_link_stats64 *br_get_stats64(struct net_device *dev, 165static void br_get_stats64(struct net_device *dev,
157 struct rtnl_link_stats64 *stats) 166 struct rtnl_link_stats64 *stats)
158{ 167{
159 struct net_bridge *br = netdev_priv(dev); 168 struct net_bridge *br = netdev_priv(dev);
160 struct pcpu_sw_netstats tmp, sum = { 0 }; 169 struct pcpu_sw_netstats tmp, sum = { 0 };
@@ -178,8 +187,6 @@ static struct rtnl_link_stats64 *br_get_stats64(struct net_device *dev,
178 stats->tx_packets = sum.tx_packets; 187 stats->tx_packets = sum.tx_packets;
179 stats->rx_bytes = sum.rx_bytes; 188 stats->rx_bytes = sum.rx_bytes;
180 stats->rx_packets = sum.rx_packets; 189 stats->rx_packets = sum.rx_packets;
181
182 return stats;
183} 190}
184 191
185static int br_change_mtu(struct net_device *dev, int new_mtu) 192static int br_change_mtu(struct net_device *dev, int new_mtu)
@@ -334,6 +341,7 @@ static const struct net_device_ops br_netdev_ops = {
334 .ndo_open = br_dev_open, 341 .ndo_open = br_dev_open,
335 .ndo_stop = br_dev_stop, 342 .ndo_stop = br_dev_stop,
336 .ndo_init = br_dev_init, 343 .ndo_init = br_dev_init,
344 .ndo_uninit = br_dev_uninit,
337 .ndo_start_xmit = br_dev_xmit, 345 .ndo_start_xmit = br_dev_xmit,
338 .ndo_get_stats64 = br_get_stats64, 346 .ndo_get_stats64 = br_get_stats64,
339 .ndo_set_mac_address = br_set_mac_address, 347 .ndo_set_mac_address = br_set_mac_address,
@@ -349,8 +357,6 @@ static const struct net_device_ops br_netdev_ops = {
349 .ndo_add_slave = br_add_slave, 357 .ndo_add_slave = br_add_slave,
350 .ndo_del_slave = br_del_slave, 358 .ndo_del_slave = br_del_slave,
351 .ndo_fix_features = br_fix_features, 359 .ndo_fix_features = br_fix_features,
352 .ndo_neigh_construct = netdev_default_l2upper_neigh_construct,
353 .ndo_neigh_destroy = netdev_default_l2upper_neigh_destroy,
354 .ndo_fdb_add = br_fdb_add, 360 .ndo_fdb_add = br_fdb_add,
355 .ndo_fdb_del = br_fdb_delete, 361 .ndo_fdb_del = br_fdb_delete,
356 .ndo_fdb_dump = br_fdb_dump, 362 .ndo_fdb_dump = br_fdb_dump,
@@ -360,14 +366,6 @@ static const struct net_device_ops br_netdev_ops = {
360 .ndo_features_check = passthru_features_check, 366 .ndo_features_check = passthru_features_check,
361}; 367};
362 368
363static void br_dev_free(struct net_device *dev)
364{
365 struct net_bridge *br = netdev_priv(dev);
366
367 free_percpu(br->stats);
368 free_netdev(dev);
369}
370
371static struct device_type br_type = { 369static struct device_type br_type = {
372 .name = "bridge", 370 .name = "bridge",
373}; 371};
@@ -380,7 +378,7 @@ void br_dev_setup(struct net_device *dev)
380 ether_setup(dev); 378 ether_setup(dev);
381 379
382 dev->netdev_ops = &br_netdev_ops; 380 dev->netdev_ops = &br_netdev_ops;
383 dev->destructor = br_dev_free; 381 dev->destructor = free_netdev;
384 dev->ethtool_ops = &br_ethtool_ops; 382 dev->ethtool_ops = &br_ethtool_ops;
385 SET_NETDEV_DEVTYPE(dev, &br_type); 383 SET_NETDEV_DEVTYPE(dev, &br_type);
386 dev->priv_flags = IFF_EBRIDGE | IFF_NO_QUEUE; 384 dev->priv_flags = IFF_EBRIDGE | IFF_NO_QUEUE;
@@ -415,4 +413,5 @@ void br_dev_setup(struct net_device *dev)
415 br_netfilter_rtable_init(br); 413 br_netfilter_rtable_init(br);
416 br_stp_timer_init(br); 414 br_stp_timer_init(br);
417 br_multicast_init(br); 415 br_multicast_init(br);
416 INIT_DELAYED_WORK(&br->gc_work, br_fdb_cleanup);
418} 417}
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index e4a4176171c9..6e08b7199dd7 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -28,9 +28,6 @@
28#include "br_private.h" 28#include "br_private.h"
29 29
30static struct kmem_cache *br_fdb_cache __read_mostly; 30static struct kmem_cache *br_fdb_cache __read_mostly;
31static struct net_bridge_fdb_entry *fdb_find(struct hlist_head *head,
32 const unsigned char *addr,
33 __u16 vid);
34static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source, 31static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
35 const unsigned char *addr, u16 vid); 32 const unsigned char *addr, u16 vid);
36static void fdb_notify(struct net_bridge *br, 33static void fdb_notify(struct net_bridge *br,
@@ -68,7 +65,7 @@ static inline unsigned long hold_time(const struct net_bridge *br)
68static inline int has_expired(const struct net_bridge *br, 65static inline int has_expired(const struct net_bridge *br,
69 const struct net_bridge_fdb_entry *fdb) 66 const struct net_bridge_fdb_entry *fdb)
70{ 67{
71 return !fdb->is_static && 68 return !fdb->is_static && !fdb->added_by_external_learn &&
72 time_before_eq(fdb->updated + hold_time(br), jiffies); 69 time_before_eq(fdb->updated + hold_time(br), jiffies);
73} 70}
74 71
@@ -86,6 +83,47 @@ static void fdb_rcu_free(struct rcu_head *head)
86 kmem_cache_free(br_fdb_cache, ent); 83 kmem_cache_free(br_fdb_cache, ent);
87} 84}
88 85
86static struct net_bridge_fdb_entry *fdb_find_rcu(struct hlist_head *head,
87 const unsigned char *addr,
88 __u16 vid)
89{
90 struct net_bridge_fdb_entry *f;
91
92 WARN_ON_ONCE(!rcu_read_lock_held());
93
94 hlist_for_each_entry_rcu(f, head, hlist)
95 if (ether_addr_equal(f->addr.addr, addr) && f->vlan_id == vid)
96 break;
97
98 return f;
99}
100
101/* requires bridge hash_lock */
102static struct net_bridge_fdb_entry *br_fdb_find(struct net_bridge *br,
103 const unsigned char *addr,
104 __u16 vid)
105{
106 struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)];
107 struct net_bridge_fdb_entry *fdb;
108
109 lockdep_assert_held_once(&br->hash_lock);
110
111 rcu_read_lock();
112 fdb = fdb_find_rcu(head, addr, vid);
113 rcu_read_unlock();
114
115 return fdb;
116}
117
118struct net_bridge_fdb_entry *br_fdb_find_rcu(struct net_bridge *br,
119 const unsigned char *addr,
120 __u16 vid)
121{
122 struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)];
123
124 return fdb_find_rcu(head, addr, vid);
125}
126
89/* When a static FDB entry is added, the mac address from the entry is 127/* When a static FDB entry is added, the mac address from the entry is
90 * added to the bridge private HW address list and all required ports 128 * added to the bridge private HW address list and all required ports
91 * are then updated with the new information. 129 * are then updated with the new information.
@@ -154,7 +192,7 @@ static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f)
154 if (f->added_by_external_learn) 192 if (f->added_by_external_learn)
155 fdb_del_external_learn(f); 193 fdb_del_external_learn(f);
156 194
157 hlist_del_rcu(&f->hlist); 195 hlist_del_init_rcu(&f->hlist);
158 fdb_notify(br, f, RTM_DELNEIGH); 196 fdb_notify(br, f, RTM_DELNEIGH);
159 call_rcu(&f->rcu, fdb_rcu_free); 197 call_rcu(&f->rcu, fdb_rcu_free);
160} 198}
@@ -198,11 +236,10 @@ void br_fdb_find_delete_local(struct net_bridge *br,
198 const struct net_bridge_port *p, 236 const struct net_bridge_port *p,
199 const unsigned char *addr, u16 vid) 237 const unsigned char *addr, u16 vid)
200{ 238{
201 struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)];
202 struct net_bridge_fdb_entry *f; 239 struct net_bridge_fdb_entry *f;
203 240
204 spin_lock_bh(&br->hash_lock); 241 spin_lock_bh(&br->hash_lock);
205 f = fdb_find(head, addr, vid); 242 f = br_fdb_find(br, addr, vid);
206 if (f && f->is_local && !f->added_by_user && f->dst == p) 243 if (f && f->is_local && !f->added_by_user && f->dst == p)
207 fdb_delete_local(br, p, f); 244 fdb_delete_local(br, p, f);
208 spin_unlock_bh(&br->hash_lock); 245 spin_unlock_bh(&br->hash_lock);
@@ -266,7 +303,7 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr)
266 spin_lock_bh(&br->hash_lock); 303 spin_lock_bh(&br->hash_lock);
267 304
268 /* If old entry was unassociated with any port, then delete it. */ 305 /* If old entry was unassociated with any port, then delete it. */
269 f = __br_fdb_get(br, br->dev->dev_addr, 0); 306 f = br_fdb_find(br, br->dev->dev_addr, 0);
270 if (f && f->is_local && !f->dst && !f->added_by_user) 307 if (f && f->is_local && !f->dst && !f->added_by_user)
271 fdb_delete_local(br, NULL, f); 308 fdb_delete_local(br, NULL, f);
272 309
@@ -281,7 +318,7 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr)
281 list_for_each_entry(v, &vg->vlan_list, vlist) { 318 list_for_each_entry(v, &vg->vlan_list, vlist) {
282 if (!br_vlan_should_use(v)) 319 if (!br_vlan_should_use(v))
283 continue; 320 continue;
284 f = __br_fdb_get(br, br->dev->dev_addr, v->vid); 321 f = br_fdb_find(br, br->dev->dev_addr, v->vid);
285 if (f && f->is_local && !f->dst && !f->added_by_user) 322 if (f && f->is_local && !f->dst && !f->added_by_user)
286 fdb_delete_local(br, NULL, f); 323 fdb_delete_local(br, NULL, f);
287 fdb_insert(br, NULL, newaddr, v->vid); 324 fdb_insert(br, NULL, newaddr, v->vid);
@@ -290,34 +327,43 @@ out:
290 spin_unlock_bh(&br->hash_lock); 327 spin_unlock_bh(&br->hash_lock);
291} 328}
292 329
293void br_fdb_cleanup(unsigned long _data) 330void br_fdb_cleanup(struct work_struct *work)
294{ 331{
295 struct net_bridge *br = (struct net_bridge *)_data; 332 struct net_bridge *br = container_of(work, struct net_bridge,
333 gc_work.work);
296 unsigned long delay = hold_time(br); 334 unsigned long delay = hold_time(br);
297 unsigned long next_timer = jiffies + br->ageing_time; 335 unsigned long work_delay = delay;
336 unsigned long now = jiffies;
298 int i; 337 int i;
299 338
300 spin_lock(&br->hash_lock);
301 for (i = 0; i < BR_HASH_SIZE; i++) { 339 for (i = 0; i < BR_HASH_SIZE; i++) {
302 struct net_bridge_fdb_entry *f; 340 struct net_bridge_fdb_entry *f;
303 struct hlist_node *n; 341 struct hlist_node *n;
304 342
343 if (!br->hash[i].first)
344 continue;
345
346 spin_lock_bh(&br->hash_lock);
305 hlist_for_each_entry_safe(f, n, &br->hash[i], hlist) { 347 hlist_for_each_entry_safe(f, n, &br->hash[i], hlist) {
306 unsigned long this_timer; 348 unsigned long this_timer;
349
307 if (f->is_static) 350 if (f->is_static)
308 continue; 351 continue;
309 if (f->added_by_external_learn) 352 if (f->added_by_external_learn)
310 continue; 353 continue;
311 this_timer = f->updated + delay; 354 this_timer = f->updated + delay;
312 if (time_before_eq(this_timer, jiffies)) 355 if (time_after(this_timer, now))
356 work_delay = min(work_delay, this_timer - now);
357 else
313 fdb_delete(br, f); 358 fdb_delete(br, f);
314 else if (time_before(this_timer, next_timer))
315 next_timer = this_timer;
316 } 359 }
360 spin_unlock_bh(&br->hash_lock);
361 cond_resched();
317 } 362 }
318 spin_unlock(&br->hash_lock);
319 363
320 mod_timer(&br->gc_timer, round_jiffies_up(next_timer)); 364 /* Cleanup minimum 10 milliseconds apart */
365 work_delay = max_t(unsigned long, work_delay, msecs_to_jiffies(10));
366 mod_delayed_work(system_long_wq, &br->gc_work, work_delay);
321} 367}
322 368
323/* Completely flush all dynamic entries in forwarding database.*/ 369/* Completely flush all dynamic entries in forwarding database.*/
@@ -371,26 +417,6 @@ void br_fdb_delete_by_port(struct net_bridge *br,
371 spin_unlock_bh(&br->hash_lock); 417 spin_unlock_bh(&br->hash_lock);
372} 418}
373 419
374/* No locking or refcounting, assumes caller has rcu_read_lock */
375struct net_bridge_fdb_entry *__br_fdb_get(struct net_bridge *br,
376 const unsigned char *addr,
377 __u16 vid)
378{
379 struct net_bridge_fdb_entry *fdb;
380
381 hlist_for_each_entry_rcu(fdb,
382 &br->hash[br_mac_hash(addr, vid)], hlist) {
383 if (ether_addr_equal(fdb->addr.addr, addr) &&
384 fdb->vlan_id == vid) {
385 if (unlikely(has_expired(br, fdb)))
386 break;
387 return fdb;
388 }
389 }
390
391 return NULL;
392}
393
394#if IS_ENABLED(CONFIG_ATM_LANE) 420#if IS_ENABLED(CONFIG_ATM_LANE)
395/* Interface used by ATM LANE hook to test 421/* Interface used by ATM LANE hook to test
396 * if an addr is on some other bridge port */ 422 * if an addr is on some other bridge port */
@@ -405,7 +431,7 @@ int br_fdb_test_addr(struct net_device *dev, unsigned char *addr)
405 if (!port) 431 if (!port)
406 ret = 0; 432 ret = 0;
407 else { 433 else {
408 fdb = __br_fdb_get(port->br, addr, 0); 434 fdb = br_fdb_find_rcu(port->br, addr, 0);
409 ret = fdb && fdb->dst && fdb->dst->dev != dev && 435 ret = fdb && fdb->dst && fdb->dst->dev != dev &&
410 fdb->dst->state == BR_STATE_FORWARDING; 436 fdb->dst->state == BR_STATE_FORWARDING;
411 } 437 }
@@ -467,34 +493,6 @@ int br_fdb_fillbuf(struct net_bridge *br, void *buf,
467 return num; 493 return num;
468} 494}
469 495
470static struct net_bridge_fdb_entry *fdb_find(struct hlist_head *head,
471 const unsigned char *addr,
472 __u16 vid)
473{
474 struct net_bridge_fdb_entry *fdb;
475
476 hlist_for_each_entry(fdb, head, hlist) {
477 if (ether_addr_equal(fdb->addr.addr, addr) &&
478 fdb->vlan_id == vid)
479 return fdb;
480 }
481 return NULL;
482}
483
484static struct net_bridge_fdb_entry *fdb_find_rcu(struct hlist_head *head,
485 const unsigned char *addr,
486 __u16 vid)
487{
488 struct net_bridge_fdb_entry *fdb;
489
490 hlist_for_each_entry_rcu(fdb, head, hlist) {
491 if (ether_addr_equal(fdb->addr.addr, addr) &&
492 fdb->vlan_id == vid)
493 return fdb;
494 }
495 return NULL;
496}
497
498static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head, 496static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head,
499 struct net_bridge_port *source, 497 struct net_bridge_port *source,
500 const unsigned char *addr, 498 const unsigned char *addr,
@@ -528,7 +526,7 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
528 if (!is_valid_ether_addr(addr)) 526 if (!is_valid_ether_addr(addr))
529 return -EINVAL; 527 return -EINVAL;
530 528
531 fdb = fdb_find(head, addr, vid); 529 fdb = br_fdb_find(br, addr, vid);
532 if (fdb) { 530 if (fdb) {
533 /* it is okay to have multiple ports with same 531 /* it is okay to have multiple ports with same
534 * address, just use the first one. 532 * address, just use the first one.
@@ -585,12 +583,15 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
585 br_warn(br, "received packet on %s with own address as source address (addr:%pM, vlan:%u)\n", 583 br_warn(br, "received packet on %s with own address as source address (addr:%pM, vlan:%u)\n",
586 source->dev->name, addr, vid); 584 source->dev->name, addr, vid);
587 } else { 585 } else {
586 unsigned long now = jiffies;
587
588 /* fastpath: update of existing entry */ 588 /* fastpath: update of existing entry */
589 if (unlikely(source != fdb->dst)) { 589 if (unlikely(source != fdb->dst)) {
590 fdb->dst = source; 590 fdb->dst = source;
591 fdb_modified = true; 591 fdb_modified = true;
592 } 592 }
593 fdb->updated = jiffies; 593 if (now != fdb->updated)
594 fdb->updated = now;
594 if (unlikely(added_by_user)) 595 if (unlikely(added_by_user))
595 fdb->added_by_user = 1; 596 fdb->added_by_user = 1;
596 if (unlikely(fdb_modified)) 597 if (unlikely(fdb_modified))
@@ -598,7 +599,7 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
598 } 599 }
599 } else { 600 } else {
600 spin_lock(&br->hash_lock); 601 spin_lock(&br->hash_lock);
601 if (likely(!fdb_find(head, addr, vid))) { 602 if (likely(!fdb_find_rcu(head, addr, vid))) {
602 fdb = fdb_create(head, source, addr, vid, 0, 0); 603 fdb = fdb_create(head, source, addr, vid, 0, 0);
603 if (fdb) { 604 if (fdb) {
604 if (unlikely(added_by_user)) 605 if (unlikely(added_by_user))
@@ -782,7 +783,7 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source,
782 return -EINVAL; 783 return -EINVAL;
783 } 784 }
784 785
785 fdb = fdb_find(head, addr, vid); 786 fdb = br_fdb_find(br, addr, vid);
786 if (fdb == NULL) { 787 if (fdb == NULL) {
787 if (!(flags & NLM_F_CREATE)) 788 if (!(flags & NLM_F_CREATE))
788 return -ENOENT; 789 return -ENOENT;
@@ -929,55 +930,30 @@ out:
929 return err; 930 return err;
930} 931}
931 932
932static int fdb_delete_by_addr(struct net_bridge *br, const u8 *addr, 933static int fdb_delete_by_addr_and_port(struct net_bridge *br,
933 u16 vid) 934 const struct net_bridge_port *p,
934{
935 struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)];
936 struct net_bridge_fdb_entry *fdb;
937
938 fdb = fdb_find(head, addr, vid);
939 if (!fdb)
940 return -ENOENT;
941
942 fdb_delete(br, fdb);
943 return 0;
944}
945
946static int __br_fdb_delete_by_addr(struct net_bridge *br,
947 const unsigned char *addr, u16 vid)
948{
949 int err;
950
951 spin_lock_bh(&br->hash_lock);
952 err = fdb_delete_by_addr(br, addr, vid);
953 spin_unlock_bh(&br->hash_lock);
954
955 return err;
956}
957
958static int fdb_delete_by_addr_and_port(struct net_bridge_port *p,
959 const u8 *addr, u16 vlan) 935 const u8 *addr, u16 vlan)
960{ 936{
961 struct net_bridge *br = p->br;
962 struct hlist_head *head = &br->hash[br_mac_hash(addr, vlan)];
963 struct net_bridge_fdb_entry *fdb; 937 struct net_bridge_fdb_entry *fdb;
964 938
965 fdb = fdb_find(head, addr, vlan); 939 fdb = br_fdb_find(br, addr, vlan);
966 if (!fdb || fdb->dst != p) 940 if (!fdb || fdb->dst != p)
967 return -ENOENT; 941 return -ENOENT;
968 942
969 fdb_delete(br, fdb); 943 fdb_delete(br, fdb);
944
970 return 0; 945 return 0;
971} 946}
972 947
973static int __br_fdb_delete(struct net_bridge_port *p, 948static int __br_fdb_delete(struct net_bridge *br,
949 const struct net_bridge_port *p,
974 const unsigned char *addr, u16 vid) 950 const unsigned char *addr, u16 vid)
975{ 951{
976 int err; 952 int err;
977 953
978 spin_lock_bh(&p->br->hash_lock); 954 spin_lock_bh(&br->hash_lock);
979 err = fdb_delete_by_addr_and_port(p, addr, vid); 955 err = fdb_delete_by_addr_and_port(br, p, addr, vid);
980 spin_unlock_bh(&p->br->hash_lock); 956 spin_unlock_bh(&br->hash_lock);
981 957
982 return err; 958 return err;
983} 959}
@@ -990,7 +966,7 @@ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
990 struct net_bridge_vlan_group *vg; 966 struct net_bridge_vlan_group *vg;
991 struct net_bridge_port *p = NULL; 967 struct net_bridge_port *p = NULL;
992 struct net_bridge_vlan *v; 968 struct net_bridge_vlan *v;
993 struct net_bridge *br = NULL; 969 struct net_bridge *br;
994 int err; 970 int err;
995 971
996 if (dev->priv_flags & IFF_EBRIDGE) { 972 if (dev->priv_flags & IFF_EBRIDGE) {
@@ -1004,6 +980,7 @@ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
1004 return -EINVAL; 980 return -EINVAL;
1005 } 981 }
1006 vg = nbp_vlan_group(p); 982 vg = nbp_vlan_group(p);
983 br = p->br;
1007 } 984 }
1008 985
1009 if (vid) { 986 if (vid) {
@@ -1013,30 +990,20 @@ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
1013 return -EINVAL; 990 return -EINVAL;
1014 } 991 }
1015 992
1016 if (dev->priv_flags & IFF_EBRIDGE) 993 err = __br_fdb_delete(br, p, addr, vid);
1017 err = __br_fdb_delete_by_addr(br, addr, vid);
1018 else
1019 err = __br_fdb_delete(p, addr, vid);
1020 } else { 994 } else {
1021 err = -ENOENT; 995 err = -ENOENT;
1022 if (dev->priv_flags & IFF_EBRIDGE) 996 err &= __br_fdb_delete(br, p, addr, 0);
1023 err = __br_fdb_delete_by_addr(br, addr, 0);
1024 else
1025 err &= __br_fdb_delete(p, addr, 0);
1026
1027 if (!vg || !vg->num_vlans) 997 if (!vg || !vg->num_vlans)
1028 goto out; 998 return err;
1029 999
1030 list_for_each_entry(v, &vg->vlan_list, vlist) { 1000 list_for_each_entry(v, &vg->vlan_list, vlist) {
1031 if (!br_vlan_should_use(v)) 1001 if (!br_vlan_should_use(v))
1032 continue; 1002 continue;
1033 if (dev->priv_flags & IFF_EBRIDGE) 1003 err &= __br_fdb_delete(br, p, addr, v->vid);
1034 err = __br_fdb_delete_by_addr(br, addr, v->vid);
1035 else
1036 err &= __br_fdb_delete(p, addr, v->vid);
1037 } 1004 }
1038 } 1005 }
1039out: 1006
1040 return err; 1007 return err;
1041} 1008}
1042 1009
@@ -1107,7 +1074,7 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
1107 spin_lock_bh(&br->hash_lock); 1074 spin_lock_bh(&br->hash_lock);
1108 1075
1109 head = &br->hash[br_mac_hash(addr, vid)]; 1076 head = &br->hash[br_mac_hash(addr, vid)];
1110 fdb = fdb_find(head, addr, vid); 1077 fdb = br_fdb_find(br, addr, vid);
1111 if (!fdb) { 1078 if (!fdb) {
1112 fdb = fdb_create(head, p, addr, vid, 0, 0); 1079 fdb = fdb_create(head, p, addr, vid, 0, 0);
1113 if (!fdb) { 1080 if (!fdb) {
@@ -1135,15 +1102,13 @@ err_unlock:
1135int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p, 1102int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
1136 const unsigned char *addr, u16 vid) 1103 const unsigned char *addr, u16 vid)
1137{ 1104{
1138 struct hlist_head *head;
1139 struct net_bridge_fdb_entry *fdb; 1105 struct net_bridge_fdb_entry *fdb;
1140 int err = 0; 1106 int err = 0;
1141 1107
1142 ASSERT_RTNL(); 1108 ASSERT_RTNL();
1143 spin_lock_bh(&br->hash_lock); 1109 spin_lock_bh(&br->hash_lock);
1144 1110
1145 head = &br->hash[br_mac_hash(addr, vid)]; 1111 fdb = br_fdb_find(br, addr, vid);
1146 fdb = fdb_find(head, addr, vid);
1147 if (fdb && fdb->added_by_external_learn) 1112 if (fdb && fdb->added_by_external_learn)
1148 fdb_delete(br, fdb); 1113 fdb_delete(br, fdb);
1149 else 1114 else
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index 7cb41aee4c82..902af6ba481c 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -80,7 +80,7 @@ static void __br_forward(const struct net_bridge_port *to,
80 int br_hook; 80 int br_hook;
81 81
82 vg = nbp_vlan_group_rcu(to); 82 vg = nbp_vlan_group_rcu(to);
83 skb = br_handle_vlan(to->br, vg, skb); 83 skb = br_handle_vlan(to->br, to, vg, skb);
84 if (!skb) 84 if (!skb)
85 return; 85 return;
86 86
@@ -186,8 +186,9 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb,
186 /* Do not flood unicast traffic to ports that turn it off */ 186 /* Do not flood unicast traffic to ports that turn it off */
187 if (pkt_type == BR_PKT_UNICAST && !(p->flags & BR_FLOOD)) 187 if (pkt_type == BR_PKT_UNICAST && !(p->flags & BR_FLOOD))
188 continue; 188 continue;
189 /* Do not flood if mc off, except for traffic we originate */
189 if (pkt_type == BR_PKT_MULTICAST && 190 if (pkt_type == BR_PKT_MULTICAST &&
190 !(p->flags & BR_MCAST_FLOOD)) 191 !(p->flags & BR_MCAST_FLOOD) && skb->dev != br->dev)
191 continue; 192 continue;
192 193
193 /* Do not flood to ports that enable proxy ARP */ 194 /* Do not flood to ports that enable proxy ARP */
@@ -220,6 +221,31 @@ out:
220} 221}
221 222
222#ifdef CONFIG_BRIDGE_IGMP_SNOOPING 223#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
224static void maybe_deliver_addr(struct net_bridge_port *p, struct sk_buff *skb,
225 const unsigned char *addr, bool local_orig)
226{
227 struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev;
228 const unsigned char *src = eth_hdr(skb)->h_source;
229
230 if (!should_deliver(p, skb))
231 return;
232
233 /* Even with hairpin, no soliloquies - prevent breaking IPv6 DAD */
234 if (skb->dev == p->dev && ether_addr_equal(src, addr))
235 return;
236
237 skb = skb_copy(skb, GFP_ATOMIC);
238 if (!skb) {
239 dev->stats.tx_dropped++;
240 return;
241 }
242
243 if (!is_broadcast_ether_addr(addr))
244 memcpy(eth_hdr(skb)->h_dest, addr, ETH_ALEN);
245
246 __br_forward(p, skb, local_orig);
247}
248
223/* called with rcu_read_lock */ 249/* called with rcu_read_lock */
224void br_multicast_flood(struct net_bridge_mdb_entry *mdst, 250void br_multicast_flood(struct net_bridge_mdb_entry *mdst,
225 struct sk_buff *skb, 251 struct sk_buff *skb,
@@ -241,10 +267,20 @@ void br_multicast_flood(struct net_bridge_mdb_entry *mdst,
241 rport = rp ? hlist_entry(rp, struct net_bridge_port, rlist) : 267 rport = rp ? hlist_entry(rp, struct net_bridge_port, rlist) :
242 NULL; 268 NULL;
243 269
244 port = (unsigned long)lport > (unsigned long)rport ? 270 if ((unsigned long)lport > (unsigned long)rport) {
245 lport : rport; 271 port = lport;
272
273 if (port->flags & BR_MULTICAST_TO_UNICAST) {
274 maybe_deliver_addr(lport, skb, p->eth_addr,
275 local_orig);
276 goto delivered;
277 }
278 } else {
279 port = rport;
280 }
246 281
247 prev = maybe_deliver(prev, port, skb, local_orig); 282 prev = maybe_deliver(prev, port, skb, local_orig);
283delivered:
248 if (IS_ERR(prev)) 284 if (IS_ERR(prev))
249 goto out; 285 goto out;
250 if (prev == port) 286 if (prev == port)
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index ed0dd3340084..56a2a72e7738 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -311,9 +311,8 @@ void br_dev_delete(struct net_device *dev, struct list_head *head)
311 311
312 br_fdb_delete_by_port(br, NULL, 0, 1); 312 br_fdb_delete_by_port(br, NULL, 0, 1);
313 313
314 br_vlan_flush(br);
315 br_multicast_dev_del(br); 314 br_multicast_dev_del(br);
316 del_timer_sync(&br->gc_timer); 315 cancel_delayed_work_sync(&br->gc_work);
317 316
318 br_sysfs_delbr(br->dev); 317 br_sysfs_delbr(br->dev);
319 unregister_netdevice_queue(br->dev, head); 318 unregister_netdevice_queue(br->dev, head);
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 855b72fbe1da..013f2290bfa5 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -21,6 +21,7 @@
21#include <linux/export.h> 21#include <linux/export.h>
22#include <linux/rculist.h> 22#include <linux/rculist.h>
23#include "br_private.h" 23#include "br_private.h"
24#include "br_private_tunnel.h"
24 25
25/* Hook for brouter */ 26/* Hook for brouter */
26br_should_route_hook_t __rcu *br_should_route_hook __read_mostly; 27br_should_route_hook_t __rcu *br_should_route_hook __read_mostly;
@@ -29,6 +30,7 @@ EXPORT_SYMBOL(br_should_route_hook);
29static int 30static int
30br_netif_receive_skb(struct net *net, struct sock *sk, struct sk_buff *skb) 31br_netif_receive_skb(struct net *net, struct sock *sk, struct sk_buff *skb)
31{ 32{
33 br_drop_fake_rtable(skb);
32 return netif_receive_skb(skb); 34 return netif_receive_skb(skb);
33} 35}
34 36
@@ -57,7 +59,7 @@ static int br_pass_frame_up(struct sk_buff *skb)
57 59
58 indev = skb->dev; 60 indev = skb->dev;
59 skb->dev = brdev; 61 skb->dev = brdev;
60 skb = br_handle_vlan(br, vg, skb); 62 skb = br_handle_vlan(br, NULL, vg, skb);
61 if (!skb) 63 if (!skb)
62 return NET_RX_DROP; 64 return NET_RX_DROP;
63 /* update the multicast stats if the packet is IGMP/MLD */ 65 /* update the multicast stats if the packet is IGMP/MLD */
@@ -113,7 +115,7 @@ static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br,
113 return; 115 return;
114 } 116 }
115 117
116 f = __br_fdb_get(br, n->ha, vid); 118 f = br_fdb_find_rcu(br, n->ha, vid);
117 if (f && ((p->flags & BR_PROXYARP) || 119 if (f && ((p->flags & BR_PROXYARP) ||
118 (f->dst && (f->dst->flags & BR_PROXYARP_WIFI)))) { 120 (f->dst && (f->dst->flags & BR_PROXYARP_WIFI)))) {
119 arp_send(ARPOP_REPLY, ETH_P_ARP, sip, skb->dev, tip, 121 arp_send(ARPOP_REPLY, ETH_P_ARP, sip, skb->dev, tip,
@@ -188,16 +190,19 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
188 } 190 }
189 break; 191 break;
190 case BR_PKT_UNICAST: 192 case BR_PKT_UNICAST:
191 dst = __br_fdb_get(br, dest, vid); 193 dst = br_fdb_find_rcu(br, dest, vid);
192 default: 194 default:
193 break; 195 break;
194 } 196 }
195 197
196 if (dst) { 198 if (dst) {
199 unsigned long now = jiffies;
200
197 if (dst->is_local) 201 if (dst->is_local)
198 return br_pass_frame_up(skb); 202 return br_pass_frame_up(skb);
199 203
200 dst->used = jiffies; 204 if (now != dst->used)
205 dst->used = now;
201 br_forward(dst->dst, skb, local_rcv, false); 206 br_forward(dst->dst, skb, local_rcv, false);
202 } else { 207 } else {
203 if (!mcast_hit) 208 if (!mcast_hit)
@@ -261,6 +266,11 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
261 return RX_HANDLER_CONSUMED; 266 return RX_HANDLER_CONSUMED;
262 267
263 p = br_port_get_rcu(skb->dev); 268 p = br_port_get_rcu(skb->dev);
269 if (p->flags & BR_VLAN_TUNNEL) {
270 if (br_handle_ingress_vlan_tunnel(skb, p,
271 nbp_vlan_group_rcu(p)))
272 goto drop;
273 }
264 274
265 if (unlikely(is_link_local_ether_addr(dest))) { 275 if (unlikely(is_link_local_ether_addr(dest))) {
266 u16 fwd_mask = p->br->group_fwd_mask_required; 276 u16 fwd_mask = p->br->group_fwd_mask_required;
diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c
index da8157c57eb1..7970f8540cbb 100644
--- a/net/bridge/br_ioctl.c
+++ b/net/bridge/br_ioctl.c
@@ -149,7 +149,7 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
149 b.hello_timer_value = br_timer_value(&br->hello_timer); 149 b.hello_timer_value = br_timer_value(&br->hello_timer);
150 b.tcn_timer_value = br_timer_value(&br->tcn_timer); 150 b.tcn_timer_value = br_timer_value(&br->tcn_timer);
151 b.topology_change_timer_value = br_timer_value(&br->topology_change_timer); 151 b.topology_change_timer_value = br_timer_value(&br->topology_change_timer);
152 b.gc_timer_value = br_timer_value(&br->gc_timer); 152 b.gc_timer_value = br_timer_value(&br->gc_work.timer);
153 rcu_read_unlock(); 153 rcu_read_unlock();
154 154
155 if (copy_to_user((void __user *)args[1], &b, sizeof(b))) 155 if (copy_to_user((void __user *)args[1], &b, sizeof(b)))
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index 7dbc80d01eb0..056e6ac49d8f 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -531,7 +531,7 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port,
531 break; 531 break;
532 } 532 }
533 533
534 p = br_multicast_new_port_group(port, group, *pp, state); 534 p = br_multicast_new_port_group(port, group, *pp, state, NULL);
535 if (unlikely(!p)) 535 if (unlikely(!p))
536 return -ENOMEM; 536 return -ENOMEM;
537 rcu_assign_pointer(*pp, p); 537 rcu_assign_pointer(*pp, p);
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index b30e77e8427c..faa7261a992f 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -27,6 +27,7 @@
27#include <linux/inetdevice.h> 27#include <linux/inetdevice.h>
28#include <linux/mroute.h> 28#include <linux/mroute.h>
29#include <net/ip.h> 29#include <net/ip.h>
30#include <net/switchdev.h>
30#if IS_ENABLED(CONFIG_IPV6) 31#if IS_ENABLED(CONFIG_IPV6)
31#include <net/ipv6.h> 32#include <net/ipv6.h>
32#include <net/mld.h> 33#include <net/mld.h>
@@ -43,12 +44,15 @@ static void br_multicast_add_router(struct net_bridge *br,
43static void br_ip4_multicast_leave_group(struct net_bridge *br, 44static void br_ip4_multicast_leave_group(struct net_bridge *br,
44 struct net_bridge_port *port, 45 struct net_bridge_port *port,
45 __be32 group, 46 __be32 group,
46 __u16 vid); 47 __u16 vid,
48 const unsigned char *src);
49
50static void __del_port_router(struct net_bridge_port *p);
47#if IS_ENABLED(CONFIG_IPV6) 51#if IS_ENABLED(CONFIG_IPV6)
48static void br_ip6_multicast_leave_group(struct net_bridge *br, 52static void br_ip6_multicast_leave_group(struct net_bridge *br,
49 struct net_bridge_port *port, 53 struct net_bridge_port *port,
50 const struct in6_addr *group, 54 const struct in6_addr *group,
51 __u16 vid); 55 __u16 vid, const unsigned char *src);
52#endif 56#endif
53unsigned int br_mdb_rehash_seq; 57unsigned int br_mdb_rehash_seq;
54 58
@@ -540,7 +544,7 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br,
540 break; 544 break;
541 case 2: 545 case 2:
542 mld2q = (struct mld2_query *)icmp6_hdr(skb); 546 mld2q = (struct mld2_query *)icmp6_hdr(skb);
543 mld2q->mld2q_mrc = ntohs((u16)jiffies_to_msecs(interval)); 547 mld2q->mld2q_mrc = htons((u16)jiffies_to_msecs(interval));
544 mld2q->mld2q_type = ICMPV6_MGM_QUERY; 548 mld2q->mld2q_type = ICMPV6_MGM_QUERY;
545 mld2q->mld2q_code = 0; 549 mld2q->mld2q_code = 0;
546 mld2q->mld2q_cksum = 0; 550 mld2q->mld2q_cksum = 0;
@@ -711,7 +715,8 @@ struct net_bridge_port_group *br_multicast_new_port_group(
711 struct net_bridge_port *port, 715 struct net_bridge_port *port,
712 struct br_ip *group, 716 struct br_ip *group,
713 struct net_bridge_port_group __rcu *next, 717 struct net_bridge_port_group __rcu *next,
714 unsigned char flags) 718 unsigned char flags,
719 const unsigned char *src)
715{ 720{
716 struct net_bridge_port_group *p; 721 struct net_bridge_port_group *p;
717 722
@@ -726,12 +731,32 @@ struct net_bridge_port_group *br_multicast_new_port_group(
726 hlist_add_head(&p->mglist, &port->mglist); 731 hlist_add_head(&p->mglist, &port->mglist);
727 setup_timer(&p->timer, br_multicast_port_group_expired, 732 setup_timer(&p->timer, br_multicast_port_group_expired,
728 (unsigned long)p); 733 (unsigned long)p);
734
735 if (src)
736 memcpy(p->eth_addr, src, ETH_ALEN);
737 else
738 memset(p->eth_addr, 0xff, ETH_ALEN);
739
729 return p; 740 return p;
730} 741}
731 742
743static bool br_port_group_equal(struct net_bridge_port_group *p,
744 struct net_bridge_port *port,
745 const unsigned char *src)
746{
747 if (p->port != port)
748 return false;
749
750 if (!(port->flags & BR_MULTICAST_TO_UNICAST))
751 return true;
752
753 return ether_addr_equal(src, p->eth_addr);
754}
755
732static int br_multicast_add_group(struct net_bridge *br, 756static int br_multicast_add_group(struct net_bridge *br,
733 struct net_bridge_port *port, 757 struct net_bridge_port *port,
734 struct br_ip *group) 758 struct br_ip *group,
759 const unsigned char *src)
735{ 760{
736 struct net_bridge_port_group __rcu **pp; 761 struct net_bridge_port_group __rcu **pp;
737 struct net_bridge_port_group *p; 762 struct net_bridge_port_group *p;
@@ -758,13 +783,13 @@ static int br_multicast_add_group(struct net_bridge *br,
758 for (pp = &mp->ports; 783 for (pp = &mp->ports;
759 (p = mlock_dereference(*pp, br)) != NULL; 784 (p = mlock_dereference(*pp, br)) != NULL;
760 pp = &p->next) { 785 pp = &p->next) {
761 if (p->port == port) 786 if (br_port_group_equal(p, port, src))
762 goto found; 787 goto found;
763 if ((unsigned long)p->port < (unsigned long)port) 788 if ((unsigned long)p->port < (unsigned long)port)
764 break; 789 break;
765 } 790 }
766 791
767 p = br_multicast_new_port_group(port, group, *pp, 0); 792 p = br_multicast_new_port_group(port, group, *pp, 0, src);
768 if (unlikely(!p)) 793 if (unlikely(!p))
769 goto err; 794 goto err;
770 rcu_assign_pointer(*pp, p); 795 rcu_assign_pointer(*pp, p);
@@ -783,7 +808,8 @@ err:
783static int br_ip4_multicast_add_group(struct net_bridge *br, 808static int br_ip4_multicast_add_group(struct net_bridge *br,
784 struct net_bridge_port *port, 809 struct net_bridge_port *port,
785 __be32 group, 810 __be32 group,
786 __u16 vid) 811 __u16 vid,
812 const unsigned char *src)
787{ 813{
788 struct br_ip br_group; 814 struct br_ip br_group;
789 815
@@ -794,14 +820,15 @@ static int br_ip4_multicast_add_group(struct net_bridge *br,
794 br_group.proto = htons(ETH_P_IP); 820 br_group.proto = htons(ETH_P_IP);
795 br_group.vid = vid; 821 br_group.vid = vid;
796 822
797 return br_multicast_add_group(br, port, &br_group); 823 return br_multicast_add_group(br, port, &br_group, src);
798} 824}
799 825
800#if IS_ENABLED(CONFIG_IPV6) 826#if IS_ENABLED(CONFIG_IPV6)
801static int br_ip6_multicast_add_group(struct net_bridge *br, 827static int br_ip6_multicast_add_group(struct net_bridge *br,
802 struct net_bridge_port *port, 828 struct net_bridge_port *port,
803 const struct in6_addr *group, 829 const struct in6_addr *group,
804 __u16 vid) 830 __u16 vid,
831 const unsigned char *src)
805{ 832{
806 struct br_ip br_group; 833 struct br_ip br_group;
807 834
@@ -812,7 +839,7 @@ static int br_ip6_multicast_add_group(struct net_bridge *br,
812 br_group.proto = htons(ETH_P_IPV6); 839 br_group.proto = htons(ETH_P_IPV6);
813 br_group.vid = vid; 840 br_group.vid = vid;
814 841
815 return br_multicast_add_group(br, port, &br_group); 842 return br_multicast_add_group(br, port, &br_group, src);
816} 843}
817#endif 844#endif
818 845
@@ -824,16 +851,10 @@ static void br_multicast_router_expired(unsigned long data)
824 spin_lock(&br->multicast_lock); 851 spin_lock(&br->multicast_lock);
825 if (port->multicast_router == MDB_RTR_TYPE_DISABLED || 852 if (port->multicast_router == MDB_RTR_TYPE_DISABLED ||
826 port->multicast_router == MDB_RTR_TYPE_PERM || 853 port->multicast_router == MDB_RTR_TYPE_PERM ||
827 timer_pending(&port->multicast_router_timer) || 854 timer_pending(&port->multicast_router_timer))
828 hlist_unhashed(&port->rlist))
829 goto out; 855 goto out;
830 856
831 hlist_del_init_rcu(&port->rlist); 857 __del_port_router(port);
832 br_rtr_notify(br->dev, port, RTM_DELMDB);
833 /* Don't allow timer refresh if the router expired */
834 if (port->multicast_router == MDB_RTR_TYPE_TEMP)
835 port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
836
837out: 858out:
838 spin_unlock(&br->multicast_lock); 859 spin_unlock(&br->multicast_lock);
839} 860}
@@ -982,6 +1003,18 @@ static void br_ip6_multicast_port_query_expired(unsigned long data)
982} 1003}
983#endif 1004#endif
984 1005
1006static void br_mc_disabled_update(struct net_device *dev, bool value)
1007{
1008 struct switchdev_attr attr = {
1009 .orig_dev = dev,
1010 .id = SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED,
1011 .flags = SWITCHDEV_F_DEFER,
1012 .u.mc_disabled = value,
1013 };
1014
1015 switchdev_port_attr_set(dev, &attr);
1016}
1017
985int br_multicast_add_port(struct net_bridge_port *port) 1018int br_multicast_add_port(struct net_bridge_port *port)
986{ 1019{
987 port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; 1020 port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
@@ -994,6 +1027,8 @@ int br_multicast_add_port(struct net_bridge_port *port)
994 setup_timer(&port->ip6_own_query.timer, 1027 setup_timer(&port->ip6_own_query.timer,
995 br_ip6_multicast_port_query_expired, (unsigned long)port); 1028 br_ip6_multicast_port_query_expired, (unsigned long)port);
996#endif 1029#endif
1030 br_mc_disabled_update(port->dev, port->br->multicast_disabled);
1031
997 port->mcast_stats = netdev_alloc_pcpu_stats(struct bridge_mcast_stats); 1032 port->mcast_stats = netdev_alloc_pcpu_stats(struct bridge_mcast_stats);
998 if (!port->mcast_stats) 1033 if (!port->mcast_stats)
999 return -ENOMEM; 1034 return -ENOMEM;
@@ -1061,13 +1096,8 @@ void br_multicast_disable_port(struct net_bridge_port *port)
1061 if (!(pg->flags & MDB_PG_FLAGS_PERMANENT)) 1096 if (!(pg->flags & MDB_PG_FLAGS_PERMANENT))
1062 br_multicast_del_pg(br, pg); 1097 br_multicast_del_pg(br, pg);
1063 1098
1064 if (!hlist_unhashed(&port->rlist)) { 1099 __del_port_router(port);
1065 hlist_del_init_rcu(&port->rlist); 1100
1066 br_rtr_notify(br->dev, port, RTM_DELMDB);
1067 /* Don't allow timer refresh if disabling */
1068 if (port->multicast_router == MDB_RTR_TYPE_TEMP)
1069 port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
1070 }
1071 del_timer(&port->multicast_router_timer); 1101 del_timer(&port->multicast_router_timer);
1072 del_timer(&port->ip4_own_query.timer); 1102 del_timer(&port->ip4_own_query.timer);
1073#if IS_ENABLED(CONFIG_IPV6) 1103#if IS_ENABLED(CONFIG_IPV6)
@@ -1081,6 +1111,7 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br,
1081 struct sk_buff *skb, 1111 struct sk_buff *skb,
1082 u16 vid) 1112 u16 vid)
1083{ 1113{
1114 const unsigned char *src;
1084 struct igmpv3_report *ih; 1115 struct igmpv3_report *ih;
1085 struct igmpv3_grec *grec; 1116 struct igmpv3_grec *grec;
1086 int i; 1117 int i;
@@ -1121,12 +1152,14 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br,
1121 continue; 1152 continue;
1122 } 1153 }
1123 1154
1155 src = eth_hdr(skb)->h_source;
1124 if ((type == IGMPV3_CHANGE_TO_INCLUDE || 1156 if ((type == IGMPV3_CHANGE_TO_INCLUDE ||
1125 type == IGMPV3_MODE_IS_INCLUDE) && 1157 type == IGMPV3_MODE_IS_INCLUDE) &&
1126 ntohs(grec->grec_nsrcs) == 0) { 1158 ntohs(grec->grec_nsrcs) == 0) {
1127 br_ip4_multicast_leave_group(br, port, group, vid); 1159 br_ip4_multicast_leave_group(br, port, group, vid, src);
1128 } else { 1160 } else {
1129 err = br_ip4_multicast_add_group(br, port, group, vid); 1161 err = br_ip4_multicast_add_group(br, port, group, vid,
1162 src);
1130 if (err) 1163 if (err)
1131 break; 1164 break;
1132 } 1165 }
@@ -1141,6 +1174,7 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br,
1141 struct sk_buff *skb, 1174 struct sk_buff *skb,
1142 u16 vid) 1175 u16 vid)
1143{ 1176{
1177 const unsigned char *src;
1144 struct icmp6hdr *icmp6h; 1178 struct icmp6hdr *icmp6h;
1145 struct mld2_grec *grec; 1179 struct mld2_grec *grec;
1146 int i; 1180 int i;
@@ -1188,14 +1222,16 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br,
1188 continue; 1222 continue;
1189 } 1223 }
1190 1224
1225 src = eth_hdr(skb)->h_source;
1191 if ((grec->grec_type == MLD2_CHANGE_TO_INCLUDE || 1226 if ((grec->grec_type == MLD2_CHANGE_TO_INCLUDE ||
1192 grec->grec_type == MLD2_MODE_IS_INCLUDE) && 1227 grec->grec_type == MLD2_MODE_IS_INCLUDE) &&
1193 ntohs(*nsrcs) == 0) { 1228 ntohs(*nsrcs) == 0) {
1194 br_ip6_multicast_leave_group(br, port, &grec->grec_mca, 1229 br_ip6_multicast_leave_group(br, port, &grec->grec_mca,
1195 vid); 1230 vid, src);
1196 } else { 1231 } else {
1197 err = br_ip6_multicast_add_group(br, port, 1232 err = br_ip6_multicast_add_group(br, port,
1198 &grec->grec_mca, vid); 1233 &grec->grec_mca, vid,
1234 src);
1199 if (err) 1235 if (err)
1200 break; 1236 break;
1201 } 1237 }
@@ -1281,6 +1317,19 @@ br_multicast_update_query_timer(struct net_bridge *br,
1281 mod_timer(&query->timer, jiffies + br->multicast_querier_interval); 1317 mod_timer(&query->timer, jiffies + br->multicast_querier_interval);
1282} 1318}
1283 1319
1320static void br_port_mc_router_state_change(struct net_bridge_port *p,
1321 bool is_mc_router)
1322{
1323 struct switchdev_attr attr = {
1324 .orig_dev = p->dev,
1325 .id = SWITCHDEV_ATTR_ID_PORT_MROUTER,
1326 .flags = SWITCHDEV_F_DEFER,
1327 .u.mrouter = is_mc_router,
1328 };
1329
1330 switchdev_port_attr_set(p->dev, &attr);
1331}
1332
1284/* 1333/*
1285 * Add port to router_list 1334 * Add port to router_list
1286 * list is maintained ordered by pointer value 1335 * list is maintained ordered by pointer value
@@ -1306,6 +1355,7 @@ static void br_multicast_add_router(struct net_bridge *br,
1306 else 1355 else
1307 hlist_add_head_rcu(&port->rlist, &br->router_list); 1356 hlist_add_head_rcu(&port->rlist, &br->router_list);
1308 br_rtr_notify(br->dev, port, RTM_NEWMDB); 1357 br_rtr_notify(br->dev, port, RTM_NEWMDB);
1358 br_port_mc_router_state_change(port, true);
1309} 1359}
1310 1360
1311static void br_multicast_mark_router(struct net_bridge *br, 1361static void br_multicast_mark_router(struct net_bridge *br,
@@ -1511,7 +1561,8 @@ br_multicast_leave_group(struct net_bridge *br,
1511 struct net_bridge_port *port, 1561 struct net_bridge_port *port,
1512 struct br_ip *group, 1562 struct br_ip *group,
1513 struct bridge_mcast_other_query *other_query, 1563 struct bridge_mcast_other_query *other_query,
1514 struct bridge_mcast_own_query *own_query) 1564 struct bridge_mcast_own_query *own_query,
1565 const unsigned char *src)
1515{ 1566{
1516 struct net_bridge_mdb_htable *mdb; 1567 struct net_bridge_mdb_htable *mdb;
1517 struct net_bridge_mdb_entry *mp; 1568 struct net_bridge_mdb_entry *mp;
@@ -1535,7 +1586,7 @@ br_multicast_leave_group(struct net_bridge *br,
1535 for (pp = &mp->ports; 1586 for (pp = &mp->ports;
1536 (p = mlock_dereference(*pp, br)) != NULL; 1587 (p = mlock_dereference(*pp, br)) != NULL;
1537 pp = &p->next) { 1588 pp = &p->next) {
1538 if (p->port != port) 1589 if (!br_port_group_equal(p, port, src))
1539 continue; 1590 continue;
1540 1591
1541 rcu_assign_pointer(*pp, p->next); 1592 rcu_assign_pointer(*pp, p->next);
@@ -1566,7 +1617,7 @@ br_multicast_leave_group(struct net_bridge *br,
1566 for (p = mlock_dereference(mp->ports, br); 1617 for (p = mlock_dereference(mp->ports, br);
1567 p != NULL; 1618 p != NULL;
1568 p = mlock_dereference(p->next, br)) { 1619 p = mlock_dereference(p->next, br)) {
1569 if (p->port != port) 1620 if (!br_port_group_equal(p, port, src))
1570 continue; 1621 continue;
1571 1622
1572 if (!hlist_unhashed(&p->mglist) && 1623 if (!hlist_unhashed(&p->mglist) &&
@@ -1617,7 +1668,8 @@ out:
1617static void br_ip4_multicast_leave_group(struct net_bridge *br, 1668static void br_ip4_multicast_leave_group(struct net_bridge *br,
1618 struct net_bridge_port *port, 1669 struct net_bridge_port *port,
1619 __be32 group, 1670 __be32 group,
1620 __u16 vid) 1671 __u16 vid,
1672 const unsigned char *src)
1621{ 1673{
1622 struct br_ip br_group; 1674 struct br_ip br_group;
1623 struct bridge_mcast_own_query *own_query; 1675 struct bridge_mcast_own_query *own_query;
@@ -1632,14 +1684,15 @@ static void br_ip4_multicast_leave_group(struct net_bridge *br,
1632 br_group.vid = vid; 1684 br_group.vid = vid;
1633 1685
1634 br_multicast_leave_group(br, port, &br_group, &br->ip4_other_query, 1686 br_multicast_leave_group(br, port, &br_group, &br->ip4_other_query,
1635 own_query); 1687 own_query, src);
1636} 1688}
1637 1689
1638#if IS_ENABLED(CONFIG_IPV6) 1690#if IS_ENABLED(CONFIG_IPV6)
1639static void br_ip6_multicast_leave_group(struct net_bridge *br, 1691static void br_ip6_multicast_leave_group(struct net_bridge *br,
1640 struct net_bridge_port *port, 1692 struct net_bridge_port *port,
1641 const struct in6_addr *group, 1693 const struct in6_addr *group,
1642 __u16 vid) 1694 __u16 vid,
1695 const unsigned char *src)
1643{ 1696{
1644 struct br_ip br_group; 1697 struct br_ip br_group;
1645 struct bridge_mcast_own_query *own_query; 1698 struct bridge_mcast_own_query *own_query;
@@ -1654,7 +1707,7 @@ static void br_ip6_multicast_leave_group(struct net_bridge *br,
1654 br_group.vid = vid; 1707 br_group.vid = vid;
1655 1708
1656 br_multicast_leave_group(br, port, &br_group, &br->ip6_other_query, 1709 br_multicast_leave_group(br, port, &br_group, &br->ip6_other_query,
1657 own_query); 1710 own_query, src);
1658} 1711}
1659#endif 1712#endif
1660 1713
@@ -1712,6 +1765,7 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
1712 u16 vid) 1765 u16 vid)
1713{ 1766{
1714 struct sk_buff *skb_trimmed = NULL; 1767 struct sk_buff *skb_trimmed = NULL;
1768 const unsigned char *src;
1715 struct igmphdr *ih; 1769 struct igmphdr *ih;
1716 int err; 1770 int err;
1717 1771
@@ -1731,13 +1785,14 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
1731 } 1785 }
1732 1786
1733 ih = igmp_hdr(skb); 1787 ih = igmp_hdr(skb);
1788 src = eth_hdr(skb)->h_source;
1734 BR_INPUT_SKB_CB(skb)->igmp = ih->type; 1789 BR_INPUT_SKB_CB(skb)->igmp = ih->type;
1735 1790
1736 switch (ih->type) { 1791 switch (ih->type) {
1737 case IGMP_HOST_MEMBERSHIP_REPORT: 1792 case IGMP_HOST_MEMBERSHIP_REPORT:
1738 case IGMPV2_HOST_MEMBERSHIP_REPORT: 1793 case IGMPV2_HOST_MEMBERSHIP_REPORT:
1739 BR_INPUT_SKB_CB(skb)->mrouters_only = 1; 1794 BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
1740 err = br_ip4_multicast_add_group(br, port, ih->group, vid); 1795 err = br_ip4_multicast_add_group(br, port, ih->group, vid, src);
1741 break; 1796 break;
1742 case IGMPV3_HOST_MEMBERSHIP_REPORT: 1797 case IGMPV3_HOST_MEMBERSHIP_REPORT:
1743 err = br_ip4_multicast_igmp3_report(br, port, skb_trimmed, vid); 1798 err = br_ip4_multicast_igmp3_report(br, port, skb_trimmed, vid);
@@ -1746,7 +1801,7 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
1746 err = br_ip4_multicast_query(br, port, skb_trimmed, vid); 1801 err = br_ip4_multicast_query(br, port, skb_trimmed, vid);
1747 break; 1802 break;
1748 case IGMP_HOST_LEAVE_MESSAGE: 1803 case IGMP_HOST_LEAVE_MESSAGE:
1749 br_ip4_multicast_leave_group(br, port, ih->group, vid); 1804 br_ip4_multicast_leave_group(br, port, ih->group, vid, src);
1750 break; 1805 break;
1751 } 1806 }
1752 1807
@@ -1766,6 +1821,7 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
1766 u16 vid) 1821 u16 vid)
1767{ 1822{
1768 struct sk_buff *skb_trimmed = NULL; 1823 struct sk_buff *skb_trimmed = NULL;
1824 const unsigned char *src;
1769 struct mld_msg *mld; 1825 struct mld_msg *mld;
1770 int err; 1826 int err;
1771 1827
@@ -1785,8 +1841,10 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
1785 1841
1786 switch (mld->mld_type) { 1842 switch (mld->mld_type) {
1787 case ICMPV6_MGM_REPORT: 1843 case ICMPV6_MGM_REPORT:
1844 src = eth_hdr(skb)->h_source;
1788 BR_INPUT_SKB_CB(skb)->mrouters_only = 1; 1845 BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
1789 err = br_ip6_multicast_add_group(br, port, &mld->mld_mca, vid); 1846 err = br_ip6_multicast_add_group(br, port, &mld->mld_mca, vid,
1847 src);
1790 break; 1848 break;
1791 case ICMPV6_MLD2_REPORT: 1849 case ICMPV6_MLD2_REPORT:
1792 err = br_ip6_multicast_mld2_report(br, port, skb_trimmed, vid); 1850 err = br_ip6_multicast_mld2_report(br, port, skb_trimmed, vid);
@@ -1795,7 +1853,8 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
1795 err = br_ip6_multicast_query(br, port, skb_trimmed, vid); 1853 err = br_ip6_multicast_query(br, port, skb_trimmed, vid);
1796 break; 1854 break;
1797 case ICMPV6_MGM_REDUCTION: 1855 case ICMPV6_MGM_REDUCTION:
1798 br_ip6_multicast_leave_group(br, port, &mld->mld_mca, vid); 1856 src = eth_hdr(skb)->h_source;
1857 br_ip6_multicast_leave_group(br, port, &mld->mld_mca, vid, src);
1799 break; 1858 break;
1800 } 1859 }
1801 1860
@@ -1972,8 +2031,6 @@ void br_multicast_dev_del(struct net_bridge *br)
1972 2031
1973out: 2032out:
1974 spin_unlock_bh(&br->multicast_lock); 2033 spin_unlock_bh(&br->multicast_lock);
1975
1976 free_percpu(br->mcast_stats);
1977} 2034}
1978 2035
1979int br_multicast_set_router(struct net_bridge *br, unsigned long val) 2036int br_multicast_set_router(struct net_bridge *br, unsigned long val)
@@ -2004,6 +2061,11 @@ static void __del_port_router(struct net_bridge_port *p)
2004 return; 2061 return;
2005 hlist_del_init_rcu(&p->rlist); 2062 hlist_del_init_rcu(&p->rlist);
2006 br_rtr_notify(p->br->dev, p, RTM_DELMDB); 2063 br_rtr_notify(p->br->dev, p, RTM_DELMDB);
2064 br_port_mc_router_state_change(p, false);
2065
2066 /* don't allow timer refresh */
2067 if (p->multicast_router == MDB_RTR_TYPE_TEMP)
2068 p->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
2007} 2069}
2008 2070
2009int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val) 2071int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val)
@@ -2081,6 +2143,7 @@ int br_multicast_toggle(struct net_bridge *br, unsigned long val)
2081 if (br->multicast_disabled == !val) 2143 if (br->multicast_disabled == !val)
2082 goto unlock; 2144 goto unlock;
2083 2145
2146 br_mc_disabled_update(br->dev, !val);
2084 br->multicast_disabled = !val; 2147 br->multicast_disabled = !val;
2085 if (br->multicast_disabled) 2148 if (br->multicast_disabled)
2086 goto unlock; 2149 goto unlock;
@@ -2466,6 +2529,11 @@ int br_multicast_init_stats(struct net_bridge *br)
2466 return 0; 2529 return 0;
2467} 2530}
2468 2531
2532void br_multicast_uninit_stats(struct net_bridge *br)
2533{
2534 free_percpu(br->mcast_stats);
2535}
2536
2469static void mcast_stats_add_dir(u64 *dst, u64 *src) 2537static void mcast_stats_add_dir(u64 *dst, u64 *src)
2470{ 2538{
2471 dst[BR_MCAST_DIR_RX] += src[BR_MCAST_DIR_RX]; 2539 dst[BR_MCAST_DIR_RX] += src[BR_MCAST_DIR_RX];
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 95087e6e8258..1f1e62095464 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -521,21 +521,6 @@ static unsigned int br_nf_pre_routing(void *priv,
521} 521}
522 522
523 523
524/* PF_BRIDGE/LOCAL_IN ************************************************/
525/* The packet is locally destined, which requires a real
526 * dst_entry, so detach the fake one. On the way up, the
527 * packet would pass through PRE_ROUTING again (which already
528 * took place when the packet entered the bridge), but we
529 * register an IPv4 PRE_ROUTING 'sabotage' hook that will
530 * prevent this from happening. */
531static unsigned int br_nf_local_in(void *priv,
532 struct sk_buff *skb,
533 const struct nf_hook_state *state)
534{
535 br_drop_fake_rtable(skb);
536 return NF_ACCEPT;
537}
538
539/* PF_BRIDGE/FORWARD *************************************************/ 524/* PF_BRIDGE/FORWARD *************************************************/
540static int br_nf_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb) 525static int br_nf_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
541{ 526{
@@ -721,18 +706,20 @@ static unsigned int nf_bridge_mtu_reduction(const struct sk_buff *skb)
721 706
722static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) 707static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
723{ 708{
724 struct nf_bridge_info *nf_bridge; 709 struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
725 unsigned int mtu_reserved; 710 unsigned int mtu, mtu_reserved;
726 711
727 mtu_reserved = nf_bridge_mtu_reduction(skb); 712 mtu_reserved = nf_bridge_mtu_reduction(skb);
713 mtu = skb->dev->mtu;
728 714
729 if (skb_is_gso(skb) || skb->len + mtu_reserved <= skb->dev->mtu) { 715 if (nf_bridge->frag_max_size && nf_bridge->frag_max_size < mtu)
716 mtu = nf_bridge->frag_max_size;
717
718 if (skb_is_gso(skb) || skb->len + mtu_reserved <= mtu) {
730 nf_bridge_info_free(skb); 719 nf_bridge_info_free(skb);
731 return br_dev_queue_push_xmit(net, sk, skb); 720 return br_dev_queue_push_xmit(net, sk, skb);
732 } 721 }
733 722
734 nf_bridge = nf_bridge_info_get(skb);
735
736 /* This is wrong! We should preserve the original fragment 723 /* This is wrong! We should preserve the original fragment
737 * boundaries by preserving frag_list rather than refragmenting. 724 * boundaries by preserving frag_list rather than refragmenting.
738 */ 725 */
@@ -908,12 +895,6 @@ static struct nf_hook_ops br_nf_ops[] __read_mostly = {
908 .priority = NF_BR_PRI_BRNF, 895 .priority = NF_BR_PRI_BRNF,
909 }, 896 },
910 { 897 {
911 .hook = br_nf_local_in,
912 .pf = NFPROTO_BRIDGE,
913 .hooknum = NF_BR_LOCAL_IN,
914 .priority = NF_BR_PRI_BRNF,
915 },
916 {
917 .hook = br_nf_forward_ip, 898 .hook = br_nf_forward_ip,
918 .pf = NFPROTO_BRIDGE, 899 .pf = NFPROTO_BRIDGE,
919 .hooknum = NF_BR_FORWARD, 900 .hooknum = NF_BR_FORWARD,
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 7109b389ea58..225ef7d53701 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -20,6 +20,7 @@
20 20
21#include "br_private.h" 21#include "br_private.h"
22#include "br_private_stp.h" 22#include "br_private_stp.h"
23#include "br_private_tunnel.h"
23 24
24static int __get_num_vlan_infos(struct net_bridge_vlan_group *vg, 25static int __get_num_vlan_infos(struct net_bridge_vlan_group *vg,
25 u32 filter_mask) 26 u32 filter_mask)
@@ -95,9 +96,10 @@ static size_t br_get_link_af_size_filtered(const struct net_device *dev,
95 u32 filter_mask) 96 u32 filter_mask)
96{ 97{
97 struct net_bridge_vlan_group *vg = NULL; 98 struct net_bridge_vlan_group *vg = NULL;
98 struct net_bridge_port *p; 99 struct net_bridge_port *p = NULL;
99 struct net_bridge *br; 100 struct net_bridge *br;
100 int num_vlan_infos; 101 int num_vlan_infos;
102 size_t vinfo_sz = 0;
101 103
102 rcu_read_lock(); 104 rcu_read_lock();
103 if (br_port_exists(dev)) { 105 if (br_port_exists(dev)) {
@@ -110,8 +112,13 @@ static size_t br_get_link_af_size_filtered(const struct net_device *dev,
110 num_vlan_infos = br_get_num_vlan_infos(vg, filter_mask); 112 num_vlan_infos = br_get_num_vlan_infos(vg, filter_mask);
111 rcu_read_unlock(); 113 rcu_read_unlock();
112 114
115 if (p && (p->flags & BR_VLAN_TUNNEL))
116 vinfo_sz += br_get_vlan_tunnel_info_size(vg);
117
113 /* Each VLAN is returned in bridge_vlan_info along with flags */ 118 /* Each VLAN is returned in bridge_vlan_info along with flags */
114 return num_vlan_infos * nla_total_size(sizeof(struct bridge_vlan_info)); 119 vinfo_sz += num_vlan_infos * nla_total_size(sizeof(struct bridge_vlan_info));
120
121 return vinfo_sz;
115} 122}
116 123
117static inline size_t br_port_info_size(void) 124static inline size_t br_port_info_size(void)
@@ -123,10 +130,12 @@ static inline size_t br_port_info_size(void)
123 + nla_total_size(1) /* IFLA_BRPORT_GUARD */ 130 + nla_total_size(1) /* IFLA_BRPORT_GUARD */
124 + nla_total_size(1) /* IFLA_BRPORT_PROTECT */ 131 + nla_total_size(1) /* IFLA_BRPORT_PROTECT */
125 + nla_total_size(1) /* IFLA_BRPORT_FAST_LEAVE */ 132 + nla_total_size(1) /* IFLA_BRPORT_FAST_LEAVE */
133 + nla_total_size(1) /* IFLA_BRPORT_MCAST_TO_UCAST */
126 + nla_total_size(1) /* IFLA_BRPORT_LEARNING */ 134 + nla_total_size(1) /* IFLA_BRPORT_LEARNING */
127 + nla_total_size(1) /* IFLA_BRPORT_UNICAST_FLOOD */ 135 + nla_total_size(1) /* IFLA_BRPORT_UNICAST_FLOOD */
128 + nla_total_size(1) /* IFLA_BRPORT_PROXYARP */ 136 + nla_total_size(1) /* IFLA_BRPORT_PROXYARP */
129 + nla_total_size(1) /* IFLA_BRPORT_PROXYARP_WIFI */ 137 + nla_total_size(1) /* IFLA_BRPORT_PROXYARP_WIFI */
138 + nla_total_size(1) /* IFLA_BRPORT_VLAN_TUNNEL */
130 + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_ROOT_ID */ 139 + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_ROOT_ID */
131 + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_BRIDGE_ID */ 140 + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_BRIDGE_ID */
132 + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_DESIGNATED_PORT */ 141 + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_DESIGNATED_PORT */
@@ -173,6 +182,8 @@ static int br_port_fill_attrs(struct sk_buff *skb,
173 !!(p->flags & BR_ROOT_BLOCK)) || 182 !!(p->flags & BR_ROOT_BLOCK)) ||
174 nla_put_u8(skb, IFLA_BRPORT_FAST_LEAVE, 183 nla_put_u8(skb, IFLA_BRPORT_FAST_LEAVE,
175 !!(p->flags & BR_MULTICAST_FAST_LEAVE)) || 184 !!(p->flags & BR_MULTICAST_FAST_LEAVE)) ||
185 nla_put_u8(skb, IFLA_BRPORT_MCAST_TO_UCAST,
186 !!(p->flags & BR_MULTICAST_TO_UNICAST)) ||
176 nla_put_u8(skb, IFLA_BRPORT_LEARNING, !!(p->flags & BR_LEARNING)) || 187 nla_put_u8(skb, IFLA_BRPORT_LEARNING, !!(p->flags & BR_LEARNING)) ||
177 nla_put_u8(skb, IFLA_BRPORT_UNICAST_FLOOD, 188 nla_put_u8(skb, IFLA_BRPORT_UNICAST_FLOOD,
178 !!(p->flags & BR_FLOOD)) || 189 !!(p->flags & BR_FLOOD)) ||
@@ -191,7 +202,9 @@ static int br_port_fill_attrs(struct sk_buff *skb,
191 nla_put_u16(skb, IFLA_BRPORT_NO, p->port_no) || 202 nla_put_u16(skb, IFLA_BRPORT_NO, p->port_no) ||
192 nla_put_u8(skb, IFLA_BRPORT_TOPOLOGY_CHANGE_ACK, 203 nla_put_u8(skb, IFLA_BRPORT_TOPOLOGY_CHANGE_ACK,
193 p->topology_change_ack) || 204 p->topology_change_ack) ||
194 nla_put_u8(skb, IFLA_BRPORT_CONFIG_PENDING, p->config_pending)) 205 nla_put_u8(skb, IFLA_BRPORT_CONFIG_PENDING, p->config_pending) ||
206 nla_put_u8(skb, IFLA_BRPORT_VLAN_TUNNEL, !!(p->flags &
207 BR_VLAN_TUNNEL)))
195 return -EMSGSIZE; 208 return -EMSGSIZE;
196 209
197 timerval = br_timer_value(&p->message_age_timer); 210 timerval = br_timer_value(&p->message_age_timer);
@@ -414,6 +427,9 @@ static int br_fill_ifinfo(struct sk_buff *skb,
414 err = br_fill_ifvlaninfo_compressed(skb, vg); 427 err = br_fill_ifvlaninfo_compressed(skb, vg);
415 else 428 else
416 err = br_fill_ifvlaninfo(skb, vg); 429 err = br_fill_ifvlaninfo(skb, vg);
430
431 if (port && (port->flags & BR_VLAN_TUNNEL))
432 err = br_fill_vlan_tunnel_info(skb, vg);
417 rcu_read_unlock(); 433 rcu_read_unlock();
418 if (err) 434 if (err)
419 goto nla_put_failure; 435 goto nla_put_failure;
@@ -514,60 +530,88 @@ static int br_vlan_info(struct net_bridge *br, struct net_bridge_port *p,
514 return err; 530 return err;
515} 531}
516 532
533static int br_process_vlan_info(struct net_bridge *br,
534 struct net_bridge_port *p, int cmd,
535 struct bridge_vlan_info *vinfo_curr,
536 struct bridge_vlan_info **vinfo_last)
537{
538 if (!vinfo_curr->vid || vinfo_curr->vid >= VLAN_VID_MASK)
539 return -EINVAL;
540
541 if (vinfo_curr->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) {
542 /* check if we are already processing a range */
543 if (*vinfo_last)
544 return -EINVAL;
545 *vinfo_last = vinfo_curr;
546 /* don't allow range of pvids */
547 if ((*vinfo_last)->flags & BRIDGE_VLAN_INFO_PVID)
548 return -EINVAL;
549 return 0;
550 }
551
552 if (*vinfo_last) {
553 struct bridge_vlan_info tmp_vinfo;
554 int v, err;
555
556 if (!(vinfo_curr->flags & BRIDGE_VLAN_INFO_RANGE_END))
557 return -EINVAL;
558
559 if (vinfo_curr->vid <= (*vinfo_last)->vid)
560 return -EINVAL;
561
562 memcpy(&tmp_vinfo, *vinfo_last,
563 sizeof(struct bridge_vlan_info));
564 for (v = (*vinfo_last)->vid; v <= vinfo_curr->vid; v++) {
565 tmp_vinfo.vid = v;
566 err = br_vlan_info(br, p, cmd, &tmp_vinfo);
567 if (err)
568 break;
569 }
570 *vinfo_last = NULL;
571
572 return 0;
573 }
574
575 return br_vlan_info(br, p, cmd, vinfo_curr);
576}
577
517static int br_afspec(struct net_bridge *br, 578static int br_afspec(struct net_bridge *br,
518 struct net_bridge_port *p, 579 struct net_bridge_port *p,
519 struct nlattr *af_spec, 580 struct nlattr *af_spec,
520 int cmd) 581 int cmd)
521{ 582{
522 struct bridge_vlan_info *vinfo_start = NULL; 583 struct bridge_vlan_info *vinfo_curr = NULL;
523 struct bridge_vlan_info *vinfo = NULL; 584 struct bridge_vlan_info *vinfo_last = NULL;
524 struct nlattr *attr; 585 struct nlattr *attr;
525 int err = 0; 586 struct vtunnel_info tinfo_last = {};
526 int rem; 587 struct vtunnel_info tinfo_curr = {};
588 int err = 0, rem;
527 589
528 nla_for_each_nested(attr, af_spec, rem) { 590 nla_for_each_nested(attr, af_spec, rem) {
529 if (nla_type(attr) != IFLA_BRIDGE_VLAN_INFO) 591 err = 0;
530 continue; 592 switch (nla_type(attr)) {
531 if (nla_len(attr) != sizeof(struct bridge_vlan_info)) 593 case IFLA_BRIDGE_VLAN_TUNNEL_INFO:
532 return -EINVAL; 594 if (!(p->flags & BR_VLAN_TUNNEL))
533 vinfo = nla_data(attr);
534 if (!vinfo->vid || vinfo->vid >= VLAN_VID_MASK)
535 return -EINVAL;
536 if (vinfo->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) {
537 if (vinfo_start)
538 return -EINVAL;
539 vinfo_start = vinfo;
540 /* don't allow range of pvids */
541 if (vinfo_start->flags & BRIDGE_VLAN_INFO_PVID)
542 return -EINVAL;
543 continue;
544 }
545
546 if (vinfo_start) {
547 struct bridge_vlan_info tmp_vinfo;
548 int v;
549
550 if (!(vinfo->flags & BRIDGE_VLAN_INFO_RANGE_END))
551 return -EINVAL; 595 return -EINVAL;
552 596 err = br_parse_vlan_tunnel_info(attr, &tinfo_curr);
553 if (vinfo->vid <= vinfo_start->vid) 597 if (err)
598 return err;
599 err = br_process_vlan_tunnel_info(br, p, cmd,
600 &tinfo_curr,
601 &tinfo_last);
602 if (err)
603 return err;
604 break;
605 case IFLA_BRIDGE_VLAN_INFO:
606 if (nla_len(attr) != sizeof(struct bridge_vlan_info))
554 return -EINVAL; 607 return -EINVAL;
555 608 vinfo_curr = nla_data(attr);
556 memcpy(&tmp_vinfo, vinfo_start, 609 err = br_process_vlan_info(br, p, cmd, vinfo_curr,
557 sizeof(struct bridge_vlan_info)); 610 &vinfo_last);
558 611 if (err)
559 for (v = vinfo_start->vid; v <= vinfo->vid; v++) { 612 return err;
560 tmp_vinfo.vid = v;
561 err = br_vlan_info(br, p, cmd, &tmp_vinfo);
562 if (err)
563 break;
564 }
565 vinfo_start = NULL;
566 } else {
567 err = br_vlan_info(br, p, cmd, vinfo);
568 }
569 if (err)
570 break; 613 break;
614 }
571 } 615 }
572 616
573 return err; 617 return err;
@@ -586,6 +630,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = {
586 [IFLA_BRPORT_PROXYARP] = { .type = NLA_U8 }, 630 [IFLA_BRPORT_PROXYARP] = { .type = NLA_U8 },
587 [IFLA_BRPORT_PROXYARP_WIFI] = { .type = NLA_U8 }, 631 [IFLA_BRPORT_PROXYARP_WIFI] = { .type = NLA_U8 },
588 [IFLA_BRPORT_MULTICAST_ROUTER] = { .type = NLA_U8 }, 632 [IFLA_BRPORT_MULTICAST_ROUTER] = { .type = NLA_U8 },
633 [IFLA_BRPORT_MCAST_TO_UCAST] = { .type = NLA_U8 },
589}; 634};
590 635
591/* Change the state of the port and notify spanning tree */ 636/* Change the state of the port and notify spanning tree */
@@ -626,8 +671,9 @@ static void br_set_port_flag(struct net_bridge_port *p, struct nlattr *tb[],
626/* Process bridge protocol info on port */ 671/* Process bridge protocol info on port */
627static int br_setport(struct net_bridge_port *p, struct nlattr *tb[]) 672static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
628{ 673{
629 int err;
630 unsigned long old_flags = p->flags; 674 unsigned long old_flags = p->flags;
675 bool br_vlan_tunnel_old = false;
676 int err;
631 677
632 br_set_port_flag(p, tb, IFLA_BRPORT_MODE, BR_HAIRPIN_MODE); 678 br_set_port_flag(p, tb, IFLA_BRPORT_MODE, BR_HAIRPIN_MODE);
633 br_set_port_flag(p, tb, IFLA_BRPORT_GUARD, BR_BPDU_GUARD); 679 br_set_port_flag(p, tb, IFLA_BRPORT_GUARD, BR_BPDU_GUARD);
@@ -636,9 +682,15 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
636 br_set_port_flag(p, tb, IFLA_BRPORT_LEARNING, BR_LEARNING); 682 br_set_port_flag(p, tb, IFLA_BRPORT_LEARNING, BR_LEARNING);
637 br_set_port_flag(p, tb, IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD); 683 br_set_port_flag(p, tb, IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD);
638 br_set_port_flag(p, tb, IFLA_BRPORT_MCAST_FLOOD, BR_MCAST_FLOOD); 684 br_set_port_flag(p, tb, IFLA_BRPORT_MCAST_FLOOD, BR_MCAST_FLOOD);
685 br_set_port_flag(p, tb, IFLA_BRPORT_MCAST_TO_UCAST, BR_MULTICAST_TO_UNICAST);
639 br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP, BR_PROXYARP); 686 br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP, BR_PROXYARP);
640 br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP_WIFI, BR_PROXYARP_WIFI); 687 br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP_WIFI, BR_PROXYARP_WIFI);
641 688
689 br_vlan_tunnel_old = (p->flags & BR_VLAN_TUNNEL) ? true : false;
690 br_set_port_flag(p, tb, IFLA_BRPORT_VLAN_TUNNEL, BR_VLAN_TUNNEL);
691 if (br_vlan_tunnel_old && !(p->flags & BR_VLAN_TUNNEL))
692 nbp_vlan_tunnel_info_flush(p);
693
642 if (tb[IFLA_BRPORT_COST]) { 694 if (tb[IFLA_BRPORT_COST]) {
643 err = br_stp_set_path_cost(p, nla_get_u32(tb[IFLA_BRPORT_COST])); 695 err = br_stp_set_path_cost(p, nla_get_u32(tb[IFLA_BRPORT_COST]));
644 if (err) 696 if (err)
@@ -1113,11 +1165,14 @@ static int br_dev_newlink(struct net *src_net, struct net_device *dev,
1113 spin_unlock_bh(&br->lock); 1165 spin_unlock_bh(&br->lock);
1114 } 1166 }
1115 1167
1116 err = br_changelink(dev, tb, data); 1168 err = register_netdevice(dev);
1117 if (err) 1169 if (err)
1118 return err; 1170 return err;
1119 1171
1120 return register_netdevice(dev); 1172 err = br_changelink(dev, tb, data);
1173 if (err)
1174 unregister_netdevice(dev);
1175 return err;
1121} 1176}
1122 1177
1123static size_t br_get_size(const struct net_device *brdev) 1178static size_t br_get_size(const struct net_device *brdev)
@@ -1195,7 +1250,7 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev)
1195 if (nla_put_u64_64bit(skb, IFLA_BR_TOPOLOGY_CHANGE_TIMER, clockval, 1250 if (nla_put_u64_64bit(skb, IFLA_BR_TOPOLOGY_CHANGE_TIMER, clockval,
1196 IFLA_BR_PAD)) 1251 IFLA_BR_PAD))
1197 return -EMSGSIZE; 1252 return -EMSGSIZE;
1198 clockval = br_timer_value(&br->gc_timer); 1253 clockval = br_timer_value(&br->gc_work.timer);
1199 if (nla_put_u64_64bit(skb, IFLA_BR_GC_TIMER, clockval, IFLA_BR_PAD)) 1254 if (nla_put_u64_64bit(skb, IFLA_BR_GC_TIMER, clockval, IFLA_BR_PAD))
1200 return -EMSGSIZE; 1255 return -EMSGSIZE;
1201 1256
diff --git a/net/bridge/br_netlink_tunnel.c b/net/bridge/br_netlink_tunnel.c
new file mode 100644
index 000000000000..c913491495ab
--- /dev/null
+++ b/net/bridge/br_netlink_tunnel.c
@@ -0,0 +1,294 @@
1/*
2 * Bridge per vlan tunnel port dst_metadata netlink control interface
3 *
4 * Authors:
5 * Roopa Prabhu <roopa@cumulusnetworks.com>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 */
12
13#include <linux/kernel.h>
14#include <linux/slab.h>
15#include <linux/etherdevice.h>
16#include <net/rtnetlink.h>
17#include <net/net_namespace.h>
18#include <net/sock.h>
19#include <uapi/linux/if_bridge.h>
20#include <net/dst_metadata.h>
21
22#include "br_private.h"
23#include "br_private_tunnel.h"
24
25static size_t __get_vlan_tinfo_size(void)
26{
27 return nla_total_size(0) + /* nest IFLA_BRIDGE_VLAN_TUNNEL_INFO */
28 nla_total_size(sizeof(u32)) + /* IFLA_BRIDGE_VLAN_TUNNEL_ID */
29 nla_total_size(sizeof(u16)) + /* IFLA_BRIDGE_VLAN_TUNNEL_VID */
30 nla_total_size(sizeof(u16)); /* IFLA_BRIDGE_VLAN_TUNNEL_FLAGS */
31}
32
33static bool vlan_tunid_inrange(struct net_bridge_vlan *v_curr,
34 struct net_bridge_vlan *v_last)
35{
36 __be32 tunid_curr = tunnel_id_to_key32(v_curr->tinfo.tunnel_id);
37 __be32 tunid_last = tunnel_id_to_key32(v_last->tinfo.tunnel_id);
38
39 return (be32_to_cpu(tunid_curr) - be32_to_cpu(tunid_last)) == 1;
40}
41
42static int __get_num_vlan_tunnel_infos(struct net_bridge_vlan_group *vg)
43{
44 struct net_bridge_vlan *v, *vtbegin = NULL, *vtend = NULL;
45 int num_tinfos = 0;
46
47 /* Count number of vlan infos */
48 list_for_each_entry_rcu(v, &vg->vlan_list, vlist) {
49 /* only a context, bridge vlan not activated */
50 if (!br_vlan_should_use(v) || !v->tinfo.tunnel_id)
51 continue;
52
53 if (!vtbegin) {
54 goto initvars;
55 } else if ((v->vid - vtend->vid) == 1 &&
56 vlan_tunid_inrange(v, vtend)) {
57 vtend = v;
58 continue;
59 } else {
60 if ((vtend->vid - vtbegin->vid) > 0)
61 num_tinfos += 2;
62 else
63 num_tinfos += 1;
64 }
65initvars:
66 vtbegin = v;
67 vtend = v;
68 }
69
70 if (vtbegin && vtend) {
71 if ((vtend->vid - vtbegin->vid) > 0)
72 num_tinfos += 2;
73 else
74 num_tinfos += 1;
75 }
76
77 return num_tinfos;
78}
79
80int br_get_vlan_tunnel_info_size(struct net_bridge_vlan_group *vg)
81{
82 int num_tinfos;
83
84 if (!vg)
85 return 0;
86
87 rcu_read_lock();
88 num_tinfos = __get_num_vlan_tunnel_infos(vg);
89 rcu_read_unlock();
90
91 return num_tinfos * __get_vlan_tinfo_size();
92}
93
94static int br_fill_vlan_tinfo(struct sk_buff *skb, u16 vid,
95 __be64 tunnel_id, u16 flags)
96{
97 __be32 tid = tunnel_id_to_key32(tunnel_id);
98 struct nlattr *tmap;
99
100 tmap = nla_nest_start(skb, IFLA_BRIDGE_VLAN_TUNNEL_INFO);
101 if (!tmap)
102 return -EMSGSIZE;
103 if (nla_put_u32(skb, IFLA_BRIDGE_VLAN_TUNNEL_ID,
104 be32_to_cpu(tid)))
105 goto nla_put_failure;
106 if (nla_put_u16(skb, IFLA_BRIDGE_VLAN_TUNNEL_VID,
107 vid))
108 goto nla_put_failure;
109 if (nla_put_u16(skb, IFLA_BRIDGE_VLAN_TUNNEL_FLAGS,
110 flags))
111 goto nla_put_failure;
112 nla_nest_end(skb, tmap);
113
114 return 0;
115
116nla_put_failure:
117 nla_nest_cancel(skb, tmap);
118
119 return -EMSGSIZE;
120}
121
122static int br_fill_vlan_tinfo_range(struct sk_buff *skb,
123 struct net_bridge_vlan *vtbegin,
124 struct net_bridge_vlan *vtend)
125{
126 int err;
127
128 if (vtend && (vtend->vid - vtbegin->vid) > 0) {
129 /* add range to skb */
130 err = br_fill_vlan_tinfo(skb, vtbegin->vid,
131 vtbegin->tinfo.tunnel_id,
132 BRIDGE_VLAN_INFO_RANGE_BEGIN);
133 if (err)
134 return err;
135
136 err = br_fill_vlan_tinfo(skb, vtend->vid,
137 vtend->tinfo.tunnel_id,
138 BRIDGE_VLAN_INFO_RANGE_END);
139 if (err)
140 return err;
141 } else {
142 err = br_fill_vlan_tinfo(skb, vtbegin->vid,
143 vtbegin->tinfo.tunnel_id,
144 0);
145 if (err)
146 return err;
147 }
148
149 return 0;
150}
151
152int br_fill_vlan_tunnel_info(struct sk_buff *skb,
153 struct net_bridge_vlan_group *vg)
154{
155 struct net_bridge_vlan *vtbegin = NULL;
156 struct net_bridge_vlan *vtend = NULL;
157 struct net_bridge_vlan *v;
158 int err;
159
160 /* Count number of vlan infos */
161 list_for_each_entry_rcu(v, &vg->vlan_list, vlist) {
162 /* only a context, bridge vlan not activated */
163 if (!br_vlan_should_use(v))
164 continue;
165
166 if (!v->tinfo.tunnel_dst)
167 continue;
168
169 if (!vtbegin) {
170 goto initvars;
171 } else if ((v->vid - vtend->vid) == 1 &&
172 vlan_tunid_inrange(v, vtend)) {
173 vtend = v;
174 continue;
175 } else {
176 err = br_fill_vlan_tinfo_range(skb, vtbegin, vtend);
177 if (err)
178 return err;
179 }
180initvars:
181 vtbegin = v;
182 vtend = v;
183 }
184
185 if (vtbegin) {
186 err = br_fill_vlan_tinfo_range(skb, vtbegin, vtend);
187 if (err)
188 return err;
189 }
190
191 return 0;
192}
193
194static const struct nla_policy vlan_tunnel_policy[IFLA_BRIDGE_VLAN_TUNNEL_MAX + 1] = {
195 [IFLA_BRIDGE_VLAN_TUNNEL_ID] = { .type = NLA_U32 },
196 [IFLA_BRIDGE_VLAN_TUNNEL_VID] = { .type = NLA_U16 },
197 [IFLA_BRIDGE_VLAN_TUNNEL_FLAGS] = { .type = NLA_U16 },
198};
199
200static int br_vlan_tunnel_info(struct net_bridge_port *p, int cmd,
201 u16 vid, u32 tun_id)
202{
203 int err = 0;
204
205 if (!p)
206 return -EINVAL;
207
208 switch (cmd) {
209 case RTM_SETLINK:
210 err = nbp_vlan_tunnel_info_add(p, vid, tun_id);
211 break;
212 case RTM_DELLINK:
213 nbp_vlan_tunnel_info_delete(p, vid);
214 break;
215 }
216
217 return err;
218}
219
220int br_parse_vlan_tunnel_info(struct nlattr *attr,
221 struct vtunnel_info *tinfo)
222{
223 struct nlattr *tb[IFLA_BRIDGE_VLAN_TUNNEL_MAX + 1];
224 u32 tun_id;
225 u16 vid, flags = 0;
226 int err;
227
228 memset(tinfo, 0, sizeof(*tinfo));
229
230 err = nla_parse_nested(tb, IFLA_BRIDGE_VLAN_TUNNEL_MAX,
231 attr, vlan_tunnel_policy);
232 if (err < 0)
233 return err;
234
235 if (!tb[IFLA_BRIDGE_VLAN_TUNNEL_ID] ||
236 !tb[IFLA_BRIDGE_VLAN_TUNNEL_VID])
237 return -EINVAL;
238
239 tun_id = nla_get_u32(tb[IFLA_BRIDGE_VLAN_TUNNEL_ID]);
240 vid = nla_get_u16(tb[IFLA_BRIDGE_VLAN_TUNNEL_VID]);
241 if (vid >= VLAN_VID_MASK)
242 return -ERANGE;
243
244 if (tb[IFLA_BRIDGE_VLAN_TUNNEL_FLAGS])
245 flags = nla_get_u16(tb[IFLA_BRIDGE_VLAN_TUNNEL_FLAGS]);
246
247 tinfo->tunid = tun_id;
248 tinfo->vid = vid;
249 tinfo->flags = flags;
250
251 return 0;
252}
253
254int br_process_vlan_tunnel_info(struct net_bridge *br,
255 struct net_bridge_port *p, int cmd,
256 struct vtunnel_info *tinfo_curr,
257 struct vtunnel_info *tinfo_last)
258{
259 int err;
260
261 if (tinfo_curr->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) {
262 if (tinfo_last->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN)
263 return -EINVAL;
264 memcpy(tinfo_last, tinfo_curr, sizeof(struct vtunnel_info));
265 } else if (tinfo_curr->flags & BRIDGE_VLAN_INFO_RANGE_END) {
266 int t, v;
267
268 if (!(tinfo_last->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN))
269 return -EINVAL;
270 if ((tinfo_curr->vid - tinfo_last->vid) !=
271 (tinfo_curr->tunid - tinfo_last->tunid))
272 return -EINVAL;
273 t = tinfo_last->tunid;
274 for (v = tinfo_last->vid; v <= tinfo_curr->vid; v++) {
275 err = br_vlan_tunnel_info(p, cmd, v, t);
276 if (err)
277 return err;
278 t++;
279 }
280 memset(tinfo_last, 0, sizeof(struct vtunnel_info));
281 memset(tinfo_curr, 0, sizeof(struct vtunnel_info));
282 } else {
283 if (tinfo_last->flags)
284 return -EINVAL;
285 err = br_vlan_tunnel_info(p, cmd, tinfo_curr->vid,
286 tinfo_curr->tunid);
287 if (err)
288 return err;
289 memset(tinfo_last, 0, sizeof(struct vtunnel_info));
290 memset(tinfo_curr, 0, sizeof(struct vtunnel_info));
291 }
292
293 return 0;
294}
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 8ce621e8345c..0d177280aa84 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -91,6 +91,11 @@ struct br_vlan_stats {
91 struct u64_stats_sync syncp; 91 struct u64_stats_sync syncp;
92}; 92};
93 93
94struct br_tunnel_info {
95 __be64 tunnel_id;
96 struct metadata_dst *tunnel_dst;
97};
98
94/** 99/**
95 * struct net_bridge_vlan - per-vlan entry 100 * struct net_bridge_vlan - per-vlan entry
96 * 101 *
@@ -113,6 +118,7 @@ struct br_vlan_stats {
113 */ 118 */
114struct net_bridge_vlan { 119struct net_bridge_vlan {
115 struct rhash_head vnode; 120 struct rhash_head vnode;
121 struct rhash_head tnode;
116 u16 vid; 122 u16 vid;
117 u16 flags; 123 u16 flags;
118 struct br_vlan_stats __percpu *stats; 124 struct br_vlan_stats __percpu *stats;
@@ -124,6 +130,9 @@ struct net_bridge_vlan {
124 atomic_t refcnt; 130 atomic_t refcnt;
125 struct net_bridge_vlan *brvlan; 131 struct net_bridge_vlan *brvlan;
126 }; 132 };
133
134 struct br_tunnel_info tinfo;
135
127 struct list_head vlist; 136 struct list_head vlist;
128 137
129 struct rcu_head rcu; 138 struct rcu_head rcu;
@@ -145,24 +154,27 @@ struct net_bridge_vlan {
145 */ 154 */
146struct net_bridge_vlan_group { 155struct net_bridge_vlan_group {
147 struct rhashtable vlan_hash; 156 struct rhashtable vlan_hash;
157 struct rhashtable tunnel_hash;
148 struct list_head vlan_list; 158 struct list_head vlan_list;
149 u16 num_vlans; 159 u16 num_vlans;
150 u16 pvid; 160 u16 pvid;
151}; 161};
152 162
153struct net_bridge_fdb_entry 163struct net_bridge_fdb_entry {
154{
155 struct hlist_node hlist; 164 struct hlist_node hlist;
156 struct net_bridge_port *dst; 165 struct net_bridge_port *dst;
157 166
158 unsigned long updated;
159 unsigned long used;
160 mac_addr addr; 167 mac_addr addr;
161 __u16 vlan_id; 168 __u16 vlan_id;
162 unsigned char is_local:1, 169 unsigned char is_local:1,
163 is_static:1, 170 is_static:1,
164 added_by_user:1, 171 added_by_user:1,
165 added_by_external_learn:1; 172 added_by_external_learn:1;
173
174 /* write-heavy members should not affect lookups */
175 unsigned long updated ____cacheline_aligned_in_smp;
176 unsigned long used;
177
166 struct rcu_head rcu; 178 struct rcu_head rcu;
167}; 179};
168 180
@@ -177,6 +189,7 @@ struct net_bridge_port_group {
177 struct timer_list timer; 189 struct timer_list timer;
178 struct br_ip addr; 190 struct br_ip addr;
179 unsigned char flags; 191 unsigned char flags;
192 unsigned char eth_addr[ETH_ALEN];
180}; 193};
181 194
182struct net_bridge_mdb_entry 195struct net_bridge_mdb_entry
@@ -201,12 +214,16 @@ struct net_bridge_mdb_htable
201 u32 ver; 214 u32 ver;
202}; 215};
203 216
204struct net_bridge_port 217struct net_bridge_port {
205{
206 struct net_bridge *br; 218 struct net_bridge *br;
207 struct net_device *dev; 219 struct net_device *dev;
208 struct list_head list; 220 struct list_head list;
209 221
222 unsigned long flags;
223#ifdef CONFIG_BRIDGE_VLAN_FILTERING
224 struct net_bridge_vlan_group __rcu *vlgrp;
225#endif
226
210 /* STP */ 227 /* STP */
211 u8 priority; 228 u8 priority;
212 u8 state; 229 u8 state;
@@ -227,8 +244,6 @@ struct net_bridge_port
227 struct kobject kobj; 244 struct kobject kobj;
228 struct rcu_head rcu; 245 struct rcu_head rcu;
229 246
230 unsigned long flags;
231
232#ifdef CONFIG_BRIDGE_IGMP_SNOOPING 247#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
233 struct bridge_mcast_own_query ip4_own_query; 248 struct bridge_mcast_own_query ip4_own_query;
234#if IS_ENABLED(CONFIG_IPV6) 249#if IS_ENABLED(CONFIG_IPV6)
@@ -248,9 +263,6 @@ struct net_bridge_port
248#ifdef CONFIG_NET_POLL_CONTROLLER 263#ifdef CONFIG_NET_POLL_CONTROLLER
249 struct netpoll *np; 264 struct netpoll *np;
250#endif 265#endif
251#ifdef CONFIG_BRIDGE_VLAN_FILTERING
252 struct net_bridge_vlan_group __rcu *vlgrp;
253#endif
254#ifdef CONFIG_NET_SWITCHDEV 266#ifdef CONFIG_NET_SWITCHDEV
255 int offload_fwd_mark; 267 int offload_fwd_mark;
256#endif 268#endif
@@ -272,14 +284,21 @@ static inline struct net_bridge_port *br_port_get_rtnl(const struct net_device *
272 rtnl_dereference(dev->rx_handler_data) : NULL; 284 rtnl_dereference(dev->rx_handler_data) : NULL;
273} 285}
274 286
275struct net_bridge 287struct net_bridge {
276{
277 spinlock_t lock; 288 spinlock_t lock;
289 spinlock_t hash_lock;
278 struct list_head port_list; 290 struct list_head port_list;
279 struct net_device *dev; 291 struct net_device *dev;
280
281 struct pcpu_sw_netstats __percpu *stats; 292 struct pcpu_sw_netstats __percpu *stats;
282 spinlock_t hash_lock; 293 /* These fields are accessed on each packet */
294#ifdef CONFIG_BRIDGE_VLAN_FILTERING
295 u8 vlan_enabled;
296 u8 vlan_stats_enabled;
297 __be16 vlan_proto;
298 u16 default_pvid;
299 struct net_bridge_vlan_group __rcu *vlgrp;
300#endif
301
283 struct hlist_head hash[BR_HASH_SIZE]; 302 struct hlist_head hash[BR_HASH_SIZE];
284#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) 303#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
285 union { 304 union {
@@ -297,6 +316,9 @@ struct net_bridge
297 bridge_id designated_root; 316 bridge_id designated_root;
298 bridge_id bridge_id; 317 bridge_id bridge_id;
299 u32 root_path_cost; 318 u32 root_path_cost;
319 unsigned char topology_change;
320 unsigned char topology_change_detected;
321 u16 root_port;
300 unsigned long max_age; 322 unsigned long max_age;
301 unsigned long hello_time; 323 unsigned long hello_time;
302 unsigned long forward_delay; 324 unsigned long forward_delay;
@@ -308,7 +330,6 @@ struct net_bridge
308 330
309 u8 group_addr[ETH_ALEN]; 331 u8 group_addr[ETH_ALEN];
310 bool group_addr_set; 332 bool group_addr_set;
311 u16 root_port;
312 333
313 enum { 334 enum {
314 BR_NO_STP, /* no spanning tree */ 335 BR_NO_STP, /* no spanning tree */
@@ -316,9 +337,6 @@ struct net_bridge
316 BR_USER_STP, /* new RSTP in userspace */ 337 BR_USER_STP, /* new RSTP in userspace */
317 } stp_enabled; 338 } stp_enabled;
318 339
319 unsigned char topology_change;
320 unsigned char topology_change_detected;
321
322#ifdef CONFIG_BRIDGE_IGMP_SNOOPING 340#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
323 unsigned char multicast_router; 341 unsigned char multicast_router;
324 342
@@ -363,21 +381,13 @@ struct net_bridge
363 struct timer_list hello_timer; 381 struct timer_list hello_timer;
364 struct timer_list tcn_timer; 382 struct timer_list tcn_timer;
365 struct timer_list topology_change_timer; 383 struct timer_list topology_change_timer;
366 struct timer_list gc_timer; 384 struct delayed_work gc_work;
367 struct kobject *ifobj; 385 struct kobject *ifobj;
368 u32 auto_cnt; 386 u32 auto_cnt;
369 387
370#ifdef CONFIG_NET_SWITCHDEV 388#ifdef CONFIG_NET_SWITCHDEV
371 int offload_fwd_mark; 389 int offload_fwd_mark;
372#endif 390#endif
373
374#ifdef CONFIG_BRIDGE_VLAN_FILTERING
375 struct net_bridge_vlan_group __rcu *vlgrp;
376 u8 vlan_enabled;
377 u8 vlan_stats_enabled;
378 __be16 vlan_proto;
379 u16 default_pvid;
380#endif
381}; 391};
382 392
383struct br_input_skb_cb { 393struct br_input_skb_cb {
@@ -494,11 +504,12 @@ void br_fdb_find_delete_local(struct net_bridge *br,
494 const unsigned char *addr, u16 vid); 504 const unsigned char *addr, u16 vid);
495void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr); 505void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr);
496void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr); 506void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr);
497void br_fdb_cleanup(unsigned long arg); 507void br_fdb_cleanup(struct work_struct *work);
498void br_fdb_delete_by_port(struct net_bridge *br, 508void br_fdb_delete_by_port(struct net_bridge *br,
499 const struct net_bridge_port *p, u16 vid, int do_all); 509 const struct net_bridge_port *p, u16 vid, int do_all);
500struct net_bridge_fdb_entry *__br_fdb_get(struct net_bridge *br, 510struct net_bridge_fdb_entry *br_fdb_find_rcu(struct net_bridge *br,
501 const unsigned char *addr, __u16 vid); 511 const unsigned char *addr,
512 __u16 vid);
502int br_fdb_test_addr(struct net_device *dev, unsigned char *addr); 513int br_fdb_test_addr(struct net_device *dev, unsigned char *addr);
503int br_fdb_fillbuf(struct net_bridge *br, void *buf, unsigned long count, 514int br_fdb_fillbuf(struct net_bridge *br, void *buf, unsigned long count,
504 unsigned long off); 515 unsigned long off);
@@ -599,7 +610,7 @@ void br_multicast_free_pg(struct rcu_head *head);
599struct net_bridge_port_group * 610struct net_bridge_port_group *
600br_multicast_new_port_group(struct net_bridge_port *port, struct br_ip *group, 611br_multicast_new_port_group(struct net_bridge_port *port, struct br_ip *group,
601 struct net_bridge_port_group __rcu *next, 612 struct net_bridge_port_group __rcu *next,
602 unsigned char flags); 613 unsigned char flags, const unsigned char *src);
603void br_mdb_init(void); 614void br_mdb_init(void);
604void br_mdb_uninit(void); 615void br_mdb_uninit(void);
605void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port, 616void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
@@ -609,6 +620,7 @@ void br_rtr_notify(struct net_device *dev, struct net_bridge_port *port,
609void br_multicast_count(struct net_bridge *br, const struct net_bridge_port *p, 620void br_multicast_count(struct net_bridge *br, const struct net_bridge_port *p,
610 const struct sk_buff *skb, u8 type, u8 dir); 621 const struct sk_buff *skb, u8 type, u8 dir);
611int br_multicast_init_stats(struct net_bridge *br); 622int br_multicast_init_stats(struct net_bridge *br);
623void br_multicast_uninit_stats(struct net_bridge *br);
612void br_multicast_get_stats(const struct net_bridge *br, 624void br_multicast_get_stats(const struct net_bridge *br,
613 const struct net_bridge_port *p, 625 const struct net_bridge_port *p,
614 struct br_mcast_stats *dest); 626 struct br_mcast_stats *dest);
@@ -749,6 +761,10 @@ static inline int br_multicast_init_stats(struct net_bridge *br)
749 return 0; 761 return 0;
750} 762}
751 763
764static inline void br_multicast_uninit_stats(struct net_bridge *br)
765{
766}
767
752static inline int br_multicast_igmp_type(const struct sk_buff *skb) 768static inline int br_multicast_igmp_type(const struct sk_buff *skb)
753{ 769{
754 return 0; 770 return 0;
@@ -764,6 +780,7 @@ bool br_allowed_egress(struct net_bridge_vlan_group *vg,
764 const struct sk_buff *skb); 780 const struct sk_buff *skb);
765bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid); 781bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid);
766struct sk_buff *br_handle_vlan(struct net_bridge *br, 782struct sk_buff *br_handle_vlan(struct net_bridge *br,
783 const struct net_bridge_port *port,
767 struct net_bridge_vlan_group *vg, 784 struct net_bridge_vlan_group *vg,
768 struct sk_buff *skb); 785 struct sk_buff *skb);
769int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags); 786int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags);
@@ -863,6 +880,7 @@ static inline bool br_should_learn(struct net_bridge_port *p,
863} 880}
864 881
865static inline struct sk_buff *br_handle_vlan(struct net_bridge *br, 882static inline struct sk_buff *br_handle_vlan(struct net_bridge *br,
883 const struct net_bridge_port *port,
866 struct net_bridge_vlan_group *vg, 884 struct net_bridge_vlan_group *vg,
867 struct sk_buff *skb) 885 struct sk_buff *skb)
868{ 886{
diff --git a/net/bridge/br_private_tunnel.h b/net/bridge/br_private_tunnel.h
new file mode 100644
index 000000000000..4a447a378ab3
--- /dev/null
+++ b/net/bridge/br_private_tunnel.h
@@ -0,0 +1,83 @@
1/*
2 * Bridge per vlan tunnels
3 *
4 * Authors:
5 * Roopa Prabhu <roopa@cumulusnetworks.com>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 */
12
13#ifndef _BR_PRIVATE_TUNNEL_H
14#define _BR_PRIVATE_TUNNEL_H
15
16struct vtunnel_info {
17 u32 tunid;
18 u16 vid;
19 u16 flags;
20};
21
22/* br_netlink_tunnel.c */
23int br_parse_vlan_tunnel_info(struct nlattr *attr,
24 struct vtunnel_info *tinfo);
25int br_process_vlan_tunnel_info(struct net_bridge *br,
26 struct net_bridge_port *p,
27 int cmd,
28 struct vtunnel_info *tinfo_curr,
29 struct vtunnel_info *tinfo_last);
30int br_get_vlan_tunnel_info_size(struct net_bridge_vlan_group *vg);
31int br_fill_vlan_tunnel_info(struct sk_buff *skb,
32 struct net_bridge_vlan_group *vg);
33
34#ifdef CONFIG_BRIDGE_VLAN_FILTERING
35/* br_vlan_tunnel.c */
36int vlan_tunnel_init(struct net_bridge_vlan_group *vg);
37void vlan_tunnel_deinit(struct net_bridge_vlan_group *vg);
38int nbp_vlan_tunnel_info_delete(struct net_bridge_port *port, u16 vid);
39int nbp_vlan_tunnel_info_add(struct net_bridge_port *port, u16 vid, u32 tun_id);
40void nbp_vlan_tunnel_info_flush(struct net_bridge_port *port);
41void vlan_tunnel_info_del(struct net_bridge_vlan_group *vg,
42 struct net_bridge_vlan *vlan);
43int br_handle_ingress_vlan_tunnel(struct sk_buff *skb,
44 struct net_bridge_port *p,
45 struct net_bridge_vlan_group *vg);
46int br_handle_egress_vlan_tunnel(struct sk_buff *skb,
47 struct net_bridge_vlan *vlan);
48#else
49static inline int vlan_tunnel_init(struct net_bridge_vlan_group *vg)
50{
51 return 0;
52}
53
54static inline int nbp_vlan_tunnel_info_delete(struct net_bridge_port *port,
55 u16 vid)
56{
57 return 0;
58}
59
60static inline int nbp_vlan_tunnel_info_add(struct net_bridge_port *port,
61 u16 vid, u32 tun_id)
62{
63 return 0;
64}
65
66static inline void nbp_vlan_tunnel_info_flush(struct net_bridge_port *port)
67{
68}
69
70static inline void vlan_tunnel_info_del(struct net_bridge_vlan_group *vg,
71 struct net_bridge_vlan *vlan)
72{
73}
74
75static inline int br_handle_ingress_vlan_tunnel(struct sk_buff *skb,
76 struct net_bridge_port *p,
77 struct net_bridge_vlan_group *vg)
78{
79 return 0;
80}
81#endif
82
83#endif
diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
index 71fd1a4e63cc..8f56c2d1f1a7 100644
--- a/net/bridge/br_stp.c
+++ b/net/bridge/br_stp.c
@@ -602,7 +602,7 @@ int br_set_ageing_time(struct net_bridge *br, clock_t ageing_time)
602 br->ageing_time = t; 602 br->ageing_time = t;
603 spin_unlock_bh(&br->lock); 603 spin_unlock_bh(&br->lock);
604 604
605 mod_timer(&br->gc_timer, jiffies); 605 mod_delayed_work(system_long_wq, &br->gc_work, 0);
606 606
607 return 0; 607 return 0;
608} 608}
diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c
index 6c1e21411125..08341d2aa9c9 100644
--- a/net/bridge/br_stp_if.c
+++ b/net/bridge/br_stp_if.c
@@ -57,7 +57,7 @@ void br_stp_enable_bridge(struct net_bridge *br)
57 spin_lock_bh(&br->lock); 57 spin_lock_bh(&br->lock);
58 if (br->stp_enabled == BR_KERNEL_STP) 58 if (br->stp_enabled == BR_KERNEL_STP)
59 mod_timer(&br->hello_timer, jiffies + br->hello_time); 59 mod_timer(&br->hello_timer, jiffies + br->hello_time);
60 mod_timer(&br->gc_timer, jiffies + HZ/10); 60 mod_delayed_work(system_long_wq, &br->gc_work, HZ / 10);
61 61
62 br_config_bpdu_generation(br); 62 br_config_bpdu_generation(br);
63 63
@@ -88,7 +88,7 @@ void br_stp_disable_bridge(struct net_bridge *br)
88 del_timer_sync(&br->hello_timer); 88 del_timer_sync(&br->hello_timer);
89 del_timer_sync(&br->topology_change_timer); 89 del_timer_sync(&br->topology_change_timer);
90 del_timer_sync(&br->tcn_timer); 90 del_timer_sync(&br->tcn_timer);
91 del_timer_sync(&br->gc_timer); 91 cancel_delayed_work_sync(&br->gc_work);
92} 92}
93 93
94/* called under bridge lock */ 94/* called under bridge lock */
diff --git a/net/bridge/br_stp_timer.c b/net/bridge/br_stp_timer.c
index 7ddb38e0a06e..c98b3e5c140a 100644
--- a/net/bridge/br_stp_timer.c
+++ b/net/bridge/br_stp_timer.c
@@ -153,8 +153,6 @@ void br_stp_timer_init(struct net_bridge *br)
153 setup_timer(&br->topology_change_timer, 153 setup_timer(&br->topology_change_timer,
154 br_topology_change_timer_expired, 154 br_topology_change_timer_expired,
155 (unsigned long) br); 155 (unsigned long) br);
156
157 setup_timer(&br->gc_timer, br_fdb_cleanup, (unsigned long) br);
158} 156}
159 157
160void br_stp_port_timer_init(struct net_bridge_port *p) 158void br_stp_port_timer_init(struct net_bridge_port *p)
diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
index a18148213b08..0b5dd607444c 100644
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -19,6 +19,7 @@
19#include <linux/rtnetlink.h> 19#include <linux/rtnetlink.h>
20#include <linux/spinlock.h> 20#include <linux/spinlock.h>
21#include <linux/times.h> 21#include <linux/times.h>
22#include <linux/sched/signal.h>
22 23
23#include "br_private.h" 24#include "br_private.h"
24 25
@@ -263,7 +264,7 @@ static ssize_t gc_timer_show(struct device *d, struct device_attribute *attr,
263 char *buf) 264 char *buf)
264{ 265{
265 struct net_bridge *br = to_bridge(d); 266 struct net_bridge *br = to_bridge(d);
266 return sprintf(buf, "%ld\n", br_timer_value(&br->gc_timer)); 267 return sprintf(buf, "%ld\n", br_timer_value(&br->gc_work.timer));
267} 268}
268static DEVICE_ATTR_RO(gc_timer); 269static DEVICE_ATTR_RO(gc_timer);
269 270
diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
index 8bd569695e76..79aee759aba5 100644
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -17,6 +17,7 @@
17#include <linux/if_bridge.h> 17#include <linux/if_bridge.h>
18#include <linux/rtnetlink.h> 18#include <linux/rtnetlink.h>
19#include <linux/spinlock.h> 19#include <linux/spinlock.h>
20#include <linux/sched/signal.h>
20 21
21#include "br_private.h" 22#include "br_private.h"
22 23
@@ -188,6 +189,7 @@ static BRPORT_ATTR(multicast_router, S_IRUGO | S_IWUSR, show_multicast_router,
188 store_multicast_router); 189 store_multicast_router);
189 190
190BRPORT_ATTR_FLAG(multicast_fast_leave, BR_MULTICAST_FAST_LEAVE); 191BRPORT_ATTR_FLAG(multicast_fast_leave, BR_MULTICAST_FAST_LEAVE);
192BRPORT_ATTR_FLAG(multicast_to_unicast, BR_MULTICAST_TO_UNICAST);
191#endif 193#endif
192 194
193static const struct brport_attribute *brport_attrs[] = { 195static const struct brport_attribute *brport_attrs[] = {
@@ -214,6 +216,7 @@ static const struct brport_attribute *brport_attrs[] = {
214#ifdef CONFIG_BRIDGE_IGMP_SNOOPING 216#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
215 &brport_attr_multicast_router, 217 &brport_attr_multicast_router,
216 &brport_attr_multicast_fast_leave, 218 &brport_attr_multicast_fast_leave,
219 &brport_attr_multicast_to_unicast,
217#endif 220#endif
218 &brport_attr_proxyarp, 221 &brport_attr_proxyarp,
219 &brport_attr_proxyarp_wifi, 222 &brport_attr_proxyarp_wifi,
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index b6de4f457161..b838213c408e 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -5,6 +5,7 @@
5#include <net/switchdev.h> 5#include <net/switchdev.h>
6 6
7#include "br_private.h" 7#include "br_private.h"
8#include "br_private_tunnel.h"
8 9
9static inline int br_vlan_cmp(struct rhashtable_compare_arg *arg, 10static inline int br_vlan_cmp(struct rhashtable_compare_arg *arg,
10 const void *ptr) 11 const void *ptr)
@@ -310,6 +311,7 @@ static int __vlan_del(struct net_bridge_vlan *v)
310 } 311 }
311 312
312 if (masterv != v) { 313 if (masterv != v) {
314 vlan_tunnel_info_del(vg, v);
313 rhashtable_remove_fast(&vg->vlan_hash, &v->vnode, 315 rhashtable_remove_fast(&vg->vlan_hash, &v->vnode,
314 br_vlan_rht_params); 316 br_vlan_rht_params);
315 __vlan_del_list(v); 317 __vlan_del_list(v);
@@ -325,6 +327,7 @@ static void __vlan_group_free(struct net_bridge_vlan_group *vg)
325{ 327{
326 WARN_ON(!list_empty(&vg->vlan_list)); 328 WARN_ON(!list_empty(&vg->vlan_list));
327 rhashtable_destroy(&vg->vlan_hash); 329 rhashtable_destroy(&vg->vlan_hash);
330 vlan_tunnel_deinit(vg);
328 kfree(vg); 331 kfree(vg);
329} 332}
330 333
@@ -338,6 +341,7 @@ static void __vlan_flush(struct net_bridge_vlan_group *vg)
338} 341}
339 342
340struct sk_buff *br_handle_vlan(struct net_bridge *br, 343struct sk_buff *br_handle_vlan(struct net_bridge *br,
344 const struct net_bridge_port *p,
341 struct net_bridge_vlan_group *vg, 345 struct net_bridge_vlan_group *vg,
342 struct sk_buff *skb) 346 struct sk_buff *skb)
343{ 347{
@@ -378,6 +382,12 @@ struct sk_buff *br_handle_vlan(struct net_bridge *br,
378 382
379 if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED) 383 if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED)
380 skb->vlan_tci = 0; 384 skb->vlan_tci = 0;
385
386 if (p && (p->flags & BR_VLAN_TUNNEL) &&
387 br_handle_egress_vlan_tunnel(skb, v)) {
388 kfree_skb(skb);
389 return NULL;
390 }
381out: 391out:
382 return skb; 392 return skb;
383} 393}
@@ -613,6 +623,8 @@ int br_vlan_delete(struct net_bridge *br, u16 vid)
613 br_fdb_find_delete_local(br, NULL, br->dev->dev_addr, vid); 623 br_fdb_find_delete_local(br, NULL, br->dev->dev_addr, vid);
614 br_fdb_delete_by_port(br, NULL, vid, 0); 624 br_fdb_delete_by_port(br, NULL, vid, 0);
615 625
626 vlan_tunnel_info_del(vg, v);
627
616 return __vlan_del(v); 628 return __vlan_del(v);
617} 629}
618 630
@@ -918,6 +930,9 @@ int br_vlan_init(struct net_bridge *br)
918 ret = rhashtable_init(&vg->vlan_hash, &br_vlan_rht_params); 930 ret = rhashtable_init(&vg->vlan_hash, &br_vlan_rht_params);
919 if (ret) 931 if (ret)
920 goto err_rhtbl; 932 goto err_rhtbl;
933 ret = vlan_tunnel_init(vg);
934 if (ret)
935 goto err_tunnel_init;
921 INIT_LIST_HEAD(&vg->vlan_list); 936 INIT_LIST_HEAD(&vg->vlan_list);
922 br->vlan_proto = htons(ETH_P_8021Q); 937 br->vlan_proto = htons(ETH_P_8021Q);
923 br->default_pvid = 1; 938 br->default_pvid = 1;
@@ -932,6 +947,8 @@ out:
932 return ret; 947 return ret;
933 948
934err_vlan_add: 949err_vlan_add:
950 vlan_tunnel_deinit(vg);
951err_tunnel_init:
935 rhashtable_destroy(&vg->vlan_hash); 952 rhashtable_destroy(&vg->vlan_hash);
936err_rhtbl: 953err_rhtbl:
937 kfree(vg); 954 kfree(vg);
@@ -961,6 +978,9 @@ int nbp_vlan_init(struct net_bridge_port *p)
961 ret = rhashtable_init(&vg->vlan_hash, &br_vlan_rht_params); 978 ret = rhashtable_init(&vg->vlan_hash, &br_vlan_rht_params);
962 if (ret) 979 if (ret)
963 goto err_rhtbl; 980 goto err_rhtbl;
981 ret = vlan_tunnel_init(vg);
982 if (ret)
983 goto err_tunnel_init;
964 INIT_LIST_HEAD(&vg->vlan_list); 984 INIT_LIST_HEAD(&vg->vlan_list);
965 rcu_assign_pointer(p->vlgrp, vg); 985 rcu_assign_pointer(p->vlgrp, vg);
966 if (p->br->default_pvid) { 986 if (p->br->default_pvid) {
@@ -976,9 +996,11 @@ out:
976err_vlan_add: 996err_vlan_add:
977 RCU_INIT_POINTER(p->vlgrp, NULL); 997 RCU_INIT_POINTER(p->vlgrp, NULL);
978 synchronize_rcu(); 998 synchronize_rcu();
999 vlan_tunnel_deinit(vg);
1000err_tunnel_init:
979 rhashtable_destroy(&vg->vlan_hash); 1001 rhashtable_destroy(&vg->vlan_hash);
980err_vlan_enabled:
981err_rhtbl: 1002err_rhtbl:
1003err_vlan_enabled:
982 kfree(vg); 1004 kfree(vg);
983 1005
984 goto out; 1006 goto out;
diff --git a/net/bridge/br_vlan_tunnel.c b/net/bridge/br_vlan_tunnel.c
new file mode 100644
index 000000000000..6d2c4eed2dc8
--- /dev/null
+++ b/net/bridge/br_vlan_tunnel.c
@@ -0,0 +1,205 @@
1/*
2 * Bridge per vlan tunnel port dst_metadata handling code
3 *
4 * Authors:
5 * Roopa Prabhu <roopa@cumulusnetworks.com>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 */
12
13#include <linux/kernel.h>
14#include <linux/netdevice.h>
15#include <linux/rtnetlink.h>
16#include <linux/slab.h>
17#include <net/switchdev.h>
18#include <net/dst_metadata.h>
19
20#include "br_private.h"
21#include "br_private_tunnel.h"
22
23static inline int br_vlan_tunid_cmp(struct rhashtable_compare_arg *arg,
24 const void *ptr)
25{
26 const struct net_bridge_vlan *vle = ptr;
27 __be64 tunid = *(__be64 *)arg->key;
28
29 return vle->tinfo.tunnel_id != tunid;
30}
31
32static const struct rhashtable_params br_vlan_tunnel_rht_params = {
33 .head_offset = offsetof(struct net_bridge_vlan, tnode),
34 .key_offset = offsetof(struct net_bridge_vlan, tinfo.tunnel_id),
35 .key_len = sizeof(__be64),
36 .nelem_hint = 3,
37 .locks_mul = 1,
38 .obj_cmpfn = br_vlan_tunid_cmp,
39 .automatic_shrinking = true,
40};
41
42static struct net_bridge_vlan *br_vlan_tunnel_lookup(struct rhashtable *tbl,
43 u64 tunnel_id)
44{
45 return rhashtable_lookup_fast(tbl, &tunnel_id,
46 br_vlan_tunnel_rht_params);
47}
48
49void vlan_tunnel_info_del(struct net_bridge_vlan_group *vg,
50 struct net_bridge_vlan *vlan)
51{
52 if (!vlan->tinfo.tunnel_dst)
53 return;
54 rhashtable_remove_fast(&vg->tunnel_hash, &vlan->tnode,
55 br_vlan_tunnel_rht_params);
56 vlan->tinfo.tunnel_id = 0;
57 dst_release(&vlan->tinfo.tunnel_dst->dst);
58 vlan->tinfo.tunnel_dst = NULL;
59}
60
61static int __vlan_tunnel_info_add(struct net_bridge_vlan_group *vg,
62 struct net_bridge_vlan *vlan, u32 tun_id)
63{
64 struct metadata_dst *metadata = NULL;
65 __be64 key = key32_to_tunnel_id(cpu_to_be32(tun_id));
66 int err;
67
68 if (vlan->tinfo.tunnel_dst)
69 return -EEXIST;
70
71 metadata = __ip_tun_set_dst(0, 0, 0, 0, 0, TUNNEL_KEY,
72 key, 0);
73 if (!metadata)
74 return -EINVAL;
75
76 metadata->u.tun_info.mode |= IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_BRIDGE;
77 vlan->tinfo.tunnel_dst = metadata;
78 vlan->tinfo.tunnel_id = key;
79
80 err = rhashtable_lookup_insert_fast(&vg->tunnel_hash, &vlan->tnode,
81 br_vlan_tunnel_rht_params);
82 if (err)
83 goto out;
84
85 return 0;
86out:
87 dst_release(&vlan->tinfo.tunnel_dst->dst);
88 vlan->tinfo.tunnel_dst = NULL;
89 vlan->tinfo.tunnel_id = 0;
90
91 return err;
92}
93
94/* Must be protected by RTNL.
95 * Must be called with vid in range from 1 to 4094 inclusive.
96 */
97int nbp_vlan_tunnel_info_add(struct net_bridge_port *port, u16 vid, u32 tun_id)
98{
99 struct net_bridge_vlan_group *vg;
100 struct net_bridge_vlan *vlan;
101
102 ASSERT_RTNL();
103
104 vg = nbp_vlan_group(port);
105 vlan = br_vlan_find(vg, vid);
106 if (!vlan)
107 return -EINVAL;
108
109 return __vlan_tunnel_info_add(vg, vlan, tun_id);
110}
111
112/* Must be protected by RTNL.
113 * Must be called with vid in range from 1 to 4094 inclusive.
114 */
115int nbp_vlan_tunnel_info_delete(struct net_bridge_port *port, u16 vid)
116{
117 struct net_bridge_vlan_group *vg;
118 struct net_bridge_vlan *v;
119
120 ASSERT_RTNL();
121
122 vg = nbp_vlan_group(port);
123 v = br_vlan_find(vg, vid);
124 if (!v)
125 return -ENOENT;
126
127 vlan_tunnel_info_del(vg, v);
128
129 return 0;
130}
131
132static void __vlan_tunnel_info_flush(struct net_bridge_vlan_group *vg)
133{
134 struct net_bridge_vlan *vlan, *tmp;
135
136 list_for_each_entry_safe(vlan, tmp, &vg->vlan_list, vlist)
137 vlan_tunnel_info_del(vg, vlan);
138}
139
140void nbp_vlan_tunnel_info_flush(struct net_bridge_port *port)
141{
142 struct net_bridge_vlan_group *vg;
143
144 ASSERT_RTNL();
145
146 vg = nbp_vlan_group(port);
147 __vlan_tunnel_info_flush(vg);
148}
149
150int vlan_tunnel_init(struct net_bridge_vlan_group *vg)
151{
152 return rhashtable_init(&vg->tunnel_hash, &br_vlan_tunnel_rht_params);
153}
154
155void vlan_tunnel_deinit(struct net_bridge_vlan_group *vg)
156{
157 rhashtable_destroy(&vg->tunnel_hash);
158}
159
160int br_handle_ingress_vlan_tunnel(struct sk_buff *skb,
161 struct net_bridge_port *p,
162 struct net_bridge_vlan_group *vg)
163{
164 struct ip_tunnel_info *tinfo = skb_tunnel_info(skb);
165 struct net_bridge_vlan *vlan;
166
167 if (!vg || !tinfo)
168 return 0;
169
170 /* if already tagged, ignore */
171 if (skb_vlan_tagged(skb))
172 return 0;
173
174 /* lookup vid, given tunnel id */
175 vlan = br_vlan_tunnel_lookup(&vg->tunnel_hash, tinfo->key.tun_id);
176 if (!vlan)
177 return 0;
178
179 skb_dst_drop(skb);
180
181 __vlan_hwaccel_put_tag(skb, p->br->vlan_proto, vlan->vid);
182
183 return 0;
184}
185
186int br_handle_egress_vlan_tunnel(struct sk_buff *skb,
187 struct net_bridge_vlan *vlan)
188{
189 int err;
190
191 if (!vlan || !vlan->tinfo.tunnel_id)
192 return 0;
193
194 if (unlikely(!skb_vlan_tag_present(skb)))
195 return 0;
196
197 skb_dst_drop(skb);
198 err = skb_vlan_pop(skb);
199 if (err)
200 return err;
201
202 skb_dst_set(skb, dst_clone(&vlan->tinfo.tunnel_dst->dst));
203
204 return 0;
205}
diff --git a/net/bridge/netfilter/ebt_among.c b/net/bridge/netfilter/ebt_among.c
index 9024283d2bca..279527f8b1fe 100644
--- a/net/bridge/netfilter/ebt_among.c
+++ b/net/bridge/netfilter/ebt_among.c
@@ -187,7 +187,7 @@ static int ebt_among_mt_check(const struct xt_mtchk_param *par)
187 expected_length += ebt_mac_wormhash_size(wh_src); 187 expected_length += ebt_mac_wormhash_size(wh_src);
188 188
189 if (em->match_size != EBT_ALIGN(expected_length)) { 189 if (em->match_size != EBT_ALIGN(expected_length)) {
190 pr_info("wrong size: %d against expected %d, rounded to %Zd\n", 190 pr_info("wrong size: %d against expected %d, rounded to %zd\n",
191 em->match_size, expected_length, 191 em->match_size, expected_length,
192 EBT_ALIGN(expected_length)); 192 EBT_ALIGN(expected_length));
193 return -EINVAL; 193 return -EINVAL;
diff --git a/net/bridge/netfilter/ebt_limit.c b/net/bridge/netfilter/ebt_limit.c
index 517e78befcb2..61a9f1be1263 100644
--- a/net/bridge/netfilter/ebt_limit.c
+++ b/net/bridge/netfilter/ebt_limit.c
@@ -105,6 +105,7 @@ static struct xt_match ebt_limit_mt_reg __read_mostly = {
105 .match = ebt_limit_mt, 105 .match = ebt_limit_mt,
106 .checkentry = ebt_limit_mt_check, 106 .checkentry = ebt_limit_mt_check,
107 .matchsize = sizeof(struct ebt_limit_info), 107 .matchsize = sizeof(struct ebt_limit_info),
108 .usersize = offsetof(struct ebt_limit_info, prev),
108#ifdef CONFIG_COMPAT 109#ifdef CONFIG_COMPAT
109 .compatsize = sizeof(struct ebt_compat_limit_info), 110 .compatsize = sizeof(struct ebt_compat_limit_info),
110#endif 111#endif
diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index e88bd4827ac1..98b9c8e8615e 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -78,7 +78,7 @@ ebt_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum,
78 unsigned int bitmask; 78 unsigned int bitmask;
79 79
80 /* FIXME: Disabled from containers until syslog ns is supported */ 80 /* FIXME: Disabled from containers until syslog ns is supported */
81 if (!net_eq(net, &init_net)) 81 if (!net_eq(net, &init_net) && !sysctl_nf_log_all_netns)
82 return; 82 return;
83 83
84 spin_lock_bh(&ebt_log_lock); 84 spin_lock_bh(&ebt_log_lock);
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 537e3d506fc2..79b69917f521 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -1346,56 +1346,72 @@ static int update_counters(struct net *net, const void __user *user,
1346 hlp.num_counters, user, len); 1346 hlp.num_counters, user, len);
1347} 1347}
1348 1348
1349static inline int ebt_make_matchname(const struct ebt_entry_match *m, 1349static inline int ebt_obj_to_user(char __user *um, const char *_name,
1350 const char *base, char __user *ubase) 1350 const char *data, int entrysize,
1351 int usersize, int datasize)
1351{ 1352{
1352 char __user *hlp = ubase + ((char *)m - base); 1353 char name[EBT_FUNCTION_MAXNAMELEN] = {0};
1353 char name[EBT_FUNCTION_MAXNAMELEN] = {};
1354 1354
1355 /* ebtables expects 32 bytes long names but xt_match names are 29 bytes 1355 /* ebtables expects 32 bytes long names but xt_match names are 29 bytes
1356 * long. Copy 29 bytes and fill remaining bytes with zeroes. 1356 * long. Copy 29 bytes and fill remaining bytes with zeroes.
1357 */ 1357 */
1358 strlcpy(name, m->u.match->name, sizeof(name)); 1358 strlcpy(name, _name, sizeof(name));
1359 if (copy_to_user(hlp, name, EBT_FUNCTION_MAXNAMELEN)) 1359 if (copy_to_user(um, name, EBT_FUNCTION_MAXNAMELEN) ||
1360 put_user(datasize, (int __user *)(um + EBT_FUNCTION_MAXNAMELEN)) ||
1361 xt_data_to_user(um + entrysize, data, usersize, datasize))
1360 return -EFAULT; 1362 return -EFAULT;
1363
1361 return 0; 1364 return 0;
1362} 1365}
1363 1366
1364static inline int ebt_make_watchername(const struct ebt_entry_watcher *w, 1367static inline int ebt_match_to_user(const struct ebt_entry_match *m,
1365 const char *base, char __user *ubase) 1368 const char *base, char __user *ubase)
1366{ 1369{
1367 char __user *hlp = ubase + ((char *)w - base); 1370 return ebt_obj_to_user(ubase + ((char *)m - base),
1368 char name[EBT_FUNCTION_MAXNAMELEN] = {}; 1371 m->u.match->name, m->data, sizeof(*m),
1372 m->u.match->usersize, m->match_size);
1373}
1369 1374
1370 strlcpy(name, w->u.watcher->name, sizeof(name)); 1375static inline int ebt_watcher_to_user(const struct ebt_entry_watcher *w,
1371 if (copy_to_user(hlp, name, EBT_FUNCTION_MAXNAMELEN)) 1376 const char *base, char __user *ubase)
1372 return -EFAULT; 1377{
1373 return 0; 1378 return ebt_obj_to_user(ubase + ((char *)w - base),
1379 w->u.watcher->name, w->data, sizeof(*w),
1380 w->u.watcher->usersize, w->watcher_size);
1374} 1381}
1375 1382
1376static inline int ebt_make_names(struct ebt_entry *e, const char *base, 1383static inline int ebt_entry_to_user(struct ebt_entry *e, const char *base,
1377 char __user *ubase) 1384 char __user *ubase)
1378{ 1385{
1379 int ret; 1386 int ret;
1380 char __user *hlp; 1387 char __user *hlp;
1381 const struct ebt_entry_target *t; 1388 const struct ebt_entry_target *t;
1382 char name[EBT_FUNCTION_MAXNAMELEN] = {};
1383 1389
1384 if (e->bitmask == 0) 1390 if (e->bitmask == 0) {
1391 /* special case !EBT_ENTRY_OR_ENTRIES */
1392 if (copy_to_user(ubase + ((char *)e - base), e,
1393 sizeof(struct ebt_entries)))
1394 return -EFAULT;
1385 return 0; 1395 return 0;
1396 }
1397
1398 if (copy_to_user(ubase + ((char *)e - base), e, sizeof(*e)))
1399 return -EFAULT;
1386 1400
1387 hlp = ubase + (((char *)e + e->target_offset) - base); 1401 hlp = ubase + (((char *)e + e->target_offset) - base);
1388 t = (struct ebt_entry_target *)(((char *)e) + e->target_offset); 1402 t = (struct ebt_entry_target *)(((char *)e) + e->target_offset);
1389 1403
1390 ret = EBT_MATCH_ITERATE(e, ebt_make_matchname, base, ubase); 1404 ret = EBT_MATCH_ITERATE(e, ebt_match_to_user, base, ubase);
1391 if (ret != 0) 1405 if (ret != 0)
1392 return ret; 1406 return ret;
1393 ret = EBT_WATCHER_ITERATE(e, ebt_make_watchername, base, ubase); 1407 ret = EBT_WATCHER_ITERATE(e, ebt_watcher_to_user, base, ubase);
1394 if (ret != 0) 1408 if (ret != 0)
1395 return ret; 1409 return ret;
1396 strlcpy(name, t->u.target->name, sizeof(name)); 1410 ret = ebt_obj_to_user(hlp, t->u.target->name, t->data, sizeof(*t),
1397 if (copy_to_user(hlp, name, EBT_FUNCTION_MAXNAMELEN)) 1411 t->u.target->usersize, t->target_size);
1398 return -EFAULT; 1412 if (ret != 0)
1413 return ret;
1414
1399 return 0; 1415 return 0;
1400} 1416}
1401 1417
@@ -1475,13 +1491,9 @@ static int copy_everything_to_user(struct ebt_table *t, void __user *user,
1475 if (ret) 1491 if (ret)
1476 return ret; 1492 return ret;
1477 1493
1478 if (copy_to_user(tmp.entries, entries, entries_size)) {
1479 BUGPRINT("Couldn't copy entries to userspace\n");
1480 return -EFAULT;
1481 }
1482 /* set the match/watcher/target names right */ 1494 /* set the match/watcher/target names right */
1483 return EBT_ENTRY_ITERATE(entries, entries_size, 1495 return EBT_ENTRY_ITERATE(entries, entries_size,
1484 ebt_make_names, entries, tmp.entries); 1496 ebt_entry_to_user, entries, tmp.entries);
1485} 1497}
1486 1498
1487static int do_ebt_set_ctl(struct sock *sk, 1499static int do_ebt_set_ctl(struct sock *sk,
@@ -1630,8 +1642,10 @@ static int compat_match_to_user(struct ebt_entry_match *m, void __user **dstptr,
1630 if (match->compat_to_user) { 1642 if (match->compat_to_user) {
1631 if (match->compat_to_user(cm->data, m->data)) 1643 if (match->compat_to_user(cm->data, m->data))
1632 return -EFAULT; 1644 return -EFAULT;
1633 } else if (copy_to_user(cm->data, m->data, msize)) 1645 } else {
1646 if (xt_data_to_user(cm->data, m->data, match->usersize, msize))
1634 return -EFAULT; 1647 return -EFAULT;
1648 }
1635 1649
1636 *size -= ebt_compat_entry_padsize() + off; 1650 *size -= ebt_compat_entry_padsize() + off;
1637 *dstptr = cm->data; 1651 *dstptr = cm->data;
@@ -1657,8 +1671,10 @@ static int compat_target_to_user(struct ebt_entry_target *t,
1657 if (target->compat_to_user) { 1671 if (target->compat_to_user) {
1658 if (target->compat_to_user(cm->data, t->data)) 1672 if (target->compat_to_user(cm->data, t->data))
1659 return -EFAULT; 1673 return -EFAULT;
1660 } else if (copy_to_user(cm->data, t->data, tsize)) 1674 } else {
1661 return -EFAULT; 1675 if (xt_data_to_user(cm->data, t->data, target->usersize, tsize))
1676 return -EFAULT;
1677 }
1662 1678
1663 *size -= ebt_compat_entry_padsize() + off; 1679 *size -= ebt_compat_entry_padsize() + off;
1664 *dstptr = cm->data; 1680 *dstptr = cm->data;
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index 92cbbd2afddb..adcad344c843 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -9,7 +9,7 @@
9#include <linux/fs.h> 9#include <linux/fs.h>
10#include <linux/init.h> 10#include <linux/init.h>
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/sched.h> 12#include <linux/sched/signal.h>
13#include <linux/spinlock.h> 13#include <linux/spinlock.h>
14#include <linux/mutex.h> 14#include <linux/mutex.h>
15#include <linux/list.h> 15#include <linux/list.h>
diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c
index 3408ed51b611..1816fc9f1ee7 100644
--- a/net/caif/chnl_net.c
+++ b/net/caif/chnl_net.c
@@ -44,7 +44,6 @@ enum caif_states {
44 44
45struct chnl_net { 45struct chnl_net {
46 struct cflayer chnl; 46 struct cflayer chnl;
47 struct net_device_stats stats;
48 struct caif_connect_request conn_req; 47 struct caif_connect_request conn_req;
49 struct list_head list_field; 48 struct list_head list_field;
50 struct net_device *netdev; 49 struct net_device *netdev;
diff --git a/net/can/af_can.c b/net/can/af_can.c
index 1108079d934f..5488e4a6ccd0 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -445,6 +445,7 @@ static struct hlist_head *find_rcv_list(canid_t *can_id, canid_t *mask,
445 * @func: callback function on filter match 445 * @func: callback function on filter match
446 * @data: returned parameter for callback function 446 * @data: returned parameter for callback function
447 * @ident: string for calling module identification 447 * @ident: string for calling module identification
448 * @sk: socket pointer (might be NULL)
448 * 449 *
449 * Description: 450 * Description:
450 * Invokes the callback function with the received sk_buff and the given 451 * Invokes the callback function with the received sk_buff and the given
@@ -468,7 +469,7 @@ static struct hlist_head *find_rcv_list(canid_t *can_id, canid_t *mask,
468 */ 469 */
469int can_rx_register(struct net_device *dev, canid_t can_id, canid_t mask, 470int can_rx_register(struct net_device *dev, canid_t can_id, canid_t mask,
470 void (*func)(struct sk_buff *, void *), void *data, 471 void (*func)(struct sk_buff *, void *), void *data,
471 char *ident) 472 char *ident, struct sock *sk)
472{ 473{
473 struct receiver *r; 474 struct receiver *r;
474 struct hlist_head *rl; 475 struct hlist_head *rl;
@@ -496,6 +497,7 @@ int can_rx_register(struct net_device *dev, canid_t can_id, canid_t mask,
496 r->func = func; 497 r->func = func;
497 r->data = data; 498 r->data = data;
498 r->ident = ident; 499 r->ident = ident;
500 r->sk = sk;
499 501
500 hlist_add_head_rcu(&r->list, rl); 502 hlist_add_head_rcu(&r->list, rl);
501 d->entries++; 503 d->entries++;
@@ -520,8 +522,11 @@ EXPORT_SYMBOL(can_rx_register);
520static void can_rx_delete_receiver(struct rcu_head *rp) 522static void can_rx_delete_receiver(struct rcu_head *rp)
521{ 523{
522 struct receiver *r = container_of(rp, struct receiver, rcu); 524 struct receiver *r = container_of(rp, struct receiver, rcu);
525 struct sock *sk = r->sk;
523 526
524 kmem_cache_free(rcv_cache, r); 527 kmem_cache_free(rcv_cache, r);
528 if (sk)
529 sock_put(sk);
525} 530}
526 531
527/** 532/**
@@ -596,8 +601,11 @@ void can_rx_unregister(struct net_device *dev, canid_t can_id, canid_t mask,
596 spin_unlock(&can_rcvlists_lock); 601 spin_unlock(&can_rcvlists_lock);
597 602
598 /* schedule the receiver item for deletion */ 603 /* schedule the receiver item for deletion */
599 if (r) 604 if (r) {
605 if (r->sk)
606 sock_hold(r->sk);
600 call_rcu(&r->rcu, can_rx_delete_receiver); 607 call_rcu(&r->rcu, can_rx_delete_receiver);
608 }
601} 609}
602EXPORT_SYMBOL(can_rx_unregister); 610EXPORT_SYMBOL(can_rx_unregister);
603 611
diff --git a/net/can/af_can.h b/net/can/af_can.h
index fca0fe9fc45a..b86f5129e838 100644
--- a/net/can/af_can.h
+++ b/net/can/af_can.h
@@ -50,13 +50,14 @@
50 50
51struct receiver { 51struct receiver {
52 struct hlist_node list; 52 struct hlist_node list;
53 struct rcu_head rcu;
54 canid_t can_id; 53 canid_t can_id;
55 canid_t mask; 54 canid_t mask;
56 unsigned long matches; 55 unsigned long matches;
57 void (*func)(struct sk_buff *, void *); 56 void (*func)(struct sk_buff *, void *);
58 void *data; 57 void *data;
59 char *ident; 58 char *ident;
59 struct sock *sk;
60 struct rcu_head rcu;
60}; 61};
61 62
62#define CAN_SFF_RCV_ARRAY_SZ (1 << CAN_SFF_ID_BITS) 63#define CAN_SFF_RCV_ARRAY_SZ (1 << CAN_SFF_ID_BITS)
diff --git a/net/can/bcm.c b/net/can/bcm.c
index 21ac75390e3d..95d13b233c65 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -734,14 +734,23 @@ static struct bcm_op *bcm_find_op(struct list_head *ops,
734 734
735static void bcm_remove_op(struct bcm_op *op) 735static void bcm_remove_op(struct bcm_op *op)
736{ 736{
737 hrtimer_cancel(&op->timer); 737 if (op->tsklet.func) {
738 hrtimer_cancel(&op->thrtimer); 738 while (test_bit(TASKLET_STATE_SCHED, &op->tsklet.state) ||
739 739 test_bit(TASKLET_STATE_RUN, &op->tsklet.state) ||
740 if (op->tsklet.func) 740 hrtimer_active(&op->timer)) {
741 tasklet_kill(&op->tsklet); 741 hrtimer_cancel(&op->timer);
742 tasklet_kill(&op->tsklet);
743 }
744 }
742 745
743 if (op->thrtsklet.func) 746 if (op->thrtsklet.func) {
744 tasklet_kill(&op->thrtsklet); 747 while (test_bit(TASKLET_STATE_SCHED, &op->thrtsklet.state) ||
748 test_bit(TASKLET_STATE_RUN, &op->thrtsklet.state) ||
749 hrtimer_active(&op->thrtimer)) {
750 hrtimer_cancel(&op->thrtimer);
751 tasklet_kill(&op->thrtsklet);
752 }
753 }
745 754
746 if ((op->frames) && (op->frames != &op->sframe)) 755 if ((op->frames) && (op->frames != &op->sframe))
747 kfree(op->frames); 756 kfree(op->frames);
@@ -1216,7 +1225,7 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
1216 err = can_rx_register(dev, op->can_id, 1225 err = can_rx_register(dev, op->can_id,
1217 REGMASK(op->can_id), 1226 REGMASK(op->can_id),
1218 bcm_rx_handler, op, 1227 bcm_rx_handler, op,
1219 "bcm"); 1228 "bcm", sk);
1220 1229
1221 op->rx_reg_dev = dev; 1230 op->rx_reg_dev = dev;
1222 dev_put(dev); 1231 dev_put(dev);
@@ -1225,7 +1234,7 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
1225 } else 1234 } else
1226 err = can_rx_register(NULL, op->can_id, 1235 err = can_rx_register(NULL, op->can_id,
1227 REGMASK(op->can_id), 1236 REGMASK(op->can_id),
1228 bcm_rx_handler, op, "bcm"); 1237 bcm_rx_handler, op, "bcm", sk);
1229 if (err) { 1238 if (err) {
1230 /* this bcm rx op is broken -> remove it */ 1239 /* this bcm rx op is broken -> remove it */
1231 list_del(&op->list); 1240 list_del(&op->list);
diff --git a/net/can/gw.c b/net/can/gw.c
index a54ab0c82104..7056a1a2bb70 100644
--- a/net/can/gw.c
+++ b/net/can/gw.c
@@ -442,7 +442,7 @@ static inline int cgw_register_filter(struct cgw_job *gwj)
442{ 442{
443 return can_rx_register(gwj->src.dev, gwj->ccgw.filter.can_id, 443 return can_rx_register(gwj->src.dev, gwj->ccgw.filter.can_id,
444 gwj->ccgw.filter.can_mask, can_can_gw_rcv, 444 gwj->ccgw.filter.can_mask, can_can_gw_rcv,
445 gwj, "gw"); 445 gwj, "gw", NULL);
446} 446}
447 447
448static inline void cgw_unregister_filter(struct cgw_job *gwj) 448static inline void cgw_unregister_filter(struct cgw_job *gwj)
diff --git a/net/can/raw.c b/net/can/raw.c
index b075f028d7e2..6dc546a06673 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -190,7 +190,7 @@ static int raw_enable_filters(struct net_device *dev, struct sock *sk,
190 for (i = 0; i < count; i++) { 190 for (i = 0; i < count; i++) {
191 err = can_rx_register(dev, filter[i].can_id, 191 err = can_rx_register(dev, filter[i].can_id,
192 filter[i].can_mask, 192 filter[i].can_mask,
193 raw_rcv, sk, "raw"); 193 raw_rcv, sk, "raw", sk);
194 if (err) { 194 if (err) {
195 /* clean up successfully registered filters */ 195 /* clean up successfully registered filters */
196 while (--i >= 0) 196 while (--i >= 0)
@@ -211,7 +211,7 @@ static int raw_enable_errfilter(struct net_device *dev, struct sock *sk,
211 211
212 if (err_mask) 212 if (err_mask)
213 err = can_rx_register(dev, 0, err_mask | CAN_ERR_FLAG, 213 err = can_rx_register(dev, 0, err_mask | CAN_ERR_FLAG,
214 raw_rcv, sk, "raw"); 214 raw_rcv, sk, "raw", sk);
215 215
216 return err; 216 return err;
217} 217}
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 464e88599b9d..108533859a53 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -230,6 +230,7 @@ enum {
230 Opt_osdkeepalivetimeout, 230 Opt_osdkeepalivetimeout,
231 Opt_mount_timeout, 231 Opt_mount_timeout,
232 Opt_osd_idle_ttl, 232 Opt_osd_idle_ttl,
233 Opt_osd_request_timeout,
233 Opt_last_int, 234 Opt_last_int,
234 /* int args above */ 235 /* int args above */
235 Opt_fsid, 236 Opt_fsid,
@@ -256,6 +257,7 @@ static match_table_t opt_tokens = {
256 {Opt_osdkeepalivetimeout, "osdkeepalive=%d"}, 257 {Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
257 {Opt_mount_timeout, "mount_timeout=%d"}, 258 {Opt_mount_timeout, "mount_timeout=%d"},
258 {Opt_osd_idle_ttl, "osd_idle_ttl=%d"}, 259 {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
260 {Opt_osd_request_timeout, "osd_request_timeout=%d"},
259 /* int args above */ 261 /* int args above */
260 {Opt_fsid, "fsid=%s"}, 262 {Opt_fsid, "fsid=%s"},
261 {Opt_name, "name=%s"}, 263 {Opt_name, "name=%s"},
@@ -361,6 +363,7 @@ ceph_parse_options(char *options, const char *dev_name,
361 opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT; 363 opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
362 opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; 364 opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT;
363 opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; 365 opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;
366 opt->osd_request_timeout = CEPH_OSD_REQUEST_TIMEOUT_DEFAULT;
364 367
365 /* get mon ip(s) */ 368 /* get mon ip(s) */
366 /* ip1[:port1][,ip2[:port2]...] */ 369 /* ip1[:port1][,ip2[:port2]...] */
@@ -473,6 +476,15 @@ ceph_parse_options(char *options, const char *dev_name,
473 } 476 }
474 opt->mount_timeout = msecs_to_jiffies(intval * 1000); 477 opt->mount_timeout = msecs_to_jiffies(intval * 1000);
475 break; 478 break;
479 case Opt_osd_request_timeout:
480 /* 0 is "wait forever" (i.e. infinite timeout) */
481 if (intval < 0 || intval > INT_MAX / 1000) {
482 pr_err("osd_request_timeout out of range\n");
483 err = -EINVAL;
484 goto out;
485 }
486 opt->osd_request_timeout = msecs_to_jiffies(intval * 1000);
487 break;
476 488
477 case Opt_share: 489 case Opt_share:
478 opt->flags &= ~CEPH_OPT_NOSHARE; 490 opt->flags &= ~CEPH_OPT_NOSHARE;
@@ -557,6 +569,9 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client)
557 if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT) 569 if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
558 seq_printf(m, "osdkeepalivetimeout=%d,", 570 seq_printf(m, "osdkeepalivetimeout=%d,",
559 jiffies_to_msecs(opt->osd_keepalive_timeout) / 1000); 571 jiffies_to_msecs(opt->osd_keepalive_timeout) / 1000);
572 if (opt->osd_request_timeout != CEPH_OSD_REQUEST_TIMEOUT_DEFAULT)
573 seq_printf(m, "osd_request_timeout=%d,",
574 jiffies_to_msecs(opt->osd_request_timeout) / 1000);
560 575
561 /* drop redundant comma */ 576 /* drop redundant comma */
562 if (m->count != pos) 577 if (m->count != pos)
diff --git a/net/ceph/cls_lock_client.c b/net/ceph/cls_lock_client.c
index 50f040fdb2a9..b9233b990399 100644
--- a/net/ceph/cls_lock_client.c
+++ b/net/ceph/cls_lock_client.c
@@ -69,8 +69,8 @@ int ceph_cls_lock(struct ceph_osd_client *osdc,
69 dout("%s lock_name %s type %d cookie %s tag %s desc %s flags 0x%x\n", 69 dout("%s lock_name %s type %d cookie %s tag %s desc %s flags 0x%x\n",
70 __func__, lock_name, type, cookie, tag, desc, flags); 70 __func__, lock_name, type, cookie, tag, desc, flags);
71 ret = ceph_osdc_call(osdc, oid, oloc, "lock", "lock", 71 ret = ceph_osdc_call(osdc, oid, oloc, "lock", "lock",
72 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, 72 CEPH_OSD_FLAG_WRITE, lock_op_page,
73 lock_op_page, lock_op_buf_size, NULL, NULL); 73 lock_op_buf_size, NULL, NULL);
74 74
75 dout("%s: status %d\n", __func__, ret); 75 dout("%s: status %d\n", __func__, ret);
76 __free_page(lock_op_page); 76 __free_page(lock_op_page);
@@ -117,8 +117,8 @@ int ceph_cls_unlock(struct ceph_osd_client *osdc,
117 117
118 dout("%s lock_name %s cookie %s\n", __func__, lock_name, cookie); 118 dout("%s lock_name %s cookie %s\n", __func__, lock_name, cookie);
119 ret = ceph_osdc_call(osdc, oid, oloc, "lock", "unlock", 119 ret = ceph_osdc_call(osdc, oid, oloc, "lock", "unlock",
120 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, 120 CEPH_OSD_FLAG_WRITE, unlock_op_page,
121 unlock_op_page, unlock_op_buf_size, NULL, NULL); 121 unlock_op_buf_size, NULL, NULL);
122 122
123 dout("%s: status %d\n", __func__, ret); 123 dout("%s: status %d\n", __func__, ret);
124 __free_page(unlock_op_page); 124 __free_page(unlock_op_page);
@@ -170,8 +170,8 @@ int ceph_cls_break_lock(struct ceph_osd_client *osdc,
170 dout("%s lock_name %s cookie %s locker %s%llu\n", __func__, lock_name, 170 dout("%s lock_name %s cookie %s locker %s%llu\n", __func__, lock_name,
171 cookie, ENTITY_NAME(*locker)); 171 cookie, ENTITY_NAME(*locker));
172 ret = ceph_osdc_call(osdc, oid, oloc, "lock", "break_lock", 172 ret = ceph_osdc_call(osdc, oid, oloc, "lock", "break_lock",
173 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, 173 CEPH_OSD_FLAG_WRITE, break_op_page,
174 break_op_page, break_op_buf_size, NULL, NULL); 174 break_op_buf_size, NULL, NULL);
175 175
176 dout("%s: status %d\n", __func__, ret); 176 dout("%s: status %d\n", __func__, ret);
177 __free_page(break_op_page); 177 __free_page(break_op_page);
@@ -278,7 +278,7 @@ int ceph_cls_lock_info(struct ceph_osd_client *osdc,
278 int get_info_op_buf_size; 278 int get_info_op_buf_size;
279 int name_len = strlen(lock_name); 279 int name_len = strlen(lock_name);
280 struct page *get_info_op_page, *reply_page; 280 struct page *get_info_op_page, *reply_page;
281 size_t reply_len; 281 size_t reply_len = PAGE_SIZE;
282 void *p, *end; 282 void *p, *end;
283 int ret; 283 int ret;
284 284
diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c
index 80d7c3a97cb8..5bf94c04f645 100644
--- a/net/ceph/crush/crush.c
+++ b/net/ceph/crush/crush.c
@@ -45,7 +45,6 @@ int crush_get_bucket_item_weight(const struct crush_bucket *b, int p)
45 45
46void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b) 46void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
47{ 47{
48 kfree(b->h.perm);
49 kfree(b->h.items); 48 kfree(b->h.items);
50 kfree(b); 49 kfree(b);
51} 50}
@@ -54,14 +53,12 @@ void crush_destroy_bucket_list(struct crush_bucket_list *b)
54{ 53{
55 kfree(b->item_weights); 54 kfree(b->item_weights);
56 kfree(b->sum_weights); 55 kfree(b->sum_weights);
57 kfree(b->h.perm);
58 kfree(b->h.items); 56 kfree(b->h.items);
59 kfree(b); 57 kfree(b);
60} 58}
61 59
62void crush_destroy_bucket_tree(struct crush_bucket_tree *b) 60void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
63{ 61{
64 kfree(b->h.perm);
65 kfree(b->h.items); 62 kfree(b->h.items);
66 kfree(b->node_weights); 63 kfree(b->node_weights);
67 kfree(b); 64 kfree(b);
@@ -71,7 +68,6 @@ void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
71{ 68{
72 kfree(b->straws); 69 kfree(b->straws);
73 kfree(b->item_weights); 70 kfree(b->item_weights);
74 kfree(b->h.perm);
75 kfree(b->h.items); 71 kfree(b->h.items);
76 kfree(b); 72 kfree(b);
77} 73}
@@ -79,7 +75,6 @@ void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
79void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b) 75void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b)
80{ 76{
81 kfree(b->item_weights); 77 kfree(b->item_weights);
82 kfree(b->h.perm);
83 kfree(b->h.items); 78 kfree(b->h.items);
84 kfree(b); 79 kfree(b);
85} 80}
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index 130ab407c5ec..b5cd8c21bfdf 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -54,7 +54,6 @@ int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size
54 return -1; 54 return -1;
55} 55}
56 56
57
58/* 57/*
59 * bucket choose methods 58 * bucket choose methods
60 * 59 *
@@ -72,59 +71,60 @@ int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size
72 * Since this is expensive, we optimize for the r=0 case, which 71 * Since this is expensive, we optimize for the r=0 case, which
73 * captures the vast majority of calls. 72 * captures the vast majority of calls.
74 */ 73 */
75static int bucket_perm_choose(struct crush_bucket *bucket, 74static int bucket_perm_choose(const struct crush_bucket *bucket,
75 struct crush_work_bucket *work,
76 int x, int r) 76 int x, int r)
77{ 77{
78 unsigned int pr = r % bucket->size; 78 unsigned int pr = r % bucket->size;
79 unsigned int i, s; 79 unsigned int i, s;
80 80
81 /* start a new permutation if @x has changed */ 81 /* start a new permutation if @x has changed */
82 if (bucket->perm_x != (__u32)x || bucket->perm_n == 0) { 82 if (work->perm_x != (__u32)x || work->perm_n == 0) {
83 dprintk("bucket %d new x=%d\n", bucket->id, x); 83 dprintk("bucket %d new x=%d\n", bucket->id, x);
84 bucket->perm_x = x; 84 work->perm_x = x;
85 85
86 /* optimize common r=0 case */ 86 /* optimize common r=0 case */
87 if (pr == 0) { 87 if (pr == 0) {
88 s = crush_hash32_3(bucket->hash, x, bucket->id, 0) % 88 s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
89 bucket->size; 89 bucket->size;
90 bucket->perm[0] = s; 90 work->perm[0] = s;
91 bucket->perm_n = 0xffff; /* magic value, see below */ 91 work->perm_n = 0xffff; /* magic value, see below */
92 goto out; 92 goto out;
93 } 93 }
94 94
95 for (i = 0; i < bucket->size; i++) 95 for (i = 0; i < bucket->size; i++)
96 bucket->perm[i] = i; 96 work->perm[i] = i;
97 bucket->perm_n = 0; 97 work->perm_n = 0;
98 } else if (bucket->perm_n == 0xffff) { 98 } else if (work->perm_n == 0xffff) {
99 /* clean up after the r=0 case above */ 99 /* clean up after the r=0 case above */
100 for (i = 1; i < bucket->size; i++) 100 for (i = 1; i < bucket->size; i++)
101 bucket->perm[i] = i; 101 work->perm[i] = i;
102 bucket->perm[bucket->perm[0]] = 0; 102 work->perm[work->perm[0]] = 0;
103 bucket->perm_n = 1; 103 work->perm_n = 1;
104 } 104 }
105 105
106 /* calculate permutation up to pr */ 106 /* calculate permutation up to pr */
107 for (i = 0; i < bucket->perm_n; i++) 107 for (i = 0; i < work->perm_n; i++)
108 dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]); 108 dprintk(" perm_choose have %d: %d\n", i, work->perm[i]);
109 while (bucket->perm_n <= pr) { 109 while (work->perm_n <= pr) {
110 unsigned int p = bucket->perm_n; 110 unsigned int p = work->perm_n;
111 /* no point in swapping the final entry */ 111 /* no point in swapping the final entry */
112 if (p < bucket->size - 1) { 112 if (p < bucket->size - 1) {
113 i = crush_hash32_3(bucket->hash, x, bucket->id, p) % 113 i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
114 (bucket->size - p); 114 (bucket->size - p);
115 if (i) { 115 if (i) {
116 unsigned int t = bucket->perm[p + i]; 116 unsigned int t = work->perm[p + i];
117 bucket->perm[p + i] = bucket->perm[p]; 117 work->perm[p + i] = work->perm[p];
118 bucket->perm[p] = t; 118 work->perm[p] = t;
119 } 119 }
120 dprintk(" perm_choose swap %d with %d\n", p, p+i); 120 dprintk(" perm_choose swap %d with %d\n", p, p+i);
121 } 121 }
122 bucket->perm_n++; 122 work->perm_n++;
123 } 123 }
124 for (i = 0; i < bucket->size; i++) 124 for (i = 0; i < bucket->size; i++)
125 dprintk(" perm_choose %d: %d\n", i, bucket->perm[i]); 125 dprintk(" perm_choose %d: %d\n", i, work->perm[i]);
126 126
127 s = bucket->perm[pr]; 127 s = work->perm[pr];
128out: 128out:
129 dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id, 129 dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
130 bucket->size, x, r, pr, s); 130 bucket->size, x, r, pr, s);
@@ -132,14 +132,14 @@ out:
132} 132}
133 133
134/* uniform */ 134/* uniform */
135static int bucket_uniform_choose(struct crush_bucket_uniform *bucket, 135static int bucket_uniform_choose(const struct crush_bucket_uniform *bucket,
136 int x, int r) 136 struct crush_work_bucket *work, int x, int r)
137{ 137{
138 return bucket_perm_choose(&bucket->h, x, r); 138 return bucket_perm_choose(&bucket->h, work, x, r);
139} 139}
140 140
141/* list */ 141/* list */
142static int bucket_list_choose(struct crush_bucket_list *bucket, 142static int bucket_list_choose(const struct crush_bucket_list *bucket,
143 int x, int r) 143 int x, int r)
144{ 144{
145 int i; 145 int i;
@@ -155,8 +155,9 @@ static int bucket_list_choose(struct crush_bucket_list *bucket,
155 w *= bucket->sum_weights[i]; 155 w *= bucket->sum_weights[i];
156 w = w >> 16; 156 w = w >> 16;
157 /*dprintk(" scaled %llx\n", w);*/ 157 /*dprintk(" scaled %llx\n", w);*/
158 if (w < bucket->item_weights[i]) 158 if (w < bucket->item_weights[i]) {
159 return bucket->h.items[i]; 159 return bucket->h.items[i];
160 }
160 } 161 }
161 162
162 dprintk("bad list sums for bucket %d\n", bucket->h.id); 163 dprintk("bad list sums for bucket %d\n", bucket->h.id);
@@ -192,7 +193,7 @@ static int terminal(int x)
192 return x & 1; 193 return x & 1;
193} 194}
194 195
195static int bucket_tree_choose(struct crush_bucket_tree *bucket, 196static int bucket_tree_choose(const struct crush_bucket_tree *bucket,
196 int x, int r) 197 int x, int r)
197{ 198{
198 int n; 199 int n;
@@ -224,7 +225,7 @@ static int bucket_tree_choose(struct crush_bucket_tree *bucket,
224 225
225/* straw */ 226/* straw */
226 227
227static int bucket_straw_choose(struct crush_bucket_straw *bucket, 228static int bucket_straw_choose(const struct crush_bucket_straw *bucket,
228 int x, int r) 229 int x, int r)
229{ 230{
230 __u32 i; 231 __u32 i;
@@ -301,7 +302,7 @@ static __u64 crush_ln(unsigned int xin)
301 * 302 *
302 */ 303 */
303 304
304static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket, 305static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket,
305 int x, int r) 306 int x, int r)
306{ 307{
307 unsigned int i, high = 0; 308 unsigned int i, high = 0;
@@ -344,37 +345,42 @@ static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket,
344 high_draw = draw; 345 high_draw = draw;
345 } 346 }
346 } 347 }
348
347 return bucket->h.items[high]; 349 return bucket->h.items[high];
348} 350}
349 351
350 352
351static int crush_bucket_choose(struct crush_bucket *in, int x, int r) 353static int crush_bucket_choose(const struct crush_bucket *in,
354 struct crush_work_bucket *work,
355 int x, int r)
352{ 356{
353 dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r); 357 dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
354 BUG_ON(in->size == 0); 358 BUG_ON(in->size == 0);
355 switch (in->alg) { 359 switch (in->alg) {
356 case CRUSH_BUCKET_UNIFORM: 360 case CRUSH_BUCKET_UNIFORM:
357 return bucket_uniform_choose((struct crush_bucket_uniform *)in, 361 return bucket_uniform_choose(
358 x, r); 362 (const struct crush_bucket_uniform *)in,
363 work, x, r);
359 case CRUSH_BUCKET_LIST: 364 case CRUSH_BUCKET_LIST:
360 return bucket_list_choose((struct crush_bucket_list *)in, 365 return bucket_list_choose((const struct crush_bucket_list *)in,
361 x, r); 366 x, r);
362 case CRUSH_BUCKET_TREE: 367 case CRUSH_BUCKET_TREE:
363 return bucket_tree_choose((struct crush_bucket_tree *)in, 368 return bucket_tree_choose((const struct crush_bucket_tree *)in,
364 x, r); 369 x, r);
365 case CRUSH_BUCKET_STRAW: 370 case CRUSH_BUCKET_STRAW:
366 return bucket_straw_choose((struct crush_bucket_straw *)in, 371 return bucket_straw_choose(
367 x, r); 372 (const struct crush_bucket_straw *)in,
373 x, r);
368 case CRUSH_BUCKET_STRAW2: 374 case CRUSH_BUCKET_STRAW2:
369 return bucket_straw2_choose((struct crush_bucket_straw2 *)in, 375 return bucket_straw2_choose(
370 x, r); 376 (const struct crush_bucket_straw2 *)in,
377 x, r);
371 default: 378 default:
372 dprintk("unknown bucket %d alg %d\n", in->id, in->alg); 379 dprintk("unknown bucket %d alg %d\n", in->id, in->alg);
373 return in->items[0]; 380 return in->items[0];
374 } 381 }
375} 382}
376 383
377
378/* 384/*
379 * true if device is marked "out" (failed, fully offloaded) 385 * true if device is marked "out" (failed, fully offloaded)
380 * of the cluster 386 * of the cluster
@@ -416,7 +422,8 @@ static int is_out(const struct crush_map *map,
416 * @parent_r: r value passed from the parent 422 * @parent_r: r value passed from the parent
417 */ 423 */
418static int crush_choose_firstn(const struct crush_map *map, 424static int crush_choose_firstn(const struct crush_map *map,
419 struct crush_bucket *bucket, 425 struct crush_work *work,
426 const struct crush_bucket *bucket,
420 const __u32 *weight, int weight_max, 427 const __u32 *weight, int weight_max,
421 int x, int numrep, int type, 428 int x, int numrep, int type,
422 int *out, int outpos, 429 int *out, int outpos,
@@ -434,7 +441,7 @@ static int crush_choose_firstn(const struct crush_map *map,
434 int rep; 441 int rep;
435 unsigned int ftotal, flocal; 442 unsigned int ftotal, flocal;
436 int retry_descent, retry_bucket, skip_rep; 443 int retry_descent, retry_bucket, skip_rep;
437 struct crush_bucket *in = bucket; 444 const struct crush_bucket *in = bucket;
438 int r; 445 int r;
439 int i; 446 int i;
440 int item = 0; 447 int item = 0;
@@ -473,9 +480,13 @@ static int crush_choose_firstn(const struct crush_map *map,
473 if (local_fallback_retries > 0 && 480 if (local_fallback_retries > 0 &&
474 flocal >= (in->size>>1) && 481 flocal >= (in->size>>1) &&
475 flocal > local_fallback_retries) 482 flocal > local_fallback_retries)
476 item = bucket_perm_choose(in, x, r); 483 item = bucket_perm_choose(
484 in, work->work[-1-in->id],
485 x, r);
477 else 486 else
478 item = crush_bucket_choose(in, x, r); 487 item = crush_bucket_choose(
488 in, work->work[-1-in->id],
489 x, r);
479 if (item >= map->max_devices) { 490 if (item >= map->max_devices) {
480 dprintk(" bad item %d\n", item); 491 dprintk(" bad item %d\n", item);
481 skip_rep = 1; 492 skip_rep = 1;
@@ -518,19 +529,21 @@ static int crush_choose_firstn(const struct crush_map *map,
518 sub_r = r >> (vary_r-1); 529 sub_r = r >> (vary_r-1);
519 else 530 else
520 sub_r = 0; 531 sub_r = 0;
521 if (crush_choose_firstn(map, 532 if (crush_choose_firstn(
522 map->buckets[-1-item], 533 map,
523 weight, weight_max, 534 work,
524 x, stable ? 1 : outpos+1, 0, 535 map->buckets[-1-item],
525 out2, outpos, count, 536 weight, weight_max,
526 recurse_tries, 0, 537 x, stable ? 1 : outpos+1, 0,
527 local_retries, 538 out2, outpos, count,
528 local_fallback_retries, 539 recurse_tries, 0,
529 0, 540 local_retries,
530 vary_r, 541 local_fallback_retries,
531 stable, 542 0,
532 NULL, 543 vary_r,
533 sub_r) <= outpos) 544 stable,
545 NULL,
546 sub_r) <= outpos)
534 /* didn't get leaf */ 547 /* didn't get leaf */
535 reject = 1; 548 reject = 1;
536 } else { 549 } else {
@@ -539,14 +552,12 @@ static int crush_choose_firstn(const struct crush_map *map,
539 } 552 }
540 } 553 }
541 554
542 if (!reject) { 555 if (!reject && !collide) {
543 /* out? */ 556 /* out? */
544 if (itemtype == 0) 557 if (itemtype == 0)
545 reject = is_out(map, weight, 558 reject = is_out(map, weight,
546 weight_max, 559 weight_max,
547 item, x); 560 item, x);
548 else
549 reject = 0;
550 } 561 }
551 562
552reject: 563reject:
@@ -600,7 +611,8 @@ reject:
600 * 611 *
601 */ 612 */
602static void crush_choose_indep(const struct crush_map *map, 613static void crush_choose_indep(const struct crush_map *map,
603 struct crush_bucket *bucket, 614 struct crush_work *work,
615 const struct crush_bucket *bucket,
604 const __u32 *weight, int weight_max, 616 const __u32 *weight, int weight_max,
605 int x, int left, int numrep, int type, 617 int x, int left, int numrep, int type,
606 int *out, int outpos, 618 int *out, int outpos,
@@ -610,7 +622,7 @@ static void crush_choose_indep(const struct crush_map *map,
610 int *out2, 622 int *out2,
611 int parent_r) 623 int parent_r)
612{ 624{
613 struct crush_bucket *in = bucket; 625 const struct crush_bucket *in = bucket;
614 int endpos = outpos + left; 626 int endpos = outpos + left;
615 int rep; 627 int rep;
616 unsigned int ftotal; 628 unsigned int ftotal;
@@ -678,7 +690,9 @@ static void crush_choose_indep(const struct crush_map *map,
678 break; 690 break;
679 } 691 }
680 692
681 item = crush_bucket_choose(in, x, r); 693 item = crush_bucket_choose(
694 in, work->work[-1-in->id],
695 x, r);
682 if (item >= map->max_devices) { 696 if (item >= map->max_devices) {
683 dprintk(" bad item %d\n", item); 697 dprintk(" bad item %d\n", item);
684 out[rep] = CRUSH_ITEM_NONE; 698 out[rep] = CRUSH_ITEM_NONE;
@@ -724,13 +738,15 @@ static void crush_choose_indep(const struct crush_map *map,
724 738
725 if (recurse_to_leaf) { 739 if (recurse_to_leaf) {
726 if (item < 0) { 740 if (item < 0) {
727 crush_choose_indep(map, 741 crush_choose_indep(
728 map->buckets[-1-item], 742 map,
729 weight, weight_max, 743 work,
730 x, 1, numrep, 0, 744 map->buckets[-1-item],
731 out2, rep, 745 weight, weight_max,
732 recurse_tries, 0, 746 x, 1, numrep, 0,
733 0, NULL, r); 747 out2, rep,
748 recurse_tries, 0,
749 0, NULL, r);
734 if (out2[rep] == CRUSH_ITEM_NONE) { 750 if (out2[rep] == CRUSH_ITEM_NONE) {
735 /* placed nothing; no leaf */ 751 /* placed nothing; no leaf */
736 break; 752 break;
@@ -781,6 +797,53 @@ static void crush_choose_indep(const struct crush_map *map,
781#endif 797#endif
782} 798}
783 799
800
801/*
802 * This takes a chunk of memory and sets it up to be a shiny new
803 * working area for a CRUSH placement computation. It must be called
804 * on any newly allocated memory before passing it in to
805 * crush_do_rule. It may be used repeatedly after that, so long as the
806 * map has not changed. If the map /has/ changed, you must make sure
807 * the working size is no smaller than what was allocated and re-run
808 * crush_init_workspace.
809 *
810 * If you do retain the working space between calls to crush, make it
811 * thread-local.
812 */
813void crush_init_workspace(const struct crush_map *map, void *v)
814{
815 struct crush_work *w = v;
816 __s32 b;
817
818 /*
819 * We work by moving through the available space and setting
820 * values and pointers as we go.
821 *
822 * It's a bit like Forth's use of the 'allot' word since we
823 * set the pointer first and then reserve the space for it to
824 * point to by incrementing the point.
825 */
826 v += sizeof(struct crush_work *);
827 w->work = v;
828 v += map->max_buckets * sizeof(struct crush_work_bucket *);
829 for (b = 0; b < map->max_buckets; ++b) {
830 if (!map->buckets[b])
831 continue;
832
833 w->work[b] = v;
834 switch (map->buckets[b]->alg) {
835 default:
836 v += sizeof(struct crush_work_bucket);
837 break;
838 }
839 w->work[b]->perm_x = 0;
840 w->work[b]->perm_n = 0;
841 w->work[b]->perm = v;
842 v += map->buckets[b]->size * sizeof(__u32);
843 }
844 BUG_ON(v - (void *)w != map->working_size);
845}
846
784/** 847/**
785 * crush_do_rule - calculate a mapping with the given input and rule 848 * crush_do_rule - calculate a mapping with the given input and rule
786 * @map: the crush_map 849 * @map: the crush_map
@@ -790,24 +853,25 @@ static void crush_choose_indep(const struct crush_map *map,
790 * @result_max: maximum result size 853 * @result_max: maximum result size
791 * @weight: weight vector (for map leaves) 854 * @weight: weight vector (for map leaves)
792 * @weight_max: size of weight vector 855 * @weight_max: size of weight vector
793 * @scratch: scratch vector for private use; must be >= 3 * result_max 856 * @cwin: pointer to at least crush_work_size() bytes of memory
794 */ 857 */
795int crush_do_rule(const struct crush_map *map, 858int crush_do_rule(const struct crush_map *map,
796 int ruleno, int x, int *result, int result_max, 859 int ruleno, int x, int *result, int result_max,
797 const __u32 *weight, int weight_max, 860 const __u32 *weight, int weight_max,
798 int *scratch) 861 void *cwin)
799{ 862{
800 int result_len; 863 int result_len;
801 int *a = scratch; 864 struct crush_work *cw = cwin;
802 int *b = scratch + result_max; 865 int *a = cwin + map->working_size;
803 int *c = scratch + result_max*2; 866 int *b = a + result_max;
867 int *c = b + result_max;
868 int *w = a;
869 int *o = b;
804 int recurse_to_leaf; 870 int recurse_to_leaf;
805 int *w;
806 int wsize = 0; 871 int wsize = 0;
807 int *o;
808 int osize; 872 int osize;
809 int *tmp; 873 int *tmp;
810 struct crush_rule *rule; 874 const struct crush_rule *rule;
811 __u32 step; 875 __u32 step;
812 int i, j; 876 int i, j;
813 int numrep; 877 int numrep;
@@ -835,12 +899,10 @@ int crush_do_rule(const struct crush_map *map,
835 899
836 rule = map->rules[ruleno]; 900 rule = map->rules[ruleno];
837 result_len = 0; 901 result_len = 0;
838 w = a;
839 o = b;
840 902
841 for (step = 0; step < rule->len; step++) { 903 for (step = 0; step < rule->len; step++) {
842 int firstn = 0; 904 int firstn = 0;
843 struct crush_rule_step *curstep = &rule->steps[step]; 905 const struct crush_rule_step *curstep = &rule->steps[step];
844 906
845 switch (curstep->op) { 907 switch (curstep->op) {
846 case CRUSH_RULE_TAKE: 908 case CRUSH_RULE_TAKE:
@@ -936,6 +998,7 @@ int crush_do_rule(const struct crush_map *map,
936 recurse_tries = choose_tries; 998 recurse_tries = choose_tries;
937 osize += crush_choose_firstn( 999 osize += crush_choose_firstn(
938 map, 1000 map,
1001 cw,
939 map->buckets[bno], 1002 map->buckets[bno],
940 weight, weight_max, 1003 weight, weight_max,
941 x, numrep, 1004 x, numrep,
@@ -956,6 +1019,7 @@ int crush_do_rule(const struct crush_map *map,
956 numrep : (result_max-osize)); 1019 numrep : (result_max-osize));
957 crush_choose_indep( 1020 crush_choose_indep(
958 map, 1021 map,
1022 cw,
959 map->buckets[bno], 1023 map->buckets[bno],
960 weight, weight_max, 1024 weight, weight_max,
961 x, out_size, numrep, 1025 x, out_size, numrep,
@@ -997,5 +1061,6 @@ int crush_do_rule(const struct crush_map *map,
997 break; 1061 break;
998 } 1062 }
999 } 1063 }
1064
1000 return result_len; 1065 return result_len;
1001} 1066}
diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c
index 292e33bd916e..46008d5ac504 100644
--- a/net/ceph/crypto.c
+++ b/net/ceph/crypto.c
@@ -3,10 +3,12 @@
3 3
4#include <linux/err.h> 4#include <linux/err.h>
5#include <linux/scatterlist.h> 5#include <linux/scatterlist.h>
6#include <linux/sched.h>
6#include <linux/slab.h> 7#include <linux/slab.h>
7#include <crypto/aes.h> 8#include <crypto/aes.h>
8#include <crypto/skcipher.h> 9#include <crypto/skcipher.h>
9#include <linux/key-type.h> 10#include <linux/key-type.h>
11#include <linux/sched/mm.h>
10 12
11#include <keys/ceph-type.h> 13#include <keys/ceph-type.h>
12#include <keys/user-type.h> 14#include <keys/user-type.h>
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 770c52701efa..f76bb3332613 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -7,6 +7,7 @@
7#include <linux/kthread.h> 7#include <linux/kthread.h>
8#include <linux/net.h> 8#include <linux/net.h>
9#include <linux/nsproxy.h> 9#include <linux/nsproxy.h>
10#include <linux/sched/mm.h>
10#include <linux/slab.h> 11#include <linux/slab.h>
11#include <linux/socket.h> 12#include <linux/socket.h>
12#include <linux/string.h> 13#include <linux/string.h>
@@ -469,11 +470,16 @@ static int ceph_tcp_connect(struct ceph_connection *con)
469{ 470{
470 struct sockaddr_storage *paddr = &con->peer_addr.in_addr; 471 struct sockaddr_storage *paddr = &con->peer_addr.in_addr;
471 struct socket *sock; 472 struct socket *sock;
473 unsigned int noio_flag;
472 int ret; 474 int ret;
473 475
474 BUG_ON(con->sock); 476 BUG_ON(con->sock);
477
478 /* sock_create_kern() allocates with GFP_KERNEL */
479 noio_flag = memalloc_noio_save();
475 ret = sock_create_kern(read_pnet(&con->msgr->net), paddr->ss_family, 480 ret = sock_create_kern(read_pnet(&con->msgr->net), paddr->ss_family,
476 SOCK_STREAM, IPPROTO_TCP, &sock); 481 SOCK_STREAM, IPPROTO_TCP, &sock);
482 memalloc_noio_restore(noio_flag);
477 if (ret) 483 if (ret)
478 return ret; 484 return ret;
479 sock->sk->sk_allocation = GFP_NOFS; 485 sock->sk->sk_allocation = GFP_NOFS;
@@ -520,7 +526,8 @@ static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
520 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; 526 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
521 int r; 527 int r;
522 528
523 r = kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags); 529 iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &iov, 1, len);
530 r = sock_recvmsg(sock, &msg, msg.msg_flags);
524 if (r == -EAGAIN) 531 if (r == -EAGAIN)
525 r = 0; 532 r = 0;
526 return r; 533 return r;
@@ -529,17 +536,20 @@ static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
529static int ceph_tcp_recvpage(struct socket *sock, struct page *page, 536static int ceph_tcp_recvpage(struct socket *sock, struct page *page,
530 int page_offset, size_t length) 537 int page_offset, size_t length)
531{ 538{
532 void *kaddr; 539 struct bio_vec bvec = {
533 int ret; 540 .bv_page = page,
541 .bv_offset = page_offset,
542 .bv_len = length
543 };
544 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
545 int r;
534 546
535 BUG_ON(page_offset + length > PAGE_SIZE); 547 BUG_ON(page_offset + length > PAGE_SIZE);
536 548 iov_iter_bvec(&msg.msg_iter, READ | ITER_BVEC, &bvec, 1, length);
537 kaddr = kmap(page); 549 r = sock_recvmsg(sock, &msg, msg.msg_flags);
538 BUG_ON(!kaddr); 550 if (r == -EAGAIN)
539 ret = ceph_tcp_recvmsg(sock, kaddr + page_offset, length); 551 r = 0;
540 kunmap(page); 552 return r;
541
542 return ret;
543} 553}
544 554
545/* 555/*
@@ -579,18 +589,28 @@ static int __ceph_tcp_sendpage(struct socket *sock, struct page *page,
579static int ceph_tcp_sendpage(struct socket *sock, struct page *page, 589static int ceph_tcp_sendpage(struct socket *sock, struct page *page,
580 int offset, size_t size, bool more) 590 int offset, size_t size, bool more)
581{ 591{
592 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
593 struct bio_vec bvec;
582 int ret; 594 int ret;
583 struct kvec iov;
584 595
585 /* sendpage cannot properly handle pages with page_count == 0, 596 /* sendpage cannot properly handle pages with page_count == 0,
586 * we need to fallback to sendmsg if that's the case */ 597 * we need to fallback to sendmsg if that's the case */
587 if (page_count(page) >= 1) 598 if (page_count(page) >= 1)
588 return __ceph_tcp_sendpage(sock, page, offset, size, more); 599 return __ceph_tcp_sendpage(sock, page, offset, size, more);
589 600
590 iov.iov_base = kmap(page) + offset; 601 bvec.bv_page = page;
591 iov.iov_len = size; 602 bvec.bv_offset = offset;
592 ret = ceph_tcp_sendmsg(sock, &iov, 1, size, more); 603 bvec.bv_len = size;
593 kunmap(page); 604
605 if (more)
606 msg.msg_flags |= MSG_MORE;
607 else
608 msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */
609
610 iov_iter_bvec(&msg.msg_iter, WRITE | ITER_BVEC, &bvec, 1, size);
611 ret = sock_sendmsg(sock, &msg);
612 if (ret == -EAGAIN)
613 ret = 0;
594 614
595 return ret; 615 return ret;
596} 616}
@@ -3425,7 +3445,7 @@ static void ceph_msg_release(struct kref *kref)
3425struct ceph_msg *ceph_msg_get(struct ceph_msg *msg) 3445struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
3426{ 3446{
3427 dout("%s %p (was %d)\n", __func__, msg, 3447 dout("%s %p (was %d)\n", __func__, msg,
3428 atomic_read(&msg->kref.refcount)); 3448 kref_read(&msg->kref));
3429 kref_get(&msg->kref); 3449 kref_get(&msg->kref);
3430 return msg; 3450 return msg;
3431} 3451}
@@ -3434,7 +3454,7 @@ EXPORT_SYMBOL(ceph_msg_get);
3434void ceph_msg_put(struct ceph_msg *msg) 3454void ceph_msg_put(struct ceph_msg *msg)
3435{ 3455{
3436 dout("%s %p (was %d)\n", __func__, msg, 3456 dout("%s %p (was %d)\n", __func__, msg,
3437 atomic_read(&msg->kref.refcount)); 3457 kref_read(&msg->kref));
3438 kref_put(&msg->kref, ceph_msg_release); 3458 kref_put(&msg->kref, ceph_msg_release);
3439} 3459}
3440EXPORT_SYMBOL(ceph_msg_put); 3460EXPORT_SYMBOL(ceph_msg_put);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 842f049abb86..e15ea9e4c495 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -438,7 +438,7 @@ static void ceph_osdc_release_request(struct kref *kref)
438void ceph_osdc_get_request(struct ceph_osd_request *req) 438void ceph_osdc_get_request(struct ceph_osd_request *req)
439{ 439{
440 dout("%s %p (was %d)\n", __func__, req, 440 dout("%s %p (was %d)\n", __func__, req,
441 atomic_read(&req->r_kref.refcount)); 441 kref_read(&req->r_kref));
442 kref_get(&req->r_kref); 442 kref_get(&req->r_kref);
443} 443}
444EXPORT_SYMBOL(ceph_osdc_get_request); 444EXPORT_SYMBOL(ceph_osdc_get_request);
@@ -447,7 +447,7 @@ void ceph_osdc_put_request(struct ceph_osd_request *req)
447{ 447{
448 if (req) { 448 if (req) {
449 dout("%s %p (was %d)\n", __func__, req, 449 dout("%s %p (was %d)\n", __func__, req,
450 atomic_read(&req->r_kref.refcount)); 450 kref_read(&req->r_kref));
451 kref_put(&req->r_kref, ceph_osdc_release_request); 451 kref_put(&req->r_kref, ceph_osdc_release_request);
452 } 452 }
453} 453}
@@ -460,7 +460,6 @@ static void request_init(struct ceph_osd_request *req)
460 460
461 kref_init(&req->r_kref); 461 kref_init(&req->r_kref);
462 init_completion(&req->r_completion); 462 init_completion(&req->r_completion);
463 init_completion(&req->r_done_completion);
464 RB_CLEAR_NODE(&req->r_node); 463 RB_CLEAR_NODE(&req->r_node);
465 RB_CLEAR_NODE(&req->r_mc_node); 464 RB_CLEAR_NODE(&req->r_mc_node);
466 INIT_LIST_HEAD(&req->r_unsafe_item); 465 INIT_LIST_HEAD(&req->r_unsafe_item);
@@ -487,11 +486,11 @@ static void request_reinit(struct ceph_osd_request *req)
487 struct ceph_msg *reply_msg = req->r_reply; 486 struct ceph_msg *reply_msg = req->r_reply;
488 487
489 dout("%s req %p\n", __func__, req); 488 dout("%s req %p\n", __func__, req);
490 WARN_ON(atomic_read(&req->r_kref.refcount) != 1); 489 WARN_ON(kref_read(&req->r_kref) != 1);
491 request_release_checks(req); 490 request_release_checks(req);
492 491
493 WARN_ON(atomic_read(&request_msg->kref.refcount) != 1); 492 WARN_ON(kref_read(&request_msg->kref) != 1);
494 WARN_ON(atomic_read(&reply_msg->kref.refcount) != 1); 493 WARN_ON(kref_read(&reply_msg->kref) != 1);
495 target_destroy(&req->r_t); 494 target_destroy(&req->r_t);
496 495
497 request_init(req); 496 request_init(req);
@@ -672,7 +671,8 @@ void osd_req_op_extent_update(struct ceph_osd_request *osd_req,
672 BUG_ON(length > previous); 671 BUG_ON(length > previous);
673 672
674 op->extent.length = length; 673 op->extent.length = length;
675 op->indata_len -= previous - length; 674 if (op->op == CEPH_OSD_OP_WRITE || op->op == CEPH_OSD_OP_WRITEFULL)
675 op->indata_len -= previous - length;
676} 676}
677EXPORT_SYMBOL(osd_req_op_extent_update); 677EXPORT_SYMBOL(osd_req_op_extent_update);
678 678
@@ -1636,7 +1636,7 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
1636 bool need_send = false; 1636 bool need_send = false;
1637 bool promoted = false; 1637 bool promoted = false;
1638 1638
1639 WARN_ON(req->r_tid || req->r_got_reply); 1639 WARN_ON(req->r_tid);
1640 dout("%s req %p wrlocked %d\n", __func__, req, wrlocked); 1640 dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
1641 1641
1642again: 1642again:
@@ -1704,18 +1704,13 @@ promote:
1704 1704
1705static void account_request(struct ceph_osd_request *req) 1705static void account_request(struct ceph_osd_request *req)
1706{ 1706{
1707 unsigned int mask = CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK; 1707 WARN_ON(req->r_flags & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK));
1708 1708 WARN_ON(!(req->r_flags & (CEPH_OSD_FLAG_READ | CEPH_OSD_FLAG_WRITE)));
1709 if (req->r_flags & CEPH_OSD_FLAG_READ) {
1710 WARN_ON(req->r_flags & mask);
1711 req->r_flags |= CEPH_OSD_FLAG_ACK;
1712 } else if (req->r_flags & CEPH_OSD_FLAG_WRITE)
1713 WARN_ON(!(req->r_flags & mask));
1714 else
1715 WARN_ON(1);
1716 1709
1717 WARN_ON(req->r_unsafe_callback && (req->r_flags & mask) != mask); 1710 req->r_flags |= CEPH_OSD_FLAG_ONDISK;
1718 atomic_inc(&req->r_osdc->num_requests); 1711 atomic_inc(&req->r_osdc->num_requests);
1712
1713 req->r_start_stamp = jiffies;
1719} 1714}
1720 1715
1721static void submit_request(struct ceph_osd_request *req, bool wrlocked) 1716static void submit_request(struct ceph_osd_request *req, bool wrlocked)
@@ -1749,15 +1744,15 @@ static void finish_request(struct ceph_osd_request *req)
1749 1744
1750static void __complete_request(struct ceph_osd_request *req) 1745static void __complete_request(struct ceph_osd_request *req)
1751{ 1746{
1752 if (req->r_callback) 1747 if (req->r_callback) {
1748 dout("%s req %p tid %llu cb %pf result %d\n", __func__, req,
1749 req->r_tid, req->r_callback, req->r_result);
1753 req->r_callback(req); 1750 req->r_callback(req);
1754 else 1751 }
1755 complete_all(&req->r_completion);
1756} 1752}
1757 1753
1758/* 1754/*
1759 * Note that this is open-coded in handle_reply(), which has to deal 1755 * This is open-coded in handle_reply().
1760 * with ack vs commit, dup acks, etc.
1761 */ 1756 */
1762static void complete_request(struct ceph_osd_request *req, int err) 1757static void complete_request(struct ceph_osd_request *req, int err)
1763{ 1758{
@@ -1766,7 +1761,7 @@ static void complete_request(struct ceph_osd_request *req, int err)
1766 req->r_result = err; 1761 req->r_result = err;
1767 finish_request(req); 1762 finish_request(req);
1768 __complete_request(req); 1763 __complete_request(req);
1769 complete_all(&req->r_done_completion); 1764 complete_all(&req->r_completion);
1770 ceph_osdc_put_request(req); 1765 ceph_osdc_put_request(req);
1771} 1766}
1772 1767
@@ -1792,10 +1787,18 @@ static void cancel_request(struct ceph_osd_request *req)
1792 1787
1793 cancel_map_check(req); 1788 cancel_map_check(req);
1794 finish_request(req); 1789 finish_request(req);
1795 complete_all(&req->r_done_completion); 1790 complete_all(&req->r_completion);
1796 ceph_osdc_put_request(req); 1791 ceph_osdc_put_request(req);
1797} 1792}
1798 1793
1794static void abort_request(struct ceph_osd_request *req, int err)
1795{
1796 dout("%s req %p tid %llu err %d\n", __func__, req, req->r_tid, err);
1797
1798 cancel_map_check(req);
1799 complete_request(req, err);
1800}
1801
1799static void check_pool_dne(struct ceph_osd_request *req) 1802static void check_pool_dne(struct ceph_osd_request *req)
1800{ 1803{
1801 struct ceph_osd_client *osdc = req->r_osdc; 1804 struct ceph_osd_client *osdc = req->r_osdc;
@@ -2169,7 +2172,6 @@ static void linger_commit_cb(struct ceph_osd_request *req)
2169 mutex_lock(&lreq->lock); 2172 mutex_lock(&lreq->lock);
2170 dout("%s lreq %p linger_id %llu result %d\n", __func__, lreq, 2173 dout("%s lreq %p linger_id %llu result %d\n", __func__, lreq,
2171 lreq->linger_id, req->r_result); 2174 lreq->linger_id, req->r_result);
2172 WARN_ON(!__linger_registered(lreq));
2173 linger_reg_commit_complete(lreq, req->r_result); 2175 linger_reg_commit_complete(lreq, req->r_result);
2174 lreq->committed = true; 2176 lreq->committed = true;
2175 2177
@@ -2495,6 +2497,7 @@ static void handle_timeout(struct work_struct *work)
2495 container_of(work, struct ceph_osd_client, timeout_work.work); 2497 container_of(work, struct ceph_osd_client, timeout_work.work);
2496 struct ceph_options *opts = osdc->client->options; 2498 struct ceph_options *opts = osdc->client->options;
2497 unsigned long cutoff = jiffies - opts->osd_keepalive_timeout; 2499 unsigned long cutoff = jiffies - opts->osd_keepalive_timeout;
2500 unsigned long expiry_cutoff = jiffies - opts->osd_request_timeout;
2498 LIST_HEAD(slow_osds); 2501 LIST_HEAD(slow_osds);
2499 struct rb_node *n, *p; 2502 struct rb_node *n, *p;
2500 2503
@@ -2510,15 +2513,23 @@ static void handle_timeout(struct work_struct *work)
2510 struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); 2513 struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
2511 bool found = false; 2514 bool found = false;
2512 2515
2513 for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) { 2516 for (p = rb_first(&osd->o_requests); p; ) {
2514 struct ceph_osd_request *req = 2517 struct ceph_osd_request *req =
2515 rb_entry(p, struct ceph_osd_request, r_node); 2518 rb_entry(p, struct ceph_osd_request, r_node);
2516 2519
2520 p = rb_next(p); /* abort_request() */
2521
2517 if (time_before(req->r_stamp, cutoff)) { 2522 if (time_before(req->r_stamp, cutoff)) {
2518 dout(" req %p tid %llu on osd%d is laggy\n", 2523 dout(" req %p tid %llu on osd%d is laggy\n",
2519 req, req->r_tid, osd->o_osd); 2524 req, req->r_tid, osd->o_osd);
2520 found = true; 2525 found = true;
2521 } 2526 }
2527 if (opts->osd_request_timeout &&
2528 time_before(req->r_start_stamp, expiry_cutoff)) {
2529 pr_err_ratelimited("tid %llu on osd%d timeout\n",
2530 req->r_tid, osd->o_osd);
2531 abort_request(req, -ETIMEDOUT);
2532 }
2522 } 2533 }
2523 for (p = rb_first(&osd->o_linger_requests); p; p = rb_next(p)) { 2534 for (p = rb_first(&osd->o_linger_requests); p; p = rb_next(p)) {
2524 struct ceph_osd_linger_request *lreq = 2535 struct ceph_osd_linger_request *lreq =
@@ -2538,6 +2549,21 @@ static void handle_timeout(struct work_struct *work)
2538 list_move_tail(&osd->o_keepalive_item, &slow_osds); 2549 list_move_tail(&osd->o_keepalive_item, &slow_osds);
2539 } 2550 }
2540 2551
2552 if (opts->osd_request_timeout) {
2553 for (p = rb_first(&osdc->homeless_osd.o_requests); p; ) {
2554 struct ceph_osd_request *req =
2555 rb_entry(p, struct ceph_osd_request, r_node);
2556
2557 p = rb_next(p); /* abort_request() */
2558
2559 if (time_before(req->r_start_stamp, expiry_cutoff)) {
2560 pr_err_ratelimited("tid %llu on osd%d timeout\n",
2561 req->r_tid, osdc->homeless_osd.o_osd);
2562 abort_request(req, -ETIMEDOUT);
2563 }
2564 }
2565 }
2566
2541 if (atomic_read(&osdc->num_homeless) || !list_empty(&slow_osds)) 2567 if (atomic_read(&osdc->num_homeless) || !list_empty(&slow_osds))
2542 maybe_request_map(osdc); 2568 maybe_request_map(osdc);
2543 2569
@@ -2785,31 +2811,8 @@ e_inval:
2785} 2811}
2786 2812
2787/* 2813/*
2788 * We are done with @req if 2814 * Handle MOSDOpReply. Set ->r_result and call the callback if it is
2789 * - @m is a safe reply, or 2815 * specified.
2790 * - @m is an unsafe reply and we didn't want a safe one
2791 */
2792static bool done_request(const struct ceph_osd_request *req,
2793 const struct MOSDOpReply *m)
2794{
2795 return (m->result < 0 ||
2796 (m->flags & CEPH_OSD_FLAG_ONDISK) ||
2797 !(req->r_flags & CEPH_OSD_FLAG_ONDISK));
2798}
2799
2800/*
2801 * handle osd op reply. either call the callback if it is specified,
2802 * or do the completion to wake up the waiting thread.
2803 *
2804 * ->r_unsafe_callback is set? yes no
2805 *
2806 * first reply is OK (needed r_cb/r_completion, r_cb/r_completion,
2807 * any or needed/got safe) r_done_completion r_done_completion
2808 *
2809 * first reply is unsafe r_unsafe_cb(true) (nothing)
2810 *
2811 * when we get the safe reply r_unsafe_cb(false), r_cb/r_completion,
2812 * r_done_completion r_done_completion
2813 */ 2816 */
2814static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg) 2817static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
2815{ 2818{
@@ -2818,7 +2821,6 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
2818 struct MOSDOpReply m; 2821 struct MOSDOpReply m;
2819 u64 tid = le64_to_cpu(msg->hdr.tid); 2822 u64 tid = le64_to_cpu(msg->hdr.tid);
2820 u32 data_len = 0; 2823 u32 data_len = 0;
2821 bool already_acked;
2822 int ret; 2824 int ret;
2823 int i; 2825 int i;
2824 2826
@@ -2897,50 +2899,22 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
2897 le32_to_cpu(msg->hdr.data_len), req->r_tid); 2899 le32_to_cpu(msg->hdr.data_len), req->r_tid);
2898 goto fail_request; 2900 goto fail_request;
2899 } 2901 }
2900 dout("%s req %p tid %llu acked %d result %d data_len %u\n", __func__, 2902 dout("%s req %p tid %llu result %d data_len %u\n", __func__,
2901 req, req->r_tid, req->r_got_reply, m.result, data_len); 2903 req, req->r_tid, m.result, data_len);
2902
2903 already_acked = req->r_got_reply;
2904 if (!already_acked) {
2905 req->r_result = m.result ?: data_len;
2906 req->r_replay_version = m.replay_version; /* struct */
2907 req->r_got_reply = true;
2908 } else if (!(m.flags & CEPH_OSD_FLAG_ONDISK)) {
2909 dout("req %p tid %llu dup ack\n", req, req->r_tid);
2910 goto out_unlock_session;
2911 }
2912
2913 if (done_request(req, &m)) {
2914 finish_request(req);
2915 if (req->r_linger) {
2916 WARN_ON(req->r_unsafe_callback);
2917 dout("req %p tid %llu cb (locked)\n", req, req->r_tid);
2918 __complete_request(req);
2919 }
2920 }
2921 2904
2905 /*
2906 * Since we only ever request ONDISK, we should only ever get
2907 * one (type of) reply back.
2908 */
2909 WARN_ON(!(m.flags & CEPH_OSD_FLAG_ONDISK));
2910 req->r_result = m.result ?: data_len;
2911 finish_request(req);
2922 mutex_unlock(&osd->lock); 2912 mutex_unlock(&osd->lock);
2923 up_read(&osdc->lock); 2913 up_read(&osdc->lock);
2924 2914
2925 if (done_request(req, &m)) { 2915 __complete_request(req);
2926 if (already_acked && req->r_unsafe_callback) { 2916 complete_all(&req->r_completion);
2927 dout("req %p tid %llu safe-cb\n", req, req->r_tid); 2917 ceph_osdc_put_request(req);
2928 req->r_unsafe_callback(req, false);
2929 } else if (!req->r_linger) {
2930 dout("req %p tid %llu cb\n", req, req->r_tid);
2931 __complete_request(req);
2932 }
2933 complete_all(&req->r_done_completion);
2934 ceph_osdc_put_request(req);
2935 } else {
2936 if (req->r_unsafe_callback) {
2937 dout("req %p tid %llu unsafe-cb\n", req, req->r_tid);
2938 req->r_unsafe_callback(req, true);
2939 } else {
2940 WARN_ON(1);
2941 }
2942 }
2943
2944 return; 2918 return;
2945 2919
2946fail_request: 2920fail_request:
@@ -3540,7 +3514,7 @@ again:
3540 up_read(&osdc->lock); 3514 up_read(&osdc->lock);
3541 dout("%s waiting on req %p tid %llu last_tid %llu\n", 3515 dout("%s waiting on req %p tid %llu last_tid %llu\n",
3542 __func__, req, req->r_tid, last_tid); 3516 __func__, req, req->r_tid, last_tid);
3543 wait_for_completion(&req->r_done_completion); 3517 wait_for_completion(&req->r_completion);
3544 ceph_osdc_put_request(req); 3518 ceph_osdc_put_request(req);
3545 goto again; 3519 goto again;
3546 } 3520 }
@@ -3599,7 +3573,7 @@ ceph_osdc_watch(struct ceph_osd_client *osdc,
3599 3573
3600 ceph_oid_copy(&lreq->t.base_oid, oid); 3574 ceph_oid_copy(&lreq->t.base_oid, oid);
3601 ceph_oloc_copy(&lreq->t.base_oloc, oloc); 3575 ceph_oloc_copy(&lreq->t.base_oloc, oloc);
3602 lreq->t.flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; 3576 lreq->t.flags = CEPH_OSD_FLAG_WRITE;
3603 lreq->mtime = CURRENT_TIME; 3577 lreq->mtime = CURRENT_TIME;
3604 3578
3605 lreq->reg_req = alloc_linger_request(lreq); 3579 lreq->reg_req = alloc_linger_request(lreq);
@@ -3657,7 +3631,7 @@ int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
3657 3631
3658 ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid); 3632 ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
3659 ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc); 3633 ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
3660 req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; 3634 req->r_flags = CEPH_OSD_FLAG_WRITE;
3661 req->r_mtime = CURRENT_TIME; 3635 req->r_mtime = CURRENT_TIME;
3662 osd_req_op_watch_init(req, 0, lreq->linger_id, 3636 osd_req_op_watch_init(req, 0, lreq->linger_id,
3663 CEPH_OSD_WATCH_OP_UNWATCH); 3637 CEPH_OSD_WATCH_OP_UNWATCH);
@@ -4022,7 +3996,7 @@ EXPORT_SYMBOL(ceph_osdc_maybe_request_map);
4022 * Execute an OSD class method on an object. 3996 * Execute an OSD class method on an object.
4023 * 3997 *
4024 * @flags: CEPH_OSD_FLAG_* 3998 * @flags: CEPH_OSD_FLAG_*
4025 * @resp_len: out param for reply length 3999 * @resp_len: in/out param for reply length
4026 */ 4000 */
4027int ceph_osdc_call(struct ceph_osd_client *osdc, 4001int ceph_osdc_call(struct ceph_osd_client *osdc,
4028 struct ceph_object_id *oid, 4002 struct ceph_object_id *oid,
@@ -4035,6 +4009,9 @@ int ceph_osdc_call(struct ceph_osd_client *osdc,
4035 struct ceph_osd_request *req; 4009 struct ceph_osd_request *req;
4036 int ret; 4010 int ret;
4037 4011
4012 if (req_len > PAGE_SIZE || (resp_page && *resp_len > PAGE_SIZE))
4013 return -E2BIG;
4014
4038 req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO); 4015 req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
4039 if (!req) 4016 if (!req)
4040 return -ENOMEM; 4017 return -ENOMEM;
@@ -4053,7 +4030,7 @@ int ceph_osdc_call(struct ceph_osd_client *osdc,
4053 0, false, false); 4030 0, false, false);
4054 if (resp_page) 4031 if (resp_page)
4055 osd_req_op_cls_response_data_pages(req, 0, &resp_page, 4032 osd_req_op_cls_response_data_pages(req, 0, &resp_page,
4056 PAGE_SIZE, 0, false, false); 4033 *resp_len, 0, false, false);
4057 4034
4058 ceph_osdc_start_request(osdc, req, false); 4035 ceph_osdc_start_request(osdc, req, false);
4059 ret = ceph_osdc_wait_request(osdc, req); 4036 ret = ceph_osdc_wait_request(osdc, req);
@@ -4220,8 +4197,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
4220 int page_align = off & ~PAGE_MASK; 4197 int page_align = off & ~PAGE_MASK;
4221 4198
4222 req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1, 4199 req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1,
4223 CEPH_OSD_OP_WRITE, 4200 CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
4224 CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
4225 snapc, truncate_seq, truncate_size, 4201 snapc, truncate_seq, truncate_size,
4226 true); 4202 true);
4227 if (IS_ERR(req)) 4203 if (IS_ERR(req))
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index d2436880b305..ffe9e904d4d1 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -153,6 +153,32 @@ bad:
153 return -EINVAL; 153 return -EINVAL;
154} 154}
155 155
156static void crush_finalize(struct crush_map *c)
157{
158 __s32 b;
159
160 /* Space for the array of pointers to per-bucket workspace */
161 c->working_size = sizeof(struct crush_work) +
162 c->max_buckets * sizeof(struct crush_work_bucket *);
163
164 for (b = 0; b < c->max_buckets; b++) {
165 if (!c->buckets[b])
166 continue;
167
168 switch (c->buckets[b]->alg) {
169 default:
170 /*
171 * The base case, permutation variables and
172 * the pointer to the permutation array.
173 */
174 c->working_size += sizeof(struct crush_work_bucket);
175 break;
176 }
177 /* Every bucket has a permutation array. */
178 c->working_size += c->buckets[b]->size * sizeof(__u32);
179 }
180}
181
156static struct crush_map *crush_decode(void *pbyval, void *end) 182static struct crush_map *crush_decode(void *pbyval, void *end)
157{ 183{
158 struct crush_map *c; 184 struct crush_map *c;
@@ -246,10 +272,6 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
246 b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS); 272 b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
247 if (b->items == NULL) 273 if (b->items == NULL)
248 goto badmem; 274 goto badmem;
249 b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
250 if (b->perm == NULL)
251 goto badmem;
252 b->perm_n = 0;
253 275
254 ceph_decode_need(p, end, b->size*sizeof(u32), bad); 276 ceph_decode_need(p, end, b->size*sizeof(u32), bad);
255 for (j = 0; j < b->size; j++) 277 for (j = 0; j < b->size; j++)
@@ -369,6 +391,7 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
369 c->chooseleaf_stable); 391 c->chooseleaf_stable);
370 392
371done: 393done:
394 crush_finalize(c);
372 dout("crush_decode success\n"); 395 dout("crush_decode success\n");
373 return c; 396 return c;
374 397
@@ -719,7 +742,7 @@ struct ceph_osdmap *ceph_osdmap_alloc(void)
719 map->pool_max = -1; 742 map->pool_max = -1;
720 map->pg_temp = RB_ROOT; 743 map->pg_temp = RB_ROOT;
721 map->primary_temp = RB_ROOT; 744 map->primary_temp = RB_ROOT;
722 mutex_init(&map->crush_scratch_mutex); 745 mutex_init(&map->crush_workspace_mutex);
723 746
724 return map; 747 return map;
725} 748}
@@ -753,6 +776,7 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
753 kfree(map->osd_weight); 776 kfree(map->osd_weight);
754 kfree(map->osd_addr); 777 kfree(map->osd_addr);
755 kfree(map->osd_primary_affinity); 778 kfree(map->osd_primary_affinity);
779 kfree(map->crush_workspace);
756 kfree(map); 780 kfree(map);
757} 781}
758 782
@@ -808,6 +832,31 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
808 return 0; 832 return 0;
809} 833}
810 834
835static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush)
836{
837 void *workspace;
838 size_t work_size;
839
840 if (IS_ERR(crush))
841 return PTR_ERR(crush);
842
843 work_size = crush_work_size(crush, CEPH_PG_MAX_SIZE);
844 dout("%s work_size %zu bytes\n", __func__, work_size);
845 workspace = kmalloc(work_size, GFP_NOIO);
846 if (!workspace) {
847 crush_destroy(crush);
848 return -ENOMEM;
849 }
850 crush_init_workspace(crush, workspace);
851
852 if (map->crush)
853 crush_destroy(map->crush);
854 kfree(map->crush_workspace);
855 map->crush = crush;
856 map->crush_workspace = workspace;
857 return 0;
858}
859
811#define OSDMAP_WRAPPER_COMPAT_VER 7 860#define OSDMAP_WRAPPER_COMPAT_VER 7
812#define OSDMAP_CLIENT_DATA_COMPAT_VER 1 861#define OSDMAP_CLIENT_DATA_COMPAT_VER 1
813 862
@@ -1214,13 +1263,9 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
1214 1263
1215 /* crush */ 1264 /* crush */
1216 ceph_decode_32_safe(p, end, len, e_inval); 1265 ceph_decode_32_safe(p, end, len, e_inval);
1217 map->crush = crush_decode(*p, min(*p + len, end)); 1266 err = osdmap_set_crush(map, crush_decode(*p, min(*p + len, end)));
1218 if (IS_ERR(map->crush)) { 1267 if (err)
1219 err = PTR_ERR(map->crush);
1220 map->crush = NULL;
1221 goto bad; 1268 goto bad;
1222 }
1223 *p += len;
1224 1269
1225 /* ignore the rest */ 1270 /* ignore the rest */
1226 *p = end; 1271 *p = end;
@@ -1334,7 +1379,6 @@ static int decode_new_up_state_weight(void **p, void *end,
1334 if ((map->osd_state[osd] & CEPH_OSD_EXISTS) && 1379 if ((map->osd_state[osd] & CEPH_OSD_EXISTS) &&
1335 (xorstate & CEPH_OSD_EXISTS)) { 1380 (xorstate & CEPH_OSD_EXISTS)) {
1336 pr_info("osd%d does not exist\n", osd); 1381 pr_info("osd%d does not exist\n", osd);
1337 map->osd_weight[osd] = CEPH_OSD_IN;
1338 ret = set_primary_affinity(map, osd, 1382 ret = set_primary_affinity(map, osd,
1339 CEPH_OSD_DEFAULT_PRIMARY_AFFINITY); 1383 CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
1340 if (ret) 1384 if (ret)
@@ -1375,7 +1419,6 @@ e_inval:
1375struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, 1419struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
1376 struct ceph_osdmap *map) 1420 struct ceph_osdmap *map)
1377{ 1421{
1378 struct crush_map *newcrush = NULL;
1379 struct ceph_fsid fsid; 1422 struct ceph_fsid fsid;
1380 u32 epoch = 0; 1423 u32 epoch = 0;
1381 struct ceph_timespec modified; 1424 struct ceph_timespec modified;
@@ -1414,12 +1457,10 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
1414 /* new crush? */ 1457 /* new crush? */
1415 ceph_decode_32_safe(p, end, len, e_inval); 1458 ceph_decode_32_safe(p, end, len, e_inval);
1416 if (len > 0) { 1459 if (len > 0) {
1417 newcrush = crush_decode(*p, min(*p+len, end)); 1460 err = osdmap_set_crush(map,
1418 if (IS_ERR(newcrush)) { 1461 crush_decode(*p, min(*p + len, end)));
1419 err = PTR_ERR(newcrush); 1462 if (err)
1420 newcrush = NULL;
1421 goto bad; 1463 goto bad;
1422 }
1423 *p += len; 1464 *p += len;
1424 } 1465 }
1425 1466
@@ -1439,12 +1480,6 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
1439 1480
1440 map->epoch++; 1481 map->epoch++;
1441 map->modified = modified; 1482 map->modified = modified;
1442 if (newcrush) {
1443 if (map->crush)
1444 crush_destroy(map->crush);
1445 map->crush = newcrush;
1446 newcrush = NULL;
1447 }
1448 1483
1449 /* new_pools */ 1484 /* new_pools */
1450 err = decode_new_pools(p, end, map); 1485 err = decode_new_pools(p, end, map);
@@ -1505,8 +1540,6 @@ bad:
1505 print_hex_dump(KERN_DEBUG, "osdmap: ", 1540 print_hex_dump(KERN_DEBUG, "osdmap: ",
1506 DUMP_PREFIX_OFFSET, 16, 1, 1541 DUMP_PREFIX_OFFSET, 16, 1,
1507 start, end - start, true); 1542 start, end - start, true);
1508 if (newcrush)
1509 crush_destroy(newcrush);
1510 return ERR_PTR(err); 1543 return ERR_PTR(err);
1511} 1544}
1512 1545
@@ -1942,10 +1975,10 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
1942 1975
1943 BUG_ON(result_max > CEPH_PG_MAX_SIZE); 1976 BUG_ON(result_max > CEPH_PG_MAX_SIZE);
1944 1977
1945 mutex_lock(&map->crush_scratch_mutex); 1978 mutex_lock(&map->crush_workspace_mutex);
1946 r = crush_do_rule(map->crush, ruleno, x, result, result_max, 1979 r = crush_do_rule(map->crush, ruleno, x, result, result_max,
1947 weight, weight_max, map->crush_scratch_ary); 1980 weight, weight_max, map->crush_workspace);
1948 mutex_unlock(&map->crush_scratch_mutex); 1981 mutex_unlock(&map->crush_workspace_mutex);
1949 1982
1950 return r; 1983 return r;
1951} 1984}
@@ -1978,8 +2011,14 @@ static void pg_to_raw_osds(struct ceph_osdmap *osdmap,
1978 return; 2011 return;
1979 } 2012 }
1980 2013
1981 len = do_crush(osdmap, ruleno, pps, raw->osds, 2014 if (pi->size > ARRAY_SIZE(raw->osds)) {
1982 min_t(int, pi->size, ARRAY_SIZE(raw->osds)), 2015 pr_err_ratelimited("pool %lld ruleset %d type %d too wide: size %d > %zu\n",
2016 pi->id, pi->crush_ruleset, pi->type, pi->size,
2017 ARRAY_SIZE(raw->osds));
2018 return;
2019 }
2020
2021 len = do_crush(osdmap, ruleno, pps, raw->osds, pi->size,
1983 osdmap->osd_weight, osdmap->max_osd); 2022 osdmap->osd_weight, osdmap->max_osd);
1984 if (len < 0) { 2023 if (len < 0) {
1985 pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n", 2024 pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n",
diff --git a/net/ceph/snapshot.c b/net/ceph/snapshot.c
index 154683f5f14c..705414e78ae0 100644
--- a/net/ceph/snapshot.c
+++ b/net/ceph/snapshot.c
@@ -18,8 +18,6 @@
18 * 02110-1301, USA. 18 * 02110-1301, USA.
19 */ 19 */
20 20
21#include <stddef.h>
22
23#include <linux/types.h> 21#include <linux/types.h>
24#include <linux/export.h> 22#include <linux/export.h>
25#include <linux/ceph/libceph.h> 23#include <linux/ceph/libceph.h>
diff --git a/net/compat.c b/net/compat.c
index 96c544b05b15..aba929e5250f 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -22,6 +22,7 @@
22#include <linux/filter.h> 22#include <linux/filter.h>
23#include <linux/compat.h> 23#include <linux/compat.h>
24#include <linux/security.h> 24#include <linux/security.h>
25#include <linux/audit.h>
25#include <linux/export.h> 26#include <linux/export.h>
26 27
27#include <net/scm.h> 28#include <net/scm.h>
@@ -90,11 +91,11 @@ int get_compat_msghdr(struct msghdr *kmsg,
90#define CMSG_COMPAT_ALIGN(len) ALIGN((len), sizeof(s32)) 91#define CMSG_COMPAT_ALIGN(len) ALIGN((len), sizeof(s32))
91 92
92#define CMSG_COMPAT_DATA(cmsg) \ 93#define CMSG_COMPAT_DATA(cmsg) \
93 ((void __user *)((char __user *)(cmsg) + CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr)))) 94 ((void __user *)((char __user *)(cmsg) + sizeof(struct compat_cmsghdr)))
94#define CMSG_COMPAT_SPACE(len) \ 95#define CMSG_COMPAT_SPACE(len) \
95 (CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr)) + CMSG_COMPAT_ALIGN(len)) 96 (sizeof(struct compat_cmsghdr) + CMSG_COMPAT_ALIGN(len))
96#define CMSG_COMPAT_LEN(len) \ 97#define CMSG_COMPAT_LEN(len) \
97 (CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr)) + (len)) 98 (sizeof(struct compat_cmsghdr) + (len))
98 99
99#define CMSG_COMPAT_FIRSTHDR(msg) \ 100#define CMSG_COMPAT_FIRSTHDR(msg) \
100 (((msg)->msg_controllen) >= sizeof(struct compat_cmsghdr) ? \ 101 (((msg)->msg_controllen) >= sizeof(struct compat_cmsghdr) ? \
@@ -130,6 +131,9 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk,
130 __kernel_size_t kcmlen, tmp; 131 __kernel_size_t kcmlen, tmp;
131 int err = -EFAULT; 132 int err = -EFAULT;
132 133
134 BUILD_BUG_ON(sizeof(struct compat_cmsghdr) !=
135 CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr)));
136
133 kcmlen = 0; 137 kcmlen = 0;
134 kcmsg_base = kcmsg = (struct cmsghdr *)stackbuf; 138 kcmsg_base = kcmsg = (struct cmsghdr *)stackbuf;
135 ucmsg = CMSG_COMPAT_FIRSTHDR(kmsg); 139 ucmsg = CMSG_COMPAT_FIRSTHDR(kmsg);
@@ -141,8 +145,7 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk,
141 if (!CMSG_COMPAT_OK(ucmlen, ucmsg, kmsg)) 145 if (!CMSG_COMPAT_OK(ucmlen, ucmsg, kmsg))
142 return -EINVAL; 146 return -EINVAL;
143 147
144 tmp = ((ucmlen - CMSG_COMPAT_ALIGN(sizeof(*ucmsg))) + 148 tmp = ((ucmlen - sizeof(*ucmsg)) + sizeof(struct cmsghdr));
145 CMSG_ALIGN(sizeof(struct cmsghdr)));
146 tmp = CMSG_ALIGN(tmp); 149 tmp = CMSG_ALIGN(tmp);
147 kcmlen += tmp; 150 kcmlen += tmp;
148 ucmsg = cmsg_compat_nxthdr(kmsg, ucmsg, ucmlen); 151 ucmsg = cmsg_compat_nxthdr(kmsg, ucmsg, ucmlen);
@@ -168,8 +171,7 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk,
168 goto Efault; 171 goto Efault;
169 if (!CMSG_COMPAT_OK(ucmlen, ucmsg, kmsg)) 172 if (!CMSG_COMPAT_OK(ucmlen, ucmsg, kmsg))
170 goto Einval; 173 goto Einval;
171 tmp = ((ucmlen - CMSG_COMPAT_ALIGN(sizeof(*ucmsg))) + 174 tmp = ((ucmlen - sizeof(*ucmsg)) + sizeof(struct cmsghdr));
172 CMSG_ALIGN(sizeof(struct cmsghdr)));
173 if ((char *)kcmsg_base + kcmlen - (char *)kcmsg < CMSG_ALIGN(tmp)) 175 if ((char *)kcmsg_base + kcmlen - (char *)kcmsg < CMSG_ALIGN(tmp))
174 goto Einval; 176 goto Einval;
175 kcmsg->cmsg_len = tmp; 177 kcmsg->cmsg_len = tmp;
@@ -178,7 +180,7 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk,
178 __get_user(kcmsg->cmsg_type, &ucmsg->cmsg_type) || 180 __get_user(kcmsg->cmsg_type, &ucmsg->cmsg_type) ||
179 copy_from_user(CMSG_DATA(kcmsg), 181 copy_from_user(CMSG_DATA(kcmsg),
180 CMSG_COMPAT_DATA(ucmsg), 182 CMSG_COMPAT_DATA(ucmsg),
181 (ucmlen - CMSG_COMPAT_ALIGN(sizeof(*ucmsg))))) 183 (ucmlen - sizeof(*ucmsg))))
182 goto Efault; 184 goto Efault;
183 185
184 /* Advance. */ 186 /* Advance. */
@@ -781,14 +783,24 @@ COMPAT_SYSCALL_DEFINE5(recvmmsg, int, fd, struct compat_mmsghdr __user *, mmsg,
781 783
782COMPAT_SYSCALL_DEFINE2(socketcall, int, call, u32 __user *, args) 784COMPAT_SYSCALL_DEFINE2(socketcall, int, call, u32 __user *, args)
783{ 785{
784 int ret; 786 u32 a[AUDITSC_ARGS];
785 u32 a[6]; 787 unsigned int len;
786 u32 a0, a1; 788 u32 a0, a1;
789 int ret;
787 790
788 if (call < SYS_SOCKET || call > SYS_SENDMMSG) 791 if (call < SYS_SOCKET || call > SYS_SENDMMSG)
789 return -EINVAL; 792 return -EINVAL;
790 if (copy_from_user(a, args, nas[call])) 793 len = nas[call];
794 if (len > sizeof(a))
795 return -EINVAL;
796
797 if (copy_from_user(a, args, len))
791 return -EFAULT; 798 return -EFAULT;
799
800 ret = audit_socketcall_compat(len / sizeof(a[0]), a);
801 if (ret)
802 return ret;
803
792 a0 = a[0]; 804 a0 = a[0];
793 a1 = a[1]; 805 a1 = a[1];
794 806
diff --git a/net/core/Makefile b/net/core/Makefile
index f6761b6e3b29..79f9479e9658 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -28,3 +28,4 @@ obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o
28obj-$(CONFIG_DST_CACHE) += dst_cache.o 28obj-$(CONFIG_DST_CACHE) += dst_cache.o
29obj-$(CONFIG_HWBM) += hwbm.o 29obj-$(CONFIG_HWBM) += hwbm.o
30obj-$(CONFIG_NET_DEVLINK) += devlink.o 30obj-$(CONFIG_NET_DEVLINK) += devlink.o
31obj-$(CONFIG_GRO_CELLS) += gro_cells.o
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 662bea587165..f4947e737f34 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -332,7 +332,9 @@ void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len)
332EXPORT_SYMBOL(__skb_free_datagram_locked); 332EXPORT_SYMBOL(__skb_free_datagram_locked);
333 333
334int __sk_queue_drop_skb(struct sock *sk, struct sk_buff *skb, 334int __sk_queue_drop_skb(struct sock *sk, struct sk_buff *skb,
335 unsigned int flags) 335 unsigned int flags,
336 void (*destructor)(struct sock *sk,
337 struct sk_buff *skb))
336{ 338{
337 int err = 0; 339 int err = 0;
338 340
@@ -342,6 +344,8 @@ int __sk_queue_drop_skb(struct sock *sk, struct sk_buff *skb,
342 if (skb == skb_peek(&sk->sk_receive_queue)) { 344 if (skb == skb_peek(&sk->sk_receive_queue)) {
343 __skb_unlink(skb, &sk->sk_receive_queue); 345 __skb_unlink(skb, &sk->sk_receive_queue);
344 atomic_dec(&skb->users); 346 atomic_dec(&skb->users);
347 if (destructor)
348 destructor(sk, skb);
345 err = 0; 349 err = 0;
346 } 350 }
347 spin_unlock_bh(&sk->sk_receive_queue.lock); 351 spin_unlock_bh(&sk->sk_receive_queue.lock);
@@ -375,7 +379,7 @@ EXPORT_SYMBOL(__sk_queue_drop_skb);
375 379
376int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags) 380int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
377{ 381{
378 int err = __sk_queue_drop_skb(sk, skb, flags); 382 int err = __sk_queue_drop_skb(sk, skb, flags, NULL);
379 383
380 kfree_skb(skb); 384 kfree_skb(skb);
381 sk_mem_reclaim_partial(sk); 385 sk_mem_reclaim_partial(sk);
@@ -394,7 +398,7 @@ int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
394 struct iov_iter *to, int len) 398 struct iov_iter *to, int len)
395{ 399{
396 int start = skb_headlen(skb); 400 int start = skb_headlen(skb);
397 int i, copy = start - offset; 401 int i, copy = start - offset, start_off = offset, n;
398 struct sk_buff *frag_iter; 402 struct sk_buff *frag_iter;
399 403
400 trace_skb_copy_datagram_iovec(skb, len); 404 trace_skb_copy_datagram_iovec(skb, len);
@@ -403,11 +407,12 @@ int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
403 if (copy > 0) { 407 if (copy > 0) {
404 if (copy > len) 408 if (copy > len)
405 copy = len; 409 copy = len;
406 if (copy_to_iter(skb->data + offset, copy, to) != copy) 410 n = copy_to_iter(skb->data + offset, copy, to);
411 offset += n;
412 if (n != copy)
407 goto short_copy; 413 goto short_copy;
408 if ((len -= copy) == 0) 414 if ((len -= copy) == 0)
409 return 0; 415 return 0;
410 offset += copy;
411 } 416 }
412 417
413 /* Copy paged appendix. Hmm... why does this look so complicated? */ 418 /* Copy paged appendix. Hmm... why does this look so complicated? */
@@ -421,13 +426,14 @@ int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
421 if ((copy = end - offset) > 0) { 426 if ((copy = end - offset) > 0) {
422 if (copy > len) 427 if (copy > len)
423 copy = len; 428 copy = len;
424 if (copy_page_to_iter(skb_frag_page(frag), 429 n = copy_page_to_iter(skb_frag_page(frag),
425 frag->page_offset + offset - 430 frag->page_offset + offset -
426 start, copy, to) != copy) 431 start, copy, to);
432 offset += n;
433 if (n != copy)
427 goto short_copy; 434 goto short_copy;
428 if (!(len -= copy)) 435 if (!(len -= copy))
429 return 0; 436 return 0;
430 offset += copy;
431 } 437 }
432 start = end; 438 start = end;
433 } 439 }
@@ -459,6 +465,7 @@ int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
459 */ 465 */
460 466
461fault: 467fault:
468 iov_iter_revert(to, offset - start_off);
462 return -EFAULT; 469 return -EFAULT;
463 470
464short_copy: 471short_copy:
@@ -609,7 +616,7 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
609 __wsum *csump) 616 __wsum *csump)
610{ 617{
611 int start = skb_headlen(skb); 618 int start = skb_headlen(skb);
612 int i, copy = start - offset; 619 int i, copy = start - offset, start_off = offset;
613 struct sk_buff *frag_iter; 620 struct sk_buff *frag_iter;
614 int pos = 0; 621 int pos = 0;
615 int n; 622 int n;
@@ -619,11 +626,11 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
619 if (copy > len) 626 if (copy > len)
620 copy = len; 627 copy = len;
621 n = csum_and_copy_to_iter(skb->data + offset, copy, csump, to); 628 n = csum_and_copy_to_iter(skb->data + offset, copy, csump, to);
629 offset += n;
622 if (n != copy) 630 if (n != copy)
623 goto fault; 631 goto fault;
624 if ((len -= copy) == 0) 632 if ((len -= copy) == 0)
625 return 0; 633 return 0;
626 offset += copy;
627 pos = copy; 634 pos = copy;
628 } 635 }
629 636
@@ -645,12 +652,12 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
645 offset - start, copy, 652 offset - start, copy,
646 &csum2, to); 653 &csum2, to);
647 kunmap(page); 654 kunmap(page);
655 offset += n;
648 if (n != copy) 656 if (n != copy)
649 goto fault; 657 goto fault;
650 *csump = csum_block_add(*csump, csum2, pos); 658 *csump = csum_block_add(*csump, csum2, pos);
651 if (!(len -= copy)) 659 if (!(len -= copy))
652 return 0; 660 return 0;
653 offset += copy;
654 pos += copy; 661 pos += copy;
655 } 662 }
656 start = end; 663 start = end;
@@ -683,6 +690,7 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
683 return 0; 690 return 0;
684 691
685fault: 692fault:
693 iov_iter_revert(to, offset - start_off);
686 return -EFAULT; 694 return -EFAULT;
687} 695}
688 696
@@ -767,6 +775,7 @@ int skb_copy_and_csum_datagram_msg(struct sk_buff *skb,
767 } 775 }
768 return 0; 776 return 0;
769csum_error: 777csum_error:
778 iov_iter_revert(&msg->msg_iter, chunk);
770 return -EINVAL; 779 return -EINVAL;
771fault: 780fault:
772 return -EFAULT; 781 return -EFAULT;
diff --git a/net/core/dev.c b/net/core/dev.c
index 7f218e095361..533a6d6f6092 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * NET3 Protocol independent device support routines. 2 * NET3 Protocol independent device support routines.
3 * 3 *
4 * This program is free software; you can redistribute it and/or 4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License 5 * modify it under the terms of the GNU General Public License
@@ -7,7 +7,7 @@
7 * 2 of the License, or (at your option) any later version. 7 * 2 of the License, or (at your option) any later version.
8 * 8 *
9 * Derived from the non IP parts of dev.c 1.0.19 9 * Derived from the non IP parts of dev.c 1.0.19
10 * Authors: Ross Biro 10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk> 12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * 13 *
@@ -21,9 +21,9 @@
21 * 21 *
22 * Changes: 22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set 23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called 24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a 25 * before net_dev_init & also removed a
26 * few lines of code in the process. 26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back. 27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant 28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe. 29 * stunts to keep the queue safe.
@@ -36,7 +36,7 @@
36 * Alan Cox : 100 backlog just doesn't cut it when 36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8) 37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager. 38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths. 39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass 40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler 41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before 42 * Alan Cox : Network driver sets packet type before
@@ -46,7 +46,7 @@
46 * Richard Kooijman: Timestamp fixes. 46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR 47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection. 48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close 49 * Alan Cox : Fixed nasty side effect of device close
50 * changes. 50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to 51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address() 52 * set_mac_address()
@@ -67,8 +67,8 @@
67 * Paul Rusty Russell : SIOCSIFNAME 67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code 68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait 69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt 70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling 71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback 72 * - netif_rx() feedback
73 */ 73 */
74 74
@@ -192,7 +192,8 @@ static seqcount_t devnet_rename_seq;
192 192
193static inline void dev_base_seq_inc(struct net *net) 193static inline void dev_base_seq_inc(struct net *net)
194{ 194{
195 while (++net->dev_base_seq == 0); 195 while (++net->dev_base_seq == 0)
196 ;
196} 197}
197 198
198static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) 199static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
@@ -274,8 +275,8 @@ EXPORT_PER_CPU_SYMBOL(softnet_data);
274 * register_netdevice() inits txq->_xmit_lock and sets lockdep class 275 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
275 * according to dev->type 276 * according to dev->type
276 */ 277 */
277static const unsigned short netdev_lock_type[] = 278static const unsigned short netdev_lock_type[] = {
278 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25, 279 ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
279 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET, 280 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
280 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM, 281 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
281 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP, 282 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
@@ -291,22 +292,22 @@ static const unsigned short netdev_lock_type[] =
291 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE, 292 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
292 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE}; 293 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
293 294
294static const char *const netdev_lock_name[] = 295static const char *const netdev_lock_name[] = {
295 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", 296 "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
296 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET", 297 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
297 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM", 298 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
298 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP", 299 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
299 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD", 300 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
300 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25", 301 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
301 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP", 302 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
302 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD", 303 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
303 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI", 304 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
304 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE", 305 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
305 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET", 306 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
306 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", 307 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
307 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM", 308 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
308 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE", 309 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
309 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"}; 310 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
310 311
311static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; 312static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
312static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)]; 313static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
@@ -352,10 +353,11 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
352#endif 353#endif
353 354
354/******************************************************************************* 355/*******************************************************************************
356 *
357 * Protocol management and registration routines
358 *
359 *******************************************************************************/
355 360
356 Protocol management and registration routines
357
358*******************************************************************************/
359 361
360/* 362/*
361 * Add a protocol ID to the list. Now that the input handler is 363 * Add a protocol ID to the list. Now that the input handler is
@@ -538,10 +540,10 @@ void dev_remove_offload(struct packet_offload *po)
538EXPORT_SYMBOL(dev_remove_offload); 540EXPORT_SYMBOL(dev_remove_offload);
539 541
540/****************************************************************************** 542/******************************************************************************
541 543 *
542 Device Boot-time Settings Routines 544 * Device Boot-time Settings Routines
543 545 *
544*******************************************************************************/ 546 ******************************************************************************/
545 547
546/* Boot time configuration table */ 548/* Boot time configuration table */
547static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX]; 549static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
@@ -574,13 +576,13 @@ static int netdev_boot_setup_add(char *name, struct ifmap *map)
574} 576}
575 577
576/** 578/**
577 * netdev_boot_setup_check - check boot time settings 579 * netdev_boot_setup_check - check boot time settings
578 * @dev: the netdevice 580 * @dev: the netdevice
579 * 581 *
580 * Check boot time settings for the device. 582 * Check boot time settings for the device.
581 * The found settings are set for the device to be used 583 * The found settings are set for the device to be used
582 * later in the device probing. 584 * later in the device probing.
583 * Returns 0 if no settings found, 1 if they are. 585 * Returns 0 if no settings found, 1 if they are.
584 */ 586 */
585int netdev_boot_setup_check(struct net_device *dev) 587int netdev_boot_setup_check(struct net_device *dev)
586{ 588{
@@ -590,10 +592,10 @@ int netdev_boot_setup_check(struct net_device *dev)
590 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { 592 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
591 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' && 593 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
592 !strcmp(dev->name, s[i].name)) { 594 !strcmp(dev->name, s[i].name)) {
593 dev->irq = s[i].map.irq; 595 dev->irq = s[i].map.irq;
594 dev->base_addr = s[i].map.base_addr; 596 dev->base_addr = s[i].map.base_addr;
595 dev->mem_start = s[i].map.mem_start; 597 dev->mem_start = s[i].map.mem_start;
596 dev->mem_end = s[i].map.mem_end; 598 dev->mem_end = s[i].map.mem_end;
597 return 1; 599 return 1;
598 } 600 }
599 } 601 }
@@ -603,14 +605,14 @@ EXPORT_SYMBOL(netdev_boot_setup_check);
603 605
604 606
605/** 607/**
606 * netdev_boot_base - get address from boot time settings 608 * netdev_boot_base - get address from boot time settings
607 * @prefix: prefix for network device 609 * @prefix: prefix for network device
608 * @unit: id for network device 610 * @unit: id for network device
609 * 611 *
610 * Check boot time settings for the base address of device. 612 * Check boot time settings for the base address of device.
611 * The found settings are set for the device to be used 613 * The found settings are set for the device to be used
612 * later in the device probing. 614 * later in the device probing.
613 * Returns 0 if no settings found. 615 * Returns 0 if no settings found.
614 */ 616 */
615unsigned long netdev_boot_base(const char *prefix, int unit) 617unsigned long netdev_boot_base(const char *prefix, int unit)
616{ 618{
@@ -663,10 +665,10 @@ int __init netdev_boot_setup(char *str)
663__setup("netdev=", netdev_boot_setup); 665__setup("netdev=", netdev_boot_setup);
664 666
665/******************************************************************************* 667/*******************************************************************************
666 668 *
667 Device Interface Subroutines 669 * Device Interface Subroutines
668 670 *
669*******************************************************************************/ 671 *******************************************************************************/
670 672
671/** 673/**
672 * dev_get_iflink - get 'iflink' value of a interface 674 * dev_get_iflink - get 'iflink' value of a interface
@@ -737,15 +739,15 @@ struct net_device *__dev_get_by_name(struct net *net, const char *name)
737EXPORT_SYMBOL(__dev_get_by_name); 739EXPORT_SYMBOL(__dev_get_by_name);
738 740
739/** 741/**
740 * dev_get_by_name_rcu - find a device by its name 742 * dev_get_by_name_rcu - find a device by its name
741 * @net: the applicable net namespace 743 * @net: the applicable net namespace
742 * @name: name to find 744 * @name: name to find
743 * 745 *
744 * Find an interface by name. 746 * Find an interface by name.
745 * If the name is found a pointer to the device is returned. 747 * If the name is found a pointer to the device is returned.
746 * If the name is not found then %NULL is returned. 748 * If the name is not found then %NULL is returned.
747 * The reference counters are not incremented so the caller must be 749 * The reference counters are not incremented so the caller must be
748 * careful with locks. The caller must hold RCU lock. 750 * careful with locks. The caller must hold RCU lock.
749 */ 751 */
750 752
751struct net_device *dev_get_by_name_rcu(struct net *net, const char *name) 753struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
@@ -1289,8 +1291,8 @@ void netdev_state_change(struct net_device *dev)
1289EXPORT_SYMBOL(netdev_state_change); 1291EXPORT_SYMBOL(netdev_state_change);
1290 1292
1291/** 1293/**
1292 * netdev_notify_peers - notify network peers about existence of @dev 1294 * netdev_notify_peers - notify network peers about existence of @dev
1293 * @dev: network device 1295 * @dev: network device
1294 * 1296 *
1295 * Generate traffic such that interested network peers are aware of 1297 * Generate traffic such that interested network peers are aware of
1296 * @dev, such as by generating a gratuitous ARP. This may be used when 1298 * @dev, such as by generating a gratuitous ARP. This may be used when
@@ -1302,6 +1304,7 @@ void netdev_notify_peers(struct net_device *dev)
1302{ 1304{
1303 rtnl_lock(); 1305 rtnl_lock();
1304 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev); 1306 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1307 call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
1305 rtnl_unlock(); 1308 rtnl_unlock();
1306} 1309}
1307EXPORT_SYMBOL(netdev_notify_peers); 1310EXPORT_SYMBOL(netdev_notify_peers);
@@ -1518,17 +1521,17 @@ static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1518static int dev_boot_phase = 1; 1521static int dev_boot_phase = 1;
1519 1522
1520/** 1523/**
1521 * register_netdevice_notifier - register a network notifier block 1524 * register_netdevice_notifier - register a network notifier block
1522 * @nb: notifier 1525 * @nb: notifier
1523 * 1526 *
1524 * Register a notifier to be called when network device events occur. 1527 * Register a notifier to be called when network device events occur.
1525 * The notifier passed is linked into the kernel structures and must 1528 * The notifier passed is linked into the kernel structures and must
1526 * not be reused until it has been unregistered. A negative errno code 1529 * not be reused until it has been unregistered. A negative errno code
1527 * is returned on a failure. 1530 * is returned on a failure.
1528 * 1531 *
1529 * When registered all registration and up events are replayed 1532 * When registered all registration and up events are replayed
1530 * to the new notifier to allow device to have a race free 1533 * to the new notifier to allow device to have a race free
1531 * view of the network device list. 1534 * view of the network device list.
1532 */ 1535 */
1533 1536
1534int register_netdevice_notifier(struct notifier_block *nb) 1537int register_netdevice_notifier(struct notifier_block *nb)
@@ -1585,17 +1588,17 @@ outroll:
1585EXPORT_SYMBOL(register_netdevice_notifier); 1588EXPORT_SYMBOL(register_netdevice_notifier);
1586 1589
1587/** 1590/**
1588 * unregister_netdevice_notifier - unregister a network notifier block 1591 * unregister_netdevice_notifier - unregister a network notifier block
1589 * @nb: notifier 1592 * @nb: notifier
1590 * 1593 *
1591 * Unregister a notifier previously registered by 1594 * Unregister a notifier previously registered by
1592 * register_netdevice_notifier(). The notifier is unlinked into the 1595 * register_netdevice_notifier(). The notifier is unlinked into the
1593 * kernel structures and may then be reused. A negative errno code 1596 * kernel structures and may then be reused. A negative errno code
1594 * is returned on a failure. 1597 * is returned on a failure.
1595 * 1598 *
1596 * After unregistering unregister and down device events are synthesized 1599 * After unregistering unregister and down device events are synthesized
1597 * for all devices on the device list to the removed notifier to remove 1600 * for all devices on the device list to the removed notifier to remove
1598 * the need for special case cleanup code. 1601 * the need for special case cleanup code.
1599 */ 1602 */
1600 1603
1601int unregister_netdevice_notifier(struct notifier_block *nb) 1604int unregister_netdevice_notifier(struct notifier_block *nb)
@@ -1695,37 +1698,59 @@ EXPORT_SYMBOL_GPL(net_dec_egress_queue);
1695 1698
1696static struct static_key netstamp_needed __read_mostly; 1699static struct static_key netstamp_needed __read_mostly;
1697#ifdef HAVE_JUMP_LABEL 1700#ifdef HAVE_JUMP_LABEL
1698/* We are not allowed to call static_key_slow_dec() from irq context
1699 * If net_disable_timestamp() is called from irq context, defer the
1700 * static_key_slow_dec() calls.
1701 */
1702static atomic_t netstamp_needed_deferred; 1701static atomic_t netstamp_needed_deferred;
1702static atomic_t netstamp_wanted;
1703static void netstamp_clear(struct work_struct *work)
1704{
1705 int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1706 int wanted;
1707
1708 wanted = atomic_add_return(deferred, &netstamp_wanted);
1709 if (wanted > 0)
1710 static_key_enable(&netstamp_needed);
1711 else
1712 static_key_disable(&netstamp_needed);
1713}
1714static DECLARE_WORK(netstamp_work, netstamp_clear);
1703#endif 1715#endif
1704 1716
1705void net_enable_timestamp(void) 1717void net_enable_timestamp(void)
1706{ 1718{
1707#ifdef HAVE_JUMP_LABEL 1719#ifdef HAVE_JUMP_LABEL
1708 int deferred = atomic_xchg(&netstamp_needed_deferred, 0); 1720 int wanted;
1709 1721
1710 if (deferred) { 1722 while (1) {
1711 while (--deferred) 1723 wanted = atomic_read(&netstamp_wanted);
1712 static_key_slow_dec(&netstamp_needed); 1724 if (wanted <= 0)
1713 return; 1725 break;
1726 if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
1727 return;
1714 } 1728 }
1715#endif 1729 atomic_inc(&netstamp_needed_deferred);
1730 schedule_work(&netstamp_work);
1731#else
1716 static_key_slow_inc(&netstamp_needed); 1732 static_key_slow_inc(&netstamp_needed);
1733#endif
1717} 1734}
1718EXPORT_SYMBOL(net_enable_timestamp); 1735EXPORT_SYMBOL(net_enable_timestamp);
1719 1736
1720void net_disable_timestamp(void) 1737void net_disable_timestamp(void)
1721{ 1738{
1722#ifdef HAVE_JUMP_LABEL 1739#ifdef HAVE_JUMP_LABEL
1723 if (in_interrupt()) { 1740 int wanted;
1724 atomic_inc(&netstamp_needed_deferred); 1741
1725 return; 1742 while (1) {
1743 wanted = atomic_read(&netstamp_wanted);
1744 if (wanted <= 1)
1745 break;
1746 if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
1747 return;
1726 } 1748 }
1727#endif 1749 atomic_dec(&netstamp_needed_deferred);
1750 schedule_work(&netstamp_work);
1751#else
1728 static_key_slow_dec(&netstamp_needed); 1752 static_key_slow_dec(&netstamp_needed);
1753#endif
1729} 1754}
1730EXPORT_SYMBOL(net_disable_timestamp); 1755EXPORT_SYMBOL(net_disable_timestamp);
1731 1756
@@ -2408,28 +2433,6 @@ void netif_schedule_queue(struct netdev_queue *txq)
2408} 2433}
2409EXPORT_SYMBOL(netif_schedule_queue); 2434EXPORT_SYMBOL(netif_schedule_queue);
2410 2435
2411/**
2412 * netif_wake_subqueue - allow sending packets on subqueue
2413 * @dev: network device
2414 * @queue_index: sub queue index
2415 *
2416 * Resume individual transmit queue of a device with multiple transmit queues.
2417 */
2418void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2419{
2420 struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2421
2422 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2423 struct Qdisc *q;
2424
2425 rcu_read_lock();
2426 q = rcu_dereference(txq->qdisc);
2427 __netif_schedule(q);
2428 rcu_read_unlock();
2429 }
2430}
2431EXPORT_SYMBOL(netif_wake_subqueue);
2432
2433void netif_tx_wake_queue(struct netdev_queue *dev_queue) 2436void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2434{ 2437{
2435 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) { 2438 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
@@ -2523,6 +2526,7 @@ u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2523 2526
2524 if (dev->num_tc) { 2527 if (dev->num_tc) {
2525 u8 tc = netdev_get_prio_tc_map(dev, skb->priority); 2528 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2529
2526 qoffset = dev->tc_to_txq[tc].offset; 2530 qoffset = dev->tc_to_txq[tc].offset;
2527 qcount = dev->tc_to_txq[tc].count; 2531 qcount = dev->tc_to_txq[tc].count;
2528 } 2532 }
@@ -2659,9 +2663,10 @@ EXPORT_SYMBOL(skb_mac_gso_segment);
2659static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) 2663static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2660{ 2664{
2661 if (tx_path) 2665 if (tx_path)
2662 return skb->ip_summed != CHECKSUM_PARTIAL; 2666 return skb->ip_summed != CHECKSUM_PARTIAL &&
2663 else 2667 skb->ip_summed != CHECKSUM_NONE;
2664 return skb->ip_summed == CHECKSUM_NONE; 2668
2669 return skb->ip_summed == CHECKSUM_NONE;
2665} 2670}
2666 2671
2667/** 2672/**
@@ -2680,11 +2685,12 @@ static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2680struct sk_buff *__skb_gso_segment(struct sk_buff *skb, 2685struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2681 netdev_features_t features, bool tx_path) 2686 netdev_features_t features, bool tx_path)
2682{ 2687{
2688 struct sk_buff *segs;
2689
2683 if (unlikely(skb_needs_check(skb, tx_path))) { 2690 if (unlikely(skb_needs_check(skb, tx_path))) {
2684 int err; 2691 int err;
2685 2692
2686 skb_warn_bad_offload(skb); 2693 /* We're going to init ->check field in TCP or UDP header */
2687
2688 err = skb_cow_head(skb, 0); 2694 err = skb_cow_head(skb, 0);
2689 if (err < 0) 2695 if (err < 0)
2690 return ERR_PTR(err); 2696 return ERR_PTR(err);
@@ -2712,7 +2718,12 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2712 skb_reset_mac_header(skb); 2718 skb_reset_mac_header(skb);
2713 skb_reset_mac_len(skb); 2719 skb_reset_mac_len(skb);
2714 2720
2715 return skb_mac_gso_segment(skb, features); 2721 segs = skb_mac_gso_segment(skb, features);
2722
2723 if (unlikely(skb_needs_check(skb, tx_path)))
2724 skb_warn_bad_offload(skb);
2725
2726 return segs;
2716} 2727}
2717EXPORT_SYMBOL(__skb_gso_segment); 2728EXPORT_SYMBOL(__skb_gso_segment);
2718 2729
@@ -2737,9 +2748,11 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2737{ 2748{
2738#ifdef CONFIG_HIGHMEM 2749#ifdef CONFIG_HIGHMEM
2739 int i; 2750 int i;
2751
2740 if (!(dev->features & NETIF_F_HIGHDMA)) { 2752 if (!(dev->features & NETIF_F_HIGHDMA)) {
2741 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2753 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2742 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 2754 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2755
2743 if (PageHighMem(skb_frag_page(frag))) 2756 if (PageHighMem(skb_frag_page(frag)))
2744 return 1; 2757 return 1;
2745 } 2758 }
@@ -2753,6 +2766,7 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2753 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2766 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2754 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 2767 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2755 dma_addr_t addr = page_to_phys(skb_frag_page(frag)); 2768 dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2769
2756 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask) 2770 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2757 return 1; 2771 return 1;
2758 } 2772 }
@@ -3153,9 +3167,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3153 if (!cl) 3167 if (!cl)
3154 return skb; 3168 return skb;
3155 3169
3156 /* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set 3170 /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
3157 * earlier by the caller.
3158 */
3159 qdisc_bstats_cpu_update(cl->q, skb); 3171 qdisc_bstats_cpu_update(cl->q, skb);
3160 3172
3161 switch (tc_classify(skb, cl, &cl_res, false)) { 3173 switch (tc_classify(skb, cl, &cl_res, false)) {
@@ -3230,6 +3242,7 @@ static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
3230 if (queue_index < 0 || skb->ooo_okay || 3242 if (queue_index < 0 || skb->ooo_okay ||
3231 queue_index >= dev->real_num_tx_queues) { 3243 queue_index >= dev->real_num_tx_queues) {
3232 int new_index = get_xps_queue(dev, skb); 3244 int new_index = get_xps_queue(dev, skb);
3245
3233 if (new_index < 0) 3246 if (new_index < 0)
3234 new_index = skb_tx_hash(dev, skb); 3247 new_index = skb_tx_hash(dev, skb);
3235 3248
@@ -3259,6 +3272,7 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3259 3272
3260 if (dev->real_num_tx_queues != 1) { 3273 if (dev->real_num_tx_queues != 1) {
3261 const struct net_device_ops *ops = dev->netdev_ops; 3274 const struct net_device_ops *ops = dev->netdev_ops;
3275
3262 if (ops->ndo_select_queue) 3276 if (ops->ndo_select_queue)
3263 queue_index = ops->ndo_select_queue(dev, skb, accel_priv, 3277 queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3264 __netdev_pick_tx); 3278 __netdev_pick_tx);
@@ -3320,7 +3334,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3320 3334
3321 qdisc_pkt_len_init(skb); 3335 qdisc_pkt_len_init(skb);
3322#ifdef CONFIG_NET_CLS_ACT 3336#ifdef CONFIG_NET_CLS_ACT
3323 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS); 3337 skb->tc_at_ingress = 0;
3324# ifdef CONFIG_NET_EGRESS 3338# ifdef CONFIG_NET_EGRESS
3325 if (static_key_false(&egress_needed)) { 3339 if (static_key_false(&egress_needed)) {
3326 skb = sch_handle_egress(skb, &rc, dev); 3340 skb = sch_handle_egress(skb, &rc, dev);
@@ -3347,16 +3361,16 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3347 } 3361 }
3348 3362
3349 /* The device has no queue. Common case for software devices: 3363 /* The device has no queue. Common case for software devices:
3350 loopback, all the sorts of tunnels... 3364 * loopback, all the sorts of tunnels...
3351 3365
3352 Really, it is unlikely that netif_tx_lock protection is necessary 3366 * Really, it is unlikely that netif_tx_lock protection is necessary
3353 here. (f.e. loopback and IP tunnels are clean ignoring statistics 3367 * here. (f.e. loopback and IP tunnels are clean ignoring statistics
3354 counters.) 3368 * counters.)
3355 However, it is possible, that they rely on protection 3369 * However, it is possible, that they rely on protection
3356 made by us here. 3370 * made by us here.
3357 3371
3358 Check this and shot the lock. It is not prone from deadlocks. 3372 * Check this and shot the lock. It is not prone from deadlocks.
3359 Either shot noqueue qdisc, it is even simpler 8) 3373 *Either shot noqueue qdisc, it is even simpler 8)
3360 */ 3374 */
3361 if (dev->flags & IFF_UP) { 3375 if (dev->flags & IFF_UP) {
3362 int cpu = smp_processor_id(); /* ok because BHs are off */ 3376 int cpu = smp_processor_id(); /* ok because BHs are off */
@@ -3418,16 +3432,20 @@ int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3418EXPORT_SYMBOL(dev_queue_xmit_accel); 3432EXPORT_SYMBOL(dev_queue_xmit_accel);
3419 3433
3420 3434
3421/*======================================================================= 3435/*************************************************************************
3422 Receiver routines 3436 * Receiver routines
3423 =======================================================================*/ 3437 *************************************************************************/
3424 3438
3425int netdev_max_backlog __read_mostly = 1000; 3439int netdev_max_backlog __read_mostly = 1000;
3426EXPORT_SYMBOL(netdev_max_backlog); 3440EXPORT_SYMBOL(netdev_max_backlog);
3427 3441
3428int netdev_tstamp_prequeue __read_mostly = 1; 3442int netdev_tstamp_prequeue __read_mostly = 1;
3429int netdev_budget __read_mostly = 300; 3443int netdev_budget __read_mostly = 300;
3430int weight_p __read_mostly = 64; /* old backlog weight */ 3444int weight_p __read_mostly = 64; /* old backlog weight */
3445int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */
3446int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */
3447int dev_rx_weight __read_mostly = 64;
3448int dev_tx_weight __read_mostly = 64;
3431 3449
3432/* Called with irq disabled */ 3450/* Called with irq disabled */
3433static inline void ____napi_schedule(struct softnet_data *sd, 3451static inline void ____napi_schedule(struct softnet_data *sd,
@@ -3784,6 +3802,7 @@ static int netif_rx_internal(struct sk_buff *skb)
3784#endif 3802#endif
3785 { 3803 {
3786 unsigned int qtail; 3804 unsigned int qtail;
3805
3787 ret = enqueue_to_backlog(skb, get_cpu(), &qtail); 3806 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3788 put_cpu(); 3807 put_cpu();
3789 } 3808 }
@@ -3843,6 +3862,7 @@ static __latent_entropy void net_tx_action(struct softirq_action *h)
3843 3862
3844 while (clist) { 3863 while (clist) {
3845 struct sk_buff *skb = clist; 3864 struct sk_buff *skb = clist;
3865
3846 clist = clist->next; 3866 clist = clist->next;
3847 3867
3848 WARN_ON(atomic_read(&skb->users)); 3868 WARN_ON(atomic_read(&skb->users));
@@ -3916,7 +3936,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
3916 } 3936 }
3917 3937
3918 qdisc_skb_cb(skb)->pkt_len = skb->len; 3938 qdisc_skb_cb(skb)->pkt_len = skb->len;
3919 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); 3939 skb->tc_at_ingress = 1;
3920 qdisc_bstats_cpu_update(cl->q, skb); 3940 qdisc_bstats_cpu_update(cl->q, skb);
3921 3941
3922 switch (tc_classify(skb, cl, &cl_res, false)) { 3942 switch (tc_classify(skb, cl, &cl_res, false)) {
@@ -3981,9 +4001,7 @@ int netdev_rx_handler_register(struct net_device *dev,
3981 rx_handler_func_t *rx_handler, 4001 rx_handler_func_t *rx_handler,
3982 void *rx_handler_data) 4002 void *rx_handler_data)
3983{ 4003{
3984 ASSERT_RTNL(); 4004 if (netdev_is_rx_handler_busy(dev))
3985
3986 if (dev->rx_handler)
3987 return -EBUSY; 4005 return -EBUSY;
3988 4006
3989 /* Note: rx_handler_data must be set before rx_handler */ 4007 /* Note: rx_handler_data must be set before rx_handler */
@@ -4089,12 +4107,8 @@ another_round:
4089 goto out; 4107 goto out;
4090 } 4108 }
4091 4109
4092#ifdef CONFIG_NET_CLS_ACT 4110 if (skb_skip_tc_classify(skb))
4093 if (skb->tc_verd & TC_NCLS) { 4111 goto skip_classify;
4094 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
4095 goto ncls;
4096 }
4097#endif
4098 4112
4099 if (pfmemalloc) 4113 if (pfmemalloc)
4100 goto skip_taps; 4114 goto skip_taps;
@@ -4122,10 +4136,8 @@ skip_taps:
4122 goto out; 4136 goto out;
4123 } 4137 }
4124#endif 4138#endif
4125#ifdef CONFIG_NET_CLS_ACT 4139 skb_reset_tc(skb);
4126 skb->tc_verd = 0; 4140skip_classify:
4127ncls:
4128#endif
4129 if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) 4141 if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
4130 goto drop; 4142 goto drop;
4131 4143
@@ -4526,6 +4538,11 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
4526 if (&ptype->list == head) 4538 if (&ptype->list == head)
4527 goto normal; 4539 goto normal;
4528 4540
4541 if (IS_ERR(pp) && PTR_ERR(pp) == -EINPROGRESS) {
4542 ret = GRO_CONSUMED;
4543 goto ok;
4544 }
4545
4529 same_flow = NAPI_GRO_CB(skb)->same_flow; 4546 same_flow = NAPI_GRO_CB(skb)->same_flow;
4530 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; 4547 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4531 4548
@@ -4621,6 +4638,7 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4621 case GRO_MERGED_FREE: 4638 case GRO_MERGED_FREE:
4622 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) { 4639 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
4623 skb_dst_drop(skb); 4640 skb_dst_drop(skb);
4641 secpath_reset(skb);
4624 kmem_cache_free(skbuff_head_cache, skb); 4642 kmem_cache_free(skbuff_head_cache, skb);
4625 } else { 4643 } else {
4626 __kfree_skb(skb); 4644 __kfree_skb(skb);
@@ -4629,6 +4647,7 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4629 4647
4630 case GRO_HELD: 4648 case GRO_HELD:
4631 case GRO_MERGED: 4649 case GRO_MERGED:
4650 case GRO_CONSUMED:
4632 break; 4651 break;
4633 } 4652 }
4634 4653
@@ -4661,6 +4680,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4661 skb->encapsulation = 0; 4680 skb->encapsulation = 0;
4662 skb_shinfo(skb)->gso_type = 0; 4681 skb_shinfo(skb)->gso_type = 0;
4663 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); 4682 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4683 secpath_reset(skb);
4664 4684
4665 napi->skb = skb; 4685 napi->skb = skb;
4666} 4686}
@@ -4699,6 +4719,7 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi,
4699 break; 4719 break;
4700 4720
4701 case GRO_MERGED: 4721 case GRO_MERGED:
4722 case GRO_CONSUMED:
4702 break; 4723 break;
4703 } 4724 }
4704 4725
@@ -4835,7 +4856,7 @@ static int process_backlog(struct napi_struct *napi, int quota)
4835 net_rps_action_and_irq_enable(sd); 4856 net_rps_action_and_irq_enable(sd);
4836 } 4857 }
4837 4858
4838 napi->weight = weight_p; 4859 napi->weight = dev_rx_weight;
4839 while (again) { 4860 while (again) {
4840 struct sk_buff *skb; 4861 struct sk_buff *skb;
4841 4862
@@ -4891,6 +4912,39 @@ void __napi_schedule(struct napi_struct *n)
4891EXPORT_SYMBOL(__napi_schedule); 4912EXPORT_SYMBOL(__napi_schedule);
4892 4913
4893/** 4914/**
4915 * napi_schedule_prep - check if napi can be scheduled
4916 * @n: napi context
4917 *
4918 * Test if NAPI routine is already running, and if not mark
4919 * it as running. This is used as a condition variable
4920 * insure only one NAPI poll instance runs. We also make
4921 * sure there is no pending NAPI disable.
4922 */
4923bool napi_schedule_prep(struct napi_struct *n)
4924{
4925 unsigned long val, new;
4926
4927 do {
4928 val = READ_ONCE(n->state);
4929 if (unlikely(val & NAPIF_STATE_DISABLE))
4930 return false;
4931 new = val | NAPIF_STATE_SCHED;
4932
4933 /* Sets STATE_MISSED bit if STATE_SCHED was already set
4934 * This was suggested by Alexander Duyck, as compiler
4935 * emits better code than :
4936 * if (val & NAPIF_STATE_SCHED)
4937 * new |= NAPIF_STATE_MISSED;
4938 */
4939 new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
4940 NAPIF_STATE_MISSED;
4941 } while (cmpxchg(&n->state, val, new) != val);
4942
4943 return !(val & NAPIF_STATE_SCHED);
4944}
4945EXPORT_SYMBOL(napi_schedule_prep);
4946
4947/**
4894 * __napi_schedule_irqoff - schedule for receive 4948 * __napi_schedule_irqoff - schedule for receive
4895 * @n: entry to schedule 4949 * @n: entry to schedule
4896 * 4950 *
@@ -4902,26 +4956,9 @@ void __napi_schedule_irqoff(struct napi_struct *n)
4902} 4956}
4903EXPORT_SYMBOL(__napi_schedule_irqoff); 4957EXPORT_SYMBOL(__napi_schedule_irqoff);
4904 4958
4905bool __napi_complete(struct napi_struct *n)
4906{
4907 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4908
4909 /* Some drivers call us directly, instead of calling
4910 * napi_complete_done().
4911 */
4912 if (unlikely(test_bit(NAPI_STATE_IN_BUSY_POLL, &n->state)))
4913 return false;
4914
4915 list_del_init(&n->poll_list);
4916 smp_mb__before_atomic();
4917 clear_bit(NAPI_STATE_SCHED, &n->state);
4918 return true;
4919}
4920EXPORT_SYMBOL(__napi_complete);
4921
4922bool napi_complete_done(struct napi_struct *n, int work_done) 4959bool napi_complete_done(struct napi_struct *n, int work_done)
4923{ 4960{
4924 unsigned long flags; 4961 unsigned long flags, val, new;
4925 4962
4926 /* 4963 /*
4927 * 1) Don't let napi dequeue from the cpu poll list 4964 * 1) Don't let napi dequeue from the cpu poll list
@@ -4945,14 +4982,33 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
4945 else 4982 else
4946 napi_gro_flush(n, false); 4983 napi_gro_flush(n, false);
4947 } 4984 }
4948 if (likely(list_empty(&n->poll_list))) { 4985 if (unlikely(!list_empty(&n->poll_list))) {
4949 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4950 } else {
4951 /* If n->poll_list is not empty, we need to mask irqs */ 4986 /* If n->poll_list is not empty, we need to mask irqs */
4952 local_irq_save(flags); 4987 local_irq_save(flags);
4953 __napi_complete(n); 4988 list_del_init(&n->poll_list);
4954 local_irq_restore(flags); 4989 local_irq_restore(flags);
4955 } 4990 }
4991
4992 do {
4993 val = READ_ONCE(n->state);
4994
4995 WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
4996
4997 new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
4998
4999 /* If STATE_MISSED was set, leave STATE_SCHED set,
5000 * because we will call napi->poll() one more time.
5001 * This C code was suggested by Alexander Duyck to help gcc.
5002 */
5003 new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
5004 NAPIF_STATE_SCHED;
5005 } while (cmpxchg(&n->state, val, new) != val);
5006
5007 if (unlikely(val & NAPIF_STATE_MISSED)) {
5008 __napi_schedule(n);
5009 return false;
5010 }
5011
4956 return true; 5012 return true;
4957} 5013}
4958EXPORT_SYMBOL(napi_complete_done); 5014EXPORT_SYMBOL(napi_complete_done);
@@ -4978,6 +5034,16 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
4978{ 5034{
4979 int rc; 5035 int rc;
4980 5036
5037 /* Busy polling means there is a high chance device driver hard irq
5038 * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
5039 * set in napi_schedule_prep().
5040 * Since we are about to call napi->poll() once more, we can safely
5041 * clear NAPI_STATE_MISSED.
5042 *
5043 * Note: x86 could use a single "lock and ..." instruction
5044 * to perform these two clear_bit()
5045 */
5046 clear_bit(NAPI_STATE_MISSED, &napi->state);
4981 clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state); 5047 clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
4982 5048
4983 local_bh_disable(); 5049 local_bh_disable();
@@ -4998,7 +5064,6 @@ bool sk_busy_loop(struct sock *sk, int nonblock)
4998{ 5064{
4999 unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0; 5065 unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
5000 int (*napi_poll)(struct napi_struct *napi, int budget); 5066 int (*napi_poll)(struct napi_struct *napi, int budget);
5001 int (*busy_poll)(struct napi_struct *dev);
5002 void *have_poll_lock = NULL; 5067 void *have_poll_lock = NULL;
5003 struct napi_struct *napi; 5068 struct napi_struct *napi;
5004 int rc; 5069 int rc;
@@ -5013,17 +5078,10 @@ restart:
5013 if (!napi) 5078 if (!napi)
5014 goto out; 5079 goto out;
5015 5080
5016 /* Note: ndo_busy_poll method is optional in linux-4.5 */
5017 busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
5018
5019 preempt_disable(); 5081 preempt_disable();
5020 for (;;) { 5082 for (;;) {
5021 rc = 0; 5083 rc = 0;
5022 local_bh_disable(); 5084 local_bh_disable();
5023 if (busy_poll) {
5024 rc = busy_poll(napi);
5025 goto count;
5026 }
5027 if (!napi_poll) { 5085 if (!napi_poll) {
5028 unsigned long val = READ_ONCE(napi->state); 5086 unsigned long val = READ_ONCE(napi->state);
5029 5087
@@ -5048,9 +5106,6 @@ count:
5048 LINUX_MIB_BUSYPOLLRXPACKETS, rc); 5106 LINUX_MIB_BUSYPOLLRXPACKETS, rc);
5049 local_bh_enable(); 5107 local_bh_enable();
5050 5108
5051 if (rc == LL_FLUSH_FAILED)
5052 break; /* permanent failure */
5053
5054 if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) || 5109 if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) ||
5055 busy_loop_timeout(end_time)) 5110 busy_loop_timeout(end_time))
5056 break; 5111 break;
@@ -5124,8 +5179,13 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
5124 struct napi_struct *napi; 5179 struct napi_struct *napi;
5125 5180
5126 napi = container_of(timer, struct napi_struct, timer); 5181 napi = container_of(timer, struct napi_struct, timer);
5127 if (napi->gro_list) 5182
5128 napi_schedule(napi); 5183 /* Note : we use a relaxed variant of napi_schedule_prep() not setting
5184 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
5185 */
5186 if (napi->gro_list && !napi_disable_pending(napi) &&
5187 !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
5188 __napi_schedule_irqoff(napi);
5129 5189
5130 return HRTIMER_NORESTART; 5190 return HRTIMER_NORESTART;
5131} 5191}
@@ -5711,6 +5771,7 @@ static int netdev_adjacent_sysfs_add(struct net_device *dev,
5711 struct list_head *dev_list) 5771 struct list_head *dev_list)
5712{ 5772{
5713 char linkname[IFNAMSIZ+7]; 5773 char linkname[IFNAMSIZ+7];
5774
5714 sprintf(linkname, dev_list == &dev->adj_list.upper ? 5775 sprintf(linkname, dev_list == &dev->adj_list.upper ?
5715 "upper_%s" : "lower_%s", adj_dev->name); 5776 "upper_%s" : "lower_%s", adj_dev->name);
5716 return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj), 5777 return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
@@ -5721,6 +5782,7 @@ static void netdev_adjacent_sysfs_del(struct net_device *dev,
5721 struct list_head *dev_list) 5782 struct list_head *dev_list)
5722{ 5783{
5723 char linkname[IFNAMSIZ+7]; 5784 char linkname[IFNAMSIZ+7];
5785
5724 sprintf(linkname, dev_list == &dev->adj_list.upper ? 5786 sprintf(linkname, dev_list == &dev->adj_list.upper ?
5725 "upper_%s" : "lower_%s", name); 5787 "upper_%s" : "lower_%s", name);
5726 sysfs_remove_link(&(dev->dev.kobj), linkname); 5788 sysfs_remove_link(&(dev->dev.kobj), linkname);
@@ -5990,6 +6052,7 @@ void netdev_upper_dev_unlink(struct net_device *dev,
5990 struct net_device *upper_dev) 6052 struct net_device *upper_dev)
5991{ 6053{
5992 struct netdev_notifier_changeupper_info changeupper_info; 6054 struct netdev_notifier_changeupper_info changeupper_info;
6055
5993 ASSERT_RTNL(); 6056 ASSERT_RTNL();
5994 6057
5995 changeupper_info.upper_dev = upper_dev; 6058 changeupper_info.upper_dev = upper_dev;
@@ -6156,50 +6219,6 @@ void netdev_lower_state_changed(struct net_device *lower_dev,
6156} 6219}
6157EXPORT_SYMBOL(netdev_lower_state_changed); 6220EXPORT_SYMBOL(netdev_lower_state_changed);
6158 6221
6159int netdev_default_l2upper_neigh_construct(struct net_device *dev,
6160 struct neighbour *n)
6161{
6162 struct net_device *lower_dev, *stop_dev;
6163 struct list_head *iter;
6164 int err;
6165
6166 netdev_for_each_lower_dev(dev, lower_dev, iter) {
6167 if (!lower_dev->netdev_ops->ndo_neigh_construct)
6168 continue;
6169 err = lower_dev->netdev_ops->ndo_neigh_construct(lower_dev, n);
6170 if (err) {
6171 stop_dev = lower_dev;
6172 goto rollback;
6173 }
6174 }
6175 return 0;
6176
6177rollback:
6178 netdev_for_each_lower_dev(dev, lower_dev, iter) {
6179 if (lower_dev == stop_dev)
6180 break;
6181 if (!lower_dev->netdev_ops->ndo_neigh_destroy)
6182 continue;
6183 lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
6184 }
6185 return err;
6186}
6187EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_construct);
6188
6189void netdev_default_l2upper_neigh_destroy(struct net_device *dev,
6190 struct neighbour *n)
6191{
6192 struct net_device *lower_dev;
6193 struct list_head *iter;
6194
6195 netdev_for_each_lower_dev(dev, lower_dev, iter) {
6196 if (!lower_dev->netdev_ops->ndo_neigh_destroy)
6197 continue;
6198 lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
6199 }
6200}
6201EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_destroy);
6202
6203static void dev_change_rx_flags(struct net_device *dev, int flags) 6222static void dev_change_rx_flags(struct net_device *dev, int flags)
6204{ 6223{
6205 const struct net_device_ops *ops = dev->netdev_ops; 6224 const struct net_device_ops *ops = dev->netdev_ops;
@@ -6452,8 +6471,8 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags)
6452 } 6471 }
6453 6472
6454 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI 6473 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
6455 is important. Some (broken) drivers set IFF_PROMISC, when 6474 * is important. Some (broken) drivers set IFF_PROMISC, when
6456 IFF_ALLMULTI is requested not asking us and not reporting. 6475 * IFF_ALLMULTI is requested not asking us and not reporting.
6457 */ 6476 */
6458 if ((flags ^ dev->gflags) & IFF_ALLMULTI) { 6477 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
6459 int inc = (flags & IFF_ALLMULTI) ? 1 : -1; 6478 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
@@ -6738,7 +6757,6 @@ int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags)
6738 6757
6739 return err; 6758 return err;
6740} 6759}
6741EXPORT_SYMBOL(dev_change_xdp_fd);
6742 6760
6743/** 6761/**
6744 * dev_new_index - allocate an ifindex 6762 * dev_new_index - allocate an ifindex
@@ -6751,6 +6769,7 @@ EXPORT_SYMBOL(dev_change_xdp_fd);
6751static int dev_new_index(struct net *net) 6769static int dev_new_index(struct net *net)
6752{ 6770{
6753 int ifindex = net->ifindex; 6771 int ifindex = net->ifindex;
6772
6754 for (;;) { 6773 for (;;) {
6755 if (++ifindex <= 0) 6774 if (++ifindex <= 0)
6756 ifindex = 1; 6775 ifindex = 1;
@@ -6817,8 +6836,8 @@ static void rollback_registered_many(struct list_head *head)
6817 6836
6818 6837
6819 /* Notify protocols, that we are about to destroy 6838 /* Notify protocols, that we are about to destroy
6820 this device. They should clean all the things. 6839 * this device. They should clean all the things.
6821 */ 6840 */
6822 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 6841 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6823 6842
6824 if (!dev->rtnl_link_ops || 6843 if (!dev->rtnl_link_ops ||
@@ -6976,13 +6995,6 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,
6976 features &= ~dev->gso_partial_features; 6995 features &= ~dev->gso_partial_features;
6977 } 6996 }
6978 6997
6979#ifdef CONFIG_NET_RX_BUSY_POLL
6980 if (dev->netdev_ops->ndo_busy_poll)
6981 features |= NETIF_F_BUSY_POLL;
6982 else
6983#endif
6984 features &= ~NETIF_F_BUSY_POLL;
6985
6986 return features; 6998 return features;
6987} 6999}
6988 7000
@@ -7171,6 +7183,7 @@ void netif_tx_stop_all_queues(struct net_device *dev)
7171 7183
7172 for (i = 0; i < dev->num_tx_queues; i++) { 7184 for (i = 0; i < dev->num_tx_queues; i++) {
7173 struct netdev_queue *txq = netdev_get_tx_queue(dev, i); 7185 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
7186
7174 netif_tx_stop_queue(txq); 7187 netif_tx_stop_queue(txq);
7175 } 7188 }
7176} 7189}
@@ -7645,17 +7658,17 @@ void netdev_freemem(struct net_device *dev)
7645} 7658}
7646 7659
7647/** 7660/**
7648 * alloc_netdev_mqs - allocate network device 7661 * alloc_netdev_mqs - allocate network device
7649 * @sizeof_priv: size of private data to allocate space for 7662 * @sizeof_priv: size of private data to allocate space for
7650 * @name: device name format string 7663 * @name: device name format string
7651 * @name_assign_type: origin of device name 7664 * @name_assign_type: origin of device name
7652 * @setup: callback to initialize device 7665 * @setup: callback to initialize device
7653 * @txqs: the number of TX subqueues to allocate 7666 * @txqs: the number of TX subqueues to allocate
7654 * @rxqs: the number of RX subqueues to allocate 7667 * @rxqs: the number of RX subqueues to allocate
7655 * 7668 *
7656 * Allocates a struct net_device with private data area for driver use 7669 * Allocates a struct net_device with private data area for driver use
7657 * and performs basic initialization. Also allocates subqueue structs 7670 * and performs basic initialization. Also allocates subqueue structs
7658 * for each queue on the device. 7671 * for each queue on the device.
7659 */ 7672 */
7660struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, 7673struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
7661 unsigned char name_assign_type, 7674 unsigned char name_assign_type,
@@ -7767,13 +7780,13 @@ free_dev:
7767EXPORT_SYMBOL(alloc_netdev_mqs); 7780EXPORT_SYMBOL(alloc_netdev_mqs);
7768 7781
7769/** 7782/**
7770 * free_netdev - free network device 7783 * free_netdev - free network device
7771 * @dev: device 7784 * @dev: device
7772 * 7785 *
7773 * This function does the last stage of destroying an allocated device 7786 * This function does the last stage of destroying an allocated device
7774 * interface. The reference to the device object is released. 7787 * interface. The reference to the device object is released. If this
7775 * If this is the last reference then it will be freed. 7788 * is the last reference then it will be freed.Must be called in process
7776 * Must be called in process context. 7789 * context.
7777 */ 7790 */
7778void free_netdev(struct net_device *dev) 7791void free_netdev(struct net_device *dev)
7779{ 7792{
@@ -7955,12 +7968,12 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
7955 dev_shutdown(dev); 7968 dev_shutdown(dev);
7956 7969
7957 /* Notify protocols, that we are about to destroy 7970 /* Notify protocols, that we are about to destroy
7958 this device. They should clean all the things. 7971 * this device. They should clean all the things.
7959 7972 *
7960 Note that dev->reg_state stays at NETREG_REGISTERED. 7973 * Note that dev->reg_state stays at NETREG_REGISTERED.
7961 This is wanted because this way 8021q and macvlan know 7974 * This is wanted because this way 8021q and macvlan know
7962 the device is just moving and can keep their slaves up. 7975 * the device is just moving and can keep their slaves up.
7963 */ 7976 */
7964 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 7977 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7965 rcu_barrier(); 7978 rcu_barrier();
7966 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); 7979 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 2b5bf9efa720..e9c1e6acfb6d 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -1392,9 +1392,9 @@ static int devlink_nl_cmd_sb_occ_max_clear_doit(struct sk_buff *skb,
1392 return -EOPNOTSUPP; 1392 return -EOPNOTSUPP;
1393} 1393}
1394 1394
1395static int devlink_eswitch_fill(struct sk_buff *msg, struct devlink *devlink, 1395static int devlink_nl_eswitch_fill(struct sk_buff *msg, struct devlink *devlink,
1396 enum devlink_command cmd, u32 portid, 1396 enum devlink_command cmd, u32 portid,
1397 u32 seq, int flags) 1397 u32 seq, int flags)
1398{ 1398{
1399 const struct devlink_ops *ops = devlink->ops; 1399 const struct devlink_ops *ops = devlink->ops;
1400 void *hdr; 1400 void *hdr;
@@ -1408,50 +1408,52 @@ static int devlink_eswitch_fill(struct sk_buff *msg, struct devlink *devlink,
1408 1408
1409 err = devlink_nl_put_handle(msg, devlink); 1409 err = devlink_nl_put_handle(msg, devlink);
1410 if (err) 1410 if (err)
1411 goto out; 1411 goto nla_put_failure;
1412 1412
1413 err = ops->eswitch_mode_get(devlink, &mode); 1413 if (ops->eswitch_mode_get) {
1414 if (err) 1414 err = ops->eswitch_mode_get(devlink, &mode);
1415 goto out; 1415 if (err)
1416 err = nla_put_u16(msg, DEVLINK_ATTR_ESWITCH_MODE, mode); 1416 goto nla_put_failure;
1417 if (err) 1417 err = nla_put_u16(msg, DEVLINK_ATTR_ESWITCH_MODE, mode);
1418 goto out; 1418 if (err)
1419 goto nla_put_failure;
1420 }
1419 1421
1420 if (ops->eswitch_inline_mode_get) { 1422 if (ops->eswitch_inline_mode_get) {
1421 err = ops->eswitch_inline_mode_get(devlink, &inline_mode); 1423 err = ops->eswitch_inline_mode_get(devlink, &inline_mode);
1422 if (err) 1424 if (err)
1423 goto out; 1425 goto nla_put_failure;
1424 err = nla_put_u8(msg, DEVLINK_ATTR_ESWITCH_INLINE_MODE, 1426 err = nla_put_u8(msg, DEVLINK_ATTR_ESWITCH_INLINE_MODE,
1425 inline_mode); 1427 inline_mode);
1426 if (err) 1428 if (err)
1427 goto out; 1429 goto nla_put_failure;
1428 } 1430 }
1429 1431
1430 genlmsg_end(msg, hdr); 1432 genlmsg_end(msg, hdr);
1431 return 0; 1433 return 0;
1432 1434
1433out: 1435nla_put_failure:
1434 genlmsg_cancel(msg, hdr); 1436 genlmsg_cancel(msg, hdr);
1435 return err; 1437 return err;
1436} 1438}
1437 1439
1438static int devlink_nl_cmd_eswitch_mode_get_doit(struct sk_buff *skb, 1440static int devlink_nl_cmd_eswitch_get_doit(struct sk_buff *skb,
1439 struct genl_info *info) 1441 struct genl_info *info)
1440{ 1442{
1441 struct devlink *devlink = info->user_ptr[0]; 1443 struct devlink *devlink = info->user_ptr[0];
1442 const struct devlink_ops *ops = devlink->ops; 1444 const struct devlink_ops *ops = devlink->ops;
1443 struct sk_buff *msg; 1445 struct sk_buff *msg;
1444 int err; 1446 int err;
1445 1447
1446 if (!ops || !ops->eswitch_mode_get) 1448 if (!ops)
1447 return -EOPNOTSUPP; 1449 return -EOPNOTSUPP;
1448 1450
1449 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); 1451 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
1450 if (!msg) 1452 if (!msg)
1451 return -ENOMEM; 1453 return -ENOMEM;
1452 1454
1453 err = devlink_eswitch_fill(msg, devlink, DEVLINK_CMD_ESWITCH_MODE_GET, 1455 err = devlink_nl_eswitch_fill(msg, devlink, DEVLINK_CMD_ESWITCH_GET,
1454 info->snd_portid, info->snd_seq, 0); 1456 info->snd_portid, info->snd_seq, 0);
1455 1457
1456 if (err) { 1458 if (err) {
1457 nlmsg_free(msg); 1459 nlmsg_free(msg);
@@ -1461,8 +1463,8 @@ static int devlink_nl_cmd_eswitch_mode_get_doit(struct sk_buff *skb,
1461 return genlmsg_reply(msg, info); 1463 return genlmsg_reply(msg, info);
1462} 1464}
1463 1465
1464static int devlink_nl_cmd_eswitch_mode_set_doit(struct sk_buff *skb, 1466static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb,
1465 struct genl_info *info) 1467 struct genl_info *info)
1466{ 1468{
1467 struct devlink *devlink = info->user_ptr[0]; 1469 struct devlink *devlink = info->user_ptr[0];
1468 const struct devlink_ops *ops = devlink->ops; 1470 const struct devlink_ops *ops = devlink->ops;
@@ -1629,15 +1631,15 @@ static const struct genl_ops devlink_nl_ops[] = {
1629 DEVLINK_NL_FLAG_LOCK_PORTS, 1631 DEVLINK_NL_FLAG_LOCK_PORTS,
1630 }, 1632 },
1631 { 1633 {
1632 .cmd = DEVLINK_CMD_ESWITCH_MODE_GET, 1634 .cmd = DEVLINK_CMD_ESWITCH_GET,
1633 .doit = devlink_nl_cmd_eswitch_mode_get_doit, 1635 .doit = devlink_nl_cmd_eswitch_get_doit,
1634 .policy = devlink_nl_policy, 1636 .policy = devlink_nl_policy,
1635 .flags = GENL_ADMIN_PERM, 1637 .flags = GENL_ADMIN_PERM,
1636 .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, 1638 .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
1637 }, 1639 },
1638 { 1640 {
1639 .cmd = DEVLINK_CMD_ESWITCH_MODE_SET, 1641 .cmd = DEVLINK_CMD_ESWITCH_SET,
1640 .doit = devlink_nl_cmd_eswitch_mode_set_doit, 1642 .doit = devlink_nl_cmd_eswitch_set_doit,
1641 .policy = devlink_nl_policy, 1643 .policy = devlink_nl_policy,
1642 .flags = GENL_ADMIN_PERM, 1644 .flags = GENL_ADMIN_PERM,
1643 .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, 1645 .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
diff --git a/net/core/dst.c b/net/core/dst.c
index b5cbbe07f786..960e503b5a52 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -190,7 +190,6 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops,
190 dst->__use = 0; 190 dst->__use = 0;
191 dst->lastuse = jiffies; 191 dst->lastuse = jiffies;
192 dst->flags = flags; 192 dst->flags = flags;
193 dst->pending_confirm = 0;
194 dst->next = NULL; 193 dst->next = NULL;
195 if (!(flags & DST_NOCOUNT)) 194 if (!(flags & DST_NOCOUNT))
196 dst_entries_add(ops, 1); 195 dst_entries_add(ops, 1);
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 236a21e3c878..aecb2c7241b6 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -24,7 +24,7 @@
24#include <linux/vmalloc.h> 24#include <linux/vmalloc.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/rtnetlink.h> 26#include <linux/rtnetlink.h>
27#include <linux/sched.h> 27#include <linux/sched/signal.h>
28#include <linux/net.h> 28#include <linux/net.h>
29 29
30/* 30/*
@@ -102,7 +102,6 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
102 [NETIF_F_RXFCS_BIT] = "rx-fcs", 102 [NETIF_F_RXFCS_BIT] = "rx-fcs",
103 [NETIF_F_RXALL_BIT] = "rx-all", 103 [NETIF_F_RXALL_BIT] = "rx-all",
104 [NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload", 104 [NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload",
105 [NETIF_F_BUSY_POLL_BIT] = "busy-poll",
106 [NETIF_F_HW_TC_BIT] = "hw-tc-offload", 105 [NETIF_F_HW_TC_BIT] = "hw-tc-offload",
107}; 106};
108 107
@@ -1405,9 +1404,12 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)
1405 if (regs.len > reglen) 1404 if (regs.len > reglen)
1406 regs.len = reglen; 1405 regs.len = reglen;
1407 1406
1408 regbuf = vzalloc(reglen); 1407 regbuf = NULL;
1409 if (reglen && !regbuf) 1408 if (reglen) {
1410 return -ENOMEM; 1409 regbuf = vzalloc(reglen);
1410 if (!regbuf)
1411 return -ENOMEM;
1412 }
1411 1413
1412 ops->get_regs(dev, &regs, regbuf); 1414 ops->get_regs(dev, &regs, regbuf);
1413 1415
@@ -1817,11 +1819,13 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr)
1817 ret = __ethtool_get_sset_count(dev, gstrings.string_set); 1819 ret = __ethtool_get_sset_count(dev, gstrings.string_set);
1818 if (ret < 0) 1820 if (ret < 0)
1819 return ret; 1821 return ret;
1822 if (ret > S32_MAX / ETH_GSTRING_LEN)
1823 return -ENOMEM;
1824 WARN_ON_ONCE(!ret);
1820 1825
1821 gstrings.len = ret; 1826 gstrings.len = ret;
1822 1827 data = vzalloc(gstrings.len * ETH_GSTRING_LEN);
1823 data = kcalloc(gstrings.len, ETH_GSTRING_LEN, GFP_USER); 1828 if (gstrings.len && !data)
1824 if (!data)
1825 return -ENOMEM; 1829 return -ENOMEM;
1826 1830
1827 __ethtool_get_strings(dev, gstrings.string_set, data); 1831 __ethtool_get_strings(dev, gstrings.string_set, data);
@@ -1830,12 +1834,13 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr)
1830 if (copy_to_user(useraddr, &gstrings, sizeof(gstrings))) 1834 if (copy_to_user(useraddr, &gstrings, sizeof(gstrings)))
1831 goto out; 1835 goto out;
1832 useraddr += sizeof(gstrings); 1836 useraddr += sizeof(gstrings);
1833 if (copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN)) 1837 if (gstrings.len &&
1838 copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN))
1834 goto out; 1839 goto out;
1835 ret = 0; 1840 ret = 0;
1836 1841
1837out: 1842out:
1838 kfree(data); 1843 vfree(data);
1839 return ret; 1844 return ret;
1840} 1845}
1841 1846
@@ -1912,14 +1917,15 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr)
1912 n_stats = ops->get_sset_count(dev, ETH_SS_STATS); 1917 n_stats = ops->get_sset_count(dev, ETH_SS_STATS);
1913 if (n_stats < 0) 1918 if (n_stats < 0)
1914 return n_stats; 1919 return n_stats;
1915 WARN_ON(n_stats == 0); 1920 if (n_stats > S32_MAX / sizeof(u64))
1916 1921 return -ENOMEM;
1922 WARN_ON_ONCE(!n_stats);
1917 if (copy_from_user(&stats, useraddr, sizeof(stats))) 1923 if (copy_from_user(&stats, useraddr, sizeof(stats)))
1918 return -EFAULT; 1924 return -EFAULT;
1919 1925
1920 stats.n_stats = n_stats; 1926 stats.n_stats = n_stats;
1921 data = kmalloc(n_stats * sizeof(u64), GFP_USER); 1927 data = vzalloc(n_stats * sizeof(u64));
1922 if (!data) 1928 if (n_stats && !data)
1923 return -ENOMEM; 1929 return -ENOMEM;
1924 1930
1925 ops->get_ethtool_stats(dev, &stats, data); 1931 ops->get_ethtool_stats(dev, &stats, data);
@@ -1928,12 +1934,12 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr)
1928 if (copy_to_user(useraddr, &stats, sizeof(stats))) 1934 if (copy_to_user(useraddr, &stats, sizeof(stats)))
1929 goto out; 1935 goto out;
1930 useraddr += sizeof(stats); 1936 useraddr += sizeof(stats);
1931 if (copy_to_user(useraddr, data, stats.n_stats * sizeof(u64))) 1937 if (n_stats && copy_to_user(useraddr, data, n_stats * sizeof(u64)))
1932 goto out; 1938 goto out;
1933 ret = 0; 1939 ret = 0;
1934 1940
1935 out: 1941 out:
1936 kfree(data); 1942 vfree(data);
1937 return ret; 1943 return ret;
1938} 1944}
1939 1945
@@ -1948,17 +1954,18 @@ static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr)
1948 return -EOPNOTSUPP; 1954 return -EOPNOTSUPP;
1949 1955
1950 n_stats = phy_get_sset_count(phydev); 1956 n_stats = phy_get_sset_count(phydev);
1951
1952 if (n_stats < 0) 1957 if (n_stats < 0)
1953 return n_stats; 1958 return n_stats;
1954 WARN_ON(n_stats == 0); 1959 if (n_stats > S32_MAX / sizeof(u64))
1960 return -ENOMEM;
1961 WARN_ON_ONCE(!n_stats);
1955 1962
1956 if (copy_from_user(&stats, useraddr, sizeof(stats))) 1963 if (copy_from_user(&stats, useraddr, sizeof(stats)))
1957 return -EFAULT; 1964 return -EFAULT;
1958 1965
1959 stats.n_stats = n_stats; 1966 stats.n_stats = n_stats;
1960 data = kmalloc_array(n_stats, sizeof(u64), GFP_USER); 1967 data = vzalloc(n_stats * sizeof(u64));
1961 if (!data) 1968 if (n_stats && !data)
1962 return -ENOMEM; 1969 return -ENOMEM;
1963 1970
1964 mutex_lock(&phydev->lock); 1971 mutex_lock(&phydev->lock);
@@ -1969,12 +1976,12 @@ static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr)
1969 if (copy_to_user(useraddr, &stats, sizeof(stats))) 1976 if (copy_to_user(useraddr, &stats, sizeof(stats)))
1970 goto out; 1977 goto out;
1971 useraddr += sizeof(stats); 1978 useraddr += sizeof(stats);
1972 if (copy_to_user(useraddr, data, stats.n_stats * sizeof(u64))) 1979 if (n_stats && copy_to_user(useraddr, data, n_stats * sizeof(u64)))
1973 goto out; 1980 goto out;
1974 ret = 0; 1981 ret = 0;
1975 1982
1976 out: 1983 out:
1977 kfree(data); 1984 vfree(data);
1978 return ret; 1985 return ret;
1979} 1986}
1980 1987
diff --git a/net/core/filter.c b/net/core/filter.c
index 1969b3f118c1..ebaeaf2e46e8 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -76,9 +76,10 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
76 * allow SOCK_MEMALLOC sockets to use it as this socket is 76 * allow SOCK_MEMALLOC sockets to use it as this socket is
77 * helping free memory 77 * helping free memory
78 */ 78 */
79 if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) 79 if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) {
80 NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
80 return -ENOMEM; 81 return -ENOMEM;
81 82 }
82 err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb); 83 err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb);
83 if (err) 84 if (err)
84 return err; 85 return err;
@@ -1416,8 +1417,8 @@ static const struct bpf_func_proto bpf_skb_store_bytes_proto = {
1416 .ret_type = RET_INTEGER, 1417 .ret_type = RET_INTEGER,
1417 .arg1_type = ARG_PTR_TO_CTX, 1418 .arg1_type = ARG_PTR_TO_CTX,
1418 .arg2_type = ARG_ANYTHING, 1419 .arg2_type = ARG_ANYTHING,
1419 .arg3_type = ARG_PTR_TO_STACK, 1420 .arg3_type = ARG_PTR_TO_MEM,
1420 .arg4_type = ARG_CONST_STACK_SIZE, 1421 .arg4_type = ARG_CONST_SIZE,
1421 .arg5_type = ARG_ANYTHING, 1422 .arg5_type = ARG_ANYTHING,
1422}; 1423};
1423 1424
@@ -1447,8 +1448,8 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
1447 .ret_type = RET_INTEGER, 1448 .ret_type = RET_INTEGER,
1448 .arg1_type = ARG_PTR_TO_CTX, 1449 .arg1_type = ARG_PTR_TO_CTX,
1449 .arg2_type = ARG_ANYTHING, 1450 .arg2_type = ARG_ANYTHING,
1450 .arg3_type = ARG_PTR_TO_RAW_STACK, 1451 .arg3_type = ARG_PTR_TO_UNINIT_MEM,
1451 .arg4_type = ARG_CONST_STACK_SIZE, 1452 .arg4_type = ARG_CONST_SIZE,
1452}; 1453};
1453 1454
1454BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len) 1455BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len)
@@ -1522,10 +1523,11 @@ BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset,
1522{ 1523{
1523 bool is_pseudo = flags & BPF_F_PSEUDO_HDR; 1524 bool is_pseudo = flags & BPF_F_PSEUDO_HDR;
1524 bool is_mmzero = flags & BPF_F_MARK_MANGLED_0; 1525 bool is_mmzero = flags & BPF_F_MARK_MANGLED_0;
1526 bool do_mforce = flags & BPF_F_MARK_ENFORCE;
1525 __sum16 *ptr; 1527 __sum16 *ptr;
1526 1528
1527 if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_PSEUDO_HDR | 1529 if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_MARK_ENFORCE |
1528 BPF_F_HDR_FIELD_MASK))) 1530 BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK)))
1529 return -EINVAL; 1531 return -EINVAL;
1530 if (unlikely(offset > 0xffff || offset & 1)) 1532 if (unlikely(offset > 0xffff || offset & 1))
1531 return -EFAULT; 1533 return -EFAULT;
@@ -1533,7 +1535,7 @@ BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset,
1533 return -EFAULT; 1535 return -EFAULT;
1534 1536
1535 ptr = (__sum16 *)(skb->data + offset); 1537 ptr = (__sum16 *)(skb->data + offset);
1536 if (is_mmzero && !*ptr) 1538 if (is_mmzero && !do_mforce && !*ptr)
1537 return 0; 1539 return 0;
1538 1540
1539 switch (flags & BPF_F_HDR_FIELD_MASK) { 1541 switch (flags & BPF_F_HDR_FIELD_MASK) {
@@ -1601,10 +1603,10 @@ static const struct bpf_func_proto bpf_csum_diff_proto = {
1601 .gpl_only = false, 1603 .gpl_only = false,
1602 .pkt_access = true, 1604 .pkt_access = true,
1603 .ret_type = RET_INTEGER, 1605 .ret_type = RET_INTEGER,
1604 .arg1_type = ARG_PTR_TO_STACK, 1606 .arg1_type = ARG_PTR_TO_MEM,
1605 .arg2_type = ARG_CONST_STACK_SIZE_OR_ZERO, 1607 .arg2_type = ARG_CONST_SIZE_OR_ZERO,
1606 .arg3_type = ARG_PTR_TO_STACK, 1608 .arg3_type = ARG_PTR_TO_MEM,
1607 .arg4_type = ARG_CONST_STACK_SIZE_OR_ZERO, 1609 .arg4_type = ARG_CONST_SIZE_OR_ZERO,
1608 .arg5_type = ARG_ANYTHING, 1610 .arg5_type = ARG_ANYTHING,
1609}; 1611};
1610 1612
@@ -2306,8 +2308,8 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = {
2306 .arg1_type = ARG_PTR_TO_CTX, 2308 .arg1_type = ARG_PTR_TO_CTX,
2307 .arg2_type = ARG_CONST_MAP_PTR, 2309 .arg2_type = ARG_CONST_MAP_PTR,
2308 .arg3_type = ARG_ANYTHING, 2310 .arg3_type = ARG_ANYTHING,
2309 .arg4_type = ARG_PTR_TO_STACK, 2311 .arg4_type = ARG_PTR_TO_MEM,
2310 .arg5_type = ARG_CONST_STACK_SIZE, 2312 .arg5_type = ARG_CONST_SIZE,
2311}; 2313};
2312 2314
2313static unsigned short bpf_tunnel_key_af(u64 flags) 2315static unsigned short bpf_tunnel_key_af(u64 flags)
@@ -2377,8 +2379,8 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
2377 .gpl_only = false, 2379 .gpl_only = false,
2378 .ret_type = RET_INTEGER, 2380 .ret_type = RET_INTEGER,
2379 .arg1_type = ARG_PTR_TO_CTX, 2381 .arg1_type = ARG_PTR_TO_CTX,
2380 .arg2_type = ARG_PTR_TO_RAW_STACK, 2382 .arg2_type = ARG_PTR_TO_UNINIT_MEM,
2381 .arg3_type = ARG_CONST_STACK_SIZE, 2383 .arg3_type = ARG_CONST_SIZE,
2382 .arg4_type = ARG_ANYTHING, 2384 .arg4_type = ARG_ANYTHING,
2383}; 2385};
2384 2386
@@ -2412,8 +2414,8 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = {
2412 .gpl_only = false, 2414 .gpl_only = false,
2413 .ret_type = RET_INTEGER, 2415 .ret_type = RET_INTEGER,
2414 .arg1_type = ARG_PTR_TO_CTX, 2416 .arg1_type = ARG_PTR_TO_CTX,
2415 .arg2_type = ARG_PTR_TO_RAW_STACK, 2417 .arg2_type = ARG_PTR_TO_UNINIT_MEM,
2416 .arg3_type = ARG_CONST_STACK_SIZE, 2418 .arg3_type = ARG_CONST_SIZE,
2417}; 2419};
2418 2420
2419static struct metadata_dst __percpu *md_dst; 2421static struct metadata_dst __percpu *md_dst;
@@ -2483,8 +2485,8 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
2483 .gpl_only = false, 2485 .gpl_only = false,
2484 .ret_type = RET_INTEGER, 2486 .ret_type = RET_INTEGER,
2485 .arg1_type = ARG_PTR_TO_CTX, 2487 .arg1_type = ARG_PTR_TO_CTX,
2486 .arg2_type = ARG_PTR_TO_STACK, 2488 .arg2_type = ARG_PTR_TO_MEM,
2487 .arg3_type = ARG_CONST_STACK_SIZE, 2489 .arg3_type = ARG_CONST_SIZE,
2488 .arg4_type = ARG_ANYTHING, 2490 .arg4_type = ARG_ANYTHING,
2489}; 2491};
2490 2492
@@ -2509,8 +2511,8 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = {
2509 .gpl_only = false, 2511 .gpl_only = false,
2510 .ret_type = RET_INTEGER, 2512 .ret_type = RET_INTEGER,
2511 .arg1_type = ARG_PTR_TO_CTX, 2513 .arg1_type = ARG_PTR_TO_CTX,
2512 .arg2_type = ARG_PTR_TO_STACK, 2514 .arg2_type = ARG_PTR_TO_MEM,
2513 .arg3_type = ARG_CONST_STACK_SIZE, 2515 .arg3_type = ARG_CONST_SIZE,
2514}; 2516};
2515 2517
2516static const struct bpf_func_proto * 2518static const struct bpf_func_proto *
@@ -2582,8 +2584,8 @@ BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map,
2582 if (unlikely(xdp_size > (unsigned long)(xdp->data_end - xdp->data))) 2584 if (unlikely(xdp_size > (unsigned long)(xdp->data_end - xdp->data)))
2583 return -EFAULT; 2585 return -EFAULT;
2584 2586
2585 return bpf_event_output(map, flags, meta, meta_size, xdp, xdp_size, 2587 return bpf_event_output(map, flags, meta, meta_size, xdp->data,
2586 bpf_xdp_copy); 2588 xdp_size, bpf_xdp_copy);
2587} 2589}
2588 2590
2589static const struct bpf_func_proto bpf_xdp_event_output_proto = { 2591static const struct bpf_func_proto bpf_xdp_event_output_proto = {
@@ -2593,12 +2595,12 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = {
2593 .arg1_type = ARG_PTR_TO_CTX, 2595 .arg1_type = ARG_PTR_TO_CTX,
2594 .arg2_type = ARG_CONST_MAP_PTR, 2596 .arg2_type = ARG_CONST_MAP_PTR,
2595 .arg3_type = ARG_ANYTHING, 2597 .arg3_type = ARG_ANYTHING,
2596 .arg4_type = ARG_PTR_TO_STACK, 2598 .arg4_type = ARG_PTR_TO_MEM,
2597 .arg5_type = ARG_CONST_STACK_SIZE, 2599 .arg5_type = ARG_CONST_SIZE,
2598}; 2600};
2599 2601
2600static const struct bpf_func_proto * 2602static const struct bpf_func_proto *
2601sk_filter_func_proto(enum bpf_func_id func_id) 2603bpf_base_func_proto(enum bpf_func_id func_id)
2602{ 2604{
2603 switch (func_id) { 2605 switch (func_id) {
2604 case BPF_FUNC_map_lookup_elem: 2606 case BPF_FUNC_map_lookup_elem:
@@ -2626,6 +2628,17 @@ sk_filter_func_proto(enum bpf_func_id func_id)
2626} 2628}
2627 2629
2628static const struct bpf_func_proto * 2630static const struct bpf_func_proto *
2631sk_filter_func_proto(enum bpf_func_id func_id)
2632{
2633 switch (func_id) {
2634 case BPF_FUNC_skb_load_bytes:
2635 return &bpf_skb_load_bytes_proto;
2636 default:
2637 return bpf_base_func_proto(func_id);
2638 }
2639}
2640
2641static const struct bpf_func_proto *
2629tc_cls_act_func_proto(enum bpf_func_id func_id) 2642tc_cls_act_func_proto(enum bpf_func_id func_id)
2630{ 2643{
2631 switch (func_id) { 2644 switch (func_id) {
@@ -2680,7 +2693,7 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
2680 case BPF_FUNC_skb_under_cgroup: 2693 case BPF_FUNC_skb_under_cgroup:
2681 return &bpf_skb_under_cgroup_proto; 2694 return &bpf_skb_under_cgroup_proto;
2682 default: 2695 default:
2683 return sk_filter_func_proto(func_id); 2696 return bpf_base_func_proto(func_id);
2684 } 2697 }
2685} 2698}
2686 2699
@@ -2695,7 +2708,7 @@ xdp_func_proto(enum bpf_func_id func_id)
2695 case BPF_FUNC_xdp_adjust_head: 2708 case BPF_FUNC_xdp_adjust_head:
2696 return &bpf_xdp_adjust_head_proto; 2709 return &bpf_xdp_adjust_head_proto;
2697 default: 2710 default:
2698 return sk_filter_func_proto(func_id); 2711 return bpf_base_func_proto(func_id);
2699 } 2712 }
2700} 2713}
2701 2714
@@ -2706,7 +2719,7 @@ cg_skb_func_proto(enum bpf_func_id func_id)
2706 case BPF_FUNC_skb_load_bytes: 2719 case BPF_FUNC_skb_load_bytes:
2707 return &bpf_skb_load_bytes_proto; 2720 return &bpf_skb_load_bytes_proto;
2708 default: 2721 default:
2709 return sk_filter_func_proto(func_id); 2722 return bpf_base_func_proto(func_id);
2710 } 2723 }
2711} 2724}
2712 2725
@@ -2733,7 +2746,7 @@ lwt_inout_func_proto(enum bpf_func_id func_id)
2733 case BPF_FUNC_skb_under_cgroup: 2746 case BPF_FUNC_skb_under_cgroup:
2734 return &bpf_skb_under_cgroup_proto; 2747 return &bpf_skb_under_cgroup_proto;
2735 default: 2748 default:
2736 return sk_filter_func_proto(func_id); 2749 return bpf_base_func_proto(func_id);
2737 } 2750 }
2738} 2751}
2739 2752
@@ -2776,11 +2789,22 @@ static bool __is_valid_access(int off, int size)
2776{ 2789{
2777 if (off < 0 || off >= sizeof(struct __sk_buff)) 2790 if (off < 0 || off >= sizeof(struct __sk_buff))
2778 return false; 2791 return false;
2792
2779 /* The verifier guarantees that size > 0. */ 2793 /* The verifier guarantees that size > 0. */
2780 if (off % size != 0) 2794 if (off % size != 0)
2781 return false; 2795 return false;
2782 if (size != sizeof(__u32)) 2796
2783 return false; 2797 switch (off) {
2798 case offsetof(struct __sk_buff, cb[0]) ...
2799 offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
2800 if (off + size >
2801 offsetof(struct __sk_buff, cb[4]) + sizeof(__u32))
2802 return false;
2803 break;
2804 default:
2805 if (size != sizeof(__u32))
2806 return false;
2807 }
2784 2808
2785 return true; 2809 return true;
2786} 2810}
@@ -2799,7 +2823,7 @@ static bool sk_filter_is_valid_access(int off, int size,
2799 if (type == BPF_WRITE) { 2823 if (type == BPF_WRITE) {
2800 switch (off) { 2824 switch (off) {
2801 case offsetof(struct __sk_buff, cb[0]) ... 2825 case offsetof(struct __sk_buff, cb[0]) ...
2802 offsetof(struct __sk_buff, cb[4]): 2826 offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
2803 break; 2827 break;
2804 default: 2828 default:
2805 return false; 2829 return false;
@@ -2823,7 +2847,7 @@ static bool lwt_is_valid_access(int off, int size,
2823 case offsetof(struct __sk_buff, mark): 2847 case offsetof(struct __sk_buff, mark):
2824 case offsetof(struct __sk_buff, priority): 2848 case offsetof(struct __sk_buff, priority):
2825 case offsetof(struct __sk_buff, cb[0]) ... 2849 case offsetof(struct __sk_buff, cb[0]) ...
2826 offsetof(struct __sk_buff, cb[4]): 2850 offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
2827 break; 2851 break;
2828 default: 2852 default:
2829 return false; 2853 return false;
@@ -2915,7 +2939,7 @@ static bool tc_cls_act_is_valid_access(int off, int size,
2915 case offsetof(struct __sk_buff, tc_index): 2939 case offsetof(struct __sk_buff, tc_index):
2916 case offsetof(struct __sk_buff, priority): 2940 case offsetof(struct __sk_buff, priority):
2917 case offsetof(struct __sk_buff, cb[0]) ... 2941 case offsetof(struct __sk_buff, cb[0]) ...
2918 offsetof(struct __sk_buff, cb[4]): 2942 offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
2919 case offsetof(struct __sk_buff, tc_classid): 2943 case offsetof(struct __sk_buff, tc_classid):
2920 break; 2944 break;
2921 default: 2945 default:
@@ -2972,32 +2996,33 @@ void bpf_warn_invalid_xdp_action(u32 act)
2972} 2996}
2973EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); 2997EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
2974 2998
2975static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg, 2999static u32 bpf_convert_ctx_access(enum bpf_access_type type,
2976 int src_reg, int ctx_off, 3000 const struct bpf_insn *si,
2977 struct bpf_insn *insn_buf, 3001 struct bpf_insn *insn_buf,
2978 struct bpf_prog *prog) 3002 struct bpf_prog *prog)
2979{ 3003{
2980 struct bpf_insn *insn = insn_buf; 3004 struct bpf_insn *insn = insn_buf;
3005 int off;
2981 3006
2982 switch (ctx_off) { 3007 switch (si->off) {
2983 case offsetof(struct __sk_buff, len): 3008 case offsetof(struct __sk_buff, len):
2984 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4); 3009 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4);
2985 3010
2986 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 3011 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
2987 offsetof(struct sk_buff, len)); 3012 offsetof(struct sk_buff, len));
2988 break; 3013 break;
2989 3014
2990 case offsetof(struct __sk_buff, protocol): 3015 case offsetof(struct __sk_buff, protocol):
2991 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2); 3016 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2);
2992 3017
2993 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, 3018 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
2994 offsetof(struct sk_buff, protocol)); 3019 offsetof(struct sk_buff, protocol));
2995 break; 3020 break;
2996 3021
2997 case offsetof(struct __sk_buff, vlan_proto): 3022 case offsetof(struct __sk_buff, vlan_proto):
2998 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2); 3023 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2);
2999 3024
3000 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, 3025 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
3001 offsetof(struct sk_buff, vlan_proto)); 3026 offsetof(struct sk_buff, vlan_proto));
3002 break; 3027 break;
3003 3028
@@ -3005,17 +3030,17 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
3005 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, priority) != 4); 3030 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, priority) != 4);
3006 3031
3007 if (type == BPF_WRITE) 3032 if (type == BPF_WRITE)
3008 *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, 3033 *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
3009 offsetof(struct sk_buff, priority)); 3034 offsetof(struct sk_buff, priority));
3010 else 3035 else
3011 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 3036 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
3012 offsetof(struct sk_buff, priority)); 3037 offsetof(struct sk_buff, priority));
3013 break; 3038 break;
3014 3039
3015 case offsetof(struct __sk_buff, ingress_ifindex): 3040 case offsetof(struct __sk_buff, ingress_ifindex):
3016 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, skb_iif) != 4); 3041 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, skb_iif) != 4);
3017 3042
3018 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 3043 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
3019 offsetof(struct sk_buff, skb_iif)); 3044 offsetof(struct sk_buff, skb_iif));
3020 break; 3045 break;
3021 3046
@@ -3023,17 +3048,17 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
3023 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); 3048 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
3024 3049
3025 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), 3050 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
3026 dst_reg, src_reg, 3051 si->dst_reg, si->src_reg,
3027 offsetof(struct sk_buff, dev)); 3052 offsetof(struct sk_buff, dev));
3028 *insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1); 3053 *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
3029 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg, 3054 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
3030 offsetof(struct net_device, ifindex)); 3055 offsetof(struct net_device, ifindex));
3031 break; 3056 break;
3032 3057
3033 case offsetof(struct __sk_buff, hash): 3058 case offsetof(struct __sk_buff, hash):
3034 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4); 3059 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4);
3035 3060
3036 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 3061 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
3037 offsetof(struct sk_buff, hash)); 3062 offsetof(struct sk_buff, hash));
3038 break; 3063 break;
3039 3064
@@ -3041,63 +3066,77 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
3041 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); 3066 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
3042 3067
3043 if (type == BPF_WRITE) 3068 if (type == BPF_WRITE)
3044 *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, 3069 *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
3045 offsetof(struct sk_buff, mark)); 3070 offsetof(struct sk_buff, mark));
3046 else 3071 else
3047 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 3072 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
3048 offsetof(struct sk_buff, mark)); 3073 offsetof(struct sk_buff, mark));
3049 break; 3074 break;
3050 3075
3051 case offsetof(struct __sk_buff, pkt_type): 3076 case offsetof(struct __sk_buff, pkt_type):
3052 return convert_skb_access(SKF_AD_PKTTYPE, dst_reg, src_reg, insn); 3077 return convert_skb_access(SKF_AD_PKTTYPE, si->dst_reg,
3078 si->src_reg, insn);
3053 3079
3054 case offsetof(struct __sk_buff, queue_mapping): 3080 case offsetof(struct __sk_buff, queue_mapping):
3055 return convert_skb_access(SKF_AD_QUEUE, dst_reg, src_reg, insn); 3081 return convert_skb_access(SKF_AD_QUEUE, si->dst_reg,
3082 si->src_reg, insn);
3056 3083
3057 case offsetof(struct __sk_buff, vlan_present): 3084 case offsetof(struct __sk_buff, vlan_present):
3058 return convert_skb_access(SKF_AD_VLAN_TAG_PRESENT, 3085 return convert_skb_access(SKF_AD_VLAN_TAG_PRESENT,
3059 dst_reg, src_reg, insn); 3086 si->dst_reg, si->src_reg, insn);
3060 3087
3061 case offsetof(struct __sk_buff, vlan_tci): 3088 case offsetof(struct __sk_buff, vlan_tci):
3062 return convert_skb_access(SKF_AD_VLAN_TAG, 3089 return convert_skb_access(SKF_AD_VLAN_TAG,
3063 dst_reg, src_reg, insn); 3090 si->dst_reg, si->src_reg, insn);
3064 3091
3065 case offsetof(struct __sk_buff, cb[0]) ... 3092 case offsetof(struct __sk_buff, cb[0]) ...
3066 offsetof(struct __sk_buff, cb[4]): 3093 offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
3067 BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20); 3094 BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20);
3095 BUILD_BUG_ON((offsetof(struct sk_buff, cb) +
3096 offsetof(struct qdisc_skb_cb, data)) %
3097 sizeof(__u64));
3068 3098
3069 prog->cb_access = 1; 3099 prog->cb_access = 1;
3070 ctx_off -= offsetof(struct __sk_buff, cb[0]); 3100 off = si->off;
3071 ctx_off += offsetof(struct sk_buff, cb); 3101 off -= offsetof(struct __sk_buff, cb[0]);
3072 ctx_off += offsetof(struct qdisc_skb_cb, data); 3102 off += offsetof(struct sk_buff, cb);
3103 off += offsetof(struct qdisc_skb_cb, data);
3073 if (type == BPF_WRITE) 3104 if (type == BPF_WRITE)
3074 *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, ctx_off); 3105 *insn++ = BPF_STX_MEM(BPF_SIZE(si->code), si->dst_reg,
3106 si->src_reg, off);
3075 else 3107 else
3076 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, ctx_off); 3108 *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg,
3109 si->src_reg, off);
3077 break; 3110 break;
3078 3111
3079 case offsetof(struct __sk_buff, tc_classid): 3112 case offsetof(struct __sk_buff, tc_classid):
3080 ctx_off -= offsetof(struct __sk_buff, tc_classid); 3113 BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, tc_classid) != 2);
3081 ctx_off += offsetof(struct sk_buff, cb); 3114
3082 ctx_off += offsetof(struct qdisc_skb_cb, tc_classid); 3115 off = si->off;
3116 off -= offsetof(struct __sk_buff, tc_classid);
3117 off += offsetof(struct sk_buff, cb);
3118 off += offsetof(struct qdisc_skb_cb, tc_classid);
3083 if (type == BPF_WRITE) 3119 if (type == BPF_WRITE)
3084 *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, ctx_off); 3120 *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg,
3121 si->src_reg, off);
3085 else 3122 else
3086 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, ctx_off); 3123 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg,
3124 si->src_reg, off);
3087 break; 3125 break;
3088 3126
3089 case offsetof(struct __sk_buff, data): 3127 case offsetof(struct __sk_buff, data):
3090 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), 3128 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
3091 dst_reg, src_reg, 3129 si->dst_reg, si->src_reg,
3092 offsetof(struct sk_buff, data)); 3130 offsetof(struct sk_buff, data));
3093 break; 3131 break;
3094 3132
3095 case offsetof(struct __sk_buff, data_end): 3133 case offsetof(struct __sk_buff, data_end):
3096 ctx_off -= offsetof(struct __sk_buff, data_end); 3134 off = si->off;
3097 ctx_off += offsetof(struct sk_buff, cb); 3135 off -= offsetof(struct __sk_buff, data_end);
3098 ctx_off += offsetof(struct bpf_skb_data_end, data_end); 3136 off += offsetof(struct sk_buff, cb);
3099 *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), dst_reg, src_reg, 3137 off += offsetof(struct bpf_skb_data_end, data_end);
3100 ctx_off); 3138 *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
3139 si->src_reg, off);
3101 break; 3140 break;
3102 3141
3103 case offsetof(struct __sk_buff, tc_index): 3142 case offsetof(struct __sk_buff, tc_index):
@@ -3105,110 +3144,107 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
3105 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2); 3144 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2);
3106 3145
3107 if (type == BPF_WRITE) 3146 if (type == BPF_WRITE)
3108 *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, 3147 *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, si->src_reg,
3109 offsetof(struct sk_buff, tc_index)); 3148 offsetof(struct sk_buff, tc_index));
3110 else 3149 else
3111 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, 3150 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
3112 offsetof(struct sk_buff, tc_index)); 3151 offsetof(struct sk_buff, tc_index));
3113 break;
3114#else 3152#else
3115 if (type == BPF_WRITE) 3153 if (type == BPF_WRITE)
3116 *insn++ = BPF_MOV64_REG(dst_reg, dst_reg); 3154 *insn++ = BPF_MOV64_REG(si->dst_reg, si->dst_reg);
3117 else 3155 else
3118 *insn++ = BPF_MOV64_IMM(dst_reg, 0); 3156 *insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
3119 break;
3120#endif 3157#endif
3158 break;
3121 } 3159 }
3122 3160
3123 return insn - insn_buf; 3161 return insn - insn_buf;
3124} 3162}
3125 3163
3126static u32 sock_filter_convert_ctx_access(enum bpf_access_type type, 3164static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,
3127 int dst_reg, int src_reg, 3165 const struct bpf_insn *si,
3128 int ctx_off,
3129 struct bpf_insn *insn_buf, 3166 struct bpf_insn *insn_buf,
3130 struct bpf_prog *prog) 3167 struct bpf_prog *prog)
3131{ 3168{
3132 struct bpf_insn *insn = insn_buf; 3169 struct bpf_insn *insn = insn_buf;
3133 3170
3134 switch (ctx_off) { 3171 switch (si->off) {
3135 case offsetof(struct bpf_sock, bound_dev_if): 3172 case offsetof(struct bpf_sock, bound_dev_if):
3136 BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_bound_dev_if) != 4); 3173 BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_bound_dev_if) != 4);
3137 3174
3138 if (type == BPF_WRITE) 3175 if (type == BPF_WRITE)
3139 *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, 3176 *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
3140 offsetof(struct sock, sk_bound_dev_if)); 3177 offsetof(struct sock, sk_bound_dev_if));
3141 else 3178 else
3142 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 3179 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
3143 offsetof(struct sock, sk_bound_dev_if)); 3180 offsetof(struct sock, sk_bound_dev_if));
3144 break; 3181 break;
3145 3182
3146 case offsetof(struct bpf_sock, family): 3183 case offsetof(struct bpf_sock, family):
3147 BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_family) != 2); 3184 BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_family) != 2);
3148 3185
3149 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, 3186 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
3150 offsetof(struct sock, sk_family)); 3187 offsetof(struct sock, sk_family));
3151 break; 3188 break;
3152 3189
3153 case offsetof(struct bpf_sock, type): 3190 case offsetof(struct bpf_sock, type):
3154 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 3191 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
3155 offsetof(struct sock, __sk_flags_offset)); 3192 offsetof(struct sock, __sk_flags_offset));
3156 *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, SK_FL_TYPE_MASK); 3193 *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK);
3157 *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, SK_FL_TYPE_SHIFT); 3194 *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT);
3158 break; 3195 break;
3159 3196
3160 case offsetof(struct bpf_sock, protocol): 3197 case offsetof(struct bpf_sock, protocol):
3161 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 3198 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
3162 offsetof(struct sock, __sk_flags_offset)); 3199 offsetof(struct sock, __sk_flags_offset));
3163 *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, SK_FL_PROTO_MASK); 3200 *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK);
3164 *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, SK_FL_PROTO_SHIFT); 3201 *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_PROTO_SHIFT);
3165 break; 3202 break;
3166 } 3203 }
3167 3204
3168 return insn - insn_buf; 3205 return insn - insn_buf;
3169} 3206}
3170 3207
3171static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, int dst_reg, 3208static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type,
3172 int src_reg, int ctx_off, 3209 const struct bpf_insn *si,
3173 struct bpf_insn *insn_buf, 3210 struct bpf_insn *insn_buf,
3174 struct bpf_prog *prog) 3211 struct bpf_prog *prog)
3175{ 3212{
3176 struct bpf_insn *insn = insn_buf; 3213 struct bpf_insn *insn = insn_buf;
3177 3214
3178 switch (ctx_off) { 3215 switch (si->off) {
3179 case offsetof(struct __sk_buff, ifindex): 3216 case offsetof(struct __sk_buff, ifindex):
3180 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); 3217 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
3181 3218
3182 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), 3219 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
3183 dst_reg, src_reg, 3220 si->dst_reg, si->src_reg,
3184 offsetof(struct sk_buff, dev)); 3221 offsetof(struct sk_buff, dev));
3185 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg, 3222 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
3186 offsetof(struct net_device, ifindex)); 3223 offsetof(struct net_device, ifindex));
3187 break; 3224 break;
3188 default: 3225 default:
3189 return sk_filter_convert_ctx_access(type, dst_reg, src_reg, 3226 return bpf_convert_ctx_access(type, si, insn_buf, prog);
3190 ctx_off, insn_buf, prog);
3191 } 3227 }
3192 3228
3193 return insn - insn_buf; 3229 return insn - insn_buf;
3194} 3230}
3195 3231
3196static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg, 3232static u32 xdp_convert_ctx_access(enum bpf_access_type type,
3197 int src_reg, int ctx_off, 3233 const struct bpf_insn *si,
3198 struct bpf_insn *insn_buf, 3234 struct bpf_insn *insn_buf,
3199 struct bpf_prog *prog) 3235 struct bpf_prog *prog)
3200{ 3236{
3201 struct bpf_insn *insn = insn_buf; 3237 struct bpf_insn *insn = insn_buf;
3202 3238
3203 switch (ctx_off) { 3239 switch (si->off) {
3204 case offsetof(struct xdp_md, data): 3240 case offsetof(struct xdp_md, data):
3205 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data), 3241 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data),
3206 dst_reg, src_reg, 3242 si->dst_reg, si->src_reg,
3207 offsetof(struct xdp_buff, data)); 3243 offsetof(struct xdp_buff, data));
3208 break; 3244 break;
3209 case offsetof(struct xdp_md, data_end): 3245 case offsetof(struct xdp_md, data_end):
3210 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end), 3246 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end),
3211 dst_reg, src_reg, 3247 si->dst_reg, si->src_reg,
3212 offsetof(struct xdp_buff, data_end)); 3248 offsetof(struct xdp_buff, data_end));
3213 break; 3249 break;
3214 } 3250 }
@@ -3219,7 +3255,7 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg,
3219static const struct bpf_verifier_ops sk_filter_ops = { 3255static const struct bpf_verifier_ops sk_filter_ops = {
3220 .get_func_proto = sk_filter_func_proto, 3256 .get_func_proto = sk_filter_func_proto,
3221 .is_valid_access = sk_filter_is_valid_access, 3257 .is_valid_access = sk_filter_is_valid_access,
3222 .convert_ctx_access = sk_filter_convert_ctx_access, 3258 .convert_ctx_access = bpf_convert_ctx_access,
3223}; 3259};
3224 3260
3225static const struct bpf_verifier_ops tc_cls_act_ops = { 3261static const struct bpf_verifier_ops tc_cls_act_ops = {
@@ -3238,69 +3274,69 @@ static const struct bpf_verifier_ops xdp_ops = {
3238static const struct bpf_verifier_ops cg_skb_ops = { 3274static const struct bpf_verifier_ops cg_skb_ops = {
3239 .get_func_proto = cg_skb_func_proto, 3275 .get_func_proto = cg_skb_func_proto,
3240 .is_valid_access = sk_filter_is_valid_access, 3276 .is_valid_access = sk_filter_is_valid_access,
3241 .convert_ctx_access = sk_filter_convert_ctx_access, 3277 .convert_ctx_access = bpf_convert_ctx_access,
3242}; 3278};
3243 3279
3244static const struct bpf_verifier_ops lwt_inout_ops = { 3280static const struct bpf_verifier_ops lwt_inout_ops = {
3245 .get_func_proto = lwt_inout_func_proto, 3281 .get_func_proto = lwt_inout_func_proto,
3246 .is_valid_access = lwt_is_valid_access, 3282 .is_valid_access = lwt_is_valid_access,
3247 .convert_ctx_access = sk_filter_convert_ctx_access, 3283 .convert_ctx_access = bpf_convert_ctx_access,
3248}; 3284};
3249 3285
3250static const struct bpf_verifier_ops lwt_xmit_ops = { 3286static const struct bpf_verifier_ops lwt_xmit_ops = {
3251 .get_func_proto = lwt_xmit_func_proto, 3287 .get_func_proto = lwt_xmit_func_proto,
3252 .is_valid_access = lwt_is_valid_access, 3288 .is_valid_access = lwt_is_valid_access,
3253 .convert_ctx_access = sk_filter_convert_ctx_access, 3289 .convert_ctx_access = bpf_convert_ctx_access,
3254 .gen_prologue = tc_cls_act_prologue, 3290 .gen_prologue = tc_cls_act_prologue,
3255}; 3291};
3256 3292
3257static const struct bpf_verifier_ops cg_sock_ops = { 3293static const struct bpf_verifier_ops cg_sock_ops = {
3258 .get_func_proto = sk_filter_func_proto, 3294 .get_func_proto = bpf_base_func_proto,
3259 .is_valid_access = sock_filter_is_valid_access, 3295 .is_valid_access = sock_filter_is_valid_access,
3260 .convert_ctx_access = sock_filter_convert_ctx_access, 3296 .convert_ctx_access = sock_filter_convert_ctx_access,
3261}; 3297};
3262 3298
3263static struct bpf_prog_type_list sk_filter_type __read_mostly = { 3299static struct bpf_prog_type_list sk_filter_type __ro_after_init = {
3264 .ops = &sk_filter_ops, 3300 .ops = &sk_filter_ops,
3265 .type = BPF_PROG_TYPE_SOCKET_FILTER, 3301 .type = BPF_PROG_TYPE_SOCKET_FILTER,
3266}; 3302};
3267 3303
3268static struct bpf_prog_type_list sched_cls_type __read_mostly = { 3304static struct bpf_prog_type_list sched_cls_type __ro_after_init = {
3269 .ops = &tc_cls_act_ops, 3305 .ops = &tc_cls_act_ops,
3270 .type = BPF_PROG_TYPE_SCHED_CLS, 3306 .type = BPF_PROG_TYPE_SCHED_CLS,
3271}; 3307};
3272 3308
3273static struct bpf_prog_type_list sched_act_type __read_mostly = { 3309static struct bpf_prog_type_list sched_act_type __ro_after_init = {
3274 .ops = &tc_cls_act_ops, 3310 .ops = &tc_cls_act_ops,
3275 .type = BPF_PROG_TYPE_SCHED_ACT, 3311 .type = BPF_PROG_TYPE_SCHED_ACT,
3276}; 3312};
3277 3313
3278static struct bpf_prog_type_list xdp_type __read_mostly = { 3314static struct bpf_prog_type_list xdp_type __ro_after_init = {
3279 .ops = &xdp_ops, 3315 .ops = &xdp_ops,
3280 .type = BPF_PROG_TYPE_XDP, 3316 .type = BPF_PROG_TYPE_XDP,
3281}; 3317};
3282 3318
3283static struct bpf_prog_type_list cg_skb_type __read_mostly = { 3319static struct bpf_prog_type_list cg_skb_type __ro_after_init = {
3284 .ops = &cg_skb_ops, 3320 .ops = &cg_skb_ops,
3285 .type = BPF_PROG_TYPE_CGROUP_SKB, 3321 .type = BPF_PROG_TYPE_CGROUP_SKB,
3286}; 3322};
3287 3323
3288static struct bpf_prog_type_list lwt_in_type __read_mostly = { 3324static struct bpf_prog_type_list lwt_in_type __ro_after_init = {
3289 .ops = &lwt_inout_ops, 3325 .ops = &lwt_inout_ops,
3290 .type = BPF_PROG_TYPE_LWT_IN, 3326 .type = BPF_PROG_TYPE_LWT_IN,
3291}; 3327};
3292 3328
3293static struct bpf_prog_type_list lwt_out_type __read_mostly = { 3329static struct bpf_prog_type_list lwt_out_type __ro_after_init = {
3294 .ops = &lwt_inout_ops, 3330 .ops = &lwt_inout_ops,
3295 .type = BPF_PROG_TYPE_LWT_OUT, 3331 .type = BPF_PROG_TYPE_LWT_OUT,
3296}; 3332};
3297 3333
3298static struct bpf_prog_type_list lwt_xmit_type __read_mostly = { 3334static struct bpf_prog_type_list lwt_xmit_type __ro_after_init = {
3299 .ops = &lwt_xmit_ops, 3335 .ops = &lwt_xmit_ops,
3300 .type = BPF_PROG_TYPE_LWT_XMIT, 3336 .type = BPF_PROG_TYPE_LWT_XMIT,
3301}; 3337};
3302 3338
3303static struct bpf_prog_type_list cg_sock_type __read_mostly = { 3339static struct bpf_prog_type_list cg_sock_type __ro_after_init = {
3304 .ops = &cg_sock_ops, 3340 .ops = &cg_sock_ops,
3305 .type = BPF_PROG_TYPE_CGROUP_SOCK 3341 .type = BPF_PROG_TYPE_CGROUP_SOCK
3306}; 3342};
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 1b7673aac59d..d98d4998213d 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -138,6 +138,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
138 struct flow_dissector_key_control *key_control; 138 struct flow_dissector_key_control *key_control;
139 struct flow_dissector_key_basic *key_basic; 139 struct flow_dissector_key_basic *key_basic;
140 struct flow_dissector_key_addrs *key_addrs; 140 struct flow_dissector_key_addrs *key_addrs;
141 struct flow_dissector_key_arp *key_arp;
141 struct flow_dissector_key_ports *key_ports; 142 struct flow_dissector_key_ports *key_ports;
142 struct flow_dissector_key_icmp *key_icmp; 143 struct flow_dissector_key_icmp *key_icmp;
143 struct flow_dissector_key_tags *key_tags; 144 struct flow_dissector_key_tags *key_tags;
@@ -379,6 +380,62 @@ mpls:
379 380
380 nhoff += FCOE_HEADER_LEN; 381 nhoff += FCOE_HEADER_LEN;
381 goto out_good; 382 goto out_good;
383
384 case htons(ETH_P_ARP):
385 case htons(ETH_P_RARP): {
386 struct {
387 unsigned char ar_sha[ETH_ALEN];
388 unsigned char ar_sip[4];
389 unsigned char ar_tha[ETH_ALEN];
390 unsigned char ar_tip[4];
391 } *arp_eth, _arp_eth;
392 const struct arphdr *arp;
393 struct arphdr _arp;
394
395 arp = __skb_header_pointer(skb, nhoff, sizeof(_arp), data,
396 hlen, &_arp);
397 if (!arp)
398 goto out_bad;
399
400 if (arp->ar_hrd != htons(ARPHRD_ETHER) ||
401 arp->ar_pro != htons(ETH_P_IP) ||
402 arp->ar_hln != ETH_ALEN ||
403 arp->ar_pln != 4 ||
404 (arp->ar_op != htons(ARPOP_REPLY) &&
405 arp->ar_op != htons(ARPOP_REQUEST)))
406 goto out_bad;
407
408 arp_eth = __skb_header_pointer(skb, nhoff + sizeof(_arp),
409 sizeof(_arp_eth), data,
410 hlen,
411 &_arp_eth);
412 if (!arp_eth)
413 goto out_bad;
414
415 if (dissector_uses_key(flow_dissector,
416 FLOW_DISSECTOR_KEY_ARP)) {
417
418 key_arp = skb_flow_dissector_target(flow_dissector,
419 FLOW_DISSECTOR_KEY_ARP,
420 target_container);
421
422 memcpy(&key_arp->sip, arp_eth->ar_sip,
423 sizeof(key_arp->sip));
424 memcpy(&key_arp->tip, arp_eth->ar_tip,
425 sizeof(key_arp->tip));
426
427 /* Only store the lower byte of the opcode;
428 * this covers ARPOP_REPLY and ARPOP_REQUEST.
429 */
430 key_arp->op = ntohs(arp->ar_op) & 0xff;
431
432 ether_addr_copy(key_arp->sha, arp_eth->ar_sha);
433 ether_addr_copy(key_arp->tha, arp_eth->ar_tha);
434 }
435
436 goto out_good;
437 }
438
382 default: 439 default:
383 goto out_bad; 440 goto out_bad;
384 } 441 }
diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c
new file mode 100644
index 000000000000..c98bbfbd26b8
--- /dev/null
+++ b/net/core/gro_cells.c
@@ -0,0 +1,92 @@
1#include <linux/skbuff.h>
2#include <linux/slab.h>
3#include <linux/netdevice.h>
4#include <net/gro_cells.h>
5
6struct gro_cell {
7 struct sk_buff_head napi_skbs;
8 struct napi_struct napi;
9};
10
11int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb)
12{
13 struct net_device *dev = skb->dev;
14 struct gro_cell *cell;
15
16 if (!gcells->cells || skb_cloned(skb) || !(dev->features & NETIF_F_GRO))
17 return netif_rx(skb);
18
19 cell = this_cpu_ptr(gcells->cells);
20
21 if (skb_queue_len(&cell->napi_skbs) > netdev_max_backlog) {
22 atomic_long_inc(&dev->rx_dropped);
23 kfree_skb(skb);
24 return NET_RX_DROP;
25 }
26
27 __skb_queue_tail(&cell->napi_skbs, skb);
28 if (skb_queue_len(&cell->napi_skbs) == 1)
29 napi_schedule(&cell->napi);
30 return NET_RX_SUCCESS;
31}
32EXPORT_SYMBOL(gro_cells_receive);
33
34/* called under BH context */
35static int gro_cell_poll(struct napi_struct *napi, int budget)
36{
37 struct gro_cell *cell = container_of(napi, struct gro_cell, napi);
38 struct sk_buff *skb;
39 int work_done = 0;
40
41 while (work_done < budget) {
42 skb = __skb_dequeue(&cell->napi_skbs);
43 if (!skb)
44 break;
45 napi_gro_receive(napi, skb);
46 work_done++;
47 }
48
49 if (work_done < budget)
50 napi_complete_done(napi, work_done);
51 return work_done;
52}
53
54int gro_cells_init(struct gro_cells *gcells, struct net_device *dev)
55{
56 int i;
57
58 gcells->cells = alloc_percpu(struct gro_cell);
59 if (!gcells->cells)
60 return -ENOMEM;
61
62 for_each_possible_cpu(i) {
63 struct gro_cell *cell = per_cpu_ptr(gcells->cells, i);
64
65 __skb_queue_head_init(&cell->napi_skbs);
66
67 set_bit(NAPI_STATE_NO_BUSY_POLL, &cell->napi.state);
68
69 netif_napi_add(dev, &cell->napi, gro_cell_poll,
70 NAPI_POLL_WEIGHT);
71 napi_enable(&cell->napi);
72 }
73 return 0;
74}
75EXPORT_SYMBOL(gro_cells_init);
76
77void gro_cells_destroy(struct gro_cells *gcells)
78{
79 int i;
80
81 if (!gcells->cells)
82 return;
83 for_each_possible_cpu(i) {
84 struct gro_cell *cell = per_cpu_ptr(gcells->cells, i);
85
86 netif_napi_del(&cell->napi);
87 __skb_queue_purge(&cell->napi_skbs);
88 }
89 free_percpu(gcells->cells);
90 gcells->cells = NULL;
91}
92EXPORT_SYMBOL(gro_cells_destroy);
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index b3eef90b2df9..0cfe7b0216c3 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -237,7 +237,7 @@ static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = {
237 [LWT_BPF_XMIT_HEADROOM] = { .type = NLA_U32 }, 237 [LWT_BPF_XMIT_HEADROOM] = { .type = NLA_U32 },
238}; 238};
239 239
240static int bpf_build_state(struct net_device *dev, struct nlattr *nla, 240static int bpf_build_state(struct nlattr *nla,
241 unsigned int family, const void *cfg, 241 unsigned int family, const void *cfg,
242 struct lwtunnel_state **ts) 242 struct lwtunnel_state **ts)
243{ 243{
@@ -352,7 +352,7 @@ static int bpf_encap_nlsize(struct lwtunnel_state *lwtstate)
352 0; 352 0;
353} 353}
354 354
355int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b) 355static int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b)
356{ 356{
357 /* FIXME: 357 /* FIXME:
358 * The LWT state is currently rebuilt for delete requests which 358 * The LWT state is currently rebuilt for delete requests which
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index c23465005f2f..6df9f8fabf0c 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -101,7 +101,7 @@ int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *ops,
101} 101}
102EXPORT_SYMBOL(lwtunnel_encap_del_ops); 102EXPORT_SYMBOL(lwtunnel_encap_del_ops);
103 103
104int lwtunnel_build_state(struct net_device *dev, u16 encap_type, 104int lwtunnel_build_state(u16 encap_type,
105 struct nlattr *encap, unsigned int family, 105 struct nlattr *encap, unsigned int family,
106 const void *cfg, struct lwtunnel_state **lws) 106 const void *cfg, struct lwtunnel_state **lws)
107{ 107{
@@ -116,7 +116,7 @@ int lwtunnel_build_state(struct net_device *dev, u16 encap_type,
116 rcu_read_lock(); 116 rcu_read_lock();
117 ops = rcu_dereference(lwtun_encaps[encap_type]); 117 ops = rcu_dereference(lwtun_encaps[encap_type]);
118 if (likely(ops && ops->build_state && try_module_get(ops->owner))) { 118 if (likely(ops && ops->build_state && try_module_get(ops->owner))) {
119 ret = ops->build_state(dev, encap, family, cfg, lws); 119 ret = ops->build_state(encap, family, cfg, lws);
120 if (ret) 120 if (ret)
121 module_put(ops->owner); 121 module_put(ops->owner);
122 } 122 }
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 7bb12e07ffef..4526cbd7e28a 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -860,7 +860,8 @@ static void neigh_probe(struct neighbour *neigh)
860 if (skb) 860 if (skb)
861 skb = skb_clone(skb, GFP_ATOMIC); 861 skb = skb_clone(skb, GFP_ATOMIC);
862 write_unlock(&neigh->lock); 862 write_unlock(&neigh->lock);
863 neigh->ops->solicit(neigh, skb); 863 if (neigh->ops->solicit)
864 neigh->ops->solicit(neigh, skb);
864 atomic_inc(&neigh->probes); 865 atomic_inc(&neigh->probes);
865 kfree_skb(skb); 866 kfree_skb(skb);
866} 867}
@@ -2923,7 +2924,8 @@ static void neigh_proc_update(struct ctl_table *ctl, int write)
2923 return; 2924 return;
2924 2925
2925 set_bit(index, p->data_state); 2926 set_bit(index, p->data_state);
2926 call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p); 2927 if (index == NEIGH_VAR_DELAY_PROBE_TIME)
2928 call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p);
2927 if (!dev) /* NULL dev means this is default value */ 2929 if (!dev) /* NULL dev means this is default value */
2928 neigh_copy_dflt_parms(net, p, index); 2930 neigh_copy_dflt_parms(net, p, index);
2929} 2931}
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index b0c04cf4851d..65ea0ff4017c 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -15,6 +15,7 @@
15#include <net/switchdev.h> 15#include <net/switchdev.h>
16#include <linux/if_arp.h> 16#include <linux/if_arp.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/sched/signal.h>
18#include <linux/nsproxy.h> 19#include <linux/nsproxy.h>
19#include <net/sock.h> 20#include <net/sock.h>
20#include <net/net_namespace.h> 21#include <net/net_namespace.h>
@@ -952,7 +953,7 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
952 while (--i >= new_num) { 953 while (--i >= new_num) {
953 struct kobject *kobj = &dev->_rx[i].kobj; 954 struct kobject *kobj = &dev->_rx[i].kobj;
954 955
955 if (!list_empty(&dev_net(dev)->exit_list)) 956 if (!atomic_read(&dev_net(dev)->count))
956 kobj->uevent_suppress = 1; 957 kobj->uevent_suppress = 1;
957 if (dev->sysfs_rx_queue_group) 958 if (dev->sysfs_rx_queue_group)
958 sysfs_remove_group(kobj, dev->sysfs_rx_queue_group); 959 sysfs_remove_group(kobj, dev->sysfs_rx_queue_group);
@@ -1370,7 +1371,7 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
1370 while (--i >= new_num) { 1371 while (--i >= new_num) {
1371 struct netdev_queue *queue = dev->_tx + i; 1372 struct netdev_queue *queue = dev->_tx + i;
1372 1373
1373 if (!list_empty(&dev_net(dev)->exit_list)) 1374 if (!atomic_read(&dev_net(dev)->count))
1374 queue->kobj.uevent_suppress = 1; 1375 queue->kobj.uevent_suppress = 1;
1375#ifdef CONFIG_BQL 1376#ifdef CONFIG_BQL
1376 sysfs_remove_group(&queue->kobj, &dql_group); 1377 sysfs_remove_group(&queue->kobj, &dql_group);
@@ -1557,7 +1558,7 @@ void netdev_unregister_kobject(struct net_device *ndev)
1557{ 1558{
1558 struct device *dev = &(ndev->dev); 1559 struct device *dev = &(ndev->dev);
1559 1560
1560 if (!list_empty(&dev_net(ndev)->exit_list)) 1561 if (!atomic_read(&dev_net(ndev)->count))
1561 dev_set_uevent_suppress(dev, 1); 1562 dev_set_uevent_suppress(dev, 1);
1562 1563
1563 kobject_get(&dev->kobj); 1564 kobject_get(&dev->kobj);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 3c4bbec39713..652468ff65b7 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -16,6 +16,8 @@
16#include <linux/export.h> 16#include <linux/export.h>
17#include <linux/user_namespace.h> 17#include <linux/user_namespace.h>
18#include <linux/net_namespace.h> 18#include <linux/net_namespace.h>
19#include <linux/sched/task.h>
20
19#include <net/sock.h> 21#include <net/sock.h>
20#include <net/netlink.h> 22#include <net/netlink.h>
21#include <net/net_namespace.h> 23#include <net/net_namespace.h>
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index 11fce17274f6..029a61ac6cdd 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -12,6 +12,8 @@
12#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/cgroup.h> 13#include <linux/cgroup.h>
14#include <linux/fdtable.h> 14#include <linux/fdtable.h>
15#include <linux/sched/task.h>
16
15#include <net/cls_cgroup.h> 17#include <net/cls_cgroup.h>
16#include <net/sock.h> 18#include <net/sock.h>
17 19
@@ -69,27 +71,17 @@ static int update_classid_sock(const void *v, struct file *file, unsigned n)
69 return 0; 71 return 0;
70} 72}
71 73
72static void update_classid(struct cgroup_subsys_state *css, void *v) 74static void cgrp_attach(struct cgroup_taskset *tset)
73{ 75{
74 struct css_task_iter it; 76 struct cgroup_subsys_state *css;
75 struct task_struct *p; 77 struct task_struct *p;
76 78
77 css_task_iter_start(css, &it); 79 cgroup_taskset_for_each(p, css, tset) {
78 while ((p = css_task_iter_next(&it))) {
79 task_lock(p); 80 task_lock(p);
80 iterate_fd(p->files, 0, update_classid_sock, v); 81 iterate_fd(p->files, 0, update_classid_sock,
82 (void *)(unsigned long)css_cls_state(css)->classid);
81 task_unlock(p); 83 task_unlock(p);
82 } 84 }
83 css_task_iter_end(&it);
84}
85
86static void cgrp_attach(struct cgroup_taskset *tset)
87{
88 struct cgroup_subsys_state *css;
89
90 cgroup_taskset_first(tset, &css);
91 update_classid(css,
92 (void *)(unsigned long)css_cls_state(css)->classid);
93} 85}
94 86
95static u64 read_classid(struct cgroup_subsys_state *css, struct cftype *cft) 87static u64 read_classid(struct cgroup_subsys_state *css, struct cftype *cft)
@@ -101,12 +93,22 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft,
101 u64 value) 93 u64 value)
102{ 94{
103 struct cgroup_cls_state *cs = css_cls_state(css); 95 struct cgroup_cls_state *cs = css_cls_state(css);
96 struct css_task_iter it;
97 struct task_struct *p;
104 98
105 cgroup_sk_alloc_disable(); 99 cgroup_sk_alloc_disable();
106 100
107 cs->classid = (u32)value; 101 cs->classid = (u32)value;
108 102
109 update_classid(css, (void *)(unsigned long)cs->classid); 103 css_task_iter_start(css, &it);
104 while ((p = css_task_iter_next(&it))) {
105 task_lock(p);
106 iterate_fd(p->files, 0, update_classid_sock,
107 (void *)(unsigned long)cs->classid);
108 task_unlock(p);
109 }
110 css_task_iter_end(&it);
111
110 return 0; 112 return 0;
111} 113}
112 114
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 9424673009c1..29be2466970c 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -105,15 +105,21 @@ static void queue_process(struct work_struct *work)
105 while ((skb = skb_dequeue(&npinfo->txq))) { 105 while ((skb = skb_dequeue(&npinfo->txq))) {
106 struct net_device *dev = skb->dev; 106 struct net_device *dev = skb->dev;
107 struct netdev_queue *txq; 107 struct netdev_queue *txq;
108 unsigned int q_index;
108 109
109 if (!netif_device_present(dev) || !netif_running(dev)) { 110 if (!netif_device_present(dev) || !netif_running(dev)) {
110 kfree_skb(skb); 111 kfree_skb(skb);
111 continue; 112 continue;
112 } 113 }
113 114
114 txq = skb_get_tx_queue(dev, skb);
115
116 local_irq_save(flags); 115 local_irq_save(flags);
116 /* check if skb->queue_mapping is still valid */
117 q_index = skb_get_queue_mapping(skb);
118 if (unlikely(q_index >= dev->real_num_tx_queues)) {
119 q_index = q_index % dev->real_num_tx_queues;
120 skb_set_queue_mapping(skb, q_index);
121 }
122 txq = netdev_get_tx_queue(dev, q_index);
117 HARD_TX_LOCK(dev, txq, smp_processor_id()); 123 HARD_TX_LOCK(dev, txq, smp_processor_id());
118 if (netif_xmit_frozen_or_stopped(txq) || 124 if (netif_xmit_frozen_or_stopped(txq) ||
119 netpoll_start_xmit(skb, dev, txq) != NETDEV_TX_OK) { 125 netpoll_start_xmit(skb, dev, txq) != NETDEV_TX_OK) {
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 2ec86fc552df..0f9275ee5595 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -13,12 +13,15 @@
13 13
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/types.h> 15#include <linux/types.h>
16#include <linux/module.h>
16#include <linux/string.h> 17#include <linux/string.h>
17#include <linux/errno.h> 18#include <linux/errno.h>
18#include <linux/skbuff.h> 19#include <linux/skbuff.h>
19#include <linux/cgroup.h> 20#include <linux/cgroup.h>
20#include <linux/rcupdate.h> 21#include <linux/rcupdate.h>
21#include <linux/atomic.h> 22#include <linux/atomic.h>
23#include <linux/sched/task.h>
24
22#include <net/rtnetlink.h> 25#include <net/rtnetlink.h>
23#include <net/pkt_cls.h> 26#include <net/pkt_cls.h>
24#include <net/sock.h> 27#include <net/sock.h>
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 8e69ce472236..96947f5d41e4 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -3439,9 +3439,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
3439 /* skb was 'freed' by stack, so clean few 3439 /* skb was 'freed' by stack, so clean few
3440 * bits and reuse it 3440 * bits and reuse it
3441 */ 3441 */
3442#ifdef CONFIG_NET_CLS_ACT 3442 skb_reset_tc(skb);
3443 skb->tc_verd = 0; /* reset reclass/redir ttl */
3444#endif
3445 } while (--burst > 0); 3443 } while (--burst > 0);
3446 goto out; /* Skips xmit_mode M_START_XMIT */ 3444 goto out; /* Skips xmit_mode M_START_XMIT */
3447 } else if (pkt_dev->xmit_mode == M_QUEUE_XMIT) { 3445 } else if (pkt_dev->xmit_mode == M_QUEUE_XMIT) {
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
index 5d26056b6d8f..9b8727c67b58 100644
--- a/net/core/request_sock.c
+++ b/net/core/request_sock.c
@@ -34,8 +34,6 @@
34 * and it will increase in proportion to the memory of machine. 34 * and it will increase in proportion to the memory of machine.
35 * Note : Dont forget somaxconn that may limit backlog too. 35 * Note : Dont forget somaxconn that may limit backlog too.
36 */ 36 */
37int sysctl_max_syn_backlog = 256;
38EXPORT_SYMBOL(sysctl_max_syn_backlog);
39 37
40void reqsk_queue_alloc(struct request_sock_queue *queue) 38void reqsk_queue_alloc(struct request_sock_queue *queue)
41{ 39{
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 75e3ea7bda08..c4e84c558240 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -837,8 +837,7 @@ static void copy_rtnl_link_stats(struct rtnl_link_stats *a,
837static inline int rtnl_vfinfo_size(const struct net_device *dev, 837static inline int rtnl_vfinfo_size(const struct net_device *dev,
838 u32 ext_filter_mask) 838 u32 ext_filter_mask)
839{ 839{
840 if (dev->dev.parent && dev_is_pci(dev->dev.parent) && 840 if (dev->dev.parent && (ext_filter_mask & RTEXT_FILTER_VF)) {
841 (ext_filter_mask & RTEXT_FILTER_VF)) {
842 int num_vfs = dev_num_vf(dev->dev.parent); 841 int num_vfs = dev_num_vf(dev->dev.parent);
843 size_t size = nla_total_size(0); 842 size_t size = nla_total_size(0);
844 size += num_vfs * 843 size += num_vfs *
@@ -877,8 +876,6 @@ static size_t rtnl_port_size(const struct net_device *dev,
877{ 876{
878 size_t port_size = nla_total_size(4) /* PORT_VF */ 877 size_t port_size = nla_total_size(4) /* PORT_VF */
879 + nla_total_size(PORT_PROFILE_MAX) /* PORT_PROFILE */ 878 + nla_total_size(PORT_PROFILE_MAX) /* PORT_PROFILE */
880 + nla_total_size(sizeof(struct ifla_port_vsi))
881 /* PORT_VSI_TYPE */
882 + nla_total_size(PORT_UUID_MAX) /* PORT_INSTANCE_UUID */ 879 + nla_total_size(PORT_UUID_MAX) /* PORT_INSTANCE_UUID */
883 + nla_total_size(PORT_UUID_MAX) /* PORT_HOST_UUID */ 880 + nla_total_size(PORT_UUID_MAX) /* PORT_HOST_UUID */
884 + nla_total_size(1) /* PROT_VDP_REQUEST */ 881 + nla_total_size(1) /* PROT_VDP_REQUEST */
@@ -1492,14 +1489,19 @@ static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
1492 [IFLA_PORT_VF] = { .type = NLA_U32 }, 1489 [IFLA_PORT_VF] = { .type = NLA_U32 },
1493 [IFLA_PORT_PROFILE] = { .type = NLA_STRING, 1490 [IFLA_PORT_PROFILE] = { .type = NLA_STRING,
1494 .len = PORT_PROFILE_MAX }, 1491 .len = PORT_PROFILE_MAX },
1495 [IFLA_PORT_VSI_TYPE] = { .type = NLA_BINARY,
1496 .len = sizeof(struct ifla_port_vsi)},
1497 [IFLA_PORT_INSTANCE_UUID] = { .type = NLA_BINARY, 1492 [IFLA_PORT_INSTANCE_UUID] = { .type = NLA_BINARY,
1498 .len = PORT_UUID_MAX }, 1493 .len = PORT_UUID_MAX },
1499 [IFLA_PORT_HOST_UUID] = { .type = NLA_STRING, 1494 [IFLA_PORT_HOST_UUID] = { .type = NLA_STRING,
1500 .len = PORT_UUID_MAX }, 1495 .len = PORT_UUID_MAX },
1501 [IFLA_PORT_REQUEST] = { .type = NLA_U8, }, 1496 [IFLA_PORT_REQUEST] = { .type = NLA_U8, },
1502 [IFLA_PORT_RESPONSE] = { .type = NLA_U16, }, 1497 [IFLA_PORT_RESPONSE] = { .type = NLA_U16, },
1498
1499 /* Unused, but we need to keep it here since user space could
1500 * fill it. It's also broken with regard to NLA_BINARY use in
1501 * combination with structs.
1502 */
1503 [IFLA_PORT_VSI_TYPE] = { .type = NLA_BINARY,
1504 .len = sizeof(struct ifla_port_vsi) },
1503}; 1505};
1504 1506
1505static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = { 1507static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = {
@@ -2356,7 +2358,6 @@ struct net_device *rtnl_create_link(struct net *net,
2356 const char *ifname, unsigned char name_assign_type, 2358 const char *ifname, unsigned char name_assign_type,
2357 const struct rtnl_link_ops *ops, struct nlattr *tb[]) 2359 const struct rtnl_link_ops *ops, struct nlattr *tb[])
2358{ 2360{
2359 int err;
2360 struct net_device *dev; 2361 struct net_device *dev;
2361 unsigned int num_tx_queues = 1; 2362 unsigned int num_tx_queues = 1;
2362 unsigned int num_rx_queues = 1; 2363 unsigned int num_rx_queues = 1;
@@ -2371,11 +2372,10 @@ struct net_device *rtnl_create_link(struct net *net,
2371 else if (ops->get_num_rx_queues) 2372 else if (ops->get_num_rx_queues)
2372 num_rx_queues = ops->get_num_rx_queues(); 2373 num_rx_queues = ops->get_num_rx_queues();
2373 2374
2374 err = -ENOMEM;
2375 dev = alloc_netdev_mqs(ops->priv_size, ifname, name_assign_type, 2375 dev = alloc_netdev_mqs(ops->priv_size, ifname, name_assign_type,
2376 ops->setup, num_tx_queues, num_rx_queues); 2376 ops->setup, num_tx_queues, num_rx_queues);
2377 if (!dev) 2377 if (!dev)
2378 goto err; 2378 return ERR_PTR(-ENOMEM);
2379 2379
2380 dev_net_set(dev, net); 2380 dev_net_set(dev, net);
2381 dev->rtnl_link_ops = ops; 2381 dev->rtnl_link_ops = ops;
@@ -2401,9 +2401,6 @@ struct net_device *rtnl_create_link(struct net *net,
2401 dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); 2401 dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));
2402 2402
2403 return dev; 2403 return dev;
2404
2405err:
2406 return ERR_PTR(err);
2407} 2404}
2408EXPORT_SYMBOL(rtnl_create_link); 2405EXPORT_SYMBOL(rtnl_create_link);
2409 2406
@@ -2571,7 +2568,7 @@ replay:
2571 return -ENODEV; 2568 return -ENODEV;
2572 } 2569 }
2573 2570
2574 if (tb[IFLA_MAP] || tb[IFLA_MASTER] || tb[IFLA_PROTINFO]) 2571 if (tb[IFLA_MAP] || tb[IFLA_PROTINFO])
2575 return -EOPNOTSUPP; 2572 return -EOPNOTSUPP;
2576 2573
2577 if (!ops) { 2574 if (!ops) {
@@ -2653,6 +2650,11 @@ replay:
2653 if (err < 0) 2650 if (err < 0)
2654 goto out_unregister; 2651 goto out_unregister;
2655 } 2652 }
2653 if (tb[IFLA_MASTER]) {
2654 err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]));
2655 if (err)
2656 goto out_unregister;
2657 }
2656out: 2658out:
2657 if (link_net) 2659 if (link_net)
2658 put_net(link_net); 2660 put_net(link_net);
@@ -3829,6 +3831,39 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
3829 *idxattr = 0; 3831 *idxattr = 0;
3830 } 3832 }
3831 3833
3834 if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, *idxattr)) {
3835 struct rtnl_af_ops *af_ops;
3836
3837 *idxattr = IFLA_STATS_AF_SPEC;
3838 attr = nla_nest_start(skb, IFLA_STATS_AF_SPEC);
3839 if (!attr)
3840 goto nla_put_failure;
3841
3842 list_for_each_entry(af_ops, &rtnl_af_ops, list) {
3843 if (af_ops->fill_stats_af) {
3844 struct nlattr *af;
3845 int err;
3846
3847 af = nla_nest_start(skb, af_ops->family);
3848 if (!af)
3849 goto nla_put_failure;
3850
3851 err = af_ops->fill_stats_af(skb, dev);
3852
3853 if (err == -ENODATA)
3854 nla_nest_cancel(skb, af);
3855 else if (err < 0)
3856 goto nla_put_failure;
3857
3858 nla_nest_end(skb, af);
3859 }
3860 }
3861
3862 nla_nest_end(skb, attr);
3863
3864 *idxattr = 0;
3865 }
3866
3832 nlmsg_end(skb, nlh); 3867 nlmsg_end(skb, nlh);
3833 3868
3834 return 0; 3869 return 0;
@@ -3885,6 +3920,23 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev,
3885 if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0)) 3920 if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0))
3886 size += rtnl_get_offload_stats_size(dev); 3921 size += rtnl_get_offload_stats_size(dev);
3887 3922
3923 if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, 0)) {
3924 struct rtnl_af_ops *af_ops;
3925
3926 /* for IFLA_STATS_AF_SPEC */
3927 size += nla_total_size(0);
3928
3929 list_for_each_entry(af_ops, &rtnl_af_ops, list) {
3930 if (af_ops->get_stats_af_size) {
3931 size += nla_total_size(
3932 af_ops->get_stats_af_size(dev));
3933
3934 /* for AF_* */
3935 size += nla_total_size(0);
3936 }
3937 }
3938 }
3939
3888 return size; 3940 return size;
3889} 3941}
3890 3942
diff --git a/net/core/scm.c b/net/core/scm.c
index d8820438ba37..b1ff8a441748 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -14,6 +14,7 @@
14#include <linux/capability.h> 14#include <linux/capability.h>
15#include <linux/errno.h> 15#include <linux/errno.h>
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/sched/user.h>
17#include <linux/mm.h> 18#include <linux/mm.h>
18#include <linux/kernel.h> 19#include <linux/kernel.h>
19#include <linux/stat.h> 20#include <linux/stat.h>
@@ -71,7 +72,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
71 struct file **fpp; 72 struct file **fpp;
72 int i, num; 73 int i, num;
73 74
74 num = (cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)))/sizeof(int); 75 num = (cmsg->cmsg_len - sizeof(struct cmsghdr))/sizeof(int);
75 76
76 if (num <= 0) 77 if (num <= 0)
77 return 0; 78 return 0;
diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
index 88a8e429fc3e..d28da7d363f1 100644
--- a/net/core/secure_seq.c
+++ b/net/core/secure_seq.c
@@ -1,3 +1,7 @@
1/*
2 * Copyright (C) 2016 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
3 */
4
1#include <linux/kernel.h> 5#include <linux/kernel.h>
2#include <linux/init.h> 6#include <linux/init.h>
3#include <linux/cryptohash.h> 7#include <linux/cryptohash.h>
@@ -8,18 +12,20 @@
8#include <linux/ktime.h> 12#include <linux/ktime.h>
9#include <linux/string.h> 13#include <linux/string.h>
10#include <linux/net.h> 14#include <linux/net.h>
11 15#include <linux/siphash.h>
12#include <net/secure_seq.h> 16#include <net/secure_seq.h>
13 17
14#if IS_ENABLED(CONFIG_IPV6) || IS_ENABLED(CONFIG_INET) 18#if IS_ENABLED(CONFIG_IPV6) || IS_ENABLED(CONFIG_INET)
19#include <linux/in6.h>
15#include <net/tcp.h> 20#include <net/tcp.h>
16#define NET_SECRET_SIZE (MD5_MESSAGE_BYTES / 4)
17 21
18static u32 net_secret[NET_SECRET_SIZE] ____cacheline_aligned; 22static siphash_key_t net_secret __read_mostly;
23static siphash_key_t ts_secret __read_mostly;
19 24
20static __always_inline void net_secret_init(void) 25static __always_inline void net_secret_init(void)
21{ 26{
22 net_get_random_once(net_secret, sizeof(net_secret)); 27 net_get_random_once(&ts_secret, sizeof(ts_secret));
28 net_get_random_once(&net_secret, sizeof(net_secret));
23} 29}
24#endif 30#endif
25 31
@@ -41,83 +47,98 @@ static u32 seq_scale(u32 seq)
41#endif 47#endif
42 48
43#if IS_ENABLED(CONFIG_IPV6) 49#if IS_ENABLED(CONFIG_IPV6)
50static u32 secure_tcpv6_ts_off(const __be32 *saddr, const __be32 *daddr)
51{
52 const struct {
53 struct in6_addr saddr;
54 struct in6_addr daddr;
55 } __aligned(SIPHASH_ALIGNMENT) combined = {
56 .saddr = *(struct in6_addr *)saddr,
57 .daddr = *(struct in6_addr *)daddr,
58 };
59
60 if (sysctl_tcp_timestamps != 1)
61 return 0;
62
63 return siphash(&combined, offsetofend(typeof(combined), daddr),
64 &ts_secret);
65}
66
44u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr, 67u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,
45 __be16 sport, __be16 dport, u32 *tsoff) 68 __be16 sport, __be16 dport, u32 *tsoff)
46{ 69{
47 u32 secret[MD5_MESSAGE_BYTES / 4]; 70 const struct {
48 u32 hash[MD5_DIGEST_WORDS]; 71 struct in6_addr saddr;
49 u32 i; 72 struct in6_addr daddr;
50 73 __be16 sport;
74 __be16 dport;
75 } __aligned(SIPHASH_ALIGNMENT) combined = {
76 .saddr = *(struct in6_addr *)saddr,
77 .daddr = *(struct in6_addr *)daddr,
78 .sport = sport,
79 .dport = dport
80 };
81 u64 hash;
51 net_secret_init(); 82 net_secret_init();
52 memcpy(hash, saddr, 16); 83 hash = siphash(&combined, offsetofend(typeof(combined), dport),
53 for (i = 0; i < 4; i++) 84 &net_secret);
54 secret[i] = net_secret[i] + (__force u32)daddr[i]; 85 *tsoff = secure_tcpv6_ts_off(saddr, daddr);
55 secret[4] = net_secret[4] + 86 return seq_scale(hash);
56 (((__force u16)sport << 16) + (__force u16)dport);
57 for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
58 secret[i] = net_secret[i];
59
60 md5_transform(hash, secret);
61
62 *tsoff = sysctl_tcp_timestamps == 1 ? hash[1] : 0;
63 return seq_scale(hash[0]);
64} 87}
65EXPORT_SYMBOL(secure_tcpv6_sequence_number); 88EXPORT_SYMBOL(secure_tcpv6_sequence_number);
66 89
67u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr, 90u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
68 __be16 dport) 91 __be16 dport)
69{ 92{
70 u32 secret[MD5_MESSAGE_BYTES / 4]; 93 const struct {
71 u32 hash[MD5_DIGEST_WORDS]; 94 struct in6_addr saddr;
72 u32 i; 95 struct in6_addr daddr;
73 96 __be16 dport;
97 } __aligned(SIPHASH_ALIGNMENT) combined = {
98 .saddr = *(struct in6_addr *)saddr,
99 .daddr = *(struct in6_addr *)daddr,
100 .dport = dport
101 };
74 net_secret_init(); 102 net_secret_init();
75 memcpy(hash, saddr, 16); 103 return siphash(&combined, offsetofend(typeof(combined), dport),
76 for (i = 0; i < 4; i++) 104 &net_secret);
77 secret[i] = net_secret[i] + (__force u32) daddr[i];
78 secret[4] = net_secret[4] + (__force u32)dport;
79 for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
80 secret[i] = net_secret[i];
81
82 md5_transform(hash, secret);
83
84 return hash[0];
85} 105}
86EXPORT_SYMBOL(secure_ipv6_port_ephemeral); 106EXPORT_SYMBOL(secure_ipv6_port_ephemeral);
87#endif 107#endif
88 108
89#ifdef CONFIG_INET 109#ifdef CONFIG_INET
110static u32 secure_tcp_ts_off(__be32 saddr, __be32 daddr)
111{
112 if (sysctl_tcp_timestamps != 1)
113 return 0;
114
115 return siphash_2u32((__force u32)saddr, (__force u32)daddr,
116 &ts_secret);
117}
118
119/* secure_tcp_sequence_number(a, b, 0, d) == secure_ipv4_port_ephemeral(a, b, d),
120 * but fortunately, `sport' cannot be 0 in any circumstances. If this changes,
121 * it would be easy enough to have the former function use siphash_4u32, passing
122 * the arguments as separate u32.
123 */
90 124
91u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr, 125u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr,
92 __be16 sport, __be16 dport, u32 *tsoff) 126 __be16 sport, __be16 dport, u32 *tsoff)
93{ 127{
94 u32 hash[MD5_DIGEST_WORDS]; 128 u64 hash;
95
96 net_secret_init(); 129 net_secret_init();
97 hash[0] = (__force u32)saddr; 130 hash = siphash_3u32((__force u32)saddr, (__force u32)daddr,
98 hash[1] = (__force u32)daddr; 131 (__force u32)sport << 16 | (__force u32)dport,
99 hash[2] = ((__force u16)sport << 16) + (__force u16)dport; 132 &net_secret);
100 hash[3] = net_secret[15]; 133 *tsoff = secure_tcp_ts_off(saddr, daddr);
101 134 return seq_scale(hash);
102 md5_transform(hash, net_secret);
103
104 *tsoff = sysctl_tcp_timestamps == 1 ? hash[1] : 0;
105 return seq_scale(hash[0]);
106} 135}
107 136
108u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport) 137u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport)
109{ 138{
110 u32 hash[MD5_DIGEST_WORDS];
111
112 net_secret_init(); 139 net_secret_init();
113 hash[0] = (__force u32)saddr; 140 return siphash_3u32((__force u32)saddr, (__force u32)daddr,
114 hash[1] = (__force u32)daddr; 141 (__force u16)dport, &net_secret);
115 hash[2] = (__force u32)dport ^ net_secret[14];
116 hash[3] = net_secret[15];
117
118 md5_transform(hash, net_secret);
119
120 return hash[0];
121} 142}
122EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral); 143EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral);
123#endif 144#endif
@@ -126,21 +147,13 @@ EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral);
126u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr, 147u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr,
127 __be16 sport, __be16 dport) 148 __be16 sport, __be16 dport)
128{ 149{
129 u32 hash[MD5_DIGEST_WORDS];
130 u64 seq; 150 u64 seq;
131
132 net_secret_init(); 151 net_secret_init();
133 hash[0] = (__force u32)saddr; 152 seq = siphash_3u32((__force u32)saddr, (__force u32)daddr,
134 hash[1] = (__force u32)daddr; 153 (__force u32)sport << 16 | (__force u32)dport,
135 hash[2] = ((__force u16)sport << 16) + (__force u16)dport; 154 &net_secret);
136 hash[3] = net_secret[15];
137
138 md5_transform(hash, net_secret);
139
140 seq = hash[0] | (((u64)hash[1]) << 32);
141 seq += ktime_get_real_ns(); 155 seq += ktime_get_real_ns();
142 seq &= (1ull << 48) - 1; 156 seq &= (1ull << 48) - 1;
143
144 return seq; 157 return seq;
145} 158}
146EXPORT_SYMBOL(secure_dccp_sequence_number); 159EXPORT_SYMBOL(secure_dccp_sequence_number);
@@ -149,26 +162,23 @@ EXPORT_SYMBOL(secure_dccp_sequence_number);
149u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr, 162u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr,
150 __be16 sport, __be16 dport) 163 __be16 sport, __be16 dport)
151{ 164{
152 u32 secret[MD5_MESSAGE_BYTES / 4]; 165 const struct {
153 u32 hash[MD5_DIGEST_WORDS]; 166 struct in6_addr saddr;
167 struct in6_addr daddr;
168 __be16 sport;
169 __be16 dport;
170 } __aligned(SIPHASH_ALIGNMENT) combined = {
171 .saddr = *(struct in6_addr *)saddr,
172 .daddr = *(struct in6_addr *)daddr,
173 .sport = sport,
174 .dport = dport
175 };
154 u64 seq; 176 u64 seq;
155 u32 i;
156
157 net_secret_init(); 177 net_secret_init();
158 memcpy(hash, saddr, 16); 178 seq = siphash(&combined, offsetofend(typeof(combined), dport),
159 for (i = 0; i < 4; i++) 179 &net_secret);
160 secret[i] = net_secret[i] + (__force u32)daddr[i];
161 secret[4] = net_secret[4] +
162 (((__force u16)sport << 16) + (__force u16)dport);
163 for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
164 secret[i] = net_secret[i];
165
166 md5_transform(hash, secret);
167
168 seq = hash[0] | (((u64)hash[1]) << 32);
169 seq += ktime_get_real_ns(); 180 seq += ktime_get_real_ns();
170 seq &= (1ull << 48) - 1; 181 seq &= (1ull << 48) - 1;
171
172 return seq; 182 return seq;
173} 183}
174EXPORT_SYMBOL(secure_dccpv6_sequence_number); 184EXPORT_SYMBOL(secure_dccpv6_sequence_number);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 734c71468b01..f86bf69cfb8d 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -271,7 +271,6 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
271 atomic_set(&fclones->fclone_ref, 1); 271 atomic_set(&fclones->fclone_ref, 1);
272 272
273 fclones->skb2.fclone = SKB_FCLONE_CLONE; 273 fclones->skb2.fclone = SKB_FCLONE_CLONE;
274 fclones->skb2.pfmemalloc = pfmemalloc;
275 } 274 }
276out: 275out:
277 return skb; 276 return skb;
@@ -655,7 +654,7 @@ static void skb_release_head_state(struct sk_buff *skb)
655 skb->destructor(skb); 654 skb->destructor(skb);
656 } 655 }
657#if IS_ENABLED(CONFIG_NF_CONNTRACK) 656#if IS_ENABLED(CONFIG_NF_CONNTRACK)
658 nf_conntrack_put(skb->nfct); 657 nf_conntrack_put(skb_nfct(skb));
659#endif 658#endif
660#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) 659#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
661 nf_bridge_put(skb->nf_bridge); 660 nf_bridge_put(skb->nf_bridge);
@@ -878,9 +877,6 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
878#endif 877#endif
879#ifdef CONFIG_NET_SCHED 878#ifdef CONFIG_NET_SCHED
880 CHECK_SKB_FIELD(tc_index); 879 CHECK_SKB_FIELD(tc_index);
881#ifdef CONFIG_NET_CLS_ACT
882 CHECK_SKB_FIELD(tc_verd);
883#endif
884#endif 880#endif
885 881
886} 882}
@@ -1195,10 +1191,10 @@ EXPORT_SYMBOL(__pskb_copy_fclone);
1195int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, 1191int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
1196 gfp_t gfp_mask) 1192 gfp_t gfp_mask)
1197{ 1193{
1198 int i; 1194 int i, osize = skb_end_offset(skb);
1199 u8 *data; 1195 int size = osize + nhead + ntail;
1200 int size = nhead + skb_end_offset(skb) + ntail;
1201 long off; 1196 long off;
1197 u8 *data;
1202 1198
1203 BUG_ON(nhead < 0); 1199 BUG_ON(nhead < 0);
1204 1200
@@ -1260,6 +1256,14 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
1260 skb->hdr_len = 0; 1256 skb->hdr_len = 0;
1261 skb->nohdr = 0; 1257 skb->nohdr = 0;
1262 atomic_set(&skb_shinfo(skb)->dataref, 1); 1258 atomic_set(&skb_shinfo(skb)->dataref, 1);
1259
1260 /* It is not generally safe to change skb->truesize.
1261 * For the moment, we really care of rx path, or
1262 * when skb is orphaned (not attached to a socket).
1263 */
1264 if (!skb->sk || skb->destructor == sock_edemux)
1265 skb->truesize += size - osize;
1266
1263 return 0; 1267 return 0;
1264 1268
1265nofrags: 1269nofrags:
@@ -3078,22 +3082,32 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
3078 if (sg && csum && (mss != GSO_BY_FRAGS)) { 3082 if (sg && csum && (mss != GSO_BY_FRAGS)) {
3079 if (!(features & NETIF_F_GSO_PARTIAL)) { 3083 if (!(features & NETIF_F_GSO_PARTIAL)) {
3080 struct sk_buff *iter; 3084 struct sk_buff *iter;
3085 unsigned int frag_len;
3081 3086
3082 if (!list_skb || 3087 if (!list_skb ||
3083 !net_gso_ok(features, skb_shinfo(head_skb)->gso_type)) 3088 !net_gso_ok(features, skb_shinfo(head_skb)->gso_type))
3084 goto normal; 3089 goto normal;
3085 3090
3086 /* Split the buffer at the frag_list pointer. 3091 /* If we get here then all the required
3087 * This is based on the assumption that all 3092 * GSO features except frag_list are supported.
3088 * buffers in the chain excluding the last 3093 * Try to split the SKB to multiple GSO SKBs
3089 * containing the same amount of data. 3094 * with no frag_list.
3095 * Currently we can do that only when the buffers don't
3096 * have a linear part and all the buffers except
3097 * the last are of the same length.
3090 */ 3098 */
3099 frag_len = list_skb->len;
3091 skb_walk_frags(head_skb, iter) { 3100 skb_walk_frags(head_skb, iter) {
3101 if (frag_len != iter->len && iter->next)
3102 goto normal;
3092 if (skb_headlen(iter)) 3103 if (skb_headlen(iter))
3093 goto normal; 3104 goto normal;
3094 3105
3095 len -= iter->len; 3106 len -= iter->len;
3096 } 3107 }
3108
3109 if (len != frag_len)
3110 goto normal;
3097 } 3111 }
3098 3112
3099 /* GSO partial only requires that we trim off any excess that 3113 /* GSO partial only requires that we trim off any excess that
@@ -3690,6 +3704,15 @@ static void sock_rmem_free(struct sk_buff *skb)
3690 atomic_sub(skb->truesize, &sk->sk_rmem_alloc); 3704 atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
3691} 3705}
3692 3706
3707static void skb_set_err_queue(struct sk_buff *skb)
3708{
3709 /* pkt_type of skbs received on local sockets is never PACKET_OUTGOING.
3710 * So, it is safe to (mis)use it to mark skbs on the error queue.
3711 */
3712 skb->pkt_type = PACKET_OUTGOING;
3713 BUILD_BUG_ON(PACKET_OUTGOING == 0);
3714}
3715
3693/* 3716/*
3694 * Note: We dont mem charge error packets (no sk_forward_alloc changes) 3717 * Note: We dont mem charge error packets (no sk_forward_alloc changes)
3695 */ 3718 */
@@ -3703,6 +3726,7 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
3703 skb->sk = sk; 3726 skb->sk = sk;
3704 skb->destructor = sock_rmem_free; 3727 skb->destructor = sock_rmem_free;
3705 atomic_add(skb->truesize, &sk->sk_rmem_alloc); 3728 atomic_add(skb->truesize, &sk->sk_rmem_alloc);
3729 skb_set_err_queue(skb);
3706 3730
3707 /* before exiting rcu section, make sure dst is refcounted */ 3731 /* before exiting rcu section, make sure dst is refcounted */
3708 skb_dst_force(skb); 3732 skb_dst_force(skb);
@@ -3779,16 +3803,21 @@ EXPORT_SYMBOL(skb_clone_sk);
3779 3803
3780static void __skb_complete_tx_timestamp(struct sk_buff *skb, 3804static void __skb_complete_tx_timestamp(struct sk_buff *skb,
3781 struct sock *sk, 3805 struct sock *sk,
3782 int tstype) 3806 int tstype,
3807 bool opt_stats)
3783{ 3808{
3784 struct sock_exterr_skb *serr; 3809 struct sock_exterr_skb *serr;
3785 int err; 3810 int err;
3786 3811
3812 BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb));
3813
3787 serr = SKB_EXT_ERR(skb); 3814 serr = SKB_EXT_ERR(skb);
3788 memset(serr, 0, sizeof(*serr)); 3815 memset(serr, 0, sizeof(*serr));
3789 serr->ee.ee_errno = ENOMSG; 3816 serr->ee.ee_errno = ENOMSG;
3790 serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; 3817 serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
3791 serr->ee.ee_info = tstype; 3818 serr->ee.ee_info = tstype;
3819 serr->opt_stats = opt_stats;
3820 serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0;
3792 if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) { 3821 if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) {
3793 serr->ee.ee_data = skb_shinfo(skb)->tskey; 3822 serr->ee.ee_data = skb_shinfo(skb)->tskey;
3794 if (sk->sk_protocol == IPPROTO_TCP && 3823 if (sk->sk_protocol == IPPROTO_TCP &&
@@ -3824,13 +3853,14 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
3824 if (!skb_may_tx_timestamp(sk, false)) 3853 if (!skb_may_tx_timestamp(sk, false))
3825 return; 3854 return;
3826 3855
3827 /* take a reference to prevent skb_orphan() from freeing the socket */ 3856 /* Take a reference to prevent skb_orphan() from freeing the socket,
3828 sock_hold(sk); 3857 * but only if the socket refcount is not zero.
3829 3858 */
3830 *skb_hwtstamps(skb) = *hwtstamps; 3859 if (likely(atomic_inc_not_zero(&sk->sk_refcnt))) {
3831 __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND); 3860 *skb_hwtstamps(skb) = *hwtstamps;
3832 3861 __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false);
3833 sock_put(sk); 3862 sock_put(sk);
3863 }
3834} 3864}
3835EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp); 3865EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
3836 3866
@@ -3839,7 +3869,7 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
3839 struct sock *sk, int tstype) 3869 struct sock *sk, int tstype)
3840{ 3870{
3841 struct sk_buff *skb; 3871 struct sk_buff *skb;
3842 bool tsonly; 3872 bool tsonly, opt_stats = false;
3843 3873
3844 if (!sk) 3874 if (!sk)
3845 return; 3875 return;
@@ -3852,9 +3882,10 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
3852#ifdef CONFIG_INET 3882#ifdef CONFIG_INET
3853 if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) && 3883 if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) &&
3854 sk->sk_protocol == IPPROTO_TCP && 3884 sk->sk_protocol == IPPROTO_TCP &&
3855 sk->sk_type == SOCK_STREAM) 3885 sk->sk_type == SOCK_STREAM) {
3856 skb = tcp_get_timestamping_opt_stats(sk); 3886 skb = tcp_get_timestamping_opt_stats(sk);
3857 else 3887 opt_stats = true;
3888 } else
3858#endif 3889#endif
3859 skb = alloc_skb(0, GFP_ATOMIC); 3890 skb = alloc_skb(0, GFP_ATOMIC);
3860 } else { 3891 } else {
@@ -3873,7 +3904,7 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
3873 else 3904 else
3874 skb->tstamp = ktime_get_real(); 3905 skb->tstamp = ktime_get_real();
3875 3906
3876 __skb_complete_tx_timestamp(skb, sk, tstype); 3907 __skb_complete_tx_timestamp(skb, sk, tstype, opt_stats);
3877} 3908}
3878EXPORT_SYMBOL_GPL(__skb_tstamp_tx); 3909EXPORT_SYMBOL_GPL(__skb_tstamp_tx);
3879 3910
@@ -3889,7 +3920,7 @@ void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
3889{ 3920{
3890 struct sock *sk = skb->sk; 3921 struct sock *sk = skb->sk;
3891 struct sock_exterr_skb *serr; 3922 struct sock_exterr_skb *serr;
3892 int err; 3923 int err = 1;
3893 3924
3894 skb->wifi_acked_valid = 1; 3925 skb->wifi_acked_valid = 1;
3895 skb->wifi_acked = acked; 3926 skb->wifi_acked = acked;
@@ -3899,14 +3930,15 @@ void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
3899 serr->ee.ee_errno = ENOMSG; 3930 serr->ee.ee_errno = ENOMSG;
3900 serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS; 3931 serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS;
3901 3932
3902 /* take a reference to prevent skb_orphan() from freeing the socket */ 3933 /* Take a reference to prevent skb_orphan() from freeing the socket,
3903 sock_hold(sk); 3934 * but only if the socket refcount is not zero.
3904 3935 */
3905 err = sock_queue_err_skb(sk, skb); 3936 if (likely(atomic_inc_not_zero(&sk->sk_refcnt))) {
3937 err = sock_queue_err_skb(sk, skb);
3938 sock_put(sk);
3939 }
3906 if (err) 3940 if (err)
3907 kfree_skb(skb); 3941 kfree_skb(skb);
3908
3909 sock_put(sk);
3910} 3942}
3911EXPORT_SYMBOL_GPL(skb_complete_wifi_ack); 3943EXPORT_SYMBOL_GPL(skb_complete_wifi_ack);
3912 3944
diff --git a/net/core/sock.c b/net/core/sock.c
index 4eca27dc5c94..2c4f574168fb 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -197,66 +197,55 @@ EXPORT_SYMBOL(sk_net_capable);
197 197
198/* 198/*
199 * Each address family might have different locking rules, so we have 199 * Each address family might have different locking rules, so we have
200 * one slock key per address family: 200 * one slock key per address family and separate keys for internal and
201 * userspace sockets.
201 */ 202 */
202static struct lock_class_key af_family_keys[AF_MAX]; 203static struct lock_class_key af_family_keys[AF_MAX];
204static struct lock_class_key af_family_kern_keys[AF_MAX];
203static struct lock_class_key af_family_slock_keys[AF_MAX]; 205static struct lock_class_key af_family_slock_keys[AF_MAX];
206static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
204 207
205/* 208/*
206 * Make lock validator output more readable. (we pre-construct these 209 * Make lock validator output more readable. (we pre-construct these
207 * strings build-time, so that runtime initialization of socket 210 * strings build-time, so that runtime initialization of socket
208 * locks is fast): 211 * locks is fast):
209 */ 212 */
213
214#define _sock_locks(x) \
215 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \
216 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \
217 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \
218 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \
219 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \
220 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \
221 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \
222 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \
223 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \
224 x "27" , x "28" , x "AF_CAN" , \
225 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \
226 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \
227 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
228 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
229 x "AF_QIPCRTR", x "AF_SMC" , x "AF_MAX"
230
210static const char *const af_family_key_strings[AF_MAX+1] = { 231static const char *const af_family_key_strings[AF_MAX+1] = {
211 "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" , 232 _sock_locks("sk_lock-")
212 "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK",
213 "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" ,
214 "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" ,
215 "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" ,
216 "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" ,
217 "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" ,
218 "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" ,
219 "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" ,
220 "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" ,
221 "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
222 "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" ,
223 "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" ,
224 "sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_KCM" ,
225 "sk_lock-AF_QIPCRTR", "sk_lock-AF_MAX"
226}; 233};
227static const char *const af_family_slock_key_strings[AF_MAX+1] = { 234static const char *const af_family_slock_key_strings[AF_MAX+1] = {
228 "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" , 235 _sock_locks("slock-")
229 "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK",
230 "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" ,
231 "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" ,
232 "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" ,
233 "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" ,
234 "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" ,
235 "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" ,
236 "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" ,
237 "slock-27" , "slock-28" , "slock-AF_CAN" ,
238 "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
239 "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" ,
240 "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" ,
241 "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_KCM" ,
242 "slock-AF_QIPCRTR", "slock-AF_MAX"
243}; 236};
244static const char *const af_family_clock_key_strings[AF_MAX+1] = { 237static const char *const af_family_clock_key_strings[AF_MAX+1] = {
245 "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" , 238 _sock_locks("clock-")
246 "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK", 239};
247 "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" , 240
248 "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" , 241static const char *const af_family_kern_key_strings[AF_MAX+1] = {
249 "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" , 242 _sock_locks("k-sk_lock-")
250 "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" , 243};
251 "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" , 244static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
252 "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" , 245 _sock_locks("k-slock-")
253 "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" , 246};
254 "clock-27" , "clock-28" , "clock-AF_CAN" , 247static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
255 "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" , 248 _sock_locks("k-clock-")
256 "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" ,
257 "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" ,
258 "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_KCM" ,
259 "clock-AF_QIPCRTR", "clock-AF_MAX"
260}; 249};
261 250
262/* 251/*
@@ -264,6 +253,7 @@ static const char *const af_family_clock_key_strings[AF_MAX+1] = {
264 * so split the lock classes by using a per-AF key: 253 * so split the lock classes by using a per-AF key:
265 */ 254 */
266static struct lock_class_key af_callback_keys[AF_MAX]; 255static struct lock_class_key af_callback_keys[AF_MAX];
256static struct lock_class_key af_kern_callback_keys[AF_MAX];
267 257
268/* Take into consideration the size of the struct sk_buff overhead in the 258/* Take into consideration the size of the struct sk_buff overhead in the
269 * determination of these values, since that is non-constant across 259 * determination of these values, since that is non-constant across
@@ -367,7 +357,7 @@ static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
367 if (tv.tv_sec == 0 && tv.tv_usec == 0) 357 if (tv.tv_sec == 0 && tv.tv_usec == 0)
368 return 0; 358 return 0;
369 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1)) 359 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
370 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ); 360 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
371 return 0; 361 return 0;
372} 362}
373 363
@@ -502,6 +492,7 @@ struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
502 492
503 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { 493 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
504 sk_tx_queue_clear(sk); 494 sk_tx_queue_clear(sk);
495 sk->sk_dst_pending_confirm = 0;
505 RCU_INIT_POINTER(sk->sk_dst_cache, NULL); 496 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
506 dst_release(dst); 497 dst_release(dst);
507 return NULL; 498 return NULL;
@@ -762,11 +753,8 @@ set_rcvbuf:
762 goto set_rcvbuf; 753 goto set_rcvbuf;
763 754
764 case SO_KEEPALIVE: 755 case SO_KEEPALIVE:
765#ifdef CONFIG_INET 756 if (sk->sk_prot->keepalive)
766 if (sk->sk_protocol == IPPROTO_TCP && 757 sk->sk_prot->keepalive(sk, valbool);
767 sk->sk_type == SOCK_STREAM)
768 tcp_set_keepalive(sk, valbool);
769#endif
770 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); 758 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
771 break; 759 break;
772 760
@@ -1148,7 +1136,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
1148 v.tm.tv_usec = 0; 1136 v.tm.tv_usec = 0;
1149 } else { 1137 } else {
1150 v.tm.tv_sec = sk->sk_rcvtimeo / HZ; 1138 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1151 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ; 1139 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
1152 } 1140 }
1153 break; 1141 break;
1154 1142
@@ -1159,7 +1147,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
1159 v.tm.tv_usec = 0; 1147 v.tm.tv_usec = 0;
1160 } else { 1148 } else {
1161 v.tm.tv_sec = sk->sk_sndtimeo / HZ; 1149 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1162 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ; 1150 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
1163 } 1151 }
1164 break; 1152 break;
1165 1153
@@ -1295,7 +1283,16 @@ lenout:
1295 */ 1283 */
1296static inline void sock_lock_init(struct sock *sk) 1284static inline void sock_lock_init(struct sock *sk)
1297{ 1285{
1298 sock_lock_init_class_and_name(sk, 1286 if (sk->sk_kern_sock)
1287 sock_lock_init_class_and_name(
1288 sk,
1289 af_family_kern_slock_key_strings[sk->sk_family],
1290 af_family_kern_slock_keys + sk->sk_family,
1291 af_family_kern_key_strings[sk->sk_family],
1292 af_family_kern_keys + sk->sk_family);
1293 else
1294 sock_lock_init_class_and_name(
1295 sk,
1299 af_family_slock_key_strings[sk->sk_family], 1296 af_family_slock_key_strings[sk->sk_family],
1300 af_family_slock_keys + sk->sk_family, 1297 af_family_slock_keys + sk->sk_family,
1301 af_family_key_strings[sk->sk_family], 1298 af_family_key_strings[sk->sk_family],
@@ -1401,6 +1398,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1401 * why we need sk_prot_creator -acme 1398 * why we need sk_prot_creator -acme
1402 */ 1399 */
1403 sk->sk_prot = sk->sk_prot_creator = prot; 1400 sk->sk_prot = sk->sk_prot_creator = prot;
1401 sk->sk_kern_sock = kern;
1404 sock_lock_init(sk); 1402 sock_lock_init(sk);
1405 sk->sk_net_refcnt = kern ? 0 : 1; 1403 sk->sk_net_refcnt = kern ? 0 : 1;
1406 if (likely(sk->sk_net_refcnt)) 1404 if (likely(sk->sk_net_refcnt))
@@ -1444,6 +1442,11 @@ static void __sk_destruct(struct rcu_head *head)
1444 pr_debug("%s: optmem leakage (%d bytes) detected\n", 1442 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1445 __func__, atomic_read(&sk->sk_omem_alloc)); 1443 __func__, atomic_read(&sk->sk_omem_alloc));
1446 1444
1445 if (sk->sk_frag.page) {
1446 put_page(sk->sk_frag.page);
1447 sk->sk_frag.page = NULL;
1448 }
1449
1447 if (sk->sk_peer_cred) 1450 if (sk->sk_peer_cred)
1448 put_cred(sk->sk_peer_cred); 1451 put_cred(sk->sk_peer_cred);
1449 put_pid(sk->sk_peer_pid); 1452 put_pid(sk->sk_peer_pid);
@@ -1522,6 +1525,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1522 af_family_clock_key_strings[newsk->sk_family]); 1525 af_family_clock_key_strings[newsk->sk_family]);
1523 1526
1524 newsk->sk_dst_cache = NULL; 1527 newsk->sk_dst_cache = NULL;
1528 newsk->sk_dst_pending_confirm = 0;
1525 newsk->sk_wmem_queued = 0; 1529 newsk->sk_wmem_queued = 0;
1526 newsk->sk_forward_alloc = 0; 1530 newsk->sk_forward_alloc = 0;
1527 atomic_set(&newsk->sk_drops, 0); 1531 atomic_set(&newsk->sk_drops, 0);
@@ -1540,11 +1544,13 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1540 is_charged = sk_filter_charge(newsk, filter); 1544 is_charged = sk_filter_charge(newsk, filter);
1541 1545
1542 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { 1546 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1543 /* It is still raw copy of parent, so invalidate 1547 /* We need to make sure that we don't uncharge the new
1544 * destructor and make plain sk_free() */ 1548 * socket if we couldn't charge it in the first place
1545 newsk->sk_destruct = NULL; 1549 * as otherwise we uncharge the parent's filter.
1546 bh_unlock_sock(newsk); 1550 */
1547 sk_free(newsk); 1551 if (!is_charged)
1552 RCU_INIT_POINTER(newsk->sk_filter, NULL);
1553 sk_free_unlock_clone(newsk);
1548 newsk = NULL; 1554 newsk = NULL;
1549 goto out; 1555 goto out;
1550 } 1556 }
@@ -1593,6 +1599,16 @@ out:
1593} 1599}
1594EXPORT_SYMBOL_GPL(sk_clone_lock); 1600EXPORT_SYMBOL_GPL(sk_clone_lock);
1595 1601
1602void sk_free_unlock_clone(struct sock *sk)
1603{
1604 /* It is still raw copy of parent, so invalidate
1605 * destructor and make plain sk_free() */
1606 sk->sk_destruct = NULL;
1607 bh_unlock_sock(sk);
1608 sk_free(sk);
1609}
1610EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1611
1596void sk_setup_caps(struct sock *sk, struct dst_entry *dst) 1612void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1597{ 1613{
1598 u32 max_segs = 1; 1614 u32 max_segs = 1;
@@ -2272,7 +2288,8 @@ int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2272} 2288}
2273EXPORT_SYMBOL(sock_no_socketpair); 2289EXPORT_SYMBOL(sock_no_socketpair);
2274 2290
2275int sock_no_accept(struct socket *sock, struct socket *newsock, int flags) 2291int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2292 bool kern)
2276{ 2293{
2277 return -EOPNOTSUPP; 2294 return -EOPNOTSUPP;
2278} 2295}
@@ -2476,7 +2493,14 @@ void sock_init_data(struct socket *sock, struct sock *sk)
2476 } 2493 }
2477 2494
2478 rwlock_init(&sk->sk_callback_lock); 2495 rwlock_init(&sk->sk_callback_lock);
2479 lockdep_set_class_and_name(&sk->sk_callback_lock, 2496 if (sk->sk_kern_sock)
2497 lockdep_set_class_and_name(
2498 &sk->sk_callback_lock,
2499 af_kern_callback_keys + sk->sk_family,
2500 af_family_kern_clock_key_strings[sk->sk_family]);
2501 else
2502 lockdep_set_class_and_name(
2503 &sk->sk_callback_lock,
2480 af_callback_keys + sk->sk_family, 2504 af_callback_keys + sk->sk_family,
2481 af_family_clock_key_strings[sk->sk_family]); 2505 af_family_clock_key_strings[sk->sk_family]);
2482 2506
@@ -2774,11 +2798,6 @@ void sk_common_release(struct sock *sk)
2774 2798
2775 sk_refcnt_debug_release(sk); 2799 sk_refcnt_debug_release(sk);
2776 2800
2777 if (sk->sk_frag.page) {
2778 put_page(sk->sk_frag.page);
2779 sk->sk_frag.page = NULL;
2780 }
2781
2782 sock_put(sk); 2801 sock_put(sk);
2783} 2802}
2784EXPORT_SYMBOL(sk_common_release); 2803EXPORT_SYMBOL(sk_common_release);
diff --git a/net/core/stream.c b/net/core/stream.c
index f575bcf64af2..20231dbb1da0 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -13,6 +13,7 @@
13 */ 13 */
14 14
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/sched/signal.h>
16#include <linux/net.h> 17#include <linux/net.h>
17#include <linux/signal.h> 18#include <linux/signal.h>
18#include <linux/tcp.h> 19#include <linux/tcp.h>
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 2a46e4009f62..7f9cc400eca0 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -222,6 +222,21 @@ static int set_default_qdisc(struct ctl_table *table, int write,
222} 222}
223#endif 223#endif
224 224
225static int proc_do_dev_weight(struct ctl_table *table, int write,
226 void __user *buffer, size_t *lenp, loff_t *ppos)
227{
228 int ret;
229
230 ret = proc_dointvec(table, write, buffer, lenp, ppos);
231 if (ret != 0)
232 return ret;
233
234 dev_rx_weight = weight_p * dev_weight_rx_bias;
235 dev_tx_weight = weight_p * dev_weight_tx_bias;
236
237 return ret;
238}
239
225static int proc_do_rss_key(struct ctl_table *table, int write, 240static int proc_do_rss_key(struct ctl_table *table, int write,
226 void __user *buffer, size_t *lenp, loff_t *ppos) 241 void __user *buffer, size_t *lenp, loff_t *ppos)
227{ 242{
@@ -273,7 +288,21 @@ static struct ctl_table net_core_table[] = {
273 .data = &weight_p, 288 .data = &weight_p,
274 .maxlen = sizeof(int), 289 .maxlen = sizeof(int),
275 .mode = 0644, 290 .mode = 0644,
276 .proc_handler = proc_dointvec 291 .proc_handler = proc_do_dev_weight,
292 },
293 {
294 .procname = "dev_weight_rx_bias",
295 .data = &dev_weight_rx_bias,
296 .maxlen = sizeof(int),
297 .mode = 0644,
298 .proc_handler = proc_do_dev_weight,
299 },
300 {
301 .procname = "dev_weight_tx_bias",
302 .data = &dev_weight_tx_bias,
303 .maxlen = sizeof(int),
304 .mode = 0644,
305 .proc_handler = proc_do_dev_weight,
277 }, 306 },
278 { 307 {
279 .procname = "netdev_max_backlog", 308 .procname = "netdev_max_backlog",
@@ -305,6 +334,13 @@ static struct ctl_table net_core_table[] = {
305 .mode = 0600, 334 .mode = 0600,
306 .proc_handler = proc_dointvec, 335 .proc_handler = proc_dointvec,
307 }, 336 },
337 {
338 .procname = "bpf_jit_kallsyms",
339 .data = &bpf_jit_kallsyms,
340 .maxlen = sizeof(int),
341 .mode = 0600,
342 .proc_handler = proc_dointvec,
343 },
308# endif 344# endif
309#endif 345#endif
310 { 346 {
@@ -372,14 +408,16 @@ static struct ctl_table net_core_table[] = {
372 .data = &sysctl_net_busy_poll, 408 .data = &sysctl_net_busy_poll,
373 .maxlen = sizeof(unsigned int), 409 .maxlen = sizeof(unsigned int),
374 .mode = 0644, 410 .mode = 0644,
375 .proc_handler = proc_dointvec 411 .proc_handler = proc_dointvec_minmax,
412 .extra1 = &zero,
376 }, 413 },
377 { 414 {
378 .procname = "busy_read", 415 .procname = "busy_read",
379 .data = &sysctl_net_busy_read, 416 .data = &sysctl_net_busy_read,
380 .maxlen = sizeof(unsigned int), 417 .maxlen = sizeof(unsigned int),
381 .mode = 0644, 418 .mode = 0644,
382 .proc_handler = proc_dointvec 419 .proc_handler = proc_dointvec_minmax,
420 .extra1 = &zero,
383 }, 421 },
384#endif 422#endif
385#ifdef CONFIG_NET_SCHED 423#ifdef CONFIG_NET_SCHED
diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
index f053198e730c..5e3a7302f774 100644
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -749,6 +749,7 @@ static void ccid2_hc_tx_exit(struct sock *sk)
749 for (i = 0; i < hc->tx_seqbufc; i++) 749 for (i = 0; i < hc->tx_seqbufc; i++)
750 kfree(hc->tx_seqbuf[i]); 750 kfree(hc->tx_seqbuf[i]);
751 hc->tx_seqbufc = 0; 751 hc->tx_seqbufc = 0;
752 dccp_ackvec_parsed_cleanup(&hc->tx_av_chunks);
752} 753}
753 754
754static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) 755static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
diff --git a/net/dccp/input.c b/net/dccp/input.c
index ba347184bda9..4a05d7876850 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -577,6 +577,7 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
577 struct dccp_sock *dp = dccp_sk(sk); 577 struct dccp_sock *dp = dccp_sk(sk);
578 struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); 578 struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
579 const int old_state = sk->sk_state; 579 const int old_state = sk->sk_state;
580 bool acceptable;
580 int queued = 0; 581 int queued = 0;
581 582
582 /* 583 /*
@@ -603,10 +604,16 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
603 */ 604 */
604 if (sk->sk_state == DCCP_LISTEN) { 605 if (sk->sk_state == DCCP_LISTEN) {
605 if (dh->dccph_type == DCCP_PKT_REQUEST) { 606 if (dh->dccph_type == DCCP_PKT_REQUEST) {
606 if (inet_csk(sk)->icsk_af_ops->conn_request(sk, 607 /* It is possible that we process SYN packets from backlog,
607 skb) < 0) 608 * so we need to make sure to disable BH right there.
609 */
610 local_bh_disable();
611 acceptable = inet_csk(sk)->icsk_af_ops->conn_request(sk, skb) >= 0;
612 local_bh_enable();
613 if (!acceptable)
608 return 1; 614 return 1;
609 goto discard; 615 consume_skb(skb);
616 return 0;
610 } 617 }
611 if (dh->dccph_type == DCCP_PKT_RESET) 618 if (dh->dccph_type == DCCP_PKT_RESET)
612 goto discard; 619 goto discard;
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index d859a5c36e70..b99168b0fabf 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -289,7 +289,8 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info)
289 289
290 switch (type) { 290 switch (type) {
291 case ICMP_REDIRECT: 291 case ICMP_REDIRECT:
292 dccp_do_redirect(skb, sk); 292 if (!sock_owned_by_user(sk))
293 dccp_do_redirect(skb, sk);
293 goto out; 294 goto out;
294 case ICMP_SOURCE_QUENCH: 295 case ICMP_SOURCE_QUENCH:
295 /* Just silently ignore these. */ 296 /* Just silently ignore these. */
@@ -904,7 +905,6 @@ static const struct inet_connection_sock_af_ops dccp_ipv4_af_ops = {
904 .getsockopt = ip_getsockopt, 905 .getsockopt = ip_getsockopt,
905 .addr2sockaddr = inet_csk_addr2sockaddr, 906 .addr2sockaddr = inet_csk_addr2sockaddr,
906 .sockaddr_len = sizeof(struct sockaddr_in), 907 .sockaddr_len = sizeof(struct sockaddr_in),
907 .bind_conflict = inet_csk_bind_conflict,
908#ifdef CONFIG_COMPAT 908#ifdef CONFIG_COMPAT
909 .compat_setsockopt = compat_ip_setsockopt, 909 .compat_setsockopt = compat_ip_setsockopt,
910 .compat_getsockopt = compat_ip_getsockopt, 910 .compat_getsockopt = compat_ip_getsockopt,
@@ -1018,9 +1018,15 @@ static void __net_exit dccp_v4_exit_net(struct net *net)
1018 inet_ctl_sock_destroy(net->dccp.v4_ctl_sk); 1018 inet_ctl_sock_destroy(net->dccp.v4_ctl_sk);
1019} 1019}
1020 1020
1021static void __net_exit dccp_v4_exit_batch(struct list_head *net_exit_list)
1022{
1023 inet_twsk_purge(&dccp_hashinfo, AF_INET);
1024}
1025
1021static struct pernet_operations dccp_v4_ops = { 1026static struct pernet_operations dccp_v4_ops = {
1022 .init = dccp_v4_init_net, 1027 .init = dccp_v4_init_net,
1023 .exit = dccp_v4_exit_net, 1028 .exit = dccp_v4_exit_net,
1029 .exit_batch = dccp_v4_exit_batch,
1024}; 1030};
1025 1031
1026static int __init dccp_v4_init(void) 1032static int __init dccp_v4_init(void)
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index c4e879c02186..d9b6a4e403e7 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -122,10 +122,12 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
122 np = inet6_sk(sk); 122 np = inet6_sk(sk);
123 123
124 if (type == NDISC_REDIRECT) { 124 if (type == NDISC_REDIRECT) {
125 struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie); 125 if (!sock_owned_by_user(sk)) {
126 struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie);
126 127
127 if (dst) 128 if (dst)
128 dst->ops->redirect(dst, sk, skb); 129 dst->ops->redirect(dst, sk, skb);
130 }
129 goto out; 131 goto out;
130 } 132 }
131 133
@@ -937,7 +939,6 @@ static const struct inet_connection_sock_af_ops dccp_ipv6_af_ops = {
937 .getsockopt = ipv6_getsockopt, 939 .getsockopt = ipv6_getsockopt,
938 .addr2sockaddr = inet6_csk_addr2sockaddr, 940 .addr2sockaddr = inet6_csk_addr2sockaddr,
939 .sockaddr_len = sizeof(struct sockaddr_in6), 941 .sockaddr_len = sizeof(struct sockaddr_in6),
940 .bind_conflict = inet6_csk_bind_conflict,
941#ifdef CONFIG_COMPAT 942#ifdef CONFIG_COMPAT
942 .compat_setsockopt = compat_ipv6_setsockopt, 943 .compat_setsockopt = compat_ipv6_setsockopt,
943 .compat_getsockopt = compat_ipv6_getsockopt, 944 .compat_getsockopt = compat_ipv6_getsockopt,
@@ -958,7 +959,6 @@ static const struct inet_connection_sock_af_ops dccp_ipv6_mapped = {
958 .getsockopt = ipv6_getsockopt, 959 .getsockopt = ipv6_getsockopt,
959 .addr2sockaddr = inet6_csk_addr2sockaddr, 960 .addr2sockaddr = inet6_csk_addr2sockaddr,
960 .sockaddr_len = sizeof(struct sockaddr_in6), 961 .sockaddr_len = sizeof(struct sockaddr_in6),
961 .bind_conflict = inet6_csk_bind_conflict,
962#ifdef CONFIG_COMPAT 962#ifdef CONFIG_COMPAT
963 .compat_setsockopt = compat_ipv6_setsockopt, 963 .compat_setsockopt = compat_ipv6_setsockopt,
964 .compat_getsockopt = compat_ipv6_getsockopt, 964 .compat_getsockopt = compat_ipv6_getsockopt,
@@ -1077,9 +1077,15 @@ static void __net_exit dccp_v6_exit_net(struct net *net)
1077 inet_ctl_sock_destroy(net->dccp.v6_ctl_sk); 1077 inet_ctl_sock_destroy(net->dccp.v6_ctl_sk);
1078} 1078}
1079 1079
1080static void __net_exit dccp_v6_exit_batch(struct list_head *net_exit_list)
1081{
1082 inet_twsk_purge(&dccp_hashinfo, AF_INET6);
1083}
1084
1080static struct pernet_operations dccp_v6_ops = { 1085static struct pernet_operations dccp_v6_ops = {
1081 .init = dccp_v6_init_net, 1086 .init = dccp_v6_init_net,
1082 .exit = dccp_v6_exit_net, 1087 .exit = dccp_v6_exit_net,
1088 .exit_batch = dccp_v6_exit_batch,
1083}; 1089};
1084 1090
1085static int __init dccp_v6_init(void) 1091static int __init dccp_v6_init(void)
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 53eddf99e4f6..abd07a443219 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -119,10 +119,7 @@ struct sock *dccp_create_openreq_child(const struct sock *sk,
119 * Activate features: initialise CCIDs, sequence windows etc. 119 * Activate features: initialise CCIDs, sequence windows etc.
120 */ 120 */
121 if (dccp_feat_activate_values(newsk, &dreq->dreq_featneg)) { 121 if (dccp_feat_activate_values(newsk, &dreq->dreq_featneg)) {
122 /* It is still raw copy of parent, so invalidate 122 sk_free_unlock_clone(newsk);
123 * destructor and make plain sk_free() */
124 newsk->sk_destruct = NULL;
125 sk_free(newsk);
126 return NULL; 123 return NULL;
127 } 124 }
128 dccp_init_xmit_timers(newsk); 125 dccp_init_xmit_timers(newsk);
@@ -145,6 +142,13 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
145 struct dccp_request_sock *dreq = dccp_rsk(req); 142 struct dccp_request_sock *dreq = dccp_rsk(req);
146 bool own_req; 143 bool own_req;
147 144
145 /* TCP/DCCP listeners became lockless.
146 * DCCP stores complex state in its request_sock, so we need
147 * a protection for them, now this code runs without being protected
148 * by the parent (listener) lock.
149 */
150 spin_lock_bh(&dreq->dreq_lock);
151
148 /* Check for retransmitted REQUEST */ 152 /* Check for retransmitted REQUEST */
149 if (dccp_hdr(skb)->dccph_type == DCCP_PKT_REQUEST) { 153 if (dccp_hdr(skb)->dccph_type == DCCP_PKT_REQUEST) {
150 154
@@ -159,7 +163,7 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
159 inet_rtx_syn_ack(sk, req); 163 inet_rtx_syn_ack(sk, req);
160 } 164 }
161 /* Network Duplicate, discard packet */ 165 /* Network Duplicate, discard packet */
162 return NULL; 166 goto out;
163 } 167 }
164 168
165 DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR; 169 DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR;
@@ -185,20 +189,20 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
185 189
186 child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL, 190 child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL,
187 req, &own_req); 191 req, &own_req);
188 if (!child) 192 if (child) {
189 goto listen_overflow; 193 child = inet_csk_complete_hashdance(sk, child, req, own_req);
190 194 goto out;
191 return inet_csk_complete_hashdance(sk, child, req, own_req); 195 }
192 196
193listen_overflow:
194 dccp_pr_debug("listen_overflow!\n");
195 DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY; 197 DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY;
196drop: 198drop:
197 if (dccp_hdr(skb)->dccph_type != DCCP_PKT_RESET) 199 if (dccp_hdr(skb)->dccph_type != DCCP_PKT_RESET)
198 req->rsk_ops->send_reset(sk, skb); 200 req->rsk_ops->send_reset(sk, skb);
199 201
200 inet_csk_reqsk_queue_drop(sk, req); 202 inet_csk_reqsk_queue_drop(sk, req);
201 return NULL; 203out:
204 spin_unlock_bh(&dreq->dreq_lock);
205 return child;
202} 206}
203 207
204EXPORT_SYMBOL_GPL(dccp_check_req); 208EXPORT_SYMBOL_GPL(dccp_check_req);
@@ -249,6 +253,7 @@ int dccp_reqsk_init(struct request_sock *req,
249{ 253{
250 struct dccp_request_sock *dreq = dccp_rsk(req); 254 struct dccp_request_sock *dreq = dccp_rsk(req);
251 255
256 spin_lock_init(&dreq->dreq_lock);
252 inet_rsk(req)->ir_rmt_port = dccp_hdr(skb)->dccph_sport; 257 inet_rsk(req)->ir_rmt_port = dccp_hdr(skb)->dccph_sport;
253 inet_rsk(req)->ir_num = ntohs(dccp_hdr(skb)->dccph_dport); 258 inet_rsk(req)->ir_num = ntohs(dccp_hdr(skb)->dccph_dport);
254 inet_rsk(req)->acked = 0; 259 inet_rsk(req)->acked = 0;
diff --git a/net/dccp/output.c b/net/dccp/output.c
index b66c84db0766..91a15b3c4915 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -14,6 +14,7 @@
14#include <linux/kernel.h> 14#include <linux/kernel.h>
15#include <linux/skbuff.h> 15#include <linux/skbuff.h>
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/sched/signal.h>
17 18
18#include <net/inet_sock.h> 19#include <net/inet_sock.h>
19#include <net/sock.h> 20#include <net/sock.h>
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index a90ed67027b0..7de5b40a5d0d 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -106,7 +106,7 @@ Version 0.0.6 2.1.110 07-aug-98 Eduardo Marcelo Serrat
106#include <linux/socket.h> 106#include <linux/socket.h>
107#include <linux/in.h> 107#include <linux/in.h>
108#include <linux/kernel.h> 108#include <linux/kernel.h>
109#include <linux/sched.h> 109#include <linux/sched/signal.h>
110#include <linux/timer.h> 110#include <linux/timer.h>
111#include <linux/string.h> 111#include <linux/string.h>
112#include <linux/sockios.h> 112#include <linux/sockios.h>
@@ -1070,7 +1070,8 @@ static struct sk_buff *dn_wait_for_connect(struct sock *sk, long *timeo)
1070 return skb == NULL ? ERR_PTR(err) : skb; 1070 return skb == NULL ? ERR_PTR(err) : skb;
1071} 1071}
1072 1072
1073static int dn_accept(struct socket *sock, struct socket *newsock, int flags) 1073static int dn_accept(struct socket *sock, struct socket *newsock, int flags,
1074 bool kern)
1074{ 1075{
1075 struct sock *sk = sock->sk, *newsk; 1076 struct sock *sk = sock->sk, *newsk;
1076 struct sk_buff *skb = NULL; 1077 struct sk_buff *skb = NULL;
@@ -1099,7 +1100,7 @@ static int dn_accept(struct socket *sock, struct socket *newsock, int flags)
1099 1100
1100 cb = DN_SKB_CB(skb); 1101 cb = DN_SKB_CB(skb);
1101 sk->sk_ack_backlog--; 1102 sk->sk_ack_backlog--;
1102 newsk = dn_alloc_sock(sock_net(sk), newsock, sk->sk_allocation, 0); 1103 newsk = dn_alloc_sock(sock_net(sk), newsock, sk->sk_allocation, kern);
1103 if (newsk == NULL) { 1104 if (newsk == NULL) {
1104 release_sock(sk); 1105 release_sock(sk);
1105 kfree_skb(skb); 1106 kfree_skb(skb);
diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c
index ecc28cff08ab..af781010753b 100644
--- a/net/dns_resolver/dns_query.c
+++ b/net/dns_resolver/dns_query.c
@@ -37,8 +37,10 @@
37 37
38#include <linux/module.h> 38#include <linux/module.h>
39#include <linux/slab.h> 39#include <linux/slab.h>
40#include <linux/cred.h>
40#include <linux/dns_resolver.h> 41#include <linux/dns_resolver.h>
41#include <linux/err.h> 42#include <linux/err.h>
43
42#include <keys/dns_resolver-type.h> 44#include <keys/dns_resolver-type.h>
43#include <keys/user-type.h> 45#include <keys/user-type.h>
44 46
@@ -70,7 +72,7 @@ int dns_query(const char *type, const char *name, size_t namelen,
70 const char *options, char **_result, time64_t *_expiry) 72 const char *options, char **_result, time64_t *_expiry)
71{ 73{
72 struct key *rkey; 74 struct key *rkey;
73 const struct user_key_payload *upayload; 75 struct user_key_payload *upayload;
74 const struct cred *saved_cred; 76 const struct cred *saved_cred;
75 size_t typelen, desclen; 77 size_t typelen, desclen;
76 char *desc, *cp; 78 char *desc, *cp;
@@ -141,7 +143,7 @@ int dns_query(const char *type, const char *name, size_t namelen,
141 if (ret) 143 if (ret)
142 goto put; 144 goto put;
143 145
144 upayload = user_key_payload(rkey); 146 upayload = user_key_payload_locked(rkey);
145 len = upayload->datalen; 147 len = upayload->datalen;
146 148
147 ret = -ENOMEM; 149 ret = -ENOMEM;
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index 96e47c539bee..9649238eef40 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -1,12 +1,13 @@
1config HAVE_NET_DSA 1config HAVE_NET_DSA
2 def_bool y 2 def_bool y
3 depends on NETDEVICES && !S390 3 depends on INET && NETDEVICES && !S390
4 4
5# Drivers must select NET_DSA and the appropriate tagging format 5# Drivers must select NET_DSA and the appropriate tagging format
6 6
7config NET_DSA 7config NET_DSA
8 tristate "Distributed Switch Architecture" 8 tristate "Distributed Switch Architecture"
9 depends on HAVE_NET_DSA && NET_SWITCHDEV 9 depends on HAVE_NET_DSA
10 select NET_SWITCHDEV
10 select PHYLIB 11 select PHYLIB
11 ---help--- 12 ---help---
12 Say Y if you want to enable support for the hardware switches supported 13 Say Y if you want to enable support for the hardware switches supported
@@ -14,17 +15,6 @@ config NET_DSA
14 15
15if NET_DSA 16if NET_DSA
16 17
17config NET_DSA_HWMON
18 bool "Distributed Switch Architecture HWMON support"
19 default y
20 depends on HWMON && !(NET_DSA=y && HWMON=m)
21 ---help---
22 Say Y if you want to expose thermal sensor data on switches supported
23 by the Distributed Switch Architecture.
24
25 Some of those switches contain thermal sensors. This data is available
26 via the hwmon sysfs interface and exposes the onboard sensors.
27
28# tagging formats 18# tagging formats
29config NET_DSA_TAG_BRCM 19config NET_DSA_TAG_BRCM
30 bool 20 bool
diff --git a/net/dsa/Makefile b/net/dsa/Makefile
index a3380ed0e0be..31d343796251 100644
--- a/net/dsa/Makefile
+++ b/net/dsa/Makefile
@@ -1,6 +1,6 @@
1# the core 1# the core
2obj-$(CONFIG_NET_DSA) += dsa_core.o 2obj-$(CONFIG_NET_DSA) += dsa_core.o
3dsa_core-y += dsa.o slave.o dsa2.o 3dsa_core-y += dsa.o slave.o dsa2.o switch.o
4 4
5# tagging formats 5# tagging formats
6dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o 6dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 7899919cd9f0..b6d4f6a23f06 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -9,9 +9,7 @@
9 * (at your option) any later version. 9 * (at your option) any later version.
10 */ 10 */
11 11
12#include <linux/ctype.h>
13#include <linux/device.h> 12#include <linux/device.h>
14#include <linux/hwmon.h>
15#include <linux/list.h> 13#include <linux/list.h>
16#include <linux/platform_device.h> 14#include <linux/platform_device.h>
17#include <linux/slab.h> 15#include <linux/slab.h>
@@ -27,8 +25,6 @@
27#include <linux/gpio/consumer.h> 25#include <linux/gpio/consumer.h>
28#include "dsa_priv.h" 26#include "dsa_priv.h"
29 27
30char dsa_driver_version[] = "0.1";
31
32static struct sk_buff *dsa_slave_notag_xmit(struct sk_buff *skb, 28static struct sk_buff *dsa_slave_notag_xmit(struct sk_buff *skb,
33 struct net_device *dev) 29 struct net_device *dev)
34{ 30{
@@ -64,27 +60,27 @@ const struct dsa_device_ops *dsa_device_ops[DSA_TAG_LAST] = {
64static DEFINE_MUTEX(dsa_switch_drivers_mutex); 60static DEFINE_MUTEX(dsa_switch_drivers_mutex);
65static LIST_HEAD(dsa_switch_drivers); 61static LIST_HEAD(dsa_switch_drivers);
66 62
67void register_switch_driver(struct dsa_switch_ops *ops) 63void register_switch_driver(struct dsa_switch_driver *drv)
68{ 64{
69 mutex_lock(&dsa_switch_drivers_mutex); 65 mutex_lock(&dsa_switch_drivers_mutex);
70 list_add_tail(&ops->list, &dsa_switch_drivers); 66 list_add_tail(&drv->list, &dsa_switch_drivers);
71 mutex_unlock(&dsa_switch_drivers_mutex); 67 mutex_unlock(&dsa_switch_drivers_mutex);
72} 68}
73EXPORT_SYMBOL_GPL(register_switch_driver); 69EXPORT_SYMBOL_GPL(register_switch_driver);
74 70
75void unregister_switch_driver(struct dsa_switch_ops *ops) 71void unregister_switch_driver(struct dsa_switch_driver *drv)
76{ 72{
77 mutex_lock(&dsa_switch_drivers_mutex); 73 mutex_lock(&dsa_switch_drivers_mutex);
78 list_del_init(&ops->list); 74 list_del_init(&drv->list);
79 mutex_unlock(&dsa_switch_drivers_mutex); 75 mutex_unlock(&dsa_switch_drivers_mutex);
80} 76}
81EXPORT_SYMBOL_GPL(unregister_switch_driver); 77EXPORT_SYMBOL_GPL(unregister_switch_driver);
82 78
83static struct dsa_switch_ops * 79static const struct dsa_switch_ops *
84dsa_switch_probe(struct device *parent, struct device *host_dev, int sw_addr, 80dsa_switch_probe(struct device *parent, struct device *host_dev, int sw_addr,
85 const char **_name, void **priv) 81 const char **_name, void **priv)
86{ 82{
87 struct dsa_switch_ops *ret; 83 const struct dsa_switch_ops *ret;
88 struct list_head *list; 84 struct list_head *list;
89 const char *name; 85 const char *name;
90 86
@@ -93,9 +89,11 @@ dsa_switch_probe(struct device *parent, struct device *host_dev, int sw_addr,
93 89
94 mutex_lock(&dsa_switch_drivers_mutex); 90 mutex_lock(&dsa_switch_drivers_mutex);
95 list_for_each(list, &dsa_switch_drivers) { 91 list_for_each(list, &dsa_switch_drivers) {
96 struct dsa_switch_ops *ops; 92 const struct dsa_switch_ops *ops;
93 struct dsa_switch_driver *drv;
97 94
98 ops = list_entry(list, struct dsa_switch_ops, list); 95 drv = list_entry(list, struct dsa_switch_driver, list);
96 ops = drv->ops;
99 97
100 name = ops->probe(parent, host_dev, sw_addr, priv); 98 name = ops->probe(parent, host_dev, sw_addr, priv);
101 if (name != NULL) { 99 if (name != NULL) {
@@ -110,109 +108,11 @@ dsa_switch_probe(struct device *parent, struct device *host_dev, int sw_addr,
110 return ret; 108 return ret;
111} 109}
112 110
113/* hwmon support ************************************************************/
114
115#ifdef CONFIG_NET_DSA_HWMON
116
117static ssize_t temp1_input_show(struct device *dev,
118 struct device_attribute *attr, char *buf)
119{
120 struct dsa_switch *ds = dev_get_drvdata(dev);
121 int temp, ret;
122
123 ret = ds->ops->get_temp(ds, &temp);
124 if (ret < 0)
125 return ret;
126
127 return sprintf(buf, "%d\n", temp * 1000);
128}
129static DEVICE_ATTR_RO(temp1_input);
130
131static ssize_t temp1_max_show(struct device *dev,
132 struct device_attribute *attr, char *buf)
133{
134 struct dsa_switch *ds = dev_get_drvdata(dev);
135 int temp, ret;
136
137 ret = ds->ops->get_temp_limit(ds, &temp);
138 if (ret < 0)
139 return ret;
140
141 return sprintf(buf, "%d\n", temp * 1000);
142}
143
144static ssize_t temp1_max_store(struct device *dev,
145 struct device_attribute *attr, const char *buf,
146 size_t count)
147{
148 struct dsa_switch *ds = dev_get_drvdata(dev);
149 int temp, ret;
150
151 ret = kstrtoint(buf, 0, &temp);
152 if (ret < 0)
153 return ret;
154
155 ret = ds->ops->set_temp_limit(ds, DIV_ROUND_CLOSEST(temp, 1000));
156 if (ret < 0)
157 return ret;
158
159 return count;
160}
161static DEVICE_ATTR_RW(temp1_max);
162
163static ssize_t temp1_max_alarm_show(struct device *dev,
164 struct device_attribute *attr, char *buf)
165{
166 struct dsa_switch *ds = dev_get_drvdata(dev);
167 bool alarm;
168 int ret;
169
170 ret = ds->ops->get_temp_alarm(ds, &alarm);
171 if (ret < 0)
172 return ret;
173
174 return sprintf(buf, "%d\n", alarm);
175}
176static DEVICE_ATTR_RO(temp1_max_alarm);
177
178static struct attribute *dsa_hwmon_attrs[] = {
179 &dev_attr_temp1_input.attr, /* 0 */
180 &dev_attr_temp1_max.attr, /* 1 */
181 &dev_attr_temp1_max_alarm.attr, /* 2 */
182 NULL
183};
184
185static umode_t dsa_hwmon_attrs_visible(struct kobject *kobj,
186 struct attribute *attr, int index)
187{
188 struct device *dev = container_of(kobj, struct device, kobj);
189 struct dsa_switch *ds = dev_get_drvdata(dev);
190 struct dsa_switch_ops *ops = ds->ops;
191 umode_t mode = attr->mode;
192
193 if (index == 1) {
194 if (!ops->get_temp_limit)
195 mode = 0;
196 else if (!ops->set_temp_limit)
197 mode &= ~S_IWUSR;
198 } else if (index == 2 && !ops->get_temp_alarm) {
199 mode = 0;
200 }
201 return mode;
202}
203
204static const struct attribute_group dsa_hwmon_group = {
205 .attrs = dsa_hwmon_attrs,
206 .is_visible = dsa_hwmon_attrs_visible,
207};
208__ATTRIBUTE_GROUPS(dsa_hwmon);
209
210#endif /* CONFIG_NET_DSA_HWMON */
211
212/* basic switch operations **************************************************/ 111/* basic switch operations **************************************************/
213int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct device *dev, 112int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct device *dev,
214 struct device_node *port_dn, int port) 113 struct dsa_port *dport, int port)
215{ 114{
115 struct device_node *port_dn = dport->dn;
216 struct phy_device *phydev; 116 struct phy_device *phydev;
217 int ret, mode; 117 int ret, mode;
218 118
@@ -242,15 +142,15 @@ int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct device *dev,
242 142
243static int dsa_cpu_dsa_setups(struct dsa_switch *ds, struct device *dev) 143static int dsa_cpu_dsa_setups(struct dsa_switch *ds, struct device *dev)
244{ 144{
245 struct device_node *port_dn; 145 struct dsa_port *dport;
246 int ret, port; 146 int ret, port;
247 147
248 for (port = 0; port < DSA_MAX_PORTS; port++) { 148 for (port = 0; port < ds->num_ports; port++) {
249 if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))) 149 if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port)))
250 continue; 150 continue;
251 151
252 port_dn = ds->ports[port].dn; 152 dport = &ds->ports[port];
253 ret = dsa_cpu_dsa_setup(ds, dev, port_dn, port); 153 ret = dsa_cpu_dsa_setup(ds, dev, dport, port);
254 if (ret) 154 if (ret)
255 return ret; 155 return ret;
256 } 156 }
@@ -308,7 +208,7 @@ void dsa_cpu_port_ethtool_restore(struct dsa_switch *ds)
308 208
309static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) 209static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
310{ 210{
311 struct dsa_switch_ops *ops = ds->ops; 211 const struct dsa_switch_ops *ops = ds->ops;
312 struct dsa_switch_tree *dst = ds->dst; 212 struct dsa_switch_tree *dst = ds->dst;
313 struct dsa_chip_data *cd = ds->cd; 213 struct dsa_chip_data *cd = ds->cd;
314 bool valid_name_found = false; 214 bool valid_name_found = false;
@@ -318,7 +218,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
318 /* 218 /*
319 * Validate supplied switch configuration. 219 * Validate supplied switch configuration.
320 */ 220 */
321 for (i = 0; i < DSA_MAX_PORTS; i++) { 221 for (i = 0; i < ds->num_ports; i++) {
322 char *name; 222 char *name;
323 223
324 name = cd->port_names[i]; 224 name = cd->port_names[i];
@@ -326,13 +226,12 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
326 continue; 226 continue;
327 227
328 if (!strcmp(name, "cpu")) { 228 if (!strcmp(name, "cpu")) {
329 if (dst->cpu_switch != -1) { 229 if (dst->cpu_switch) {
330 netdev_err(dst->master_netdev, 230 netdev_err(dst->master_netdev,
331 "multiple cpu ports?!\n"); 231 "multiple cpu ports?!\n");
332 ret = -EINVAL; 232 return -EINVAL;
333 goto out;
334 } 233 }
335 dst->cpu_switch = index; 234 dst->cpu_switch = ds;
336 dst->cpu_port = i; 235 dst->cpu_port = i;
337 ds->cpu_port_mask |= 1 << i; 236 ds->cpu_port_mask |= 1 << i;
338 } else if (!strcmp(name, "dsa")) { 237 } else if (!strcmp(name, "dsa")) {
@@ -343,10 +242,8 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
343 valid_name_found = true; 242 valid_name_found = true;
344 } 243 }
345 244
346 if (!valid_name_found && i == DSA_MAX_PORTS) { 245 if (!valid_name_found && i == ds->num_ports)
347 ret = -EINVAL; 246 return -EINVAL;
348 goto out;
349 }
350 247
351 /* Make the built-in MII bus mask match the number of ports, 248 /* Make the built-in MII bus mask match the number of ports,
352 * switch drivers can override this later 249 * switch drivers can override this later
@@ -358,15 +255,13 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
358 * tagging protocol to the preferred tagging format of this 255 * tagging protocol to the preferred tagging format of this
359 * switch. 256 * switch.
360 */ 257 */
361 if (dst->cpu_switch == index) { 258 if (dst->cpu_switch == ds) {
362 enum dsa_tag_protocol tag_protocol; 259 enum dsa_tag_protocol tag_protocol;
363 260
364 tag_protocol = ops->get_tag_protocol(ds); 261 tag_protocol = ops->get_tag_protocol(ds);
365 dst->tag_ops = dsa_resolve_tag_protocol(tag_protocol); 262 dst->tag_ops = dsa_resolve_tag_protocol(tag_protocol);
366 if (IS_ERR(dst->tag_ops)) { 263 if (IS_ERR(dst->tag_ops))
367 ret = PTR_ERR(dst->tag_ops); 264 return PTR_ERR(dst->tag_ops);
368 goto out;
369 }
370 265
371 dst->rcv = dst->tag_ops->rcv; 266 dst->rcv = dst->tag_ops->rcv;
372 } 267 }
@@ -378,85 +273,55 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
378 */ 273 */
379 ret = ops->setup(ds); 274 ret = ops->setup(ds);
380 if (ret < 0) 275 if (ret < 0)
381 goto out; 276 return ret;
277
278 ret = dsa_switch_register_notifier(ds);
279 if (ret)
280 return ret;
382 281
383 if (ops->set_addr) { 282 if (ops->set_addr) {
384 ret = ops->set_addr(ds, dst->master_netdev->dev_addr); 283 ret = ops->set_addr(ds, dst->master_netdev->dev_addr);
385 if (ret < 0) 284 if (ret < 0)
386 goto out; 285 return ret;
387 } 286 }
388 287
389 if (!ds->slave_mii_bus && ops->phy_read) { 288 if (!ds->slave_mii_bus && ops->phy_read) {
390 ds->slave_mii_bus = devm_mdiobus_alloc(parent); 289 ds->slave_mii_bus = devm_mdiobus_alloc(parent);
391 if (!ds->slave_mii_bus) { 290 if (!ds->slave_mii_bus)
392 ret = -ENOMEM; 291 return -ENOMEM;
393 goto out;
394 }
395 dsa_slave_mii_bus_init(ds); 292 dsa_slave_mii_bus_init(ds);
396 293
397 ret = mdiobus_register(ds->slave_mii_bus); 294 ret = mdiobus_register(ds->slave_mii_bus);
398 if (ret < 0) 295 if (ret < 0)
399 goto out; 296 return ret;
400 } 297 }
401 298
402 /* 299 /*
403 * Create network devices for physical switch ports. 300 * Create network devices for physical switch ports.
404 */ 301 */
405 for (i = 0; i < DSA_MAX_PORTS; i++) { 302 for (i = 0; i < ds->num_ports; i++) {
406 ds->ports[i].dn = cd->port_dn[i]; 303 ds->ports[i].dn = cd->port_dn[i];
407 304
408 if (!(ds->enabled_port_mask & (1 << i))) 305 if (!(ds->enabled_port_mask & (1 << i)))
409 continue; 306 continue;
410 307
411 ret = dsa_slave_create(ds, parent, i, cd->port_names[i]); 308 ret = dsa_slave_create(ds, parent, i, cd->port_names[i]);
412 if (ret < 0) { 309 if (ret < 0)
413 netdev_err(dst->master_netdev, "[%d]: can't create dsa slave device for port %d(%s): %d\n", 310 netdev_err(dst->master_netdev, "[%d]: can't create dsa slave device for port %d(%s): %d\n",
414 index, i, cd->port_names[i], ret); 311 index, i, cd->port_names[i], ret);
415 ret = 0;
416 }
417 } 312 }
418 313
419 /* Perform configuration of the CPU and DSA ports */ 314 /* Perform configuration of the CPU and DSA ports */
420 ret = dsa_cpu_dsa_setups(ds, parent); 315 ret = dsa_cpu_dsa_setups(ds, parent);
421 if (ret < 0) { 316 if (ret < 0)
422 netdev_err(dst->master_netdev, "[%d] : can't configure CPU and DSA ports\n", 317 netdev_err(dst->master_netdev, "[%d] : can't configure CPU and DSA ports\n",
423 index); 318 index);
424 ret = 0;
425 }
426 319
427 ret = dsa_cpu_port_ethtool_setup(ds); 320 ret = dsa_cpu_port_ethtool_setup(ds);
428 if (ret) 321 if (ret)
429 return ret; 322 return ret;
430 323
431#ifdef CONFIG_NET_DSA_HWMON 324 return 0;
432 /* If the switch provides a temperature sensor,
433 * register with hardware monitoring subsystem.
434 * Treat registration error as non-fatal and ignore it.
435 */
436 if (ops->get_temp) {
437 const char *netname = netdev_name(dst->master_netdev);
438 char hname[IFNAMSIZ + 1];
439 int i, j;
440
441 /* Create valid hwmon 'name' attribute */
442 for (i = j = 0; i < IFNAMSIZ && netname[i]; i++) {
443 if (isalnum(netname[i]))
444 hname[j++] = netname[i];
445 }
446 hname[j] = '\0';
447 scnprintf(ds->hwmon_name, sizeof(ds->hwmon_name), "%s_dsa%d",
448 hname, index);
449 ds->hwmon_dev = hwmon_device_register_with_groups(NULL,
450 ds->hwmon_name, ds, dsa_hwmon_groups);
451 if (IS_ERR(ds->hwmon_dev))
452 ds->hwmon_dev = NULL;
453 }
454#endif /* CONFIG_NET_DSA_HWMON */
455
456 return ret;
457
458out:
459 return ret;
460} 325}
461 326
462static struct dsa_switch * 327static struct dsa_switch *
@@ -464,7 +329,7 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index,
464 struct device *parent, struct device *host_dev) 329 struct device *parent, struct device *host_dev)
465{ 330{
466 struct dsa_chip_data *cd = dst->pd->chip + index; 331 struct dsa_chip_data *cd = dst->pd->chip + index;
467 struct dsa_switch_ops *ops; 332 const struct dsa_switch_ops *ops;
468 struct dsa_switch *ds; 333 struct dsa_switch *ds;
469 int ret; 334 int ret;
470 const char *name; 335 const char *name;
@@ -486,8 +351,8 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index,
486 /* 351 /*
487 * Allocate and initialise switch state. 352 * Allocate and initialise switch state.
488 */ 353 */
489 ds = devm_kzalloc(parent, sizeof(*ds), GFP_KERNEL); 354 ds = dsa_switch_alloc(parent, DSA_MAX_PORTS);
490 if (ds == NULL) 355 if (!ds)
491 return ERR_PTR(-ENOMEM); 356 return ERR_PTR(-ENOMEM);
492 357
493 ds->dst = dst; 358 ds->dst = dst;
@@ -495,7 +360,6 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index,
495 ds->cd = cd; 360 ds->cd = cd;
496 ds->ops = ops; 361 ds->ops = ops;
497 ds->priv = priv; 362 ds->priv = priv;
498 ds->dev = parent;
499 363
500 ret = dsa_switch_setup_one(ds, parent); 364 ret = dsa_switch_setup_one(ds, parent);
501 if (ret) 365 if (ret)
@@ -504,8 +368,10 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index,
504 return ds; 368 return ds;
505} 369}
506 370
507void dsa_cpu_dsa_destroy(struct device_node *port_dn) 371void dsa_cpu_dsa_destroy(struct dsa_port *port)
508{ 372{
373 struct device_node *port_dn = port->dn;
374
509 if (of_phy_is_fixed_link(port_dn)) 375 if (of_phy_is_fixed_link(port_dn))
510 of_phy_deregister_fixed_link(port_dn); 376 of_phy_deregister_fixed_link(port_dn);
511} 377}
@@ -514,13 +380,8 @@ static void dsa_switch_destroy(struct dsa_switch *ds)
514{ 380{
515 int port; 381 int port;
516 382
517#ifdef CONFIG_NET_DSA_HWMON
518 if (ds->hwmon_dev)
519 hwmon_device_unregister(ds->hwmon_dev);
520#endif
521
522 /* Destroy network devices for physical switch ports. */ 383 /* Destroy network devices for physical switch ports. */
523 for (port = 0; port < DSA_MAX_PORTS; port++) { 384 for (port = 0; port < ds->num_ports; port++) {
524 if (!(ds->enabled_port_mask & (1 << port))) 385 if (!(ds->enabled_port_mask & (1 << port)))
525 continue; 386 continue;
526 387
@@ -531,10 +392,10 @@ static void dsa_switch_destroy(struct dsa_switch *ds)
531 } 392 }
532 393
533 /* Disable configuration of the CPU and DSA ports */ 394 /* Disable configuration of the CPU and DSA ports */
534 for (port = 0; port < DSA_MAX_PORTS; port++) { 395 for (port = 0; port < ds->num_ports; port++) {
535 if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))) 396 if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port)))
536 continue; 397 continue;
537 dsa_cpu_dsa_destroy(ds->ports[port].dn); 398 dsa_cpu_dsa_destroy(&ds->ports[port]);
538 399
539 /* Clearing a bit which is not set does no harm */ 400 /* Clearing a bit which is not set does no harm */
540 ds->cpu_port_mask |= ~(1 << port); 401 ds->cpu_port_mask |= ~(1 << port);
@@ -543,6 +404,8 @@ static void dsa_switch_destroy(struct dsa_switch *ds)
543 404
544 if (ds->slave_mii_bus && ds->ops->phy_read) 405 if (ds->slave_mii_bus && ds->ops->phy_read)
545 mdiobus_unregister(ds->slave_mii_bus); 406 mdiobus_unregister(ds->slave_mii_bus);
407
408 dsa_switch_unregister_notifier(ds);
546} 409}
547 410
548#ifdef CONFIG_PM_SLEEP 411#ifdef CONFIG_PM_SLEEP
@@ -551,7 +414,7 @@ int dsa_switch_suspend(struct dsa_switch *ds)
551 int i, ret = 0; 414 int i, ret = 0;
552 415
553 /* Suspend slave network devices */ 416 /* Suspend slave network devices */
554 for (i = 0; i < DSA_MAX_PORTS; i++) { 417 for (i = 0; i < ds->num_ports; i++) {
555 if (!dsa_is_port_initialized(ds, i)) 418 if (!dsa_is_port_initialized(ds, i))
556 continue; 419 continue;
557 420
@@ -578,7 +441,7 @@ int dsa_switch_resume(struct dsa_switch *ds)
578 return ret; 441 return ret;
579 442
580 /* Resume slave network devices */ 443 /* Resume slave network devices */
581 for (i = 0; i < DSA_MAX_PORTS; i++) { 444 for (i = 0; i < ds->num_ports; i++) {
582 if (!dsa_is_port_initialized(ds, i)) 445 if (!dsa_is_port_initialized(ds, i))
583 continue; 446 continue;
584 447
@@ -629,7 +492,7 @@ struct mii_bus *dsa_host_dev_to_mii_bus(struct device *dev)
629} 492}
630EXPORT_SYMBOL_GPL(dsa_host_dev_to_mii_bus); 493EXPORT_SYMBOL_GPL(dsa_host_dev_to_mii_bus);
631 494
632static struct net_device *dev_to_net_device(struct device *dev) 495struct net_device *dsa_dev_to_net_device(struct device *dev)
633{ 496{
634 struct device *d; 497 struct device *d;
635 498
@@ -646,6 +509,7 @@ static struct net_device *dev_to_net_device(struct device *dev)
646 509
647 return NULL; 510 return NULL;
648} 511}
512EXPORT_SYMBOL_GPL(dsa_dev_to_net_device);
649 513
650#ifdef CONFIG_OF 514#ifdef CONFIG_OF
651static int dsa_of_setup_routing_table(struct dsa_platform_data *pd, 515static int dsa_of_setup_routing_table(struct dsa_platform_data *pd,
@@ -898,7 +762,6 @@ static int dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev,
898 762
899 dst->pd = pd; 763 dst->pd = pd;
900 dst->master_netdev = dev; 764 dst->master_netdev = dev;
901 dst->cpu_switch = -1;
902 dst->cpu_port = -1; 765 dst->cpu_port = -1;
903 766
904 for (i = 0; i < pd->nr_chips; i++) { 767 for (i = 0; i < pd->nr_chips; i++) {
@@ -940,9 +803,6 @@ static int dsa_probe(struct platform_device *pdev)
940 struct dsa_switch_tree *dst; 803 struct dsa_switch_tree *dst;
941 int ret; 804 int ret;
942 805
943 pr_notice_once("Distributed Switch Architecture driver version %s\n",
944 dsa_driver_version);
945
946 if (pdev->dev.of_node) { 806 if (pdev->dev.of_node) {
947 ret = dsa_of_probe(&pdev->dev); 807 ret = dsa_of_probe(&pdev->dev);
948 if (ret) 808 if (ret)
@@ -958,7 +818,7 @@ static int dsa_probe(struct platform_device *pdev)
958 dev = pd->of_netdev; 818 dev = pd->of_netdev;
959 dev_hold(dev); 819 dev_hold(dev);
960 } else { 820 } else {
961 dev = dev_to_net_device(pd->netdev); 821 dev = dsa_dev_to_net_device(pd->netdev);
962 } 822 }
963 if (dev == NULL) { 823 if (dev == NULL) {
964 ret = -EPROBE_DEFER; 824 ret = -EPROBE_DEFER;
@@ -1013,7 +873,7 @@ static void dsa_remove_dst(struct dsa_switch_tree *dst)
1013 dsa_switch_destroy(ds); 873 dsa_switch_destroy(ds);
1014 } 874 }
1015 875
1016 dsa_cpu_port_ethtool_restore(dst->ds[0]); 876 dsa_cpu_port_ethtool_restore(dst->cpu_switch);
1017 877
1018 dev_put(dst->master_netdev); 878 dev_put(dst->master_netdev);
1019} 879}
@@ -1050,10 +910,6 @@ static struct packet_type dsa_pack_type __read_mostly = {
1050 .func = dsa_switch_rcv, 910 .func = dsa_switch_rcv,
1051}; 911};
1052 912
1053static struct notifier_block dsa_netdevice_nb __read_mostly = {
1054 .notifier_call = dsa_slave_netdevice_event,
1055};
1056
1057#ifdef CONFIG_PM_SLEEP 913#ifdef CONFIG_PM_SLEEP
1058static int dsa_suspend(struct device *d) 914static int dsa_suspend(struct device *d)
1059{ 915{
@@ -1111,7 +967,9 @@ static int __init dsa_init_module(void)
1111{ 967{
1112 int rc; 968 int rc;
1113 969
1114 register_netdevice_notifier(&dsa_netdevice_nb); 970 rc = dsa_slave_register_notifier();
971 if (rc)
972 return rc;
1115 973
1116 rc = platform_driver_register(&dsa_driver); 974 rc = platform_driver_register(&dsa_driver);
1117 if (rc) 975 if (rc)
@@ -1125,7 +983,7 @@ module_init(dsa_init_module);
1125 983
1126static void __exit dsa_cleanup_module(void) 984static void __exit dsa_cleanup_module(void)
1127{ 985{
1128 unregister_netdevice_notifier(&dsa_netdevice_nb); 986 dsa_slave_unregister_notifier();
1129 dev_remove_pack(&dsa_pack_type); 987 dev_remove_pack(&dsa_pack_type);
1130 platform_driver_unregister(&dsa_driver); 988 platform_driver_unregister(&dsa_driver);
1131} 989}
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index da3862124545..737be6470c7f 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -57,7 +57,6 @@ static struct dsa_switch_tree *dsa_add_dst(u32 tree)
57 if (!dst) 57 if (!dst)
58 return NULL; 58 return NULL;
59 dst->tree = tree; 59 dst->tree = tree;
60 dst->cpu_switch = -1;
61 INIT_LIST_HEAD(&dst->list); 60 INIT_LIST_HEAD(&dst->list);
62 list_add_tail(&dsa_switch_trees, &dst->list); 61 list_add_tail(&dsa_switch_trees, &dst->list);
63 kref_init(&dst->refcount); 62 kref_init(&dst->refcount);
@@ -79,47 +78,43 @@ static void dsa_dst_del_ds(struct dsa_switch_tree *dst,
79 kref_put(&dst->refcount, dsa_free_dst); 78 kref_put(&dst->refcount, dsa_free_dst);
80} 79}
81 80
82static bool dsa_port_is_dsa(struct device_node *port) 81/* For platform data configurations, we need to have a valid name argument to
82 * differentiate a disabled port from an enabled one
83 */
84static bool dsa_port_is_valid(struct dsa_port *port)
83{ 85{
84 const char *name; 86 return !!(port->dn || port->name);
85 87}
86 name = of_get_property(port, "label", NULL);
87 if (!name)
88 return false;
89 88
90 if (!strcmp(name, "dsa")) 89static bool dsa_port_is_dsa(struct dsa_port *port)
90{
91 if (port->name && !strcmp(port->name, "dsa"))
91 return true; 92 return true;
92 93 else
93 return false; 94 return !!of_parse_phandle(port->dn, "link", 0);
94} 95}
95 96
96static bool dsa_port_is_cpu(struct device_node *port) 97static bool dsa_port_is_cpu(struct dsa_port *port)
97{ 98{
98 const char *name; 99 if (port->name && !strcmp(port->name, "cpu"))
99
100 name = of_get_property(port, "label", NULL);
101 if (!name)
102 return false;
103
104 if (!strcmp(name, "cpu"))
105 return true; 100 return true;
106 101 else
107 return false; 102 return !!of_parse_phandle(port->dn, "ethernet", 0);
108} 103}
109 104
110static bool dsa_ds_find_port(struct dsa_switch *ds, 105static bool dsa_ds_find_port_dn(struct dsa_switch *ds,
111 struct device_node *port) 106 struct device_node *port)
112{ 107{
113 u32 index; 108 u32 index;
114 109
115 for (index = 0; index < DSA_MAX_PORTS; index++) 110 for (index = 0; index < ds->num_ports; index++)
116 if (ds->ports[index].dn == port) 111 if (ds->ports[index].dn == port)
117 return true; 112 return true;
118 return false; 113 return false;
119} 114}
120 115
121static struct dsa_switch *dsa_dst_find_port(struct dsa_switch_tree *dst, 116static struct dsa_switch *dsa_dst_find_port_dn(struct dsa_switch_tree *dst,
122 struct device_node *port) 117 struct device_node *port)
123{ 118{
124 struct dsa_switch *ds; 119 struct dsa_switch *ds;
125 u32 index; 120 u32 index;
@@ -129,7 +124,7 @@ static struct dsa_switch *dsa_dst_find_port(struct dsa_switch_tree *dst,
129 if (!ds) 124 if (!ds)
130 continue; 125 continue;
131 126
132 if (dsa_ds_find_port(ds, port)) 127 if (dsa_ds_find_port_dn(ds, port))
133 return ds; 128 return ds;
134 } 129 }
135 130
@@ -138,7 +133,7 @@ static struct dsa_switch *dsa_dst_find_port(struct dsa_switch_tree *dst,
138 133
139static int dsa_port_complete(struct dsa_switch_tree *dst, 134static int dsa_port_complete(struct dsa_switch_tree *dst,
140 struct dsa_switch *src_ds, 135 struct dsa_switch *src_ds,
141 struct device_node *port, 136 struct dsa_port *port,
142 u32 src_port) 137 u32 src_port)
143{ 138{
144 struct device_node *link; 139 struct device_node *link;
@@ -146,11 +141,11 @@ static int dsa_port_complete(struct dsa_switch_tree *dst,
146 struct dsa_switch *dst_ds; 141 struct dsa_switch *dst_ds;
147 142
148 for (index = 0;; index++) { 143 for (index = 0;; index++) {
149 link = of_parse_phandle(port, "link", index); 144 link = of_parse_phandle(port->dn, "link", index);
150 if (!link) 145 if (!link)
151 break; 146 break;
152 147
153 dst_ds = dsa_dst_find_port(dst, link); 148 dst_ds = dsa_dst_find_port_dn(dst, link);
154 of_node_put(link); 149 of_node_put(link);
155 150
156 if (!dst_ds) 151 if (!dst_ds)
@@ -169,13 +164,13 @@ static int dsa_port_complete(struct dsa_switch_tree *dst,
169 */ 164 */
170static int dsa_ds_complete(struct dsa_switch_tree *dst, struct dsa_switch *ds) 165static int dsa_ds_complete(struct dsa_switch_tree *dst, struct dsa_switch *ds)
171{ 166{
172 struct device_node *port; 167 struct dsa_port *port;
173 u32 index; 168 u32 index;
174 int err; 169 int err;
175 170
176 for (index = 0; index < DSA_MAX_PORTS; index++) { 171 for (index = 0; index < ds->num_ports; index++) {
177 port = ds->ports[index].dn; 172 port = &ds->ports[index];
178 if (!port) 173 if (!dsa_port_is_valid(port))
179 continue; 174 continue;
180 175
181 if (!dsa_port_is_dsa(port)) 176 if (!dsa_port_is_dsa(port))
@@ -215,7 +210,7 @@ static int dsa_dst_complete(struct dsa_switch_tree *dst)
215 return 0; 210 return 0;
216} 211}
217 212
218static int dsa_dsa_port_apply(struct device_node *port, u32 index, 213static int dsa_dsa_port_apply(struct dsa_port *port, u32 index,
219 struct dsa_switch *ds) 214 struct dsa_switch *ds)
220{ 215{
221 int err; 216 int err;
@@ -230,13 +225,13 @@ static int dsa_dsa_port_apply(struct device_node *port, u32 index,
230 return 0; 225 return 0;
231} 226}
232 227
233static void dsa_dsa_port_unapply(struct device_node *port, u32 index, 228static void dsa_dsa_port_unapply(struct dsa_port *port, u32 index,
234 struct dsa_switch *ds) 229 struct dsa_switch *ds)
235{ 230{
236 dsa_cpu_dsa_destroy(port); 231 dsa_cpu_dsa_destroy(port);
237} 232}
238 233
239static int dsa_cpu_port_apply(struct device_node *port, u32 index, 234static int dsa_cpu_port_apply(struct dsa_port *port, u32 index,
240 struct dsa_switch *ds) 235 struct dsa_switch *ds)
241{ 236{
242 int err; 237 int err;
@@ -253,7 +248,7 @@ static int dsa_cpu_port_apply(struct device_node *port, u32 index,
253 return 0; 248 return 0;
254} 249}
255 250
256static void dsa_cpu_port_unapply(struct device_node *port, u32 index, 251static void dsa_cpu_port_unapply(struct dsa_port *port, u32 index,
257 struct dsa_switch *ds) 252 struct dsa_switch *ds)
258{ 253{
259 dsa_cpu_dsa_destroy(port); 254 dsa_cpu_dsa_destroy(port);
@@ -261,25 +256,29 @@ static void dsa_cpu_port_unapply(struct device_node *port, u32 index,
261 256
262} 257}
263 258
264static int dsa_user_port_apply(struct device_node *port, u32 index, 259static int dsa_user_port_apply(struct dsa_port *port, u32 index,
265 struct dsa_switch *ds) 260 struct dsa_switch *ds)
266{ 261{
267 const char *name; 262 const char *name = port->name;
268 int err; 263 int err;
269 264
270 name = of_get_property(port, "label", NULL); 265 if (port->dn)
266 name = of_get_property(port->dn, "label", NULL);
267 if (!name)
268 name = "eth%d";
271 269
272 err = dsa_slave_create(ds, ds->dev, index, name); 270 err = dsa_slave_create(ds, ds->dev, index, name);
273 if (err) { 271 if (err) {
274 dev_warn(ds->dev, "Failed to create slave %d: %d\n", 272 dev_warn(ds->dev, "Failed to create slave %d: %d\n",
275 index, err); 273 index, err);
274 ds->ports[index].netdev = NULL;
276 return err; 275 return err;
277 } 276 }
278 277
279 return 0; 278 return 0;
280} 279}
281 280
282static void dsa_user_port_unapply(struct device_node *port, u32 index, 281static void dsa_user_port_unapply(struct dsa_port *port, u32 index,
283 struct dsa_switch *ds) 282 struct dsa_switch *ds)
284{ 283{
285 if (ds->ports[index].netdev) { 284 if (ds->ports[index].netdev) {
@@ -291,7 +290,7 @@ static void dsa_user_port_unapply(struct device_node *port, u32 index,
291 290
292static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds) 291static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds)
293{ 292{
294 struct device_node *port; 293 struct dsa_port *port;
295 u32 index; 294 u32 index;
296 int err; 295 int err;
297 296
@@ -306,6 +305,10 @@ static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds)
306 if (err < 0) 305 if (err < 0)
307 return err; 306 return err;
308 307
308 err = dsa_switch_register_notifier(ds);
309 if (err)
310 return err;
311
309 if (ds->ops->set_addr) { 312 if (ds->ops->set_addr) {
310 err = ds->ops->set_addr(ds, dst->master_netdev->dev_addr); 313 err = ds->ops->set_addr(ds, dst->master_netdev->dev_addr);
311 if (err < 0) 314 if (err < 0)
@@ -324,9 +327,9 @@ static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds)
324 return err; 327 return err;
325 } 328 }
326 329
327 for (index = 0; index < DSA_MAX_PORTS; index++) { 330 for (index = 0; index < ds->num_ports; index++) {
328 port = ds->ports[index].dn; 331 port = &ds->ports[index];
329 if (!port) 332 if (!dsa_port_is_valid(port))
330 continue; 333 continue;
331 334
332 if (dsa_port_is_dsa(port)) { 335 if (dsa_port_is_dsa(port)) {
@@ -353,12 +356,12 @@ static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds)
353 356
354static void dsa_ds_unapply(struct dsa_switch_tree *dst, struct dsa_switch *ds) 357static void dsa_ds_unapply(struct dsa_switch_tree *dst, struct dsa_switch *ds)
355{ 358{
356 struct device_node *port; 359 struct dsa_port *port;
357 u32 index; 360 u32 index;
358 361
359 for (index = 0; index < DSA_MAX_PORTS; index++) { 362 for (index = 0; index < ds->num_ports; index++) {
360 port = ds->ports[index].dn; 363 port = &ds->ports[index];
361 if (!port) 364 if (!dsa_port_is_valid(port))
362 continue; 365 continue;
363 366
364 if (dsa_port_is_dsa(port)) { 367 if (dsa_port_is_dsa(port)) {
@@ -376,6 +379,8 @@ static void dsa_ds_unapply(struct dsa_switch_tree *dst, struct dsa_switch *ds)
376 379
377 if (ds->slave_mii_bus && ds->ops->phy_read) 380 if (ds->slave_mii_bus && ds->ops->phy_read)
378 mdiobus_unregister(ds->slave_mii_bus); 381 mdiobus_unregister(ds->slave_mii_bus);
382
383 dsa_switch_unregister_notifier(ds);
379} 384}
380 385
381static int dsa_dst_apply(struct dsa_switch_tree *dst) 386static int dsa_dst_apply(struct dsa_switch_tree *dst)
@@ -394,8 +399,8 @@ static int dsa_dst_apply(struct dsa_switch_tree *dst)
394 return err; 399 return err;
395 } 400 }
396 401
397 if (dst->ds[0]) { 402 if (dst->cpu_switch) {
398 err = dsa_cpu_port_ethtool_setup(dst->ds[0]); 403 err = dsa_cpu_port_ethtool_setup(dst->cpu_switch);
399 if (err) 404 if (err)
400 return err; 405 return err;
401 } 406 }
@@ -435,14 +440,14 @@ static void dsa_dst_unapply(struct dsa_switch_tree *dst)
435 dsa_ds_unapply(dst, ds); 440 dsa_ds_unapply(dst, ds);
436 } 441 }
437 442
438 if (dst->ds[0]) 443 if (dst->cpu_switch)
439 dsa_cpu_port_ethtool_restore(dst->ds[0]); 444 dsa_cpu_port_ethtool_restore(dst->cpu_switch);
440 445
441 pr_info("DSA: tree %d unapplied\n", dst->tree); 446 pr_info("DSA: tree %d unapplied\n", dst->tree);
442 dst->applied = false; 447 dst->applied = false;
443} 448}
444 449
445static int dsa_cpu_parse(struct device_node *port, u32 index, 450static int dsa_cpu_parse(struct dsa_port *port, u32 index,
446 struct dsa_switch_tree *dst, 451 struct dsa_switch_tree *dst,
447 struct dsa_switch *ds) 452 struct dsa_switch *ds)
448{ 453{
@@ -450,11 +455,16 @@ static int dsa_cpu_parse(struct device_node *port, u32 index,
450 struct net_device *ethernet_dev; 455 struct net_device *ethernet_dev;
451 struct device_node *ethernet; 456 struct device_node *ethernet;
452 457
453 ethernet = of_parse_phandle(port, "ethernet", 0); 458 if (port->dn) {
454 if (!ethernet) 459 ethernet = of_parse_phandle(port->dn, "ethernet", 0);
455 return -EINVAL; 460 if (!ethernet)
461 return -EINVAL;
462 ethernet_dev = of_find_net_device_by_node(ethernet);
463 } else {
464 ethernet_dev = dsa_dev_to_net_device(ds->cd->netdev[index]);
465 dev_put(ethernet_dev);
466 }
456 467
457 ethernet_dev = of_find_net_device_by_node(ethernet);
458 if (!ethernet_dev) 468 if (!ethernet_dev)
459 return -EPROBE_DEFER; 469 return -EPROBE_DEFER;
460 470
@@ -464,8 +474,8 @@ static int dsa_cpu_parse(struct device_node *port, u32 index,
464 if (!dst->master_netdev) 474 if (!dst->master_netdev)
465 dst->master_netdev = ethernet_dev; 475 dst->master_netdev = ethernet_dev;
466 476
467 if (dst->cpu_switch == -1) { 477 if (!dst->cpu_switch) {
468 dst->cpu_switch = ds->index; 478 dst->cpu_switch = ds;
469 dst->cpu_port = index; 479 dst->cpu_port = index;
470 } 480 }
471 481
@@ -483,13 +493,13 @@ static int dsa_cpu_parse(struct device_node *port, u32 index,
483 493
484static int dsa_ds_parse(struct dsa_switch_tree *dst, struct dsa_switch *ds) 494static int dsa_ds_parse(struct dsa_switch_tree *dst, struct dsa_switch *ds)
485{ 495{
486 struct device_node *port; 496 struct dsa_port *port;
487 u32 index; 497 u32 index;
488 int err; 498 int err;
489 499
490 for (index = 0; index < DSA_MAX_PORTS; index++) { 500 for (index = 0; index < ds->num_ports; index++) {
491 port = ds->ports[index].dn; 501 port = &ds->ports[index];
492 if (!port) 502 if (!dsa_port_is_valid(port))
493 continue; 503 continue;
494 504
495 if (dsa_port_is_cpu(port)) { 505 if (dsa_port_is_cpu(port)) {
@@ -541,7 +551,7 @@ static int dsa_parse_ports_dn(struct device_node *ports, struct dsa_switch *ds)
541 if (err) 551 if (err)
542 return err; 552 return err;
543 553
544 if (reg >= DSA_MAX_PORTS) 554 if (reg >= ds->num_ports)
545 return -EINVAL; 555 return -EINVAL;
546 556
547 ds->ports[reg].dn = port; 557 ds->ports[reg].dn = port;
@@ -550,14 +560,41 @@ static int dsa_parse_ports_dn(struct device_node *ports, struct dsa_switch *ds)
550 * to have access to a correct value, just like what 560 * to have access to a correct value, just like what
551 * net/dsa/dsa.c::dsa_switch_setup_one does. 561 * net/dsa/dsa.c::dsa_switch_setup_one does.
552 */ 562 */
553 if (!dsa_port_is_cpu(port)) 563 if (!dsa_port_is_cpu(&ds->ports[reg]))
554 ds->enabled_port_mask |= 1 << reg; 564 ds->enabled_port_mask |= 1 << reg;
555 } 565 }
556 566
557 return 0; 567 return 0;
558} 568}
559 569
560static int dsa_parse_member(struct device_node *np, u32 *tree, u32 *index) 570static int dsa_parse_ports(struct dsa_chip_data *cd, struct dsa_switch *ds)
571{
572 bool valid_name_found = false;
573 unsigned int i;
574
575 for (i = 0; i < DSA_MAX_PORTS; i++) {
576 if (!cd->port_names[i])
577 continue;
578
579 ds->ports[i].name = cd->port_names[i];
580
581 /* Initialize enabled_port_mask now for drv->setup()
582 * to have access to a correct value, just like what
583 * net/dsa/dsa.c::dsa_switch_setup_one does.
584 */
585 if (!dsa_port_is_cpu(&ds->ports[i]))
586 ds->enabled_port_mask |= 1 << i;
587
588 valid_name_found = true;
589 }
590
591 if (!valid_name_found && i == DSA_MAX_PORTS)
592 return -EINVAL;
593
594 return 0;
595}
596
597static int dsa_parse_member_dn(struct device_node *np, u32 *tree, u32 *index)
561{ 598{
562 int err; 599 int err;
563 600
@@ -581,6 +618,18 @@ static int dsa_parse_member(struct device_node *np, u32 *tree, u32 *index)
581 return 0; 618 return 0;
582} 619}
583 620
621static int dsa_parse_member(struct dsa_chip_data *pd, u32 *tree, u32 *index)
622{
623 if (!pd)
624 return -ENODEV;
625
626 /* We do not support complex trees with dsa_chip_data */
627 *tree = 0;
628 *index = 0;
629
630 return 0;
631}
632
584static struct device_node *dsa_get_ports(struct dsa_switch *ds, 633static struct device_node *dsa_get_ports(struct dsa_switch *ds,
585 struct device_node *np) 634 struct device_node *np)
586{ 635{
@@ -595,23 +644,36 @@ static struct device_node *dsa_get_ports(struct dsa_switch *ds,
595 return ports; 644 return ports;
596} 645}
597 646
598static int _dsa_register_switch(struct dsa_switch *ds, struct device_node *np) 647static int _dsa_register_switch(struct dsa_switch *ds, struct device *dev)
599{ 648{
600 struct device_node *ports = dsa_get_ports(ds, np); 649 struct dsa_chip_data *pdata = dev->platform_data;
650 struct device_node *np = dev->of_node;
601 struct dsa_switch_tree *dst; 651 struct dsa_switch_tree *dst;
652 struct device_node *ports;
602 u32 tree, index; 653 u32 tree, index;
603 int i, err; 654 int i, err;
604 655
605 err = dsa_parse_member(np, &tree, &index); 656 if (np) {
606 if (err) 657 err = dsa_parse_member_dn(np, &tree, &index);
607 return err; 658 if (err)
659 return err;
608 660
609 if (IS_ERR(ports)) 661 ports = dsa_get_ports(ds, np);
610 return PTR_ERR(ports); 662 if (IS_ERR(ports))
663 return PTR_ERR(ports);
611 664
612 err = dsa_parse_ports_dn(ports, ds); 665 err = dsa_parse_ports_dn(ports, ds);
613 if (err) 666 if (err)
614 return err; 667 return err;
668 } else {
669 err = dsa_parse_member(pdata, &tree, &index);
670 if (err)
671 return err;
672
673 err = dsa_parse_ports(pdata, ds);
674 if (err)
675 return err;
676 }
615 677
616 dst = dsa_get_dst(tree); 678 dst = dsa_get_dst(tree);
617 if (!dst) { 679 if (!dst) {
@@ -627,6 +689,7 @@ static int _dsa_register_switch(struct dsa_switch *ds, struct device_node *np)
627 689
628 ds->dst = dst; 690 ds->dst = dst;
629 ds->index = index; 691 ds->index = index;
692 ds->cd = pdata;
630 693
631 /* Initialize the routing table */ 694 /* Initialize the routing table */
632 for (i = 0; i < DSA_MAX_SWITCHES; ++i) 695 for (i = 0; i < DSA_MAX_SWITCHES; ++i)
@@ -650,8 +713,14 @@ static int _dsa_register_switch(struct dsa_switch *ds, struct device_node *np)
650 } 713 }
651 714
652 err = dsa_dst_parse(dst); 715 err = dsa_dst_parse(dst);
653 if (err) 716 if (err) {
717 if (err == -EPROBE_DEFER) {
718 dsa_dst_del_ds(dst, ds, ds->index);
719 return err;
720 }
721
654 goto out_del_dst; 722 goto out_del_dst;
723 }
655 724
656 err = dsa_dst_apply(dst); 725 err = dsa_dst_apply(dst);
657 if (err) { 726 if (err) {
@@ -670,12 +739,34 @@ out:
670 return err; 739 return err;
671} 740}
672 741
673int dsa_register_switch(struct dsa_switch *ds, struct device_node *np) 742struct dsa_switch *dsa_switch_alloc(struct device *dev, size_t n)
743{
744 size_t size = sizeof(struct dsa_switch) + n * sizeof(struct dsa_port);
745 struct dsa_switch *ds;
746 int i;
747
748 ds = devm_kzalloc(dev, size, GFP_KERNEL);
749 if (!ds)
750 return NULL;
751
752 ds->dev = dev;
753 ds->num_ports = n;
754
755 for (i = 0; i < ds->num_ports; ++i) {
756 ds->ports[i].index = i;
757 ds->ports[i].ds = ds;
758 }
759
760 return ds;
761}
762EXPORT_SYMBOL_GPL(dsa_switch_alloc);
763
764int dsa_register_switch(struct dsa_switch *ds, struct device *dev)
674{ 765{
675 int err; 766 int err;
676 767
677 mutex_lock(&dsa2_mutex); 768 mutex_lock(&dsa2_mutex);
678 err = _dsa_register_switch(ds, np); 769 err = _dsa_register_switch(ds, dev);
679 mutex_unlock(&dsa2_mutex); 770 mutex_unlock(&dsa2_mutex);
680 771
681 return err; 772 return err;
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 6cfd7388834e..0706a511244e 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -25,12 +25,8 @@ struct dsa_slave_priv {
25 struct sk_buff * (*xmit)(struct sk_buff *skb, 25 struct sk_buff * (*xmit)(struct sk_buff *skb,
26 struct net_device *dev); 26 struct net_device *dev);
27 27
28 /* 28 /* DSA port data, such as switch, port index, etc. */
29 * Which switch this port is a part of, and the port index 29 struct dsa_port *dp;
30 * for this port.
31 */
32 struct dsa_switch *parent;
33 u8 port;
34 30
35 /* 31 /*
36 * The phylib phy_device pointer for the PHY connected 32 * The phylib phy_device pointer for the PHY connected
@@ -42,17 +38,18 @@ struct dsa_slave_priv {
42 int old_pause; 38 int old_pause;
43 int old_duplex; 39 int old_duplex;
44 40
45 struct net_device *bridge_dev;
46#ifdef CONFIG_NET_POLL_CONTROLLER 41#ifdef CONFIG_NET_POLL_CONTROLLER
47 struct netpoll *netpoll; 42 struct netpoll *netpoll;
48#endif 43#endif
44
45 /* TC context */
46 struct list_head mall_tc_list;
49}; 47};
50 48
51/* dsa.c */ 49/* dsa.c */
52extern char dsa_driver_version[];
53int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct device *dev, 50int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct device *dev,
54 struct device_node *port_dn, int port); 51 struct dsa_port *dport, int port);
55void dsa_cpu_dsa_destroy(struct device_node *port_dn); 52void dsa_cpu_dsa_destroy(struct dsa_port *dport);
56const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol); 53const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol);
57int dsa_cpu_port_ethtool_setup(struct dsa_switch *ds); 54int dsa_cpu_port_ethtool_setup(struct dsa_switch *ds);
58void dsa_cpu_port_ethtool_restore(struct dsa_switch *ds); 55void dsa_cpu_port_ethtool_restore(struct dsa_switch *ds);
@@ -66,8 +63,12 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent,
66void dsa_slave_destroy(struct net_device *slave_dev); 63void dsa_slave_destroy(struct net_device *slave_dev);
67int dsa_slave_suspend(struct net_device *slave_dev); 64int dsa_slave_suspend(struct net_device *slave_dev);
68int dsa_slave_resume(struct net_device *slave_dev); 65int dsa_slave_resume(struct net_device *slave_dev);
69int dsa_slave_netdevice_event(struct notifier_block *unused, 66int dsa_slave_register_notifier(void);
70 unsigned long event, void *ptr); 67void dsa_slave_unregister_notifier(void);
68
69/* switch.c */
70int dsa_switch_register_notifier(struct dsa_switch *ds);
71void dsa_switch_unregister_notifier(struct dsa_switch *ds);
71 72
72/* tag_dsa.c */ 73/* tag_dsa.c */
73extern const struct dsa_device_ops dsa_netdev_ops; 74extern const struct dsa_device_ops dsa_netdev_ops;
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 7d4596110851..c34872e1febc 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -16,12 +16,28 @@
16#include <linux/of_net.h> 16#include <linux/of_net.h>
17#include <linux/of_mdio.h> 17#include <linux/of_mdio.h>
18#include <linux/mdio.h> 18#include <linux/mdio.h>
19#include <linux/list.h>
19#include <net/rtnetlink.h> 20#include <net/rtnetlink.h>
20#include <net/switchdev.h> 21#include <net/switchdev.h>
22#include <net/pkt_cls.h>
23#include <net/tc_act/tc_mirred.h>
21#include <linux/if_bridge.h> 24#include <linux/if_bridge.h>
22#include <linux/netpoll.h> 25#include <linux/netpoll.h>
23#include "dsa_priv.h" 26#include "dsa_priv.h"
24 27
28static bool dsa_slave_dev_check(struct net_device *dev);
29
30static int dsa_slave_notify(struct net_device *dev, unsigned long e, void *v)
31{
32 struct dsa_slave_priv *p = netdev_priv(dev);
33 struct raw_notifier_head *nh = &p->dp->ds->dst->nh;
34 int err;
35
36 err = raw_notifier_call_chain(nh, e, v);
37
38 return notifier_to_errno(err);
39}
40
25/* slave mii_bus handling ***************************************************/ 41/* slave mii_bus handling ***************************************************/
26static int dsa_slave_phy_read(struct mii_bus *bus, int addr, int reg) 42static int dsa_slave_phy_read(struct mii_bus *bus, int addr, int reg)
27{ 43{
@@ -61,17 +77,20 @@ static int dsa_slave_get_iflink(const struct net_device *dev)
61{ 77{
62 struct dsa_slave_priv *p = netdev_priv(dev); 78 struct dsa_slave_priv *p = netdev_priv(dev);
63 79
64 return p->parent->dst->master_netdev->ifindex; 80 return p->dp->ds->dst->master_netdev->ifindex;
65} 81}
66 82
67static inline bool dsa_port_is_bridged(struct dsa_slave_priv *p) 83static inline bool dsa_port_is_bridged(struct dsa_port *dp)
68{ 84{
69 return !!p->bridge_dev; 85 return !!dp->bridge_dev;
70} 86}
71 87
72static void dsa_port_set_stp_state(struct dsa_switch *ds, int port, u8 state) 88static void dsa_slave_set_state(struct net_device *dev, u8 state)
73{ 89{
74 struct dsa_port *dp = &ds->ports[port]; 90 struct dsa_slave_priv *p = netdev_priv(dev);
91 struct dsa_port *dp = p->dp;
92 struct dsa_switch *ds = dp->ds;
93 int port = dp->index;
75 94
76 if (ds->ops->port_stp_state_set) 95 if (ds->ops->port_stp_state_set)
77 ds->ops->port_stp_state_set(ds, port, state); 96 ds->ops->port_stp_state_set(ds, port, state);
@@ -96,9 +115,9 @@ static void dsa_port_set_stp_state(struct dsa_switch *ds, int port, u8 state)
96static int dsa_slave_open(struct net_device *dev) 115static int dsa_slave_open(struct net_device *dev)
97{ 116{
98 struct dsa_slave_priv *p = netdev_priv(dev); 117 struct dsa_slave_priv *p = netdev_priv(dev);
99 struct net_device *master = p->parent->dst->master_netdev; 118 struct net_device *master = p->dp->ds->dst->master_netdev;
100 struct dsa_switch *ds = p->parent; 119 struct dsa_switch *ds = p->dp->ds;
101 u8 stp_state = dsa_port_is_bridged(p) ? 120 u8 stp_state = dsa_port_is_bridged(p->dp) ?
102 BR_STATE_BLOCKING : BR_STATE_FORWARDING; 121 BR_STATE_BLOCKING : BR_STATE_FORWARDING;
103 int err; 122 int err;
104 123
@@ -123,12 +142,12 @@ static int dsa_slave_open(struct net_device *dev)
123 } 142 }
124 143
125 if (ds->ops->port_enable) { 144 if (ds->ops->port_enable) {
126 err = ds->ops->port_enable(ds, p->port, p->phy); 145 err = ds->ops->port_enable(ds, p->dp->index, p->phy);
127 if (err) 146 if (err)
128 goto clear_promisc; 147 goto clear_promisc;
129 } 148 }
130 149
131 dsa_port_set_stp_state(ds, p->port, stp_state); 150 dsa_slave_set_state(dev, stp_state);
132 151
133 if (p->phy) 152 if (p->phy)
134 phy_start(p->phy); 153 phy_start(p->phy);
@@ -151,8 +170,8 @@ out:
151static int dsa_slave_close(struct net_device *dev) 170static int dsa_slave_close(struct net_device *dev)
152{ 171{
153 struct dsa_slave_priv *p = netdev_priv(dev); 172 struct dsa_slave_priv *p = netdev_priv(dev);
154 struct net_device *master = p->parent->dst->master_netdev; 173 struct net_device *master = p->dp->ds->dst->master_netdev;
155 struct dsa_switch *ds = p->parent; 174 struct dsa_switch *ds = p->dp->ds;
156 175
157 if (p->phy) 176 if (p->phy)
158 phy_stop(p->phy); 177 phy_stop(p->phy);
@@ -168,9 +187,9 @@ static int dsa_slave_close(struct net_device *dev)
168 dev_uc_del(master, dev->dev_addr); 187 dev_uc_del(master, dev->dev_addr);
169 188
170 if (ds->ops->port_disable) 189 if (ds->ops->port_disable)
171 ds->ops->port_disable(ds, p->port, p->phy); 190 ds->ops->port_disable(ds, p->dp->index, p->phy);
172 191
173 dsa_port_set_stp_state(ds, p->port, BR_STATE_DISABLED); 192 dsa_slave_set_state(dev, BR_STATE_DISABLED);
174 193
175 return 0; 194 return 0;
176} 195}
@@ -178,7 +197,7 @@ static int dsa_slave_close(struct net_device *dev)
178static void dsa_slave_change_rx_flags(struct net_device *dev, int change) 197static void dsa_slave_change_rx_flags(struct net_device *dev, int change)
179{ 198{
180 struct dsa_slave_priv *p = netdev_priv(dev); 199 struct dsa_slave_priv *p = netdev_priv(dev);
181 struct net_device *master = p->parent->dst->master_netdev; 200 struct net_device *master = p->dp->ds->dst->master_netdev;
182 201
183 if (change & IFF_ALLMULTI) 202 if (change & IFF_ALLMULTI)
184 dev_set_allmulti(master, dev->flags & IFF_ALLMULTI ? 1 : -1); 203 dev_set_allmulti(master, dev->flags & IFF_ALLMULTI ? 1 : -1);
@@ -189,7 +208,7 @@ static void dsa_slave_change_rx_flags(struct net_device *dev, int change)
189static void dsa_slave_set_rx_mode(struct net_device *dev) 208static void dsa_slave_set_rx_mode(struct net_device *dev)
190{ 209{
191 struct dsa_slave_priv *p = netdev_priv(dev); 210 struct dsa_slave_priv *p = netdev_priv(dev);
192 struct net_device *master = p->parent->dst->master_netdev; 211 struct net_device *master = p->dp->ds->dst->master_netdev;
193 212
194 dev_mc_sync(master, dev); 213 dev_mc_sync(master, dev);
195 dev_uc_sync(master, dev); 214 dev_uc_sync(master, dev);
@@ -198,7 +217,7 @@ static void dsa_slave_set_rx_mode(struct net_device *dev)
198static int dsa_slave_set_mac_address(struct net_device *dev, void *a) 217static int dsa_slave_set_mac_address(struct net_device *dev, void *a)
199{ 218{
200 struct dsa_slave_priv *p = netdev_priv(dev); 219 struct dsa_slave_priv *p = netdev_priv(dev);
201 struct net_device *master = p->parent->dst->master_netdev; 220 struct net_device *master = p->dp->ds->dst->master_netdev;
202 struct sockaddr *addr = a; 221 struct sockaddr *addr = a;
203 int err; 222 int err;
204 223
@@ -228,16 +247,17 @@ static int dsa_slave_port_vlan_add(struct net_device *dev,
228 struct switchdev_trans *trans) 247 struct switchdev_trans *trans)
229{ 248{
230 struct dsa_slave_priv *p = netdev_priv(dev); 249 struct dsa_slave_priv *p = netdev_priv(dev);
231 struct dsa_switch *ds = p->parent; 250 struct dsa_port *dp = p->dp;
251 struct dsa_switch *ds = dp->ds;
232 252
233 if (switchdev_trans_ph_prepare(trans)) { 253 if (switchdev_trans_ph_prepare(trans)) {
234 if (!ds->ops->port_vlan_prepare || !ds->ops->port_vlan_add) 254 if (!ds->ops->port_vlan_prepare || !ds->ops->port_vlan_add)
235 return -EOPNOTSUPP; 255 return -EOPNOTSUPP;
236 256
237 return ds->ops->port_vlan_prepare(ds, p->port, vlan, trans); 257 return ds->ops->port_vlan_prepare(ds, dp->index, vlan, trans);
238 } 258 }
239 259
240 ds->ops->port_vlan_add(ds, p->port, vlan, trans); 260 ds->ops->port_vlan_add(ds, dp->index, vlan, trans);
241 261
242 return 0; 262 return 0;
243} 263}
@@ -246,12 +266,12 @@ static int dsa_slave_port_vlan_del(struct net_device *dev,
246 const struct switchdev_obj_port_vlan *vlan) 266 const struct switchdev_obj_port_vlan *vlan)
247{ 267{
248 struct dsa_slave_priv *p = netdev_priv(dev); 268 struct dsa_slave_priv *p = netdev_priv(dev);
249 struct dsa_switch *ds = p->parent; 269 struct dsa_switch *ds = p->dp->ds;
250 270
251 if (!ds->ops->port_vlan_del) 271 if (!ds->ops->port_vlan_del)
252 return -EOPNOTSUPP; 272 return -EOPNOTSUPP;
253 273
254 return ds->ops->port_vlan_del(ds, p->port, vlan); 274 return ds->ops->port_vlan_del(ds, p->dp->index, vlan);
255} 275}
256 276
257static int dsa_slave_port_vlan_dump(struct net_device *dev, 277static int dsa_slave_port_vlan_dump(struct net_device *dev,
@@ -259,10 +279,10 @@ static int dsa_slave_port_vlan_dump(struct net_device *dev,
259 switchdev_obj_dump_cb_t *cb) 279 switchdev_obj_dump_cb_t *cb)
260{ 280{
261 struct dsa_slave_priv *p = netdev_priv(dev); 281 struct dsa_slave_priv *p = netdev_priv(dev);
262 struct dsa_switch *ds = p->parent; 282 struct dsa_switch *ds = p->dp->ds;
263 283
264 if (ds->ops->port_vlan_dump) 284 if (ds->ops->port_vlan_dump)
265 return ds->ops->port_vlan_dump(ds, p->port, vlan, cb); 285 return ds->ops->port_vlan_dump(ds, p->dp->index, vlan, cb);
266 286
267 return -EOPNOTSUPP; 287 return -EOPNOTSUPP;
268} 288}
@@ -272,16 +292,16 @@ static int dsa_slave_port_fdb_add(struct net_device *dev,
272 struct switchdev_trans *trans) 292 struct switchdev_trans *trans)
273{ 293{
274 struct dsa_slave_priv *p = netdev_priv(dev); 294 struct dsa_slave_priv *p = netdev_priv(dev);
275 struct dsa_switch *ds = p->parent; 295 struct dsa_switch *ds = p->dp->ds;
276 296
277 if (switchdev_trans_ph_prepare(trans)) { 297 if (switchdev_trans_ph_prepare(trans)) {
278 if (!ds->ops->port_fdb_prepare || !ds->ops->port_fdb_add) 298 if (!ds->ops->port_fdb_prepare || !ds->ops->port_fdb_add)
279 return -EOPNOTSUPP; 299 return -EOPNOTSUPP;
280 300
281 return ds->ops->port_fdb_prepare(ds, p->port, fdb, trans); 301 return ds->ops->port_fdb_prepare(ds, p->dp->index, fdb, trans);
282 } 302 }
283 303
284 ds->ops->port_fdb_add(ds, p->port, fdb, trans); 304 ds->ops->port_fdb_add(ds, p->dp->index, fdb, trans);
285 305
286 return 0; 306 return 0;
287} 307}
@@ -290,11 +310,11 @@ static int dsa_slave_port_fdb_del(struct net_device *dev,
290 const struct switchdev_obj_port_fdb *fdb) 310 const struct switchdev_obj_port_fdb *fdb)
291{ 311{
292 struct dsa_slave_priv *p = netdev_priv(dev); 312 struct dsa_slave_priv *p = netdev_priv(dev);
293 struct dsa_switch *ds = p->parent; 313 struct dsa_switch *ds = p->dp->ds;
294 int ret = -EOPNOTSUPP; 314 int ret = -EOPNOTSUPP;
295 315
296 if (ds->ops->port_fdb_del) 316 if (ds->ops->port_fdb_del)
297 ret = ds->ops->port_fdb_del(ds, p->port, fdb); 317 ret = ds->ops->port_fdb_del(ds, p->dp->index, fdb);
298 318
299 return ret; 319 return ret;
300} 320}
@@ -304,10 +324,10 @@ static int dsa_slave_port_fdb_dump(struct net_device *dev,
304 switchdev_obj_dump_cb_t *cb) 324 switchdev_obj_dump_cb_t *cb)
305{ 325{
306 struct dsa_slave_priv *p = netdev_priv(dev); 326 struct dsa_slave_priv *p = netdev_priv(dev);
307 struct dsa_switch *ds = p->parent; 327 struct dsa_switch *ds = p->dp->ds;
308 328
309 if (ds->ops->port_fdb_dump) 329 if (ds->ops->port_fdb_dump)
310 return ds->ops->port_fdb_dump(ds, p->port, fdb, cb); 330 return ds->ops->port_fdb_dump(ds, p->dp->index, fdb, cb);
311 331
312 return -EOPNOTSUPP; 332 return -EOPNOTSUPP;
313} 333}
@@ -317,16 +337,16 @@ static int dsa_slave_port_mdb_add(struct net_device *dev,
317 struct switchdev_trans *trans) 337 struct switchdev_trans *trans)
318{ 338{
319 struct dsa_slave_priv *p = netdev_priv(dev); 339 struct dsa_slave_priv *p = netdev_priv(dev);
320 struct dsa_switch *ds = p->parent; 340 struct dsa_switch *ds = p->dp->ds;
321 341
322 if (switchdev_trans_ph_prepare(trans)) { 342 if (switchdev_trans_ph_prepare(trans)) {
323 if (!ds->ops->port_mdb_prepare || !ds->ops->port_mdb_add) 343 if (!ds->ops->port_mdb_prepare || !ds->ops->port_mdb_add)
324 return -EOPNOTSUPP; 344 return -EOPNOTSUPP;
325 345
326 return ds->ops->port_mdb_prepare(ds, p->port, mdb, trans); 346 return ds->ops->port_mdb_prepare(ds, p->dp->index, mdb, trans);
327 } 347 }
328 348
329 ds->ops->port_mdb_add(ds, p->port, mdb, trans); 349 ds->ops->port_mdb_add(ds, p->dp->index, mdb, trans);
330 350
331 return 0; 351 return 0;
332} 352}
@@ -335,10 +355,10 @@ static int dsa_slave_port_mdb_del(struct net_device *dev,
335 const struct switchdev_obj_port_mdb *mdb) 355 const struct switchdev_obj_port_mdb *mdb)
336{ 356{
337 struct dsa_slave_priv *p = netdev_priv(dev); 357 struct dsa_slave_priv *p = netdev_priv(dev);
338 struct dsa_switch *ds = p->parent; 358 struct dsa_switch *ds = p->dp->ds;
339 359
340 if (ds->ops->port_mdb_del) 360 if (ds->ops->port_mdb_del)
341 return ds->ops->port_mdb_del(ds, p->port, mdb); 361 return ds->ops->port_mdb_del(ds, p->dp->index, mdb);
342 362
343 return -EOPNOTSUPP; 363 return -EOPNOTSUPP;
344} 364}
@@ -348,10 +368,10 @@ static int dsa_slave_port_mdb_dump(struct net_device *dev,
348 switchdev_obj_dump_cb_t *cb) 368 switchdev_obj_dump_cb_t *cb)
349{ 369{
350 struct dsa_slave_priv *p = netdev_priv(dev); 370 struct dsa_slave_priv *p = netdev_priv(dev);
351 struct dsa_switch *ds = p->parent; 371 struct dsa_switch *ds = p->dp->ds;
352 372
353 if (ds->ops->port_mdb_dump) 373 if (ds->ops->port_mdb_dump)
354 return ds->ops->port_mdb_dump(ds, p->port, mdb, cb); 374 return ds->ops->port_mdb_dump(ds, p->dp->index, mdb, cb);
355 375
356 return -EOPNOTSUPP; 376 return -EOPNOTSUPP;
357} 377}
@@ -371,12 +391,12 @@ static int dsa_slave_stp_state_set(struct net_device *dev,
371 struct switchdev_trans *trans) 391 struct switchdev_trans *trans)
372{ 392{
373 struct dsa_slave_priv *p = netdev_priv(dev); 393 struct dsa_slave_priv *p = netdev_priv(dev);
374 struct dsa_switch *ds = p->parent; 394 struct dsa_switch *ds = p->dp->ds;
375 395
376 if (switchdev_trans_ph_prepare(trans)) 396 if (switchdev_trans_ph_prepare(trans))
377 return ds->ops->port_stp_state_set ? 0 : -EOPNOTSUPP; 397 return ds->ops->port_stp_state_set ? 0 : -EOPNOTSUPP;
378 398
379 dsa_port_set_stp_state(ds, p->port, attr->u.stp_state); 399 dsa_slave_set_state(dev, attr->u.stp_state);
380 400
381 return 0; 401 return 0;
382} 402}
@@ -386,14 +406,14 @@ static int dsa_slave_vlan_filtering(struct net_device *dev,
386 struct switchdev_trans *trans) 406 struct switchdev_trans *trans)
387{ 407{
388 struct dsa_slave_priv *p = netdev_priv(dev); 408 struct dsa_slave_priv *p = netdev_priv(dev);
389 struct dsa_switch *ds = p->parent; 409 struct dsa_switch *ds = p->dp->ds;
390 410
391 /* bridge skips -EOPNOTSUPP, so skip the prepare phase */ 411 /* bridge skips -EOPNOTSUPP, so skip the prepare phase */
392 if (switchdev_trans_ph_prepare(trans)) 412 if (switchdev_trans_ph_prepare(trans))
393 return 0; 413 return 0;
394 414
395 if (ds->ops->port_vlan_filtering) 415 if (ds->ops->port_vlan_filtering)
396 return ds->ops->port_vlan_filtering(ds, p->port, 416 return ds->ops->port_vlan_filtering(ds, p->dp->index,
397 attr->u.vlan_filtering); 417 attr->u.vlan_filtering);
398 418
399 return 0; 419 return 0;
@@ -404,7 +424,7 @@ static int dsa_fastest_ageing_time(struct dsa_switch *ds,
404{ 424{
405 int i; 425 int i;
406 426
407 for (i = 0; i < DSA_MAX_PORTS; ++i) { 427 for (i = 0; i < ds->num_ports; ++i) {
408 struct dsa_port *dp = &ds->ports[i]; 428 struct dsa_port *dp = &ds->ports[i];
409 429
410 if (dp && dp->ageing_time && dp->ageing_time < ageing_time) 430 if (dp && dp->ageing_time && dp->ageing_time < ageing_time)
@@ -419,7 +439,7 @@ static int dsa_slave_ageing_time(struct net_device *dev,
419 struct switchdev_trans *trans) 439 struct switchdev_trans *trans)
420{ 440{
421 struct dsa_slave_priv *p = netdev_priv(dev); 441 struct dsa_slave_priv *p = netdev_priv(dev);
422 struct dsa_switch *ds = p->parent; 442 struct dsa_switch *ds = p->dp->ds;
423 unsigned long ageing_jiffies = clock_t_to_jiffies(attr->u.ageing_time); 443 unsigned long ageing_jiffies = clock_t_to_jiffies(attr->u.ageing_time);
424 unsigned int ageing_time = jiffies_to_msecs(ageing_jiffies); 444 unsigned int ageing_time = jiffies_to_msecs(ageing_jiffies);
425 445
@@ -428,7 +448,7 @@ static int dsa_slave_ageing_time(struct net_device *dev,
428 return 0; 448 return 0;
429 449
430 /* Keep the fastest ageing time in case of multiple bridges */ 450 /* Keep the fastest ageing time in case of multiple bridges */
431 ds->ports[p->port].ageing_time = ageing_time; 451 p->dp->ageing_time = ageing_time;
432 ageing_time = dsa_fastest_ageing_time(ds, ageing_time); 452 ageing_time = dsa_fastest_ageing_time(ds, ageing_time);
433 453
434 if (ds->ops->set_ageing_time) 454 if (ds->ops->set_ageing_time)
@@ -553,39 +573,58 @@ static int dsa_slave_bridge_port_join(struct net_device *dev,
553 struct net_device *br) 573 struct net_device *br)
554{ 574{
555 struct dsa_slave_priv *p = netdev_priv(dev); 575 struct dsa_slave_priv *p = netdev_priv(dev);
556 struct dsa_switch *ds = p->parent; 576 struct dsa_notifier_bridge_info info = {
557 int ret = -EOPNOTSUPP; 577 .sw_index = p->dp->ds->index,
578 .port = p->dp->index,
579 .br = br,
580 };
581 int err;
582
583 /* Here the port is already bridged. Reflect the current configuration
584 * so that drivers can program their chips accordingly.
585 */
586 p->dp->bridge_dev = br;
558 587
559 p->bridge_dev = br; 588 err = dsa_slave_notify(dev, DSA_NOTIFIER_BRIDGE_JOIN, &info);
560 589
561 if (ds->ops->port_bridge_join) 590 /* The bridging is rolled back on error */
562 ret = ds->ops->port_bridge_join(ds, p->port, br); 591 if (err)
592 p->dp->bridge_dev = NULL;
563 593
564 return ret == -EOPNOTSUPP ? 0 : ret; 594 return err;
565} 595}
566 596
567static void dsa_slave_bridge_port_leave(struct net_device *dev) 597static void dsa_slave_bridge_port_leave(struct net_device *dev,
598 struct net_device *br)
568{ 599{
569 struct dsa_slave_priv *p = netdev_priv(dev); 600 struct dsa_slave_priv *p = netdev_priv(dev);
570 struct dsa_switch *ds = p->parent; 601 struct dsa_notifier_bridge_info info = {
571 602 .sw_index = p->dp->ds->index,
603 .port = p->dp->index,
604 .br = br,
605 };
606 int err;
572 607
573 if (ds->ops->port_bridge_leave) 608 /* Here the port is already unbridged. Reflect the current configuration
574 ds->ops->port_bridge_leave(ds, p->port); 609 * so that drivers can program their chips accordingly.
610 */
611 p->dp->bridge_dev = NULL;
575 612
576 p->bridge_dev = NULL; 613 err = dsa_slave_notify(dev, DSA_NOTIFIER_BRIDGE_LEAVE, &info);
614 if (err)
615 netdev_err(dev, "failed to notify DSA_NOTIFIER_BRIDGE_LEAVE\n");
577 616
578 /* Port left the bridge, put in BR_STATE_DISABLED by the bridge layer, 617 /* Port left the bridge, put in BR_STATE_DISABLED by the bridge layer,
579 * so allow it to be in BR_STATE_FORWARDING to be kept functional 618 * so allow it to be in BR_STATE_FORWARDING to be kept functional
580 */ 619 */
581 dsa_port_set_stp_state(ds, p->port, BR_STATE_FORWARDING); 620 dsa_slave_set_state(dev, BR_STATE_FORWARDING);
582} 621}
583 622
584static int dsa_slave_port_attr_get(struct net_device *dev, 623static int dsa_slave_port_attr_get(struct net_device *dev,
585 struct switchdev_attr *attr) 624 struct switchdev_attr *attr)
586{ 625{
587 struct dsa_slave_priv *p = netdev_priv(dev); 626 struct dsa_slave_priv *p = netdev_priv(dev);
588 struct dsa_switch *ds = p->parent; 627 struct dsa_switch *ds = p->dp->ds;
589 628
590 switch (attr->id) { 629 switch (attr->id) {
591 case SWITCHDEV_ATTR_ID_PORT_PARENT_ID: 630 case SWITCHDEV_ATTR_ID_PORT_PARENT_ID:
@@ -633,7 +672,7 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev)
633 /* Queue the SKB for transmission on the parent interface, but 672 /* Queue the SKB for transmission on the parent interface, but
634 * do not modify its EtherType 673 * do not modify its EtherType
635 */ 674 */
636 nskb->dev = p->parent->dst->master_netdev; 675 nskb->dev = p->dp->ds->dst->master_netdev;
637 dev_queue_xmit(nskb); 676 dev_queue_xmit(nskb);
638 677
639 return NETDEV_TX_OK; 678 return NETDEV_TX_OK;
@@ -645,14 +684,10 @@ dsa_slave_get_link_ksettings(struct net_device *dev,
645 struct ethtool_link_ksettings *cmd) 684 struct ethtool_link_ksettings *cmd)
646{ 685{
647 struct dsa_slave_priv *p = netdev_priv(dev); 686 struct dsa_slave_priv *p = netdev_priv(dev);
648 int err; 687 int err = -EOPNOTSUPP;
649 688
650 err = -EOPNOTSUPP; 689 if (p->phy != NULL)
651 if (p->phy != NULL) { 690 err = phy_ethtool_ksettings_get(p->phy, cmd);
652 err = phy_read_status(p->phy);
653 if (err == 0)
654 err = phy_ethtool_ksettings_get(p->phy, cmd);
655 }
656 691
657 return err; 692 return err;
658} 693}
@@ -673,7 +708,6 @@ static void dsa_slave_get_drvinfo(struct net_device *dev,
673 struct ethtool_drvinfo *drvinfo) 708 struct ethtool_drvinfo *drvinfo)
674{ 709{
675 strlcpy(drvinfo->driver, "dsa", sizeof(drvinfo->driver)); 710 strlcpy(drvinfo->driver, "dsa", sizeof(drvinfo->driver));
676 strlcpy(drvinfo->version, dsa_driver_version, sizeof(drvinfo->version));
677 strlcpy(drvinfo->fw_version, "N/A", sizeof(drvinfo->fw_version)); 711 strlcpy(drvinfo->fw_version, "N/A", sizeof(drvinfo->fw_version));
678 strlcpy(drvinfo->bus_info, "platform", sizeof(drvinfo->bus_info)); 712 strlcpy(drvinfo->bus_info, "platform", sizeof(drvinfo->bus_info));
679} 713}
@@ -681,10 +715,10 @@ static void dsa_slave_get_drvinfo(struct net_device *dev,
681static int dsa_slave_get_regs_len(struct net_device *dev) 715static int dsa_slave_get_regs_len(struct net_device *dev)
682{ 716{
683 struct dsa_slave_priv *p = netdev_priv(dev); 717 struct dsa_slave_priv *p = netdev_priv(dev);
684 struct dsa_switch *ds = p->parent; 718 struct dsa_switch *ds = p->dp->ds;
685 719
686 if (ds->ops->get_regs_len) 720 if (ds->ops->get_regs_len)
687 return ds->ops->get_regs_len(ds, p->port); 721 return ds->ops->get_regs_len(ds, p->dp->index);
688 722
689 return -EOPNOTSUPP; 723 return -EOPNOTSUPP;
690} 724}
@@ -693,10 +727,10 @@ static void
693dsa_slave_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *_p) 727dsa_slave_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *_p)
694{ 728{
695 struct dsa_slave_priv *p = netdev_priv(dev); 729 struct dsa_slave_priv *p = netdev_priv(dev);
696 struct dsa_switch *ds = p->parent; 730 struct dsa_switch *ds = p->dp->ds;
697 731
698 if (ds->ops->get_regs) 732 if (ds->ops->get_regs)
699 ds->ops->get_regs(ds, p->port, regs, _p); 733 ds->ops->get_regs(ds, p->dp->index, regs, _p);
700} 734}
701 735
702static int dsa_slave_nway_reset(struct net_device *dev) 736static int dsa_slave_nway_reset(struct net_device *dev)
@@ -724,7 +758,7 @@ static u32 dsa_slave_get_link(struct net_device *dev)
724static int dsa_slave_get_eeprom_len(struct net_device *dev) 758static int dsa_slave_get_eeprom_len(struct net_device *dev)
725{ 759{
726 struct dsa_slave_priv *p = netdev_priv(dev); 760 struct dsa_slave_priv *p = netdev_priv(dev);
727 struct dsa_switch *ds = p->parent; 761 struct dsa_switch *ds = p->dp->ds;
728 762
729 if (ds->cd && ds->cd->eeprom_len) 763 if (ds->cd && ds->cd->eeprom_len)
730 return ds->cd->eeprom_len; 764 return ds->cd->eeprom_len;
@@ -739,7 +773,7 @@ static int dsa_slave_get_eeprom(struct net_device *dev,
739 struct ethtool_eeprom *eeprom, u8 *data) 773 struct ethtool_eeprom *eeprom, u8 *data)
740{ 774{
741 struct dsa_slave_priv *p = netdev_priv(dev); 775 struct dsa_slave_priv *p = netdev_priv(dev);
742 struct dsa_switch *ds = p->parent; 776 struct dsa_switch *ds = p->dp->ds;
743 777
744 if (ds->ops->get_eeprom) 778 if (ds->ops->get_eeprom)
745 return ds->ops->get_eeprom(ds, eeprom, data); 779 return ds->ops->get_eeprom(ds, eeprom, data);
@@ -751,7 +785,7 @@ static int dsa_slave_set_eeprom(struct net_device *dev,
751 struct ethtool_eeprom *eeprom, u8 *data) 785 struct ethtool_eeprom *eeprom, u8 *data)
752{ 786{
753 struct dsa_slave_priv *p = netdev_priv(dev); 787 struct dsa_slave_priv *p = netdev_priv(dev);
754 struct dsa_switch *ds = p->parent; 788 struct dsa_switch *ds = p->dp->ds;
755 789
756 if (ds->ops->set_eeprom) 790 if (ds->ops->set_eeprom)
757 return ds->ops->set_eeprom(ds, eeprom, data); 791 return ds->ops->set_eeprom(ds, eeprom, data);
@@ -763,7 +797,7 @@ static void dsa_slave_get_strings(struct net_device *dev,
763 uint32_t stringset, uint8_t *data) 797 uint32_t stringset, uint8_t *data)
764{ 798{
765 struct dsa_slave_priv *p = netdev_priv(dev); 799 struct dsa_slave_priv *p = netdev_priv(dev);
766 struct dsa_switch *ds = p->parent; 800 struct dsa_switch *ds = p->dp->ds;
767 801
768 if (stringset == ETH_SS_STATS) { 802 if (stringset == ETH_SS_STATS) {
769 int len = ETH_GSTRING_LEN; 803 int len = ETH_GSTRING_LEN;
@@ -773,7 +807,7 @@ static void dsa_slave_get_strings(struct net_device *dev,
773 strncpy(data + 2 * len, "rx_packets", len); 807 strncpy(data + 2 * len, "rx_packets", len);
774 strncpy(data + 3 * len, "rx_bytes", len); 808 strncpy(data + 3 * len, "rx_bytes", len);
775 if (ds->ops->get_strings) 809 if (ds->ops->get_strings)
776 ds->ops->get_strings(ds, p->port, data + 4 * len); 810 ds->ops->get_strings(ds, p->dp->index, data + 4 * len);
777 } 811 }
778} 812}
779 813
@@ -782,7 +816,7 @@ static void dsa_cpu_port_get_ethtool_stats(struct net_device *dev,
782 uint64_t *data) 816 uint64_t *data)
783{ 817{
784 struct dsa_switch_tree *dst = dev->dsa_ptr; 818 struct dsa_switch_tree *dst = dev->dsa_ptr;
785 struct dsa_switch *ds = dst->ds[0]; 819 struct dsa_switch *ds = dst->cpu_switch;
786 s8 cpu_port = dst->cpu_port; 820 s8 cpu_port = dst->cpu_port;
787 int count = 0; 821 int count = 0;
788 822
@@ -799,7 +833,7 @@ static void dsa_cpu_port_get_ethtool_stats(struct net_device *dev,
799static int dsa_cpu_port_get_sset_count(struct net_device *dev, int sset) 833static int dsa_cpu_port_get_sset_count(struct net_device *dev, int sset)
800{ 834{
801 struct dsa_switch_tree *dst = dev->dsa_ptr; 835 struct dsa_switch_tree *dst = dev->dsa_ptr;
802 struct dsa_switch *ds = dst->ds[0]; 836 struct dsa_switch *ds = dst->cpu_switch;
803 int count = 0; 837 int count = 0;
804 838
805 if (dst->master_ethtool_ops.get_sset_count) 839 if (dst->master_ethtool_ops.get_sset_count)
@@ -815,7 +849,7 @@ static void dsa_cpu_port_get_strings(struct net_device *dev,
815 uint32_t stringset, uint8_t *data) 849 uint32_t stringset, uint8_t *data)
816{ 850{
817 struct dsa_switch_tree *dst = dev->dsa_ptr; 851 struct dsa_switch_tree *dst = dev->dsa_ptr;
818 struct dsa_switch *ds = dst->ds[0]; 852 struct dsa_switch *ds = dst->cpu_switch;
819 s8 cpu_port = dst->cpu_port; 853 s8 cpu_port = dst->cpu_port;
820 int len = ETH_GSTRING_LEN; 854 int len = ETH_GSTRING_LEN;
821 int mcount = 0, count; 855 int mcount = 0, count;
@@ -854,20 +888,20 @@ static void dsa_slave_get_ethtool_stats(struct net_device *dev,
854 uint64_t *data) 888 uint64_t *data)
855{ 889{
856 struct dsa_slave_priv *p = netdev_priv(dev); 890 struct dsa_slave_priv *p = netdev_priv(dev);
857 struct dsa_switch *ds = p->parent; 891 struct dsa_switch *ds = p->dp->ds;
858 892
859 data[0] = dev->stats.tx_packets; 893 data[0] = dev->stats.tx_packets;
860 data[1] = dev->stats.tx_bytes; 894 data[1] = dev->stats.tx_bytes;
861 data[2] = dev->stats.rx_packets; 895 data[2] = dev->stats.rx_packets;
862 data[3] = dev->stats.rx_bytes; 896 data[3] = dev->stats.rx_bytes;
863 if (ds->ops->get_ethtool_stats) 897 if (ds->ops->get_ethtool_stats)
864 ds->ops->get_ethtool_stats(ds, p->port, data + 4); 898 ds->ops->get_ethtool_stats(ds, p->dp->index, data + 4);
865} 899}
866 900
867static int dsa_slave_get_sset_count(struct net_device *dev, int sset) 901static int dsa_slave_get_sset_count(struct net_device *dev, int sset)
868{ 902{
869 struct dsa_slave_priv *p = netdev_priv(dev); 903 struct dsa_slave_priv *p = netdev_priv(dev);
870 struct dsa_switch *ds = p->parent; 904 struct dsa_switch *ds = p->dp->ds;
871 905
872 if (sset == ETH_SS_STATS) { 906 if (sset == ETH_SS_STATS) {
873 int count; 907 int count;
@@ -885,20 +919,20 @@ static int dsa_slave_get_sset_count(struct net_device *dev, int sset)
885static void dsa_slave_get_wol(struct net_device *dev, struct ethtool_wolinfo *w) 919static void dsa_slave_get_wol(struct net_device *dev, struct ethtool_wolinfo *w)
886{ 920{
887 struct dsa_slave_priv *p = netdev_priv(dev); 921 struct dsa_slave_priv *p = netdev_priv(dev);
888 struct dsa_switch *ds = p->parent; 922 struct dsa_switch *ds = p->dp->ds;
889 923
890 if (ds->ops->get_wol) 924 if (ds->ops->get_wol)
891 ds->ops->get_wol(ds, p->port, w); 925 ds->ops->get_wol(ds, p->dp->index, w);
892} 926}
893 927
894static int dsa_slave_set_wol(struct net_device *dev, struct ethtool_wolinfo *w) 928static int dsa_slave_set_wol(struct net_device *dev, struct ethtool_wolinfo *w)
895{ 929{
896 struct dsa_slave_priv *p = netdev_priv(dev); 930 struct dsa_slave_priv *p = netdev_priv(dev);
897 struct dsa_switch *ds = p->parent; 931 struct dsa_switch *ds = p->dp->ds;
898 int ret = -EOPNOTSUPP; 932 int ret = -EOPNOTSUPP;
899 933
900 if (ds->ops->set_wol) 934 if (ds->ops->set_wol)
901 ret = ds->ops->set_wol(ds, p->port, w); 935 ret = ds->ops->set_wol(ds, p->dp->index, w);
902 936
903 return ret; 937 return ret;
904} 938}
@@ -906,13 +940,13 @@ static int dsa_slave_set_wol(struct net_device *dev, struct ethtool_wolinfo *w)
906static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e) 940static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e)
907{ 941{
908 struct dsa_slave_priv *p = netdev_priv(dev); 942 struct dsa_slave_priv *p = netdev_priv(dev);
909 struct dsa_switch *ds = p->parent; 943 struct dsa_switch *ds = p->dp->ds;
910 int ret; 944 int ret;
911 945
912 if (!ds->ops->set_eee) 946 if (!ds->ops->set_eee)
913 return -EOPNOTSUPP; 947 return -EOPNOTSUPP;
914 948
915 ret = ds->ops->set_eee(ds, p->port, p->phy, e); 949 ret = ds->ops->set_eee(ds, p->dp->index, p->phy, e);
916 if (ret) 950 if (ret)
917 return ret; 951 return ret;
918 952
@@ -925,13 +959,13 @@ static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e)
925static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e) 959static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e)
926{ 960{
927 struct dsa_slave_priv *p = netdev_priv(dev); 961 struct dsa_slave_priv *p = netdev_priv(dev);
928 struct dsa_switch *ds = p->parent; 962 struct dsa_switch *ds = p->dp->ds;
929 int ret; 963 int ret;
930 964
931 if (!ds->ops->get_eee) 965 if (!ds->ops->get_eee)
932 return -EOPNOTSUPP; 966 return -EOPNOTSUPP;
933 967
934 ret = ds->ops->get_eee(ds, p->port, e); 968 ret = ds->ops->get_eee(ds, p->dp->index, e);
935 if (ret) 969 if (ret)
936 return ret; 970 return ret;
937 971
@@ -946,7 +980,7 @@ static int dsa_slave_netpoll_setup(struct net_device *dev,
946 struct netpoll_info *ni) 980 struct netpoll_info *ni)
947{ 981{
948 struct dsa_slave_priv *p = netdev_priv(dev); 982 struct dsa_slave_priv *p = netdev_priv(dev);
949 struct dsa_switch *ds = p->parent; 983 struct dsa_switch *ds = p->dp->ds;
950 struct net_device *master = ds->dst->master_netdev; 984 struct net_device *master = ds->dst->master_netdev;
951 struct netpoll *netpoll; 985 struct netpoll *netpoll;
952 int err = 0; 986 int err = 0;
@@ -984,6 +1018,144 @@ static void dsa_slave_poll_controller(struct net_device *dev)
984} 1018}
985#endif 1019#endif
986 1020
1021static int dsa_slave_get_phys_port_name(struct net_device *dev,
1022 char *name, size_t len)
1023{
1024 struct dsa_slave_priv *p = netdev_priv(dev);
1025
1026 if (snprintf(name, len, "p%d", p->dp->index) >= len)
1027 return -EINVAL;
1028
1029 return 0;
1030}
1031
1032static struct dsa_mall_tc_entry *
1033dsa_slave_mall_tc_entry_find(struct dsa_slave_priv *p,
1034 unsigned long cookie)
1035{
1036 struct dsa_mall_tc_entry *mall_tc_entry;
1037
1038 list_for_each_entry(mall_tc_entry, &p->mall_tc_list, list)
1039 if (mall_tc_entry->cookie == cookie)
1040 return mall_tc_entry;
1041
1042 return NULL;
1043}
1044
1045static int dsa_slave_add_cls_matchall(struct net_device *dev,
1046 __be16 protocol,
1047 struct tc_cls_matchall_offload *cls,
1048 bool ingress)
1049{
1050 struct dsa_slave_priv *p = netdev_priv(dev);
1051 struct dsa_mall_tc_entry *mall_tc_entry;
1052 struct dsa_switch *ds = p->dp->ds;
1053 struct net *net = dev_net(dev);
1054 struct dsa_slave_priv *to_p;
1055 struct net_device *to_dev;
1056 const struct tc_action *a;
1057 int err = -EOPNOTSUPP;
1058 LIST_HEAD(actions);
1059 int ifindex;
1060
1061 if (!ds->ops->port_mirror_add)
1062 return err;
1063
1064 if (!tc_single_action(cls->exts))
1065 return err;
1066
1067 tcf_exts_to_list(cls->exts, &actions);
1068 a = list_first_entry(&actions, struct tc_action, list);
1069
1070 if (is_tcf_mirred_egress_mirror(a) && protocol == htons(ETH_P_ALL)) {
1071 struct dsa_mall_mirror_tc_entry *mirror;
1072
1073 ifindex = tcf_mirred_ifindex(a);
1074 to_dev = __dev_get_by_index(net, ifindex);
1075 if (!to_dev)
1076 return -EINVAL;
1077
1078 if (!dsa_slave_dev_check(to_dev))
1079 return -EOPNOTSUPP;
1080
1081 mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL);
1082 if (!mall_tc_entry)
1083 return -ENOMEM;
1084
1085 mall_tc_entry->cookie = cls->cookie;
1086 mall_tc_entry->type = DSA_PORT_MALL_MIRROR;
1087 mirror = &mall_tc_entry->mirror;
1088
1089 to_p = netdev_priv(to_dev);
1090
1091 mirror->to_local_port = to_p->dp->index;
1092 mirror->ingress = ingress;
1093
1094 err = ds->ops->port_mirror_add(ds, p->dp->index, mirror,
1095 ingress);
1096 if (err) {
1097 kfree(mall_tc_entry);
1098 return err;
1099 }
1100
1101 list_add_tail(&mall_tc_entry->list, &p->mall_tc_list);
1102 }
1103
1104 return 0;
1105}
1106
1107static void dsa_slave_del_cls_matchall(struct net_device *dev,
1108 struct tc_cls_matchall_offload *cls)
1109{
1110 struct dsa_slave_priv *p = netdev_priv(dev);
1111 struct dsa_mall_tc_entry *mall_tc_entry;
1112 struct dsa_switch *ds = p->dp->ds;
1113
1114 if (!ds->ops->port_mirror_del)
1115 return;
1116
1117 mall_tc_entry = dsa_slave_mall_tc_entry_find(p, cls->cookie);
1118 if (!mall_tc_entry)
1119 return;
1120
1121 list_del(&mall_tc_entry->list);
1122
1123 switch (mall_tc_entry->type) {
1124 case DSA_PORT_MALL_MIRROR:
1125 ds->ops->port_mirror_del(ds, p->dp->index,
1126 &mall_tc_entry->mirror);
1127 break;
1128 default:
1129 WARN_ON(1);
1130 }
1131
1132 kfree(mall_tc_entry);
1133}
1134
1135static int dsa_slave_setup_tc(struct net_device *dev, u32 handle,
1136 __be16 protocol, struct tc_to_netdev *tc)
1137{
1138 bool ingress = TC_H_MAJ(handle) == TC_H_MAJ(TC_H_INGRESS);
1139 int ret = -EOPNOTSUPP;
1140
1141 switch (tc->type) {
1142 case TC_SETUP_MATCHALL:
1143 switch (tc->cls_mall->command) {
1144 case TC_CLSMATCHALL_REPLACE:
1145 return dsa_slave_add_cls_matchall(dev, protocol,
1146 tc->cls_mall,
1147 ingress);
1148 case TC_CLSMATCHALL_DESTROY:
1149 dsa_slave_del_cls_matchall(dev, tc->cls_mall);
1150 return 0;
1151 }
1152 default:
1153 break;
1154 }
1155
1156 return ret;
1157}
1158
987void dsa_cpu_port_ethtool_init(struct ethtool_ops *ops) 1159void dsa_cpu_port_ethtool_init(struct ethtool_ops *ops)
988{ 1160{
989 ops->get_sset_count = dsa_cpu_port_get_sset_count; 1161 ops->get_sset_count = dsa_cpu_port_get_sset_count;
@@ -991,6 +1163,30 @@ void dsa_cpu_port_ethtool_init(struct ethtool_ops *ops)
991 ops->get_strings = dsa_cpu_port_get_strings; 1163 ops->get_strings = dsa_cpu_port_get_strings;
992} 1164}
993 1165
1166static int dsa_slave_get_rxnfc(struct net_device *dev,
1167 struct ethtool_rxnfc *nfc, u32 *rule_locs)
1168{
1169 struct dsa_slave_priv *p = netdev_priv(dev);
1170 struct dsa_switch *ds = p->dp->ds;
1171
1172 if (!ds->ops->get_rxnfc)
1173 return -EOPNOTSUPP;
1174
1175 return ds->ops->get_rxnfc(ds, p->dp->index, nfc, rule_locs);
1176}
1177
1178static int dsa_slave_set_rxnfc(struct net_device *dev,
1179 struct ethtool_rxnfc *nfc)
1180{
1181 struct dsa_slave_priv *p = netdev_priv(dev);
1182 struct dsa_switch *ds = p->dp->ds;
1183
1184 if (!ds->ops->set_rxnfc)
1185 return -EOPNOTSUPP;
1186
1187 return ds->ops->set_rxnfc(ds, p->dp->index, nfc);
1188}
1189
994static const struct ethtool_ops dsa_slave_ethtool_ops = { 1190static const struct ethtool_ops dsa_slave_ethtool_ops = {
995 .get_drvinfo = dsa_slave_get_drvinfo, 1191 .get_drvinfo = dsa_slave_get_drvinfo,
996 .get_regs_len = dsa_slave_get_regs_len, 1192 .get_regs_len = dsa_slave_get_regs_len,
@@ -1009,6 +1205,8 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = {
1009 .get_eee = dsa_slave_get_eee, 1205 .get_eee = dsa_slave_get_eee,
1010 .get_link_ksettings = dsa_slave_get_link_ksettings, 1206 .get_link_ksettings = dsa_slave_get_link_ksettings,
1011 .set_link_ksettings = dsa_slave_set_link_ksettings, 1207 .set_link_ksettings = dsa_slave_set_link_ksettings,
1208 .get_rxnfc = dsa_slave_get_rxnfc,
1209 .set_rxnfc = dsa_slave_set_rxnfc,
1012}; 1210};
1013 1211
1014static const struct net_device_ops dsa_slave_netdev_ops = { 1212static const struct net_device_ops dsa_slave_netdev_ops = {
@@ -1031,6 +1229,8 @@ static const struct net_device_ops dsa_slave_netdev_ops = {
1031 .ndo_bridge_getlink = switchdev_port_bridge_getlink, 1229 .ndo_bridge_getlink = switchdev_port_bridge_getlink,
1032 .ndo_bridge_setlink = switchdev_port_bridge_setlink, 1230 .ndo_bridge_setlink = switchdev_port_bridge_setlink,
1033 .ndo_bridge_dellink = switchdev_port_bridge_dellink, 1231 .ndo_bridge_dellink = switchdev_port_bridge_dellink,
1232 .ndo_get_phys_port_name = dsa_slave_get_phys_port_name,
1233 .ndo_setup_tc = dsa_slave_setup_tc,
1034}; 1234};
1035 1235
1036static const struct switchdev_ops dsa_slave_switchdev_ops = { 1236static const struct switchdev_ops dsa_slave_switchdev_ops = {
@@ -1048,7 +1248,7 @@ static struct device_type dsa_type = {
1048static void dsa_slave_adjust_link(struct net_device *dev) 1248static void dsa_slave_adjust_link(struct net_device *dev)
1049{ 1249{
1050 struct dsa_slave_priv *p = netdev_priv(dev); 1250 struct dsa_slave_priv *p = netdev_priv(dev);
1051 struct dsa_switch *ds = p->parent; 1251 struct dsa_switch *ds = p->dp->ds;
1052 unsigned int status_changed = 0; 1252 unsigned int status_changed = 0;
1053 1253
1054 if (p->old_link != p->phy->link) { 1254 if (p->old_link != p->phy->link) {
@@ -1067,7 +1267,7 @@ static void dsa_slave_adjust_link(struct net_device *dev)
1067 } 1267 }
1068 1268
1069 if (ds->ops->adjust_link && status_changed) 1269 if (ds->ops->adjust_link && status_changed)
1070 ds->ops->adjust_link(ds, p->port, p->phy); 1270 ds->ops->adjust_link(ds, p->dp->index, p->phy);
1071 1271
1072 if (status_changed) 1272 if (status_changed)
1073 phy_print_status(p->phy); 1273 phy_print_status(p->phy);
@@ -1081,9 +1281,9 @@ static int dsa_slave_fixed_link_update(struct net_device *dev,
1081 1281
1082 if (dev) { 1282 if (dev) {
1083 p = netdev_priv(dev); 1283 p = netdev_priv(dev);
1084 ds = p->parent; 1284 ds = p->dp->ds;
1085 if (ds->ops->fixed_link_update) 1285 if (ds->ops->fixed_link_update)
1086 ds->ops->fixed_link_update(ds, p->port, status); 1286 ds->ops->fixed_link_update(ds, p->dp->index, status);
1087 } 1287 }
1088 1288
1089 return 0; 1289 return 0;
@@ -1094,7 +1294,7 @@ static int dsa_slave_phy_connect(struct dsa_slave_priv *p,
1094 struct net_device *slave_dev, 1294 struct net_device *slave_dev,
1095 int addr) 1295 int addr)
1096{ 1296{
1097 struct dsa_switch *ds = p->parent; 1297 struct dsa_switch *ds = p->dp->ds;
1098 1298
1099 p->phy = mdiobus_get_phy(ds->slave_mii_bus, addr); 1299 p->phy = mdiobus_get_phy(ds->slave_mii_bus, addr);
1100 if (!p->phy) { 1300 if (!p->phy) {
@@ -1112,13 +1312,13 @@ static int dsa_slave_phy_connect(struct dsa_slave_priv *p,
1112static int dsa_slave_phy_setup(struct dsa_slave_priv *p, 1312static int dsa_slave_phy_setup(struct dsa_slave_priv *p,
1113 struct net_device *slave_dev) 1313 struct net_device *slave_dev)
1114{ 1314{
1115 struct dsa_switch *ds = p->parent; 1315 struct dsa_switch *ds = p->dp->ds;
1116 struct device_node *phy_dn, *port_dn; 1316 struct device_node *phy_dn, *port_dn;
1117 bool phy_is_fixed = false; 1317 bool phy_is_fixed = false;
1118 u32 phy_flags = 0; 1318 u32 phy_flags = 0;
1119 int mode, ret; 1319 int mode, ret;
1120 1320
1121 port_dn = ds->ports[p->port].dn; 1321 port_dn = p->dp->dn;
1122 mode = of_get_phy_mode(port_dn); 1322 mode = of_get_phy_mode(port_dn);
1123 if (mode < 0) 1323 if (mode < 0)
1124 mode = PHY_INTERFACE_MODE_NA; 1324 mode = PHY_INTERFACE_MODE_NA;
@@ -1139,7 +1339,7 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p,
1139 } 1339 }
1140 1340
1141 if (ds->ops->get_phy_flags) 1341 if (ds->ops->get_phy_flags)
1142 phy_flags = ds->ops->get_phy_flags(ds, p->port); 1342 phy_flags = ds->ops->get_phy_flags(ds, p->dp->index);
1143 1343
1144 if (phy_dn) { 1344 if (phy_dn) {
1145 int phy_id = of_mdio_parse_addr(&slave_dev->dev, phy_dn); 1345 int phy_id = of_mdio_parse_addr(&slave_dev->dev, phy_dn);
@@ -1174,9 +1374,10 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p,
1174 * MDIO bus instead 1374 * MDIO bus instead
1175 */ 1375 */
1176 if (!p->phy) { 1376 if (!p->phy) {
1177 ret = dsa_slave_phy_connect(p, slave_dev, p->port); 1377 ret = dsa_slave_phy_connect(p, slave_dev, p->dp->index);
1178 if (ret) { 1378 if (ret) {
1179 netdev_err(slave_dev, "failed to connect to port %d: %d\n", p->port, ret); 1379 netdev_err(slave_dev, "failed to connect to port %d: %d\n",
1380 p->dp->index, ret);
1180 if (phy_is_fixed) 1381 if (phy_is_fixed)
1181 of_phy_deregister_fixed_link(port_dn); 1382 of_phy_deregister_fixed_link(port_dn);
1182 return ret; 1383 return ret;
@@ -1246,7 +1447,8 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent,
1246 if (slave_dev == NULL) 1447 if (slave_dev == NULL)
1247 return -ENOMEM; 1448 return -ENOMEM;
1248 1449
1249 slave_dev->features = master->vlan_features; 1450 slave_dev->features = master->vlan_features | NETIF_F_HW_TC;
1451 slave_dev->hw_features |= NETIF_F_HW_TC;
1250 slave_dev->ethtool_ops = &dsa_slave_ethtool_ops; 1452 slave_dev->ethtool_ops = &dsa_slave_ethtool_ops;
1251 eth_hw_addr_inherit(slave_dev, master); 1453 eth_hw_addr_inherit(slave_dev, master);
1252 slave_dev->priv_flags |= IFF_NO_QUEUE; 1454 slave_dev->priv_flags |= IFF_NO_QUEUE;
@@ -1264,8 +1466,8 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent,
1264 slave_dev->vlan_features = master->vlan_features; 1466 slave_dev->vlan_features = master->vlan_features;
1265 1467
1266 p = netdev_priv(slave_dev); 1468 p = netdev_priv(slave_dev);
1267 p->parent = ds; 1469 p->dp = &ds->ports[port];
1268 p->port = port; 1470 INIT_LIST_HEAD(&p->mall_tc_list);
1269 p->xmit = dst->tag_ops->xmit; 1471 p->xmit = dst->tag_ops->xmit;
1270 1472
1271 p->old_pause = -1; 1473 p->old_pause = -1;
@@ -1298,10 +1500,9 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent,
1298void dsa_slave_destroy(struct net_device *slave_dev) 1500void dsa_slave_destroy(struct net_device *slave_dev)
1299{ 1501{
1300 struct dsa_slave_priv *p = netdev_priv(slave_dev); 1502 struct dsa_slave_priv *p = netdev_priv(slave_dev);
1301 struct dsa_switch *ds = p->parent;
1302 struct device_node *port_dn; 1503 struct device_node *port_dn;
1303 1504
1304 port_dn = ds->ports[p->port].dn; 1505 port_dn = p->dp->dn;
1305 1506
1306 netif_carrier_off(slave_dev); 1507 netif_carrier_off(slave_dev);
1307 if (p->phy) { 1508 if (p->phy) {
@@ -1319,46 +1520,52 @@ static bool dsa_slave_dev_check(struct net_device *dev)
1319 return dev->netdev_ops == &dsa_slave_netdev_ops; 1520 return dev->netdev_ops == &dsa_slave_netdev_ops;
1320} 1521}
1321 1522
1322static int dsa_slave_port_upper_event(struct net_device *dev, 1523static int dsa_slave_changeupper(struct net_device *dev,
1323 unsigned long event, void *ptr) 1524 struct netdev_notifier_changeupper_info *info)
1324{ 1525{
1325 struct netdev_notifier_changeupper_info *info = ptr; 1526 int err = NOTIFY_DONE;
1326 struct net_device *upper = info->upper_dev;
1327 int err = 0;
1328 1527
1329 switch (event) { 1528 if (netif_is_bridge_master(info->upper_dev)) {
1330 case NETDEV_CHANGEUPPER: 1529 if (info->linking) {
1331 if (netif_is_bridge_master(upper)) { 1530 err = dsa_slave_bridge_port_join(dev, info->upper_dev);
1332 if (info->linking) 1531 err = notifier_from_errno(err);
1333 err = dsa_slave_bridge_port_join(dev, upper); 1532 } else {
1334 else 1533 dsa_slave_bridge_port_leave(dev, info->upper_dev);
1335 dsa_slave_bridge_port_leave(dev); 1534 err = NOTIFY_OK;
1336 } 1535 }
1337
1338 break;
1339 } 1536 }
1340 1537
1341 return notifier_from_errno(err); 1538 return err;
1342} 1539}
1343 1540
1344static int dsa_slave_port_event(struct net_device *dev, unsigned long event, 1541static int dsa_slave_netdevice_event(struct notifier_block *nb,
1345 void *ptr) 1542 unsigned long event, void *ptr)
1346{ 1543{
1347 switch (event) { 1544 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1348 case NETDEV_CHANGEUPPER: 1545
1349 return dsa_slave_port_upper_event(dev, event, ptr); 1546 if (dev->netdev_ops != &dsa_slave_netdev_ops)
1350 } 1547 return NOTIFY_DONE;
1548
1549 if (event == NETDEV_CHANGEUPPER)
1550 return dsa_slave_changeupper(dev, ptr);
1351 1551
1352 return NOTIFY_DONE; 1552 return NOTIFY_DONE;
1353} 1553}
1354 1554
1355int dsa_slave_netdevice_event(struct notifier_block *unused, 1555static struct notifier_block dsa_slave_nb __read_mostly = {
1356 unsigned long event, void *ptr) 1556 .notifier_call = dsa_slave_netdevice_event,
1557};
1558
1559int dsa_slave_register_notifier(void)
1357{ 1560{
1358 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 1561 return register_netdevice_notifier(&dsa_slave_nb);
1562}
1359 1563
1360 if (dsa_slave_dev_check(dev)) 1564void dsa_slave_unregister_notifier(void)
1361 return dsa_slave_port_event(dev, event, ptr); 1565{
1566 int err;
1362 1567
1363 return NOTIFY_DONE; 1568 err = unregister_netdevice_notifier(&dsa_slave_nb);
1569 if (err)
1570 pr_err("DSA: failed to unregister slave notifier (%d)\n", err);
1364} 1571}
diff --git a/net/dsa/switch.c b/net/dsa/switch.c
new file mode 100644
index 000000000000..6456dacf9ae9
--- /dev/null
+++ b/net/dsa/switch.c
@@ -0,0 +1,85 @@
1/*
2 * Handling of a single switch chip, part of a switch fabric
3 *
4 * Copyright (c) 2017 Vivien Didelot <vivien.didelot@savoirfairelinux.com>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 */
11
12#include <linux/netdevice.h>
13#include <linux/notifier.h>
14#include <net/dsa.h>
15
16static int dsa_switch_bridge_join(struct dsa_switch *ds,
17 struct dsa_notifier_bridge_info *info)
18{
19 if (ds->index == info->sw_index && ds->ops->port_bridge_join)
20 return ds->ops->port_bridge_join(ds, info->port, info->br);
21
22 if (ds->index != info->sw_index)
23 dev_dbg(ds->dev, "crosschip DSA port %d.%d bridged to %s\n",
24 info->sw_index, info->port, netdev_name(info->br));
25
26 return 0;
27}
28
29static int dsa_switch_bridge_leave(struct dsa_switch *ds,
30 struct dsa_notifier_bridge_info *info)
31{
32 if (ds->index == info->sw_index && ds->ops->port_bridge_leave)
33 ds->ops->port_bridge_leave(ds, info->port, info->br);
34
35 if (ds->index != info->sw_index)
36 dev_dbg(ds->dev, "crosschip DSA port %d.%d unbridged from %s\n",
37 info->sw_index, info->port, netdev_name(info->br));
38
39 return 0;
40}
41
42static int dsa_switch_event(struct notifier_block *nb,
43 unsigned long event, void *info)
44{
45 struct dsa_switch *ds = container_of(nb, struct dsa_switch, nb);
46 int err;
47
48 switch (event) {
49 case DSA_NOTIFIER_BRIDGE_JOIN:
50 err = dsa_switch_bridge_join(ds, info);
51 break;
52 case DSA_NOTIFIER_BRIDGE_LEAVE:
53 err = dsa_switch_bridge_leave(ds, info);
54 break;
55 default:
56 err = -EOPNOTSUPP;
57 break;
58 }
59
60 /* Non-switchdev operations cannot be rolled back. If a DSA driver
61 * returns an error during the chained call, switch chips may be in an
62 * inconsistent state.
63 */
64 if (err)
65 dev_dbg(ds->dev, "breaking chain for DSA event %lu (%d)\n",
66 event, err);
67
68 return notifier_from_errno(err);
69}
70
71int dsa_switch_register_notifier(struct dsa_switch *ds)
72{
73 ds->nb.notifier_call = dsa_switch_event;
74
75 return raw_notifier_chain_register(&ds->dst->nh, &ds->nb);
76}
77
78void dsa_switch_unregister_notifier(struct dsa_switch *ds)
79{
80 int err;
81
82 err = raw_notifier_chain_unregister(&ds->dst->nh, &ds->nb);
83 if (err)
84 dev_err(ds->dev, "failed to unregister notifier (%d)\n", err);
85}
diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c
index 21bffde6e4bf..5d925b6b2bb1 100644
--- a/net/dsa/tag_brcm.c
+++ b/net/dsa/tag_brcm.c
@@ -80,9 +80,9 @@ static struct sk_buff *brcm_tag_xmit(struct sk_buff *skb, struct net_device *dev
80 ((skb->priority << BRCM_IG_TC_SHIFT) & BRCM_IG_TC_MASK); 80 ((skb->priority << BRCM_IG_TC_SHIFT) & BRCM_IG_TC_MASK);
81 brcm_tag[1] = 0; 81 brcm_tag[1] = 0;
82 brcm_tag[2] = 0; 82 brcm_tag[2] = 0;
83 if (p->port == 8) 83 if (p->dp->index == 8)
84 brcm_tag[2] = BRCM_IG_DSTMAP2_MASK; 84 brcm_tag[2] = BRCM_IG_DSTMAP2_MASK;
85 brcm_tag[3] = (1 << p->port) & BRCM_IG_DSTMAP1_MASK; 85 brcm_tag[3] = (1 << p->dp->index) & BRCM_IG_DSTMAP1_MASK;
86 86
87 return skb; 87 return skb;
88 88
@@ -102,7 +102,7 @@ static int brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev,
102 if (unlikely(dst == NULL)) 102 if (unlikely(dst == NULL))
103 goto out_drop; 103 goto out_drop;
104 104
105 ds = dst->ds[0]; 105 ds = dst->cpu_switch;
106 106
107 skb = skb_unshare(skb, GFP_ATOMIC); 107 skb = skb_unshare(skb, GFP_ATOMIC);
108 if (skb == NULL) 108 if (skb == NULL)
@@ -121,13 +121,14 @@ static int brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev,
121 /* We should never see a reserved reason code without knowing how to 121 /* We should never see a reserved reason code without knowing how to
122 * handle it 122 * handle it
123 */ 123 */
124 WARN_ON(brcm_tag[2] & BRCM_EG_RC_RSVD); 124 if (unlikely(brcm_tag[2] & BRCM_EG_RC_RSVD))
125 goto out_drop;
125 126
126 /* Locate which port this is coming from */ 127 /* Locate which port this is coming from */
127 source_port = brcm_tag[3] & BRCM_EG_PID_MASK; 128 source_port = brcm_tag[3] & BRCM_EG_PID_MASK;
128 129
129 /* Validate port against switch setup, either the port is totally */ 130 /* Validate port against switch setup, either the port is totally */
130 if (source_port >= DSA_MAX_PORTS || !ds->ports[source_port].netdev) 131 if (source_port >= ds->num_ports || !ds->ports[source_port].netdev)
131 goto out_drop; 132 goto out_drop;
132 133
133 /* Remove Broadcom tag and update checksum */ 134 /* Remove Broadcom tag and update checksum */
diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c
index bce79ffe342b..72579ceea381 100644
--- a/net/dsa/tag_dsa.c
+++ b/net/dsa/tag_dsa.c
@@ -33,8 +33,8 @@ static struct sk_buff *dsa_xmit(struct sk_buff *skb, struct net_device *dev)
33 * Construct tagged FROM_CPU DSA tag from 802.1q tag. 33 * Construct tagged FROM_CPU DSA tag from 802.1q tag.
34 */ 34 */
35 dsa_header = skb->data + 2 * ETH_ALEN; 35 dsa_header = skb->data + 2 * ETH_ALEN;
36 dsa_header[0] = 0x60 | p->parent->index; 36 dsa_header[0] = 0x60 | p->dp->ds->index;
37 dsa_header[1] = p->port << 3; 37 dsa_header[1] = p->dp->index << 3;
38 38
39 /* 39 /*
40 * Move CFI field from byte 2 to byte 1. 40 * Move CFI field from byte 2 to byte 1.
@@ -54,8 +54,8 @@ static struct sk_buff *dsa_xmit(struct sk_buff *skb, struct net_device *dev)
54 * Construct untagged FROM_CPU DSA tag. 54 * Construct untagged FROM_CPU DSA tag.
55 */ 55 */
56 dsa_header = skb->data + 2 * ETH_ALEN; 56 dsa_header = skb->data + 2 * ETH_ALEN;
57 dsa_header[0] = 0x40 | p->parent->index; 57 dsa_header[0] = 0x40 | p->dp->ds->index;
58 dsa_header[1] = p->port << 3; 58 dsa_header[1] = p->dp->index << 3;
59 dsa_header[2] = 0x00; 59 dsa_header[2] = 0x00;
60 dsa_header[3] = 0x00; 60 dsa_header[3] = 0x00;
61 } 61 }
@@ -114,7 +114,7 @@ static int dsa_rcv(struct sk_buff *skb, struct net_device *dev,
114 if (!ds) 114 if (!ds)
115 goto out_drop; 115 goto out_drop;
116 116
117 if (source_port >= DSA_MAX_PORTS || !ds->ports[source_port].netdev) 117 if (source_port >= ds->num_ports || !ds->ports[source_port].netdev)
118 goto out_drop; 118 goto out_drop;
119 119
120 /* 120 /*
diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c
index 6c1720e88537..648c051817a1 100644
--- a/net/dsa/tag_edsa.c
+++ b/net/dsa/tag_edsa.c
@@ -42,8 +42,8 @@ static struct sk_buff *edsa_xmit(struct sk_buff *skb, struct net_device *dev)
42 edsa_header[1] = ETH_P_EDSA & 0xff; 42 edsa_header[1] = ETH_P_EDSA & 0xff;
43 edsa_header[2] = 0x00; 43 edsa_header[2] = 0x00;
44 edsa_header[3] = 0x00; 44 edsa_header[3] = 0x00;
45 edsa_header[4] = 0x60 | p->parent->index; 45 edsa_header[4] = 0x60 | p->dp->ds->index;
46 edsa_header[5] = p->port << 3; 46 edsa_header[5] = p->dp->index << 3;
47 47
48 /* 48 /*
49 * Move CFI field from byte 6 to byte 5. 49 * Move CFI field from byte 6 to byte 5.
@@ -67,8 +67,8 @@ static struct sk_buff *edsa_xmit(struct sk_buff *skb, struct net_device *dev)
67 edsa_header[1] = ETH_P_EDSA & 0xff; 67 edsa_header[1] = ETH_P_EDSA & 0xff;
68 edsa_header[2] = 0x00; 68 edsa_header[2] = 0x00;
69 edsa_header[3] = 0x00; 69 edsa_header[3] = 0x00;
70 edsa_header[4] = 0x40 | p->parent->index; 70 edsa_header[4] = 0x40 | p->dp->ds->index;
71 edsa_header[5] = p->port << 3; 71 edsa_header[5] = p->dp->index << 3;
72 edsa_header[6] = 0x00; 72 edsa_header[6] = 0x00;
73 edsa_header[7] = 0x00; 73 edsa_header[7] = 0x00;
74 } 74 }
@@ -127,7 +127,7 @@ static int edsa_rcv(struct sk_buff *skb, struct net_device *dev,
127 if (!ds) 127 if (!ds)
128 goto out_drop; 128 goto out_drop;
129 129
130 if (source_port >= DSA_MAX_PORTS || !ds->ports[source_port].netdev) 130 if (source_port >= ds->num_ports || !ds->ports[source_port].netdev)
131 goto out_drop; 131 goto out_drop;
132 132
133 /* 133 /*
diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c
index 0c90cacee7aa..30240f343aea 100644
--- a/net/dsa/tag_qca.c
+++ b/net/dsa/tag_qca.c
@@ -54,7 +54,7 @@ static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev)
54 /* Set the version field, and set destination port information */ 54 /* Set the version field, and set destination port information */
55 hdr = QCA_HDR_VERSION << QCA_HDR_XMIT_VERSION_S | 55 hdr = QCA_HDR_VERSION << QCA_HDR_XMIT_VERSION_S |
56 QCA_HDR_XMIT_FROM_CPU | 56 QCA_HDR_XMIT_FROM_CPU |
57 BIT(p->port); 57 BIT(p->dp->index);
58 58
59 *phdr = htons(hdr); 59 *phdr = htons(hdr);
60 60
@@ -104,7 +104,7 @@ static int qca_tag_rcv(struct sk_buff *skb, struct net_device *dev,
104 /* This protocol doesn't support cascading multiple switches so it's 104 /* This protocol doesn't support cascading multiple switches so it's
105 * safe to assume the switch is first in the tree 105 * safe to assume the switch is first in the tree
106 */ 106 */
107 ds = dst->ds[0]; 107 ds = dst->cpu_switch;
108 if (!ds) 108 if (!ds)
109 goto out_drop; 109 goto out_drop;
110 110
diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c
index 5e3903eb1afa..26f977176978 100644
--- a/net/dsa/tag_trailer.c
+++ b/net/dsa/tag_trailer.c
@@ -50,7 +50,7 @@ static struct sk_buff *trailer_xmit(struct sk_buff *skb, struct net_device *dev)
50 50
51 trailer = skb_put(nskb, 4); 51 trailer = skb_put(nskb, 4);
52 trailer[0] = 0x80; 52 trailer[0] = 0x80;
53 trailer[1] = 1 << p->port; 53 trailer[1] = 1 << p->dp->index;
54 trailer[2] = 0x10; 54 trailer[2] = 0x10;
55 trailer[3] = 0x00; 55 trailer[3] = 0x00;
56 56
@@ -67,7 +67,7 @@ static int trailer_rcv(struct sk_buff *skb, struct net_device *dev,
67 67
68 if (unlikely(dst == NULL)) 68 if (unlikely(dst == NULL))
69 goto out_drop; 69 goto out_drop;
70 ds = dst->ds[0]; 70 ds = dst->cpu_switch;
71 71
72 skb = skb_unshare(skb, GFP_ATOMIC); 72 skb = skb_unshare(skb, GFP_ATOMIC);
73 if (skb == NULL) 73 if (skb == NULL)
@@ -82,7 +82,7 @@ static int trailer_rcv(struct sk_buff *skb, struct net_device *dev,
82 goto out_drop; 82 goto out_drop;
83 83
84 source_port = trailer[1] & 7; 84 source_port = trailer[1] & 7;
85 if (source_port >= DSA_MAX_PORTS || !ds->ports[source_port].netdev) 85 if (source_port >= ds->num_ports || !ds->ports[source_port].netdev)
86 goto out_drop; 86 goto out_drop;
87 87
88 pskb_trim_rcsum(skb, skb->len - 4); 88 pskb_trim_rcsum(skb, skb->len - 4);
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 8c5a479681ca..1446810047f5 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -356,6 +356,7 @@ void ether_setup(struct net_device *dev)
356 dev->header_ops = &eth_header_ops; 356 dev->header_ops = &eth_header_ops;
357 dev->type = ARPHRD_ETHER; 357 dev->type = ARPHRD_ETHER;
358 dev->hard_header_len = ETH_HLEN; 358 dev->hard_header_len = ETH_HLEN;
359 dev->min_header_len = ETH_HLEN;
359 dev->mtu = ETH_DATA_LEN; 360 dev->mtu = ETH_DATA_LEN;
360 dev->min_mtu = ETH_MIN_MTU; 361 dev->min_mtu = ETH_MIN_MTU;
361 dev->max_mtu = ETH_DATA_LEN; 362 dev->max_mtu = ETH_DATA_LEN;
@@ -392,6 +393,34 @@ struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs,
392} 393}
393EXPORT_SYMBOL(alloc_etherdev_mqs); 394EXPORT_SYMBOL(alloc_etherdev_mqs);
394 395
396static void devm_free_netdev(struct device *dev, void *res)
397{
398 free_netdev(*(struct net_device **)res);
399}
400
401struct net_device *devm_alloc_etherdev_mqs(struct device *dev, int sizeof_priv,
402 unsigned int txqs, unsigned int rxqs)
403{
404 struct net_device **dr;
405 struct net_device *netdev;
406
407 dr = devres_alloc(devm_free_netdev, sizeof(*dr), GFP_KERNEL);
408 if (!dr)
409 return NULL;
410
411 netdev = alloc_etherdev_mqs(sizeof_priv, txqs, rxqs);
412 if (!netdev) {
413 devres_free(dr);
414 return NULL;
415 }
416
417 *dr = netdev;
418 devres_add(dev, dr);
419
420 return netdev;
421}
422EXPORT_SYMBOL(devm_alloc_etherdev_mqs);
423
395ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len) 424ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len)
396{ 425{
397 return scnprintf(buf, PAGE_SIZE, "%*phC\n", len, addr); 426 return scnprintf(buf, PAGE_SIZE, "%*phC\n", len, addr);
@@ -446,7 +475,7 @@ struct sk_buff **eth_gro_receive(struct sk_buff **head,
446out_unlock: 475out_unlock:
447 rcu_read_unlock(); 476 rcu_read_unlock();
448out: 477out:
449 NAPI_GRO_CB(skb)->flush |= flush; 478 skb_gro_flush_final(skb, pp, flush);
450 479
451 return pp; 480 return pp;
452} 481}
diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c
index fc65b145f6e7..c73160fb11e7 100644
--- a/net/hsr/hsr_device.c
+++ b/net/hsr/hsr_device.c
@@ -395,7 +395,7 @@ static struct device_type hsr_type = {
395 395
396void hsr_dev_setup(struct net_device *dev) 396void hsr_dev_setup(struct net_device *dev)
397{ 397{
398 random_ether_addr(dev->dev_addr); 398 eth_hw_addr_random(dev);
399 399
400 ether_setup(dev); 400 ether_setup(dev);
401 dev->min_mtu = 0; 401 dev->min_mtu = 0;
diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c
index f5b60388d02f..56080da4aa77 100644
--- a/net/hsr/hsr_slave.c
+++ b/net/hsr/hsr_slave.c
@@ -12,6 +12,7 @@
12#include "hsr_slave.h" 12#include "hsr_slave.h"
13#include <linux/etherdevice.h> 13#include <linux/etherdevice.h>
14#include <linux/if_arp.h> 14#include <linux/if_arp.h>
15#include <linux/if_vlan.h>
15#include "hsr_main.h" 16#include "hsr_main.h"
16#include "hsr_device.h" 17#include "hsr_device.h"
17#include "hsr_forward.h" 18#include "hsr_forward.h"
@@ -81,7 +82,7 @@ static int hsr_check_dev_ok(struct net_device *dev)
81 return -EINVAL; 82 return -EINVAL;
82 } 83 }
83 84
84 if (dev->priv_flags & IFF_802_1Q_VLAN) { 85 if (is_vlan_dev(dev)) {
85 netdev_info(dev, "HSR on top of VLAN is not yet supported in this driver.\n"); 86 netdev_info(dev, "HSR on top of VLAN is not yet supported in this driver.\n");
86 return -EINVAL; 87 return -EINVAL;
87 } 88 }
diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c
index e0bd013a1e5e..eedba7670b51 100644
--- a/net/ieee802154/socket.c
+++ b/net/ieee802154/socket.c
@@ -279,7 +279,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
279 pr_debug("name = %s, mtu = %u\n", dev->name, mtu); 279 pr_debug("name = %s, mtu = %u\n", dev->name, mtu);
280 280
281 if (size > mtu) { 281 if (size > mtu) {
282 pr_debug("size = %Zu, mtu = %u\n", size, mtu); 282 pr_debug("size = %zu, mtu = %u\n", size, mtu);
283 err = -EMSGSIZE; 283 err = -EMSGSIZE;
284 goto out_dev; 284 goto out_dev;
285 } 285 }
@@ -645,7 +645,7 @@ static int dgram_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
645 pr_debug("name = %s, mtu = %u\n", dev->name, mtu); 645 pr_debug("name = %s, mtu = %u\n", dev->name, mtu);
646 646
647 if (size > mtu) { 647 if (size > mtu) {
648 pr_debug("size = %Zu, mtu = %u\n", size, mtu); 648 pr_debug("size = %zu, mtu = %u\n", size, mtu);
649 err = -EMSGSIZE; 649 err = -EMSGSIZE;
650 goto out_dev; 650 goto out_dev;
651 } 651 }
diff --git a/net/ife/Kconfig b/net/ife/Kconfig
new file mode 100644
index 000000000000..31e48b652c7c
--- /dev/null
+++ b/net/ife/Kconfig
@@ -0,0 +1,16 @@
1#
2# IFE subsystem configuration
3#
4
5menuconfig NET_IFE
6 depends on NET
7 tristate "Inter-FE based on IETF ForCES InterFE LFB"
8 default n
9 help
10 Say Y here to add support of IFE encapsulation protocol
11 For details refer to netdev01 paper:
12 "Distributing Linux Traffic Control Classifier-Action Subsystem"
13 Authors: Jamal Hadi Salim and Damascene M. Joachimpillai
14
15 To compile this support as a module, choose M here: the module will
16 be called ife.
diff --git a/net/ife/Makefile b/net/ife/Makefile
new file mode 100644
index 000000000000..2a90d97746cc
--- /dev/null
+++ b/net/ife/Makefile
@@ -0,0 +1,5 @@
1#
2# Makefile for the IFE encapsulation protocol
3#
4
5obj-$(CONFIG_NET_IFE) += ife.o
diff --git a/net/ife/ife.c b/net/ife/ife.c
new file mode 100644
index 000000000000..f360341c72eb
--- /dev/null
+++ b/net/ife/ife.c
@@ -0,0 +1,142 @@
1/*
2 * net/ife/ife.c - Inter-FE protocol based on ForCES WG InterFE LFB
3 * Copyright (c) 2015 Jamal Hadi Salim <jhs@mojatatu.com>
4 * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
5 *
6 * Refer to: draft-ietf-forces-interfelfb-03 and netdev01 paper:
7 * "Distributing Linux Traffic Control Classifier-Action Subsystem"
8 * Authors: Jamal Hadi Salim and Damascene M. Joachimpillai
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation.
13 */
14
15#include <linux/types.h>
16#include <linux/kernel.h>
17#include <linux/string.h>
18#include <linux/errno.h>
19#include <linux/skbuff.h>
20#include <linux/rtnetlink.h>
21#include <linux/module.h>
22#include <linux/init.h>
23#include <net/net_namespace.h>
24#include <net/netlink.h>
25#include <net/pkt_sched.h>
26#include <linux/etherdevice.h>
27#include <net/ife.h>
28
29struct ifeheadr {
30 __be16 metalen;
31 u8 tlv_data[];
32};
33
34void *ife_encode(struct sk_buff *skb, u16 metalen)
35{
36 /* OUTERHDR:TOTMETALEN:{TLVHDR:Metadatum:TLVHDR..}:ORIGDATA
37 * where ORIGDATA = original ethernet header ...
38 */
39 int hdrm = metalen + IFE_METAHDRLEN;
40 int total_push = hdrm + skb->dev->hard_header_len;
41 struct ifeheadr *ifehdr;
42 struct ethhdr *iethh; /* inner ether header */
43 int skboff = 0;
44 int err;
45
46 err = skb_cow_head(skb, total_push);
47 if (unlikely(err))
48 return NULL;
49
50 iethh = (struct ethhdr *) skb->data;
51
52 __skb_push(skb, total_push);
53 memcpy(skb->data, iethh, skb->dev->hard_header_len);
54 skb_reset_mac_header(skb);
55 skboff += skb->dev->hard_header_len;
56
57 /* total metadata length */
58 ifehdr = (struct ifeheadr *) (skb->data + skboff);
59 metalen += IFE_METAHDRLEN;
60 ifehdr->metalen = htons(metalen);
61
62 return ifehdr->tlv_data;
63}
64EXPORT_SYMBOL_GPL(ife_encode);
65
66void *ife_decode(struct sk_buff *skb, u16 *metalen)
67{
68 struct ifeheadr *ifehdr;
69 int total_pull;
70 u16 ifehdrln;
71
72 ifehdr = (struct ifeheadr *) (skb->data + skb->dev->hard_header_len);
73 ifehdrln = ntohs(ifehdr->metalen);
74 total_pull = skb->dev->hard_header_len + ifehdrln;
75
76 if (unlikely(ifehdrln < 2))
77 return NULL;
78
79 if (unlikely(!pskb_may_pull(skb, total_pull)))
80 return NULL;
81
82 skb_set_mac_header(skb, total_pull);
83 __skb_pull(skb, total_pull);
84 *metalen = ifehdrln - IFE_METAHDRLEN;
85
86 return &ifehdr->tlv_data;
87}
88EXPORT_SYMBOL_GPL(ife_decode);
89
90struct meta_tlvhdr {
91 __be16 type;
92 __be16 len;
93};
94
95/* Caller takes care of presenting data in network order
96 */
97void *ife_tlv_meta_decode(void *skbdata, u16 *attrtype, u16 *dlen, u16 *totlen)
98{
99 struct meta_tlvhdr *tlv = (struct meta_tlvhdr *) skbdata;
100
101 *dlen = ntohs(tlv->len) - NLA_HDRLEN;
102 *attrtype = ntohs(tlv->type);
103
104 if (totlen)
105 *totlen = nla_total_size(*dlen);
106
107 return skbdata + sizeof(struct meta_tlvhdr);
108}
109EXPORT_SYMBOL_GPL(ife_tlv_meta_decode);
110
111void *ife_tlv_meta_next(void *skbdata)
112{
113 struct meta_tlvhdr *tlv = (struct meta_tlvhdr *) skbdata;
114 u16 tlvlen = ntohs(tlv->len);
115
116 tlvlen = NLA_ALIGN(tlvlen);
117
118 return skbdata + tlvlen;
119}
120EXPORT_SYMBOL_GPL(ife_tlv_meta_next);
121
122/* Caller takes care of presenting data in network order
123 */
124int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen, const void *dval)
125{
126 __be32 *tlv = (__be32 *) (skbdata);
127 u16 totlen = nla_total_size(dlen); /*alignment + hdr */
128 char *dptr = (char *) tlv + NLA_HDRLEN;
129 u32 htlv = attrtype << 16 | (dlen + NLA_HDRLEN);
130
131 *tlv = htonl(htlv);
132 memset(dptr, 0, totlen - NLA_HDRLEN);
133 memcpy(dptr, dval, dlen);
134
135 return totlen;
136}
137EXPORT_SYMBOL_GPL(ife_tlv_meta_encode);
138
139MODULE_AUTHOR("Jamal Hadi Salim <jhs@mojatatu.com>");
140MODULE_AUTHOR("Yotam Gigi <yotamg@mellanox.com>");
141MODULE_DESCRIPTION("Inter-FE LFB action");
142MODULE_LICENSE("GPL");
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 6e7baaf814c6..91a2557942fa 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -187,6 +187,7 @@ config NET_IPGRE_DEMUX
187config NET_IP_TUNNEL 187config NET_IP_TUNNEL
188 tristate 188 tristate
189 select DST_CACHE 189 select DST_CACHE
190 select GRO_CELLS
190 default n 191 default n
191 192
192config NET_IPGRE 193config NET_IPGRE
@@ -360,6 +361,19 @@ config INET_ESP
360 361
361 If unsure, say Y. 362 If unsure, say Y.
362 363
364config INET_ESP_OFFLOAD
365 tristate "IP: ESP transformation offload"
366 depends on INET_ESP
367 select XFRM_OFFLOAD
368 default n
369 ---help---
370 Support for ESP transformation offload. This makes sense
371 only if this system really does IPsec and want to do it
372 with high throughput. A typical desktop system does not
373 need it, even if it does IPsec.
374
375 If unsure, say N.
376
363config INET_IPCOMP 377config INET_IPCOMP
364 tristate "IP: IPComp transformation" 378 tristate "IP: IPComp transformation"
365 select INET_XFRM_TUNNEL 379 select INET_XFRM_TUNNEL
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 48af58a5686e..c6d4238ff94a 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_NET_IPVTI) += ip_vti.o
29obj-$(CONFIG_SYN_COOKIES) += syncookies.o 29obj-$(CONFIG_SYN_COOKIES) += syncookies.o
30obj-$(CONFIG_INET_AH) += ah4.o 30obj-$(CONFIG_INET_AH) += ah4.o
31obj-$(CONFIG_INET_ESP) += esp4.o 31obj-$(CONFIG_INET_ESP) += esp4.o
32obj-$(CONFIG_INET_ESP_OFFLOAD) += esp4_offload.o
32obj-$(CONFIG_INET_IPCOMP) += ipcomp.o 33obj-$(CONFIG_INET_IPCOMP) += ipcomp.o
33obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o 34obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o
34obj-$(CONFIG_INET_XFRM_MODE_BEET) += xfrm4_mode_beet.o 35obj-$(CONFIG_INET_XFRM_MODE_BEET) += xfrm4_mode_beet.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index f75069883f2b..6b1fc6e4278e 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -479,7 +479,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
479 479
480 snum = ntohs(addr->sin_port); 480 snum = ntohs(addr->sin_port);
481 err = -EACCES; 481 err = -EACCES;
482 if (snum && snum < PROT_SOCK && 482 if (snum && snum < inet_prot_sock(net) &&
483 !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) 483 !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
484 goto out; 484 goto out;
485 485
@@ -570,19 +570,30 @@ static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias)
570 * TCP 'magic' in here. 570 * TCP 'magic' in here.
571 */ 571 */
572int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, 572int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
573 int addr_len, int flags) 573 int addr_len, int flags, int is_sendmsg)
574{ 574{
575 struct sock *sk = sock->sk; 575 struct sock *sk = sock->sk;
576 int err; 576 int err;
577 long timeo; 577 long timeo;
578 578
579 if (addr_len < sizeof(uaddr->sa_family)) 579 /*
580 return -EINVAL; 580 * uaddr can be NULL and addr_len can be 0 if:
581 * sk is a TCP fastopen active socket and
582 * TCP_FASTOPEN_CONNECT sockopt is set and
583 * we already have a valid cookie for this socket.
584 * In this case, user can call write() after connect().
585 * write() will invoke tcp_sendmsg_fastopen() which calls
586 * __inet_stream_connect().
587 */
588 if (uaddr) {
589 if (addr_len < sizeof(uaddr->sa_family))
590 return -EINVAL;
581 591
582 if (uaddr->sa_family == AF_UNSPEC) { 592 if (uaddr->sa_family == AF_UNSPEC) {
583 err = sk->sk_prot->disconnect(sk, flags); 593 err = sk->sk_prot->disconnect(sk, flags);
584 sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; 594 sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
585 goto out; 595 goto out;
596 }
586 } 597 }
587 598
588 switch (sock->state) { 599 switch (sock->state) {
@@ -593,7 +604,10 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
593 err = -EISCONN; 604 err = -EISCONN;
594 goto out; 605 goto out;
595 case SS_CONNECTING: 606 case SS_CONNECTING:
596 err = -EALREADY; 607 if (inet_sk(sk)->defer_connect)
608 err = is_sendmsg ? -EINPROGRESS : -EISCONN;
609 else
610 err = -EALREADY;
597 /* Fall out of switch with err, set for this state */ 611 /* Fall out of switch with err, set for this state */
598 break; 612 break;
599 case SS_UNCONNECTED: 613 case SS_UNCONNECTED:
@@ -607,6 +621,9 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
607 621
608 sock->state = SS_CONNECTING; 622 sock->state = SS_CONNECTING;
609 623
624 if (!err && inet_sk(sk)->defer_connect)
625 goto out;
626
610 /* Just entered SS_CONNECTING state; the only 627 /* Just entered SS_CONNECTING state; the only
611 * difference is that return value in non-blocking 628 * difference is that return value in non-blocking
612 * case is EINPROGRESS, rather than EALREADY. 629 * case is EINPROGRESS, rather than EALREADY.
@@ -662,7 +679,7 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
662 int err; 679 int err;
663 680
664 lock_sock(sock->sk); 681 lock_sock(sock->sk);
665 err = __inet_stream_connect(sock, uaddr, addr_len, flags); 682 err = __inet_stream_connect(sock, uaddr, addr_len, flags, 0);
666 release_sock(sock->sk); 683 release_sock(sock->sk);
667 return err; 684 return err;
668} 685}
@@ -672,11 +689,12 @@ EXPORT_SYMBOL(inet_stream_connect);
672 * Accept a pending connection. The TCP layer now gives BSD semantics. 689 * Accept a pending connection. The TCP layer now gives BSD semantics.
673 */ 690 */
674 691
675int inet_accept(struct socket *sock, struct socket *newsock, int flags) 692int inet_accept(struct socket *sock, struct socket *newsock, int flags,
693 bool kern)
676{ 694{
677 struct sock *sk1 = sock->sk; 695 struct sock *sk1 = sock->sk;
678 int err = -EINVAL; 696 int err = -EINVAL;
679 struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err); 697 struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err, kern);
680 698
681 if (!sk2) 699 if (!sk2)
682 goto do_err; 700 goto do_err;
@@ -1406,7 +1424,7 @@ out_unlock:
1406 rcu_read_unlock(); 1424 rcu_read_unlock();
1407 1425
1408out: 1426out:
1409 NAPI_GRO_CB(skb)->flush |= flush; 1427 skb_gro_flush_final(skb, pp, flush);
1410 1428
1411 return pp; 1429 return pp;
1412} 1430}
@@ -1470,8 +1488,10 @@ int inet_gro_complete(struct sk_buff *skb, int nhoff)
1470 int proto = iph->protocol; 1488 int proto = iph->protocol;
1471 int err = -ENOSYS; 1489 int err = -ENOSYS;
1472 1490
1473 if (skb->encapsulation) 1491 if (skb->encapsulation) {
1492 skb_set_inner_protocol(skb, cpu_to_be16(ETH_P_IP));
1474 skb_set_inner_network_header(skb, nhoff); 1493 skb_set_inner_network_header(skb, nhoff);
1494 }
1475 1495
1476 csum_replace2(&iph->check, iph->tot_len, newlen); 1496 csum_replace2(&iph->check, iph->tot_len, newlen);
1477 iph->tot_len = newlen; 1497 iph->tot_len = newlen;
@@ -1700,6 +1720,9 @@ static __net_init int inet_init_net(struct net *net)
1700 net->ipv4.sysctl_ip_default_ttl = IPDEFTTL; 1720 net->ipv4.sysctl_ip_default_ttl = IPDEFTTL;
1701 net->ipv4.sysctl_ip_dynaddr = 0; 1721 net->ipv4.sysctl_ip_dynaddr = 0;
1702 net->ipv4.sysctl_ip_early_demux = 1; 1722 net->ipv4.sysctl_ip_early_demux = 1;
1723#ifdef CONFIG_SYSCTL
1724 net->ipv4.sysctl_ip_prot_sock = PROT_SOCK;
1725#endif
1703 1726
1704 return 0; 1727 return 0;
1705} 1728}
@@ -1831,8 +1854,6 @@ static int __init inet_init(void)
1831 1854
1832 ip_init(); 1855 ip_init();
1833 1856
1834 tcp_v4_init();
1835
1836 /* Setup TCP slab cache for open requests. */ 1857 /* Setup TCP slab cache for open requests. */
1837 tcp_init(); 1858 tcp_init();
1838 1859
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index f2a71025a770..22377c8ff14b 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -270,6 +270,9 @@ static void ah_input_done(struct crypto_async_request *base, int err)
270 int ihl = ip_hdrlen(skb); 270 int ihl = ip_hdrlen(skb);
271 int ah_hlen = (ah->hdrlen + 2) << 2; 271 int ah_hlen = (ah->hdrlen + 2) << 2;
272 272
273 if (err)
274 goto out;
275
273 work_iph = AH_SKB_CB(skb)->tmp; 276 work_iph = AH_SKB_CB(skb)->tmp;
274 auth_data = ah_tmp_auth(work_iph, ihl); 277 auth_data = ah_tmp_auth(work_iph, ihl);
275 icv = ah_tmp_icv(ahp->ahash, auth_data, ahp->icv_trunc_len); 278 icv = ah_tmp_icv(ahp->ahash, auth_data, ahp->icv_trunc_len);
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 89a8cac4726a..51b27ae09fbd 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1263,7 +1263,7 @@ void __init arp_init(void)
1263/* 1263/*
1264 * ax25 -> ASCII conversion 1264 * ax25 -> ASCII conversion
1265 */ 1265 */
1266static char *ax2asc2(ax25_address *a, char *buf) 1266static void ax2asc2(ax25_address *a, char *buf)
1267{ 1267{
1268 char c, *s; 1268 char c, *s;
1269 int n; 1269 int n;
@@ -1285,10 +1285,10 @@ static char *ax2asc2(ax25_address *a, char *buf)
1285 *s++ = n + '0'; 1285 *s++ = n + '0';
1286 *s++ = '\0'; 1286 *s++ = '\0';
1287 1287
1288 if (*buf == '\0' || *buf == '-') 1288 if (*buf == '\0' || *buf == '-') {
1289 return "*"; 1289 buf[0] = '*';
1290 1290 buf[1] = '\0';
1291 return buf; 1291 }
1292} 1292}
1293#endif /* CONFIG_AX25 */ 1293#endif /* CONFIG_AX25 */
1294 1294
@@ -1322,7 +1322,7 @@ static void arp_format_neigh_entry(struct seq_file *seq,
1322 } 1322 }
1323#endif 1323#endif
1324 sprintf(tbuf, "%pI4", n->primary_key); 1324 sprintf(tbuf, "%pI4", n->primary_key);
1325 seq_printf(seq, "%-16s 0x%-10x0x%-10x%s * %s\n", 1325 seq_printf(seq, "%-16s 0x%-10x0x%-10x%-17s * %s\n",
1326 tbuf, hatype, arp_state_to_flags(n), hbuffer, dev->name); 1326 tbuf, hatype, arp_state_to_flags(n), hbuffer, dev->name);
1327 read_unlock(&n->lock); 1327 read_unlock(&n->lock);
1328} 1328}
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 72d6f056d863..ae206163c273 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -1587,6 +1587,10 @@ int cipso_v4_validate(const struct sk_buff *skb, unsigned char **option)
1587 goto validate_return_locked; 1587 goto validate_return_locked;
1588 } 1588 }
1589 1589
1590 if (opt_iter + 1 == opt_len) {
1591 err_offset = opt_iter;
1592 goto validate_return_locked;
1593 }
1590 tag_len = tag[1]; 1594 tag_len = tag[1];
1591 if (tag_len > (opt_len - opt_iter)) { 1595 if (tag_len > (opt_len - opt_iter)) {
1592 err_offset = opt_iter + 1; 1596 err_offset = opt_iter + 1;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 4cd2ee8857d2..cebedd545e5e 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -32,6 +32,7 @@
32#include <linux/module.h> 32#include <linux/module.h>
33#include <linux/types.h> 33#include <linux/types.h>
34#include <linux/kernel.h> 34#include <linux/kernel.h>
35#include <linux/sched/signal.h>
35#include <linux/string.h> 36#include <linux/string.h>
36#include <linux/mm.h> 37#include <linux/mm.h>
37#include <linux/socket.h> 38#include <linux/socket.h>
@@ -65,8 +66,6 @@
65#include <net/net_namespace.h> 66#include <net/net_namespace.h>
66#include <net/addrconf.h> 67#include <net/addrconf.h>
67 68
68#include "fib_lookup.h"
69
70static struct ipv4_devconf ipv4_devconf = { 69static struct ipv4_devconf ipv4_devconf = {
71 .data = { 70 .data = {
72 [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1, 71 [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1,
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 20fb25e3027b..b1e24446e297 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -18,6 +18,8 @@
18#include <net/protocol.h> 18#include <net/protocol.h>
19#include <net/udp.h> 19#include <net/udp.h>
20 20
21#include <linux/highmem.h>
22
21struct esp_skb_cb { 23struct esp_skb_cb {
22 struct xfrm_skb_cb xfrm; 24 struct xfrm_skb_cb xfrm;
23 void *tmp; 25 void *tmp;
@@ -92,11 +94,40 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead,
92 __alignof__(struct scatterlist)); 94 __alignof__(struct scatterlist));
93} 95}
94 96
97static void esp_ssg_unref(struct xfrm_state *x, void *tmp)
98{
99 struct esp_output_extra *extra = esp_tmp_extra(tmp);
100 struct crypto_aead *aead = x->data;
101 int extralen = 0;
102 u8 *iv;
103 struct aead_request *req;
104 struct scatterlist *sg;
105
106 if (x->props.flags & XFRM_STATE_ESN)
107 extralen += sizeof(*extra);
108
109 extra = esp_tmp_extra(tmp);
110 iv = esp_tmp_iv(aead, tmp, extralen);
111 req = esp_tmp_req(aead, iv);
112
113 /* Unref skb_frag_pages in the src scatterlist if necessary.
114 * Skip the first sg which comes from skb->data.
115 */
116 if (req->src != req->dst)
117 for (sg = sg_next(req->src); sg; sg = sg_next(sg))
118 put_page(sg_page(sg));
119}
120
95static void esp_output_done(struct crypto_async_request *base, int err) 121static void esp_output_done(struct crypto_async_request *base, int err)
96{ 122{
97 struct sk_buff *skb = base->data; 123 struct sk_buff *skb = base->data;
124 void *tmp;
125 struct dst_entry *dst = skb_dst(skb);
126 struct xfrm_state *x = dst->xfrm;
98 127
99 kfree(ESP_SKB_CB(skb)->tmp); 128 tmp = ESP_SKB_CB(skb)->tmp;
129 esp_ssg_unref(x, tmp);
130 kfree(tmp);
100 xfrm_output_resume(skb, err); 131 xfrm_output_resume(skb, err);
101} 132}
102 133
@@ -120,6 +151,29 @@ static void esp_output_restore_header(struct sk_buff *skb)
120 sizeof(__be32)); 151 sizeof(__be32));
121} 152}
122 153
154static struct ip_esp_hdr *esp_output_set_extra(struct sk_buff *skb,
155 struct ip_esp_hdr *esph,
156 struct esp_output_extra *extra)
157{
158 struct xfrm_state *x = skb_dst(skb)->xfrm;
159
160 /* For ESN we move the header forward by 4 bytes to
161 * accomodate the high bits. We will move it back after
162 * encryption.
163 */
164 if ((x->props.flags & XFRM_STATE_ESN)) {
165 extra->esphoff = (unsigned char *)esph -
166 skb_transport_header(skb);
167 esph = (struct ip_esp_hdr *)((unsigned char *)esph - 4);
168 extra->seqhi = esph->spi;
169 esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
170 }
171
172 esph->spi = x->id.spi;
173
174 return esph;
175}
176
123static void esp_output_done_esn(struct crypto_async_request *base, int err) 177static void esp_output_done_esn(struct crypto_async_request *base, int err)
124{ 178{
125 struct sk_buff *skb = base->data; 179 struct sk_buff *skb = base->data;
@@ -128,18 +182,36 @@ static void esp_output_done_esn(struct crypto_async_request *base, int err)
128 esp_output_done(base, err); 182 esp_output_done(base, err);
129} 183}
130 184
185static void esp_output_fill_trailer(u8 *tail, int tfclen, int plen, __u8 proto)
186{
187 /* Fill padding... */
188 if (tfclen) {
189 memset(tail, 0, tfclen);
190 tail += tfclen;
191 }
192 do {
193 int i;
194 for (i = 0; i < plen - 2; i++)
195 tail[i] = i + 1;
196 } while (0);
197 tail[plen - 2] = plen - 2;
198 tail[plen - 1] = proto;
199}
200
131static int esp_output(struct xfrm_state *x, struct sk_buff *skb) 201static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
132{ 202{
133 int err;
134 struct esp_output_extra *extra; 203 struct esp_output_extra *extra;
204 int err = -ENOMEM;
135 struct ip_esp_hdr *esph; 205 struct ip_esp_hdr *esph;
136 struct crypto_aead *aead; 206 struct crypto_aead *aead;
137 struct aead_request *req; 207 struct aead_request *req;
138 struct scatterlist *sg; 208 struct scatterlist *sg, *dsg;
139 struct sk_buff *trailer; 209 struct sk_buff *trailer;
210 struct page *page;
140 void *tmp; 211 void *tmp;
141 u8 *iv; 212 u8 *iv;
142 u8 *tail; 213 u8 *tail;
214 u8 *vaddr;
143 int blksize; 215 int blksize;
144 int clen; 216 int clen;
145 int alen; 217 int alen;
@@ -149,7 +221,9 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
149 int nfrags; 221 int nfrags;
150 int assoclen; 222 int assoclen;
151 int extralen; 223 int extralen;
224 int tailen;
152 __be64 seqno; 225 __be64 seqno;
226 __u8 proto = *skb_mac_header(skb);
153 227
154 /* skb is pure payload to encrypt */ 228 /* skb is pure payload to encrypt */
155 229
@@ -169,12 +243,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
169 blksize = ALIGN(crypto_aead_blocksize(aead), 4); 243 blksize = ALIGN(crypto_aead_blocksize(aead), 4);
170 clen = ALIGN(skb->len + 2 + tfclen, blksize); 244 clen = ALIGN(skb->len + 2 + tfclen, blksize);
171 plen = clen - skb->len - tfclen; 245 plen = clen - skb->len - tfclen;
172 246 tailen = tfclen + plen + alen;
173 err = skb_cow_data(skb, tfclen + plen + alen, &trailer);
174 if (err < 0)
175 goto error;
176 nfrags = err;
177
178 assoclen = sizeof(*esph); 247 assoclen = sizeof(*esph);
179 extralen = 0; 248 extralen = 0;
180 249
@@ -183,35 +252,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
183 assoclen += sizeof(__be32); 252 assoclen += sizeof(__be32);
184 } 253 }
185 254
186 tmp = esp_alloc_tmp(aead, nfrags, extralen);
187 if (!tmp) {
188 err = -ENOMEM;
189 goto error;
190 }
191
192 extra = esp_tmp_extra(tmp);
193 iv = esp_tmp_iv(aead, tmp, extralen);
194 req = esp_tmp_req(aead, iv);
195 sg = esp_req_sg(aead, req);
196
197 /* Fill padding... */
198 tail = skb_tail_pointer(trailer);
199 if (tfclen) {
200 memset(tail, 0, tfclen);
201 tail += tfclen;
202 }
203 do {
204 int i;
205 for (i = 0; i < plen - 2; i++)
206 tail[i] = i + 1;
207 } while (0);
208 tail[plen - 2] = plen - 2;
209 tail[plen - 1] = *skb_mac_header(skb);
210 pskb_put(skb, trailer, clen - skb->len + alen);
211
212 skb_push(skb, -skb_network_offset(skb));
213 esph = ip_esp_hdr(skb);
214 *skb_mac_header(skb) = IPPROTO_ESP; 255 *skb_mac_header(skb) = IPPROTO_ESP;
256 esph = ip_esp_hdr(skb);
215 257
216 /* this is non-NULL only with UDP Encapsulation */ 258 /* this is non-NULL only with UDP Encapsulation */
217 if (x->encap) { 259 if (x->encap) {
@@ -230,7 +272,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
230 uh = (struct udphdr *)esph; 272 uh = (struct udphdr *)esph;
231 uh->source = sport; 273 uh->source = sport;
232 uh->dest = dport; 274 uh->dest = dport;
233 uh->len = htons(skb->len - skb_transport_offset(skb)); 275 uh->len = htons(skb->len + tailen
276 - skb_transport_offset(skb));
234 uh->check = 0; 277 uh->check = 0;
235 278
236 switch (encap_type) { 279 switch (encap_type) {
@@ -248,31 +291,148 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
248 *skb_mac_header(skb) = IPPROTO_UDP; 291 *skb_mac_header(skb) = IPPROTO_UDP;
249 } 292 }
250 293
251 esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); 294 if (!skb_cloned(skb)) {
295 if (tailen <= skb_availroom(skb)) {
296 nfrags = 1;
297 trailer = skb;
298 tail = skb_tail_pointer(trailer);
252 299
253 aead_request_set_callback(req, 0, esp_output_done, skb); 300 goto skip_cow;
301 } else if ((skb_shinfo(skb)->nr_frags < MAX_SKB_FRAGS)
302 && !skb_has_frag_list(skb)) {
303 int allocsize;
304 struct sock *sk = skb->sk;
305 struct page_frag *pfrag = &x->xfrag;
254 306
255 /* For ESN we move the header forward by 4 bytes to 307 allocsize = ALIGN(tailen, L1_CACHE_BYTES);
256 * accomodate the high bits. We will move it back after 308
257 * encryption. 309 spin_lock_bh(&x->lock);
258 */ 310
259 if ((x->props.flags & XFRM_STATE_ESN)) { 311 if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
260 extra->esphoff = (unsigned char *)esph - 312 spin_unlock_bh(&x->lock);
261 skb_transport_header(skb); 313 goto cow;
262 esph = (struct ip_esp_hdr *)((unsigned char *)esph - 4); 314 }
263 extra->seqhi = esph->spi; 315
264 esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi); 316 page = pfrag->page;
265 aead_request_set_callback(req, 0, esp_output_done_esn, skb); 317 get_page(page);
318
319 vaddr = kmap_atomic(page);
320
321 tail = vaddr + pfrag->offset;
322
323 esp_output_fill_trailer(tail, tfclen, plen, proto);
324
325 kunmap_atomic(vaddr);
326
327 nfrags = skb_shinfo(skb)->nr_frags;
328
329 __skb_fill_page_desc(skb, nfrags, page, pfrag->offset,
330 tailen);
331 skb_shinfo(skb)->nr_frags = ++nfrags;
332
333 pfrag->offset = pfrag->offset + allocsize;
334 nfrags++;
335
336 skb->len += tailen;
337 skb->data_len += tailen;
338 skb->truesize += tailen;
339 if (sk)
340 atomic_add(tailen, &sk->sk_wmem_alloc);
341
342 skb_push(skb, -skb_network_offset(skb));
343
344 esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
345 esph->spi = x->id.spi;
346
347 tmp = esp_alloc_tmp(aead, nfrags + 2, extralen);
348 if (!tmp) {
349 spin_unlock_bh(&x->lock);
350 err = -ENOMEM;
351 goto error;
352 }
353
354 extra = esp_tmp_extra(tmp);
355 iv = esp_tmp_iv(aead, tmp, extralen);
356 req = esp_tmp_req(aead, iv);
357 sg = esp_req_sg(aead, req);
358 dsg = &sg[nfrags];
359
360 esph = esp_output_set_extra(skb, esph, extra);
361
362 sg_init_table(sg, nfrags);
363 skb_to_sgvec(skb, sg,
364 (unsigned char *)esph - skb->data,
365 assoclen + ivlen + clen + alen);
366
367 allocsize = ALIGN(skb->data_len, L1_CACHE_BYTES);
368
369 if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
370 spin_unlock_bh(&x->lock);
371 err = -ENOMEM;
372 goto error;
373 }
374
375 skb_shinfo(skb)->nr_frags = 1;
376
377 page = pfrag->page;
378 get_page(page);
379 /* replace page frags in skb with new page */
380 __skb_fill_page_desc(skb, 0, page, pfrag->offset, skb->data_len);
381 pfrag->offset = pfrag->offset + allocsize;
382
383 sg_init_table(dsg, skb_shinfo(skb)->nr_frags + 1);
384 skb_to_sgvec(skb, dsg,
385 (unsigned char *)esph - skb->data,
386 assoclen + ivlen + clen + alen);
387
388 spin_unlock_bh(&x->lock);
389
390 goto skip_cow2;
391 }
266 } 392 }
267 393
394cow:
395 err = skb_cow_data(skb, tailen, &trailer);
396 if (err < 0)
397 goto error;
398 nfrags = err;
399 tail = skb_tail_pointer(trailer);
400 esph = ip_esp_hdr(skb);
401
402skip_cow:
403 esp_output_fill_trailer(tail, tfclen, plen, proto);
404
405 pskb_put(skb, trailer, clen - skb->len + alen);
406 skb_push(skb, -skb_network_offset(skb));
407 esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
268 esph->spi = x->id.spi; 408 esph->spi = x->id.spi;
269 409
410 tmp = esp_alloc_tmp(aead, nfrags, extralen);
411 if (!tmp) {
412 err = -ENOMEM;
413 goto error;
414 }
415
416 extra = esp_tmp_extra(tmp);
417 iv = esp_tmp_iv(aead, tmp, extralen);
418 req = esp_tmp_req(aead, iv);
419 sg = esp_req_sg(aead, req);
420 dsg = sg;
421
422 esph = esp_output_set_extra(skb, esph, extra);
423
270 sg_init_table(sg, nfrags); 424 sg_init_table(sg, nfrags);
271 skb_to_sgvec(skb, sg, 425 skb_to_sgvec(skb, sg,
272 (unsigned char *)esph - skb->data, 426 (unsigned char *)esph - skb->data,
273 assoclen + ivlen + clen + alen); 427 assoclen + ivlen + clen + alen);
274 428
275 aead_request_set_crypt(req, sg, sg, ivlen + clen, iv); 429skip_cow2:
430 if ((x->props.flags & XFRM_STATE_ESN))
431 aead_request_set_callback(req, 0, esp_output_done_esn, skb);
432 else
433 aead_request_set_callback(req, 0, esp_output_done, skb);
434
435 aead_request_set_crypt(req, sg, dsg, ivlen + clen, iv);
276 aead_request_set_ad(req, assoclen); 436 aead_request_set_ad(req, assoclen);
277 437
278 seqno = cpu_to_be64(XFRM_SKB_CB(skb)->seq.output.low + 438 seqno = cpu_to_be64(XFRM_SKB_CB(skb)->seq.output.low +
@@ -298,6 +458,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
298 esp_output_restore_header(skb); 458 esp_output_restore_header(skb);
299 } 459 }
300 460
461 if (sg != dsg)
462 esp_ssg_unref(x, tmp);
301 kfree(tmp); 463 kfree(tmp);
302 464
303error: 465error:
@@ -401,6 +563,23 @@ static void esp_input_restore_header(struct sk_buff *skb)
401 __skb_pull(skb, 4); 563 __skb_pull(skb, 4);
402} 564}
403 565
566static void esp_input_set_header(struct sk_buff *skb, __be32 *seqhi)
567{
568 struct xfrm_state *x = xfrm_input_state(skb);
569 struct ip_esp_hdr *esph = (struct ip_esp_hdr *)skb->data;
570
571 /* For ESN we move the header forward by 4 bytes to
572 * accomodate the high bits. We will move it back after
573 * decryption.
574 */
575 if ((x->props.flags & XFRM_STATE_ESN)) {
576 esph = (void *)skb_push(skb, 4);
577 *seqhi = esph->spi;
578 esph->spi = esph->seq_no;
579 esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi;
580 }
581}
582
404static void esp_input_done_esn(struct crypto_async_request *base, int err) 583static void esp_input_done_esn(struct crypto_async_request *base, int err)
405{ 584{
406 struct sk_buff *skb = base->data; 585 struct sk_buff *skb = base->data;
@@ -437,12 +616,6 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
437 if (elen <= 0) 616 if (elen <= 0)
438 goto out; 617 goto out;
439 618
440 err = skb_cow_data(skb, 0, &trailer);
441 if (err < 0)
442 goto out;
443
444 nfrags = err;
445
446 assoclen = sizeof(*esph); 619 assoclen = sizeof(*esph);
447 seqhilen = 0; 620 seqhilen = 0;
448 621
@@ -451,6 +624,26 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
451 assoclen += seqhilen; 624 assoclen += seqhilen;
452 } 625 }
453 626
627 if (!skb_cloned(skb)) {
628 if (!skb_is_nonlinear(skb)) {
629 nfrags = 1;
630
631 goto skip_cow;
632 } else if (!skb_has_frag_list(skb)) {
633 nfrags = skb_shinfo(skb)->nr_frags;
634 nfrags++;
635
636 goto skip_cow;
637 }
638 }
639
640 err = skb_cow_data(skb, 0, &trailer);
641 if (err < 0)
642 goto out;
643
644 nfrags = err;
645
646skip_cow:
454 err = -ENOMEM; 647 err = -ENOMEM;
455 tmp = esp_alloc_tmp(aead, nfrags, seqhilen); 648 tmp = esp_alloc_tmp(aead, nfrags, seqhilen);
456 if (!tmp) 649 if (!tmp)
@@ -462,26 +655,17 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
462 req = esp_tmp_req(aead, iv); 655 req = esp_tmp_req(aead, iv);
463 sg = esp_req_sg(aead, req); 656 sg = esp_req_sg(aead, req);
464 657
465 skb->ip_summed = CHECKSUM_NONE; 658 esp_input_set_header(skb, seqhi);
466 659
467 esph = (struct ip_esp_hdr *)skb->data; 660 sg_init_table(sg, nfrags);
661 skb_to_sgvec(skb, sg, 0, skb->len);
468 662
469 aead_request_set_callback(req, 0, esp_input_done, skb); 663 skb->ip_summed = CHECKSUM_NONE;
470 664
471 /* For ESN we move the header forward by 4 bytes to 665 if ((x->props.flags & XFRM_STATE_ESN))
472 * accomodate the high bits. We will move it back after
473 * decryption.
474 */
475 if ((x->props.flags & XFRM_STATE_ESN)) {
476 esph = (void *)skb_push(skb, 4);
477 *seqhi = esph->spi;
478 esph->spi = esph->seq_no;
479 esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi;
480 aead_request_set_callback(req, 0, esp_input_done_esn, skb); 666 aead_request_set_callback(req, 0, esp_input_done_esn, skb);
481 } 667 else
482 668 aead_request_set_callback(req, 0, esp_input_done, skb);
483 sg_init_table(sg, nfrags);
484 skb_to_sgvec(skb, sg, 0, skb->len);
485 669
486 aead_request_set_crypt(req, sg, sg, elen + ivlen, iv); 670 aead_request_set_crypt(req, sg, sg, elen + ivlen, iv);
487 aead_request_set_ad(req, assoclen); 671 aead_request_set_ad(req, assoclen);
diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
new file mode 100644
index 000000000000..1de442632406
--- /dev/null
+++ b/net/ipv4/esp4_offload.c
@@ -0,0 +1,106 @@
1/*
2 * IPV4 GSO/GRO offload support
3 * Linux INET implementation
4 *
5 * Copyright (C) 2016 secunet Security Networks AG
6 * Author: Steffen Klassert <steffen.klassert@secunet.com>
7 *
8 * This program is free software; you can redistribute it and/or modify it
9 * under the terms and conditions of the GNU General Public License,
10 * version 2, as published by the Free Software Foundation.
11 *
12 * ESP GRO support
13 */
14
15#include <linux/skbuff.h>
16#include <linux/init.h>
17#include <net/protocol.h>
18#include <crypto/aead.h>
19#include <crypto/authenc.h>
20#include <linux/err.h>
21#include <linux/module.h>
22#include <net/ip.h>
23#include <net/xfrm.h>
24#include <net/esp.h>
25#include <linux/scatterlist.h>
26#include <linux/kernel.h>
27#include <linux/slab.h>
28#include <linux/spinlock.h>
29#include <net/udp.h>
30
31static struct sk_buff **esp4_gro_receive(struct sk_buff **head,
32 struct sk_buff *skb)
33{
34 int offset = skb_gro_offset(skb);
35 struct xfrm_offload *xo;
36 struct xfrm_state *x;
37 __be32 seq;
38 __be32 spi;
39 int err;
40
41 skb_pull(skb, offset);
42
43 if ((err = xfrm_parse_spi(skb, IPPROTO_ESP, &spi, &seq)) != 0)
44 goto out;
45
46 err = secpath_set(skb);
47 if (err)
48 goto out;
49
50 if (skb->sp->len == XFRM_MAX_DEPTH)
51 goto out;
52
53 x = xfrm_state_lookup(dev_net(skb->dev), skb->mark,
54 (xfrm_address_t *)&ip_hdr(skb)->daddr,
55 spi, IPPROTO_ESP, AF_INET);
56 if (!x)
57 goto out;
58
59 skb->sp->xvec[skb->sp->len++] = x;
60 skb->sp->olen++;
61
62 xo = xfrm_offload(skb);
63 if (!xo) {
64 xfrm_state_put(x);
65 goto out;
66 }
67 xo->flags |= XFRM_GRO;
68
69 XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
70 XFRM_SPI_SKB_CB(skb)->family = AF_INET;
71 XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
72 XFRM_SPI_SKB_CB(skb)->seq = seq;
73
74 /* We don't need to handle errors from xfrm_input, it does all
75 * the error handling and frees the resources on error. */
76 xfrm_input(skb, IPPROTO_ESP, spi, -2);
77
78 return ERR_PTR(-EINPROGRESS);
79out:
80 skb_push(skb, offset);
81 NAPI_GRO_CB(skb)->same_flow = 0;
82 NAPI_GRO_CB(skb)->flush = 1;
83
84 return NULL;
85}
86
87static const struct net_offload esp4_offload = {
88 .callbacks = {
89 .gro_receive = esp4_gro_receive,
90 },
91};
92
93static int __init esp4_offload_init(void)
94{
95 return inet_add_offload(&esp4_offload, IPPROTO_ESP);
96}
97
98static void __exit esp4_offload_exit(void)
99{
100 inet_del_offload(&esp4_offload, IPPROTO_ESP);
101}
102
103module_init(esp4_offload_init);
104module_exit(esp4_offload_exit);
105MODULE_LICENSE("GPL");
106MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>");
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 7db2ad2e82d3..8f2133ffc2ff 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -319,7 +319,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
319 int ret, no_addr; 319 int ret, no_addr;
320 struct fib_result res; 320 struct fib_result res;
321 struct flowi4 fl4; 321 struct flowi4 fl4;
322 struct net *net; 322 struct net *net = dev_net(dev);
323 bool dev_match; 323 bool dev_match;
324 324
325 fl4.flowi4_oif = 0; 325 fl4.flowi4_oif = 0;
@@ -332,6 +332,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
332 fl4.flowi4_scope = RT_SCOPE_UNIVERSE; 332 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
333 fl4.flowi4_tun_key.tun_id = 0; 333 fl4.flowi4_tun_key.tun_id = 0;
334 fl4.flowi4_flags = 0; 334 fl4.flowi4_flags = 0;
335 fl4.flowi4_uid = sock_net_uid(net, NULL);
335 336
336 no_addr = idev->ifa_list == NULL; 337 no_addr = idev->ifa_list == NULL;
337 338
@@ -339,13 +340,12 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
339 340
340 trace_fib_validate_source(dev, &fl4); 341 trace_fib_validate_source(dev, &fl4);
341 342
342 net = dev_net(dev);
343 if (fib_lookup(net, &fl4, &res, 0)) 343 if (fib_lookup(net, &fl4, &res, 0))
344 goto last_resort; 344 goto last_resort;
345 if (res.type != RTN_UNICAST && 345 if (res.type != RTN_UNICAST &&
346 (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev))) 346 (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev)))
347 goto e_inval; 347 goto e_inval;
348 if (!rpf && !fib_num_tclassid_users(dev_net(dev)) && 348 if (!rpf && !fib_num_tclassid_users(net) &&
349 (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) 349 (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev)))
350 goto last_resort; 350 goto last_resort;
351 fib_combine_itag(itag, &res); 351 fib_combine_itag(itag, &res);
@@ -622,6 +622,7 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
622 [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, 622 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
623 [RTA_ENCAP] = { .type = NLA_NESTED }, 623 [RTA_ENCAP] = { .type = NLA_NESTED },
624 [RTA_UID] = { .type = NLA_U32 }, 624 [RTA_UID] = { .type = NLA_U32 },
625 [RTA_MARK] = { .type = NLA_U32 },
625}; 626};
626 627
627static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, 628static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
@@ -1082,7 +1083,8 @@ static void nl_fib_input(struct sk_buff *skb)
1082 1083
1083 net = sock_net(skb->sk); 1084 net = sock_net(skb->sk);
1084 nlh = nlmsg_hdr(skb); 1085 nlh = nlmsg_hdr(skb);
1085 if (skb->len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len || 1086 if (skb->len < nlmsg_total_size(sizeof(*frn)) ||
1087 skb->len < nlh->nlmsg_len ||
1086 nlmsg_len(nlh) < sizeof(*frn)) 1088 nlmsg_len(nlh) < sizeof(*frn))
1087 return; 1089 return;
1088 1090
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 9a375b908d01..317026a39cfa 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -471,7 +471,6 @@ static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
471static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, 471static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
472 int remaining, struct fib_config *cfg) 472 int remaining, struct fib_config *cfg)
473{ 473{
474 struct net *net = cfg->fc_nlinfo.nl_net;
475 int ret; 474 int ret;
476 475
477 change_nexthops(fi) { 476 change_nexthops(fi) {
@@ -503,16 +502,14 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
503 nla = nla_find(attrs, attrlen, RTA_ENCAP); 502 nla = nla_find(attrs, attrlen, RTA_ENCAP);
504 if (nla) { 503 if (nla) {
505 struct lwtunnel_state *lwtstate; 504 struct lwtunnel_state *lwtstate;
506 struct net_device *dev = NULL;
507 struct nlattr *nla_entype; 505 struct nlattr *nla_entype;
508 506
509 nla_entype = nla_find(attrs, attrlen, 507 nla_entype = nla_find(attrs, attrlen,
510 RTA_ENCAP_TYPE); 508 RTA_ENCAP_TYPE);
511 if (!nla_entype) 509 if (!nla_entype)
512 goto err_inval; 510 goto err_inval;
513 if (cfg->fc_oif) 511
514 dev = __dev_get_by_index(net, cfg->fc_oif); 512 ret = lwtunnel_build_state(nla_get_u16(
515 ret = lwtunnel_build_state(dev, nla_get_u16(
516 nla_entype), 513 nla_entype),
517 nla, AF_INET, cfg, 514 nla, AF_INET, cfg,
518 &lwtstate); 515 &lwtstate);
@@ -597,21 +594,18 @@ static inline void fib_add_weight(struct fib_info *fi,
597 594
598#endif /* CONFIG_IP_ROUTE_MULTIPATH */ 595#endif /* CONFIG_IP_ROUTE_MULTIPATH */
599 596
600static int fib_encap_match(struct net *net, u16 encap_type, 597static int fib_encap_match(u16 encap_type,
601 struct nlattr *encap, 598 struct nlattr *encap,
602 int oif, const struct fib_nh *nh, 599 const struct fib_nh *nh,
603 const struct fib_config *cfg) 600 const struct fib_config *cfg)
604{ 601{
605 struct lwtunnel_state *lwtstate; 602 struct lwtunnel_state *lwtstate;
606 struct net_device *dev = NULL;
607 int ret, result = 0; 603 int ret, result = 0;
608 604
609 if (encap_type == LWTUNNEL_ENCAP_NONE) 605 if (encap_type == LWTUNNEL_ENCAP_NONE)
610 return 0; 606 return 0;
611 607
612 if (oif) 608 ret = lwtunnel_build_state(encap_type, encap,
613 dev = __dev_get_by_index(net, oif);
614 ret = lwtunnel_build_state(dev, encap_type, encap,
615 AF_INET, cfg, &lwtstate); 609 AF_INET, cfg, &lwtstate);
616 if (!ret) { 610 if (!ret) {
617 result = lwtunnel_cmp_encap(lwtstate, nh->nh_lwtstate); 611 result = lwtunnel_cmp_encap(lwtstate, nh->nh_lwtstate);
@@ -623,7 +617,6 @@ static int fib_encap_match(struct net *net, u16 encap_type,
623 617
624int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) 618int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
625{ 619{
626 struct net *net = cfg->fc_nlinfo.nl_net;
627#ifdef CONFIG_IP_ROUTE_MULTIPATH 620#ifdef CONFIG_IP_ROUTE_MULTIPATH
628 struct rtnexthop *rtnh; 621 struct rtnexthop *rtnh;
629 int remaining; 622 int remaining;
@@ -634,9 +627,8 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
634 627
635 if (cfg->fc_oif || cfg->fc_gw) { 628 if (cfg->fc_oif || cfg->fc_gw) {
636 if (cfg->fc_encap) { 629 if (cfg->fc_encap) {
637 if (fib_encap_match(net, cfg->fc_encap_type, 630 if (fib_encap_match(cfg->fc_encap_type,
638 cfg->fc_encap, cfg->fc_oif, 631 cfg->fc_encap, fi->fib_nh, cfg))
639 fi->fib_nh, cfg))
640 return 1; 632 return 1;
641 } 633 }
642 if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) && 634 if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
@@ -1093,13 +1085,10 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
1093 1085
1094 if (cfg->fc_encap) { 1086 if (cfg->fc_encap) {
1095 struct lwtunnel_state *lwtstate; 1087 struct lwtunnel_state *lwtstate;
1096 struct net_device *dev = NULL;
1097 1088
1098 if (cfg->fc_encap_type == LWTUNNEL_ENCAP_NONE) 1089 if (cfg->fc_encap_type == LWTUNNEL_ENCAP_NONE)
1099 goto err_inval; 1090 goto err_inval;
1100 if (cfg->fc_oif) 1091 err = lwtunnel_build_state(cfg->fc_encap_type,
1101 dev = __dev_get_by_index(net, cfg->fc_oif);
1102 err = lwtunnel_build_state(dev, cfg->fc_encap_type,
1103 cfg->fc_encap, AF_INET, cfg, 1092 cfg->fc_encap, AF_INET, cfg,
1104 &lwtstate); 1093 &lwtstate);
1105 if (err) 1094 if (err)
@@ -1366,6 +1355,36 @@ int fib_sync_down_addr(struct net_device *dev, __be32 local)
1366 return ret; 1355 return ret;
1367} 1356}
1368 1357
1358static int call_fib_nh_notifiers(struct fib_nh *fib_nh,
1359 enum fib_event_type event_type)
1360{
1361 struct in_device *in_dev = __in_dev_get_rtnl(fib_nh->nh_dev);
1362 struct fib_nh_notifier_info info = {
1363 .fib_nh = fib_nh,
1364 };
1365
1366 switch (event_type) {
1367 case FIB_EVENT_NH_ADD:
1368 if (fib_nh->nh_flags & RTNH_F_DEAD)
1369 break;
1370 if (IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
1371 fib_nh->nh_flags & RTNH_F_LINKDOWN)
1372 break;
1373 return call_fib_notifiers(dev_net(fib_nh->nh_dev), event_type,
1374 &info.info);
1375 case FIB_EVENT_NH_DEL:
1376 if ((IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
1377 fib_nh->nh_flags & RTNH_F_LINKDOWN) ||
1378 (fib_nh->nh_flags & RTNH_F_DEAD))
1379 return call_fib_notifiers(dev_net(fib_nh->nh_dev),
1380 event_type, &info.info);
1381 default:
1382 break;
1383 }
1384
1385 return NOTIFY_DONE;
1386}
1387
1369/* Event force Flags Description 1388/* Event force Flags Description
1370 * NETDEV_CHANGE 0 LINKDOWN Carrier OFF, not for scope host 1389 * NETDEV_CHANGE 0 LINKDOWN Carrier OFF, not for scope host
1371 * NETDEV_DOWN 0 LINKDOWN|DEAD Link down, not for scope host 1390 * NETDEV_DOWN 0 LINKDOWN|DEAD Link down, not for scope host
@@ -1407,6 +1426,8 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force)
1407 nexthop_nh->nh_flags |= RTNH_F_LINKDOWN; 1426 nexthop_nh->nh_flags |= RTNH_F_LINKDOWN;
1408 break; 1427 break;
1409 } 1428 }
1429 call_fib_nh_notifiers(nexthop_nh,
1430 FIB_EVENT_NH_DEL);
1410 dead++; 1431 dead++;
1411 } 1432 }
1412#ifdef CONFIG_IP_ROUTE_MULTIPATH 1433#ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -1437,7 +1458,7 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force)
1437} 1458}
1438 1459
1439/* Must be invoked inside of an RCU protected region. */ 1460/* Must be invoked inside of an RCU protected region. */
1440void fib_select_default(const struct flowi4 *flp, struct fib_result *res) 1461static void fib_select_default(const struct flowi4 *flp, struct fib_result *res)
1441{ 1462{
1442 struct fib_info *fi = NULL, *last_resort = NULL; 1463 struct fib_info *fi = NULL, *last_resort = NULL;
1443 struct hlist_head *fa_head = res->fa_head; 1464 struct hlist_head *fa_head = res->fa_head;
@@ -1561,6 +1582,7 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags)
1561 continue; 1582 continue;
1562 alive++; 1583 alive++;
1563 nexthop_nh->nh_flags &= ~nh_flags; 1584 nexthop_nh->nh_flags &= ~nh_flags;
1585 call_fib_nh_notifiers(nexthop_nh, FIB_EVENT_NH_ADD);
1564 } endfor_nexthops(fi) 1586 } endfor_nexthops(fi)
1565 1587
1566 if (alive > 0) { 1588 if (alive > 0) {
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 2919d1a10cfd..2f0d8233950f 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -124,7 +124,7 @@ static void fib_notify(struct net *net, struct notifier_block *nb,
124static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net, 124static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net,
125 enum fib_event_type event_type, u32 dst, 125 enum fib_event_type event_type, u32 dst,
126 int dst_len, struct fib_info *fi, 126 int dst_len, struct fib_info *fi,
127 u8 tos, u8 type, u32 tb_id, u32 nlflags) 127 u8 tos, u8 type, u32 tb_id)
128{ 128{
129 struct fib_entry_notifier_info info = { 129 struct fib_entry_notifier_info info = {
130 .dst = dst, 130 .dst = dst,
@@ -133,7 +133,6 @@ static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net,
133 .tos = tos, 133 .tos = tos,
134 .type = type, 134 .type = type,
135 .tb_id = tb_id, 135 .tb_id = tb_id,
136 .nlflags = nlflags,
137 }; 136 };
138 return call_fib_notifier(nb, net, event_type, &info.info); 137 return call_fib_notifier(nb, net, event_type, &info.info);
139} 138}
@@ -197,7 +196,7 @@ int call_fib_notifiers(struct net *net, enum fib_event_type event_type,
197static int call_fib_entry_notifiers(struct net *net, 196static int call_fib_entry_notifiers(struct net *net,
198 enum fib_event_type event_type, u32 dst, 197 enum fib_event_type event_type, u32 dst,
199 int dst_len, struct fib_info *fi, 198 int dst_len, struct fib_info *fi,
200 u8 tos, u8 type, u32 tb_id, u32 nlflags) 199 u8 tos, u8 type, u32 tb_id)
201{ 200{
202 struct fib_entry_notifier_info info = { 201 struct fib_entry_notifier_info info = {
203 .dst = dst, 202 .dst = dst,
@@ -206,7 +205,6 @@ static int call_fib_entry_notifiers(struct net *net,
206 .tos = tos, 205 .tos = tos,
207 .type = type, 206 .type = type,
208 .tb_id = tb_id, 207 .tb_id = tb_id,
209 .nlflags = nlflags,
210 }; 208 };
211 return call_fib_notifiers(net, event_type, &info.info); 209 return call_fib_notifiers(net, event_type, &info.info);
212} 210}
@@ -1198,6 +1196,7 @@ static int fib_insert_alias(struct trie *t, struct key_vector *tp,
1198int fib_table_insert(struct net *net, struct fib_table *tb, 1196int fib_table_insert(struct net *net, struct fib_table *tb,
1199 struct fib_config *cfg) 1197 struct fib_config *cfg)
1200{ 1198{
1199 enum fib_event_type event = FIB_EVENT_ENTRY_ADD;
1201 struct trie *t = (struct trie *)tb->tb_data; 1200 struct trie *t = (struct trie *)tb->tb_data;
1202 struct fib_alias *fa, *new_fa; 1201 struct fib_alias *fa, *new_fa;
1203 struct key_vector *l, *tp; 1202 struct key_vector *l, *tp;
@@ -1295,6 +1294,13 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
1295 new_fa->tb_id = tb->tb_id; 1294 new_fa->tb_id = tb->tb_id;
1296 new_fa->fa_default = -1; 1295 new_fa->fa_default = -1;
1297 1296
1297 call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE,
1298 key, plen, fi,
1299 new_fa->fa_tos, cfg->fc_type,
1300 tb->tb_id);
1301 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1302 tb->tb_id, &cfg->fc_nlinfo, nlflags);
1303
1298 hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list); 1304 hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1299 1305
1300 alias_free_mem_rcu(fa); 1306 alias_free_mem_rcu(fa);
@@ -1303,13 +1309,6 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
1303 if (state & FA_S_ACCESSED) 1309 if (state & FA_S_ACCESSED)
1304 rt_cache_flush(cfg->fc_nlinfo.nl_net); 1310 rt_cache_flush(cfg->fc_nlinfo.nl_net);
1305 1311
1306 call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_ADD,
1307 key, plen, fi,
1308 new_fa->fa_tos, cfg->fc_type,
1309 tb->tb_id, cfg->fc_nlflags);
1310 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1311 tb->tb_id, &cfg->fc_nlinfo, nlflags);
1312
1313 goto succeeded; 1312 goto succeeded;
1314 } 1313 }
1315 /* Error if we find a perfect match which 1314 /* Error if we find a perfect match which
@@ -1319,10 +1318,12 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
1319 if (fa_match) 1318 if (fa_match)
1320 goto out; 1319 goto out;
1321 1320
1322 if (cfg->fc_nlflags & NLM_F_APPEND) 1321 if (cfg->fc_nlflags & NLM_F_APPEND) {
1322 event = FIB_EVENT_ENTRY_APPEND;
1323 nlflags |= NLM_F_APPEND; 1323 nlflags |= NLM_F_APPEND;
1324 else 1324 } else {
1325 fa = fa_first; 1325 fa = fa_first;
1326 }
1326 } 1327 }
1327 err = -ENOENT; 1328 err = -ENOENT;
1328 if (!(cfg->fc_nlflags & NLM_F_CREATE)) 1329 if (!(cfg->fc_nlflags & NLM_F_CREATE))
@@ -1351,8 +1352,8 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
1351 tb->tb_num_default++; 1352 tb->tb_num_default++;
1352 1353
1353 rt_cache_flush(cfg->fc_nlinfo.nl_net); 1354 rt_cache_flush(cfg->fc_nlinfo.nl_net);
1354 call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, key, plen, fi, tos, 1355 call_fib_entry_notifiers(net, event, key, plen, fi, tos, cfg->fc_type,
1355 cfg->fc_type, tb->tb_id, cfg->fc_nlflags); 1356 tb->tb_id);
1356 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id, 1357 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id,
1357 &cfg->fc_nlinfo, nlflags); 1358 &cfg->fc_nlinfo, nlflags);
1358succeeded: 1359succeeded:
@@ -1653,8 +1654,8 @@ int fib_table_delete(struct net *net, struct fib_table *tb,
1653 return -ESRCH; 1654 return -ESRCH;
1654 1655
1655 call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, plen, 1656 call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, plen,
1656 fa_to_delete->fa_info, tos, cfg->fc_type, 1657 fa_to_delete->fa_info, tos,
1657 tb->tb_id, 0); 1658 fa_to_delete->fa_type, tb->tb_id);
1658 rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id, 1659 rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id,
1659 &cfg->fc_nlinfo, 0); 1660 &cfg->fc_nlinfo, 0);
1660 1661
@@ -1963,7 +1964,8 @@ int fib_table_flush(struct net *net, struct fib_table *tb)
1963 hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) { 1964 hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
1964 struct fib_info *fi = fa->fa_info; 1965 struct fib_info *fi = fa->fa_info;
1965 1966
1966 if (!fi || !(fi->fib_flags & RTNH_F_DEAD)) { 1967 if (!fi || !(fi->fib_flags & RTNH_F_DEAD) ||
1968 tb->tb_id != fa->tb_id) {
1967 slen = fa->fa_slen; 1969 slen = fa->fa_slen;
1968 continue; 1970 continue;
1969 } 1971 }
@@ -1972,7 +1974,7 @@ int fib_table_flush(struct net *net, struct fib_table *tb)
1972 n->key, 1974 n->key,
1973 KEYLENGTH - fa->fa_slen, 1975 KEYLENGTH - fa->fa_slen,
1974 fi, fa->fa_tos, fa->fa_type, 1976 fi, fa->fa_tos, fa->fa_type,
1975 tb->tb_id, 0); 1977 tb->tb_id);
1976 hlist_del_rcu(&fa->fa_list); 1978 hlist_del_rcu(&fa->fa_list);
1977 fib_release_info(fa->fa_info); 1979 fib_release_info(fa->fa_info);
1978 alias_free_mem_rcu(fa); 1980 alias_free_mem_rcu(fa);
@@ -2012,7 +2014,7 @@ static void fib_leaf_notify(struct net *net, struct key_vector *l,
2012 2014
2013 call_fib_entry_notifier(nb, net, event_type, l->key, 2015 call_fib_entry_notifier(nb, net, event_type, l->key,
2014 KEYLENGTH - fa->fa_slen, fi, fa->fa_tos, 2016 KEYLENGTH - fa->fa_slen, fi, fa->fa_tos,
2015 fa->fa_type, fa->tb_id, 0); 2017 fa->fa_type, fa->tb_id);
2016 } 2018 }
2017} 2019}
2018 2020
@@ -2386,7 +2388,7 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2386 2388
2387 seq_printf(seq, 2389 seq_printf(seq,
2388 "Basic info: size of leaf:" 2390 "Basic info: size of leaf:"
2389 " %Zd bytes, size of tnode: %Zd bytes.\n", 2391 " %zd bytes, size of tnode: %zd bytes.\n",
2390 LEAF_SIZE, TNODE_SIZE(0)); 2392 LEAF_SIZE, TNODE_SIZE(0));
2391 2393
2392 for (h = 0; h < FIB_TABLE_HASHSZ; h++) { 2394 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 0777ea949223..fc310db2708b 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -209,19 +209,17 @@ static struct sock *icmp_sk(struct net *net)
209 return *this_cpu_ptr(net->ipv4.icmp_sk); 209 return *this_cpu_ptr(net->ipv4.icmp_sk);
210} 210}
211 211
212/* Called with BH disabled */
212static inline struct sock *icmp_xmit_lock(struct net *net) 213static inline struct sock *icmp_xmit_lock(struct net *net)
213{ 214{
214 struct sock *sk; 215 struct sock *sk;
215 216
216 local_bh_disable();
217
218 sk = icmp_sk(net); 217 sk = icmp_sk(net);
219 218
220 if (unlikely(!spin_trylock(&sk->sk_lock.slock))) { 219 if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
221 /* This can happen if the output path signals a 220 /* This can happen if the output path signals a
222 * dst_link_failure() for an outgoing ICMP packet. 221 * dst_link_failure() for an outgoing ICMP packet.
223 */ 222 */
224 local_bh_enable();
225 return NULL; 223 return NULL;
226 } 224 }
227 return sk; 225 return sk;
@@ -229,7 +227,7 @@ static inline struct sock *icmp_xmit_lock(struct net *net)
229 227
230static inline void icmp_xmit_unlock(struct sock *sk) 228static inline void icmp_xmit_unlock(struct sock *sk)
231{ 229{
232 spin_unlock_bh(&sk->sk_lock.slock); 230 spin_unlock(&sk->sk_lock.slock);
233} 231}
234 232
235int sysctl_icmp_msgs_per_sec __read_mostly = 1000; 233int sysctl_icmp_msgs_per_sec __read_mostly = 1000;
@@ -282,6 +280,33 @@ bool icmp_global_allow(void)
282} 280}
283EXPORT_SYMBOL(icmp_global_allow); 281EXPORT_SYMBOL(icmp_global_allow);
284 282
283static bool icmpv4_mask_allow(struct net *net, int type, int code)
284{
285 if (type > NR_ICMP_TYPES)
286 return true;
287
288 /* Don't limit PMTU discovery. */
289 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
290 return true;
291
292 /* Limit if icmp type is enabled in ratemask. */
293 if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask))
294 return true;
295
296 return false;
297}
298
299static bool icmpv4_global_allow(struct net *net, int type, int code)
300{
301 if (icmpv4_mask_allow(net, type, code))
302 return true;
303
304 if (icmp_global_allow())
305 return true;
306
307 return false;
308}
309
285/* 310/*
286 * Send an ICMP frame. 311 * Send an ICMP frame.
287 */ 312 */
@@ -290,34 +315,22 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
290 struct flowi4 *fl4, int type, int code) 315 struct flowi4 *fl4, int type, int code)
291{ 316{
292 struct dst_entry *dst = &rt->dst; 317 struct dst_entry *dst = &rt->dst;
318 struct inet_peer *peer;
293 bool rc = true; 319 bool rc = true;
320 int vif;
294 321
295 if (type > NR_ICMP_TYPES) 322 if (icmpv4_mask_allow(net, type, code))
296 goto out;
297
298 /* Don't limit PMTU discovery. */
299 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
300 goto out; 323 goto out;
301 324
302 /* No rate limit on loopback */ 325 /* No rate limit on loopback */
303 if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) 326 if (dst->dev && (dst->dev->flags&IFF_LOOPBACK))
304 goto out; 327 goto out;
305 328
306 /* Limit if icmp type is enabled in ratemask. */ 329 vif = l3mdev_master_ifindex(dst->dev);
307 if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask)) 330 peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1);
308 goto out; 331 rc = inet_peer_xrlim_allow(peer, net->ipv4.sysctl_icmp_ratelimit);
309 332 if (peer)
310 rc = false; 333 inet_putpeer(peer);
311 if (icmp_global_allow()) {
312 int vif = l3mdev_master_ifindex(dst->dev);
313 struct inet_peer *peer;
314
315 peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1);
316 rc = inet_peer_xrlim_allow(peer,
317 net->ipv4.sysctl_icmp_ratelimit);
318 if (peer)
319 inet_putpeer(peer);
320 }
321out: 334out:
322 return rc; 335 return rc;
323} 336}
@@ -396,13 +409,22 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
396 struct inet_sock *inet; 409 struct inet_sock *inet;
397 __be32 daddr, saddr; 410 __be32 daddr, saddr;
398 u32 mark = IP4_REPLY_MARK(net, skb->mark); 411 u32 mark = IP4_REPLY_MARK(net, skb->mark);
412 int type = icmp_param->data.icmph.type;
413 int code = icmp_param->data.icmph.code;
399 414
400 if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb)) 415 if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb))
401 return; 416 return;
402 417
418 /* Needed by both icmp_global_allow and icmp_xmit_lock */
419 local_bh_disable();
420
421 /* global icmp_msgs_per_sec */
422 if (!icmpv4_global_allow(net, type, code))
423 goto out_bh_enable;
424
403 sk = icmp_xmit_lock(net); 425 sk = icmp_xmit_lock(net);
404 if (!sk) 426 if (!sk)
405 return; 427 goto out_bh_enable;
406 inet = inet_sk(sk); 428 inet = inet_sk(sk);
407 429
408 icmp_param->data.icmph.checksum = 0; 430 icmp_param->data.icmph.checksum = 0;
@@ -433,12 +455,13 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
433 rt = ip_route_output_key(net, &fl4); 455 rt = ip_route_output_key(net, &fl4);
434 if (IS_ERR(rt)) 456 if (IS_ERR(rt))
435 goto out_unlock; 457 goto out_unlock;
436 if (icmpv4_xrlim_allow(net, rt, &fl4, icmp_param->data.icmph.type, 458 if (icmpv4_xrlim_allow(net, rt, &fl4, type, code))
437 icmp_param->data.icmph.code))
438 icmp_push_reply(icmp_param, &fl4, &ipc, &rt); 459 icmp_push_reply(icmp_param, &fl4, &ipc, &rt);
439 ip_rt_put(rt); 460 ip_rt_put(rt);
440out_unlock: 461out_unlock:
441 icmp_xmit_unlock(sk); 462 icmp_xmit_unlock(sk);
463out_bh_enable:
464 local_bh_enable();
442} 465}
443 466
444#ifdef CONFIG_IP_ROUTE_MULTIPATH 467#ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -571,7 +594,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
571{ 594{
572 struct iphdr *iph; 595 struct iphdr *iph;
573 int room; 596 int room;
574 struct icmp_bxm *icmp_param; 597 struct icmp_bxm icmp_param;
575 struct rtable *rt = skb_rtable(skb_in); 598 struct rtable *rt = skb_rtable(skb_in);
576 struct ipcm_cookie ipc; 599 struct ipcm_cookie ipc;
577 struct flowi4 fl4; 600 struct flowi4 fl4;
@@ -648,13 +671,16 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
648 } 671 }
649 } 672 }
650 673
651 icmp_param = kmalloc(sizeof(*icmp_param), GFP_ATOMIC); 674 /* Needed by both icmp_global_allow and icmp_xmit_lock */
652 if (!icmp_param) 675 local_bh_disable();
653 return; 676
677 /* Check global sysctl_icmp_msgs_per_sec ratelimit */
678 if (!icmpv4_global_allow(net, type, code))
679 goto out_bh_enable;
654 680
655 sk = icmp_xmit_lock(net); 681 sk = icmp_xmit_lock(net);
656 if (!sk) 682 if (!sk)
657 goto out_free; 683 goto out_bh_enable;
658 684
659 /* 685 /*
660 * Construct source address and options. 686 * Construct source address and options.
@@ -681,7 +707,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
681 iph->tos; 707 iph->tos;
682 mark = IP4_REPLY_MARK(net, skb_in->mark); 708 mark = IP4_REPLY_MARK(net, skb_in->mark);
683 709
684 if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb_in)) 710 if (ip_options_echo(&icmp_param.replyopts.opt.opt, skb_in))
685 goto out_unlock; 711 goto out_unlock;
686 712
687 713
@@ -689,25 +715,26 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
689 * Prepare data for ICMP header. 715 * Prepare data for ICMP header.
690 */ 716 */
691 717
692 icmp_param->data.icmph.type = type; 718 icmp_param.data.icmph.type = type;
693 icmp_param->data.icmph.code = code; 719 icmp_param.data.icmph.code = code;
694 icmp_param->data.icmph.un.gateway = info; 720 icmp_param.data.icmph.un.gateway = info;
695 icmp_param->data.icmph.checksum = 0; 721 icmp_param.data.icmph.checksum = 0;
696 icmp_param->skb = skb_in; 722 icmp_param.skb = skb_in;
697 icmp_param->offset = skb_network_offset(skb_in); 723 icmp_param.offset = skb_network_offset(skb_in);
698 inet_sk(sk)->tos = tos; 724 inet_sk(sk)->tos = tos;
699 sk->sk_mark = mark; 725 sk->sk_mark = mark;
700 ipc.addr = iph->saddr; 726 ipc.addr = iph->saddr;
701 ipc.opt = &icmp_param->replyopts.opt; 727 ipc.opt = &icmp_param.replyopts.opt;
702 ipc.tx_flags = 0; 728 ipc.tx_flags = 0;
703 ipc.ttl = 0; 729 ipc.ttl = 0;
704 ipc.tos = -1; 730 ipc.tos = -1;
705 731
706 rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark, 732 rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark,
707 type, code, icmp_param); 733 type, code, &icmp_param);
708 if (IS_ERR(rt)) 734 if (IS_ERR(rt))
709 goto out_unlock; 735 goto out_unlock;
710 736
737 /* peer icmp_ratelimit */
711 if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code)) 738 if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code))
712 goto ende; 739 goto ende;
713 740
@@ -716,21 +743,21 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
716 room = dst_mtu(&rt->dst); 743 room = dst_mtu(&rt->dst);
717 if (room > 576) 744 if (room > 576)
718 room = 576; 745 room = 576;
719 room -= sizeof(struct iphdr) + icmp_param->replyopts.opt.opt.optlen; 746 room -= sizeof(struct iphdr) + icmp_param.replyopts.opt.opt.optlen;
720 room -= sizeof(struct icmphdr); 747 room -= sizeof(struct icmphdr);
721 748
722 icmp_param->data_len = skb_in->len - icmp_param->offset; 749 icmp_param.data_len = skb_in->len - icmp_param.offset;
723 if (icmp_param->data_len > room) 750 if (icmp_param.data_len > room)
724 icmp_param->data_len = room; 751 icmp_param.data_len = room;
725 icmp_param->head_len = sizeof(struct icmphdr); 752 icmp_param.head_len = sizeof(struct icmphdr);
726 753
727 icmp_push_reply(icmp_param, &fl4, &ipc, &rt); 754 icmp_push_reply(&icmp_param, &fl4, &ipc, &rt);
728ende: 755ende:
729 ip_rt_put(rt); 756 ip_rt_put(rt);
730out_unlock: 757out_unlock:
731 icmp_xmit_unlock(sk); 758 icmp_xmit_unlock(sk);
732out_free: 759out_bh_enable:
733 kfree(icmp_param); 760 local_bh_enable();
734out:; 761out:;
735} 762}
736EXPORT_SYMBOL(icmp_send); 763EXPORT_SYMBOL(icmp_send);
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 5b15459955f8..44fd86de2823 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1172,6 +1172,7 @@ static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im)
1172 psf->sf_crcount = im->crcount; 1172 psf->sf_crcount = im->crcount;
1173 } 1173 }
1174 in_dev_put(pmc->interface); 1174 in_dev_put(pmc->interface);
1175 kfree(pmc);
1175 } 1176 }
1176 spin_unlock_bh(&im->lock); 1177 spin_unlock_bh(&im->lock);
1177} 1178}
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 19ea045c50ed..5e313c1ac94f 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -31,6 +31,86 @@ const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
31EXPORT_SYMBOL(inet_csk_timer_bug_msg); 31EXPORT_SYMBOL(inet_csk_timer_bug_msg);
32#endif 32#endif
33 33
34#if IS_ENABLED(CONFIG_IPV6)
35/* match_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses if IPv6
36 * only, and any IPv4 addresses if not IPv6 only
37 * match_wildcard == false: addresses must be exactly the same, i.e.
38 * IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY,
39 * and 0.0.0.0 equals to 0.0.0.0 only
40 */
41static int ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6,
42 const struct in6_addr *sk2_rcv_saddr6,
43 __be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr,
44 bool sk1_ipv6only, bool sk2_ipv6only,
45 bool match_wildcard)
46{
47 int addr_type = ipv6_addr_type(sk1_rcv_saddr6);
48 int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED;
49
50 /* if both are mapped, treat as IPv4 */
51 if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) {
52 if (!sk2_ipv6only) {
53 if (sk1_rcv_saddr == sk2_rcv_saddr)
54 return 1;
55 if (!sk1_rcv_saddr || !sk2_rcv_saddr)
56 return match_wildcard;
57 }
58 return 0;
59 }
60
61 if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY)
62 return 1;
63
64 if (addr_type2 == IPV6_ADDR_ANY && match_wildcard &&
65 !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED))
66 return 1;
67
68 if (addr_type == IPV6_ADDR_ANY && match_wildcard &&
69 !(sk1_ipv6only && addr_type2 == IPV6_ADDR_MAPPED))
70 return 1;
71
72 if (sk2_rcv_saddr6 &&
73 ipv6_addr_equal(sk1_rcv_saddr6, sk2_rcv_saddr6))
74 return 1;
75
76 return 0;
77}
78#endif
79
80/* match_wildcard == true: 0.0.0.0 equals to any IPv4 addresses
81 * match_wildcard == false: addresses must be exactly the same, i.e.
82 * 0.0.0.0 only equals to 0.0.0.0
83 */
84static int ipv4_rcv_saddr_equal(__be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr,
85 bool sk2_ipv6only, bool match_wildcard)
86{
87 if (!sk2_ipv6only) {
88 if (sk1_rcv_saddr == sk2_rcv_saddr)
89 return 1;
90 if (!sk1_rcv_saddr || !sk2_rcv_saddr)
91 return match_wildcard;
92 }
93 return 0;
94}
95
96int inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
97 bool match_wildcard)
98{
99#if IS_ENABLED(CONFIG_IPV6)
100 if (sk->sk_family == AF_INET6)
101 return ipv6_rcv_saddr_equal(&sk->sk_v6_rcv_saddr,
102 inet6_rcv_saddr(sk2),
103 sk->sk_rcv_saddr,
104 sk2->sk_rcv_saddr,
105 ipv6_only_sock(sk),
106 ipv6_only_sock(sk2),
107 match_wildcard);
108#endif
109 return ipv4_rcv_saddr_equal(sk->sk_rcv_saddr, sk2->sk_rcv_saddr,
110 ipv6_only_sock(sk2), match_wildcard);
111}
112EXPORT_SYMBOL(inet_rcv_saddr_equal);
113
34void inet_get_local_port_range(struct net *net, int *low, int *high) 114void inet_get_local_port_range(struct net *net, int *low, int *high)
35{ 115{
36 unsigned int seq; 116 unsigned int seq;
@@ -44,9 +124,9 @@ void inet_get_local_port_range(struct net *net, int *low, int *high)
44} 124}
45EXPORT_SYMBOL(inet_get_local_port_range); 125EXPORT_SYMBOL(inet_get_local_port_range);
46 126
47int inet_csk_bind_conflict(const struct sock *sk, 127static int inet_csk_bind_conflict(const struct sock *sk,
48 const struct inet_bind_bucket *tb, bool relax, 128 const struct inet_bind_bucket *tb,
49 bool reuseport_ok) 129 bool relax, bool reuseport_ok)
50{ 130{
51 struct sock *sk2; 131 struct sock *sk2;
52 bool reuse = sk->sk_reuse; 132 bool reuse = sk->sk_reuse;
@@ -62,7 +142,6 @@ int inet_csk_bind_conflict(const struct sock *sk,
62 142
63 sk_for_each_bound(sk2, &tb->owners) { 143 sk_for_each_bound(sk2, &tb->owners) {
64 if (sk != sk2 && 144 if (sk != sk2 &&
65 !inet_v6_ipv6only(sk2) &&
66 (!sk->sk_bound_dev_if || 145 (!sk->sk_bound_dev_if ||
67 !sk2->sk_bound_dev_if || 146 !sk2->sk_bound_dev_if ||
68 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { 147 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
@@ -72,54 +151,34 @@ int inet_csk_bind_conflict(const struct sock *sk,
72 rcu_access_pointer(sk->sk_reuseport_cb) || 151 rcu_access_pointer(sk->sk_reuseport_cb) ||
73 (sk2->sk_state != TCP_TIME_WAIT && 152 (sk2->sk_state != TCP_TIME_WAIT &&
74 !uid_eq(uid, sock_i_uid(sk2))))) { 153 !uid_eq(uid, sock_i_uid(sk2))))) {
75 154 if (inet_rcv_saddr_equal(sk, sk2, true))
76 if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr ||
77 sk2->sk_rcv_saddr == sk->sk_rcv_saddr)
78 break; 155 break;
79 } 156 }
80 if (!relax && reuse && sk2->sk_reuse && 157 if (!relax && reuse && sk2->sk_reuse &&
81 sk2->sk_state != TCP_LISTEN) { 158 sk2->sk_state != TCP_LISTEN) {
82 159 if (inet_rcv_saddr_equal(sk, sk2, true))
83 if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr ||
84 sk2->sk_rcv_saddr == sk->sk_rcv_saddr)
85 break; 160 break;
86 } 161 }
87 } 162 }
88 } 163 }
89 return sk2 != NULL; 164 return sk2 != NULL;
90} 165}
91EXPORT_SYMBOL_GPL(inet_csk_bind_conflict);
92 166
93/* Obtain a reference to a local port for the given sock, 167/*
94 * if snum is zero it means select any available local port. 168 * Find an open port number for the socket. Returns with the
95 * We try to allocate an odd port (and leave even ports for connect()) 169 * inet_bind_hashbucket lock held.
96 */ 170 */
97int inet_csk_get_port(struct sock *sk, unsigned short snum) 171static struct inet_bind_hashbucket *
172inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, int *port_ret)
98{ 173{
99 bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
100 struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo; 174 struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
101 int ret = 1, attempts = 5, port = snum; 175 int port = 0;
102 int smallest_size = -1, smallest_port;
103 struct inet_bind_hashbucket *head; 176 struct inet_bind_hashbucket *head;
104 struct net *net = sock_net(sk); 177 struct net *net = sock_net(sk);
105 int i, low, high, attempt_half; 178 int i, low, high, attempt_half;
106 struct inet_bind_bucket *tb; 179 struct inet_bind_bucket *tb;
107 kuid_t uid = sock_i_uid(sk);
108 u32 remaining, offset; 180 u32 remaining, offset;
109 bool reuseport_ok = !!snum;
110 181
111 if (port) {
112have_port:
113 head = &hinfo->bhash[inet_bhashfn(net, port,
114 hinfo->bhash_size)];
115 spin_lock_bh(&head->lock);
116 inet_bind_bucket_for_each(tb, &head->chain)
117 if (net_eq(ib_net(tb), net) && tb->port == port)
118 goto tb_found;
119
120 goto tb_not_found;
121 }
122again:
123 attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0; 182 attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
124other_half_scan: 183other_half_scan:
125 inet_get_local_port_range(net, &low, &high); 184 inet_get_local_port_range(net, &low, &high);
@@ -143,8 +202,6 @@ other_half_scan:
143 * We do the opposite to not pollute connect() users. 202 * We do the opposite to not pollute connect() users.
144 */ 203 */
145 offset |= 1U; 204 offset |= 1U;
146 smallest_size = -1;
147 smallest_port = low; /* avoid compiler warning */
148 205
149other_parity_scan: 206other_parity_scan:
150 port = low + offset; 207 port = low + offset;
@@ -158,30 +215,17 @@ other_parity_scan:
158 spin_lock_bh(&head->lock); 215 spin_lock_bh(&head->lock);
159 inet_bind_bucket_for_each(tb, &head->chain) 216 inet_bind_bucket_for_each(tb, &head->chain)
160 if (net_eq(ib_net(tb), net) && tb->port == port) { 217 if (net_eq(ib_net(tb), net) && tb->port == port) {
161 if (((tb->fastreuse > 0 && reuse) || 218 if (!inet_csk_bind_conflict(sk, tb, false, false))
162 (tb->fastreuseport > 0 && 219 goto success;
163 sk->sk_reuseport &&
164 !rcu_access_pointer(sk->sk_reuseport_cb) &&
165 uid_eq(tb->fastuid, uid))) &&
166 (tb->num_owners < smallest_size || smallest_size == -1)) {
167 smallest_size = tb->num_owners;
168 smallest_port = port;
169 }
170 if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false,
171 reuseport_ok))
172 goto tb_found;
173 goto next_port; 220 goto next_port;
174 } 221 }
175 goto tb_not_found; 222 tb = NULL;
223 goto success;
176next_port: 224next_port:
177 spin_unlock_bh(&head->lock); 225 spin_unlock_bh(&head->lock);
178 cond_resched(); 226 cond_resched();
179 } 227 }
180 228
181 if (smallest_size != -1) {
182 port = smallest_port;
183 goto have_port;
184 }
185 offset--; 229 offset--;
186 if (!(offset & 1)) 230 if (!(offset & 1))
187 goto other_parity_scan; 231 goto other_parity_scan;
@@ -191,8 +235,74 @@ next_port:
191 attempt_half = 2; 235 attempt_half = 2;
192 goto other_half_scan; 236 goto other_half_scan;
193 } 237 }
194 return ret; 238 return NULL;
239success:
240 *port_ret = port;
241 *tb_ret = tb;
242 return head;
243}
195 244
245static inline int sk_reuseport_match(struct inet_bind_bucket *tb,
246 struct sock *sk)
247{
248 kuid_t uid = sock_i_uid(sk);
249
250 if (tb->fastreuseport <= 0)
251 return 0;
252 if (!sk->sk_reuseport)
253 return 0;
254 if (rcu_access_pointer(sk->sk_reuseport_cb))
255 return 0;
256 if (!uid_eq(tb->fastuid, uid))
257 return 0;
258 /* We only need to check the rcv_saddr if this tb was once marked
259 * without fastreuseport and then was reset, as we can only know that
260 * the fast_*rcv_saddr doesn't have any conflicts with the socks on the
261 * owners list.
262 */
263 if (tb->fastreuseport == FASTREUSEPORT_ANY)
264 return 1;
265#if IS_ENABLED(CONFIG_IPV6)
266 if (tb->fast_sk_family == AF_INET6)
267 return ipv6_rcv_saddr_equal(&tb->fast_v6_rcv_saddr,
268 &sk->sk_v6_rcv_saddr,
269 tb->fast_rcv_saddr,
270 sk->sk_rcv_saddr,
271 tb->fast_ipv6_only,
272 ipv6_only_sock(sk), true);
273#endif
274 return ipv4_rcv_saddr_equal(tb->fast_rcv_saddr, sk->sk_rcv_saddr,
275 ipv6_only_sock(sk), true);
276}
277
278/* Obtain a reference to a local port for the given sock,
279 * if snum is zero it means select any available local port.
280 * We try to allocate an odd port (and leave even ports for connect())
281 */
282int inet_csk_get_port(struct sock *sk, unsigned short snum)
283{
284 bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
285 struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
286 int ret = 1, port = snum;
287 struct inet_bind_hashbucket *head;
288 struct net *net = sock_net(sk);
289 struct inet_bind_bucket *tb = NULL;
290 kuid_t uid = sock_i_uid(sk);
291
292 if (!port) {
293 head = inet_csk_find_open_port(sk, &tb, &port);
294 if (!head)
295 return ret;
296 if (!tb)
297 goto tb_not_found;
298 goto success;
299 }
300 head = &hinfo->bhash[inet_bhashfn(net, port,
301 hinfo->bhash_size)];
302 spin_lock_bh(&head->lock);
303 inet_bind_bucket_for_each(tb, &head->chain)
304 if (net_eq(ib_net(tb), net) && tb->port == port)
305 goto tb_found;
196tb_not_found: 306tb_not_found:
197 tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, 307 tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
198 net, head, port); 308 net, head, port);
@@ -203,39 +313,54 @@ tb_found:
203 if (sk->sk_reuse == SK_FORCE_REUSE) 313 if (sk->sk_reuse == SK_FORCE_REUSE)
204 goto success; 314 goto success;
205 315
206 if (((tb->fastreuse > 0 && reuse) || 316 if ((tb->fastreuse > 0 && reuse) ||
207 (tb->fastreuseport > 0 && 317 sk_reuseport_match(tb, sk))
208 !rcu_access_pointer(sk->sk_reuseport_cb) &&
209 sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
210 smallest_size == -1)
211 goto success; 318 goto success;
212 if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true, 319 if (inet_csk_bind_conflict(sk, tb, true, true))
213 reuseport_ok)) {
214 if ((reuse ||
215 (tb->fastreuseport > 0 &&
216 sk->sk_reuseport &&
217 !rcu_access_pointer(sk->sk_reuseport_cb) &&
218 uid_eq(tb->fastuid, uid))) &&
219 !snum && smallest_size != -1 && --attempts >= 0) {
220 spin_unlock_bh(&head->lock);
221 goto again;
222 }
223 goto fail_unlock; 320 goto fail_unlock;
321 }
322success:
323 if (!hlist_empty(&tb->owners)) {
324 tb->fastreuse = reuse;
325 if (sk->sk_reuseport) {
326 tb->fastreuseport = FASTREUSEPORT_ANY;
327 tb->fastuid = uid;
328 tb->fast_rcv_saddr = sk->sk_rcv_saddr;
329 tb->fast_ipv6_only = ipv6_only_sock(sk);
330#if IS_ENABLED(CONFIG_IPV6)
331 tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
332#endif
333 } else {
334 tb->fastreuseport = 0;
224 } 335 }
336 } else {
225 if (!reuse) 337 if (!reuse)
226 tb->fastreuse = 0; 338 tb->fastreuse = 0;
227 if (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))
228 tb->fastreuseport = 0;
229 } else {
230 tb->fastreuse = reuse;
231 if (sk->sk_reuseport) { 339 if (sk->sk_reuseport) {
232 tb->fastreuseport = 1; 340 /* We didn't match or we don't have fastreuseport set on
233 tb->fastuid = uid; 341 * the tb, but we have sk_reuseport set on this socket
342 * and we know that there are no bind conflicts with
343 * this socket in this tb, so reset our tb's reuseport
344 * settings so that any subsequent sockets that match
345 * our current socket will be put on the fast path.
346 *
347 * If we reset we need to set FASTREUSEPORT_STRICT so we
348 * do extra checking for all subsequent sk_reuseport
349 * socks.
350 */
351 if (!sk_reuseport_match(tb, sk)) {
352 tb->fastreuseport = FASTREUSEPORT_STRICT;
353 tb->fastuid = uid;
354 tb->fast_rcv_saddr = sk->sk_rcv_saddr;
355 tb->fast_ipv6_only = ipv6_only_sock(sk);
356#if IS_ENABLED(CONFIG_IPV6)
357 tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
358#endif
359 }
234 } else { 360 } else {
235 tb->fastreuseport = 0; 361 tb->fastreuseport = 0;
236 } 362 }
237 } 363 }
238success:
239 if (!inet_csk(sk)->icsk_bind_hash) 364 if (!inet_csk(sk)->icsk_bind_hash)
240 inet_bind_hash(sk, tb, port); 365 inet_bind_hash(sk, tb, port);
241 WARN_ON(inet_csk(sk)->icsk_bind_hash != tb); 366 WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
@@ -299,7 +424,7 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
299/* 424/*
300 * This will accept the next outstanding connection. 425 * This will accept the next outstanding connection.
301 */ 426 */
302struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) 427struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern)
303{ 428{
304 struct inet_connection_sock *icsk = inet_csk(sk); 429 struct inet_connection_sock *icsk = inet_csk(sk);
305 struct request_sock_queue *queue = &icsk->icsk_accept_queue; 430 struct request_sock_queue *queue = &icsk->icsk_accept_queue;
@@ -711,9 +836,8 @@ void inet_csk_destroy_sock(struct sock *sk)
711 836
712 sk_refcnt_debug_release(sk); 837 sk_refcnt_debug_release(sk);
713 838
714 local_bh_disable();
715 percpu_counter_dec(sk->sk_prot->orphan_count); 839 percpu_counter_dec(sk->sk_prot->orphan_count);
716 local_bh_enable(); 840
717 sock_put(sk); 841 sock_put(sk);
718} 842}
719EXPORT_SYMBOL(inet_csk_destroy_sock); 843EXPORT_SYMBOL(inet_csk_destroy_sock);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 4dea33e5f295..3828b3a805cd 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -215,7 +215,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
215 } 215 }
216 216
217 if (icsk->icsk_pending == ICSK_TIME_RETRANS || 217 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
218 icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || 218 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
219 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 219 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
220 r->idiag_timer = 1; 220 r->idiag_timer = 1;
221 r->idiag_retrans = icsk->icsk_retransmits; 221 r->idiag_retrans = icsk->icsk_retransmits;
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index ca97835bfec4..8bea74298173 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -73,7 +73,6 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
73 tb->port = snum; 73 tb->port = snum;
74 tb->fastreuse = 0; 74 tb->fastreuse = 0;
75 tb->fastreuseport = 0; 75 tb->fastreuseport = 0;
76 tb->num_owners = 0;
77 INIT_HLIST_HEAD(&tb->owners); 76 INIT_HLIST_HEAD(&tb->owners);
78 hlist_add_head(&tb->node, &head->chain); 77 hlist_add_head(&tb->node, &head->chain);
79 } 78 }
@@ -96,7 +95,6 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
96{ 95{
97 inet_sk(sk)->inet_num = snum; 96 inet_sk(sk)->inet_num = snum;
98 sk_add_bind_node(sk, &tb->owners); 97 sk_add_bind_node(sk, &tb->owners);
99 tb->num_owners++;
100 inet_csk(sk)->icsk_bind_hash = tb; 98 inet_csk(sk)->icsk_bind_hash = tb;
101} 99}
102 100
@@ -114,7 +112,6 @@ static void __inet_put_port(struct sock *sk)
114 spin_lock(&head->lock); 112 spin_lock(&head->lock);
115 tb = inet_csk(sk)->icsk_bind_hash; 113 tb = inet_csk(sk)->icsk_bind_hash;
116 __sk_del_bind_node(sk); 114 __sk_del_bind_node(sk);
117 tb->num_owners--;
118 inet_csk(sk)->icsk_bind_hash = NULL; 115 inet_csk(sk)->icsk_bind_hash = NULL;
119 inet_sk(sk)->inet_num = 0; 116 inet_sk(sk)->inet_num = 0;
120 inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); 117 inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
@@ -435,10 +432,7 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk)
435EXPORT_SYMBOL_GPL(inet_ehash_nolisten); 432EXPORT_SYMBOL_GPL(inet_ehash_nolisten);
436 433
437static int inet_reuseport_add_sock(struct sock *sk, 434static int inet_reuseport_add_sock(struct sock *sk,
438 struct inet_listen_hashbucket *ilb, 435 struct inet_listen_hashbucket *ilb)
439 int (*saddr_same)(const struct sock *sk1,
440 const struct sock *sk2,
441 bool match_wildcard))
442{ 436{
443 struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash; 437 struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash;
444 struct sock *sk2; 438 struct sock *sk2;
@@ -451,7 +445,7 @@ static int inet_reuseport_add_sock(struct sock *sk,
451 sk2->sk_bound_dev_if == sk->sk_bound_dev_if && 445 sk2->sk_bound_dev_if == sk->sk_bound_dev_if &&
452 inet_csk(sk2)->icsk_bind_hash == tb && 446 inet_csk(sk2)->icsk_bind_hash == tb &&
453 sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && 447 sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
454 saddr_same(sk, sk2, false)) 448 inet_rcv_saddr_equal(sk, sk2, false))
455 return reuseport_add_sock(sk, sk2); 449 return reuseport_add_sock(sk, sk2);
456 } 450 }
457 451
@@ -461,10 +455,7 @@ static int inet_reuseport_add_sock(struct sock *sk,
461 return 0; 455 return 0;
462} 456}
463 457
464int __inet_hash(struct sock *sk, struct sock *osk, 458int __inet_hash(struct sock *sk, struct sock *osk)
465 int (*saddr_same)(const struct sock *sk1,
466 const struct sock *sk2,
467 bool match_wildcard))
468{ 459{
469 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 460 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
470 struct inet_listen_hashbucket *ilb; 461 struct inet_listen_hashbucket *ilb;
@@ -479,7 +470,7 @@ int __inet_hash(struct sock *sk, struct sock *osk,
479 470
480 spin_lock(&ilb->lock); 471 spin_lock(&ilb->lock);
481 if (sk->sk_reuseport) { 472 if (sk->sk_reuseport) {
482 err = inet_reuseport_add_sock(sk, ilb, saddr_same); 473 err = inet_reuseport_add_sock(sk, ilb);
483 if (err) 474 if (err)
484 goto unlock; 475 goto unlock;
485 } 476 }
@@ -503,7 +494,7 @@ int inet_hash(struct sock *sk)
503 494
504 if (sk->sk_state != TCP_CLOSE) { 495 if (sk->sk_state != TCP_CLOSE) {
505 local_bh_disable(); 496 local_bh_disable();
506 err = __inet_hash(sk, NULL, ipv4_rcv_saddr_equal); 497 err = __inet_hash(sk, NULL);
507 local_bh_enable(); 498 local_bh_enable();
508 } 499 }
509 500
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index ddcd56c08d14..f8aff2c71cde 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -257,8 +257,7 @@ void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm)
257} 257}
258EXPORT_SYMBOL_GPL(__inet_twsk_schedule); 258EXPORT_SYMBOL_GPL(__inet_twsk_schedule);
259 259
260void inet_twsk_purge(struct inet_hashinfo *hashinfo, 260void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family)
261 struct inet_timewait_death_row *twdr, int family)
262{ 261{
263 struct inet_timewait_sock *tw; 262 struct inet_timewait_sock *tw;
264 struct sock *sk; 263 struct sock *sk;
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index bbe7f72db9c1..b3cdeec85f1f 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -198,6 +198,7 @@ static void ip_expire(unsigned long arg)
198 qp = container_of((struct inet_frag_queue *) arg, struct ipq, q); 198 qp = container_of((struct inet_frag_queue *) arg, struct ipq, q);
199 net = container_of(qp->q.net, struct net, ipv4.frags); 199 net = container_of(qp->q.net, struct net, ipv4.frags);
200 200
201 rcu_read_lock();
201 spin_lock(&qp->q.lock); 202 spin_lock(&qp->q.lock);
202 203
203 if (qp->q.flags & INET_FRAG_COMPLETE) 204 if (qp->q.flags & INET_FRAG_COMPLETE)
@@ -207,7 +208,7 @@ static void ip_expire(unsigned long arg)
207 __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); 208 __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
208 209
209 if (!inet_frag_evicting(&qp->q)) { 210 if (!inet_frag_evicting(&qp->q)) {
210 struct sk_buff *head = qp->q.fragments; 211 struct sk_buff *clone, *head = qp->q.fragments;
211 const struct iphdr *iph; 212 const struct iphdr *iph;
212 int err; 213 int err;
213 214
@@ -216,32 +217,40 @@ static void ip_expire(unsigned long arg)
216 if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !qp->q.fragments) 217 if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !qp->q.fragments)
217 goto out; 218 goto out;
218 219
219 rcu_read_lock();
220 head->dev = dev_get_by_index_rcu(net, qp->iif); 220 head->dev = dev_get_by_index_rcu(net, qp->iif);
221 if (!head->dev) 221 if (!head->dev)
222 goto out_rcu_unlock; 222 goto out;
223
223 224
224 /* skb has no dst, perform route lookup again */ 225 /* skb has no dst, perform route lookup again */
225 iph = ip_hdr(head); 226 iph = ip_hdr(head);
226 err = ip_route_input_noref(head, iph->daddr, iph->saddr, 227 err = ip_route_input_noref(head, iph->daddr, iph->saddr,
227 iph->tos, head->dev); 228 iph->tos, head->dev);
228 if (err) 229 if (err)
229 goto out_rcu_unlock; 230 goto out;
230 231
231 /* Only an end host needs to send an ICMP 232 /* Only an end host needs to send an ICMP
232 * "Fragment Reassembly Timeout" message, per RFC792. 233 * "Fragment Reassembly Timeout" message, per RFC792.
233 */ 234 */
234 if (frag_expire_skip_icmp(qp->user) && 235 if (frag_expire_skip_icmp(qp->user) &&
235 (skb_rtable(head)->rt_type != RTN_LOCAL)) 236 (skb_rtable(head)->rt_type != RTN_LOCAL))
236 goto out_rcu_unlock; 237 goto out;
238
239 clone = skb_clone(head, GFP_ATOMIC);
237 240
238 /* Send an ICMP "Fragment Reassembly Timeout" message. */ 241 /* Send an ICMP "Fragment Reassembly Timeout" message. */
239 icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); 242 if (clone) {
240out_rcu_unlock: 243 spin_unlock(&qp->q.lock);
241 rcu_read_unlock(); 244 icmp_send(clone, ICMP_TIME_EXCEEDED,
245 ICMP_EXC_FRAGTIME, 0);
246 consume_skb(clone);
247 goto out_rcu_unlock;
248 }
242 } 249 }
243out: 250out:
244 spin_unlock(&qp->q.lock); 251 spin_unlock(&qp->q.lock);
252out_rcu_unlock:
253 rcu_read_unlock();
245 ipq_put(qp); 254 ipq_put(qp);
246} 255}
247 256
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index b67719f45953..7a3fd25e8913 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -222,7 +222,10 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s
222 if (unlikely(!neigh)) 222 if (unlikely(!neigh))
223 neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); 223 neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
224 if (!IS_ERR(neigh)) { 224 if (!IS_ERR(neigh)) {
225 int res = dst_neigh_output(dst, neigh, skb); 225 int res;
226
227 sock_confirm_neigh(skb, neigh);
228 res = neigh_output(neigh, skb);
226 229
227 rcu_read_unlock_bh(); 230 rcu_read_unlock_bh();
228 return res; 231 return res;
@@ -886,6 +889,9 @@ static inline int ip_ufo_append_data(struct sock *sk,
886 889
887 skb->csum = 0; 890 skb->csum = 0;
888 891
892 if (flags & MSG_CONFIRM)
893 skb_set_dst_pending_confirm(skb, 1);
894
889 __skb_queue_tail(queue, skb); 895 __skb_queue_tail(queue, skb);
890 } else if (skb_is_gso(skb)) { 896 } else if (skb_is_gso(skb)) {
891 goto append; 897 goto append;
@@ -960,7 +966,7 @@ static int __ip_append_data(struct sock *sk,
960 cork->length += length; 966 cork->length += length;
961 if ((((length + fragheaderlen) > mtu) || (skb && skb_is_gso(skb))) && 967 if ((((length + fragheaderlen) > mtu) || (skb && skb_is_gso(skb))) &&
962 (sk->sk_protocol == IPPROTO_UDP) && 968 (sk->sk_protocol == IPPROTO_UDP) &&
963 (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len && 969 (rt->dst.dev->features & NETIF_F_UFO) && !dst_xfrm(&rt->dst) &&
964 (sk->sk_type == SOCK_DGRAM) && !sk->sk_no_check_tx) { 970 (sk->sk_type == SOCK_DGRAM) && !sk->sk_no_check_tx) {
965 err = ip_ufo_append_data(sk, queue, getfrag, from, length, 971 err = ip_ufo_append_data(sk, queue, getfrag, from, length,
966 hh_len, fragheaderlen, transhdrlen, 972 hh_len, fragheaderlen, transhdrlen,
@@ -1086,6 +1092,9 @@ alloc_new_skb:
1086 exthdrlen = 0; 1092 exthdrlen = 0;
1087 csummode = CHECKSUM_NONE; 1093 csummode = CHECKSUM_NONE;
1088 1094
1095 if ((flags & MSG_CONFIRM) && !skb_prev)
1096 skb_set_dst_pending_confirm(skb, 1);
1097
1089 /* 1098 /*
1090 * Put the packet on the pending queue. 1099 * Put the packet on the pending queue.
1091 */ 1100 */
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 53ae0c6315ad..1d46d05efb0f 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -116,10 +116,10 @@ static void ip_cmsg_recv_checksum(struct msghdr *msg, struct sk_buff *skb,
116 if (skb->ip_summed != CHECKSUM_COMPLETE) 116 if (skb->ip_summed != CHECKSUM_COMPLETE)
117 return; 117 return;
118 118
119 if (offset != 0) 119 if (offset != 0) {
120 csum = csum_sub(csum, 120 int tend_off = skb_transport_offset(skb) + tlen;
121 csum_partial(skb_transport_header(skb) + tlen, 121 csum = csum_sub(csum, skb_checksum(skb, tend_off, offset, 0));
122 offset, 0)); 122 }
123 123
124 put_cmsg(msg, SOL_IP, IP_CHECKSUM, sizeof(__wsum), &csum); 124 put_cmsg(msg, SOL_IP, IP_CHECKSUM, sizeof(__wsum), &csum);
125} 125}
@@ -272,7 +272,7 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc,
272 continue; 272 continue;
273 switch (cmsg->cmsg_type) { 273 switch (cmsg->cmsg_type) {
274 case IP_RETOPTS: 274 case IP_RETOPTS:
275 err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)); 275 err = cmsg->cmsg_len - sizeof(struct cmsghdr);
276 276
277 /* Our caller is responsible for freeing ipc->opt */ 277 /* Our caller is responsible for freeing ipc->opt */
278 err = ip_options_get(net, &ipc->opt, CMSG_DATA(cmsg), 278 err = ip_options_get(net, &ipc->opt, CMSG_DATA(cmsg),
@@ -488,16 +488,15 @@ static bool ipv4_datagram_support_cmsg(const struct sock *sk,
488 return false; 488 return false;
489 489
490 /* Support IP_PKTINFO on tstamp packets if requested, to correlate 490 /* Support IP_PKTINFO on tstamp packets if requested, to correlate
491 * timestamp with egress dev. Not possible for packets without dev 491 * timestamp with egress dev. Not possible for packets without iif
492 * or without payload (SOF_TIMESTAMPING_OPT_TSONLY). 492 * or without payload (SOF_TIMESTAMPING_OPT_TSONLY).
493 */ 493 */
494 if ((!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_CMSG)) || 494 info = PKTINFO_SKB_CB(skb);
495 (!skb->dev)) 495 if (!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_CMSG) ||
496 !info->ipi_ifindex)
496 return false; 497 return false;
497 498
498 info = PKTINFO_SKB_CB(skb);
499 info->ipi_spec_dst.s_addr = ip_hdr(skb)->saddr; 499 info->ipi_spec_dst.s_addr = ip_hdr(skb)->saddr;
500 info->ipi_ifindex = skb->dev->ifindex;
501 return true; 500 return true;
502} 501}
503 502
@@ -591,6 +590,7 @@ static bool setsockopt_needs_rtnl(int optname)
591 case MCAST_LEAVE_GROUP: 590 case MCAST_LEAVE_GROUP:
592 case MCAST_LEAVE_SOURCE_GROUP: 591 case MCAST_LEAVE_SOURCE_GROUP:
593 case MCAST_UNBLOCK_SOURCE: 592 case MCAST_UNBLOCK_SOURCE:
593 case IP_ROUTER_ALERT:
594 return true; 594 return true;
595 } 595 }
596 return false; 596 return false;
@@ -843,6 +843,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
843 { 843 {
844 struct ip_mreqn mreq; 844 struct ip_mreqn mreq;
845 struct net_device *dev = NULL; 845 struct net_device *dev = NULL;
846 int midx;
846 847
847 if (sk->sk_type == SOCK_STREAM) 848 if (sk->sk_type == SOCK_STREAM)
848 goto e_inval; 849 goto e_inval;
@@ -887,11 +888,15 @@ static int do_ip_setsockopt(struct sock *sk, int level,
887 err = -EADDRNOTAVAIL; 888 err = -EADDRNOTAVAIL;
888 if (!dev) 889 if (!dev)
889 break; 890 break;
891
892 midx = l3mdev_master_ifindex(dev);
893
890 dev_put(dev); 894 dev_put(dev);
891 895
892 err = -EINVAL; 896 err = -EINVAL;
893 if (sk->sk_bound_dev_if && 897 if (sk->sk_bound_dev_if &&
894 mreq.imr_ifindex != sk->sk_bound_dev_if) 898 mreq.imr_ifindex != sk->sk_bound_dev_if &&
899 (!midx || midx != sk->sk_bound_dev_if))
895 break; 900 break;
896 901
897 inet->mc_index = mreq.imr_ifindex; 902 inet->mc_index = mreq.imr_ifindex;
@@ -1238,7 +1243,14 @@ void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb)
1238 pktinfo->ipi_ifindex = 0; 1243 pktinfo->ipi_ifindex = 0;
1239 pktinfo->ipi_spec_dst.s_addr = 0; 1244 pktinfo->ipi_spec_dst.s_addr = 0;
1240 } 1245 }
1241 skb_dst_drop(skb); 1246 /* We need to keep the dst for __ip_options_echo()
1247 * We could restrict the test to opt.ts_needtime || opt.srr,
1248 * but the following is good enough as IP options are not often used.
1249 */
1250 if (unlikely(IPCB(skb)->opt.optlen))
1251 skb_dst_force(skb);
1252 else
1253 skb_dst_drop(skb);
1242} 1254}
1243 1255
1244int ip_setsockopt(struct sock *sk, int level, 1256int ip_setsockopt(struct sock *sk, int level,
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 0fd1976ab63b..a31f47ccaad9 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -188,8 +188,8 @@ int iptunnel_handle_offloads(struct sk_buff *skb,
188EXPORT_SYMBOL_GPL(iptunnel_handle_offloads); 188EXPORT_SYMBOL_GPL(iptunnel_handle_offloads);
189 189
190/* Often modified stats are per cpu, other are shared (netdev->stats) */ 190/* Often modified stats are per cpu, other are shared (netdev->stats) */
191struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev, 191void ip_tunnel_get_stats64(struct net_device *dev,
192 struct rtnl_link_stats64 *tot) 192 struct rtnl_link_stats64 *tot)
193{ 193{
194 int i; 194 int i;
195 195
@@ -214,8 +214,6 @@ struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
214 tot->rx_bytes += rx_bytes; 214 tot->rx_bytes += rx_bytes;
215 tot->tx_bytes += tx_bytes; 215 tot->tx_bytes += tx_bytes;
216 } 216 }
217
218 return tot;
219} 217}
220EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64); 218EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64);
221 219
@@ -228,7 +226,7 @@ static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = {
228 [LWTUNNEL_IP_FLAGS] = { .type = NLA_U16 }, 226 [LWTUNNEL_IP_FLAGS] = { .type = NLA_U16 },
229}; 227};
230 228
231static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr, 229static int ip_tun_build_state(struct nlattr *attr,
232 unsigned int family, const void *cfg, 230 unsigned int family, const void *cfg,
233 struct lwtunnel_state **ts) 231 struct lwtunnel_state **ts)
234{ 232{
@@ -325,7 +323,7 @@ static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = {
325 [LWTUNNEL_IP6_FLAGS] = { .type = NLA_U16 }, 323 [LWTUNNEL_IP6_FLAGS] = { .type = NLA_U16 },
326}; 324};
327 325
328static int ip6_tun_build_state(struct net_device *dev, struct nlattr *attr, 326static int ip6_tun_build_state(struct nlattr *attr,
329 unsigned int family, const void *cfg, 327 unsigned int family, const void *cfg,
330 struct lwtunnel_state **ts) 328 struct lwtunnel_state **ts)
331{ 329{
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index fd9f34bbd740..dfb2ab2dd3c8 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -306,7 +306,7 @@ static void __init ic_close_devs(void)
306 while ((d = next)) { 306 while ((d = next)) {
307 next = d->next; 307 next = d->next;
308 dev = d->dev; 308 dev = d->dev;
309 if ((!ic_dev || dev != ic_dev->dev) && !netdev_uses_dsa(dev)) { 309 if (d != ic_dev && !netdev_uses_dsa(dev)) {
310 pr_debug("IP-Config: Downing %s\n", dev->name); 310 pr_debug("IP-Config: Downing %s\n", dev->name);
311 dev_change_flags(dev, d->flags); 311 dev_change_flags(dev, d->flags);
312 } 312 }
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index efc1e76d4977..b036e85e093b 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -299,10 +299,29 @@ static void __net_exit ipmr_rules_exit(struct net *net)
299} 299}
300#endif 300#endif
301 301
302static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg,
303 const void *ptr)
304{
305 const struct mfc_cache_cmp_arg *cmparg = arg->key;
306 struct mfc_cache *c = (struct mfc_cache *)ptr;
307
308 return cmparg->mfc_mcastgrp != c->mfc_mcastgrp ||
309 cmparg->mfc_origin != c->mfc_origin;
310}
311
312static const struct rhashtable_params ipmr_rht_params = {
313 .head_offset = offsetof(struct mfc_cache, mnode),
314 .key_offset = offsetof(struct mfc_cache, cmparg),
315 .key_len = sizeof(struct mfc_cache_cmp_arg),
316 .nelem_hint = 3,
317 .locks_mul = 1,
318 .obj_cmpfn = ipmr_hash_cmp,
319 .automatic_shrinking = true,
320};
321
302static struct mr_table *ipmr_new_table(struct net *net, u32 id) 322static struct mr_table *ipmr_new_table(struct net *net, u32 id)
303{ 323{
304 struct mr_table *mrt; 324 struct mr_table *mrt;
305 unsigned int i;
306 325
307 /* "pimreg%u" should not exceed 16 bytes (IFNAMSIZ) */ 326 /* "pimreg%u" should not exceed 16 bytes (IFNAMSIZ) */
308 if (id != RT_TABLE_DEFAULT && id >= 1000000000) 327 if (id != RT_TABLE_DEFAULT && id >= 1000000000)
@@ -318,10 +337,8 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id)
318 write_pnet(&mrt->net, net); 337 write_pnet(&mrt->net, net);
319 mrt->id = id; 338 mrt->id = id;
320 339
321 /* Forwarding cache */ 340 rhltable_init(&mrt->mfc_hash, &ipmr_rht_params);
322 for (i = 0; i < MFC_LINES; i++) 341 INIT_LIST_HEAD(&mrt->mfc_cache_list);
323 INIT_LIST_HEAD(&mrt->mfc_cache_array[i]);
324
325 INIT_LIST_HEAD(&mrt->mfc_unres_queue); 342 INIT_LIST_HEAD(&mrt->mfc_unres_queue);
326 343
327 setup_timer(&mrt->ipmr_expire_timer, ipmr_expire_process, 344 setup_timer(&mrt->ipmr_expire_timer, ipmr_expire_process,
@@ -338,6 +355,7 @@ static void ipmr_free_table(struct mr_table *mrt)
338{ 355{
339 del_timer_sync(&mrt->ipmr_expire_timer); 356 del_timer_sync(&mrt->ipmr_expire_timer);
340 mroute_clean_tables(mrt, true); 357 mroute_clean_tables(mrt, true);
358 rhltable_destroy(&mrt->mfc_hash);
341 kfree(mrt); 359 kfree(mrt);
342} 360}
343 361
@@ -839,13 +857,17 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
839 __be32 origin, 857 __be32 origin,
840 __be32 mcastgrp) 858 __be32 mcastgrp)
841{ 859{
842 int line = MFC_HASH(mcastgrp, origin); 860 struct mfc_cache_cmp_arg arg = {
861 .mfc_mcastgrp = mcastgrp,
862 .mfc_origin = origin
863 };
864 struct rhlist_head *tmp, *list;
843 struct mfc_cache *c; 865 struct mfc_cache *c;
844 866
845 list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list) { 867 list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params);
846 if (c->mfc_origin == origin && c->mfc_mcastgrp == mcastgrp) 868 rhl_for_each_entry_rcu(c, tmp, list, mnode)
847 return c; 869 return c;
848 } 870
849 return NULL; 871 return NULL;
850} 872}
851 873
@@ -853,13 +875,16 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
853static struct mfc_cache *ipmr_cache_find_any_parent(struct mr_table *mrt, 875static struct mfc_cache *ipmr_cache_find_any_parent(struct mr_table *mrt,
854 int vifi) 876 int vifi)
855{ 877{
856 int line = MFC_HASH(htonl(INADDR_ANY), htonl(INADDR_ANY)); 878 struct mfc_cache_cmp_arg arg = {
879 .mfc_mcastgrp = htonl(INADDR_ANY),
880 .mfc_origin = htonl(INADDR_ANY)
881 };
882 struct rhlist_head *tmp, *list;
857 struct mfc_cache *c; 883 struct mfc_cache *c;
858 884
859 list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list) 885 list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params);
860 if (c->mfc_origin == htonl(INADDR_ANY) && 886 rhl_for_each_entry_rcu(c, tmp, list, mnode)
861 c->mfc_mcastgrp == htonl(INADDR_ANY) && 887 if (c->mfc_un.res.ttls[vifi] < 255)
862 c->mfc_un.res.ttls[vifi] < 255)
863 return c; 888 return c;
864 889
865 return NULL; 890 return NULL;
@@ -869,29 +894,51 @@ static struct mfc_cache *ipmr_cache_find_any_parent(struct mr_table *mrt,
869static struct mfc_cache *ipmr_cache_find_any(struct mr_table *mrt, 894static struct mfc_cache *ipmr_cache_find_any(struct mr_table *mrt,
870 __be32 mcastgrp, int vifi) 895 __be32 mcastgrp, int vifi)
871{ 896{
872 int line = MFC_HASH(mcastgrp, htonl(INADDR_ANY)); 897 struct mfc_cache_cmp_arg arg = {
898 .mfc_mcastgrp = mcastgrp,
899 .mfc_origin = htonl(INADDR_ANY)
900 };
901 struct rhlist_head *tmp, *list;
873 struct mfc_cache *c, *proxy; 902 struct mfc_cache *c, *proxy;
874 903
875 if (mcastgrp == htonl(INADDR_ANY)) 904 if (mcastgrp == htonl(INADDR_ANY))
876 goto skip; 905 goto skip;
877 906
878 list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list) 907 list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params);
879 if (c->mfc_origin == htonl(INADDR_ANY) && 908 rhl_for_each_entry_rcu(c, tmp, list, mnode) {
880 c->mfc_mcastgrp == mcastgrp) { 909 if (c->mfc_un.res.ttls[vifi] < 255)
881 if (c->mfc_un.res.ttls[vifi] < 255) 910 return c;
882 return c; 911
883 912 /* It's ok if the vifi is part of the static tree */
884 /* It's ok if the vifi is part of the static tree */ 913 proxy = ipmr_cache_find_any_parent(mrt, c->mfc_parent);
885 proxy = ipmr_cache_find_any_parent(mrt, 914 if (proxy && proxy->mfc_un.res.ttls[vifi] < 255)
886 c->mfc_parent); 915 return c;
887 if (proxy && proxy->mfc_un.res.ttls[vifi] < 255) 916 }
888 return c;
889 }
890 917
891skip: 918skip:
892 return ipmr_cache_find_any_parent(mrt, vifi); 919 return ipmr_cache_find_any_parent(mrt, vifi);
893} 920}
894 921
922/* Look for a (S,G,iif) entry if parent != -1 */
923static struct mfc_cache *ipmr_cache_find_parent(struct mr_table *mrt,
924 __be32 origin, __be32 mcastgrp,
925 int parent)
926{
927 struct mfc_cache_cmp_arg arg = {
928 .mfc_mcastgrp = mcastgrp,
929 .mfc_origin = origin,
930 };
931 struct rhlist_head *tmp, *list;
932 struct mfc_cache *c;
933
934 list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params);
935 rhl_for_each_entry_rcu(c, tmp, list, mnode)
936 if (parent == -1 || parent == c->mfc_parent)
937 return c;
938
939 return NULL;
940}
941
895/* Allocate a multicast cache entry */ 942/* Allocate a multicast cache entry */
896static struct mfc_cache *ipmr_cache_alloc(void) 943static struct mfc_cache *ipmr_cache_alloc(void)
897{ 944{
@@ -1028,10 +1075,10 @@ static int ipmr_cache_report(struct mr_table *mrt,
1028static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, 1075static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
1029 struct sk_buff *skb) 1076 struct sk_buff *skb)
1030{ 1077{
1078 const struct iphdr *iph = ip_hdr(skb);
1079 struct mfc_cache *c;
1031 bool found = false; 1080 bool found = false;
1032 int err; 1081 int err;
1033 struct mfc_cache *c;
1034 const struct iphdr *iph = ip_hdr(skb);
1035 1082
1036 spin_lock_bh(&mfc_unres_lock); 1083 spin_lock_bh(&mfc_unres_lock);
1037 list_for_each_entry(c, &mrt->mfc_unres_queue, list) { 1084 list_for_each_entry(c, &mrt->mfc_unres_queue, list) {
@@ -1095,46 +1142,39 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
1095 1142
1096static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent) 1143static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent)
1097{ 1144{
1098 int line; 1145 struct mfc_cache *c;
1099 struct mfc_cache *c, *next;
1100 1146
1101 line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr); 1147 /* The entries are added/deleted only under RTNL */
1148 rcu_read_lock();
1149 c = ipmr_cache_find_parent(mrt, mfc->mfcc_origin.s_addr,
1150 mfc->mfcc_mcastgrp.s_addr, parent);
1151 rcu_read_unlock();
1152 if (!c)
1153 return -ENOENT;
1154 rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params);
1155 list_del_rcu(&c->list);
1156 mroute_netlink_event(mrt, c, RTM_DELROUTE);
1157 ipmr_cache_free(c);
1102 1158
1103 list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) { 1159 return 0;
1104 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
1105 c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr &&
1106 (parent == -1 || parent == c->mfc_parent)) {
1107 list_del_rcu(&c->list);
1108 mroute_netlink_event(mrt, c, RTM_DELROUTE);
1109 ipmr_cache_free(c);
1110 return 0;
1111 }
1112 }
1113 return -ENOENT;
1114} 1160}
1115 1161
1116static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, 1162static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
1117 struct mfcctl *mfc, int mrtsock, int parent) 1163 struct mfcctl *mfc, int mrtsock, int parent)
1118{ 1164{
1119 bool found = false;
1120 int line;
1121 struct mfc_cache *uc, *c; 1165 struct mfc_cache *uc, *c;
1166 bool found;
1167 int ret;
1122 1168
1123 if (mfc->mfcc_parent >= MAXVIFS) 1169 if (mfc->mfcc_parent >= MAXVIFS)
1124 return -ENFILE; 1170 return -ENFILE;
1125 1171
1126 line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr); 1172 /* The entries are added/deleted only under RTNL */
1127 1173 rcu_read_lock();
1128 list_for_each_entry(c, &mrt->mfc_cache_array[line], list) { 1174 c = ipmr_cache_find_parent(mrt, mfc->mfcc_origin.s_addr,
1129 if (c->mfc_origin == mfc->mfcc_origin.s_addr && 1175 mfc->mfcc_mcastgrp.s_addr, parent);
1130 c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr && 1176 rcu_read_unlock();
1131 (parent == -1 || parent == c->mfc_parent)) { 1177 if (c) {
1132 found = true;
1133 break;
1134 }
1135 }
1136
1137 if (found) {
1138 write_lock_bh(&mrt_lock); 1178 write_lock_bh(&mrt_lock);
1139 c->mfc_parent = mfc->mfcc_parent; 1179 c->mfc_parent = mfc->mfcc_parent;
1140 ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls); 1180 ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls);
@@ -1160,8 +1200,14 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
1160 if (!mrtsock) 1200 if (!mrtsock)
1161 c->mfc_flags |= MFC_STATIC; 1201 c->mfc_flags |= MFC_STATIC;
1162 1202
1163 list_add_rcu(&c->list, &mrt->mfc_cache_array[line]); 1203 ret = rhltable_insert_key(&mrt->mfc_hash, &c->cmparg, &c->mnode,
1164 1204 ipmr_rht_params);
1205 if (ret) {
1206 pr_err("ipmr: rhtable insert error %d\n", ret);
1207 ipmr_cache_free(c);
1208 return ret;
1209 }
1210 list_add_tail_rcu(&c->list, &mrt->mfc_cache_list);
1165 /* Check to see if we resolved a queued list. If so we 1211 /* Check to see if we resolved a queued list. If so we
1166 * need to send on the frames and tidy up. 1212 * need to send on the frames and tidy up.
1167 */ 1213 */
@@ -1191,9 +1237,9 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
1191/* Close the multicast socket, and clear the vif tables etc */ 1237/* Close the multicast socket, and clear the vif tables etc */
1192static void mroute_clean_tables(struct mr_table *mrt, bool all) 1238static void mroute_clean_tables(struct mr_table *mrt, bool all)
1193{ 1239{
1194 int i; 1240 struct mfc_cache *c, *tmp;
1195 LIST_HEAD(list); 1241 LIST_HEAD(list);
1196 struct mfc_cache *c, *next; 1242 int i;
1197 1243
1198 /* Shut down all active vif entries */ 1244 /* Shut down all active vif entries */
1199 for (i = 0; i < mrt->maxvif; i++) { 1245 for (i = 0; i < mrt->maxvif; i++) {
@@ -1204,19 +1250,18 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all)
1204 unregister_netdevice_many(&list); 1250 unregister_netdevice_many(&list);
1205 1251
1206 /* Wipe the cache */ 1252 /* Wipe the cache */
1207 for (i = 0; i < MFC_LINES; i++) { 1253 list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) {
1208 list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) { 1254 if (!all && (c->mfc_flags & MFC_STATIC))
1209 if (!all && (c->mfc_flags & MFC_STATIC)) 1255 continue;
1210 continue; 1256 rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params);
1211 list_del_rcu(&c->list); 1257 list_del_rcu(&c->list);
1212 mroute_netlink_event(mrt, c, RTM_DELROUTE); 1258 mroute_netlink_event(mrt, c, RTM_DELROUTE);
1213 ipmr_cache_free(c); 1259 ipmr_cache_free(c);
1214 }
1215 } 1260 }
1216 1261
1217 if (atomic_read(&mrt->cache_resolve_queue_len) != 0) { 1262 if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
1218 spin_lock_bh(&mfc_unres_lock); 1263 spin_lock_bh(&mfc_unres_lock);
1219 list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) { 1264 list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) {
1220 list_del(&c->list); 1265 list_del(&c->list);
1221 mroute_netlink_event(mrt, c, RTM_DELROUTE); 1266 mroute_netlink_event(mrt, c, RTM_DELROUTE);
1222 ipmr_destroy_unres(mrt, c); 1267 ipmr_destroy_unres(mrt, c);
@@ -1233,7 +1278,7 @@ static void mrtsock_destruct(struct sock *sk)
1233 struct net *net = sock_net(sk); 1278 struct net *net = sock_net(sk);
1234 struct mr_table *mrt; 1279 struct mr_table *mrt;
1235 1280
1236 rtnl_lock(); 1281 ASSERT_RTNL();
1237 ipmr_for_each_table(mrt, net) { 1282 ipmr_for_each_table(mrt, net) {
1238 if (sk == rtnl_dereference(mrt->mroute_sk)) { 1283 if (sk == rtnl_dereference(mrt->mroute_sk)) {
1239 IPV4_DEVCONF_ALL(net, MC_FORWARDING)--; 1284 IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
@@ -1244,7 +1289,6 @@ static void mrtsock_destruct(struct sock *sk)
1244 mroute_clean_tables(mrt, false); 1289 mroute_clean_tables(mrt, false);
1245 } 1290 }
1246 } 1291 }
1247 rtnl_unlock();
1248} 1292}
1249 1293
1250/* Socket options and virtual interface manipulation. The whole 1294/* Socket options and virtual interface manipulation. The whole
@@ -1308,13 +1352,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
1308 if (sk != rcu_access_pointer(mrt->mroute_sk)) { 1352 if (sk != rcu_access_pointer(mrt->mroute_sk)) {
1309 ret = -EACCES; 1353 ret = -EACCES;
1310 } else { 1354 } else {
1311 /* We need to unlock here because mrtsock_destruct takes
1312 * care of rtnl itself and we can't change that due to
1313 * the IP_ROUTER_ALERT setsockopt which runs without it.
1314 */
1315 rtnl_unlock();
1316 ret = ip_ra_control(sk, 0, NULL); 1355 ret = ip_ra_control(sk, 0, NULL);
1317 goto out; 1356 goto out_unlock;
1318 } 1357 }
1319 break; 1358 break;
1320 case MRT_ADD_VIF: 1359 case MRT_ADD_VIF:
@@ -1425,7 +1464,6 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
1425 } 1464 }
1426out_unlock: 1465out_unlock:
1427 rtnl_unlock(); 1466 rtnl_unlock();
1428out:
1429 return ret; 1467 return ret;
1430} 1468}
1431 1469
@@ -1791,9 +1829,9 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt,
1791 struct sk_buff *skb, struct mfc_cache *cache, 1829 struct sk_buff *skb, struct mfc_cache *cache,
1792 int local) 1830 int local)
1793{ 1831{
1832 int true_vifi = ipmr_find_vif(mrt, skb->dev);
1794 int psend = -1; 1833 int psend = -1;
1795 int vif, ct; 1834 int vif, ct;
1796 int true_vifi = ipmr_find_vif(mrt, skb->dev);
1797 1835
1798 vif = cache->mfc_parent; 1836 vif = cache->mfc_parent;
1799 cache->mfc_un.res.pkt++; 1837 cache->mfc_un.res.pkt++;
@@ -2091,8 +2129,10 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
2091 int ct; 2129 int ct;
2092 2130
2093 /* If cache is unresolved, don't try to parse IIF and OIF */ 2131 /* If cache is unresolved, don't try to parse IIF and OIF */
2094 if (c->mfc_parent >= MAXVIFS) 2132 if (c->mfc_parent >= MAXVIFS) {
2133 rtm->rtm_flags |= RTNH_F_UNRESOLVED;
2095 return -ENOENT; 2134 return -ENOENT;
2135 }
2096 2136
2097 if (VIF_EXISTS(mrt, c->mfc_parent) && 2137 if (VIF_EXISTS(mrt, c->mfc_parent) &&
2098 nla_put_u32(skb, RTA_IIF, mrt->vif_table[c->mfc_parent].dev->ifindex) < 0) 2138 nla_put_u32(skb, RTA_IIF, mrt->vif_table[c->mfc_parent].dev->ifindex) < 0)
@@ -2134,7 +2174,7 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
2134 2174
2135int ipmr_get_route(struct net *net, struct sk_buff *skb, 2175int ipmr_get_route(struct net *net, struct sk_buff *skb,
2136 __be32 saddr, __be32 daddr, 2176 __be32 saddr, __be32 daddr,
2137 struct rtmsg *rtm, int nowait, u32 portid) 2177 struct rtmsg *rtm, u32 portid)
2138{ 2178{
2139 struct mfc_cache *cache; 2179 struct mfc_cache *cache;
2140 struct mr_table *mrt; 2180 struct mr_table *mrt;
@@ -2158,11 +2198,6 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb,
2158 struct net_device *dev; 2198 struct net_device *dev;
2159 int vif = -1; 2199 int vif = -1;
2160 2200
2161 if (nowait) {
2162 rcu_read_unlock();
2163 return -EAGAIN;
2164 }
2165
2166 dev = skb->dev; 2201 dev = skb->dev;
2167 read_lock(&mrt_lock); 2202 read_lock(&mrt_lock);
2168 if (dev) 2203 if (dev)
@@ -2296,34 +2331,30 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
2296 struct mr_table *mrt; 2331 struct mr_table *mrt;
2297 struct mfc_cache *mfc; 2332 struct mfc_cache *mfc;
2298 unsigned int t = 0, s_t; 2333 unsigned int t = 0, s_t;
2299 unsigned int h = 0, s_h;
2300 unsigned int e = 0, s_e; 2334 unsigned int e = 0, s_e;
2301 2335
2302 s_t = cb->args[0]; 2336 s_t = cb->args[0];
2303 s_h = cb->args[1]; 2337 s_e = cb->args[1];
2304 s_e = cb->args[2];
2305 2338
2306 rcu_read_lock(); 2339 rcu_read_lock();
2307 ipmr_for_each_table(mrt, net) { 2340 ipmr_for_each_table(mrt, net) {
2308 if (t < s_t) 2341 if (t < s_t)
2309 goto next_table; 2342 goto next_table;
2310 if (t > s_t) 2343 list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) {
2311 s_h = 0; 2344 if (e < s_e)
2312 for (h = s_h; h < MFC_LINES; h++) { 2345 goto next_entry;
2313 list_for_each_entry_rcu(mfc, &mrt->mfc_cache_array[h], list) { 2346 if (ipmr_fill_mroute(mrt, skb,
2314 if (e < s_e) 2347 NETLINK_CB(cb->skb).portid,
2315 goto next_entry; 2348 cb->nlh->nlmsg_seq,
2316 if (ipmr_fill_mroute(mrt, skb, 2349 mfc, RTM_NEWROUTE,
2317 NETLINK_CB(cb->skb).portid, 2350 NLM_F_MULTI) < 0)
2318 cb->nlh->nlmsg_seq, 2351 goto done;
2319 mfc, RTM_NEWROUTE,
2320 NLM_F_MULTI) < 0)
2321 goto done;
2322next_entry: 2352next_entry:
2323 e++; 2353 e++;
2324 }
2325 e = s_e = 0;
2326 } 2354 }
2355 e = 0;
2356 s_e = 0;
2357
2327 spin_lock_bh(&mfc_unres_lock); 2358 spin_lock_bh(&mfc_unres_lock);
2328 list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) { 2359 list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) {
2329 if (e < s_e) 2360 if (e < s_e)
@@ -2340,16 +2371,15 @@ next_entry2:
2340 e++; 2371 e++;
2341 } 2372 }
2342 spin_unlock_bh(&mfc_unres_lock); 2373 spin_unlock_bh(&mfc_unres_lock);
2343 e = s_e = 0; 2374 e = 0;
2344 s_h = 0; 2375 s_e = 0;
2345next_table: 2376next_table:
2346 t++; 2377 t++;
2347 } 2378 }
2348done: 2379done:
2349 rcu_read_unlock(); 2380 rcu_read_unlock();
2350 2381
2351 cb->args[2] = e; 2382 cb->args[1] = e;
2352 cb->args[1] = h;
2353 cb->args[0] = t; 2383 cb->args[0] = t;
2354 2384
2355 return skb->len; 2385 return skb->len;
@@ -2559,7 +2589,7 @@ static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
2559 const char *name = vif->dev ? vif->dev->name : "none"; 2589 const char *name = vif->dev ? vif->dev->name : "none";
2560 2590
2561 seq_printf(seq, 2591 seq_printf(seq,
2562 "%2Zd %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n", 2592 "%2zd %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n",
2563 vif - mrt->vif_table, 2593 vif - mrt->vif_table,
2564 name, vif->bytes_in, vif->pkt_in, 2594 name, vif->bytes_in, vif->pkt_in,
2565 vif->bytes_out, vif->pkt_out, 2595 vif->bytes_out, vif->pkt_out,
@@ -2593,10 +2623,8 @@ struct ipmr_mfc_iter {
2593 struct seq_net_private p; 2623 struct seq_net_private p;
2594 struct mr_table *mrt; 2624 struct mr_table *mrt;
2595 struct list_head *cache; 2625 struct list_head *cache;
2596 int ct;
2597}; 2626};
2598 2627
2599
2600static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net, 2628static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
2601 struct ipmr_mfc_iter *it, loff_t pos) 2629 struct ipmr_mfc_iter *it, loff_t pos)
2602{ 2630{
@@ -2604,12 +2632,10 @@ static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
2604 struct mfc_cache *mfc; 2632 struct mfc_cache *mfc;
2605 2633
2606 rcu_read_lock(); 2634 rcu_read_lock();
2607 for (it->ct = 0; it->ct < MFC_LINES; it->ct++) { 2635 it->cache = &mrt->mfc_cache_list;
2608 it->cache = &mrt->mfc_cache_array[it->ct]; 2636 list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list)
2609 list_for_each_entry_rcu(mfc, it->cache, list) 2637 if (pos-- == 0)
2610 if (pos-- == 0) 2638 return mfc;
2611 return mfc;
2612 }
2613 rcu_read_unlock(); 2639 rcu_read_unlock();
2614 2640
2615 spin_lock_bh(&mfc_unres_lock); 2641 spin_lock_bh(&mfc_unres_lock);
@@ -2636,17 +2662,16 @@ static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
2636 2662
2637 it->mrt = mrt; 2663 it->mrt = mrt;
2638 it->cache = NULL; 2664 it->cache = NULL;
2639 it->ct = 0;
2640 return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1) 2665 return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1)
2641 : SEQ_START_TOKEN; 2666 : SEQ_START_TOKEN;
2642} 2667}
2643 2668
2644static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2669static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2645{ 2670{
2646 struct mfc_cache *mfc = v;
2647 struct ipmr_mfc_iter *it = seq->private; 2671 struct ipmr_mfc_iter *it = seq->private;
2648 struct net *net = seq_file_net(seq); 2672 struct net *net = seq_file_net(seq);
2649 struct mr_table *mrt = it->mrt; 2673 struct mr_table *mrt = it->mrt;
2674 struct mfc_cache *mfc = v;
2650 2675
2651 ++*pos; 2676 ++*pos;
2652 2677
@@ -2659,19 +2684,9 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2659 if (it->cache == &mrt->mfc_unres_queue) 2684 if (it->cache == &mrt->mfc_unres_queue)
2660 goto end_of_list; 2685 goto end_of_list;
2661 2686
2662 BUG_ON(it->cache != &mrt->mfc_cache_array[it->ct]);
2663
2664 while (++it->ct < MFC_LINES) {
2665 it->cache = &mrt->mfc_cache_array[it->ct];
2666 if (list_empty(it->cache))
2667 continue;
2668 return list_first_entry(it->cache, struct mfc_cache, list);
2669 }
2670
2671 /* exhausted cache_array, show unresolved */ 2687 /* exhausted cache_array, show unresolved */
2672 rcu_read_unlock(); 2688 rcu_read_unlock();
2673 it->cache = &mrt->mfc_unres_queue; 2689 it->cache = &mrt->mfc_unres_queue;
2674 it->ct = 0;
2675 2690
2676 spin_lock_bh(&mfc_unres_lock); 2691 spin_lock_bh(&mfc_unres_lock);
2677 if (!list_empty(it->cache)) 2692 if (!list_empty(it->cache))
@@ -2691,7 +2706,7 @@ static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
2691 2706
2692 if (it->cache == &mrt->mfc_unres_queue) 2707 if (it->cache == &mrt->mfc_unres_queue)
2693 spin_unlock_bh(&mfc_unres_lock); 2708 spin_unlock_bh(&mfc_unres_lock);
2694 else if (it->cache == &mrt->mfc_cache_array[it->ct]) 2709 else if (it->cache == &mrt->mfc_cache_list)
2695 rcu_read_unlock(); 2710 rcu_read_unlock();
2696} 2711}
2697 2712
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index b3cc1335adbc..c0cc6aa8cfaa 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -23,7 +23,8 @@ int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_t
23 struct rtable *rt; 23 struct rtable *rt;
24 struct flowi4 fl4 = {}; 24 struct flowi4 fl4 = {};
25 __be32 saddr = iph->saddr; 25 __be32 saddr = iph->saddr;
26 __u8 flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0; 26 const struct sock *sk = skb_to_full_sk(skb);
27 __u8 flags = sk ? inet_sk_flowi_flags(sk) : 0;
27 struct net_device *dev = skb_dst(skb)->dev; 28 struct net_device *dev = skb_dst(skb)->dev;
28 unsigned int hh_len; 29 unsigned int hh_len;
29 30
@@ -40,7 +41,7 @@ int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_t
40 fl4.daddr = iph->daddr; 41 fl4.daddr = iph->daddr;
41 fl4.saddr = saddr; 42 fl4.saddr = saddr;
42 fl4.flowi4_tos = RT_TOS(iph->tos); 43 fl4.flowi4_tos = RT_TOS(iph->tos);
43 fl4.flowi4_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0; 44 fl4.flowi4_oif = sk ? sk->sk_bound_dev_if : 0;
44 if (!fl4.flowi4_oif) 45 if (!fl4.flowi4_oif)
45 fl4.flowi4_oif = l3mdev_master_ifindex(dev); 46 fl4.flowi4_oif = l3mdev_master_ifindex(dev);
46 fl4.flowi4_mark = skb->mark; 47 fl4.flowi4_mark = skb->mark;
@@ -61,7 +62,7 @@ int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_t
61 xfrm_decode_session(skb, flowi4_to_flowi(&fl4), AF_INET) == 0) { 62 xfrm_decode_session(skb, flowi4_to_flowi(&fl4), AF_INET) == 0) {
62 struct dst_entry *dst = skb_dst(skb); 63 struct dst_entry *dst = skb_dst(skb);
63 skb_dst_set(skb, NULL); 64 skb_dst_set(skb, NULL);
64 dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), skb->sk, 0); 65 dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), sk, 0);
65 if (IS_ERR(dst)) 66 if (IS_ERR(dst))
66 return PTR_ERR(dst); 67 return PTR_ERR(dst);
67 skb_dst_set(skb, dst); 68 skb_dst_set(skb, dst);
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index a467e1236c43..6241a81fd7f5 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -677,11 +677,6 @@ static int copy_entries_to_user(unsigned int total_size,
677 return PTR_ERR(counters); 677 return PTR_ERR(counters);
678 678
679 loc_cpu_entry = private->entries; 679 loc_cpu_entry = private->entries;
680 /* ... then copy entire thing ... */
681 if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
682 ret = -EFAULT;
683 goto free_counters;
684 }
685 680
686 /* FIXME: use iterator macros --RR */ 681 /* FIXME: use iterator macros --RR */
687 /* ... then go back and fix counters and names */ 682 /* ... then go back and fix counters and names */
@@ -689,6 +684,10 @@ static int copy_entries_to_user(unsigned int total_size,
689 const struct xt_entry_target *t; 684 const struct xt_entry_target *t;
690 685
691 e = (struct arpt_entry *)(loc_cpu_entry + off); 686 e = (struct arpt_entry *)(loc_cpu_entry + off);
687 if (copy_to_user(userptr + off, e, sizeof(*e))) {
688 ret = -EFAULT;
689 goto free_counters;
690 }
692 if (copy_to_user(userptr + off 691 if (copy_to_user(userptr + off
693 + offsetof(struct arpt_entry, counters), 692 + offsetof(struct arpt_entry, counters),
694 &counters[num], 693 &counters[num],
@@ -698,11 +697,7 @@ static int copy_entries_to_user(unsigned int total_size,
698 } 697 }
699 698
700 t = arpt_get_target_c(e); 699 t = arpt_get_target_c(e);
701 if (copy_to_user(userptr + off + e->target_offset 700 if (xt_target_to_user(t, userptr + off + e->target_offset)) {
702 + offsetof(struct xt_entry_target,
703 u.user.name),
704 t->u.kernel.target->name,
705 strlen(t->u.kernel.target->name)+1) != 0) {
706 ret = -EFAULT; 701 ret = -EFAULT;
707 goto free_counters; 702 goto free_counters;
708 } 703 }
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 91656a1d8fbd..384b85713e06 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -826,10 +826,6 @@ copy_entries_to_user(unsigned int total_size,
826 return PTR_ERR(counters); 826 return PTR_ERR(counters);
827 827
828 loc_cpu_entry = private->entries; 828 loc_cpu_entry = private->entries;
829 if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
830 ret = -EFAULT;
831 goto free_counters;
832 }
833 829
834 /* FIXME: use iterator macros --RR */ 830 /* FIXME: use iterator macros --RR */
835 /* ... then go back and fix counters and names */ 831 /* ... then go back and fix counters and names */
@@ -839,6 +835,10 @@ copy_entries_to_user(unsigned int total_size,
839 const struct xt_entry_target *t; 835 const struct xt_entry_target *t;
840 836
841 e = (struct ipt_entry *)(loc_cpu_entry + off); 837 e = (struct ipt_entry *)(loc_cpu_entry + off);
838 if (copy_to_user(userptr + off, e, sizeof(*e))) {
839 ret = -EFAULT;
840 goto free_counters;
841 }
842 if (copy_to_user(userptr + off 842 if (copy_to_user(userptr + off
843 + offsetof(struct ipt_entry, counters), 843 + offsetof(struct ipt_entry, counters),
844 &counters[num], 844 &counters[num],
@@ -852,23 +852,14 @@ copy_entries_to_user(unsigned int total_size,
852 i += m->u.match_size) { 852 i += m->u.match_size) {
853 m = (void *)e + i; 853 m = (void *)e + i;
854 854
855 if (copy_to_user(userptr + off + i 855 if (xt_match_to_user(m, userptr + off + i)) {
856 + offsetof(struct xt_entry_match,
857 u.user.name),
858 m->u.kernel.match->name,
859 strlen(m->u.kernel.match->name)+1)
860 != 0) {
861 ret = -EFAULT; 856 ret = -EFAULT;
862 goto free_counters; 857 goto free_counters;
863 } 858 }
864 } 859 }
865 860
866 t = ipt_get_target_c(e); 861 t = ipt_get_target_c(e);
867 if (copy_to_user(userptr + off + e->target_offset 862 if (xt_target_to_user(t, userptr + off + e->target_offset)) {
868 + offsetof(struct xt_entry_target,
869 u.user.name),
870 t->u.kernel.target->name,
871 strlen(t->u.kernel.target->name)+1) != 0) {
872 ret = -EFAULT; 863 ret = -EFAULT;
873 goto free_counters; 864 goto free_counters;
874 } 865 }
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 0a783cd73faf..9b8841316e7b 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -461,7 +461,7 @@ static void clusterip_tg_destroy(const struct xt_tgdtor_param *par)
461 461
462 clusterip_config_put(cipinfo->config); 462 clusterip_config_put(cipinfo->config);
463 463
464 nf_ct_netns_get(par->net, par->family); 464 nf_ct_netns_put(par->net, par->family);
465} 465}
466 466
467#ifdef CONFIG_COMPAT 467#ifdef CONFIG_COMPAT
@@ -485,6 +485,7 @@ static struct xt_target clusterip_tg_reg __read_mostly = {
485 .checkentry = clusterip_tg_check, 485 .checkentry = clusterip_tg_check,
486 .destroy = clusterip_tg_destroy, 486 .destroy = clusterip_tg_destroy,
487 .targetsize = sizeof(struct ipt_clusterip_tgt_info), 487 .targetsize = sizeof(struct ipt_clusterip_tgt_info),
488 .usersize = offsetof(struct ipt_clusterip_tgt_info, config),
488#ifdef CONFIG_COMPAT 489#ifdef CONFIG_COMPAT
489 .compatsize = sizeof(struct compat_ipt_clusterip_tgt_info), 490 .compatsize = sizeof(struct compat_ipt_clusterip_tgt_info),
490#endif /* CONFIG_COMPAT */ 491#endif /* CONFIG_COMPAT */
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
index 30c0de53e254..3240a2614e82 100644
--- a/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -57,8 +57,7 @@ synproxy_send_tcp(struct net *net,
57 goto free_nskb; 57 goto free_nskb;
58 58
59 if (nfct) { 59 if (nfct) {
60 nskb->nfct = nfct; 60 nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo);
61 nskb->nfctinfo = ctinfo;
62 nf_conntrack_get(nfct); 61 nf_conntrack_get(nfct);
63 } 62 }
64 63
@@ -107,8 +106,8 @@ synproxy_send_client_synack(struct net *net,
107 106
108 synproxy_build_options(nth, opts); 107 synproxy_build_options(nth, opts);
109 108
110 synproxy_send_tcp(net, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, 109 synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
111 niph, nth, tcp_hdr_size); 110 IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
112} 111}
113 112
114static void 113static void
@@ -230,8 +229,8 @@ synproxy_send_client_ack(struct net *net,
230 229
231 synproxy_build_options(nth, opts); 230 synproxy_build_options(nth, opts);
232 231
233 synproxy_send_tcp(net, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, 232 synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
234 niph, nth, tcp_hdr_size); 233 IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
235} 234}
236 235
237static bool 236static bool
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index fcfd071f4705..2e14ed11a35c 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -165,6 +165,10 @@ static unsigned int ipv4_conntrack_local(void *priv,
165 if (skb->len < sizeof(struct iphdr) || 165 if (skb->len < sizeof(struct iphdr) ||
166 ip_hdrlen(skb) < sizeof(struct iphdr)) 166 ip_hdrlen(skb) < sizeof(struct iphdr))
167 return NF_ACCEPT; 167 return NF_ACCEPT;
168
169 if (ip_is_fragment(ip_hdr(skb))) /* IP_NODEFRAG setsockopt set */
170 return NF_ACCEPT;
171
168 return nf_conntrack_in(state->net, PF_INET, state->hook, skb); 172 return nf_conntrack_in(state->net, PF_INET, state->hook, skb);
169} 173}
170 174
@@ -235,7 +239,7 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
235 } 239 }
236 240
237 if ((unsigned int) *len < sizeof(struct sockaddr_in)) { 241 if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
238 pr_debug("SO_ORIGINAL_DST: len %d not %Zu\n", 242 pr_debug("SO_ORIGINAL_DST: len %d not %zu\n",
239 *len, sizeof(struct sockaddr_in)); 243 *len, sizeof(struct sockaddr_in));
240 return -EINVAL; 244 return -EINVAL;
241 } 245 }
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index d075b3cf2400..73c591d8a9a8 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -128,16 +128,16 @@ static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb,
128/* Returns conntrack if it dealt with ICMP, and filled in skb fields */ 128/* Returns conntrack if it dealt with ICMP, and filled in skb fields */
129static int 129static int
130icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, 130icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
131 enum ip_conntrack_info *ctinfo,
132 unsigned int hooknum) 131 unsigned int hooknum)
133{ 132{
134 struct nf_conntrack_tuple innertuple, origtuple; 133 struct nf_conntrack_tuple innertuple, origtuple;
135 const struct nf_conntrack_l4proto *innerproto; 134 const struct nf_conntrack_l4proto *innerproto;
136 const struct nf_conntrack_tuple_hash *h; 135 const struct nf_conntrack_tuple_hash *h;
137 const struct nf_conntrack_zone *zone; 136 const struct nf_conntrack_zone *zone;
137 enum ip_conntrack_info ctinfo;
138 struct nf_conntrack_zone tmp; 138 struct nf_conntrack_zone tmp;
139 139
140 NF_CT_ASSERT(skb->nfct == NULL); 140 NF_CT_ASSERT(!skb_nfct(skb));
141 zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); 141 zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
142 142
143 /* Are they talking about one of our connections? */ 143 /* Are they talking about one of our connections? */
@@ -160,7 +160,7 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
160 return -NF_ACCEPT; 160 return -NF_ACCEPT;
161 } 161 }
162 162
163 *ctinfo = IP_CT_RELATED; 163 ctinfo = IP_CT_RELATED;
164 164
165 h = nf_conntrack_find_get(net, zone, &innertuple); 165 h = nf_conntrack_find_get(net, zone, &innertuple);
166 if (!h) { 166 if (!h) {
@@ -169,11 +169,10 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
169 } 169 }
170 170
171 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) 171 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY)
172 *ctinfo += IP_CT_IS_REPLY; 172 ctinfo += IP_CT_IS_REPLY;
173 173
174 /* Update skb to refer to this connection */ 174 /* Update skb to refer to this connection */
175 skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general; 175 nf_ct_set(skb, nf_ct_tuplehash_to_ctrack(h), ctinfo);
176 skb->nfctinfo = *ctinfo;
177 return NF_ACCEPT; 176 return NF_ACCEPT;
178} 177}
179 178
@@ -181,7 +180,7 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
181static int 180static int
182icmp_error(struct net *net, struct nf_conn *tmpl, 181icmp_error(struct net *net, struct nf_conn *tmpl,
183 struct sk_buff *skb, unsigned int dataoff, 182 struct sk_buff *skb, unsigned int dataoff,
184 enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum) 183 u8 pf, unsigned int hooknum)
185{ 184{
186 const struct icmphdr *icmph; 185 const struct icmphdr *icmph;
187 struct icmphdr _ih; 186 struct icmphdr _ih;
@@ -225,7 +224,7 @@ icmp_error(struct net *net, struct nf_conn *tmpl,
225 icmph->type != ICMP_REDIRECT) 224 icmph->type != ICMP_REDIRECT)
226 return NF_ACCEPT; 225 return NF_ACCEPT;
227 226
228 return icmp_error_message(net, tmpl, skb, ctinfo, hooknum); 227 return icmp_error_message(net, tmpl, skb, hooknum);
229} 228}
230 229
231#if IS_ENABLED(CONFIG_NF_CT_NETLINK) 230#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index 49bd6a54404f..346bf7ccac08 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -45,7 +45,7 @@ static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum,
45{ 45{
46 u16 zone_id = NF_CT_DEFAULT_ZONE_ID; 46 u16 zone_id = NF_CT_DEFAULT_ZONE_ID;
47#if IS_ENABLED(CONFIG_NF_CONNTRACK) 47#if IS_ENABLED(CONFIG_NF_CONNTRACK)
48 if (skb->nfct) { 48 if (skb_nfct(skb)) {
49 enum ip_conntrack_info ctinfo; 49 enum ip_conntrack_info ctinfo;
50 const struct nf_conn *ct = nf_ct_get(skb, &ctinfo); 50 const struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
51 51
@@ -75,7 +75,7 @@ static unsigned int ipv4_conntrack_defrag(void *priv,
75#if !IS_ENABLED(CONFIG_NF_NAT) 75#if !IS_ENABLED(CONFIG_NF_NAT)
76 /* Previously seen (loopback)? Ignore. Do this before 76 /* Previously seen (loopback)? Ignore. Do this before
77 fragment check. */ 77 fragment check. */
78 if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct)) 78 if (skb_nfct(skb) && !nf_ct_is_template((struct nf_conn *)skb_nfct(skb)))
79 return NF_ACCEPT; 79 return NF_ACCEPT;
80#endif 80#endif
81#endif 81#endif
diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c
index cf986e1c7bbd..f0dbff05fc28 100644
--- a/net/ipv4/netfilter/nf_dup_ipv4.c
+++ b/net/ipv4/netfilter/nf_dup_ipv4.c
@@ -68,10 +68,9 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum,
68 68
69#if IS_ENABLED(CONFIG_NF_CONNTRACK) 69#if IS_ENABLED(CONFIG_NF_CONNTRACK)
70 /* Avoid counting cloned packets towards the original connection. */ 70 /* Avoid counting cloned packets towards the original connection. */
71 nf_conntrack_put(skb->nfct); 71 nf_reset(skb);
72 skb->nfct = &nf_ct_untracked_get()->ct_general; 72 nf_ct_set(skb, nf_ct_untracked_get(), IP_CT_NEW);
73 skb->nfctinfo = IP_CT_NEW; 73 nf_conntrack_get(skb_nfct(skb));
74 nf_conntrack_get(skb->nfct);
75#endif 74#endif
76 /* 75 /*
77 * If we are in PREROUTING/INPUT, decrease the TTL to mitigate potential 76 * If we are in PREROUTING/INPUT, decrease the TTL to mitigate potential
diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c
index b24795e2ee6d..2f3895ddc275 100644
--- a/net/ipv4/netfilter/nf_log_arp.c
+++ b/net/ipv4/netfilter/nf_log_arp.c
@@ -69,7 +69,7 @@ static void dump_arp_packet(struct nf_log_buf *m,
69 69
70 ap = skb_header_pointer(skb, sizeof(_arph), sizeof(_arpp), &_arpp); 70 ap = skb_header_pointer(skb, sizeof(_arph), sizeof(_arpp), &_arpp);
71 if (ap == NULL) { 71 if (ap == NULL) {
72 nf_log_buf_add(m, " INCOMPLETE [%Zu bytes]", 72 nf_log_buf_add(m, " INCOMPLETE [%zu bytes]",
73 skb->len - sizeof(_arph)); 73 skb->len - sizeof(_arph));
74 return; 74 return;
75 } 75 }
@@ -87,7 +87,7 @@ static void nf_log_arp_packet(struct net *net, u_int8_t pf,
87 struct nf_log_buf *m; 87 struct nf_log_buf *m;
88 88
89 /* FIXME: Disabled from containers until syslog ns is supported */ 89 /* FIXME: Disabled from containers until syslog ns is supported */
90 if (!net_eq(net, &init_net)) 90 if (!net_eq(net, &init_net) && !sysctl_nf_log_all_netns)
91 return; 91 return;
92 92
93 m = nf_log_buf_open(); 93 m = nf_log_buf_open();
diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c
index 856648966f4c..c83a9963269b 100644
--- a/net/ipv4/netfilter/nf_log_ipv4.c
+++ b/net/ipv4/netfilter/nf_log_ipv4.c
@@ -319,7 +319,7 @@ static void nf_log_ip_packet(struct net *net, u_int8_t pf,
319 struct nf_log_buf *m; 319 struct nf_log_buf *m;
320 320
321 /* FIXME: Disabled from containers until syslog ns is supported */ 321 /* FIXME: Disabled from containers until syslog ns is supported */
322 if (!net_eq(net, &init_net)) 322 if (!net_eq(net, &init_net) && !sysctl_nf_log_all_netns)
323 return; 323 return;
324 324
325 m = nf_log_buf_open(); 325 m = nf_log_buf_open();
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
index f8aad03d674b..6f5e8d01b876 100644
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -255,11 +255,6 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
255 /* maniptype == SRC for postrouting. */ 255 /* maniptype == SRC for postrouting. */
256 enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook); 256 enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook);
257 257
258 /* We never see fragments: conntrack defrags on pre-routing
259 * and local-out, and nf_nat_out protects post-routing.
260 */
261 NF_CT_ASSERT(!ip_is_fragment(ip_hdr(skb)));
262
263 ct = nf_ct_get(skb, &ctinfo); 258 ct = nf_ct_get(skb, &ctinfo);
264 /* Can't track? It's not due to stress, or conntrack would 259 /* Can't track? It's not due to stress, or conntrack would
265 * have dropped it. Hence it's the user's responsibilty to 260 * have dropped it. Hence it's the user's responsibilty to
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index c9b52c361da2..53e49f5011d3 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -1260,16 +1260,6 @@ static const struct nf_conntrack_expect_policy snmp_exp_policy = {
1260 .timeout = 180, 1260 .timeout = 180,
1261}; 1261};
1262 1262
1263static struct nf_conntrack_helper snmp_helper __read_mostly = {
1264 .me = THIS_MODULE,
1265 .help = help,
1266 .expect_policy = &snmp_exp_policy,
1267 .name = "snmp",
1268 .tuple.src.l3num = AF_INET,
1269 .tuple.src.u.udp.port = cpu_to_be16(SNMP_PORT),
1270 .tuple.dst.protonum = IPPROTO_UDP,
1271};
1272
1273static struct nf_conntrack_helper snmp_trap_helper __read_mostly = { 1263static struct nf_conntrack_helper snmp_trap_helper __read_mostly = {
1274 .me = THIS_MODULE, 1264 .me = THIS_MODULE,
1275 .help = help, 1265 .help = help,
@@ -1288,22 +1278,16 @@ static struct nf_conntrack_helper snmp_trap_helper __read_mostly = {
1288 1278
1289static int __init nf_nat_snmp_basic_init(void) 1279static int __init nf_nat_snmp_basic_init(void)
1290{ 1280{
1291 int ret = 0;
1292
1293 BUG_ON(nf_nat_snmp_hook != NULL); 1281 BUG_ON(nf_nat_snmp_hook != NULL);
1294 RCU_INIT_POINTER(nf_nat_snmp_hook, help); 1282 RCU_INIT_POINTER(nf_nat_snmp_hook, help);
1295 1283
1296 ret = nf_conntrack_helper_register(&snmp_trap_helper); 1284 return nf_conntrack_helper_register(&snmp_trap_helper);
1297 if (ret < 0) {
1298 nf_conntrack_helper_unregister(&snmp_helper);
1299 return ret;
1300 }
1301 return ret;
1302} 1285}
1303 1286
1304static void __exit nf_nat_snmp_basic_fini(void) 1287static void __exit nf_nat_snmp_basic_fini(void)
1305{ 1288{
1306 RCU_INIT_POINTER(nf_nat_snmp_hook, NULL); 1289 RCU_INIT_POINTER(nf_nat_snmp_hook, NULL);
1290 synchronize_rcu();
1307 nf_conntrack_helper_unregister(&snmp_trap_helper); 1291 nf_conntrack_helper_unregister(&snmp_trap_helper);
1308} 1292}
1309 1293
diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c
index a0ea8aad1bf1..f18677277119 100644
--- a/net/ipv4/netfilter/nft_masq_ipv4.c
+++ b/net/ipv4/netfilter/nft_masq_ipv4.c
@@ -26,10 +26,10 @@ static void nft_masq_ipv4_eval(const struct nft_expr *expr,
26 memset(&range, 0, sizeof(range)); 26 memset(&range, 0, sizeof(range));
27 range.flags = priv->flags; 27 range.flags = priv->flags;
28 if (priv->sreg_proto_min) { 28 if (priv->sreg_proto_min) {
29 range.min_proto.all = 29 range.min_proto.all = (__force __be16)nft_reg_load16(
30 *(__be16 *)&regs->data[priv->sreg_proto_min]; 30 &regs->data[priv->sreg_proto_min]);
31 range.max_proto.all = 31 range.max_proto.all = (__force __be16)nft_reg_load16(
32 *(__be16 *)&regs->data[priv->sreg_proto_max]; 32 &regs->data[priv->sreg_proto_max]);
33 } 33 }
34 regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, nft_hook(pkt), 34 regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, nft_hook(pkt),
35 &range, nft_out(pkt)); 35 &range, nft_out(pkt));
diff --git a/net/ipv4/netfilter/nft_redir_ipv4.c b/net/ipv4/netfilter/nft_redir_ipv4.c
index 1650ed23c15d..5120be1d3118 100644
--- a/net/ipv4/netfilter/nft_redir_ipv4.c
+++ b/net/ipv4/netfilter/nft_redir_ipv4.c
@@ -26,10 +26,10 @@ static void nft_redir_ipv4_eval(const struct nft_expr *expr,
26 26
27 memset(&mr, 0, sizeof(mr)); 27 memset(&mr, 0, sizeof(mr));
28 if (priv->sreg_proto_min) { 28 if (priv->sreg_proto_min) {
29 mr.range[0].min.all = 29 mr.range[0].min.all = (__force __be16)nft_reg_load16(
30 *(__be16 *)&regs->data[priv->sreg_proto_min]; 30 &regs->data[priv->sreg_proto_min]);
31 mr.range[0].max.all = 31 mr.range[0].max.all = (__force __be16)nft_reg_load16(
32 *(__be16 *)&regs->data[priv->sreg_proto_max]; 32 &regs->data[priv->sreg_proto_max]);
33 mr.range[0].flags |= NF_NAT_RANGE_PROTO_SPECIFIED; 33 mr.range[0].flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
34 } 34 }
35 35
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 86cca610f4c2..ccfbce13a633 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -156,17 +156,18 @@ int ping_hash(struct sock *sk)
156void ping_unhash(struct sock *sk) 156void ping_unhash(struct sock *sk)
157{ 157{
158 struct inet_sock *isk = inet_sk(sk); 158 struct inet_sock *isk = inet_sk(sk);
159
159 pr_debug("ping_unhash(isk=%p,isk->num=%u)\n", isk, isk->inet_num); 160 pr_debug("ping_unhash(isk=%p,isk->num=%u)\n", isk, isk->inet_num);
161 write_lock_bh(&ping_table.lock);
160 if (sk_hashed(sk)) { 162 if (sk_hashed(sk)) {
161 write_lock_bh(&ping_table.lock);
162 hlist_nulls_del(&sk->sk_nulls_node); 163 hlist_nulls_del(&sk->sk_nulls_node);
163 sk_nulls_node_init(&sk->sk_nulls_node); 164 sk_nulls_node_init(&sk->sk_nulls_node);
164 sock_put(sk); 165 sock_put(sk);
165 isk->inet_num = 0; 166 isk->inet_num = 0;
166 isk->inet_sport = 0; 167 isk->inet_sport = 0;
167 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 168 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
168 write_unlock_bh(&ping_table.lock);
169 } 169 }
170 write_unlock_bh(&ping_table.lock);
170} 171}
171EXPORT_SYMBOL_GPL(ping_unhash); 172EXPORT_SYMBOL_GPL(ping_unhash);
172 173
@@ -433,9 +434,9 @@ int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
433 goto out; 434 goto out;
434 } 435 }
435 436
436 pr_debug("after bind(): num = %d, dif = %d\n", 437 pr_debug("after bind(): num = %hu, dif = %d\n",
437 (int)isk->inet_num, 438 isk->inet_num,
438 (int)sk->sk_bound_dev_if); 439 sk->sk_bound_dev_if);
439 440
440 err = 0; 441 err = 0;
441 if (sk->sk_family == AF_INET && isk->inet_rcv_saddr) 442 if (sk->sk_family == AF_INET && isk->inet_rcv_saddr)
@@ -642,6 +643,8 @@ static int ping_v4_push_pending_frames(struct sock *sk, struct pingfakehdr *pfh,
642{ 643{
643 struct sk_buff *skb = skb_peek(&sk->sk_write_queue); 644 struct sk_buff *skb = skb_peek(&sk->sk_write_queue);
644 645
646 if (!skb)
647 return 0;
645 pfh->wcheck = csum_partial((char *)&pfh->icmph, 648 pfh->wcheck = csum_partial((char *)&pfh->icmph,
646 sizeof(struct icmphdr), pfh->wcheck); 649 sizeof(struct icmphdr), pfh->wcheck);
647 pfh->icmph.checksum = csum_fold(pfh->wcheck); 650 pfh->icmph.checksum = csum_fold(pfh->wcheck);
@@ -848,7 +851,8 @@ out:
848 return err; 851 return err;
849 852
850do_confirm: 853do_confirm:
851 dst_confirm(&rt->dst); 854 if (msg->msg_flags & MSG_PROBE)
855 dst_confirm_neigh(&rt->dst, &fl4.daddr);
852 if (!(msg->msg_flags & MSG_PROBE) || len) 856 if (!(msg->msg_flags & MSG_PROBE) || len)
853 goto back_from_confirm; 857 goto back_from_confirm;
854 err = 0; 858 err = 0;
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 7143ca1a6af9..69cf49e8356d 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -57,15 +57,13 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
57 unsigned int frag_mem; 57 unsigned int frag_mem;
58 int orphans, sockets; 58 int orphans, sockets;
59 59
60 local_bh_disable();
61 orphans = percpu_counter_sum_positive(&tcp_orphan_count); 60 orphans = percpu_counter_sum_positive(&tcp_orphan_count);
62 sockets = proto_sockets_allocated_sum_positive(&tcp_prot); 61 sockets = proto_sockets_allocated_sum_positive(&tcp_prot);
63 local_bh_enable();
64 62
65 socket_seq_show(seq); 63 socket_seq_show(seq);
66 seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n", 64 seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",
67 sock_prot_inuse_get(net, &tcp_prot), orphans, 65 sock_prot_inuse_get(net, &tcp_prot), orphans,
68 atomic_read(&tcp_death_row.tw_count), sockets, 66 atomic_read(&net->ipv4.tcp_death_row.tw_count), sockets,
69 proto_memory_allocated(&tcp_prot)); 67 proto_memory_allocated(&tcp_prot));
70 seq_printf(seq, "UDP: inuse %d mem %ld\n", 68 seq_printf(seq, "UDP: inuse %d mem %ld\n",
71 sock_prot_inuse_get(net, &udp_prot), 69 sock_prot_inuse_get(net, &udp_prot),
@@ -264,6 +262,7 @@ static const struct snmp_mib snmp4_net_list[] = {
264 SNMP_MIB_ITEM("TCPSackMerged", LINUX_MIB_SACKMERGED), 262 SNMP_MIB_ITEM("TCPSackMerged", LINUX_MIB_SACKMERGED),
265 SNMP_MIB_ITEM("TCPSackShiftFallback", LINUX_MIB_SACKSHIFTFALLBACK), 263 SNMP_MIB_ITEM("TCPSackShiftFallback", LINUX_MIB_SACKSHIFTFALLBACK),
266 SNMP_MIB_ITEM("TCPBacklogDrop", LINUX_MIB_TCPBACKLOGDROP), 264 SNMP_MIB_ITEM("TCPBacklogDrop", LINUX_MIB_TCPBACKLOGDROP),
265 SNMP_MIB_ITEM("PFMemallocDrop", LINUX_MIB_PFMEMALLOCDROP),
267 SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP), 266 SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP),
268 SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP), 267 SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP),
269 SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER), 268 SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER),
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 4e49e5cb001c..9d943974de2b 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -383,6 +383,9 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
383 383
384 sock_tx_timestamp(sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags); 384 sock_tx_timestamp(sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags);
385 385
386 if (flags & MSG_CONFIRM)
387 skb_set_dst_pending_confirm(skb, 1);
388
386 skb->transport_header = skb->network_header; 389 skb->transport_header = skb->network_header;
387 err = -EFAULT; 390 err = -EFAULT;
388 if (memcpy_from_msg(iph, msg, length)) 391 if (memcpy_from_msg(iph, msg, length))
@@ -666,7 +669,8 @@ out:
666 return len; 669 return len;
667 670
668do_confirm: 671do_confirm:
669 dst_confirm(&rt->dst); 672 if (msg->msg_flags & MSG_PROBE)
673 dst_confirm_neigh(&rt->dst, &fl4.daddr);
670 if (!(msg->msg_flags & MSG_PROBE) || len) 674 if (!(msg->msg_flags & MSG_PROBE) || len)
671 goto back_from_confirm; 675 goto back_from_confirm;
672 err = 0; 676 err = 0;
@@ -678,7 +682,9 @@ static void raw_close(struct sock *sk, long timeout)
678 /* 682 /*
679 * Raw sockets may have direct kernel references. Kill them. 683 * Raw sockets may have direct kernel references. Kill them.
680 */ 684 */
685 rtnl_lock();
681 ip_ra_control(sk, 0, NULL); 686 ip_ra_control(sk, 0, NULL);
687 rtnl_unlock();
682 688
683 sk_common_release(sk); 689 sk_common_release(sk);
684} 690}
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 709ffe67d1de..acd69cfe2951 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -154,6 +154,7 @@ static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
154static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, 154static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
155 struct sk_buff *skb, 155 struct sk_buff *skb,
156 const void *daddr); 156 const void *daddr);
157static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
157 158
158static struct dst_ops ipv4_dst_ops = { 159static struct dst_ops ipv4_dst_ops = {
159 .family = AF_INET, 160 .family = AF_INET,
@@ -168,6 +169,7 @@ static struct dst_ops ipv4_dst_ops = {
168 .redirect = ip_do_redirect, 169 .redirect = ip_do_redirect,
169 .local_out = __ip_local_out, 170 .local_out = __ip_local_out,
170 .neigh_lookup = ipv4_neigh_lookup, 171 .neigh_lookup = ipv4_neigh_lookup,
172 .confirm_neigh = ipv4_confirm_neigh,
171}; 173};
172 174
173#define ECN_OR_COST(class) TC_PRIO_##class 175#define ECN_OR_COST(class) TC_PRIO_##class
@@ -461,6 +463,23 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
461 return neigh_create(&arp_tbl, pkey, dev); 463 return neigh_create(&arp_tbl, pkey, dev);
462} 464}
463 465
466static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
467{
468 struct net_device *dev = dst->dev;
469 const __be32 *pkey = daddr;
470 const struct rtable *rt;
471
472 rt = (const struct rtable *)dst;
473 if (rt->rt_gateway)
474 pkey = (const __be32 *)&rt->rt_gateway;
475 else if (!daddr ||
476 (rt->rt_flags &
477 (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
478 return;
479
480 __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
481}
482
464#define IP_IDENTS_SZ 2048u 483#define IP_IDENTS_SZ 2048u
465 484
466static atomic_t *ip_idents __read_mostly; 485static atomic_t *ip_idents __read_mostly;
@@ -1758,7 +1777,6 @@ standard_hash:
1758 1777
1759static int ip_mkroute_input(struct sk_buff *skb, 1778static int ip_mkroute_input(struct sk_buff *skb,
1760 struct fib_result *res, 1779 struct fib_result *res,
1761 const struct flowi4 *fl4,
1762 struct in_device *in_dev, 1780 struct in_device *in_dev,
1763 __be32 daddr, __be32 saddr, u32 tos) 1781 __be32 daddr, __be32 saddr, u32 tos)
1764{ 1782{
@@ -1858,6 +1876,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1858 fl4.flowi4_flags = 0; 1876 fl4.flowi4_flags = 0;
1859 fl4.daddr = daddr; 1877 fl4.daddr = daddr;
1860 fl4.saddr = saddr; 1878 fl4.saddr = saddr;
1879 fl4.flowi4_uid = sock_net_uid(net, NULL);
1861 err = fib_lookup(net, &fl4, &res, 0); 1880 err = fib_lookup(net, &fl4, &res, 0);
1862 if (err != 0) { 1881 if (err != 0) {
1863 if (!IN_DEV_FORWARD(in_dev)) 1882 if (!IN_DEV_FORWARD(in_dev))
@@ -1883,7 +1902,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1883 if (res.type != RTN_UNICAST) 1902 if (res.type != RTN_UNICAST)
1884 goto martian_destination; 1903 goto martian_destination;
1885 1904
1886 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos); 1905 err = ip_mkroute_input(skb, &res, in_dev, daddr, saddr, tos);
1887out: return err; 1906out: return err;
1888 1907
1889brd_input: 1908brd_input:
@@ -1990,6 +2009,7 @@ int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1990{ 2009{
1991 int res; 2010 int res;
1992 2011
2012 tos &= IPTOS_RT_MASK;
1993 rcu_read_lock(); 2013 rcu_read_lock();
1994 2014
1995 /* Multicast recognition logic is moved from route cache to here. 2015 /* Multicast recognition logic is moved from route cache to here.
@@ -2454,7 +2474,7 @@ EXPORT_SYMBOL_GPL(ip_route_output_flow);
2454 2474
2455static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id, 2475static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
2456 struct flowi4 *fl4, struct sk_buff *skb, u32 portid, 2476 struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2457 u32 seq, int event, int nowait, unsigned int flags) 2477 u32 seq, int event)
2458{ 2478{
2459 struct rtable *rt = skb_rtable(skb); 2479 struct rtable *rt = skb_rtable(skb);
2460 struct rtmsg *r; 2480 struct rtmsg *r;
@@ -2463,7 +2483,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
2463 u32 error; 2483 u32 error;
2464 u32 metrics[RTAX_MAX]; 2484 u32 metrics[RTAX_MAX];
2465 2485
2466 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags); 2486 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), 0);
2467 if (!nlh) 2487 if (!nlh)
2468 return -EMSGSIZE; 2488 return -EMSGSIZE;
2469 2489
@@ -2541,18 +2561,12 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
2541 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { 2561 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2542 int err = ipmr_get_route(net, skb, 2562 int err = ipmr_get_route(net, skb,
2543 fl4->saddr, fl4->daddr, 2563 fl4->saddr, fl4->daddr,
2544 r, nowait, portid); 2564 r, portid);
2545 2565
2546 if (err <= 0) { 2566 if (err <= 0) {
2547 if (!nowait) { 2567 if (err == 0)
2548 if (err == 0) 2568 return 0;
2549 return 0; 2569 goto nla_put_failure;
2550 goto nla_put_failure;
2551 } else {
2552 if (err == -EMSGSIZE)
2553 goto nla_put_failure;
2554 error = err;
2555 }
2556 } 2570 }
2557 } else 2571 } else
2558#endif 2572#endif
@@ -2606,7 +2620,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2606 skb_reset_network_header(skb); 2620 skb_reset_network_header(skb);
2607 2621
2608 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */ 2622 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2609 ip_hdr(skb)->protocol = IPPROTO_ICMP; 2623 ip_hdr(skb)->protocol = IPPROTO_UDP;
2610 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr)); 2624 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2611 2625
2612 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0; 2626 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
@@ -2638,9 +2652,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2638 skb->protocol = htons(ETH_P_IP); 2652 skb->protocol = htons(ETH_P_IP);
2639 skb->dev = dev; 2653 skb->dev = dev;
2640 skb->mark = mark; 2654 skb->mark = mark;
2641 local_bh_disable();
2642 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev); 2655 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2643 local_bh_enable();
2644 2656
2645 rt = skb_rtable(skb); 2657 rt = skb_rtable(skb);
2646 if (err == 0 && rt->dst.error) 2658 if (err == 0 && rt->dst.error)
@@ -2665,7 +2677,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2665 2677
2666 err = rt_fill_info(net, dst, src, table_id, &fl4, skb, 2678 err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2667 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 2679 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2668 RTM_NEWROUTE, 0, 0); 2680 RTM_NEWROUTE);
2669 if (err < 0) 2681 if (err < 0)
2670 goto errout_free; 2682 goto errout_free;
2671 2683
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 3e88467d70ee..496b97e17aaf 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -13,13 +13,13 @@
13#include <linux/tcp.h> 13#include <linux/tcp.h>
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/random.h> 15#include <linux/random.h>
16#include <linux/cryptohash.h> 16#include <linux/siphash.h>
17#include <linux/kernel.h> 17#include <linux/kernel.h>
18#include <linux/export.h> 18#include <linux/export.h>
19#include <net/tcp.h> 19#include <net/tcp.h>
20#include <net/route.h> 20#include <net/route.h>
21 21
22static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly; 22static siphash_key_t syncookie_secret[2] __read_mostly;
23 23
24#define COOKIEBITS 24 /* Upper bits store count */ 24#define COOKIEBITS 24 /* Upper bits store count */
25#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) 25#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
@@ -48,24 +48,13 @@ static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly;
48#define TSBITS 6 48#define TSBITS 6
49#define TSMASK (((__u32)1 << TSBITS) - 1) 49#define TSMASK (((__u32)1 << TSBITS) - 1)
50 50
51static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], ipv4_cookie_scratch);
52
53static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, 51static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
54 u32 count, int c) 52 u32 count, int c)
55{ 53{
56 __u32 *tmp;
57
58 net_get_random_once(syncookie_secret, sizeof(syncookie_secret)); 54 net_get_random_once(syncookie_secret, sizeof(syncookie_secret));
59 55 return siphash_4u32((__force u32)saddr, (__force u32)daddr,
60 tmp = this_cpu_ptr(ipv4_cookie_scratch); 56 (__force u32)sport << 16 | (__force u32)dport,
61 memcpy(tmp + 4, syncookie_secret[c], sizeof(syncookie_secret[c])); 57 count, &syncookie_secret[c]);
62 tmp[0] = (__force u32)saddr;
63 tmp[1] = (__force u32)daddr;
64 tmp[2] = ((__force u32)sport << 16) + (__force u32)dport;
65 tmp[3] = count;
66 sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5);
67
68 return tmp[17];
69} 58}
70 59
71 60
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index b2fa498b15d1..d6880a6149ee 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -35,6 +35,8 @@ static int ip_local_port_range_min[] = { 1, 1 };
35static int ip_local_port_range_max[] = { 65535, 65535 }; 35static int ip_local_port_range_max[] = { 65535, 65535 };
36static int tcp_adv_win_scale_min = -31; 36static int tcp_adv_win_scale_min = -31;
37static int tcp_adv_win_scale_max = 31; 37static int tcp_adv_win_scale_max = 31;
38static int ip_privileged_port_min;
39static int ip_privileged_port_max = 65535;
38static int ip_ttl_min = 1; 40static int ip_ttl_min = 1;
39static int ip_ttl_max = 255; 41static int ip_ttl_max = 255;
40static int tcp_syn_retries_min = 1; 42static int tcp_syn_retries_min = 1;
@@ -79,7 +81,12 @@ static int ipv4_local_port_range(struct ctl_table *table, int write,
79 ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); 81 ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
80 82
81 if (write && ret == 0) { 83 if (write && ret == 0) {
82 if (range[1] < range[0]) 84 /* Ensure that the upper limit is not smaller than the lower,
85 * and that the lower does not encroach upon the privileged
86 * port limit.
87 */
88 if ((range[1] < range[0]) ||
89 (range[0] < net->ipv4.sysctl_ip_prot_sock))
83 ret = -EINVAL; 90 ret = -EINVAL;
84 else 91 else
85 set_local_port_range(net, range); 92 set_local_port_range(net, range);
@@ -88,6 +95,40 @@ static int ipv4_local_port_range(struct ctl_table *table, int write,
88 return ret; 95 return ret;
89} 96}
90 97
98/* Validate changes from /proc interface. */
99static int ipv4_privileged_ports(struct ctl_table *table, int write,
100 void __user *buffer, size_t *lenp, loff_t *ppos)
101{
102 struct net *net = container_of(table->data, struct net,
103 ipv4.sysctl_ip_prot_sock);
104 int ret;
105 int pports;
106 int range[2];
107 struct ctl_table tmp = {
108 .data = &pports,
109 .maxlen = sizeof(pports),
110 .mode = table->mode,
111 .extra1 = &ip_privileged_port_min,
112 .extra2 = &ip_privileged_port_max,
113 };
114
115 pports = net->ipv4.sysctl_ip_prot_sock;
116
117 ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
118
119 if (write && ret == 0) {
120 inet_get_local_port_range(net, &range[0], &range[1]);
121 /* Ensure that the local port range doesn't overlap with the
122 * privileged port range.
123 */
124 if (range[0] < pports)
125 ret = -EINVAL;
126 else
127 net->ipv4.sysctl_ip_prot_sock = pports;
128 }
129
130 return ret;
131}
91 132
92static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low, kgid_t *high) 133static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low, kgid_t *high)
93{ 134{
@@ -290,13 +331,6 @@ static struct ctl_table ipv4_table[] = {
290 .proc_handler = proc_dointvec 331 .proc_handler = proc_dointvec
291 }, 332 },
292 { 333 {
293 .procname = "tcp_max_tw_buckets",
294 .data = &tcp_death_row.sysctl_max_tw_buckets,
295 .maxlen = sizeof(int),
296 .mode = 0644,
297 .proc_handler = proc_dointvec
298 },
299 {
300 .procname = "tcp_fastopen", 334 .procname = "tcp_fastopen",
301 .data = &sysctl_tcp_fastopen, 335 .data = &sysctl_tcp_fastopen,
302 .maxlen = sizeof(int), 336 .maxlen = sizeof(int),
@@ -310,13 +344,6 @@ static struct ctl_table ipv4_table[] = {
310 .proc_handler = proc_tcp_fastopen_key, 344 .proc_handler = proc_tcp_fastopen_key,
311 }, 345 },
312 { 346 {
313 .procname = "tcp_tw_recycle",
314 .data = &tcp_death_row.sysctl_tw_recycle,
315 .maxlen = sizeof(int),
316 .mode = 0644,
317 .proc_handler = proc_dointvec
318 },
319 {
320 .procname = "tcp_abort_on_overflow", 347 .procname = "tcp_abort_on_overflow",
321 .data = &sysctl_tcp_abort_on_overflow, 348 .data = &sysctl_tcp_abort_on_overflow,
322 .maxlen = sizeof(int), 349 .maxlen = sizeof(int),
@@ -338,13 +365,6 @@ static struct ctl_table ipv4_table[] = {
338 .proc_handler = proc_dointvec 365 .proc_handler = proc_dointvec
339 }, 366 },
340 { 367 {
341 .procname = "tcp_max_syn_backlog",
342 .data = &sysctl_max_syn_backlog,
343 .maxlen = sizeof(int),
344 .mode = 0644,
345 .proc_handler = proc_dointvec
346 },
347 {
348 .procname = "inet_peer_threshold", 368 .procname = "inet_peer_threshold",
349 .data = &inet_peer_threshold, 369 .data = &inet_peer_threshold,
350 .maxlen = sizeof(int), 370 .maxlen = sizeof(int),
@@ -558,13 +578,6 @@ static struct ctl_table ipv4_table[] = {
558 .proc_handler = proc_dointvec 578 .proc_handler = proc_dointvec
559 }, 579 },
560 { 580 {
561 .procname = "tcp_thin_dupack",
562 .data = &sysctl_tcp_thin_dupack,
563 .maxlen = sizeof(int),
564 .mode = 0644,
565 .proc_handler = proc_dointvec
566 },
567 {
568 .procname = "tcp_early_retrans", 581 .procname = "tcp_early_retrans",
569 .data = &sysctl_tcp_early_retrans, 582 .data = &sysctl_tcp_early_retrans,
570 .maxlen = sizeof(int), 583 .maxlen = sizeof(int),
@@ -960,6 +973,27 @@ static struct ctl_table ipv4_net_table[] = {
960 .mode = 0644, 973 .mode = 0644,
961 .proc_handler = proc_dointvec 974 .proc_handler = proc_dointvec
962 }, 975 },
976 {
977 .procname = "tcp_max_tw_buckets",
978 .data = &init_net.ipv4.tcp_death_row.sysctl_max_tw_buckets,
979 .maxlen = sizeof(int),
980 .mode = 0644,
981 .proc_handler = proc_dointvec
982 },
983 {
984 .procname = "tcp_tw_recycle",
985 .data = &init_net.ipv4.tcp_death_row.sysctl_tw_recycle,
986 .maxlen = sizeof(int),
987 .mode = 0644,
988 .proc_handler = proc_dointvec
989 },
990 {
991 .procname = "tcp_max_syn_backlog",
992 .data = &init_net.ipv4.sysctl_max_syn_backlog,
993 .maxlen = sizeof(int),
994 .mode = 0644,
995 .proc_handler = proc_dointvec
996 },
963#ifdef CONFIG_IP_ROUTE_MULTIPATH 997#ifdef CONFIG_IP_ROUTE_MULTIPATH
964 { 998 {
965 .procname = "fib_multipath_use_neigh", 999 .procname = "fib_multipath_use_neigh",
@@ -971,6 +1005,24 @@ static struct ctl_table ipv4_net_table[] = {
971 .extra2 = &one, 1005 .extra2 = &one,
972 }, 1006 },
973#endif 1007#endif
1008 {
1009 .procname = "ip_unprivileged_port_start",
1010 .maxlen = sizeof(int),
1011 .data = &init_net.ipv4.sysctl_ip_prot_sock,
1012 .mode = 0644,
1013 .proc_handler = ipv4_privileged_ports,
1014 },
1015#ifdef CONFIG_NET_L3_MASTER_DEV
1016 {
1017 .procname = "udp_l3mdev_accept",
1018 .data = &init_net.ipv4.sysctl_udp_l3mdev_accept,
1019 .maxlen = sizeof(int),
1020 .mode = 0644,
1021 .proc_handler = proc_dointvec_minmax,
1022 .extra1 = &zero,
1023 .extra2 = &one,
1024 },
1025#endif
974 { } 1026 { }
975}; 1027};
976 1028
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 4a044964da66..40ba4249a586 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -406,7 +406,6 @@ void tcp_init_sock(struct sock *sk)
406 tp->mss_cache = TCP_MSS_DEFAULT; 406 tp->mss_cache = TCP_MSS_DEFAULT;
407 407
408 tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering; 408 tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
409 tcp_enable_early_retrans(tp);
410 tcp_assign_congestion_control(sk); 409 tcp_assign_congestion_control(sk);
411 410
412 tp->tsoffset = 0; 411 tp->tsoffset = 0;
@@ -421,15 +420,13 @@ void tcp_init_sock(struct sock *sk)
421 sk->sk_sndbuf = sysctl_tcp_wmem[1]; 420 sk->sk_sndbuf = sysctl_tcp_wmem[1];
422 sk->sk_rcvbuf = sysctl_tcp_rmem[1]; 421 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
423 422
424 local_bh_disable();
425 sk_sockets_allocated_inc(sk); 423 sk_sockets_allocated_inc(sk);
426 local_bh_enable();
427} 424}
428EXPORT_SYMBOL(tcp_init_sock); 425EXPORT_SYMBOL(tcp_init_sock);
429 426
430static void tcp_tx_timestamp(struct sock *sk, u16 tsflags, struct sk_buff *skb) 427static void tcp_tx_timestamp(struct sock *sk, u16 tsflags, struct sk_buff *skb)
431{ 428{
432 if (tsflags) { 429 if (tsflags && skb) {
433 struct skb_shared_info *shinfo = skb_shinfo(skb); 430 struct skb_shared_info *shinfo = skb_shinfo(skb);
434 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); 431 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
435 432
@@ -536,6 +533,12 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
536 533
537 if (tp->urg_data & TCP_URG_VALID) 534 if (tp->urg_data & TCP_URG_VALID)
538 mask |= POLLPRI; 535 mask |= POLLPRI;
536 } else if (sk->sk_state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) {
537 /* Active TCP fastopen socket with defer_connect
538 * Return POLLOUT so application can call write()
539 * in order for kernel to generate SYN+data
540 */
541 mask |= POLLOUT | POLLWRNORM;
539 } 542 }
540 /* This barrier is coupled with smp_wmb() in tcp_reset() */ 543 /* This barrier is coupled with smp_wmb() in tcp_reset() */
541 smp_rmb(); 544 smp_rmb();
@@ -770,6 +773,12 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
770 ret = -EAGAIN; 773 ret = -EAGAIN;
771 break; 774 break;
772 } 775 }
776 /* if __tcp_splice_read() got nothing while we have
777 * an skb in receive queue, we do not want to loop.
778 * This might happen with URG data.
779 */
780 if (!skb_queue_empty(&sk->sk_receive_queue))
781 break;
773 sk_wait_data(sk, &timeo, NULL); 782 sk_wait_data(sk, &timeo, NULL);
774 if (signal_pending(current)) { 783 if (signal_pending(current)) {
775 ret = sock_intr_errno(timeo); 784 ret = sock_intr_errno(timeo);
@@ -958,10 +967,8 @@ new_segment:
958 copied += copy; 967 copied += copy;
959 offset += copy; 968 offset += copy;
960 size -= copy; 969 size -= copy;
961 if (!size) { 970 if (!size)
962 tcp_tx_timestamp(sk, sk->sk_tsflags, skb);
963 goto out; 971 goto out;
964 }
965 972
966 if (skb->len < size_goal || (flags & MSG_OOB)) 973 if (skb->len < size_goal || (flags & MSG_OOB))
967 continue; 974 continue;
@@ -987,8 +994,11 @@ wait_for_memory:
987 } 994 }
988 995
989out: 996out:
990 if (copied && !(flags & MSG_SENDPAGE_NOTLAST)) 997 if (copied) {
991 tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); 998 tcp_tx_timestamp(sk, sk->sk_tsflags, tcp_write_queue_tail(sk));
999 if (!(flags & MSG_SENDPAGE_NOTLAST))
1000 tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
1001 }
992 return copied; 1002 return copied;
993 1003
994do_error: 1004do_error:
@@ -1073,6 +1083,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
1073 int *copied, size_t size) 1083 int *copied, size_t size)
1074{ 1084{
1075 struct tcp_sock *tp = tcp_sk(sk); 1085 struct tcp_sock *tp = tcp_sk(sk);
1086 struct inet_sock *inet = inet_sk(sk);
1076 int err, flags; 1087 int err, flags;
1077 1088
1078 if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE)) 1089 if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE))
@@ -1087,11 +1098,26 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
1087 tp->fastopen_req->data = msg; 1098 tp->fastopen_req->data = msg;
1088 tp->fastopen_req->size = size; 1099 tp->fastopen_req->size = size;
1089 1100
1101 if (inet->defer_connect) {
1102 err = tcp_connect(sk);
1103 /* Same failure procedure as in tcp_v4/6_connect */
1104 if (err) {
1105 tcp_set_state(sk, TCP_CLOSE);
1106 inet->inet_dport = 0;
1107 sk->sk_route_caps = 0;
1108 }
1109 }
1090 flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0; 1110 flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
1091 err = __inet_stream_connect(sk->sk_socket, msg->msg_name, 1111 err = __inet_stream_connect(sk->sk_socket, msg->msg_name,
1092 msg->msg_namelen, flags); 1112 msg->msg_namelen, flags, 1);
1093 *copied = tp->fastopen_req->copied; 1113 /* fastopen_req could already be freed in __inet_stream_connect
1094 tcp_free_fastopen_req(tp); 1114 * if the connection times out or gets rst
1115 */
1116 if (tp->fastopen_req) {
1117 *copied = tp->fastopen_req->copied;
1118 tcp_free_fastopen_req(tp);
1119 inet->defer_connect = 0;
1120 }
1095 return err; 1121 return err;
1096} 1122}
1097 1123
@@ -1109,7 +1135,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
1109 lock_sock(sk); 1135 lock_sock(sk);
1110 1136
1111 flags = msg->msg_flags; 1137 flags = msg->msg_flags;
1112 if (flags & MSG_FASTOPEN) { 1138 if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect)) {
1113 err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size); 1139 err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size);
1114 if (err == -EINPROGRESS && copied_syn > 0) 1140 if (err == -EINPROGRESS && copied_syn > 0)
1115 goto out; 1141 goto out;
@@ -1267,7 +1293,7 @@ new_segment:
1267 } else { 1293 } else {
1268 skb_fill_page_desc(skb, i, pfrag->page, 1294 skb_fill_page_desc(skb, i, pfrag->page,
1269 pfrag->offset, copy); 1295 pfrag->offset, copy);
1270 get_page(pfrag->page); 1296 page_ref_inc(pfrag->page);
1271 } 1297 }
1272 pfrag->offset += copy; 1298 pfrag->offset += copy;
1273 } 1299 }
@@ -1281,7 +1307,6 @@ new_segment:
1281 1307
1282 copied += copy; 1308 copied += copy;
1283 if (!msg_data_left(msg)) { 1309 if (!msg_data_left(msg)) {
1284 tcp_tx_timestamp(sk, sockc.tsflags, skb);
1285 if (unlikely(flags & MSG_EOR)) 1310 if (unlikely(flags & MSG_EOR))
1286 TCP_SKB_CB(skb)->eor = 1; 1311 TCP_SKB_CB(skb)->eor = 1;
1287 goto out; 1312 goto out;
@@ -1312,8 +1337,10 @@ wait_for_memory:
1312 } 1337 }
1313 1338
1314out: 1339out:
1315 if (copied) 1340 if (copied) {
1341 tcp_tx_timestamp(sk, sockc.tsflags, tcp_write_queue_tail(sk));
1316 tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); 1342 tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
1343 }
1317out_nopush: 1344out_nopush:
1318 release_sock(sk); 1345 release_sock(sk);
1319 return copied + copied_syn; 1346 return copied + copied_syn;
@@ -2295,6 +2322,11 @@ int tcp_disconnect(struct sock *sk, int flags)
2295 tcp_init_send_head(sk); 2322 tcp_init_send_head(sk);
2296 memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); 2323 memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
2297 __sk_dst_reset(sk); 2324 __sk_dst_reset(sk);
2325 tcp_saved_syn_free(tp);
2326
2327 /* Clean up fastopen related fields */
2328 tcp_free_fastopen_req(tp);
2329 inet->defer_connect = 0;
2298 2330
2299 WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); 2331 WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
2300 2332
@@ -2473,11 +2505,6 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2473 case TCP_THIN_DUPACK: 2505 case TCP_THIN_DUPACK:
2474 if (val < 0 || val > 1) 2506 if (val < 0 || val > 1)
2475 err = -EINVAL; 2507 err = -EINVAL;
2476 else {
2477 tp->thin_dupack = val;
2478 if (tp->thin_dupack)
2479 tcp_disable_early_retrans(tp);
2480 }
2481 break; 2508 break;
2482 2509
2483 case TCP_REPAIR: 2510 case TCP_REPAIR:
@@ -2662,6 +2689,18 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2662 err = -EINVAL; 2689 err = -EINVAL;
2663 } 2690 }
2664 break; 2691 break;
2692 case TCP_FASTOPEN_CONNECT:
2693 if (val > 1 || val < 0) {
2694 err = -EINVAL;
2695 } else if (sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) {
2696 if (sk->sk_state == TCP_CLOSE)
2697 tp->fastopen_connect = val;
2698 else
2699 err = -EINVAL;
2700 } else {
2701 err = -EOPNOTSUPP;
2702 }
2703 break;
2665 case TCP_TIMESTAMP: 2704 case TCP_TIMESTAMP:
2666 if (!tp->repair) 2705 if (!tp->repair)
2667 err = -EPERM; 2706 err = -EPERM;
@@ -2732,7 +2771,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
2732{ 2771{
2733 const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */ 2772 const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
2734 const struct inet_connection_sock *icsk = inet_csk(sk); 2773 const struct inet_connection_sock *icsk = inet_csk(sk);
2735 u32 now = tcp_time_stamp, intv; 2774 u32 now, intv;
2736 u64 rate64; 2775 u64 rate64;
2737 bool slow; 2776 bool slow;
2738 u32 rate; 2777 u32 rate;
@@ -2764,6 +2803,9 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
2764 info->tcpi_sacked = sk->sk_max_ack_backlog; 2803 info->tcpi_sacked = sk->sk_max_ack_backlog;
2765 return; 2804 return;
2766 } 2805 }
2806
2807 slow = lock_sock_fast(sk);
2808
2767 info->tcpi_ca_state = icsk->icsk_ca_state; 2809 info->tcpi_ca_state = icsk->icsk_ca_state;
2768 info->tcpi_retransmits = icsk->icsk_retransmits; 2810 info->tcpi_retransmits = icsk->icsk_retransmits;
2769 info->tcpi_probes = icsk->icsk_probes_out; 2811 info->tcpi_probes = icsk->icsk_probes_out;
@@ -2798,6 +2840,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
2798 info->tcpi_retrans = tp->retrans_out; 2840 info->tcpi_retrans = tp->retrans_out;
2799 info->tcpi_fackets = tp->fackets_out; 2841 info->tcpi_fackets = tp->fackets_out;
2800 2842
2843 now = tcp_time_stamp;
2801 info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime); 2844 info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2802 info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime); 2845 info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
2803 info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp); 2846 info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
@@ -2814,15 +2857,11 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
2814 2857
2815 info->tcpi_total_retrans = tp->total_retrans; 2858 info->tcpi_total_retrans = tp->total_retrans;
2816 2859
2817 slow = lock_sock_fast(sk);
2818
2819 info->tcpi_bytes_acked = tp->bytes_acked; 2860 info->tcpi_bytes_acked = tp->bytes_acked;
2820 info->tcpi_bytes_received = tp->bytes_received; 2861 info->tcpi_bytes_received = tp->bytes_received;
2821 info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt); 2862 info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt);
2822 tcp_get_info_chrono_stats(tp, info); 2863 tcp_get_info_chrono_stats(tp, info);
2823 2864
2824 unlock_sock_fast(sk, slow);
2825
2826 info->tcpi_segs_out = tp->segs_out; 2865 info->tcpi_segs_out = tp->segs_out;
2827 info->tcpi_segs_in = tp->segs_in; 2866 info->tcpi_segs_in = tp->segs_in;
2828 2867
@@ -2838,6 +2877,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
2838 do_div(rate64, intv); 2877 do_div(rate64, intv);
2839 info->tcpi_delivery_rate = rate64; 2878 info->tcpi_delivery_rate = rate64;
2840 } 2879 }
2880 unlock_sock_fast(sk, slow);
2841} 2881}
2842EXPORT_SYMBOL_GPL(tcp_get_info); 2882EXPORT_SYMBOL_GPL(tcp_get_info);
2843 2883
@@ -2847,7 +2887,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
2847 struct sk_buff *stats; 2887 struct sk_buff *stats;
2848 struct tcp_info info; 2888 struct tcp_info info;
2849 2889
2850 stats = alloc_skb(3 * nla_total_size_64bit(sizeof(u64)), GFP_ATOMIC); 2890 stats = alloc_skb(5 * nla_total_size_64bit(sizeof(u64)), GFP_ATOMIC);
2851 if (!stats) 2891 if (!stats)
2852 return NULL; 2892 return NULL;
2853 2893
@@ -2858,6 +2898,10 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
2858 info.tcpi_rwnd_limited, TCP_NLA_PAD); 2898 info.tcpi_rwnd_limited, TCP_NLA_PAD);
2859 nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED, 2899 nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED,
2860 info.tcpi_sndbuf_limited, TCP_NLA_PAD); 2900 info.tcpi_sndbuf_limited, TCP_NLA_PAD);
2901 nla_put_u64_64bit(stats, TCP_NLA_DATA_SEGS_OUT,
2902 tp->data_segs_out, TCP_NLA_PAD);
2903 nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS,
2904 tp->total_retrans, TCP_NLA_PAD);
2861 return stats; 2905 return stats;
2862} 2906}
2863 2907
@@ -2967,8 +3011,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
2967 case TCP_THIN_LINEAR_TIMEOUTS: 3011 case TCP_THIN_LINEAR_TIMEOUTS:
2968 val = tp->thin_lto; 3012 val = tp->thin_lto;
2969 break; 3013 break;
3014
2970 case TCP_THIN_DUPACK: 3015 case TCP_THIN_DUPACK:
2971 val = tp->thin_dupack; 3016 val = 0;
2972 break; 3017 break;
2973 3018
2974 case TCP_REPAIR: 3019 case TCP_REPAIR:
@@ -3021,6 +3066,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
3021 val = icsk->icsk_accept_queue.fastopenq.max_qlen; 3066 val = icsk->icsk_accept_queue.fastopenq.max_qlen;
3022 break; 3067 break;
3023 3068
3069 case TCP_FASTOPEN_CONNECT:
3070 val = tp->fastopen_connect;
3071 break;
3072
3024 case TCP_TIMESTAMP: 3073 case TCP_TIMESTAMP:
3025 val = tcp_time_stamp + tp->tsoffset; 3074 val = tcp_time_stamp + tp->tsoffset;
3026 break; 3075 break;
@@ -3334,6 +3383,7 @@ void __init tcp_init(void)
3334 3383
3335 percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL); 3384 percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
3336 percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL); 3385 percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
3386 inet_hashinfo_init(&tcp_hashinfo);
3337 tcp_hashinfo.bind_bucket_cachep = 3387 tcp_hashinfo.bind_bucket_cachep =
3338 kmem_cache_create("tcp_bind_bucket", 3388 kmem_cache_create("tcp_bind_bucket",
3339 sizeof(struct inet_bind_bucket), 0, 3389 sizeof(struct inet_bind_bucket), 0,
@@ -3377,10 +3427,7 @@ void __init tcp_init(void)
3377 3427
3378 3428
3379 cnt = tcp_hashinfo.ehash_mask + 1; 3429 cnt = tcp_hashinfo.ehash_mask + 1;
3380
3381 tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
3382 sysctl_tcp_max_orphans = cnt / 2; 3430 sysctl_tcp_max_orphans = cnt / 2;
3383 sysctl_max_syn_backlog = max(128, cnt / 256);
3384 3431
3385 tcp_init_mem(); 3432 tcp_init_mem();
3386 /* Set per-socket limits to no more than 1/128 the pressure threshold */ 3433 /* Set per-socket limits to no more than 1/128 the pressure threshold */
@@ -3399,6 +3446,7 @@ void __init tcp_init(void)
3399 pr_info("Hash tables configured (established %u bind %u)\n", 3446 pr_info("Hash tables configured (established %u bind %u)\n",
3400 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); 3447 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
3401 3448
3449 tcp_v4_init();
3402 tcp_metrics_init(); 3450 tcp_metrics_init();
3403 BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0); 3451 BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
3404 tcp_tasklet_init(); 3452 tcp_tasklet_init();
diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c
index 35b280361cb2..50a0f3e51d5b 100644
--- a/net/ipv4/tcp_cdg.c
+++ b/net/ipv4/tcp_cdg.c
@@ -27,6 +27,8 @@
27#include <linux/kernel.h> 27#include <linux/kernel.h>
28#include <linux/random.h> 28#include <linux/random.h>
29#include <linux/module.h> 29#include <linux/module.h>
30#include <linux/sched/clock.h>
31
30#include <net/tcp.h> 32#include <net/tcp.h>
31 33
32#define HYSTART_ACK_TRAIN 1 34#define HYSTART_ACK_TRAIN 1
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index dd2560c83a85..8ea4e9787f82 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -326,3 +326,57 @@ fastopen:
326 *foc = valid_foc; 326 *foc = valid_foc;
327 return NULL; 327 return NULL;
328} 328}
329
330bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
331 struct tcp_fastopen_cookie *cookie)
332{
333 unsigned long last_syn_loss = 0;
334 int syn_loss = 0;
335
336 tcp_fastopen_cache_get(sk, mss, cookie, &syn_loss, &last_syn_loss);
337
338 /* Recurring FO SYN losses: no cookie or data in SYN */
339 if (syn_loss > 1 &&
340 time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) {
341 cookie->len = -1;
342 return false;
343 }
344 if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) {
345 cookie->len = -1;
346 return true;
347 }
348 return cookie->len > 0;
349}
350
351/* This function checks if we want to defer sending SYN until the first
352 * write(). We defer under the following conditions:
353 * 1. fastopen_connect sockopt is set
354 * 2. we have a valid cookie
355 * Return value: return true if we want to defer until application writes data
356 * return false if we want to send out SYN immediately
357 */
358bool tcp_fastopen_defer_connect(struct sock *sk, int *err)
359{
360 struct tcp_fastopen_cookie cookie = { .len = 0 };
361 struct tcp_sock *tp = tcp_sk(sk);
362 u16 mss;
363
364 if (tp->fastopen_connect && !tp->fastopen_req) {
365 if (tcp_fastopen_cookie_check(sk, &mss, &cookie)) {
366 inet_sk(sk)->defer_connect = 1;
367 return true;
368 }
369
370 /* Alloc fastopen_req in order for FO option to be included
371 * in SYN
372 */
373 tp->fastopen_req = kzalloc(sizeof(*tp->fastopen_req),
374 sk->sk_allocation);
375 if (tp->fastopen_req)
376 tp->fastopen_req->cookie = cookie;
377 else
378 *err = -ENOBUFS;
379 }
380 return false;
381}
382EXPORT_SYMBOL(tcp_fastopen_defer_connect);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 41dcbd568cbe..659d1baefb2b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -79,7 +79,7 @@
79int sysctl_tcp_timestamps __read_mostly = 1; 79int sysctl_tcp_timestamps __read_mostly = 1;
80int sysctl_tcp_window_scaling __read_mostly = 1; 80int sysctl_tcp_window_scaling __read_mostly = 1;
81int sysctl_tcp_sack __read_mostly = 1; 81int sysctl_tcp_sack __read_mostly = 1;
82int sysctl_tcp_fack __read_mostly = 1; 82int sysctl_tcp_fack __read_mostly;
83int sysctl_tcp_max_reordering __read_mostly = 300; 83int sysctl_tcp_max_reordering __read_mostly = 300;
84int sysctl_tcp_dsack __read_mostly = 1; 84int sysctl_tcp_dsack __read_mostly = 1;
85int sysctl_tcp_app_win __read_mostly = 31; 85int sysctl_tcp_app_win __read_mostly = 31;
@@ -95,9 +95,6 @@ int sysctl_tcp_rfc1337 __read_mostly;
95int sysctl_tcp_max_orphans __read_mostly = NR_FILE; 95int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
96int sysctl_tcp_frto __read_mostly = 2; 96int sysctl_tcp_frto __read_mostly = 2;
97int sysctl_tcp_min_rtt_wlen __read_mostly = 300; 97int sysctl_tcp_min_rtt_wlen __read_mostly = 300;
98
99int sysctl_tcp_thin_dupack __read_mostly;
100
101int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; 98int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
102int sysctl_tcp_early_retrans __read_mostly = 3; 99int sysctl_tcp_early_retrans __read_mostly = 3;
103int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; 100int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
@@ -129,7 +126,8 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
129#define REXMIT_LOST 1 /* retransmit packets marked lost */ 126#define REXMIT_LOST 1 /* retransmit packets marked lost */
130#define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */ 127#define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */
131 128
132static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb) 129static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb,
130 unsigned int len)
133{ 131{
134 static bool __once __read_mostly; 132 static bool __once __read_mostly;
135 133
@@ -140,8 +138,9 @@ static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb)
140 138
141 rcu_read_lock(); 139 rcu_read_lock();
142 dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif); 140 dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif);
143 pr_warn("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n", 141 if (!dev || len >= dev->mtu)
144 dev ? dev->name : "Unknown driver"); 142 pr_warn("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n",
143 dev ? dev->name : "Unknown driver");
145 rcu_read_unlock(); 144 rcu_read_unlock();
146 } 145 }
147} 146}
@@ -164,8 +163,10 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
164 if (len >= icsk->icsk_ack.rcv_mss) { 163 if (len >= icsk->icsk_ack.rcv_mss) {
165 icsk->icsk_ack.rcv_mss = min_t(unsigned int, len, 164 icsk->icsk_ack.rcv_mss = min_t(unsigned int, len,
166 tcp_sk(sk)->advmss); 165 tcp_sk(sk)->advmss);
167 if (unlikely(icsk->icsk_ack.rcv_mss != len)) 166 /* Account for possibly-removed options */
168 tcp_gro_dev_warn(sk, skb); 167 if (unlikely(len > icsk->icsk_ack.rcv_mss +
168 MAX_TCP_OPTION_SPACE))
169 tcp_gro_dev_warn(sk, skb, len);
169 } else { 170 } else {
170 /* Otherwise, we make more careful check taking into account, 171 /* Otherwise, we make more careful check taking into account,
171 * that SACKs block is variable. 172 * that SACKs block is variable.
@@ -877,22 +878,11 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
877 const int ts) 878 const int ts)
878{ 879{
879 struct tcp_sock *tp = tcp_sk(sk); 880 struct tcp_sock *tp = tcp_sk(sk);
880 if (metric > tp->reordering) { 881 int mib_idx;
881 int mib_idx;
882 882
883 if (metric > tp->reordering) {
883 tp->reordering = min(sysctl_tcp_max_reordering, metric); 884 tp->reordering = min(sysctl_tcp_max_reordering, metric);
884 885
885 /* This exciting event is worth to be remembered. 8) */
886 if (ts)
887 mib_idx = LINUX_MIB_TCPTSREORDER;
888 else if (tcp_is_reno(tp))
889 mib_idx = LINUX_MIB_TCPRENOREORDER;
890 else if (tcp_is_fack(tp))
891 mib_idx = LINUX_MIB_TCPFACKREORDER;
892 else
893 mib_idx = LINUX_MIB_TCPSACKREORDER;
894
895 NET_INC_STATS(sock_net(sk), mib_idx);
896#if FASTRETRANS_DEBUG > 1 886#if FASTRETRANS_DEBUG > 1
897 pr_debug("Disorder%d %d %u f%u s%u rr%d\n", 887 pr_debug("Disorder%d %d %u f%u s%u rr%d\n",
898 tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state, 888 tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
@@ -904,9 +894,19 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
904 tcp_disable_fack(tp); 894 tcp_disable_fack(tp);
905 } 895 }
906 896
907 if (metric > 0)
908 tcp_disable_early_retrans(tp);
909 tp->rack.reord = 1; 897 tp->rack.reord = 1;
898
899 /* This exciting event is worth to be remembered. 8) */
900 if (ts)
901 mib_idx = LINUX_MIB_TCPTSREORDER;
902 else if (tcp_is_reno(tp))
903 mib_idx = LINUX_MIB_TCPRENOREORDER;
904 else if (tcp_is_fack(tp))
905 mib_idx = LINUX_MIB_TCPFACKREORDER;
906 else
907 mib_idx = LINUX_MIB_TCPSACKREORDER;
908
909 NET_INC_STATS(sock_net(sk), mib_idx);
910} 910}
911 911
912/* This must be called before lost_out is incremented */ 912/* This must be called before lost_out is incremented */
@@ -916,10 +916,6 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
916 before(TCP_SKB_CB(skb)->seq, 916 before(TCP_SKB_CB(skb)->seq,
917 TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) 917 TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
918 tp->retransmit_skb_hint = skb; 918 tp->retransmit_skb_hint = skb;
919
920 if (!tp->lost_out ||
921 after(TCP_SKB_CB(skb)->end_seq, tp->retransmit_high))
922 tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
923} 919}
924 920
925/* Sum the number of packets on the wire we have marked as lost. 921/* Sum the number of packets on the wire we have marked as lost.
@@ -1135,6 +1131,7 @@ struct tcp_sacktag_state {
1135 */ 1131 */
1136 struct skb_mstamp first_sackt; 1132 struct skb_mstamp first_sackt;
1137 struct skb_mstamp last_sackt; 1133 struct skb_mstamp last_sackt;
1134 struct skb_mstamp ack_time; /* Timestamp when the S/ACK was received */
1138 struct rate_sample *rate; 1135 struct rate_sample *rate;
1139 int flag; 1136 int flag;
1140}; 1137};
@@ -1217,7 +1214,8 @@ static u8 tcp_sacktag_one(struct sock *sk,
1217 return sacked; 1214 return sacked;
1218 1215
1219 if (!(sacked & TCPCB_SACKED_ACKED)) { 1216 if (!(sacked & TCPCB_SACKED_ACKED)) {
1220 tcp_rack_advance(tp, xmit_time, sacked); 1217 tcp_rack_advance(tp, sacked, end_seq,
1218 xmit_time, &state->ack_time);
1221 1219
1222 if (sacked & TCPCB_SACKED_RETRANS) { 1220 if (sacked & TCPCB_SACKED_RETRANS) {
1223 /* If the segment is not tagged as lost, 1221 /* If the segment is not tagged as lost,
@@ -1982,7 +1980,6 @@ void tcp_enter_loss(struct sock *sk)
1982 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; 1980 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
1983 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; 1981 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
1984 tp->lost_out += tcp_skb_pcount(skb); 1982 tp->lost_out += tcp_skb_pcount(skb);
1985 tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
1986 } 1983 }
1987 } 1984 }
1988 tcp_verify_left_out(tp); 1985 tcp_verify_left_out(tp);
@@ -2001,6 +1998,11 @@ void tcp_enter_loss(struct sock *sk)
2001 /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous 1998 /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
2002 * loss recovery is underway except recurring timeout(s) on 1999 * loss recovery is underway except recurring timeout(s) on
2003 * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing 2000 * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing
2001 *
2002 * In theory F-RTO can be used repeatedly during loss recovery.
2003 * In practice this interacts badly with broken middle-boxes that
2004 * falsely raise the receive window, which results in repeated
2005 * timeouts and stop-and-go behavior.
2004 */ 2006 */
2005 tp->frto = sysctl_tcp_frto && 2007 tp->frto = sysctl_tcp_frto &&
2006 (new_recovery || icsk->icsk_retransmits) && 2008 (new_recovery || icsk->icsk_retransmits) &&
@@ -2056,30 +2058,6 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
2056 return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1; 2058 return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1;
2057} 2059}
2058 2060
2059static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
2060{
2061 struct tcp_sock *tp = tcp_sk(sk);
2062 unsigned long delay;
2063
2064 /* Delay early retransmit and entering fast recovery for
2065 * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples
2066 * available, or RTO is scheduled to fire first.
2067 */
2068 if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 ||
2069 (flag & FLAG_ECE) || !tp->srtt_us)
2070 return false;
2071
2072 delay = max(usecs_to_jiffies(tp->srtt_us >> 5),
2073 msecs_to_jiffies(2));
2074
2075 if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay)))
2076 return false;
2077
2078 inet_csk_reset_xmit_timer(sk, ICSK_TIME_EARLY_RETRANS, delay,
2079 TCP_RTO_MAX);
2080 return true;
2081}
2082
2083/* Linux NewReno/SACK/FACK/ECN state machine. 2061/* Linux NewReno/SACK/FACK/ECN state machine.
2084 * -------------------------------------- 2062 * --------------------------------------
2085 * 2063 *
@@ -2127,10 +2105,26 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
2127 * F.e. after RTO, when all the queue is considered as lost, 2105 * F.e. after RTO, when all the queue is considered as lost,
2128 * lost_out = packets_out and in_flight = retrans_out. 2106 * lost_out = packets_out and in_flight = retrans_out.
2129 * 2107 *
2130 * Essentially, we have now two algorithms counting 2108 * Essentially, we have now a few algorithms detecting
2131 * lost packets. 2109 * lost packets.
2132 * 2110 *
2133 * FACK: It is the simplest heuristics. As soon as we decided 2111 * If the receiver supports SACK:
2112 *
2113 * RFC6675/3517: It is the conventional algorithm. A packet is
2114 * considered lost if the number of higher sequence packets
2115 * SACKed is greater than or equal the DUPACK thoreshold
2116 * (reordering). This is implemented in tcp_mark_head_lost and
2117 * tcp_update_scoreboard.
2118 *
2119 * RACK (draft-ietf-tcpm-rack-01): it is a newer algorithm
2120 * (2017-) that checks timing instead of counting DUPACKs.
2121 * Essentially a packet is considered lost if it's not S/ACKed
2122 * after RTT + reordering_window, where both metrics are
2123 * dynamically measured and adjusted. This is implemented in
2124 * tcp_rack_mark_lost.
2125 *
2126 * FACK (Disabled by default. Subsumbed by RACK):
2127 * It is the simplest heuristics. As soon as we decided
2134 * that something is lost, we decide that _all_ not SACKed 2128 * that something is lost, we decide that _all_ not SACKed
2135 * packets until the most forward SACK are lost. I.e. 2129 * packets until the most forward SACK are lost. I.e.
2136 * lost_out = fackets_out - sacked_out and left_out = fackets_out. 2130 * lost_out = fackets_out - sacked_out and left_out = fackets_out.
@@ -2139,16 +2133,14 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
2139 * takes place. We use FACK by default until reordering 2133 * takes place. We use FACK by default until reordering
2140 * is suspected on the path to this destination. 2134 * is suspected on the path to this destination.
2141 * 2135 *
2142 * NewReno: when Recovery is entered, we assume that one segment 2136 * If the receiver does not support SACK:
2137 *
2138 * NewReno (RFC6582): in Recovery we assume that one segment
2143 * is lost (classic Reno). While we are in Recovery and 2139 * is lost (classic Reno). While we are in Recovery and
2144 * a partial ACK arrives, we assume that one more packet 2140 * a partial ACK arrives, we assume that one more packet
2145 * is lost (NewReno). This heuristics are the same in NewReno 2141 * is lost (NewReno). This heuristics are the same in NewReno
2146 * and SACK. 2142 * and SACK.
2147 * 2143 *
2148 * Imagine, that's all! Forget about all this shamanism about CWND inflation
2149 * deflation etc. CWND is real congestion window, never inflated, changes
2150 * only according to classic VJ rules.
2151 *
2152 * Really tricky (and requiring careful tuning) part of algorithm 2144 * Really tricky (and requiring careful tuning) part of algorithm
2153 * is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue(). 2145 * is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue().
2154 * The first determines the moment _when_ we should reduce CWND and, 2146 * The first determines the moment _when_ we should reduce CWND and,
@@ -2176,8 +2168,6 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
2176static bool tcp_time_to_recover(struct sock *sk, int flag) 2168static bool tcp_time_to_recover(struct sock *sk, int flag)
2177{ 2169{
2178 struct tcp_sock *tp = tcp_sk(sk); 2170 struct tcp_sock *tp = tcp_sk(sk);
2179 __u32 packets_out;
2180 int tcp_reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
2181 2171
2182 /* Trick#1: The loss is proven. */ 2172 /* Trick#1: The loss is proven. */
2183 if (tp->lost_out) 2173 if (tp->lost_out)
@@ -2187,39 +2177,6 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
2187 if (tcp_dupack_heuristics(tp) > tp->reordering) 2177 if (tcp_dupack_heuristics(tp) > tp->reordering)
2188 return true; 2178 return true;
2189 2179
2190 /* Trick#4: It is still not OK... But will it be useful to delay
2191 * recovery more?
2192 */
2193 packets_out = tp->packets_out;
2194 if (packets_out <= tp->reordering &&
2195 tp->sacked_out >= max_t(__u32, packets_out/2, tcp_reordering) &&
2196 !tcp_may_send_now(sk)) {
2197 /* We have nothing to send. This connection is limited
2198 * either by receiver window or by application.
2199 */
2200 return true;
2201 }
2202
2203 /* If a thin stream is detected, retransmit after first
2204 * received dupack. Employ only if SACK is supported in order
2205 * to avoid possible corner-case series of spurious retransmissions
2206 * Use only if there are no unsent data.
2207 */
2208 if ((tp->thin_dupack || sysctl_tcp_thin_dupack) &&
2209 tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 &&
2210 tcp_is_sack(tp) && !tcp_send_head(sk))
2211 return true;
2212
2213 /* Trick#6: TCP early retransmit, per RFC5827. To avoid spurious
2214 * retransmissions due to small network reorderings, we implement
2215 * Mitigation A.3 in the RFC and delay the retransmission for a short
2216 * interval if appropriate.
2217 */
2218 if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out &&
2219 (tp->packets_out >= (tp->sacked_out + 1) && tp->packets_out < 4) &&
2220 !tcp_may_send_now(sk))
2221 return !tcp_pause_early_retransmit(sk, flag);
2222
2223 return false; 2180 return false;
2224} 2181}
2225 2182
@@ -2521,8 +2478,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk)
2521 tcp_ecn_queue_cwr(tp); 2478 tcp_ecn_queue_cwr(tp);
2522} 2479}
2523 2480
2524static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, 2481void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag)
2525 int flag)
2526{ 2482{
2527 struct tcp_sock *tp = tcp_sk(sk); 2483 struct tcp_sock *tp = tcp_sk(sk);
2528 int sndcnt = 0; 2484 int sndcnt = 0;
@@ -2690,7 +2646,7 @@ void tcp_simple_retransmit(struct sock *sk)
2690} 2646}
2691EXPORT_SYMBOL(tcp_simple_retransmit); 2647EXPORT_SYMBOL(tcp_simple_retransmit);
2692 2648
2693static void tcp_enter_recovery(struct sock *sk, bool ece_ack) 2649void tcp_enter_recovery(struct sock *sk, bool ece_ack)
2694{ 2650{
2695 struct tcp_sock *tp = tcp_sk(sk); 2651 struct tcp_sock *tp = tcp_sk(sk);
2696 int mib_idx; 2652 int mib_idx;
@@ -2726,14 +2682,18 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
2726 tcp_try_undo_loss(sk, false)) 2682 tcp_try_undo_loss(sk, false))
2727 return; 2683 return;
2728 2684
2729 if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */ 2685 /* The ACK (s)acks some never-retransmitted data meaning not all
2730 /* Step 3.b. A timeout is spurious if not all data are 2686 * the data packets before the timeout were lost. Therefore we
2731 * lost, i.e., never-retransmitted data are (s)acked. 2687 * undo the congestion window and state. This is essentially
2732 */ 2688 * the operation in F-RTO (RFC5682 section 3.1 step 3.b). Since
2733 if ((flag & FLAG_ORIG_SACK_ACKED) && 2689 * a retransmitted skb is permantly marked, we can apply such an
2734 tcp_try_undo_loss(sk, true)) 2690 * operation even if F-RTO was not used.
2735 return; 2691 */
2692 if ((flag & FLAG_ORIG_SACK_ACKED) &&
2693 tcp_try_undo_loss(sk, tp->undo_marker))
2694 return;
2736 2695
2696 if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
2737 if (after(tp->snd_nxt, tp->high_seq)) { 2697 if (after(tp->snd_nxt, tp->high_seq)) {
2738 if (flag & FLAG_DATA_SACKED || is_dupack) 2698 if (flag & FLAG_DATA_SACKED || is_dupack)
2739 tp->frto = 0; /* Step 3.a. loss was real */ 2699 tp->frto = 0; /* Step 3.a. loss was real */
@@ -2800,6 +2760,21 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked)
2800 return false; 2760 return false;
2801} 2761}
2802 2762
2763static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag,
2764 const struct skb_mstamp *ack_time)
2765{
2766 struct tcp_sock *tp = tcp_sk(sk);
2767
2768 /* Use RACK to detect loss */
2769 if (sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) {
2770 u32 prior_retrans = tp->retrans_out;
2771
2772 tcp_rack_mark_lost(sk, ack_time);
2773 if (prior_retrans > tp->retrans_out)
2774 *ack_flag |= FLAG_LOST_RETRANS;
2775 }
2776}
2777
2803/* Process an event, which can update packets-in-flight not trivially. 2778/* Process an event, which can update packets-in-flight not trivially.
2804 * Main goal of this function is to calculate new estimate for left_out, 2779 * Main goal of this function is to calculate new estimate for left_out,
2805 * taking into account both packets sitting in receiver's buffer and 2780 * taking into account both packets sitting in receiver's buffer and
@@ -2813,7 +2788,8 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked)
2813 * tcp_xmit_retransmit_queue(). 2788 * tcp_xmit_retransmit_queue().
2814 */ 2789 */
2815static void tcp_fastretrans_alert(struct sock *sk, const int acked, 2790static void tcp_fastretrans_alert(struct sock *sk, const int acked,
2816 bool is_dupack, int *ack_flag, int *rexmit) 2791 bool is_dupack, int *ack_flag, int *rexmit,
2792 const struct skb_mstamp *ack_time)
2817{ 2793{
2818 struct inet_connection_sock *icsk = inet_csk(sk); 2794 struct inet_connection_sock *icsk = inet_csk(sk);
2819 struct tcp_sock *tp = tcp_sk(sk); 2795 struct tcp_sock *tp = tcp_sk(sk);
@@ -2864,13 +2840,6 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
2864 } 2840 }
2865 } 2841 }
2866 2842
2867 /* Use RACK to detect loss */
2868 if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS &&
2869 tcp_rack_mark_lost(sk)) {
2870 flag |= FLAG_LOST_RETRANS;
2871 *ack_flag |= FLAG_LOST_RETRANS;
2872 }
2873
2874 /* E. Process state. */ 2843 /* E. Process state. */
2875 switch (icsk->icsk_ca_state) { 2844 switch (icsk->icsk_ca_state) {
2876 case TCP_CA_Recovery: 2845 case TCP_CA_Recovery:
@@ -2888,11 +2857,13 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
2888 tcp_try_keep_open(sk); 2857 tcp_try_keep_open(sk);
2889 return; 2858 return;
2890 } 2859 }
2860 tcp_rack_identify_loss(sk, ack_flag, ack_time);
2891 break; 2861 break;
2892 case TCP_CA_Loss: 2862 case TCP_CA_Loss:
2893 tcp_process_loss(sk, flag, is_dupack, rexmit); 2863 tcp_process_loss(sk, flag, is_dupack, rexmit);
2894 if (icsk->icsk_ca_state != TCP_CA_Open && 2864 tcp_rack_identify_loss(sk, ack_flag, ack_time);
2895 !(flag & FLAG_LOST_RETRANS)) 2865 if (!(icsk->icsk_ca_state == TCP_CA_Open ||
2866 (*ack_flag & FLAG_LOST_RETRANS)))
2896 return; 2867 return;
2897 /* Change state if cwnd is undone or retransmits are lost */ 2868 /* Change state if cwnd is undone or retransmits are lost */
2898 default: 2869 default:
@@ -2906,6 +2877,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
2906 if (icsk->icsk_ca_state <= TCP_CA_Disorder) 2877 if (icsk->icsk_ca_state <= TCP_CA_Disorder)
2907 tcp_try_undo_dsack(sk); 2878 tcp_try_undo_dsack(sk);
2908 2879
2880 tcp_rack_identify_loss(sk, ack_flag, ack_time);
2909 if (!tcp_time_to_recover(sk, flag)) { 2881 if (!tcp_time_to_recover(sk, flag)) {
2910 tcp_try_to_open(sk, flag); 2882 tcp_try_to_open(sk, flag);
2911 return; 2883 return;
@@ -3024,7 +2996,7 @@ void tcp_rearm_rto(struct sock *sk)
3024 } else { 2996 } else {
3025 u32 rto = inet_csk(sk)->icsk_rto; 2997 u32 rto = inet_csk(sk)->icsk_rto;
3026 /* Offset the time elapsed after installing regular RTO */ 2998 /* Offset the time elapsed after installing regular RTO */
3027 if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || 2999 if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
3028 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 3000 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
3029 struct sk_buff *skb = tcp_write_queue_head(sk); 3001 struct sk_buff *skb = tcp_write_queue_head(sk);
3030 const u32 rto_time_stamp = 3002 const u32 rto_time_stamp =
@@ -3041,24 +3013,6 @@ void tcp_rearm_rto(struct sock *sk)
3041 } 3013 }
3042} 3014}
3043 3015
3044/* This function is called when the delayed ER timer fires. TCP enters
3045 * fast recovery and performs fast-retransmit.
3046 */
3047void tcp_resume_early_retransmit(struct sock *sk)
3048{
3049 struct tcp_sock *tp = tcp_sk(sk);
3050
3051 tcp_rearm_rto(sk);
3052
3053 /* Stop if ER is disabled after the delayed ER timer is scheduled */
3054 if (!tp->do_early_retrans)
3055 return;
3056
3057 tcp_enter_recovery(sk, false);
3058 tcp_update_scoreboard(sk, 1);
3059 tcp_xmit_retransmit_queue(sk);
3060}
3061
3062/* If we get here, the whole TSO packet has not been acked. */ 3016/* If we get here, the whole TSO packet has not been acked. */
3063static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) 3017static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
3064{ 3018{
@@ -3101,11 +3055,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
3101 */ 3055 */
3102static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, 3056static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3103 u32 prior_snd_una, int *acked, 3057 u32 prior_snd_una, int *acked,
3104 struct tcp_sacktag_state *sack, 3058 struct tcp_sacktag_state *sack)
3105 struct skb_mstamp *now)
3106{ 3059{
3107 const struct inet_connection_sock *icsk = inet_csk(sk); 3060 const struct inet_connection_sock *icsk = inet_csk(sk);
3108 struct skb_mstamp first_ackt, last_ackt; 3061 struct skb_mstamp first_ackt, last_ackt;
3062 struct skb_mstamp *now = &sack->ack_time;
3109 struct tcp_sock *tp = tcp_sk(sk); 3063 struct tcp_sock *tp = tcp_sk(sk);
3110 u32 prior_sacked = tp->sacked_out; 3064 u32 prior_sacked = tp->sacked_out;
3111 u32 reord = tp->packets_out; 3065 u32 reord = tp->packets_out;
@@ -3165,7 +3119,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3165 } else if (tcp_is_sack(tp)) { 3119 } else if (tcp_is_sack(tp)) {
3166 tp->delivered += acked_pcount; 3120 tp->delivered += acked_pcount;
3167 if (!tcp_skb_spurious_retrans(tp, skb)) 3121 if (!tcp_skb_spurious_retrans(tp, skb))
3168 tcp_rack_advance(tp, &skb->skb_mstamp, sacked); 3122 tcp_rack_advance(tp, sacked, scb->end_seq,
3123 &skb->skb_mstamp,
3124 &sack->ack_time);
3169 } 3125 }
3170 if (sacked & TCPCB_LOST) 3126 if (sacked & TCPCB_LOST)
3171 tp->lost_out -= acked_pcount; 3127 tp->lost_out -= acked_pcount;
@@ -3595,7 +3551,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3595 u32 lost = tp->lost; 3551 u32 lost = tp->lost;
3596 int acked = 0; /* Number of packets newly acked */ 3552 int acked = 0; /* Number of packets newly acked */
3597 int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */ 3553 int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
3598 struct skb_mstamp now;
3599 3554
3600 sack_state.first_sackt.v64 = 0; 3555 sack_state.first_sackt.v64 = 0;
3601 sack_state.rate = &rs; 3556 sack_state.rate = &rs;
@@ -3621,10 +3576,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3621 if (after(ack, tp->snd_nxt)) 3576 if (after(ack, tp->snd_nxt))
3622 goto invalid_ack; 3577 goto invalid_ack;
3623 3578
3624 skb_mstamp_get(&now); 3579 skb_mstamp_get(&sack_state.ack_time);
3625 3580
3626 if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || 3581 if (icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
3627 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
3628 tcp_rearm_rto(sk); 3582 tcp_rearm_rto(sk);
3629 3583
3630 if (after(ack, prior_snd_una)) { 3584 if (after(ack, prior_snd_una)) {
@@ -3689,34 +3643,34 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3689 3643
3690 /* See if we can take anything off of the retransmit queue. */ 3644 /* See if we can take anything off of the retransmit queue. */
3691 flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked, 3645 flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked,
3692 &sack_state, &now); 3646 &sack_state);
3693 3647
3694 if (tcp_ack_is_dubious(sk, flag)) { 3648 if (tcp_ack_is_dubious(sk, flag)) {
3695 is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); 3649 is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
3696 tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); 3650 tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit,
3651 &sack_state.ack_time);
3697 } 3652 }
3698 if (tp->tlp_high_seq) 3653 if (tp->tlp_high_seq)
3699 tcp_process_tlp_ack(sk, ack, flag); 3654 tcp_process_tlp_ack(sk, ack, flag);
3700 3655
3701 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) { 3656 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
3702 struct dst_entry *dst = __sk_dst_get(sk); 3657 sk_dst_confirm(sk);
3703 if (dst)
3704 dst_confirm(dst);
3705 }
3706 3658
3707 if (icsk->icsk_pending == ICSK_TIME_RETRANS) 3659 if (icsk->icsk_pending == ICSK_TIME_RETRANS)
3708 tcp_schedule_loss_probe(sk); 3660 tcp_schedule_loss_probe(sk);
3709 delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */ 3661 delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */
3710 lost = tp->lost - lost; /* freshly marked lost */ 3662 lost = tp->lost - lost; /* freshly marked lost */
3711 tcp_rate_gen(sk, delivered, lost, &now, &rs); 3663 tcp_rate_gen(sk, delivered, lost, &sack_state.ack_time,
3712 tcp_cong_control(sk, ack, delivered, flag, &rs); 3664 sack_state.rate);
3665 tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
3713 tcp_xmit_recovery(sk, rexmit); 3666 tcp_xmit_recovery(sk, rexmit);
3714 return 1; 3667 return 1;
3715 3668
3716no_queue: 3669no_queue:
3717 /* If data was DSACKed, see if we can undo a cwnd reduction. */ 3670 /* If data was DSACKed, see if we can undo a cwnd reduction. */
3718 if (flag & FLAG_DSACKING_ACK) 3671 if (flag & FLAG_DSACKING_ACK)
3719 tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); 3672 tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit,
3673 &sack_state.ack_time);
3720 /* If this ack opens up a zero window, clear backoff. It was 3674 /* If this ack opens up a zero window, clear backoff. It was
3721 * being used to time the probes, and is probably far higher than 3675 * being used to time the probes, and is probably far higher than
3722 * it needs to be for normal retransmission. 3676 * it needs to be for normal retransmission.
@@ -3737,9 +3691,11 @@ old_ack:
3737 * If data was DSACKed, see if we can undo a cwnd reduction. 3691 * If data was DSACKed, see if we can undo a cwnd reduction.
3738 */ 3692 */
3739 if (TCP_SKB_CB(skb)->sacked) { 3693 if (TCP_SKB_CB(skb)->sacked) {
3694 skb_mstamp_get(&sack_state.ack_time);
3740 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, 3695 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
3741 &sack_state); 3696 &sack_state);
3742 tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); 3697 tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit,
3698 &sack_state.ack_time);
3743 tcp_xmit_recovery(sk, rexmit); 3699 tcp_xmit_recovery(sk, rexmit);
3744 } 3700 }
3745 3701
@@ -4557,6 +4513,7 @@ add_sack:
4557end: 4513end:
4558 if (skb) { 4514 if (skb) {
4559 tcp_grow_window(sk, skb); 4515 tcp_grow_window(sk, skb);
4516 skb_condense(skb);
4560 skb_set_owner_r(skb, sk); 4517 skb_set_owner_r(skb, sk);
4561 } 4518 }
4562} 4519}
@@ -5249,6 +5206,23 @@ static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
5249 return err; 5206 return err;
5250} 5207}
5251 5208
5209/* Accept RST for rcv_nxt - 1 after a FIN.
5210 * When tcp connections are abruptly terminated from Mac OSX (via ^C), a
5211 * FIN is sent followed by a RST packet. The RST is sent with the same
5212 * sequence number as the FIN, and thus according to RFC 5961 a challenge
5213 * ACK should be sent. However, Mac OSX rate limits replies to challenge
5214 * ACKs on the closed socket. In addition middleboxes can drop either the
5215 * challenge ACK or a subsequent RST.
5216 */
5217static bool tcp_reset_check(const struct sock *sk, const struct sk_buff *skb)
5218{
5219 struct tcp_sock *tp = tcp_sk(sk);
5220
5221 return unlikely(TCP_SKB_CB(skb)->seq == (tp->rcv_nxt - 1) &&
5222 (1 << sk->sk_state) & (TCPF_CLOSE_WAIT | TCPF_LAST_ACK |
5223 TCPF_CLOSING));
5224}
5225
5252/* Does PAWS and seqno based validation of an incoming segment, flags will 5226/* Does PAWS and seqno based validation of an incoming segment, flags will
5253 * play significant role here. 5227 * play significant role here.
5254 */ 5228 */
@@ -5287,20 +5261,25 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
5287 LINUX_MIB_TCPACKSKIPPEDSEQ, 5261 LINUX_MIB_TCPACKSKIPPEDSEQ,
5288 &tp->last_oow_ack_time)) 5262 &tp->last_oow_ack_time))
5289 tcp_send_dupack(sk, skb); 5263 tcp_send_dupack(sk, skb);
5264 } else if (tcp_reset_check(sk, skb)) {
5265 tcp_reset(sk);
5290 } 5266 }
5291 goto discard; 5267 goto discard;
5292 } 5268 }
5293 5269
5294 /* Step 2: check RST bit */ 5270 /* Step 2: check RST bit */
5295 if (th->rst) { 5271 if (th->rst) {
5296 /* RFC 5961 3.2 (extend to match against SACK too if available): 5272 /* RFC 5961 3.2 (extend to match against (RCV.NXT - 1) after a
5297 * If seq num matches RCV.NXT or the right-most SACK block, 5273 * FIN and SACK too if available):
5274 * If seq num matches RCV.NXT or (RCV.NXT - 1) after a FIN, or
5275 * the right-most SACK block,
5298 * then 5276 * then
5299 * RESET the connection 5277 * RESET the connection
5300 * else 5278 * else
5301 * Send a challenge ACK 5279 * Send a challenge ACK
5302 */ 5280 */
5303 if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { 5281 if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt ||
5282 tcp_reset_check(sk, skb)) {
5304 rst_seq_match = true; 5283 rst_seq_match = true;
5305 } else if (tcp_is_sack(tp) && tp->rx_opt.num_sacks > 0) { 5284 } else if (tcp_is_sack(tp) && tp->rx_opt.num_sacks > 0) {
5306 struct tcp_sack_block *sp = &tp->selective_acks[0]; 5285 struct tcp_sack_block *sp = &tp->selective_acks[0];
@@ -5571,6 +5550,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
5571 struct inet_connection_sock *icsk = inet_csk(sk); 5550 struct inet_connection_sock *icsk = inet_csk(sk);
5572 5551
5573 tcp_set_state(sk, TCP_ESTABLISHED); 5552 tcp_set_state(sk, TCP_ESTABLISHED);
5553 icsk->icsk_ack.lrcvtime = tcp_time_stamp;
5574 5554
5575 if (skb) { 5555 if (skb) {
5576 icsk->icsk_af_ops->sk_rx_dst_set(sk, skb); 5556 icsk->icsk_af_ops->sk_rx_dst_set(sk, skb);
@@ -5789,7 +5769,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5789 * to stand against the temptation 8) --ANK 5769 * to stand against the temptation 8) --ANK
5790 */ 5770 */
5791 inet_csk_schedule_ack(sk); 5771 inet_csk_schedule_ack(sk);
5792 icsk->icsk_ack.lrcvtime = tcp_time_stamp;
5793 tcp_enter_quickack_mode(sk); 5772 tcp_enter_quickack_mode(sk);
5794 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, 5773 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
5795 TCP_DELACK_MAX, TCP_RTO_MAX); 5774 TCP_DELACK_MAX, TCP_RTO_MAX);
@@ -5916,9 +5895,15 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
5916 if (th->syn) { 5895 if (th->syn) {
5917 if (th->fin) 5896 if (th->fin)
5918 goto discard; 5897 goto discard;
5919 if (icsk->icsk_af_ops->conn_request(sk, skb) < 0) 5898 /* It is possible that we process SYN packets from backlog,
5920 return 1; 5899 * so we need to make sure to disable BH right there.
5900 */
5901 local_bh_disable();
5902 acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0;
5903 local_bh_enable();
5921 5904
5905 if (!acceptable)
5906 return 1;
5922 consume_skb(skb); 5907 consume_skb(skb);
5923 return 0; 5908 return 0;
5924 } 5909 }
@@ -6022,7 +6007,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
6022 break; 6007 break;
6023 6008
6024 case TCP_FIN_WAIT1: { 6009 case TCP_FIN_WAIT1: {
6025 struct dst_entry *dst;
6026 int tmo; 6010 int tmo;
6027 6011
6028 /* If we enter the TCP_FIN_WAIT1 state and we are a 6012 /* If we enter the TCP_FIN_WAIT1 state and we are a
@@ -6049,9 +6033,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
6049 tcp_set_state(sk, TCP_FIN_WAIT2); 6033 tcp_set_state(sk, TCP_FIN_WAIT2);
6050 sk->sk_shutdown |= SEND_SHUTDOWN; 6034 sk->sk_shutdown |= SEND_SHUTDOWN;
6051 6035
6052 dst = __sk_dst_get(sk); 6036 sk_dst_confirm(sk);
6053 if (dst)
6054 dst_confirm(dst);
6055 6037
6056 if (!sock_flag(sk, SOCK_DEAD)) { 6038 if (!sock_flag(sk, SOCK_DEAD)) {
6057 /* Wake up lingering close() */ 6039 /* Wake up lingering close() */
@@ -6363,7 +6345,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
6363 * timewait bucket, so that all the necessary checks 6345 * timewait bucket, so that all the necessary checks
6364 * are made in the function processing timewait state. 6346 * are made in the function processing timewait state.
6365 */ 6347 */
6366 if (tcp_death_row.sysctl_tw_recycle) { 6348 if (net->ipv4.tcp_death_row.sysctl_tw_recycle) {
6367 bool strict; 6349 bool strict;
6368 6350
6369 dst = af_ops->route_req(sk, &fl, req, &strict); 6351 dst = af_ops->route_req(sk, &fl, req, &strict);
@@ -6377,8 +6359,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
6377 } 6359 }
6378 /* Kill the following clause, if you dislike this way. */ 6360 /* Kill the following clause, if you dislike this way. */
6379 else if (!net->ipv4.sysctl_tcp_syncookies && 6361 else if (!net->ipv4.sysctl_tcp_syncookies &&
6380 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < 6362 (net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
6381 (sysctl_max_syn_backlog >> 2)) && 6363 (net->ipv4.sysctl_max_syn_backlog >> 2)) &&
6382 !tcp_peer_is_proven(req, dst, false, 6364 !tcp_peer_is_proven(req, dst, false,
6383 tmp_opt.saw_tstamp)) { 6365 tmp_opt.saw_tstamp)) {
6384 /* Without syncookies last quarter of 6366 /* Without syncookies last quarter of
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index fe9da4fb96bf..575e19dcc017 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -145,7 +145,9 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
145 struct flowi4 *fl4; 145 struct flowi4 *fl4;
146 struct rtable *rt; 146 struct rtable *rt;
147 int err; 147 int err;
148 u32 seq;
148 struct ip_options_rcu *inet_opt; 149 struct ip_options_rcu *inet_opt;
150 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
149 151
150 if (addr_len < sizeof(struct sockaddr_in)) 152 if (addr_len < sizeof(struct sockaddr_in))
151 return -EINVAL; 153 return -EINVAL;
@@ -196,7 +198,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
196 tp->write_seq = 0; 198 tp->write_seq = 0;
197 } 199 }
198 200
199 if (tcp_death_row.sysctl_tw_recycle && 201 if (tcp_death_row->sysctl_tw_recycle &&
200 !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) 202 !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
201 tcp_fetch_timewait_stamp(sk, &rt->dst); 203 tcp_fetch_timewait_stamp(sk, &rt->dst);
202 204
@@ -215,7 +217,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
215 * complete initialization after this. 217 * complete initialization after this.
216 */ 218 */
217 tcp_set_state(sk, TCP_SYN_SENT); 219 tcp_set_state(sk, TCP_SYN_SENT);
218 err = inet_hash_connect(&tcp_death_row, sk); 220 err = inet_hash_connect(tcp_death_row, sk);
219 if (err) 221 if (err)
220 goto failure; 222 goto failure;
221 223
@@ -231,19 +233,27 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
231 /* OK, now commit destination to socket. */ 233 /* OK, now commit destination to socket. */
232 sk->sk_gso_type = SKB_GSO_TCPV4; 234 sk->sk_gso_type = SKB_GSO_TCPV4;
233 sk_setup_caps(sk, &rt->dst); 235 sk_setup_caps(sk, &rt->dst);
236 rt = NULL;
234 237
235 if (!tp->write_seq && likely(!tp->repair)) 238 if (likely(!tp->repair)) {
236 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr, 239 seq = secure_tcp_sequence_number(inet->inet_saddr,
237 inet->inet_daddr, 240 inet->inet_daddr,
238 inet->inet_sport, 241 inet->inet_sport,
239 usin->sin_port, 242 usin->sin_port,
240 &tp->tsoffset); 243 &tp->tsoffset);
244 if (!tp->write_seq)
245 tp->write_seq = seq;
246 }
241 247
242 inet->inet_id = tp->write_seq ^ jiffies; 248 inet->inet_id = tp->write_seq ^ jiffies;
243 249
250 if (tcp_fastopen_defer_connect(sk, &err))
251 return err;
252 if (err)
253 goto failure;
254
244 err = tcp_connect(sk); 255 err = tcp_connect(sk);
245 256
246 rt = NULL;
247 if (err) 257 if (err)
248 goto failure; 258 goto failure;
249 259
@@ -269,10 +279,13 @@ EXPORT_SYMBOL(tcp_v4_connect);
269 */ 279 */
270void tcp_v4_mtu_reduced(struct sock *sk) 280void tcp_v4_mtu_reduced(struct sock *sk)
271{ 281{
272 struct dst_entry *dst;
273 struct inet_sock *inet = inet_sk(sk); 282 struct inet_sock *inet = inet_sk(sk);
274 u32 mtu = tcp_sk(sk)->mtu_info; 283 struct dst_entry *dst;
284 u32 mtu;
275 285
286 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
287 return;
288 mtu = tcp_sk(sk)->mtu_info;
276 dst = inet_csk_update_pmtu(sk, mtu); 289 dst = inet_csk_update_pmtu(sk, mtu);
277 if (!dst) 290 if (!dst)
278 return; 291 return;
@@ -418,7 +431,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
418 431
419 switch (type) { 432 switch (type) {
420 case ICMP_REDIRECT: 433 case ICMP_REDIRECT:
421 do_redirect(icmp_skb, sk); 434 if (!sock_owned_by_user(sk))
435 do_redirect(icmp_skb, sk);
422 goto out; 436 goto out;
423 case ICMP_SOURCE_QUENCH: 437 case ICMP_SOURCE_QUENCH:
424 /* Just silently ignore these. */ 438 /* Just silently ignore these. */
@@ -1318,10 +1332,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1318 tcp_ca_openreq_child(newsk, dst); 1332 tcp_ca_openreq_child(newsk, dst);
1319 1333
1320 tcp_sync_mss(newsk, dst_mtu(dst)); 1334 tcp_sync_mss(newsk, dst_mtu(dst));
1321 newtp->advmss = dst_metric_advmss(dst); 1335 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1322 if (tcp_sk(sk)->rx_opt.user_mss &&
1323 tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1324 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1325 1336
1326 tcp_initialize_rcv_mss(newsk); 1337 tcp_initialize_rcv_mss(newsk);
1327 1338
@@ -1555,8 +1566,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1555 * It has been noticed pure SACK packets were sometimes dropped 1566 * It has been noticed pure SACK packets were sometimes dropped
1556 * (if cooked by drivers without copybreak feature). 1567 * (if cooked by drivers without copybreak feature).
1557 */ 1568 */
1558 if (!skb->data_len) 1569 skb_condense(skb);
1559 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
1560 1570
1561 if (unlikely(sk_add_backlog(sk, skb, limit))) { 1571 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1562 bh_unlock_sock(sk); 1572 bh_unlock_sock(sk);
@@ -1816,7 +1826,6 @@ const struct inet_connection_sock_af_ops ipv4_specific = {
1816 .getsockopt = ip_getsockopt, 1826 .getsockopt = ip_getsockopt,
1817 .addr2sockaddr = inet_csk_addr2sockaddr, 1827 .addr2sockaddr = inet_csk_addr2sockaddr,
1818 .sockaddr_len = sizeof(struct sockaddr_in), 1828 .sockaddr_len = sizeof(struct sockaddr_in),
1819 .bind_conflict = inet_csk_bind_conflict,
1820#ifdef CONFIG_COMPAT 1829#ifdef CONFIG_COMPAT
1821 .compat_setsockopt = compat_ip_setsockopt, 1830 .compat_setsockopt = compat_ip_setsockopt,
1822 .compat_getsockopt = compat_ip_getsockopt, 1831 .compat_getsockopt = compat_ip_getsockopt,
@@ -1887,9 +1896,7 @@ void tcp_v4_destroy_sock(struct sock *sk)
1887 tcp_free_fastopen_req(tp); 1896 tcp_free_fastopen_req(tp);
1888 tcp_saved_syn_free(tp); 1897 tcp_saved_syn_free(tp);
1889 1898
1890 local_bh_disable();
1891 sk_sockets_allocated_dec(sk); 1899 sk_sockets_allocated_dec(sk);
1892 local_bh_enable();
1893} 1900}
1894EXPORT_SYMBOL(tcp_v4_destroy_sock); 1901EXPORT_SYMBOL(tcp_v4_destroy_sock);
1895 1902
@@ -2228,7 +2235,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2228 int state; 2235 int state;
2229 2236
2230 if (icsk->icsk_pending == ICSK_TIME_RETRANS || 2237 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2231 icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || 2238 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2232 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 2239 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2233 timer_active = 1; 2240 timer_active = 1;
2234 timer_expires = icsk->icsk_timeout; 2241 timer_expires = icsk->icsk_timeout;
@@ -2375,6 +2382,7 @@ struct proto tcp_prot = {
2375 .shutdown = tcp_shutdown, 2382 .shutdown = tcp_shutdown,
2376 .setsockopt = tcp_setsockopt, 2383 .setsockopt = tcp_setsockopt,
2377 .getsockopt = tcp_getsockopt, 2384 .getsockopt = tcp_getsockopt,
2385 .keepalive = tcp_set_keepalive,
2378 .recvmsg = tcp_recvmsg, 2386 .recvmsg = tcp_recvmsg,
2379 .sendmsg = tcp_sendmsg, 2387 .sendmsg = tcp_sendmsg,
2380 .sendpage = tcp_sendpage, 2388 .sendpage = tcp_sendpage,
@@ -2418,7 +2426,7 @@ static void __net_exit tcp_sk_exit(struct net *net)
2418 2426
2419static int __net_init tcp_sk_init(struct net *net) 2427static int __net_init tcp_sk_init(struct net *net)
2420{ 2428{
2421 int res, cpu; 2429 int res, cpu, cnt;
2422 2430
2423 net->ipv4.tcp_sk = alloc_percpu(struct sock *); 2431 net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2424 if (!net->ipv4.tcp_sk) 2432 if (!net->ipv4.tcp_sk)
@@ -2457,6 +2465,13 @@ static int __net_init tcp_sk_init(struct net *net)
2457 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 2465 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2458 net->ipv4.sysctl_tcp_tw_reuse = 0; 2466 net->ipv4.sysctl_tcp_tw_reuse = 0;
2459 2467
2468 cnt = tcp_hashinfo.ehash_mask + 1;
2469 net->ipv4.tcp_death_row.sysctl_tw_recycle = 0;
2470 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2471 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2472
2473 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2474
2460 return 0; 2475 return 0;
2461fail: 2476fail:
2462 tcp_sk_exit(net); 2477 tcp_sk_exit(net);
@@ -2466,7 +2481,7 @@ fail:
2466 2481
2467static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 2482static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2468{ 2483{
2469 inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET); 2484 inet_twsk_purge(&tcp_hashinfo, AF_INET);
2470} 2485}
2471 2486
2472static struct pernet_operations __net_initdata tcp_sk_ops = { 2487static struct pernet_operations __net_initdata tcp_sk_ops = {
@@ -2477,7 +2492,6 @@ static struct pernet_operations __net_initdata tcp_sk_ops = {
2477 2492
2478void __init tcp_v4_init(void) 2493void __init tcp_v4_init(void)
2479{ 2494{
2480 inet_hashinfo_init(&tcp_hashinfo);
2481 if (register_pernet_subsys(&tcp_sk_ops)) 2495 if (register_pernet_subsys(&tcp_sk_ops))
2482 panic("Failed to create the TCP control socket.\n"); 2496 panic("Failed to create the TCP control socket.\n");
2483} 2497}
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index ba8f02d0f283..0f46e5fe31ad 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -375,12 +375,10 @@ void tcp_update_metrics(struct sock *sk)
375 u32 val; 375 u32 val;
376 int m; 376 int m;
377 377
378 sk_dst_confirm(sk);
378 if (sysctl_tcp_nometrics_save || !dst) 379 if (sysctl_tcp_nometrics_save || !dst)
379 return; 380 return;
380 381
381 if (dst->flags & DST_HOST)
382 dst_confirm(dst);
383
384 rcu_read_lock(); 382 rcu_read_lock();
385 if (icsk->icsk_backoff || !tp->srtt_us) { 383 if (icsk->icsk_backoff || !tp->srtt_us) {
386 /* This session failed to estimate rtt. Why? 384 /* This session failed to estimate rtt. Why?
@@ -493,11 +491,10 @@ void tcp_init_metrics(struct sock *sk)
493 struct tcp_metrics_block *tm; 491 struct tcp_metrics_block *tm;
494 u32 val, crtt = 0; /* cached RTT scaled by 8 */ 492 u32 val, crtt = 0; /* cached RTT scaled by 8 */
495 493
494 sk_dst_confirm(sk);
496 if (!dst) 495 if (!dst)
497 goto reset; 496 goto reset;
498 497
499 dst_confirm(dst);
500
501 rcu_read_lock(); 498 rcu_read_lock();
502 tm = tcp_get_metrics(sk, dst, true); 499 tm = tcp_get_metrics(sk, dst, true);
503 if (!tm) { 500 if (!tm) {
@@ -522,7 +519,6 @@ void tcp_init_metrics(struct sock *sk)
522 val = tcp_metric_get(tm, TCP_METRIC_REORDERING); 519 val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
523 if (val && tp->reordering != val) { 520 if (val && tp->reordering != val) {
524 tcp_disable_fack(tp); 521 tcp_disable_fack(tp);
525 tcp_disable_early_retrans(tp);
526 tp->reordering = val; 522 tp->reordering = val;
527 } 523 }
528 524
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 28ce5ee831f5..65c0f3d13eca 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -29,12 +29,6 @@
29 29
30int sysctl_tcp_abort_on_overflow __read_mostly; 30int sysctl_tcp_abort_on_overflow __read_mostly;
31 31
32struct inet_timewait_death_row tcp_death_row = {
33 .sysctl_max_tw_buckets = NR_FILE * 2,
34 .hashinfo = &tcp_hashinfo,
35};
36EXPORT_SYMBOL_GPL(tcp_death_row);
37
38static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) 32static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
39{ 33{
40 if (seq == s_win) 34 if (seq == s_win)
@@ -100,13 +94,15 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
100 struct tcp_options_received tmp_opt; 94 struct tcp_options_received tmp_opt;
101 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); 95 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
102 bool paws_reject = false; 96 bool paws_reject = false;
97 struct inet_timewait_death_row *tcp_death_row = &sock_net((struct sock*)tw)->ipv4.tcp_death_row;
103 98
104 tmp_opt.saw_tstamp = 0; 99 tmp_opt.saw_tstamp = 0;
105 if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { 100 if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
106 tcp_parse_options(skb, &tmp_opt, 0, NULL); 101 tcp_parse_options(skb, &tmp_opt, 0, NULL);
107 102
108 if (tmp_opt.saw_tstamp) { 103 if (tmp_opt.saw_tstamp) {
109 tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset; 104 if (tmp_opt.rcv_tsecr)
105 tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset;
110 tmp_opt.ts_recent = tcptw->tw_ts_recent; 106 tmp_opt.ts_recent = tcptw->tw_ts_recent;
111 tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; 107 tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
112 paws_reject = tcp_paws_reject(&tmp_opt, th->rst); 108 paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
@@ -153,7 +149,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
153 tcptw->tw_ts_recent = tmp_opt.rcv_tsval; 149 tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
154 } 150 }
155 151
156 if (tcp_death_row.sysctl_tw_recycle && 152 if (tcp_death_row->sysctl_tw_recycle &&
157 tcptw->tw_ts_recent_stamp && 153 tcptw->tw_ts_recent_stamp &&
158 tcp_tw_remember_stamp(tw)) 154 tcp_tw_remember_stamp(tw))
159 inet_twsk_reschedule(tw, tw->tw_timeout); 155 inet_twsk_reschedule(tw, tw->tw_timeout);
@@ -264,11 +260,12 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
264 const struct tcp_sock *tp = tcp_sk(sk); 260 const struct tcp_sock *tp = tcp_sk(sk);
265 struct inet_timewait_sock *tw; 261 struct inet_timewait_sock *tw;
266 bool recycle_ok = false; 262 bool recycle_ok = false;
263 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
267 264
268 if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) 265 if (tcp_death_row->sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
269 recycle_ok = tcp_remember_stamp(sk); 266 recycle_ok = tcp_remember_stamp(sk);
270 267
271 tw = inet_twsk_alloc(sk, &tcp_death_row, state); 268 tw = inet_twsk_alloc(sk, tcp_death_row, state);
272 269
273 if (tw) { 270 if (tw) {
274 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); 271 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
@@ -364,15 +361,12 @@ void tcp_openreq_init_rwin(struct request_sock *req,
364{ 361{
365 struct inet_request_sock *ireq = inet_rsk(req); 362 struct inet_request_sock *ireq = inet_rsk(req);
366 const struct tcp_sock *tp = tcp_sk(sk_listener); 363 const struct tcp_sock *tp = tcp_sk(sk_listener);
367 u16 user_mss = READ_ONCE(tp->rx_opt.user_mss);
368 int full_space = tcp_full_space(sk_listener); 364 int full_space = tcp_full_space(sk_listener);
369 int mss = dst_metric_advmss(dst);
370 u32 window_clamp; 365 u32 window_clamp;
371 __u8 rcv_wscale; 366 __u8 rcv_wscale;
367 int mss;
372 368
373 if (user_mss && user_mss < mss) 369 mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
374 mss = user_mss;
375
376 window_clamp = READ_ONCE(tp->window_clamp); 370 window_clamp = READ_ONCE(tp->window_clamp);
377 /* Set this up on the first call only */ 371 /* Set this up on the first call only */
378 req->rsk_window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW); 372 req->rsk_window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW);
@@ -466,13 +460,13 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
466 newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); 460 newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
467 minmax_reset(&newtp->rtt_min, tcp_time_stamp, ~0U); 461 minmax_reset(&newtp->rtt_min, tcp_time_stamp, ~0U);
468 newicsk->icsk_rto = TCP_TIMEOUT_INIT; 462 newicsk->icsk_rto = TCP_TIMEOUT_INIT;
463 newicsk->icsk_ack.lrcvtime = tcp_time_stamp;
469 464
470 newtp->packets_out = 0; 465 newtp->packets_out = 0;
471 newtp->retrans_out = 0; 466 newtp->retrans_out = 0;
472 newtp->sacked_out = 0; 467 newtp->sacked_out = 0;
473 newtp->fackets_out = 0; 468 newtp->fackets_out = 0;
474 newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; 469 newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
475 tcp_enable_early_retrans(newtp);
476 newtp->tlp_high_seq = 0; 470 newtp->tlp_high_seq = 0;
477 newtp->lsndtime = treq->snt_synack.stamp_jiffies; 471 newtp->lsndtime = treq->snt_synack.stamp_jiffies;
478 newsk->sk_txhash = treq->txhash; 472 newsk->sk_txhash = treq->txhash;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 1d5331a1b1dc..c3c082ed3879 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -76,16 +76,15 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
76 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; 76 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
77 77
78 tp->packets_out += tcp_skb_pcount(skb); 78 tp->packets_out += tcp_skb_pcount(skb);
79 if (!prior_packets || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || 79 if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
80 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
81 tcp_rearm_rto(sk); 80 tcp_rearm_rto(sk);
82 }
83 81
84 NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT, 82 NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
85 tcp_skb_pcount(skb)); 83 tcp_skb_pcount(skb));
86} 84}
87 85
88/* SND.NXT, if window was not shrunk. 86/* SND.NXT, if window was not shrunk or the amount of shrunk was less than one
87 * window scaling factor due to loss of precision.
89 * If window has been shrunk, what should we make? It is not clear at all. 88 * If window has been shrunk, what should we make? It is not clear at all.
90 * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-( 89 * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
91 * Anything in between SND.UNA...SND.UNA+SND.WND also can be already 90 * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
@@ -95,7 +94,9 @@ static inline __u32 tcp_acceptable_seq(const struct sock *sk)
95{ 94{
96 const struct tcp_sock *tp = tcp_sk(sk); 95 const struct tcp_sock *tp = tcp_sk(sk);
97 96
98 if (!before(tcp_wnd_end(tp), tp->snd_nxt)) 97 if (!before(tcp_wnd_end(tp), tp->snd_nxt) ||
98 (tp->rx_opt.wscale_ok &&
99 ((tp->snd_nxt - tcp_wnd_end(tp)) < (1 << tp->rx_opt.rcv_wscale))))
99 return tp->snd_nxt; 100 return tp->snd_nxt;
100 else 101 else
101 return tcp_wnd_end(tp); 102 return tcp_wnd_end(tp);
@@ -966,6 +967,13 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
966 */ 967 */
967 skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1); 968 skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1);
968 969
970 /* If we had to use memory reserve to allocate this skb,
971 * this might cause drops if packet is looped back :
972 * Other socket might not have SOCK_MEMALLOC.
973 * Packets not looped back do not care about pfmemalloc.
974 */
975 skb->pfmemalloc = 0;
976
969 skb_push(skb, tcp_header_size); 977 skb_push(skb, tcp_header_size);
970 skb_reset_transport_header(skb); 978 skb_reset_transport_header(skb);
971 979
@@ -975,6 +983,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
975 skb_set_hash_from_sk(skb, sk); 983 skb_set_hash_from_sk(skb, sk);
976 atomic_add(skb->truesize, &sk->sk_wmem_alloc); 984 atomic_add(skb->truesize, &sk->sk_wmem_alloc);
977 985
986 skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm);
987
978 /* Build TCP header and checksum it. */ 988 /* Build TCP header and checksum it. */
979 th = (struct tcphdr *)skb->data; 989 th = (struct tcphdr *)skb->data;
980 th->source = inet->inet_sport; 990 th->source = inet->inet_sport;
@@ -2289,8 +2299,6 @@ bool tcp_schedule_loss_probe(struct sock *sk)
2289 u32 timeout, tlp_time_stamp, rto_time_stamp; 2299 u32 timeout, tlp_time_stamp, rto_time_stamp;
2290 u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3); 2300 u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3);
2291 2301
2292 if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS))
2293 return false;
2294 /* No consecutive loss probes. */ 2302 /* No consecutive loss probes. */
2295 if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) { 2303 if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) {
2296 tcp_rearm_rto(sk); 2304 tcp_rearm_rto(sk);
@@ -2309,8 +2317,9 @@ bool tcp_schedule_loss_probe(struct sock *sk)
2309 /* Schedule a loss probe in 2*RTT for SACK capable connections 2317 /* Schedule a loss probe in 2*RTT for SACK capable connections
2310 * in Open state, that are either limited by cwnd or application. 2318 * in Open state, that are either limited by cwnd or application.
2311 */ 2319 */
2312 if (sysctl_tcp_early_retrans < 3 || !tp->packets_out || 2320 if ((sysctl_tcp_early_retrans != 3 && sysctl_tcp_early_retrans != 4) ||
2313 !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open) 2321 !tp->packets_out || !tcp_is_sack(tp) ||
2322 icsk->icsk_ca_state != TCP_CA_Open)
2314 return false; 2323 return false;
2315 2324
2316 if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) && 2325 if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) &&
@@ -2518,9 +2527,11 @@ u32 __tcp_select_window(struct sock *sk)
2518 int full_space = min_t(int, tp->window_clamp, allowed_space); 2527 int full_space = min_t(int, tp->window_clamp, allowed_space);
2519 int window; 2528 int window;
2520 2529
2521 if (mss > full_space) 2530 if (unlikely(mss > full_space)) {
2522 mss = full_space; 2531 mss = full_space;
2523 2532 if (mss <= 0)
2533 return 0;
2534 }
2524 if (free_space < (full_space >> 1)) { 2535 if (free_space < (full_space >> 1)) {
2525 icsk->icsk_ack.quick = 0; 2536 icsk->icsk_ack.quick = 0;
2526 2537
@@ -2774,6 +2785,13 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
2774 if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN) 2785 if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN)
2775 tcp_ecn_clear_syn(sk, skb); 2786 tcp_ecn_clear_syn(sk, skb);
2776 2787
2788 /* Update global and local TCP statistics. */
2789 segs = tcp_skb_pcount(skb);
2790 TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs);
2791 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
2792 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
2793 tp->total_retrans += segs;
2794
2777 /* make sure skb->data is aligned on arches that require it 2795 /* make sure skb->data is aligned on arches that require it
2778 * and check if ack-trimming & collapsing extended the headroom 2796 * and check if ack-trimming & collapsing extended the headroom
2779 * beyond what csum_start can cover. 2797 * beyond what csum_start can cover.
@@ -2791,14 +2809,9 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
2791 } 2809 }
2792 2810
2793 if (likely(!err)) { 2811 if (likely(!err)) {
2794 segs = tcp_skb_pcount(skb);
2795
2796 TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; 2812 TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
2797 /* Update global TCP statistics. */ 2813 } else if (err != -EBUSY) {
2798 TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs); 2814 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
2799 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
2800 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
2801 tp->total_retrans += segs;
2802 } 2815 }
2803 return err; 2816 return err;
2804} 2817}
@@ -2821,8 +2834,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
2821 if (!tp->retrans_stamp) 2834 if (!tp->retrans_stamp)
2822 tp->retrans_stamp = tcp_skb_timestamp(skb); 2835 tp->retrans_stamp = tcp_skb_timestamp(skb);
2823 2836
2824 } else if (err != -EBUSY) {
2825 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
2826 } 2837 }
2827 2838
2828 if (tp->undo_retrans < 0) 2839 if (tp->undo_retrans < 0)
@@ -2831,36 +2842,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
2831 return err; 2842 return err;
2832} 2843}
2833 2844
2834/* Check if we forward retransmits are possible in the current
2835 * window/congestion state.
2836 */
2837static bool tcp_can_forward_retransmit(struct sock *sk)
2838{
2839 const struct inet_connection_sock *icsk = inet_csk(sk);
2840 const struct tcp_sock *tp = tcp_sk(sk);
2841
2842 /* Forward retransmissions are possible only during Recovery. */
2843 if (icsk->icsk_ca_state != TCP_CA_Recovery)
2844 return false;
2845
2846 /* No forward retransmissions in Reno are possible. */
2847 if (tcp_is_reno(tp))
2848 return false;
2849
2850 /* Yeah, we have to make difficult choice between forward transmission
2851 * and retransmission... Both ways have their merits...
2852 *
2853 * For now we do not retransmit anything, while we have some new
2854 * segments to send. In the other cases, follow rule 3 for
2855 * NextSeg() specified in RFC3517.
2856 */
2857
2858 if (tcp_may_send_now(sk))
2859 return false;
2860
2861 return true;
2862}
2863
2864/* This gets called after a retransmit timeout, and the initially 2845/* This gets called after a retransmit timeout, and the initially
2865 * retransmitted data is acknowledged. It tries to continue 2846 * retransmitted data is acknowledged. It tries to continue
2866 * resending the rest of the retransmit queue, until either 2847 * resending the rest of the retransmit queue, until either
@@ -2875,24 +2856,16 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
2875 struct tcp_sock *tp = tcp_sk(sk); 2856 struct tcp_sock *tp = tcp_sk(sk);
2876 struct sk_buff *skb; 2857 struct sk_buff *skb;
2877 struct sk_buff *hole = NULL; 2858 struct sk_buff *hole = NULL;
2878 u32 max_segs, last_lost; 2859 u32 max_segs;
2879 int mib_idx; 2860 int mib_idx;
2880 int fwd_rexmitting = 0;
2881 2861
2882 if (!tp->packets_out) 2862 if (!tp->packets_out)
2883 return; 2863 return;
2884 2864
2885 if (!tp->lost_out)
2886 tp->retransmit_high = tp->snd_una;
2887
2888 if (tp->retransmit_skb_hint) { 2865 if (tp->retransmit_skb_hint) {
2889 skb = tp->retransmit_skb_hint; 2866 skb = tp->retransmit_skb_hint;
2890 last_lost = TCP_SKB_CB(skb)->end_seq;
2891 if (after(last_lost, tp->retransmit_high))
2892 last_lost = tp->retransmit_high;
2893 } else { 2867 } else {
2894 skb = tcp_write_queue_head(sk); 2868 skb = tcp_write_queue_head(sk);
2895 last_lost = tp->snd_una;
2896 } 2869 }
2897 2870
2898 max_segs = tcp_tso_segs(sk, tcp_current_mss(sk)); 2871 max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
@@ -2915,31 +2888,14 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
2915 */ 2888 */
2916 segs = min_t(int, segs, max_segs); 2889 segs = min_t(int, segs, max_segs);
2917 2890
2918 if (fwd_rexmitting) { 2891 if (tp->retrans_out >= tp->lost_out) {
2919begin_fwd: 2892 break;
2920 if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
2921 break;
2922 mib_idx = LINUX_MIB_TCPFORWARDRETRANS;
2923
2924 } else if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high)) {
2925 tp->retransmit_high = last_lost;
2926 if (!tcp_can_forward_retransmit(sk))
2927 break;
2928 /* Backtrack if necessary to non-L'ed skb */
2929 if (hole) {
2930 skb = hole;
2931 hole = NULL;
2932 }
2933 fwd_rexmitting = 1;
2934 goto begin_fwd;
2935
2936 } else if (!(sacked & TCPCB_LOST)) { 2893 } else if (!(sacked & TCPCB_LOST)) {
2937 if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED))) 2894 if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
2938 hole = skb; 2895 hole = skb;
2939 continue; 2896 continue;
2940 2897
2941 } else { 2898 } else {
2942 last_lost = TCP_SKB_CB(skb)->end_seq;
2943 if (icsk->icsk_ca_state != TCP_CA_Loss) 2899 if (icsk->icsk_ca_state != TCP_CA_Loss)
2944 mib_idx = LINUX_MIB_TCPFASTRETRANS; 2900 mib_idx = LINUX_MIB_TCPFASTRETRANS;
2945 else 2901 else
@@ -2960,7 +2916,8 @@ begin_fwd:
2960 if (tcp_in_cwnd_reduction(sk)) 2916 if (tcp_in_cwnd_reduction(sk))
2961 tp->prr_out += tcp_skb_pcount(skb); 2917 tp->prr_out += tcp_skb_pcount(skb);
2962 2918
2963 if (skb == tcp_write_queue_head(sk)) 2919 if (skb == tcp_write_queue_head(sk) &&
2920 icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
2964 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 2921 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
2965 inet_csk(sk)->icsk_rto, 2922 inet_csk(sk)->icsk_rto,
2966 TCP_RTO_MAX); 2923 TCP_RTO_MAX);
@@ -3042,6 +2999,8 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
3042{ 2999{
3043 struct sk_buff *skb; 3000 struct sk_buff *skb;
3044 3001
3002 TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS);
3003
3045 /* NOTE: No TCP options attached and we never retransmit this. */ 3004 /* NOTE: No TCP options attached and we never retransmit this. */
3046 skb = alloc_skb(MAX_TCP_HEADER, priority); 3005 skb = alloc_skb(MAX_TCP_HEADER, priority);
3047 if (!skb) { 3006 if (!skb) {
@@ -3057,8 +3016,6 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
3057 /* Send it off. */ 3016 /* Send it off. */
3058 if (tcp_transmit_skb(sk, skb, 0, priority)) 3017 if (tcp_transmit_skb(sk, skb, 0, priority))
3059 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); 3018 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
3060
3061 TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS);
3062} 3019}
3063 3020
3064/* Send a crossed SYN-ACK during socket establishment. 3021/* Send a crossed SYN-ACK during socket establishment.
@@ -3117,7 +3074,6 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
3117 struct sk_buff *skb; 3074 struct sk_buff *skb;
3118 int tcp_header_size; 3075 int tcp_header_size;
3119 struct tcphdr *th; 3076 struct tcphdr *th;
3120 u16 user_mss;
3121 int mss; 3077 int mss;
3122 3078
3123 skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); 3079 skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
@@ -3147,10 +3103,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
3147 } 3103 }
3148 skb_dst_set(skb, dst); 3104 skb_dst_set(skb, dst);
3149 3105
3150 mss = dst_metric_advmss(dst); 3106 mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
3151 user_mss = READ_ONCE(tp->rx_opt.user_mss);
3152 if (user_mss && user_mss < mss)
3153 mss = user_mss;
3154 3107
3155 memset(&opts, 0, sizeof(opts)); 3108 memset(&opts, 0, sizeof(opts));
3156#ifdef CONFIG_SYN_COOKIES 3109#ifdef CONFIG_SYN_COOKIES
@@ -3256,9 +3209,7 @@ static void tcp_connect_init(struct sock *sk)
3256 3209
3257 if (!tp->window_clamp) 3210 if (!tp->window_clamp)
3258 tp->window_clamp = dst_metric(dst, RTAX_WINDOW); 3211 tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
3259 tp->advmss = dst_metric_advmss(dst); 3212 tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
3260 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss)
3261 tp->advmss = tp->rx_opt.user_mss;
3262 3213
3263 tcp_initialize_rcv_mss(sk); 3214 tcp_initialize_rcv_mss(sk);
3264 3215
@@ -3324,31 +3275,19 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
3324{ 3275{
3325 struct tcp_sock *tp = tcp_sk(sk); 3276 struct tcp_sock *tp = tcp_sk(sk);
3326 struct tcp_fastopen_request *fo = tp->fastopen_req; 3277 struct tcp_fastopen_request *fo = tp->fastopen_req;
3327 int syn_loss = 0, space, err = 0; 3278 int space, err = 0;
3328 unsigned long last_syn_loss = 0;
3329 struct sk_buff *syn_data; 3279 struct sk_buff *syn_data;
3330 3280
3331 tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */ 3281 tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */
3332 tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie, 3282 if (!tcp_fastopen_cookie_check(sk, &tp->rx_opt.mss_clamp, &fo->cookie))
3333 &syn_loss, &last_syn_loss);
3334 /* Recurring FO SYN losses: revert to regular handshake temporarily */
3335 if (syn_loss > 1 &&
3336 time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) {
3337 fo->cookie.len = -1;
3338 goto fallback;
3339 }
3340
3341 if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE)
3342 fo->cookie.len = -1;
3343 else if (fo->cookie.len <= 0)
3344 goto fallback; 3283 goto fallback;
3345 3284
3346 /* MSS for SYN-data is based on cached MSS and bounded by PMTU and 3285 /* MSS for SYN-data is based on cached MSS and bounded by PMTU and
3347 * user-MSS. Reserve maximum option space for middleboxes that add 3286 * user-MSS. Reserve maximum option space for middleboxes that add
3348 * private TCP options. The cost is reduced data space in SYN :( 3287 * private TCP options. The cost is reduced data space in SYN :(
3349 */ 3288 */
3350 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->rx_opt.mss_clamp) 3289 tp->rx_opt.mss_clamp = tcp_mss_clamp(tp, tp->rx_opt.mss_clamp);
3351 tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; 3290
3352 space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) - 3291 space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -
3353 MAX_TCP_OPTION_SPACE; 3292 MAX_TCP_OPTION_SPACE;
3354 3293
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index e36df4fcfeba..d8acbd9f477a 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -1,9 +1,33 @@
1#include <linux/tcp.h> 1#include <linux/tcp.h>
2#include <net/tcp.h> 2#include <net/tcp.h>
3 3
4int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS; 4int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOSS_DETECTION;
5 5
6/* Marks a packet lost, if some packet sent later has been (s)acked. 6static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
7{
8 struct tcp_sock *tp = tcp_sk(sk);
9
10 tcp_skb_mark_lost_uncond_verify(tp, skb);
11 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
12 /* Account for retransmits that are lost again */
13 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
14 tp->retrans_out -= tcp_skb_pcount(skb);
15 NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT,
16 tcp_skb_pcount(skb));
17 }
18}
19
20static bool tcp_rack_sent_after(const struct skb_mstamp *t1,
21 const struct skb_mstamp *t2,
22 u32 seq1, u32 seq2)
23{
24 return skb_mstamp_after(t1, t2) ||
25 (t1->v64 == t2->v64 && after(seq1, seq2));
26}
27
28/* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01):
29 *
30 * Marks a packet lost, if some packet sent later has been (s)acked.
7 * The underlying idea is similar to the traditional dupthresh and FACK 31 * The underlying idea is similar to the traditional dupthresh and FACK
8 * but they look at different metrics: 32 * but they look at different metrics:
9 * 33 *
@@ -16,31 +40,26 @@ int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS;
16 * is being more resilient to reordering by simply allowing some 40 * is being more resilient to reordering by simply allowing some
17 * "settling delay", instead of tweaking the dupthresh. 41 * "settling delay", instead of tweaking the dupthresh.
18 * 42 *
19 * The current version is only used after recovery starts but can be 43 * When tcp_rack_detect_loss() detects some packets are lost and we
20 * easily extended to detect the first loss. 44 * are not already in the CA_Recovery state, either tcp_rack_reo_timeout()
45 * or tcp_time_to_recover()'s "Trick#1: the loss is proven" code path will
46 * make us enter the CA_Recovery state.
21 */ 47 */
22int tcp_rack_mark_lost(struct sock *sk) 48static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now,
49 u32 *reo_timeout)
23{ 50{
24 struct tcp_sock *tp = tcp_sk(sk); 51 struct tcp_sock *tp = tcp_sk(sk);
25 struct sk_buff *skb; 52 struct sk_buff *skb;
26 u32 reo_wnd, prior_retrans = tp->retrans_out; 53 u32 reo_wnd;
27
28 if (inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery || !tp->rack.advanced)
29 return 0;
30
31 /* Reset the advanced flag to avoid unnecessary queue scanning */
32 tp->rack.advanced = 0;
33 54
55 *reo_timeout = 0;
34 /* To be more reordering resilient, allow min_rtt/4 settling delay 56 /* To be more reordering resilient, allow min_rtt/4 settling delay
35 * (lower-bounded to 1000uS). We use min_rtt instead of the smoothed 57 * (lower-bounded to 1000uS). We use min_rtt instead of the smoothed
36 * RTT because reordering is often a path property and less related 58 * RTT because reordering is often a path property and less related
37 * to queuing or delayed ACKs. 59 * to queuing or delayed ACKs.
38 *
39 * TODO: measure and adapt to the observed reordering delay, and
40 * use a timer to retransmit like the delayed early retransmit.
41 */ 60 */
42 reo_wnd = 1000; 61 reo_wnd = 1000;
43 if (tp->rack.reord && tcp_min_rtt(tp) != ~0U) 62 if ((tp->rack.reord || !tp->lost_out) && tcp_min_rtt(tp) != ~0U)
44 reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd); 63 reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd);
45 64
46 tcp_for_write_queue(skb, sk) { 65 tcp_for_write_queue(skb, sk) {
@@ -54,20 +73,29 @@ int tcp_rack_mark_lost(struct sock *sk)
54 scb->sacked & TCPCB_SACKED_ACKED) 73 scb->sacked & TCPCB_SACKED_ACKED)
55 continue; 74 continue;
56 75
57 if (skb_mstamp_after(&tp->rack.mstamp, &skb->skb_mstamp)) { 76 if (tcp_rack_sent_after(&tp->rack.mstamp, &skb->skb_mstamp,
77 tp->rack.end_seq, scb->end_seq)) {
78 /* Step 3 in draft-cheng-tcpm-rack-00.txt:
79 * A packet is lost if its elapsed time is beyond
80 * the recent RTT plus the reordering window.
81 */
82 u32 elapsed = skb_mstamp_us_delta(now,
83 &skb->skb_mstamp);
84 s32 remaining = tp->rack.rtt_us + reo_wnd - elapsed;
58 85
59 if (skb_mstamp_us_delta(&tp->rack.mstamp, 86 if (remaining < 0) {
60 &skb->skb_mstamp) <= reo_wnd) 87 tcp_rack_mark_skb_lost(sk, skb);
61 continue; 88 continue;
62
63 /* skb is lost if packet sent later is sacked */
64 tcp_skb_mark_lost_uncond_verify(tp, skb);
65 if (scb->sacked & TCPCB_SACKED_RETRANS) {
66 scb->sacked &= ~TCPCB_SACKED_RETRANS;
67 tp->retrans_out -= tcp_skb_pcount(skb);
68 NET_INC_STATS(sock_net(sk),
69 LINUX_MIB_TCPLOSTRETRANSMIT);
70 } 89 }
90
91 /* Skip ones marked lost but not yet retransmitted */
92 if ((scb->sacked & TCPCB_LOST) &&
93 !(scb->sacked & TCPCB_SACKED_RETRANS))
94 continue;
95
96 /* Record maximum wait time (+1 to avoid 0) */
97 *reo_timeout = max_t(u32, *reo_timeout, 1 + remaining);
98
71 } else if (!(scb->sacked & TCPCB_RETRANS)) { 99 } else if (!(scb->sacked & TCPCB_RETRANS)) {
72 /* Original data are sent sequentially so stop early 100 /* Original data are sent sequentially so stop early
73 * b/c the rest are all sent after rack_sent 101 * b/c the rest are all sent after rack_sent
@@ -75,20 +103,43 @@ int tcp_rack_mark_lost(struct sock *sk)
75 break; 103 break;
76 } 104 }
77 } 105 }
78 return prior_retrans - tp->retrans_out;
79} 106}
80 107
81/* Record the most recently (re)sent time among the (s)acked packets */ 108void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now)
82void tcp_rack_advance(struct tcp_sock *tp, 109{
83 const struct skb_mstamp *xmit_time, u8 sacked) 110 struct tcp_sock *tp = tcp_sk(sk);
111 u32 timeout;
112
113 if (!tp->rack.advanced)
114 return;
115
116 /* Reset the advanced flag to avoid unnecessary queue scanning */
117 tp->rack.advanced = 0;
118 tcp_rack_detect_loss(sk, now, &timeout);
119 if (timeout) {
120 timeout = usecs_to_jiffies(timeout + TCP_REO_TIMEOUT_MIN);
121 inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT,
122 timeout, inet_csk(sk)->icsk_rto);
123 }
124}
125
126/* Record the most recently (re)sent time among the (s)acked packets
127 * This is "Step 3: Advance RACK.xmit_time and update RACK.RTT" from
128 * draft-cheng-tcpm-rack-00.txt
129 */
130void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
131 const struct skb_mstamp *xmit_time,
132 const struct skb_mstamp *ack_time)
84{ 133{
134 u32 rtt_us;
135
85 if (tp->rack.mstamp.v64 && 136 if (tp->rack.mstamp.v64 &&
86 !skb_mstamp_after(xmit_time, &tp->rack.mstamp)) 137 !tcp_rack_sent_after(xmit_time, &tp->rack.mstamp,
138 end_seq, tp->rack.end_seq))
87 return; 139 return;
88 140
141 rtt_us = skb_mstamp_us_delta(ack_time, xmit_time);
89 if (sacked & TCPCB_RETRANS) { 142 if (sacked & TCPCB_RETRANS) {
90 struct skb_mstamp now;
91
92 /* If the sacked packet was retransmitted, it's ambiguous 143 /* If the sacked packet was retransmitted, it's ambiguous
93 * whether the retransmission or the original (or the prior 144 * whether the retransmission or the original (or the prior
94 * retransmission) was sacked. 145 * retransmission) was sacked.
@@ -99,11 +150,35 @@ void tcp_rack_advance(struct tcp_sock *tp,
99 * so it's at least one RTT (i.e., retransmission is at least 150 * so it's at least one RTT (i.e., retransmission is at least
100 * an RTT later). 151 * an RTT later).
101 */ 152 */
102 skb_mstamp_get(&now); 153 if (rtt_us < tcp_min_rtt(tp))
103 if (skb_mstamp_us_delta(&now, xmit_time) < tcp_min_rtt(tp))
104 return; 154 return;
105 } 155 }
106 156 tp->rack.rtt_us = rtt_us;
107 tp->rack.mstamp = *xmit_time; 157 tp->rack.mstamp = *xmit_time;
158 tp->rack.end_seq = end_seq;
108 tp->rack.advanced = 1; 159 tp->rack.advanced = 1;
109} 160}
161
162/* We have waited long enough to accommodate reordering. Mark the expired
163 * packets lost and retransmit them.
164 */
165void tcp_rack_reo_timeout(struct sock *sk)
166{
167 struct tcp_sock *tp = tcp_sk(sk);
168 struct skb_mstamp now;
169 u32 timeout, prior_inflight;
170
171 skb_mstamp_get(&now);
172 prior_inflight = tcp_packets_in_flight(tp);
173 tcp_rack_detect_loss(sk, &now, &timeout);
174 if (prior_inflight != tcp_packets_in_flight(tp)) {
175 if (inet_csk(sk)->icsk_ca_state != TCP_CA_Recovery) {
176 tcp_enter_recovery(sk, false);
177 if (!inet_csk(sk)->icsk_ca_ops->cong_control)
178 tcp_cwnd_reduction(sk, 1, 0);
179 }
180 tcp_xmit_retransmit_queue(sk);
181 }
182 if (inet_csk(sk)->icsk_pending != ICSK_TIME_RETRANS)
183 tcp_rearm_rto(sk);
184}
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 3705075f42c3..b2ab411c6d37 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -249,7 +249,8 @@ void tcp_delack_timer_handler(struct sock *sk)
249 249
250 sk_mem_reclaim_partial(sk); 250 sk_mem_reclaim_partial(sk);
251 251
252 if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) 252 if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) ||
253 !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
253 goto out; 254 goto out;
254 255
255 if (time_after(icsk->icsk_ack.timeout, jiffies)) { 256 if (time_after(icsk->icsk_ack.timeout, jiffies)) {
@@ -552,7 +553,8 @@ void tcp_write_timer_handler(struct sock *sk)
552 struct inet_connection_sock *icsk = inet_csk(sk); 553 struct inet_connection_sock *icsk = inet_csk(sk);
553 int event; 554 int event;
554 555
555 if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending) 556 if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) ||
557 !icsk->icsk_pending)
556 goto out; 558 goto out;
557 559
558 if (time_after(icsk->icsk_timeout, jiffies)) { 560 if (time_after(icsk->icsk_timeout, jiffies)) {
@@ -563,8 +565,8 @@ void tcp_write_timer_handler(struct sock *sk)
563 event = icsk->icsk_pending; 565 event = icsk->icsk_pending;
564 566
565 switch (event) { 567 switch (event) {
566 case ICSK_TIME_EARLY_RETRANS: 568 case ICSK_TIME_REO_TIMEOUT:
567 tcp_resume_early_retransmit(sk); 569 tcp_rack_reo_timeout(sk);
568 break; 570 break;
569 case ICSK_TIME_LOSS_PROBE: 571 case ICSK_TIME_LOSS_PROBE:
570 tcp_send_loss_probe(sk); 572 tcp_send_loss_probe(sk);
@@ -617,6 +619,7 @@ void tcp_set_keepalive(struct sock *sk, int val)
617 else if (!val) 619 else if (!val)
618 inet_csk_delete_keepalive_timer(sk); 620 inet_csk_delete_keepalive_timer(sk);
619} 621}
622EXPORT_SYMBOL_GPL(tcp_set_keepalive);
620 623
621 624
622static void tcp_keepalive_timer (unsigned long data) 625static void tcp_keepalive_timer (unsigned long data)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 1307a7c2e544..ea6e4cff9faf 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -134,14 +134,21 @@ EXPORT_SYMBOL(udp_memory_allocated);
134#define MAX_UDP_PORTS 65536 134#define MAX_UDP_PORTS 65536
135#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN) 135#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN)
136 136
137/* IPCB reference means this can not be used from early demux */
138static bool udp_lib_exact_dif_match(struct net *net, struct sk_buff *skb)
139{
140#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
141 if (!net->ipv4.sysctl_udp_l3mdev_accept &&
142 skb && ipv4_l3mdev_skb(IPCB(skb)->flags))
143 return true;
144#endif
145 return false;
146}
147
137static int udp_lib_lport_inuse(struct net *net, __u16 num, 148static int udp_lib_lport_inuse(struct net *net, __u16 num,
138 const struct udp_hslot *hslot, 149 const struct udp_hslot *hslot,
139 unsigned long *bitmap, 150 unsigned long *bitmap,
140 struct sock *sk, 151 struct sock *sk, unsigned int log)
141 int (*saddr_comp)(const struct sock *sk1,
142 const struct sock *sk2,
143 bool match_wildcard),
144 unsigned int log)
145{ 152{
146 struct sock *sk2; 153 struct sock *sk2;
147 kuid_t uid = sock_i_uid(sk); 154 kuid_t uid = sock_i_uid(sk);
@@ -153,13 +160,18 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
153 (!sk2->sk_reuse || !sk->sk_reuse) && 160 (!sk2->sk_reuse || !sk->sk_reuse) &&
154 (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || 161 (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
155 sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && 162 sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
156 (!sk2->sk_reuseport || !sk->sk_reuseport || 163 inet_rcv_saddr_equal(sk, sk2, true)) {
157 rcu_access_pointer(sk->sk_reuseport_cb) || 164 if (sk2->sk_reuseport && sk->sk_reuseport &&
158 !uid_eq(uid, sock_i_uid(sk2))) && 165 !rcu_access_pointer(sk->sk_reuseport_cb) &&
159 saddr_comp(sk, sk2, true)) { 166 uid_eq(uid, sock_i_uid(sk2))) {
160 if (!bitmap) 167 if (!bitmap)
161 return 1; 168 return 0;
162 __set_bit(udp_sk(sk2)->udp_port_hash >> log, bitmap); 169 } else {
170 if (!bitmap)
171 return 1;
172 __set_bit(udp_sk(sk2)->udp_port_hash >> log,
173 bitmap);
174 }
163 } 175 }
164 } 176 }
165 return 0; 177 return 0;
@@ -171,10 +183,7 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
171 */ 183 */
172static int udp_lib_lport_inuse2(struct net *net, __u16 num, 184static int udp_lib_lport_inuse2(struct net *net, __u16 num,
173 struct udp_hslot *hslot2, 185 struct udp_hslot *hslot2,
174 struct sock *sk, 186 struct sock *sk)
175 int (*saddr_comp)(const struct sock *sk1,
176 const struct sock *sk2,
177 bool match_wildcard))
178{ 187{
179 struct sock *sk2; 188 struct sock *sk2;
180 kuid_t uid = sock_i_uid(sk); 189 kuid_t uid = sock_i_uid(sk);
@@ -188,11 +197,14 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
188 (!sk2->sk_reuse || !sk->sk_reuse) && 197 (!sk2->sk_reuse || !sk->sk_reuse) &&
189 (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || 198 (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
190 sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && 199 sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
191 (!sk2->sk_reuseport || !sk->sk_reuseport || 200 inet_rcv_saddr_equal(sk, sk2, true)) {
192 rcu_access_pointer(sk->sk_reuseport_cb) || 201 if (sk2->sk_reuseport && sk->sk_reuseport &&
193 !uid_eq(uid, sock_i_uid(sk2))) && 202 !rcu_access_pointer(sk->sk_reuseport_cb) &&
194 saddr_comp(sk, sk2, true)) { 203 uid_eq(uid, sock_i_uid(sk2))) {
195 res = 1; 204 res = 0;
205 } else {
206 res = 1;
207 }
196 break; 208 break;
197 } 209 }
198 } 210 }
@@ -200,10 +212,7 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
200 return res; 212 return res;
201} 213}
202 214
203static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot, 215static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot)
204 int (*saddr_same)(const struct sock *sk1,
205 const struct sock *sk2,
206 bool match_wildcard))
207{ 216{
208 struct net *net = sock_net(sk); 217 struct net *net = sock_net(sk);
209 kuid_t uid = sock_i_uid(sk); 218 kuid_t uid = sock_i_uid(sk);
@@ -217,7 +226,7 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot,
217 (udp_sk(sk2)->udp_port_hash == udp_sk(sk)->udp_port_hash) && 226 (udp_sk(sk2)->udp_port_hash == udp_sk(sk)->udp_port_hash) &&
218 (sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && 227 (sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
219 sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && 228 sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
220 (*saddr_same)(sk, sk2, false)) { 229 inet_rcv_saddr_equal(sk, sk2, false)) {
221 return reuseport_add_sock(sk, sk2); 230 return reuseport_add_sock(sk, sk2);
222 } 231 }
223 } 232 }
@@ -233,14 +242,10 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot,
233 * 242 *
234 * @sk: socket struct in question 243 * @sk: socket struct in question
235 * @snum: port number to look up 244 * @snum: port number to look up
236 * @saddr_comp: AF-dependent comparison of bound local IP addresses
237 * @hash2_nulladdr: AF-dependent hash value in secondary hash chains, 245 * @hash2_nulladdr: AF-dependent hash value in secondary hash chains,
238 * with NULL address 246 * with NULL address
239 */ 247 */
240int udp_lib_get_port(struct sock *sk, unsigned short snum, 248int udp_lib_get_port(struct sock *sk, unsigned short snum,
241 int (*saddr_comp)(const struct sock *sk1,
242 const struct sock *sk2,
243 bool match_wildcard),
244 unsigned int hash2_nulladdr) 249 unsigned int hash2_nulladdr)
245{ 250{
246 struct udp_hslot *hslot, *hslot2; 251 struct udp_hslot *hslot, *hslot2;
@@ -269,7 +274,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
269 bitmap_zero(bitmap, PORTS_PER_CHAIN); 274 bitmap_zero(bitmap, PORTS_PER_CHAIN);
270 spin_lock_bh(&hslot->lock); 275 spin_lock_bh(&hslot->lock);
271 udp_lib_lport_inuse(net, snum, hslot, bitmap, sk, 276 udp_lib_lport_inuse(net, snum, hslot, bitmap, sk,
272 saddr_comp, udptable->log); 277 udptable->log);
273 278
274 snum = first; 279 snum = first;
275 /* 280 /*
@@ -285,6 +290,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
285 snum += rand; 290 snum += rand;
286 } while (snum != first); 291 } while (snum != first);
287 spin_unlock_bh(&hslot->lock); 292 spin_unlock_bh(&hslot->lock);
293 cond_resched();
288 } while (++first != last); 294 } while (++first != last);
289 goto fail; 295 goto fail;
290 } else { 296 } else {
@@ -301,12 +307,11 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
301 if (hslot->count < hslot2->count) 307 if (hslot->count < hslot2->count)
302 goto scan_primary_hash; 308 goto scan_primary_hash;
303 309
304 exist = udp_lib_lport_inuse2(net, snum, hslot2, 310 exist = udp_lib_lport_inuse2(net, snum, hslot2, sk);
305 sk, saddr_comp);
306 if (!exist && (hash2_nulladdr != slot2)) { 311 if (!exist && (hash2_nulladdr != slot2)) {
307 hslot2 = udp_hashslot2(udptable, hash2_nulladdr); 312 hslot2 = udp_hashslot2(udptable, hash2_nulladdr);
308 exist = udp_lib_lport_inuse2(net, snum, hslot2, 313 exist = udp_lib_lport_inuse2(net, snum, hslot2,
309 sk, saddr_comp); 314 sk);
310 } 315 }
311 if (exist) 316 if (exist)
312 goto fail_unlock; 317 goto fail_unlock;
@@ -314,8 +319,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
314 goto found; 319 goto found;
315 } 320 }
316scan_primary_hash: 321scan_primary_hash:
317 if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk, 322 if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk, 0))
318 saddr_comp, 0))
319 goto fail_unlock; 323 goto fail_unlock;
320 } 324 }
321found: 325found:
@@ -324,7 +328,7 @@ found:
324 udp_sk(sk)->udp_portaddr_hash ^= snum; 328 udp_sk(sk)->udp_portaddr_hash ^= snum;
325 if (sk_unhashed(sk)) { 329 if (sk_unhashed(sk)) {
326 if (sk->sk_reuseport && 330 if (sk->sk_reuseport &&
327 udp_reuseport_add_sock(sk, hslot, saddr_comp)) { 331 udp_reuseport_add_sock(sk, hslot)) {
328 inet_sk(sk)->inet_num = 0; 332 inet_sk(sk)->inet_num = 0;
329 udp_sk(sk)->udp_port_hash = 0; 333 udp_sk(sk)->udp_port_hash = 0;
330 udp_sk(sk)->udp_portaddr_hash ^= snum; 334 udp_sk(sk)->udp_portaddr_hash ^= snum;
@@ -356,24 +360,6 @@ fail:
356} 360}
357EXPORT_SYMBOL(udp_lib_get_port); 361EXPORT_SYMBOL(udp_lib_get_port);
358 362
359/* match_wildcard == true: 0.0.0.0 equals to any IPv4 addresses
360 * match_wildcard == false: addresses must be exactly the same, i.e.
361 * 0.0.0.0 only equals to 0.0.0.0
362 */
363int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2,
364 bool match_wildcard)
365{
366 struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2);
367
368 if (!ipv6_only_sock(sk2)) {
369 if (inet1->inet_rcv_saddr == inet2->inet_rcv_saddr)
370 return 1;
371 if (!inet1->inet_rcv_saddr || !inet2->inet_rcv_saddr)
372 return match_wildcard;
373 }
374 return 0;
375}
376
377static u32 udp4_portaddr_hash(const struct net *net, __be32 saddr, 363static u32 udp4_portaddr_hash(const struct net *net, __be32 saddr,
378 unsigned int port) 364 unsigned int port)
379{ 365{
@@ -389,12 +375,13 @@ int udp_v4_get_port(struct sock *sk, unsigned short snum)
389 375
390 /* precompute partial secondary hash */ 376 /* precompute partial secondary hash */
391 udp_sk(sk)->udp_portaddr_hash = hash2_partial; 377 udp_sk(sk)->udp_portaddr_hash = hash2_partial;
392 return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal, hash2_nulladdr); 378 return udp_lib_get_port(sk, snum, hash2_nulladdr);
393} 379}
394 380
395static int compute_score(struct sock *sk, struct net *net, 381static int compute_score(struct sock *sk, struct net *net,
396 __be32 saddr, __be16 sport, 382 __be32 saddr, __be16 sport,
397 __be32 daddr, unsigned short hnum, int dif) 383 __be32 daddr, unsigned short hnum, int dif,
384 bool exact_dif)
398{ 385{
399 int score; 386 int score;
400 struct inet_sock *inet; 387 struct inet_sock *inet;
@@ -425,7 +412,7 @@ static int compute_score(struct sock *sk, struct net *net,
425 score += 4; 412 score += 4;
426 } 413 }
427 414
428 if (sk->sk_bound_dev_if) { 415 if (sk->sk_bound_dev_if || exact_dif) {
429 if (sk->sk_bound_dev_if != dif) 416 if (sk->sk_bound_dev_if != dif)
430 return -1; 417 return -1;
431 score += 4; 418 score += 4;
@@ -450,7 +437,7 @@ static u32 udp_ehashfn(const struct net *net, const __be32 laddr,
450/* called with rcu_read_lock() */ 437/* called with rcu_read_lock() */
451static struct sock *udp4_lib_lookup2(struct net *net, 438static struct sock *udp4_lib_lookup2(struct net *net,
452 __be32 saddr, __be16 sport, 439 __be32 saddr, __be16 sport,
453 __be32 daddr, unsigned int hnum, int dif, 440 __be32 daddr, unsigned int hnum, int dif, bool exact_dif,
454 struct udp_hslot *hslot2, 441 struct udp_hslot *hslot2,
455 struct sk_buff *skb) 442 struct sk_buff *skb)
456{ 443{
@@ -462,7 +449,7 @@ static struct sock *udp4_lib_lookup2(struct net *net,
462 badness = 0; 449 badness = 0;
463 udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { 450 udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
464 score = compute_score(sk, net, saddr, sport, 451 score = compute_score(sk, net, saddr, sport,
465 daddr, hnum, dif); 452 daddr, hnum, dif, exact_dif);
466 if (score > badness) { 453 if (score > badness) {
467 reuseport = sk->sk_reuseport; 454 reuseport = sk->sk_reuseport;
468 if (reuseport) { 455 if (reuseport) {
@@ -497,6 +484,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
497 unsigned short hnum = ntohs(dport); 484 unsigned short hnum = ntohs(dport);
498 unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); 485 unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
499 struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; 486 struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
487 bool exact_dif = udp_lib_exact_dif_match(net, skb);
500 int score, badness, matches = 0, reuseport = 0; 488 int score, badness, matches = 0, reuseport = 0;
501 u32 hash = 0; 489 u32 hash = 0;
502 490
@@ -509,7 +497,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
509 497
510 result = udp4_lib_lookup2(net, saddr, sport, 498 result = udp4_lib_lookup2(net, saddr, sport,
511 daddr, hnum, dif, 499 daddr, hnum, dif,
512 hslot2, skb); 500 exact_dif, hslot2, skb);
513 if (!result) { 501 if (!result) {
514 unsigned int old_slot2 = slot2; 502 unsigned int old_slot2 = slot2;
515 hash2 = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum); 503 hash2 = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
@@ -524,7 +512,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
524 512
525 result = udp4_lib_lookup2(net, saddr, sport, 513 result = udp4_lib_lookup2(net, saddr, sport,
526 daddr, hnum, dif, 514 daddr, hnum, dif,
527 hslot2, skb); 515 exact_dif, hslot2, skb);
528 } 516 }
529 return result; 517 return result;
530 } 518 }
@@ -533,7 +521,7 @@ begin:
533 badness = 0; 521 badness = 0;
534 sk_for_each_rcu(sk, &hslot->head) { 522 sk_for_each_rcu(sk, &hslot->head) {
535 score = compute_score(sk, net, saddr, sport, 523 score = compute_score(sk, net, saddr, sport,
536 daddr, hnum, dif); 524 daddr, hnum, dif, exact_dif);
537 if (score > badness) { 525 if (score > badness) {
538 reuseport = sk->sk_reuseport; 526 reuseport = sk->sk_reuseport;
539 if (reuseport) { 527 if (reuseport) {
@@ -1113,7 +1101,8 @@ out:
1113 return err; 1101 return err;
1114 1102
1115do_confirm: 1103do_confirm:
1116 dst_confirm(&rt->dst); 1104 if (msg->msg_flags & MSG_PROBE)
1105 dst_confirm_neigh(&rt->dst, &fl4->daddr);
1117 if (!(msg->msg_flags&MSG_PROBE) || len) 1106 if (!(msg->msg_flags&MSG_PROBE) || len)
1118 goto back_from_confirm; 1107 goto back_from_confirm;
1119 err = 0; 1108 err = 0;
@@ -1501,7 +1490,7 @@ try_again:
1501 return err; 1490 return err;
1502 1491
1503csum_copy_err: 1492csum_copy_err:
1504 if (!__sk_queue_drop_skb(sk, skb, flags)) { 1493 if (!__sk_queue_drop_skb(sk, skb, flags, udp_skb_destructor)) {
1505 UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); 1494 UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
1506 UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); 1495 UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
1507 } 1496 }
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index 62e1e72db461..1fc684111ce6 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -40,6 +40,7 @@ drop:
40 40
41int xfrm4_transport_finish(struct sk_buff *skb, int async) 41int xfrm4_transport_finish(struct sk_buff *skb, int async)
42{ 42{
43 struct xfrm_offload *xo = xfrm_offload(skb);
43 struct iphdr *iph = ip_hdr(skb); 44 struct iphdr *iph = ip_hdr(skb);
44 45
45 iph->protocol = XFRM_MODE_SKB_CB(skb)->protocol; 46 iph->protocol = XFRM_MODE_SKB_CB(skb)->protocol;
@@ -53,6 +54,11 @@ int xfrm4_transport_finish(struct sk_buff *skb, int async)
53 iph->tot_len = htons(skb->len); 54 iph->tot_len = htons(skb->len);
54 ip_send_check(iph); 55 ip_send_check(iph);
55 56
57 if (xo && (xo->flags & XFRM_GRO)) {
58 skb_mac_header_rebuild(skb);
59 return 0;
60 }
61
56 NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, 62 NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
57 dev_net(skb->dev), NULL, skb, skb->dev, NULL, 63 dev_net(skb->dev), NULL, skb, skb->dev, NULL,
58 xfrm4_rcv_encap_finish); 64 xfrm4_rcv_encap_finish);
diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c
index fd840c7d75ea..4acc0508c5eb 100644
--- a/net/ipv4/xfrm4_mode_transport.c
+++ b/net/ipv4/xfrm4_mode_transport.c
@@ -43,6 +43,7 @@ static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb)
43static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb) 43static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb)
44{ 44{
45 int ihl = skb->data - skb_transport_header(skb); 45 int ihl = skb->data - skb_transport_header(skb);
46 struct xfrm_offload *xo = xfrm_offload(skb);
46 47
47 if (skb->transport_header != skb->network_header) { 48 if (skb->transport_header != skb->network_header) {
48 memmove(skb_transport_header(skb), 49 memmove(skb_transport_header(skb),
@@ -50,7 +51,8 @@ static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb)
50 skb->network_header = skb->transport_header; 51 skb->network_header = skb->transport_header;
51 } 52 }
52 ip_hdr(skb)->tot_len = htons(skb->len + ihl); 53 ip_hdr(skb)->tot_len = htons(skb->len + ihl);
53 skb_reset_transport_header(skb); 54 if (!xo || !(xo->flags & XFRM_GRO))
55 skb_reset_transport_header(skb);
54 return 0; 56 return 0;
55} 57}
56 58
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 6a7ff6957535..71b4ecc195c7 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -17,8 +17,6 @@
17#include <net/ip.h> 17#include <net/ip.h>
18#include <net/l3mdev.h> 18#include <net/l3mdev.h>
19 19
20static struct xfrm_policy_afinfo xfrm4_policy_afinfo;
21
22static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4, 20static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,
23 int tos, int oif, 21 int tos, int oif,
24 const xfrm_address_t *saddr, 22 const xfrm_address_t *saddr,
@@ -219,7 +217,7 @@ static inline int xfrm4_garbage_collect(struct dst_ops *ops)
219{ 217{
220 struct net *net = container_of(ops, struct net, xfrm.xfrm4_dst_ops); 218 struct net *net = container_of(ops, struct net, xfrm.xfrm4_dst_ops);
221 219
222 xfrm4_policy_afinfo.garbage_collect(net); 220 xfrm_garbage_collect_deferred(net);
223 return (dst_entries_get_slow(ops) > ops->gc_thresh * 2); 221 return (dst_entries_get_slow(ops) > ops->gc_thresh * 2);
224} 222}
225 223
@@ -271,8 +269,7 @@ static struct dst_ops xfrm4_dst_ops_template = {
271 .gc_thresh = INT_MAX, 269 .gc_thresh = INT_MAX,
272}; 270};
273 271
274static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { 272static const struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
275 .family = AF_INET,
276 .dst_ops = &xfrm4_dst_ops_template, 273 .dst_ops = &xfrm4_dst_ops_template,
277 .dst_lookup = xfrm4_dst_lookup, 274 .dst_lookup = xfrm4_dst_lookup,
278 .get_saddr = xfrm4_get_saddr, 275 .get_saddr = xfrm4_get_saddr,
@@ -376,7 +373,7 @@ static struct pernet_operations __net_initdata xfrm4_net_ops = {
376 373
377static void __init xfrm4_policy_init(void) 374static void __init xfrm4_policy_init(void)
378{ 375{
379 xfrm_policy_register_afinfo(&xfrm4_policy_afinfo); 376 xfrm_policy_register_afinfo(&xfrm4_policy_afinfo, AF_INET);
380} 377}
381 378
382void __init xfrm4_init(void) 379void __init xfrm4_init(void)
diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c
index dccefa9d84cf..8dd0e6ab8606 100644
--- a/net/ipv4/xfrm4_protocol.c
+++ b/net/ipv4/xfrm4_protocol.c
@@ -188,9 +188,8 @@ static const struct net_protocol ipcomp4_protocol = {
188 .netns_ok = 1, 188 .netns_ok = 1,
189}; 189};
190 190
191static struct xfrm_input_afinfo xfrm4_input_afinfo = { 191static const struct xfrm_input_afinfo xfrm4_input_afinfo = {
192 .family = AF_INET, 192 .family = AF_INET,
193 .owner = THIS_MODULE,
194 .callback = xfrm4_rcv_cb, 193 .callback = xfrm4_rcv_cb,
195}; 194};
196 195
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 542074c00c78..d6660a8c0ea5 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -90,11 +90,3 @@ void __init xfrm4_state_init(void)
90{ 90{
91 xfrm_state_register_afinfo(&xfrm4_state_afinfo); 91 xfrm_state_register_afinfo(&xfrm4_state_afinfo);
92} 92}
93
94#if 0
95void __exit xfrm4_state_fini(void)
96{
97 xfrm_state_unregister_afinfo(&xfrm4_state_afinfo);
98}
99#endif /* 0 */
100
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index ec1267e2bd1f..e2afe677a9d9 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -75,6 +75,19 @@ config INET6_ESP
75 75
76 If unsure, say Y. 76 If unsure, say Y.
77 77
78config INET6_ESP_OFFLOAD
79 tristate "IPv6: ESP transformation offload"
80 depends on INET6_ESP
81 select XFRM_OFFLOAD
82 default n
83 ---help---
84 Support for ESP transformation offload. This makes sense
85 only if this system really does IPsec and want to do it
86 with high throughput. A typical desktop system does not
87 need it, even if it does IPsec.
88
89 If unsure, say N.
90
78config INET6_IPCOMP 91config INET6_IPCOMP
79 tristate "IPv6: IPComp transformation" 92 tristate "IPv6: IPComp transformation"
80 select INET6_XFRM_TUNNEL 93 select INET6_XFRM_TUNNEL
@@ -208,6 +221,7 @@ config IPV6_TUNNEL
208 tristate "IPv6: IP-in-IPv6 tunnel (RFC2473)" 221 tristate "IPv6: IP-in-IPv6 tunnel (RFC2473)"
209 select INET6_TUNNEL 222 select INET6_TUNNEL
210 select DST_CACHE 223 select DST_CACHE
224 select GRO_CELLS
211 ---help--- 225 ---help---
212 Support for IPv6-in-IPv6 and IPv4-in-IPv6 tunnels described in 226 Support for IPv6-in-IPv6 and IPv4-in-IPv6 tunnels described in
213 RFC 2473. 227 RFC 2473.
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index a9e9fec387ce..217e9ff0e24b 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -30,6 +30,7 @@ ipv6-objs += $(ipv6-y)
30 30
31obj-$(CONFIG_INET6_AH) += ah6.o 31obj-$(CONFIG_INET6_AH) += ah6.o
32obj-$(CONFIG_INET6_ESP) += esp6.o 32obj-$(CONFIG_INET6_ESP) += esp6.o
33obj-$(CONFIG_INET6_ESP_OFFLOAD) += esp6_offload.o
33obj-$(CONFIG_INET6_IPCOMP) += ipcomp6.o 34obj-$(CONFIG_INET6_IPCOMP) += ipcomp6.o
34obj-$(CONFIG_INET6_XFRM_TUNNEL) += xfrm6_tunnel.o 35obj-$(CONFIG_INET6_XFRM_TUNNEL) += xfrm6_tunnel.o
35obj-$(CONFIG_INET6_TUNNEL) += tunnel6.o 36obj-$(CONFIG_INET6_TUNNEL) += tunnel6.o
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index f60e88e56255..80ce478c4851 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -43,6 +43,7 @@
43#include <linux/errno.h> 43#include <linux/errno.h>
44#include <linux/types.h> 44#include <linux/types.h>
45#include <linux/kernel.h> 45#include <linux/kernel.h>
46#include <linux/sched/signal.h>
46#include <linux/socket.h> 47#include <linux/socket.h>
47#include <linux/sockios.h> 48#include <linux/sockios.h>
48#include <linux/net.h> 49#include <linux/net.h>
@@ -243,6 +244,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
243 .seg6_require_hmac = 0, 244 .seg6_require_hmac = 0,
244#endif 245#endif
245 .enhanced_dad = 1, 246 .enhanced_dad = 1,
247 .addr_gen_mode = IN6_ADDR_GEN_MODE_EUI64,
246}; 248};
247 249
248static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { 250static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -294,6 +296,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
294 .seg6_require_hmac = 0, 296 .seg6_require_hmac = 0,
295#endif 297#endif
296 .enhanced_dad = 1, 298 .enhanced_dad = 1,
299 .addr_gen_mode = IN6_ADDR_GEN_MODE_EUI64,
297}; 300};
298 301
299/* Check if a valid qdisc is available */ 302/* Check if a valid qdisc is available */
@@ -386,9 +389,9 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
386 memcpy(&ndev->cnf, dev_net(dev)->ipv6.devconf_dflt, sizeof(ndev->cnf)); 389 memcpy(&ndev->cnf, dev_net(dev)->ipv6.devconf_dflt, sizeof(ndev->cnf));
387 390
388 if (ndev->cnf.stable_secret.initialized) 391 if (ndev->cnf.stable_secret.initialized)
389 ndev->addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY; 392 ndev->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY;
390 else 393 else
391 ndev->addr_gen_mode = IN6_ADDR_GEN_MODE_EUI64; 394 ndev->cnf.addr_gen_mode = ipv6_devconf_dflt.addr_gen_mode;
392 395
393 ndev->cnf.mtu6 = dev->mtu; 396 ndev->cnf.mtu6 = dev->mtu;
394 ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl); 397 ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl);
@@ -2144,12 +2147,14 @@ static int ipv6_generate_eui64(u8 *eui, struct net_device *dev)
2144 case ARPHRD_SIT: 2147 case ARPHRD_SIT:
2145 return addrconf_ifid_sit(eui, dev); 2148 return addrconf_ifid_sit(eui, dev);
2146 case ARPHRD_IPGRE: 2149 case ARPHRD_IPGRE:
2150 case ARPHRD_TUNNEL:
2147 return addrconf_ifid_gre(eui, dev); 2151 return addrconf_ifid_gre(eui, dev);
2148 case ARPHRD_6LOWPAN: 2152 case ARPHRD_6LOWPAN:
2149 return addrconf_ifid_eui64(eui, dev); 2153 return addrconf_ifid_eui64(eui, dev);
2150 case ARPHRD_IEEE1394: 2154 case ARPHRD_IEEE1394:
2151 return addrconf_ifid_ieee1394(eui, dev); 2155 return addrconf_ifid_ieee1394(eui, dev);
2152 case ARPHRD_TUNNEL6: 2156 case ARPHRD_TUNNEL6:
2157 case ARPHRD_IP6GRE:
2153 return addrconf_ifid_ip6tnl(eui, dev); 2158 return addrconf_ifid_ip6tnl(eui, dev);
2154 } 2159 }
2155 return -1; 2160 return -1;
@@ -2387,8 +2392,8 @@ static void manage_tempaddrs(struct inet6_dev *idev,
2387 2392
2388static bool is_addr_mode_generate_stable(struct inet6_dev *idev) 2393static bool is_addr_mode_generate_stable(struct inet6_dev *idev)
2389{ 2394{
2390 return idev->addr_gen_mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY || 2395 return idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY ||
2391 idev->addr_gen_mode == IN6_ADDR_GEN_MODE_RANDOM; 2396 idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_RANDOM;
2392} 2397}
2393 2398
2394int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, 2399int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
@@ -3152,7 +3157,7 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route)
3152 3157
3153 ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); 3158 ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0);
3154 3159
3155 switch (idev->addr_gen_mode) { 3160 switch (idev->cnf.addr_gen_mode) {
3156 case IN6_ADDR_GEN_MODE_RANDOM: 3161 case IN6_ADDR_GEN_MODE_RANDOM:
3157 ipv6_gen_mode_random_init(idev); 3162 ipv6_gen_mode_random_init(idev);
3158 /* fallthrough */ 3163 /* fallthrough */
@@ -3193,6 +3198,9 @@ static void addrconf_dev_config(struct net_device *dev)
3193 (dev->type != ARPHRD_IEEE1394) && 3198 (dev->type != ARPHRD_IEEE1394) &&
3194 (dev->type != ARPHRD_TUNNEL6) && 3199 (dev->type != ARPHRD_TUNNEL6) &&
3195 (dev->type != ARPHRD_6LOWPAN) && 3200 (dev->type != ARPHRD_6LOWPAN) &&
3201 (dev->type != ARPHRD_IP6GRE) &&
3202 (dev->type != ARPHRD_IPGRE) &&
3203 (dev->type != ARPHRD_TUNNEL) &&
3196 (dev->type != ARPHRD_NONE)) { 3204 (dev->type != ARPHRD_NONE)) {
3197 /* Alas, we support only Ethernet autoconfiguration. */ 3205 /* Alas, we support only Ethernet autoconfiguration. */
3198 return; 3206 return;
@@ -3204,8 +3212,8 @@ static void addrconf_dev_config(struct net_device *dev)
3204 3212
3205 /* this device type has no EUI support */ 3213 /* this device type has no EUI support */
3206 if (dev->type == ARPHRD_NONE && 3214 if (dev->type == ARPHRD_NONE &&
3207 idev->addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64) 3215 idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64)
3208 idev->addr_gen_mode = IN6_ADDR_GEN_MODE_RANDOM; 3216 idev->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_RANDOM;
3209 3217
3210 addrconf_addr_gen(idev, false); 3218 addrconf_addr_gen(idev, false);
3211} 3219}
@@ -3386,9 +3394,15 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
3386 } 3394 }
3387 3395
3388 if (idev) { 3396 if (idev) {
3389 if (idev->if_flags & IF_READY) 3397 if (idev->if_flags & IF_READY) {
3390 /* device is already configured. */ 3398 /* device is already configured -
3399 * but resend MLD reports, we might
3400 * have roamed and need to update
3401 * multicast snooping switches
3402 */
3403 ipv6_mc_up(idev);
3391 break; 3404 break;
3405 }
3392 idev->if_flags |= IF_READY; 3406 idev->if_flags |= IF_READY;
3393 } 3407 }
3394 3408
@@ -3612,14 +3626,19 @@ restart:
3612 INIT_LIST_HEAD(&del_list); 3626 INIT_LIST_HEAD(&del_list);
3613 list_for_each_entry_safe(ifa, tmp, &idev->addr_list, if_list) { 3627 list_for_each_entry_safe(ifa, tmp, &idev->addr_list, if_list) {
3614 struct rt6_info *rt = NULL; 3628 struct rt6_info *rt = NULL;
3629 bool keep;
3615 3630
3616 addrconf_del_dad_work(ifa); 3631 addrconf_del_dad_work(ifa);
3617 3632
3633 keep = keep_addr && (ifa->flags & IFA_F_PERMANENT) &&
3634 !addr_is_local(&ifa->addr);
3635 if (!keep)
3636 list_move(&ifa->if_list, &del_list);
3637
3618 write_unlock_bh(&idev->lock); 3638 write_unlock_bh(&idev->lock);
3619 spin_lock_bh(&ifa->lock); 3639 spin_lock_bh(&ifa->lock);
3620 3640
3621 if (keep_addr && (ifa->flags & IFA_F_PERMANENT) && 3641 if (keep) {
3622 !addr_is_local(&ifa->addr)) {
3623 /* set state to skip the notifier below */ 3642 /* set state to skip the notifier below */
3624 state = INET6_IFADDR_STATE_DEAD; 3643 state = INET6_IFADDR_STATE_DEAD;
3625 ifa->state = 0; 3644 ifa->state = 0;
@@ -3631,8 +3650,6 @@ restart:
3631 } else { 3650 } else {
3632 state = ifa->state; 3651 state = ifa->state;
3633 ifa->state = INET6_IFADDR_STATE_DEAD; 3652 ifa->state = INET6_IFADDR_STATE_DEAD;
3634
3635 list_move(&ifa->if_list, &del_list);
3636 } 3653 }
3637 3654
3638 spin_unlock_bh(&ifa->lock); 3655 spin_unlock_bh(&ifa->lock);
@@ -4009,6 +4026,12 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id)
4009 4026
4010 if (bump_id) 4027 if (bump_id)
4011 rt_genid_bump_ipv6(dev_net(dev)); 4028 rt_genid_bump_ipv6(dev_net(dev));
4029
4030 /* Make sure that a new temporary address will be created
4031 * before this temporary address becomes deprecated.
4032 */
4033 if (ifp->flags & IFA_F_TEMPORARY)
4034 addrconf_verify_rtnl();
4012} 4035}
4013 4036
4014static void addrconf_dad_run(struct inet6_dev *idev) 4037static void addrconf_dad_run(struct inet6_dev *idev)
@@ -4888,6 +4911,13 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa)
4888 struct net *net = dev_net(ifa->idev->dev); 4911 struct net *net = dev_net(ifa->idev->dev);
4889 int err = -ENOBUFS; 4912 int err = -ENOBUFS;
4890 4913
4914 /* Don't send DELADDR notification for TENTATIVE address,
4915 * since NEWADDR notification is sent only after removing
4916 * TENTATIVE flag.
4917 */
4918 if (ifa->flags & IFA_F_TENTATIVE && event == RTM_DELADDR)
4919 return;
4920
4891 skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_ATOMIC); 4921 skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_ATOMIC);
4892 if (!skb) 4922 if (!skb)
4893 goto errout; 4923 goto errout;
@@ -4975,6 +5005,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
4975 array[DEVCONF_SEG6_REQUIRE_HMAC] = cnf->seg6_require_hmac; 5005 array[DEVCONF_SEG6_REQUIRE_HMAC] = cnf->seg6_require_hmac;
4976#endif 5006#endif
4977 array[DEVCONF_ENHANCED_DAD] = cnf->enhanced_dad; 5007 array[DEVCONF_ENHANCED_DAD] = cnf->enhanced_dad;
5008 array[DEVCONF_ADDR_GEN_MODE] = cnf->addr_gen_mode;
4978} 5009}
4979 5010
4980static inline size_t inet6_ifla6_size(void) 5011static inline size_t inet6_ifla6_size(void)
@@ -5086,7 +5117,7 @@ static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev,
5086 if (!nla) 5117 if (!nla)
5087 goto nla_put_failure; 5118 goto nla_put_failure;
5088 5119
5089 if (nla_put_u8(skb, IFLA_INET6_ADDR_GEN_MODE, idev->addr_gen_mode)) 5120 if (nla_put_u8(skb, IFLA_INET6_ADDR_GEN_MODE, idev->cnf.addr_gen_mode))
5090 goto nla_put_failure; 5121 goto nla_put_failure;
5091 5122
5092 read_lock_bh(&idev->lock); 5123 read_lock_bh(&idev->lock);
@@ -5204,6 +5235,26 @@ static int inet6_validate_link_af(const struct net_device *dev,
5204 return nla_parse_nested(tb, IFLA_INET6_MAX, nla, inet6_af_policy); 5235 return nla_parse_nested(tb, IFLA_INET6_MAX, nla, inet6_af_policy);
5205} 5236}
5206 5237
5238static int check_addr_gen_mode(int mode)
5239{
5240 if (mode != IN6_ADDR_GEN_MODE_EUI64 &&
5241 mode != IN6_ADDR_GEN_MODE_NONE &&
5242 mode != IN6_ADDR_GEN_MODE_STABLE_PRIVACY &&
5243 mode != IN6_ADDR_GEN_MODE_RANDOM)
5244 return -EINVAL;
5245 return 1;
5246}
5247
5248static int check_stable_privacy(struct inet6_dev *idev, struct net *net,
5249 int mode)
5250{
5251 if (mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY &&
5252 !idev->cnf.stable_secret.initialized &&
5253 !net->ipv6.devconf_dflt->stable_secret.initialized)
5254 return -EINVAL;
5255 return 1;
5256}
5257
5207static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla) 5258static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla)
5208{ 5259{
5209 int err = -EINVAL; 5260 int err = -EINVAL;
@@ -5225,18 +5276,11 @@ static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla)
5225 if (tb[IFLA_INET6_ADDR_GEN_MODE]) { 5276 if (tb[IFLA_INET6_ADDR_GEN_MODE]) {
5226 u8 mode = nla_get_u8(tb[IFLA_INET6_ADDR_GEN_MODE]); 5277 u8 mode = nla_get_u8(tb[IFLA_INET6_ADDR_GEN_MODE]);
5227 5278
5228 if (mode != IN6_ADDR_GEN_MODE_EUI64 && 5279 if (check_addr_gen_mode(mode) < 0 ||
5229 mode != IN6_ADDR_GEN_MODE_NONE && 5280 check_stable_privacy(idev, dev_net(dev), mode) < 0)
5230 mode != IN6_ADDR_GEN_MODE_STABLE_PRIVACY &&
5231 mode != IN6_ADDR_GEN_MODE_RANDOM)
5232 return -EINVAL; 5281 return -EINVAL;
5233 5282
5234 if (mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY && 5283 idev->cnf.addr_gen_mode = mode;
5235 !idev->cnf.stable_secret.initialized &&
5236 !dev_net(dev)->ipv6.devconf_dflt->stable_secret.initialized)
5237 return -EINVAL;
5238
5239 idev->addr_gen_mode = mode;
5240 err = 0; 5284 err = 0;
5241 } 5285 }
5242 5286
@@ -5643,6 +5687,55 @@ int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write,
5643 return ret; 5687 return ret;
5644} 5688}
5645 5689
5690static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write,
5691 void __user *buffer, size_t *lenp,
5692 loff_t *ppos)
5693{
5694 int ret = 0;
5695 int new_val;
5696 struct inet6_dev *idev = (struct inet6_dev *)ctl->extra1;
5697 struct net *net = (struct net *)ctl->extra2;
5698
5699 if (!rtnl_trylock())
5700 return restart_syscall();
5701
5702 ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
5703
5704 if (write) {
5705 new_val = *((int *)ctl->data);
5706
5707 if (check_addr_gen_mode(new_val) < 0) {
5708 ret = -EINVAL;
5709 goto out;
5710 }
5711
5712 /* request for default */
5713 if (&net->ipv6.devconf_dflt->addr_gen_mode == ctl->data) {
5714 ipv6_devconf_dflt.addr_gen_mode = new_val;
5715
5716 /* request for individual net device */
5717 } else {
5718 if (!idev)
5719 goto out;
5720
5721 if (check_stable_privacy(idev, net, new_val) < 0) {
5722 ret = -EINVAL;
5723 goto out;
5724 }
5725
5726 if (idev->cnf.addr_gen_mode != new_val) {
5727 idev->cnf.addr_gen_mode = new_val;
5728 addrconf_dev_config(idev->dev);
5729 }
5730 }
5731 }
5732
5733out:
5734 rtnl_unlock();
5735
5736 return ret;
5737}
5738
5646static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write, 5739static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write,
5647 void __user *buffer, size_t *lenp, 5740 void __user *buffer, size_t *lenp,
5648 loff_t *ppos) 5741 loff_t *ppos)
@@ -5693,14 +5786,14 @@ static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write,
5693 struct inet6_dev *idev = __in6_dev_get(dev); 5786 struct inet6_dev *idev = __in6_dev_get(dev);
5694 5787
5695 if (idev) { 5788 if (idev) {
5696 idev->addr_gen_mode = 5789 idev->cnf.addr_gen_mode =
5697 IN6_ADDR_GEN_MODE_STABLE_PRIVACY; 5790 IN6_ADDR_GEN_MODE_STABLE_PRIVACY;
5698 } 5791 }
5699 } 5792 }
5700 } else { 5793 } else {
5701 struct inet6_dev *idev = ctl->extra1; 5794 struct inet6_dev *idev = ctl->extra1;
5702 5795
5703 idev->addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY; 5796 idev->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY;
5704 } 5797 }
5705 5798
5706out: 5799out:
@@ -6088,6 +6181,13 @@ static const struct ctl_table addrconf_sysctl[] = {
6088 .proc_handler = proc_dointvec, 6181 .proc_handler = proc_dointvec,
6089 }, 6182 },
6090 { 6183 {
6184 .procname = "addr_gen_mode",
6185 .data = &ipv6_devconf.addr_gen_mode,
6186 .maxlen = sizeof(int),
6187 .mode = 0644,
6188 .proc_handler = addrconf_sysctl_addr_gen_mode,
6189 },
6190 {
6091 /* sentinel */ 6191 /* sentinel */
6092 } 6192 }
6093}; 6193};
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index aa42123bc301..a9a9553ee63d 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -302,7 +302,8 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
302 return -EINVAL; 302 return -EINVAL;
303 303
304 snum = ntohs(addr->sin6_port); 304 snum = ntohs(addr->sin6_port);
305 if (snum && snum < PROT_SOCK && !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) 305 if (snum && snum < inet_prot_sock(net) &&
306 !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
306 return -EACCES; 307 return -EACCES;
307 308
308 lock_sock(sk); 309 lock_sock(sk);
@@ -919,12 +920,12 @@ static int __init inet6_init(void)
919 err = register_pernet_subsys(&inet6_net_ops); 920 err = register_pernet_subsys(&inet6_net_ops);
920 if (err) 921 if (err)
921 goto register_pernet_fail; 922 goto register_pernet_fail;
922 err = icmpv6_init();
923 if (err)
924 goto icmp_fail;
925 err = ip6_mr_init(); 923 err = ip6_mr_init();
926 if (err) 924 if (err)
927 goto ipmr_fail; 925 goto ipmr_fail;
926 err = icmpv6_init();
927 if (err)
928 goto icmp_fail;
928 err = ndisc_init(); 929 err = ndisc_init();
929 if (err) 930 if (err)
930 goto ndisc_fail; 931 goto ndisc_fail;
@@ -1060,10 +1061,10 @@ igmp_fail:
1060 ndisc_cleanup(); 1061 ndisc_cleanup();
1061ndisc_fail: 1062ndisc_fail:
1062 ip6_mr_cleanup(); 1063 ip6_mr_cleanup();
1063ipmr_fail:
1064 icmpv6_cleanup();
1065icmp_fail: 1064icmp_fail:
1066 unregister_pernet_subsys(&inet6_net_ops); 1065 unregister_pernet_subsys(&inet6_net_ops);
1066ipmr_fail:
1067 icmpv6_cleanup();
1067register_pernet_fail: 1068register_pernet_fail:
1068 sock_unregister(PF_INET6); 1069 sock_unregister(PF_INET6);
1069 rtnl_unregister_all(PF_INET6); 1070 rtnl_unregister_all(PF_INET6);
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index 189eb10b742d..dda6035e3b84 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -474,6 +474,9 @@ static void ah6_input_done(struct crypto_async_request *base, int err)
474 int hdr_len = skb_network_header_len(skb); 474 int hdr_len = skb_network_header_len(skb);
475 int ah_hlen = (ah->hdrlen + 2) << 2; 475 int ah_hlen = (ah->hdrlen + 2) << 2;
476 476
477 if (err)
478 goto out;
479
477 work_iph = AH_SKB_CB(skb)->tmp; 480 work_iph = AH_SKB_CB(skb)->tmp;
478 auth_data = ah_tmp_auth(work_iph, hdr_len); 481 auth_data = ah_tmp_auth(work_iph, hdr_len);
479 icv = ah_tmp_icv(ahp->ahash, auth_data, ahp->icv_trunc_len); 482 icv = ah_tmp_icv(ahp->ahash, auth_data, ahp->icv_trunc_len);
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index a3eaafd87100..e011122ebd43 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -167,18 +167,22 @@ int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr,
167 if (np->sndflow) 167 if (np->sndflow)
168 fl6_flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK; 168 fl6_flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK;
169 169
170 addr_type = ipv6_addr_type(&usin->sin6_addr); 170 if (ipv6_addr_any(&usin->sin6_addr)) {
171
172 if (addr_type == IPV6_ADDR_ANY) {
173 /* 171 /*
174 * connect to self 172 * connect to self
175 */ 173 */
176 usin->sin6_addr.s6_addr[15] = 0x01; 174 if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
175 ipv6_addr_set_v4mapped(htonl(INADDR_LOOPBACK),
176 &usin->sin6_addr);
177 else
178 usin->sin6_addr = in6addr_loopback;
177 } 179 }
178 180
181 addr_type = ipv6_addr_type(&usin->sin6_addr);
182
179 daddr = &usin->sin6_addr; 183 daddr = &usin->sin6_addr;
180 184
181 if (addr_type == IPV6_ADDR_MAPPED) { 185 if (addr_type & IPV6_ADDR_MAPPED) {
182 struct sockaddr_in sin; 186 struct sockaddr_in sin;
183 187
184 if (__ipv6_only_sock(sk)) { 188 if (__ipv6_only_sock(sk)) {
@@ -401,9 +405,6 @@ static inline bool ipv6_datagram_support_addr(struct sock_exterr_skb *serr)
401 * At one point, excluding local errors was a quick test to identify icmp/icmp6 405 * At one point, excluding local errors was a quick test to identify icmp/icmp6
402 * errors. This is no longer true, but the test remained, so the v6 stack, 406 * errors. This is no longer true, but the test remained, so the v6 stack,
403 * unlike v4, also honors cmsg requests on all wifi and timestamp errors. 407 * unlike v4, also honors cmsg requests on all wifi and timestamp errors.
404 *
405 * Timestamp code paths do not initialize the fields expected by cmsg:
406 * the PKTINFO fields in skb->cb[]. Fill those in here.
407 */ 408 */
408static bool ip6_datagram_support_cmsg(struct sk_buff *skb, 409static bool ip6_datagram_support_cmsg(struct sk_buff *skb,
409 struct sock_exterr_skb *serr) 410 struct sock_exterr_skb *serr)
@@ -415,14 +416,9 @@ static bool ip6_datagram_support_cmsg(struct sk_buff *skb,
415 if (serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL) 416 if (serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL)
416 return false; 417 return false;
417 418
418 if (!skb->dev) 419 if (!IP6CB(skb)->iif)
419 return false; 420 return false;
420 421
421 if (skb->protocol == htons(ETH_P_IPV6))
422 IP6CB(skb)->iif = skb->dev->ifindex;
423 else
424 PKTINFO_SKB_CB(skb)->ipi_ifindex = skb->dev->ifindex;
425
426 return true; 422 return true;
427} 423}
428 424
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index cbcdd5db31f4..ff54faa75631 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -44,6 +44,8 @@
44#include <net/protocol.h> 44#include <net/protocol.h>
45#include <linux/icmpv6.h> 45#include <linux/icmpv6.h>
46 46
47#include <linux/highmem.h>
48
47struct esp_skb_cb { 49struct esp_skb_cb {
48 struct xfrm_skb_cb xfrm; 50 struct xfrm_skb_cb xfrm;
49 void *tmp; 51 void *tmp;
@@ -114,11 +116,40 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead,
114 __alignof__(struct scatterlist)); 116 __alignof__(struct scatterlist));
115} 117}
116 118
119static void esp_ssg_unref(struct xfrm_state *x, void *tmp)
120{
121 __be32 *seqhi;
122 struct crypto_aead *aead = x->data;
123 int seqhilen = 0;
124 u8 *iv;
125 struct aead_request *req;
126 struct scatterlist *sg;
127
128 if (x->props.flags & XFRM_STATE_ESN)
129 seqhilen += sizeof(__be32);
130
131 seqhi = esp_tmp_seqhi(tmp);
132 iv = esp_tmp_iv(aead, tmp, seqhilen);
133 req = esp_tmp_req(aead, iv);
134
135 /* Unref skb_frag_pages in the src scatterlist if necessary.
136 * Skip the first sg which comes from skb->data.
137 */
138 if (req->src != req->dst)
139 for (sg = sg_next(req->src); sg; sg = sg_next(sg))
140 put_page(sg_page(sg));
141}
142
117static void esp_output_done(struct crypto_async_request *base, int err) 143static void esp_output_done(struct crypto_async_request *base, int err)
118{ 144{
119 struct sk_buff *skb = base->data; 145 struct sk_buff *skb = base->data;
146 void *tmp;
147 struct dst_entry *dst = skb_dst(skb);
148 struct xfrm_state *x = dst->xfrm;
120 149
121 kfree(ESP_SKB_CB(skb)->tmp); 150 tmp = ESP_SKB_CB(skb)->tmp;
151 esp_ssg_unref(x, tmp);
152 kfree(tmp);
122 xfrm_output_resume(skb, err); 153 xfrm_output_resume(skb, err);
123} 154}
124 155
@@ -138,6 +169,27 @@ static void esp_output_restore_header(struct sk_buff *skb)
138 esp_restore_header(skb, skb_transport_offset(skb) - sizeof(__be32)); 169 esp_restore_header(skb, skb_transport_offset(skb) - sizeof(__be32));
139} 170}
140 171
172static struct ip_esp_hdr *esp_output_set_esn(struct sk_buff *skb,
173 struct ip_esp_hdr *esph,
174 __be32 *seqhi)
175{
176 struct xfrm_state *x = skb_dst(skb)->xfrm;
177
178 /* For ESN we move the header forward by 4 bytes to
179 * accomodate the high bits. We will move it back after
180 * encryption.
181 */
182 if ((x->props.flags & XFRM_STATE_ESN)) {
183 esph = (void *)(skb_transport_header(skb) - sizeof(__be32));
184 *seqhi = esph->spi;
185 esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
186 }
187
188 esph->spi = x->id.spi;
189
190 return esph;
191}
192
141static void esp_output_done_esn(struct crypto_async_request *base, int err) 193static void esp_output_done_esn(struct crypto_async_request *base, int err)
142{ 194{
143 struct sk_buff *skb = base->data; 195 struct sk_buff *skb = base->data;
@@ -146,14 +198,31 @@ static void esp_output_done_esn(struct crypto_async_request *base, int err)
146 esp_output_done(base, err); 198 esp_output_done(base, err);
147} 199}
148 200
201static void esp_output_fill_trailer(u8 *tail, int tfclen, int plen, __u8 proto)
202{
203 /* Fill padding... */
204 if (tfclen) {
205 memset(tail, 0, tfclen);
206 tail += tfclen;
207 }
208 do {
209 int i;
210 for (i = 0; i < plen - 2; i++)
211 tail[i] = i + 1;
212 } while (0);
213 tail[plen - 2] = plen - 2;
214 tail[plen - 1] = proto;
215}
216
149static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) 217static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
150{ 218{
151 int err; 219 int err;
152 struct ip_esp_hdr *esph; 220 struct ip_esp_hdr *esph;
153 struct crypto_aead *aead; 221 struct crypto_aead *aead;
154 struct aead_request *req; 222 struct aead_request *req;
155 struct scatterlist *sg; 223 struct scatterlist *sg, *dsg;
156 struct sk_buff *trailer; 224 struct sk_buff *trailer;
225 struct page *page;
157 void *tmp; 226 void *tmp;
158 int blksize; 227 int blksize;
159 int clen; 228 int clen;
@@ -164,10 +233,13 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
164 int nfrags; 233 int nfrags;
165 int assoclen; 234 int assoclen;
166 int seqhilen; 235 int seqhilen;
236 int tailen;
167 u8 *iv; 237 u8 *iv;
168 u8 *tail; 238 u8 *tail;
239 u8 *vaddr;
169 __be32 *seqhi; 240 __be32 *seqhi;
170 __be64 seqno; 241 __be64 seqno;
242 __u8 proto = *skb_mac_header(skb);
171 243
172 /* skb is pure payload to encrypt */ 244 /* skb is pure payload to encrypt */
173 aead = x->data; 245 aead = x->data;
@@ -186,11 +258,7 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
186 blksize = ALIGN(crypto_aead_blocksize(aead), 4); 258 blksize = ALIGN(crypto_aead_blocksize(aead), 4);
187 clen = ALIGN(skb->len + 2 + tfclen, blksize); 259 clen = ALIGN(skb->len + 2 + tfclen, blksize);
188 plen = clen - skb->len - tfclen; 260 plen = clen - skb->len - tfclen;
189 261 tailen = tfclen + plen + alen;
190 err = skb_cow_data(skb, tfclen + plen + alen, &trailer);
191 if (err < 0)
192 goto error;
193 nfrags = err;
194 262
195 assoclen = sizeof(*esph); 263 assoclen = sizeof(*esph);
196 seqhilen = 0; 264 seqhilen = 0;
@@ -200,59 +268,152 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
200 assoclen += seqhilen; 268 assoclen += seqhilen;
201 } 269 }
202 270
203 tmp = esp_alloc_tmp(aead, nfrags, seqhilen); 271 *skb_mac_header(skb) = IPPROTO_ESP;
204 if (!tmp) { 272 esph = ip_esp_hdr(skb);
205 err = -ENOMEM; 273
206 goto error; 274 if (!skb_cloned(skb)) {
275 if (tailen <= skb_availroom(skb)) {
276 nfrags = 1;
277 trailer = skb;
278 tail = skb_tail_pointer(trailer);
279
280 goto skip_cow;
281 } else if ((skb_shinfo(skb)->nr_frags < MAX_SKB_FRAGS)
282 && !skb_has_frag_list(skb)) {
283 int allocsize;
284 struct sock *sk = skb->sk;
285 struct page_frag *pfrag = &x->xfrag;
286
287 allocsize = ALIGN(tailen, L1_CACHE_BYTES);
288
289 spin_lock_bh(&x->lock);
290
291 if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
292 spin_unlock_bh(&x->lock);
293 goto cow;
294 }
295
296 page = pfrag->page;
297 get_page(page);
298
299 vaddr = kmap_atomic(page);
300
301 tail = vaddr + pfrag->offset;
302
303 esp_output_fill_trailer(tail, tfclen, plen, proto);
304
305 kunmap_atomic(vaddr);
306
307 nfrags = skb_shinfo(skb)->nr_frags;
308
309 __skb_fill_page_desc(skb, nfrags, page, pfrag->offset,
310 tailen);
311 skb_shinfo(skb)->nr_frags = ++nfrags;
312
313 pfrag->offset = pfrag->offset + allocsize;
314 nfrags++;
315
316 skb->len += tailen;
317 skb->data_len += tailen;
318 skb->truesize += tailen;
319 if (sk)
320 atomic_add(tailen, &sk->sk_wmem_alloc);
321
322 skb_push(skb, -skb_network_offset(skb));
323
324 esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
325 esph->spi = x->id.spi;
326
327 tmp = esp_alloc_tmp(aead, nfrags + 2, seqhilen);
328 if (!tmp) {
329 spin_unlock_bh(&x->lock);
330 err = -ENOMEM;
331 goto error;
332 }
333 seqhi = esp_tmp_seqhi(tmp);
334 iv = esp_tmp_iv(aead, tmp, seqhilen);
335 req = esp_tmp_req(aead, iv);
336 sg = esp_req_sg(aead, req);
337 dsg = &sg[nfrags];
338
339 esph = esp_output_set_esn(skb, esph, seqhi);
340
341 sg_init_table(sg, nfrags);
342 skb_to_sgvec(skb, sg,
343 (unsigned char *)esph - skb->data,
344 assoclen + ivlen + clen + alen);
345
346 allocsize = ALIGN(skb->data_len, L1_CACHE_BYTES);
347
348 if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
349 spin_unlock_bh(&x->lock);
350 err = -ENOMEM;
351 goto error;
352 }
353
354 skb_shinfo(skb)->nr_frags = 1;
355
356 page = pfrag->page;
357 get_page(page);
358 /* replace page frags in skb with new page */
359 __skb_fill_page_desc(skb, 0, page, pfrag->offset, skb->data_len);
360 pfrag->offset = pfrag->offset + allocsize;
361
362 sg_init_table(dsg, skb_shinfo(skb)->nr_frags + 1);
363 skb_to_sgvec(skb, dsg,
364 (unsigned char *)esph - skb->data,
365 assoclen + ivlen + clen + alen);
366
367 spin_unlock_bh(&x->lock);
368
369 goto skip_cow2;
370 }
207 } 371 }
208 372
209 seqhi = esp_tmp_seqhi(tmp); 373cow:
210 iv = esp_tmp_iv(aead, tmp, seqhilen); 374 err = skb_cow_data(skb, tailen, &trailer);
211 req = esp_tmp_req(aead, iv); 375 if (err < 0)
212 sg = esp_req_sg(aead, req); 376 goto error;
377 nfrags = err;
213 378
214 /* Fill padding... */
215 tail = skb_tail_pointer(trailer); 379 tail = skb_tail_pointer(trailer);
216 if (tfclen) { 380 esph = ip_esp_hdr(skb);
217 memset(tail, 0, tfclen);
218 tail += tfclen;
219 }
220 do {
221 int i;
222 for (i = 0; i < plen - 2; i++)
223 tail[i] = i + 1;
224 } while (0);
225 tail[plen - 2] = plen - 2;
226 tail[plen - 1] = *skb_mac_header(skb);
227 pskb_put(skb, trailer, clen - skb->len + alen);
228 381
382skip_cow:
383 esp_output_fill_trailer(tail, tfclen, plen, proto);
384
385 pskb_put(skb, trailer, clen - skb->len + alen);
229 skb_push(skb, -skb_network_offset(skb)); 386 skb_push(skb, -skb_network_offset(skb));
230 esph = ip_esp_hdr(skb);
231 *skb_mac_header(skb) = IPPROTO_ESP;
232 387
233 esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); 388 esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
389 esph->spi = x->id.spi;
234 390
235 aead_request_set_callback(req, 0, esp_output_done, skb); 391 tmp = esp_alloc_tmp(aead, nfrags, seqhilen);
236 392 if (!tmp) {
237 /* For ESN we move the header forward by 4 bytes to 393 err = -ENOMEM;
238 * accomodate the high bits. We will move it back after 394 goto error;
239 * encryption.
240 */
241 if ((x->props.flags & XFRM_STATE_ESN)) {
242 esph = (void *)(skb_transport_header(skb) - sizeof(__be32));
243 *seqhi = esph->spi;
244 esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
245 aead_request_set_callback(req, 0, esp_output_done_esn, skb);
246 } 395 }
247 396
248 esph->spi = x->id.spi; 397 seqhi = esp_tmp_seqhi(tmp);
398 iv = esp_tmp_iv(aead, tmp, seqhilen);
399 req = esp_tmp_req(aead, iv);
400 sg = esp_req_sg(aead, req);
401 dsg = sg;
402
403 esph = esp_output_set_esn(skb, esph, seqhi);
249 404
250 sg_init_table(sg, nfrags); 405 sg_init_table(sg, nfrags);
251 skb_to_sgvec(skb, sg, 406 skb_to_sgvec(skb, sg,
252 (unsigned char *)esph - skb->data, 407 (unsigned char *)esph - skb->data,
253 assoclen + ivlen + clen + alen); 408 assoclen + ivlen + clen + alen);
254 409
255 aead_request_set_crypt(req, sg, sg, ivlen + clen, iv); 410skip_cow2:
411 if ((x->props.flags & XFRM_STATE_ESN))
412 aead_request_set_callback(req, 0, esp_output_done_esn, skb);
413 else
414 aead_request_set_callback(req, 0, esp_output_done, skb);
415
416 aead_request_set_crypt(req, sg, dsg, ivlen + clen, iv);
256 aead_request_set_ad(req, assoclen); 417 aead_request_set_ad(req, assoclen);
257 418
258 seqno = cpu_to_be64(XFRM_SKB_CB(skb)->seq.output.low + 419 seqno = cpu_to_be64(XFRM_SKB_CB(skb)->seq.output.low +
@@ -278,6 +439,8 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
278 esp_output_restore_header(skb); 439 esp_output_restore_header(skb);
279 } 440 }
280 441
442 if (sg != dsg)
443 esp_ssg_unref(x, tmp);
281 kfree(tmp); 444 kfree(tmp);
282 445
283error: 446error:
@@ -343,6 +506,23 @@ static void esp_input_restore_header(struct sk_buff *skb)
343 __skb_pull(skb, 4); 506 __skb_pull(skb, 4);
344} 507}
345 508
509static void esp_input_set_header(struct sk_buff *skb, __be32 *seqhi)
510{
511 struct xfrm_state *x = xfrm_input_state(skb);
512 struct ip_esp_hdr *esph = (struct ip_esp_hdr *)skb->data;
513
514 /* For ESN we move the header forward by 4 bytes to
515 * accomodate the high bits. We will move it back after
516 * decryption.
517 */
518 if ((x->props.flags & XFRM_STATE_ESN)) {
519 esph = (void *)skb_push(skb, 4);
520 *seqhi = esph->spi;
521 esph->spi = esph->seq_no;
522 esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi;
523 }
524}
525
346static void esp_input_done_esn(struct crypto_async_request *base, int err) 526static void esp_input_done_esn(struct crypto_async_request *base, int err)
347{ 527{
348 struct sk_buff *skb = base->data; 528 struct sk_buff *skb = base->data;
@@ -378,14 +558,6 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
378 goto out; 558 goto out;
379 } 559 }
380 560
381 nfrags = skb_cow_data(skb, 0, &trailer);
382 if (nfrags < 0) {
383 ret = -EINVAL;
384 goto out;
385 }
386
387 ret = -ENOMEM;
388
389 assoclen = sizeof(*esph); 561 assoclen = sizeof(*esph);
390 seqhilen = 0; 562 seqhilen = 0;
391 563
@@ -394,6 +566,27 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
394 assoclen += seqhilen; 566 assoclen += seqhilen;
395 } 567 }
396 568
569 if (!skb_cloned(skb)) {
570 if (!skb_is_nonlinear(skb)) {
571 nfrags = 1;
572
573 goto skip_cow;
574 } else if (!skb_has_frag_list(skb)) {
575 nfrags = skb_shinfo(skb)->nr_frags;
576 nfrags++;
577
578 goto skip_cow;
579 }
580 }
581
582 nfrags = skb_cow_data(skb, 0, &trailer);
583 if (nfrags < 0) {
584 ret = -EINVAL;
585 goto out;
586 }
587
588skip_cow:
589 ret = -ENOMEM;
397 tmp = esp_alloc_tmp(aead, nfrags, seqhilen); 590 tmp = esp_alloc_tmp(aead, nfrags, seqhilen);
398 if (!tmp) 591 if (!tmp)
399 goto out; 592 goto out;
@@ -404,26 +597,17 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
404 req = esp_tmp_req(aead, iv); 597 req = esp_tmp_req(aead, iv);
405 sg = esp_req_sg(aead, req); 598 sg = esp_req_sg(aead, req);
406 599
407 skb->ip_summed = CHECKSUM_NONE; 600 esp_input_set_header(skb, seqhi);
408 601
409 esph = (struct ip_esp_hdr *)skb->data; 602 sg_init_table(sg, nfrags);
603 skb_to_sgvec(skb, sg, 0, skb->len);
410 604
411 aead_request_set_callback(req, 0, esp_input_done, skb); 605 skb->ip_summed = CHECKSUM_NONE;
412 606
413 /* For ESN we move the header forward by 4 bytes to 607 if ((x->props.flags & XFRM_STATE_ESN))
414 * accomodate the high bits. We will move it back after
415 * decryption.
416 */
417 if ((x->props.flags & XFRM_STATE_ESN)) {
418 esph = (void *)skb_push(skb, 4);
419 *seqhi = esph->spi;
420 esph->spi = esph->seq_no;
421 esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi;
422 aead_request_set_callback(req, 0, esp_input_done_esn, skb); 608 aead_request_set_callback(req, 0, esp_input_done_esn, skb);
423 } 609 else
424 610 aead_request_set_callback(req, 0, esp_input_done, skb);
425 sg_init_table(sg, nfrags);
426 skb_to_sgvec(skb, sg, 0, skb->len);
427 611
428 aead_request_set_crypt(req, sg, sg, elen + ivlen, iv); 612 aead_request_set_crypt(req, sg, sg, elen + ivlen, iv);
429 aead_request_set_ad(req, assoclen); 613 aead_request_set_ad(req, assoclen);
diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c
new file mode 100644
index 000000000000..d914eb93204a
--- /dev/null
+++ b/net/ipv6/esp6_offload.c
@@ -0,0 +1,108 @@
1/*
2 * IPV6 GSO/GRO offload support
3 * Linux INET implementation
4 *
5 * Copyright (C) 2016 secunet Security Networks AG
6 * Author: Steffen Klassert <steffen.klassert@secunet.com>
7 *
8 * This program is free software; you can redistribute it and/or modify it
9 * under the terms and conditions of the GNU General Public License,
10 * version 2, as published by the Free Software Foundation.
11 *
12 * ESP GRO support
13 */
14
15#include <linux/skbuff.h>
16#include <linux/init.h>
17#include <net/protocol.h>
18#include <crypto/aead.h>
19#include <crypto/authenc.h>
20#include <linux/err.h>
21#include <linux/module.h>
22#include <net/ip.h>
23#include <net/xfrm.h>
24#include <net/esp.h>
25#include <linux/scatterlist.h>
26#include <linux/kernel.h>
27#include <linux/slab.h>
28#include <linux/spinlock.h>
29#include <net/ip6_route.h>
30#include <net/ipv6.h>
31#include <linux/icmpv6.h>
32
33static struct sk_buff **esp6_gro_receive(struct sk_buff **head,
34 struct sk_buff *skb)
35{
36 int offset = skb_gro_offset(skb);
37 struct xfrm_offload *xo;
38 struct xfrm_state *x;
39 __be32 seq;
40 __be32 spi;
41 int err;
42
43 skb_pull(skb, offset);
44
45 if ((err = xfrm_parse_spi(skb, IPPROTO_ESP, &spi, &seq)) != 0)
46 goto out;
47
48 err = secpath_set(skb);
49 if (err)
50 goto out;
51
52 if (skb->sp->len == XFRM_MAX_DEPTH)
53 goto out;
54
55 x = xfrm_state_lookup(dev_net(skb->dev), skb->mark,
56 (xfrm_address_t *)&ipv6_hdr(skb)->daddr,
57 spi, IPPROTO_ESP, AF_INET6);
58 if (!x)
59 goto out;
60
61 skb->sp->xvec[skb->sp->len++] = x;
62 skb->sp->olen++;
63
64 xo = xfrm_offload(skb);
65 if (!xo) {
66 xfrm_state_put(x);
67 goto out;
68 }
69 xo->flags |= XFRM_GRO;
70
71 XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL;
72 XFRM_SPI_SKB_CB(skb)->family = AF_INET6;
73 XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr);
74 XFRM_SPI_SKB_CB(skb)->seq = seq;
75
76 /* We don't need to handle errors from xfrm_input, it does all
77 * the error handling and frees the resources on error. */
78 xfrm_input(skb, IPPROTO_ESP, spi, -2);
79
80 return ERR_PTR(-EINPROGRESS);
81out:
82 skb_push(skb, offset);
83 NAPI_GRO_CB(skb)->same_flow = 0;
84 NAPI_GRO_CB(skb)->flush = 1;
85
86 return NULL;
87}
88
89static const struct net_offload esp6_offload = {
90 .callbacks = {
91 .gro_receive = esp6_gro_receive,
92 },
93};
94
95static int __init esp6_offload_init(void)
96{
97 return inet6_add_offload(&esp6_offload, IPPROTO_ESP);
98}
99
100static void __exit esp6_offload_exit(void)
101{
102 inet6_del_offload(&esp6_offload, IPPROTO_ESP);
103}
104
105module_init(esp6_offload_init);
106module_exit(esp6_offload_exit);
107MODULE_LICENSE("GPL");
108MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>");
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index e4198502fd98..25192a3b0cd7 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -327,7 +327,6 @@ static int ipv6_srh_rcv(struct sk_buff *skb)
327 struct ipv6_sr_hdr *hdr; 327 struct ipv6_sr_hdr *hdr;
328 struct inet6_dev *idev; 328 struct inet6_dev *idev;
329 struct in6_addr *addr; 329 struct in6_addr *addr;
330 bool cleanup = false;
331 int accept_seg6; 330 int accept_seg6;
332 331
333 hdr = (struct ipv6_sr_hdr *)skb_transport_header(skb); 332 hdr = (struct ipv6_sr_hdr *)skb_transport_header(skb);
@@ -351,11 +350,7 @@ static int ipv6_srh_rcv(struct sk_buff *skb)
351#endif 350#endif
352 351
353looped_back: 352looped_back:
354 if (hdr->segments_left > 0) { 353 if (hdr->segments_left == 0) {
355 if (hdr->nexthdr != NEXTHDR_IPV6 && hdr->segments_left == 1 &&
356 sr_has_cleanup(hdr))
357 cleanup = true;
358 } else {
359 if (hdr->nexthdr == NEXTHDR_IPV6) { 354 if (hdr->nexthdr == NEXTHDR_IPV6) {
360 int offset = (hdr->hdrlen + 1) << 3; 355 int offset = (hdr->hdrlen + 1) << 3;
361 356
@@ -393,7 +388,6 @@ looped_back:
393 icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, 388 icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
394 ((&hdr->segments_left) - 389 ((&hdr->segments_left) -
395 skb_network_header(skb))); 390 skb_network_header(skb)));
396 kfree_skb(skb);
397 return -1; 391 return -1;
398 } 392 }
399 393
@@ -418,21 +412,6 @@ looped_back:
418 412
419 ipv6_hdr(skb)->daddr = *addr; 413 ipv6_hdr(skb)->daddr = *addr;
420 414
421 if (cleanup) {
422 int srhlen = (hdr->hdrlen + 1) << 3;
423 int nh = hdr->nexthdr;
424
425 skb_pull_rcsum(skb, sizeof(struct ipv6hdr) + srhlen);
426 memmove(skb_network_header(skb) + srhlen,
427 skb_network_header(skb),
428 (unsigned char *)hdr - skb_network_header(skb));
429 skb->network_header += srhlen;
430 ipv6_hdr(skb)->nexthdr = nh;
431 ipv6_hdr(skb)->payload_len = htons(skb->len -
432 sizeof(struct ipv6hdr));
433 skb_push_rcsum(skb, sizeof(struct ipv6hdr));
434 }
435
436 skb_dst_drop(skb); 415 skb_dst_drop(skb);
437 416
438 ip6_route_input(skb); 417 ip6_route_input(skb);
@@ -453,13 +432,8 @@ looped_back:
453 } 432 }
454 ipv6_hdr(skb)->hop_limit--; 433 ipv6_hdr(skb)->hop_limit--;
455 434
456 /* be sure that srh is still present before reinjecting */ 435 skb_pull(skb, sizeof(struct ipv6hdr));
457 if (!cleanup) { 436 goto looped_back;
458 skb_pull(skb, sizeof(struct ipv6hdr));
459 goto looped_back;
460 }
461 skb_set_transport_header(skb, sizeof(struct ipv6hdr));
462 IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr);
463 } 437 }
464 438
465 dst_input(skb); 439 dst_input(skb);
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 3036f665e6c8..230b5aac9f03 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -110,19 +110,17 @@ static const struct inet6_protocol icmpv6_protocol = {
110 .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, 110 .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
111}; 111};
112 112
113/* Called with BH disabled */
113static __inline__ struct sock *icmpv6_xmit_lock(struct net *net) 114static __inline__ struct sock *icmpv6_xmit_lock(struct net *net)
114{ 115{
115 struct sock *sk; 116 struct sock *sk;
116 117
117 local_bh_disable();
118
119 sk = icmpv6_sk(net); 118 sk = icmpv6_sk(net);
120 if (unlikely(!spin_trylock(&sk->sk_lock.slock))) { 119 if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
121 /* This can happen if the output path (f.e. SIT or 120 /* This can happen if the output path (f.e. SIT or
122 * ip6ip6 tunnel) signals dst_link_failure() for an 121 * ip6ip6 tunnel) signals dst_link_failure() for an
123 * outgoing ICMP6 packet. 122 * outgoing ICMP6 packet.
124 */ 123 */
125 local_bh_enable();
126 return NULL; 124 return NULL;
127 } 125 }
128 return sk; 126 return sk;
@@ -130,7 +128,7 @@ static __inline__ struct sock *icmpv6_xmit_lock(struct net *net)
130 128
131static __inline__ void icmpv6_xmit_unlock(struct sock *sk) 129static __inline__ void icmpv6_xmit_unlock(struct sock *sk)
132{ 130{
133 spin_unlock_bh(&sk->sk_lock.slock); 131 spin_unlock(&sk->sk_lock.slock);
134} 132}
135 133
136/* 134/*
@@ -168,6 +166,30 @@ static bool is_ineligible(const struct sk_buff *skb)
168 return false; 166 return false;
169} 167}
170 168
169static bool icmpv6_mask_allow(int type)
170{
171 /* Informational messages are not limited. */
172 if (type & ICMPV6_INFOMSG_MASK)
173 return true;
174
175 /* Do not limit pmtu discovery, it would break it. */
176 if (type == ICMPV6_PKT_TOOBIG)
177 return true;
178
179 return false;
180}
181
182static bool icmpv6_global_allow(int type)
183{
184 if (icmpv6_mask_allow(type))
185 return true;
186
187 if (icmp_global_allow())
188 return true;
189
190 return false;
191}
192
171/* 193/*
172 * Check the ICMP output rate limit 194 * Check the ICMP output rate limit
173 */ 195 */
@@ -178,12 +200,7 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
178 struct dst_entry *dst; 200 struct dst_entry *dst;
179 bool res = false; 201 bool res = false;
180 202
181 /* Informational messages are not limited. */ 203 if (icmpv6_mask_allow(type))
182 if (type & ICMPV6_INFOMSG_MASK)
183 return true;
184
185 /* Do not limit pmtu discovery, it would break it. */
186 if (type == ICMPV6_PKT_TOOBIG)
187 return true; 204 return true;
188 205
189 /* 206 /*
@@ -200,20 +217,16 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
200 } else { 217 } else {
201 struct rt6_info *rt = (struct rt6_info *)dst; 218 struct rt6_info *rt = (struct rt6_info *)dst;
202 int tmo = net->ipv6.sysctl.icmpv6_time; 219 int tmo = net->ipv6.sysctl.icmpv6_time;
220 struct inet_peer *peer;
203 221
204 /* Give more bandwidth to wider prefixes. */ 222 /* Give more bandwidth to wider prefixes. */
205 if (rt->rt6i_dst.plen < 128) 223 if (rt->rt6i_dst.plen < 128)
206 tmo >>= ((128 - rt->rt6i_dst.plen)>>5); 224 tmo >>= ((128 - rt->rt6i_dst.plen)>>5);
207 225
208 if (icmp_global_allow()) { 226 peer = inet_getpeer_v6(net->ipv6.peers, &fl6->daddr, 1);
209 struct inet_peer *peer; 227 res = inet_peer_xrlim_allow(peer, tmo);
210 228 if (peer)
211 peer = inet_getpeer_v6(net->ipv6.peers, 229 inet_putpeer(peer);
212 &fl6->daddr, 1);
213 res = inet_peer_xrlim_allow(peer, tmo);
214 if (peer)
215 inet_putpeer(peer);
216 }
217 } 230 }
218 dst_release(dst); 231 dst_release(dst);
219 return res; 232 return res;
@@ -474,6 +487,13 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
474 return; 487 return;
475 } 488 }
476 489
490 /* Needed by both icmp_global_allow and icmpv6_xmit_lock */
491 local_bh_disable();
492
493 /* Check global sysctl_icmp_msgs_per_sec ratelimit */
494 if (!icmpv6_global_allow(type))
495 goto out_bh_enable;
496
477 mip6_addr_swap(skb); 497 mip6_addr_swap(skb);
478 498
479 memset(&fl6, 0, sizeof(fl6)); 499 memset(&fl6, 0, sizeof(fl6));
@@ -492,7 +512,8 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
492 512
493 sk = icmpv6_xmit_lock(net); 513 sk = icmpv6_xmit_lock(net);
494 if (!sk) 514 if (!sk)
495 return; 515 goto out_bh_enable;
516
496 sk->sk_mark = mark; 517 sk->sk_mark = mark;
497 np = inet6_sk(sk); 518 np = inet6_sk(sk);
498 519
@@ -552,6 +573,8 @@ out_dst_release:
552 dst_release(dst); 573 dst_release(dst);
553out: 574out:
554 icmpv6_xmit_unlock(sk); 575 icmpv6_xmit_unlock(sk);
576out_bh_enable:
577 local_bh_enable();
555} 578}
556 579
557/* Slightly more convenient version of icmp6_send. 580/* Slightly more convenient version of icmp6_send.
@@ -665,9 +688,10 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
665 fl6.flowi6_uid = sock_net_uid(net, NULL); 688 fl6.flowi6_uid = sock_net_uid(net, NULL);
666 security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); 689 security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
667 690
691 local_bh_disable();
668 sk = icmpv6_xmit_lock(net); 692 sk = icmpv6_xmit_lock(net);
669 if (!sk) 693 if (!sk)
670 return; 694 goto out_bh_enable;
671 sk->sk_mark = mark; 695 sk->sk_mark = mark;
672 np = inet6_sk(sk); 696 np = inet6_sk(sk);
673 697
@@ -709,6 +733,8 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
709 dst_release(dst); 733 dst_release(dst);
710out: 734out:
711 icmpv6_xmit_unlock(sk); 735 icmpv6_xmit_unlock(sk);
736out_bh_enable:
737 local_bh_enable();
712} 738}
713 739
714void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info) 740void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info)
diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c
index 13b5e85fe0d5..ce1aae4a7fc8 100644
--- a/net/ipv6/ila/ila_lwt.c
+++ b/net/ipv6/ila/ila_lwt.c
@@ -115,7 +115,7 @@ static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = {
115 [ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, }, 115 [ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, },
116}; 116};
117 117
118static int ila_build_state(struct net_device *dev, struct nlattr *nla, 118static int ila_build_state(struct nlattr *nla,
119 unsigned int family, const void *cfg, 119 unsigned int family, const void *cfg,
120 struct lwtunnel_state **ts) 120 struct lwtunnel_state **ts)
121{ 121{
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index 75c308239243..9a31d13bf180 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -28,46 +28,6 @@
28#include <net/inet6_connection_sock.h> 28#include <net/inet6_connection_sock.h>
29#include <net/sock_reuseport.h> 29#include <net/sock_reuseport.h>
30 30
31int inet6_csk_bind_conflict(const struct sock *sk,
32 const struct inet_bind_bucket *tb, bool relax,
33 bool reuseport_ok)
34{
35 const struct sock *sk2;
36 bool reuse = !!sk->sk_reuse;
37 bool reuseport = !!sk->sk_reuseport && reuseport_ok;
38 kuid_t uid = sock_i_uid((struct sock *)sk);
39
40 /* We must walk the whole port owner list in this case. -DaveM */
41 /*
42 * See comment in inet_csk_bind_conflict about sock lookup
43 * vs net namespaces issues.
44 */
45 sk_for_each_bound(sk2, &tb->owners) {
46 if (sk != sk2 &&
47 (!sk->sk_bound_dev_if ||
48 !sk2->sk_bound_dev_if ||
49 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
50 if ((!reuse || !sk2->sk_reuse ||
51 sk2->sk_state == TCP_LISTEN) &&
52 (!reuseport || !sk2->sk_reuseport ||
53 rcu_access_pointer(sk->sk_reuseport_cb) ||
54 (sk2->sk_state != TCP_TIME_WAIT &&
55 !uid_eq(uid,
56 sock_i_uid((struct sock *)sk2))))) {
57 if (ipv6_rcv_saddr_equal(sk, sk2, true))
58 break;
59 }
60 if (!relax && reuse && sk2->sk_reuse &&
61 sk2->sk_state != TCP_LISTEN &&
62 ipv6_rcv_saddr_equal(sk, sk2, true))
63 break;
64 }
65 }
66
67 return sk2 != NULL;
68}
69EXPORT_SYMBOL_GPL(inet6_csk_bind_conflict);
70
71struct dst_entry *inet6_csk_route_req(const struct sock *sk, 31struct dst_entry *inet6_csk_route_req(const struct sock *sk,
72 struct flowi6 *fl6, 32 struct flowi6 *fl6,
73 const struct request_sock *req, 33 const struct request_sock *req,
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 02761c9fe43e..d0900918a19e 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -268,54 +268,10 @@ int inet6_hash(struct sock *sk)
268 268
269 if (sk->sk_state != TCP_CLOSE) { 269 if (sk->sk_state != TCP_CLOSE) {
270 local_bh_disable(); 270 local_bh_disable();
271 err = __inet_hash(sk, NULL, ipv6_rcv_saddr_equal); 271 err = __inet_hash(sk, NULL);
272 local_bh_enable(); 272 local_bh_enable();
273 } 273 }
274 274
275 return err; 275 return err;
276} 276}
277EXPORT_SYMBOL_GPL(inet6_hash); 277EXPORT_SYMBOL_GPL(inet6_hash);
278
279/* match_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses if IPv6
280 * only, and any IPv4 addresses if not IPv6 only
281 * match_wildcard == false: addresses must be exactly the same, i.e.
282 * IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY,
283 * and 0.0.0.0 equals to 0.0.0.0 only
284 */
285int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
286 bool match_wildcard)
287{
288 const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2);
289 int sk2_ipv6only = inet_v6_ipv6only(sk2);
290 int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr);
291 int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED;
292
293 /* if both are mapped, treat as IPv4 */
294 if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) {
295 if (!sk2_ipv6only) {
296 if (sk->sk_rcv_saddr == sk2->sk_rcv_saddr)
297 return 1;
298 if (!sk->sk_rcv_saddr || !sk2->sk_rcv_saddr)
299 return match_wildcard;
300 }
301 return 0;
302 }
303
304 if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY)
305 return 1;
306
307 if (addr_type2 == IPV6_ADDR_ANY && match_wildcard &&
308 !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED))
309 return 1;
310
311 if (addr_type == IPV6_ADDR_ANY && match_wildcard &&
312 !(ipv6_only_sock(sk) && addr_type2 == IPV6_ADDR_MAPPED))
313 return 1;
314
315 if (sk2_rcv_saddr6 &&
316 ipv6_addr_equal(&sk->sk_v6_rcv_saddr, sk2_rcv_saddr6))
317 return 1;
318
319 return 0;
320}
321EXPORT_SYMBOL_GPL(ipv6_rcv_saddr_equal);
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index ef5485204522..d4bf2c68a545 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -318,6 +318,16 @@ static int fib6_dump_node(struct fib6_walker *w)
318 w->leaf = rt; 318 w->leaf = rt;
319 return 1; 319 return 1;
320 } 320 }
321
322 /* Multipath routes are dumped in one route with the
323 * RTA_MULTIPATH attribute. Jump 'rt' to point to the
324 * last sibling of this route (no need to dump the
325 * sibling routes again)
326 */
327 if (rt->rt6i_nsiblings)
328 rt = list_last_entry(&rt->rt6i_siblings,
329 struct rt6_info,
330 rt6i_siblings);
321 } 331 }
322 w->leaf = NULL; 332 w->leaf = NULL;
323 return 0; 333 return 0;
@@ -746,6 +756,9 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
746 u16 nlflags = NLM_F_EXCL; 756 u16 nlflags = NLM_F_EXCL;
747 int err; 757 int err;
748 758
759 if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND))
760 nlflags |= NLM_F_APPEND;
761
749 ins = &fn->leaf; 762 ins = &fn->leaf;
750 763
751 for (iter = fn->leaf; iter; iter = iter->dst.rt6_next) { 764 for (iter = fn->leaf; iter; iter = iter->dst.rt6_next) {
@@ -868,7 +881,8 @@ add:
868 *ins = rt; 881 *ins = rt;
869 rt->rt6i_node = fn; 882 rt->rt6i_node = fn;
870 atomic_inc(&rt->rt6i_ref); 883 atomic_inc(&rt->rt6i_ref);
871 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); 884 if (!info->skip_notify)
885 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
872 info->nl_net->ipv6.rt6_stats->fib_rt_entries++; 886 info->nl_net->ipv6.rt6_stats->fib_rt_entries++;
873 887
874 if (!(fn->fn_flags & RTN_RTINFO)) { 888 if (!(fn->fn_flags & RTN_RTINFO)) {
@@ -894,7 +908,8 @@ add:
894 rt->rt6i_node = fn; 908 rt->rt6i_node = fn;
895 rt->dst.rt6_next = iter->dst.rt6_next; 909 rt->dst.rt6_next = iter->dst.rt6_next;
896 atomic_inc(&rt->rt6i_ref); 910 atomic_inc(&rt->rt6i_ref);
897 inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE); 911 if (!info->skip_notify)
912 inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
898 if (!(fn->fn_flags & RTN_RTINFO)) { 913 if (!(fn->fn_flags & RTN_RTINFO)) {
899 info->nl_net->ipv6.rt6_stats->fib_route_nodes++; 914 info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
900 fn->fn_flags |= RTN_RTINFO; 915 fn->fn_flags |= RTN_RTINFO;
@@ -908,6 +923,8 @@ add:
908 ins = &rt->dst.rt6_next; 923 ins = &rt->dst.rt6_next;
909 iter = *ins; 924 iter = *ins;
910 while (iter) { 925 while (iter) {
926 if (iter->rt6i_metric > rt->rt6i_metric)
927 break;
911 if (rt6_qualify_for_ecmp(iter)) { 928 if (rt6_qualify_for_ecmp(iter)) {
912 *ins = iter->dst.rt6_next; 929 *ins = iter->dst.rt6_next;
913 fib6_purge_rt(iter, fn, info->nl_net); 930 fib6_purge_rt(iter, fn, info->nl_net);
@@ -1439,7 +1456,8 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
1439 1456
1440 fib6_purge_rt(rt, fn, net); 1457 fib6_purge_rt(rt, fn, net);
1441 1458
1442 inet6_rt_notify(RTM_DELROUTE, rt, info, 0); 1459 if (!info->skip_notify)
1460 inet6_rt_notify(RTM_DELROUTE, rt, info, 0);
1443 rt6_release(rt); 1461 rt6_release(rt);
1444} 1462}
1445 1463
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 558631860d91..6fcb7cb49bb2 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -367,35 +367,37 @@ static void ip6gre_tunnel_uninit(struct net_device *dev)
367 367
368 368
369static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, 369static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
370 u8 type, u8 code, int offset, __be32 info) 370 u8 type, u8 code, int offset, __be32 info)
371{ 371{
372 const struct ipv6hdr *ipv6h = (const struct ipv6hdr *)skb->data; 372 const struct gre_base_hdr *greh;
373 __be16 *p = (__be16 *)(skb->data + offset); 373 const struct ipv6hdr *ipv6h;
374 int grehlen = offset + 4; 374 int grehlen = sizeof(*greh);
375 struct ip6_tnl *t; 375 struct ip6_tnl *t;
376 int key_off = 0;
376 __be16 flags; 377 __be16 flags;
378 __be32 key;
377 379
378 flags = p[0]; 380 if (!pskb_may_pull(skb, offset + grehlen))
379 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) { 381 return;
380 if (flags&(GRE_VERSION|GRE_ROUTING)) 382 greh = (const struct gre_base_hdr *)(skb->data + offset);
381 return; 383 flags = greh->flags;
382 if (flags&GRE_KEY) { 384 if (flags & (GRE_VERSION | GRE_ROUTING))
383 grehlen += 4; 385 return;
384 if (flags&GRE_CSUM) 386 if (flags & GRE_CSUM)
385 grehlen += 4; 387 grehlen += 4;
386 } 388 if (flags & GRE_KEY) {
389 key_off = grehlen + offset;
390 grehlen += 4;
387 } 391 }
388 392
389 /* If only 8 bytes returned, keyed message will be dropped here */ 393 if (!pskb_may_pull(skb, offset + grehlen))
390 if (!pskb_may_pull(skb, grehlen))
391 return; 394 return;
392 ipv6h = (const struct ipv6hdr *)skb->data; 395 ipv6h = (const struct ipv6hdr *)skb->data;
393 p = (__be16 *)(skb->data + offset); 396 greh = (const struct gre_base_hdr *)(skb->data + offset);
397 key = key_off ? *(__be32 *)(skb->data + key_off) : 0;
394 398
395 t = ip6gre_tunnel_lookup(skb->dev, &ipv6h->daddr, &ipv6h->saddr, 399 t = ip6gre_tunnel_lookup(skb->dev, &ipv6h->daddr, &ipv6h->saddr,
396 flags & GRE_KEY ? 400 key, greh->protocol);
397 *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
398 p[1]);
399 if (!t) 401 if (!t)
400 return; 402 return;
401 403
@@ -484,11 +486,6 @@ drop:
484 return 0; 486 return 0;
485} 487}
486 488
487struct ipv6_tel_txoption {
488 struct ipv6_txoptions ops;
489 __u8 dst_opt[8];
490};
491
492static int gre_handle_offloads(struct sk_buff *skb, bool csum) 489static int gre_handle_offloads(struct sk_buff *skb, bool csum)
493{ 490{
494 return iptunnel_handle_offloads(skb, 491 return iptunnel_handle_offloads(skb,
@@ -1001,6 +998,9 @@ static void ip6gre_tunnel_setup(struct net_device *dev)
1001 dev->flags |= IFF_NOARP; 998 dev->flags |= IFF_NOARP;
1002 dev->addr_len = sizeof(struct in6_addr); 999 dev->addr_len = sizeof(struct in6_addr);
1003 netif_keep_dst(dev); 1000 netif_keep_dst(dev);
1001 /* This perm addr will be used as interface identifier by IPv6 */
1002 dev->addr_assign_type = NET_ADDR_RANDOM;
1003 eth_random_addr(dev->perm_addr);
1004} 1004}
1005 1005
1006static int ip6gre_tunnel_init_common(struct net_device *dev) 1006static int ip6gre_tunnel_init_common(struct net_device *dev)
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index aacfb4bce153..c45b12b4431c 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -122,11 +122,14 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
122 max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs)); 122 max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs));
123 /* 123 /*
124 * RFC4291 2.5.3 124 * RFC4291 2.5.3
125 * The loopback address must not be used as the source address in IPv6
126 * packets that are sent outside of a single node. [..]
125 * A packet received on an interface with a destination address 127 * A packet received on an interface with a destination address
126 * of loopback must be dropped. 128 * of loopback must be dropped.
127 */ 129 */
128 if (!(dev->flags & IFF_LOOPBACK) && 130 if ((ipv6_addr_loopback(&hdr->saddr) ||
129 ipv6_addr_loopback(&hdr->daddr)) 131 ipv6_addr_loopback(&hdr->daddr)) &&
132 !(dev->flags & IFF_LOOPBACK))
130 goto err; 133 goto err;
131 134
132 /* RFC4291 Errata ID: 3480 135 /* RFC4291 Errata ID: 3480
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index fc7b4017ba24..93e58a5e1837 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -253,7 +253,7 @@ out_unlock:
253 rcu_read_unlock(); 253 rcu_read_unlock();
254 254
255out: 255out:
256 NAPI_GRO_CB(skb)->flush |= flush; 256 skb_gro_flush_final(skb, pp, flush);
257 257
258 return pp; 258 return pp;
259} 259}
@@ -294,8 +294,10 @@ static int ipv6_gro_complete(struct sk_buff *skb, int nhoff)
294 struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + nhoff); 294 struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + nhoff);
295 int err = -ENOSYS; 295 int err = -ENOSYS;
296 296
297 if (skb->encapsulation) 297 if (skb->encapsulation) {
298 skb_set_inner_protocol(skb, cpu_to_be16(ETH_P_IPV6));
298 skb_set_inner_network_header(skb, nhoff); 299 skb_set_inner_network_header(skb, nhoff);
300 }
299 301
300 iph->payload_len = htons(skb->len - nhoff - sizeof(*iph)); 302 iph->payload_len = htons(skb->len - nhoff - sizeof(*iph));
301 303
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 2c0df09e9036..58f6288e9ba5 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -119,7 +119,8 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
119 if (unlikely(!neigh)) 119 if (unlikely(!neigh))
120 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false); 120 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
121 if (!IS_ERR(neigh)) { 121 if (!IS_ERR(neigh)) {
122 ret = dst_neigh_output(dst, neigh, skb); 122 sock_confirm_neigh(skb, neigh);
123 ret = neigh_output(neigh, skb);
123 rcu_read_unlock_bh(); 124 rcu_read_unlock_bh();
124 return ret; 125 return ret;
125 } 126 }
@@ -767,13 +768,14 @@ slow_path:
767 * Fragment the datagram. 768 * Fragment the datagram.
768 */ 769 */
769 770
770 *prevhdr = NEXTHDR_FRAGMENT;
771 troom = rt->dst.dev->needed_tailroom; 771 troom = rt->dst.dev->needed_tailroom;
772 772
773 /* 773 /*
774 * Keep copying data until we run out. 774 * Keep copying data until we run out.
775 */ 775 */
776 while (left > 0) { 776 while (left > 0) {
777 u8 *fragnexthdr_offset;
778
777 len = left; 779 len = left;
778 /* IF: it doesn't fit, use 'mtu' - the data space left */ 780 /* IF: it doesn't fit, use 'mtu' - the data space left */
779 if (len > mtu) 781 if (len > mtu)
@@ -818,6 +820,10 @@ slow_path:
818 */ 820 */
819 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen); 821 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
820 822
823 fragnexthdr_offset = skb_network_header(frag);
824 fragnexthdr_offset += prevhdr - skb_network_header(skb);
825 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
826
821 /* 827 /*
822 * Build fragment header. 828 * Build fragment header.
823 */ 829 */
@@ -1021,6 +1027,11 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1021 } 1027 }
1022 } 1028 }
1023#endif 1029#endif
1030 if (ipv6_addr_v4mapped(&fl6->saddr) &&
1031 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1032 err = -EAFNOSUPPORT;
1033 goto out_err_release;
1034 }
1024 1035
1025 return 0; 1036 return 0;
1026 1037
@@ -1144,6 +1155,9 @@ static inline int ip6_ufo_append_data(struct sock *sk,
1144 skb->protocol = htons(ETH_P_IPV6); 1155 skb->protocol = htons(ETH_P_IPV6);
1145 skb->csum = 0; 1156 skb->csum = 0;
1146 1157
1158 if (flags & MSG_CONFIRM)
1159 skb_set_dst_pending_confirm(skb, 1);
1160
1147 __skb_queue_tail(queue, skb); 1161 __skb_queue_tail(queue, skb);
1148 } else if (skb_is_gso(skb)) { 1162 } else if (skb_is_gso(skb)) {
1149 goto append; 1163 goto append;
@@ -1344,7 +1358,7 @@ emsgsize:
1344 */ 1358 */
1345 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP && 1359 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1346 headersize == sizeof(struct ipv6hdr) && 1360 headersize == sizeof(struct ipv6hdr) &&
1347 length < mtu - headersize && 1361 length <= mtu - headersize &&
1348 !(flags & MSG_MORE) && 1362 !(flags & MSG_MORE) &&
1349 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) 1363 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1350 csummode = CHECKSUM_PARTIAL; 1364 csummode = CHECKSUM_PARTIAL;
@@ -1376,7 +1390,7 @@ emsgsize:
1376 if ((((length + fragheaderlen) > mtu) || 1390 if ((((length + fragheaderlen) > mtu) ||
1377 (skb && skb_is_gso(skb))) && 1391 (skb && skb_is_gso(skb))) &&
1378 (sk->sk_protocol == IPPROTO_UDP) && 1392 (sk->sk_protocol == IPPROTO_UDP) &&
1379 (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len && 1393 (rt->dst.dev->features & NETIF_F_UFO) && !dst_xfrm(&rt->dst) &&
1380 (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk)) { 1394 (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk)) {
1381 err = ip6_ufo_append_data(sk, queue, getfrag, from, length, 1395 err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
1382 hh_len, fragheaderlen, exthdrlen, 1396 hh_len, fragheaderlen, exthdrlen,
@@ -1516,6 +1530,9 @@ alloc_new_skb:
1516 exthdrlen = 0; 1530 exthdrlen = 0;
1517 dst_exthdrlen = 0; 1531 dst_exthdrlen = 0;
1518 1532
1533 if ((flags & MSG_CONFIRM) && !skb_prev)
1534 skb_set_dst_pending_confirm(skb, 1);
1535
1519 /* 1536 /*
1520 * Put the packet on the pending queue 1537 * Put the packet on the pending queue
1521 */ 1538 */
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index ff8ee06491c3..75fac933c209 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -441,7 +441,7 @@ __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw)
441 if (i + sizeof(*tel) > optlen) 441 if (i + sizeof(*tel) > optlen)
442 break; 442 break;
443 443
444 tel = (struct ipv6_tlv_tnl_enc_lim *) skb->data + off + i; 444 tel = (struct ipv6_tlv_tnl_enc_lim *)(skb->data + off + i);
445 /* return index of option if found and valid */ 445 /* return index of option if found and valid */
446 if (tel->type == IPV6_TLV_TNL_ENCAP_LIMIT && 446 if (tel->type == IPV6_TLV_TNL_ENCAP_LIMIT &&
447 tel->length == 1) 447 tel->length == 1)
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index d82042c8d8fd..3d8a3b63b4fd 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -49,6 +49,7 @@
49#include <net/xfrm.h> 49#include <net/xfrm.h>
50#include <net/net_namespace.h> 50#include <net/net_namespace.h>
51#include <net/netns/generic.h> 51#include <net/netns/generic.h>
52#include <linux/etherdevice.h>
52 53
53#define IP6_VTI_HASH_SIZE_SHIFT 5 54#define IP6_VTI_HASH_SIZE_SHIFT 5
54#define IP6_VTI_HASH_SIZE (1 << IP6_VTI_HASH_SIZE_SHIFT) 55#define IP6_VTI_HASH_SIZE (1 << IP6_VTI_HASH_SIZE_SHIFT)
@@ -484,11 +485,15 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
484 if (!skb->ignore_df && skb->len > mtu) { 485 if (!skb->ignore_df && skb->len > mtu) {
485 skb_dst(skb)->ops->update_pmtu(dst, NULL, skb, mtu); 486 skb_dst(skb)->ops->update_pmtu(dst, NULL, skb, mtu);
486 487
487 if (skb->protocol == htons(ETH_P_IPV6)) 488 if (skb->protocol == htons(ETH_P_IPV6)) {
489 if (mtu < IPV6_MIN_MTU)
490 mtu = IPV6_MIN_MTU;
491
488 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 492 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
489 else 493 } else {
490 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, 494 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
491 htonl(mtu)); 495 htonl(mtu));
496 }
492 497
493 return -EMSGSIZE; 498 return -EMSGSIZE;
494 } 499 }
@@ -692,6 +697,10 @@ vti6_parm_to_user(struct ip6_tnl_parm2 *u, const struct __ip6_tnl_parm *p)
692 u->link = p->link; 697 u->link = p->link;
693 u->i_key = p->i_key; 698 u->i_key = p->i_key;
694 u->o_key = p->o_key; 699 u->o_key = p->o_key;
700 if (u->i_key)
701 u->i_flags |= GRE_KEY;
702 if (u->o_key)
703 u->o_flags |= GRE_KEY;
695 u->proto = p->proto; 704 u->proto = p->proto;
696 705
697 memcpy(u->name, p->name, sizeof(u->name)); 706 memcpy(u->name, p->name, sizeof(u->name));
@@ -842,6 +851,9 @@ static void vti6_dev_setup(struct net_device *dev)
842 dev->flags |= IFF_NOARP; 851 dev->flags |= IFF_NOARP;
843 dev->addr_len = sizeof(struct in6_addr); 852 dev->addr_len = sizeof(struct in6_addr);
844 netif_keep_dst(dev); 853 netif_keep_dst(dev);
854 /* This perm addr will be used as interface identifier by IPv6 */
855 dev->addr_assign_type = NET_ADDR_RANDOM;
856 eth_random_addr(dev->perm_addr);
845} 857}
846 858
847/** 859/**
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 604d8953c775..bf34d0950752 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -774,7 +774,8 @@ failure:
774 * Delete a VIF entry 774 * Delete a VIF entry
775 */ 775 */
776 776
777static int mif6_delete(struct mr6_table *mrt, int vifi, struct list_head *head) 777static int mif6_delete(struct mr6_table *mrt, int vifi, int notify,
778 struct list_head *head)
778{ 779{
779 struct mif_device *v; 780 struct mif_device *v;
780 struct net_device *dev; 781 struct net_device *dev;
@@ -820,7 +821,7 @@ static int mif6_delete(struct mr6_table *mrt, int vifi, struct list_head *head)
820 dev->ifindex, &in6_dev->cnf); 821 dev->ifindex, &in6_dev->cnf);
821 } 822 }
822 823
823 if (v->flags & MIFF_REGISTER) 824 if ((v->flags & MIFF_REGISTER) && !notify)
824 unregister_netdevice_queue(dev, head); 825 unregister_netdevice_queue(dev, head);
825 826
826 dev_put(dev); 827 dev_put(dev);
@@ -1331,7 +1332,6 @@ static int ip6mr_device_event(struct notifier_block *this,
1331 struct mr6_table *mrt; 1332 struct mr6_table *mrt;
1332 struct mif_device *v; 1333 struct mif_device *v;
1333 int ct; 1334 int ct;
1334 LIST_HEAD(list);
1335 1335
1336 if (event != NETDEV_UNREGISTER) 1336 if (event != NETDEV_UNREGISTER)
1337 return NOTIFY_DONE; 1337 return NOTIFY_DONE;
@@ -1340,10 +1340,9 @@ static int ip6mr_device_event(struct notifier_block *this,
1340 v = &mrt->vif6_table[0]; 1340 v = &mrt->vif6_table[0];
1341 for (ct = 0; ct < mrt->maxvif; ct++, v++) { 1341 for (ct = 0; ct < mrt->maxvif; ct++, v++) {
1342 if (v->dev == dev) 1342 if (v->dev == dev)
1343 mif6_delete(mrt, ct, &list); 1343 mif6_delete(mrt, ct, 1, NULL);
1344 } 1344 }
1345 } 1345 }
1346 unregister_netdevice_many(&list);
1347 1346
1348 return NOTIFY_DONE; 1347 return NOTIFY_DONE;
1349} 1348}
@@ -1552,7 +1551,7 @@ static void mroute_clean_tables(struct mr6_table *mrt, bool all)
1552 for (i = 0; i < mrt->maxvif; i++) { 1551 for (i = 0; i < mrt->maxvif; i++) {
1553 if (!all && (mrt->vif6_table[i].flags & VIFF_STATIC)) 1552 if (!all && (mrt->vif6_table[i].flags & VIFF_STATIC))
1554 continue; 1553 continue;
1555 mif6_delete(mrt, i, &list); 1554 mif6_delete(mrt, i, 0, &list);
1556 } 1555 }
1557 unregister_netdevice_many(&list); 1556 unregister_netdevice_many(&list);
1558 1557
@@ -1666,6 +1665,10 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
1666 struct net *net = sock_net(sk); 1665 struct net *net = sock_net(sk);
1667 struct mr6_table *mrt; 1666 struct mr6_table *mrt;
1668 1667
1668 if (sk->sk_type != SOCK_RAW ||
1669 inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
1670 return -EOPNOTSUPP;
1671
1669 mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); 1672 mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT);
1670 if (!mrt) 1673 if (!mrt)
1671 return -ENOENT; 1674 return -ENOENT;
@@ -1677,9 +1680,6 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
1677 1680
1678 switch (optname) { 1681 switch (optname) {
1679 case MRT6_INIT: 1682 case MRT6_INIT:
1680 if (sk->sk_type != SOCK_RAW ||
1681 inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
1682 return -EOPNOTSUPP;
1683 if (optlen < sizeof(int)) 1683 if (optlen < sizeof(int))
1684 return -EINVAL; 1684 return -EINVAL;
1685 1685
@@ -1706,7 +1706,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
1706 if (copy_from_user(&mifi, optval, sizeof(mifi_t))) 1706 if (copy_from_user(&mifi, optval, sizeof(mifi_t)))
1707 return -EFAULT; 1707 return -EFAULT;
1708 rtnl_lock(); 1708 rtnl_lock();
1709 ret = mif6_delete(mrt, mifi, NULL); 1709 ret = mif6_delete(mrt, mifi, 0, NULL);
1710 rtnl_unlock(); 1710 rtnl_unlock();
1711 return ret; 1711 return ret;
1712 1712
@@ -1815,6 +1815,10 @@ int ip6_mroute_getsockopt(struct sock *sk, int optname, char __user *optval,
1815 struct net *net = sock_net(sk); 1815 struct net *net = sock_net(sk);
1816 struct mr6_table *mrt; 1816 struct mr6_table *mrt;
1817 1817
1818 if (sk->sk_type != SOCK_RAW ||
1819 inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
1820 return -EOPNOTSUPP;
1821
1818 mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); 1822 mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT);
1819 if (!mrt) 1823 if (!mrt)
1820 return -ENOENT; 1824 return -ENOENT;
@@ -2243,8 +2247,10 @@ static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb,
2243 int ct; 2247 int ct;
2244 2248
2245 /* If cache is unresolved, don't try to parse IIF and OIF */ 2249 /* If cache is unresolved, don't try to parse IIF and OIF */
2246 if (c->mf6c_parent >= MAXMIFS) 2250 if (c->mf6c_parent >= MAXMIFS) {
2251 rtm->rtm_flags |= RTNH_F_UNRESOLVED;
2247 return -ENOENT; 2252 return -ENOENT;
2253 }
2248 2254
2249 if (MIF_EXISTS(mrt, c->mf6c_parent) && 2255 if (MIF_EXISTS(mrt, c->mf6c_parent) &&
2250 nla_put_u32(skb, RTA_IIF, mrt->vif6_table[c->mf6c_parent].dev->ifindex) < 0) 2256 nla_put_u32(skb, RTA_IIF, mrt->vif6_table[c->mf6c_parent].dev->ifindex) < 0)
@@ -2286,7 +2292,7 @@ static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb,
2286} 2292}
2287 2293
2288int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm, 2294int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm,
2289 int nowait, u32 portid) 2295 u32 portid)
2290{ 2296{
2291 int err; 2297 int err;
2292 struct mr6_table *mrt; 2298 struct mr6_table *mrt;
@@ -2313,11 +2319,6 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm,
2313 struct net_device *dev; 2319 struct net_device *dev;
2314 int vif; 2320 int vif;
2315 2321
2316 if (nowait) {
2317 read_unlock(&mrt_lock);
2318 return -EAGAIN;
2319 }
2320
2321 dev = skb->dev; 2322 dev = skb->dev;
2322 if (!dev || (vif = ip6mr_find_vif(mrt, dev)) < 0) { 2323 if (!dev || (vif = ip6mr_find_vif(mrt, dev)) < 0) {
2323 read_unlock(&mrt_lock); 2324 read_unlock(&mrt_lock);
@@ -2355,7 +2356,7 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm,
2355 return err; 2356 return err;
2356 } 2357 }
2357 2358
2358 if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY)) 2359 if (rtm->rtm_flags & RTM_F_NOTIFY)
2359 cache->mfc_flags |= MFC_NOTIFY; 2360 cache->mfc_flags |= MFC_NOTIFY;
2360 2361
2361 err = __ip6mr_fill_mroute(mrt, skb, cache, rtm); 2362 err = __ip6mr_fill_mroute(mrt, skb, cache, rtm);
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index ee97c44e2aa0..a531ba032b85 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -595,16 +595,24 @@ done:
595 595
596 if (val) { 596 if (val) {
597 struct net_device *dev; 597 struct net_device *dev;
598 int midx;
598 599
599 if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != val) 600 rcu_read_lock();
600 goto e_inval;
601 601
602 dev = dev_get_by_index(net, val); 602 dev = dev_get_by_index_rcu(net, val);
603 if (!dev) { 603 if (!dev) {
604 rcu_read_unlock();
604 retv = -ENODEV; 605 retv = -ENODEV;
605 break; 606 break;
606 } 607 }
607 dev_put(dev); 608 midx = l3mdev_master_ifindex_rcu(dev);
609
610 rcu_read_unlock();
611
612 if (sk->sk_bound_dev_if &&
613 sk->sk_bound_dev_if != val &&
614 (!midx || midx != sk->sk_bound_dev_if))
615 goto e_inval;
608 } 616 }
609 np->mcast_oif = val; 617 np->mcast_oif = val;
610 retv = 0; 618 retv = 0;
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 7139fffd61b6..1bdc703cb966 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -779,6 +779,7 @@ static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im)
779 psf->sf_crcount = im->mca_crcount; 779 psf->sf_crcount = im->mca_crcount;
780 } 780 }
781 in6_dev_put(pmc->idev); 781 in6_dev_put(pmc->idev);
782 kfree(pmc);
782 } 783 }
783 spin_unlock_bh(&im->mca_lock); 784 spin_unlock_bh(&im->mca_lock);
784} 785}
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 25a022d41a70..1e15c54fd5e2 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -855,10 +855,6 @@ copy_entries_to_user(unsigned int total_size,
855 return PTR_ERR(counters); 855 return PTR_ERR(counters);
856 856
857 loc_cpu_entry = private->entries; 857 loc_cpu_entry = private->entries;
858 if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
859 ret = -EFAULT;
860 goto free_counters;
861 }
862 858
863 /* FIXME: use iterator macros --RR */ 859 /* FIXME: use iterator macros --RR */
864 /* ... then go back and fix counters and names */ 860 /* ... then go back and fix counters and names */
@@ -868,6 +864,10 @@ copy_entries_to_user(unsigned int total_size,
868 const struct xt_entry_target *t; 864 const struct xt_entry_target *t;
869 865
870 e = (struct ip6t_entry *)(loc_cpu_entry + off); 866 e = (struct ip6t_entry *)(loc_cpu_entry + off);
867 if (copy_to_user(userptr + off, e, sizeof(*e))) {
868 ret = -EFAULT;
869 goto free_counters;
870 }
871 if (copy_to_user(userptr + off 871 if (copy_to_user(userptr + off
872 + offsetof(struct ip6t_entry, counters), 872 + offsetof(struct ip6t_entry, counters),
873 &counters[num], 873 &counters[num],
@@ -881,23 +881,14 @@ copy_entries_to_user(unsigned int total_size,
881 i += m->u.match_size) { 881 i += m->u.match_size) {
882 m = (void *)e + i; 882 m = (void *)e + i;
883 883
884 if (copy_to_user(userptr + off + i 884 if (xt_match_to_user(m, userptr + off + i)) {
885 + offsetof(struct xt_entry_match,
886 u.user.name),
887 m->u.kernel.match->name,
888 strlen(m->u.kernel.match->name)+1)
889 != 0) {
890 ret = -EFAULT; 885 ret = -EFAULT;
891 goto free_counters; 886 goto free_counters;
892 } 887 }
893 } 888 }
894 889
895 t = ip6t_get_target_c(e); 890 t = ip6t_get_target_c(e);
896 if (copy_to_user(userptr + off + e->target_offset 891 if (xt_target_to_user(t, userptr + off + e->target_offset)) {
897 + offsetof(struct xt_entry_target,
898 u.user.name),
899 t->u.kernel.target->name,
900 strlen(t->u.kernel.target->name)+1) != 0) {
901 ret = -EFAULT; 892 ret = -EFAULT;
902 goto free_counters; 893 goto free_counters;
903 } 894 }
diff --git a/net/ipv6/netfilter/ip6t_NPT.c b/net/ipv6/netfilter/ip6t_NPT.c
index 590f767db5d4..a379d2f79b19 100644
--- a/net/ipv6/netfilter/ip6t_NPT.c
+++ b/net/ipv6/netfilter/ip6t_NPT.c
@@ -112,6 +112,7 @@ static struct xt_target ip6t_npt_target_reg[] __read_mostly = {
112 .table = "mangle", 112 .table = "mangle",
113 .target = ip6t_snpt_tg, 113 .target = ip6t_snpt_tg,
114 .targetsize = sizeof(struct ip6t_npt_tginfo), 114 .targetsize = sizeof(struct ip6t_npt_tginfo),
115 .usersize = offsetof(struct ip6t_npt_tginfo, adjustment),
115 .checkentry = ip6t_npt_checkentry, 116 .checkentry = ip6t_npt_checkentry,
116 .family = NFPROTO_IPV6, 117 .family = NFPROTO_IPV6,
117 .hooks = (1 << NF_INET_LOCAL_IN) | 118 .hooks = (1 << NF_INET_LOCAL_IN) |
@@ -123,6 +124,7 @@ static struct xt_target ip6t_npt_target_reg[] __read_mostly = {
123 .table = "mangle", 124 .table = "mangle",
124 .target = ip6t_dnpt_tg, 125 .target = ip6t_dnpt_tg,
125 .targetsize = sizeof(struct ip6t_npt_tginfo), 126 .targetsize = sizeof(struct ip6t_npt_tginfo),
127 .usersize = offsetof(struct ip6t_npt_tginfo, adjustment),
126 .checkentry = ip6t_npt_checkentry, 128 .checkentry = ip6t_npt_checkentry,
127 .family = NFPROTO_IPV6, 129 .family = NFPROTO_IPV6,
128 .hooks = (1 << NF_INET_PRE_ROUTING) | 130 .hooks = (1 << NF_INET_PRE_ROUTING) |
diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c
index 98c8dd38575a..4ef1ddd4bbbd 100644
--- a/net/ipv6/netfilter/ip6t_SYNPROXY.c
+++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c
@@ -71,8 +71,7 @@ synproxy_send_tcp(struct net *net,
71 skb_dst_set(nskb, dst); 71 skb_dst_set(nskb, dst);
72 72
73 if (nfct) { 73 if (nfct) {
74 nskb->nfct = nfct; 74 nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo);
75 nskb->nfctinfo = ctinfo;
76 nf_conntrack_get(nfct); 75 nf_conntrack_get(nfct);
77 } 76 }
78 77
@@ -121,8 +120,8 @@ synproxy_send_client_synack(struct net *net,
121 120
122 synproxy_build_options(nth, opts); 121 synproxy_build_options(nth, opts);
123 122
124 synproxy_send_tcp(net, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, 123 synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
125 niph, nth, tcp_hdr_size); 124 IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
126} 125}
127 126
128static void 127static void
@@ -244,8 +243,8 @@ synproxy_send_client_ack(struct net *net,
244 243
245 synproxy_build_options(nth, opts); 244 synproxy_build_options(nth, opts);
246 245
247 synproxy_send_tcp(net, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, 246 synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
248 niph, nth, tcp_hdr_size); 247 IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
249} 248}
250 249
251static bool 250static bool
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index f5a61bc3ec2b..d2c2ccbfbe72 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -145,15 +145,15 @@ static int
145icmpv6_error_message(struct net *net, struct nf_conn *tmpl, 145icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
146 struct sk_buff *skb, 146 struct sk_buff *skb,
147 unsigned int icmp6off, 147 unsigned int icmp6off,
148 enum ip_conntrack_info *ctinfo,
149 unsigned int hooknum) 148 unsigned int hooknum)
150{ 149{
151 struct nf_conntrack_tuple intuple, origtuple; 150 struct nf_conntrack_tuple intuple, origtuple;
152 const struct nf_conntrack_tuple_hash *h; 151 const struct nf_conntrack_tuple_hash *h;
153 const struct nf_conntrack_l4proto *inproto; 152 const struct nf_conntrack_l4proto *inproto;
153 enum ip_conntrack_info ctinfo;
154 struct nf_conntrack_zone tmp; 154 struct nf_conntrack_zone tmp;
155 155
156 NF_CT_ASSERT(skb->nfct == NULL); 156 NF_CT_ASSERT(!skb_nfct(skb));
157 157
158 /* Are they talking about one of our connections? */ 158 /* Are they talking about one of our connections? */
159 if (!nf_ct_get_tuplepr(skb, 159 if (!nf_ct_get_tuplepr(skb,
@@ -176,7 +176,7 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
176 return -NF_ACCEPT; 176 return -NF_ACCEPT;
177 } 177 }
178 178
179 *ctinfo = IP_CT_RELATED; 179 ctinfo = IP_CT_RELATED;
180 180
181 h = nf_conntrack_find_get(net, nf_ct_zone_tmpl(tmpl, skb, &tmp), 181 h = nf_conntrack_find_get(net, nf_ct_zone_tmpl(tmpl, skb, &tmp),
182 &intuple); 182 &intuple);
@@ -185,19 +185,18 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
185 return -NF_ACCEPT; 185 return -NF_ACCEPT;
186 } else { 186 } else {
187 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) 187 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY)
188 *ctinfo += IP_CT_IS_REPLY; 188 ctinfo += IP_CT_IS_REPLY;
189 } 189 }
190 190
191 /* Update skb to refer to this connection */ 191 /* Update skb to refer to this connection */
192 skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general; 192 nf_ct_set(skb, nf_ct_tuplehash_to_ctrack(h), ctinfo);
193 skb->nfctinfo = *ctinfo;
194 return NF_ACCEPT; 193 return NF_ACCEPT;
195} 194}
196 195
197static int 196static int
198icmpv6_error(struct net *net, struct nf_conn *tmpl, 197icmpv6_error(struct net *net, struct nf_conn *tmpl,
199 struct sk_buff *skb, unsigned int dataoff, 198 struct sk_buff *skb, unsigned int dataoff,
200 enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum) 199 u8 pf, unsigned int hooknum)
201{ 200{
202 const struct icmp6hdr *icmp6h; 201 const struct icmp6hdr *icmp6h;
203 struct icmp6hdr _ih; 202 struct icmp6hdr _ih;
@@ -222,9 +221,8 @@ icmpv6_error(struct net *net, struct nf_conn *tmpl,
222 type = icmp6h->icmp6_type - 130; 221 type = icmp6h->icmp6_type - 130;
223 if (type >= 0 && type < sizeof(noct_valid_new) && 222 if (type >= 0 && type < sizeof(noct_valid_new) &&
224 noct_valid_new[type]) { 223 noct_valid_new[type]) {
225 skb->nfct = &nf_ct_untracked_get()->ct_general; 224 nf_ct_set(skb, nf_ct_untracked_get(), IP_CT_NEW);
226 skb->nfctinfo = IP_CT_NEW; 225 nf_conntrack_get(skb_nfct(skb));
227 nf_conntrack_get(skb->nfct);
228 return NF_ACCEPT; 226 return NF_ACCEPT;
229 } 227 }
230 228
@@ -232,7 +230,7 @@ icmpv6_error(struct net *net, struct nf_conn *tmpl,
232 if (icmp6h->icmp6_type >= 128) 230 if (icmp6h->icmp6_type >= 128)
233 return NF_ACCEPT; 231 return NF_ACCEPT;
234 232
235 return icmpv6_error_message(net, tmpl, skb, dataoff, ctinfo, hooknum); 233 return icmpv6_error_message(net, tmpl, skb, dataoff, hooknum);
236} 234}
237 235
238#if IS_ENABLED(CONFIG_NF_CT_NETLINK) 236#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 9948b5ce52da..986d4ca38832 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -589,6 +589,7 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
589 hdr = ipv6_hdr(skb); 589 hdr = ipv6_hdr(skb);
590 fhdr = (struct frag_hdr *)skb_transport_header(skb); 590 fhdr = (struct frag_hdr *)skb_transport_header(skb);
591 591
592 skb_orphan(skb);
592 fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr, 593 fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr,
593 skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr)); 594 skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr));
594 if (fq == NULL) { 595 if (fq == NULL) {
diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
index 8e0bdd058787..ada60d1a991b 100644
--- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
+++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
@@ -37,7 +37,7 @@ static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum,
37{ 37{
38 u16 zone_id = NF_CT_DEFAULT_ZONE_ID; 38 u16 zone_id = NF_CT_DEFAULT_ZONE_ID;
39#if IS_ENABLED(CONFIG_NF_CONNTRACK) 39#if IS_ENABLED(CONFIG_NF_CONNTRACK)
40 if (skb->nfct) { 40 if (skb_nfct(skb)) {
41 enum ip_conntrack_info ctinfo; 41 enum ip_conntrack_info ctinfo;
42 const struct nf_conn *ct = nf_ct_get(skb, &ctinfo); 42 const struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
43 43
@@ -61,7 +61,7 @@ static unsigned int ipv6_defrag(void *priv,
61 61
62#if IS_ENABLED(CONFIG_NF_CONNTRACK) 62#if IS_ENABLED(CONFIG_NF_CONNTRACK)
63 /* Previously seen (loopback)? */ 63 /* Previously seen (loopback)? */
64 if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct)) 64 if (skb_nfct(skb) && !nf_ct_is_template((struct nf_conn *)skb_nfct(skb)))
65 return NF_ACCEPT; 65 return NF_ACCEPT;
66#endif 66#endif
67 67
diff --git a/net/ipv6/netfilter/nf_dup_ipv6.c b/net/ipv6/netfilter/nf_dup_ipv6.c
index 4a84b5ad9ecb..888ecd106e5f 100644
--- a/net/ipv6/netfilter/nf_dup_ipv6.c
+++ b/net/ipv6/netfilter/nf_dup_ipv6.c
@@ -57,10 +57,9 @@ void nf_dup_ipv6(struct net *net, struct sk_buff *skb, unsigned int hooknum,
57 return; 57 return;
58 58
59#if IS_ENABLED(CONFIG_NF_CONNTRACK) 59#if IS_ENABLED(CONFIG_NF_CONNTRACK)
60 nf_conntrack_put(skb->nfct); 60 nf_reset(skb);
61 skb->nfct = &nf_ct_untracked_get()->ct_general; 61 nf_ct_set(skb, nf_ct_untracked_get(), IP_CT_NEW);
62 skb->nfctinfo = IP_CT_NEW; 62 nf_conntrack_get(skb_nfct(skb));
63 nf_conntrack_get(skb->nfct);
64#endif 63#endif
65 if (hooknum == NF_INET_PRE_ROUTING || 64 if (hooknum == NF_INET_PRE_ROUTING ||
66 hooknum == NF_INET_LOCAL_IN) { 65 hooknum == NF_INET_LOCAL_IN) {
diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c
index 57d86066a13b..97c724224da7 100644
--- a/net/ipv6/netfilter/nf_log_ipv6.c
+++ b/net/ipv6/netfilter/nf_log_ipv6.c
@@ -64,7 +64,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m,
64 nf_log_buf_add(m, "SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr); 64 nf_log_buf_add(m, "SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr);
65 65
66 /* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */ 66 /* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */
67 nf_log_buf_add(m, "LEN=%Zu TC=%u HOPLIMIT=%u FLOWLBL=%u ", 67 nf_log_buf_add(m, "LEN=%zu TC=%u HOPLIMIT=%u FLOWLBL=%u ",
68 ntohs(ih->payload_len) + sizeof(struct ipv6hdr), 68 ntohs(ih->payload_len) + sizeof(struct ipv6hdr),
69 (ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20, 69 (ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20,
70 ih->hop_limit, 70 ih->hop_limit,
@@ -351,7 +351,7 @@ static void nf_log_ip6_packet(struct net *net, u_int8_t pf,
351 struct nf_log_buf *m; 351 struct nf_log_buf *m;
352 352
353 /* FIXME: Disabled from containers until syslog ns is supported */ 353 /* FIXME: Disabled from containers until syslog ns is supported */
354 if (!net_eq(net, &init_net)) 354 if (!net_eq(net, &init_net) && !sysctl_nf_log_all_netns)
355 return; 355 return;
356 356
357 m = nf_log_buf_open(); 357 m = nf_log_buf_open();
diff --git a/net/ipv6/netfilter/nft_masq_ipv6.c b/net/ipv6/netfilter/nft_masq_ipv6.c
index 6c5b5b1830a7..4146536e9c15 100644
--- a/net/ipv6/netfilter/nft_masq_ipv6.c
+++ b/net/ipv6/netfilter/nft_masq_ipv6.c
@@ -27,10 +27,10 @@ static void nft_masq_ipv6_eval(const struct nft_expr *expr,
27 memset(&range, 0, sizeof(range)); 27 memset(&range, 0, sizeof(range));
28 range.flags = priv->flags; 28 range.flags = priv->flags;
29 if (priv->sreg_proto_min) { 29 if (priv->sreg_proto_min) {
30 range.min_proto.all = 30 range.min_proto.all = (__force __be16)nft_reg_load16(
31 *(__be16 *)&regs->data[priv->sreg_proto_min]; 31 &regs->data[priv->sreg_proto_min]);
32 range.max_proto.all = 32 range.max_proto.all = (__force __be16)nft_reg_load16(
33 *(__be16 *)&regs->data[priv->sreg_proto_max]; 33 &regs->data[priv->sreg_proto_max]);
34 } 34 }
35 regs->verdict.code = nf_nat_masquerade_ipv6(pkt->skb, &range, 35 regs->verdict.code = nf_nat_masquerade_ipv6(pkt->skb, &range,
36 nft_out(pkt)); 36 nft_out(pkt));
diff --git a/net/ipv6/netfilter/nft_redir_ipv6.c b/net/ipv6/netfilter/nft_redir_ipv6.c
index f5ac080fc084..a27e424f690d 100644
--- a/net/ipv6/netfilter/nft_redir_ipv6.c
+++ b/net/ipv6/netfilter/nft_redir_ipv6.c
@@ -26,10 +26,10 @@ static void nft_redir_ipv6_eval(const struct nft_expr *expr,
26 26
27 memset(&range, 0, sizeof(range)); 27 memset(&range, 0, sizeof(range));
28 if (priv->sreg_proto_min) { 28 if (priv->sreg_proto_min) {
29 range.min_proto.all = 29 range.min_proto.all = (__force __be16)nft_reg_load16(
30 *(__be16 *)&regs->data[priv->sreg_proto_min], 30 &regs->data[priv->sreg_proto_min]);
31 range.max_proto.all = 31 range.max_proto.all = (__force __be16)nft_reg_load16(
32 *(__be16 *)&regs->data[priv->sreg_proto_max], 32 &regs->data[priv->sreg_proto_max]);
33 range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; 33 range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
34 } 34 }
35 35
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index e1f8b34d7a2e..9b522fa90e6d 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -126,12 +126,6 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
126 return PTR_ERR(dst); 126 return PTR_ERR(dst);
127 rt = (struct rt6_info *) dst; 127 rt = (struct rt6_info *) dst;
128 128
129 np = inet6_sk(sk);
130 if (!np) {
131 err = -EBADF;
132 goto dst_err_out;
133 }
134
135 if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) 129 if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr))
136 fl6.flowi6_oif = np->mcast_oif; 130 fl6.flowi6_oif = np->mcast_oif;
137 else if (!fl6.flowi6_oif) 131 else if (!fl6.flowi6_oif)
@@ -166,7 +160,6 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
166 } 160 }
167 release_sock(sk); 161 release_sock(sk);
168 162
169dst_err_out:
170 dst_release(dst); 163 dst_release(dst);
171 164
172 if (err) 165 if (err)
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index ea89073c8247..f174e76e6505 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -654,6 +654,9 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length,
654 654
655 skb->ip_summed = CHECKSUM_NONE; 655 skb->ip_summed = CHECKSUM_NONE;
656 656
657 if (flags & MSG_CONFIRM)
658 skb_set_dst_pending_confirm(skb, 1);
659
657 skb->transport_header = skb->network_header; 660 skb->transport_header = skb->network_header;
658 err = memcpy_from_msg(iph, msg, length); 661 err = memcpy_from_msg(iph, msg, length);
659 if (err) 662 if (err)
@@ -934,7 +937,8 @@ out:
934 txopt_put(opt_to_free); 937 txopt_put(opt_to_free);
935 return err < 0 ? err : len; 938 return err < 0 ? err : len;
936do_confirm: 939do_confirm:
937 dst_confirm(dst); 940 if (msg->msg_flags & MSG_PROBE)
941 dst_confirm_neigh(dst, &fl6.daddr);
938 if (!(msg->msg_flags & MSG_PROBE) || len) 942 if (!(msg->msg_flags & MSG_PROBE) || len)
939 goto back_from_confirm; 943 goto back_from_confirm;
940 err = 0; 944 err = 0;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 7ea85370c11c..fb174b590fd3 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -98,6 +98,12 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
98 struct sk_buff *skb); 98 struct sk_buff *skb);
99static void rt6_dst_from_metrics_check(struct rt6_info *rt); 99static void rt6_dst_from_metrics_check(struct rt6_info *rt);
100static int rt6_score_route(struct rt6_info *rt, int oif, int strict); 100static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
101static size_t rt6_nlmsg_size(struct rt6_info *rt);
102static int rt6_fill_node(struct net *net,
103 struct sk_buff *skb, struct rt6_info *rt,
104 struct in6_addr *dst, struct in6_addr *src,
105 int iif, int type, u32 portid, u32 seq,
106 unsigned int flags);
101 107
102#ifdef CONFIG_IPV6_ROUTE_INFO 108#ifdef CONFIG_IPV6_ROUTE_INFO
103static struct rt6_info *rt6_add_route_info(struct net *net, 109static struct rt6_info *rt6_add_route_info(struct net *net,
@@ -217,6 +223,21 @@ static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
217 return neigh_create(&nd_tbl, daddr, dst->dev); 223 return neigh_create(&nd_tbl, daddr, dst->dev);
218} 224}
219 225
226static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
227{
228 struct net_device *dev = dst->dev;
229 struct rt6_info *rt = (struct rt6_info *)dst;
230
231 daddr = choose_neigh_daddr(rt, NULL, daddr);
232 if (!daddr)
233 return;
234 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
235 return;
236 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
237 return;
238 __ipv6_confirm_neigh(dev, daddr);
239}
240
220static struct dst_ops ip6_dst_ops_template = { 241static struct dst_ops ip6_dst_ops_template = {
221 .family = AF_INET6, 242 .family = AF_INET6,
222 .gc = ip6_dst_gc, 243 .gc = ip6_dst_gc,
@@ -233,6 +254,7 @@ static struct dst_ops ip6_dst_ops_template = {
233 .redirect = rt6_do_redirect, 254 .redirect = rt6_do_redirect,
234 .local_out = __ip6_local_out, 255 .local_out = __ip6_local_out,
235 .neigh_lookup = ip6_neigh_lookup, 256 .neigh_lookup = ip6_neigh_lookup,
257 .confirm_neigh = ip6_confirm_neigh,
236}; 258};
237 259
238static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst) 260static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
@@ -1359,6 +1381,7 @@ static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
1359static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, 1381static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1360 const struct ipv6hdr *iph, u32 mtu) 1382 const struct ipv6hdr *iph, u32 mtu)
1361{ 1383{
1384 const struct in6_addr *daddr, *saddr;
1362 struct rt6_info *rt6 = (struct rt6_info *)dst; 1385 struct rt6_info *rt6 = (struct rt6_info *)dst;
1363 1386
1364 if (rt6->rt6i_flags & RTF_LOCAL) 1387 if (rt6->rt6i_flags & RTF_LOCAL)
@@ -1367,26 +1390,26 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1367 if (dst_metric_locked(dst, RTAX_MTU)) 1390 if (dst_metric_locked(dst, RTAX_MTU))
1368 return; 1391 return;
1369 1392
1370 dst_confirm(dst); 1393 if (iph) {
1394 daddr = &iph->daddr;
1395 saddr = &iph->saddr;
1396 } else if (sk) {
1397 daddr = &sk->sk_v6_daddr;
1398 saddr = &inet6_sk(sk)->saddr;
1399 } else {
1400 daddr = NULL;
1401 saddr = NULL;
1402 }
1403 dst_confirm_neigh(dst, daddr);
1371 mtu = max_t(u32, mtu, IPV6_MIN_MTU); 1404 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1372 if (mtu >= dst_mtu(dst)) 1405 if (mtu >= dst_mtu(dst))
1373 return; 1406 return;
1374 1407
1375 if (!rt6_cache_allowed_for_pmtu(rt6)) { 1408 if (!rt6_cache_allowed_for_pmtu(rt6)) {
1376 rt6_do_update_pmtu(rt6, mtu); 1409 rt6_do_update_pmtu(rt6, mtu);
1377 } else { 1410 } else if (daddr) {
1378 const struct in6_addr *daddr, *saddr;
1379 struct rt6_info *nrt6; 1411 struct rt6_info *nrt6;
1380 1412
1381 if (iph) {
1382 daddr = &iph->daddr;
1383 saddr = &iph->saddr;
1384 } else if (sk) {
1385 daddr = &sk->sk_v6_daddr;
1386 saddr = &inet6_sk(sk)->saddr;
1387 } else {
1388 return;
1389 }
1390 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr); 1413 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
1391 if (nrt6) { 1414 if (nrt6) {
1392 rt6_do_update_pmtu(nrt6, mtu); 1415 rt6_do_update_pmtu(nrt6, mtu);
@@ -1831,6 +1854,10 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg)
1831 int addr_type; 1854 int addr_type;
1832 int err = -EINVAL; 1855 int err = -EINVAL;
1833 1856
1857 /* RTF_PCPU is an internal flag; can not be set by userspace */
1858 if (cfg->fc_flags & RTF_PCPU)
1859 goto out;
1860
1834 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128) 1861 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1835 goto out; 1862 goto out;
1836#ifndef CONFIG_IPV6_SUBTREES 1863#ifndef CONFIG_IPV6_SUBTREES
@@ -1897,7 +1924,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg)
1897 if (cfg->fc_encap) { 1924 if (cfg->fc_encap) {
1898 struct lwtunnel_state *lwtstate; 1925 struct lwtunnel_state *lwtstate;
1899 1926
1900 err = lwtunnel_build_state(dev, cfg->fc_encap_type, 1927 err = lwtunnel_build_state(cfg->fc_encap_type,
1901 cfg->fc_encap, AF_INET6, cfg, 1928 cfg->fc_encap, AF_INET6, cfg,
1902 &lwtstate); 1929 &lwtstate);
1903 if (err) 1930 if (err)
@@ -2143,6 +2170,58 @@ int ip6_del_rt(struct rt6_info *rt)
2143 return __ip6_del_rt(rt, &info); 2170 return __ip6_del_rt(rt, &info);
2144} 2171}
2145 2172
2173static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2174{
2175 struct nl_info *info = &cfg->fc_nlinfo;
2176 struct net *net = info->nl_net;
2177 struct sk_buff *skb = NULL;
2178 struct fib6_table *table;
2179 int err = -ENOENT;
2180
2181 if (rt == net->ipv6.ip6_null_entry)
2182 goto out_put;
2183 table = rt->rt6i_table;
2184 write_lock_bh(&table->tb6_lock);
2185
2186 if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2187 struct rt6_info *sibling, *next_sibling;
2188
2189 /* prefer to send a single notification with all hops */
2190 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2191 if (skb) {
2192 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2193
2194 if (rt6_fill_node(net, skb, rt,
2195 NULL, NULL, 0, RTM_DELROUTE,
2196 info->portid, seq, 0) < 0) {
2197 kfree_skb(skb);
2198 skb = NULL;
2199 } else
2200 info->skip_notify = 1;
2201 }
2202
2203 list_for_each_entry_safe(sibling, next_sibling,
2204 &rt->rt6i_siblings,
2205 rt6i_siblings) {
2206 err = fib6_del(sibling, info);
2207 if (err)
2208 goto out_unlock;
2209 }
2210 }
2211
2212 err = fib6_del(rt, info);
2213out_unlock:
2214 write_unlock_bh(&table->tb6_lock);
2215out_put:
2216 ip6_rt_put(rt);
2217
2218 if (skb) {
2219 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2220 info->nlh, gfp_any());
2221 }
2222 return err;
2223}
2224
2146static int ip6_route_del(struct fib6_config *cfg) 2225static int ip6_route_del(struct fib6_config *cfg)
2147{ 2226{
2148 struct fib6_table *table; 2227 struct fib6_table *table;
@@ -2179,7 +2258,11 @@ static int ip6_route_del(struct fib6_config *cfg)
2179 dst_hold(&rt->dst); 2258 dst_hold(&rt->dst);
2180 read_unlock_bh(&table->tb6_lock); 2259 read_unlock_bh(&table->tb6_lock);
2181 2260
2182 return __ip6_del_rt(rt, &cfg->fc_nlinfo); 2261 /* if gateway was specified only delete the one hop */
2262 if (cfg->fc_flags & RTF_GATEWAY)
2263 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2264
2265 return __ip6_del_rt_siblings(rt, cfg);
2183 } 2266 }
2184 } 2267 }
2185 read_unlock_bh(&table->tb6_lock); 2268 read_unlock_bh(&table->tb6_lock);
@@ -2258,7 +2341,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
2258 * Look, redirects are sent only in response to data packets, 2341 * Look, redirects are sent only in response to data packets,
2259 * so that this nexthop apparently is reachable. --ANK 2342 * so that this nexthop apparently is reachable. --ANK
2260 */ 2343 */
2261 dst_confirm(&rt->dst); 2344 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
2262 2345
2263 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); 2346 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2264 if (!neigh) 2347 if (!neigh)
@@ -2634,6 +2717,7 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2634 rt->dst.output = ip6_output; 2717 rt->dst.output = ip6_output;
2635 rt->rt6i_idev = idev; 2718 rt->rt6i_idev = idev;
2636 2719
2720 rt->rt6i_protocol = RTPROT_KERNEL;
2637 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP; 2721 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2638 if (anycast) 2722 if (anycast)
2639 rt->rt6i_flags |= RTF_ANYCAST; 2723 rt->rt6i_flags |= RTF_ANYCAST;
@@ -2711,13 +2795,16 @@ struct arg_dev_net {
2711 struct net *net; 2795 struct net *net;
2712}; 2796};
2713 2797
2798/* called with write lock held for table with rt */
2714static int fib6_ifdown(struct rt6_info *rt, void *arg) 2799static int fib6_ifdown(struct rt6_info *rt, void *arg)
2715{ 2800{
2716 const struct arg_dev_net *adn = arg; 2801 const struct arg_dev_net *adn = arg;
2717 const struct net_device *dev = adn->dev; 2802 const struct net_device *dev = adn->dev;
2718 2803
2719 if ((rt->dst.dev == dev || !dev) && 2804 if ((rt->dst.dev == dev || !dev) &&
2720 rt != adn->net->ipv6.ip6_null_entry) 2805 rt != adn->net->ipv6.ip6_null_entry &&
2806 (rt->rt6i_nsiblings == 0 ||
2807 !rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
2721 return -1; 2808 return -1;
2722 2809
2723 return 0; 2810 return 0;
@@ -2812,6 +2899,7 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2812 [RTA_ENCAP] = { .type = NLA_NESTED }, 2899 [RTA_ENCAP] = { .type = NLA_NESTED },
2813 [RTA_EXPIRES] = { .type = NLA_U32 }, 2900 [RTA_EXPIRES] = { .type = NLA_U32 },
2814 [RTA_UID] = { .type = NLA_U32 }, 2901 [RTA_UID] = { .type = NLA_U32 },
2902 [RTA_MARK] = { .type = NLA_U32 },
2815}; 2903};
2816 2904
2817static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, 2905static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -2948,7 +3036,7 @@ static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
2948 struct rt6_nh *nh; 3036 struct rt6_nh *nh;
2949 3037
2950 list_for_each_entry(nh, rt6_nh_list, next) { 3038 list_for_each_entry(nh, rt6_nh_list, next) {
2951 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6 nexthop %pI6 ifi %d\n", 3039 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
2952 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway, 3040 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
2953 nh->r_cfg.fc_ifindex); 3041 nh->r_cfg.fc_ifindex);
2954 } 3042 }
@@ -2987,13 +3075,37 @@ static int ip6_route_info_append(struct list_head *rt6_nh_list,
2987 return 0; 3075 return 0;
2988} 3076}
2989 3077
3078static void ip6_route_mpath_notify(struct rt6_info *rt,
3079 struct rt6_info *rt_last,
3080 struct nl_info *info,
3081 __u16 nlflags)
3082{
3083 /* if this is an APPEND route, then rt points to the first route
3084 * inserted and rt_last points to last route inserted. Userspace
3085 * wants a consistent dump of the route which starts at the first
3086 * nexthop. Since sibling routes are always added at the end of
3087 * the list, find the first sibling of the last route appended
3088 */
3089 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
3090 rt = list_first_entry(&rt_last->rt6i_siblings,
3091 struct rt6_info,
3092 rt6i_siblings);
3093 }
3094
3095 if (rt)
3096 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
3097}
3098
2990static int ip6_route_multipath_add(struct fib6_config *cfg) 3099static int ip6_route_multipath_add(struct fib6_config *cfg)
2991{ 3100{
3101 struct rt6_info *rt_notif = NULL, *rt_last = NULL;
3102 struct nl_info *info = &cfg->fc_nlinfo;
2992 struct fib6_config r_cfg; 3103 struct fib6_config r_cfg;
2993 struct rtnexthop *rtnh; 3104 struct rtnexthop *rtnh;
2994 struct rt6_info *rt; 3105 struct rt6_info *rt;
2995 struct rt6_nh *err_nh; 3106 struct rt6_nh *err_nh;
2996 struct rt6_nh *nh, *nh_safe; 3107 struct rt6_nh *nh, *nh_safe;
3108 __u16 nlflags;
2997 int remaining; 3109 int remaining;
2998 int attrlen; 3110 int attrlen;
2999 int err = 1; 3111 int err = 1;
@@ -3002,6 +3114,10 @@ static int ip6_route_multipath_add(struct fib6_config *cfg)
3002 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); 3114 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
3003 LIST_HEAD(rt6_nh_list); 3115 LIST_HEAD(rt6_nh_list);
3004 3116
3117 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
3118 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
3119 nlflags |= NLM_F_APPEND;
3120
3005 remaining = cfg->fc_mp_len; 3121 remaining = cfg->fc_mp_len;
3006 rtnh = (struct rtnexthop *)cfg->fc_mp; 3122 rtnh = (struct rtnexthop *)cfg->fc_mp;
3007 3123
@@ -3044,9 +3160,20 @@ static int ip6_route_multipath_add(struct fib6_config *cfg)
3044 rtnh = rtnh_next(rtnh, &remaining); 3160 rtnh = rtnh_next(rtnh, &remaining);
3045 } 3161 }
3046 3162
3163 /* for add and replace send one notification with all nexthops.
3164 * Skip the notification in fib6_add_rt2node and send one with
3165 * the full route when done
3166 */
3167 info->skip_notify = 1;
3168
3047 err_nh = NULL; 3169 err_nh = NULL;
3048 list_for_each_entry(nh, &rt6_nh_list, next) { 3170 list_for_each_entry(nh, &rt6_nh_list, next) {
3049 err = __ip6_ins_rt(nh->rt6_info, &cfg->fc_nlinfo, &nh->mxc); 3171 rt_last = nh->rt6_info;
3172 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc);
3173 /* save reference to first route for notification */
3174 if (!rt_notif && !err)
3175 rt_notif = nh->rt6_info;
3176
3050 /* nh->rt6_info is used or freed at this point, reset to NULL*/ 3177 /* nh->rt6_info is used or freed at this point, reset to NULL*/
3051 nh->rt6_info = NULL; 3178 nh->rt6_info = NULL;
3052 if (err) { 3179 if (err) {
@@ -3068,9 +3195,18 @@ static int ip6_route_multipath_add(struct fib6_config *cfg)
3068 nhn++; 3195 nhn++;
3069 } 3196 }
3070 3197
3198 /* success ... tell user about new route */
3199 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3071 goto cleanup; 3200 goto cleanup;
3072 3201
3073add_errout: 3202add_errout:
3203 /* send notification for routes that were added so that
3204 * the delete notifications sent by ip6_route_del are
3205 * coherent
3206 */
3207 if (rt_notif)
3208 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3209
3074 /* Delete routes that were already added */ 3210 /* Delete routes that were already added */
3075 list_for_each_entry(nh, &rt6_nh_list, next) { 3211 list_for_each_entry(nh, &rt6_nh_list, next) {
3076 if (err_nh == nh) 3212 if (err_nh == nh)
@@ -3138,8 +3274,10 @@ static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
3138 3274
3139 if (cfg.fc_mp) 3275 if (cfg.fc_mp)
3140 return ip6_route_multipath_del(&cfg); 3276 return ip6_route_multipath_del(&cfg);
3141 else 3277 else {
3278 cfg.fc_delete_all_nh = 1;
3142 return ip6_route_del(&cfg); 3279 return ip6_route_del(&cfg);
3280 }
3143} 3281}
3144 3282
3145static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh) 3283static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
@@ -3157,8 +3295,19 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
3157 return ip6_route_add(&cfg); 3295 return ip6_route_add(&cfg);
3158} 3296}
3159 3297
3160static inline size_t rt6_nlmsg_size(struct rt6_info *rt) 3298static size_t rt6_nlmsg_size(struct rt6_info *rt)
3161{ 3299{
3300 int nexthop_len = 0;
3301
3302 if (rt->rt6i_nsiblings) {
3303 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
3304 + NLA_ALIGN(sizeof(struct rtnexthop))
3305 + nla_total_size(16) /* RTA_GATEWAY */
3306 + lwtunnel_get_encap_size(rt->dst.lwtstate);
3307
3308 nexthop_len *= rt->rt6i_nsiblings;
3309 }
3310
3162 return NLMSG_ALIGN(sizeof(struct rtmsg)) 3311 return NLMSG_ALIGN(sizeof(struct rtmsg))
3163 + nla_total_size(16) /* RTA_SRC */ 3312 + nla_total_size(16) /* RTA_SRC */
3164 + nla_total_size(16) /* RTA_DST */ 3313 + nla_total_size(16) /* RTA_DST */
@@ -3172,14 +3321,71 @@ static inline size_t rt6_nlmsg_size(struct rt6_info *rt)
3172 + nla_total_size(sizeof(struct rta_cacheinfo)) 3321 + nla_total_size(sizeof(struct rta_cacheinfo))
3173 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ 3322 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3174 + nla_total_size(1) /* RTA_PREF */ 3323 + nla_total_size(1) /* RTA_PREF */
3175 + lwtunnel_get_encap_size(rt->dst.lwtstate); 3324 + lwtunnel_get_encap_size(rt->dst.lwtstate)
3325 + nexthop_len;
3326}
3327
3328static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
3329 unsigned int *flags, bool skip_oif)
3330{
3331 if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) {
3332 *flags |= RTNH_F_LINKDOWN;
3333 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3334 *flags |= RTNH_F_DEAD;
3335 }
3336
3337 if (rt->rt6i_flags & RTF_GATEWAY) {
3338 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
3339 goto nla_put_failure;
3340 }
3341
3342 /* not needed for multipath encoding b/c it has a rtnexthop struct */
3343 if (!skip_oif && rt->dst.dev &&
3344 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
3345 goto nla_put_failure;
3346
3347 if (rt->dst.lwtstate &&
3348 lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
3349 goto nla_put_failure;
3350
3351 return 0;
3352
3353nla_put_failure:
3354 return -EMSGSIZE;
3355}
3356
3357/* add multipath next hop */
3358static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
3359{
3360 struct rtnexthop *rtnh;
3361 unsigned int flags = 0;
3362
3363 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
3364 if (!rtnh)
3365 goto nla_put_failure;
3366
3367 rtnh->rtnh_hops = 0;
3368 rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
3369
3370 if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
3371 goto nla_put_failure;
3372
3373 rtnh->rtnh_flags = flags;
3374
3375 /* length of rtnetlink header + attributes */
3376 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
3377
3378 return 0;
3379
3380nla_put_failure:
3381 return -EMSGSIZE;
3176} 3382}
3177 3383
3178static int rt6_fill_node(struct net *net, 3384static int rt6_fill_node(struct net *net,
3179 struct sk_buff *skb, struct rt6_info *rt, 3385 struct sk_buff *skb, struct rt6_info *rt,
3180 struct in6_addr *dst, struct in6_addr *src, 3386 struct in6_addr *dst, struct in6_addr *src,
3181 int iif, int type, u32 portid, u32 seq, 3387 int iif, int type, u32 portid, u32 seq,
3182 int prefix, int nowait, unsigned int flags) 3388 unsigned int flags)
3183{ 3389{
3184 u32 metrics[RTAX_MAX]; 3390 u32 metrics[RTAX_MAX];
3185 struct rtmsg *rtm; 3391 struct rtmsg *rtm;
@@ -3187,13 +3393,6 @@ static int rt6_fill_node(struct net *net,
3187 long expires; 3393 long expires;
3188 u32 table; 3394 u32 table;
3189 3395
3190 if (prefix) { /* user wants prefix routes only */
3191 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
3192 /* success since this is not a prefix route */
3193 return 1;
3194 }
3195 }
3196
3197 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); 3396 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
3198 if (!nlh) 3397 if (!nlh)
3199 return -EMSGSIZE; 3398 return -EMSGSIZE;
@@ -3228,16 +3427,13 @@ static int rt6_fill_node(struct net *net,
3228 } 3427 }
3229 else if (rt->rt6i_flags & RTF_LOCAL) 3428 else if (rt->rt6i_flags & RTF_LOCAL)
3230 rtm->rtm_type = RTN_LOCAL; 3429 rtm->rtm_type = RTN_LOCAL;
3430 else if (rt->rt6i_flags & RTF_ANYCAST)
3431 rtm->rtm_type = RTN_ANYCAST;
3231 else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK)) 3432 else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
3232 rtm->rtm_type = RTN_LOCAL; 3433 rtm->rtm_type = RTN_LOCAL;
3233 else 3434 else
3234 rtm->rtm_type = RTN_UNICAST; 3435 rtm->rtm_type = RTN_UNICAST;
3235 rtm->rtm_flags = 0; 3436 rtm->rtm_flags = 0;
3236 if (!netif_carrier_ok(rt->dst.dev)) {
3237 rtm->rtm_flags |= RTNH_F_LINKDOWN;
3238 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3239 rtm->rtm_flags |= RTNH_F_DEAD;
3240 }
3241 rtm->rtm_scope = RT_SCOPE_UNIVERSE; 3437 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
3242 rtm->rtm_protocol = rt->rt6i_protocol; 3438 rtm->rtm_protocol = rt->rt6i_protocol;
3243 if (rt->rt6i_flags & RTF_DYNAMIC) 3439 if (rt->rt6i_flags & RTF_DYNAMIC)
@@ -3271,19 +3467,12 @@ static int rt6_fill_node(struct net *net,
3271 if (iif) { 3467 if (iif) {
3272#ifdef CONFIG_IPV6_MROUTE 3468#ifdef CONFIG_IPV6_MROUTE
3273 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) { 3469 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
3274 int err = ip6mr_get_route(net, skb, rtm, nowait, 3470 int err = ip6mr_get_route(net, skb, rtm, portid);
3275 portid); 3471
3276 3472 if (err == 0)
3277 if (err <= 0) { 3473 return 0;
3278 if (!nowait) { 3474 if (err < 0)
3279 if (err == 0) 3475 goto nla_put_failure;
3280 return 0;
3281 goto nla_put_failure;
3282 } else {
3283 if (err == -EMSGSIZE)
3284 goto nla_put_failure;
3285 }
3286 }
3287 } else 3476 } else
3288#endif 3477#endif
3289 if (nla_put_u32(skb, RTA_IIF, iif)) 3478 if (nla_put_u32(skb, RTA_IIF, iif))
@@ -3308,17 +3497,35 @@ static int rt6_fill_node(struct net *net,
3308 if (rtnetlink_put_metrics(skb, metrics) < 0) 3497 if (rtnetlink_put_metrics(skb, metrics) < 0)
3309 goto nla_put_failure; 3498 goto nla_put_failure;
3310 3499
3311 if (rt->rt6i_flags & RTF_GATEWAY) {
3312 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
3313 goto nla_put_failure;
3314 }
3315
3316 if (rt->dst.dev &&
3317 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
3318 goto nla_put_failure;
3319 if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric)) 3500 if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
3320 goto nla_put_failure; 3501 goto nla_put_failure;
3321 3502
3503 /* For multipath routes, walk the siblings list and add
3504 * each as a nexthop within RTA_MULTIPATH.
3505 */
3506 if (rt->rt6i_nsiblings) {
3507 struct rt6_info *sibling, *next_sibling;
3508 struct nlattr *mp;
3509
3510 mp = nla_nest_start(skb, RTA_MULTIPATH);
3511 if (!mp)
3512 goto nla_put_failure;
3513
3514 if (rt6_add_nexthop(skb, rt) < 0)
3515 goto nla_put_failure;
3516
3517 list_for_each_entry_safe(sibling, next_sibling,
3518 &rt->rt6i_siblings, rt6i_siblings) {
3519 if (rt6_add_nexthop(skb, sibling) < 0)
3520 goto nla_put_failure;
3521 }
3522
3523 nla_nest_end(skb, mp);
3524 } else {
3525 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
3526 goto nla_put_failure;
3527 }
3528
3322 expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0; 3529 expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
3323 3530
3324 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0) 3531 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
@@ -3327,8 +3534,6 @@ static int rt6_fill_node(struct net *net,
3327 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags))) 3534 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
3328 goto nla_put_failure; 3535 goto nla_put_failure;
3329 3536
3330 if (lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
3331 goto nla_put_failure;
3332 3537
3333 nlmsg_end(skb, nlh); 3538 nlmsg_end(skb, nlh);
3334 return 0; 3539 return 0;
@@ -3341,18 +3546,26 @@ nla_put_failure:
3341int rt6_dump_route(struct rt6_info *rt, void *p_arg) 3546int rt6_dump_route(struct rt6_info *rt, void *p_arg)
3342{ 3547{
3343 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; 3548 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
3344 int prefix; 3549 struct net *net = arg->net;
3550
3551 if (rt == net->ipv6.ip6_null_entry)
3552 return 0;
3345 3553
3346 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) { 3554 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
3347 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh); 3555 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
3348 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
3349 } else
3350 prefix = 0;
3351 3556
3352 return rt6_fill_node(arg->net, 3557 /* user wants prefix routes only */
3558 if (rtm->rtm_flags & RTM_F_PREFIX &&
3559 !(rt->rt6i_flags & RTF_PREFIX_RT)) {
3560 /* success since this is not a prefix route */
3561 return 1;
3562 }
3563 }
3564
3565 return rt6_fill_node(net,
3353 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE, 3566 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
3354 NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq, 3567 NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
3355 prefix, 0, NLM_F_MULTI); 3568 NLM_F_MULTI);
3356} 3569}
3357 3570
3358static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) 3571static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
@@ -3426,6 +3639,12 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
3426 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6); 3639 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
3427 } 3640 }
3428 3641
3642 if (rt == net->ipv6.ip6_null_entry) {
3643 err = rt->dst.error;
3644 ip6_rt_put(rt);
3645 goto errout;
3646 }
3647
3429 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 3648 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3430 if (!skb) { 3649 if (!skb) {
3431 ip6_rt_put(rt); 3650 ip6_rt_put(rt);
@@ -3433,17 +3652,11 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
3433 goto errout; 3652 goto errout;
3434 } 3653 }
3435 3654
3436 /* Reserve room for dummy headers, this skb can pass
3437 through good chunk of routing engine.
3438 */
3439 skb_reset_mac_header(skb);
3440 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
3441
3442 skb_dst_set(skb, &rt->dst); 3655 skb_dst_set(skb, &rt->dst);
3443 3656
3444 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif, 3657 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
3445 RTM_NEWROUTE, NETLINK_CB(in_skb).portid, 3658 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3446 nlh->nlmsg_seq, 0, 0, 0); 3659 nlh->nlmsg_seq, 0);
3447 if (err < 0) { 3660 if (err < 0) {
3448 kfree_skb(skb); 3661 kfree_skb(skb);
3449 goto errout; 3662 goto errout;
@@ -3470,7 +3683,7 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
3470 goto errout; 3683 goto errout;
3471 3684
3472 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0, 3685 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
3473 event, info->portid, seq, 0, 0, nlm_flags); 3686 event, info->portid, seq, nlm_flags);
3474 if (err < 0) { 3687 if (err < 0) {
3475 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 3688 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
3476 WARN_ON(err == -EMSGSIZE); 3689 WARN_ON(err == -EMSGSIZE);
diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c
index a855eb325b03..5f44ffed2576 100644
--- a/net/ipv6/seg6.c
+++ b/net/ipv6/seg6.c
@@ -53,6 +53,9 @@ bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len)
53 struct sr6_tlv *tlv; 53 struct sr6_tlv *tlv;
54 unsigned int tlv_len; 54 unsigned int tlv_len;
55 55
56 if (trailing < sizeof(*tlv))
57 return false;
58
56 tlv = (struct sr6_tlv *)((unsigned char *)srh + tlv_offset); 59 tlv = (struct sr6_tlv *)((unsigned char *)srh + tlv_offset);
57 tlv_len = sizeof(*tlv) + tlv->len; 60 tlv_len = sizeof(*tlv) + tlv->len;
58 61
diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c
index 03a064803626..f950cb53d5e3 100644
--- a/net/ipv6/seg6_hmac.c
+++ b/net/ipv6/seg6_hmac.c
@@ -45,7 +45,7 @@
45#include <net/seg6_hmac.h> 45#include <net/seg6_hmac.h>
46#include <linux/random.h> 46#include <linux/random.h>
47 47
48static char * __percpu *hmac_ring; 48static DEFINE_PER_CPU(char [SEG6_HMAC_RING_SIZE], hmac_ring);
49 49
50static int seg6_hmac_cmpfn(struct rhashtable_compare_arg *arg, const void *obj) 50static int seg6_hmac_cmpfn(struct rhashtable_compare_arg *arg, const void *obj)
51{ 51{
@@ -174,7 +174,7 @@ int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr,
174 * hash function (RadioGatun) with up to 1216 bits 174 * hash function (RadioGatun) with up to 1216 bits
175 */ 175 */
176 176
177 /* saddr(16) + first_seg(1) + cleanup(1) + keyid(4) + seglist(16n) */ 177 /* saddr(16) + first_seg(1) + flags(1) + keyid(4) + seglist(16n) */
178 plen = 16 + 1 + 1 + 4 + (hdr->first_segment + 1) * 16; 178 plen = 16 + 1 + 1 + 4 + (hdr->first_segment + 1) * 16;
179 179
180 /* this limit allows for 14 segments */ 180 /* this limit allows for 14 segments */
@@ -186,13 +186,13 @@ int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr,
186 * 186 *
187 * 1. Source IPv6 address (128 bits) 187 * 1. Source IPv6 address (128 bits)
188 * 2. first_segment value (8 bits) 188 * 2. first_segment value (8 bits)
189 * 3. cleanup flag (8 bits: highest bit is cleanup value, others are 0) 189 * 3. Flags (8 bits)
190 * 4. HMAC Key ID (32 bits) 190 * 4. HMAC Key ID (32 bits)
191 * 5. All segments in the segments list (n * 128 bits) 191 * 5. All segments in the segments list (n * 128 bits)
192 */ 192 */
193 193
194 local_bh_disable(); 194 local_bh_disable();
195 ring = *this_cpu_ptr(hmac_ring); 195 ring = this_cpu_ptr(hmac_ring);
196 off = ring; 196 off = ring;
197 197
198 /* source address */ 198 /* source address */
@@ -202,8 +202,8 @@ int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr,
202 /* first_segment value */ 202 /* first_segment value */
203 *off++ = hdr->first_segment; 203 *off++ = hdr->first_segment;
204 204
205 /* cleanup flag */ 205 /* flags */
206 *off++ = !!(sr_has_cleanup(hdr)) << 7; 206 *off++ = hdr->flags;
207 207
208 /* HMAC Key ID */ 208 /* HMAC Key ID */
209 memcpy(off, &hmackeyid, 4); 209 memcpy(off, &hmackeyid, 4);
@@ -353,27 +353,6 @@ out:
353} 353}
354EXPORT_SYMBOL(seg6_push_hmac); 354EXPORT_SYMBOL(seg6_push_hmac);
355 355
356static int seg6_hmac_init_ring(void)
357{
358 int i;
359
360 hmac_ring = alloc_percpu(char *);
361
362 if (!hmac_ring)
363 return -ENOMEM;
364
365 for_each_possible_cpu(i) {
366 char *ring = kzalloc(SEG6_HMAC_RING_SIZE, GFP_KERNEL);
367
368 if (!ring)
369 return -ENOMEM;
370
371 *per_cpu_ptr(hmac_ring, i) = ring;
372 }
373
374 return 0;
375}
376
377static int seg6_hmac_init_algo(void) 356static int seg6_hmac_init_algo(void)
378{ 357{
379 struct seg6_hmac_algo *algo; 358 struct seg6_hmac_algo *algo;
@@ -410,7 +389,8 @@ static int seg6_hmac_init_algo(void)
410 return -ENOMEM; 389 return -ENOMEM;
411 390
412 for_each_possible_cpu(cpu) { 391 for_each_possible_cpu(cpu) {
413 shash = kzalloc(shsize, GFP_KERNEL); 392 shash = kzalloc_node(shsize, GFP_KERNEL,
393 cpu_to_node(cpu));
414 if (!shash) 394 if (!shash)
415 return -ENOMEM; 395 return -ENOMEM;
416 *per_cpu_ptr(algo->shashs, cpu) = shash; 396 *per_cpu_ptr(algo->shashs, cpu) = shash;
@@ -422,16 +402,7 @@ static int seg6_hmac_init_algo(void)
422 402
423int __init seg6_hmac_init(void) 403int __init seg6_hmac_init(void)
424{ 404{
425 int ret; 405 return seg6_hmac_init_algo();
426
427 ret = seg6_hmac_init_ring();
428 if (ret < 0)
429 goto out;
430
431 ret = seg6_hmac_init_algo();
432
433out:
434 return ret;
435} 406}
436EXPORT_SYMBOL(seg6_hmac_init); 407EXPORT_SYMBOL(seg6_hmac_init);
437 408
@@ -450,13 +421,6 @@ void seg6_hmac_exit(void)
450 struct seg6_hmac_algo *algo = NULL; 421 struct seg6_hmac_algo *algo = NULL;
451 int i, alg_count, cpu; 422 int i, alg_count, cpu;
452 423
453 for_each_possible_cpu(i) {
454 char *ring = *per_cpu_ptr(hmac_ring, i);
455
456 kfree(ring);
457 }
458 free_percpu(hmac_ring);
459
460 alg_count = sizeof(hmac_algos) / sizeof(struct seg6_hmac_algo); 424 alg_count = sizeof(hmac_algos) / sizeof(struct seg6_hmac_algo);
461 for (i = 0; i < alg_count; i++) { 425 for (i = 0; i < alg_count; i++) {
462 algo = &hmac_algos[i]; 426 algo = &hmac_algos[i];
diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c
index c46f8cbf5ab5..85582257d3af 100644
--- a/net/ipv6/seg6_iptunnel.c
+++ b/net/ipv6/seg6_iptunnel.c
@@ -55,8 +55,8 @@ static const struct nla_policy seg6_iptunnel_policy[SEG6_IPTUNNEL_MAX + 1] = {
55 [SEG6_IPTUNNEL_SRH] = { .type = NLA_BINARY }, 55 [SEG6_IPTUNNEL_SRH] = { .type = NLA_BINARY },
56}; 56};
57 57
58int nla_put_srh(struct sk_buff *skb, int attrtype, 58static int nla_put_srh(struct sk_buff *skb, int attrtype,
59 struct seg6_iptunnel_encap *tuninfo) 59 struct seg6_iptunnel_encap *tuninfo)
60{ 60{
61 struct seg6_iptunnel_encap *data; 61 struct seg6_iptunnel_encap *data;
62 struct nlattr *nla; 62 struct nlattr *nla;
@@ -235,7 +235,7 @@ static int seg6_do_srh(struct sk_buff *skb)
235 return 0; 235 return 0;
236} 236}
237 237
238int seg6_input(struct sk_buff *skb) 238static int seg6_input(struct sk_buff *skb)
239{ 239{
240 int err; 240 int err;
241 241
@@ -251,7 +251,7 @@ int seg6_input(struct sk_buff *skb)
251 return dst_input(skb); 251 return dst_input(skb);
252} 252}
253 253
254int seg6_output(struct net *net, struct sock *sk, struct sk_buff *skb) 254static int seg6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
255{ 255{
256 struct dst_entry *orig_dst = skb_dst(skb); 256 struct dst_entry *orig_dst = skb_dst(skb);
257 struct dst_entry *dst = NULL; 257 struct dst_entry *dst = NULL;
@@ -303,7 +303,7 @@ drop:
303 return err; 303 return err;
304} 304}
305 305
306static int seg6_build_state(struct net_device *dev, struct nlattr *nla, 306static int seg6_build_state(struct nlattr *nla,
307 unsigned int family, const void *cfg, 307 unsigned int family, const void *cfg,
308 struct lwtunnel_state **ts) 308 struct lwtunnel_state **ts)
309{ 309{
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index fad992ad4bc8..99853c6e33a8 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -1380,6 +1380,7 @@ static int ipip6_tunnel_init(struct net_device *dev)
1380 err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); 1380 err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1381 if (err) { 1381 if (err) {
1382 free_percpu(dev->tstats); 1382 free_percpu(dev->tstats);
1383 dev->tstats = NULL;
1383 return err; 1384 return err;
1384 } 1385 }
1385 1386
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index a4d49760bf43..895ff650db43 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -16,7 +16,7 @@
16 16
17#include <linux/tcp.h> 17#include <linux/tcp.h>
18#include <linux/random.h> 18#include <linux/random.h>
19#include <linux/cryptohash.h> 19#include <linux/siphash.h>
20#include <linux/kernel.h> 20#include <linux/kernel.h>
21#include <net/ipv6.h> 21#include <net/ipv6.h>
22#include <net/tcp.h> 22#include <net/tcp.h>
@@ -24,7 +24,7 @@
24#define COOKIEBITS 24 /* Upper bits store count */ 24#define COOKIEBITS 24 /* Upper bits store count */
25#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) 25#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
26 26
27static u32 syncookie6_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly; 27static siphash_key_t syncookie6_secret[2] __read_mostly;
28 28
29/* RFC 2460, Section 8.3: 29/* RFC 2460, Section 8.3:
30 * [ipv6 tcp] MSS must be computed as the maximum packet size minus 60 [..] 30 * [ipv6 tcp] MSS must be computed as the maximum packet size minus 60 [..]
@@ -41,30 +41,27 @@ static __u16 const msstab[] = {
41 9000 - 60, 41 9000 - 60,
42}; 42};
43 43
44static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], ipv6_cookie_scratch); 44static u32 cookie_hash(const struct in6_addr *saddr,
45 45 const struct in6_addr *daddr,
46static u32 cookie_hash(const struct in6_addr *saddr, const struct in6_addr *daddr,
47 __be16 sport, __be16 dport, u32 count, int c) 46 __be16 sport, __be16 dport, u32 count, int c)
48{ 47{
49 __u32 *tmp; 48 const struct {
49 struct in6_addr saddr;
50 struct in6_addr daddr;
51 u32 count;
52 __be16 sport;
53 __be16 dport;
54 } __aligned(SIPHASH_ALIGNMENT) combined = {
55 .saddr = *saddr,
56 .daddr = *daddr,
57 .count = count,
58 .sport = sport,
59 .dport = dport
60 };
50 61
51 net_get_random_once(syncookie6_secret, sizeof(syncookie6_secret)); 62 net_get_random_once(syncookie6_secret, sizeof(syncookie6_secret));
52 63 return siphash(&combined, offsetofend(typeof(combined), dport),
53 tmp = this_cpu_ptr(ipv6_cookie_scratch); 64 &syncookie6_secret[c]);
54
55 /*
56 * we have 320 bits of information to hash, copy in the remaining
57 * 192 bits required for sha_transform, from the syncookie6_secret
58 * and overwrite the digest with the secret
59 */
60 memcpy(tmp + 10, syncookie6_secret[c], 44);
61 memcpy(tmp, saddr, 16);
62 memcpy(tmp + 4, daddr, 16);
63 tmp[8] = ((__force u32)sport << 16) + (__force u32)dport;
64 tmp[9] = count;
65 sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5);
66
67 return tmp[17];
68} 65}
69 66
70static __u32 secure_tcp_syn_cookie(const struct in6_addr *saddr, 67static __u32 secure_tcp_syn_cookie(const struct in6_addr *saddr,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index cb8929681dc7..49fa2e8c3fa9 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -122,7 +122,9 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
122 struct flowi6 fl6; 122 struct flowi6 fl6;
123 struct dst_entry *dst; 123 struct dst_entry *dst;
124 int addr_type; 124 int addr_type;
125 u32 seq;
125 int err; 126 int err;
127 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
126 128
127 if (addr_len < SIN6_LEN_RFC2133) 129 if (addr_len < SIN6_LEN_RFC2133)
128 return -EINVAL; 130 return -EINVAL;
@@ -148,8 +150,13 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
148 * connect() to INADDR_ANY means loopback (BSD'ism). 150 * connect() to INADDR_ANY means loopback (BSD'ism).
149 */ 151 */
150 152
151 if (ipv6_addr_any(&usin->sin6_addr)) 153 if (ipv6_addr_any(&usin->sin6_addr)) {
152 usin->sin6_addr.s6_addr[15] = 0x1; 154 if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
155 ipv6_addr_set_v4mapped(htonl(INADDR_LOOPBACK),
156 &usin->sin6_addr);
157 else
158 usin->sin6_addr = in6addr_loopback;
159 }
153 160
154 addr_type = ipv6_addr_type(&usin->sin6_addr); 161 addr_type = ipv6_addr_type(&usin->sin6_addr);
155 162
@@ -188,7 +195,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
188 * TCP over IPv4 195 * TCP over IPv4
189 */ 196 */
190 197
191 if (addr_type == IPV6_ADDR_MAPPED) { 198 if (addr_type & IPV6_ADDR_MAPPED) {
192 u32 exthdrlen = icsk->icsk_ext_hdr_len; 199 u32 exthdrlen = icsk->icsk_ext_hdr_len;
193 struct sockaddr_in sin; 200 struct sockaddr_in sin;
194 201
@@ -258,7 +265,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
258 sk->sk_gso_type = SKB_GSO_TCPV6; 265 sk->sk_gso_type = SKB_GSO_TCPV6;
259 ip6_dst_store(sk, dst, NULL, NULL); 266 ip6_dst_store(sk, dst, NULL, NULL);
260 267
261 if (tcp_death_row.sysctl_tw_recycle && 268 if (tcp_death_row->sysctl_tw_recycle &&
262 !tp->rx_opt.ts_recent_stamp && 269 !tp->rx_opt.ts_recent_stamp &&
263 ipv6_addr_equal(&fl6.daddr, &sk->sk_v6_daddr)) 270 ipv6_addr_equal(&fl6.daddr, &sk->sk_v6_daddr))
264 tcp_fetch_timewait_stamp(sk, dst); 271 tcp_fetch_timewait_stamp(sk, dst);
@@ -273,18 +280,26 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
273 inet->inet_dport = usin->sin6_port; 280 inet->inet_dport = usin->sin6_port;
274 281
275 tcp_set_state(sk, TCP_SYN_SENT); 282 tcp_set_state(sk, TCP_SYN_SENT);
276 err = inet6_hash_connect(&tcp_death_row, sk); 283 err = inet6_hash_connect(tcp_death_row, sk);
277 if (err) 284 if (err)
278 goto late_failure; 285 goto late_failure;
279 286
280 sk_set_txhash(sk); 287 sk_set_txhash(sk);
281 288
282 if (!tp->write_seq && likely(!tp->repair)) 289 if (likely(!tp->repair)) {
283 tp->write_seq = secure_tcpv6_sequence_number(np->saddr.s6_addr32, 290 seq = secure_tcpv6_sequence_number(np->saddr.s6_addr32,
284 sk->sk_v6_daddr.s6_addr32, 291 sk->sk_v6_daddr.s6_addr32,
285 inet->inet_sport, 292 inet->inet_sport,
286 inet->inet_dport, 293 inet->inet_dport,
287 &tp->tsoffset); 294 &tp->tsoffset);
295 if (!tp->write_seq)
296 tp->write_seq = seq;
297 }
298
299 if (tcp_fastopen_defer_connect(sk, &err))
300 return err;
301 if (err)
302 goto late_failure;
288 303
289 err = tcp_connect(sk); 304 err = tcp_connect(sk);
290 if (err) 305 if (err)
@@ -294,7 +309,6 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
294 309
295late_failure: 310late_failure:
296 tcp_set_state(sk, TCP_CLOSE); 311 tcp_set_state(sk, TCP_CLOSE);
297 __sk_dst_reset(sk);
298failure: 312failure:
299 inet->inet_dport = 0; 313 inet->inet_dport = 0;
300 sk->sk_route_caps = 0; 314 sk->sk_route_caps = 0;
@@ -377,10 +391,12 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
377 np = inet6_sk(sk); 391 np = inet6_sk(sk);
378 392
379 if (type == NDISC_REDIRECT) { 393 if (type == NDISC_REDIRECT) {
380 struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie); 394 if (!sock_owned_by_user(sk)) {
395 struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie);
381 396
382 if (dst) 397 if (dst)
383 dst->ops->redirect(dst, sk, skb); 398 dst->ops->redirect(dst, sk, skb);
399 }
384 goto out; 400 goto out;
385 } 401 }
386 402
@@ -991,6 +1007,16 @@ drop:
991 return 0; /* don't send reset */ 1007 return 0; /* don't send reset */
992} 1008}
993 1009
1010static void tcp_v6_restore_cb(struct sk_buff *skb)
1011{
1012 /* We need to move header back to the beginning if xfrm6_policy_check()
1013 * and tcp_v6_fill_cb() are going to be called again.
1014 * ip6_datagram_recv_specific_ctl() also expects IP6CB to be there.
1015 */
1016 memmove(IP6CB(skb), &TCP_SKB_CB(skb)->header.h6,
1017 sizeof(struct inet6_skb_parm));
1018}
1019
994static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1020static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
995 struct request_sock *req, 1021 struct request_sock *req,
996 struct dst_entry *dst, 1022 struct dst_entry *dst,
@@ -1142,10 +1168,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
1142 tcp_ca_openreq_child(newsk, dst); 1168 tcp_ca_openreq_child(newsk, dst);
1143 1169
1144 tcp_sync_mss(newsk, dst_mtu(dst)); 1170 tcp_sync_mss(newsk, dst_mtu(dst));
1145 newtp->advmss = dst_metric_advmss(dst); 1171 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1146 if (tcp_sk(sk)->rx_opt.user_mss &&
1147 tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1148 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1149 1172
1150 tcp_initialize_rcv_mss(newsk); 1173 tcp_initialize_rcv_mss(newsk);
1151 1174
@@ -1182,8 +1205,10 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
1182 sk_gfp_mask(sk, GFP_ATOMIC)); 1205 sk_gfp_mask(sk, GFP_ATOMIC));
1183 consume_skb(ireq->pktopts); 1206 consume_skb(ireq->pktopts);
1184 ireq->pktopts = NULL; 1207 ireq->pktopts = NULL;
1185 if (newnp->pktoptions) 1208 if (newnp->pktoptions) {
1209 tcp_v6_restore_cb(newnp->pktoptions);
1186 skb_set_owner_r(newnp->pktoptions, newsk); 1210 skb_set_owner_r(newnp->pktoptions, newsk);
1211 }
1187 } 1212 }
1188 } 1213 }
1189 1214
@@ -1198,16 +1223,6 @@ out:
1198 return NULL; 1223 return NULL;
1199} 1224}
1200 1225
1201static void tcp_v6_restore_cb(struct sk_buff *skb)
1202{
1203 /* We need to move header back to the beginning if xfrm6_policy_check()
1204 * and tcp_v6_fill_cb() are going to be called again.
1205 * ip6_datagram_recv_specific_ctl() also expects IP6CB to be there.
1206 */
1207 memmove(IP6CB(skb), &TCP_SKB_CB(skb)->header.h6,
1208 sizeof(struct inet6_skb_parm));
1209}
1210
1211/* The socket must have it's spinlock held when we get 1226/* The socket must have it's spinlock held when we get
1212 * here, unless it is a TCP_LISTEN socket. 1227 * here, unless it is a TCP_LISTEN socket.
1213 * 1228 *
@@ -1620,7 +1635,6 @@ static const struct inet_connection_sock_af_ops ipv6_specific = {
1620 .getsockopt = ipv6_getsockopt, 1635 .getsockopt = ipv6_getsockopt,
1621 .addr2sockaddr = inet6_csk_addr2sockaddr, 1636 .addr2sockaddr = inet6_csk_addr2sockaddr,
1622 .sockaddr_len = sizeof(struct sockaddr_in6), 1637 .sockaddr_len = sizeof(struct sockaddr_in6),
1623 .bind_conflict = inet6_csk_bind_conflict,
1624#ifdef CONFIG_COMPAT 1638#ifdef CONFIG_COMPAT
1625 .compat_setsockopt = compat_ipv6_setsockopt, 1639 .compat_setsockopt = compat_ipv6_setsockopt,
1626 .compat_getsockopt = compat_ipv6_getsockopt, 1640 .compat_getsockopt = compat_ipv6_getsockopt,
@@ -1651,7 +1665,6 @@ static const struct inet_connection_sock_af_ops ipv6_mapped = {
1651 .getsockopt = ipv6_getsockopt, 1665 .getsockopt = ipv6_getsockopt,
1652 .addr2sockaddr = inet6_csk_addr2sockaddr, 1666 .addr2sockaddr = inet6_csk_addr2sockaddr,
1653 .sockaddr_len = sizeof(struct sockaddr_in6), 1667 .sockaddr_len = sizeof(struct sockaddr_in6),
1654 .bind_conflict = inet6_csk_bind_conflict,
1655#ifdef CONFIG_COMPAT 1668#ifdef CONFIG_COMPAT
1656 .compat_setsockopt = compat_ipv6_setsockopt, 1669 .compat_setsockopt = compat_ipv6_setsockopt,
1657 .compat_getsockopt = compat_ipv6_getsockopt, 1670 .compat_getsockopt = compat_ipv6_getsockopt,
@@ -1744,7 +1757,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
1744 srcp = ntohs(inet->inet_sport); 1757 srcp = ntohs(inet->inet_sport);
1745 1758
1746 if (icsk->icsk_pending == ICSK_TIME_RETRANS || 1759 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
1747 icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || 1760 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
1748 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 1761 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
1749 timer_active = 1; 1762 timer_active = 1;
1750 timer_expires = icsk->icsk_timeout; 1763 timer_expires = icsk->icsk_timeout;
@@ -1888,6 +1901,7 @@ struct proto tcpv6_prot = {
1888 .shutdown = tcp_shutdown, 1901 .shutdown = tcp_shutdown,
1889 .setsockopt = tcp_setsockopt, 1902 .setsockopt = tcp_setsockopt,
1890 .getsockopt = tcp_getsockopt, 1903 .getsockopt = tcp_getsockopt,
1904 .keepalive = tcp_set_keepalive,
1891 .recvmsg = tcp_recvmsg, 1905 .recvmsg = tcp_recvmsg,
1892 .sendmsg = tcp_sendmsg, 1906 .sendmsg = tcp_sendmsg,
1893 .sendpage = tcp_sendpage, 1907 .sendpage = tcp_sendpage,
@@ -1948,7 +1962,7 @@ static void __net_exit tcpv6_net_exit(struct net *net)
1948 1962
1949static void __net_exit tcpv6_net_exit_batch(struct list_head *net_exit_list) 1963static void __net_exit tcpv6_net_exit_batch(struct list_head *net_exit_list)
1950{ 1964{
1951 inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET6); 1965 inet_twsk_purge(&tcp_hashinfo, AF_INET6);
1952} 1966}
1953 1967
1954static struct pernet_operations tcpv6_net_ops = { 1968static struct pernet_operations tcpv6_net_ops = {
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 4d5c4eee4b3f..e28082f0a307 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -55,6 +55,16 @@
55#include <trace/events/skb.h> 55#include <trace/events/skb.h>
56#include "udp_impl.h" 56#include "udp_impl.h"
57 57
58static bool udp6_lib_exact_dif_match(struct net *net, struct sk_buff *skb)
59{
60#if defined(CONFIG_NET_L3_MASTER_DEV)
61 if (!net->ipv4.sysctl_udp_l3mdev_accept &&
62 skb && ipv6_l3mdev_skb(IP6CB(skb)->flags))
63 return true;
64#endif
65 return false;
66}
67
58static u32 udp6_ehashfn(const struct net *net, 68static u32 udp6_ehashfn(const struct net *net,
59 const struct in6_addr *laddr, 69 const struct in6_addr *laddr,
60 const u16 lport, 70 const u16 lport,
@@ -103,7 +113,7 @@ int udp_v6_get_port(struct sock *sk, unsigned short snum)
103 113
104 /* precompute partial secondary hash */ 114 /* precompute partial secondary hash */
105 udp_sk(sk)->udp_portaddr_hash = hash2_partial; 115 udp_sk(sk)->udp_portaddr_hash = hash2_partial;
106 return udp_lib_get_port(sk, snum, ipv6_rcv_saddr_equal, hash2_nulladdr); 116 return udp_lib_get_port(sk, snum, hash2_nulladdr);
107} 117}
108 118
109static void udp_v6_rehash(struct sock *sk) 119static void udp_v6_rehash(struct sock *sk)
@@ -118,7 +128,7 @@ static void udp_v6_rehash(struct sock *sk)
118static int compute_score(struct sock *sk, struct net *net, 128static int compute_score(struct sock *sk, struct net *net,
119 const struct in6_addr *saddr, __be16 sport, 129 const struct in6_addr *saddr, __be16 sport,
120 const struct in6_addr *daddr, unsigned short hnum, 130 const struct in6_addr *daddr, unsigned short hnum,
121 int dif) 131 int dif, bool exact_dif)
122{ 132{
123 int score; 133 int score;
124 struct inet_sock *inet; 134 struct inet_sock *inet;
@@ -149,7 +159,7 @@ static int compute_score(struct sock *sk, struct net *net,
149 score++; 159 score++;
150 } 160 }
151 161
152 if (sk->sk_bound_dev_if) { 162 if (sk->sk_bound_dev_if || exact_dif) {
153 if (sk->sk_bound_dev_if != dif) 163 if (sk->sk_bound_dev_if != dif)
154 return -1; 164 return -1;
155 score++; 165 score++;
@@ -165,7 +175,7 @@ static int compute_score(struct sock *sk, struct net *net,
165static struct sock *udp6_lib_lookup2(struct net *net, 175static struct sock *udp6_lib_lookup2(struct net *net,
166 const struct in6_addr *saddr, __be16 sport, 176 const struct in6_addr *saddr, __be16 sport,
167 const struct in6_addr *daddr, unsigned int hnum, int dif, 177 const struct in6_addr *daddr, unsigned int hnum, int dif,
168 struct udp_hslot *hslot2, 178 bool exact_dif, struct udp_hslot *hslot2,
169 struct sk_buff *skb) 179 struct sk_buff *skb)
170{ 180{
171 struct sock *sk, *result; 181 struct sock *sk, *result;
@@ -176,7 +186,7 @@ static struct sock *udp6_lib_lookup2(struct net *net,
176 badness = -1; 186 badness = -1;
177 udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { 187 udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
178 score = compute_score(sk, net, saddr, sport, 188 score = compute_score(sk, net, saddr, sport,
179 daddr, hnum, dif); 189 daddr, hnum, dif, exact_dif);
180 if (score > badness) { 190 if (score > badness) {
181 reuseport = sk->sk_reuseport; 191 reuseport = sk->sk_reuseport;
182 if (reuseport) { 192 if (reuseport) {
@@ -212,6 +222,7 @@ struct sock *__udp6_lib_lookup(struct net *net,
212 unsigned short hnum = ntohs(dport); 222 unsigned short hnum = ntohs(dport);
213 unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); 223 unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
214 struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; 224 struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
225 bool exact_dif = udp6_lib_exact_dif_match(net, skb);
215 int score, badness, matches = 0, reuseport = 0; 226 int score, badness, matches = 0, reuseport = 0;
216 u32 hash = 0; 227 u32 hash = 0;
217 228
@@ -223,7 +234,7 @@ struct sock *__udp6_lib_lookup(struct net *net,
223 goto begin; 234 goto begin;
224 235
225 result = udp6_lib_lookup2(net, saddr, sport, 236 result = udp6_lib_lookup2(net, saddr, sport,
226 daddr, hnum, dif, 237 daddr, hnum, dif, exact_dif,
227 hslot2, skb); 238 hslot2, skb);
228 if (!result) { 239 if (!result) {
229 unsigned int old_slot2 = slot2; 240 unsigned int old_slot2 = slot2;
@@ -239,7 +250,8 @@ struct sock *__udp6_lib_lookup(struct net *net,
239 250
240 result = udp6_lib_lookup2(net, saddr, sport, 251 result = udp6_lib_lookup2(net, saddr, sport,
241 daddr, hnum, dif, 252 daddr, hnum, dif,
242 hslot2, skb); 253 exact_dif, hslot2,
254 skb);
243 } 255 }
244 return result; 256 return result;
245 } 257 }
@@ -247,7 +259,8 @@ begin:
247 result = NULL; 259 result = NULL;
248 badness = -1; 260 badness = -1;
249 sk_for_each_rcu(sk, &hslot->head) { 261 sk_for_each_rcu(sk, &hslot->head) {
250 score = compute_score(sk, net, saddr, sport, daddr, hnum, dif); 262 score = compute_score(sk, net, saddr, sport, daddr, hnum, dif,
263 exact_dif);
251 if (score > badness) { 264 if (score > badness) {
252 reuseport = sk->sk_reuseport; 265 reuseport = sk->sk_reuseport;
253 if (reuseport) { 266 if (reuseport) {
@@ -441,7 +454,7 @@ try_again:
441 return err; 454 return err;
442 455
443csum_copy_err: 456csum_copy_err:
444 if (!__sk_queue_drop_skb(sk, skb, flags)) { 457 if (!__sk_queue_drop_skb(sk, skb, flags, udp_skb_destructor)) {
445 if (is_udp4) { 458 if (is_udp4) {
446 UDP_INC_STATS(sock_net(sk), 459 UDP_INC_STATS(sock_net(sk),
447 UDP_MIB_CSUMERRORS, is_udplite); 460 UDP_MIB_CSUMERRORS, is_udplite);
@@ -1022,6 +1035,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
1022 ipc6.hlimit = -1; 1035 ipc6.hlimit = -1;
1023 ipc6.tclass = -1; 1036 ipc6.tclass = -1;
1024 ipc6.dontfrag = -1; 1037 ipc6.dontfrag = -1;
1038 sockc.tsflags = sk->sk_tsflags;
1025 1039
1026 /* destination address check */ 1040 /* destination address check */
1027 if (sin6) { 1041 if (sin6) {
@@ -1033,6 +1047,10 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
1033 if (addr_len < SIN6_LEN_RFC2133) 1047 if (addr_len < SIN6_LEN_RFC2133)
1034 return -EINVAL; 1048 return -EINVAL;
1035 daddr = &sin6->sin6_addr; 1049 daddr = &sin6->sin6_addr;
1050 if (ipv6_addr_any(daddr) &&
1051 ipv6_addr_v4mapped(&np->saddr))
1052 ipv6_addr_set_v4mapped(htonl(INADDR_LOOPBACK),
1053 daddr);
1036 break; 1054 break;
1037 case AF_INET: 1055 case AF_INET:
1038 goto do_udp_sendmsg; 1056 goto do_udp_sendmsg;
@@ -1142,7 +1160,6 @@ do_udp_sendmsg:
1142 1160
1143 fl6.flowi6_mark = sk->sk_mark; 1161 fl6.flowi6_mark = sk->sk_mark;
1144 fl6.flowi6_uid = sk->sk_uid; 1162 fl6.flowi6_uid = sk->sk_uid;
1145 sockc.tsflags = sk->sk_tsflags;
1146 1163
1147 if (msg->msg_controllen) { 1164 if (msg->msg_controllen) {
1148 opt = &opt_space; 1165 opt = &opt_space;
@@ -1295,7 +1312,8 @@ out:
1295 return err; 1312 return err;
1296 1313
1297do_confirm: 1314do_confirm:
1298 dst_confirm(dst); 1315 if (msg->msg_flags & MSG_PROBE)
1316 dst_confirm_neigh(dst, &fl6.daddr);
1299 if (!(msg->msg_flags&MSG_PROBE) || len) 1317 if (!(msg->msg_flags&MSG_PROBE) || len)
1300 goto back_from_confirm; 1318 goto back_from_confirm;
1301 err = 0; 1319 err = 0;
diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
index b5789562aded..08a807b29298 100644
--- a/net/ipv6/xfrm6_input.c
+++ b/net/ipv6/xfrm6_input.c
@@ -33,6 +33,8 @@ EXPORT_SYMBOL(xfrm6_rcv_spi);
33 33
34int xfrm6_transport_finish(struct sk_buff *skb, int async) 34int xfrm6_transport_finish(struct sk_buff *skb, int async)
35{ 35{
36 struct xfrm_offload *xo = xfrm_offload(skb);
37
36 skb_network_header(skb)[IP6CB(skb)->nhoff] = 38 skb_network_header(skb)[IP6CB(skb)->nhoff] =
37 XFRM_MODE_SKB_CB(skb)->protocol; 39 XFRM_MODE_SKB_CB(skb)->protocol;
38 40
@@ -44,6 +46,11 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async)
44 ipv6_hdr(skb)->payload_len = htons(skb->len); 46 ipv6_hdr(skb)->payload_len = htons(skb->len);
45 __skb_push(skb, skb->data - skb_network_header(skb)); 47 __skb_push(skb, skb->data - skb_network_header(skb));
46 48
49 if (xo && (xo->flags & XFRM_GRO)) {
50 skb_mac_header_rebuild(skb);
51 return -1;
52 }
53
47 NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, 54 NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING,
48 dev_net(skb->dev), NULL, skb, skb->dev, NULL, 55 dev_net(skb->dev), NULL, skb, skb->dev, NULL,
49 ip6_rcv_finish); 56 ip6_rcv_finish);
@@ -69,18 +76,9 @@ int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr,
69 struct xfrm_state *x = NULL; 76 struct xfrm_state *x = NULL;
70 int i = 0; 77 int i = 0;
71 78
72 /* Allocate new secpath or COW existing one. */ 79 if (secpath_set(skb)) {
73 if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) { 80 XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR);
74 struct sec_path *sp; 81 goto drop;
75
76 sp = secpath_dup(skb->sp);
77 if (!sp) {
78 XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR);
79 goto drop;
80 }
81 if (skb->sp)
82 secpath_put(skb->sp);
83 skb->sp = sp;
84 } 82 }
85 83
86 if (1 + skb->sp->len == XFRM_MAX_DEPTH) { 84 if (1 + skb->sp->len == XFRM_MAX_DEPTH) {
diff --git a/net/ipv6/xfrm6_mode_transport.c b/net/ipv6/xfrm6_mode_transport.c
index 4e344105b3fd..4439ee44c8b0 100644
--- a/net/ipv6/xfrm6_mode_transport.c
+++ b/net/ipv6/xfrm6_mode_transport.c
@@ -47,6 +47,7 @@ static int xfrm6_transport_output(struct xfrm_state *x, struct sk_buff *skb)
47static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb) 47static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb)
48{ 48{
49 int ihl = skb->data - skb_transport_header(skb); 49 int ihl = skb->data - skb_transport_header(skb);
50 struct xfrm_offload *xo = xfrm_offload(skb);
50 51
51 if (skb->transport_header != skb->network_header) { 52 if (skb->transport_header != skb->network_header) {
52 memmove(skb_transport_header(skb), 53 memmove(skb_transport_header(skb),
@@ -55,7 +56,8 @@ static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb)
55 } 56 }
56 ipv6_hdr(skb)->payload_len = htons(skb->len + ihl - 57 ipv6_hdr(skb)->payload_len = htons(skb->len + ihl -
57 sizeof(struct ipv6hdr)); 58 sizeof(struct ipv6hdr));
58 skb_reset_transport_header(skb); 59 if (!xo || !(xo->flags & XFRM_GRO))
60 skb_reset_transport_header(skb);
59 return 0; 61 return 0;
60} 62}
61 63
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index e0f71c01d728..79651bc71bf0 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -25,8 +25,6 @@
25#include <net/mip6.h> 25#include <net/mip6.h>
26#endif 26#endif
27 27
28static struct xfrm_policy_afinfo xfrm6_policy_afinfo;
29
30static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif, 28static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif,
31 const xfrm_address_t *saddr, 29 const xfrm_address_t *saddr,
32 const xfrm_address_t *daddr) 30 const xfrm_address_t *daddr)
@@ -220,7 +218,7 @@ static inline int xfrm6_garbage_collect(struct dst_ops *ops)
220{ 218{
221 struct net *net = container_of(ops, struct net, xfrm.xfrm6_dst_ops); 219 struct net *net = container_of(ops, struct net, xfrm.xfrm6_dst_ops);
222 220
223 xfrm6_policy_afinfo.garbage_collect(net); 221 xfrm_garbage_collect_deferred(net);
224 return dst_entries_get_fast(ops) > ops->gc_thresh * 2; 222 return dst_entries_get_fast(ops) > ops->gc_thresh * 2;
225} 223}
226 224
@@ -291,8 +289,7 @@ static struct dst_ops xfrm6_dst_ops_template = {
291 .gc_thresh = INT_MAX, 289 .gc_thresh = INT_MAX,
292}; 290};
293 291
294static struct xfrm_policy_afinfo xfrm6_policy_afinfo = { 292static const struct xfrm_policy_afinfo xfrm6_policy_afinfo = {
295 .family = AF_INET6,
296 .dst_ops = &xfrm6_dst_ops_template, 293 .dst_ops = &xfrm6_dst_ops_template,
297 .dst_lookup = xfrm6_dst_lookup, 294 .dst_lookup = xfrm6_dst_lookup,
298 .get_saddr = xfrm6_get_saddr, 295 .get_saddr = xfrm6_get_saddr,
@@ -305,7 +302,7 @@ static struct xfrm_policy_afinfo xfrm6_policy_afinfo = {
305 302
306static int __init xfrm6_policy_init(void) 303static int __init xfrm6_policy_init(void)
307{ 304{
308 return xfrm_policy_register_afinfo(&xfrm6_policy_afinfo); 305 return xfrm_policy_register_afinfo(&xfrm6_policy_afinfo, AF_INET6);
309} 306}
310 307
311static void xfrm6_policy_fini(void) 308static void xfrm6_policy_fini(void)
diff --git a/net/ipv6/xfrm6_protocol.c b/net/ipv6/xfrm6_protocol.c
index 54d13f8dbbae..b2dc8ce49378 100644
--- a/net/ipv6/xfrm6_protocol.c
+++ b/net/ipv6/xfrm6_protocol.c
@@ -162,9 +162,8 @@ static const struct inet6_protocol ipcomp6_protocol = {
162 .flags = INET6_PROTO_NOPOLICY, 162 .flags = INET6_PROTO_NOPOLICY,
163}; 163};
164 164
165static struct xfrm_input_afinfo xfrm6_input_afinfo = { 165static const struct xfrm_input_afinfo xfrm6_input_afinfo = {
166 .family = AF_INET6, 166 .family = AF_INET6,
167 .owner = THIS_MODULE,
168 .callback = xfrm6_rcv_cb, 167 .callback = xfrm6_rcv_cb,
169}; 168};
170 169
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index ab254041dab7..8d77ad5cadaf 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -46,6 +46,7 @@
46#include <linux/socket.h> 46#include <linux/socket.h>
47#include <linux/sockios.h> 47#include <linux/sockios.h>
48#include <linux/slab.h> 48#include <linux/slab.h>
49#include <linux/sched/signal.h>
49#include <linux/init.h> 50#include <linux/init.h>
50#include <linux/net.h> 51#include <linux/net.h>
51#include <linux/irda.h> 52#include <linux/irda.h>
@@ -827,7 +828,8 @@ out:
827 * Wait for incoming connection 828 * Wait for incoming connection
828 * 829 *
829 */ 830 */
830static int irda_accept(struct socket *sock, struct socket *newsock, int flags) 831static int irda_accept(struct socket *sock, struct socket *newsock, int flags,
832 bool kern)
831{ 833{
832 struct sock *sk = sock->sk; 834 struct sock *sk = sock->sk;
833 struct irda_sock *new, *self = irda_sk(sk); 835 struct irda_sock *new, *self = irda_sk(sk);
@@ -835,7 +837,7 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags)
835 struct sk_buff *skb = NULL; 837 struct sk_buff *skb = NULL;
836 int err; 838 int err;
837 839
838 err = irda_create(sock_net(sk), newsock, sk->sk_protocol, 0); 840 err = irda_create(sock_net(sk), newsock, sk->sk_protocol, kern);
839 if (err) 841 if (err)
840 return err; 842 return err;
841 843
diff --git a/net/irda/ircomm/ircomm_tty.c b/net/irda/ircomm/ircomm_tty.c
index 817b1b186aff..f6061c4bb0a8 100644
--- a/net/irda/ircomm/ircomm_tty.c
+++ b/net/irda/ircomm/ircomm_tty.c
@@ -32,7 +32,7 @@
32#include <linux/module.h> 32#include <linux/module.h>
33#include <linux/fs.h> 33#include <linux/fs.h>
34#include <linux/slab.h> 34#include <linux/slab.h>
35#include <linux/sched.h> 35#include <linux/sched/signal.h>
36#include <linux/seq_file.h> 36#include <linux/seq_file.h>
37#include <linux/termios.h> 37#include <linux/termios.h>
38#include <linux/tty.h> 38#include <linux/tty.h>
diff --git a/net/irda/irnet/irnet_ppp.c b/net/irda/irnet/irnet_ppp.c
index 1215693fdd22..7025dcb853d0 100644
--- a/net/irda/irnet/irnet_ppp.c
+++ b/net/irda/irnet/irnet_ppp.c
@@ -13,8 +13,9 @@
13 * 2) as a control channel (write commands, read events) 13 * 2) as a control channel (write commands, read events)
14 */ 14 */
15 15
16#include <linux/sched.h> 16#include <linux/sched/signal.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18
18#include "irnet_ppp.h" /* Private header */ 19#include "irnet_ppp.h" /* Private header */
19/* Please put other headers in irnet.h - Thanks */ 20/* Please put other headers in irnet.h - Thanks */
20 21
@@ -51,7 +52,7 @@ irnet_ctrl_write(irnet_socket * ap,
51 char * next; /* Next command to process */ 52 char * next; /* Next command to process */
52 int length; /* Length of current command */ 53 int length; /* Length of current command */
53 54
54 DENTER(CTRL_TRACE, "(ap=0x%p, count=%Zd)\n", ap, count); 55 DENTER(CTRL_TRACE, "(ap=0x%p, count=%zd)\n", ap, count);
55 56
56 /* Check for overflow... */ 57 /* Check for overflow... */
57 DABORT(count >= IRNET_MAX_COMMAND, -ENOMEM, 58 DABORT(count >= IRNET_MAX_COMMAND, -ENOMEM,
@@ -66,7 +67,7 @@ irnet_ctrl_write(irnet_socket * ap,
66 67
67 /* Safe terminate the string */ 68 /* Safe terminate the string */
68 command[count] = '\0'; 69 command[count] = '\0';
69 DEBUG(CTRL_INFO, "Command line received is ``%s'' (%Zd).\n", 70 DEBUG(CTRL_INFO, "Command line received is ``%s'' (%zd).\n",
70 command, count); 71 command, count);
71 72
72 /* Check every commands in the command line */ 73 /* Check every commands in the command line */
@@ -285,7 +286,7 @@ irnet_ctrl_read(irnet_socket * ap,
285 char event[75]; 286 char event[75];
286 ssize_t ret = 0; 287 ssize_t ret = 0;
287 288
288 DENTER(CTRL_TRACE, "(ap=0x%p, count=%Zd)\n", ap, count); 289 DENTER(CTRL_TRACE, "(ap=0x%p, count=%zd)\n", ap, count);
289 290
290#ifdef INITIAL_DISCOVERY 291#ifdef INITIAL_DISCOVERY
291 /* Check if we have read the log */ 292 /* Check if we have read the log */
@@ -328,7 +329,7 @@ irnet_ctrl_read(irnet_socket * ap,
328 if(ret != 0) 329 if(ret != 0)
329 { 330 {
330 /* No, return the error code */ 331 /* No, return the error code */
331 DEXIT(CTRL_TRACE, " - ret %Zd\n", ret); 332 DEXIT(CTRL_TRACE, " - ret %zd\n", ret);
332 return ret; 333 return ret;
333 } 334 }
334 335
@@ -568,7 +569,7 @@ dev_irnet_write(struct file * file,
568{ 569{
569 irnet_socket * ap = file->private_data; 570 irnet_socket * ap = file->private_data;
570 571
571 DPASS(FS_TRACE, "(file=0x%p, ap=0x%p, count=%Zd)\n", 572 DPASS(FS_TRACE, "(file=0x%p, ap=0x%p, count=%zd)\n",
572 file, ap, count); 573 file, ap, count);
573 DABORT(ap == NULL, -ENXIO, FS_ERROR, "ap is NULL !!!\n"); 574 DABORT(ap == NULL, -ENXIO, FS_ERROR, "ap is NULL !!!\n");
574 575
@@ -592,7 +593,7 @@ dev_irnet_read(struct file * file,
592{ 593{
593 irnet_socket * ap = file->private_data; 594 irnet_socket * ap = file->private_data;
594 595
595 DPASS(FS_TRACE, "(file=0x%p, ap=0x%p, count=%Zd)\n", 596 DPASS(FS_TRACE, "(file=0x%p, ap=0x%p, count=%zd)\n",
596 file, ap, count); 597 file, ap, count);
597 DABORT(ap == NULL, -ENXIO, FS_ERROR, "ap is NULL !!!\n"); 598 DABORT(ap == NULL, -ENXIO, FS_ERROR, "ap is NULL !!!\n");
598 599
diff --git a/net/irda/irqueue.c b/net/irda/irqueue.c
index acbe61c7e683..160dc89335e2 100644
--- a/net/irda/irqueue.c
+++ b/net/irda/irqueue.c
@@ -383,9 +383,6 @@ EXPORT_SYMBOL(hashbin_new);
383 * for deallocating this structure if it's complex. If not the user can 383 * for deallocating this structure if it's complex. If not the user can
384 * just supply kfree, which should take care of the job. 384 * just supply kfree, which should take care of the job.
385 */ 385 */
386#ifdef CONFIG_LOCKDEP
387static int hashbin_lock_depth = 0;
388#endif
389int hashbin_delete( hashbin_t* hashbin, FREE_FUNC free_func) 386int hashbin_delete( hashbin_t* hashbin, FREE_FUNC free_func)
390{ 387{
391 irda_queue_t* queue; 388 irda_queue_t* queue;
@@ -396,22 +393,27 @@ int hashbin_delete( hashbin_t* hashbin, FREE_FUNC free_func)
396 IRDA_ASSERT(hashbin->magic == HB_MAGIC, return -1;); 393 IRDA_ASSERT(hashbin->magic == HB_MAGIC, return -1;);
397 394
398 /* Synchronize */ 395 /* Synchronize */
399 if ( hashbin->hb_type & HB_LOCK ) { 396 if (hashbin->hb_type & HB_LOCK)
400 spin_lock_irqsave_nested(&hashbin->hb_spinlock, flags, 397 spin_lock_irqsave(&hashbin->hb_spinlock, flags);
401 hashbin_lock_depth++);
402 }
403 398
404 /* 399 /*
405 * Free the entries in the hashbin, TODO: use hashbin_clear when 400 * Free the entries in the hashbin, TODO: use hashbin_clear when
406 * it has been shown to work 401 * it has been shown to work
407 */ 402 */
408 for (i = 0; i < HASHBIN_SIZE; i ++ ) { 403 for (i = 0; i < HASHBIN_SIZE; i ++ ) {
409 queue = dequeue_first((irda_queue_t**) &hashbin->hb_queue[i]); 404 while (1) {
410 while (queue ) { 405 queue = dequeue_first((irda_queue_t**) &hashbin->hb_queue[i]);
411 if (free_func) 406
412 (*free_func)(queue); 407 if (!queue)
413 queue = dequeue_first( 408 break;
414 (irda_queue_t**) &hashbin->hb_queue[i]); 409
410 if (free_func) {
411 if (hashbin->hb_type & HB_LOCK)
412 spin_unlock_irqrestore(&hashbin->hb_spinlock, flags);
413 free_func(queue);
414 if (hashbin->hb_type & HB_LOCK)
415 spin_lock_irqsave(&hashbin->hb_spinlock, flags);
416 }
415 } 417 }
416 } 418 }
417 419
@@ -420,12 +422,8 @@ int hashbin_delete( hashbin_t* hashbin, FREE_FUNC free_func)
420 hashbin->magic = ~HB_MAGIC; 422 hashbin->magic = ~HB_MAGIC;
421 423
422 /* Release lock */ 424 /* Release lock */
423 if ( hashbin->hb_type & HB_LOCK) { 425 if (hashbin->hb_type & HB_LOCK)
424 spin_unlock_irqrestore(&hashbin->hb_spinlock, flags); 426 spin_unlock_irqrestore(&hashbin->hb_spinlock, flags);
425#ifdef CONFIG_LOCKDEP
426 hashbin_lock_depth--;
427#endif
428 }
429 427
430 /* 428 /*
431 * Free the hashbin structure 429 * Free the hashbin structure
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index 13190b38f22e..84de7b6326dc 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -17,7 +17,7 @@
17#include <linux/list.h> 17#include <linux/list.h>
18#include <linux/errno.h> 18#include <linux/errno.h>
19#include <linux/kernel.h> 19#include <linux/kernel.h>
20#include <linux/sched.h> 20#include <linux/sched/signal.h>
21#include <linux/slab.h> 21#include <linux/slab.h>
22#include <linux/skbuff.h> 22#include <linux/skbuff.h>
23#include <linux/init.h> 23#include <linux/init.h>
@@ -938,7 +938,7 @@ done:
938 938
939/* Accept a pending connection */ 939/* Accept a pending connection */
940static int iucv_sock_accept(struct socket *sock, struct socket *newsock, 940static int iucv_sock_accept(struct socket *sock, struct socket *newsock,
941 int flags) 941 int flags, bool kern)
942{ 942{
943 DECLARE_WAITQUEUE(wait, current); 943 DECLARE_WAITQUEUE(wait, current);
944 struct sock *sk = sock->sk, *nsk; 944 struct sock *sk = sock->sk, *nsk;
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index 7e08a4d3d77d..31762f76cdb5 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -24,6 +24,8 @@
24#include <linux/uaccess.h> 24#include <linux/uaccess.h>
25#include <linux/workqueue.h> 25#include <linux/workqueue.h>
26#include <linux/syscalls.h> 26#include <linux/syscalls.h>
27#include <linux/sched/signal.h>
28
27#include <net/kcm.h> 29#include <net/kcm.h>
28#include <net/netns/generic.h> 30#include <net/netns/generic.h>
29#include <net/sock.h> 31#include <net/sock.h>
@@ -929,23 +931,25 @@ static int kcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
929 goto out_error; 931 goto out_error;
930 } 932 }
931 933
932 /* New message, alloc head skb */ 934 if (msg_data_left(msg)) {
933 head = alloc_skb(0, sk->sk_allocation); 935 /* New message, alloc head skb */
934 while (!head) {
935 kcm_push(kcm);
936 err = sk_stream_wait_memory(sk, &timeo);
937 if (err)
938 goto out_error;
939
940 head = alloc_skb(0, sk->sk_allocation); 936 head = alloc_skb(0, sk->sk_allocation);
941 } 937 while (!head) {
938 kcm_push(kcm);
939 err = sk_stream_wait_memory(sk, &timeo);
940 if (err)
941 goto out_error;
942
943 head = alloc_skb(0, sk->sk_allocation);
944 }
942 945
943 skb = head; 946 skb = head;
944 947
945 /* Set ip_summed to CHECKSUM_UNNECESSARY to avoid calling 948 /* Set ip_summed to CHECKSUM_UNNECESSARY to avoid calling
946 * csum_and_copy_from_iter from skb_do_copy_data_nocache. 949 * csum_and_copy_from_iter from skb_do_copy_data_nocache.
947 */ 950 */
948 skb->ip_summed = CHECKSUM_UNNECESSARY; 951 skb->ip_summed = CHECKSUM_UNNECESSARY;
952 }
949 953
950start: 954start:
951 while (msg_data_left(msg)) { 955 while (msg_data_left(msg)) {
@@ -1018,10 +1022,12 @@ wait_for_memory:
1018 if (eor) { 1022 if (eor) {
1019 bool not_busy = skb_queue_empty(&sk->sk_write_queue); 1023 bool not_busy = skb_queue_empty(&sk->sk_write_queue);
1020 1024
1021 /* Message complete, queue it on send buffer */ 1025 if (head) {
1022 __skb_queue_tail(&sk->sk_write_queue, head); 1026 /* Message complete, queue it on send buffer */
1023 kcm->seq_skb = NULL; 1027 __skb_queue_tail(&sk->sk_write_queue, head);
1024 KCM_STATS_INCR(kcm->stats.tx_msgs); 1028 kcm->seq_skb = NULL;
1029 KCM_STATS_INCR(kcm->stats.tx_msgs);
1030 }
1025 1031
1026 if (msg->msg_flags & MSG_BATCH) { 1032 if (msg->msg_flags & MSG_BATCH) {
1027 kcm->tx_wait_more = true; 1033 kcm->tx_wait_more = true;
@@ -1040,8 +1046,10 @@ wait_for_memory:
1040 } else { 1046 } else {
1041 /* Message not complete, save state */ 1047 /* Message not complete, save state */
1042partial_message: 1048partial_message:
1043 kcm->seq_skb = head; 1049 if (head) {
1044 kcm_tx_msg(head)->last_skb = skb; 1050 kcm->seq_skb = head;
1051 kcm_tx_msg(head)->last_skb = skb;
1052 }
1045 } 1053 }
1046 1054
1047 KCM_STATS_ADD(kcm->stats.tx_bytes, copied); 1055 KCM_STATS_ADD(kcm->stats.tx_bytes, copied);
@@ -1679,7 +1687,7 @@ static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1679 struct kcm_attach info; 1687 struct kcm_attach info;
1680 1688
1681 if (copy_from_user(&info, (void __user *)arg, sizeof(info))) 1689 if (copy_from_user(&info, (void __user *)arg, sizeof(info)))
1682 err = -EFAULT; 1690 return -EFAULT;
1683 1691
1684 err = kcm_attach_ioctl(sock, &info); 1692 err = kcm_attach_ioctl(sock, &info);
1685 1693
@@ -1689,7 +1697,7 @@ static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1689 struct kcm_unattach info; 1697 struct kcm_unattach info;
1690 1698
1691 if (copy_from_user(&info, (void __user *)arg, sizeof(info))) 1699 if (copy_from_user(&info, (void __user *)arg, sizeof(info)))
1692 err = -EFAULT; 1700 return -EFAULT;
1693 1701
1694 err = kcm_unattach_ioctl(sock, &info); 1702 err = kcm_unattach_ioctl(sock, &info);
1695 1703
@@ -1700,7 +1708,7 @@ static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1700 struct socket *newsock = NULL; 1708 struct socket *newsock = NULL;
1701 1709
1702 if (copy_from_user(&info, (void __user *)arg, sizeof(info))) 1710 if (copy_from_user(&info, (void __user *)arg, sizeof(info)))
1703 err = -EFAULT; 1711 return -EFAULT;
1704 1712
1705 err = kcm_clone(sock, &info, &newsock); 1713 err = kcm_clone(sock, &info, &newsock);
1706 1714
diff --git a/net/key/af_key.c b/net/key/af_key.c
index c6252ed42c1d..be8cecc65002 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -63,8 +63,13 @@ struct pfkey_sock {
63 } u; 63 } u;
64 struct sk_buff *skb; 64 struct sk_buff *skb;
65 } dump; 65 } dump;
66 struct mutex dump_lock;
66}; 67};
67 68
69static int parse_sockaddr_pair(struct sockaddr *sa, int ext_len,
70 xfrm_address_t *saddr, xfrm_address_t *daddr,
71 u16 *family);
72
68static inline struct pfkey_sock *pfkey_sk(struct sock *sk) 73static inline struct pfkey_sock *pfkey_sk(struct sock *sk)
69{ 74{
70 return (struct pfkey_sock *)sk; 75 return (struct pfkey_sock *)sk;
@@ -139,6 +144,7 @@ static int pfkey_create(struct net *net, struct socket *sock, int protocol,
139{ 144{
140 struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); 145 struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id);
141 struct sock *sk; 146 struct sock *sk;
147 struct pfkey_sock *pfk;
142 int err; 148 int err;
143 149
144 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 150 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
@@ -153,6 +159,9 @@ static int pfkey_create(struct net *net, struct socket *sock, int protocol,
153 if (sk == NULL) 159 if (sk == NULL)
154 goto out; 160 goto out;
155 161
162 pfk = pfkey_sk(sk);
163 mutex_init(&pfk->dump_lock);
164
156 sock->ops = &pfkey_ops; 165 sock->ops = &pfkey_ops;
157 sock_init_data(sock, sk); 166 sock_init_data(sock, sk);
158 167
@@ -281,13 +290,23 @@ static int pfkey_do_dump(struct pfkey_sock *pfk)
281 struct sadb_msg *hdr; 290 struct sadb_msg *hdr;
282 int rc; 291 int rc;
283 292
293 mutex_lock(&pfk->dump_lock);
294 if (!pfk->dump.dump) {
295 rc = 0;
296 goto out;
297 }
298
284 rc = pfk->dump.dump(pfk); 299 rc = pfk->dump.dump(pfk);
285 if (rc == -ENOBUFS) 300 if (rc == -ENOBUFS) {
286 return 0; 301 rc = 0;
302 goto out;
303 }
287 304
288 if (pfk->dump.skb) { 305 if (pfk->dump.skb) {
289 if (!pfkey_can_dump(&pfk->sk)) 306 if (!pfkey_can_dump(&pfk->sk)) {
290 return 0; 307 rc = 0;
308 goto out;
309 }
291 310
292 hdr = (struct sadb_msg *) pfk->dump.skb->data; 311 hdr = (struct sadb_msg *) pfk->dump.skb->data;
293 hdr->sadb_msg_seq = 0; 312 hdr->sadb_msg_seq = 0;
@@ -298,6 +317,9 @@ static int pfkey_do_dump(struct pfkey_sock *pfk)
298 } 317 }
299 318
300 pfkey_terminate_dump(pfk); 319 pfkey_terminate_dump(pfk);
320
321out:
322 mutex_unlock(&pfk->dump_lock);
301 return rc; 323 return rc;
302} 324}
303 325
@@ -1793,19 +1815,26 @@ static int pfkey_dump(struct sock *sk, struct sk_buff *skb, const struct sadb_ms
1793 struct xfrm_address_filter *filter = NULL; 1815 struct xfrm_address_filter *filter = NULL;
1794 struct pfkey_sock *pfk = pfkey_sk(sk); 1816 struct pfkey_sock *pfk = pfkey_sk(sk);
1795 1817
1796 if (pfk->dump.dump != NULL) 1818 mutex_lock(&pfk->dump_lock);
1819 if (pfk->dump.dump != NULL) {
1820 mutex_unlock(&pfk->dump_lock);
1797 return -EBUSY; 1821 return -EBUSY;
1822 }
1798 1823
1799 proto = pfkey_satype2proto(hdr->sadb_msg_satype); 1824 proto = pfkey_satype2proto(hdr->sadb_msg_satype);
1800 if (proto == 0) 1825 if (proto == 0) {
1826 mutex_unlock(&pfk->dump_lock);
1801 return -EINVAL; 1827 return -EINVAL;
1828 }
1802 1829
1803 if (ext_hdrs[SADB_X_EXT_FILTER - 1]) { 1830 if (ext_hdrs[SADB_X_EXT_FILTER - 1]) {
1804 struct sadb_x_filter *xfilter = ext_hdrs[SADB_X_EXT_FILTER - 1]; 1831 struct sadb_x_filter *xfilter = ext_hdrs[SADB_X_EXT_FILTER - 1];
1805 1832
1806 filter = kmalloc(sizeof(*filter), GFP_KERNEL); 1833 filter = kmalloc(sizeof(*filter), GFP_KERNEL);
1807 if (filter == NULL) 1834 if (filter == NULL) {
1835 mutex_unlock(&pfk->dump_lock);
1808 return -ENOMEM; 1836 return -ENOMEM;
1837 }
1809 1838
1810 memcpy(&filter->saddr, &xfilter->sadb_x_filter_saddr, 1839 memcpy(&filter->saddr, &xfilter->sadb_x_filter_saddr,
1811 sizeof(xfrm_address_t)); 1840 sizeof(xfrm_address_t));
@@ -1821,6 +1850,7 @@ static int pfkey_dump(struct sock *sk, struct sk_buff *skb, const struct sadb_ms
1821 pfk->dump.dump = pfkey_dump_sa; 1850 pfk->dump.dump = pfkey_dump_sa;
1822 pfk->dump.done = pfkey_dump_sa_done; 1851 pfk->dump.done = pfkey_dump_sa_done;
1823 xfrm_state_walk_init(&pfk->dump.u.state, proto, filter); 1852 xfrm_state_walk_init(&pfk->dump.u.state, proto, filter);
1853 mutex_unlock(&pfk->dump_lock);
1824 1854
1825 return pfkey_do_dump(pfk); 1855 return pfkey_do_dump(pfk);
1826} 1856}
@@ -1913,19 +1943,14 @@ parse_ipsecrequest(struct xfrm_policy *xp, struct sadb_x_ipsecrequest *rq)
1913 1943
1914 /* addresses present only in tunnel mode */ 1944 /* addresses present only in tunnel mode */
1915 if (t->mode == XFRM_MODE_TUNNEL) { 1945 if (t->mode == XFRM_MODE_TUNNEL) {
1916 u8 *sa = (u8 *) (rq + 1); 1946 int err;
1917 int family, socklen;
1918 1947
1919 family = pfkey_sockaddr_extract((struct sockaddr *)sa, 1948 err = parse_sockaddr_pair(
1920 &t->saddr); 1949 (struct sockaddr *)(rq + 1),
1921 if (!family) 1950 rq->sadb_x_ipsecrequest_len - sizeof(*rq),
1922 return -EINVAL; 1951 &t->saddr, &t->id.daddr, &t->encap_family);
1923 1952 if (err)
1924 socklen = pfkey_sockaddr_len(family); 1953 return err;
1925 if (pfkey_sockaddr_extract((struct sockaddr *)(sa + socklen),
1926 &t->id.daddr) != family)
1927 return -EINVAL;
1928 t->encap_family = family;
1929 } else 1954 } else
1930 t->encap_family = xp->family; 1955 t->encap_family = xp->family;
1931 1956
@@ -1945,7 +1970,11 @@ parse_ipsecrequests(struct xfrm_policy *xp, struct sadb_x_policy *pol)
1945 if (pol->sadb_x_policy_len * 8 < sizeof(struct sadb_x_policy)) 1970 if (pol->sadb_x_policy_len * 8 < sizeof(struct sadb_x_policy))
1946 return -EINVAL; 1971 return -EINVAL;
1947 1972
1948 while (len >= sizeof(struct sadb_x_ipsecrequest)) { 1973 while (len >= sizeof(*rq)) {
1974 if (len < rq->sadb_x_ipsecrequest_len ||
1975 rq->sadb_x_ipsecrequest_len < sizeof(*rq))
1976 return -EINVAL;
1977
1949 if ((err = parse_ipsecrequest(xp, rq)) < 0) 1978 if ((err = parse_ipsecrequest(xp, rq)) < 0)
1950 return err; 1979 return err;
1951 len -= rq->sadb_x_ipsecrequest_len; 1980 len -= rq->sadb_x_ipsecrequest_len;
@@ -2408,7 +2437,6 @@ out:
2408 return err; 2437 return err;
2409} 2438}
2410 2439
2411#ifdef CONFIG_NET_KEY_MIGRATE
2412static int pfkey_sockaddr_pair_size(sa_family_t family) 2440static int pfkey_sockaddr_pair_size(sa_family_t family)
2413{ 2441{
2414 return PFKEY_ALIGN8(pfkey_sockaddr_len(family) * 2); 2442 return PFKEY_ALIGN8(pfkey_sockaddr_len(family) * 2);
@@ -2420,7 +2448,7 @@ static int parse_sockaddr_pair(struct sockaddr *sa, int ext_len,
2420{ 2448{
2421 int af, socklen; 2449 int af, socklen;
2422 2450
2423 if (ext_len < pfkey_sockaddr_pair_size(sa->sa_family)) 2451 if (ext_len < 2 || ext_len < pfkey_sockaddr_pair_size(sa->sa_family))
2424 return -EINVAL; 2452 return -EINVAL;
2425 2453
2426 af = pfkey_sockaddr_extract(sa, saddr); 2454 af = pfkey_sockaddr_extract(sa, saddr);
@@ -2436,6 +2464,7 @@ static int parse_sockaddr_pair(struct sockaddr *sa, int ext_len,
2436 return 0; 2464 return 0;
2437} 2465}
2438 2466
2467#ifdef CONFIG_NET_KEY_MIGRATE
2439static int ipsecrequests_to_migrate(struct sadb_x_ipsecrequest *rq1, int len, 2468static int ipsecrequests_to_migrate(struct sadb_x_ipsecrequest *rq1, int len,
2440 struct xfrm_migrate *m) 2469 struct xfrm_migrate *m)
2441{ 2470{
@@ -2443,13 +2472,14 @@ static int ipsecrequests_to_migrate(struct sadb_x_ipsecrequest *rq1, int len,
2443 struct sadb_x_ipsecrequest *rq2; 2472 struct sadb_x_ipsecrequest *rq2;
2444 int mode; 2473 int mode;
2445 2474
2446 if (len <= sizeof(struct sadb_x_ipsecrequest) || 2475 if (len < sizeof(*rq1) ||
2447 len < rq1->sadb_x_ipsecrequest_len) 2476 len < rq1->sadb_x_ipsecrequest_len ||
2477 rq1->sadb_x_ipsecrequest_len < sizeof(*rq1))
2448 return -EINVAL; 2478 return -EINVAL;
2449 2479
2450 /* old endoints */ 2480 /* old endoints */
2451 err = parse_sockaddr_pair((struct sockaddr *)(rq1 + 1), 2481 err = parse_sockaddr_pair((struct sockaddr *)(rq1 + 1),
2452 rq1->sadb_x_ipsecrequest_len, 2482 rq1->sadb_x_ipsecrequest_len - sizeof(*rq1),
2453 &m->old_saddr, &m->old_daddr, 2483 &m->old_saddr, &m->old_daddr,
2454 &m->old_family); 2484 &m->old_family);
2455 if (err) 2485 if (err)
@@ -2458,13 +2488,14 @@ static int ipsecrequests_to_migrate(struct sadb_x_ipsecrequest *rq1, int len,
2458 rq2 = (struct sadb_x_ipsecrequest *)((u8 *)rq1 + rq1->sadb_x_ipsecrequest_len); 2488 rq2 = (struct sadb_x_ipsecrequest *)((u8 *)rq1 + rq1->sadb_x_ipsecrequest_len);
2459 len -= rq1->sadb_x_ipsecrequest_len; 2489 len -= rq1->sadb_x_ipsecrequest_len;
2460 2490
2461 if (len <= sizeof(struct sadb_x_ipsecrequest) || 2491 if (len <= sizeof(*rq2) ||
2462 len < rq2->sadb_x_ipsecrequest_len) 2492 len < rq2->sadb_x_ipsecrequest_len ||
2493 rq2->sadb_x_ipsecrequest_len < sizeof(*rq2))
2463 return -EINVAL; 2494 return -EINVAL;
2464 2495
2465 /* new endpoints */ 2496 /* new endpoints */
2466 err = parse_sockaddr_pair((struct sockaddr *)(rq2 + 1), 2497 err = parse_sockaddr_pair((struct sockaddr *)(rq2 + 1),
2467 rq2->sadb_x_ipsecrequest_len, 2498 rq2->sadb_x_ipsecrequest_len - sizeof(*rq2),
2468 &m->new_saddr, &m->new_daddr, 2499 &m->new_saddr, &m->new_daddr,
2469 &m->new_family); 2500 &m->new_family);
2470 if (err) 2501 if (err)
@@ -2679,14 +2710,18 @@ static int pfkey_spddump(struct sock *sk, struct sk_buff *skb, const struct sadb
2679{ 2710{
2680 struct pfkey_sock *pfk = pfkey_sk(sk); 2711 struct pfkey_sock *pfk = pfkey_sk(sk);
2681 2712
2682 if (pfk->dump.dump != NULL) 2713 mutex_lock(&pfk->dump_lock);
2714 if (pfk->dump.dump != NULL) {
2715 mutex_unlock(&pfk->dump_lock);
2683 return -EBUSY; 2716 return -EBUSY;
2717 }
2684 2718
2685 pfk->dump.msg_version = hdr->sadb_msg_version; 2719 pfk->dump.msg_version = hdr->sadb_msg_version;
2686 pfk->dump.msg_portid = hdr->sadb_msg_pid; 2720 pfk->dump.msg_portid = hdr->sadb_msg_pid;
2687 pfk->dump.dump = pfkey_dump_sp; 2721 pfk->dump.dump = pfkey_dump_sp;
2688 pfk->dump.done = pfkey_dump_sp_done; 2722 pfk->dump.done = pfkey_dump_sp_done;
2689 xfrm_policy_walk_init(&pfk->dump.u.policy, XFRM_POLICY_TYPE_MAIN); 2723 xfrm_policy_walk_init(&pfk->dump.u.policy, XFRM_POLICY_TYPE_MAIN);
2724 mutex_unlock(&pfk->dump_lock);
2690 2725
2691 return pfkey_do_dump(pfk); 2726 return pfkey_do_dump(pfk);
2692} 2727}
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 85948c69b236..e37d9554da7b 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -278,7 +278,57 @@ struct l2tp_session *l2tp_session_find(struct net *net, struct l2tp_tunnel *tunn
278} 278}
279EXPORT_SYMBOL_GPL(l2tp_session_find); 279EXPORT_SYMBOL_GPL(l2tp_session_find);
280 280
281struct l2tp_session *l2tp_session_find_nth(struct l2tp_tunnel *tunnel, int nth) 281/* Like l2tp_session_find() but takes a reference on the returned session.
282 * Optionally calls session->ref() too if do_ref is true.
283 */
284struct l2tp_session *l2tp_session_get(struct net *net,
285 struct l2tp_tunnel *tunnel,
286 u32 session_id, bool do_ref)
287{
288 struct hlist_head *session_list;
289 struct l2tp_session *session;
290
291 if (!tunnel) {
292 struct l2tp_net *pn = l2tp_pernet(net);
293
294 session_list = l2tp_session_id_hash_2(pn, session_id);
295
296 rcu_read_lock_bh();
297 hlist_for_each_entry_rcu(session, session_list, global_hlist) {
298 if (session->session_id == session_id) {
299 l2tp_session_inc_refcount(session);
300 if (do_ref && session->ref)
301 session->ref(session);
302 rcu_read_unlock_bh();
303
304 return session;
305 }
306 }
307 rcu_read_unlock_bh();
308
309 return NULL;
310 }
311
312 session_list = l2tp_session_id_hash(tunnel, session_id);
313 read_lock_bh(&tunnel->hlist_lock);
314 hlist_for_each_entry(session, session_list, hlist) {
315 if (session->session_id == session_id) {
316 l2tp_session_inc_refcount(session);
317 if (do_ref && session->ref)
318 session->ref(session);
319 read_unlock_bh(&tunnel->hlist_lock);
320
321 return session;
322 }
323 }
324 read_unlock_bh(&tunnel->hlist_lock);
325
326 return NULL;
327}
328EXPORT_SYMBOL_GPL(l2tp_session_get);
329
330struct l2tp_session *l2tp_session_get_nth(struct l2tp_tunnel *tunnel, int nth,
331 bool do_ref)
282{ 332{
283 int hash; 333 int hash;
284 struct l2tp_session *session; 334 struct l2tp_session *session;
@@ -288,6 +338,9 @@ struct l2tp_session *l2tp_session_find_nth(struct l2tp_tunnel *tunnel, int nth)
288 for (hash = 0; hash < L2TP_HASH_SIZE; hash++) { 338 for (hash = 0; hash < L2TP_HASH_SIZE; hash++) {
289 hlist_for_each_entry(session, &tunnel->session_hlist[hash], hlist) { 339 hlist_for_each_entry(session, &tunnel->session_hlist[hash], hlist) {
290 if (++count > nth) { 340 if (++count > nth) {
341 l2tp_session_inc_refcount(session);
342 if (do_ref && session->ref)
343 session->ref(session);
291 read_unlock_bh(&tunnel->hlist_lock); 344 read_unlock_bh(&tunnel->hlist_lock);
292 return session; 345 return session;
293 } 346 }
@@ -298,12 +351,13 @@ struct l2tp_session *l2tp_session_find_nth(struct l2tp_tunnel *tunnel, int nth)
298 351
299 return NULL; 352 return NULL;
300} 353}
301EXPORT_SYMBOL_GPL(l2tp_session_find_nth); 354EXPORT_SYMBOL_GPL(l2tp_session_get_nth);
302 355
303/* Lookup a session by interface name. 356/* Lookup a session by interface name.
304 * This is very inefficient but is only used by management interfaces. 357 * This is very inefficient but is only used by management interfaces.
305 */ 358 */
306struct l2tp_session *l2tp_session_find_by_ifname(struct net *net, char *ifname) 359struct l2tp_session *l2tp_session_get_by_ifname(struct net *net, char *ifname,
360 bool do_ref)
307{ 361{
308 struct l2tp_net *pn = l2tp_pernet(net); 362 struct l2tp_net *pn = l2tp_pernet(net);
309 int hash; 363 int hash;
@@ -313,7 +367,11 @@ struct l2tp_session *l2tp_session_find_by_ifname(struct net *net, char *ifname)
313 for (hash = 0; hash < L2TP_HASH_SIZE_2; hash++) { 367 for (hash = 0; hash < L2TP_HASH_SIZE_2; hash++) {
314 hlist_for_each_entry_rcu(session, &pn->l2tp_session_hlist[hash], global_hlist) { 368 hlist_for_each_entry_rcu(session, &pn->l2tp_session_hlist[hash], global_hlist) {
315 if (!strcmp(session->ifname, ifname)) { 369 if (!strcmp(session->ifname, ifname)) {
370 l2tp_session_inc_refcount(session);
371 if (do_ref && session->ref)
372 session->ref(session);
316 rcu_read_unlock_bh(); 373 rcu_read_unlock_bh();
374
317 return session; 375 return session;
318 } 376 }
319 } 377 }
@@ -323,7 +381,49 @@ struct l2tp_session *l2tp_session_find_by_ifname(struct net *net, char *ifname)
323 381
324 return NULL; 382 return NULL;
325} 383}
326EXPORT_SYMBOL_GPL(l2tp_session_find_by_ifname); 384EXPORT_SYMBOL_GPL(l2tp_session_get_by_ifname);
385
386static int l2tp_session_add_to_tunnel(struct l2tp_tunnel *tunnel,
387 struct l2tp_session *session)
388{
389 struct l2tp_session *session_walk;
390 struct hlist_head *g_head;
391 struct hlist_head *head;
392 struct l2tp_net *pn;
393
394 head = l2tp_session_id_hash(tunnel, session->session_id);
395
396 write_lock_bh(&tunnel->hlist_lock);
397 hlist_for_each_entry(session_walk, head, hlist)
398 if (session_walk->session_id == session->session_id)
399 goto exist;
400
401 if (tunnel->version == L2TP_HDR_VER_3) {
402 pn = l2tp_pernet(tunnel->l2tp_net);
403 g_head = l2tp_session_id_hash_2(l2tp_pernet(tunnel->l2tp_net),
404 session->session_id);
405
406 spin_lock_bh(&pn->l2tp_session_hlist_lock);
407 hlist_for_each_entry(session_walk, g_head, global_hlist)
408 if (session_walk->session_id == session->session_id)
409 goto exist_glob;
410
411 hlist_add_head_rcu(&session->global_hlist, g_head);
412 spin_unlock_bh(&pn->l2tp_session_hlist_lock);
413 }
414
415 hlist_add_head(&session->hlist, head);
416 write_unlock_bh(&tunnel->hlist_lock);
417
418 return 0;
419
420exist_glob:
421 spin_unlock_bh(&pn->l2tp_session_hlist_lock);
422exist:
423 write_unlock_bh(&tunnel->hlist_lock);
424
425 return -EEXIST;
426}
327 427
328/* Lookup a tunnel by id 428/* Lookup a tunnel by id
329 */ 429 */
@@ -633,6 +733,9 @@ discard:
633 * a data (not control) frame before coming here. Fields up to the 733 * a data (not control) frame before coming here. Fields up to the
634 * session-id have already been parsed and ptr points to the data 734 * session-id have already been parsed and ptr points to the data
635 * after the session-id. 735 * after the session-id.
736 *
737 * session->ref() must have been called prior to l2tp_recv_common().
738 * session->deref() will be called automatically after skb is processed.
636 */ 739 */
637void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, 740void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
638 unsigned char *ptr, unsigned char *optr, u16 hdrflags, 741 unsigned char *ptr, unsigned char *optr, u16 hdrflags,
@@ -642,14 +745,6 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
642 int offset; 745 int offset;
643 u32 ns, nr; 746 u32 ns, nr;
644 747
645 /* The ref count is increased since we now hold a pointer to
646 * the session. Take care to decrement the refcnt when exiting
647 * this function from now on...
648 */
649 l2tp_session_inc_refcount(session);
650 if (session->ref)
651 (*session->ref)(session);
652
653 /* Parse and check optional cookie */ 748 /* Parse and check optional cookie */
654 if (session->peer_cookie_len > 0) { 749 if (session->peer_cookie_len > 0) {
655 if (memcmp(ptr, &session->peer_cookie[0], session->peer_cookie_len)) { 750 if (memcmp(ptr, &session->peer_cookie[0], session->peer_cookie_len)) {
@@ -802,8 +897,6 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
802 /* Try to dequeue as many skbs from reorder_q as we can. */ 897 /* Try to dequeue as many skbs from reorder_q as we can. */
803 l2tp_recv_dequeue(session); 898 l2tp_recv_dequeue(session);
804 899
805 l2tp_session_dec_refcount(session);
806
807 return; 900 return;
808 901
809discard: 902discard:
@@ -812,8 +905,6 @@ discard:
812 905
813 if (session->deref) 906 if (session->deref)
814 (*session->deref)(session); 907 (*session->deref)(session);
815
816 l2tp_session_dec_refcount(session);
817} 908}
818EXPORT_SYMBOL(l2tp_recv_common); 909EXPORT_SYMBOL(l2tp_recv_common);
819 910
@@ -920,8 +1011,14 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb,
920 } 1011 }
921 1012
922 /* Find the session context */ 1013 /* Find the session context */
923 session = l2tp_session_find(tunnel->l2tp_net, tunnel, session_id); 1014 session = l2tp_session_get(tunnel->l2tp_net, tunnel, session_id, true);
924 if (!session || !session->recv_skb) { 1015 if (!session || !session->recv_skb) {
1016 if (session) {
1017 if (session->deref)
1018 session->deref(session);
1019 l2tp_session_dec_refcount(session);
1020 }
1021
925 /* Not found? Pass to userspace to deal with */ 1022 /* Not found? Pass to userspace to deal with */
926 l2tp_info(tunnel, L2TP_MSG_DATA, 1023 l2tp_info(tunnel, L2TP_MSG_DATA,
927 "%s: no session found (%u/%u). Passing up.\n", 1024 "%s: no session found (%u/%u). Passing up.\n",
@@ -930,6 +1027,7 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb,
930 } 1027 }
931 1028
932 l2tp_recv_common(session, skb, ptr, optr, hdrflags, length, payload_hook); 1029 l2tp_recv_common(session, skb, ptr, optr, hdrflags, length, payload_hook);
1030 l2tp_session_dec_refcount(session);
933 1031
934 return 0; 1032 return 0;
935 1033
@@ -1058,10 +1156,10 @@ static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb,
1058 1156
1059 /* Debug */ 1157 /* Debug */
1060 if (session->send_seq) 1158 if (session->send_seq)
1061 l2tp_dbg(session, L2TP_MSG_DATA, "%s: send %Zd bytes, ns=%u\n", 1159 l2tp_dbg(session, L2TP_MSG_DATA, "%s: send %zd bytes, ns=%u\n",
1062 session->name, data_len, session->ns - 1); 1160 session->name, data_len, session->ns - 1);
1063 else 1161 else
1064 l2tp_dbg(session, L2TP_MSG_DATA, "%s: send %Zd bytes\n", 1162 l2tp_dbg(session, L2TP_MSG_DATA, "%s: send %zd bytes\n",
1065 session->name, data_len); 1163 session->name, data_len);
1066 1164
1067 if (session->debug & L2TP_MSG_DATA) { 1165 if (session->debug & L2TP_MSG_DATA) {
@@ -1317,6 +1415,9 @@ static void l2tp_tunnel_del_work(struct work_struct *work)
1317 struct sock *sk = NULL; 1415 struct sock *sk = NULL;
1318 1416
1319 tunnel = container_of(work, struct l2tp_tunnel, del_work); 1417 tunnel = container_of(work, struct l2tp_tunnel, del_work);
1418
1419 l2tp_tunnel_closeall(tunnel);
1420
1320 sk = l2tp_tunnel_sock_lookup(tunnel); 1421 sk = l2tp_tunnel_sock_lookup(tunnel);
1321 if (!sk) 1422 if (!sk)
1322 goto out; 1423 goto out;
@@ -1639,7 +1740,6 @@ EXPORT_SYMBOL_GPL(l2tp_tunnel_create);
1639int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel) 1740int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel)
1640{ 1741{
1641 l2tp_tunnel_inc_refcount(tunnel); 1742 l2tp_tunnel_inc_refcount(tunnel);
1642 l2tp_tunnel_closeall(tunnel);
1643 if (false == queue_work(l2tp_wq, &tunnel->del_work)) { 1743 if (false == queue_work(l2tp_wq, &tunnel->del_work)) {
1644 l2tp_tunnel_dec_refcount(tunnel); 1744 l2tp_tunnel_dec_refcount(tunnel);
1645 return 1; 1745 return 1;
@@ -1736,6 +1836,7 @@ EXPORT_SYMBOL_GPL(l2tp_session_set_header_len);
1736struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg) 1836struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg)
1737{ 1837{
1738 struct l2tp_session *session; 1838 struct l2tp_session *session;
1839 int err;
1739 1840
1740 session = kzalloc(sizeof(struct l2tp_session) + priv_size, GFP_KERNEL); 1841 session = kzalloc(sizeof(struct l2tp_session) + priv_size, GFP_KERNEL);
1741 if (session != NULL) { 1842 if (session != NULL) {
@@ -1791,6 +1892,13 @@ struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunn
1791 1892
1792 l2tp_session_set_header_len(session, tunnel->version); 1893 l2tp_session_set_header_len(session, tunnel->version);
1793 1894
1895 err = l2tp_session_add_to_tunnel(tunnel, session);
1896 if (err) {
1897 kfree(session);
1898
1899 return ERR_PTR(err);
1900 }
1901
1794 /* Bump the reference count. The session context is deleted 1902 /* Bump the reference count. The session context is deleted
1795 * only when this drops to zero. 1903 * only when this drops to zero.
1796 */ 1904 */
@@ -1800,28 +1908,14 @@ struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunn
1800 /* Ensure tunnel socket isn't deleted */ 1908 /* Ensure tunnel socket isn't deleted */
1801 sock_hold(tunnel->sock); 1909 sock_hold(tunnel->sock);
1802 1910
1803 /* Add session to the tunnel's hash list */
1804 write_lock_bh(&tunnel->hlist_lock);
1805 hlist_add_head(&session->hlist,
1806 l2tp_session_id_hash(tunnel, session_id));
1807 write_unlock_bh(&tunnel->hlist_lock);
1808
1809 /* And to the global session list if L2TPv3 */
1810 if (tunnel->version != L2TP_HDR_VER_2) {
1811 struct l2tp_net *pn = l2tp_pernet(tunnel->l2tp_net);
1812
1813 spin_lock_bh(&pn->l2tp_session_hlist_lock);
1814 hlist_add_head_rcu(&session->global_hlist,
1815 l2tp_session_id_hash_2(pn, session_id));
1816 spin_unlock_bh(&pn->l2tp_session_hlist_lock);
1817 }
1818
1819 /* Ignore management session in session count value */ 1911 /* Ignore management session in session count value */
1820 if (session->session_id != 0) 1912 if (session->session_id != 0)
1821 atomic_inc(&l2tp_session_count); 1913 atomic_inc(&l2tp_session_count);
1914
1915 return session;
1822 } 1916 }
1823 1917
1824 return session; 1918 return ERR_PTR(-ENOMEM);
1825} 1919}
1826EXPORT_SYMBOL_GPL(l2tp_session_create); 1920EXPORT_SYMBOL_GPL(l2tp_session_create);
1827 1921
diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h
index 8f560f7140a0..8ce7818c7a9d 100644
--- a/net/l2tp/l2tp_core.h
+++ b/net/l2tp/l2tp_core.h
@@ -230,11 +230,16 @@ out:
230 return tunnel; 230 return tunnel;
231} 231}
232 232
233struct l2tp_session *l2tp_session_get(struct net *net,
234 struct l2tp_tunnel *tunnel,
235 u32 session_id, bool do_ref);
233struct l2tp_session *l2tp_session_find(struct net *net, 236struct l2tp_session *l2tp_session_find(struct net *net,
234 struct l2tp_tunnel *tunnel, 237 struct l2tp_tunnel *tunnel,
235 u32 session_id); 238 u32 session_id);
236struct l2tp_session *l2tp_session_find_nth(struct l2tp_tunnel *tunnel, int nth); 239struct l2tp_session *l2tp_session_get_nth(struct l2tp_tunnel *tunnel, int nth,
237struct l2tp_session *l2tp_session_find_by_ifname(struct net *net, char *ifname); 240 bool do_ref);
241struct l2tp_session *l2tp_session_get_by_ifname(struct net *net, char *ifname,
242 bool do_ref);
238struct l2tp_tunnel *l2tp_tunnel_find(struct net *net, u32 tunnel_id); 243struct l2tp_tunnel *l2tp_tunnel_find(struct net *net, u32 tunnel_id);
239struct l2tp_tunnel *l2tp_tunnel_find_nth(struct net *net, int nth); 244struct l2tp_tunnel *l2tp_tunnel_find_nth(struct net *net, int nth);
240 245
@@ -263,6 +268,7 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb,
263int l2tp_nl_register_ops(enum l2tp_pwtype pw_type, 268int l2tp_nl_register_ops(enum l2tp_pwtype pw_type,
264 const struct l2tp_nl_cmd_ops *ops); 269 const struct l2tp_nl_cmd_ops *ops);
265void l2tp_nl_unregister_ops(enum l2tp_pwtype pw_type); 270void l2tp_nl_unregister_ops(enum l2tp_pwtype pw_type);
271int l2tp_ioctl(struct sock *sk, int cmd, unsigned long arg);
266 272
267/* Session reference counts. Incremented when code obtains a reference 273/* Session reference counts. Incremented when code obtains a reference
268 * to a session. 274 * to a session.
diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c
index 2d6760a2ae34..d100aed3d06f 100644
--- a/net/l2tp/l2tp_debugfs.c
+++ b/net/l2tp/l2tp_debugfs.c
@@ -53,7 +53,7 @@ static void l2tp_dfs_next_tunnel(struct l2tp_dfs_seq_data *pd)
53 53
54static void l2tp_dfs_next_session(struct l2tp_dfs_seq_data *pd) 54static void l2tp_dfs_next_session(struct l2tp_dfs_seq_data *pd)
55{ 55{
56 pd->session = l2tp_session_find_nth(pd->tunnel, pd->session_idx); 56 pd->session = l2tp_session_get_nth(pd->tunnel, pd->session_idx, true);
57 pd->session_idx++; 57 pd->session_idx++;
58 58
59 if (pd->session == NULL) { 59 if (pd->session == NULL) {
@@ -238,10 +238,14 @@ static int l2tp_dfs_seq_show(struct seq_file *m, void *v)
238 } 238 }
239 239
240 /* Show the tunnel or session context */ 240 /* Show the tunnel or session context */
241 if (pd->session == NULL) 241 if (!pd->session) {
242 l2tp_dfs_seq_tunnel_show(m, pd->tunnel); 242 l2tp_dfs_seq_tunnel_show(m, pd->tunnel);
243 else 243 } else {
244 l2tp_dfs_seq_session_show(m, pd->session); 244 l2tp_dfs_seq_session_show(m, pd->session);
245 if (pd->session->deref)
246 pd->session->deref(pd->session);
247 l2tp_session_dec_refcount(pd->session);
248 }
245 249
246out: 250out:
247 return 0; 251 return 0;
diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c
index e2c6ae024565..6fd41d7afe1e 100644
--- a/net/l2tp/l2tp_eth.c
+++ b/net/l2tp/l2tp_eth.c
@@ -106,8 +106,8 @@ static int l2tp_eth_dev_xmit(struct sk_buff *skb, struct net_device *dev)
106 return NETDEV_TX_OK; 106 return NETDEV_TX_OK;
107} 107}
108 108
109static struct rtnl_link_stats64 *l2tp_eth_get_stats64(struct net_device *dev, 109static void l2tp_eth_get_stats64(struct net_device *dev,
110 struct rtnl_link_stats64 *stats) 110 struct rtnl_link_stats64 *stats)
111{ 111{
112 struct l2tp_eth *priv = netdev_priv(dev); 112 struct l2tp_eth *priv = netdev_priv(dev);
113 113
@@ -117,10 +117,8 @@ static struct rtnl_link_stats64 *l2tp_eth_get_stats64(struct net_device *dev,
117 stats->rx_bytes = atomic_long_read(&priv->rx_bytes); 117 stats->rx_bytes = atomic_long_read(&priv->rx_bytes);
118 stats->rx_packets = atomic_long_read(&priv->rx_packets); 118 stats->rx_packets = atomic_long_read(&priv->rx_packets);
119 stats->rx_errors = atomic_long_read(&priv->rx_errors); 119 stats->rx_errors = atomic_long_read(&priv->rx_errors);
120 return stats;
121} 120}
122 121
123
124static const struct net_device_ops l2tp_eth_netdev_ops = { 122static const struct net_device_ops l2tp_eth_netdev_ops = {
125 .ndo_init = l2tp_eth_dev_init, 123 .ndo_init = l2tp_eth_dev_init,
126 .ndo_uninit = l2tp_eth_dev_uninit, 124 .ndo_uninit = l2tp_eth_dev_uninit,
@@ -223,12 +221,6 @@ static int l2tp_eth_create(struct net *net, u32 tunnel_id, u32 session_id, u32 p
223 goto out; 221 goto out;
224 } 222 }
225 223
226 session = l2tp_session_find(net, tunnel, session_id);
227 if (session) {
228 rc = -EEXIST;
229 goto out;
230 }
231
232 if (cfg->ifname) { 224 if (cfg->ifname) {
233 dev = dev_get_by_name(net, cfg->ifname); 225 dev = dev_get_by_name(net, cfg->ifname);
234 if (dev) { 226 if (dev) {
@@ -242,8 +234,8 @@ static int l2tp_eth_create(struct net *net, u32 tunnel_id, u32 session_id, u32 p
242 234
243 session = l2tp_session_create(sizeof(*spriv), tunnel, session_id, 235 session = l2tp_session_create(sizeof(*spriv), tunnel, session_id,
244 peer_session_id, cfg); 236 peer_session_id, cfg);
245 if (!session) { 237 if (IS_ERR(session)) {
246 rc = -ENOMEM; 238 rc = PTR_ERR(session);
247 goto out; 239 goto out;
248 } 240 }
249 241
diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index 3d73278b86ca..4d322c1b7233 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -11,6 +11,7 @@
11 11
12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13 13
14#include <asm/ioctls.h>
14#include <linux/icmp.h> 15#include <linux/icmp.h>
15#include <linux/module.h> 16#include <linux/module.h>
16#include <linux/skbuff.h> 17#include <linux/skbuff.h>
@@ -53,19 +54,26 @@ static struct sock *__l2tp_ip_bind_lookup(const struct net *net, __be32 laddr,
53 struct sock *sk; 54 struct sock *sk;
54 55
55 sk_for_each_bound(sk, &l2tp_ip_bind_table) { 56 sk_for_each_bound(sk, &l2tp_ip_bind_table) {
56 struct inet_sock *inet = inet_sk(sk); 57 const struct l2tp_ip_sock *l2tp = l2tp_ip_sk(sk);
57 struct l2tp_ip_sock *l2tp = l2tp_ip_sk(sk); 58 const struct inet_sock *inet = inet_sk(sk);
58 59
59 if (l2tp == NULL) 60 if (!net_eq(sock_net(sk), net))
60 continue; 61 continue;
61 62
62 if ((l2tp->conn_id == tunnel_id) && 63 if (sk->sk_bound_dev_if && dif && sk->sk_bound_dev_if != dif)
63 net_eq(sock_net(sk), net) && 64 continue;
64 !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) && 65
65 (!inet->inet_daddr || !raddr || inet->inet_daddr == raddr) && 66 if (inet->inet_rcv_saddr && laddr &&
66 (!sk->sk_bound_dev_if || !dif || 67 inet->inet_rcv_saddr != laddr)
67 sk->sk_bound_dev_if == dif)) 68 continue;
68 goto found; 69
70 if (inet->inet_daddr && raddr && inet->inet_daddr != raddr)
71 continue;
72
73 if (l2tp->conn_id != tunnel_id)
74 continue;
75
76 goto found;
69 } 77 }
70 78
71 sk = NULL; 79 sk = NULL;
@@ -135,19 +143,19 @@ static int l2tp_ip_recv(struct sk_buff *skb)
135 } 143 }
136 144
137 /* Ok, this is a data packet. Lookup the session. */ 145 /* Ok, this is a data packet. Lookup the session. */
138 session = l2tp_session_find(net, NULL, session_id); 146 session = l2tp_session_get(net, NULL, session_id, true);
139 if (session == NULL) 147 if (!session)
140 goto discard; 148 goto discard;
141 149
142 tunnel = session->tunnel; 150 tunnel = session->tunnel;
143 if (tunnel == NULL) 151 if (!tunnel)
144 goto discard; 152 goto discard_sess;
145 153
146 /* Trace packet contents, if enabled */ 154 /* Trace packet contents, if enabled */
147 if (tunnel->debug & L2TP_MSG_DATA) { 155 if (tunnel->debug & L2TP_MSG_DATA) {
148 length = min(32u, skb->len); 156 length = min(32u, skb->len);
149 if (!pskb_may_pull(skb, length)) 157 if (!pskb_may_pull(skb, length))
150 goto discard; 158 goto discard_sess;
151 159
152 /* Point to L2TP header */ 160 /* Point to L2TP header */
153 optr = ptr = skb->data; 161 optr = ptr = skb->data;
@@ -157,6 +165,7 @@ static int l2tp_ip_recv(struct sk_buff *skb)
157 } 165 }
158 166
159 l2tp_recv_common(session, skb, ptr, optr, 0, skb->len, tunnel->recv_payload_hook); 167 l2tp_recv_common(session, skb, ptr, optr, 0, skb->len, tunnel->recv_payload_hook);
168 l2tp_session_dec_refcount(session);
160 169
161 return 0; 170 return 0;
162 171
@@ -170,9 +179,10 @@ pass_up:
170 179
171 tunnel_id = ntohl(*(__be32 *) &skb->data[4]); 180 tunnel_id = ntohl(*(__be32 *) &skb->data[4]);
172 tunnel = l2tp_tunnel_find(net, tunnel_id); 181 tunnel = l2tp_tunnel_find(net, tunnel_id);
173 if (tunnel != NULL) 182 if (tunnel) {
174 sk = tunnel->sock; 183 sk = tunnel->sock;
175 else { 184 sock_hold(sk);
185 } else {
176 struct iphdr *iph = (struct iphdr *) skb_network_header(skb); 186 struct iphdr *iph = (struct iphdr *) skb_network_header(skb);
177 187
178 read_lock_bh(&l2tp_ip_lock); 188 read_lock_bh(&l2tp_ip_lock);
@@ -194,6 +204,12 @@ pass_up:
194 204
195 return sk_receive_skb(sk, skb, 1); 205 return sk_receive_skb(sk, skb, 1);
196 206
207discard_sess:
208 if (session->deref)
209 session->deref(session);
210 l2tp_session_dec_refcount(session);
211 goto discard;
212
197discard_put: 213discard_put:
198 sock_put(sk); 214 sock_put(sk);
199 215
@@ -258,7 +274,7 @@ static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
258 if (!sock_flag(sk, SOCK_ZAPPED)) 274 if (!sock_flag(sk, SOCK_ZAPPED))
259 goto out; 275 goto out;
260 276
261 if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_l2tpip)) 277 if (sk->sk_state != TCP_CLOSE)
262 goto out; 278 goto out;
263 279
264 chk_addr_ret = inet_addr_type(net, addr->l2tp_addr.s_addr); 280 chk_addr_ret = inet_addr_type(net, addr->l2tp_addr.s_addr);
@@ -380,7 +396,7 @@ static int l2tp_ip_backlog_recv(struct sock *sk, struct sk_buff *skb)
380drop: 396drop:
381 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_INDISCARDS); 397 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_INDISCARDS);
382 kfree_skb(skb); 398 kfree_skb(skb);
383 return -1; 399 return 0;
384} 400}
385 401
386/* Userspace will call sendmsg() on the tunnel socket to send L2TP 402/* Userspace will call sendmsg() on the tunnel socket to send L2TP
@@ -553,6 +569,30 @@ out:
553 return err ? err : copied; 569 return err ? err : copied;
554} 570}
555 571
572int l2tp_ioctl(struct sock *sk, int cmd, unsigned long arg)
573{
574 struct sk_buff *skb;
575 int amount;
576
577 switch (cmd) {
578 case SIOCOUTQ:
579 amount = sk_wmem_alloc_get(sk);
580 break;
581 case SIOCINQ:
582 spin_lock_bh(&sk->sk_receive_queue.lock);
583 skb = skb_peek(&sk->sk_receive_queue);
584 amount = skb ? skb->len : 0;
585 spin_unlock_bh(&sk->sk_receive_queue.lock);
586 break;
587
588 default:
589 return -ENOIOCTLCMD;
590 }
591
592 return put_user(amount, (int __user *)arg);
593}
594EXPORT_SYMBOL(l2tp_ioctl);
595
556static struct proto l2tp_ip_prot = { 596static struct proto l2tp_ip_prot = {
557 .name = "L2TP/IP", 597 .name = "L2TP/IP",
558 .owner = THIS_MODULE, 598 .owner = THIS_MODULE,
@@ -561,7 +601,7 @@ static struct proto l2tp_ip_prot = {
561 .bind = l2tp_ip_bind, 601 .bind = l2tp_ip_bind,
562 .connect = l2tp_ip_connect, 602 .connect = l2tp_ip_connect,
563 .disconnect = l2tp_ip_disconnect, 603 .disconnect = l2tp_ip_disconnect,
564 .ioctl = udp_ioctl, 604 .ioctl = l2tp_ioctl,
565 .destroy = l2tp_ip_destroy_sock, 605 .destroy = l2tp_ip_destroy_sock,
566 .setsockopt = ip_setsockopt, 606 .setsockopt = ip_setsockopt,
567 .getsockopt = ip_getsockopt, 607 .getsockopt = ip_getsockopt,
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index 331ccf5a7bad..88b397c30d86 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -57,8 +57,8 @@ static inline struct l2tp_ip6_sock *l2tp_ip6_sk(const struct sock *sk)
57 return (struct l2tp_ip6_sock *)sk; 57 return (struct l2tp_ip6_sock *)sk;
58} 58}
59 59
60static struct sock *__l2tp_ip6_bind_lookup(struct net *net, 60static struct sock *__l2tp_ip6_bind_lookup(const struct net *net,
61 struct in6_addr *laddr, 61 const struct in6_addr *laddr,
62 const struct in6_addr *raddr, 62 const struct in6_addr *raddr,
63 int dif, u32 tunnel_id) 63 int dif, u32 tunnel_id)
64{ 64{
@@ -67,18 +67,26 @@ static struct sock *__l2tp_ip6_bind_lookup(struct net *net,
67 sk_for_each_bound(sk, &l2tp_ip6_bind_table) { 67 sk_for_each_bound(sk, &l2tp_ip6_bind_table) {
68 const struct in6_addr *sk_laddr = inet6_rcv_saddr(sk); 68 const struct in6_addr *sk_laddr = inet6_rcv_saddr(sk);
69 const struct in6_addr *sk_raddr = &sk->sk_v6_daddr; 69 const struct in6_addr *sk_raddr = &sk->sk_v6_daddr;
70 struct l2tp_ip6_sock *l2tp = l2tp_ip6_sk(sk); 70 const struct l2tp_ip6_sock *l2tp = l2tp_ip6_sk(sk);
71 71
72 if (l2tp == NULL) 72 if (!net_eq(sock_net(sk), net))
73 continue; 73 continue;
74 74
75 if ((l2tp->conn_id == tunnel_id) && 75 if (sk->sk_bound_dev_if && dif && sk->sk_bound_dev_if != dif)
76 net_eq(sock_net(sk), net) && 76 continue;
77 (!sk_laddr || ipv6_addr_any(sk_laddr) || ipv6_addr_equal(sk_laddr, laddr)) && 77
78 (!raddr || ipv6_addr_any(sk_raddr) || ipv6_addr_equal(sk_raddr, raddr)) && 78 if (sk_laddr && !ipv6_addr_any(sk_laddr) &&
79 (!sk->sk_bound_dev_if || !dif || 79 !ipv6_addr_any(laddr) && !ipv6_addr_equal(sk_laddr, laddr))
80 sk->sk_bound_dev_if == dif)) 80 continue;
81 goto found; 81
82 if (!ipv6_addr_any(sk_raddr) && raddr &&
83 !ipv6_addr_any(raddr) && !ipv6_addr_equal(sk_raddr, raddr))
84 continue;
85
86 if (l2tp->conn_id != tunnel_id)
87 continue;
88
89 goto found;
82 } 90 }
83 91
84 sk = NULL; 92 sk = NULL;
@@ -148,19 +156,19 @@ static int l2tp_ip6_recv(struct sk_buff *skb)
148 } 156 }
149 157
150 /* Ok, this is a data packet. Lookup the session. */ 158 /* Ok, this is a data packet. Lookup the session. */
151 session = l2tp_session_find(net, NULL, session_id); 159 session = l2tp_session_get(net, NULL, session_id, true);
152 if (session == NULL) 160 if (!session)
153 goto discard; 161 goto discard;
154 162
155 tunnel = session->tunnel; 163 tunnel = session->tunnel;
156 if (tunnel == NULL) 164 if (!tunnel)
157 goto discard; 165 goto discard_sess;
158 166
159 /* Trace packet contents, if enabled */ 167 /* Trace packet contents, if enabled */
160 if (tunnel->debug & L2TP_MSG_DATA) { 168 if (tunnel->debug & L2TP_MSG_DATA) {
161 length = min(32u, skb->len); 169 length = min(32u, skb->len);
162 if (!pskb_may_pull(skb, length)) 170 if (!pskb_may_pull(skb, length))
163 goto discard; 171 goto discard_sess;
164 172
165 /* Point to L2TP header */ 173 /* Point to L2TP header */
166 optr = ptr = skb->data; 174 optr = ptr = skb->data;
@@ -171,6 +179,8 @@ static int l2tp_ip6_recv(struct sk_buff *skb)
171 179
172 l2tp_recv_common(session, skb, ptr, optr, 0, skb->len, 180 l2tp_recv_common(session, skb, ptr, optr, 0, skb->len,
173 tunnel->recv_payload_hook); 181 tunnel->recv_payload_hook);
182 l2tp_session_dec_refcount(session);
183
174 return 0; 184 return 0;
175 185
176pass_up: 186pass_up:
@@ -183,9 +193,10 @@ pass_up:
183 193
184 tunnel_id = ntohl(*(__be32 *) &skb->data[4]); 194 tunnel_id = ntohl(*(__be32 *) &skb->data[4]);
185 tunnel = l2tp_tunnel_find(net, tunnel_id); 195 tunnel = l2tp_tunnel_find(net, tunnel_id);
186 if (tunnel != NULL) 196 if (tunnel) {
187 sk = tunnel->sock; 197 sk = tunnel->sock;
188 else { 198 sock_hold(sk);
199 } else {
189 struct ipv6hdr *iph = ipv6_hdr(skb); 200 struct ipv6hdr *iph = ipv6_hdr(skb);
190 201
191 read_lock_bh(&l2tp_ip6_lock); 202 read_lock_bh(&l2tp_ip6_lock);
@@ -207,6 +218,12 @@ pass_up:
207 218
208 return sk_receive_skb(sk, skb, 1); 219 return sk_receive_skb(sk, skb, 1);
209 220
221discard_sess:
222 if (session->deref)
223 session->deref(session);
224 l2tp_session_dec_refcount(session);
225 goto discard;
226
210discard_put: 227discard_put:
211 sock_put(sk); 228 sock_put(sk);
212 229
@@ -650,7 +667,8 @@ out:
650 return err < 0 ? err : len; 667 return err < 0 ? err : len;
651 668
652do_confirm: 669do_confirm:
653 dst_confirm(dst); 670 if (msg->msg_flags & MSG_PROBE)
671 dst_confirm_neigh(dst, &fl6.daddr);
654 if (!(msg->msg_flags & MSG_PROBE) || len) 672 if (!(msg->msg_flags & MSG_PROBE) || len)
655 goto back_from_confirm; 673 goto back_from_confirm;
656 err = 0; 674 err = 0;
@@ -722,7 +740,7 @@ static struct proto l2tp_ip6_prot = {
722 .bind = l2tp_ip6_bind, 740 .bind = l2tp_ip6_bind,
723 .connect = l2tp_ip6_connect, 741 .connect = l2tp_ip6_connect,
724 .disconnect = l2tp_ip6_disconnect, 742 .disconnect = l2tp_ip6_disconnect,
725 .ioctl = udp_ioctl, 743 .ioctl = l2tp_ioctl,
726 .destroy = l2tp_ip6_destroy_sock, 744 .destroy = l2tp_ip6_destroy_sock,
727 .setsockopt = ipv6_setsockopt, 745 .setsockopt = ipv6_setsockopt,
728 .getsockopt = ipv6_getsockopt, 746 .getsockopt = ipv6_getsockopt,
diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
index 3620fba31786..7e3e669baac4 100644
--- a/net/l2tp/l2tp_netlink.c
+++ b/net/l2tp/l2tp_netlink.c
@@ -48,7 +48,8 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq,
48/* Accessed under genl lock */ 48/* Accessed under genl lock */
49static const struct l2tp_nl_cmd_ops *l2tp_nl_cmd_ops[__L2TP_PWTYPE_MAX]; 49static const struct l2tp_nl_cmd_ops *l2tp_nl_cmd_ops[__L2TP_PWTYPE_MAX];
50 50
51static struct l2tp_session *l2tp_nl_session_find(struct genl_info *info) 51static struct l2tp_session *l2tp_nl_session_get(struct genl_info *info,
52 bool do_ref)
52{ 53{
53 u32 tunnel_id; 54 u32 tunnel_id;
54 u32 session_id; 55 u32 session_id;
@@ -59,14 +60,15 @@ static struct l2tp_session *l2tp_nl_session_find(struct genl_info *info)
59 60
60 if (info->attrs[L2TP_ATTR_IFNAME]) { 61 if (info->attrs[L2TP_ATTR_IFNAME]) {
61 ifname = nla_data(info->attrs[L2TP_ATTR_IFNAME]); 62 ifname = nla_data(info->attrs[L2TP_ATTR_IFNAME]);
62 session = l2tp_session_find_by_ifname(net, ifname); 63 session = l2tp_session_get_by_ifname(net, ifname, do_ref);
63 } else if ((info->attrs[L2TP_ATTR_SESSION_ID]) && 64 } else if ((info->attrs[L2TP_ATTR_SESSION_ID]) &&
64 (info->attrs[L2TP_ATTR_CONN_ID])) { 65 (info->attrs[L2TP_ATTR_CONN_ID])) {
65 tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]); 66 tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]);
66 session_id = nla_get_u32(info->attrs[L2TP_ATTR_SESSION_ID]); 67 session_id = nla_get_u32(info->attrs[L2TP_ATTR_SESSION_ID]);
67 tunnel = l2tp_tunnel_find(net, tunnel_id); 68 tunnel = l2tp_tunnel_find(net, tunnel_id);
68 if (tunnel) 69 if (tunnel)
69 session = l2tp_session_find(net, tunnel, session_id); 70 session = l2tp_session_get(net, tunnel, session_id,
71 do_ref);
70 } 72 }
71 73
72 return session; 74 return session;
@@ -642,10 +644,12 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf
642 session_id, peer_session_id, &cfg); 644 session_id, peer_session_id, &cfg);
643 645
644 if (ret >= 0) { 646 if (ret >= 0) {
645 session = l2tp_session_find(net, tunnel, session_id); 647 session = l2tp_session_get(net, tunnel, session_id, false);
646 if (session) 648 if (session) {
647 ret = l2tp_session_notify(&l2tp_nl_family, info, session, 649 ret = l2tp_session_notify(&l2tp_nl_family, info, session,
648 L2TP_CMD_SESSION_CREATE); 650 L2TP_CMD_SESSION_CREATE);
651 l2tp_session_dec_refcount(session);
652 }
649 } 653 }
650 654
651out: 655out:
@@ -658,7 +662,7 @@ static int l2tp_nl_cmd_session_delete(struct sk_buff *skb, struct genl_info *inf
658 struct l2tp_session *session; 662 struct l2tp_session *session;
659 u16 pw_type; 663 u16 pw_type;
660 664
661 session = l2tp_nl_session_find(info); 665 session = l2tp_nl_session_get(info, true);
662 if (session == NULL) { 666 if (session == NULL) {
663 ret = -ENODEV; 667 ret = -ENODEV;
664 goto out; 668 goto out;
@@ -672,6 +676,10 @@ static int l2tp_nl_cmd_session_delete(struct sk_buff *skb, struct genl_info *inf
672 if (l2tp_nl_cmd_ops[pw_type] && l2tp_nl_cmd_ops[pw_type]->session_delete) 676 if (l2tp_nl_cmd_ops[pw_type] && l2tp_nl_cmd_ops[pw_type]->session_delete)
673 ret = (*l2tp_nl_cmd_ops[pw_type]->session_delete)(session); 677 ret = (*l2tp_nl_cmd_ops[pw_type]->session_delete)(session);
674 678
679 if (session->deref)
680 session->deref(session);
681 l2tp_session_dec_refcount(session);
682
675out: 683out:
676 return ret; 684 return ret;
677} 685}
@@ -681,7 +689,7 @@ static int l2tp_nl_cmd_session_modify(struct sk_buff *skb, struct genl_info *inf
681 int ret = 0; 689 int ret = 0;
682 struct l2tp_session *session; 690 struct l2tp_session *session;
683 691
684 session = l2tp_nl_session_find(info); 692 session = l2tp_nl_session_get(info, false);
685 if (session == NULL) { 693 if (session == NULL) {
686 ret = -ENODEV; 694 ret = -ENODEV;
687 goto out; 695 goto out;
@@ -716,6 +724,8 @@ static int l2tp_nl_cmd_session_modify(struct sk_buff *skb, struct genl_info *inf
716 ret = l2tp_session_notify(&l2tp_nl_family, info, 724 ret = l2tp_session_notify(&l2tp_nl_family, info,
717 session, L2TP_CMD_SESSION_MODIFY); 725 session, L2TP_CMD_SESSION_MODIFY);
718 726
727 l2tp_session_dec_refcount(session);
728
719out: 729out:
720 return ret; 730 return ret;
721} 731}
@@ -811,29 +821,34 @@ static int l2tp_nl_cmd_session_get(struct sk_buff *skb, struct genl_info *info)
811 struct sk_buff *msg; 821 struct sk_buff *msg;
812 int ret; 822 int ret;
813 823
814 session = l2tp_nl_session_find(info); 824 session = l2tp_nl_session_get(info, false);
815 if (session == NULL) { 825 if (session == NULL) {
816 ret = -ENODEV; 826 ret = -ENODEV;
817 goto out; 827 goto err;
818 } 828 }
819 829
820 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); 830 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
821 if (!msg) { 831 if (!msg) {
822 ret = -ENOMEM; 832 ret = -ENOMEM;
823 goto out; 833 goto err_ref;
824 } 834 }
825 835
826 ret = l2tp_nl_session_send(msg, info->snd_portid, info->snd_seq, 836 ret = l2tp_nl_session_send(msg, info->snd_portid, info->snd_seq,
827 0, session, L2TP_CMD_SESSION_GET); 837 0, session, L2TP_CMD_SESSION_GET);
828 if (ret < 0) 838 if (ret < 0)
829 goto err_out; 839 goto err_ref_msg;
830 840
831 return genlmsg_unicast(genl_info_net(info), msg, info->snd_portid); 841 ret = genlmsg_unicast(genl_info_net(info), msg, info->snd_portid);
832 842
833err_out: 843 l2tp_session_dec_refcount(session);
834 nlmsg_free(msg);
835 844
836out: 845 return ret;
846
847err_ref_msg:
848 nlmsg_free(msg);
849err_ref:
850 l2tp_session_dec_refcount(session);
851err:
837 return ret; 852 return ret;
838} 853}
839 854
@@ -852,7 +867,7 @@ static int l2tp_nl_cmd_session_dump(struct sk_buff *skb, struct netlink_callback
852 goto out; 867 goto out;
853 } 868 }
854 869
855 session = l2tp_session_find_nth(tunnel, si); 870 session = l2tp_session_get_nth(tunnel, si, false);
856 if (session == NULL) { 871 if (session == NULL) {
857 ti++; 872 ti++;
858 tunnel = NULL; 873 tunnel = NULL;
@@ -862,8 +877,11 @@ static int l2tp_nl_cmd_session_dump(struct sk_buff *skb, struct netlink_callback
862 877
863 if (l2tp_nl_session_send(skb, NETLINK_CB(cb->skb).portid, 878 if (l2tp_nl_session_send(skb, NETLINK_CB(cb->skb).portid,
864 cb->nlh->nlmsg_seq, NLM_F_MULTI, 879 cb->nlh->nlmsg_seq, NLM_F_MULTI,
865 session, L2TP_CMD_SESSION_GET) < 0) 880 session, L2TP_CMD_SESSION_GET) < 0) {
881 l2tp_session_dec_refcount(session);
866 break; 882 break;
883 }
884 l2tp_session_dec_refcount(session);
867 885
868 si++; 886 si++;
869 } 887 }
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 36cc56fd0418..32ea0f3d868c 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -450,6 +450,10 @@ static void pppol2tp_session_close(struct l2tp_session *session)
450static void pppol2tp_session_destruct(struct sock *sk) 450static void pppol2tp_session_destruct(struct sock *sk)
451{ 451{
452 struct l2tp_session *session = sk->sk_user_data; 452 struct l2tp_session *session = sk->sk_user_data;
453
454 skb_queue_purge(&sk->sk_receive_queue);
455 skb_queue_purge(&sk->sk_write_queue);
456
453 if (session) { 457 if (session) {
454 sk->sk_user_data = NULL; 458 sk->sk_user_data = NULL;
455 BUG_ON(session->magic != L2TP_SESSION_MAGIC); 459 BUG_ON(session->magic != L2TP_SESSION_MAGIC);
@@ -488,9 +492,6 @@ static int pppol2tp_release(struct socket *sock)
488 l2tp_session_queue_purge(session); 492 l2tp_session_queue_purge(session);
489 sock_put(sk); 493 sock_put(sk);
490 } 494 }
491 skb_queue_purge(&sk->sk_receive_queue);
492 skb_queue_purge(&sk->sk_write_queue);
493
494 release_sock(sk); 495 release_sock(sk);
495 496
496 /* This will delete the session context via 497 /* This will delete the session context via
@@ -582,6 +583,7 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
582 int error = 0; 583 int error = 0;
583 u32 tunnel_id, peer_tunnel_id; 584 u32 tunnel_id, peer_tunnel_id;
584 u32 session_id, peer_session_id; 585 u32 session_id, peer_session_id;
586 bool drop_refcnt = false;
585 int ver = 2; 587 int ver = 2;
586 int fd; 588 int fd;
587 589
@@ -683,36 +685,36 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
683 if (tunnel->peer_tunnel_id == 0) 685 if (tunnel->peer_tunnel_id == 0)
684 tunnel->peer_tunnel_id = peer_tunnel_id; 686 tunnel->peer_tunnel_id = peer_tunnel_id;
685 687
686 /* Create session if it doesn't already exist. We handle the 688 session = l2tp_session_get(sock_net(sk), tunnel, session_id, false);
687 * case where a session was previously created by the netlink 689 if (session) {
688 * interface by checking that the session doesn't already have 690 drop_refcnt = true;
689 * a socket and its tunnel socket are what we expect. If any 691 ps = l2tp_session_priv(session);
690 * of those checks fail, return EEXIST to the caller. 692
691 */ 693 /* Using a pre-existing session is fine as long as it hasn't
692 session = l2tp_session_find(sock_net(sk), tunnel, session_id); 694 * been connected yet.
693 if (session == NULL) {
694 /* Default MTU must allow space for UDP/L2TP/PPP
695 * headers.
696 */ 695 */
697 cfg.mtu = cfg.mru = 1500 - PPPOL2TP_HEADER_OVERHEAD; 696 if (ps->sock) {
697 error = -EEXIST;
698 goto end;
699 }
698 700
699 /* Allocate and initialize a new session context. */ 701 /* consistency checks */
700 session = l2tp_session_create(sizeof(struct pppol2tp_session), 702 if (ps->tunnel_sock != tunnel->sock) {
701 tunnel, session_id, 703 error = -EEXIST;
702 peer_session_id, &cfg);
703 if (session == NULL) {
704 error = -ENOMEM;
705 goto end; 704 goto end;
706 } 705 }
707 } else { 706 } else {
708 ps = l2tp_session_priv(session); 707 /* Default MTU must allow space for UDP/L2TP/PPP headers */
709 error = -EEXIST; 708 cfg.mtu = 1500 - PPPOL2TP_HEADER_OVERHEAD;
710 if (ps->sock != NULL) 709 cfg.mru = cfg.mtu;
711 goto end;
712 710
713 /* consistency checks */ 711 session = l2tp_session_create(sizeof(struct pppol2tp_session),
714 if (ps->tunnel_sock != tunnel->sock) 712 tunnel, session_id,
713 peer_session_id, &cfg);
714 if (IS_ERR(session)) {
715 error = PTR_ERR(session);
715 goto end; 716 goto end;
717 }
716 } 718 }
717 719
718 /* Associate session with its PPPoL2TP socket */ 720 /* Associate session with its PPPoL2TP socket */
@@ -777,6 +779,8 @@ out_no_ppp:
777 session->name); 779 session->name);
778 780
779end: 781end:
782 if (drop_refcnt)
783 l2tp_session_dec_refcount(session);
780 release_sock(sk); 784 release_sock(sk);
781 785
782 return error; 786 return error;
@@ -804,12 +808,6 @@ static int pppol2tp_session_create(struct net *net, u32 tunnel_id, u32 session_i
804 if (tunnel->sock == NULL) 808 if (tunnel->sock == NULL)
805 goto out; 809 goto out;
806 810
807 /* Check that this session doesn't already exist */
808 error = -EEXIST;
809 session = l2tp_session_find(net, tunnel, session_id);
810 if (session != NULL)
811 goto out;
812
813 /* Default MTU values. */ 811 /* Default MTU values. */
814 if (cfg->mtu == 0) 812 if (cfg->mtu == 0)
815 cfg->mtu = 1500 - PPPOL2TP_HEADER_OVERHEAD; 813 cfg->mtu = 1500 - PPPOL2TP_HEADER_OVERHEAD;
@@ -817,12 +815,13 @@ static int pppol2tp_session_create(struct net *net, u32 tunnel_id, u32 session_i
817 cfg->mru = cfg->mtu; 815 cfg->mru = cfg->mtu;
818 816
819 /* Allocate and initialize a new session context. */ 817 /* Allocate and initialize a new session context. */
820 error = -ENOMEM;
821 session = l2tp_session_create(sizeof(struct pppol2tp_session), 818 session = l2tp_session_create(sizeof(struct pppol2tp_session),
822 tunnel, session_id, 819 tunnel, session_id,
823 peer_session_id, cfg); 820 peer_session_id, cfg);
824 if (session == NULL) 821 if (IS_ERR(session)) {
822 error = PTR_ERR(session);
825 goto out; 823 goto out;
824 }
826 825
827 ps = l2tp_session_priv(session); 826 ps = l2tp_session_priv(session);
828 ps->tunnel_sock = tunnel->sock; 827 ps->tunnel_sock = tunnel->sock;
@@ -1140,11 +1139,18 @@ static int pppol2tp_tunnel_ioctl(struct l2tp_tunnel *tunnel,
1140 if (stats.session_id != 0) { 1139 if (stats.session_id != 0) {
1141 /* resend to session ioctl handler */ 1140 /* resend to session ioctl handler */
1142 struct l2tp_session *session = 1141 struct l2tp_session *session =
1143 l2tp_session_find(sock_net(sk), tunnel, stats.session_id); 1142 l2tp_session_get(sock_net(sk), tunnel,
1144 if (session != NULL) 1143 stats.session_id, true);
1145 err = pppol2tp_session_ioctl(session, cmd, arg); 1144
1146 else 1145 if (session) {
1146 err = pppol2tp_session_ioctl(session, cmd,
1147 arg);
1148 if (session->deref)
1149 session->deref(session);
1150 l2tp_session_dec_refcount(session);
1151 } else {
1147 err = -EBADR; 1152 err = -EBADR;
1153 }
1148 break; 1154 break;
1149 } 1155 }
1150#ifdef CONFIG_XFRM 1156#ifdef CONFIG_XFRM
@@ -1377,8 +1383,6 @@ static int pppol2tp_setsockopt(struct socket *sock, int level, int optname,
1377 } else 1383 } else
1378 err = pppol2tp_session_setsockopt(sk, session, optname, val); 1384 err = pppol2tp_session_setsockopt(sk, session, optname, val);
1379 1385
1380 err = 0;
1381
1382end_put_sess: 1386end_put_sess:
1383 sock_put(sk); 1387 sock_put(sk);
1384end: 1388end:
@@ -1501,8 +1505,13 @@ static int pppol2tp_getsockopt(struct socket *sock, int level, int optname,
1501 1505
1502 err = pppol2tp_tunnel_getsockopt(sk, tunnel, optname, &val); 1506 err = pppol2tp_tunnel_getsockopt(sk, tunnel, optname, &val);
1503 sock_put(ps->tunnel_sock); 1507 sock_put(ps->tunnel_sock);
1504 } else 1508 if (err)
1509 goto end_put_sess;
1510 } else {
1505 err = pppol2tp_session_getsockopt(sk, session, optname, &val); 1511 err = pppol2tp_session_getsockopt(sk, session, optname, &val);
1512 if (err)
1513 goto end_put_sess;
1514 }
1506 1515
1507 err = -EFAULT; 1516 err = -EFAULT;
1508 if (put_user(len, optlen)) 1517 if (put_user(len, optlen))
@@ -1554,7 +1563,7 @@ static void pppol2tp_next_tunnel(struct net *net, struct pppol2tp_seq_data *pd)
1554 1563
1555static void pppol2tp_next_session(struct net *net, struct pppol2tp_seq_data *pd) 1564static void pppol2tp_next_session(struct net *net, struct pppol2tp_seq_data *pd)
1556{ 1565{
1557 pd->session = l2tp_session_find_nth(pd->tunnel, pd->session_idx); 1566 pd->session = l2tp_session_get_nth(pd->tunnel, pd->session_idx, true);
1558 pd->session_idx++; 1567 pd->session_idx++;
1559 1568
1560 if (pd->session == NULL) { 1569 if (pd->session == NULL) {
@@ -1681,10 +1690,14 @@ static int pppol2tp_seq_show(struct seq_file *m, void *v)
1681 1690
1682 /* Show the tunnel or session context. 1691 /* Show the tunnel or session context.
1683 */ 1692 */
1684 if (pd->session == NULL) 1693 if (!pd->session) {
1685 pppol2tp_seq_tunnel_show(m, pd->tunnel); 1694 pppol2tp_seq_tunnel_show(m, pd->tunnel);
1686 else 1695 } else {
1687 pppol2tp_seq_session_show(m, pd->session); 1696 pppol2tp_seq_session_show(m, pd->session);
1697 if (pd->session->deref)
1698 pd->session->deref(pd->session);
1699 l2tp_session_dec_refcount(pd->session);
1700 }
1688 1701
1689out: 1702out:
1690 return 0; 1703 return 0;
@@ -1843,4 +1856,4 @@ MODULE_DESCRIPTION("PPP over L2TP over UDP");
1843MODULE_LICENSE("GPL"); 1856MODULE_LICENSE("GPL");
1844MODULE_VERSION(PPPOL2TP_DRV_VERSION); 1857MODULE_VERSION(PPPOL2TP_DRV_VERSION);
1845MODULE_ALIAS_NET_PF_PROTO(PF_PPPOX, PX_PROTO_OL2TP); 1858MODULE_ALIAS_NET_PF_PROTO(PF_PPPOX, PX_PROTO_OL2TP);
1846MODULE_ALIAS_L2TP_PWTYPE(11); 1859MODULE_ALIAS_L2TP_PWTYPE(7);
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 5e9296382420..cb4fff785cbf 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -26,6 +26,8 @@
26#include <linux/rtnetlink.h> 26#include <linux/rtnetlink.h>
27#include <linux/init.h> 27#include <linux/init.h>
28#include <linux/slab.h> 28#include <linux/slab.h>
29#include <linux/sched/signal.h>
30
29#include <net/llc.h> 31#include <net/llc.h>
30#include <net/llc_sap.h> 32#include <net/llc_sap.h>
31#include <net/llc_pdu.h> 33#include <net/llc_pdu.h>
@@ -639,11 +641,13 @@ static void llc_cmsg_rcv(struct msghdr *msg, struct sk_buff *skb)
639 * @sock: Socket which connections arrive on. 641 * @sock: Socket which connections arrive on.
640 * @newsock: Socket to move incoming connection to. 642 * @newsock: Socket to move incoming connection to.
641 * @flags: User specified operational flags. 643 * @flags: User specified operational flags.
644 * @kern: If the socket is kernel internal
642 * 645 *
643 * Accept a new incoming connection. 646 * Accept a new incoming connection.
644 * Returns 0 upon success, negative otherwise. 647 * Returns 0 upon success, negative otherwise.
645 */ 648 */
646static int llc_ui_accept(struct socket *sock, struct socket *newsock, int flags) 649static int llc_ui_accept(struct socket *sock, struct socket *newsock, int flags,
650 bool kern)
647{ 651{
648 struct sock *sk = sock->sk, *newsk; 652 struct sock *sk = sock->sk, *newsk;
649 struct llc_sock *llc, *newllc; 653 struct llc_sock *llc, *newllc;
diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c
index 3e821daf9dd4..8bc5a1bd2d45 100644
--- a/net/llc/llc_conn.c
+++ b/net/llc/llc_conn.c
@@ -821,7 +821,10 @@ void llc_conn_handler(struct llc_sap *sap, struct sk_buff *skb)
821 * another trick required to cope with how the PROCOM state 821 * another trick required to cope with how the PROCOM state
822 * machine works. -acme 822 * machine works. -acme
823 */ 823 */
824 skb_orphan(skb);
825 sock_hold(sk);
824 skb->sk = sk; 826 skb->sk = sk;
827 skb->destructor = sock_efree;
825 } 828 }
826 if (!sock_owned_by_user(sk)) 829 if (!sock_owned_by_user(sk))
827 llc_conn_rcv(sk, skb); 830 llc_conn_rcv(sk, skb);
diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c
index d0e1e804ebd7..5404d0d195cc 100644
--- a/net/llc/llc_sap.c
+++ b/net/llc/llc_sap.c
@@ -290,7 +290,10 @@ static void llc_sap_rcv(struct llc_sap *sap, struct sk_buff *skb,
290 290
291 ev->type = LLC_SAP_EV_TYPE_PDU; 291 ev->type = LLC_SAP_EV_TYPE_PDU;
292 ev->reason = 0; 292 ev->reason = 0;
293 skb_orphan(skb);
294 sock_hold(sk);
293 skb->sk = sk; 295 skb->sk = sk;
296 skb->destructor = sock_efree;
294 llc_sap_state_process(sap, skb); 297 llc_sap_state_process(sap, skb);
295} 298}
296 299
diff --git a/net/mac80211/Kconfig b/net/mac80211/Kconfig
index 3891cbd2adea..76e30f4797fb 100644
--- a/net/mac80211/Kconfig
+++ b/net/mac80211/Kconfig
@@ -6,6 +6,7 @@ config MAC80211
6 select CRYPTO_AES 6 select CRYPTO_AES
7 select CRYPTO_CCM 7 select CRYPTO_CCM
8 select CRYPTO_GCM 8 select CRYPTO_GCM
9 select CRYPTO_CMAC
9 select CRC32 10 select CRC32
10 ---help--- 11 ---help---
11 This option enables the hardware independent IEEE 802.11 12 This option enables the hardware independent IEEE 802.11
diff --git a/net/mac80211/aes_cmac.c b/net/mac80211/aes_cmac.c
index d0bd5fff5f0a..2fb65588490c 100644
--- a/net/mac80211/aes_cmac.c
+++ b/net/mac80211/aes_cmac.c
@@ -22,126 +22,50 @@
22#define CMAC_TLEN_256 16 /* CMAC TLen = 128 bits (16 octets) */ 22#define CMAC_TLEN_256 16 /* CMAC TLen = 128 bits (16 octets) */
23#define AAD_LEN 20 23#define AAD_LEN 20
24 24
25static const u8 zero[CMAC_TLEN_256];
25 26
26void gf_mulx(u8 *pad) 27void ieee80211_aes_cmac(struct crypto_shash *tfm, const u8 *aad,
27{
28 int i, carry;
29
30 carry = pad[0] & 0x80;
31 for (i = 0; i < AES_BLOCK_SIZE - 1; i++)
32 pad[i] = (pad[i] << 1) | (pad[i + 1] >> 7);
33 pad[AES_BLOCK_SIZE - 1] <<= 1;
34 if (carry)
35 pad[AES_BLOCK_SIZE - 1] ^= 0x87;
36}
37
38void aes_cmac_vector(struct crypto_cipher *tfm, size_t num_elem,
39 const u8 *addr[], const size_t *len, u8 *mac,
40 size_t mac_len)
41{
42 u8 cbc[AES_BLOCK_SIZE], pad[AES_BLOCK_SIZE];
43 const u8 *pos, *end;
44 size_t i, e, left, total_len;
45
46 memset(cbc, 0, AES_BLOCK_SIZE);
47
48 total_len = 0;
49 for (e = 0; e < num_elem; e++)
50 total_len += len[e];
51 left = total_len;
52
53 e = 0;
54 pos = addr[0];
55 end = pos + len[0];
56
57 while (left >= AES_BLOCK_SIZE) {
58 for (i = 0; i < AES_BLOCK_SIZE; i++) {
59 cbc[i] ^= *pos++;
60 if (pos >= end) {
61 e++;
62 pos = addr[e];
63 end = pos + len[e];
64 }
65 }
66 if (left > AES_BLOCK_SIZE)
67 crypto_cipher_encrypt_one(tfm, cbc, cbc);
68 left -= AES_BLOCK_SIZE;
69 }
70
71 memset(pad, 0, AES_BLOCK_SIZE);
72 crypto_cipher_encrypt_one(tfm, pad, pad);
73 gf_mulx(pad);
74
75 if (left || total_len == 0) {
76 for (i = 0; i < left; i++) {
77 cbc[i] ^= *pos++;
78 if (pos >= end) {
79 e++;
80 pos = addr[e];
81 end = pos + len[e];
82 }
83 }
84 cbc[left] ^= 0x80;
85 gf_mulx(pad);
86 }
87
88 for (i = 0; i < AES_BLOCK_SIZE; i++)
89 pad[i] ^= cbc[i];
90 crypto_cipher_encrypt_one(tfm, pad, pad);
91 memcpy(mac, pad, mac_len);
92}
93
94
95void ieee80211_aes_cmac(struct crypto_cipher *tfm, const u8 *aad,
96 const u8 *data, size_t data_len, u8 *mic) 28 const u8 *data, size_t data_len, u8 *mic)
97{ 29{
98 const u8 *addr[3]; 30 SHASH_DESC_ON_STACK(desc, tfm);
99 size_t len[3]; 31 u8 out[AES_BLOCK_SIZE];
100 u8 zero[CMAC_TLEN];
101 32
102 memset(zero, 0, CMAC_TLEN); 33 desc->tfm = tfm;
103 addr[0] = aad;
104 len[0] = AAD_LEN;
105 addr[1] = data;
106 len[1] = data_len - CMAC_TLEN;
107 addr[2] = zero;
108 len[2] = CMAC_TLEN;
109 34
110 aes_cmac_vector(tfm, 3, addr, len, mic, CMAC_TLEN); 35 crypto_shash_init(desc);
36 crypto_shash_update(desc, aad, AAD_LEN);
37 crypto_shash_update(desc, data, data_len - CMAC_TLEN);
38 crypto_shash_finup(desc, zero, CMAC_TLEN, out);
39
40 memcpy(mic, out, CMAC_TLEN);
111} 41}
112 42
113void ieee80211_aes_cmac_256(struct crypto_cipher *tfm, const u8 *aad, 43void ieee80211_aes_cmac_256(struct crypto_shash *tfm, const u8 *aad,
114 const u8 *data, size_t data_len, u8 *mic) 44 const u8 *data, size_t data_len, u8 *mic)
115{ 45{
116 const u8 *addr[3]; 46 SHASH_DESC_ON_STACK(desc, tfm);
117 size_t len[3];
118 u8 zero[CMAC_TLEN_256];
119 47
120 memset(zero, 0, CMAC_TLEN_256); 48 desc->tfm = tfm;
121 addr[0] = aad;
122 len[0] = AAD_LEN;
123 addr[1] = data;
124 len[1] = data_len - CMAC_TLEN_256;
125 addr[2] = zero;
126 len[2] = CMAC_TLEN_256;
127 49
128 aes_cmac_vector(tfm, 3, addr, len, mic, CMAC_TLEN_256); 50 crypto_shash_init(desc);
51 crypto_shash_update(desc, aad, AAD_LEN);
52 crypto_shash_update(desc, data, data_len - CMAC_TLEN_256);
53 crypto_shash_finup(desc, zero, CMAC_TLEN_256, mic);
129} 54}
130 55
131struct crypto_cipher *ieee80211_aes_cmac_key_setup(const u8 key[], 56struct crypto_shash *ieee80211_aes_cmac_key_setup(const u8 key[],
132 size_t key_len) 57 size_t key_len)
133{ 58{
134 struct crypto_cipher *tfm; 59 struct crypto_shash *tfm;
135 60
136 tfm = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC); 61 tfm = crypto_alloc_shash("cmac(aes)", 0, 0);
137 if (!IS_ERR(tfm)) 62 if (!IS_ERR(tfm))
138 crypto_cipher_setkey(tfm, key, key_len); 63 crypto_shash_setkey(tfm, key, key_len);
139 64
140 return tfm; 65 return tfm;
141} 66}
142 67
143 68void ieee80211_aes_cmac_key_free(struct crypto_shash *tfm)
144void ieee80211_aes_cmac_key_free(struct crypto_cipher *tfm)
145{ 69{
146 crypto_free_cipher(tfm); 70 crypto_free_shash(tfm);
147} 71}
diff --git a/net/mac80211/aes_cmac.h b/net/mac80211/aes_cmac.h
index c827e1d5de8b..fef531f42003 100644
--- a/net/mac80211/aes_cmac.h
+++ b/net/mac80211/aes_cmac.h
@@ -10,17 +10,14 @@
10#define AES_CMAC_H 10#define AES_CMAC_H
11 11
12#include <linux/crypto.h> 12#include <linux/crypto.h>
13#include <crypto/hash.h>
13 14
14void gf_mulx(u8 *pad); 15struct crypto_shash *ieee80211_aes_cmac_key_setup(const u8 key[],
15void aes_cmac_vector(struct crypto_cipher *tfm, size_t num_elem, 16 size_t key_len);
16 const u8 *addr[], const size_t *len, u8 *mac, 17void ieee80211_aes_cmac(struct crypto_shash *tfm, const u8 *aad,
17 size_t mac_len);
18struct crypto_cipher *ieee80211_aes_cmac_key_setup(const u8 key[],
19 size_t key_len);
20void ieee80211_aes_cmac(struct crypto_cipher *tfm, const u8 *aad,
21 const u8 *data, size_t data_len, u8 *mic); 18 const u8 *data, size_t data_len, u8 *mic);
22void ieee80211_aes_cmac_256(struct crypto_cipher *tfm, const u8 *aad, 19void ieee80211_aes_cmac_256(struct crypto_shash *tfm, const u8 *aad,
23 const u8 *data, size_t data_len, u8 *mic); 20 const u8 *data, size_t data_len, u8 *mic);
24void ieee80211_aes_cmac_key_free(struct crypto_cipher *tfm); 21void ieee80211_aes_cmac_key_free(struct crypto_shash *tfm);
25 22
26#endif /* AES_CMAC_H */ 23#endif /* AES_CMAC_H */
diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index 3b5fd4188f2a..4456559cb056 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -85,7 +85,7 @@ void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
85 ht_dbg(sta->sdata, 85 ht_dbg(sta->sdata,
86 "Rx BA session stop requested for %pM tid %u %s reason: %d\n", 86 "Rx BA session stop requested for %pM tid %u %s reason: %d\n",
87 sta->sta.addr, tid, 87 sta->sta.addr, tid,
88 initiator == WLAN_BACK_RECIPIENT ? "recipient" : "inititator", 88 initiator == WLAN_BACK_RECIPIENT ? "recipient" : "initiator",
89 (int)reason); 89 (int)reason);
90 90
91 if (drv_ampdu_action(local, sta->sdata, &params)) 91 if (drv_ampdu_action(local, sta->sdata, &params))
@@ -398,6 +398,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
398 tid_agg_rx->timeout = timeout; 398 tid_agg_rx->timeout = timeout;
399 tid_agg_rx->stored_mpdu_num = 0; 399 tid_agg_rx->stored_mpdu_num = 0;
400 tid_agg_rx->auto_seq = auto_seq; 400 tid_agg_rx->auto_seq = auto_seq;
401 tid_agg_rx->started = false;
401 tid_agg_rx->reorder_buf_filtered = 0; 402 tid_agg_rx->reorder_buf_filtered = 0;
402 status = WLAN_STATUS_SUCCESS; 403 status = WLAN_STATUS_SUCCESS;
403 404
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index e91e503bf992..ac879bb17870 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -208,8 +208,8 @@ static int ieee80211_nan_change_conf(struct wiphy *wiphy,
208 if (changes & CFG80211_NAN_CONF_CHANGED_PREF) 208 if (changes & CFG80211_NAN_CONF_CHANGED_PREF)
209 new_conf.master_pref = conf->master_pref; 209 new_conf.master_pref = conf->master_pref;
210 210
211 if (changes & CFG80211_NAN_CONF_CHANGED_DUAL) 211 if (changes & CFG80211_NAN_CONF_CHANGED_BANDS)
212 new_conf.dual = conf->dual; 212 new_conf.bands = conf->bands;
213 213
214 ret = drv_nan_change_conf(sdata->local, sdata, &new_conf, changes); 214 ret = drv_nan_change_conf(sdata->local, sdata, &new_conf, changes);
215 if (!ret) 215 if (!ret)
@@ -3563,6 +3563,17 @@ void ieee80211_nan_func_match(struct ieee80211_vif *vif,
3563} 3563}
3564EXPORT_SYMBOL(ieee80211_nan_func_match); 3564EXPORT_SYMBOL(ieee80211_nan_func_match);
3565 3565
3566static int ieee80211_set_multicast_to_unicast(struct wiphy *wiphy,
3567 struct net_device *dev,
3568 const bool enabled)
3569{
3570 struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
3571
3572 sdata->u.ap.multicast_to_unicast = enabled;
3573
3574 return 0;
3575}
3576
3566const struct cfg80211_ops mac80211_config_ops = { 3577const struct cfg80211_ops mac80211_config_ops = {
3567 .add_virtual_intf = ieee80211_add_iface, 3578 .add_virtual_intf = ieee80211_add_iface,
3568 .del_virtual_intf = ieee80211_del_iface, 3579 .del_virtual_intf = ieee80211_del_iface,
@@ -3653,4 +3664,5 @@ const struct cfg80211_ops mac80211_config_ops = {
3653 .nan_change_conf = ieee80211_nan_change_conf, 3664 .nan_change_conf = ieee80211_nan_change_conf,
3654 .add_nan_func = ieee80211_add_nan_func, 3665 .add_nan_func = ieee80211_add_nan_func,
3655 .del_nan_func = ieee80211_del_nan_func, 3666 .del_nan_func = ieee80211_del_nan_func,
3667 .set_multicast_to_unicast = ieee80211_set_multicast_to_unicast,
3656}; 3668};
diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c
index a0d901d8992e..89178b46b32f 100644
--- a/net/mac80211/chan.c
+++ b/net/mac80211/chan.c
@@ -1267,7 +1267,7 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local)
1267 struct ieee80211_sub_if_data *sdata, *sdata_tmp; 1267 struct ieee80211_sub_if_data *sdata, *sdata_tmp;
1268 struct ieee80211_chanctx *ctx, *ctx_tmp, *old_ctx; 1268 struct ieee80211_chanctx *ctx, *ctx_tmp, *old_ctx;
1269 struct ieee80211_chanctx *new_ctx = NULL; 1269 struct ieee80211_chanctx *new_ctx = NULL;
1270 int i, err, n_assigned, n_reserved, n_ready; 1270 int err, n_assigned, n_reserved, n_ready;
1271 int n_ctx = 0, n_vifs_switch = 0, n_vifs_assign = 0, n_vifs_ctxless = 0; 1271 int n_ctx = 0, n_vifs_switch = 0, n_vifs_assign = 0, n_vifs_ctxless = 0;
1272 1272
1273 lockdep_assert_held(&local->mtx); 1273 lockdep_assert_held(&local->mtx);
@@ -1388,8 +1388,6 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local)
1388 * Update all structures, values and pointers to point to new channel 1388 * Update all structures, values and pointers to point to new channel
1389 * context(s). 1389 * context(s).
1390 */ 1390 */
1391
1392 i = 0;
1393 list_for_each_entry(ctx, &local->chanctx_list, list) { 1391 list_for_each_entry(ctx, &local->chanctx_list, list) {
1394 if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER) 1392 if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER)
1395 continue; 1393 continue;
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index e02ba42ca827..5fae001f286c 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -243,6 +243,38 @@ static ssize_t hwflags_read(struct file *file, char __user *user_buf,
243 return rv; 243 return rv;
244} 244}
245 245
246static ssize_t misc_read(struct file *file, char __user *user_buf,
247 size_t count, loff_t *ppos)
248{
249 struct ieee80211_local *local = file->private_data;
250 /* Max len of each line is 16 characters, plus 9 for 'pending:\n' */
251 size_t bufsz = IEEE80211_MAX_QUEUES * 16 + 9;
252 char *buf;
253 char *pos, *end;
254 ssize_t rv;
255 int i;
256 int ln;
257
258 buf = kzalloc(bufsz, GFP_KERNEL);
259 if (!buf)
260 return -ENOMEM;
261
262 pos = buf;
263 end = buf + bufsz - 1;
264
265 pos += scnprintf(pos, end - pos, "pending:\n");
266
267 for (i = 0; i < IEEE80211_MAX_QUEUES; i++) {
268 ln = skb_queue_len(&local->pending[i]);
269 pos += scnprintf(pos, end - pos, "[%i] %d\n",
270 i, ln);
271 }
272
273 rv = simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf));
274 kfree(buf);
275 return rv;
276}
277
246static ssize_t queues_read(struct file *file, char __user *user_buf, 278static ssize_t queues_read(struct file *file, char __user *user_buf,
247 size_t count, loff_t *ppos) 279 size_t count, loff_t *ppos)
248{ 280{
@@ -263,6 +295,7 @@ static ssize_t queues_read(struct file *file, char __user *user_buf,
263 295
264DEBUGFS_READONLY_FILE_OPS(hwflags); 296DEBUGFS_READONLY_FILE_OPS(hwflags);
265DEBUGFS_READONLY_FILE_OPS(queues); 297DEBUGFS_READONLY_FILE_OPS(queues);
298DEBUGFS_READONLY_FILE_OPS(misc);
266 299
267/* statistics stuff */ 300/* statistics stuff */
268 301
@@ -330,7 +363,9 @@ void debugfs_hw_add(struct ieee80211_local *local)
330 363
331 DEBUGFS_ADD(total_ps_buffered); 364 DEBUGFS_ADD(total_ps_buffered);
332 DEBUGFS_ADD(wep_iv); 365 DEBUGFS_ADD(wep_iv);
366 DEBUGFS_ADD(rate_ctrl_alg);
333 DEBUGFS_ADD(queues); 367 DEBUGFS_ADD(queues);
368 DEBUGFS_ADD(misc);
334#ifdef CONFIG_PM 369#ifdef CONFIG_PM
335 DEBUGFS_ADD_MODE(reset, 0200); 370 DEBUGFS_ADD_MODE(reset, 0200);
336#endif 371#endif
diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c
index 1a05f85cb1f0..8f5fff8b2040 100644
--- a/net/mac80211/debugfs_netdev.c
+++ b/net/mac80211/debugfs_netdev.c
@@ -519,6 +519,8 @@ static ssize_t ieee80211_if_fmt_aqm(
519} 519}
520IEEE80211_IF_FILE_R(aqm); 520IEEE80211_IF_FILE_R(aqm);
521 521
522IEEE80211_IF_FILE(multicast_to_unicast, u.ap.multicast_to_unicast, HEX);
523
522/* IBSS attributes */ 524/* IBSS attributes */
523static ssize_t ieee80211_if_fmt_tsf( 525static ssize_t ieee80211_if_fmt_tsf(
524 const struct ieee80211_sub_if_data *sdata, char *buf, int buflen) 526 const struct ieee80211_sub_if_data *sdata, char *buf, int buflen)
@@ -683,6 +685,7 @@ static void add_ap_files(struct ieee80211_sub_if_data *sdata)
683 DEBUGFS_ADD(dtim_count); 685 DEBUGFS_ADD(dtim_count);
684 DEBUGFS_ADD(num_buffered_multicast); 686 DEBUGFS_ADD(num_buffered_multicast);
685 DEBUGFS_ADD_MODE(tkip_mic_test, 0200); 687 DEBUGFS_ADD_MODE(tkip_mic_test, 0200);
688 DEBUGFS_ADD_MODE(multicast_to_unicast, 0600);
686} 689}
687 690
688static void add_vlan_files(struct ieee80211_sub_if_data *sdata) 691static void add_vlan_files(struct ieee80211_sub_if_data *sdata)
diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index f6003b8c2c33..42601820db20 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -522,6 +522,7 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta)
522 return; 522 return;
523 523
524 DEBUGFS_ADD(flags); 524 DEBUGFS_ADD(flags);
525 DEBUGFS_ADD(aid);
525 DEBUGFS_ADD(num_ps_buf_frames); 526 DEBUGFS_ADD(num_ps_buf_frames);
526 DEBUGFS_ADD(last_seq_ctrl); 527 DEBUGFS_ADD(last_seq_ctrl);
527 DEBUGFS_ADD(agg_status); 528 DEBUGFS_ADD(agg_status);
diff --git a/net/mac80211/fils_aead.c b/net/mac80211/fils_aead.c
index ecfdd97758a3..3cfb1e2ab7ac 100644
--- a/net/mac80211/fils_aead.c
+++ b/net/mac80211/fils_aead.c
@@ -9,66 +9,58 @@
9 9
10#include <crypto/aes.h> 10#include <crypto/aes.h>
11#include <crypto/algapi.h> 11#include <crypto/algapi.h>
12#include <crypto/hash.h>
12#include <crypto/skcipher.h> 13#include <crypto/skcipher.h>
13 14
14#include "ieee80211_i.h" 15#include "ieee80211_i.h"
15#include "aes_cmac.h" 16#include "aes_cmac.h"
16#include "fils_aead.h" 17#include "fils_aead.h"
17 18
18static int aes_s2v(struct crypto_cipher *tfm, 19static void gf_mulx(u8 *pad)
20{
21 u64 a = get_unaligned_be64(pad);
22 u64 b = get_unaligned_be64(pad + 8);
23
24 put_unaligned_be64((a << 1) | (b >> 63), pad);
25 put_unaligned_be64((b << 1) ^ ((a >> 63) ? 0x87 : 0), pad + 8);
26}
27
28static int aes_s2v(struct crypto_shash *tfm,
19 size_t num_elem, const u8 *addr[], size_t len[], u8 *v) 29 size_t num_elem, const u8 *addr[], size_t len[], u8 *v)
20{ 30{
21 u8 d[AES_BLOCK_SIZE], tmp[AES_BLOCK_SIZE]; 31 u8 d[AES_BLOCK_SIZE], tmp[AES_BLOCK_SIZE] = {};
32 SHASH_DESC_ON_STACK(desc, tfm);
22 size_t i; 33 size_t i;
23 const u8 *data[2]; 34
24 size_t data_len[2], data_elems; 35 desc->tfm = tfm;
25 36
26 /* D = AES-CMAC(K, <zero>) */ 37 /* D = AES-CMAC(K, <zero>) */
27 memset(tmp, 0, AES_BLOCK_SIZE); 38 crypto_shash_digest(desc, tmp, AES_BLOCK_SIZE, d);
28 data[0] = tmp;
29 data_len[0] = AES_BLOCK_SIZE;
30 aes_cmac_vector(tfm, 1, data, data_len, d, AES_BLOCK_SIZE);
31 39
32 for (i = 0; i < num_elem - 1; i++) { 40 for (i = 0; i < num_elem - 1; i++) {
33 /* D = dbl(D) xor AES_CMAC(K, Si) */ 41 /* D = dbl(D) xor AES_CMAC(K, Si) */
34 gf_mulx(d); /* dbl */ 42 gf_mulx(d); /* dbl */
35 aes_cmac_vector(tfm, 1, &addr[i], &len[i], tmp, 43 crypto_shash_digest(desc, addr[i], len[i], tmp);
36 AES_BLOCK_SIZE);
37 crypto_xor(d, tmp, AES_BLOCK_SIZE); 44 crypto_xor(d, tmp, AES_BLOCK_SIZE);
38 } 45 }
39 46
47 crypto_shash_init(desc);
48
40 if (len[i] >= AES_BLOCK_SIZE) { 49 if (len[i] >= AES_BLOCK_SIZE) {
41 /* len(Sn) >= 128 */ 50 /* len(Sn) >= 128 */
42 size_t j;
43 const u8 *pos;
44
45 /* T = Sn xorend D */ 51 /* T = Sn xorend D */
46 52 crypto_shash_update(desc, addr[i], len[i] - AES_BLOCK_SIZE);
47 /* Use a temporary buffer to perform xorend on Sn (addr[i]) to 53 crypto_xor(d, addr[i] + len[i] - AES_BLOCK_SIZE,
48 * avoid modifying the const input argument. 54 AES_BLOCK_SIZE);
49 */
50 data[0] = addr[i];
51 data_len[0] = len[i] - AES_BLOCK_SIZE;
52 pos = addr[i] + data_len[0];
53 for (j = 0; j < AES_BLOCK_SIZE; j++)
54 tmp[j] = pos[j] ^ d[j];
55 data[1] = tmp;
56 data_len[1] = AES_BLOCK_SIZE;
57 data_elems = 2;
58 } else { 55 } else {
59 /* len(Sn) < 128 */ 56 /* len(Sn) < 128 */
60 /* T = dbl(D) xor pad(Sn) */ 57 /* T = dbl(D) xor pad(Sn) */
61 gf_mulx(d); /* dbl */ 58 gf_mulx(d); /* dbl */
62 memset(tmp, 0, AES_BLOCK_SIZE); 59 crypto_xor(d, addr[i], len[i]);
63 memcpy(tmp, addr[i], len[i]); 60 d[len[i]] ^= 0x80;
64 tmp[len[i]] = 0x80;
65 crypto_xor(d, tmp, AES_BLOCK_SIZE);
66 data[0] = d;
67 data_len[0] = sizeof(d);
68 data_elems = 1;
69 } 61 }
70 /* V = AES-CMAC(K, T) */ 62 /* V = AES-CMAC(K, T) */
71 aes_cmac_vector(tfm, data_elems, data, data_len, v, AES_BLOCK_SIZE); 63 crypto_shash_finup(desc, d, AES_BLOCK_SIZE, v);
72 64
73 return 0; 65 return 0;
74} 66}
@@ -80,7 +72,7 @@ static int aes_siv_encrypt(const u8 *key, size_t key_len,
80 size_t len[], u8 *out) 72 size_t len[], u8 *out)
81{ 73{
82 u8 v[AES_BLOCK_SIZE]; 74 u8 v[AES_BLOCK_SIZE];
83 struct crypto_cipher *tfm; 75 struct crypto_shash *tfm;
84 struct crypto_skcipher *tfm2; 76 struct crypto_skcipher *tfm2;
85 struct skcipher_request *req; 77 struct skcipher_request *req;
86 int res; 78 int res;
@@ -95,14 +87,14 @@ static int aes_siv_encrypt(const u8 *key, size_t key_len,
95 87
96 /* S2V */ 88 /* S2V */
97 89
98 tfm = crypto_alloc_cipher("aes", 0, 0); 90 tfm = crypto_alloc_shash("cmac(aes)", 0, 0);
99 if (IS_ERR(tfm)) 91 if (IS_ERR(tfm))
100 return PTR_ERR(tfm); 92 return PTR_ERR(tfm);
101 /* K1 for S2V */ 93 /* K1 for S2V */
102 res = crypto_cipher_setkey(tfm, key, key_len); 94 res = crypto_shash_setkey(tfm, key, key_len);
103 if (!res) 95 if (!res)
104 res = aes_s2v(tfm, num_elem, addr, len, v); 96 res = aes_s2v(tfm, num_elem, addr, len, v);
105 crypto_free_cipher(tfm); 97 crypto_free_shash(tfm);
106 if (res) 98 if (res)
107 return res; 99 return res;
108 100
@@ -124,7 +116,7 @@ static int aes_siv_encrypt(const u8 *key, size_t key_len,
124 116
125 /* CTR */ 117 /* CTR */
126 118
127 tfm2 = crypto_alloc_skcipher("ctr(aes)", 0, 0); 119 tfm2 = crypto_alloc_skcipher("ctr(aes)", 0, CRYPTO_ALG_ASYNC);
128 if (IS_ERR(tfm2)) { 120 if (IS_ERR(tfm2)) {
129 kfree(tmp); 121 kfree(tmp);
130 return PTR_ERR(tfm2); 122 return PTR_ERR(tfm2);
@@ -157,7 +149,7 @@ static int aes_siv_decrypt(const u8 *key, size_t key_len,
157 size_t num_elem, const u8 *addr[], size_t len[], 149 size_t num_elem, const u8 *addr[], size_t len[],
158 u8 *out) 150 u8 *out)
159{ 151{
160 struct crypto_cipher *tfm; 152 struct crypto_shash *tfm;
161 struct crypto_skcipher *tfm2; 153 struct crypto_skcipher *tfm2;
162 struct skcipher_request *req; 154 struct skcipher_request *req;
163 struct scatterlist src[1], dst[1]; 155 struct scatterlist src[1], dst[1];
@@ -183,7 +175,7 @@ static int aes_siv_decrypt(const u8 *key, size_t key_len,
183 175
184 /* CTR */ 176 /* CTR */
185 177
186 tfm2 = crypto_alloc_skcipher("ctr(aes)", 0, 0); 178 tfm2 = crypto_alloc_skcipher("ctr(aes)", 0, CRYPTO_ALG_ASYNC);
187 if (IS_ERR(tfm2)) 179 if (IS_ERR(tfm2))
188 return PTR_ERR(tfm2); 180 return PTR_ERR(tfm2);
189 /* K2 for CTR */ 181 /* K2 for CTR */
@@ -210,14 +202,14 @@ static int aes_siv_decrypt(const u8 *key, size_t key_len,
210 202
211 /* S2V */ 203 /* S2V */
212 204
213 tfm = crypto_alloc_cipher("aes", 0, 0); 205 tfm = crypto_alloc_shash("cmac(aes)", 0, 0);
214 if (IS_ERR(tfm)) 206 if (IS_ERR(tfm))
215 return PTR_ERR(tfm); 207 return PTR_ERR(tfm);
216 /* K1 for S2V */ 208 /* K1 for S2V */
217 res = crypto_cipher_setkey(tfm, key, key_len); 209 res = crypto_shash_setkey(tfm, key, key_len);
218 if (!res) 210 if (!res)
219 res = aes_s2v(tfm, num_elem, addr, len, check); 211 res = aes_s2v(tfm, num_elem, addr, len, check);
220 crypto_free_cipher(tfm); 212 crypto_free_shash(tfm);
221 if (res) 213 if (res)
222 return res; 214 return res;
223 if (memcmp(check, frame_iv, AES_BLOCK_SIZE) != 0) 215 if (memcmp(check, frame_iv, AES_BLOCK_SIZE) != 0)
@@ -272,7 +264,7 @@ int fils_encrypt_assoc_req(struct sk_buff *skb,
272 crypt_len = skb->data + skb->len - encr; 264 crypt_len = skb->data + skb->len - encr;
273 skb_put(skb, AES_BLOCK_SIZE); 265 skb_put(skb, AES_BLOCK_SIZE);
274 return aes_siv_encrypt(assoc_data->fils_kek, assoc_data->fils_kek_len, 266 return aes_siv_encrypt(assoc_data->fils_kek, assoc_data->fils_kek_len,
275 encr, crypt_len, 1, addr, len, encr); 267 encr, crypt_len, 5, addr, len, encr);
276} 268}
277 269
278int fils_decrypt_assoc_resp(struct ieee80211_sub_if_data *sdata, 270int fils_decrypt_assoc_resp(struct ieee80211_sub_if_data *sdata,
diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index a31d30713d08..98999d3d5262 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -487,14 +487,14 @@ int ieee80211_ibss_csa_beacon(struct ieee80211_sub_if_data *sdata,
487 struct beacon_data *presp, *old_presp; 487 struct beacon_data *presp, *old_presp;
488 struct cfg80211_bss *cbss; 488 struct cfg80211_bss *cbss;
489 const struct cfg80211_bss_ies *ies; 489 const struct cfg80211_bss_ies *ies;
490 u16 capability = 0; 490 u16 capability = WLAN_CAPABILITY_IBSS;
491 u64 tsf; 491 u64 tsf;
492 int ret = 0; 492 int ret = 0;
493 493
494 sdata_assert_lock(sdata); 494 sdata_assert_lock(sdata);
495 495
496 if (ifibss->privacy) 496 if (ifibss->privacy)
497 capability = WLAN_CAPABILITY_PRIVACY; 497 capability |= WLAN_CAPABILITY_PRIVACY;
498 498
499 cbss = cfg80211_get_bss(sdata->local->hw.wiphy, ifibss->chandef.chan, 499 cbss = cfg80211_get_bss(sdata->local->hw.wiphy, ifibss->chandef.chan,
500 ifibss->bssid, ifibss->ssid, 500 ifibss->bssid, ifibss->ssid,
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index b2069fbd60f9..0e718437d080 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -297,6 +297,7 @@ struct ieee80211_if_ap {
297 driver_smps_mode; /* smps mode request */ 297 driver_smps_mode; /* smps mode request */
298 298
299 struct work_struct request_smps_work; 299 struct work_struct request_smps_work;
300 bool multicast_to_unicast;
300}; 301};
301 302
302struct ieee80211_if_wds { 303struct ieee80211_if_wds {
@@ -427,7 +428,7 @@ struct ieee80211_sta_tx_tspec {
427 bool downgraded; 428 bool downgraded;
428}; 429};
429 430
430DECLARE_EWMA(beacon_signal, 16, 4) 431DECLARE_EWMA(beacon_signal, 4, 4)
431 432
432struct ieee80211_if_managed { 433struct ieee80211_if_managed {
433 struct timer_list timer; 434 struct timer_list timer;
@@ -624,8 +625,8 @@ struct ieee80211_mesh_sync_ops {
624 struct ieee80211_rx_status *rx_status); 625 struct ieee80211_rx_status *rx_status);
625 626
626 /* should be called with beacon_data under RCU read lock */ 627 /* should be called with beacon_data under RCU read lock */
627 void (*adjust_tbtt)(struct ieee80211_sub_if_data *sdata, 628 void (*adjust_tsf)(struct ieee80211_sub_if_data *sdata,
628 struct beacon_data *beacon); 629 struct beacon_data *beacon);
629 /* add other framework functions here */ 630 /* add other framework functions here */
630}; 631};
631 632
@@ -688,7 +689,6 @@ struct ieee80211_if_mesh {
688 const struct ieee80211_mesh_sync_ops *sync_ops; 689 const struct ieee80211_mesh_sync_ops *sync_ops;
689 s64 sync_offset_clockdrift_max; 690 s64 sync_offset_clockdrift_max;
690 spinlock_t sync_offset_lock; 691 spinlock_t sync_offset_lock;
691 bool adjusting_tbtt;
692 /* mesh power save */ 692 /* mesh power save */
693 enum nl80211_mesh_power_mode nonpeer_pm; 693 enum nl80211_mesh_power_mode nonpeer_pm;
694 int ps_peers_light_sleep; 694 int ps_peers_light_sleep;
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index d37ae7dc114b..5bb0c5012819 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -718,7 +718,8 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up)
718 ieee80211_recalc_ps(local); 718 ieee80211_recalc_ps(local);
719 719
720 if (sdata->vif.type == NL80211_IFTYPE_MONITOR || 720 if (sdata->vif.type == NL80211_IFTYPE_MONITOR ||
721 sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { 721 sdata->vif.type == NL80211_IFTYPE_AP_VLAN ||
722 local->ops->wake_tx_queue) {
722 /* XXX: for AP_VLAN, actually track AP queues */ 723 /* XXX: for AP_VLAN, actually track AP queues */
723 netif_tx_start_all_queues(dev); 724 netif_tx_start_all_queues(dev);
724 } else if (dev) { 725 } else if (dev) {
@@ -1123,7 +1124,7 @@ static u16 ieee80211_netdev_select_queue(struct net_device *dev,
1123 return ieee80211_select_queue(IEEE80211_DEV_TO_SUB_IF(dev), skb); 1124 return ieee80211_select_queue(IEEE80211_DEV_TO_SUB_IF(dev), skb);
1124} 1125}
1125 1126
1126static struct rtnl_link_stats64 * 1127static void
1127ieee80211_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) 1128ieee80211_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
1128{ 1129{
1129 int i; 1130 int i;
@@ -1148,8 +1149,6 @@ ieee80211_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
1148 stats->rx_bytes += rx_bytes; 1149 stats->rx_bytes += rx_bytes;
1149 stats->tx_bytes += tx_bytes; 1150 stats->tx_bytes += tx_bytes;
1150 } 1151 }
1151
1152 return stats;
1153} 1152}
1154 1153
1155static const struct net_device_ops ieee80211_dataif_ops = { 1154static const struct net_device_ops ieee80211_dataif_ops = {
diff --git a/net/mac80211/key.h b/net/mac80211/key.h
index 4aa20cef0859..ebdb80b85dc3 100644
--- a/net/mac80211/key.h
+++ b/net/mac80211/key.h
@@ -93,7 +93,7 @@ struct ieee80211_key {
93 } ccmp; 93 } ccmp;
94 struct { 94 struct {
95 u8 rx_pn[IEEE80211_CMAC_PN_LEN]; 95 u8 rx_pn[IEEE80211_CMAC_PN_LEN];
96 struct crypto_cipher *tfm; 96 struct crypto_shash *tfm;
97 u32 replays; /* dot11RSNAStatsCMACReplays */ 97 u32 replays; /* dot11RSNAStatsCMACReplays */
98 u32 icverrors; /* dot11RSNAStatsCMACICVErrors */ 98 u32 icverrors; /* dot11RSNAStatsCMACICVErrors */
99 } aes_cmac; 99 } aes_cmac;
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index 42120d965263..6e7b6a07b7d5 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -279,10 +279,6 @@ int mesh_add_meshconf_ie(struct ieee80211_sub_if_data *sdata,
279 /* Mesh PS mode. See IEEE802.11-2012 8.4.2.100.8 */ 279 /* Mesh PS mode. See IEEE802.11-2012 8.4.2.100.8 */
280 *pos |= ifmsh->ps_peers_deep_sleep ? 280 *pos |= ifmsh->ps_peers_deep_sleep ?
281 IEEE80211_MESHCONF_CAPAB_POWER_SAVE_LEVEL : 0x00; 281 IEEE80211_MESHCONF_CAPAB_POWER_SAVE_LEVEL : 0x00;
282 *pos++ |= ifmsh->adjusting_tbtt ?
283 IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING : 0x00;
284 *pos++ = 0x00;
285
286 return 0; 282 return 0;
287} 283}
288 284
@@ -339,7 +335,7 @@ int mesh_add_vendor_ies(struct ieee80211_sub_if_data *sdata,
339 /* fast-forward to vendor IEs */ 335 /* fast-forward to vendor IEs */
340 offset = ieee80211_ie_split_vendor(ifmsh->ie, ifmsh->ie_len, 0); 336 offset = ieee80211_ie_split_vendor(ifmsh->ie, ifmsh->ie_len, 0);
341 337
342 if (offset) { 338 if (offset < ifmsh->ie_len) {
343 len = ifmsh->ie_len - offset; 339 len = ifmsh->ie_len - offset;
344 data = ifmsh->ie + offset; 340 data = ifmsh->ie + offset;
345 if (skb_tailroom(skb) < len) 341 if (skb_tailroom(skb) < len)
@@ -685,7 +681,7 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh)
685 2 + /* NULL SSID */ 681 2 + /* NULL SSID */
686 /* Channel Switch Announcement */ 682 /* Channel Switch Announcement */
687 2 + sizeof(struct ieee80211_channel_sw_ie) + 683 2 + sizeof(struct ieee80211_channel_sw_ie) +
688 /* Mesh Channel Swith Parameters */ 684 /* Mesh Channel Switch Parameters */
689 2 + sizeof(struct ieee80211_mesh_chansw_params_ie) + 685 2 + sizeof(struct ieee80211_mesh_chansw_params_ie) +
690 2 + 8 + /* supported rates */ 686 2 + 8 + /* supported rates */
691 2 + 3; /* DS params */ 687 2 + 3; /* DS params */
@@ -850,7 +846,6 @@ int ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata)
850 ifmsh->mesh_cc_id = 0; /* Disabled */ 846 ifmsh->mesh_cc_id = 0; /* Disabled */
851 /* register sync ops from extensible synchronization framework */ 847 /* register sync ops from extensible synchronization framework */
852 ifmsh->sync_ops = ieee80211_mesh_sync_ops_get(ifmsh->mesh_sp_id); 848 ifmsh->sync_ops = ieee80211_mesh_sync_ops_get(ifmsh->mesh_sp_id);
853 ifmsh->adjusting_tbtt = false;
854 ifmsh->sync_offset_clockdrift_max = 0; 849 ifmsh->sync_offset_clockdrift_max = 0;
855 set_bit(MESH_WORK_HOUSEKEEPING, &ifmsh->wrkq_flags); 850 set_bit(MESH_WORK_HOUSEKEEPING, &ifmsh->wrkq_flags);
856 ieee80211_mesh_root_setup(ifmsh); 851 ieee80211_mesh_root_setup(ifmsh);
@@ -1349,7 +1344,7 @@ void ieee80211_mesh_work(struct ieee80211_sub_if_data *sdata)
1349 ieee80211_mesh_rootpath(sdata); 1344 ieee80211_mesh_rootpath(sdata);
1350 1345
1351 if (test_and_clear_bit(MESH_WORK_DRIFT_ADJUST, &ifmsh->wrkq_flags)) 1346 if (test_and_clear_bit(MESH_WORK_DRIFT_ADJUST, &ifmsh->wrkq_flags))
1352 mesh_sync_adjust_tbtt(sdata); 1347 mesh_sync_adjust_tsf(sdata);
1353 1348
1354 if (test_and_clear_bit(MESH_WORK_MBSS_CHANGED, &ifmsh->wrkq_flags)) 1349 if (test_and_clear_bit(MESH_WORK_MBSS_CHANGED, &ifmsh->wrkq_flags))
1355 mesh_bss_info_changed(sdata); 1350 mesh_bss_info_changed(sdata);
diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h
index 26b9ccbe1fce..7e5f271e3c30 100644
--- a/net/mac80211/mesh.h
+++ b/net/mac80211/mesh.h
@@ -341,7 +341,7 @@ static inline bool mesh_path_sel_is_hwmp(struct ieee80211_sub_if_data *sdata)
341} 341}
342 342
343void mesh_path_flush_by_iface(struct ieee80211_sub_if_data *sdata); 343void mesh_path_flush_by_iface(struct ieee80211_sub_if_data *sdata);
344void mesh_sync_adjust_tbtt(struct ieee80211_sub_if_data *sdata); 344void mesh_sync_adjust_tsf(struct ieee80211_sub_if_data *sdata);
345void ieee80211s_stop(void); 345void ieee80211s_stop(void);
346#else 346#else
347static inline bool mesh_path_sel_is_hwmp(struct ieee80211_sub_if_data *sdata) 347static inline bool mesh_path_sel_is_hwmp(struct ieee80211_sub_if_data *sdata)
diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c
index 7fcdcf622655..953d71e784a9 100644
--- a/net/mac80211/mesh_plink.c
+++ b/net/mac80211/mesh_plink.c
@@ -9,6 +9,8 @@
9#include <linux/gfp.h> 9#include <linux/gfp.h>
10#include <linux/kernel.h> 10#include <linux/kernel.h>
11#include <linux/random.h> 11#include <linux/random.h>
12#include <linux/rculist.h>
13
12#include "ieee80211_i.h" 14#include "ieee80211_i.h"
13#include "rate.h" 15#include "rate.h"
14#include "mesh.h" 16#include "mesh.h"
@@ -505,12 +507,14 @@ mesh_sta_info_alloc(struct ieee80211_sub_if_data *sdata, u8 *addr,
505 507
506 /* Userspace handles station allocation */ 508 /* Userspace handles station allocation */
507 if (sdata->u.mesh.user_mpm || 509 if (sdata->u.mesh.user_mpm ||
508 sdata->u.mesh.security & IEEE80211_MESH_SEC_AUTHED) 510 sdata->u.mesh.security & IEEE80211_MESH_SEC_AUTHED) {
509 cfg80211_notify_new_peer_candidate(sdata->dev, addr, 511 if (mesh_peer_accepts_plinks(elems) &&
510 elems->ie_start, 512 mesh_plink_availables(sdata))
511 elems->total_len, 513 cfg80211_notify_new_peer_candidate(sdata->dev, addr,
512 GFP_KERNEL); 514 elems->ie_start,
513 else 515 elems->total_len,
516 GFP_KERNEL);
517 } else
514 sta = __mesh_sta_info_alloc(sdata, addr); 518 sta = __mesh_sta_info_alloc(sdata, addr);
515 519
516 return sta; 520 return sta;
diff --git a/net/mac80211/mesh_sync.c b/net/mac80211/mesh_sync.c
index faca22cd02b5..a435f094a82e 100644
--- a/net/mac80211/mesh_sync.c
+++ b/net/mac80211/mesh_sync.c
@@ -12,7 +12,7 @@
12#include "mesh.h" 12#include "mesh.h"
13#include "driver-ops.h" 13#include "driver-ops.h"
14 14
15/* This is not in the standard. It represents a tolerable tbtt drift below 15/* This is not in the standard. It represents a tolerable tsf drift below
16 * which we do no TSF adjustment. 16 * which we do no TSF adjustment.
17 */ 17 */
18#define TOFFSET_MINIMUM_ADJUSTMENT 10 18#define TOFFSET_MINIMUM_ADJUSTMENT 10
@@ -46,7 +46,7 @@ static bool mesh_peer_tbtt_adjusting(struct ieee802_11_elems *ie)
46 IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING) != 0; 46 IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING) != 0;
47} 47}
48 48
49void mesh_sync_adjust_tbtt(struct ieee80211_sub_if_data *sdata) 49void mesh_sync_adjust_tsf(struct ieee80211_sub_if_data *sdata)
50{ 50{
51 struct ieee80211_local *local = sdata->local; 51 struct ieee80211_local *local = sdata->local;
52 struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; 52 struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
@@ -57,12 +57,12 @@ void mesh_sync_adjust_tbtt(struct ieee80211_sub_if_data *sdata)
57 57
58 spin_lock_bh(&ifmsh->sync_offset_lock); 58 spin_lock_bh(&ifmsh->sync_offset_lock);
59 if (ifmsh->sync_offset_clockdrift_max < beacon_int_fraction) { 59 if (ifmsh->sync_offset_clockdrift_max < beacon_int_fraction) {
60 msync_dbg(sdata, "TBTT : max clockdrift=%lld; adjusting\n", 60 msync_dbg(sdata, "TSF : max clockdrift=%lld; adjusting\n",
61 (long long) ifmsh->sync_offset_clockdrift_max); 61 (long long) ifmsh->sync_offset_clockdrift_max);
62 tsfdelta = -ifmsh->sync_offset_clockdrift_max; 62 tsfdelta = -ifmsh->sync_offset_clockdrift_max;
63 ifmsh->sync_offset_clockdrift_max = 0; 63 ifmsh->sync_offset_clockdrift_max = 0;
64 } else { 64 } else {
65 msync_dbg(sdata, "TBTT : max clockdrift=%lld; adjusting by %llu\n", 65 msync_dbg(sdata, "TSF : max clockdrift=%lld; adjusting by %llu\n",
66 (long long) ifmsh->sync_offset_clockdrift_max, 66 (long long) ifmsh->sync_offset_clockdrift_max,
67 (unsigned long long) beacon_int_fraction); 67 (unsigned long long) beacon_int_fraction);
68 tsfdelta = -beacon_int_fraction; 68 tsfdelta = -beacon_int_fraction;
@@ -123,7 +123,6 @@ static void mesh_sync_offset_rx_bcn_presp(struct ieee80211_sub_if_data *sdata,
123 */ 123 */
124 124
125 if (elems->mesh_config && mesh_peer_tbtt_adjusting(elems)) { 125 if (elems->mesh_config && mesh_peer_tbtt_adjusting(elems)) {
126 clear_sta_flag(sta, WLAN_STA_TOFFSET_KNOWN);
127 msync_dbg(sdata, "STA %pM : is adjusting TBTT\n", 126 msync_dbg(sdata, "STA %pM : is adjusting TBTT\n",
128 sta->sta.addr); 127 sta->sta.addr);
129 goto no_sync; 128 goto no_sync;
@@ -168,15 +167,13 @@ no_sync:
168 rcu_read_unlock(); 167 rcu_read_unlock();
169} 168}
170 169
171static void mesh_sync_offset_adjust_tbtt(struct ieee80211_sub_if_data *sdata, 170static void mesh_sync_offset_adjust_tsf(struct ieee80211_sub_if_data *sdata,
172 struct beacon_data *beacon) 171 struct beacon_data *beacon)
173{ 172{
174 struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; 173 struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
175 u8 cap;
176 174
177 WARN_ON(ifmsh->mesh_sp_id != IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET); 175 WARN_ON(ifmsh->mesh_sp_id != IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET);
178 WARN_ON(!rcu_read_lock_held()); 176 WARN_ON(!rcu_read_lock_held());
179 cap = beacon->meshconf->meshconf_cap;
180 177
181 spin_lock_bh(&ifmsh->sync_offset_lock); 178 spin_lock_bh(&ifmsh->sync_offset_lock);
182 179
@@ -187,24 +184,16 @@ static void mesh_sync_offset_adjust_tbtt(struct ieee80211_sub_if_data *sdata,
187 * the tsf adjustment to the mesh tasklet 184 * the tsf adjustment to the mesh tasklet
188 */ 185 */
189 msync_dbg(sdata, 186 msync_dbg(sdata,
190 "TBTT : kicking off TBTT adjustment with clockdrift_max=%lld\n", 187 "TSF : kicking off TSF adjustment with clockdrift_max=%lld\n",
191 ifmsh->sync_offset_clockdrift_max); 188 ifmsh->sync_offset_clockdrift_max);
192 set_bit(MESH_WORK_DRIFT_ADJUST, &ifmsh->wrkq_flags); 189 set_bit(MESH_WORK_DRIFT_ADJUST, &ifmsh->wrkq_flags);
193
194 ifmsh->adjusting_tbtt = true;
195 } else { 190 } else {
196 msync_dbg(sdata, 191 msync_dbg(sdata,
197 "TBTT : max clockdrift=%lld; too small to adjust\n", 192 "TSF : max clockdrift=%lld; too small to adjust\n",
198 (long long)ifmsh->sync_offset_clockdrift_max); 193 (long long)ifmsh->sync_offset_clockdrift_max);
199 ifmsh->sync_offset_clockdrift_max = 0; 194 ifmsh->sync_offset_clockdrift_max = 0;
200
201 ifmsh->adjusting_tbtt = false;
202 } 195 }
203 spin_unlock_bh(&ifmsh->sync_offset_lock); 196 spin_unlock_bh(&ifmsh->sync_offset_lock);
204
205 beacon->meshconf->meshconf_cap = ifmsh->adjusting_tbtt ?
206 IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING | cap :
207 ~IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING & cap;
208} 197}
209 198
210static const struct sync_method sync_methods[] = { 199static const struct sync_method sync_methods[] = {
@@ -212,7 +201,7 @@ static const struct sync_method sync_methods[] = {
212 .method = IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET, 201 .method = IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET,
213 .ops = { 202 .ops = {
214 .rx_bcn_presp = &mesh_sync_offset_rx_bcn_presp, 203 .rx_bcn_presp = &mesh_sync_offset_rx_bcn_presp,
215 .adjust_tbtt = &mesh_sync_offset_adjust_tbtt, 204 .adjust_tsf = &mesh_sync_offset_adjust_tsf,
216 } 205 }
217 }, 206 },
218}; 207};
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 098ce9b179ee..6e90301154d5 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1486,10 +1486,6 @@ void ieee80211_recalc_ps(struct ieee80211_local *local)
1486 1486
1487 if (count == 1 && ieee80211_powersave_allowed(found)) { 1487 if (count == 1 && ieee80211_powersave_allowed(found)) {
1488 u8 dtimper = found->u.mgd.dtim_period; 1488 u8 dtimper = found->u.mgd.dtim_period;
1489 s32 beaconint_us;
1490
1491 beaconint_us = ieee80211_tu_to_usec(
1492 found->vif.bss_conf.beacon_int);
1493 1489
1494 timeout = local->dynamic_ps_forced_timeout; 1490 timeout = local->dynamic_ps_forced_timeout;
1495 if (timeout < 0) 1491 if (timeout < 0)
@@ -3423,14 +3419,14 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
3423 ieee80211_cqm_rssi_notify( 3419 ieee80211_cqm_rssi_notify(
3424 &sdata->vif, 3420 &sdata->vif,
3425 NL80211_CQM_RSSI_THRESHOLD_EVENT_LOW, 3421 NL80211_CQM_RSSI_THRESHOLD_EVENT_LOW,
3426 GFP_KERNEL); 3422 sig, GFP_KERNEL);
3427 } else if (sig > thold && 3423 } else if (sig > thold &&
3428 (last_event == 0 || sig > last_event + hyst)) { 3424 (last_event == 0 || sig > last_event + hyst)) {
3429 ifmgd->last_cqm_event_signal = sig; 3425 ifmgd->last_cqm_event_signal = sig;
3430 ieee80211_cqm_rssi_notify( 3426 ieee80211_cqm_rssi_notify(
3431 &sdata->vif, 3427 &sdata->vif,
3432 NL80211_CQM_RSSI_THRESHOLD_EVENT_HIGH, 3428 NL80211_CQM_RSSI_THRESHOLD_EVENT_HIGH,
3433 GFP_KERNEL); 3429 sig, GFP_KERNEL);
3434 } 3430 }
3435 } 3431 }
3436 3432
@@ -5045,13 +5041,14 @@ void ieee80211_mgd_stop(struct ieee80211_sub_if_data *sdata)
5045 5041
5046void ieee80211_cqm_rssi_notify(struct ieee80211_vif *vif, 5042void ieee80211_cqm_rssi_notify(struct ieee80211_vif *vif,
5047 enum nl80211_cqm_rssi_threshold_event rssi_event, 5043 enum nl80211_cqm_rssi_threshold_event rssi_event,
5044 s32 rssi_level,
5048 gfp_t gfp) 5045 gfp_t gfp)
5049{ 5046{
5050 struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); 5047 struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
5051 5048
5052 trace_api_cqm_rssi_notify(sdata, rssi_event); 5049 trace_api_cqm_rssi_notify(sdata, rssi_event, rssi_level);
5053 5050
5054 cfg80211_cqm_rssi_notify(sdata->dev, rssi_event, gfp); 5051 cfg80211_cqm_rssi_notify(sdata->dev, rssi_event, rssi_level, gfp);
5055} 5052}
5056EXPORT_SYMBOL(ieee80211_cqm_rssi_notify); 5053EXPORT_SYMBOL(ieee80211_cqm_rssi_notify);
5057 5054
diff --git a/net/mac80211/pm.c b/net/mac80211/pm.c
index 28a3a0957c9e..76a8bcd8ef11 100644
--- a/net/mac80211/pm.c
+++ b/net/mac80211/pm.c
@@ -168,6 +168,7 @@ int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan)
168 break; 168 break;
169 } 169 }
170 170
171 flush_delayed_work(&sdata->dec_tailroom_needed_wk);
171 drv_remove_interface(local, sdata); 172 drv_remove_interface(local, sdata);
172 } 173 }
173 174
diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c
index 14c5ba3a1b1c..3ebe4405a2d4 100644
--- a/net/mac80211/rc80211_minstrel.c
+++ b/net/mac80211/rc80211_minstrel.c
@@ -159,21 +159,23 @@ minstrel_update_rates(struct minstrel_priv *mp, struct minstrel_sta_info *mi)
159void 159void
160minstrel_calc_rate_stats(struct minstrel_rate_stats *mrs) 160minstrel_calc_rate_stats(struct minstrel_rate_stats *mrs)
161{ 161{
162 unsigned int cur_prob;
163
162 if (unlikely(mrs->attempts > 0)) { 164 if (unlikely(mrs->attempts > 0)) {
163 mrs->sample_skipped = 0; 165 mrs->sample_skipped = 0;
164 mrs->cur_prob = MINSTREL_FRAC(mrs->success, mrs->attempts); 166 cur_prob = MINSTREL_FRAC(mrs->success, mrs->attempts);
165 if (unlikely(!mrs->att_hist)) { 167 if (unlikely(!mrs->att_hist)) {
166 mrs->prob_ewma = mrs->cur_prob; 168 mrs->prob_ewma = cur_prob;
167 } else { 169 } else {
168 /* update exponential weighted moving variance */ 170 /* update exponential weighted moving variance */
169 mrs->prob_ewmsd = minstrel_ewmsd(mrs->prob_ewmsd, 171 mrs->prob_ewmv = minstrel_ewmv(mrs->prob_ewmv,
170 mrs->cur_prob, 172 cur_prob,
171 mrs->prob_ewma, 173 mrs->prob_ewma,
172 EWMA_LEVEL); 174 EWMA_LEVEL);
173 175
174 /*update exponential weighted moving avarage */ 176 /*update exponential weighted moving avarage */
175 mrs->prob_ewma = minstrel_ewma(mrs->prob_ewma, 177 mrs->prob_ewma = minstrel_ewma(mrs->prob_ewma,
176 mrs->cur_prob, 178 cur_prob,
177 EWMA_LEVEL); 179 EWMA_LEVEL);
178 } 180 }
179 mrs->att_hist += mrs->attempts; 181 mrs->att_hist += mrs->attempts;
@@ -365,6 +367,11 @@ minstrel_get_rate(void *priv, struct ieee80211_sta *sta,
365 return; 367 return;
366#endif 368#endif
367 369
370 /* Don't use EAPOL frames for sampling on non-mrr hw */
371 if (mp->hw->max_rates == 1 &&
372 (info->control.flags & IEEE80211_TX_CTRL_PORT_CTRL_PROTO))
373 return;
374
368 delta = (mi->total_packets * sampling_ratio / 100) - 375 delta = (mi->total_packets * sampling_ratio / 100) -
369 (mi->sample_packets + mi->sample_deferred / 2); 376 (mi->sample_packets + mi->sample_deferred / 2);
370 377
diff --git a/net/mac80211/rc80211_minstrel.h b/net/mac80211/rc80211_minstrel.h
index c230bbe93262..be6c3f35f48b 100644
--- a/net/mac80211/rc80211_minstrel.h
+++ b/net/mac80211/rc80211_minstrel.h
@@ -14,7 +14,7 @@
14#define SAMPLE_COLUMNS 10 /* number of columns in sample table */ 14#define SAMPLE_COLUMNS 10 /* number of columns in sample table */
15 15
16/* scaled fraction values */ 16/* scaled fraction values */
17#define MINSTREL_SCALE 16 17#define MINSTREL_SCALE 12
18#define MINSTREL_FRAC(val, div) (((val) << MINSTREL_SCALE) / div) 18#define MINSTREL_FRAC(val, div) (((val) << MINSTREL_SCALE) / div)
19#define MINSTREL_TRUNC(val) ((val) >> MINSTREL_SCALE) 19#define MINSTREL_TRUNC(val) ((val) >> MINSTREL_SCALE)
20 20
@@ -36,21 +36,16 @@ minstrel_ewma(int old, int new, int weight)
36} 36}
37 37
38/* 38/*
39 * Perform EWMSD (Exponentially Weighted Moving Standard Deviation) calculation 39 * Perform EWMV (Exponentially Weighted Moving Variance) calculation
40 */ 40 */
41static inline int 41static inline int
42minstrel_ewmsd(int old_ewmsd, int cur_prob, int prob_ewma, int weight) 42minstrel_ewmv(int old_ewmv, int cur_prob, int prob_ewma, int weight)
43{ 43{
44 int diff, incr, tmp_var; 44 int diff, incr;
45 45
46 /* calculate exponential weighted moving variance */ 46 diff = cur_prob - prob_ewma;
47 diff = MINSTREL_TRUNC((cur_prob - prob_ewma) * 1000000);
48 incr = (EWMA_DIV - weight) * diff / EWMA_DIV; 47 incr = (EWMA_DIV - weight) * diff / EWMA_DIV;
49 tmp_var = old_ewmsd * old_ewmsd; 48 return weight * (old_ewmv + MINSTREL_TRUNC(diff * incr)) / EWMA_DIV;
50 tmp_var = weight * (tmp_var + diff * incr / 1000000) / EWMA_DIV;
51
52 /* return standard deviation */
53 return (u16) int_sqrt(tmp_var);
54} 49}
55 50
56struct minstrel_rate_stats { 51struct minstrel_rate_stats {
@@ -59,15 +54,13 @@ struct minstrel_rate_stats {
59 u16 success, last_success; 54 u16 success, last_success;
60 55
61 /* total attempts/success counters */ 56 /* total attempts/success counters */
62 u64 att_hist, succ_hist; 57 u32 att_hist, succ_hist;
63 58
64 /* statistis of packet delivery probability 59 /* statistis of packet delivery probability
65 * cur_prob - current prob within last update intervall
66 * prob_ewma - exponential weighted moving average of prob 60 * prob_ewma - exponential weighted moving average of prob
67 * prob_ewmsd - exp. weighted moving standard deviation of prob */ 61 * prob_ewmsd - exp. weighted moving standard deviation of prob */
68 unsigned int cur_prob; 62 u16 prob_ewma;
69 unsigned int prob_ewma; 63 u16 prob_ewmv;
70 u16 prob_ewmsd;
71 64
72 /* maximum retry counts */ 65 /* maximum retry counts */
73 u8 retry_count; 66 u8 retry_count;
@@ -153,6 +146,14 @@ struct minstrel_debugfs_info {
153 char buf[]; 146 char buf[];
154}; 147};
155 148
149/* Get EWMSD (Exponentially Weighted Moving Standard Deviation) * 10 */
150static inline int
151minstrel_get_ewmsd10(struct minstrel_rate_stats *mrs)
152{
153 unsigned int ewmv = mrs->prob_ewmv;
154 return int_sqrt(MINSTREL_TRUNC(ewmv * 1000 * 1000));
155}
156
156extern const struct rate_control_ops mac80211_minstrel; 157extern const struct rate_control_ops mac80211_minstrel;
157void minstrel_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir); 158void minstrel_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir);
158void minstrel_remove_sta_debugfs(void *priv, void *priv_sta); 159void minstrel_remove_sta_debugfs(void *priv, void *priv_sta);
diff --git a/net/mac80211/rc80211_minstrel_debugfs.c b/net/mac80211/rc80211_minstrel_debugfs.c
index 820b0abc9c0d..36fc971deb86 100644
--- a/net/mac80211/rc80211_minstrel_debugfs.c
+++ b/net/mac80211/rc80211_minstrel_debugfs.c
@@ -75,7 +75,7 @@ minstrel_stats_open(struct inode *inode, struct file *file)
75{ 75{
76 struct minstrel_sta_info *mi = inode->i_private; 76 struct minstrel_sta_info *mi = inode->i_private;
77 struct minstrel_debugfs_info *ms; 77 struct minstrel_debugfs_info *ms;
78 unsigned int i, tp_max, tp_avg, prob, eprob; 78 unsigned int i, tp_max, tp_avg, eprob;
79 char *p; 79 char *p;
80 80
81 ms = kmalloc(2048, GFP_KERNEL); 81 ms = kmalloc(2048, GFP_KERNEL);
@@ -86,13 +86,14 @@ minstrel_stats_open(struct inode *inode, struct file *file)
86 p = ms->buf; 86 p = ms->buf;
87 p += sprintf(p, "\n"); 87 p += sprintf(p, "\n");
88 p += sprintf(p, 88 p += sprintf(p,
89 "best __________rate_________ ________statistics________ ________last_______ ______sum-of________\n"); 89 "best __________rate_________ ________statistics________ ____last_____ ______sum-of________\n");
90 p += sprintf(p, 90 p += sprintf(p,
91 "rate [name idx airtime max_tp] [avg(tp) avg(prob) sd(prob)] [prob.|retry|suc|att] [#success | #attempts]\n"); 91 "rate [name idx airtime max_tp] [avg(tp) avg(prob) sd(prob)] [retry|suc|att] [#success | #attempts]\n");
92 92
93 for (i = 0; i < mi->n_rates; i++) { 93 for (i = 0; i < mi->n_rates; i++) {
94 struct minstrel_rate *mr = &mi->r[i]; 94 struct minstrel_rate *mr = &mi->r[i];
95 struct minstrel_rate_stats *mrs = &mi->r[i].stats; 95 struct minstrel_rate_stats *mrs = &mi->r[i].stats;
96 unsigned int prob_ewmsd;
96 97
97 *(p++) = (i == mi->max_tp_rate[0]) ? 'A' : ' '; 98 *(p++) = (i == mi->max_tp_rate[0]) ? 'A' : ' ';
98 *(p++) = (i == mi->max_tp_rate[1]) ? 'B' : ' '; 99 *(p++) = (i == mi->max_tp_rate[1]) ? 'B' : ' ';
@@ -107,17 +108,16 @@ minstrel_stats_open(struct inode *inode, struct file *file)
107 108
108 tp_max = minstrel_get_tp_avg(mr, MINSTREL_FRAC(100,100)); 109 tp_max = minstrel_get_tp_avg(mr, MINSTREL_FRAC(100,100));
109 tp_avg = minstrel_get_tp_avg(mr, mrs->prob_ewma); 110 tp_avg = minstrel_get_tp_avg(mr, mrs->prob_ewma);
110 prob = MINSTREL_TRUNC(mrs->cur_prob * 1000);
111 eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000); 111 eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000);
112 prob_ewmsd = minstrel_get_ewmsd10(mrs);
112 113
113 p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u %3u.%1u" 114 p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u %3u.%1u"
114 " %3u.%1u %3u %3u %-3u " 115 " %3u %3u %-3u "
115 "%9llu %-9llu\n", 116 "%9llu %-9llu\n",
116 tp_max / 10, tp_max % 10, 117 tp_max / 10, tp_max % 10,
117 tp_avg / 10, tp_avg % 10, 118 tp_avg / 10, tp_avg % 10,
118 eprob / 10, eprob % 10, 119 eprob / 10, eprob % 10,
119 mrs->prob_ewmsd / 10, mrs->prob_ewmsd % 10, 120 prob_ewmsd / 10, prob_ewmsd % 10,
120 prob / 10, prob % 10,
121 mrs->retry_count, 121 mrs->retry_count,
122 mrs->last_success, 122 mrs->last_success,
123 mrs->last_attempts, 123 mrs->last_attempts,
@@ -148,7 +148,7 @@ minstrel_stats_csv_open(struct inode *inode, struct file *file)
148{ 148{
149 struct minstrel_sta_info *mi = inode->i_private; 149 struct minstrel_sta_info *mi = inode->i_private;
150 struct minstrel_debugfs_info *ms; 150 struct minstrel_debugfs_info *ms;
151 unsigned int i, tp_max, tp_avg, prob, eprob; 151 unsigned int i, tp_max, tp_avg, eprob;
152 char *p; 152 char *p;
153 153
154 ms = kmalloc(2048, GFP_KERNEL); 154 ms = kmalloc(2048, GFP_KERNEL);
@@ -161,6 +161,7 @@ minstrel_stats_csv_open(struct inode *inode, struct file *file)
161 for (i = 0; i < mi->n_rates; i++) { 161 for (i = 0; i < mi->n_rates; i++) {
162 struct minstrel_rate *mr = &mi->r[i]; 162 struct minstrel_rate *mr = &mi->r[i];
163 struct minstrel_rate_stats *mrs = &mi->r[i].stats; 163 struct minstrel_rate_stats *mrs = &mi->r[i].stats;
164 unsigned int prob_ewmsd;
164 165
165 p += sprintf(p, "%s" ,((i == mi->max_tp_rate[0]) ? "A" : "")); 166 p += sprintf(p, "%s" ,((i == mi->max_tp_rate[0]) ? "A" : ""));
166 p += sprintf(p, "%s" ,((i == mi->max_tp_rate[1]) ? "B" : "")); 167 p += sprintf(p, "%s" ,((i == mi->max_tp_rate[1]) ? "B" : ""));
@@ -175,16 +176,15 @@ minstrel_stats_csv_open(struct inode *inode, struct file *file)
175 176
176 tp_max = minstrel_get_tp_avg(mr, MINSTREL_FRAC(100,100)); 177 tp_max = minstrel_get_tp_avg(mr, MINSTREL_FRAC(100,100));
177 tp_avg = minstrel_get_tp_avg(mr, mrs->prob_ewma); 178 tp_avg = minstrel_get_tp_avg(mr, mrs->prob_ewma);
178 prob = MINSTREL_TRUNC(mrs->cur_prob * 1000);
179 eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000); 179 eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000);
180 prob_ewmsd = minstrel_get_ewmsd10(mrs);
180 181
181 p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u.%u,%u.%u,%u,%u,%u," 182 p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u.%u,%u,%u,%u,"
182 "%llu,%llu,%d,%d\n", 183 "%llu,%llu,%d,%d\n",
183 tp_max / 10, tp_max % 10, 184 tp_max / 10, tp_max % 10,
184 tp_avg / 10, tp_avg % 10, 185 tp_avg / 10, tp_avg % 10,
185 eprob / 10, eprob % 10, 186 eprob / 10, eprob % 10,
186 mrs->prob_ewmsd / 10, mrs->prob_ewmsd % 10, 187 prob_ewmsd / 10, prob_ewmsd % 10,
187 prob / 10, prob % 10,
188 mrs->retry_count, 188 mrs->retry_count,
189 mrs->last_success, 189 mrs->last_success,
190 mrs->last_attempts, 190 mrs->last_attempts,
diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c
index 30fbabf4bcbc..8e783e197e93 100644
--- a/net/mac80211/rc80211_minstrel_ht.c
+++ b/net/mac80211/rc80211_minstrel_ht.c
@@ -14,6 +14,7 @@
14#include <linux/ieee80211.h> 14#include <linux/ieee80211.h>
15#include <net/mac80211.h> 15#include <net/mac80211.h>
16#include "rate.h" 16#include "rate.h"
17#include "sta_info.h"
17#include "rc80211_minstrel.h" 18#include "rc80211_minstrel.h"
18#include "rc80211_minstrel_ht.h" 19#include "rc80211_minstrel_ht.h"
19 20
@@ -154,67 +155,47 @@ MODULE_PARM_DESC(minstrel_vht_only,
154const struct mcs_group minstrel_mcs_groups[] = { 155const struct mcs_group minstrel_mcs_groups[] = {
155 MCS_GROUP(1, 0, BW_20), 156 MCS_GROUP(1, 0, BW_20),
156 MCS_GROUP(2, 0, BW_20), 157 MCS_GROUP(2, 0, BW_20),
157#if MINSTREL_MAX_STREAMS >= 3
158 MCS_GROUP(3, 0, BW_20), 158 MCS_GROUP(3, 0, BW_20),
159#endif
160 159
161 MCS_GROUP(1, 1, BW_20), 160 MCS_GROUP(1, 1, BW_20),
162 MCS_GROUP(2, 1, BW_20), 161 MCS_GROUP(2, 1, BW_20),
163#if MINSTREL_MAX_STREAMS >= 3
164 MCS_GROUP(3, 1, BW_20), 162 MCS_GROUP(3, 1, BW_20),
165#endif
166 163
167 MCS_GROUP(1, 0, BW_40), 164 MCS_GROUP(1, 0, BW_40),
168 MCS_GROUP(2, 0, BW_40), 165 MCS_GROUP(2, 0, BW_40),
169#if MINSTREL_MAX_STREAMS >= 3
170 MCS_GROUP(3, 0, BW_40), 166 MCS_GROUP(3, 0, BW_40),
171#endif
172 167
173 MCS_GROUP(1, 1, BW_40), 168 MCS_GROUP(1, 1, BW_40),
174 MCS_GROUP(2, 1, BW_40), 169 MCS_GROUP(2, 1, BW_40),
175#if MINSTREL_MAX_STREAMS >= 3
176 MCS_GROUP(3, 1, BW_40), 170 MCS_GROUP(3, 1, BW_40),
177#endif
178 171
179 CCK_GROUP, 172 CCK_GROUP,
180 173
181#ifdef CONFIG_MAC80211_RC_MINSTREL_VHT 174#ifdef CONFIG_MAC80211_RC_MINSTREL_VHT
182 VHT_GROUP(1, 0, BW_20), 175 VHT_GROUP(1, 0, BW_20),
183 VHT_GROUP(2, 0, BW_20), 176 VHT_GROUP(2, 0, BW_20),
184#if MINSTREL_MAX_STREAMS >= 3
185 VHT_GROUP(3, 0, BW_20), 177 VHT_GROUP(3, 0, BW_20),
186#endif
187 178
188 VHT_GROUP(1, 1, BW_20), 179 VHT_GROUP(1, 1, BW_20),
189 VHT_GROUP(2, 1, BW_20), 180 VHT_GROUP(2, 1, BW_20),
190#if MINSTREL_MAX_STREAMS >= 3
191 VHT_GROUP(3, 1, BW_20), 181 VHT_GROUP(3, 1, BW_20),
192#endif
193 182
194 VHT_GROUP(1, 0, BW_40), 183 VHT_GROUP(1, 0, BW_40),
195 VHT_GROUP(2, 0, BW_40), 184 VHT_GROUP(2, 0, BW_40),
196#if MINSTREL_MAX_STREAMS >= 3
197 VHT_GROUP(3, 0, BW_40), 185 VHT_GROUP(3, 0, BW_40),
198#endif
199 186
200 VHT_GROUP(1, 1, BW_40), 187 VHT_GROUP(1, 1, BW_40),
201 VHT_GROUP(2, 1, BW_40), 188 VHT_GROUP(2, 1, BW_40),
202#if MINSTREL_MAX_STREAMS >= 3
203 VHT_GROUP(3, 1, BW_40), 189 VHT_GROUP(3, 1, BW_40),
204#endif
205 190
206 VHT_GROUP(1, 0, BW_80), 191 VHT_GROUP(1, 0, BW_80),
207 VHT_GROUP(2, 0, BW_80), 192 VHT_GROUP(2, 0, BW_80),
208#if MINSTREL_MAX_STREAMS >= 3
209 VHT_GROUP(3, 0, BW_80), 193 VHT_GROUP(3, 0, BW_80),
210#endif
211 194
212 VHT_GROUP(1, 1, BW_80), 195 VHT_GROUP(1, 1, BW_80),
213 VHT_GROUP(2, 1, BW_80), 196 VHT_GROUP(2, 1, BW_80),
214#if MINSTREL_MAX_STREAMS >= 3
215 VHT_GROUP(3, 1, BW_80), 197 VHT_GROUP(3, 1, BW_80),
216#endif 198#endif
217#endif
218}; 199};
219 200
220static u8 sample_table[SAMPLE_COLUMNS][MCS_GROUP_RATES] __read_mostly; 201static u8 sample_table[SAMPLE_COLUMNS][MCS_GROUP_RATES] __read_mostly;
@@ -301,7 +282,7 @@ minstrel_ht_get_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi,
301 break; 282 break;
302 283
303 /* short preamble */ 284 /* short preamble */
304 if (!(mi->groups[group].supported & BIT(idx))) 285 if (!(mi->supported[group] & BIT(idx)))
305 idx += 4; 286 idx += 4;
306 } 287 }
307 return &mi->groups[group].rates[idx]; 288 return &mi->groups[group].rates[idx];
@@ -486,7 +467,7 @@ minstrel_ht_prob_rate_reduce_streams(struct minstrel_ht_sta *mi)
486 MCS_GROUP_RATES].streams; 467 MCS_GROUP_RATES].streams;
487 for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) { 468 for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) {
488 mg = &mi->groups[group]; 469 mg = &mi->groups[group];
489 if (!mg->supported || group == MINSTREL_CCK_GROUP) 470 if (!mi->supported[group] || group == MINSTREL_CCK_GROUP)
490 continue; 471 continue;
491 472
492 tmp_idx = mg->max_group_prob_rate % MCS_GROUP_RATES; 473 tmp_idx = mg->max_group_prob_rate % MCS_GROUP_RATES;
@@ -540,7 +521,7 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi)
540 for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) { 521 for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) {
541 522
542 mg = &mi->groups[group]; 523 mg = &mi->groups[group];
543 if (!mg->supported) 524 if (!mi->supported[group])
544 continue; 525 continue;
545 526
546 mi->sample_count++; 527 mi->sample_count++;
@@ -550,7 +531,7 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi)
550 tmp_group_tp_rate[j] = group; 531 tmp_group_tp_rate[j] = group;
551 532
552 for (i = 0; i < MCS_GROUP_RATES; i++) { 533 for (i = 0; i < MCS_GROUP_RATES; i++) {
553 if (!(mg->supported & BIT(i))) 534 if (!(mi->supported[group] & BIT(i)))
554 continue; 535 continue;
555 536
556 index = MCS_GROUP_RATES * group + i; 537 index = MCS_GROUP_RATES * group + i;
@@ -636,7 +617,7 @@ minstrel_set_next_sample_idx(struct minstrel_ht_sta *mi)
636 mi->sample_group %= ARRAY_SIZE(minstrel_mcs_groups); 617 mi->sample_group %= ARRAY_SIZE(minstrel_mcs_groups);
637 mg = &mi->groups[mi->sample_group]; 618 mg = &mi->groups[mi->sample_group];
638 619
639 if (!mg->supported) 620 if (!mi->supported[mi->sample_group])
640 continue; 621 continue;
641 622
642 if (++mg->index >= MCS_GROUP_RATES) { 623 if (++mg->index >= MCS_GROUP_RATES) {
@@ -657,7 +638,7 @@ minstrel_downgrade_rate(struct minstrel_ht_sta *mi, u16 *idx, bool primary)
657 while (group > 0) { 638 while (group > 0) {
658 group--; 639 group--;
659 640
660 if (!mi->groups[group].supported) 641 if (!mi->supported[group])
661 continue; 642 continue;
662 643
663 if (minstrel_mcs_groups[group].streams > 644 if (minstrel_mcs_groups[group].streams >
@@ -994,7 +975,7 @@ minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi)
994 sample_idx = sample_table[mg->column][mg->index]; 975 sample_idx = sample_table[mg->column][mg->index];
995 minstrel_set_next_sample_idx(mi); 976 minstrel_set_next_sample_idx(mi);
996 977
997 if (!(mg->supported & BIT(sample_idx))) 978 if (!(mi->supported[sample_group] & BIT(sample_idx)))
998 return -1; 979 return -1;
999 980
1000 mrs = &mg->rates[sample_idx]; 981 mrs = &mg->rates[sample_idx];
@@ -1049,22 +1030,6 @@ minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi)
1049} 1030}
1050 1031
1051static void 1032static void
1052minstrel_ht_check_cck_shortpreamble(struct minstrel_priv *mp,
1053 struct minstrel_ht_sta *mi, bool val)
1054{
1055 u8 supported = mi->groups[MINSTREL_CCK_GROUP].supported;
1056
1057 if (!supported || !mi->cck_supported_short)
1058 return;
1059
1060 if (supported & (mi->cck_supported_short << (val * 4)))
1061 return;
1062
1063 supported ^= mi->cck_supported_short | (mi->cck_supported_short << 4);
1064 mi->groups[MINSTREL_CCK_GROUP].supported = supported;
1065}
1066
1067static void
1068minstrel_ht_get_rate(void *priv, struct ieee80211_sta *sta, void *priv_sta, 1033minstrel_ht_get_rate(void *priv, struct ieee80211_sta *sta, void *priv_sta,
1069 struct ieee80211_tx_rate_control *txrc) 1034 struct ieee80211_tx_rate_control *txrc)
1070{ 1035{
@@ -1087,7 +1052,6 @@ minstrel_ht_get_rate(void *priv, struct ieee80211_sta *sta, void *priv_sta,
1087 minstrel_aggr_check(sta, txrc->skb); 1052 minstrel_aggr_check(sta, txrc->skb);
1088 1053
1089 info->flags |= mi->tx_flags; 1054 info->flags |= mi->tx_flags;
1090 minstrel_ht_check_cck_shortpreamble(mp, mi, txrc->short_preamble);
1091 1055
1092#ifdef CONFIG_MAC80211_DEBUGFS 1056#ifdef CONFIG_MAC80211_DEBUGFS
1093 if (mp->fixed_rate_idx != -1) 1057 if (mp->fixed_rate_idx != -1)
@@ -1154,7 +1118,7 @@ minstrel_ht_update_cck(struct minstrel_priv *mp, struct minstrel_ht_sta *mi,
1154 mi->cck_supported_short |= BIT(i); 1118 mi->cck_supported_short |= BIT(i);
1155 } 1119 }
1156 1120
1157 mi->groups[MINSTREL_CCK_GROUP].supported = mi->cck_supported; 1121 mi->supported[MINSTREL_CCK_GROUP] = mi->cck_supported;
1158} 1122}
1159 1123
1160static void 1124static void
@@ -1168,6 +1132,7 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband,
1168 struct ieee80211_mcs_info *mcs = &sta->ht_cap.mcs; 1132 struct ieee80211_mcs_info *mcs = &sta->ht_cap.mcs;
1169 u16 sta_cap = sta->ht_cap.cap; 1133 u16 sta_cap = sta->ht_cap.cap;
1170 struct ieee80211_sta_vht_cap *vht_cap = &sta->vht_cap; 1134 struct ieee80211_sta_vht_cap *vht_cap = &sta->vht_cap;
1135 struct sta_info *sinfo = container_of(sta, struct sta_info, sta);
1171 int use_vht; 1136 int use_vht;
1172 int n_supported = 0; 1137 int n_supported = 0;
1173 int ack_dur; 1138 int ack_dur;
@@ -1224,7 +1189,7 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband,
1224 u32 gflags = minstrel_mcs_groups[i].flags; 1189 u32 gflags = minstrel_mcs_groups[i].flags;
1225 int bw, nss; 1190 int bw, nss;
1226 1191
1227 mi->groups[i].supported = 0; 1192 mi->supported[i] = 0;
1228 if (i == MINSTREL_CCK_GROUP) { 1193 if (i == MINSTREL_CCK_GROUP) {
1229 minstrel_ht_update_cck(mp, mi, sband, sta); 1194 minstrel_ht_update_cck(mp, mi, sband, sta);
1230 continue; 1195 continue;
@@ -1256,8 +1221,8 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband,
1256 if (use_vht && minstrel_vht_only) 1221 if (use_vht && minstrel_vht_only)
1257 continue; 1222 continue;
1258#endif 1223#endif
1259 mi->groups[i].supported = mcs->rx_mask[nss - 1]; 1224 mi->supported[i] = mcs->rx_mask[nss - 1];
1260 if (mi->groups[i].supported) 1225 if (mi->supported[i])
1261 n_supported++; 1226 n_supported++;
1262 continue; 1227 continue;
1263 } 1228 }
@@ -1283,16 +1248,19 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband,
1283 else 1248 else
1284 bw = BW_20; 1249 bw = BW_20;
1285 1250
1286 mi->groups[i].supported = minstrel_get_valid_vht_rates(bw, nss, 1251 mi->supported[i] = minstrel_get_valid_vht_rates(bw, nss,
1287 vht_cap->vht_mcs.tx_mcs_map); 1252 vht_cap->vht_mcs.tx_mcs_map);
1288 1253
1289 if (mi->groups[i].supported) 1254 if (mi->supported[i])
1290 n_supported++; 1255 n_supported++;
1291 } 1256 }
1292 1257
1293 if (!n_supported) 1258 if (!n_supported)
1294 goto use_legacy; 1259 goto use_legacy;
1295 1260
1261 if (test_sta_flag(sinfo, WLAN_STA_SHORT_PREAMBLE))
1262 mi->cck_supported_short |= mi->cck_supported_short << 4;
1263
1296 /* create an initial rate table with the lowest supported rates */ 1264 /* create an initial rate table with the lowest supported rates */
1297 minstrel_ht_update_stats(mp, mi); 1265 minstrel_ht_update_stats(mp, mi);
1298 minstrel_ht_update_rates(mp, mi); 1266 minstrel_ht_update_rates(mp, mi);
diff --git a/net/mac80211/rc80211_minstrel_ht.h b/net/mac80211/rc80211_minstrel_ht.h
index e8b52a94d24b..de1646c42e82 100644
--- a/net/mac80211/rc80211_minstrel_ht.h
+++ b/net/mac80211/rc80211_minstrel_ht.h
@@ -52,9 +52,6 @@ struct minstrel_mcs_group_data {
52 u8 index; 52 u8 index;
53 u8 column; 53 u8 column;
54 54
55 /* bitfield of supported MCS rates of this group */
56 u16 supported;
57
58 /* sorted rate set within a MCS group*/ 55 /* sorted rate set within a MCS group*/
59 u16 max_group_tp_rate[MAX_THR_RATES]; 56 u16 max_group_tp_rate[MAX_THR_RATES];
60 u16 max_group_prob_rate; 57 u16 max_group_prob_rate;
@@ -101,6 +98,9 @@ struct minstrel_ht_sta {
101 u8 cck_supported; 98 u8 cck_supported;
102 u8 cck_supported_short; 99 u8 cck_supported_short;
103 100
101 /* Bitfield of supported MCS rates of all groups */
102 u16 supported[MINSTREL_GROUPS_NB];
103
104 /* MCS rate group info and statistics */ 104 /* MCS rate group info and statistics */
105 struct minstrel_mcs_group_data groups[MINSTREL_GROUPS_NB]; 105 struct minstrel_mcs_group_data groups[MINSTREL_GROUPS_NB];
106}; 106};
diff --git a/net/mac80211/rc80211_minstrel_ht_debugfs.c b/net/mac80211/rc80211_minstrel_ht_debugfs.c
index 5320e35ed3d0..7d969e300fb3 100644
--- a/net/mac80211/rc80211_minstrel_ht_debugfs.c
+++ b/net/mac80211/rc80211_minstrel_ht_debugfs.c
@@ -19,12 +19,12 @@ static char *
19minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p) 19minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p)
20{ 20{
21 const struct mcs_group *mg; 21 const struct mcs_group *mg;
22 unsigned int j, tp_max, tp_avg, prob, eprob, tx_time; 22 unsigned int j, tp_max, tp_avg, eprob, tx_time;
23 char htmode = '2'; 23 char htmode = '2';
24 char gimode = 'L'; 24 char gimode = 'L';
25 u32 gflags; 25 u32 gflags;
26 26
27 if (!mi->groups[i].supported) 27 if (!mi->supported[i])
28 return p; 28 return p;
29 29
30 mg = &minstrel_mcs_groups[i]; 30 mg = &minstrel_mcs_groups[i];
@@ -41,8 +41,9 @@ minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p)
41 struct minstrel_rate_stats *mrs = &mi->groups[i].rates[j]; 41 struct minstrel_rate_stats *mrs = &mi->groups[i].rates[j];
42 static const int bitrates[4] = { 10, 20, 55, 110 }; 42 static const int bitrates[4] = { 10, 20, 55, 110 };
43 int idx = i * MCS_GROUP_RATES + j; 43 int idx = i * MCS_GROUP_RATES + j;
44 unsigned int prob_ewmsd;
44 45
45 if (!(mi->groups[i].supported & BIT(j))) 46 if (!(mi->supported[i] & BIT(j)))
46 continue; 47 continue;
47 48
48 if (gflags & IEEE80211_TX_RC_MCS) { 49 if (gflags & IEEE80211_TX_RC_MCS) {
@@ -83,17 +84,16 @@ minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p)
83 84
84 tp_max = minstrel_ht_get_tp_avg(mi, i, j, MINSTREL_FRAC(100, 100)); 85 tp_max = minstrel_ht_get_tp_avg(mi, i, j, MINSTREL_FRAC(100, 100));
85 tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_ewma); 86 tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_ewma);
86 prob = MINSTREL_TRUNC(mrs->cur_prob * 1000);
87 eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000); 87 eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000);
88 prob_ewmsd = minstrel_get_ewmsd10(mrs);
88 89
89 p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u %3u.%1u" 90 p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u %3u.%1u"
90 " %3u.%1u %3u %3u %-3u " 91 " %3u %3u %-3u "
91 "%9llu %-9llu\n", 92 "%9llu %-9llu\n",
92 tp_max / 10, tp_max % 10, 93 tp_max / 10, tp_max % 10,
93 tp_avg / 10, tp_avg % 10, 94 tp_avg / 10, tp_avg % 10,
94 eprob / 10, eprob % 10, 95 eprob / 10, eprob % 10,
95 mrs->prob_ewmsd / 10, mrs->prob_ewmsd % 10, 96 prob_ewmsd / 10, prob_ewmsd % 10,
96 prob / 10, prob % 10,
97 mrs->retry_count, 97 mrs->retry_count,
98 mrs->last_success, 98 mrs->last_success,
99 mrs->last_attempts, 99 mrs->last_attempts,
@@ -130,9 +130,9 @@ minstrel_ht_stats_open(struct inode *inode, struct file *file)
130 130
131 p += sprintf(p, "\n"); 131 p += sprintf(p, "\n");
132 p += sprintf(p, 132 p += sprintf(p,
133 " best ____________rate__________ ________statistics________ ________last_______ ______sum-of________\n"); 133 " best ____________rate__________ ________statistics________ _____last____ ______sum-of________\n");
134 p += sprintf(p, 134 p += sprintf(p,
135 "mode guard # rate [name idx airtime max_tp] [avg(tp) avg(prob) sd(prob)] [prob.|retry|suc|att] [#success | #attempts]\n"); 135 "mode guard # rate [name idx airtime max_tp] [avg(tp) avg(prob) sd(prob)] [retry|suc|att] [#success | #attempts]\n");
136 136
137 p = minstrel_ht_stats_dump(mi, MINSTREL_CCK_GROUP, p); 137 p = minstrel_ht_stats_dump(mi, MINSTREL_CCK_GROUP, p);
138 for (i = 0; i < MINSTREL_CCK_GROUP; i++) 138 for (i = 0; i < MINSTREL_CCK_GROUP; i++)
@@ -165,12 +165,12 @@ static char *
165minstrel_ht_stats_csv_dump(struct minstrel_ht_sta *mi, int i, char *p) 165minstrel_ht_stats_csv_dump(struct minstrel_ht_sta *mi, int i, char *p)
166{ 166{
167 const struct mcs_group *mg; 167 const struct mcs_group *mg;
168 unsigned int j, tp_max, tp_avg, prob, eprob, tx_time; 168 unsigned int j, tp_max, tp_avg, eprob, tx_time;
169 char htmode = '2'; 169 char htmode = '2';
170 char gimode = 'L'; 170 char gimode = 'L';
171 u32 gflags; 171 u32 gflags;
172 172
173 if (!mi->groups[i].supported) 173 if (!mi->supported[i])
174 return p; 174 return p;
175 175
176 mg = &minstrel_mcs_groups[i]; 176 mg = &minstrel_mcs_groups[i];
@@ -187,8 +187,9 @@ minstrel_ht_stats_csv_dump(struct minstrel_ht_sta *mi, int i, char *p)
187 struct minstrel_rate_stats *mrs = &mi->groups[i].rates[j]; 187 struct minstrel_rate_stats *mrs = &mi->groups[i].rates[j];
188 static const int bitrates[4] = { 10, 20, 55, 110 }; 188 static const int bitrates[4] = { 10, 20, 55, 110 };
189 int idx = i * MCS_GROUP_RATES + j; 189 int idx = i * MCS_GROUP_RATES + j;
190 unsigned int prob_ewmsd;
190 191
191 if (!(mi->groups[i].supported & BIT(j))) 192 if (!(mi->supported[i] & BIT(j)))
192 continue; 193 continue;
193 194
194 if (gflags & IEEE80211_TX_RC_MCS) { 195 if (gflags & IEEE80211_TX_RC_MCS) {
@@ -226,16 +227,15 @@ minstrel_ht_stats_csv_dump(struct minstrel_ht_sta *mi, int i, char *p)
226 227
227 tp_max = minstrel_ht_get_tp_avg(mi, i, j, MINSTREL_FRAC(100, 100)); 228 tp_max = minstrel_ht_get_tp_avg(mi, i, j, MINSTREL_FRAC(100, 100));
228 tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_ewma); 229 tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_ewma);
229 prob = MINSTREL_TRUNC(mrs->cur_prob * 1000);
230 eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000); 230 eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000);
231 prob_ewmsd = minstrel_get_ewmsd10(mrs);
231 232
232 p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u.%u,%u.%u,%u,%u," 233 p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u.%u,%u,%u,"
233 "%u,%llu,%llu,", 234 "%u,%llu,%llu,",
234 tp_max / 10, tp_max % 10, 235 tp_max / 10, tp_max % 10,
235 tp_avg / 10, tp_avg % 10, 236 tp_avg / 10, tp_avg % 10,
236 eprob / 10, eprob % 10, 237 eprob / 10, eprob % 10,
237 mrs->prob_ewmsd / 10, mrs->prob_ewmsd % 10, 238 prob_ewmsd / 10, prob_ewmsd % 10,
238 prob / 10, prob % 10,
239 mrs->retry_count, 239 mrs->retry_count,
240 mrs->last_success, 240 mrs->last_success,
241 mrs->last_attempts, 241 mrs->last_attempts,
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 3090dd4342f6..4d7543d1a62c 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -4,7 +4,7 @@
4 * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> 4 * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
5 * Copyright 2007-2010 Johannes Berg <johannes@sipsolutions.net> 5 * Copyright 2007-2010 Johannes Berg <johannes@sipsolutions.net>
6 * Copyright 2013-2014 Intel Mobile Communications GmbH 6 * Copyright 2013-2014 Intel Mobile Communications GmbH
7 * Copyright(c) 2015 - 2016 Intel Deutschland GmbH 7 * Copyright(c) 2015 - 2017 Intel Deutschland GmbH
8 * 8 *
9 * This program is free software; you can redistribute it and/or modify 9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2 as 10 * it under the terms of the GNU General Public License version 2 as
@@ -208,6 +208,51 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local,
208 return len; 208 return len;
209} 209}
210 210
211static void ieee80211_handle_mu_mimo_mon(struct ieee80211_sub_if_data *sdata,
212 struct sk_buff *skb,
213 int rtap_vendor_space)
214{
215 struct {
216 struct ieee80211_hdr_3addr hdr;
217 u8 category;
218 u8 action_code;
219 } __packed action;
220
221 if (!sdata)
222 return;
223
224 BUILD_BUG_ON(sizeof(action) != IEEE80211_MIN_ACTION_SIZE + 1);
225
226 if (skb->len < rtap_vendor_space + sizeof(action) +
227 VHT_MUMIMO_GROUPS_DATA_LEN)
228 return;
229
230 if (!is_valid_ether_addr(sdata->u.mntr.mu_follow_addr))
231 return;
232
233 skb_copy_bits(skb, rtap_vendor_space, &action, sizeof(action));
234
235 if (!ieee80211_is_action(action.hdr.frame_control))
236 return;
237
238 if (action.category != WLAN_CATEGORY_VHT)
239 return;
240
241 if (action.action_code != WLAN_VHT_ACTION_GROUPID_MGMT)
242 return;
243
244 if (!ether_addr_equal(action.hdr.addr1, sdata->u.mntr.mu_follow_addr))
245 return;
246
247 skb = skb_copy(skb, GFP_ATOMIC);
248 if (!skb)
249 return;
250
251 skb->pkt_type = IEEE80211_SDATA_QUEUE_TYPE_FRAME;
252 skb_queue_tail(&sdata->skb_queue, skb);
253 ieee80211_queue_work(&sdata->local->hw, &sdata->work);
254}
255
211/* 256/*
212 * ieee80211_add_rx_radiotap_header - add radiotap header 257 * ieee80211_add_rx_radiotap_header - add radiotap header
213 * 258 *
@@ -515,7 +560,6 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
515 struct net_device *prev_dev = NULL; 560 struct net_device *prev_dev = NULL;
516 int present_fcs_len = 0; 561 int present_fcs_len = 0;
517 unsigned int rtap_vendor_space = 0; 562 unsigned int rtap_vendor_space = 0;
518 struct ieee80211_mgmt *mgmt;
519 struct ieee80211_sub_if_data *monitor_sdata = 563 struct ieee80211_sub_if_data *monitor_sdata =
520 rcu_dereference(local->monitor_sdata); 564 rcu_dereference(local->monitor_sdata);
521 565
@@ -553,6 +597,8 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
553 return remove_monitor_info(local, origskb, rtap_vendor_space); 597 return remove_monitor_info(local, origskb, rtap_vendor_space);
554 } 598 }
555 599
600 ieee80211_handle_mu_mimo_mon(monitor_sdata, origskb, rtap_vendor_space);
601
556 /* room for the radiotap header based on driver features */ 602 /* room for the radiotap header based on driver features */
557 rt_hdrlen = ieee80211_rx_radiotap_hdrlen(local, status, origskb); 603 rt_hdrlen = ieee80211_rx_radiotap_hdrlen(local, status, origskb);
558 needed_headroom = rt_hdrlen - rtap_vendor_space; 604 needed_headroom = rt_hdrlen - rtap_vendor_space;
@@ -618,23 +664,6 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
618 ieee80211_rx_stats(sdata->dev, skb->len); 664 ieee80211_rx_stats(sdata->dev, skb->len);
619 } 665 }
620 666
621 mgmt = (void *)skb->data;
622 if (monitor_sdata &&
623 skb->len >= IEEE80211_MIN_ACTION_SIZE + 1 + VHT_MUMIMO_GROUPS_DATA_LEN &&
624 ieee80211_is_action(mgmt->frame_control) &&
625 mgmt->u.action.category == WLAN_CATEGORY_VHT &&
626 mgmt->u.action.u.vht_group_notif.action_code == WLAN_VHT_ACTION_GROUPID_MGMT &&
627 is_valid_ether_addr(monitor_sdata->u.mntr.mu_follow_addr) &&
628 ether_addr_equal(mgmt->da, monitor_sdata->u.mntr.mu_follow_addr)) {
629 struct sk_buff *mu_skb = skb_copy(skb, GFP_ATOMIC);
630
631 if (mu_skb) {
632 mu_skb->pkt_type = IEEE80211_SDATA_QUEUE_TYPE_FRAME;
633 skb_queue_tail(&monitor_sdata->skb_queue, mu_skb);
634 ieee80211_queue_work(&local->hw, &monitor_sdata->work);
635 }
636 }
637
638 if (prev_dev) { 667 if (prev_dev) {
639 skb->dev = prev_dev; 668 skb->dev = prev_dev;
640 netif_receive_skb(skb); 669 netif_receive_skb(skb);
@@ -1034,6 +1063,18 @@ static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_sub_if_data *sdata
1034 buf_size = tid_agg_rx->buf_size; 1063 buf_size = tid_agg_rx->buf_size;
1035 head_seq_num = tid_agg_rx->head_seq_num; 1064 head_seq_num = tid_agg_rx->head_seq_num;
1036 1065
1066 /*
1067 * If the current MPDU's SN is smaller than the SSN, it shouldn't
1068 * be reordered.
1069 */
1070 if (unlikely(!tid_agg_rx->started)) {
1071 if (ieee80211_sn_less(mpdu_seq_num, head_seq_num)) {
1072 ret = false;
1073 goto out;
1074 }
1075 tid_agg_rx->started = true;
1076 }
1077
1037 /* frame with out of date sequence number */ 1078 /* frame with out of date sequence number */
1038 if (ieee80211_sn_less(mpdu_seq_num, head_seq_num)) { 1079 if (ieee80211_sn_less(mpdu_seq_num, head_seq_num)) {
1039 dev_kfree_skb(skb); 1080 dev_kfree_skb(skb);
@@ -1391,7 +1432,7 @@ EXPORT_SYMBOL(ieee80211_sta_pspoll);
1391void ieee80211_sta_uapsd_trigger(struct ieee80211_sta *pubsta, u8 tid) 1432void ieee80211_sta_uapsd_trigger(struct ieee80211_sta *pubsta, u8 tid)
1392{ 1433{
1393 struct sta_info *sta = container_of(pubsta, struct sta_info, sta); 1434 struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
1394 u8 ac = ieee802_1d_to_ac[tid & 7]; 1435 int ac = ieee80211_ac_from_tid(tid);
1395 1436
1396 /* 1437 /*
1397 * If this AC is not trigger-enabled do nothing unless the 1438 * If this AC is not trigger-enabled do nothing unless the
@@ -1908,7 +1949,6 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
1908 unsigned int frag, seq; 1949 unsigned int frag, seq;
1909 struct ieee80211_fragment_entry *entry; 1950 struct ieee80211_fragment_entry *entry;
1910 struct sk_buff *skb; 1951 struct sk_buff *skb;
1911 struct ieee80211_rx_status *status;
1912 1952
1913 hdr = (struct ieee80211_hdr *)rx->skb->data; 1953 hdr = (struct ieee80211_hdr *)rx->skb->data;
1914 fc = hdr->frame_control; 1954 fc = hdr->frame_control;
@@ -2034,9 +2074,6 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
2034 dev_kfree_skb(skb); 2074 dev_kfree_skb(skb);
2035 } 2075 }
2036 2076
2037 /* Complete frame has been reassembled - process it now */
2038 status = IEEE80211_SKB_RXCB(rx->skb);
2039
2040 out: 2077 out:
2041 ieee80211_led_rx(rx->local); 2078 ieee80211_led_rx(rx->local);
2042 out_no_led: 2079 out_no_led:
@@ -3602,6 +3639,27 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx)
3602 !ether_addr_equal(bssid, hdr->addr1)) 3639 !ether_addr_equal(bssid, hdr->addr1))
3603 return false; 3640 return false;
3604 } 3641 }
3642
3643 /*
3644 * 802.11-2016 Table 9-26 says that for data frames, A1 must be
3645 * the BSSID - we've checked that already but may have accepted
3646 * the wildcard (ff:ff:ff:ff:ff:ff).
3647 *
3648 * It also says:
3649 * The BSSID of the Data frame is determined as follows:
3650 * a) If the STA is contained within an AP or is associated
3651 * with an AP, the BSSID is the address currently in use
3652 * by the STA contained in the AP.
3653 *
3654 * So we should not accept data frames with an address that's
3655 * multicast.
3656 *
3657 * Accepting it also opens a security problem because stations
3658 * could encrypt it with the GTK and inject traffic that way.
3659 */
3660 if (ieee80211_is_data(hdr->frame_control) && multicast)
3661 return false;
3662
3605 return true; 3663 return true;
3606 case NL80211_IFTYPE_WDS: 3664 case NL80211_IFTYPE_WDS:
3607 if (bssid || !ieee80211_is_data(hdr->frame_control)) 3665 if (bssid || !ieee80211_is_data(hdr->frame_control))
@@ -3884,6 +3942,7 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx,
3884 stats->last_rate = sta_stats_encode_rate(status); 3942 stats->last_rate = sta_stats_encode_rate(status);
3885 3943
3886 stats->fragments++; 3944 stats->fragments++;
3945 stats->packets++;
3887 3946
3888 if (!(status->flag & RX_FLAG_NO_SIGNAL_VAL)) { 3947 if (!(status->flag & RX_FLAG_NO_SIGNAL_VAL)) {
3889 stats->last_signal = status->signal; 3948 stats->last_signal = status->signal;
@@ -4077,15 +4136,17 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
4077 ieee80211_is_beacon(hdr->frame_control))) 4136 ieee80211_is_beacon(hdr->frame_control)))
4078 ieee80211_scan_rx(local, skb); 4137 ieee80211_scan_rx(local, skb);
4079 4138
4080 if (pubsta) { 4139 if (ieee80211_is_data(fc)) {
4081 rx.sta = container_of(pubsta, struct sta_info, sta);
4082 rx.sdata = rx.sta->sdata;
4083 if (ieee80211_prepare_and_rx_handle(&rx, skb, true))
4084 return;
4085 goto out;
4086 } else if (ieee80211_is_data(fc)) {
4087 struct sta_info *sta, *prev_sta; 4140 struct sta_info *sta, *prev_sta;
4088 4141
4142 if (pubsta) {
4143 rx.sta = container_of(pubsta, struct sta_info, sta);
4144 rx.sdata = rx.sta->sdata;
4145 if (ieee80211_prepare_and_rx_handle(&rx, skb, true))
4146 return;
4147 goto out;
4148 }
4149
4089 prev_sta = NULL; 4150 prev_sta = NULL;
4090 4151
4091 for_each_sta_info(local, hdr->addr2, sta, tmp) { 4152 for_each_sta_info(local, hdr->addr2, sta, tmp) {
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index 23d8ac829279..faab3c490d2b 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -1120,7 +1120,6 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata,
1120 u32 rate_masks[NUM_NL80211_BANDS] = {}; 1120 u32 rate_masks[NUM_NL80211_BANDS] = {};
1121 u8 bands_used = 0; 1121 u8 bands_used = 0;
1122 u8 *ie; 1122 u8 *ie;
1123 size_t len;
1124 1123
1125 iebufsz = local->scan_ies_len + req->ie_len; 1124 iebufsz = local->scan_ies_len + req->ie_len;
1126 1125
@@ -1145,10 +1144,9 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata,
1145 1144
1146 ieee80211_prepare_scan_chandef(&chandef, req->scan_width); 1145 ieee80211_prepare_scan_chandef(&chandef, req->scan_width);
1147 1146
1148 len = ieee80211_build_preq_ies(local, ie, num_bands * iebufsz, 1147 ieee80211_build_preq_ies(local, ie, num_bands * iebufsz,
1149 &sched_scan_ies, req->ie, 1148 &sched_scan_ies, req->ie,
1150 req->ie_len, bands_used, 1149 req->ie_len, bands_used, rate_masks, &chandef);
1151 rate_masks, &chandef);
1152 1150
1153 ret = drv_sched_scan_start(local, sdata, req, &sched_scan_ies); 1151 ret = drv_sched_scan_start(local, sdata, req, &sched_scan_ies);
1154 if (ret == 0) { 1152 if (ret == 0) {
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 50c309094c37..3323a2fb289b 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -513,23 +513,23 @@ static int sta_info_insert_finish(struct sta_info *sta) __acquires(RCU)
513{ 513{
514 struct ieee80211_local *local = sta->local; 514 struct ieee80211_local *local = sta->local;
515 struct ieee80211_sub_if_data *sdata = sta->sdata; 515 struct ieee80211_sub_if_data *sdata = sta->sdata;
516 struct station_info *sinfo; 516 struct station_info *sinfo = NULL;
517 int err = 0; 517 int err = 0;
518 518
519 lockdep_assert_held(&local->sta_mtx); 519 lockdep_assert_held(&local->sta_mtx);
520 520
521 sinfo = kzalloc(sizeof(struct station_info), GFP_KERNEL);
522 if (!sinfo) {
523 err = -ENOMEM;
524 goto out_err;
525 }
526
527 /* check if STA exists already */ 521 /* check if STA exists already */
528 if (sta_info_get_bss(sdata, sta->sta.addr)) { 522 if (sta_info_get_bss(sdata, sta->sta.addr)) {
529 err = -EEXIST; 523 err = -EEXIST;
530 goto out_err; 524 goto out_err;
531 } 525 }
532 526
527 sinfo = kzalloc(sizeof(struct station_info), GFP_KERNEL);
528 if (!sinfo) {
529 err = -ENOMEM;
530 goto out_err;
531 }
532
533 local->num_sta++; 533 local->num_sta++;
534 local->sta_generation++; 534 local->sta_generation++;
535 smp_mb(); 535 smp_mb();
@@ -688,7 +688,7 @@ static void __sta_info_recalc_tim(struct sta_info *sta, bool ignore_pending)
688 } 688 }
689 689
690 /* No need to do anything if the driver does all */ 690 /* No need to do anything if the driver does all */
691 if (ieee80211_hw_check(&local->hw, AP_LINK_PS)) 691 if (ieee80211_hw_check(&local->hw, AP_LINK_PS) && !local->ops->set_tim)
692 return; 692 return;
693 693
694 if (sta->dead) 694 if (sta->dead)
@@ -1264,7 +1264,7 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta)
1264 sta_info_recalc_tim(sta); 1264 sta_info_recalc_tim(sta);
1265 1265
1266 ps_dbg(sdata, 1266 ps_dbg(sdata,
1267 "STA %pM aid %d sending %d filtered/%d PS frames since STA not sleeping anymore\n", 1267 "STA %pM aid %d sending %d filtered/%d PS frames since STA woke up\n",
1268 sta->sta.addr, sta->sta.aid, filtered, buffered); 1268 sta->sta.addr, sta->sta.aid, filtered, buffered);
1269 1269
1270 ieee80211_check_fast_xmit(sta); 1270 ieee80211_check_fast_xmit(sta);
@@ -2051,16 +2051,12 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo)
2051{ 2051{
2052 struct ieee80211_sub_if_data *sdata = sta->sdata; 2052 struct ieee80211_sub_if_data *sdata = sta->sdata;
2053 struct ieee80211_local *local = sdata->local; 2053 struct ieee80211_local *local = sdata->local;
2054 struct rate_control_ref *ref = NULL;
2055 u32 thr = 0; 2054 u32 thr = 0;
2056 int i, ac, cpu; 2055 int i, ac, cpu;
2057 struct ieee80211_sta_rx_stats *last_rxstats; 2056 struct ieee80211_sta_rx_stats *last_rxstats;
2058 2057
2059 last_rxstats = sta_get_last_rx_stats(sta); 2058 last_rxstats = sta_get_last_rx_stats(sta);
2060 2059
2061 if (test_sta_flag(sta, WLAN_STA_RATE_CONTROL))
2062 ref = local->rate_ctrl;
2063
2064 sinfo->generation = sdata->local->sta_generation; 2060 sinfo->generation = sdata->local->sta_generation;
2065 2061
2066 /* do before driver, so beacon filtering drivers have a 2062 /* do before driver, so beacon filtering drivers have a
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index dd06ef0b8861..e65cda34d2bc 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -189,6 +189,7 @@ struct tid_ampdu_tx {
189 * @auto_seq: used for offloaded BA sessions to automatically pick head_seq_and 189 * @auto_seq: used for offloaded BA sessions to automatically pick head_seq_and
190 * and ssn. 190 * and ssn.
191 * @removed: this session is removed (but might have been found due to RCU) 191 * @removed: this session is removed (but might have been found due to RCU)
192 * @started: this session has started (head ssn or higher was received)
192 * 193 *
193 * This structure's lifetime is managed by RCU, assignments to 194 * This structure's lifetime is managed by RCU, assignments to
194 * the array holding it must hold the aggregation mutex. 195 * the array holding it must hold the aggregation mutex.
@@ -212,8 +213,9 @@ struct tid_ampdu_rx {
212 u16 ssn; 213 u16 ssn;
213 u16 buf_size; 214 u16 buf_size;
214 u16 timeout; 215 u16 timeout;
215 bool auto_seq; 216 u8 auto_seq:1,
216 bool removed; 217 removed:1,
218 started:1;
217}; 219};
218 220
219/** 221/**
@@ -370,7 +372,7 @@ struct mesh_sta {
370 unsigned int fail_avg; 372 unsigned int fail_avg;
371}; 373};
372 374
373DECLARE_EWMA(signal, 1024, 8) 375DECLARE_EWMA(signal, 10, 8)
374 376
375struct ieee80211_sta_rx_stats { 377struct ieee80211_sta_rx_stats {
376 unsigned long packets; 378 unsigned long packets;
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index ddf71c648cab..83b8b11f24ea 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -51,7 +51,8 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local,
51 struct ieee80211_hdr *hdr = (void *)skb->data; 51 struct ieee80211_hdr *hdr = (void *)skb->data;
52 int ac; 52 int ac;
53 53
54 if (info->flags & IEEE80211_TX_CTL_NO_PS_BUFFER) { 54 if (info->flags & (IEEE80211_TX_CTL_NO_PS_BUFFER |
55 IEEE80211_TX_CTL_AMPDU)) {
55 ieee80211_free_txskb(&local->hw, skb); 56 ieee80211_free_txskb(&local->hw, skb);
56 return; 57 return;
57 } 58 }
@@ -95,7 +96,7 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local,
95 */ 96 */
96 if (*p & IEEE80211_QOS_CTL_EOSP) 97 if (*p & IEEE80211_QOS_CTL_EOSP)
97 *p &= ~IEEE80211_QOS_CTL_EOSP; 98 *p &= ~IEEE80211_QOS_CTL_EOSP;
98 ac = ieee802_1d_to_ac[tid & 7]; 99 ac = ieee80211_ac_from_tid(tid);
99 } else { 100 } else {
100 ac = IEEE80211_AC_BE; 101 ac = IEEE80211_AC_BE;
101 } 102 }
@@ -462,9 +463,7 @@ static void ieee80211_report_ack_skb(struct ieee80211_local *local,
462 unsigned long flags; 463 unsigned long flags;
463 464
464 spin_lock_irqsave(&local->ack_status_lock, flags); 465 spin_lock_irqsave(&local->ack_status_lock, flags);
465 skb = idr_find(&local->ack_status_frames, info->ack_frame_id); 466 skb = idr_remove(&local->ack_status_frames, info->ack_frame_id);
466 if (skb)
467 idr_remove(&local->ack_status_frames, info->ack_frame_id);
468 spin_unlock_irqrestore(&local->ack_status_lock, flags); 467 spin_unlock_irqrestore(&local->ack_status_lock, flags);
469 468
470 if (!skb) 469 if (!skb)
@@ -541,6 +540,11 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local,
541 } else if (info->ack_frame_id) { 540 } else if (info->ack_frame_id) {
542 ieee80211_report_ack_skb(local, info, acked, dropped); 541 ieee80211_report_ack_skb(local, info, acked, dropped);
543 } 542 }
543
544 if (!dropped && skb->destructor) {
545 skb->wifi_acked_valid = 1;
546 skb->wifi_acked = acked;
547 }
544} 548}
545 549
546/* 550/*
@@ -633,10 +637,9 @@ void ieee80211_tx_status_noskb(struct ieee80211_hw *hw,
633 struct ieee80211_local *local = hw_to_local(hw); 637 struct ieee80211_local *local = hw_to_local(hw);
634 struct ieee80211_supported_band *sband; 638 struct ieee80211_supported_band *sband;
635 int retry_count; 639 int retry_count;
636 int rates_idx;
637 bool acked, noack_success; 640 bool acked, noack_success;
638 641
639 rates_idx = ieee80211_tx_get_rates(hw, info, &retry_count); 642 ieee80211_tx_get_rates(hw, info, &retry_count);
640 643
641 sband = hw->wiphy->bands[info->band]; 644 sband = hw->wiphy->bands[info->band];
642 645
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index 92a47afaa989..0d645bc148d0 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -1736,21 +1736,21 @@ TRACE_EVENT(drv_start_nan,
1736 LOCAL_ENTRY 1736 LOCAL_ENTRY
1737 VIF_ENTRY 1737 VIF_ENTRY
1738 __field(u8, master_pref) 1738 __field(u8, master_pref)
1739 __field(u8, dual) 1739 __field(u8, bands)
1740 ), 1740 ),
1741 1741
1742 TP_fast_assign( 1742 TP_fast_assign(
1743 LOCAL_ASSIGN; 1743 LOCAL_ASSIGN;
1744 VIF_ASSIGN; 1744 VIF_ASSIGN;
1745 __entry->master_pref = conf->master_pref; 1745 __entry->master_pref = conf->master_pref;
1746 __entry->dual = conf->dual; 1746 __entry->bands = conf->bands;
1747 ), 1747 ),
1748 1748
1749 TP_printk( 1749 TP_printk(
1750 LOCAL_PR_FMT VIF_PR_FMT 1750 LOCAL_PR_FMT VIF_PR_FMT
1751 ", master preference: %u, dual: %d", 1751 ", master preference: %u, bands: 0x%0x",
1752 LOCAL_PR_ARG, VIF_PR_ARG, __entry->master_pref, 1752 LOCAL_PR_ARG, VIF_PR_ARG, __entry->master_pref,
1753 __entry->dual 1753 __entry->bands
1754 ) 1754 )
1755); 1755);
1756 1756
@@ -1787,7 +1787,7 @@ TRACE_EVENT(drv_nan_change_conf,
1787 LOCAL_ENTRY 1787 LOCAL_ENTRY
1788 VIF_ENTRY 1788 VIF_ENTRY
1789 __field(u8, master_pref) 1789 __field(u8, master_pref)
1790 __field(u8, dual) 1790 __field(u8, bands)
1791 __field(u32, changes) 1791 __field(u32, changes)
1792 ), 1792 ),
1793 1793
@@ -1795,15 +1795,15 @@ TRACE_EVENT(drv_nan_change_conf,
1795 LOCAL_ASSIGN; 1795 LOCAL_ASSIGN;
1796 VIF_ASSIGN; 1796 VIF_ASSIGN;
1797 __entry->master_pref = conf->master_pref; 1797 __entry->master_pref = conf->master_pref;
1798 __entry->dual = conf->dual; 1798 __entry->bands = conf->bands;
1799 __entry->changes = changes; 1799 __entry->changes = changes;
1800 ), 1800 ),
1801 1801
1802 TP_printk( 1802 TP_printk(
1803 LOCAL_PR_FMT VIF_PR_FMT 1803 LOCAL_PR_FMT VIF_PR_FMT
1804 ", master preference: %u, dual: %d, changes: 0x%x", 1804 ", master preference: %u, bands: 0x%0x, changes: 0x%x",
1805 LOCAL_PR_ARG, VIF_PR_ARG, __entry->master_pref, 1805 LOCAL_PR_ARG, VIF_PR_ARG, __entry->master_pref,
1806 __entry->dual, __entry->changes 1806 __entry->bands, __entry->changes
1807 ) 1807 )
1808); 1808);
1809 1809
@@ -1996,23 +1996,26 @@ TRACE_EVENT(api_connection_loss,
1996 1996
1997TRACE_EVENT(api_cqm_rssi_notify, 1997TRACE_EVENT(api_cqm_rssi_notify,
1998 TP_PROTO(struct ieee80211_sub_if_data *sdata, 1998 TP_PROTO(struct ieee80211_sub_if_data *sdata,
1999 enum nl80211_cqm_rssi_threshold_event rssi_event), 1999 enum nl80211_cqm_rssi_threshold_event rssi_event,
2000 s32 rssi_level),
2000 2001
2001 TP_ARGS(sdata, rssi_event), 2002 TP_ARGS(sdata, rssi_event, rssi_level),
2002 2003
2003 TP_STRUCT__entry( 2004 TP_STRUCT__entry(
2004 VIF_ENTRY 2005 VIF_ENTRY
2005 __field(u32, rssi_event) 2006 __field(u32, rssi_event)
2007 __field(s32, rssi_level)
2006 ), 2008 ),
2007 2009
2008 TP_fast_assign( 2010 TP_fast_assign(
2009 VIF_ASSIGN; 2011 VIF_ASSIGN;
2010 __entry->rssi_event = rssi_event; 2012 __entry->rssi_event = rssi_event;
2013 __entry->rssi_level = rssi_level;
2011 ), 2014 ),
2012 2015
2013 TP_printk( 2016 TP_printk(
2014 VIF_PR_FMT " event:%d", 2017 VIF_PR_FMT " event:%d rssi:%d",
2015 VIF_PR_ARG, __entry->rssi_event 2018 VIF_PR_ARG, __entry->rssi_event, __entry->rssi_level
2016 ) 2019 )
2017); 2020);
2018 2021
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 797e847cbc49..ba8d7db0a071 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -16,6 +16,7 @@
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/skbuff.h> 18#include <linux/skbuff.h>
19#include <linux/if_vlan.h>
19#include <linux/etherdevice.h> 20#include <linux/etherdevice.h>
20#include <linux/bitmap.h> 21#include <linux/bitmap.h>
21#include <linux/rcupdate.h> 22#include <linux/rcupdate.h>
@@ -63,6 +64,10 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx,
63 struct ieee80211_chanctx_conf *chanctx_conf; 64 struct ieee80211_chanctx_conf *chanctx_conf;
64 u32 rate_flags = 0; 65 u32 rate_flags = 0;
65 66
67 /* assume HW handles this */
68 if (tx->rate.flags & (IEEE80211_TX_RC_MCS | IEEE80211_TX_RC_VHT_MCS))
69 return 0;
70
66 rcu_read_lock(); 71 rcu_read_lock();
67 chanctx_conf = rcu_dereference(tx->sdata->vif.chanctx_conf); 72 chanctx_conf = rcu_dereference(tx->sdata->vif.chanctx_conf);
68 if (chanctx_conf) { 73 if (chanctx_conf) {
@@ -71,10 +76,6 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx,
71 } 76 }
72 rcu_read_unlock(); 77 rcu_read_unlock();
73 78
74 /* assume HW handles this */
75 if (tx->rate.flags & (IEEE80211_TX_RC_MCS | IEEE80211_TX_RC_VHT_MCS))
76 return 0;
77
78 /* uh huh? */ 79 /* uh huh? */
79 if (WARN_ON_ONCE(tx->rate.idx < 0)) 80 if (WARN_ON_ONCE(tx->rate.idx < 0))
80 return 0; 81 return 0;
@@ -1413,7 +1414,7 @@ void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata,
1413 txqi->txq.sta = &sta->sta; 1414 txqi->txq.sta = &sta->sta;
1414 sta->sta.txq[tid] = &txqi->txq; 1415 sta->sta.txq[tid] = &txqi->txq;
1415 txqi->txq.tid = tid; 1416 txqi->txq.tid = tid;
1416 txqi->txq.ac = ieee802_1d_to_ac[tid & 7]; 1417 txqi->txq.ac = ieee80211_ac_from_tid(tid);
1417 } else { 1418 } else {
1418 sdata->vif.txq = &txqi->txq; 1419 sdata->vif.txq = &txqi->txq;
1419 txqi->txq.tid = 0; 1420 txqi->txq.tid = 0;
@@ -3571,6 +3572,115 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb,
3571 rcu_read_unlock(); 3572 rcu_read_unlock();
3572} 3573}
3573 3574
3575static int ieee80211_change_da(struct sk_buff *skb, struct sta_info *sta)
3576{
3577 struct ethhdr *eth;
3578 int err;
3579
3580 err = skb_ensure_writable(skb, ETH_HLEN);
3581 if (unlikely(err))
3582 return err;
3583
3584 eth = (void *)skb->data;
3585 ether_addr_copy(eth->h_dest, sta->sta.addr);
3586
3587 return 0;
3588}
3589
3590static bool ieee80211_multicast_to_unicast(struct sk_buff *skb,
3591 struct net_device *dev)
3592{
3593 struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
3594 const struct ethhdr *eth = (void *)skb->data;
3595 const struct vlan_ethhdr *ethvlan = (void *)skb->data;
3596 __be16 ethertype;
3597
3598 if (likely(!is_multicast_ether_addr(eth->h_dest)))
3599 return false;
3600
3601 switch (sdata->vif.type) {
3602 case NL80211_IFTYPE_AP_VLAN:
3603 if (sdata->u.vlan.sta)
3604 return false;
3605 if (sdata->wdev.use_4addr)
3606 return false;
3607 /* fall through */
3608 case NL80211_IFTYPE_AP:
3609 /* check runtime toggle for this bss */
3610 if (!sdata->bss->multicast_to_unicast)
3611 return false;
3612 break;
3613 default:
3614 return false;
3615 }
3616
3617 /* multicast to unicast conversion only for some payload */
3618 ethertype = eth->h_proto;
3619 if (ethertype == htons(ETH_P_8021Q) && skb->len >= VLAN_ETH_HLEN)
3620 ethertype = ethvlan->h_vlan_encapsulated_proto;
3621 switch (ethertype) {
3622 case htons(ETH_P_ARP):
3623 case htons(ETH_P_IP):
3624 case htons(ETH_P_IPV6):
3625 break;
3626 default:
3627 return false;
3628 }
3629
3630 return true;
3631}
3632
3633static void
3634ieee80211_convert_to_unicast(struct sk_buff *skb, struct net_device *dev,
3635 struct sk_buff_head *queue)
3636{
3637 struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
3638 struct ieee80211_local *local = sdata->local;
3639 const struct ethhdr *eth = (struct ethhdr *)skb->data;
3640 struct sta_info *sta, *first = NULL;
3641 struct sk_buff *cloned_skb;
3642
3643 rcu_read_lock();
3644
3645 list_for_each_entry_rcu(sta, &local->sta_list, list) {
3646 if (sdata != sta->sdata)
3647 /* AP-VLAN mismatch */
3648 continue;
3649 if (unlikely(ether_addr_equal(eth->h_source, sta->sta.addr)))
3650 /* do not send back to source */
3651 continue;
3652 if (!first) {
3653 first = sta;
3654 continue;
3655 }
3656 cloned_skb = skb_clone(skb, GFP_ATOMIC);
3657 if (!cloned_skb)
3658 goto multicast;
3659 if (unlikely(ieee80211_change_da(cloned_skb, sta))) {
3660 dev_kfree_skb(cloned_skb);
3661 goto multicast;
3662 }
3663 __skb_queue_tail(queue, cloned_skb);
3664 }
3665
3666 if (likely(first)) {
3667 if (unlikely(ieee80211_change_da(skb, first)))
3668 goto multicast;
3669 __skb_queue_tail(queue, skb);
3670 } else {
3671 /* no STA connected, drop */
3672 kfree_skb(skb);
3673 skb = NULL;
3674 }
3675
3676 goto out;
3677multicast:
3678 __skb_queue_purge(queue);
3679 __skb_queue_tail(queue, skb);
3680out:
3681 rcu_read_unlock();
3682}
3683
3574/** 3684/**
3575 * ieee80211_subif_start_xmit - netif start_xmit function for 802.3 vifs 3685 * ieee80211_subif_start_xmit - netif start_xmit function for 802.3 vifs
3576 * @skb: packet to be sent 3686 * @skb: packet to be sent
@@ -3581,7 +3691,17 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb,
3581netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, 3691netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb,
3582 struct net_device *dev) 3692 struct net_device *dev)
3583{ 3693{
3584 __ieee80211_subif_start_xmit(skb, dev, 0); 3694 if (unlikely(ieee80211_multicast_to_unicast(skb, dev))) {
3695 struct sk_buff_head queue;
3696
3697 __skb_queue_head_init(&queue);
3698 ieee80211_convert_to_unicast(skb, dev, &queue);
3699 while ((skb = __skb_dequeue(&queue)))
3700 __ieee80211_subif_start_xmit(skb, dev, 0);
3701 } else {
3702 __ieee80211_subif_start_xmit(skb, dev, 0);
3703 }
3704
3585 return NETDEV_TX_OK; 3705 return NETDEV_TX_OK;
3586} 3706}
3587 3707
@@ -4074,7 +4194,7 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw,
4074 } 4194 }
4075 4195
4076 if (ifmsh->sync_ops) 4196 if (ifmsh->sync_ops)
4077 ifmsh->sync_ops->adjust_tbtt(sdata, beacon); 4197 ifmsh->sync_ops->adjust_tsf(sdata, beacon);
4078 4198
4079 skb = dev_alloc_skb(local->tx_headroom + 4199 skb = dev_alloc_skb(local->tx_headroom +
4080 beacon->head_len + 4200 beacon->head_len +
@@ -4539,7 +4659,7 @@ void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata,
4539 struct sk_buff *skb, int tid, 4659 struct sk_buff *skb, int tid,
4540 enum nl80211_band band) 4660 enum nl80211_band band)
4541{ 4661{
4542 int ac = ieee802_1d_to_ac[tid & 7]; 4662 int ac = ieee80211_ac_from_tid(tid);
4543 4663
4544 skb_reset_mac_header(skb); 4664 skb_reset_mac_header(skb);
4545 skb_set_queue_mapping(skb, ac); 4665 skb_set_queue_mapping(skb, ac);
diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c
index 43e45bb660bc..19ec2189d3ac 100644
--- a/net/mac80211/vht.c
+++ b/net/mac80211/vht.c
@@ -436,14 +436,10 @@ u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
436 struct sta_info *sta, u8 opmode, 436 struct sta_info *sta, u8 opmode,
437 enum nl80211_band band) 437 enum nl80211_band band)
438{ 438{
439 struct ieee80211_local *local = sdata->local;
440 struct ieee80211_supported_band *sband;
441 enum ieee80211_sta_rx_bandwidth new_bw; 439 enum ieee80211_sta_rx_bandwidth new_bw;
442 u32 changed = 0; 440 u32 changed = 0;
443 u8 nss; 441 u8 nss;
444 442
445 sband = local->hw.wiphy->bands[band];
446
447 /* ignore - no support for BF yet */ 443 /* ignore - no support for BF yet */
448 if (opmode & IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF) 444 if (opmode & IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF)
449 return 0; 445 return 0;
diff --git a/net/mac80211/wep.c b/net/mac80211/wep.c
index efa3f48f1ec5..73e8f347802e 100644
--- a/net/mac80211/wep.c
+++ b/net/mac80211/wep.c
@@ -293,7 +293,8 @@ ieee80211_crypto_wep_decrypt(struct ieee80211_rx_data *rx)
293 return RX_DROP_UNUSABLE; 293 return RX_DROP_UNUSABLE;
294 ieee80211_wep_remove_iv(rx->local, rx->skb, rx->key); 294 ieee80211_wep_remove_iv(rx->local, rx->skb, rx->key);
295 /* remove ICV */ 295 /* remove ICV */
296 if (pskb_trim(rx->skb, rx->skb->len - IEEE80211_WEP_ICV_LEN)) 296 if (!(status->flag & RX_FLAG_ICV_STRIPPED) &&
297 pskb_trim(rx->skb, rx->skb->len - IEEE80211_WEP_ICV_LEN))
297 return RX_DROP_UNUSABLE; 298 return RX_DROP_UNUSABLE;
298 } 299 }
299 300
diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c
index 8af6dd388d11..c1ef22df865f 100644
--- a/net/mac80211/wpa.c
+++ b/net/mac80211/wpa.c
@@ -294,7 +294,8 @@ ieee80211_crypto_tkip_decrypt(struct ieee80211_rx_data *rx)
294 return RX_DROP_UNUSABLE; 294 return RX_DROP_UNUSABLE;
295 295
296 /* Trim ICV */ 296 /* Trim ICV */
297 skb_trim(skb, skb->len - IEEE80211_TKIP_ICV_LEN); 297 if (!(status->flag & RX_FLAG_ICV_STRIPPED))
298 skb_trim(skb, skb->len - IEEE80211_TKIP_ICV_LEN);
298 299
299 /* Remove IV */ 300 /* Remove IV */
300 memmove(skb->data + IEEE80211_TKIP_IV_LEN, skb->data, hdrlen); 301 memmove(skb->data + IEEE80211_TKIP_IV_LEN, skb->data, hdrlen);
diff --git a/net/mac802154/llsec.c b/net/mac802154/llsec.c
index 6a3e1c2181d3..1e1c9b20bab7 100644
--- a/net/mac802154/llsec.c
+++ b/net/mac802154/llsec.c
@@ -18,6 +18,8 @@
18#include <linux/bug.h> 18#include <linux/bug.h>
19#include <linux/completion.h> 19#include <linux/completion.h>
20#include <linux/ieee802154.h> 20#include <linux/ieee802154.h>
21#include <linux/rculist.h>
22
21#include <crypto/aead.h> 23#include <crypto/aead.h>
22#include <crypto/skcipher.h> 24#include <crypto/skcipher.h>
23 25
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 5b77377e5a15..6414079aa729 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -7,7 +7,9 @@
7#include <linux/if_arp.h> 7#include <linux/if_arp.h>
8#include <linux/ipv6.h> 8#include <linux/ipv6.h>
9#include <linux/mpls.h> 9#include <linux/mpls.h>
10#include <linux/netconf.h>
10#include <linux/vmalloc.h> 11#include <linux/vmalloc.h>
12#include <linux/percpu.h>
11#include <net/ip.h> 13#include <net/ip.h>
12#include <net/dst.h> 14#include <net/dst.h>
13#include <net/sock.h> 15#include <net/sock.h>
@@ -17,8 +19,8 @@
17#include <net/netns/generic.h> 19#include <net/netns/generic.h>
18#if IS_ENABLED(CONFIG_IPV6) 20#if IS_ENABLED(CONFIG_IPV6)
19#include <net/ipv6.h> 21#include <net/ipv6.h>
20#include <net/addrconf.h>
21#endif 22#endif
23#include <net/addrconf.h>
22#include <net/nexthop.h> 24#include <net/nexthop.h>
23#include "internal.h" 25#include "internal.h"
24 26
@@ -48,11 +50,6 @@ static struct mpls_route *mpls_route_input_rcu(struct net *net, unsigned index)
48 return rt; 50 return rt;
49} 51}
50 52
51static inline struct mpls_dev *mpls_dev_get(const struct net_device *dev)
52{
53 return rcu_dereference_rtnl(dev->mpls_ptr);
54}
55
56bool mpls_output_possible(const struct net_device *dev) 53bool mpls_output_possible(const struct net_device *dev)
57{ 54{
58 return dev && (dev->flags & IFF_UP) && netif_carrier_ok(dev); 55 return dev && (dev->flags & IFF_UP) && netif_carrier_ok(dev);
@@ -98,6 +95,31 @@ bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
98} 95}
99EXPORT_SYMBOL_GPL(mpls_pkt_too_big); 96EXPORT_SYMBOL_GPL(mpls_pkt_too_big);
100 97
98void mpls_stats_inc_outucastpkts(struct net_device *dev,
99 const struct sk_buff *skb)
100{
101 struct mpls_dev *mdev;
102
103 if (skb->protocol == htons(ETH_P_MPLS_UC)) {
104 mdev = mpls_dev_get(dev);
105 if (mdev)
106 MPLS_INC_STATS_LEN(mdev, skb->len,
107 tx_packets,
108 tx_bytes);
109 } else if (skb->protocol == htons(ETH_P_IP)) {
110 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
111#if IS_ENABLED(CONFIG_IPV6)
112 } else if (skb->protocol == htons(ETH_P_IPV6)) {
113 struct inet6_dev *in6dev = __in6_dev_get(dev);
114
115 if (in6dev)
116 IP6_UPD_PO_STATS(dev_net(dev), in6dev,
117 IPSTATS_MIB_OUT, skb->len);
118#endif
119 }
120}
121EXPORT_SYMBOL_GPL(mpls_stats_inc_outucastpkts);
122
101static u32 mpls_multipath_hash(struct mpls_route *rt, struct sk_buff *skb) 123static u32 mpls_multipath_hash(struct mpls_route *rt, struct sk_buff *skb)
102{ 124{
103 struct mpls_entry_decoded dec; 125 struct mpls_entry_decoded dec;
@@ -255,6 +277,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
255 struct mpls_nh *nh; 277 struct mpls_nh *nh;
256 struct mpls_entry_decoded dec; 278 struct mpls_entry_decoded dec;
257 struct net_device *out_dev; 279 struct net_device *out_dev;
280 struct mpls_dev *out_mdev;
258 struct mpls_dev *mdev; 281 struct mpls_dev *mdev;
259 unsigned int hh_len; 282 unsigned int hh_len;
260 unsigned int new_header_size; 283 unsigned int new_header_size;
@@ -264,34 +287,39 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
264 /* Careful this entire function runs inside of an rcu critical section */ 287 /* Careful this entire function runs inside of an rcu critical section */
265 288
266 mdev = mpls_dev_get(dev); 289 mdev = mpls_dev_get(dev);
267 if (!mdev || !mdev->input_enabled) 290 if (!mdev)
268 goto drop; 291 goto drop;
269 292
270 if (skb->pkt_type != PACKET_HOST) 293 MPLS_INC_STATS_LEN(mdev, skb->len, rx_packets,
294 rx_bytes);
295
296 if (!mdev->input_enabled) {
297 MPLS_INC_STATS(mdev, rx_dropped);
271 goto drop; 298 goto drop;
299 }
300
301 if (skb->pkt_type != PACKET_HOST)
302 goto err;
272 303
273 if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) 304 if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
274 goto drop; 305 goto err;
275 306
276 if (!pskb_may_pull(skb, sizeof(*hdr))) 307 if (!pskb_may_pull(skb, sizeof(*hdr)))
277 goto drop; 308 goto err;
278 309
279 /* Read and decode the label */ 310 /* Read and decode the label */
280 hdr = mpls_hdr(skb); 311 hdr = mpls_hdr(skb);
281 dec = mpls_entry_decode(hdr); 312 dec = mpls_entry_decode(hdr);
282 313
283 rt = mpls_route_input_rcu(net, dec.label); 314 rt = mpls_route_input_rcu(net, dec.label);
284 if (!rt) 315 if (!rt) {
316 MPLS_INC_STATS(mdev, rx_noroute);
285 goto drop; 317 goto drop;
318 }
286 319
287 nh = mpls_select_multipath(rt, skb); 320 nh = mpls_select_multipath(rt, skb);
288 if (!nh) 321 if (!nh)
289 goto drop; 322 goto err;
290
291 /* Find the output device */
292 out_dev = rcu_dereference(nh->nh_dev);
293 if (!mpls_output_possible(out_dev))
294 goto drop;
295 323
296 /* Pop the label */ 324 /* Pop the label */
297 skb_pull(skb, sizeof(*hdr)); 325 skb_pull(skb, sizeof(*hdr));
@@ -300,20 +328,25 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
300 skb_orphan(skb); 328 skb_orphan(skb);
301 329
302 if (skb_warn_if_lro(skb)) 330 if (skb_warn_if_lro(skb))
303 goto drop; 331 goto err;
304 332
305 skb_forward_csum(skb); 333 skb_forward_csum(skb);
306 334
307 /* Verify ttl is valid */ 335 /* Verify ttl is valid */
308 if (dec.ttl <= 1) 336 if (dec.ttl <= 1)
309 goto drop; 337 goto err;
310 dec.ttl -= 1; 338 dec.ttl -= 1;
311 339
340 /* Find the output device */
341 out_dev = rcu_dereference(nh->nh_dev);
342 if (!mpls_output_possible(out_dev))
343 goto tx_err;
344
312 /* Verify the destination can hold the packet */ 345 /* Verify the destination can hold the packet */
313 new_header_size = mpls_nh_header_size(nh); 346 new_header_size = mpls_nh_header_size(nh);
314 mtu = mpls_dev_mtu(out_dev); 347 mtu = mpls_dev_mtu(out_dev);
315 if (mpls_pkt_too_big(skb, mtu - new_header_size)) 348 if (mpls_pkt_too_big(skb, mtu - new_header_size))
316 goto drop; 349 goto tx_err;
317 350
318 hh_len = LL_RESERVED_SPACE(out_dev); 351 hh_len = LL_RESERVED_SPACE(out_dev);
319 if (!out_dev->header_ops) 352 if (!out_dev->header_ops)
@@ -321,7 +354,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
321 354
322 /* Ensure there is enough space for the headers in the skb */ 355 /* Ensure there is enough space for the headers in the skb */
323 if (skb_cow(skb, hh_len + new_header_size)) 356 if (skb_cow(skb, hh_len + new_header_size))
324 goto drop; 357 goto tx_err;
325 358
326 skb->dev = out_dev; 359 skb->dev = out_dev;
327 skb->protocol = htons(ETH_P_MPLS_UC); 360 skb->protocol = htons(ETH_P_MPLS_UC);
@@ -329,7 +362,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
329 if (unlikely(!new_header_size && dec.bos)) { 362 if (unlikely(!new_header_size && dec.bos)) {
330 /* Penultimate hop popping */ 363 /* Penultimate hop popping */
331 if (!mpls_egress(rt, skb, dec)) 364 if (!mpls_egress(rt, skb, dec))
332 goto drop; 365 goto err;
333 } else { 366 } else {
334 bool bos; 367 bool bos;
335 int i; 368 int i;
@@ -345,6 +378,8 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
345 } 378 }
346 } 379 }
347 380
381 mpls_stats_inc_outucastpkts(out_dev, skb);
382
348 /* If via wasn't specified then send out using device address */ 383 /* If via wasn't specified then send out using device address */
349 if (nh->nh_via_table == MPLS_NEIGH_TABLE_UNSPEC) 384 if (nh->nh_via_table == MPLS_NEIGH_TABLE_UNSPEC)
350 err = neigh_xmit(NEIGH_LINK_TABLE, out_dev, 385 err = neigh_xmit(NEIGH_LINK_TABLE, out_dev,
@@ -357,6 +392,13 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
357 __func__, err); 392 __func__, err);
358 return 0; 393 return 0;
359 394
395tx_err:
396 out_mdev = out_dev ? mpls_dev_get(out_dev) : NULL;
397 if (out_mdev)
398 MPLS_INC_STATS(out_mdev, tx_errors);
399 goto drop;
400err:
401 MPLS_INC_STATS(mdev, rx_errors);
360drop: 402drop:
361 kfree_skb(skb); 403 kfree_skb(skb);
362 return NET_RX_DROP; 404 return NET_RX_DROP;
@@ -855,15 +897,279 @@ errout:
855 return err; 897 return err;
856} 898}
857 899
900static void mpls_get_stats(struct mpls_dev *mdev,
901 struct mpls_link_stats *stats)
902{
903 struct mpls_pcpu_stats *p;
904 int i;
905
906 memset(stats, 0, sizeof(*stats));
907
908 for_each_possible_cpu(i) {
909 struct mpls_link_stats local;
910 unsigned int start;
911
912 p = per_cpu_ptr(mdev->stats, i);
913 do {
914 start = u64_stats_fetch_begin(&p->syncp);
915 local = p->stats;
916 } while (u64_stats_fetch_retry(&p->syncp, start));
917
918 stats->rx_packets += local.rx_packets;
919 stats->rx_bytes += local.rx_bytes;
920 stats->tx_packets += local.tx_packets;
921 stats->tx_bytes += local.tx_bytes;
922 stats->rx_errors += local.rx_errors;
923 stats->tx_errors += local.tx_errors;
924 stats->rx_dropped += local.rx_dropped;
925 stats->tx_dropped += local.tx_dropped;
926 stats->rx_noroute += local.rx_noroute;
927 }
928}
929
930static int mpls_fill_stats_af(struct sk_buff *skb,
931 const struct net_device *dev)
932{
933 struct mpls_link_stats *stats;
934 struct mpls_dev *mdev;
935 struct nlattr *nla;
936
937 mdev = mpls_dev_get(dev);
938 if (!mdev)
939 return -ENODATA;
940
941 nla = nla_reserve_64bit(skb, MPLS_STATS_LINK,
942 sizeof(struct mpls_link_stats),
943 MPLS_STATS_UNSPEC);
944 if (!nla)
945 return -EMSGSIZE;
946
947 stats = nla_data(nla);
948 mpls_get_stats(mdev, stats);
949
950 return 0;
951}
952
953static size_t mpls_get_stats_af_size(const struct net_device *dev)
954{
955 struct mpls_dev *mdev;
956
957 mdev = mpls_dev_get(dev);
958 if (!mdev)
959 return 0;
960
961 return nla_total_size_64bit(sizeof(struct mpls_link_stats));
962}
963
964static int mpls_netconf_fill_devconf(struct sk_buff *skb, struct mpls_dev *mdev,
965 u32 portid, u32 seq, int event,
966 unsigned int flags, int type)
967{
968 struct nlmsghdr *nlh;
969 struct netconfmsg *ncm;
970 bool all = false;
971
972 nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg),
973 flags);
974 if (!nlh)
975 return -EMSGSIZE;
976
977 if (type == NETCONFA_ALL)
978 all = true;
979
980 ncm = nlmsg_data(nlh);
981 ncm->ncm_family = AF_MPLS;
982
983 if (nla_put_s32(skb, NETCONFA_IFINDEX, mdev->dev->ifindex) < 0)
984 goto nla_put_failure;
985
986 if ((all || type == NETCONFA_INPUT) &&
987 nla_put_s32(skb, NETCONFA_INPUT,
988 mdev->input_enabled) < 0)
989 goto nla_put_failure;
990
991 nlmsg_end(skb, nlh);
992 return 0;
993
994nla_put_failure:
995 nlmsg_cancel(skb, nlh);
996 return -EMSGSIZE;
997}
998
999static int mpls_netconf_msgsize_devconf(int type)
1000{
1001 int size = NLMSG_ALIGN(sizeof(struct netconfmsg))
1002 + nla_total_size(4); /* NETCONFA_IFINDEX */
1003 bool all = false;
1004
1005 if (type == NETCONFA_ALL)
1006 all = true;
1007
1008 if (all || type == NETCONFA_INPUT)
1009 size += nla_total_size(4);
1010
1011 return size;
1012}
1013
1014static void mpls_netconf_notify_devconf(struct net *net, int type,
1015 struct mpls_dev *mdev)
1016{
1017 struct sk_buff *skb;
1018 int err = -ENOBUFS;
1019
1020 skb = nlmsg_new(mpls_netconf_msgsize_devconf(type), GFP_KERNEL);
1021 if (!skb)
1022 goto errout;
1023
1024 err = mpls_netconf_fill_devconf(skb, mdev, 0, 0, RTM_NEWNETCONF,
1025 0, type);
1026 if (err < 0) {
1027 /* -EMSGSIZE implies BUG in mpls_netconf_msgsize_devconf() */
1028 WARN_ON(err == -EMSGSIZE);
1029 kfree_skb(skb);
1030 goto errout;
1031 }
1032
1033 rtnl_notify(skb, net, 0, RTNLGRP_MPLS_NETCONF, NULL, GFP_KERNEL);
1034 return;
1035errout:
1036 if (err < 0)
1037 rtnl_set_sk_err(net, RTNLGRP_MPLS_NETCONF, err);
1038}
1039
1040static const struct nla_policy devconf_mpls_policy[NETCONFA_MAX + 1] = {
1041 [NETCONFA_IFINDEX] = { .len = sizeof(int) },
1042};
1043
1044static int mpls_netconf_get_devconf(struct sk_buff *in_skb,
1045 struct nlmsghdr *nlh)
1046{
1047 struct net *net = sock_net(in_skb->sk);
1048 struct nlattr *tb[NETCONFA_MAX + 1];
1049 struct netconfmsg *ncm;
1050 struct net_device *dev;
1051 struct mpls_dev *mdev;
1052 struct sk_buff *skb;
1053 int ifindex;
1054 int err;
1055
1056 err = nlmsg_parse(nlh, sizeof(*ncm), tb, NETCONFA_MAX,
1057 devconf_mpls_policy);
1058 if (err < 0)
1059 goto errout;
1060
1061 err = -EINVAL;
1062 if (!tb[NETCONFA_IFINDEX])
1063 goto errout;
1064
1065 ifindex = nla_get_s32(tb[NETCONFA_IFINDEX]);
1066 dev = __dev_get_by_index(net, ifindex);
1067 if (!dev)
1068 goto errout;
1069
1070 mdev = mpls_dev_get(dev);
1071 if (!mdev)
1072 goto errout;
1073
1074 err = -ENOBUFS;
1075 skb = nlmsg_new(mpls_netconf_msgsize_devconf(NETCONFA_ALL), GFP_KERNEL);
1076 if (!skb)
1077 goto errout;
1078
1079 err = mpls_netconf_fill_devconf(skb, mdev,
1080 NETLINK_CB(in_skb).portid,
1081 nlh->nlmsg_seq, RTM_NEWNETCONF, 0,
1082 NETCONFA_ALL);
1083 if (err < 0) {
1084 /* -EMSGSIZE implies BUG in mpls_netconf_msgsize_devconf() */
1085 WARN_ON(err == -EMSGSIZE);
1086 kfree_skb(skb);
1087 goto errout;
1088 }
1089 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
1090errout:
1091 return err;
1092}
1093
1094static int mpls_netconf_dump_devconf(struct sk_buff *skb,
1095 struct netlink_callback *cb)
1096{
1097 struct net *net = sock_net(skb->sk);
1098 struct hlist_head *head;
1099 struct net_device *dev;
1100 struct mpls_dev *mdev;
1101 int idx, s_idx;
1102 int h, s_h;
1103
1104 s_h = cb->args[0];
1105 s_idx = idx = cb->args[1];
1106
1107 for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
1108 idx = 0;
1109 head = &net->dev_index_head[h];
1110 rcu_read_lock();
1111 cb->seq = net->dev_base_seq;
1112 hlist_for_each_entry_rcu(dev, head, index_hlist) {
1113 if (idx < s_idx)
1114 goto cont;
1115 mdev = mpls_dev_get(dev);
1116 if (!mdev)
1117 goto cont;
1118 if (mpls_netconf_fill_devconf(skb, mdev,
1119 NETLINK_CB(cb->skb).portid,
1120 cb->nlh->nlmsg_seq,
1121 RTM_NEWNETCONF,
1122 NLM_F_MULTI,
1123 NETCONFA_ALL) < 0) {
1124 rcu_read_unlock();
1125 goto done;
1126 }
1127 nl_dump_check_consistent(cb, nlmsg_hdr(skb));
1128cont:
1129 idx++;
1130 }
1131 rcu_read_unlock();
1132 }
1133done:
1134 cb->args[0] = h;
1135 cb->args[1] = idx;
1136
1137 return skb->len;
1138}
1139
858#define MPLS_PERDEV_SYSCTL_OFFSET(field) \ 1140#define MPLS_PERDEV_SYSCTL_OFFSET(field) \
859 (&((struct mpls_dev *)0)->field) 1141 (&((struct mpls_dev *)0)->field)
860 1142
1143static int mpls_conf_proc(struct ctl_table *ctl, int write,
1144 void __user *buffer,
1145 size_t *lenp, loff_t *ppos)
1146{
1147 int oval = *(int *)ctl->data;
1148 int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
1149
1150 if (write) {
1151 struct mpls_dev *mdev = ctl->extra1;
1152 int i = (int *)ctl->data - (int *)mdev;
1153 struct net *net = ctl->extra2;
1154 int val = *(int *)ctl->data;
1155
1156 if (i == offsetof(struct mpls_dev, input_enabled) &&
1157 val != oval) {
1158 mpls_netconf_notify_devconf(net,
1159 NETCONFA_INPUT,
1160 mdev);
1161 }
1162 }
1163
1164 return ret;
1165}
1166
861static const struct ctl_table mpls_dev_table[] = { 1167static const struct ctl_table mpls_dev_table[] = {
862 { 1168 {
863 .procname = "input", 1169 .procname = "input",
864 .maxlen = sizeof(int), 1170 .maxlen = sizeof(int),
865 .mode = 0644, 1171 .mode = 0644,
866 .proc_handler = proc_dointvec, 1172 .proc_handler = mpls_conf_proc,
867 .data = MPLS_PERDEV_SYSCTL_OFFSET(input_enabled), 1173 .data = MPLS_PERDEV_SYSCTL_OFFSET(input_enabled),
868 }, 1174 },
869 { } 1175 { }
@@ -873,6 +1179,7 @@ static int mpls_dev_sysctl_register(struct net_device *dev,
873 struct mpls_dev *mdev) 1179 struct mpls_dev *mdev)
874{ 1180{
875 char path[sizeof("net/mpls/conf/") + IFNAMSIZ]; 1181 char path[sizeof("net/mpls/conf/") + IFNAMSIZ];
1182 struct net *net = dev_net(dev);
876 struct ctl_table *table; 1183 struct ctl_table *table;
877 int i; 1184 int i;
878 1185
@@ -883,8 +1190,11 @@ static int mpls_dev_sysctl_register(struct net_device *dev,
883 /* Table data contains only offsets relative to the base of 1190 /* Table data contains only offsets relative to the base of
884 * the mdev at this point, so make them absolute. 1191 * the mdev at this point, so make them absolute.
885 */ 1192 */
886 for (i = 0; i < ARRAY_SIZE(mpls_dev_table); i++) 1193 for (i = 0; i < ARRAY_SIZE(mpls_dev_table); i++) {
887 table[i].data = (char *)mdev + (uintptr_t)table[i].data; 1194 table[i].data = (char *)mdev + (uintptr_t)table[i].data;
1195 table[i].extra1 = mdev;
1196 table[i].extra2 = net;
1197 }
888 1198
889 snprintf(path, sizeof(path), "net/mpls/conf/%s", dev->name); 1199 snprintf(path, sizeof(path), "net/mpls/conf/%s", dev->name);
890 1200
@@ -913,6 +1223,7 @@ static struct mpls_dev *mpls_add_dev(struct net_device *dev)
913{ 1223{
914 struct mpls_dev *mdev; 1224 struct mpls_dev *mdev;
915 int err = -ENOMEM; 1225 int err = -ENOMEM;
1226 int i;
916 1227
917 ASSERT_RTNL(); 1228 ASSERT_RTNL();
918 1229
@@ -920,23 +1231,46 @@ static struct mpls_dev *mpls_add_dev(struct net_device *dev)
920 if (!mdev) 1231 if (!mdev)
921 return ERR_PTR(err); 1232 return ERR_PTR(err);
922 1233
1234 mdev->stats = alloc_percpu(struct mpls_pcpu_stats);
1235 if (!mdev->stats)
1236 goto free;
1237
1238 for_each_possible_cpu(i) {
1239 struct mpls_pcpu_stats *mpls_stats;
1240
1241 mpls_stats = per_cpu_ptr(mdev->stats, i);
1242 u64_stats_init(&mpls_stats->syncp);
1243 }
1244
923 err = mpls_dev_sysctl_register(dev, mdev); 1245 err = mpls_dev_sysctl_register(dev, mdev);
924 if (err) 1246 if (err)
925 goto free; 1247 goto free;
926 1248
1249 mdev->dev = dev;
927 rcu_assign_pointer(dev->mpls_ptr, mdev); 1250 rcu_assign_pointer(dev->mpls_ptr, mdev);
928 1251
929 return mdev; 1252 return mdev;
930 1253
931free: 1254free:
1255 free_percpu(mdev->stats);
932 kfree(mdev); 1256 kfree(mdev);
933 return ERR_PTR(err); 1257 return ERR_PTR(err);
934} 1258}
935 1259
1260static void mpls_dev_destroy_rcu(struct rcu_head *head)
1261{
1262 struct mpls_dev *mdev = container_of(head, struct mpls_dev, rcu);
1263
1264 free_percpu(mdev->stats);
1265 kfree(mdev);
1266}
1267
936static void mpls_ifdown(struct net_device *dev, int event) 1268static void mpls_ifdown(struct net_device *dev, int event)
937{ 1269{
938 struct mpls_route __rcu **platform_label; 1270 struct mpls_route __rcu **platform_label;
939 struct net *net = dev_net(dev); 1271 struct net *net = dev_net(dev);
1272 unsigned int nh_flags = RTNH_F_DEAD | RTNH_F_LINKDOWN;
1273 unsigned int alive;
940 unsigned index; 1274 unsigned index;
941 1275
942 platform_label = rtnl_dereference(net->mpls.platform_label); 1276 platform_label = rtnl_dereference(net->mpls.platform_label);
@@ -946,9 +1280,11 @@ static void mpls_ifdown(struct net_device *dev, int event)
946 if (!rt) 1280 if (!rt)
947 continue; 1281 continue;
948 1282
1283 alive = 0;
949 change_nexthops(rt) { 1284 change_nexthops(rt) {
950 if (rtnl_dereference(nh->nh_dev) != dev) 1285 if (rtnl_dereference(nh->nh_dev) != dev)
951 continue; 1286 goto next;
1287
952 switch (event) { 1288 switch (event) {
953 case NETDEV_DOWN: 1289 case NETDEV_DOWN:
954 case NETDEV_UNREGISTER: 1290 case NETDEV_UNREGISTER:
@@ -956,12 +1292,16 @@ static void mpls_ifdown(struct net_device *dev, int event)
956 /* fall through */ 1292 /* fall through */
957 case NETDEV_CHANGE: 1293 case NETDEV_CHANGE:
958 nh->nh_flags |= RTNH_F_LINKDOWN; 1294 nh->nh_flags |= RTNH_F_LINKDOWN;
959 ACCESS_ONCE(rt->rt_nhn_alive) = rt->rt_nhn_alive - 1;
960 break; 1295 break;
961 } 1296 }
962 if (event == NETDEV_UNREGISTER) 1297 if (event == NETDEV_UNREGISTER)
963 RCU_INIT_POINTER(nh->nh_dev, NULL); 1298 RCU_INIT_POINTER(nh->nh_dev, NULL);
1299next:
1300 if (!(nh->nh_flags & nh_flags))
1301 alive++;
964 } endfor_nexthops(rt); 1302 } endfor_nexthops(rt);
1303
1304 WRITE_ONCE(rt->rt_nhn_alive, alive);
965 } 1305 }
966} 1306}
967 1307
@@ -1047,7 +1387,7 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event,
1047 if (mdev) { 1387 if (mdev) {
1048 mpls_dev_sysctl_unregister(mdev); 1388 mpls_dev_sysctl_unregister(mdev);
1049 RCU_INIT_POINTER(dev->mpls_ptr, NULL); 1389 RCU_INIT_POINTER(dev->mpls_ptr, NULL);
1050 kfree_rcu(mdev, rcu); 1390 call_rcu(&mdev->rcu, mpls_dev_destroy_rcu);
1051 } 1391 }
1052 break; 1392 break;
1053 case NETDEV_CHANGENAME: 1393 case NETDEV_CHANGENAME:
@@ -1696,6 +2036,7 @@ static void mpls_net_exit(struct net *net)
1696 for (index = 0; index < platform_labels; index++) { 2036 for (index = 0; index < platform_labels; index++) {
1697 struct mpls_route *rt = rtnl_dereference(platform_label[index]); 2037 struct mpls_route *rt = rtnl_dereference(platform_label[index]);
1698 RCU_INIT_POINTER(platform_label[index], NULL); 2038 RCU_INIT_POINTER(platform_label[index], NULL);
2039 mpls_notify_route(net, index, rt, NULL, NULL);
1699 mpls_rt_free(rt); 2040 mpls_rt_free(rt);
1700 } 2041 }
1701 rtnl_unlock(); 2042 rtnl_unlock();
@@ -1708,6 +2049,12 @@ static struct pernet_operations mpls_net_ops = {
1708 .exit = mpls_net_exit, 2049 .exit = mpls_net_exit,
1709}; 2050};
1710 2051
2052static struct rtnl_af_ops mpls_af_ops __read_mostly = {
2053 .family = AF_MPLS,
2054 .fill_stats_af = mpls_fill_stats_af,
2055 .get_stats_af_size = mpls_get_stats_af_size,
2056};
2057
1711static int __init mpls_init(void) 2058static int __init mpls_init(void)
1712{ 2059{
1713 int err; 2060 int err;
@@ -1724,9 +2071,13 @@ static int __init mpls_init(void)
1724 2071
1725 dev_add_pack(&mpls_packet_type); 2072 dev_add_pack(&mpls_packet_type);
1726 2073
2074 rtnl_af_register(&mpls_af_ops);
2075
1727 rtnl_register(PF_MPLS, RTM_NEWROUTE, mpls_rtm_newroute, NULL, NULL); 2076 rtnl_register(PF_MPLS, RTM_NEWROUTE, mpls_rtm_newroute, NULL, NULL);
1728 rtnl_register(PF_MPLS, RTM_DELROUTE, mpls_rtm_delroute, NULL, NULL); 2077 rtnl_register(PF_MPLS, RTM_DELROUTE, mpls_rtm_delroute, NULL, NULL);
1729 rtnl_register(PF_MPLS, RTM_GETROUTE, NULL, mpls_dump_routes, NULL); 2078 rtnl_register(PF_MPLS, RTM_GETROUTE, NULL, mpls_dump_routes, NULL);
2079 rtnl_register(PF_MPLS, RTM_GETNETCONF, mpls_netconf_get_devconf,
2080 mpls_netconf_dump_devconf, NULL);
1730 err = 0; 2081 err = 0;
1731out: 2082out:
1732 return err; 2083 return err;
@@ -1740,6 +2091,7 @@ module_init(mpls_init);
1740static void __exit mpls_exit(void) 2091static void __exit mpls_exit(void)
1741{ 2092{
1742 rtnl_unregister_all(PF_MPLS); 2093 rtnl_unregister_all(PF_MPLS);
2094 rtnl_af_unregister(&mpls_af_ops);
1743 dev_remove_pack(&mpls_packet_type); 2095 dev_remove_pack(&mpls_packet_type);
1744 unregister_netdevice_notifier(&mpls_dev_notifier); 2096 unregister_netdevice_notifier(&mpls_dev_notifier);
1745 unregister_pernet_subsys(&mpls_net_ops); 2097 unregister_pernet_subsys(&mpls_net_ops);
diff --git a/net/mpls/internal.h b/net/mpls/internal.h
index bdfef6c3271a..76360d8b9579 100644
--- a/net/mpls/internal.h
+++ b/net/mpls/internal.h
@@ -9,13 +9,58 @@ struct mpls_entry_decoded {
9 u8 bos; 9 u8 bos;
10}; 10};
11 11
12struct mpls_pcpu_stats {
13 struct mpls_link_stats stats;
14 struct u64_stats_sync syncp;
15};
16
12struct mpls_dev { 17struct mpls_dev {
13 int input_enabled; 18 int input_enabled;
19 struct net_device *dev;
20 struct mpls_pcpu_stats __percpu *stats;
14 21
15 struct ctl_table_header *sysctl; 22 struct ctl_table_header *sysctl;
16 struct rcu_head rcu; 23 struct rcu_head rcu;
17}; 24};
18 25
26#if BITS_PER_LONG == 32
27
28#define MPLS_INC_STATS_LEN(mdev, len, pkts_field, bytes_field) \
29 do { \
30 __typeof__(*(mdev)->stats) *ptr = \
31 raw_cpu_ptr((mdev)->stats); \
32 local_bh_disable(); \
33 u64_stats_update_begin(&ptr->syncp); \
34 ptr->stats.pkts_field++; \
35 ptr->stats.bytes_field += (len); \
36 u64_stats_update_end(&ptr->syncp); \
37 local_bh_enable(); \
38 } while (0)
39
40#define MPLS_INC_STATS(mdev, field) \
41 do { \
42 __typeof__(*(mdev)->stats) *ptr = \
43 raw_cpu_ptr((mdev)->stats); \
44 local_bh_disable(); \
45 u64_stats_update_begin(&ptr->syncp); \
46 ptr->stats.field++; \
47 u64_stats_update_end(&ptr->syncp); \
48 local_bh_enable(); \
49 } while (0)
50
51#else
52
53#define MPLS_INC_STATS_LEN(mdev, len, pkts_field, bytes_field) \
54 do { \
55 this_cpu_inc((mdev)->stats->stats.pkts_field); \
56 this_cpu_add((mdev)->stats->stats.bytes_field, (len)); \
57 } while (0)
58
59#define MPLS_INC_STATS(mdev, field) \
60 this_cpu_inc((mdev)->stats->stats.field)
61
62#endif
63
19struct sk_buff; 64struct sk_buff;
20 65
21#define LABEL_NOT_SPECIFIED (1 << 20) 66#define LABEL_NOT_SPECIFIED (1 << 20)
@@ -114,6 +159,11 @@ static inline struct mpls_entry_decoded mpls_entry_decode(struct mpls_shim_hdr *
114 return result; 159 return result;
115} 160}
116 161
162static inline struct mpls_dev *mpls_dev_get(const struct net_device *dev)
163{
164 return rcu_dereference_rtnl(dev->mpls_ptr);
165}
166
117int nla_put_labels(struct sk_buff *skb, int attrtype, u8 labels, 167int nla_put_labels(struct sk_buff *skb, int attrtype, u8 labels,
118 const u32 label[]); 168 const u32 label[]);
119int nla_get_labels(const struct nlattr *nla, u32 max_labels, u8 *labels, 169int nla_get_labels(const struct nlattr *nla, u32 max_labels, u8 *labels,
@@ -123,5 +173,7 @@ int nla_get_via(const struct nlattr *nla, u8 *via_alen, u8 *via_table,
123bool mpls_output_possible(const struct net_device *dev); 173bool mpls_output_possible(const struct net_device *dev);
124unsigned int mpls_dev_mtu(const struct net_device *dev); 174unsigned int mpls_dev_mtu(const struct net_device *dev);
125bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu); 175bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu);
176void mpls_stats_inc_outucastpkts(struct net_device *dev,
177 const struct sk_buff *skb);
126 178
127#endif /* MPLS_INTERNAL_H */ 179#endif /* MPLS_INTERNAL_H */
diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c
index 1d281c1ff7c1..e4e4424f9eb1 100644
--- a/net/mpls/mpls_iptunnel.c
+++ b/net/mpls/mpls_iptunnel.c
@@ -48,11 +48,15 @@ static int mpls_xmit(struct sk_buff *skb)
48 struct dst_entry *dst = skb_dst(skb); 48 struct dst_entry *dst = skb_dst(skb);
49 struct rtable *rt = NULL; 49 struct rtable *rt = NULL;
50 struct rt6_info *rt6 = NULL; 50 struct rt6_info *rt6 = NULL;
51 struct mpls_dev *out_mdev;
51 int err = 0; 52 int err = 0;
52 bool bos; 53 bool bos;
53 int i; 54 int i;
54 unsigned int ttl; 55 unsigned int ttl;
55 56
57 /* Find the output device */
58 out_dev = dst->dev;
59
56 /* Obtain the ttl */ 60 /* Obtain the ttl */
57 if (dst->ops->family == AF_INET) { 61 if (dst->ops->family == AF_INET) {
58 ttl = ip_hdr(skb)->ttl; 62 ttl = ip_hdr(skb)->ttl;
@@ -66,8 +70,6 @@ static int mpls_xmit(struct sk_buff *skb)
66 70
67 skb_orphan(skb); 71 skb_orphan(skb);
68 72
69 /* Find the output device */
70 out_dev = dst->dev;
71 if (!mpls_output_possible(out_dev) || 73 if (!mpls_output_possible(out_dev) ||
72 !dst->lwtstate || skb_warn_if_lro(skb)) 74 !dst->lwtstate || skb_warn_if_lro(skb))
73 goto drop; 75 goto drop;
@@ -109,6 +111,8 @@ static int mpls_xmit(struct sk_buff *skb)
109 bos = false; 111 bos = false;
110 } 112 }
111 113
114 mpls_stats_inc_outucastpkts(out_dev, skb);
115
112 if (rt) 116 if (rt)
113 err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt->rt_gateway, 117 err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt->rt_gateway,
114 skb); 118 skb);
@@ -122,11 +126,14 @@ static int mpls_xmit(struct sk_buff *skb)
122 return LWTUNNEL_XMIT_DONE; 126 return LWTUNNEL_XMIT_DONE;
123 127
124drop: 128drop:
129 out_mdev = out_dev ? mpls_dev_get(out_dev) : NULL;
130 if (out_mdev)
131 MPLS_INC_STATS(out_mdev, tx_errors);
125 kfree_skb(skb); 132 kfree_skb(skb);
126 return -EINVAL; 133 return -EINVAL;
127} 134}
128 135
129static int mpls_build_state(struct net_device *dev, struct nlattr *nla, 136static int mpls_build_state(struct nlattr *nla,
130 unsigned int family, const void *cfg, 137 unsigned int family, const void *cfg,
131 struct lwtunnel_state **ts) 138 struct lwtunnel_state **ts)
132{ 139{
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index bbc45f8a7b2d..9b28864cc36a 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -162,6 +162,7 @@ config NF_CT_PROTO_SCTP
162 bool 'SCTP protocol connection tracking support' 162 bool 'SCTP protocol connection tracking support'
163 depends on NETFILTER_ADVANCED 163 depends on NETFILTER_ADVANCED
164 default y 164 default y
165 select LIBCRC32C
165 help 166 help
166 With this option enabled, the layer 3 independent connection 167 With this option enabled, the layer 3 independent connection
167 tracking code will be able to do state tracking on SCTP connections. 168 tracking code will be able to do state tracking on SCTP connections.
@@ -397,7 +398,6 @@ config NF_NAT_PROTO_SCTP
397 bool 398 bool
398 default NF_NAT && NF_CT_PROTO_SCTP 399 default NF_NAT && NF_CT_PROTO_SCTP
399 depends on NF_NAT && NF_CT_PROTO_SCTP 400 depends on NF_NAT && NF_CT_PROTO_SCTP
400 select LIBCRC32C
401 401
402config NF_NAT_AMANDA 402config NF_NAT_AMANDA
403 tristate 403 tristate
@@ -467,10 +467,10 @@ config NF_TABLES_NETDEV
467 This option enables support for the "netdev" table. 467 This option enables support for the "netdev" table.
468 468
469config NFT_EXTHDR 469config NFT_EXTHDR
470 tristate "Netfilter nf_tables IPv6 exthdr module" 470 tristate "Netfilter nf_tables exthdr module"
471 help 471 help
472 This option adds the "exthdr" expression that you can use to match 472 This option adds the "exthdr" expression that you can use to match
473 IPv6 extension headers. 473 IPv6 extension headers and tcp options.
474 474
475config NFT_META 475config NFT_META
476 tristate "Netfilter nf_tables meta module" 476 tristate "Netfilter nf_tables meta module"
@@ -509,6 +509,12 @@ config NFT_SET_HASH
509 This option adds the "hash" set type that is used to build one-way 509 This option adds the "hash" set type that is used to build one-way
510 mappings between matchings and actions. 510 mappings between matchings and actions.
511 511
512config NFT_SET_BITMAP
513 tristate "Netfilter nf_tables bitmap set module"
514 help
515 This option adds the "bitmap" set type that is used to build sets
516 whose keys are smaller or equal to 16 bits.
517
512config NFT_COUNTER 518config NFT_COUNTER
513 tristate "Netfilter nf_tables counter module" 519 tristate "Netfilter nf_tables counter module"
514 help 520 help
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index ca30d1960f1d..c9b78e7b342f 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -7,7 +7,6 @@ nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o
7nf_conntrack-$(CONFIG_NF_CONNTRACK_LABELS) += nf_conntrack_labels.o 7nf_conntrack-$(CONFIG_NF_CONNTRACK_LABELS) += nf_conntrack_labels.o
8nf_conntrack-$(CONFIG_NF_CT_PROTO_DCCP) += nf_conntrack_proto_dccp.o 8nf_conntrack-$(CONFIG_NF_CT_PROTO_DCCP) += nf_conntrack_proto_dccp.o
9nf_conntrack-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o 9nf_conntrack-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o
10nf_conntrack-$(CONFIG_NF_CT_PROTO_UDPLITE) += nf_conntrack_proto_udplite.o
11 10
12obj-$(CONFIG_NETFILTER) = netfilter.o 11obj-$(CONFIG_NETFILTER) = netfilter.o
13 12
@@ -47,7 +46,6 @@ nf_nat-y := nf_nat_core.o nf_nat_proto_unknown.o nf_nat_proto_common.o \
47# NAT protocols (nf_nat) 46# NAT protocols (nf_nat)
48nf_nat-$(CONFIG_NF_NAT_PROTO_DCCP) += nf_nat_proto_dccp.o 47nf_nat-$(CONFIG_NF_NAT_PROTO_DCCP) += nf_nat_proto_dccp.o
49nf_nat-$(CONFIG_NF_NAT_PROTO_SCTP) += nf_nat_proto_sctp.o 48nf_nat-$(CONFIG_NF_NAT_PROTO_SCTP) += nf_nat_proto_sctp.o
50nf_nat-$(CONFIG_NF_NAT_PROTO_UDPLITE) += nf_nat_proto_udplite.o
51 49
52# generic transport layer logging 50# generic transport layer logging
53obj-$(CONFIG_NF_LOG_COMMON) += nf_log_common.o 51obj-$(CONFIG_NF_LOG_COMMON) += nf_log_common.o
@@ -95,6 +93,7 @@ obj-$(CONFIG_NFT_REJECT) += nft_reject.o
95obj-$(CONFIG_NFT_REJECT_INET) += nft_reject_inet.o 93obj-$(CONFIG_NFT_REJECT_INET) += nft_reject_inet.o
96obj-$(CONFIG_NFT_SET_RBTREE) += nft_set_rbtree.o 94obj-$(CONFIG_NFT_SET_RBTREE) += nft_set_rbtree.o
97obj-$(CONFIG_NFT_SET_HASH) += nft_set_hash.o 95obj-$(CONFIG_NFT_SET_HASH) += nft_set_hash.o
96obj-$(CONFIG_NFT_SET_BITMAP) += nft_set_bitmap.o
98obj-$(CONFIG_NFT_COUNTER) += nft_counter.o 97obj-$(CONFIG_NFT_COUNTER) += nft_counter.o
99obj-$(CONFIG_NFT_LOG) += nft_log.o 98obj-$(CONFIG_NFT_LOG) += nft_log.o
100obj-$(CONFIG_NFT_MASQ) += nft_masq.o 99obj-$(CONFIG_NFT_MASQ) += nft_masq.o
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index ce6adfae521a..a87a6f8a74d8 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -375,7 +375,7 @@ void nf_ct_attach(struct sk_buff *new, const struct sk_buff *skb)
375{ 375{
376 void (*attach)(struct sk_buff *, const struct sk_buff *); 376 void (*attach)(struct sk_buff *, const struct sk_buff *);
377 377
378 if (skb->nfct) { 378 if (skb->_nfct) {
379 rcu_read_lock(); 379 rcu_read_lock();
380 attach = rcu_dereference(ip_ct_attach); 380 attach = rcu_dereference(ip_ct_attach);
381 if (attach) 381 if (attach)
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index 1b05d4a7d5a1..f236c0bc7b3f 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -897,7 +897,7 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
897 continue; 897 continue;
898 data = ahash_data(n, j, dsize); 898 data = ahash_data(n, j, dsize);
899 memcpy(tmp->value + k * dsize, data, dsize); 899 memcpy(tmp->value + k * dsize, data, dsize);
900 set_bit(j, tmp->used); 900 set_bit(k, tmp->used);
901 k++; 901 k++;
902 } 902 }
903 tmp->pos = k; 903 tmp->pos = k;
diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
index 51077c53d76b..178d4eba013b 100644
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -260,11 +260,14 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext,
260 else 260 else
261 prev = e; 261 prev = e;
262 } 262 }
263
264 /* If before/after is used on an empty set */
265 if ((d->before > 0 && !next) ||
266 (d->before < 0 && !prev))
267 return -IPSET_ERR_REF_EXIST;
268
263 /* Re-add already existing element */ 269 /* Re-add already existing element */
264 if (n) { 270 if (n) {
265 if ((d->before > 0 && !next) ||
266 (d->before < 0 && !prev))
267 return -IPSET_ERR_REF_EXIST;
268 if (!flag_exist) 271 if (!flag_exist)
269 return -IPSET_ERR_EXIST; 272 return -IPSET_ERR_EXIST;
270 /* Update extensions */ 273 /* Update extensions */
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 096a45103f14..e6a2753dff9e 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -1429,7 +1429,7 @@ int __init ip_vs_conn_init(void)
1429 "(size=%d, memory=%ldKbytes)\n", 1429 "(size=%d, memory=%ldKbytes)\n",
1430 ip_vs_conn_tab_size, 1430 ip_vs_conn_tab_size,
1431 (long)(ip_vs_conn_tab_size*sizeof(struct list_head))/1024); 1431 (long)(ip_vs_conn_tab_size*sizeof(struct list_head))/1024);
1432 IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n", 1432 IP_VS_DBG(0, "Each connection entry needs %zd bytes at least\n",
1433 sizeof(struct ip_vs_conn)); 1433 sizeof(struct ip_vs_conn));
1434 1434
1435 for (idx = 0; idx < ip_vs_conn_tab_size; idx++) 1435 for (idx = 0; idx < ip_vs_conn_tab_size; idx++)
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 55e0169caa4c..5aeb0dde6ccc 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -426,10 +426,9 @@ ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u32 fwmark, __u16 protocol
426 */ 426 */
427 svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, vport); 427 svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, vport);
428 428
429 if (svc == NULL 429 if (!svc && protocol == IPPROTO_TCP &&
430 && protocol == IPPROTO_TCP 430 atomic_read(&ipvs->ftpsvc_counter) &&
431 && atomic_read(&ipvs->ftpsvc_counter) 431 (vport == FTPDATA || ntohs(vport) >= inet_prot_sock(ipvs->net))) {
432 && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
433 /* 432 /*
434 * Check if ftp service entry exists, the packet 433 * Check if ftp service entry exists, the packet
435 * might belong to FTP data connections. 434 * might belong to FTP data connections.
@@ -711,7 +710,6 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, int dest_af,
711 dest->vport == svc->port))) { 710 dest->vport == svc->port))) {
712 /* HIT */ 711 /* HIT */
713 list_del(&dest->t_list); 712 list_del(&dest->t_list);
714 ip_vs_dest_hold(dest);
715 goto out; 713 goto out;
716 } 714 }
717 } 715 }
@@ -741,7 +739,7 @@ static void ip_vs_dest_free(struct ip_vs_dest *dest)
741 * When the ip_vs_control_clearup is activated by ipvs module exit, 739 * When the ip_vs_control_clearup is activated by ipvs module exit,
742 * the service tables must have been flushed and all the connections 740 * the service tables must have been flushed and all the connections
743 * are expired, and the refcnt of each destination in the trash must 741 * are expired, and the refcnt of each destination in the trash must
744 * be 0, so we simply release them here. 742 * be 1, so we simply release them here.
745 */ 743 */
746static void ip_vs_trash_cleanup(struct netns_ipvs *ipvs) 744static void ip_vs_trash_cleanup(struct netns_ipvs *ipvs)
747{ 745{
@@ -1080,11 +1078,10 @@ static void __ip_vs_del_dest(struct netns_ipvs *ipvs, struct ip_vs_dest *dest,
1080 if (list_empty(&ipvs->dest_trash) && !cleanup) 1078 if (list_empty(&ipvs->dest_trash) && !cleanup)
1081 mod_timer(&ipvs->dest_trash_timer, 1079 mod_timer(&ipvs->dest_trash_timer,
1082 jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1)); 1080 jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1));
1083 /* dest lives in trash without reference */ 1081 /* dest lives in trash with reference */
1084 list_add(&dest->t_list, &ipvs->dest_trash); 1082 list_add(&dest->t_list, &ipvs->dest_trash);
1085 dest->idle_start = 0; 1083 dest->idle_start = 0;
1086 spin_unlock_bh(&ipvs->dest_trash_lock); 1084 spin_unlock_bh(&ipvs->dest_trash_lock);
1087 ip_vs_dest_put(dest);
1088} 1085}
1089 1086
1090 1087
@@ -1160,7 +1157,7 @@ static void ip_vs_dest_trash_expire(unsigned long data)
1160 1157
1161 spin_lock(&ipvs->dest_trash_lock); 1158 spin_lock(&ipvs->dest_trash_lock);
1162 list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) { 1159 list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) {
1163 if (atomic_read(&dest->refcnt) > 0) 1160 if (atomic_read(&dest->refcnt) > 1)
1164 continue; 1161 continue;
1165 if (dest->idle_start) { 1162 if (dest->idle_start) {
1166 if (time_before(now, dest->idle_start + 1163 if (time_before(now, dest->idle_start +
diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c
index 6be5c538b71e..75f798f8e83b 100644
--- a/net/netfilter/ipvs/ip_vs_dh.c
+++ b/net/netfilter/ipvs/ip_vs_dh.c
@@ -163,7 +163,7 @@ static int ip_vs_dh_init_svc(struct ip_vs_service *svc)
163 return -ENOMEM; 163 return -ENOMEM;
164 164
165 svc->sched_data = s; 165 svc->sched_data = s;
166 IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) allocated for " 166 IP_VS_DBG(6, "DH hash table (memory=%zdbytes) allocated for "
167 "current service\n", 167 "current service\n",
168 sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); 168 sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
169 169
@@ -183,7 +183,7 @@ static void ip_vs_dh_done_svc(struct ip_vs_service *svc)
183 183
184 /* release the table itself */ 184 /* release the table itself */
185 kfree_rcu(s, rcu_head); 185 kfree_rcu(s, rcu_head);
186 IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) released\n", 186 IP_VS_DBG(6, "DH hash table (memory=%zdbytes) released\n",
187 sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); 187 sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
188} 188}
189 189
diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
index cccf4d637412..5824927cf8e0 100644
--- a/net/netfilter/ipvs/ip_vs_lblc.c
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -356,7 +356,7 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
356 return -ENOMEM; 356 return -ENOMEM;
357 357
358 svc->sched_data = tbl; 358 svc->sched_data = tbl;
359 IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for " 359 IP_VS_DBG(6, "LBLC hash table (memory=%zdbytes) allocated for "
360 "current service\n", sizeof(*tbl)); 360 "current service\n", sizeof(*tbl));
361 361
362 /* 362 /*
@@ -393,7 +393,7 @@ static void ip_vs_lblc_done_svc(struct ip_vs_service *svc)
393 393
394 /* release the table itself */ 394 /* release the table itself */
395 kfree_rcu(tbl, rcu_head); 395 kfree_rcu(tbl, rcu_head);
396 IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n", 396 IP_VS_DBG(6, "LBLC hash table (memory=%zdbytes) released\n",
397 sizeof(*tbl)); 397 sizeof(*tbl));
398} 398}
399 399
diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
index 796d70e47ddd..703f11877bee 100644
--- a/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -519,7 +519,7 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
519 return -ENOMEM; 519 return -ENOMEM;
520 520
521 svc->sched_data = tbl; 521 svc->sched_data = tbl;
522 IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for " 522 IP_VS_DBG(6, "LBLCR hash table (memory=%zdbytes) allocated for "
523 "current service\n", sizeof(*tbl)); 523 "current service\n", sizeof(*tbl));
524 524
525 /* 525 /*
@@ -556,7 +556,7 @@ static void ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
556 556
557 /* release the table itself */ 557 /* release the table itself */
558 kfree_rcu(tbl, rcu_head); 558 kfree_rcu(tbl, rcu_head);
559 IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n", 559 IP_VS_DBG(6, "LBLCR hash table (memory=%zdbytes) released\n",
560 sizeof(*tbl)); 560 sizeof(*tbl));
561} 561}
562 562
diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c
index 1e373a5e44e3..16aaac6eedc9 100644
--- a/net/netfilter/ipvs/ip_vs_sh.c
+++ b/net/netfilter/ipvs/ip_vs_sh.c
@@ -239,7 +239,7 @@ static int ip_vs_sh_init_svc(struct ip_vs_service *svc)
239 return -ENOMEM; 239 return -ENOMEM;
240 240
241 svc->sched_data = s; 241 svc->sched_data = s;
242 IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for " 242 IP_VS_DBG(6, "SH hash table (memory=%zdbytes) allocated for "
243 "current service\n", 243 "current service\n",
244 sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); 244 sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
245 245
@@ -259,7 +259,7 @@ static void ip_vs_sh_done_svc(struct ip_vs_service *svc)
259 259
260 /* release the table itself */ 260 /* release the table itself */
261 kfree_rcu(s, rcu_head); 261 kfree_rcu(s, rcu_head);
262 IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n", 262 IP_VS_DBG(6, "SH hash table (memory=%zdbytes) released\n",
263 sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); 263 sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
264} 264}
265 265
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 9350530c16c1..b03c28084f81 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -1791,7 +1791,7 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
1791 u16 mtu, min_mtu; 1791 u16 mtu, min_mtu;
1792 1792
1793 IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); 1793 IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
1794 IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n", 1794 IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %zd bytes\n",
1795 sizeof(struct ip_vs_sync_conn_v0)); 1795 sizeof(struct ip_vs_sync_conn_v0));
1796 1796
1797 if (!ipvs->sync_state) { 1797 if (!ipvs->sync_state) {
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 4e8083c5e01d..ffb78e5f7b70 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -181,7 +181,11 @@ EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
181unsigned int nf_conntrack_max __read_mostly; 181unsigned int nf_conntrack_max __read_mostly;
182seqcount_t nf_conntrack_generation __read_mostly; 182seqcount_t nf_conntrack_generation __read_mostly;
183 183
184DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked); 184/* nf_conn must be 8 bytes aligned, as the 3 LSB bits are used
185 * for the nfctinfo. We cheat by (ab)using the PER CPU cache line
186 * alignment to enforce this.
187 */
188DEFINE_PER_CPU_ALIGNED(struct nf_conn, nf_conntrack_untracked);
185EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked); 189EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked);
186 190
187static unsigned int nf_conntrack_hash_rnd __read_mostly; 191static unsigned int nf_conntrack_hash_rnd __read_mostly;
@@ -350,16 +354,31 @@ static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct)
350 spin_unlock(&pcpu->lock); 354 spin_unlock(&pcpu->lock);
351} 355}
352 356
357#define NFCT_ALIGN(len) (((len) + NFCT_INFOMASK) & ~NFCT_INFOMASK)
358
353/* Released via destroy_conntrack() */ 359/* Released via destroy_conntrack() */
354struct nf_conn *nf_ct_tmpl_alloc(struct net *net, 360struct nf_conn *nf_ct_tmpl_alloc(struct net *net,
355 const struct nf_conntrack_zone *zone, 361 const struct nf_conntrack_zone *zone,
356 gfp_t flags) 362 gfp_t flags)
357{ 363{
358 struct nf_conn *tmpl; 364 struct nf_conn *tmpl, *p;
359 365
360 tmpl = kzalloc(sizeof(*tmpl), flags); 366 if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) {
361 if (tmpl == NULL) 367 tmpl = kzalloc(sizeof(*tmpl) + NFCT_INFOMASK, flags);
362 return NULL; 368 if (!tmpl)
369 return NULL;
370
371 p = tmpl;
372 tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p);
373 if (tmpl != p) {
374 tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p);
375 tmpl->proto.tmpl_padto = (char *)tmpl - (char *)p;
376 }
377 } else {
378 tmpl = kzalloc(sizeof(*tmpl), flags);
379 if (!tmpl)
380 return NULL;
381 }
363 382
364 tmpl->status = IPS_TEMPLATE; 383 tmpl->status = IPS_TEMPLATE;
365 write_pnet(&tmpl->ct_net, net); 384 write_pnet(&tmpl->ct_net, net);
@@ -374,7 +393,11 @@ void nf_ct_tmpl_free(struct nf_conn *tmpl)
374{ 393{
375 nf_ct_ext_destroy(tmpl); 394 nf_ct_ext_destroy(tmpl);
376 nf_ct_ext_free(tmpl); 395 nf_ct_ext_free(tmpl);
377 kfree(tmpl); 396
397 if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK)
398 kfree((char *)tmpl - tmpl->proto.tmpl_padto);
399 else
400 kfree(tmpl);
378} 401}
379EXPORT_SYMBOL_GPL(nf_ct_tmpl_free); 402EXPORT_SYMBOL_GPL(nf_ct_tmpl_free);
380 403
@@ -686,12 +709,12 @@ static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb,
686 !nfct_nat(ct) && 709 !nfct_nat(ct) &&
687 !nf_ct_is_dying(ct) && 710 !nf_ct_is_dying(ct) &&
688 atomic_inc_not_zero(&ct->ct_general.use)) { 711 atomic_inc_not_zero(&ct->ct_general.use)) {
689 nf_ct_acct_merge(ct, ctinfo, (struct nf_conn *)skb->nfct); 712 enum ip_conntrack_info oldinfo;
690 nf_conntrack_put(skb->nfct); 713 struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo);
691 /* Assign conntrack already in hashes to this skbuff. Don't 714
692 * modify skb->nfctinfo to ensure consistent stateful filtering. 715 nf_ct_acct_merge(ct, ctinfo, loser_ct);
693 */ 716 nf_conntrack_put(&loser_ct->ct_general);
694 skb->nfct = &ct->ct_general; 717 nf_ct_set(skb, ct, oldinfo);
695 return NF_ACCEPT; 718 return NF_ACCEPT;
696 } 719 }
697 NF_CT_STAT_INC(net, drop); 720 NF_CT_STAT_INC(net, drop);
@@ -1218,7 +1241,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
1218 return &ct->tuplehash[IP_CT_DIR_ORIGINAL]; 1241 return &ct->tuplehash[IP_CT_DIR_ORIGINAL];
1219} 1242}
1220 1243
1221/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */ 1244/* On success, returns conntrack ptr, sets skb->_nfct | ctinfo */
1222static inline struct nf_conn * 1245static inline struct nf_conn *
1223resolve_normal_ct(struct net *net, struct nf_conn *tmpl, 1246resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
1224 struct sk_buff *skb, 1247 struct sk_buff *skb,
@@ -1277,8 +1300,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
1277 } 1300 }
1278 *set_reply = 0; 1301 *set_reply = 0;
1279 } 1302 }
1280 skb->nfct = &ct->ct_general; 1303 nf_ct_set(skb, ct, *ctinfo);
1281 skb->nfctinfo = *ctinfo;
1282 return ct; 1304 return ct;
1283} 1305}
1284 1306
@@ -1286,7 +1308,7 @@ unsigned int
1286nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, 1308nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
1287 struct sk_buff *skb) 1309 struct sk_buff *skb)
1288{ 1310{
1289 struct nf_conn *ct, *tmpl = NULL; 1311 struct nf_conn *ct, *tmpl;
1290 enum ip_conntrack_info ctinfo; 1312 enum ip_conntrack_info ctinfo;
1291 struct nf_conntrack_l3proto *l3proto; 1313 struct nf_conntrack_l3proto *l3proto;
1292 struct nf_conntrack_l4proto *l4proto; 1314 struct nf_conntrack_l4proto *l4proto;
@@ -1296,14 +1318,14 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
1296 int set_reply = 0; 1318 int set_reply = 0;
1297 int ret; 1319 int ret;
1298 1320
1299 if (skb->nfct) { 1321 tmpl = nf_ct_get(skb, &ctinfo);
1322 if (tmpl) {
1300 /* Previously seen (loopback or untracked)? Ignore. */ 1323 /* Previously seen (loopback or untracked)? Ignore. */
1301 tmpl = (struct nf_conn *)skb->nfct;
1302 if (!nf_ct_is_template(tmpl)) { 1324 if (!nf_ct_is_template(tmpl)) {
1303 NF_CT_STAT_INC_ATOMIC(net, ignore); 1325 NF_CT_STAT_INC_ATOMIC(net, ignore);
1304 return NF_ACCEPT; 1326 return NF_ACCEPT;
1305 } 1327 }
1306 skb->nfct = NULL; 1328 skb->_nfct = 0;
1307 } 1329 }
1308 1330
1309 /* rcu_read_lock()ed by nf_hook_thresh */ 1331 /* rcu_read_lock()ed by nf_hook_thresh */
@@ -1324,8 +1346,7 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
1324 * inverse of the return code tells to the netfilter 1346 * inverse of the return code tells to the netfilter
1325 * core what to do with the packet. */ 1347 * core what to do with the packet. */
1326 if (l4proto->error != NULL) { 1348 if (l4proto->error != NULL) {
1327 ret = l4proto->error(net, tmpl, skb, dataoff, &ctinfo, 1349 ret = l4proto->error(net, tmpl, skb, dataoff, pf, hooknum);
1328 pf, hooknum);
1329 if (ret <= 0) { 1350 if (ret <= 0) {
1330 NF_CT_STAT_INC_ATOMIC(net, error); 1351 NF_CT_STAT_INC_ATOMIC(net, error);
1331 NF_CT_STAT_INC_ATOMIC(net, invalid); 1352 NF_CT_STAT_INC_ATOMIC(net, invalid);
@@ -1333,7 +1354,7 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
1333 goto out; 1354 goto out;
1334 } 1355 }
1335 /* ICMP[v6] protocol trackers may assign one conntrack. */ 1356 /* ICMP[v6] protocol trackers may assign one conntrack. */
1336 if (skb->nfct) 1357 if (skb->_nfct)
1337 goto out; 1358 goto out;
1338 } 1359 }
1339repeat: 1360repeat:
@@ -1353,7 +1374,7 @@ repeat:
1353 goto out; 1374 goto out;
1354 } 1375 }
1355 1376
1356 NF_CT_ASSERT(skb->nfct); 1377 NF_CT_ASSERT(skb_nfct(skb));
1357 1378
1358 /* Decide what timeout policy we want to apply to this flow. */ 1379 /* Decide what timeout policy we want to apply to this flow. */
1359 timeouts = nf_ct_timeout_lookup(net, ct, l4proto); 1380 timeouts = nf_ct_timeout_lookup(net, ct, l4proto);
@@ -1363,8 +1384,8 @@ repeat:
1363 /* Invalid: inverse of the return code tells 1384 /* Invalid: inverse of the return code tells
1364 * the netfilter core what to do */ 1385 * the netfilter core what to do */
1365 pr_debug("nf_conntrack_in: Can't track with proto module\n"); 1386 pr_debug("nf_conntrack_in: Can't track with proto module\n");
1366 nf_conntrack_put(skb->nfct); 1387 nf_conntrack_put(&ct->ct_general);
1367 skb->nfct = NULL; 1388 skb->_nfct = 0;
1368 NF_CT_STAT_INC_ATOMIC(net, invalid); 1389 NF_CT_STAT_INC_ATOMIC(net, invalid);
1369 if (ret == -NF_DROP) 1390 if (ret == -NF_DROP)
1370 NF_CT_STAT_INC_ATOMIC(net, drop); 1391 NF_CT_STAT_INC_ATOMIC(net, drop);
@@ -1522,9 +1543,8 @@ static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb)
1522 ctinfo = IP_CT_RELATED; 1543 ctinfo = IP_CT_RELATED;
1523 1544
1524 /* Attach to new skbuff, and increment count */ 1545 /* Attach to new skbuff, and increment count */
1525 nskb->nfct = &ct->ct_general; 1546 nf_ct_set(nskb, ct, ctinfo);
1526 nskb->nfctinfo = ctinfo; 1547 nf_conntrack_get(skb_nfct(nskb));
1527 nf_conntrack_get(nskb->nfct);
1528} 1548}
1529 1549
1530/* Bring out ya dead! */ 1550/* Bring out ya dead! */
@@ -1860,7 +1880,8 @@ int nf_conntrack_init_start(void)
1860 nf_conntrack_max = max_factor * nf_conntrack_htable_size; 1880 nf_conntrack_max = max_factor * nf_conntrack_htable_size;
1861 1881
1862 nf_conntrack_cachep = kmem_cache_create("nf_conntrack", 1882 nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
1863 sizeof(struct nf_conn), 0, 1883 sizeof(struct nf_conn),
1884 NFCT_INFOMASK + 1,
1864 SLAB_DESTROY_BY_RCU | SLAB_HWCACHE_ALIGN, NULL); 1885 SLAB_DESTROY_BY_RCU | SLAB_HWCACHE_ALIGN, NULL);
1865 if (!nf_conntrack_cachep) 1886 if (!nf_conntrack_cachep)
1866 goto err_cachep; 1887 goto err_cachep;
diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
index da9df2d56e66..22fc32143e9c 100644
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -290,6 +290,7 @@ void nf_conntrack_unregister_notifier(struct net *net,
290 BUG_ON(notify != new); 290 BUG_ON(notify != new);
291 RCU_INIT_POINTER(net->ct.nf_conntrack_event_cb, NULL); 291 RCU_INIT_POINTER(net->ct.nf_conntrack_event_cb, NULL);
292 mutex_unlock(&nf_ct_ecache_mutex); 292 mutex_unlock(&nf_ct_ecache_mutex);
293 /* synchronize_rcu() is called from ctnetlink_exit. */
293} 294}
294EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier); 295EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier);
295 296
@@ -326,6 +327,7 @@ void nf_ct_expect_unregister_notifier(struct net *net,
326 BUG_ON(notify != new); 327 BUG_ON(notify != new);
327 RCU_INIT_POINTER(net->ct.nf_expect_event_cb, NULL); 328 RCU_INIT_POINTER(net->ct.nf_expect_event_cb, NULL);
328 mutex_unlock(&nf_ct_ecache_mutex); 329 mutex_unlock(&nf_ct_ecache_mutex);
330 /* synchronize_rcu() is called from ctnetlink_exit. */
329} 331}
330EXPORT_SYMBOL_GPL(nf_ct_expect_unregister_notifier); 332EXPORT_SYMBOL_GPL(nf_ct_expect_unregister_notifier);
331 333
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index f8dbacf66795..d80073037856 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -57,7 +57,7 @@ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,
57 hlist_del_rcu(&exp->hnode); 57 hlist_del_rcu(&exp->hnode);
58 net->ct.expect_count--; 58 net->ct.expect_count--;
59 59
60 hlist_del(&exp->lnode); 60 hlist_del_rcu(&exp->lnode);
61 master_help->expecting[exp->class]--; 61 master_help->expecting[exp->class]--;
62 62
63 nf_ct_expect_event_report(IPEXP_DESTROY, exp, portid, report); 63 nf_ct_expect_event_report(IPEXP_DESTROY, exp, portid, report);
@@ -353,7 +353,7 @@ void nf_ct_expect_put(struct nf_conntrack_expect *exp)
353} 353}
354EXPORT_SYMBOL_GPL(nf_ct_expect_put); 354EXPORT_SYMBOL_GPL(nf_ct_expect_put);
355 355
356static int nf_ct_expect_insert(struct nf_conntrack_expect *exp) 356static void nf_ct_expect_insert(struct nf_conntrack_expect *exp)
357{ 357{
358 struct nf_conn_help *master_help = nfct_help(exp->master); 358 struct nf_conn_help *master_help = nfct_help(exp->master);
359 struct nf_conntrack_helper *helper; 359 struct nf_conntrack_helper *helper;
@@ -363,7 +363,7 @@ static int nf_ct_expect_insert(struct nf_conntrack_expect *exp)
363 /* two references : one for hash insert, one for the timer */ 363 /* two references : one for hash insert, one for the timer */
364 atomic_add(2, &exp->use); 364 atomic_add(2, &exp->use);
365 365
366 hlist_add_head(&exp->lnode, &master_help->expectations); 366 hlist_add_head_rcu(&exp->lnode, &master_help->expectations);
367 master_help->expecting[exp->class]++; 367 master_help->expecting[exp->class]++;
368 368
369 hlist_add_head_rcu(&exp->hnode, &nf_ct_expect_hash[h]); 369 hlist_add_head_rcu(&exp->hnode, &nf_ct_expect_hash[h]);
@@ -380,7 +380,6 @@ static int nf_ct_expect_insert(struct nf_conntrack_expect *exp)
380 add_timer(&exp->timeout); 380 add_timer(&exp->timeout);
381 381
382 NF_CT_STAT_INC(net, expect_create); 382 NF_CT_STAT_INC(net, expect_create);
383 return 0;
384} 383}
385 384
386/* Race with expectations being used means we could have none to find; OK. */ 385/* Race with expectations being used means we could have none to find; OK. */
@@ -411,7 +410,7 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
411 struct net *net = nf_ct_exp_net(expect); 410 struct net *net = nf_ct_exp_net(expect);
412 struct hlist_node *next; 411 struct hlist_node *next;
413 unsigned int h; 412 unsigned int h;
414 int ret = 1; 413 int ret = 0;
415 414
416 if (!master_help) { 415 if (!master_help) {
417 ret = -ESHUTDOWN; 416 ret = -ESHUTDOWN;
@@ -461,15 +460,14 @@ int nf_ct_expect_related_report(struct nf_conntrack_expect *expect,
461 460
462 spin_lock_bh(&nf_conntrack_expect_lock); 461 spin_lock_bh(&nf_conntrack_expect_lock);
463 ret = __nf_ct_expect_check(expect); 462 ret = __nf_ct_expect_check(expect);
464 if (ret <= 0)
465 goto out;
466
467 ret = nf_ct_expect_insert(expect);
468 if (ret < 0) 463 if (ret < 0)
469 goto out; 464 goto out;
465
466 nf_ct_expect_insert(expect);
467
470 spin_unlock_bh(&nf_conntrack_expect_lock); 468 spin_unlock_bh(&nf_conntrack_expect_lock);
471 nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report); 469 nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report);
472 return ret; 470 return 0;
473out: 471out:
474 spin_unlock_bh(&nf_conntrack_expect_lock); 472 spin_unlock_bh(&nf_conntrack_expect_lock);
475 return ret; 473 return ret;
diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c
index 02bcf00c2492..008299b7f78f 100644
--- a/net/netfilter/nf_conntrack_extend.c
+++ b/net/netfilter/nf_conntrack_extend.c
@@ -53,7 +53,11 @@ nf_ct_ext_create(struct nf_ct_ext **ext, enum nf_ct_ext_id id,
53 53
54 rcu_read_lock(); 54 rcu_read_lock();
55 t = rcu_dereference(nf_ct_ext_types[id]); 55 t = rcu_dereference(nf_ct_ext_types[id]);
56 BUG_ON(t == NULL); 56 if (!t) {
57 rcu_read_unlock();
58 return NULL;
59 }
60
57 off = ALIGN(sizeof(struct nf_ct_ext), t->align); 61 off = ALIGN(sizeof(struct nf_ct_ext), t->align);
58 len = off + t->len + var_alloc_len; 62 len = off + t->len + var_alloc_len;
59 alloc_size = t->alloc_size + var_alloc_len; 63 alloc_size = t->alloc_size + var_alloc_len;
@@ -88,7 +92,10 @@ void *__nf_ct_ext_add_length(struct nf_conn *ct, enum nf_ct_ext_id id,
88 92
89 rcu_read_lock(); 93 rcu_read_lock();
90 t = rcu_dereference(nf_ct_ext_types[id]); 94 t = rcu_dereference(nf_ct_ext_types[id]);
91 BUG_ON(t == NULL); 95 if (!t) {
96 rcu_read_unlock();
97 return NULL;
98 }
92 99
93 newoff = ALIGN(old->len, t->align); 100 newoff = ALIGN(old->len, t->align);
94 newlen = newoff + t->len + var_alloc_len; 101 newlen = newoff + t->len + var_alloc_len;
@@ -175,6 +182,6 @@ void nf_ct_extend_unregister(struct nf_ct_ext_type *type)
175 RCU_INIT_POINTER(nf_ct_ext_types[type->id], NULL); 182 RCU_INIT_POINTER(nf_ct_ext_types[type->id], NULL);
176 update_alloc_size(type); 183 update_alloc_size(type);
177 mutex_unlock(&nf_ct_ext_type_mutex); 184 mutex_unlock(&nf_ct_ext_type_mutex);
178 rcu_barrier(); /* Wait for completion of call_rcu()'s */ 185 synchronize_rcu();
179} 186}
180EXPORT_SYMBOL_GPL(nf_ct_extend_unregister); 187EXPORT_SYMBOL_GPL(nf_ct_extend_unregister);
diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c
index e3ed20060878..4aecef4a89fb 100644
--- a/net/netfilter/nf_conntrack_ftp.c
+++ b/net/netfilter/nf_conntrack_ftp.c
@@ -300,7 +300,7 @@ static int find_pattern(const char *data, size_t dlen,
300{ 300{
301 size_t i = plen; 301 size_t i = plen;
302 302
303 pr_debug("find_pattern `%s': dlen = %Zu\n", pattern, dlen); 303 pr_debug("find_pattern `%s': dlen = %zu\n", pattern, dlen);
304 304
305 if (dlen <= plen) { 305 if (dlen <= plen) {
306 /* Short packet: try for partial? */ 306 /* Short packet: try for partial? */
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index 7341adf7059d..4eeb3418366a 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -158,16 +158,25 @@ nf_conntrack_helper_try_module_get(const char *name, u16 l3num, u8 protonum)
158{ 158{
159 struct nf_conntrack_helper *h; 159 struct nf_conntrack_helper *h;
160 160
161 rcu_read_lock();
162
161 h = __nf_conntrack_helper_find(name, l3num, protonum); 163 h = __nf_conntrack_helper_find(name, l3num, protonum);
162#ifdef CONFIG_MODULES 164#ifdef CONFIG_MODULES
163 if (h == NULL) { 165 if (h == NULL) {
164 if (request_module("nfct-helper-%s", name) == 0) 166 rcu_read_unlock();
167 if (request_module("nfct-helper-%s", name) == 0) {
168 rcu_read_lock();
165 h = __nf_conntrack_helper_find(name, l3num, protonum); 169 h = __nf_conntrack_helper_find(name, l3num, protonum);
170 } else {
171 return h;
172 }
166 } 173 }
167#endif 174#endif
168 if (h != NULL && !try_module_get(h->me)) 175 if (h != NULL && !try_module_get(h->me))
169 h = NULL; 176 h = NULL;
170 177
178 rcu_read_unlock();
179
171 return h; 180 return h;
172} 181}
173EXPORT_SYMBOL_GPL(nf_conntrack_helper_try_module_get); 182EXPORT_SYMBOL_GPL(nf_conntrack_helper_try_module_get);
@@ -188,6 +197,26 @@ nf_ct_helper_ext_add(struct nf_conn *ct,
188} 197}
189EXPORT_SYMBOL_GPL(nf_ct_helper_ext_add); 198EXPORT_SYMBOL_GPL(nf_ct_helper_ext_add);
190 199
200static struct nf_conntrack_helper *
201nf_ct_lookup_helper(struct nf_conn *ct, struct net *net)
202{
203 if (!net->ct.sysctl_auto_assign_helper) {
204 if (net->ct.auto_assign_helper_warned)
205 return NULL;
206 if (!__nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple))
207 return NULL;
208 pr_info("nf_conntrack: default automatic helper assignment "
209 "has been turned off for security reasons and CT-based "
210 " firewall rule not found. Use the iptables CT target "
211 "to attach helpers instead.\n");
212 net->ct.auto_assign_helper_warned = 1;
213 return NULL;
214 }
215
216 return __nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
217}
218
219
191int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, 220int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
192 gfp_t flags) 221 gfp_t flags)
193{ 222{
@@ -213,21 +242,14 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
213 } 242 }
214 243
215 help = nfct_help(ct); 244 help = nfct_help(ct);
216 if (net->ct.sysctl_auto_assign_helper && helper == NULL) {
217 helper = __nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
218 if (unlikely(!net->ct.auto_assign_helper_warned && helper)) {
219 pr_info("nf_conntrack: automatic helper "
220 "assignment is deprecated and it will "
221 "be removed soon. Use the iptables CT target "
222 "to attach helpers instead.\n");
223 net->ct.auto_assign_helper_warned = true;
224 }
225 }
226 245
227 if (helper == NULL) { 246 if (helper == NULL) {
228 if (help) 247 helper = nf_ct_lookup_helper(ct, net);
229 RCU_INIT_POINTER(help->helper, NULL); 248 if (helper == NULL) {
230 return 0; 249 if (help)
250 RCU_INIT_POINTER(help->helper, NULL);
251 return 0;
252 }
231 } 253 }
232 254
233 if (help == NULL) { 255 if (help == NULL) {
@@ -298,38 +320,36 @@ void nf_ct_helper_expectfn_unregister(struct nf_ct_helper_expectfn *n)
298} 320}
299EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_unregister); 321EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_unregister);
300 322
323/* Caller should hold the rcu lock */
301struct nf_ct_helper_expectfn * 324struct nf_ct_helper_expectfn *
302nf_ct_helper_expectfn_find_by_name(const char *name) 325nf_ct_helper_expectfn_find_by_name(const char *name)
303{ 326{
304 struct nf_ct_helper_expectfn *cur; 327 struct nf_ct_helper_expectfn *cur;
305 bool found = false; 328 bool found = false;
306 329
307 rcu_read_lock();
308 list_for_each_entry_rcu(cur, &nf_ct_helper_expectfn_list, head) { 330 list_for_each_entry_rcu(cur, &nf_ct_helper_expectfn_list, head) {
309 if (!strcmp(cur->name, name)) { 331 if (!strcmp(cur->name, name)) {
310 found = true; 332 found = true;
311 break; 333 break;
312 } 334 }
313 } 335 }
314 rcu_read_unlock();
315 return found ? cur : NULL; 336 return found ? cur : NULL;
316} 337}
317EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_find_by_name); 338EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_find_by_name);
318 339
340/* Caller should hold the rcu lock */
319struct nf_ct_helper_expectfn * 341struct nf_ct_helper_expectfn *
320nf_ct_helper_expectfn_find_by_symbol(const void *symbol) 342nf_ct_helper_expectfn_find_by_symbol(const void *symbol)
321{ 343{
322 struct nf_ct_helper_expectfn *cur; 344 struct nf_ct_helper_expectfn *cur;
323 bool found = false; 345 bool found = false;
324 346
325 rcu_read_lock();
326 list_for_each_entry_rcu(cur, &nf_ct_helper_expectfn_list, head) { 347 list_for_each_entry_rcu(cur, &nf_ct_helper_expectfn_list, head) {
327 if (cur->expectfn == symbol) { 348 if (cur->expectfn == symbol) {
328 found = true; 349 found = true;
329 break; 350 break;
330 } 351 }
331 } 352 }
332 rcu_read_unlock();
333 return found ? cur : NULL; 353 return found ? cur : NULL;
334} 354}
335EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_find_by_symbol); 355EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_find_by_symbol);
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 27540455dc62..dc7dfd68fafe 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1478,14 +1478,28 @@ static int ctnetlink_change_helper(struct nf_conn *ct,
1478 struct nlattr *helpinfo = NULL; 1478 struct nlattr *helpinfo = NULL;
1479 int err; 1479 int err;
1480 1480
1481 /* don't change helper of sibling connections */
1482 if (ct->master)
1483 return -EBUSY;
1484
1485 err = ctnetlink_parse_help(cda[CTA_HELP], &helpname, &helpinfo); 1481 err = ctnetlink_parse_help(cda[CTA_HELP], &helpname, &helpinfo);
1486 if (err < 0) 1482 if (err < 0)
1487 return err; 1483 return err;
1488 1484
1485 /* don't change helper of sibling connections */
1486 if (ct->master) {
1487 /* If we try to change the helper to the same thing twice,
1488 * treat the second attempt as a no-op instead of returning
1489 * an error.
1490 */
1491 err = -EBUSY;
1492 if (help) {
1493 rcu_read_lock();
1494 helper = rcu_dereference(help->helper);
1495 if (helper && !strcmp(helper->name, helpname))
1496 err = 0;
1497 rcu_read_unlock();
1498 }
1499
1500 return err;
1501 }
1502
1489 if (!strcmp(helpname, "")) { 1503 if (!strcmp(helpname, "")) {
1490 if (help && help->helper) { 1504 if (help && help->helper) {
1491 /* we had a helper before ... */ 1505 /* we had a helper before ... */
@@ -1920,9 +1934,9 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl,
1920 1934
1921 err = 0; 1935 err = 0;
1922 if (test_bit(IPS_EXPECTED_BIT, &ct->status)) 1936 if (test_bit(IPS_EXPECTED_BIT, &ct->status))
1923 events = IPCT_RELATED; 1937 events = 1 << IPCT_RELATED;
1924 else 1938 else
1925 events = IPCT_NEW; 1939 events = 1 << IPCT_NEW;
1926 1940
1927 if (cda[CTA_LABELS] && 1941 if (cda[CTA_LABELS] &&
1928 ctnetlink_attach_labels(ct, cda) == 0) 1942 ctnetlink_attach_labels(ct, cda) == 0)
@@ -2270,6 +2284,30 @@ nla_put_failure:
2270} 2284}
2271 2285
2272static int 2286static int
2287ctnetlink_update_status(struct nf_conn *ct, const struct nlattr * const cda[])
2288{
2289 unsigned int status = ntohl(nla_get_be32(cda[CTA_STATUS]));
2290 unsigned long d = ct->status ^ status;
2291
2292 if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY))
2293 /* SEEN_REPLY bit can only be set */
2294 return -EBUSY;
2295
2296 if (d & IPS_ASSURED && !(status & IPS_ASSURED))
2297 /* ASSURED bit can only be set */
2298 return -EBUSY;
2299
2300 /* This check is less strict than ctnetlink_change_status()
2301 * because callers often flip IPS_EXPECTED bits when sending
2302 * an NFQA_CT attribute to the kernel. So ignore the
2303 * unchangeable bits but do not error out.
2304 */
2305 ct->status = (status & ~IPS_UNCHANGEABLE_MASK) |
2306 (ct->status & IPS_UNCHANGEABLE_MASK);
2307 return 0;
2308}
2309
2310static int
2273ctnetlink_glue_parse_ct(const struct nlattr *cda[], struct nf_conn *ct) 2311ctnetlink_glue_parse_ct(const struct nlattr *cda[], struct nf_conn *ct)
2274{ 2312{
2275 int err; 2313 int err;
@@ -2280,7 +2318,7 @@ ctnetlink_glue_parse_ct(const struct nlattr *cda[], struct nf_conn *ct)
2280 return err; 2318 return err;
2281 } 2319 }
2282 if (cda[CTA_STATUS]) { 2320 if (cda[CTA_STATUS]) {
2283 err = ctnetlink_change_status(ct, cda); 2321 err = ctnetlink_update_status(ct, cda);
2284 if (err < 0) 2322 if (err < 0)
2285 return err; 2323 return err;
2286 } 2324 }
@@ -2642,8 +2680,8 @@ ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
2642 last = (struct nf_conntrack_expect *)cb->args[1]; 2680 last = (struct nf_conntrack_expect *)cb->args[1];
2643 for (; cb->args[0] < nf_ct_expect_hsize; cb->args[0]++) { 2681 for (; cb->args[0] < nf_ct_expect_hsize; cb->args[0]++) {
2644restart: 2682restart:
2645 hlist_for_each_entry(exp, &nf_ct_expect_hash[cb->args[0]], 2683 hlist_for_each_entry_rcu(exp, &nf_ct_expect_hash[cb->args[0]],
2646 hnode) { 2684 hnode) {
2647 if (l3proto && exp->tuple.src.l3num != l3proto) 2685 if (l3proto && exp->tuple.src.l3num != l3proto)
2648 continue; 2686 continue;
2649 2687
@@ -2694,7 +2732,7 @@ ctnetlink_exp_ct_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
2694 rcu_read_lock(); 2732 rcu_read_lock();
2695 last = (struct nf_conntrack_expect *)cb->args[1]; 2733 last = (struct nf_conntrack_expect *)cb->args[1];
2696restart: 2734restart:
2697 hlist_for_each_entry(exp, &help->expectations, lnode) { 2735 hlist_for_each_entry_rcu(exp, &help->expectations, lnode) {
2698 if (l3proto && exp->tuple.src.l3num != l3proto) 2736 if (l3proto && exp->tuple.src.l3num != l3proto)
2699 continue; 2737 continue;
2700 if (cb->args[1]) { 2738 if (cb->args[1]) {
@@ -2756,6 +2794,12 @@ static int ctnetlink_dump_exp_ct(struct net *net, struct sock *ctnl,
2756 return -ENOENT; 2794 return -ENOENT;
2757 2795
2758 ct = nf_ct_tuplehash_to_ctrack(h); 2796 ct = nf_ct_tuplehash_to_ctrack(h);
2797 /* No expectation linked to this connection tracking. */
2798 if (!nfct_help(ct)) {
2799 nf_ct_put(ct);
2800 return 0;
2801 }
2802
2759 c.data = ct; 2803 c.data = ct;
2760 2804
2761 err = netlink_dump_start(ctnl, skb, nlh, &c); 2805 err = netlink_dump_start(ctnl, skb, nlh, &c);
@@ -3100,23 +3144,27 @@ ctnetlink_create_expect(struct net *net,
3100 return -ENOENT; 3144 return -ENOENT;
3101 ct = nf_ct_tuplehash_to_ctrack(h); 3145 ct = nf_ct_tuplehash_to_ctrack(h);
3102 3146
3147 rcu_read_lock();
3103 if (cda[CTA_EXPECT_HELP_NAME]) { 3148 if (cda[CTA_EXPECT_HELP_NAME]) {
3104 const char *helpname = nla_data(cda[CTA_EXPECT_HELP_NAME]); 3149 const char *helpname = nla_data(cda[CTA_EXPECT_HELP_NAME]);
3105 3150
3106 helper = __nf_conntrack_helper_find(helpname, u3, 3151 helper = __nf_conntrack_helper_find(helpname, u3,
3107 nf_ct_protonum(ct)); 3152 nf_ct_protonum(ct));
3108 if (helper == NULL) { 3153 if (helper == NULL) {
3154 rcu_read_unlock();
3109#ifdef CONFIG_MODULES 3155#ifdef CONFIG_MODULES
3110 if (request_module("nfct-helper-%s", helpname) < 0) { 3156 if (request_module("nfct-helper-%s", helpname) < 0) {
3111 err = -EOPNOTSUPP; 3157 err = -EOPNOTSUPP;
3112 goto err_ct; 3158 goto err_ct;
3113 } 3159 }
3160 rcu_read_lock();
3114 helper = __nf_conntrack_helper_find(helpname, u3, 3161 helper = __nf_conntrack_helper_find(helpname, u3,
3115 nf_ct_protonum(ct)); 3162 nf_ct_protonum(ct));
3116 if (helper) { 3163 if (helper) {
3117 err = -EAGAIN; 3164 err = -EAGAIN;
3118 goto err_ct; 3165 goto err_rcu;
3119 } 3166 }
3167 rcu_read_unlock();
3120#endif 3168#endif
3121 err = -EOPNOTSUPP; 3169 err = -EOPNOTSUPP;
3122 goto err_ct; 3170 goto err_ct;
@@ -3126,11 +3174,13 @@ ctnetlink_create_expect(struct net *net,
3126 exp = ctnetlink_alloc_expect(cda, ct, helper, &tuple, &mask); 3174 exp = ctnetlink_alloc_expect(cda, ct, helper, &tuple, &mask);
3127 if (IS_ERR(exp)) { 3175 if (IS_ERR(exp)) {
3128 err = PTR_ERR(exp); 3176 err = PTR_ERR(exp);
3129 goto err_ct; 3177 goto err_rcu;
3130 } 3178 }
3131 3179
3132 err = nf_ct_expect_related_report(exp, portid, report); 3180 err = nf_ct_expect_related_report(exp, portid, report);
3133 nf_ct_expect_put(exp); 3181 nf_ct_expect_put(exp);
3182err_rcu:
3183 rcu_read_unlock();
3134err_ct: 3184err_ct:
3135 nf_ct_put(ct); 3185 nf_ct_put(ct);
3136 return err; 3186 return err;
@@ -3409,6 +3459,7 @@ static void __exit ctnetlink_exit(void)
3409#ifdef CONFIG_NETFILTER_NETLINK_GLUE_CT 3459#ifdef CONFIG_NETFILTER_NETLINK_GLUE_CT
3410 RCU_INIT_POINTER(nfnl_ct_hook, NULL); 3460 RCU_INIT_POINTER(nfnl_ct_hook, NULL);
3411#endif 3461#endif
3462 synchronize_rcu();
3412} 3463}
3413 3464
3414module_init(ctnetlink_init); 3465module_init(ctnetlink_init);
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index b68ce6ac13b3..93dd1c5b7bff 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -561,7 +561,6 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb,
561 561
562static int dccp_error(struct net *net, struct nf_conn *tmpl, 562static int dccp_error(struct net *net, struct nf_conn *tmpl,
563 struct sk_buff *skb, unsigned int dataoff, 563 struct sk_buff *skb, unsigned int dataoff,
564 enum ip_conntrack_info *ctinfo,
565 u_int8_t pf, unsigned int hooknum) 564 u_int8_t pf, unsigned int hooknum)
566{ 565{
567 struct dccp_hdr _dh, *dh; 566 struct dccp_hdr _dh, *dh;
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index a0efde38da44..33279aab583d 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -22,7 +22,9 @@
22#include <linux/seq_file.h> 22#include <linux/seq_file.h>
23#include <linux/spinlock.h> 23#include <linux/spinlock.h>
24#include <linux/interrupt.h> 24#include <linux/interrupt.h>
25#include <net/sctp/checksum.h>
25 26
27#include <net/netfilter/nf_log.h>
26#include <net/netfilter/nf_conntrack.h> 28#include <net/netfilter/nf_conntrack.h>
27#include <net/netfilter/nf_conntrack_l4proto.h> 29#include <net/netfilter/nf_conntrack_l4proto.h>
28#include <net/netfilter/nf_conntrack_ecache.h> 30#include <net/netfilter/nf_conntrack_ecache.h>
@@ -505,6 +507,34 @@ static bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb,
505 return true; 507 return true;
506} 508}
507 509
510static int sctp_error(struct net *net, struct nf_conn *tpl, struct sk_buff *skb,
511 unsigned int dataoff,
512 u8 pf, unsigned int hooknum)
513{
514 const struct sctphdr *sh;
515 struct sctphdr _sctph;
516 const char *logmsg;
517
518 sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph);
519 if (!sh) {
520 logmsg = "nf_ct_sctp: short packet ";
521 goto out_invalid;
522 }
523 if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
524 skb->ip_summed == CHECKSUM_NONE) {
525 if (sh->checksum != sctp_compute_cksum(skb, dataoff)) {
526 logmsg = "nf_ct_sctp: bad CRC ";
527 goto out_invalid;
528 }
529 skb->ip_summed = CHECKSUM_UNNECESSARY;
530 }
531 return NF_ACCEPT;
532out_invalid:
533 if (LOG_INVALID(net, IPPROTO_SCTP))
534 nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "%s", logmsg);
535 return -NF_ACCEPT;
536}
537
508#if IS_ENABLED(CONFIG_NF_CT_NETLINK) 538#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
509 539
510#include <linux/netfilter/nfnetlink.h> 540#include <linux/netfilter/nfnetlink.h>
@@ -752,6 +782,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = {
752 .packet = sctp_packet, 782 .packet = sctp_packet,
753 .get_timeouts = sctp_get_timeouts, 783 .get_timeouts = sctp_get_timeouts,
754 .new = sctp_new, 784 .new = sctp_new,
785 .error = sctp_error,
755 .me = THIS_MODULE, 786 .me = THIS_MODULE,
756#if IS_ENABLED(CONFIG_NF_CT_NETLINK) 787#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
757 .to_nlattr = sctp_to_nlattr, 788 .to_nlattr = sctp_to_nlattr,
@@ -786,6 +817,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 __read_mostly = {
786 .packet = sctp_packet, 817 .packet = sctp_packet,
787 .get_timeouts = sctp_get_timeouts, 818 .get_timeouts = sctp_get_timeouts,
788 .new = sctp_new, 819 .new = sctp_new,
820 .error = sctp_error,
789 .me = THIS_MODULE, 821 .me = THIS_MODULE,
790#if IS_ENABLED(CONFIG_NF_CT_NETLINK) 822#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
791 .to_nlattr = sctp_to_nlattr, 823 .to_nlattr = sctp_to_nlattr,
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 69f687740c76..b122e9dacfed 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -750,7 +750,6 @@ static const u8 tcp_valid_flags[(TCPHDR_FIN|TCPHDR_SYN|TCPHDR_RST|TCPHDR_ACK|
750static int tcp_error(struct net *net, struct nf_conn *tmpl, 750static int tcp_error(struct net *net, struct nf_conn *tmpl,
751 struct sk_buff *skb, 751 struct sk_buff *skb,
752 unsigned int dataoff, 752 unsigned int dataoff,
753 enum ip_conntrack_info *ctinfo,
754 u_int8_t pf, 753 u_int8_t pf,
755 unsigned int hooknum) 754 unsigned int hooknum)
756{ 755{
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index 20f35ed68030..f6ebce6178ca 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -108,8 +108,60 @@ static bool udp_new(struct nf_conn *ct, const struct sk_buff *skb,
108 return true; 108 return true;
109} 109}
110 110
111#ifdef CONFIG_NF_CT_PROTO_UDPLITE
112static int udplite_error(struct net *net, struct nf_conn *tmpl,
113 struct sk_buff *skb,
114 unsigned int dataoff,
115 u8 pf, unsigned int hooknum)
116{
117 unsigned int udplen = skb->len - dataoff;
118 const struct udphdr *hdr;
119 struct udphdr _hdr;
120 unsigned int cscov;
121
122 /* Header is too small? */
123 hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
124 if (!hdr) {
125 if (LOG_INVALID(net, IPPROTO_UDPLITE))
126 nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
127 "nf_ct_udplite: short packet ");
128 return -NF_ACCEPT;
129 }
130
131 cscov = ntohs(hdr->len);
132 if (cscov == 0) {
133 cscov = udplen;
134 } else if (cscov < sizeof(*hdr) || cscov > udplen) {
135 if (LOG_INVALID(net, IPPROTO_UDPLITE))
136 nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
137 "nf_ct_udplite: invalid checksum coverage ");
138 return -NF_ACCEPT;
139 }
140
141 /* UDPLITE mandates checksums */
142 if (!hdr->check) {
143 if (LOG_INVALID(net, IPPROTO_UDPLITE))
144 nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
145 "nf_ct_udplite: checksum missing ");
146 return -NF_ACCEPT;
147 }
148
149 /* Checksum invalid? Ignore. */
150 if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
151 nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_UDP,
152 pf)) {
153 if (LOG_INVALID(net, IPPROTO_UDPLITE))
154 nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
155 "nf_ct_udplite: bad UDPLite checksum ");
156 return -NF_ACCEPT;
157 }
158
159 return NF_ACCEPT;
160}
161#endif
162
111static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, 163static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
112 unsigned int dataoff, enum ip_conntrack_info *ctinfo, 164 unsigned int dataoff,
113 u_int8_t pf, 165 u_int8_t pf,
114 unsigned int hooknum) 166 unsigned int hooknum)
115{ 167{
@@ -290,6 +342,41 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 __read_mostly =
290}; 342};
291EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp4); 343EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp4);
292 344
345#ifdef CONFIG_NF_CT_PROTO_UDPLITE
346struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly =
347{
348 .l3proto = PF_INET,
349 .l4proto = IPPROTO_UDPLITE,
350 .name = "udplite",
351 .allow_clash = true,
352 .pkt_to_tuple = udp_pkt_to_tuple,
353 .invert_tuple = udp_invert_tuple,
354 .print_tuple = udp_print_tuple,
355 .packet = udp_packet,
356 .get_timeouts = udp_get_timeouts,
357 .new = udp_new,
358 .error = udplite_error,
359#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
360 .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
361 .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
362 .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
363 .nla_policy = nf_ct_port_nla_policy,
364#endif
365#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
366 .ctnl_timeout = {
367 .nlattr_to_obj = udp_timeout_nlattr_to_obj,
368 .obj_to_nlattr = udp_timeout_obj_to_nlattr,
369 .nlattr_max = CTA_TIMEOUT_UDP_MAX,
370 .obj_size = sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX,
371 .nla_policy = udp_timeout_nla_policy,
372 },
373#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
374 .init_net = udp_init_net,
375 .get_net_proto = udp_get_net_proto,
376};
377EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udplite4);
378#endif
379
293struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 __read_mostly = 380struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 __read_mostly =
294{ 381{
295 .l3proto = PF_INET6, 382 .l3proto = PF_INET6,
@@ -322,3 +409,38 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 __read_mostly =
322 .get_net_proto = udp_get_net_proto, 409 .get_net_proto = udp_get_net_proto,
323}; 410};
324EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp6); 411EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp6);
412
413#ifdef CONFIG_NF_CT_PROTO_UDPLITE
414struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly =
415{
416 .l3proto = PF_INET6,
417 .l4proto = IPPROTO_UDPLITE,
418 .name = "udplite",
419 .allow_clash = true,
420 .pkt_to_tuple = udp_pkt_to_tuple,
421 .invert_tuple = udp_invert_tuple,
422 .print_tuple = udp_print_tuple,
423 .packet = udp_packet,
424 .get_timeouts = udp_get_timeouts,
425 .new = udp_new,
426 .error = udplite_error,
427#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
428 .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
429 .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
430 .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
431 .nla_policy = nf_ct_port_nla_policy,
432#endif
433#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
434 .ctnl_timeout = {
435 .nlattr_to_obj = udp_timeout_nlattr_to_obj,
436 .obj_to_nlattr = udp_timeout_obj_to_nlattr,
437 .nlattr_max = CTA_TIMEOUT_UDP_MAX,
438 .obj_size = sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX,
439 .nla_policy = udp_timeout_nla_policy,
440 },
441#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
442 .init_net = udp_init_net,
443 .get_net_proto = udp_get_net_proto,
444};
445EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udplite6);
446#endif
diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c
deleted file mode 100644
index c35f7bf05d8c..000000000000
--- a/net/netfilter/nf_conntrack_proto_udplite.c
+++ /dev/null
@@ -1,324 +0,0 @@
1/* (C) 1999-2001 Paul `Rusty' Russell
2 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
3 * (C) 2007 Patrick McHardy <kaber@trash.net>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 */
9
10#include <linux/types.h>
11#include <linux/timer.h>
12#include <linux/udp.h>
13#include <linux/seq_file.h>
14#include <linux/skbuff.h>
15#include <linux/ipv6.h>
16#include <net/ip6_checksum.h>
17#include <net/checksum.h>
18
19#include <linux/netfilter.h>
20#include <linux/netfilter_ipv4.h>
21#include <linux/netfilter_ipv6.h>
22#include <net/netfilter/nf_conntrack_l4proto.h>
23#include <net/netfilter/nf_conntrack_ecache.h>
24#include <net/netfilter/nf_log.h>
25
26static unsigned int udplite_timeouts[UDPLITE_CT_MAX] = {
27 [UDPLITE_CT_UNREPLIED] = 30*HZ,
28 [UDPLITE_CT_REPLIED] = 180*HZ,
29};
30
31static inline struct nf_udplite_net *udplite_pernet(struct net *net)
32{
33 return &net->ct.nf_ct_proto.udplite;
34}
35
36static bool udplite_pkt_to_tuple(const struct sk_buff *skb,
37 unsigned int dataoff,
38 struct net *net,
39 struct nf_conntrack_tuple *tuple)
40{
41 const struct udphdr *hp;
42 struct udphdr _hdr;
43
44 /* Actually only need first 4 bytes to get ports. */
45 hp = skb_header_pointer(skb, dataoff, 4, &_hdr);
46 if (hp == NULL)
47 return false;
48
49 tuple->src.u.udp.port = hp->source;
50 tuple->dst.u.udp.port = hp->dest;
51 return true;
52}
53
54static bool udplite_invert_tuple(struct nf_conntrack_tuple *tuple,
55 const struct nf_conntrack_tuple *orig)
56{
57 tuple->src.u.udp.port = orig->dst.u.udp.port;
58 tuple->dst.u.udp.port = orig->src.u.udp.port;
59 return true;
60}
61
62/* Print out the per-protocol part of the tuple. */
63static void udplite_print_tuple(struct seq_file *s,
64 const struct nf_conntrack_tuple *tuple)
65{
66 seq_printf(s, "sport=%hu dport=%hu ",
67 ntohs(tuple->src.u.udp.port),
68 ntohs(tuple->dst.u.udp.port));
69}
70
71static unsigned int *udplite_get_timeouts(struct net *net)
72{
73 return udplite_pernet(net)->timeouts;
74}
75
76/* Returns verdict for packet, and may modify conntracktype */
77static int udplite_packet(struct nf_conn *ct,
78 const struct sk_buff *skb,
79 unsigned int dataoff,
80 enum ip_conntrack_info ctinfo,
81 u_int8_t pf,
82 unsigned int hooknum,
83 unsigned int *timeouts)
84{
85 /* If we've seen traffic both ways, this is some kind of UDP
86 stream. Extend timeout. */
87 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
88 nf_ct_refresh_acct(ct, ctinfo, skb,
89 timeouts[UDPLITE_CT_REPLIED]);
90 /* Also, more likely to be important, and not a probe */
91 if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
92 nf_conntrack_event_cache(IPCT_ASSURED, ct);
93 } else {
94 nf_ct_refresh_acct(ct, ctinfo, skb,
95 timeouts[UDPLITE_CT_UNREPLIED]);
96 }
97 return NF_ACCEPT;
98}
99
100/* Called when a new connection for this protocol found. */
101static bool udplite_new(struct nf_conn *ct, const struct sk_buff *skb,
102 unsigned int dataoff, unsigned int *timeouts)
103{
104 return true;
105}
106
107static int udplite_error(struct net *net, struct nf_conn *tmpl,
108 struct sk_buff *skb,
109 unsigned int dataoff,
110 enum ip_conntrack_info *ctinfo,
111 u_int8_t pf,
112 unsigned int hooknum)
113{
114 unsigned int udplen = skb->len - dataoff;
115 const struct udphdr *hdr;
116 struct udphdr _hdr;
117 unsigned int cscov;
118
119 /* Header is too small? */
120 hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
121 if (hdr == NULL) {
122 if (LOG_INVALID(net, IPPROTO_UDPLITE))
123 nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
124 "nf_ct_udplite: short packet ");
125 return -NF_ACCEPT;
126 }
127
128 cscov = ntohs(hdr->len);
129 if (cscov == 0)
130 cscov = udplen;
131 else if (cscov < sizeof(*hdr) || cscov > udplen) {
132 if (LOG_INVALID(net, IPPROTO_UDPLITE))
133 nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
134 "nf_ct_udplite: invalid checksum coverage ");
135 return -NF_ACCEPT;
136 }
137
138 /* UDPLITE mandates checksums */
139 if (!hdr->check) {
140 if (LOG_INVALID(net, IPPROTO_UDPLITE))
141 nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
142 "nf_ct_udplite: checksum missing ");
143 return -NF_ACCEPT;
144 }
145
146 /* Checksum invalid? Ignore. */
147 if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
148 nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_UDP,
149 pf)) {
150 if (LOG_INVALID(net, IPPROTO_UDPLITE))
151 nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
152 "nf_ct_udplite: bad UDPLite checksum ");
153 return -NF_ACCEPT;
154 }
155
156 return NF_ACCEPT;
157}
158
159#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
160
161#include <linux/netfilter/nfnetlink.h>
162#include <linux/netfilter/nfnetlink_cttimeout.h>
163
164static int udplite_timeout_nlattr_to_obj(struct nlattr *tb[],
165 struct net *net, void *data)
166{
167 unsigned int *timeouts = data;
168 struct nf_udplite_net *un = udplite_pernet(net);
169
170 /* set default timeouts for UDPlite. */
171 timeouts[UDPLITE_CT_UNREPLIED] = un->timeouts[UDPLITE_CT_UNREPLIED];
172 timeouts[UDPLITE_CT_REPLIED] = un->timeouts[UDPLITE_CT_REPLIED];
173
174 if (tb[CTA_TIMEOUT_UDPLITE_UNREPLIED]) {
175 timeouts[UDPLITE_CT_UNREPLIED] =
176 ntohl(nla_get_be32(tb[CTA_TIMEOUT_UDPLITE_UNREPLIED])) * HZ;
177 }
178 if (tb[CTA_TIMEOUT_UDPLITE_REPLIED]) {
179 timeouts[UDPLITE_CT_REPLIED] =
180 ntohl(nla_get_be32(tb[CTA_TIMEOUT_UDPLITE_REPLIED])) * HZ;
181 }
182 return 0;
183}
184
185static int
186udplite_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
187{
188 const unsigned int *timeouts = data;
189
190 if (nla_put_be32(skb, CTA_TIMEOUT_UDPLITE_UNREPLIED,
191 htonl(timeouts[UDPLITE_CT_UNREPLIED] / HZ)) ||
192 nla_put_be32(skb, CTA_TIMEOUT_UDPLITE_REPLIED,
193 htonl(timeouts[UDPLITE_CT_REPLIED] / HZ)))
194 goto nla_put_failure;
195 return 0;
196
197nla_put_failure:
198 return -ENOSPC;
199}
200
201static const struct nla_policy
202udplite_timeout_nla_policy[CTA_TIMEOUT_UDPLITE_MAX+1] = {
203 [CTA_TIMEOUT_UDPLITE_UNREPLIED] = { .type = NLA_U32 },
204 [CTA_TIMEOUT_UDPLITE_REPLIED] = { .type = NLA_U32 },
205};
206#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
207
208#ifdef CONFIG_SYSCTL
209static struct ctl_table udplite_sysctl_table[] = {
210 {
211 .procname = "nf_conntrack_udplite_timeout",
212 .maxlen = sizeof(unsigned int),
213 .mode = 0644,
214 .proc_handler = proc_dointvec_jiffies,
215 },
216 {
217 .procname = "nf_conntrack_udplite_timeout_stream",
218 .maxlen = sizeof(unsigned int),
219 .mode = 0644,
220 .proc_handler = proc_dointvec_jiffies,
221 },
222 { }
223};
224#endif /* CONFIG_SYSCTL */
225
226static int udplite_kmemdup_sysctl_table(struct nf_proto_net *pn,
227 struct nf_udplite_net *un)
228{
229#ifdef CONFIG_SYSCTL
230 if (pn->ctl_table)
231 return 0;
232
233 pn->ctl_table = kmemdup(udplite_sysctl_table,
234 sizeof(udplite_sysctl_table),
235 GFP_KERNEL);
236 if (!pn->ctl_table)
237 return -ENOMEM;
238
239 pn->ctl_table[0].data = &un->timeouts[UDPLITE_CT_UNREPLIED];
240 pn->ctl_table[1].data = &un->timeouts[UDPLITE_CT_REPLIED];
241#endif
242 return 0;
243}
244
245static int udplite_init_net(struct net *net, u_int16_t proto)
246{
247 struct nf_udplite_net *un = udplite_pernet(net);
248 struct nf_proto_net *pn = &un->pn;
249
250 if (!pn->users) {
251 int i;
252
253 for (i = 0 ; i < UDPLITE_CT_MAX; i++)
254 un->timeouts[i] = udplite_timeouts[i];
255 }
256
257 return udplite_kmemdup_sysctl_table(pn, un);
258}
259
260struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly =
261{
262 .l3proto = PF_INET,
263 .l4proto = IPPROTO_UDPLITE,
264 .name = "udplite",
265 .allow_clash = true,
266 .pkt_to_tuple = udplite_pkt_to_tuple,
267 .invert_tuple = udplite_invert_tuple,
268 .print_tuple = udplite_print_tuple,
269 .packet = udplite_packet,
270 .get_timeouts = udplite_get_timeouts,
271 .new = udplite_new,
272 .error = udplite_error,
273#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
274 .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
275 .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
276 .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
277 .nla_policy = nf_ct_port_nla_policy,
278#endif
279#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
280 .ctnl_timeout = {
281 .nlattr_to_obj = udplite_timeout_nlattr_to_obj,
282 .obj_to_nlattr = udplite_timeout_obj_to_nlattr,
283 .nlattr_max = CTA_TIMEOUT_UDPLITE_MAX,
284 .obj_size = sizeof(unsigned int) *
285 CTA_TIMEOUT_UDPLITE_MAX,
286 .nla_policy = udplite_timeout_nla_policy,
287 },
288#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
289 .init_net = udplite_init_net,
290};
291EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udplite4);
292
293struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly =
294{
295 .l3proto = PF_INET6,
296 .l4proto = IPPROTO_UDPLITE,
297 .name = "udplite",
298 .allow_clash = true,
299 .pkt_to_tuple = udplite_pkt_to_tuple,
300 .invert_tuple = udplite_invert_tuple,
301 .print_tuple = udplite_print_tuple,
302 .packet = udplite_packet,
303 .get_timeouts = udplite_get_timeouts,
304 .new = udplite_new,
305 .error = udplite_error,
306#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
307 .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
308 .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
309 .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
310 .nla_policy = nf_ct_port_nla_policy,
311#endif
312#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
313 .ctnl_timeout = {
314 .nlattr_to_obj = udplite_timeout_nlattr_to_obj,
315 .obj_to_nlattr = udplite_timeout_obj_to_nlattr,
316 .nlattr_max = CTA_TIMEOUT_UDPLITE_MAX,
317 .obj_size = sizeof(unsigned int) *
318 CTA_TIMEOUT_UDPLITE_MAX,
319 .nla_policy = udplite_timeout_nla_policy,
320 },
321#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
322 .init_net = udplite_init_net,
323};
324EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udplite6);
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index c3fc14e021ec..0d17894798b5 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -809,13 +809,11 @@ static int refresh_signalling_expectation(struct nf_conn *ct,
809 exp->tuple.dst.protonum != proto || 809 exp->tuple.dst.protonum != proto ||
810 exp->tuple.dst.u.udp.port != port) 810 exp->tuple.dst.u.udp.port != port)
811 continue; 811 continue;
812 if (!del_timer(&exp->timeout)) 812 if (mod_timer_pending(&exp->timeout, jiffies + expires * HZ)) {
813 continue; 813 exp->flags &= ~NF_CT_EXPECT_INACTIVE;
814 exp->flags &= ~NF_CT_EXPECT_INACTIVE; 814 found = 1;
815 exp->timeout.expires = jiffies + expires * HZ; 815 break;
816 add_timer(&exp->timeout); 816 }
817 found = 1;
818 break;
819 } 817 }
820 spin_unlock_bh(&nf_conntrack_expect_lock); 818 spin_unlock_bh(&nf_conntrack_expect_lock);
821 return found; 819 return found;
@@ -1630,8 +1628,6 @@ static int __init nf_conntrack_sip_init(void)
1630 ports[ports_c++] = SIP_PORT; 1628 ports[ports_c++] = SIP_PORT;
1631 1629
1632 for (i = 0; i < ports_c; i++) { 1630 for (i = 0; i < ports_c; i++) {
1633 memset(&sip[i], 0, sizeof(sip[i]));
1634
1635 nf_ct_helper_init(&sip[4 * i], AF_INET, IPPROTO_UDP, "sip", 1631 nf_ct_helper_init(&sip[4 * i], AF_INET, IPPROTO_UDP, "sip",
1636 SIP_PORT, ports[i], i, sip_exp_policy, 1632 SIP_PORT, ports[i], i, sip_exp_policy,
1637 SIP_EXPECT_MAX, 1633 SIP_EXPECT_MAX,
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index d009ae663453..2256147dcaad 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -642,6 +642,9 @@ static int __init nf_conntrack_standalone_init(void)
642 if (ret < 0) 642 if (ret < 0)
643 goto out_start; 643 goto out_start;
644 644
645 BUILD_BUG_ON(SKB_NFCT_PTRMASK != NFCT_PTRMASK);
646 BUILD_BUG_ON(NFCT_INFOMASK <= IP_CT_NUMBER);
647
645#ifdef CONFIG_SYSCTL 648#ifdef CONFIG_SYSCTL
646 nf_ct_netfilter_header = 649 nf_ct_netfilter_header =
647 register_net_sysctl(&init_net, "net", nf_ct_netfilter_table); 650 register_net_sysctl(&init_net, "net", nf_ct_netfilter_table);
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index ffb9e8ada899..8d85a0598b60 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -15,6 +15,9 @@
15 15
16#define NFLOGGER_NAME_LEN 64 16#define NFLOGGER_NAME_LEN 64
17 17
18int sysctl_nf_log_all_netns __read_mostly;
19EXPORT_SYMBOL(sysctl_nf_log_all_netns);
20
18static struct nf_logger __rcu *loggers[NFPROTO_NUMPROTO][NF_LOG_TYPE_MAX] __read_mostly; 21static struct nf_logger __rcu *loggers[NFPROTO_NUMPROTO][NF_LOG_TYPE_MAX] __read_mostly;
19static DEFINE_MUTEX(nf_log_mutex); 22static DEFINE_MUTEX(nf_log_mutex);
20 23
@@ -413,6 +416,18 @@ static const struct file_operations nflog_file_ops = {
413#ifdef CONFIG_SYSCTL 416#ifdef CONFIG_SYSCTL
414static char nf_log_sysctl_fnames[NFPROTO_NUMPROTO-NFPROTO_UNSPEC][3]; 417static char nf_log_sysctl_fnames[NFPROTO_NUMPROTO-NFPROTO_UNSPEC][3];
415static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO+1]; 418static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO+1];
419static struct ctl_table_header *nf_log_sysctl_fhdr;
420
421static struct ctl_table nf_log_sysctl_ftable[] = {
422 {
423 .procname = "nf_log_all_netns",
424 .data = &sysctl_nf_log_all_netns,
425 .maxlen = sizeof(sysctl_nf_log_all_netns),
426 .mode = 0644,
427 .proc_handler = proc_dointvec,
428 },
429 { }
430};
416 431
417static int nf_log_proc_dostring(struct ctl_table *table, int write, 432static int nf_log_proc_dostring(struct ctl_table *table, int write,
418 void __user *buffer, size_t *lenp, loff_t *ppos) 433 void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -482,6 +497,10 @@ static int netfilter_log_sysctl_init(struct net *net)
482 nf_log_sysctl_table[i].extra1 = 497 nf_log_sysctl_table[i].extra1 =
483 (void *)(unsigned long) i; 498 (void *)(unsigned long) i;
484 } 499 }
500 nf_log_sysctl_fhdr = register_net_sysctl(net, "net/netfilter",
501 nf_log_sysctl_ftable);
502 if (!nf_log_sysctl_fhdr)
503 goto err_freg;
485 } 504 }
486 505
487 for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) 506 for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++)
@@ -498,6 +517,9 @@ static int netfilter_log_sysctl_init(struct net *net)
498err_reg: 517err_reg:
499 if (!net_eq(net, &init_net)) 518 if (!net_eq(net, &init_net))
500 kfree(table); 519 kfree(table);
520 else
521 unregister_net_sysctl_table(nf_log_sysctl_fhdr);
522err_freg:
501err_alloc: 523err_alloc:
502 return -ENOMEM; 524 return -ENOMEM;
503} 525}
@@ -510,6 +532,8 @@ static void netfilter_log_sysctl_exit(struct net *net)
510 unregister_net_sysctl_table(net->nf.nf_log_dir_header); 532 unregister_net_sysctl_table(net->nf.nf_log_dir_header);
511 if (!net_eq(net, &init_net)) 533 if (!net_eq(net, &init_net))
512 kfree(table); 534 kfree(table);
535 else
536 unregister_net_sysctl_table(nf_log_sysctl_fhdr);
513} 537}
514#else 538#else
515static int netfilter_log_sysctl_init(struct net *net) 539static int netfilter_log_sysctl_init(struct net *net)
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 94b14c5a8b17..82802e4a6640 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -903,6 +903,8 @@ static void __exit nf_nat_cleanup(void)
903#ifdef CONFIG_XFRM 903#ifdef CONFIG_XFRM
904 RCU_INIT_POINTER(nf_nat_decode_session_hook, NULL); 904 RCU_INIT_POINTER(nf_nat_decode_session_hook, NULL);
905#endif 905#endif
906 synchronize_rcu();
907
906 for (i = 0; i < NFPROTO_NUMPROTO; i++) 908 for (i = 0; i < NFPROTO_NUMPROTO; i++)
907 kfree(nf_nat_l4protos[i]); 909 kfree(nf_nat_l4protos[i]);
908 910
diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c
index 2840abb5bb99..211661cb2c90 100644
--- a/net/netfilter/nf_nat_helper.c
+++ b/net/netfilter/nf_nat_helper.c
@@ -60,7 +60,7 @@ static void mangle_contents(struct sk_buff *skb,
60 __skb_trim(skb, skb->len + rep_len - match_len); 60 __skb_trim(skb, skb->len + rep_len - match_len);
61 } 61 }
62 62
63 if (nf_ct_l3num((struct nf_conn *)skb->nfct) == NFPROTO_IPV4) { 63 if (nf_ct_l3num((struct nf_conn *)skb_nfct(skb)) == NFPROTO_IPV4) {
64 /* fix IP hdr checksum information */ 64 /* fix IP hdr checksum information */
65 ip_hdr(skb)->tot_len = htons(skb->len); 65 ip_hdr(skb)->tot_len = htons(skb->len);
66 ip_send_check(ip_hdr(skb)); 66 ip_send_check(ip_hdr(skb));
diff --git a/net/netfilter/nf_nat_proto_sctp.c b/net/netfilter/nf_nat_proto_sctp.c
index 31d358691af0..804e8a0ab36e 100644
--- a/net/netfilter/nf_nat_proto_sctp.c
+++ b/net/netfilter/nf_nat_proto_sctp.c
@@ -33,8 +33,16 @@ sctp_manip_pkt(struct sk_buff *skb,
33 enum nf_nat_manip_type maniptype) 33 enum nf_nat_manip_type maniptype)
34{ 34{
35 sctp_sctphdr_t *hdr; 35 sctp_sctphdr_t *hdr;
36 int hdrsize = 8;
36 37
37 if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) 38 /* This could be an inner header returned in imcp packet; in such
39 * cases we cannot update the checksum field since it is outside
40 * of the 8 bytes of transport layer headers we are guaranteed.
41 */
42 if (skb->len >= hdroff + sizeof(*hdr))
43 hdrsize = sizeof(*hdr);
44
45 if (!skb_make_writable(skb, hdroff + hdrsize))
38 return false; 46 return false;
39 47
40 hdr = (struct sctphdr *)(skb->data + hdroff); 48 hdr = (struct sctphdr *)(skb->data + hdroff);
@@ -47,6 +55,9 @@ sctp_manip_pkt(struct sk_buff *skb,
47 hdr->dest = tuple->dst.u.sctp.port; 55 hdr->dest = tuple->dst.u.sctp.port;
48 } 56 }
49 57
58 if (hdrsize < sizeof(*hdr))
59 return true;
60
50 if (skb->ip_summed != CHECKSUM_PARTIAL) { 61 if (skb->ip_summed != CHECKSUM_PARTIAL) {
51 hdr->checksum = sctp_compute_cksum(skb, hdroff); 62 hdr->checksum = sctp_compute_cksum(skb, hdroff);
52 skb->ip_summed = CHECKSUM_NONE; 63 skb->ip_summed = CHECKSUM_NONE;
diff --git a/net/netfilter/nf_nat_proto_udp.c b/net/netfilter/nf_nat_proto_udp.c
index b1e627227b6e..edd4a77dc09a 100644
--- a/net/netfilter/nf_nat_proto_udp.c
+++ b/net/netfilter/nf_nat_proto_udp.c
@@ -30,20 +30,15 @@ udp_unique_tuple(const struct nf_nat_l3proto *l3proto,
30 &udp_port_rover); 30 &udp_port_rover);
31} 31}
32 32
33static bool 33static void
34udp_manip_pkt(struct sk_buff *skb, 34__udp_manip_pkt(struct sk_buff *skb,
35 const struct nf_nat_l3proto *l3proto, 35 const struct nf_nat_l3proto *l3proto,
36 unsigned int iphdroff, unsigned int hdroff, 36 unsigned int iphdroff, struct udphdr *hdr,
37 const struct nf_conntrack_tuple *tuple, 37 const struct nf_conntrack_tuple *tuple,
38 enum nf_nat_manip_type maniptype) 38 enum nf_nat_manip_type maniptype, bool do_csum)
39{ 39{
40 struct udphdr *hdr;
41 __be16 *portptr, newport; 40 __be16 *portptr, newport;
42 41
43 if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
44 return false;
45 hdr = (struct udphdr *)(skb->data + hdroff);
46
47 if (maniptype == NF_NAT_MANIP_SRC) { 42 if (maniptype == NF_NAT_MANIP_SRC) {
48 /* Get rid of src port */ 43 /* Get rid of src port */
49 newport = tuple->src.u.udp.port; 44 newport = tuple->src.u.udp.port;
@@ -53,7 +48,7 @@ udp_manip_pkt(struct sk_buff *skb,
53 newport = tuple->dst.u.udp.port; 48 newport = tuple->dst.u.udp.port;
54 portptr = &hdr->dest; 49 portptr = &hdr->dest;
55 } 50 }
56 if (hdr->check || skb->ip_summed == CHECKSUM_PARTIAL) { 51 if (do_csum) {
57 l3proto->csum_update(skb, iphdroff, &hdr->check, 52 l3proto->csum_update(skb, iphdroff, &hdr->check,
58 tuple, maniptype); 53 tuple, maniptype);
59 inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, 54 inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport,
@@ -62,9 +57,68 @@ udp_manip_pkt(struct sk_buff *skb,
62 hdr->check = CSUM_MANGLED_0; 57 hdr->check = CSUM_MANGLED_0;
63 } 58 }
64 *portptr = newport; 59 *portptr = newport;
60}
61
62static bool udp_manip_pkt(struct sk_buff *skb,
63 const struct nf_nat_l3proto *l3proto,
64 unsigned int iphdroff, unsigned int hdroff,
65 const struct nf_conntrack_tuple *tuple,
66 enum nf_nat_manip_type maniptype)
67{
68 struct udphdr *hdr;
69 bool do_csum;
70
71 if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
72 return false;
73
74 hdr = (struct udphdr *)(skb->data + hdroff);
75 do_csum = hdr->check || skb->ip_summed == CHECKSUM_PARTIAL;
76
77 __udp_manip_pkt(skb, l3proto, iphdroff, hdr, tuple, maniptype, do_csum);
78 return true;
79}
80
81#ifdef CONFIG_NF_NAT_PROTO_UDPLITE
82static u16 udplite_port_rover;
83
84static bool udplite_manip_pkt(struct sk_buff *skb,
85 const struct nf_nat_l3proto *l3proto,
86 unsigned int iphdroff, unsigned int hdroff,
87 const struct nf_conntrack_tuple *tuple,
88 enum nf_nat_manip_type maniptype)
89{
90 struct udphdr *hdr;
91
92 if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
93 return false;
94
95 hdr = (struct udphdr *)(skb->data + hdroff);
96 __udp_manip_pkt(skb, l3proto, iphdroff, hdr, tuple, maniptype, true);
65 return true; 97 return true;
66} 98}
67 99
100static void
101udplite_unique_tuple(const struct nf_nat_l3proto *l3proto,
102 struct nf_conntrack_tuple *tuple,
103 const struct nf_nat_range *range,
104 enum nf_nat_manip_type maniptype,
105 const struct nf_conn *ct)
106{
107 nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct,
108 &udplite_port_rover);
109}
110
111const struct nf_nat_l4proto nf_nat_l4proto_udplite = {
112 .l4proto = IPPROTO_UDPLITE,
113 .manip_pkt = udplite_manip_pkt,
114 .in_range = nf_nat_l4proto_in_range,
115 .unique_tuple = udplite_unique_tuple,
116#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
117 .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
118#endif
119};
120#endif /* CONFIG_NF_NAT_PROTO_UDPLITE */
121
68const struct nf_nat_l4proto nf_nat_l4proto_udp = { 122const struct nf_nat_l4proto nf_nat_l4proto_udp = {
69 .l4proto = IPPROTO_UDP, 123 .l4proto = IPPROTO_UDP,
70 .manip_pkt = udp_manip_pkt, 124 .manip_pkt = udp_manip_pkt,
diff --git a/net/netfilter/nf_nat_proto_udplite.c b/net/netfilter/nf_nat_proto_udplite.c
deleted file mode 100644
index 366bfbfd82a1..000000000000
--- a/net/netfilter/nf_nat_proto_udplite.c
+++ /dev/null
@@ -1,73 +0,0 @@
1/* (C) 1999-2001 Paul `Rusty' Russell
2 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
3 * (C) 2008 Patrick McHardy <kaber@trash.net>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 */
9
10#include <linux/types.h>
11#include <linux/udp.h>
12
13#include <linux/netfilter.h>
14#include <net/netfilter/nf_nat.h>
15#include <net/netfilter/nf_nat_l3proto.h>
16#include <net/netfilter/nf_nat_l4proto.h>
17
18static u16 udplite_port_rover;
19
20static void
21udplite_unique_tuple(const struct nf_nat_l3proto *l3proto,
22 struct nf_conntrack_tuple *tuple,
23 const struct nf_nat_range *range,
24 enum nf_nat_manip_type maniptype,
25 const struct nf_conn *ct)
26{
27 nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct,
28 &udplite_port_rover);
29}
30
31static bool
32udplite_manip_pkt(struct sk_buff *skb,
33 const struct nf_nat_l3proto *l3proto,
34 unsigned int iphdroff, unsigned int hdroff,
35 const struct nf_conntrack_tuple *tuple,
36 enum nf_nat_manip_type maniptype)
37{
38 struct udphdr *hdr;
39 __be16 *portptr, newport;
40
41 if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
42 return false;
43
44 hdr = (struct udphdr *)(skb->data + hdroff);
45
46 if (maniptype == NF_NAT_MANIP_SRC) {
47 /* Get rid of source port */
48 newport = tuple->src.u.udp.port;
49 portptr = &hdr->source;
50 } else {
51 /* Get rid of dst port */
52 newport = tuple->dst.u.udp.port;
53 portptr = &hdr->dest;
54 }
55
56 l3proto->csum_update(skb, iphdroff, &hdr->check, tuple, maniptype);
57 inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, false);
58 if (!hdr->check)
59 hdr->check = CSUM_MANGLED_0;
60
61 *portptr = newport;
62 return true;
63}
64
65const struct nf_nat_l4proto nf_nat_l4proto_udplite = {
66 .l4proto = IPPROTO_UDPLITE,
67 .manip_pkt = udplite_manip_pkt,
68 .in_range = nf_nat_l4proto_in_range,
69 .unique_tuple = udplite_unique_tuple,
70#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
71 .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
72#endif
73};
diff --git a/net/netfilter/nf_nat_redirect.c b/net/netfilter/nf_nat_redirect.c
index d43869879fcf..86067560a318 100644
--- a/net/netfilter/nf_nat_redirect.c
+++ b/net/netfilter/nf_nat_redirect.c
@@ -101,11 +101,13 @@ nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range *range,
101 rcu_read_lock(); 101 rcu_read_lock();
102 idev = __in6_dev_get(skb->dev); 102 idev = __in6_dev_get(skb->dev);
103 if (idev != NULL) { 103 if (idev != NULL) {
104 read_lock_bh(&idev->lock);
104 list_for_each_entry(ifa, &idev->addr_list, if_list) { 105 list_for_each_entry(ifa, &idev->addr_list, if_list) {
105 newdst = ifa->addr; 106 newdst = ifa->addr;
106 addr = true; 107 addr = true;
107 break; 108 break;
108 } 109 }
110 read_unlock_bh(&idev->lock);
109 } 111 }
110 rcu_read_unlock(); 112 rcu_read_unlock();
111 113
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 1b913760f205..434c739dfeca 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -240,6 +240,10 @@ static struct nft_trans *nft_trans_rule_add(struct nft_ctx *ctx, int msg_type,
240 if (trans == NULL) 240 if (trans == NULL)
241 return NULL; 241 return NULL;
242 242
243 if (msg_type == NFT_MSG_NEWRULE && ctx->nla[NFTA_RULE_ID] != NULL) {
244 nft_trans_rule_id(trans) =
245 ntohl(nla_get_be32(ctx->nla[NFTA_RULE_ID]));
246 }
243 nft_trans_rule(trans) = rule; 247 nft_trans_rule(trans) = rule;
244 list_add_tail(&trans->list, &ctx->net->nft.commit_list); 248 list_add_tail(&trans->list, &ctx->net->nft.commit_list);
245 249
@@ -457,16 +461,15 @@ nla_put_failure:
457 return -1; 461 return -1;
458} 462}
459 463
460static int nf_tables_table_notify(const struct nft_ctx *ctx, int event) 464static void nf_tables_table_notify(const struct nft_ctx *ctx, int event)
461{ 465{
462 struct sk_buff *skb; 466 struct sk_buff *skb;
463 int err; 467 int err;
464 468
465 if (!ctx->report && 469 if (!ctx->report &&
466 !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) 470 !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
467 return 0; 471 return;
468 472
469 err = -ENOBUFS;
470 skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); 473 skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
471 if (skb == NULL) 474 if (skb == NULL)
472 goto err; 475 goto err;
@@ -478,14 +481,11 @@ static int nf_tables_table_notify(const struct nft_ctx *ctx, int event)
478 goto err; 481 goto err;
479 } 482 }
480 483
481 err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, 484 nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES,
482 ctx->report, GFP_KERNEL); 485 ctx->report, GFP_KERNEL);
486 return;
483err: 487err:
484 if (err < 0) { 488 nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS);
485 nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES,
486 err);
487 }
488 return err;
489} 489}
490 490
491static int nf_tables_dump_tables(struct sk_buff *skb, 491static int nf_tables_dump_tables(struct sk_buff *skb,
@@ -576,6 +576,28 @@ err:
576 return err; 576 return err;
577} 577}
578 578
579static void _nf_tables_table_disable(struct net *net,
580 const struct nft_af_info *afi,
581 struct nft_table *table,
582 u32 cnt)
583{
584 struct nft_chain *chain;
585 u32 i = 0;
586
587 list_for_each_entry(chain, &table->chains, list) {
588 if (!nft_is_active_next(net, chain))
589 continue;
590 if (!(chain->flags & NFT_BASE_CHAIN))
591 continue;
592
593 if (cnt && i++ == cnt)
594 break;
595
596 nf_unregister_net_hooks(net, nft_base_chain(chain)->ops,
597 afi->nops);
598 }
599}
600
579static int nf_tables_table_enable(struct net *net, 601static int nf_tables_table_enable(struct net *net,
580 const struct nft_af_info *afi, 602 const struct nft_af_info *afi,
581 struct nft_table *table) 603 struct nft_table *table)
@@ -598,18 +620,8 @@ static int nf_tables_table_enable(struct net *net,
598 } 620 }
599 return 0; 621 return 0;
600err: 622err:
601 list_for_each_entry(chain, &table->chains, list) { 623 if (i)
602 if (!nft_is_active_next(net, chain)) 624 _nf_tables_table_disable(net, afi, table, i);
603 continue;
604 if (!(chain->flags & NFT_BASE_CHAIN))
605 continue;
606
607 if (i-- <= 0)
608 break;
609
610 nf_unregister_net_hooks(net, nft_base_chain(chain)->ops,
611 afi->nops);
612 }
613 return err; 625 return err;
614} 626}
615 627
@@ -617,17 +629,7 @@ static void nf_tables_table_disable(struct net *net,
617 const struct nft_af_info *afi, 629 const struct nft_af_info *afi,
618 struct nft_table *table) 630 struct nft_table *table)
619{ 631{
620 struct nft_chain *chain; 632 _nf_tables_table_disable(net, afi, table, 0);
621
622 list_for_each_entry(chain, &table->chains, list) {
623 if (!nft_is_active_next(net, chain))
624 continue;
625 if (!(chain->flags & NFT_BASE_CHAIN))
626 continue;
627
628 nf_unregister_net_hooks(net, nft_base_chain(chain)->ops,
629 afi->nops);
630 }
631} 633}
632 634
633static int nf_tables_updtable(struct nft_ctx *ctx) 635static int nf_tables_updtable(struct nft_ctx *ctx)
@@ -696,10 +698,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
696 if (IS_ERR(table)) { 698 if (IS_ERR(table)) {
697 if (PTR_ERR(table) != -ENOENT) 699 if (PTR_ERR(table) != -ENOENT)
698 return PTR_ERR(table); 700 return PTR_ERR(table);
699 table = NULL; 701 } else {
700 }
701
702 if (table != NULL) {
703 if (nlh->nlmsg_flags & NLM_F_EXCL) 702 if (nlh->nlmsg_flags & NLM_F_EXCL)
704 return -EEXIST; 703 return -EEXIST;
705 if (nlh->nlmsg_flags & NLM_F_REPLACE) 704 if (nlh->nlmsg_flags & NLM_F_REPLACE)
@@ -1047,16 +1046,15 @@ nla_put_failure:
1047 return -1; 1046 return -1;
1048} 1047}
1049 1048
1050static int nf_tables_chain_notify(const struct nft_ctx *ctx, int event) 1049static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event)
1051{ 1050{
1052 struct sk_buff *skb; 1051 struct sk_buff *skb;
1053 int err; 1052 int err;
1054 1053
1055 if (!ctx->report && 1054 if (!ctx->report &&
1056 !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) 1055 !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
1057 return 0; 1056 return;
1058 1057
1059 err = -ENOBUFS;
1060 skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); 1058 skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
1061 if (skb == NULL) 1059 if (skb == NULL)
1062 goto err; 1060 goto err;
@@ -1069,14 +1067,11 @@ static int nf_tables_chain_notify(const struct nft_ctx *ctx, int event)
1069 goto err; 1067 goto err;
1070 } 1068 }
1071 1069
1072 err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, 1070 nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES,
1073 ctx->report, GFP_KERNEL); 1071 ctx->report, GFP_KERNEL);
1072 return;
1074err: 1073err:
1075 if (err < 0) { 1074 nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS);
1076 nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES,
1077 err);
1078 }
1079 return err;
1080} 1075}
1081 1076
1082static int nf_tables_dump_chains(struct sk_buff *skb, 1077static int nf_tables_dump_chains(struct sk_buff *skb,
@@ -1931,18 +1926,16 @@ nla_put_failure:
1931 return -1; 1926 return -1;
1932} 1927}
1933 1928
1934static int nf_tables_rule_notify(const struct nft_ctx *ctx, 1929static void nf_tables_rule_notify(const struct nft_ctx *ctx,
1935 const struct nft_rule *rule, 1930 const struct nft_rule *rule, int event)
1936 int event)
1937{ 1931{
1938 struct sk_buff *skb; 1932 struct sk_buff *skb;
1939 int err; 1933 int err;
1940 1934
1941 if (!ctx->report && 1935 if (!ctx->report &&
1942 !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) 1936 !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
1943 return 0; 1937 return;
1944 1938
1945 err = -ENOBUFS;
1946 skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); 1939 skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
1947 if (skb == NULL) 1940 if (skb == NULL)
1948 goto err; 1941 goto err;
@@ -1955,14 +1948,11 @@ static int nf_tables_rule_notify(const struct nft_ctx *ctx,
1955 goto err; 1948 goto err;
1956 } 1949 }
1957 1950
1958 err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, 1951 nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES,
1959 ctx->report, GFP_KERNEL); 1952 ctx->report, GFP_KERNEL);
1953 return;
1960err: 1954err:
1961 if (err < 0) { 1955 nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS);
1962 nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES,
1963 err);
1964 }
1965 return err;
1966} 1956}
1967 1957
1968struct nft_rule_dump_ctx { 1958struct nft_rule_dump_ctx {
@@ -2294,6 +2284,22 @@ err1:
2294 return err; 2284 return err;
2295} 2285}
2296 2286
2287static struct nft_rule *nft_rule_lookup_byid(const struct net *net,
2288 const struct nlattr *nla)
2289{
2290 u32 id = ntohl(nla_get_be32(nla));
2291 struct nft_trans *trans;
2292
2293 list_for_each_entry(trans, &net->nft.commit_list, list) {
2294 struct nft_rule *rule = nft_trans_rule(trans);
2295
2296 if (trans->msg_type == NFT_MSG_NEWRULE &&
2297 id == nft_trans_rule_id(trans))
2298 return rule;
2299 }
2300 return ERR_PTR(-ENOENT);
2301}
2302
2297static int nf_tables_delrule(struct net *net, struct sock *nlsk, 2303static int nf_tables_delrule(struct net *net, struct sock *nlsk,
2298 struct sk_buff *skb, const struct nlmsghdr *nlh, 2304 struct sk_buff *skb, const struct nlmsghdr *nlh,
2299 const struct nlattr * const nla[]) 2305 const struct nlattr * const nla[])
@@ -2332,6 +2338,12 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk,
2332 return PTR_ERR(rule); 2338 return PTR_ERR(rule);
2333 2339
2334 err = nft_delrule(&ctx, rule); 2340 err = nft_delrule(&ctx, rule);
2341 } else if (nla[NFTA_RULE_ID]) {
2342 rule = nft_rule_lookup_byid(net, nla[NFTA_RULE_ID]);
2343 if (IS_ERR(rule))
2344 return PTR_ERR(rule);
2345
2346 err = nft_delrule(&ctx, rule);
2335 } else { 2347 } else {
2336 err = nft_delrule_by_chain(&ctx); 2348 err = nft_delrule_by_chain(&ctx);
2337 } 2349 }
@@ -2399,12 +2411,14 @@ nft_select_set_ops(const struct nlattr * const nla[],
2399 features = 0; 2411 features = 0;
2400 if (nla[NFTA_SET_FLAGS] != NULL) { 2412 if (nla[NFTA_SET_FLAGS] != NULL) {
2401 features = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS])); 2413 features = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS]));
2402 features &= NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_TIMEOUT; 2414 features &= NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_TIMEOUT |
2415 NFT_SET_OBJECT;
2403 } 2416 }
2404 2417
2405 bops = NULL; 2418 bops = NULL;
2406 best.size = ~0; 2419 best.size = ~0;
2407 best.class = ~0; 2420 best.lookup = ~0;
2421 best.space = ~0;
2408 2422
2409 list_for_each_entry(ops, &nf_tables_set_ops, list) { 2423 list_for_each_entry(ops, &nf_tables_set_ops, list) {
2410 if ((ops->features & features) != features) 2424 if ((ops->features & features) != features)
@@ -2414,16 +2428,27 @@ nft_select_set_ops(const struct nlattr * const nla[],
2414 2428
2415 switch (policy) { 2429 switch (policy) {
2416 case NFT_SET_POL_PERFORMANCE: 2430 case NFT_SET_POL_PERFORMANCE:
2417 if (est.class < best.class) 2431 if (est.lookup < best.lookup)
2418 break;
2419 if (est.class == best.class && est.size < best.size)
2420 break; 2432 break;
2433 if (est.lookup == best.lookup) {
2434 if (!desc->size) {
2435 if (est.space < best.space)
2436 break;
2437 } else if (est.size < best.size) {
2438 break;
2439 }
2440 }
2421 continue; 2441 continue;
2422 case NFT_SET_POL_MEMORY: 2442 case NFT_SET_POL_MEMORY:
2423 if (est.size < best.size) 2443 if (!desc->size) {
2424 break; 2444 if (est.space < best.space)
2425 if (est.size == best.size && est.class < best.class) 2445 break;
2446 if (est.space == best.space &&
2447 est.lookup < best.lookup)
2448 break;
2449 } else if (est.size < best.size) {
2426 break; 2450 break;
2451 }
2427 continue; 2452 continue;
2428 default: 2453 default:
2429 break; 2454 break;
@@ -2658,9 +2683,9 @@ nla_put_failure:
2658 return -1; 2683 return -1;
2659} 2684}
2660 2685
2661static int nf_tables_set_notify(const struct nft_ctx *ctx, 2686static void nf_tables_set_notify(const struct nft_ctx *ctx,
2662 const struct nft_set *set, 2687 const struct nft_set *set, int event,
2663 int event, gfp_t gfp_flags) 2688 gfp_t gfp_flags)
2664{ 2689{
2665 struct sk_buff *skb; 2690 struct sk_buff *skb;
2666 u32 portid = ctx->portid; 2691 u32 portid = ctx->portid;
@@ -2668,9 +2693,8 @@ static int nf_tables_set_notify(const struct nft_ctx *ctx,
2668 2693
2669 if (!ctx->report && 2694 if (!ctx->report &&
2670 !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) 2695 !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
2671 return 0; 2696 return;
2672 2697
2673 err = -ENOBUFS;
2674 skb = nlmsg_new(NLMSG_GOODSIZE, gfp_flags); 2698 skb = nlmsg_new(NLMSG_GOODSIZE, gfp_flags);
2675 if (skb == NULL) 2699 if (skb == NULL)
2676 goto err; 2700 goto err;
@@ -2681,12 +2705,11 @@ static int nf_tables_set_notify(const struct nft_ctx *ctx,
2681 goto err; 2705 goto err;
2682 } 2706 }
2683 2707
2684 err = nfnetlink_send(skb, ctx->net, portid, NFNLGRP_NFTABLES, 2708 nfnetlink_send(skb, ctx->net, portid, NFNLGRP_NFTABLES, ctx->report,
2685 ctx->report, gfp_flags); 2709 gfp_flags);
2710 return;
2686err: 2711err:
2687 if (err < 0) 2712 nfnetlink_set_err(ctx->net, portid, NFNLGRP_NFTABLES, -ENOBUFS);
2688 nfnetlink_set_err(ctx->net, portid, NFNLGRP_NFTABLES, err);
2689 return err;
2690} 2713}
2691 2714
2692static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb) 2715static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb)
@@ -2966,10 +2989,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
2966 if (IS_ERR(set)) { 2989 if (IS_ERR(set)) {
2967 if (PTR_ERR(set) != -ENOENT) 2990 if (PTR_ERR(set) != -ENOENT)
2968 return PTR_ERR(set); 2991 return PTR_ERR(set);
2969 set = NULL; 2992 } else {
2970 }
2971
2972 if (set != NULL) {
2973 if (nlh->nlmsg_flags & NLM_F_EXCL) 2993 if (nlh->nlmsg_flags & NLM_F_EXCL)
2974 return -EEXIST; 2994 return -EEXIST;
2975 if (nlh->nlmsg_flags & NLM_F_REPLACE) 2995 if (nlh->nlmsg_flags & NLM_F_REPLACE)
@@ -3467,10 +3487,10 @@ nla_put_failure:
3467 return -1; 3487 return -1;
3468} 3488}
3469 3489
3470static int nf_tables_setelem_notify(const struct nft_ctx *ctx, 3490static void nf_tables_setelem_notify(const struct nft_ctx *ctx,
3471 const struct nft_set *set, 3491 const struct nft_set *set,
3472 const struct nft_set_elem *elem, 3492 const struct nft_set_elem *elem,
3473 int event, u16 flags) 3493 int event, u16 flags)
3474{ 3494{
3475 struct net *net = ctx->net; 3495 struct net *net = ctx->net;
3476 u32 portid = ctx->portid; 3496 u32 portid = ctx->portid;
@@ -3478,9 +3498,8 @@ static int nf_tables_setelem_notify(const struct nft_ctx *ctx,
3478 int err; 3498 int err;
3479 3499
3480 if (!ctx->report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) 3500 if (!ctx->report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES))
3481 return 0; 3501 return;
3482 3502
3483 err = -ENOBUFS;
3484 skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); 3503 skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
3485 if (skb == NULL) 3504 if (skb == NULL)
3486 goto err; 3505 goto err;
@@ -3492,12 +3511,11 @@ static int nf_tables_setelem_notify(const struct nft_ctx *ctx,
3492 goto err; 3511 goto err;
3493 } 3512 }
3494 3513
3495 err = nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, ctx->report, 3514 nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, ctx->report,
3496 GFP_KERNEL); 3515 GFP_KERNEL);
3516 return;
3497err: 3517err:
3498 if (err < 0) 3518 nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, -ENOBUFS);
3499 nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, err);
3500 return err;
3501} 3519}
3502 3520
3503static struct nft_trans *nft_trans_elem_alloc(struct nft_ctx *ctx, 3521static struct nft_trans *nft_trans_elem_alloc(struct nft_ctx *ctx,
@@ -3756,7 +3774,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
3756 return 0; 3774 return 0;
3757 3775
3758err6: 3776err6:
3759 set->ops->remove(set, &elem); 3777 set->ops->remove(ctx->net, set, &elem);
3760err5: 3778err5:
3761 kfree(trans); 3779 kfree(trans);
3762err4: 3780err4:
@@ -3902,7 +3920,7 @@ static int nft_flush_set(const struct nft_ctx *ctx,
3902 if (!trans) 3920 if (!trans)
3903 return -ENOMEM; 3921 return -ENOMEM;
3904 3922
3905 if (!set->ops->deactivate_one(ctx->net, set, elem->priv)) { 3923 if (!set->ops->flush(ctx->net, set, elem->priv)) {
3906 err = -ENOENT; 3924 err = -ENOENT;
3907 goto err1; 3925 goto err1;
3908 } 3926 }
@@ -3940,15 +3958,13 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk,
3940 return -EBUSY; 3958 return -EBUSY;
3941 3959
3942 if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL) { 3960 if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL) {
3943 struct nft_set_dump_args args = { 3961 struct nft_set_iter iter = {
3944 .iter = { 3962 .genmask = genmask,
3945 .genmask = genmask, 3963 .fn = nft_flush_set,
3946 .fn = nft_flush_set,
3947 },
3948 }; 3964 };
3949 set->ops->walk(&ctx, set, &args.iter); 3965 set->ops->walk(&ctx, set, &iter);
3950 3966
3951 return args.iter.err; 3967 return iter.err;
3952 } 3968 }
3953 3969
3954 nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { 3970 nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
@@ -4163,10 +4179,7 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk,
4163 if (err != -ENOENT) 4179 if (err != -ENOENT)
4164 return err; 4180 return err;
4165 4181
4166 obj = NULL; 4182 } else {
4167 }
4168
4169 if (obj != NULL) {
4170 if (nlh->nlmsg_flags & NLM_F_EXCL) 4183 if (nlh->nlmsg_flags & NLM_F_EXCL)
4171 return -EEXIST; 4184 return -EEXIST;
4172 4185
@@ -4443,18 +4456,17 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk,
4443 return nft_delobj(&ctx, obj); 4456 return nft_delobj(&ctx, obj);
4444} 4457}
4445 4458
4446int nft_obj_notify(struct net *net, struct nft_table *table, 4459void nft_obj_notify(struct net *net, struct nft_table *table,
4447 struct nft_object *obj, u32 portid, u32 seq, int event, 4460 struct nft_object *obj, u32 portid, u32 seq, int event,
4448 int family, int report, gfp_t gfp) 4461 int family, int report, gfp_t gfp)
4449{ 4462{
4450 struct sk_buff *skb; 4463 struct sk_buff *skb;
4451 int err; 4464 int err;
4452 4465
4453 if (!report && 4466 if (!report &&
4454 !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) 4467 !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES))
4455 return 0; 4468 return;
4456 4469
4457 err = -ENOBUFS;
4458 skb = nlmsg_new(NLMSG_GOODSIZE, gfp); 4470 skb = nlmsg_new(NLMSG_GOODSIZE, gfp);
4459 if (skb == NULL) 4471 if (skb == NULL)
4460 goto err; 4472 goto err;
@@ -4466,21 +4478,18 @@ int nft_obj_notify(struct net *net, struct nft_table *table,
4466 goto err; 4478 goto err;
4467 } 4479 }
4468 4480
4469 err = nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, report, gfp); 4481 nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, report, gfp);
4482 return;
4470err: 4483err:
4471 if (err < 0) { 4484 nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, -ENOBUFS);
4472 nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, err);
4473 }
4474 return err;
4475} 4485}
4476EXPORT_SYMBOL_GPL(nft_obj_notify); 4486EXPORT_SYMBOL_GPL(nft_obj_notify);
4477 4487
4478static int nf_tables_obj_notify(const struct nft_ctx *ctx, 4488static void nf_tables_obj_notify(const struct nft_ctx *ctx,
4479 struct nft_object *obj, int event) 4489 struct nft_object *obj, int event)
4480{ 4490{
4481 return nft_obj_notify(ctx->net, ctx->table, obj, ctx->portid, 4491 nft_obj_notify(ctx->net, ctx->table, obj, ctx->portid, ctx->seq, event,
4482 ctx->seq, event, ctx->afi->family, ctx->report, 4492 ctx->afi->family, ctx->report, GFP_KERNEL);
4483 GFP_KERNEL);
4484} 4493}
4485 4494
4486static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net, 4495static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net,
@@ -4510,7 +4519,8 @@ nla_put_failure:
4510 return -EMSGSIZE; 4519 return -EMSGSIZE;
4511} 4520}
4512 4521
4513static int nf_tables_gen_notify(struct net *net, struct sk_buff *skb, int event) 4522static void nf_tables_gen_notify(struct net *net, struct sk_buff *skb,
4523 int event)
4514{ 4524{
4515 struct nlmsghdr *nlh = nlmsg_hdr(skb); 4525 struct nlmsghdr *nlh = nlmsg_hdr(skb);
4516 struct sk_buff *skb2; 4526 struct sk_buff *skb2;
@@ -4518,9 +4528,8 @@ static int nf_tables_gen_notify(struct net *net, struct sk_buff *skb, int event)
4518 4528
4519 if (nlmsg_report(nlh) && 4529 if (nlmsg_report(nlh) &&
4520 !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) 4530 !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES))
4521 return 0; 4531 return;
4522 4532
4523 err = -ENOBUFS;
4524 skb2 = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); 4533 skb2 = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
4525 if (skb2 == NULL) 4534 if (skb2 == NULL)
4526 goto err; 4535 goto err;
@@ -4532,14 +4541,12 @@ static int nf_tables_gen_notify(struct net *net, struct sk_buff *skb, int event)
4532 goto err; 4541 goto err;
4533 } 4542 }
4534 4543
4535 err = nfnetlink_send(skb2, net, NETLINK_CB(skb).portid, 4544 nfnetlink_send(skb2, net, NETLINK_CB(skb).portid, NFNLGRP_NFTABLES,
4536 NFNLGRP_NFTABLES, nlmsg_report(nlh), GFP_KERNEL); 4545 nlmsg_report(nlh), GFP_KERNEL);
4546 return;
4537err: 4547err:
4538 if (err < 0) { 4548 nfnetlink_set_err(net, NETLINK_CB(skb).portid, NFNLGRP_NFTABLES,
4539 nfnetlink_set_err(net, NETLINK_CB(skb).portid, NFNLGRP_NFTABLES, 4549 -ENOBUFS);
4540 err);
4541 }
4542 return err;
4543} 4550}
4544 4551
4545static int nf_tables_getgen(struct net *net, struct sock *nlsk, 4552static int nf_tables_getgen(struct net *net, struct sock *nlsk,
@@ -4811,7 +4818,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
4811 nf_tables_setelem_notify(&trans->ctx, te->set, 4818 nf_tables_setelem_notify(&trans->ctx, te->set,
4812 &te->elem, 4819 &te->elem,
4813 NFT_MSG_DELSETELEM, 0); 4820 NFT_MSG_DELSETELEM, 0);
4814 te->set->ops->remove(te->set, &te->elem); 4821 te->set->ops->remove(net, te->set, &te->elem);
4815 atomic_dec(&te->set->nelems); 4822 atomic_dec(&te->set->nelems);
4816 te->set->ndeact--; 4823 te->set->ndeact--;
4817 break; 4824 break;
@@ -4932,7 +4939,7 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb)
4932 case NFT_MSG_NEWSETELEM: 4939 case NFT_MSG_NEWSETELEM:
4933 te = (struct nft_trans_elem *)trans->data; 4940 te = (struct nft_trans_elem *)trans->data;
4934 4941
4935 te->set->ops->remove(te->set, &te->elem); 4942 te->set->ops->remove(net, te->set, &te->elem);
4936 atomic_dec(&te->set->nelems); 4943 atomic_dec(&te->set->nelems);
4937 break; 4944 break;
4938 case NFT_MSG_DELSETELEM: 4945 case NFT_MSG_DELSETELEM:
@@ -4966,6 +4973,11 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb)
4966 return 0; 4973 return 0;
4967} 4974}
4968 4975
4976static bool nf_tables_valid_genid(struct net *net, u32 genid)
4977{
4978 return net->nft.base_seq == genid;
4979}
4980
4969static const struct nfnetlink_subsystem nf_tables_subsys = { 4981static const struct nfnetlink_subsystem nf_tables_subsys = {
4970 .name = "nf_tables", 4982 .name = "nf_tables",
4971 .subsys_id = NFNL_SUBSYS_NFTABLES, 4983 .subsys_id = NFNL_SUBSYS_NFTABLES,
@@ -4973,6 +4985,7 @@ static const struct nfnetlink_subsystem nf_tables_subsys = {
4973 .cb = nf_tables_cb, 4985 .cb = nf_tables_cb,
4974 .commit = nf_tables_commit, 4986 .commit = nf_tables_commit,
4975 .abort = nf_tables_abort, 4987 .abort = nf_tables_abort,
4988 .valid_genid = nf_tables_valid_genid,
4976}; 4989};
4977 4990
4978int nft_chain_validate_dependency(const struct nft_chain *chain, 4991int nft_chain_validate_dependency(const struct nft_chain *chain,
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index a09fa9fd8f3d..68eda920160e 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * (C) 2001 by Jay Schulist <jschlst@samba.org>, 4 * (C) 2001 by Jay Schulist <jschlst@samba.org>,
5 * (C) 2002-2005 by Harald Welte <laforge@gnumonks.org> 5 * (C) 2002-2005 by Harald Welte <laforge@gnumonks.org>
6 * (C) 2005,2007 by Pablo Neira Ayuso <pablo@netfilter.org> 6 * (C) 2005-2017 by Pablo Neira Ayuso <pablo@netfilter.org>
7 * 7 *
8 * Initial netfilter messages via netlink development funded and 8 * Initial netfilter messages via netlink development funded and
9 * generally made possible by Network Robots, Inc. (www.networkrobots.com) 9 * generally made possible by Network Robots, Inc. (www.networkrobots.com)
@@ -100,9 +100,9 @@ int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n)
100} 100}
101EXPORT_SYMBOL_GPL(nfnetlink_subsys_unregister); 101EXPORT_SYMBOL_GPL(nfnetlink_subsys_unregister);
102 102
103static inline const struct nfnetlink_subsystem *nfnetlink_get_subsys(u_int16_t type) 103static inline const struct nfnetlink_subsystem *nfnetlink_get_subsys(u16 type)
104{ 104{
105 u_int8_t subsys_id = NFNL_SUBSYS_ID(type); 105 u8 subsys_id = NFNL_SUBSYS_ID(type);
106 106
107 if (subsys_id >= NFNL_SUBSYS_COUNT) 107 if (subsys_id >= NFNL_SUBSYS_COUNT)
108 return NULL; 108 return NULL;
@@ -111,9 +111,9 @@ static inline const struct nfnetlink_subsystem *nfnetlink_get_subsys(u_int16_t t
111} 111}
112 112
113static inline const struct nfnl_callback * 113static inline const struct nfnl_callback *
114nfnetlink_find_client(u_int16_t type, const struct nfnetlink_subsystem *ss) 114nfnetlink_find_client(u16 type, const struct nfnetlink_subsystem *ss)
115{ 115{
116 u_int8_t cb_id = NFNL_MSG_TYPE(type); 116 u8 cb_id = NFNL_MSG_TYPE(type);
117 117
118 if (cb_id >= ss->cb_count) 118 if (cb_id >= ss->cb_count)
119 return NULL; 119 return NULL;
@@ -185,7 +185,7 @@ replay:
185 185
186 { 186 {
187 int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); 187 int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
188 u_int8_t cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type); 188 u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
189 struct nlattr *cda[ss->cb[cb_id].attr_count + 1]; 189 struct nlattr *cda[ss->cb[cb_id].attr_count + 1];
190 struct nlattr *attr = (void *)nlh + min_len; 190 struct nlattr *attr = (void *)nlh + min_len;
191 int attrlen = nlh->nlmsg_len - min_len; 191 int attrlen = nlh->nlmsg_len - min_len;
@@ -273,13 +273,13 @@ enum {
273}; 273};
274 274
275static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh, 275static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh,
276 u_int16_t subsys_id) 276 u16 subsys_id, u32 genid)
277{ 277{
278 struct sk_buff *oskb = skb; 278 struct sk_buff *oskb = skb;
279 struct net *net = sock_net(skb->sk); 279 struct net *net = sock_net(skb->sk);
280 const struct nfnetlink_subsystem *ss; 280 const struct nfnetlink_subsystem *ss;
281 const struct nfnl_callback *nc; 281 const struct nfnl_callback *nc;
282 static LIST_HEAD(err_list); 282 LIST_HEAD(err_list);
283 u32 status; 283 u32 status;
284 int err; 284 int err;
285 285
@@ -315,6 +315,12 @@ replay:
315 return kfree_skb(skb); 315 return kfree_skb(skb);
316 } 316 }
317 317
318 if (genid && ss->valid_genid && !ss->valid_genid(net, genid)) {
319 nfnl_unlock(subsys_id);
320 netlink_ack(oskb, nlh, -ERESTART);
321 return kfree_skb(skb);
322 }
323
318 while (skb->len >= nlmsg_total_size(0)) { 324 while (skb->len >= nlmsg_total_size(0)) {
319 int msglen, type; 325 int msglen, type;
320 326
@@ -365,7 +371,7 @@ replay:
365 371
366 { 372 {
367 int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); 373 int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
368 u_int8_t cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type); 374 u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
369 struct nlattr *cda[ss->cb[cb_id].attr_count + 1]; 375 struct nlattr *cda[ss->cb[cb_id].attr_count + 1];
370 struct nlattr *attr = (void *)nlh + min_len; 376 struct nlattr *attr = (void *)nlh + min_len;
371 int attrlen = nlh->nlmsg_len - min_len; 377 int attrlen = nlh->nlmsg_len - min_len;
@@ -436,11 +442,51 @@ done:
436 kfree_skb(skb); 442 kfree_skb(skb);
437} 443}
438 444
445static const struct nla_policy nfnl_batch_policy[NFNL_BATCH_MAX + 1] = {
446 [NFNL_BATCH_GENID] = { .type = NLA_U32 },
447};
448
449static void nfnetlink_rcv_skb_batch(struct sk_buff *skb, struct nlmsghdr *nlh)
450{
451 int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
452 struct nlattr *attr = (void *)nlh + min_len;
453 struct nlattr *cda[NFNL_BATCH_MAX + 1];
454 int attrlen = nlh->nlmsg_len - min_len;
455 struct nfgenmsg *nfgenmsg;
456 int msglen, err;
457 u32 gen_id = 0;
458 u16 res_id;
459
460 msglen = NLMSG_ALIGN(nlh->nlmsg_len);
461 if (msglen > skb->len)
462 msglen = skb->len;
463
464 if (nlh->nlmsg_len < NLMSG_HDRLEN ||
465 skb->len < NLMSG_HDRLEN + sizeof(struct nfgenmsg))
466 return;
467
468 err = nla_parse(cda, NFNL_BATCH_MAX, attr, attrlen, nfnl_batch_policy);
469 if (err < 0) {
470 netlink_ack(skb, nlh, err);
471 return;
472 }
473 if (cda[NFNL_BATCH_GENID])
474 gen_id = ntohl(nla_get_be32(cda[NFNL_BATCH_GENID]));
475
476 nfgenmsg = nlmsg_data(nlh);
477 skb_pull(skb, msglen);
478 /* Work around old nft using host byte order */
479 if (nfgenmsg->res_id == NFNL_SUBSYS_NFTABLES)
480 res_id = NFNL_SUBSYS_NFTABLES;
481 else
482 res_id = ntohs(nfgenmsg->res_id);
483
484 nfnetlink_rcv_batch(skb, nlh, res_id, gen_id);
485}
486
439static void nfnetlink_rcv(struct sk_buff *skb) 487static void nfnetlink_rcv(struct sk_buff *skb)
440{ 488{
441 struct nlmsghdr *nlh = nlmsg_hdr(skb); 489 struct nlmsghdr *nlh = nlmsg_hdr(skb);
442 u_int16_t res_id;
443 int msglen;
444 490
445 if (nlh->nlmsg_len < NLMSG_HDRLEN || 491 if (nlh->nlmsg_len < NLMSG_HDRLEN ||
446 skb->len < nlh->nlmsg_len) 492 skb->len < nlh->nlmsg_len)
@@ -451,28 +497,10 @@ static void nfnetlink_rcv(struct sk_buff *skb)
451 return; 497 return;
452 } 498 }
453 499
454 if (nlh->nlmsg_type == NFNL_MSG_BATCH_BEGIN) { 500 if (nlh->nlmsg_type == NFNL_MSG_BATCH_BEGIN)
455 struct nfgenmsg *nfgenmsg; 501 nfnetlink_rcv_skb_batch(skb, nlh);
456 502 else
457 msglen = NLMSG_ALIGN(nlh->nlmsg_len);
458 if (msglen > skb->len)
459 msglen = skb->len;
460
461 if (nlh->nlmsg_len < NLMSG_HDRLEN ||
462 skb->len < NLMSG_HDRLEN + sizeof(struct nfgenmsg))
463 return;
464
465 nfgenmsg = nlmsg_data(nlh);
466 skb_pull(skb, msglen);
467 /* Work around old nft using host byte order */
468 if (nfgenmsg->res_id == NFNL_SUBSYS_NFTABLES)
469 res_id = NFNL_SUBSYS_NFTABLES;
470 else
471 res_id = ntohs(nfgenmsg->res_id);
472 nfnetlink_rcv_batch(skb, nlh, res_id);
473 } else {
474 netlink_rcv_skb(skb, &nfnetlink_rcv_msg); 503 netlink_rcv_skb(skb, &nfnetlink_rcv_msg);
475 }
476} 504}
477 505
478#ifdef CONFIG_MODULES 506#ifdef CONFIG_MODULES
diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c
index 3b79f34b5095..d45558178da5 100644
--- a/net/netfilter/nfnetlink_cthelper.c
+++ b/net/netfilter/nfnetlink_cthelper.c
@@ -32,6 +32,13 @@ MODULE_LICENSE("GPL");
32MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); 32MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
33MODULE_DESCRIPTION("nfnl_cthelper: User-space connection tracking helpers"); 33MODULE_DESCRIPTION("nfnl_cthelper: User-space connection tracking helpers");
34 34
35struct nfnl_cthelper {
36 struct list_head list;
37 struct nf_conntrack_helper helper;
38};
39
40static LIST_HEAD(nfnl_cthelper_list);
41
35static int 42static int
36nfnl_userspace_cthelper(struct sk_buff *skb, unsigned int protoff, 43nfnl_userspace_cthelper(struct sk_buff *skb, unsigned int protoff,
37 struct nf_conn *ct, enum ip_conntrack_info ctinfo) 44 struct nf_conn *ct, enum ip_conntrack_info ctinfo)
@@ -48,7 +55,7 @@ nfnl_userspace_cthelper(struct sk_buff *skb, unsigned int protoff,
48 if (helper == NULL) 55 if (helper == NULL)
49 return NF_DROP; 56 return NF_DROP;
50 57
51 /* This is an user-space helper not yet configured, skip. */ 58 /* This is a user-space helper not yet configured, skip. */
52 if ((helper->flags & 59 if ((helper->flags &
53 (NF_CT_HELPER_F_USERSPACE | NF_CT_HELPER_F_CONFIGURED)) == 60 (NF_CT_HELPER_F_USERSPACE | NF_CT_HELPER_F_CONFIGURED)) ==
54 NF_CT_HELPER_F_USERSPACE) 61 NF_CT_HELPER_F_USERSPACE)
@@ -161,6 +168,7 @@ nfnl_cthelper_parse_expect_policy(struct nf_conntrack_helper *helper,
161 int i, ret; 168 int i, ret;
162 struct nf_conntrack_expect_policy *expect_policy; 169 struct nf_conntrack_expect_policy *expect_policy;
163 struct nlattr *tb[NFCTH_POLICY_SET_MAX+1]; 170 struct nlattr *tb[NFCTH_POLICY_SET_MAX+1];
171 unsigned int class_max;
164 172
165 ret = nla_parse_nested(tb, NFCTH_POLICY_SET_MAX, attr, 173 ret = nla_parse_nested(tb, NFCTH_POLICY_SET_MAX, attr,
166 nfnl_cthelper_expect_policy_set); 174 nfnl_cthelper_expect_policy_set);
@@ -170,19 +178,18 @@ nfnl_cthelper_parse_expect_policy(struct nf_conntrack_helper *helper,
170 if (!tb[NFCTH_POLICY_SET_NUM]) 178 if (!tb[NFCTH_POLICY_SET_NUM])
171 return -EINVAL; 179 return -EINVAL;
172 180
173 helper->expect_class_max = 181 class_max = ntohl(nla_get_be32(tb[NFCTH_POLICY_SET_NUM]));
174 ntohl(nla_get_be32(tb[NFCTH_POLICY_SET_NUM])); 182 if (class_max == 0)
175 183 return -EINVAL;
176 if (helper->expect_class_max != 0 && 184 if (class_max > NF_CT_MAX_EXPECT_CLASSES)
177 helper->expect_class_max > NF_CT_MAX_EXPECT_CLASSES)
178 return -EOVERFLOW; 185 return -EOVERFLOW;
179 186
180 expect_policy = kzalloc(sizeof(struct nf_conntrack_expect_policy) * 187 expect_policy = kzalloc(sizeof(struct nf_conntrack_expect_policy) *
181 helper->expect_class_max, GFP_KERNEL); 188 class_max, GFP_KERNEL);
182 if (expect_policy == NULL) 189 if (expect_policy == NULL)
183 return -ENOMEM; 190 return -ENOMEM;
184 191
185 for (i=0; i<helper->expect_class_max; i++) { 192 for (i = 0; i < class_max; i++) {
186 if (!tb[NFCTH_POLICY_SET+i]) 193 if (!tb[NFCTH_POLICY_SET+i])
187 goto err; 194 goto err;
188 195
@@ -191,6 +198,8 @@ nfnl_cthelper_parse_expect_policy(struct nf_conntrack_helper *helper,
191 if (ret < 0) 198 if (ret < 0)
192 goto err; 199 goto err;
193 } 200 }
201
202 helper->expect_class_max = class_max - 1;
194 helper->expect_policy = expect_policy; 203 helper->expect_policy = expect_policy;
195 return 0; 204 return 0;
196err: 205err:
@@ -203,18 +212,20 @@ nfnl_cthelper_create(const struct nlattr * const tb[],
203 struct nf_conntrack_tuple *tuple) 212 struct nf_conntrack_tuple *tuple)
204{ 213{
205 struct nf_conntrack_helper *helper; 214 struct nf_conntrack_helper *helper;
215 struct nfnl_cthelper *nfcth;
206 int ret; 216 int ret;
207 217
208 if (!tb[NFCTH_TUPLE] || !tb[NFCTH_POLICY] || !tb[NFCTH_PRIV_DATA_LEN]) 218 if (!tb[NFCTH_TUPLE] || !tb[NFCTH_POLICY] || !tb[NFCTH_PRIV_DATA_LEN])
209 return -EINVAL; 219 return -EINVAL;
210 220
211 helper = kzalloc(sizeof(struct nf_conntrack_helper), GFP_KERNEL); 221 nfcth = kzalloc(sizeof(*nfcth), GFP_KERNEL);
212 if (helper == NULL) 222 if (nfcth == NULL)
213 return -ENOMEM; 223 return -ENOMEM;
224 helper = &nfcth->helper;
214 225
215 ret = nfnl_cthelper_parse_expect_policy(helper, tb[NFCTH_POLICY]); 226 ret = nfnl_cthelper_parse_expect_policy(helper, tb[NFCTH_POLICY]);
216 if (ret < 0) 227 if (ret < 0)
217 goto err; 228 goto err1;
218 229
219 strncpy(helper->name, nla_data(tb[NFCTH_NAME]), NF_CT_HELPER_NAME_LEN); 230 strncpy(helper->name, nla_data(tb[NFCTH_NAME]), NF_CT_HELPER_NAME_LEN);
220 helper->data_len = ntohl(nla_get_be32(tb[NFCTH_PRIV_DATA_LEN])); 231 helper->data_len = ntohl(nla_get_be32(tb[NFCTH_PRIV_DATA_LEN]));
@@ -245,15 +256,101 @@ nfnl_cthelper_create(const struct nlattr * const tb[],
245 256
246 ret = nf_conntrack_helper_register(helper); 257 ret = nf_conntrack_helper_register(helper);
247 if (ret < 0) 258 if (ret < 0)
248 goto err; 259 goto err2;
249 260
261 list_add_tail(&nfcth->list, &nfnl_cthelper_list);
250 return 0; 262 return 0;
251err: 263err2:
252 kfree(helper); 264 kfree(helper->expect_policy);
265err1:
266 kfree(nfcth);
253 return ret; 267 return ret;
254} 268}
255 269
256static int 270static int
271nfnl_cthelper_update_policy_one(const struct nf_conntrack_expect_policy *policy,
272 struct nf_conntrack_expect_policy *new_policy,
273 const struct nlattr *attr)
274{
275 struct nlattr *tb[NFCTH_POLICY_MAX + 1];
276 int err;
277
278 err = nla_parse_nested(tb, NFCTH_POLICY_MAX, attr,
279 nfnl_cthelper_expect_pol);
280 if (err < 0)
281 return err;
282
283 if (!tb[NFCTH_POLICY_NAME] ||
284 !tb[NFCTH_POLICY_EXPECT_MAX] ||
285 !tb[NFCTH_POLICY_EXPECT_TIMEOUT])
286 return -EINVAL;
287
288 if (nla_strcmp(tb[NFCTH_POLICY_NAME], policy->name))
289 return -EBUSY;
290
291 new_policy->max_expected =
292 ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_MAX]));
293 new_policy->timeout =
294 ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_TIMEOUT]));
295
296 return 0;
297}
298
299static int nfnl_cthelper_update_policy_all(struct nlattr *tb[],
300 struct nf_conntrack_helper *helper)
301{
302 struct nf_conntrack_expect_policy new_policy[helper->expect_class_max + 1];
303 struct nf_conntrack_expect_policy *policy;
304 int i, err;
305
306 /* Check first that all policy attributes are well-formed, so we don't
307 * leave things in inconsistent state on errors.
308 */
309 for (i = 0; i < helper->expect_class_max + 1; i++) {
310
311 if (!tb[NFCTH_POLICY_SET + i])
312 return -EINVAL;
313
314 err = nfnl_cthelper_update_policy_one(&helper->expect_policy[i],
315 &new_policy[i],
316 tb[NFCTH_POLICY_SET + i]);
317 if (err < 0)
318 return err;
319 }
320 /* Now we can safely update them. */
321 for (i = 0; i < helper->expect_class_max + 1; i++) {
322 policy = (struct nf_conntrack_expect_policy *)
323 &helper->expect_policy[i];
324 policy->max_expected = new_policy->max_expected;
325 policy->timeout = new_policy->timeout;
326 }
327
328 return 0;
329}
330
331static int nfnl_cthelper_update_policy(struct nf_conntrack_helper *helper,
332 const struct nlattr *attr)
333{
334 struct nlattr *tb[NFCTH_POLICY_SET_MAX + 1];
335 unsigned int class_max;
336 int err;
337
338 err = nla_parse_nested(tb, NFCTH_POLICY_SET_MAX, attr,
339 nfnl_cthelper_expect_policy_set);
340 if (err < 0)
341 return err;
342
343 if (!tb[NFCTH_POLICY_SET_NUM])
344 return -EINVAL;
345
346 class_max = ntohl(nla_get_be32(tb[NFCTH_POLICY_SET_NUM]));
347 if (helper->expect_class_max + 1 != class_max)
348 return -EBUSY;
349
350 return nfnl_cthelper_update_policy_all(tb, helper);
351}
352
353static int
257nfnl_cthelper_update(const struct nlattr * const tb[], 354nfnl_cthelper_update(const struct nlattr * const tb[],
258 struct nf_conntrack_helper *helper) 355 struct nf_conntrack_helper *helper)
259{ 356{
@@ -263,8 +360,7 @@ nfnl_cthelper_update(const struct nlattr * const tb[],
263 return -EBUSY; 360 return -EBUSY;
264 361
265 if (tb[NFCTH_POLICY]) { 362 if (tb[NFCTH_POLICY]) {
266 ret = nfnl_cthelper_parse_expect_policy(helper, 363 ret = nfnl_cthelper_update_policy(helper, tb[NFCTH_POLICY]);
267 tb[NFCTH_POLICY]);
268 if (ret < 0) 364 if (ret < 0)
269 return ret; 365 return ret;
270 } 366 }
@@ -293,7 +389,8 @@ static int nfnl_cthelper_new(struct net *net, struct sock *nfnl,
293 const char *helper_name; 389 const char *helper_name;
294 struct nf_conntrack_helper *cur, *helper = NULL; 390 struct nf_conntrack_helper *cur, *helper = NULL;
295 struct nf_conntrack_tuple tuple; 391 struct nf_conntrack_tuple tuple;
296 int ret = 0, i; 392 struct nfnl_cthelper *nlcth;
393 int ret = 0;
297 394
298 if (!tb[NFCTH_NAME] || !tb[NFCTH_TUPLE]) 395 if (!tb[NFCTH_NAME] || !tb[NFCTH_TUPLE])
299 return -EINVAL; 396 return -EINVAL;
@@ -304,31 +401,22 @@ static int nfnl_cthelper_new(struct net *net, struct sock *nfnl,
304 if (ret < 0) 401 if (ret < 0)
305 return ret; 402 return ret;
306 403
307 rcu_read_lock(); 404 list_for_each_entry(nlcth, &nfnl_cthelper_list, list) {
308 for (i = 0; i < nf_ct_helper_hsize && !helper; i++) { 405 cur = &nlcth->helper;
309 hlist_for_each_entry_rcu(cur, &nf_ct_helper_hash[i], hnode) {
310 406
311 /* skip non-userspace conntrack helpers. */ 407 if (strncmp(cur->name, helper_name, NF_CT_HELPER_NAME_LEN))
312 if (!(cur->flags & NF_CT_HELPER_F_USERSPACE)) 408 continue;
313 continue;
314 409
315 if (strncmp(cur->name, helper_name, 410 if ((tuple.src.l3num != cur->tuple.src.l3num ||
316 NF_CT_HELPER_NAME_LEN) != 0) 411 tuple.dst.protonum != cur->tuple.dst.protonum))
317 continue; 412 continue;
318 413
319 if ((tuple.src.l3num != cur->tuple.src.l3num || 414 if (nlh->nlmsg_flags & NLM_F_EXCL)
320 tuple.dst.protonum != cur->tuple.dst.protonum)) 415 return -EEXIST;
321 continue;
322 416
323 if (nlh->nlmsg_flags & NLM_F_EXCL) { 417 helper = cur;
324 ret = -EEXIST; 418 break;
325 goto err;
326 }
327 helper = cur;
328 break;
329 }
330 } 419 }
331 rcu_read_unlock();
332 420
333 if (helper == NULL) 421 if (helper == NULL)
334 ret = nfnl_cthelper_create(tb, &tuple); 422 ret = nfnl_cthelper_create(tb, &tuple);
@@ -336,9 +424,6 @@ static int nfnl_cthelper_new(struct net *net, struct sock *nfnl,
336 ret = nfnl_cthelper_update(tb, helper); 424 ret = nfnl_cthelper_update(tb, helper);
337 425
338 return ret; 426 return ret;
339err:
340 rcu_read_unlock();
341 return ret;
342} 427}
343 428
344static int 429static int
@@ -377,10 +462,10 @@ nfnl_cthelper_dump_policy(struct sk_buff *skb,
377 goto nla_put_failure; 462 goto nla_put_failure;
378 463
379 if (nla_put_be32(skb, NFCTH_POLICY_SET_NUM, 464 if (nla_put_be32(skb, NFCTH_POLICY_SET_NUM,
380 htonl(helper->expect_class_max))) 465 htonl(helper->expect_class_max + 1)))
381 goto nla_put_failure; 466 goto nla_put_failure;
382 467
383 for (i=0; i<helper->expect_class_max; i++) { 468 for (i = 0; i < helper->expect_class_max + 1; i++) {
384 nest_parms2 = nla_nest_start(skb, 469 nest_parms2 = nla_nest_start(skb,
385 (NFCTH_POLICY_SET+i) | NLA_F_NESTED); 470 (NFCTH_POLICY_SET+i) | NLA_F_NESTED);
386 if (nest_parms2 == NULL) 471 if (nest_parms2 == NULL)
@@ -502,11 +587,12 @@ static int nfnl_cthelper_get(struct net *net, struct sock *nfnl,
502 struct sk_buff *skb, const struct nlmsghdr *nlh, 587 struct sk_buff *skb, const struct nlmsghdr *nlh,
503 const struct nlattr * const tb[]) 588 const struct nlattr * const tb[])
504{ 589{
505 int ret = -ENOENT, i; 590 int ret = -ENOENT;
506 struct nf_conntrack_helper *cur; 591 struct nf_conntrack_helper *cur;
507 struct sk_buff *skb2; 592 struct sk_buff *skb2;
508 char *helper_name = NULL; 593 char *helper_name = NULL;
509 struct nf_conntrack_tuple tuple; 594 struct nf_conntrack_tuple tuple;
595 struct nfnl_cthelper *nlcth;
510 bool tuple_set = false; 596 bool tuple_set = false;
511 597
512 if (nlh->nlmsg_flags & NLM_F_DUMP) { 598 if (nlh->nlmsg_flags & NLM_F_DUMP) {
@@ -527,45 +613,39 @@ static int nfnl_cthelper_get(struct net *net, struct sock *nfnl,
527 tuple_set = true; 613 tuple_set = true;
528 } 614 }
529 615
530 for (i = 0; i < nf_ct_helper_hsize; i++) { 616 list_for_each_entry(nlcth, &nfnl_cthelper_list, list) {
531 hlist_for_each_entry_rcu(cur, &nf_ct_helper_hash[i], hnode) { 617 cur = &nlcth->helper;
618 if (helper_name &&
619 strncmp(cur->name, helper_name, NF_CT_HELPER_NAME_LEN))
620 continue;
532 621
533 /* skip non-userspace conntrack helpers. */ 622 if (tuple_set &&
534 if (!(cur->flags & NF_CT_HELPER_F_USERSPACE)) 623 (tuple.src.l3num != cur->tuple.src.l3num ||
535 continue; 624 tuple.dst.protonum != cur->tuple.dst.protonum))
625 continue;
536 626
537 if (helper_name && strncmp(cur->name, helper_name, 627 skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
538 NF_CT_HELPER_NAME_LEN) != 0) { 628 if (skb2 == NULL) {
539 continue; 629 ret = -ENOMEM;
540 } 630 break;
541 if (tuple_set && 631 }
542 (tuple.src.l3num != cur->tuple.src.l3num ||
543 tuple.dst.protonum != cur->tuple.dst.protonum))
544 continue;
545
546 skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
547 if (skb2 == NULL) {
548 ret = -ENOMEM;
549 break;
550 }
551 632
552 ret = nfnl_cthelper_fill_info(skb2, NETLINK_CB(skb).portid, 633 ret = nfnl_cthelper_fill_info(skb2, NETLINK_CB(skb).portid,
553 nlh->nlmsg_seq, 634 nlh->nlmsg_seq,
554 NFNL_MSG_TYPE(nlh->nlmsg_type), 635 NFNL_MSG_TYPE(nlh->nlmsg_type),
555 NFNL_MSG_CTHELPER_NEW, cur); 636 NFNL_MSG_CTHELPER_NEW, cur);
556 if (ret <= 0) { 637 if (ret <= 0) {
557 kfree_skb(skb2); 638 kfree_skb(skb2);
558 break; 639 break;
559 } 640 }
560 641
561 ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid, 642 ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid,
562 MSG_DONTWAIT); 643 MSG_DONTWAIT);
563 if (ret > 0) 644 if (ret > 0)
564 ret = 0; 645 ret = 0;
565 646
566 /* this avoids a loop in nfnetlink. */ 647 /* this avoids a loop in nfnetlink. */
567 return ret == -EAGAIN ? -ENOBUFS : ret; 648 return ret == -EAGAIN ? -ENOBUFS : ret;
568 }
569 } 649 }
570 return ret; 650 return ret;
571} 651}
@@ -576,10 +656,10 @@ static int nfnl_cthelper_del(struct net *net, struct sock *nfnl,
576{ 656{
577 char *helper_name = NULL; 657 char *helper_name = NULL;
578 struct nf_conntrack_helper *cur; 658 struct nf_conntrack_helper *cur;
579 struct hlist_node *tmp;
580 struct nf_conntrack_tuple tuple; 659 struct nf_conntrack_tuple tuple;
581 bool tuple_set = false, found = false; 660 bool tuple_set = false, found = false;
582 int i, j = 0, ret; 661 struct nfnl_cthelper *nlcth, *n;
662 int j = 0, ret;
583 663
584 if (tb[NFCTH_NAME]) 664 if (tb[NFCTH_NAME])
585 helper_name = nla_data(tb[NFCTH_NAME]); 665 helper_name = nla_data(tb[NFCTH_NAME]);
@@ -592,28 +672,27 @@ static int nfnl_cthelper_del(struct net *net, struct sock *nfnl,
592 tuple_set = true; 672 tuple_set = true;
593 } 673 }
594 674
595 for (i = 0; i < nf_ct_helper_hsize; i++) { 675 list_for_each_entry_safe(nlcth, n, &nfnl_cthelper_list, list) {
596 hlist_for_each_entry_safe(cur, tmp, &nf_ct_helper_hash[i], 676 cur = &nlcth->helper;
597 hnode) { 677 j++;
598 /* skip non-userspace conntrack helpers. */
599 if (!(cur->flags & NF_CT_HELPER_F_USERSPACE))
600 continue;
601 678
602 j++; 679 if (helper_name &&
680 strncmp(cur->name, helper_name, NF_CT_HELPER_NAME_LEN))
681 continue;
603 682
604 if (helper_name && strncmp(cur->name, helper_name, 683 if (tuple_set &&
605 NF_CT_HELPER_NAME_LEN) != 0) { 684 (tuple.src.l3num != cur->tuple.src.l3num ||
606 continue; 685 tuple.dst.protonum != cur->tuple.dst.protonum))
607 } 686 continue;
608 if (tuple_set &&
609 (tuple.src.l3num != cur->tuple.src.l3num ||
610 tuple.dst.protonum != cur->tuple.dst.protonum))
611 continue;
612 687
613 found = true; 688 found = true;
614 nf_conntrack_helper_unregister(cur); 689 nf_conntrack_helper_unregister(cur);
615 } 690 kfree(cur->expect_policy);
691
692 list_del(&nlcth->list);
693 kfree(nlcth);
616 } 694 }
695
617 /* Make sure we return success if we flush and there is no helpers */ 696 /* Make sure we return success if we flush and there is no helpers */
618 return (found || j == 0) ? 0 : -ENOENT; 697 return (found || j == 0) ? 0 : -ENOENT;
619} 698}
@@ -662,20 +741,16 @@ err_out:
662static void __exit nfnl_cthelper_exit(void) 741static void __exit nfnl_cthelper_exit(void)
663{ 742{
664 struct nf_conntrack_helper *cur; 743 struct nf_conntrack_helper *cur;
665 struct hlist_node *tmp; 744 struct nfnl_cthelper *nlcth, *n;
666 int i;
667 745
668 nfnetlink_subsys_unregister(&nfnl_cthelper_subsys); 746 nfnetlink_subsys_unregister(&nfnl_cthelper_subsys);
669 747
670 for (i=0; i<nf_ct_helper_hsize; i++) { 748 list_for_each_entry_safe(nlcth, n, &nfnl_cthelper_list, list) {
671 hlist_for_each_entry_safe(cur, tmp, &nf_ct_helper_hash[i], 749 cur = &nlcth->helper;
672 hnode) {
673 /* skip non-userspace conntrack helpers. */
674 if (!(cur->flags & NF_CT_HELPER_F_USERSPACE))
675 continue;
676 750
677 nf_conntrack_helper_unregister(cur); 751 nf_conntrack_helper_unregister(cur);
678 } 752 kfree(cur->expect_policy);
753 kfree(nlcth);
679 } 754 }
680} 755}
681 756
diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
index 139e0867e56e..47d6656c9119 100644
--- a/net/netfilter/nfnetlink_cttimeout.c
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -646,8 +646,8 @@ static void __exit cttimeout_exit(void)
646#ifdef CONFIG_NF_CONNTRACK_TIMEOUT 646#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
647 RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, NULL); 647 RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, NULL);
648 RCU_INIT_POINTER(nf_ct_timeout_put_hook, NULL); 648 RCU_INIT_POINTER(nf_ct_timeout_put_hook, NULL);
649 synchronize_rcu();
649#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ 650#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
650 rcu_barrier();
651} 651}
652 652
653module_init(cttimeout_init); 653module_init(cttimeout_init);
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 3ee0b8a000a4..933509ebf3d3 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -443,7 +443,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
443 skb = alloc_skb(size, GFP_ATOMIC); 443 skb = alloc_skb(size, GFP_ATOMIC);
444 if (!skb) { 444 if (!skb) {
445 skb_tx_error(entskb); 445 skb_tx_error(entskb);
446 return NULL; 446 goto nlmsg_failure;
447 } 447 }
448 448
449 nlh = nlmsg_put(skb, 0, 0, 449 nlh = nlmsg_put(skb, 0, 0,
@@ -452,7 +452,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
452 if (!nlh) { 452 if (!nlh) {
453 skb_tx_error(entskb); 453 skb_tx_error(entskb);
454 kfree_skb(skb); 454 kfree_skb(skb);
455 return NULL; 455 goto nlmsg_failure;
456 } 456 }
457 nfmsg = nlmsg_data(nlh); 457 nfmsg = nlmsg_data(nlh);
458 nfmsg->nfgen_family = entry->state.pf; 458 nfmsg->nfgen_family = entry->state.pf;
@@ -598,12 +598,17 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
598 } 598 }
599 599
600 nlh->nlmsg_len = skb->len; 600 nlh->nlmsg_len = skb->len;
601 if (seclen)
602 security_release_secctx(secdata, seclen);
601 return skb; 603 return skb;
602 604
603nla_put_failure: 605nla_put_failure:
604 skb_tx_error(entskb); 606 skb_tx_error(entskb);
605 kfree_skb(skb); 607 kfree_skb(skb);
606 net_err_ratelimited("nf_queue: error creating packet message\n"); 608 net_err_ratelimited("nf_queue: error creating packet message\n");
609nlmsg_failure:
610 if (seclen)
611 security_release_secctx(secdata, seclen);
607 return NULL; 612 return NULL;
608} 613}
609 614
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index e6baeaebe653..0264258c46fe 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -32,6 +32,11 @@ struct nft_ct {
32 }; 32 };
33}; 33};
34 34
35#ifdef CONFIG_NF_CONNTRACK_ZONES
36static DEFINE_PER_CPU(struct nf_conn *, nft_ct_pcpu_template);
37static unsigned int nft_ct_pcpu_template_refcnt __read_mostly;
38#endif
39
35static u64 nft_ct_get_eval_counter(const struct nf_conn_counter *c, 40static u64 nft_ct_get_eval_counter(const struct nf_conn_counter *c,
36 enum nft_ct_keys k, 41 enum nft_ct_keys k,
37 enum ip_conntrack_dir d) 42 enum ip_conntrack_dir d)
@@ -78,7 +83,7 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
78 83
79 switch (priv->key) { 84 switch (priv->key) {
80 case NFT_CT_DIRECTION: 85 case NFT_CT_DIRECTION:
81 *dest = CTINFO2DIR(ctinfo); 86 nft_reg_store8(dest, CTINFO2DIR(ctinfo));
82 return; 87 return;
83 case NFT_CT_STATUS: 88 case NFT_CT_STATUS:
84 *dest = ct->status; 89 *dest = ct->status;
@@ -129,12 +134,42 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
129 memcpy(dest, &count, sizeof(count)); 134 memcpy(dest, &count, sizeof(count));
130 return; 135 return;
131 } 136 }
137 case NFT_CT_AVGPKT: {
138 const struct nf_conn_acct *acct = nf_conn_acct_find(ct);
139 u64 avgcnt = 0, bcnt = 0, pcnt = 0;
140
141 if (acct) {
142 pcnt = nft_ct_get_eval_counter(acct->counter,
143 NFT_CT_PKTS, priv->dir);
144 bcnt = nft_ct_get_eval_counter(acct->counter,
145 NFT_CT_BYTES, priv->dir);
146 if (pcnt != 0)
147 avgcnt = div64_u64(bcnt, pcnt);
148 }
149
150 memcpy(dest, &avgcnt, sizeof(avgcnt));
151 return;
152 }
132 case NFT_CT_L3PROTOCOL: 153 case NFT_CT_L3PROTOCOL:
133 *dest = nf_ct_l3num(ct); 154 nft_reg_store8(dest, nf_ct_l3num(ct));
134 return; 155 return;
135 case NFT_CT_PROTOCOL: 156 case NFT_CT_PROTOCOL:
136 *dest = nf_ct_protonum(ct); 157 nft_reg_store8(dest, nf_ct_protonum(ct));
137 return; 158 return;
159#ifdef CONFIG_NF_CONNTRACK_ZONES
160 case NFT_CT_ZONE: {
161 const struct nf_conntrack_zone *zone = nf_ct_zone(ct);
162 u16 zoneid;
163
164 if (priv->dir < IP_CT_DIR_MAX)
165 zoneid = nf_ct_zone_id(zone, priv->dir);
166 else
167 zoneid = zone->id;
168
169 nft_reg_store16(dest, zoneid);
170 return;
171 }
172#endif
138 default: 173 default:
139 break; 174 break;
140 } 175 }
@@ -150,10 +185,10 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
150 nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16); 185 nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16);
151 return; 186 return;
152 case NFT_CT_PROTO_SRC: 187 case NFT_CT_PROTO_SRC:
153 *dest = (__force __u16)tuple->src.u.all; 188 nft_reg_store16(dest, (__force u16)tuple->src.u.all);
154 return; 189 return;
155 case NFT_CT_PROTO_DST: 190 case NFT_CT_PROTO_DST:
156 *dest = (__force __u16)tuple->dst.u.all; 191 nft_reg_store16(dest, (__force u16)tuple->dst.u.all);
157 return; 192 return;
158 default: 193 default:
159 break; 194 break;
@@ -163,6 +198,53 @@ err:
163 regs->verdict.code = NFT_BREAK; 198 regs->verdict.code = NFT_BREAK;
164} 199}
165 200
201#ifdef CONFIG_NF_CONNTRACK_ZONES
202static void nft_ct_set_zone_eval(const struct nft_expr *expr,
203 struct nft_regs *regs,
204 const struct nft_pktinfo *pkt)
205{
206 struct nf_conntrack_zone zone = { .dir = NF_CT_DEFAULT_ZONE_DIR };
207 const struct nft_ct *priv = nft_expr_priv(expr);
208 struct sk_buff *skb = pkt->skb;
209 enum ip_conntrack_info ctinfo;
210 u16 value = nft_reg_load16(&regs->data[priv->sreg]);
211 struct nf_conn *ct;
212
213 ct = nf_ct_get(skb, &ctinfo);
214 if (ct) /* already tracked */
215 return;
216
217 zone.id = value;
218
219 switch (priv->dir) {
220 case IP_CT_DIR_ORIGINAL:
221 zone.dir = NF_CT_ZONE_DIR_ORIG;
222 break;
223 case IP_CT_DIR_REPLY:
224 zone.dir = NF_CT_ZONE_DIR_REPL;
225 break;
226 default:
227 break;
228 }
229
230 ct = this_cpu_read(nft_ct_pcpu_template);
231
232 if (likely(atomic_read(&ct->ct_general.use) == 1)) {
233 nf_ct_zone_add(ct, &zone);
234 } else {
235 /* previous skb got queued to userspace */
236 ct = nf_ct_tmpl_alloc(nft_net(pkt), &zone, GFP_ATOMIC);
237 if (!ct) {
238 regs->verdict.code = NF_DROP;
239 return;
240 }
241 }
242
243 atomic_inc(&ct->ct_general.use);
244 nf_ct_set(skb, ct, IP_CT_NEW);
245}
246#endif
247
166static void nft_ct_set_eval(const struct nft_expr *expr, 248static void nft_ct_set_eval(const struct nft_expr *expr,
167 struct nft_regs *regs, 249 struct nft_regs *regs,
168 const struct nft_pktinfo *pkt) 250 const struct nft_pktinfo *pkt)
@@ -241,6 +323,45 @@ static void nft_ct_netns_put(struct net *net, uint8_t family)
241 nf_ct_netns_put(net, family); 323 nf_ct_netns_put(net, family);
242} 324}
243 325
326#ifdef CONFIG_NF_CONNTRACK_ZONES
327static void nft_ct_tmpl_put_pcpu(void)
328{
329 struct nf_conn *ct;
330 int cpu;
331
332 for_each_possible_cpu(cpu) {
333 ct = per_cpu(nft_ct_pcpu_template, cpu);
334 if (!ct)
335 break;
336 nf_ct_put(ct);
337 per_cpu(nft_ct_pcpu_template, cpu) = NULL;
338 }
339}
340
341static bool nft_ct_tmpl_alloc_pcpu(void)
342{
343 struct nf_conntrack_zone zone = { .id = 0 };
344 struct nf_conn *tmp;
345 int cpu;
346
347 if (nft_ct_pcpu_template_refcnt)
348 return true;
349
350 for_each_possible_cpu(cpu) {
351 tmp = nf_ct_tmpl_alloc(&init_net, &zone, GFP_KERNEL);
352 if (!tmp) {
353 nft_ct_tmpl_put_pcpu();
354 return false;
355 }
356
357 atomic_set(&tmp->ct_general.use, 1);
358 per_cpu(nft_ct_pcpu_template, cpu) = tmp;
359 }
360
361 return true;
362}
363#endif
364
244static int nft_ct_get_init(const struct nft_ctx *ctx, 365static int nft_ct_get_init(const struct nft_ctx *ctx,
245 const struct nft_expr *expr, 366 const struct nft_expr *expr,
246 const struct nlattr * const tb[]) 367 const struct nlattr * const tb[])
@@ -250,6 +371,7 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
250 int err; 371 int err;
251 372
252 priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY])); 373 priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY]));
374 priv->dir = IP_CT_DIR_MAX;
253 switch (priv->key) { 375 switch (priv->key) {
254 case NFT_CT_DIRECTION: 376 case NFT_CT_DIRECTION:
255 if (tb[NFTA_CT_DIRECTION] != NULL) 377 if (tb[NFTA_CT_DIRECTION] != NULL)
@@ -316,11 +438,14 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
316 break; 438 break;
317 case NFT_CT_BYTES: 439 case NFT_CT_BYTES:
318 case NFT_CT_PKTS: 440 case NFT_CT_PKTS:
319 /* no direction? return sum of original + reply */ 441 case NFT_CT_AVGPKT:
320 if (tb[NFTA_CT_DIRECTION] == NULL)
321 priv->dir = IP_CT_DIR_MAX;
322 len = sizeof(u64); 442 len = sizeof(u64);
323 break; 443 break;
444#ifdef CONFIG_NF_CONNTRACK_ZONES
445 case NFT_CT_ZONE:
446 len = sizeof(u16);
447 break;
448#endif
324 default: 449 default:
325 return -EOPNOTSUPP; 450 return -EOPNOTSUPP;
326 } 451 }
@@ -346,21 +471,41 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
346 if (err < 0) 471 if (err < 0)
347 return err; 472 return err;
348 473
349 if (priv->key == NFT_CT_BYTES || priv->key == NFT_CT_PKTS) 474 if (priv->key == NFT_CT_BYTES ||
475 priv->key == NFT_CT_PKTS ||
476 priv->key == NFT_CT_AVGPKT)
350 nf_ct_set_acct(ctx->net, true); 477 nf_ct_set_acct(ctx->net, true);
351 478
352 return 0; 479 return 0;
353} 480}
354 481
482static void __nft_ct_set_destroy(const struct nft_ctx *ctx, struct nft_ct *priv)
483{
484 switch (priv->key) {
485#ifdef CONFIG_NF_CONNTRACK_LABELS
486 case NFT_CT_LABELS:
487 nf_connlabels_put(ctx->net);
488 break;
489#endif
490#ifdef CONFIG_NF_CONNTRACK_ZONES
491 case NFT_CT_ZONE:
492 if (--nft_ct_pcpu_template_refcnt == 0)
493 nft_ct_tmpl_put_pcpu();
494#endif
495 default:
496 break;
497 }
498}
499
355static int nft_ct_set_init(const struct nft_ctx *ctx, 500static int nft_ct_set_init(const struct nft_ctx *ctx,
356 const struct nft_expr *expr, 501 const struct nft_expr *expr,
357 const struct nlattr * const tb[]) 502 const struct nlattr * const tb[])
358{ 503{
359 struct nft_ct *priv = nft_expr_priv(expr); 504 struct nft_ct *priv = nft_expr_priv(expr);
360 bool label_got = false;
361 unsigned int len; 505 unsigned int len;
362 int err; 506 int err;
363 507
508 priv->dir = IP_CT_DIR_MAX;
364 priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY])); 509 priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY]));
365 switch (priv->key) { 510 switch (priv->key) {
366#ifdef CONFIG_NF_CONNTRACK_MARK 511#ifdef CONFIG_NF_CONNTRACK_MARK
@@ -378,13 +523,32 @@ static int nft_ct_set_init(const struct nft_ctx *ctx,
378 err = nf_connlabels_get(ctx->net, (len * BITS_PER_BYTE) - 1); 523 err = nf_connlabels_get(ctx->net, (len * BITS_PER_BYTE) - 1);
379 if (err) 524 if (err)
380 return err; 525 return err;
381 label_got = true; 526 break;
527#endif
528#ifdef CONFIG_NF_CONNTRACK_ZONES
529 case NFT_CT_ZONE:
530 if (!nft_ct_tmpl_alloc_pcpu())
531 return -ENOMEM;
532 nft_ct_pcpu_template_refcnt++;
533 len = sizeof(u16);
382 break; 534 break;
383#endif 535#endif
384 default: 536 default:
385 return -EOPNOTSUPP; 537 return -EOPNOTSUPP;
386 } 538 }
387 539
540 if (tb[NFTA_CT_DIRECTION]) {
541 priv->dir = nla_get_u8(tb[NFTA_CT_DIRECTION]);
542 switch (priv->dir) {
543 case IP_CT_DIR_ORIGINAL:
544 case IP_CT_DIR_REPLY:
545 break;
546 default:
547 err = -EINVAL;
548 goto err1;
549 }
550 }
551
388 priv->sreg = nft_parse_register(tb[NFTA_CT_SREG]); 552 priv->sreg = nft_parse_register(tb[NFTA_CT_SREG]);
389 err = nft_validate_register_load(priv->sreg, len); 553 err = nft_validate_register_load(priv->sreg, len);
390 if (err < 0) 554 if (err < 0)
@@ -397,8 +561,7 @@ static int nft_ct_set_init(const struct nft_ctx *ctx,
397 return 0; 561 return 0;
398 562
399err1: 563err1:
400 if (label_got) 564 __nft_ct_set_destroy(ctx, priv);
401 nf_connlabels_put(ctx->net);
402 return err; 565 return err;
403} 566}
404 567
@@ -413,16 +576,7 @@ static void nft_ct_set_destroy(const struct nft_ctx *ctx,
413{ 576{
414 struct nft_ct *priv = nft_expr_priv(expr); 577 struct nft_ct *priv = nft_expr_priv(expr);
415 578
416 switch (priv->key) { 579 __nft_ct_set_destroy(ctx, priv);
417#ifdef CONFIG_NF_CONNTRACK_LABELS
418 case NFT_CT_LABELS:
419 nf_connlabels_put(ctx->net);
420 break;
421#endif
422 default:
423 break;
424 }
425
426 nft_ct_netns_put(ctx->net, ctx->afi->family); 580 nft_ct_netns_put(ctx->net, ctx->afi->family);
427} 581}
428 582
@@ -445,6 +599,8 @@ static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr)
445 break; 599 break;
446 case NFT_CT_BYTES: 600 case NFT_CT_BYTES:
447 case NFT_CT_PKTS: 601 case NFT_CT_PKTS:
602 case NFT_CT_AVGPKT:
603 case NFT_CT_ZONE:
448 if (priv->dir < IP_CT_DIR_MAX && 604 if (priv->dir < IP_CT_DIR_MAX &&
449 nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir)) 605 nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir))
450 goto nla_put_failure; 606 goto nla_put_failure;
@@ -467,6 +623,17 @@ static int nft_ct_set_dump(struct sk_buff *skb, const struct nft_expr *expr)
467 goto nla_put_failure; 623 goto nla_put_failure;
468 if (nla_put_be32(skb, NFTA_CT_KEY, htonl(priv->key))) 624 if (nla_put_be32(skb, NFTA_CT_KEY, htonl(priv->key)))
469 goto nla_put_failure; 625 goto nla_put_failure;
626
627 switch (priv->key) {
628 case NFT_CT_ZONE:
629 if (priv->dir < IP_CT_DIR_MAX &&
630 nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir))
631 goto nla_put_failure;
632 break;
633 default:
634 break;
635 }
636
470 return 0; 637 return 0;
471 638
472nla_put_failure: 639nla_put_failure:
@@ -492,6 +659,17 @@ static const struct nft_expr_ops nft_ct_set_ops = {
492 .dump = nft_ct_set_dump, 659 .dump = nft_ct_set_dump,
493}; 660};
494 661
662#ifdef CONFIG_NF_CONNTRACK_ZONES
663static const struct nft_expr_ops nft_ct_set_zone_ops = {
664 .type = &nft_ct_type,
665 .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)),
666 .eval = nft_ct_set_zone_eval,
667 .init = nft_ct_set_init,
668 .destroy = nft_ct_set_destroy,
669 .dump = nft_ct_set_dump,
670};
671#endif
672
495static const struct nft_expr_ops * 673static const struct nft_expr_ops *
496nft_ct_select_ops(const struct nft_ctx *ctx, 674nft_ct_select_ops(const struct nft_ctx *ctx,
497 const struct nlattr * const tb[]) 675 const struct nlattr * const tb[])
@@ -505,8 +683,13 @@ nft_ct_select_ops(const struct nft_ctx *ctx,
505 if (tb[NFTA_CT_DREG]) 683 if (tb[NFTA_CT_DREG])
506 return &nft_ct_get_ops; 684 return &nft_ct_get_ops;
507 685
508 if (tb[NFTA_CT_SREG]) 686 if (tb[NFTA_CT_SREG]) {
687#ifdef CONFIG_NF_CONNTRACK_ZONES
688 if (nla_get_be32(tb[NFTA_CT_KEY]) == htonl(NFT_CT_ZONE))
689 return &nft_ct_set_zone_ops;
690#endif
509 return &nft_ct_set_ops; 691 return &nft_ct_set_ops;
692 }
510 693
511 return ERR_PTR(-EINVAL); 694 return ERR_PTR(-EINVAL);
512} 695}
@@ -534,8 +717,7 @@ static void nft_notrack_eval(const struct nft_expr *expr,
534 717
535 ct = nf_ct_untracked_get(); 718 ct = nf_ct_untracked_get();
536 atomic_inc(&ct->ct_general.use); 719 atomic_inc(&ct->ct_general.use);
537 skb->nfct = &ct->ct_general; 720 nf_ct_set(skb, ct, IP_CT_NEW);
538 skb->nfctinfo = IP_CT_NEW;
539} 721}
540 722
541static struct nft_expr_type nft_notrack_type; 723static struct nft_expr_type nft_notrack_type;
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
index 47beb3abcc9d..c308920b194c 100644
--- a/net/netfilter/nft_exthdr.c
+++ b/net/netfilter/nft_exthdr.c
@@ -15,19 +15,29 @@
15#include <linux/netfilter.h> 15#include <linux/netfilter.h>
16#include <linux/netfilter/nf_tables.h> 16#include <linux/netfilter/nf_tables.h>
17#include <net/netfilter/nf_tables.h> 17#include <net/netfilter/nf_tables.h>
18// FIXME: 18#include <net/tcp.h>
19#include <net/ipv6.h>
20 19
21struct nft_exthdr { 20struct nft_exthdr {
22 u8 type; 21 u8 type;
23 u8 offset; 22 u8 offset;
24 u8 len; 23 u8 len;
24 u8 op;
25 enum nft_registers dreg:8; 25 enum nft_registers dreg:8;
26 u8 flags;
26}; 27};
27 28
28static void nft_exthdr_eval(const struct nft_expr *expr, 29static unsigned int optlen(const u8 *opt, unsigned int offset)
29 struct nft_regs *regs, 30{
30 const struct nft_pktinfo *pkt) 31 /* Beware zero-length options: make finite progress */
32 if (opt[offset] <= TCPOPT_NOP || opt[offset + 1] == 0)
33 return 1;
34 else
35 return opt[offset + 1];
36}
37
38static void nft_exthdr_ipv6_eval(const struct nft_expr *expr,
39 struct nft_regs *regs,
40 const struct nft_pktinfo *pkt)
31{ 41{
32 struct nft_exthdr *priv = nft_expr_priv(expr); 42 struct nft_exthdr *priv = nft_expr_priv(expr);
33 u32 *dest = &regs->data[priv->dreg]; 43 u32 *dest = &regs->data[priv->dreg];
@@ -35,8 +45,12 @@ static void nft_exthdr_eval(const struct nft_expr *expr,
35 int err; 45 int err;
36 46
37 err = ipv6_find_hdr(pkt->skb, &offset, priv->type, NULL, NULL); 47 err = ipv6_find_hdr(pkt->skb, &offset, priv->type, NULL, NULL);
38 if (err < 0) 48 if (priv->flags & NFT_EXTHDR_F_PRESENT) {
49 *dest = (err >= 0);
50 return;
51 } else if (err < 0) {
39 goto err; 52 goto err;
53 }
40 offset += priv->offset; 54 offset += priv->offset;
41 55
42 dest[priv->len / NFT_REG32_SIZE] = 0; 56 dest[priv->len / NFT_REG32_SIZE] = 0;
@@ -47,11 +61,59 @@ err:
47 regs->verdict.code = NFT_BREAK; 61 regs->verdict.code = NFT_BREAK;
48} 62}
49 63
64static void nft_exthdr_tcp_eval(const struct nft_expr *expr,
65 struct nft_regs *regs,
66 const struct nft_pktinfo *pkt)
67{
68 u8 buff[sizeof(struct tcphdr) + MAX_TCP_OPTION_SPACE];
69 struct nft_exthdr *priv = nft_expr_priv(expr);
70 unsigned int i, optl, tcphdr_len, offset;
71 u32 *dest = &regs->data[priv->dreg];
72 struct tcphdr *tcph;
73 u8 *opt;
74
75 if (!pkt->tprot_set || pkt->tprot != IPPROTO_TCP)
76 goto err;
77
78 tcph = skb_header_pointer(pkt->skb, pkt->xt.thoff, sizeof(*tcph), buff);
79 if (!tcph)
80 goto err;
81
82 tcphdr_len = __tcp_hdrlen(tcph);
83 if (tcphdr_len < sizeof(*tcph))
84 goto err;
85
86 tcph = skb_header_pointer(pkt->skb, pkt->xt.thoff, tcphdr_len, buff);
87 if (!tcph)
88 goto err;
89
90 opt = (u8 *)tcph;
91 for (i = sizeof(*tcph); i < tcphdr_len - 1; i += optl) {
92 optl = optlen(opt, i);
93
94 if (priv->type != opt[i])
95 continue;
96
97 if (i + optl > tcphdr_len || priv->len + priv->offset > optl)
98 goto err;
99
100 offset = i + priv->offset;
101 dest[priv->len / NFT_REG32_SIZE] = 0;
102 memcpy(dest, opt + offset, priv->len);
103
104 return;
105 }
106
107err:
108 regs->verdict.code = NFT_BREAK;
109}
110
50static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = { 111static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = {
51 [NFTA_EXTHDR_DREG] = { .type = NLA_U32 }, 112 [NFTA_EXTHDR_DREG] = { .type = NLA_U32 },
52 [NFTA_EXTHDR_TYPE] = { .type = NLA_U8 }, 113 [NFTA_EXTHDR_TYPE] = { .type = NLA_U8 },
53 [NFTA_EXTHDR_OFFSET] = { .type = NLA_U32 }, 114 [NFTA_EXTHDR_OFFSET] = { .type = NLA_U32 },
54 [NFTA_EXTHDR_LEN] = { .type = NLA_U32 }, 115 [NFTA_EXTHDR_LEN] = { .type = NLA_U32 },
116 [NFTA_EXTHDR_FLAGS] = { .type = NLA_U32 },
55}; 117};
56 118
57static int nft_exthdr_init(const struct nft_ctx *ctx, 119static int nft_exthdr_init(const struct nft_ctx *ctx,
@@ -59,13 +121,13 @@ static int nft_exthdr_init(const struct nft_ctx *ctx,
59 const struct nlattr * const tb[]) 121 const struct nlattr * const tb[])
60{ 122{
61 struct nft_exthdr *priv = nft_expr_priv(expr); 123 struct nft_exthdr *priv = nft_expr_priv(expr);
62 u32 offset, len; 124 u32 offset, len, flags = 0, op = NFT_EXTHDR_OP_IPV6;
63 int err; 125 int err;
64 126
65 if (tb[NFTA_EXTHDR_DREG] == NULL || 127 if (!tb[NFTA_EXTHDR_DREG] ||
66 tb[NFTA_EXTHDR_TYPE] == NULL || 128 !tb[NFTA_EXTHDR_TYPE] ||
67 tb[NFTA_EXTHDR_OFFSET] == NULL || 129 !tb[NFTA_EXTHDR_OFFSET] ||
68 tb[NFTA_EXTHDR_LEN] == NULL) 130 !tb[NFTA_EXTHDR_LEN])
69 return -EINVAL; 131 return -EINVAL;
70 132
71 err = nft_parse_u32_check(tb[NFTA_EXTHDR_OFFSET], U8_MAX, &offset); 133 err = nft_parse_u32_check(tb[NFTA_EXTHDR_OFFSET], U8_MAX, &offset);
@@ -76,10 +138,27 @@ static int nft_exthdr_init(const struct nft_ctx *ctx,
76 if (err < 0) 138 if (err < 0)
77 return err; 139 return err;
78 140
141 if (tb[NFTA_EXTHDR_FLAGS]) {
142 err = nft_parse_u32_check(tb[NFTA_EXTHDR_FLAGS], U8_MAX, &flags);
143 if (err < 0)
144 return err;
145
146 if (flags & ~NFT_EXTHDR_F_PRESENT)
147 return -EINVAL;
148 }
149
150 if (tb[NFTA_EXTHDR_OP]) {
151 err = nft_parse_u32_check(tb[NFTA_EXTHDR_OP], U8_MAX, &op);
152 if (err < 0)
153 return err;
154 }
155
79 priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]); 156 priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]);
80 priv->offset = offset; 157 priv->offset = offset;
81 priv->len = len; 158 priv->len = len;
82 priv->dreg = nft_parse_register(tb[NFTA_EXTHDR_DREG]); 159 priv->dreg = nft_parse_register(tb[NFTA_EXTHDR_DREG]);
160 priv->flags = flags;
161 priv->op = op;
83 162
84 return nft_validate_register_store(ctx, priv->dreg, NULL, 163 return nft_validate_register_store(ctx, priv->dreg, NULL,
85 NFT_DATA_VALUE, priv->len); 164 NFT_DATA_VALUE, priv->len);
@@ -97,6 +176,10 @@ static int nft_exthdr_dump(struct sk_buff *skb, const struct nft_expr *expr)
97 goto nla_put_failure; 176 goto nla_put_failure;
98 if (nla_put_be32(skb, NFTA_EXTHDR_LEN, htonl(priv->len))) 177 if (nla_put_be32(skb, NFTA_EXTHDR_LEN, htonl(priv->len)))
99 goto nla_put_failure; 178 goto nla_put_failure;
179 if (nla_put_be32(skb, NFTA_EXTHDR_FLAGS, htonl(priv->flags)))
180 goto nla_put_failure;
181 if (nla_put_be32(skb, NFTA_EXTHDR_OP, htonl(priv->op)))
182 goto nla_put_failure;
100 return 0; 183 return 0;
101 184
102nla_put_failure: 185nla_put_failure:
@@ -104,17 +187,45 @@ nla_put_failure:
104} 187}
105 188
106static struct nft_expr_type nft_exthdr_type; 189static struct nft_expr_type nft_exthdr_type;
107static const struct nft_expr_ops nft_exthdr_ops = { 190static const struct nft_expr_ops nft_exthdr_ipv6_ops = {
108 .type = &nft_exthdr_type, 191 .type = &nft_exthdr_type,
109 .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), 192 .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
110 .eval = nft_exthdr_eval, 193 .eval = nft_exthdr_ipv6_eval,
111 .init = nft_exthdr_init, 194 .init = nft_exthdr_init,
112 .dump = nft_exthdr_dump, 195 .dump = nft_exthdr_dump,
113}; 196};
114 197
198static const struct nft_expr_ops nft_exthdr_tcp_ops = {
199 .type = &nft_exthdr_type,
200 .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
201 .eval = nft_exthdr_tcp_eval,
202 .init = nft_exthdr_init,
203 .dump = nft_exthdr_dump,
204};
205
206static const struct nft_expr_ops *
207nft_exthdr_select_ops(const struct nft_ctx *ctx,
208 const struct nlattr * const tb[])
209{
210 u32 op;
211
212 if (!tb[NFTA_EXTHDR_OP])
213 return &nft_exthdr_ipv6_ops;
214
215 op = ntohl(nla_get_u32(tb[NFTA_EXTHDR_OP]));
216 switch (op) {
217 case NFT_EXTHDR_OP_TCPOPT:
218 return &nft_exthdr_tcp_ops;
219 case NFT_EXTHDR_OP_IPV6:
220 return &nft_exthdr_ipv6_ops;
221 }
222
223 return ERR_PTR(-EOPNOTSUPP);
224}
225
115static struct nft_expr_type nft_exthdr_type __read_mostly = { 226static struct nft_expr_type nft_exthdr_type __read_mostly = {
116 .name = "exthdr", 227 .name = "exthdr",
117 .ops = &nft_exthdr_ops, 228 .select_ops = &nft_exthdr_select_ops,
118 .policy = nft_exthdr_policy, 229 .policy = nft_exthdr_policy,
119 .maxattr = NFTA_EXTHDR_MAX, 230 .maxattr = NFTA_EXTHDR_MAX,
120 .owner = THIS_MODULE, 231 .owner = THIS_MODULE,
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index eb2721af898d..c4dad1254ead 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -21,6 +21,7 @@ struct nft_hash {
21 enum nft_registers sreg:8; 21 enum nft_registers sreg:8;
22 enum nft_registers dreg:8; 22 enum nft_registers dreg:8;
23 u8 len; 23 u8 len;
24 bool autogen_seed:1;
24 u32 modulus; 25 u32 modulus;
25 u32 seed; 26 u32 seed;
26 u32 offset; 27 u32 offset;
@@ -82,10 +83,12 @@ static int nft_hash_init(const struct nft_ctx *ctx,
82 if (priv->offset + priv->modulus - 1 < priv->offset) 83 if (priv->offset + priv->modulus - 1 < priv->offset)
83 return -EOVERFLOW; 84 return -EOVERFLOW;
84 85
85 if (tb[NFTA_HASH_SEED]) 86 if (tb[NFTA_HASH_SEED]) {
86 priv->seed = ntohl(nla_get_be32(tb[NFTA_HASH_SEED])); 87 priv->seed = ntohl(nla_get_be32(tb[NFTA_HASH_SEED]));
87 else 88 } else {
89 priv->autogen_seed = true;
88 get_random_bytes(&priv->seed, sizeof(priv->seed)); 90 get_random_bytes(&priv->seed, sizeof(priv->seed));
91 }
89 92
90 return nft_validate_register_load(priv->sreg, len) && 93 return nft_validate_register_load(priv->sreg, len) &&
91 nft_validate_register_store(ctx, priv->dreg, NULL, 94 nft_validate_register_store(ctx, priv->dreg, NULL,
@@ -105,7 +108,8 @@ static int nft_hash_dump(struct sk_buff *skb,
105 goto nla_put_failure; 108 goto nla_put_failure;
106 if (nla_put_be32(skb, NFTA_HASH_MODULUS, htonl(priv->modulus))) 109 if (nla_put_be32(skb, NFTA_HASH_MODULUS, htonl(priv->modulus)))
107 goto nla_put_failure; 110 goto nla_put_failure;
108 if (nla_put_be32(skb, NFTA_HASH_SEED, htonl(priv->seed))) 111 if (!priv->autogen_seed &&
112 nla_put_be32(skb, NFTA_HASH_SEED, htonl(priv->seed)))
109 goto nla_put_failure; 113 goto nla_put_failure;
110 if (priv->offset != 0) 114 if (priv->offset != 0)
111 if (nla_put_be32(skb, NFTA_HASH_OFFSET, htonl(priv->offset))) 115 if (nla_put_be32(skb, NFTA_HASH_OFFSET, htonl(priv->offset)))
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 66c7f4b4c49b..7b60e01f38ff 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -45,16 +45,15 @@ void nft_meta_get_eval(const struct nft_expr *expr,
45 *dest = skb->len; 45 *dest = skb->len;
46 break; 46 break;
47 case NFT_META_PROTOCOL: 47 case NFT_META_PROTOCOL:
48 *dest = 0; 48 nft_reg_store16(dest, (__force u16)skb->protocol);
49 *(__be16 *)dest = skb->protocol;
50 break; 49 break;
51 case NFT_META_NFPROTO: 50 case NFT_META_NFPROTO:
52 *dest = nft_pf(pkt); 51 nft_reg_store8(dest, nft_pf(pkt));
53 break; 52 break;
54 case NFT_META_L4PROTO: 53 case NFT_META_L4PROTO:
55 if (!pkt->tprot_set) 54 if (!pkt->tprot_set)
56 goto err; 55 goto err;
57 *dest = pkt->tprot; 56 nft_reg_store8(dest, pkt->tprot);
58 break; 57 break;
59 case NFT_META_PRIORITY: 58 case NFT_META_PRIORITY:
60 *dest = skb->priority; 59 *dest = skb->priority;
@@ -85,14 +84,12 @@ void nft_meta_get_eval(const struct nft_expr *expr,
85 case NFT_META_IIFTYPE: 84 case NFT_META_IIFTYPE:
86 if (in == NULL) 85 if (in == NULL)
87 goto err; 86 goto err;
88 *dest = 0; 87 nft_reg_store16(dest, in->type);
89 *(u16 *)dest = in->type;
90 break; 88 break;
91 case NFT_META_OIFTYPE: 89 case NFT_META_OIFTYPE:
92 if (out == NULL) 90 if (out == NULL)
93 goto err; 91 goto err;
94 *dest = 0; 92 nft_reg_store16(dest, out->type);
95 *(u16 *)dest = out->type;
96 break; 93 break;
97 case NFT_META_SKUID: 94 case NFT_META_SKUID:
98 sk = skb_to_full_sk(skb); 95 sk = skb_to_full_sk(skb);
@@ -142,25 +139,48 @@ void nft_meta_get_eval(const struct nft_expr *expr,
142#endif 139#endif
143 case NFT_META_PKTTYPE: 140 case NFT_META_PKTTYPE:
144 if (skb->pkt_type != PACKET_LOOPBACK) { 141 if (skb->pkt_type != PACKET_LOOPBACK) {
145 *dest = skb->pkt_type; 142 nft_reg_store8(dest, skb->pkt_type);
146 break; 143 break;
147 } 144 }
148 145
149 switch (nft_pf(pkt)) { 146 switch (nft_pf(pkt)) {
150 case NFPROTO_IPV4: 147 case NFPROTO_IPV4:
151 if (ipv4_is_multicast(ip_hdr(skb)->daddr)) 148 if (ipv4_is_multicast(ip_hdr(skb)->daddr))
152 *dest = PACKET_MULTICAST; 149 nft_reg_store8(dest, PACKET_MULTICAST);
153 else 150 else
154 *dest = PACKET_BROADCAST; 151 nft_reg_store8(dest, PACKET_BROADCAST);
155 break; 152 break;
156 case NFPROTO_IPV6: 153 case NFPROTO_IPV6:
157 if (ipv6_hdr(skb)->daddr.s6_addr[0] == 0xFF) 154 nft_reg_store8(dest, PACKET_MULTICAST);
158 *dest = PACKET_MULTICAST; 155 break;
159 else 156 case NFPROTO_NETDEV:
160 *dest = PACKET_BROADCAST; 157 switch (skb->protocol) {
158 case htons(ETH_P_IP): {
159 int noff = skb_network_offset(skb);
160 struct iphdr *iph, _iph;
161
162 iph = skb_header_pointer(skb, noff,
163 sizeof(_iph), &_iph);
164 if (!iph)
165 goto err;
166
167 if (ipv4_is_multicast(iph->daddr))
168 nft_reg_store8(dest, PACKET_MULTICAST);
169 else
170 nft_reg_store8(dest, PACKET_BROADCAST);
171
172 break;
173 }
174 case htons(ETH_P_IPV6):
175 nft_reg_store8(dest, PACKET_MULTICAST);
176 break;
177 default:
178 WARN_ON_ONCE(1);
179 goto err;
180 }
161 break; 181 break;
162 default: 182 default:
163 WARN_ON(1); 183 WARN_ON_ONCE(1);
164 goto err; 184 goto err;
165 } 185 }
166 break; 186 break;
@@ -207,7 +227,9 @@ void nft_meta_set_eval(const struct nft_expr *expr,
207{ 227{
208 const struct nft_meta *meta = nft_expr_priv(expr); 228 const struct nft_meta *meta = nft_expr_priv(expr);
209 struct sk_buff *skb = pkt->skb; 229 struct sk_buff *skb = pkt->skb;
210 u32 value = regs->data[meta->sreg]; 230 u32 *sreg = &regs->data[meta->sreg];
231 u32 value = *sreg;
232 u8 pkt_type;
211 233
212 switch (meta->key) { 234 switch (meta->key) {
213 case NFT_META_MARK: 235 case NFT_META_MARK:
@@ -217,9 +239,12 @@ void nft_meta_set_eval(const struct nft_expr *expr,
217 skb->priority = value; 239 skb->priority = value;
218 break; 240 break;
219 case NFT_META_PKTTYPE: 241 case NFT_META_PKTTYPE:
220 if (skb->pkt_type != value && 242 pkt_type = nft_reg_load8(sreg);
221 skb_pkt_type_ok(value) && skb_pkt_type_ok(skb->pkt_type)) 243
222 skb->pkt_type = value; 244 if (skb->pkt_type != pkt_type &&
245 skb_pkt_type_ok(pkt_type) &&
246 skb_pkt_type_ok(skb->pkt_type))
247 skb->pkt_type = pkt_type;
223 break; 248 break;
224 case NFT_META_NFTRACE: 249 case NFT_META_NFTRACE:
225 skb->nf_trace = !!value; 250 skb->nf_trace = !!value;
diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c
index 19a7bf3236f9..439e0bd152a0 100644
--- a/net/netfilter/nft_nat.c
+++ b/net/netfilter/nft_nat.c
@@ -65,10 +65,10 @@ static void nft_nat_eval(const struct nft_expr *expr,
65 } 65 }
66 66
67 if (priv->sreg_proto_min) { 67 if (priv->sreg_proto_min) {
68 range.min_proto.all = 68 range.min_proto.all = (__force __be16)nft_reg_load16(
69 *(__be16 *)&regs->data[priv->sreg_proto_min]; 69 &regs->data[priv->sreg_proto_min]);
70 range.max_proto.all = 70 range.max_proto.all = (__force __be16)nft_reg_load16(
71 *(__be16 *)&regs->data[priv->sreg_proto_max]; 71 &regs->data[priv->sreg_proto_max]);
72 range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; 72 range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
73 } 73 }
74 74
diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c
new file mode 100644
index 000000000000..8ebbc2940f4c
--- /dev/null
+++ b/net/netfilter/nft_set_bitmap.c
@@ -0,0 +1,307 @@
1/*
2 * Copyright (c) 2017 Pablo Neira Ayuso <pablo@netfilter.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8
9#include <linux/kernel.h>
10#include <linux/init.h>
11#include <linux/module.h>
12#include <linux/list.h>
13#include <linux/netlink.h>
14#include <linux/netfilter.h>
15#include <linux/netfilter/nf_tables.h>
16#include <net/netfilter/nf_tables.h>
17
18struct nft_bitmap_elem {
19 struct list_head head;
20 struct nft_set_ext ext;
21};
22
23/* This bitmap uses two bits to represent one element. These two bits determine
24 * the element state in the current and the future generation.
25 *
26 * An element can be in three states. The generation cursor is represented using
27 * the ^ character, note that this cursor shifts on every succesful transaction.
28 * If no transaction is going on, we observe all elements are in the following
29 * state:
30 *
31 * 11 = this element is active in the current generation. In case of no updates,
32 * ^ it stays active in the next generation.
33 * 00 = this element is inactive in the current generation. In case of no
34 * ^ updates, it stays inactive in the next generation.
35 *
36 * On transaction handling, we observe these two temporary states:
37 *
38 * 01 = this element is inactive in the current generation and it becomes active
39 * ^ in the next one. This happens when the element is inserted but commit
40 * path has not yet been executed yet, so activation is still pending. On
41 * transaction abortion, the element is removed.
42 * 10 = this element is active in the current generation and it becomes inactive
43 * ^ in the next one. This happens when the element is deactivated but commit
44 * path has not yet been executed yet, so removal is still pending. On
45 * transation abortion, the next generation bit is reset to go back to
46 * restore its previous state.
47 */
48struct nft_bitmap {
49 struct list_head list;
50 u16 bitmap_size;
51 u8 bitmap[];
52};
53
54static inline void nft_bitmap_location(const struct nft_set *set,
55 const void *key,
56 u32 *idx, u32 *off)
57{
58 u32 k;
59
60 if (set->klen == 2)
61 k = *(u16 *)key;
62 else
63 k = *(u8 *)key;
64 k <<= 1;
65
66 *idx = k / BITS_PER_BYTE;
67 *off = k % BITS_PER_BYTE;
68}
69
70/* Fetch the two bits that represent the element and check if it is active based
71 * on the generation mask.
72 */
73static inline bool
74nft_bitmap_active(const u8 *bitmap, u32 idx, u32 off, u8 genmask)
75{
76 return (bitmap[idx] & (0x3 << off)) & (genmask << off);
77}
78
79static bool nft_bitmap_lookup(const struct net *net, const struct nft_set *set,
80 const u32 *key, const struct nft_set_ext **ext)
81{
82 const struct nft_bitmap *priv = nft_set_priv(set);
83 u8 genmask = nft_genmask_cur(net);
84 u32 idx, off;
85
86 nft_bitmap_location(set, key, &idx, &off);
87
88 return nft_bitmap_active(priv->bitmap, idx, off, genmask);
89}
90
91static struct nft_bitmap_elem *
92nft_bitmap_elem_find(const struct nft_set *set, struct nft_bitmap_elem *this,
93 u8 genmask)
94{
95 const struct nft_bitmap *priv = nft_set_priv(set);
96 struct nft_bitmap_elem *be;
97
98 list_for_each_entry_rcu(be, &priv->list, head) {
99 if (memcmp(nft_set_ext_key(&be->ext),
100 nft_set_ext_key(&this->ext), set->klen) ||
101 !nft_set_elem_active(&be->ext, genmask))
102 continue;
103
104 return be;
105 }
106 return NULL;
107}
108
109static int nft_bitmap_insert(const struct net *net, const struct nft_set *set,
110 const struct nft_set_elem *elem,
111 struct nft_set_ext **ext)
112{
113 struct nft_bitmap *priv = nft_set_priv(set);
114 struct nft_bitmap_elem *new = elem->priv, *be;
115 u8 genmask = nft_genmask_next(net);
116 u32 idx, off;
117
118 be = nft_bitmap_elem_find(set, new, genmask);
119 if (be) {
120 *ext = &be->ext;
121 return -EEXIST;
122 }
123
124 nft_bitmap_location(set, nft_set_ext_key(&new->ext), &idx, &off);
125 /* Enter 01 state. */
126 priv->bitmap[idx] |= (genmask << off);
127 list_add_tail_rcu(&new->head, &priv->list);
128
129 return 0;
130}
131
132static void nft_bitmap_remove(const struct net *net,
133 const struct nft_set *set,
134 const struct nft_set_elem *elem)
135{
136 struct nft_bitmap *priv = nft_set_priv(set);
137 struct nft_bitmap_elem *be = elem->priv;
138 u8 genmask = nft_genmask_next(net);
139 u32 idx, off;
140
141 nft_bitmap_location(set, nft_set_ext_key(&be->ext), &idx, &off);
142 /* Enter 00 state. */
143 priv->bitmap[idx] &= ~(genmask << off);
144 list_del_rcu(&be->head);
145}
146
147static void nft_bitmap_activate(const struct net *net,
148 const struct nft_set *set,
149 const struct nft_set_elem *elem)
150{
151 struct nft_bitmap *priv = nft_set_priv(set);
152 struct nft_bitmap_elem *be = elem->priv;
153 u8 genmask = nft_genmask_next(net);
154 u32 idx, off;
155
156 nft_bitmap_location(set, nft_set_ext_key(&be->ext), &idx, &off);
157 /* Enter 11 state. */
158 priv->bitmap[idx] |= (genmask << off);
159 nft_set_elem_change_active(net, set, &be->ext);
160}
161
162static bool nft_bitmap_flush(const struct net *net,
163 const struct nft_set *set, void *_be)
164{
165 struct nft_bitmap *priv = nft_set_priv(set);
166 u8 genmask = nft_genmask_next(net);
167 struct nft_bitmap_elem *be = _be;
168 u32 idx, off;
169
170 nft_bitmap_location(set, nft_set_ext_key(&be->ext), &idx, &off);
171 /* Enter 10 state, similar to deactivation. */
172 priv->bitmap[idx] &= ~(genmask << off);
173 nft_set_elem_change_active(net, set, &be->ext);
174
175 return true;
176}
177
178static void *nft_bitmap_deactivate(const struct net *net,
179 const struct nft_set *set,
180 const struct nft_set_elem *elem)
181{
182 struct nft_bitmap *priv = nft_set_priv(set);
183 struct nft_bitmap_elem *this = elem->priv, *be;
184 u8 genmask = nft_genmask_next(net);
185 u32 idx, off;
186
187 nft_bitmap_location(set, elem->key.val.data, &idx, &off);
188
189 be = nft_bitmap_elem_find(set, this, genmask);
190 if (!be)
191 return NULL;
192
193 /* Enter 10 state. */
194 priv->bitmap[idx] &= ~(genmask << off);
195 nft_set_elem_change_active(net, set, &be->ext);
196
197 return be;
198}
199
200static void nft_bitmap_walk(const struct nft_ctx *ctx,
201 struct nft_set *set,
202 struct nft_set_iter *iter)
203{
204 const struct nft_bitmap *priv = nft_set_priv(set);
205 struct nft_bitmap_elem *be;
206 struct nft_set_elem elem;
207
208 list_for_each_entry_rcu(be, &priv->list, head) {
209 if (iter->count < iter->skip)
210 goto cont;
211 if (!nft_set_elem_active(&be->ext, iter->genmask))
212 goto cont;
213
214 elem.priv = be;
215
216 iter->err = iter->fn(ctx, set, iter, &elem);
217
218 if (iter->err < 0)
219 return;
220cont:
221 iter->count++;
222 }
223}
224
225/* The bitmap size is pow(2, key length in bits) / bits per byte. This is
226 * multiplied by two since each element takes two bits. For 8 bit keys, the
227 * bitmap consumes 66 bytes. For 16 bit keys, 16388 bytes.
228 */
229static inline u32 nft_bitmap_size(u32 klen)
230{
231 return ((2 << ((klen * BITS_PER_BYTE) - 1)) / BITS_PER_BYTE) << 1;
232}
233
234static inline u32 nft_bitmap_total_size(u32 klen)
235{
236 return sizeof(struct nft_bitmap) + nft_bitmap_size(klen);
237}
238
239static unsigned int nft_bitmap_privsize(const struct nlattr * const nla[])
240{
241 u32 klen = ntohl(nla_get_be32(nla[NFTA_SET_KEY_LEN]));
242
243 return nft_bitmap_total_size(klen);
244}
245
246static int nft_bitmap_init(const struct nft_set *set,
247 const struct nft_set_desc *desc,
248 const struct nlattr * const nla[])
249{
250 struct nft_bitmap *priv = nft_set_priv(set);
251
252 INIT_LIST_HEAD(&priv->list);
253 priv->bitmap_size = nft_bitmap_size(set->klen);
254
255 return 0;
256}
257
258static void nft_bitmap_destroy(const struct nft_set *set)
259{
260}
261
262static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features,
263 struct nft_set_estimate *est)
264{
265 /* Make sure bitmaps we don't get bitmaps larger than 16 Kbytes. */
266 if (desc->klen > 2)
267 return false;
268
269 est->size = nft_bitmap_total_size(desc->klen);
270 est->lookup = NFT_SET_CLASS_O_1;
271 est->space = NFT_SET_CLASS_O_1;
272
273 return true;
274}
275
276static struct nft_set_ops nft_bitmap_ops __read_mostly = {
277 .privsize = nft_bitmap_privsize,
278 .elemsize = offsetof(struct nft_bitmap_elem, ext),
279 .estimate = nft_bitmap_estimate,
280 .init = nft_bitmap_init,
281 .destroy = nft_bitmap_destroy,
282 .insert = nft_bitmap_insert,
283 .remove = nft_bitmap_remove,
284 .deactivate = nft_bitmap_deactivate,
285 .flush = nft_bitmap_flush,
286 .activate = nft_bitmap_activate,
287 .lookup = nft_bitmap_lookup,
288 .walk = nft_bitmap_walk,
289 .owner = THIS_MODULE,
290};
291
292static int __init nft_bitmap_module_init(void)
293{
294 return nft_register_set(&nft_bitmap_ops);
295}
296
297static void __exit nft_bitmap_module_exit(void)
298{
299 nft_unregister_set(&nft_bitmap_ops);
300}
301
302module_init(nft_bitmap_module_init);
303module_exit(nft_bitmap_module_exit);
304
305MODULE_LICENSE("GPL");
306MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
307MODULE_ALIAS_NFT_SET();
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index e36069fb76ae..5f652720fc78 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -167,8 +167,8 @@ static void nft_hash_activate(const struct net *net, const struct nft_set *set,
167 nft_set_elem_clear_busy(&he->ext); 167 nft_set_elem_clear_busy(&he->ext);
168} 168}
169 169
170static bool nft_hash_deactivate_one(const struct net *net, 170static bool nft_hash_flush(const struct net *net,
171 const struct nft_set *set, void *priv) 171 const struct nft_set *set, void *priv)
172{ 172{
173 struct nft_hash_elem *he = priv; 173 struct nft_hash_elem *he = priv;
174 174
@@ -195,7 +195,7 @@ static void *nft_hash_deactivate(const struct net *net,
195 rcu_read_lock(); 195 rcu_read_lock();
196 he = rhashtable_lookup_fast(&priv->ht, &arg, nft_hash_params); 196 he = rhashtable_lookup_fast(&priv->ht, &arg, nft_hash_params);
197 if (he != NULL && 197 if (he != NULL &&
198 !nft_hash_deactivate_one(net, set, he)) 198 !nft_hash_flush(net, set, he))
199 he = NULL; 199 he = NULL;
200 200
201 rcu_read_unlock(); 201 rcu_read_unlock();
@@ -203,7 +203,8 @@ static void *nft_hash_deactivate(const struct net *net,
203 return he; 203 return he;
204} 204}
205 205
206static void nft_hash_remove(const struct nft_set *set, 206static void nft_hash_remove(const struct net *net,
207 const struct nft_set *set,
207 const struct nft_set_elem *elem) 208 const struct nft_set_elem *elem)
208{ 209{
209 struct nft_hash *priv = nft_set_priv(set); 210 struct nft_hash *priv = nft_set_priv(set);
@@ -383,7 +384,8 @@ static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features,
383 est->size = esize + 2 * sizeof(struct nft_hash_elem *); 384 est->size = esize + 2 * sizeof(struct nft_hash_elem *);
384 } 385 }
385 386
386 est->class = NFT_SET_CLASS_O_1; 387 est->lookup = NFT_SET_CLASS_O_1;
388 est->space = NFT_SET_CLASS_O_N;
387 389
388 return true; 390 return true;
389} 391}
@@ -397,12 +399,12 @@ static struct nft_set_ops nft_hash_ops __read_mostly = {
397 .insert = nft_hash_insert, 399 .insert = nft_hash_insert,
398 .activate = nft_hash_activate, 400 .activate = nft_hash_activate,
399 .deactivate = nft_hash_deactivate, 401 .deactivate = nft_hash_deactivate,
400 .deactivate_one = nft_hash_deactivate_one, 402 .flush = nft_hash_flush,
401 .remove = nft_hash_remove, 403 .remove = nft_hash_remove,
402 .lookup = nft_hash_lookup, 404 .lookup = nft_hash_lookup,
403 .update = nft_hash_update, 405 .update = nft_hash_update,
404 .walk = nft_hash_walk, 406 .walk = nft_hash_walk,
405 .features = NFT_SET_MAP | NFT_SET_TIMEOUT, 407 .features = NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT,
406 .owner = THIS_MODULE, 408 .owner = THIS_MODULE,
407}; 409};
408 410
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index f06f55ee516d..78dfbf9588b3 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -60,11 +60,10 @@ static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
60 d = memcmp(this, key, set->klen); 60 d = memcmp(this, key, set->klen);
61 if (d < 0) { 61 if (d < 0) {
62 parent = parent->rb_left; 62 parent = parent->rb_left;
63 /* In case of adjacent ranges, we always see the high 63 if (interval &&
64 * part of the range in first place, before the low one. 64 nft_rbtree_equal(set, this, interval) &&
65 * So don't update interval if the keys are equal. 65 nft_rbtree_interval_end(this) &&
66 */ 66 !nft_rbtree_interval_end(interval))
67 if (interval && nft_rbtree_equal(set, this, interval))
68 continue; 67 continue;
69 interval = rbe; 68 interval = rbe;
70 } else if (d > 0) 69 } else if (d > 0)
@@ -151,7 +150,8 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
151 return err; 150 return err;
152} 151}
153 152
154static void nft_rbtree_remove(const struct nft_set *set, 153static void nft_rbtree_remove(const struct net *net,
154 const struct nft_set *set,
155 const struct nft_set_elem *elem) 155 const struct nft_set_elem *elem)
156{ 156{
157 struct nft_rbtree *priv = nft_set_priv(set); 157 struct nft_rbtree *priv = nft_set_priv(set);
@@ -171,8 +171,8 @@ static void nft_rbtree_activate(const struct net *net,
171 nft_set_elem_change_active(net, set, &rbe->ext); 171 nft_set_elem_change_active(net, set, &rbe->ext);
172} 172}
173 173
174static bool nft_rbtree_deactivate_one(const struct net *net, 174static bool nft_rbtree_flush(const struct net *net,
175 const struct nft_set *set, void *priv) 175 const struct nft_set *set, void *priv)
176{ 176{
177 struct nft_rbtree_elem *rbe = priv; 177 struct nft_rbtree_elem *rbe = priv;
178 178
@@ -213,7 +213,7 @@ static void *nft_rbtree_deactivate(const struct net *net,
213 parent = parent->rb_right; 213 parent = parent->rb_right;
214 continue; 214 continue;
215 } 215 }
216 nft_rbtree_deactivate_one(net, set, rbe); 216 nft_rbtree_flush(net, set, rbe);
217 return rbe; 217 return rbe;
218 } 218 }
219 } 219 }
@@ -290,7 +290,8 @@ static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features,
290 else 290 else
291 est->size = nsize; 291 est->size = nsize;
292 292
293 est->class = NFT_SET_CLASS_O_LOG_N; 293 est->lookup = NFT_SET_CLASS_O_LOG_N;
294 est->space = NFT_SET_CLASS_O_N;
294 295
295 return true; 296 return true;
296} 297}
@@ -304,11 +305,11 @@ static struct nft_set_ops nft_rbtree_ops __read_mostly = {
304 .insert = nft_rbtree_insert, 305 .insert = nft_rbtree_insert,
305 .remove = nft_rbtree_remove, 306 .remove = nft_rbtree_remove,
306 .deactivate = nft_rbtree_deactivate, 307 .deactivate = nft_rbtree_deactivate,
307 .deactivate_one = nft_rbtree_deactivate_one, 308 .flush = nft_rbtree_flush,
308 .activate = nft_rbtree_activate, 309 .activate = nft_rbtree_activate,
309 .lookup = nft_rbtree_lookup, 310 .lookup = nft_rbtree_lookup,
310 .walk = nft_rbtree_walk, 311 .walk = nft_rbtree_walk,
311 .features = NFT_SET_INTERVAL | NFT_SET_MAP, 312 .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT,
312 .owner = THIS_MODULE, 313 .owner = THIS_MODULE,
313}; 314};
314 315
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 2ff499680cc6..14857afc9937 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -262,6 +262,60 @@ struct xt_target *xt_request_find_target(u8 af, const char *name, u8 revision)
262} 262}
263EXPORT_SYMBOL_GPL(xt_request_find_target); 263EXPORT_SYMBOL_GPL(xt_request_find_target);
264 264
265
266static int xt_obj_to_user(u16 __user *psize, u16 size,
267 void __user *pname, const char *name,
268 u8 __user *prev, u8 rev)
269{
270 if (put_user(size, psize))
271 return -EFAULT;
272 if (copy_to_user(pname, name, strlen(name) + 1))
273 return -EFAULT;
274 if (put_user(rev, prev))
275 return -EFAULT;
276
277 return 0;
278}
279
280#define XT_OBJ_TO_USER(U, K, TYPE, C_SIZE) \
281 xt_obj_to_user(&U->u.TYPE##_size, C_SIZE ? : K->u.TYPE##_size, \
282 U->u.user.name, K->u.kernel.TYPE->name, \
283 &U->u.user.revision, K->u.kernel.TYPE->revision)
284
285int xt_data_to_user(void __user *dst, const void *src,
286 int usersize, int size)
287{
288 usersize = usersize ? : size;
289 if (copy_to_user(dst, src, usersize))
290 return -EFAULT;
291 if (usersize != size && clear_user(dst + usersize, size - usersize))
292 return -EFAULT;
293
294 return 0;
295}
296EXPORT_SYMBOL_GPL(xt_data_to_user);
297
298#define XT_DATA_TO_USER(U, K, TYPE, C_SIZE) \
299 xt_data_to_user(U->data, K->data, \
300 K->u.kernel.TYPE->usersize, \
301 C_SIZE ? : K->u.kernel.TYPE->TYPE##size)
302
303int xt_match_to_user(const struct xt_entry_match *m,
304 struct xt_entry_match __user *u)
305{
306 return XT_OBJ_TO_USER(u, m, match, 0) ||
307 XT_DATA_TO_USER(u, m, match, 0);
308}
309EXPORT_SYMBOL_GPL(xt_match_to_user);
310
311int xt_target_to_user(const struct xt_entry_target *t,
312 struct xt_entry_target __user *u)
313{
314 return XT_OBJ_TO_USER(u, t, target, 0) ||
315 XT_DATA_TO_USER(u, t, target, 0);
316}
317EXPORT_SYMBOL_GPL(xt_target_to_user);
318
265static int match_revfn(u8 af, const char *name, u8 revision, int *bestp) 319static int match_revfn(u8 af, const char *name, u8 revision, int *bestp)
266{ 320{
267 const struct xt_match *m; 321 const struct xt_match *m;
@@ -565,17 +619,14 @@ int xt_compat_match_to_user(const struct xt_entry_match *m,
565 int off = xt_compat_match_offset(match); 619 int off = xt_compat_match_offset(match);
566 u_int16_t msize = m->u.user.match_size - off; 620 u_int16_t msize = m->u.user.match_size - off;
567 621
568 if (copy_to_user(cm, m, sizeof(*cm)) || 622 if (XT_OBJ_TO_USER(cm, m, match, msize))
569 put_user(msize, &cm->u.user.match_size) ||
570 copy_to_user(cm->u.user.name, m->u.kernel.match->name,
571 strlen(m->u.kernel.match->name) + 1))
572 return -EFAULT; 623 return -EFAULT;
573 624
574 if (match->compat_to_user) { 625 if (match->compat_to_user) {
575 if (match->compat_to_user((void __user *)cm->data, m->data)) 626 if (match->compat_to_user((void __user *)cm->data, m->data))
576 return -EFAULT; 627 return -EFAULT;
577 } else { 628 } else {
578 if (copy_to_user(cm->data, m->data, msize - sizeof(*cm))) 629 if (XT_DATA_TO_USER(cm, m, match, msize - sizeof(*cm)))
579 return -EFAULT; 630 return -EFAULT;
580 } 631 }
581 632
@@ -616,7 +667,7 @@ int xt_compat_check_entry_offsets(const void *base, const char *elems,
616 COMPAT_XT_ALIGN(target_offset + sizeof(struct compat_xt_standard_target)) != next_offset) 667 COMPAT_XT_ALIGN(target_offset + sizeof(struct compat_xt_standard_target)) != next_offset)
617 return -EINVAL; 668 return -EINVAL;
618 669
619 /* compat_xt_entry match has less strict aligment requirements, 670 /* compat_xt_entry match has less strict alignment requirements,
620 * otherwise they are identical. In case of padding differences 671 * otherwise they are identical. In case of padding differences
621 * we need to add compat version of xt_check_entry_match. 672 * we need to add compat version of xt_check_entry_match.
622 */ 673 */
@@ -923,17 +974,14 @@ int xt_compat_target_to_user(const struct xt_entry_target *t,
923 int off = xt_compat_target_offset(target); 974 int off = xt_compat_target_offset(target);
924 u_int16_t tsize = t->u.user.target_size - off; 975 u_int16_t tsize = t->u.user.target_size - off;
925 976
926 if (copy_to_user(ct, t, sizeof(*ct)) || 977 if (XT_OBJ_TO_USER(ct, t, target, tsize))
927 put_user(tsize, &ct->u.user.target_size) ||
928 copy_to_user(ct->u.user.name, t->u.kernel.target->name,
929 strlen(t->u.kernel.target->name) + 1))
930 return -EFAULT; 978 return -EFAULT;
931 979
932 if (target->compat_to_user) { 980 if (target->compat_to_user) {
933 if (target->compat_to_user((void __user *)ct->data, t->data)) 981 if (target->compat_to_user((void __user *)ct->data, t->data))
934 return -EFAULT; 982 return -EFAULT;
935 } else { 983 } else {
936 if (copy_to_user(ct->data, t->data, tsize - sizeof(*ct))) 984 if (XT_DATA_TO_USER(ct, t, target, tsize - sizeof(*ct)))
937 return -EFAULT; 985 return -EFAULT;
938 } 986 }
939 987
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index 95c750358747..b008db0184b8 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -23,15 +23,14 @@
23static inline int xt_ct_target(struct sk_buff *skb, struct nf_conn *ct) 23static inline int xt_ct_target(struct sk_buff *skb, struct nf_conn *ct)
24{ 24{
25 /* Previously seen (loopback)? Ignore. */ 25 /* Previously seen (loopback)? Ignore. */
26 if (skb->nfct != NULL) 26 if (skb->_nfct != 0)
27 return XT_CONTINUE; 27 return XT_CONTINUE;
28 28
29 /* special case the untracked ct : we want the percpu object */ 29 /* special case the untracked ct : we want the percpu object */
30 if (!ct) 30 if (!ct)
31 ct = nf_ct_untracked_get(); 31 ct = nf_ct_untracked_get();
32 atomic_inc(&ct->ct_general.use); 32 atomic_inc(&ct->ct_general.use);
33 skb->nfct = &ct->ct_general; 33 nf_ct_set(skb, ct, IP_CT_NEW);
34 skb->nfctinfo = IP_CT_NEW;
35 34
36 return XT_CONTINUE; 35 return XT_CONTINUE;
37} 36}
@@ -373,6 +372,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = {
373 .name = "CT", 372 .name = "CT",
374 .family = NFPROTO_UNSPEC, 373 .family = NFPROTO_UNSPEC,
375 .targetsize = sizeof(struct xt_ct_target_info), 374 .targetsize = sizeof(struct xt_ct_target_info),
375 .usersize = offsetof(struct xt_ct_target_info, ct),
376 .checkentry = xt_ct_tg_check_v0, 376 .checkentry = xt_ct_tg_check_v0,
377 .destroy = xt_ct_tg_destroy_v0, 377 .destroy = xt_ct_tg_destroy_v0,
378 .target = xt_ct_target_v0, 378 .target = xt_ct_target_v0,
@@ -384,6 +384,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = {
384 .family = NFPROTO_UNSPEC, 384 .family = NFPROTO_UNSPEC,
385 .revision = 1, 385 .revision = 1,
386 .targetsize = sizeof(struct xt_ct_target_info_v1), 386 .targetsize = sizeof(struct xt_ct_target_info_v1),
387 .usersize = offsetof(struct xt_ct_target_info, ct),
387 .checkentry = xt_ct_tg_check_v1, 388 .checkentry = xt_ct_tg_check_v1,
388 .destroy = xt_ct_tg_destroy_v1, 389 .destroy = xt_ct_tg_destroy_v1,
389 .target = xt_ct_target_v1, 390 .target = xt_ct_target_v1,
@@ -395,6 +396,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = {
395 .family = NFPROTO_UNSPEC, 396 .family = NFPROTO_UNSPEC,
396 .revision = 2, 397 .revision = 2,
397 .targetsize = sizeof(struct xt_ct_target_info_v1), 398 .targetsize = sizeof(struct xt_ct_target_info_v1),
399 .usersize = offsetof(struct xt_ct_target_info, ct),
398 .checkentry = xt_ct_tg_check_v2, 400 .checkentry = xt_ct_tg_check_v2,
399 .destroy = xt_ct_tg_destroy_v1, 401 .destroy = xt_ct_tg_destroy_v1,
400 .target = xt_ct_target_v1, 402 .target = xt_ct_target_v1,
@@ -407,12 +409,11 @@ static unsigned int
407notrack_tg(struct sk_buff *skb, const struct xt_action_param *par) 409notrack_tg(struct sk_buff *skb, const struct xt_action_param *par)
408{ 410{
409 /* Previously seen (loopback)? Ignore. */ 411 /* Previously seen (loopback)? Ignore. */
410 if (skb->nfct != NULL) 412 if (skb->_nfct != 0)
411 return XT_CONTINUE; 413 return XT_CONTINUE;
412 414
413 skb->nfct = &nf_ct_untracked_get()->ct_general; 415 nf_ct_set(skb, nf_ct_untracked_get(), IP_CT_NEW);
414 skb->nfctinfo = IP_CT_NEW; 416 nf_conntrack_get(skb_nfct(skb));
415 nf_conntrack_get(skb->nfct);
416 417
417 return XT_CONTINUE; 418 return XT_CONTINUE;
418} 419}
diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
index 91a373a3f534..498b54fd04d7 100644
--- a/net/netfilter/xt_RATEEST.c
+++ b/net/netfilter/xt_RATEEST.c
@@ -162,6 +162,7 @@ static struct xt_target xt_rateest_tg_reg __read_mostly = {
162 .checkentry = xt_rateest_tg_checkentry, 162 .checkentry = xt_rateest_tg_checkentry,
163 .destroy = xt_rateest_tg_destroy, 163 .destroy = xt_rateest_tg_destroy,
164 .targetsize = sizeof(struct xt_rateest_target_info), 164 .targetsize = sizeof(struct xt_rateest_target_info),
165 .usersize = offsetof(struct xt_rateest_target_info, est),
165 .me = THIS_MODULE, 166 .me = THIS_MODULE,
166}; 167};
167 168
diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c
index 27241a767f17..c64aca611ac5 100644
--- a/net/netfilter/xt_TCPMSS.c
+++ b/net/netfilter/xt_TCPMSS.c
@@ -104,7 +104,7 @@ tcpmss_mangle_packet(struct sk_buff *skb,
104 tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff); 104 tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff);
105 tcp_hdrlen = tcph->doff * 4; 105 tcp_hdrlen = tcph->doff * 4;
106 106
107 if (len < tcp_hdrlen) 107 if (len < tcp_hdrlen || tcp_hdrlen < sizeof(struct tcphdr))
108 return -1; 108 return -1;
109 109
110 if (info->mss == XT_TCPMSS_CLAMP_PMTU) { 110 if (info->mss == XT_TCPMSS_CLAMP_PMTU) {
@@ -152,6 +152,10 @@ tcpmss_mangle_packet(struct sk_buff *skb,
152 if (len > tcp_hdrlen) 152 if (len > tcp_hdrlen)
153 return 0; 153 return 0;
154 154
155 /* tcph->doff has 4 bits, do not wrap it to 0 */
156 if (tcp_hdrlen >= 15 * 4)
157 return 0;
158
155 /* 159 /*
156 * MSS Option not found ?! add it.. 160 * MSS Option not found ?! add it..
157 */ 161 */
diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c
index 1c57ace75ae6..86b0580b2216 100644
--- a/net/netfilter/xt_TEE.c
+++ b/net/netfilter/xt_TEE.c
@@ -133,6 +133,7 @@ static struct xt_target tee_tg_reg[] __read_mostly = {
133 .family = NFPROTO_IPV4, 133 .family = NFPROTO_IPV4,
134 .target = tee_tg4, 134 .target = tee_tg4,
135 .targetsize = sizeof(struct xt_tee_tginfo), 135 .targetsize = sizeof(struct xt_tee_tginfo),
136 .usersize = offsetof(struct xt_tee_tginfo, priv),
136 .checkentry = tee_tg_check, 137 .checkentry = tee_tg_check,
137 .destroy = tee_tg_destroy, 138 .destroy = tee_tg_destroy,
138 .me = THIS_MODULE, 139 .me = THIS_MODULE,
@@ -144,6 +145,7 @@ static struct xt_target tee_tg_reg[] __read_mostly = {
144 .family = NFPROTO_IPV6, 145 .family = NFPROTO_IPV6,
145 .target = tee_tg6, 146 .target = tee_tg6,
146 .targetsize = sizeof(struct xt_tee_tginfo), 147 .targetsize = sizeof(struct xt_tee_tginfo),
148 .usersize = offsetof(struct xt_tee_tginfo, priv),
147 .checkentry = tee_tg_check, 149 .checkentry = tee_tg_check,
148 .destroy = tee_tg_destroy, 150 .destroy = tee_tg_destroy,
149 .me = THIS_MODULE, 151 .me = THIS_MODULE,
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index 80cb7babeb64..df7f1df00330 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -393,7 +393,8 @@ tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr,
393 393
394 rcu_read_lock(); 394 rcu_read_lock();
395 indev = __in6_dev_get(skb->dev); 395 indev = __in6_dev_get(skb->dev);
396 if (indev) 396 if (indev) {
397 read_lock_bh(&indev->lock);
397 list_for_each_entry(ifa, &indev->addr_list, if_list) { 398 list_for_each_entry(ifa, &indev->addr_list, if_list) {
398 if (ifa->flags & (IFA_F_TENTATIVE | IFA_F_DEPRECATED)) 399 if (ifa->flags & (IFA_F_TENTATIVE | IFA_F_DEPRECATED))
399 continue; 400 continue;
@@ -401,6 +402,8 @@ tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr,
401 laddr = &ifa->addr; 402 laddr = &ifa->addr;
402 break; 403 break;
403 } 404 }
405 read_unlock_bh(&indev->lock);
406 }
404 rcu_read_unlock(); 407 rcu_read_unlock();
405 408
406 return laddr ? laddr : daddr; 409 return laddr ? laddr : daddr;
diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c
index 2dedaa23ab0a..38986a95216c 100644
--- a/net/netfilter/xt_bpf.c
+++ b/net/netfilter/xt_bpf.c
@@ -110,6 +110,7 @@ static struct xt_match bpf_mt_reg[] __read_mostly = {
110 .match = bpf_mt, 110 .match = bpf_mt,
111 .destroy = bpf_mt_destroy, 111 .destroy = bpf_mt_destroy,
112 .matchsize = sizeof(struct xt_bpf_info), 112 .matchsize = sizeof(struct xt_bpf_info),
113 .usersize = offsetof(struct xt_bpf_info, filter),
113 .me = THIS_MODULE, 114 .me = THIS_MODULE,
114 }, 115 },
115 { 116 {
@@ -120,6 +121,7 @@ static struct xt_match bpf_mt_reg[] __read_mostly = {
120 .match = bpf_mt_v1, 121 .match = bpf_mt_v1,
121 .destroy = bpf_mt_destroy_v1, 122 .destroy = bpf_mt_destroy_v1,
122 .matchsize = sizeof(struct xt_bpf_info_v1), 123 .matchsize = sizeof(struct xt_bpf_info_v1),
124 .usersize = offsetof(struct xt_bpf_info_v1, filter),
123 .me = THIS_MODULE, 125 .me = THIS_MODULE,
124 }, 126 },
125}; 127};
diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c
index a086a914865f..1db1ce59079f 100644
--- a/net/netfilter/xt_cgroup.c
+++ b/net/netfilter/xt_cgroup.c
@@ -122,6 +122,7 @@ static struct xt_match cgroup_mt_reg[] __read_mostly = {
122 .checkentry = cgroup_mt_check_v1, 122 .checkentry = cgroup_mt_check_v1,
123 .match = cgroup_mt_v1, 123 .match = cgroup_mt_v1,
124 .matchsize = sizeof(struct xt_cgroup_info_v1), 124 .matchsize = sizeof(struct xt_cgroup_info_v1),
125 .usersize = offsetof(struct xt_cgroup_info_v1, priv),
125 .destroy = cgroup_mt_destroy_v1, 126 .destroy = cgroup_mt_destroy_v1,
126 .me = THIS_MODULE, 127 .me = THIS_MODULE,
127 .hooks = (1 << NF_INET_LOCAL_OUT) | 128 .hooks = (1 << NF_INET_LOCAL_OUT) |
diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c
index 2aff2b7c4689..b8fd4ab762ed 100644
--- a/net/netfilter/xt_connlimit.c
+++ b/net/netfilter/xt_connlimit.c
@@ -218,7 +218,7 @@ count_tree(struct net *net, struct rb_root *root,
218 int diff; 218 int diff;
219 bool addit; 219 bool addit;
220 220
221 rbconn = container_of(*rbnode, struct xt_connlimit_rb, node); 221 rbconn = rb_entry(*rbnode, struct xt_connlimit_rb, node);
222 222
223 parent = *rbnode; 223 parent = *rbnode;
224 diff = same_source_net(addr, mask, &rbconn->addr, family); 224 diff = same_source_net(addr, mask, &rbconn->addr, family);
@@ -398,7 +398,7 @@ static void destroy_tree(struct rb_root *r)
398 struct rb_node *node; 398 struct rb_node *node;
399 399
400 while ((node = rb_first(r)) != NULL) { 400 while ((node = rb_first(r)) != NULL) {
401 rbconn = container_of(node, struct xt_connlimit_rb, node); 401 rbconn = rb_entry(node, struct xt_connlimit_rb, node);
402 402
403 rb_erase(node, r); 403 rb_erase(node, r);
404 404
@@ -431,6 +431,7 @@ static struct xt_match connlimit_mt_reg __read_mostly = {
431 .checkentry = connlimit_mt_check, 431 .checkentry = connlimit_mt_check,
432 .match = connlimit_mt, 432 .match = connlimit_mt,
433 .matchsize = sizeof(struct xt_connlimit_info), 433 .matchsize = sizeof(struct xt_connlimit_info),
434 .usersize = offsetof(struct xt_connlimit_info, data),
434 .destroy = connlimit_mt_destroy, 435 .destroy = connlimit_mt_destroy,
435 .me = THIS_MODULE, 436 .me = THIS_MODULE,
436}; 437};
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 10063408141d..2a6dfe8b74d3 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -463,23 +463,16 @@ static u32 xt_hashlimit_len_to_chunks(u32 len)
463/* Precision saver. */ 463/* Precision saver. */
464static u64 user2credits(u64 user, int revision) 464static u64 user2credits(u64 user, int revision)
465{ 465{
466 if (revision == 1) { 466 u64 scale = (revision == 1) ?
467 /* If multiplying would overflow... */ 467 XT_HASHLIMIT_SCALE : XT_HASHLIMIT_SCALE_v2;
468 if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY_v1)) 468 u64 cpj = (revision == 1) ?
469 /* Divide first. */ 469 CREDITS_PER_JIFFY_v1 : CREDITS_PER_JIFFY;
470 return div64_u64(user, XT_HASHLIMIT_SCALE)
471 * HZ * CREDITS_PER_JIFFY_v1;
472
473 return div64_u64(user * HZ * CREDITS_PER_JIFFY_v1,
474 XT_HASHLIMIT_SCALE);
475 } else {
476 if (user > 0xFFFFFFFFFFFFFFFFULL / (HZ*CREDITS_PER_JIFFY))
477 return div64_u64(user, XT_HASHLIMIT_SCALE_v2)
478 * HZ * CREDITS_PER_JIFFY;
479 470
480 return div64_u64(user * HZ * CREDITS_PER_JIFFY, 471 /* Avoid overflow: divide the constant operands first */
481 XT_HASHLIMIT_SCALE_v2); 472 if (scale >= HZ * cpj)
482 } 473 return div64_u64(user, div64_u64(scale, HZ * cpj));
474
475 return user * div64_u64(HZ * cpj, scale);
483} 476}
484 477
485static u32 user2credits_byte(u32 user) 478static u32 user2credits_byte(u32 user)
@@ -838,6 +831,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
838 .family = NFPROTO_IPV4, 831 .family = NFPROTO_IPV4,
839 .match = hashlimit_mt_v1, 832 .match = hashlimit_mt_v1,
840 .matchsize = sizeof(struct xt_hashlimit_mtinfo1), 833 .matchsize = sizeof(struct xt_hashlimit_mtinfo1),
834 .usersize = offsetof(struct xt_hashlimit_mtinfo1, hinfo),
841 .checkentry = hashlimit_mt_check_v1, 835 .checkentry = hashlimit_mt_check_v1,
842 .destroy = hashlimit_mt_destroy_v1, 836 .destroy = hashlimit_mt_destroy_v1,
843 .me = THIS_MODULE, 837 .me = THIS_MODULE,
@@ -848,6 +842,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
848 .family = NFPROTO_IPV4, 842 .family = NFPROTO_IPV4,
849 .match = hashlimit_mt, 843 .match = hashlimit_mt,
850 .matchsize = sizeof(struct xt_hashlimit_mtinfo2), 844 .matchsize = sizeof(struct xt_hashlimit_mtinfo2),
845 .usersize = offsetof(struct xt_hashlimit_mtinfo2, hinfo),
851 .checkentry = hashlimit_mt_check, 846 .checkentry = hashlimit_mt_check,
852 .destroy = hashlimit_mt_destroy, 847 .destroy = hashlimit_mt_destroy,
853 .me = THIS_MODULE, 848 .me = THIS_MODULE,
@@ -859,6 +854,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
859 .family = NFPROTO_IPV6, 854 .family = NFPROTO_IPV6,
860 .match = hashlimit_mt_v1, 855 .match = hashlimit_mt_v1,
861 .matchsize = sizeof(struct xt_hashlimit_mtinfo1), 856 .matchsize = sizeof(struct xt_hashlimit_mtinfo1),
857 .usersize = offsetof(struct xt_hashlimit_mtinfo1, hinfo),
862 .checkentry = hashlimit_mt_check_v1, 858 .checkentry = hashlimit_mt_check_v1,
863 .destroy = hashlimit_mt_destroy_v1, 859 .destroy = hashlimit_mt_destroy_v1,
864 .me = THIS_MODULE, 860 .me = THIS_MODULE,
@@ -869,6 +865,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
869 .family = NFPROTO_IPV6, 865 .family = NFPROTO_IPV6,
870 .match = hashlimit_mt, 866 .match = hashlimit_mt,
871 .matchsize = sizeof(struct xt_hashlimit_mtinfo2), 867 .matchsize = sizeof(struct xt_hashlimit_mtinfo2),
868 .usersize = offsetof(struct xt_hashlimit_mtinfo2, hinfo),
872 .checkentry = hashlimit_mt_check, 869 .checkentry = hashlimit_mt_check,
873 .destroy = hashlimit_mt_destroy, 870 .destroy = hashlimit_mt_destroy,
874 .me = THIS_MODULE, 871 .me = THIS_MODULE,
diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c
index bef850596558..dab962df1787 100644
--- a/net/netfilter/xt_limit.c
+++ b/net/netfilter/xt_limit.c
@@ -192,6 +192,8 @@ static struct xt_match limit_mt_reg __read_mostly = {
192 .compatsize = sizeof(struct compat_xt_rateinfo), 192 .compatsize = sizeof(struct compat_xt_rateinfo),
193 .compat_from_user = limit_mt_compat_from_user, 193 .compat_from_user = limit_mt_compat_from_user,
194 .compat_to_user = limit_mt_compat_to_user, 194 .compat_to_user = limit_mt_compat_to_user,
195#else
196 .usersize = offsetof(struct xt_rateinfo, prev),
195#endif 197#endif
196 .me = THIS_MODULE, 198 .me = THIS_MODULE,
197}; 199};
diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c
index 16477df45b3b..3d705c688a27 100644
--- a/net/netfilter/xt_owner.c
+++ b/net/netfilter/xt_owner.c
@@ -13,6 +13,8 @@
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/skbuff.h> 14#include <linux/skbuff.h>
15#include <linux/file.h> 15#include <linux/file.h>
16#include <linux/cred.h>
17
16#include <net/sock.h> 18#include <net/sock.h>
17#include <net/inet_sock.h> 19#include <net/inet_sock.h>
18#include <linux/netfilter/x_tables.h> 20#include <linux/netfilter/x_tables.h>
diff --git a/net/netfilter/xt_pkttype.c b/net/netfilter/xt_pkttype.c
index 57efb703ff18..1ef99151b3ba 100644
--- a/net/netfilter/xt_pkttype.c
+++ b/net/netfilter/xt_pkttype.c
@@ -33,8 +33,7 @@ pkttype_mt(const struct sk_buff *skb, struct xt_action_param *par)
33 else if (xt_family(par) == NFPROTO_IPV4 && 33 else if (xt_family(par) == NFPROTO_IPV4 &&
34 ipv4_is_multicast(ip_hdr(skb)->daddr)) 34 ipv4_is_multicast(ip_hdr(skb)->daddr))
35 type = PACKET_MULTICAST; 35 type = PACKET_MULTICAST;
36 else if (xt_family(par) == NFPROTO_IPV6 && 36 else if (xt_family(par) == NFPROTO_IPV6)
37 ipv6_hdr(skb)->daddr.s6_addr[0] == 0xFF)
38 type = PACKET_MULTICAST; 37 type = PACKET_MULTICAST;
39 else 38 else
40 type = PACKET_BROADCAST; 39 type = PACKET_BROADCAST;
diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c
index 44c8eb4c9d66..10d61a6eed71 100644
--- a/net/netfilter/xt_quota.c
+++ b/net/netfilter/xt_quota.c
@@ -73,6 +73,7 @@ static struct xt_match quota_mt_reg __read_mostly = {
73 .checkentry = quota_mt_check, 73 .checkentry = quota_mt_check,
74 .destroy = quota_mt_destroy, 74 .destroy = quota_mt_destroy,
75 .matchsize = sizeof(struct xt_quota_info), 75 .matchsize = sizeof(struct xt_quota_info),
76 .usersize = offsetof(struct xt_quota_info, master),
76 .me = THIS_MODULE, 77 .me = THIS_MODULE,
77}; 78};
78 79
diff --git a/net/netfilter/xt_rateest.c b/net/netfilter/xt_rateest.c
index 1db02f6fca54..755d2f6693a2 100644
--- a/net/netfilter/xt_rateest.c
+++ b/net/netfilter/xt_rateest.c
@@ -133,6 +133,7 @@ static struct xt_match xt_rateest_mt_reg __read_mostly = {
133 .checkentry = xt_rateest_mt_checkentry, 133 .checkentry = xt_rateest_mt_checkentry,
134 .destroy = xt_rateest_mt_destroy, 134 .destroy = xt_rateest_mt_destroy,
135 .matchsize = sizeof(struct xt_rateest_match_info), 135 .matchsize = sizeof(struct xt_rateest_match_info),
136 .usersize = offsetof(struct xt_rateest_match_info, est1),
136 .me = THIS_MODULE, 137 .me = THIS_MODULE,
137}; 138};
138 139
diff --git a/net/netfilter/xt_string.c b/net/netfilter/xt_string.c
index 0bc3460319c8..423293ee57c2 100644
--- a/net/netfilter/xt_string.c
+++ b/net/netfilter/xt_string.c
@@ -77,6 +77,7 @@ static struct xt_match xt_string_mt_reg __read_mostly = {
77 .match = string_mt, 77 .match = string_mt,
78 .destroy = string_mt_destroy, 78 .destroy = string_mt_destroy,
79 .matchsize = sizeof(struct xt_string_info), 79 .matchsize = sizeof(struct xt_string_info),
80 .usersize = offsetof(struct xt_string_info, config),
80 .me = THIS_MODULE, 81 .me = THIS_MODULE,
81}; 82};
82 83
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 161b628ab2b0..596eaff66649 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -96,6 +96,44 @@ EXPORT_SYMBOL_GPL(nl_table);
96 96
97static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait); 97static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait);
98 98
99static struct lock_class_key nlk_cb_mutex_keys[MAX_LINKS];
100
101static const char *const nlk_cb_mutex_key_strings[MAX_LINKS + 1] = {
102 "nlk_cb_mutex-ROUTE",
103 "nlk_cb_mutex-1",
104 "nlk_cb_mutex-USERSOCK",
105 "nlk_cb_mutex-FIREWALL",
106 "nlk_cb_mutex-SOCK_DIAG",
107 "nlk_cb_mutex-NFLOG",
108 "nlk_cb_mutex-XFRM",
109 "nlk_cb_mutex-SELINUX",
110 "nlk_cb_mutex-ISCSI",
111 "nlk_cb_mutex-AUDIT",
112 "nlk_cb_mutex-FIB_LOOKUP",
113 "nlk_cb_mutex-CONNECTOR",
114 "nlk_cb_mutex-NETFILTER",
115 "nlk_cb_mutex-IP6_FW",
116 "nlk_cb_mutex-DNRTMSG",
117 "nlk_cb_mutex-KOBJECT_UEVENT",
118 "nlk_cb_mutex-GENERIC",
119 "nlk_cb_mutex-17",
120 "nlk_cb_mutex-SCSITRANSPORT",
121 "nlk_cb_mutex-ECRYPTFS",
122 "nlk_cb_mutex-RDMA",
123 "nlk_cb_mutex-CRYPTO",
124 "nlk_cb_mutex-SMC",
125 "nlk_cb_mutex-23",
126 "nlk_cb_mutex-24",
127 "nlk_cb_mutex-25",
128 "nlk_cb_mutex-26",
129 "nlk_cb_mutex-27",
130 "nlk_cb_mutex-28",
131 "nlk_cb_mutex-29",
132 "nlk_cb_mutex-30",
133 "nlk_cb_mutex-31",
134 "nlk_cb_mutex-MAX_LINKS"
135};
136
99static int netlink_dump(struct sock *sk); 137static int netlink_dump(struct sock *sk);
100static void netlink_skb_destructor(struct sk_buff *skb); 138static void netlink_skb_destructor(struct sk_buff *skb);
101 139
@@ -585,6 +623,9 @@ static int __netlink_create(struct net *net, struct socket *sock,
585 } else { 623 } else {
586 nlk->cb_mutex = &nlk->cb_def_mutex; 624 nlk->cb_mutex = &nlk->cb_def_mutex;
587 mutex_init(nlk->cb_mutex); 625 mutex_init(nlk->cb_mutex);
626 lockdep_set_class_and_name(nlk->cb_mutex,
627 nlk_cb_mutex_keys + protocol,
628 nlk_cb_mutex_key_strings[protocol]);
588 } 629 }
589 init_waitqueue_head(&nlk->wait); 630 init_waitqueue_head(&nlk->wait);
590 631
@@ -1210,9 +1251,9 @@ static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation)
1210 skb = nskb; 1251 skb = nskb;
1211 } 1252 }
1212 1253
1213 if (!pskb_expand_head(skb, 0, -delta, allocation)) 1254 pskb_expand_head(skb, 0, -delta,
1214 skb->truesize -= delta; 1255 (allocation & ~__GFP_DIRECT_RECLAIM) |
1215 1256 __GFP_NOWARN | __GFP_NORETRY);
1216 return skb; 1257 return skb;
1217} 1258}
1218 1259
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index fb6e10fdb217..92e0981f7404 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -783,8 +783,10 @@ static int ctrl_dumpfamily(struct sk_buff *skb, struct netlink_callback *cb)
783 783
784 if (ctrl_fill_info(rt, NETLINK_CB(cb->skb).portid, 784 if (ctrl_fill_info(rt, NETLINK_CB(cb->skb).portid,
785 cb->nlh->nlmsg_seq, NLM_F_MULTI, 785 cb->nlh->nlmsg_seq, NLM_F_MULTI,
786 skb, CTRL_CMD_NEWFAMILY) < 0) 786 skb, CTRL_CMD_NEWFAMILY) < 0) {
787 n--;
787 break; 788 break;
789 }
788 } 790 }
789 791
790 cb->args[0] = n; 792 cb->args[0] = n;
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index ed212ffc1d9d..ebf16f7f9089 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -17,7 +17,7 @@
17#include <linux/in.h> 17#include <linux/in.h>
18#include <linux/slab.h> 18#include <linux/slab.h>
19#include <linux/kernel.h> 19#include <linux/kernel.h>
20#include <linux/sched.h> 20#include <linux/sched/signal.h>
21#include <linux/timer.h> 21#include <linux/timer.h>
22#include <linux/string.h> 22#include <linux/string.h>
23#include <linux/sockios.h> 23#include <linux/sockios.h>
@@ -765,7 +765,8 @@ out_release:
765 return err; 765 return err;
766} 766}
767 767
768static int nr_accept(struct socket *sock, struct socket *newsock, int flags) 768static int nr_accept(struct socket *sock, struct socket *newsock, int flags,
769 bool kern)
769{ 770{
770 struct sk_buff *skb; 771 struct sk_buff *skb;
771 struct sock *newsk; 772 struct sock *newsk;
diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c
index b9edf5fae6ae..2ffb18e73df6 100644
--- a/net/nfc/llcp_sock.c
+++ b/net/nfc/llcp_sock.c
@@ -21,6 +21,7 @@
21#include <linux/kernel.h> 21#include <linux/kernel.h>
22#include <linux/module.h> 22#include <linux/module.h>
23#include <linux/nfc.h> 23#include <linux/nfc.h>
24#include <linux/sched/signal.h>
24 25
25#include "nfc.h" 26#include "nfc.h"
26#include "llcp.h" 27#include "llcp.h"
@@ -440,7 +441,7 @@ struct sock *nfc_llcp_accept_dequeue(struct sock *parent,
440} 441}
441 442
442static int llcp_sock_accept(struct socket *sock, struct socket *newsock, 443static int llcp_sock_accept(struct socket *sock, struct socket *newsock,
443 int flags) 444 int flags, bool kern)
444{ 445{
445 DECLARE_WAITQUEUE(wait, current); 446 DECLARE_WAITQUEUE(wait, current);
446 struct sock *sk = sock->sk, *new_sk; 447 struct sock *sk = sock->sk, *new_sk;
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 514f7bcf7c63..c82301ce3fff 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -796,9 +796,8 @@ static void ovs_fragment(struct net *net, struct vport *vport,
796 unsigned long orig_dst; 796 unsigned long orig_dst;
797 struct rt6_info ovs_rt; 797 struct rt6_info ovs_rt;
798 798
799 if (!v6ops) { 799 if (!v6ops)
800 goto err; 800 goto err;
801 }
802 801
803 prepare_frag(vport, skb, orig_network_offset, 802 prepare_frag(vport, skb, orig_network_offset,
804 ovs_key_mac_proto(key)); 803 ovs_key_mac_proto(key));
@@ -1074,6 +1073,8 @@ static int execute_masked_set_action(struct sk_buff *skb,
1074 case OVS_KEY_ATTR_CT_ZONE: 1073 case OVS_KEY_ATTR_CT_ZONE:
1075 case OVS_KEY_ATTR_CT_MARK: 1074 case OVS_KEY_ATTR_CT_MARK:
1076 case OVS_KEY_ATTR_CT_LABELS: 1075 case OVS_KEY_ATTR_CT_LABELS:
1076 case OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4:
1077 case OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6:
1077 err = -EINVAL; 1078 err = -EINVAL;
1078 break; 1079 break;
1079 } 1080 }
@@ -1141,12 +1142,6 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
1141 struct sw_flow_key *key, 1142 struct sw_flow_key *key,
1142 const struct nlattr *attr, int len) 1143 const struct nlattr *attr, int len)
1143{ 1144{
1144 /* Every output action needs a separate clone of 'skb', but the common
1145 * case is just a single output action, so that doing a clone and
1146 * then freeing the original skbuff is wasteful. So the following code
1147 * is slightly obscure just to avoid that.
1148 */
1149 int prev_port = -1;
1150 const struct nlattr *a; 1145 const struct nlattr *a;
1151 int rem; 1146 int rem;
1152 1147
@@ -1154,20 +1149,28 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
1154 a = nla_next(a, &rem)) { 1149 a = nla_next(a, &rem)) {
1155 int err = 0; 1150 int err = 0;
1156 1151
1157 if (unlikely(prev_port != -1)) { 1152 switch (nla_type(a)) {
1158 struct sk_buff *out_skb = skb_clone(skb, GFP_ATOMIC); 1153 case OVS_ACTION_ATTR_OUTPUT: {
1159 1154 int port = nla_get_u32(a);
1160 if (out_skb) 1155 struct sk_buff *clone;
1161 do_output(dp, out_skb, prev_port, key); 1156
1157 /* Every output action needs a separate clone
1158 * of 'skb', In case the output action is the
1159 * last action, cloning can be avoided.
1160 */
1161 if (nla_is_last(a, rem)) {
1162 do_output(dp, skb, port, key);
1163 /* 'skb' has been used for output.
1164 */
1165 return 0;
1166 }
1162 1167
1168 clone = skb_clone(skb, GFP_ATOMIC);
1169 if (clone)
1170 do_output(dp, clone, port, key);
1163 OVS_CB(skb)->cutlen = 0; 1171 OVS_CB(skb)->cutlen = 0;
1164 prev_port = -1;
1165 }
1166
1167 switch (nla_type(a)) {
1168 case OVS_ACTION_ATTR_OUTPUT:
1169 prev_port = nla_get_u32(a);
1170 break; 1172 break;
1173 }
1171 1174
1172 case OVS_ACTION_ATTR_TRUNC: { 1175 case OVS_ACTION_ATTR_TRUNC: {
1173 struct ovs_action_trunc *trunc = nla_data(a); 1176 struct ovs_action_trunc *trunc = nla_data(a);
@@ -1257,11 +1260,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
1257 } 1260 }
1258 } 1261 }
1259 1262
1260 if (prev_port != -1) 1263 consume_skb(skb);
1261 do_output(dp, skb, prev_port, key);
1262 else
1263 consume_skb(skb);
1264
1265 return 0; 1264 return 0;
1266} 1265}
1267 1266
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 54253ea5976e..7b2c2fce408a 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -65,6 +65,7 @@ struct ovs_conntrack_info {
65 struct nf_conn *ct; 65 struct nf_conn *ct;
66 u8 commit : 1; 66 u8 commit : 1;
67 u8 nat : 3; /* enum ovs_ct_nat */ 67 u8 nat : 3; /* enum ovs_ct_nat */
68 u8 force : 1;
68 u16 family; 69 u16 family;
69 struct md_mark mark; 70 struct md_mark mark;
70 struct md_labels labels; 71 struct md_labels labels;
@@ -73,6 +74,8 @@ struct ovs_conntrack_info {
73#endif 74#endif
74}; 75};
75 76
77static bool labels_nonzero(const struct ovs_key_ct_labels *labels);
78
76static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info); 79static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info);
77 80
78static u16 key_to_nfproto(const struct sw_flow_key *key) 81static u16 key_to_nfproto(const struct sw_flow_key *key)
@@ -129,21 +132,33 @@ static u32 ovs_ct_get_mark(const struct nf_conn *ct)
129#endif 132#endif
130} 133}
131 134
135/* Guard against conntrack labels max size shrinking below 128 bits. */
136#if NF_CT_LABELS_MAX_SIZE < 16
137#error NF_CT_LABELS_MAX_SIZE must be at least 16 bytes
138#endif
139
132static void ovs_ct_get_labels(const struct nf_conn *ct, 140static void ovs_ct_get_labels(const struct nf_conn *ct,
133 struct ovs_key_ct_labels *labels) 141 struct ovs_key_ct_labels *labels)
134{ 142{
135 struct nf_conn_labels *cl = ct ? nf_ct_labels_find(ct) : NULL; 143 struct nf_conn_labels *cl = ct ? nf_ct_labels_find(ct) : NULL;
136 144
137 if (cl) { 145 if (cl)
138 size_t len = sizeof(cl->bits); 146 memcpy(labels, cl->bits, OVS_CT_LABELS_LEN);
147 else
148 memset(labels, 0, OVS_CT_LABELS_LEN);
149}
139 150
140 if (len > OVS_CT_LABELS_LEN) 151static void __ovs_ct_update_key_orig_tp(struct sw_flow_key *key,
141 len = OVS_CT_LABELS_LEN; 152 const struct nf_conntrack_tuple *orig,
142 else if (len < OVS_CT_LABELS_LEN) 153 u8 icmp_proto)
143 memset(labels, 0, OVS_CT_LABELS_LEN); 154{
144 memcpy(labels, cl->bits, len); 155 key->ct_orig_proto = orig->dst.protonum;
156 if (orig->dst.protonum == icmp_proto) {
157 key->ct.orig_tp.src = htons(orig->dst.u.icmp.type);
158 key->ct.orig_tp.dst = htons(orig->dst.u.icmp.code);
145 } else { 159 } else {
146 memset(labels, 0, OVS_CT_LABELS_LEN); 160 key->ct.orig_tp.src = orig->src.u.all;
161 key->ct.orig_tp.dst = orig->dst.u.all;
147 } 162 }
148} 163}
149 164
@@ -151,13 +166,42 @@ static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state,
151 const struct nf_conntrack_zone *zone, 166 const struct nf_conntrack_zone *zone,
152 const struct nf_conn *ct) 167 const struct nf_conn *ct)
153{ 168{
154 key->ct.state = state; 169 key->ct_state = state;
155 key->ct.zone = zone->id; 170 key->ct_zone = zone->id;
156 key->ct.mark = ovs_ct_get_mark(ct); 171 key->ct.mark = ovs_ct_get_mark(ct);
157 ovs_ct_get_labels(ct, &key->ct.labels); 172 ovs_ct_get_labels(ct, &key->ct.labels);
173
174 if (ct) {
175 const struct nf_conntrack_tuple *orig;
176
177 /* Use the master if we have one. */
178 if (ct->master)
179 ct = ct->master;
180 orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
181
182 /* IP version must match with the master connection. */
183 if (key->eth.type == htons(ETH_P_IP) &&
184 nf_ct_l3num(ct) == NFPROTO_IPV4) {
185 key->ipv4.ct_orig.src = orig->src.u3.ip;
186 key->ipv4.ct_orig.dst = orig->dst.u3.ip;
187 __ovs_ct_update_key_orig_tp(key, orig, IPPROTO_ICMP);
188 return;
189 } else if (key->eth.type == htons(ETH_P_IPV6) &&
190 !sw_flow_key_is_nd(key) &&
191 nf_ct_l3num(ct) == NFPROTO_IPV6) {
192 key->ipv6.ct_orig.src = orig->src.u3.in6;
193 key->ipv6.ct_orig.dst = orig->dst.u3.in6;
194 __ovs_ct_update_key_orig_tp(key, orig, NEXTHDR_ICMP);
195 return;
196 }
197 }
198 /* Clear 'ct_orig_proto' to mark the non-existence of conntrack
199 * original direction key fields.
200 */
201 key->ct_orig_proto = 0;
158} 202}
159 203
160/* Update 'key' based on skb->nfct. If 'post_ct' is true, then OVS has 204/* Update 'key' based on skb->_nfct. If 'post_ct' is true, then OVS has
161 * previously sent the packet to conntrack via the ct action. If 205 * previously sent the packet to conntrack via the ct action. If
162 * 'keep_nat_flags' is true, the existing NAT flags retained, else they are 206 * 'keep_nat_flags' is true, the existing NAT flags retained, else they are
163 * initialized from the connection status. 207 * initialized from the connection status.
@@ -184,7 +228,7 @@ static void ovs_ct_update_key(const struct sk_buff *skb,
184 if (ct->master) 228 if (ct->master)
185 state |= OVS_CS_F_RELATED; 229 state |= OVS_CS_F_RELATED;
186 if (keep_nat_flags) { 230 if (keep_nat_flags) {
187 state |= key->ct.state & OVS_CS_F_NAT_MASK; 231 state |= key->ct_state & OVS_CS_F_NAT_MASK;
188 } else { 232 } else {
189 if (ct->status & IPS_SRC_NAT) 233 if (ct->status & IPS_SRC_NAT)
190 state |= OVS_CS_F_SRC_NAT; 234 state |= OVS_CS_F_SRC_NAT;
@@ -208,44 +252,69 @@ void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key)
208 ovs_ct_update_key(skb, NULL, key, false, false); 252 ovs_ct_update_key(skb, NULL, key, false, false);
209} 253}
210 254
211int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb) 255#define IN6_ADDR_INITIALIZER(ADDR) \
256 { (ADDR).s6_addr32[0], (ADDR).s6_addr32[1], \
257 (ADDR).s6_addr32[2], (ADDR).s6_addr32[3] }
258
259int ovs_ct_put_key(const struct sw_flow_key *swkey,
260 const struct sw_flow_key *output, struct sk_buff *skb)
212{ 261{
213 if (nla_put_u32(skb, OVS_KEY_ATTR_CT_STATE, key->ct.state)) 262 if (nla_put_u32(skb, OVS_KEY_ATTR_CT_STATE, output->ct_state))
214 return -EMSGSIZE; 263 return -EMSGSIZE;
215 264
216 if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) && 265 if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) &&
217 nla_put_u16(skb, OVS_KEY_ATTR_CT_ZONE, key->ct.zone)) 266 nla_put_u16(skb, OVS_KEY_ATTR_CT_ZONE, output->ct_zone))
218 return -EMSGSIZE; 267 return -EMSGSIZE;
219 268
220 if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) && 269 if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) &&
221 nla_put_u32(skb, OVS_KEY_ATTR_CT_MARK, key->ct.mark)) 270 nla_put_u32(skb, OVS_KEY_ATTR_CT_MARK, output->ct.mark))
222 return -EMSGSIZE; 271 return -EMSGSIZE;
223 272
224 if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && 273 if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) &&
225 nla_put(skb, OVS_KEY_ATTR_CT_LABELS, sizeof(key->ct.labels), 274 nla_put(skb, OVS_KEY_ATTR_CT_LABELS, sizeof(output->ct.labels),
226 &key->ct.labels)) 275 &output->ct.labels))
227 return -EMSGSIZE; 276 return -EMSGSIZE;
228 277
278 if (swkey->ct_orig_proto) {
279 if (swkey->eth.type == htons(ETH_P_IP)) {
280 struct ovs_key_ct_tuple_ipv4 orig = {
281 output->ipv4.ct_orig.src,
282 output->ipv4.ct_orig.dst,
283 output->ct.orig_tp.src,
284 output->ct.orig_tp.dst,
285 output->ct_orig_proto,
286 };
287 if (nla_put(skb, OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4,
288 sizeof(orig), &orig))
289 return -EMSGSIZE;
290 } else if (swkey->eth.type == htons(ETH_P_IPV6)) {
291 struct ovs_key_ct_tuple_ipv6 orig = {
292 IN6_ADDR_INITIALIZER(output->ipv6.ct_orig.src),
293 IN6_ADDR_INITIALIZER(output->ipv6.ct_orig.dst),
294 output->ct.orig_tp.src,
295 output->ct.orig_tp.dst,
296 output->ct_orig_proto,
297 };
298 if (nla_put(skb, OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6,
299 sizeof(orig), &orig))
300 return -EMSGSIZE;
301 }
302 }
303
229 return 0; 304 return 0;
230} 305}
231 306
232static int ovs_ct_set_mark(struct sk_buff *skb, struct sw_flow_key *key, 307static int ovs_ct_set_mark(struct nf_conn *ct, struct sw_flow_key *key,
233 u32 ct_mark, u32 mask) 308 u32 ct_mark, u32 mask)
234{ 309{
235#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) 310#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
236 enum ip_conntrack_info ctinfo;
237 struct nf_conn *ct;
238 u32 new_mark; 311 u32 new_mark;
239 312
240 /* The connection could be invalid, in which case set_mark is no-op. */
241 ct = nf_ct_get(skb, &ctinfo);
242 if (!ct)
243 return 0;
244
245 new_mark = ct_mark | (ct->mark & ~(mask)); 313 new_mark = ct_mark | (ct->mark & ~(mask));
246 if (ct->mark != new_mark) { 314 if (ct->mark != new_mark) {
247 ct->mark = new_mark; 315 ct->mark = new_mark;
248 nf_conntrack_event_cache(IPCT_MARK, ct); 316 if (nf_ct_is_confirmed(ct))
317 nf_conntrack_event_cache(IPCT_MARK, ct);
249 key->ct.mark = new_mark; 318 key->ct.mark = new_mark;
250 } 319 }
251 320
@@ -255,34 +324,83 @@ static int ovs_ct_set_mark(struct sk_buff *skb, struct sw_flow_key *key,
255#endif 324#endif
256} 325}
257 326
258static int ovs_ct_set_labels(struct sk_buff *skb, struct sw_flow_key *key, 327static struct nf_conn_labels *ovs_ct_get_conn_labels(struct nf_conn *ct)
259 const struct ovs_key_ct_labels *labels,
260 const struct ovs_key_ct_labels *mask)
261{ 328{
262 enum ip_conntrack_info ctinfo;
263 struct nf_conn_labels *cl; 329 struct nf_conn_labels *cl;
264 struct nf_conn *ct;
265 int err;
266
267 /* The connection could be invalid, in which case set_label is no-op.*/
268 ct = nf_ct_get(skb, &ctinfo);
269 if (!ct)
270 return 0;
271 330
272 cl = nf_ct_labels_find(ct); 331 cl = nf_ct_labels_find(ct);
273 if (!cl) { 332 if (!cl) {
274 nf_ct_labels_ext_add(ct); 333 nf_ct_labels_ext_add(ct);
275 cl = nf_ct_labels_find(ct); 334 cl = nf_ct_labels_find(ct);
276 } 335 }
277 if (!cl || sizeof(cl->bits) < OVS_CT_LABELS_LEN) 336
337 return cl;
338}
339
340/* Initialize labels for a new, yet to be committed conntrack entry. Note that
341 * since the new connection is not yet confirmed, and thus no-one else has
342 * access to it's labels, we simply write them over.
343 */
344static int ovs_ct_init_labels(struct nf_conn *ct, struct sw_flow_key *key,
345 const struct ovs_key_ct_labels *labels,
346 const struct ovs_key_ct_labels *mask)
347{
348 struct nf_conn_labels *cl, *master_cl;
349 bool have_mask = labels_nonzero(mask);
350
351 /* Inherit master's labels to the related connection? */
352 master_cl = ct->master ? nf_ct_labels_find(ct->master) : NULL;
353
354 if (!master_cl && !have_mask)
355 return 0; /* Nothing to do. */
356
357 cl = ovs_ct_get_conn_labels(ct);
358 if (!cl)
359 return -ENOSPC;
360
361 /* Inherit the master's labels, if any. */
362 if (master_cl)
363 *cl = *master_cl;
364
365 if (have_mask) {
366 u32 *dst = (u32 *)cl->bits;
367 int i;
368
369 for (i = 0; i < OVS_CT_LABELS_LEN_32; i++)
370 dst[i] = (dst[i] & ~mask->ct_labels_32[i]) |
371 (labels->ct_labels_32[i]
372 & mask->ct_labels_32[i]);
373 }
374
375 /* Labels are included in the IPCTNL_MSG_CT_NEW event only if the
376 * IPCT_LABEL bit it set in the event cache.
377 */
378 nf_conntrack_event_cache(IPCT_LABEL, ct);
379
380 memcpy(&key->ct.labels, cl->bits, OVS_CT_LABELS_LEN);
381
382 return 0;
383}
384
385static int ovs_ct_set_labels(struct nf_conn *ct, struct sw_flow_key *key,
386 const struct ovs_key_ct_labels *labels,
387 const struct ovs_key_ct_labels *mask)
388{
389 struct nf_conn_labels *cl;
390 int err;
391
392 cl = ovs_ct_get_conn_labels(ct);
393 if (!cl)
278 return -ENOSPC; 394 return -ENOSPC;
279 395
280 err = nf_connlabels_replace(ct, (u32 *)labels, (u32 *)mask, 396 err = nf_connlabels_replace(ct, labels->ct_labels_32,
281 OVS_CT_LABELS_LEN / sizeof(u32)); 397 mask->ct_labels_32,
398 OVS_CT_LABELS_LEN_32);
282 if (err) 399 if (err)
283 return err; 400 return err;
284 401
285 ovs_ct_get_labels(ct, &key->ct.labels); 402 memcpy(&key->ct.labels, cl->bits, OVS_CT_LABELS_LEN);
403
286 return 0; 404 return 0;
287} 405}
288 406
@@ -367,7 +485,6 @@ static int handle_fragments(struct net *net, struct sw_flow_key *key,
367 } else if (key->eth.type == htons(ETH_P_IPV6)) { 485 } else if (key->eth.type == htons(ETH_P_IPV6)) {
368 enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone; 486 enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone;
369 487
370 skb_orphan(skb);
371 memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm)); 488 memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
372 err = nf_ct_frag6_gather(net, skb, user); 489 err = nf_ct_frag6_gather(net, skb, user);
373 if (err) { 490 if (err) {
@@ -421,16 +538,16 @@ ovs_ct_get_info(const struct nf_conntrack_tuple_hash *h)
421 538
422/* Find an existing connection which this packet belongs to without 539/* Find an existing connection which this packet belongs to without
423 * re-attributing statistics or modifying the connection state. This allows an 540 * re-attributing statistics or modifying the connection state. This allows an
424 * skb->nfct lost due to an upcall to be recovered during actions execution. 541 * skb->_nfct lost due to an upcall to be recovered during actions execution.
425 * 542 *
426 * Must be called with rcu_read_lock. 543 * Must be called with rcu_read_lock.
427 * 544 *
428 * On success, populates skb->nfct and skb->nfctinfo, and returns the 545 * On success, populates skb->_nfct and returns the connection. Returns NULL
429 * connection. Returns NULL if there is no existing entry. 546 * if there is no existing entry.
430 */ 547 */
431static struct nf_conn * 548static struct nf_conn *
432ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone, 549ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone,
433 u8 l3num, struct sk_buff *skb) 550 u8 l3num, struct sk_buff *skb, bool natted)
434{ 551{
435 struct nf_conntrack_l3proto *l3proto; 552 struct nf_conntrack_l3proto *l3proto;
436 struct nf_conntrack_l4proto *l4proto; 553 struct nf_conntrack_l4proto *l4proto;
@@ -453,6 +570,17 @@ ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone,
453 return NULL; 570 return NULL;
454 } 571 }
455 572
573 /* Must invert the tuple if skb has been transformed by NAT. */
574 if (natted) {
575 struct nf_conntrack_tuple inverse;
576
577 if (!nf_ct_invert_tuple(&inverse, &tuple, l3proto, l4proto)) {
578 pr_debug("ovs_ct_find_existing: Inversion failed!\n");
579 return NULL;
580 }
581 tuple = inverse;
582 }
583
456 /* look for tuple match */ 584 /* look for tuple match */
457 h = nf_conntrack_find_get(net, zone, &tuple); 585 h = nf_conntrack_find_get(net, zone, &tuple);
458 if (!h) 586 if (!h)
@@ -460,12 +588,18 @@ ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone,
460 588
461 ct = nf_ct_tuplehash_to_ctrack(h); 589 ct = nf_ct_tuplehash_to_ctrack(h);
462 590
463 skb->nfct = &ct->ct_general; 591 /* Inverted packet tuple matches the reverse direction conntrack tuple,
464 skb->nfctinfo = ovs_ct_get_info(h); 592 * select the other tuplehash to get the right 'ctinfo' bits for this
593 * packet.
594 */
595 if (natted)
596 h = &ct->tuplehash[!h->tuple.dst.dir];
597
598 nf_ct_set(skb, ct, ovs_ct_get_info(h));
465 return ct; 599 return ct;
466} 600}
467 601
468/* Determine whether skb->nfct is equal to the result of conntrack lookup. */ 602/* Determine whether skb->_nfct is equal to the result of conntrack lookup. */
469static bool skb_nfct_cached(struct net *net, 603static bool skb_nfct_cached(struct net *net,
470 const struct sw_flow_key *key, 604 const struct sw_flow_key *key,
471 const struct ovs_conntrack_info *info, 605 const struct ovs_conntrack_info *info,
@@ -476,14 +610,19 @@ static bool skb_nfct_cached(struct net *net,
476 610
477 ct = nf_ct_get(skb, &ctinfo); 611 ct = nf_ct_get(skb, &ctinfo);
478 /* If no ct, check if we have evidence that an existing conntrack entry 612 /* If no ct, check if we have evidence that an existing conntrack entry
479 * might be found for this skb. This happens when we lose a skb->nfct 613 * might be found for this skb. This happens when we lose a skb->_nfct
480 * due to an upcall. If the connection was not confirmed, it is not 614 * due to an upcall. If the connection was not confirmed, it is not
481 * cached and needs to be run through conntrack again. 615 * cached and needs to be run through conntrack again.
482 */ 616 */
483 if (!ct && key->ct.state & OVS_CS_F_TRACKED && 617 if (!ct && key->ct_state & OVS_CS_F_TRACKED &&
484 !(key->ct.state & OVS_CS_F_INVALID) && 618 !(key->ct_state & OVS_CS_F_INVALID) &&
485 key->ct.zone == info->zone.id) 619 key->ct_zone == info->zone.id) {
486 ct = ovs_ct_find_existing(net, &info->zone, info->family, skb); 620 ct = ovs_ct_find_existing(net, &info->zone, info->family, skb,
621 !!(key->ct_state
622 & OVS_CS_F_NAT_MASK));
623 if (ct)
624 nf_ct_get(skb, &ctinfo);
625 }
487 if (!ct) 626 if (!ct)
488 return false; 627 return false;
489 if (!net_eq(net, read_pnet(&ct->ct_net))) 628 if (!net_eq(net, read_pnet(&ct->ct_net)))
@@ -497,6 +636,18 @@ static bool skb_nfct_cached(struct net *net,
497 if (help && rcu_access_pointer(help->helper) != info->helper) 636 if (help && rcu_access_pointer(help->helper) != info->helper)
498 return false; 637 return false;
499 } 638 }
639 /* Force conntrack entry direction to the current packet? */
640 if (info->force && CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) {
641 /* Delete the conntrack entry if confirmed, else just release
642 * the reference.
643 */
644 if (nf_ct_is_confirmed(ct))
645 nf_ct_delete(ct, 0, 0);
646
647 nf_conntrack_put(&ct->ct_general);
648 nf_ct_set(skb, NULL, 0);
649 return false;
650 }
500 651
501 return true; 652 return true;
502} 653}
@@ -591,7 +742,7 @@ static void ovs_nat_update_key(struct sw_flow_key *key,
591 if (maniptype == NF_NAT_MANIP_SRC) { 742 if (maniptype == NF_NAT_MANIP_SRC) {
592 __be16 src; 743 __be16 src;
593 744
594 key->ct.state |= OVS_CS_F_SRC_NAT; 745 key->ct_state |= OVS_CS_F_SRC_NAT;
595 if (key->eth.type == htons(ETH_P_IP)) 746 if (key->eth.type == htons(ETH_P_IP))
596 key->ipv4.addr.src = ip_hdr(skb)->saddr; 747 key->ipv4.addr.src = ip_hdr(skb)->saddr;
597 else if (key->eth.type == htons(ETH_P_IPV6)) 748 else if (key->eth.type == htons(ETH_P_IPV6))
@@ -613,7 +764,7 @@ static void ovs_nat_update_key(struct sw_flow_key *key,
613 } else { 764 } else {
614 __be16 dst; 765 __be16 dst;
615 766
616 key->ct.state |= OVS_CS_F_DST_NAT; 767 key->ct_state |= OVS_CS_F_DST_NAT;
617 if (key->eth.type == htons(ETH_P_IP)) 768 if (key->eth.type == htons(ETH_P_IP))
618 key->ipv4.addr.dst = ip_hdr(skb)->daddr; 769 key->ipv4.addr.dst = ip_hdr(skb)->daddr;
619 else if (key->eth.type == htons(ETH_P_IPV6)) 770 else if (key->eth.type == htons(ETH_P_IPV6))
@@ -700,7 +851,7 @@ static int ovs_ct_nat(struct net *net, struct sw_flow_key *key,
700/* Pass 'skb' through conntrack in 'net', using zone configured in 'info', if 851/* Pass 'skb' through conntrack in 'net', using zone configured in 'info', if
701 * not done already. Update key with new CT state after passing the packet 852 * not done already. Update key with new CT state after passing the packet
702 * through conntrack. 853 * through conntrack.
703 * Note that if the packet is deemed invalid by conntrack, skb->nfct will be 854 * Note that if the packet is deemed invalid by conntrack, skb->_nfct will be
704 * set to NULL and 0 will be returned. 855 * set to NULL and 0 will be returned.
705 */ 856 */
706static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, 857static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
@@ -722,11 +873,10 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
722 873
723 /* Associate skb with specified zone. */ 874 /* Associate skb with specified zone. */
724 if (tmpl) { 875 if (tmpl) {
725 if (skb->nfct) 876 if (skb_nfct(skb))
726 nf_conntrack_put(skb->nfct); 877 nf_conntrack_put(skb_nfct(skb));
727 nf_conntrack_get(&tmpl->ct_general); 878 nf_conntrack_get(&tmpl->ct_general);
728 skb->nfct = &tmpl->ct_general; 879 nf_ct_set(skb, tmpl, IP_CT_NEW);
729 skb->nfctinfo = IP_CT_NEW;
730 } 880 }
731 881
732 err = nf_conntrack_in(net, info->family, 882 err = nf_conntrack_in(net, info->family,
@@ -738,7 +888,7 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
738 * NAT after the nf_conntrack_in() call. We can actually clear 888 * NAT after the nf_conntrack_in() call. We can actually clear
739 * the whole state, as it will be re-initialized below. 889 * the whole state, as it will be re-initialized below.
740 */ 890 */
741 key->ct.state = 0; 891 key->ct_state = 0;
742 892
743 /* Update the key, but keep the NAT flags. */ 893 /* Update the key, but keep the NAT flags. */
744 ovs_ct_update_key(skb, info, key, true, true); 894 ovs_ct_update_key(skb, info, key, true, true);
@@ -754,9 +904,9 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
754 * 904 *
755 * NAT will be done only if the CT action has NAT, and only 905 * NAT will be done only if the CT action has NAT, and only
756 * once per packet (per zone), as guarded by the NAT bits in 906 * once per packet (per zone), as guarded by the NAT bits in
757 * the key->ct.state. 907 * the key->ct_state.
758 */ 908 */
759 if (info->nat && !(key->ct.state & OVS_CS_F_NAT_MASK) && 909 if (info->nat && !(key->ct_state & OVS_CS_F_NAT_MASK) &&
760 (nf_ct_is_confirmed(ct) || info->commit) && 910 (nf_ct_is_confirmed(ct) || info->commit) &&
761 ovs_ct_nat(net, key, info, skb, ct, ctinfo) != NF_ACCEPT) { 911 ovs_ct_nat(net, key, info, skb, ct, ctinfo) != NF_ACCEPT) {
762 return -EINVAL; 912 return -EINVAL;
@@ -820,7 +970,7 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
820 if (err) 970 if (err)
821 return err; 971 return err;
822 972
823 ct = (struct nf_conn *)skb->nfct; 973 ct = (struct nf_conn *)skb_nfct(skb);
824 if (ct) 974 if (ct)
825 nf_ct_deliver_cached_events(ct); 975 nf_ct_deliver_cached_events(ct);
826 } 976 }
@@ -832,8 +982,8 @@ static bool labels_nonzero(const struct ovs_key_ct_labels *labels)
832{ 982{
833 size_t i; 983 size_t i;
834 984
835 for (i = 0; i < sizeof(*labels); i++) 985 for (i = 0; i < OVS_CT_LABELS_LEN_32; i++)
836 if (labels->ct_labels[i]) 986 if (labels->ct_labels_32[i])
837 return true; 987 return true;
838 988
839 return false; 989 return false;
@@ -844,24 +994,36 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
844 const struct ovs_conntrack_info *info, 994 const struct ovs_conntrack_info *info,
845 struct sk_buff *skb) 995 struct sk_buff *skb)
846{ 996{
997 enum ip_conntrack_info ctinfo;
998 struct nf_conn *ct;
847 int err; 999 int err;
848 1000
849 err = __ovs_ct_lookup(net, key, info, skb); 1001 err = __ovs_ct_lookup(net, key, info, skb);
850 if (err) 1002 if (err)
851 return err; 1003 return err;
852 1004
1005 /* The connection could be invalid, in which case this is a no-op.*/
1006 ct = nf_ct_get(skb, &ctinfo);
1007 if (!ct)
1008 return 0;
1009
853 /* Apply changes before confirming the connection so that the initial 1010 /* Apply changes before confirming the connection so that the initial
854 * conntrack NEW netlink event carries the values given in the CT 1011 * conntrack NEW netlink event carries the values given in the CT
855 * action. 1012 * action.
856 */ 1013 */
857 if (info->mark.mask) { 1014 if (info->mark.mask) {
858 err = ovs_ct_set_mark(skb, key, info->mark.value, 1015 err = ovs_ct_set_mark(ct, key, info->mark.value,
859 info->mark.mask); 1016 info->mark.mask);
860 if (err) 1017 if (err)
861 return err; 1018 return err;
862 } 1019 }
863 if (labels_nonzero(&info->labels.mask)) { 1020 if (!nf_ct_is_confirmed(ct)) {
864 err = ovs_ct_set_labels(skb, key, &info->labels.value, 1021 err = ovs_ct_init_labels(ct, key, &info->labels.value,
1022 &info->labels.mask);
1023 if (err)
1024 return err;
1025 } else if (labels_nonzero(&info->labels.mask)) {
1026 err = ovs_ct_set_labels(ct, key, &info->labels.value,
865 &info->labels.mask); 1027 &info->labels.mask);
866 if (err) 1028 if (err)
867 return err; 1029 return err;
@@ -1063,6 +1225,7 @@ static int parse_nat(const struct nlattr *attr,
1063 1225
1064static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = { 1226static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = {
1065 [OVS_CT_ATTR_COMMIT] = { .minlen = 0, .maxlen = 0 }, 1227 [OVS_CT_ATTR_COMMIT] = { .minlen = 0, .maxlen = 0 },
1228 [OVS_CT_ATTR_FORCE_COMMIT] = { .minlen = 0, .maxlen = 0 },
1066 [OVS_CT_ATTR_ZONE] = { .minlen = sizeof(u16), 1229 [OVS_CT_ATTR_ZONE] = { .minlen = sizeof(u16),
1067 .maxlen = sizeof(u16) }, 1230 .maxlen = sizeof(u16) },
1068 [OVS_CT_ATTR_MARK] = { .minlen = sizeof(struct md_mark), 1231 [OVS_CT_ATTR_MARK] = { .minlen = sizeof(struct md_mark),
@@ -1102,6 +1265,9 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
1102 } 1265 }
1103 1266
1104 switch (type) { 1267 switch (type) {
1268 case OVS_CT_ATTR_FORCE_COMMIT:
1269 info->force = true;
1270 /* fall through. */
1105 case OVS_CT_ATTR_COMMIT: 1271 case OVS_CT_ATTR_COMMIT:
1106 info->commit = true; 1272 info->commit = true;
1107 break; 1273 break;
@@ -1328,7 +1494,9 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info,
1328 if (!start) 1494 if (!start)
1329 return -EMSGSIZE; 1495 return -EMSGSIZE;
1330 1496
1331 if (ct_info->commit && nla_put_flag(skb, OVS_CT_ATTR_COMMIT)) 1497 if (ct_info->commit && nla_put_flag(skb, ct_info->force
1498 ? OVS_CT_ATTR_FORCE_COMMIT
1499 : OVS_CT_ATTR_COMMIT))
1332 return -EMSGSIZE; 1500 return -EMSGSIZE;
1333 if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) && 1501 if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) &&
1334 nla_put_u16(skb, OVS_CT_ATTR_ZONE, ct_info->zone.id)) 1502 nla_put_u16(skb, OVS_CT_ATTR_ZONE, ct_info->zone.id))
diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h
index 8f6230bd6183..bc7efd1867ab 100644
--- a/net/openvswitch/conntrack.h
+++ b/net/openvswitch/conntrack.h
@@ -32,7 +32,8 @@ int ovs_ct_execute(struct net *, struct sk_buff *, struct sw_flow_key *,
32 const struct ovs_conntrack_info *); 32 const struct ovs_conntrack_info *);
33 33
34void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key); 34void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key);
35int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb); 35int ovs_ct_put_key(const struct sw_flow_key *swkey,
36 const struct sw_flow_key *output, struct sk_buff *skb);
36void ovs_ct_free_action(const struct nlattr *a); 37void ovs_ct_free_action(const struct nlattr *a);
37 38
38#define CT_SUPPORTED_MASK (OVS_CS_F_NEW | OVS_CS_F_ESTABLISHED | \ 39#define CT_SUPPORTED_MASK (OVS_CS_F_NEW | OVS_CS_F_ESTABLISHED | \
@@ -75,13 +76,18 @@ static inline int ovs_ct_execute(struct net *net, struct sk_buff *skb,
75static inline void ovs_ct_fill_key(const struct sk_buff *skb, 76static inline void ovs_ct_fill_key(const struct sk_buff *skb,
76 struct sw_flow_key *key) 77 struct sw_flow_key *key)
77{ 78{
78 key->ct.state = 0; 79 key->ct_state = 0;
79 key->ct.zone = 0; 80 key->ct_zone = 0;
80 key->ct.mark = 0; 81 key->ct.mark = 0;
81 memset(&key->ct.labels, 0, sizeof(key->ct.labels)); 82 memset(&key->ct.labels, 0, sizeof(key->ct.labels));
83 /* Clear 'ct_orig_proto' to mark the non-existence of original
84 * direction key fields.
85 */
86 key->ct_orig_proto = 0;
82} 87}
83 88
84static inline int ovs_ct_put_key(const struct sw_flow_key *key, 89static inline int ovs_ct_put_key(const struct sw_flow_key *swkey,
90 const struct sw_flow_key *output,
85 struct sk_buff *skb) 91 struct sk_buff *skb)
86{ 92{
87 return 0; 93 return 0;
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 2c0a00f7f1b7..3f76cb765e5b 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -527,7 +527,7 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
527 527
528 /* Link layer. */ 528 /* Link layer. */
529 clear_vlan(key); 529 clear_vlan(key);
530 if (key->mac_proto == MAC_PROTO_NONE) { 530 if (ovs_key_mac_proto(key) == MAC_PROTO_NONE) {
531 if (unlikely(eth_type_vlan(skb->protocol))) 531 if (unlikely(eth_type_vlan(skb->protocol)))
532 return -EINVAL; 532 return -EINVAL;
533 533
@@ -745,7 +745,13 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
745 745
746int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key) 746int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key)
747{ 747{
748 return key_extract(skb, key); 748 int res;
749
750 res = key_extract(skb, key);
751 if (!res)
752 key->mac_proto &= ~SW_FLOW_KEY_INVALID;
753
754 return res;
749} 755}
750 756
751static int key_extract_mac_proto(struct sk_buff *skb) 757static int key_extract_mac_proto(struct sk_buff *skb)
@@ -765,7 +771,7 @@ static int key_extract_mac_proto(struct sk_buff *skb)
765int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, 771int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
766 struct sk_buff *skb, struct sw_flow_key *key) 772 struct sk_buff *skb, struct sw_flow_key *key)
767{ 773{
768 int res; 774 int res, err;
769 775
770 /* Extract metadata from packet. */ 776 /* Extract metadata from packet. */
771 if (tun_info) { 777 if (tun_info) {
@@ -792,7 +798,6 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
792 key->phy.priority = skb->priority; 798 key->phy.priority = skb->priority;
793 key->phy.in_port = OVS_CB(skb)->input_vport->port_no; 799 key->phy.in_port = OVS_CB(skb)->input_vport->port_no;
794 key->phy.skb_mark = skb->mark; 800 key->phy.skb_mark = skb->mark;
795 ovs_ct_fill_key(skb, key);
796 key->ovs_flow_hash = 0; 801 key->ovs_flow_hash = 0;
797 res = key_extract_mac_proto(skb); 802 res = key_extract_mac_proto(skb);
798 if (res < 0) 803 if (res < 0)
@@ -800,17 +805,26 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
800 key->mac_proto = res; 805 key->mac_proto = res;
801 key->recirc_id = 0; 806 key->recirc_id = 0;
802 807
803 return key_extract(skb, key); 808 err = key_extract(skb, key);
809 if (!err)
810 ovs_ct_fill_key(skb, key); /* Must be after key_extract(). */
811 return err;
804} 812}
805 813
806int ovs_flow_key_extract_userspace(struct net *net, const struct nlattr *attr, 814int ovs_flow_key_extract_userspace(struct net *net, const struct nlattr *attr,
807 struct sk_buff *skb, 815 struct sk_buff *skb,
808 struct sw_flow_key *key, bool log) 816 struct sw_flow_key *key, bool log)
809{ 817{
818 const struct nlattr *a[OVS_KEY_ATTR_MAX + 1];
819 u64 attrs = 0;
810 int err; 820 int err;
811 821
822 err = parse_flow_nlattrs(attr, a, &attrs, log);
823 if (err)
824 return -EINVAL;
825
812 /* Extract metadata from netlink attributes. */ 826 /* Extract metadata from netlink attributes. */
813 err = ovs_nla_get_flow_metadata(net, attr, key, log); 827 err = ovs_nla_get_flow_metadata(net, a, attrs, key, log);
814 if (err) 828 if (err)
815 return err; 829 return err;
816 830
@@ -824,5 +838,21 @@ int ovs_flow_key_extract_userspace(struct net *net, const struct nlattr *attr,
824 */ 838 */
825 839
826 skb->protocol = key->eth.type; 840 skb->protocol = key->eth.type;
827 return key_extract(skb, key); 841 err = key_extract(skb, key);
842 if (err)
843 return err;
844
845 /* Check that we have conntrack original direction tuple metadata only
846 * for packets for which it makes sense. Otherwise the key may be
847 * corrupted due to overlapping key fields.
848 */
849 if (attrs & (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4) &&
850 key->eth.type != htons(ETH_P_IP))
851 return -EINVAL;
852 if (attrs & (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6) &&
853 (key->eth.type != htons(ETH_P_IPV6) ||
854 sw_flow_key_is_nd(key)))
855 return -EINVAL;
856
857 return 0;
828} 858}
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index f61cae7f9030..a9bc1c875965 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2007-2014 Nicira, Inc. 2 * Copyright (c) 2007-2017 Nicira, Inc.
3 * 3 *
4 * This program is free software; you can redistribute it and/or 4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public 5 * modify it under the terms of version 2 of the GNU General Public
@@ -85,6 +85,11 @@ struct sw_flow_key {
85 struct vlan_head cvlan; 85 struct vlan_head cvlan;
86 __be16 type; /* Ethernet frame type. */ 86 __be16 type; /* Ethernet frame type. */
87 } eth; 87 } eth;
88 /* Filling a hole of two bytes. */
89 u8 ct_state;
90 u8 ct_orig_proto; /* CT original direction tuple IP
91 * protocol.
92 */
88 union { 93 union {
89 struct { 94 struct {
90 __be32 top_lse; /* top label stack entry */ 95 __be32 top_lse; /* top label stack entry */
@@ -96,6 +101,7 @@ struct sw_flow_key {
96 u8 frag; /* One of OVS_FRAG_TYPE_*. */ 101 u8 frag; /* One of OVS_FRAG_TYPE_*. */
97 } ip; 102 } ip;
98 }; 103 };
104 u16 ct_zone; /* Conntrack zone. */
99 struct { 105 struct {
100 __be16 src; /* TCP/UDP/SCTP source port. */ 106 __be16 src; /* TCP/UDP/SCTP source port. */
101 __be16 dst; /* TCP/UDP/SCTP destination port. */ 107 __be16 dst; /* TCP/UDP/SCTP destination port. */
@@ -107,10 +113,16 @@ struct sw_flow_key {
107 __be32 src; /* IP source address. */ 113 __be32 src; /* IP source address. */
108 __be32 dst; /* IP destination address. */ 114 __be32 dst; /* IP destination address. */
109 } addr; 115 } addr;
110 struct { 116 union {
111 u8 sha[ETH_ALEN]; /* ARP source hardware address. */ 117 struct {
112 u8 tha[ETH_ALEN]; /* ARP target hardware address. */ 118 __be32 src;
113 } arp; 119 __be32 dst;
120 } ct_orig; /* Conntrack original direction fields. */
121 struct {
122 u8 sha[ETH_ALEN]; /* ARP source hardware address. */
123 u8 tha[ETH_ALEN]; /* ARP target hardware address. */
124 } arp;
125 };
114 } ipv4; 126 } ipv4;
115 struct { 127 struct {
116 struct { 128 struct {
@@ -118,23 +130,40 @@ struct sw_flow_key {
118 struct in6_addr dst; /* IPv6 destination address. */ 130 struct in6_addr dst; /* IPv6 destination address. */
119 } addr; 131 } addr;
120 __be32 label; /* IPv6 flow label. */ 132 __be32 label; /* IPv6 flow label. */
121 struct { 133 union {
122 struct in6_addr target; /* ND target address. */ 134 struct {
123 u8 sll[ETH_ALEN]; /* ND source link layer address. */ 135 struct in6_addr src;
124 u8 tll[ETH_ALEN]; /* ND target link layer address. */ 136 struct in6_addr dst;
125 } nd; 137 } ct_orig; /* Conntrack original direction fields. */
138 struct {
139 struct in6_addr target; /* ND target address. */
140 u8 sll[ETH_ALEN]; /* ND source link layer address. */
141 u8 tll[ETH_ALEN]; /* ND target link layer address. */
142 } nd;
143 };
126 } ipv6; 144 } ipv6;
127 }; 145 };
128 struct { 146 struct {
129 /* Connection tracking fields. */ 147 /* Connection tracking fields not packed above. */
130 u16 zone; 148 struct {
149 __be16 src; /* CT orig tuple tp src port. */
150 __be16 dst; /* CT orig tuple tp dst port. */
151 } orig_tp;
131 u32 mark; 152 u32 mark;
132 u8 state;
133 struct ovs_key_ct_labels labels; 153 struct ovs_key_ct_labels labels;
134 } ct; 154 } ct;
135 155
136} __aligned(BITS_PER_LONG/8); /* Ensure that we can do comparisons as longs. */ 156} __aligned(BITS_PER_LONG/8); /* Ensure that we can do comparisons as longs. */
137 157
158static inline bool sw_flow_key_is_nd(const struct sw_flow_key *key)
159{
160 return key->eth.type == htons(ETH_P_IPV6) &&
161 key->ip.proto == NEXTHDR_ICMP &&
162 key->tp.dst == 0 &&
163 (key->tp.src == htons(NDISC_NEIGHBOUR_SOLICITATION) ||
164 key->tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT));
165}
166
138struct sw_flow_key_range { 167struct sw_flow_key_range {
139 unsigned short int start; 168 unsigned short int start;
140 unsigned short int end; 169 unsigned short int end;
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index c87d359b9b37..1105a838bab8 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -129,7 +129,9 @@ static bool match_validate(const struct sw_flow_match *match,
129 /* The following mask attributes allowed only if they 129 /* The following mask attributes allowed only if they
130 * pass the validation tests. */ 130 * pass the validation tests. */
131 mask_allowed &= ~((1 << OVS_KEY_ATTR_IPV4) 131 mask_allowed &= ~((1 << OVS_KEY_ATTR_IPV4)
132 | (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4)
132 | (1 << OVS_KEY_ATTR_IPV6) 133 | (1 << OVS_KEY_ATTR_IPV6)
134 | (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6)
133 | (1 << OVS_KEY_ATTR_TCP) 135 | (1 << OVS_KEY_ATTR_TCP)
134 | (1 << OVS_KEY_ATTR_TCP_FLAGS) 136 | (1 << OVS_KEY_ATTR_TCP_FLAGS)
135 | (1 << OVS_KEY_ATTR_UDP) 137 | (1 << OVS_KEY_ATTR_UDP)
@@ -161,8 +163,10 @@ static bool match_validate(const struct sw_flow_match *match,
161 163
162 if (match->key->eth.type == htons(ETH_P_IP)) { 164 if (match->key->eth.type == htons(ETH_P_IP)) {
163 key_expected |= 1 << OVS_KEY_ATTR_IPV4; 165 key_expected |= 1 << OVS_KEY_ATTR_IPV4;
164 if (match->mask && (match->mask->key.eth.type == htons(0xffff))) 166 if (match->mask && match->mask->key.eth.type == htons(0xffff)) {
165 mask_allowed |= 1 << OVS_KEY_ATTR_IPV4; 167 mask_allowed |= 1 << OVS_KEY_ATTR_IPV4;
168 mask_allowed |= 1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4;
169 }
166 170
167 if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) { 171 if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) {
168 if (match->key->ip.proto == IPPROTO_UDP) { 172 if (match->key->ip.proto == IPPROTO_UDP) {
@@ -196,8 +200,10 @@ static bool match_validate(const struct sw_flow_match *match,
196 200
197 if (match->key->eth.type == htons(ETH_P_IPV6)) { 201 if (match->key->eth.type == htons(ETH_P_IPV6)) {
198 key_expected |= 1 << OVS_KEY_ATTR_IPV6; 202 key_expected |= 1 << OVS_KEY_ATTR_IPV6;
199 if (match->mask && (match->mask->key.eth.type == htons(0xffff))) 203 if (match->mask && match->mask->key.eth.type == htons(0xffff)) {
200 mask_allowed |= 1 << OVS_KEY_ATTR_IPV6; 204 mask_allowed |= 1 << OVS_KEY_ATTR_IPV6;
205 mask_allowed |= 1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6;
206 }
201 207
202 if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) { 208 if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) {
203 if (match->key->ip.proto == IPPROTO_UDP) { 209 if (match->key->ip.proto == IPPROTO_UDP) {
@@ -230,6 +236,12 @@ static bool match_validate(const struct sw_flow_match *match,
230 htons(NDISC_NEIGHBOUR_SOLICITATION) || 236 htons(NDISC_NEIGHBOUR_SOLICITATION) ||
231 match->key->tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) { 237 match->key->tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) {
232 key_expected |= 1 << OVS_KEY_ATTR_ND; 238 key_expected |= 1 << OVS_KEY_ATTR_ND;
239 /* Original direction conntrack tuple
240 * uses the same space as the ND fields
241 * in the key, so both are not allowed
242 * at the same time.
243 */
244 mask_allowed &= ~(1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6);
233 if (match->mask && (match->mask->key.tp.src == htons(0xff))) 245 if (match->mask && (match->mask->key.tp.src == htons(0xff)))
234 mask_allowed |= 1 << OVS_KEY_ATTR_ND; 246 mask_allowed |= 1 << OVS_KEY_ATTR_ND;
235 } 247 }
@@ -282,7 +294,7 @@ size_t ovs_key_attr_size(void)
282 /* Whenever adding new OVS_KEY_ FIELDS, we should consider 294 /* Whenever adding new OVS_KEY_ FIELDS, we should consider
283 * updating this function. 295 * updating this function.
284 */ 296 */
285 BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 26); 297 BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 28);
286 298
287 return nla_total_size(4) /* OVS_KEY_ATTR_PRIORITY */ 299 return nla_total_size(4) /* OVS_KEY_ATTR_PRIORITY */
288 + nla_total_size(0) /* OVS_KEY_ATTR_TUNNEL */ 300 + nla_total_size(0) /* OVS_KEY_ATTR_TUNNEL */
@@ -295,6 +307,7 @@ size_t ovs_key_attr_size(void)
295 + nla_total_size(2) /* OVS_KEY_ATTR_CT_ZONE */ 307 + nla_total_size(2) /* OVS_KEY_ATTR_CT_ZONE */
296 + nla_total_size(4) /* OVS_KEY_ATTR_CT_MARK */ 308 + nla_total_size(4) /* OVS_KEY_ATTR_CT_MARK */
297 + nla_total_size(16) /* OVS_KEY_ATTR_CT_LABELS */ 309 + nla_total_size(16) /* OVS_KEY_ATTR_CT_LABELS */
310 + nla_total_size(40) /* OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6 */
298 + nla_total_size(12) /* OVS_KEY_ATTR_ETHERNET */ 311 + nla_total_size(12) /* OVS_KEY_ATTR_ETHERNET */
299 + nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */ 312 + nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */
300 + nla_total_size(4) /* OVS_KEY_ATTR_VLAN */ 313 + nla_total_size(4) /* OVS_KEY_ATTR_VLAN */
@@ -355,6 +368,10 @@ static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
355 [OVS_KEY_ATTR_CT_ZONE] = { .len = sizeof(u16) }, 368 [OVS_KEY_ATTR_CT_ZONE] = { .len = sizeof(u16) },
356 [OVS_KEY_ATTR_CT_MARK] = { .len = sizeof(u32) }, 369 [OVS_KEY_ATTR_CT_MARK] = { .len = sizeof(u32) },
357 [OVS_KEY_ATTR_CT_LABELS] = { .len = sizeof(struct ovs_key_ct_labels) }, 370 [OVS_KEY_ATTR_CT_LABELS] = { .len = sizeof(struct ovs_key_ct_labels) },
371 [OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4] = {
372 .len = sizeof(struct ovs_key_ct_tuple_ipv4) },
373 [OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6] = {
374 .len = sizeof(struct ovs_key_ct_tuple_ipv6) },
358}; 375};
359 376
360static bool check_attr_len(unsigned int attr_len, unsigned int expected_len) 377static bool check_attr_len(unsigned int attr_len, unsigned int expected_len)
@@ -430,9 +447,8 @@ static int parse_flow_mask_nlattrs(const struct nlattr *attr,
430 return __parse_flow_nlattrs(attr, a, attrsp, log, true); 447 return __parse_flow_nlattrs(attr, a, attrsp, log, true);
431} 448}
432 449
433static int parse_flow_nlattrs(const struct nlattr *attr, 450int parse_flow_nlattrs(const struct nlattr *attr, const struct nlattr *a[],
434 const struct nlattr *a[], u64 *attrsp, 451 u64 *attrsp, bool log)
435 bool log)
436{ 452{
437 return __parse_flow_nlattrs(attr, a, attrsp, log, false); 453 return __parse_flow_nlattrs(attr, a, attrsp, log, false);
438} 454}
@@ -588,7 +604,7 @@ static int ip_tun_from_nlattr(const struct nlattr *attr,
588 ipv4 = true; 604 ipv4 = true;
589 break; 605 break;
590 case OVS_TUNNEL_KEY_ATTR_IPV6_SRC: 606 case OVS_TUNNEL_KEY_ATTR_IPV6_SRC:
591 SW_FLOW_KEY_PUT(match, tun_key.u.ipv6.dst, 607 SW_FLOW_KEY_PUT(match, tun_key.u.ipv6.src,
592 nla_get_in6_addr(a), is_mask); 608 nla_get_in6_addr(a), is_mask);
593 ipv6 = true; 609 ipv6 = true;
594 break; 610 break;
@@ -649,6 +665,8 @@ static int ip_tun_from_nlattr(const struct nlattr *attr,
649 tun_flags |= TUNNEL_VXLAN_OPT; 665 tun_flags |= TUNNEL_VXLAN_OPT;
650 opts_type = type; 666 opts_type = type;
651 break; 667 break;
668 case OVS_TUNNEL_KEY_ATTR_PAD:
669 break;
652 default: 670 default:
653 OVS_NLERR(log, "Unknown IP tunnel attribute %d", 671 OVS_NLERR(log, "Unknown IP tunnel attribute %d",
654 type); 672 type);
@@ -1056,14 +1074,14 @@ static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match,
1056 return -EINVAL; 1074 return -EINVAL;
1057 } 1075 }
1058 1076
1059 SW_FLOW_KEY_PUT(match, ct.state, ct_state, is_mask); 1077 SW_FLOW_KEY_PUT(match, ct_state, ct_state, is_mask);
1060 *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_STATE); 1078 *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_STATE);
1061 } 1079 }
1062 if (*attrs & (1 << OVS_KEY_ATTR_CT_ZONE) && 1080 if (*attrs & (1 << OVS_KEY_ATTR_CT_ZONE) &&
1063 ovs_ct_verify(net, OVS_KEY_ATTR_CT_ZONE)) { 1081 ovs_ct_verify(net, OVS_KEY_ATTR_CT_ZONE)) {
1064 u16 ct_zone = nla_get_u16(a[OVS_KEY_ATTR_CT_ZONE]); 1082 u16 ct_zone = nla_get_u16(a[OVS_KEY_ATTR_CT_ZONE]);
1065 1083
1066 SW_FLOW_KEY_PUT(match, ct.zone, ct_zone, is_mask); 1084 SW_FLOW_KEY_PUT(match, ct_zone, ct_zone, is_mask);
1067 *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ZONE); 1085 *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ZONE);
1068 } 1086 }
1069 if (*attrs & (1 << OVS_KEY_ATTR_CT_MARK) && 1087 if (*attrs & (1 << OVS_KEY_ATTR_CT_MARK) &&
@@ -1082,6 +1100,34 @@ static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match,
1082 sizeof(*cl), is_mask); 1100 sizeof(*cl), is_mask);
1083 *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_LABELS); 1101 *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_LABELS);
1084 } 1102 }
1103 if (*attrs & (1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4)) {
1104 const struct ovs_key_ct_tuple_ipv4 *ct;
1105
1106 ct = nla_data(a[OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4]);
1107
1108 SW_FLOW_KEY_PUT(match, ipv4.ct_orig.src, ct->ipv4_src, is_mask);
1109 SW_FLOW_KEY_PUT(match, ipv4.ct_orig.dst, ct->ipv4_dst, is_mask);
1110 SW_FLOW_KEY_PUT(match, ct.orig_tp.src, ct->src_port, is_mask);
1111 SW_FLOW_KEY_PUT(match, ct.orig_tp.dst, ct->dst_port, is_mask);
1112 SW_FLOW_KEY_PUT(match, ct_orig_proto, ct->ipv4_proto, is_mask);
1113 *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4);
1114 }
1115 if (*attrs & (1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6)) {
1116 const struct ovs_key_ct_tuple_ipv6 *ct;
1117
1118 ct = nla_data(a[OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6]);
1119
1120 SW_FLOW_KEY_MEMCPY(match, ipv6.ct_orig.src, &ct->ipv6_src,
1121 sizeof(match->key->ipv6.ct_orig.src),
1122 is_mask);
1123 SW_FLOW_KEY_MEMCPY(match, ipv6.ct_orig.dst, &ct->ipv6_dst,
1124 sizeof(match->key->ipv6.ct_orig.dst),
1125 is_mask);
1126 SW_FLOW_KEY_PUT(match, ct.orig_tp.src, ct->src_port, is_mask);
1127 SW_FLOW_KEY_PUT(match, ct.orig_tp.dst, ct->dst_port, is_mask);
1128 SW_FLOW_KEY_PUT(match, ct_orig_proto, ct->ipv6_proto, is_mask);
1129 *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6);
1130 }
1085 1131
1086 /* For layer 3 packets the Ethernet type is provided 1132 /* For layer 3 packets the Ethernet type is provided
1087 * and treated as metadata but no MAC addresses are provided. 1133 * and treated as metadata but no MAC addresses are provided.
@@ -1493,9 +1539,12 @@ u32 ovs_nla_get_ufid_flags(const struct nlattr *attr)
1493 1539
1494/** 1540/**
1495 * ovs_nla_get_flow_metadata - parses Netlink attributes into a flow key. 1541 * ovs_nla_get_flow_metadata - parses Netlink attributes into a flow key.
1496 * @key: Receives extracted in_port, priority, tun_key and skb_mark. 1542 * @net: Network namespace.
1497 * @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute 1543 * @key: Receives extracted in_port, priority, tun_key, skb_mark and conntrack
1498 * sequence. 1544 * metadata.
1545 * @a: Array of netlink attributes holding parsed %OVS_KEY_ATTR_* Netlink
1546 * attributes.
1547 * @attrs: Bit mask for the netlink attributes included in @a.
1499 * @log: Boolean to allow kernel error logging. Normally true, but when 1548 * @log: Boolean to allow kernel error logging. Normally true, but when
1500 * probing for feature compatibility this should be passed in as false to 1549 * probing for feature compatibility this should be passed in as false to
1501 * suppress unnecessary error logging. 1550 * suppress unnecessary error logging.
@@ -1504,25 +1553,26 @@ u32 ovs_nla_get_ufid_flags(const struct nlattr *attr)
1504 * take the same form accepted by flow_from_nlattrs(), but only enough of it to 1553 * take the same form accepted by flow_from_nlattrs(), but only enough of it to
1505 * get the metadata, that is, the parts of the flow key that cannot be 1554 * get the metadata, that is, the parts of the flow key that cannot be
1506 * extracted from the packet itself. 1555 * extracted from the packet itself.
1556 *
1557 * This must be called before the packet key fields are filled in 'key'.
1507 */ 1558 */
1508 1559
1509int ovs_nla_get_flow_metadata(struct net *net, const struct nlattr *attr, 1560int ovs_nla_get_flow_metadata(struct net *net,
1510 struct sw_flow_key *key, 1561 const struct nlattr *a[OVS_KEY_ATTR_MAX + 1],
1511 bool log) 1562 u64 attrs, struct sw_flow_key *key, bool log)
1512{ 1563{
1513 const struct nlattr *a[OVS_KEY_ATTR_MAX + 1];
1514 struct sw_flow_match match; 1564 struct sw_flow_match match;
1515 u64 attrs = 0;
1516 int err;
1517
1518 err = parse_flow_nlattrs(attr, a, &attrs, log);
1519 if (err)
1520 return -EINVAL;
1521 1565
1522 memset(&match, 0, sizeof(match)); 1566 memset(&match, 0, sizeof(match));
1523 match.key = key; 1567 match.key = key;
1524 1568
1569 key->ct_state = 0;
1570 key->ct_zone = 0;
1571 key->ct_orig_proto = 0;
1525 memset(&key->ct, 0, sizeof(key->ct)); 1572 memset(&key->ct, 0, sizeof(key->ct));
1573 memset(&key->ipv4.ct_orig, 0, sizeof(key->ipv4.ct_orig));
1574 memset(&key->ipv6.ct_orig, 0, sizeof(key->ipv6.ct_orig));
1575
1526 key->phy.in_port = DP_MAX_PORTS; 1576 key->phy.in_port = DP_MAX_PORTS;
1527 1577
1528 return metadata_from_nlattrs(net, &match, &attrs, a, false, log); 1578 return metadata_from_nlattrs(net, &match, &attrs, a, false, log);
@@ -1584,7 +1634,7 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
1584 if (nla_put_u32(skb, OVS_KEY_ATTR_SKB_MARK, output->phy.skb_mark)) 1634 if (nla_put_u32(skb, OVS_KEY_ATTR_SKB_MARK, output->phy.skb_mark))
1585 goto nla_put_failure; 1635 goto nla_put_failure;
1586 1636
1587 if (ovs_ct_put_key(output, skb)) 1637 if (ovs_ct_put_key(swkey, output, skb))
1588 goto nla_put_failure; 1638 goto nla_put_failure;
1589 1639
1590 if (ovs_key_mac_proto(swkey) == MAC_PROTO_ETHERNET) { 1640 if (ovs_key_mac_proto(swkey) == MAC_PROTO_ETHERNET) {
diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h
index 45f9769e5aac..929c665ac3aa 100644
--- a/net/openvswitch/flow_netlink.h
+++ b/net/openvswitch/flow_netlink.h
@@ -46,8 +46,11 @@ void ovs_match_init(struct sw_flow_match *match,
46 46
47int ovs_nla_put_key(const struct sw_flow_key *, const struct sw_flow_key *, 47int ovs_nla_put_key(const struct sw_flow_key *, const struct sw_flow_key *,
48 int attr, bool is_mask, struct sk_buff *); 48 int attr, bool is_mask, struct sk_buff *);
49int ovs_nla_get_flow_metadata(struct net *, const struct nlattr *, 49int parse_flow_nlattrs(const struct nlattr *attr, const struct nlattr *a[],
50 struct sw_flow_key *, bool log); 50 u64 *attrsp, bool log);
51int ovs_nla_get_flow_metadata(struct net *net,
52 const struct nlattr *a[OVS_KEY_ATTR_MAX + 1],
53 u64 attrs, struct sw_flow_key *key, bool log);
51 54
52int ovs_nla_put_identifier(const struct sw_flow *flow, struct sk_buff *skb); 55int ovs_nla_put_identifier(const struct sw_flow *flow, struct sk_buff *skb);
53int ovs_nla_put_masked_key(const struct sw_flow *flow, struct sk_buff *skb); 56int ovs_nla_put_masked_key(const struct sw_flow *flow, struct sk_buff *skb);
diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
index d5d6caecd072..89193a634da4 100644
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -97,7 +97,7 @@ static void internal_dev_destructor(struct net_device *dev)
97 free_netdev(dev); 97 free_netdev(dev);
98} 98}
99 99
100static struct rtnl_link_stats64 * 100static void
101internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) 101internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
102{ 102{
103 int i; 103 int i;
@@ -125,8 +125,6 @@ internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
125 stats->tx_bytes += local_stats.tx_bytes; 125 stats->tx_bytes += local_stats.tx_bytes;
126 stats->tx_packets += local_stats.tx_packets; 126 stats->tx_packets += local_stats.tx_packets;
127 } 127 }
128
129 return stats;
130} 128}
131 129
132static void internal_set_rx_headroom(struct net_device *dev, int new_hr) 130static void internal_set_rx_headroom(struct net_device *dev, int new_hr)
@@ -151,6 +149,8 @@ static void do_setup(struct net_device *netdev)
151{ 149{
152 ether_setup(netdev); 150 ether_setup(netdev);
153 151
152 netdev->max_mtu = ETH_MAX_MTU;
153
154 netdev->netdev_ops = &internal_dev_netdev_ops; 154 netdev->netdev_ops = &internal_dev_netdev_ops;
155 155
156 netdev->priv_flags &= ~IFF_TX_SKB_SHARING; 156 netdev->priv_flags &= ~IFF_TX_SKB_SHARING;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 3d555c79a7b5..8489beff5c25 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -409,6 +409,9 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status)
409 flush_dcache_page(pgv_to_page(&h.h2->tp_status)); 409 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
410 break; 410 break;
411 case TPACKET_V3: 411 case TPACKET_V3:
412 h.h3->tp_status = status;
413 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
414 break;
412 default: 415 default:
413 WARN(1, "TPACKET version not supported.\n"); 416 WARN(1, "TPACKET version not supported.\n");
414 BUG(); 417 BUG();
@@ -432,6 +435,8 @@ static int __packet_get_status(struct packet_sock *po, void *frame)
432 flush_dcache_page(pgv_to_page(&h.h2->tp_status)); 435 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
433 return h.h2->tp_status; 436 return h.h2->tp_status;
434 case TPACKET_V3: 437 case TPACKET_V3:
438 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
439 return h.h3->tp_status;
435 default: 440 default:
436 WARN(1, "TPACKET version not supported.\n"); 441 WARN(1, "TPACKET version not supported.\n");
437 BUG(); 442 BUG();
@@ -476,6 +481,9 @@ static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
476 h.h2->tp_nsec = ts.tv_nsec; 481 h.h2->tp_nsec = ts.tv_nsec;
477 break; 482 break;
478 case TPACKET_V3: 483 case TPACKET_V3:
484 h.h3->tp_sec = ts.tv_sec;
485 h.h3->tp_nsec = ts.tv_nsec;
486 break;
479 default: 487 default:
480 WARN(1, "TPACKET version not supported.\n"); 488 WARN(1, "TPACKET version not supported.\n");
481 BUG(); 489 BUG();
@@ -1497,6 +1505,8 @@ static void __fanout_link(struct sock *sk, struct packet_sock *po)
1497 f->arr[f->num_members] = sk; 1505 f->arr[f->num_members] = sk;
1498 smp_wmb(); 1506 smp_wmb();
1499 f->num_members++; 1507 f->num_members++;
1508 if (f->num_members == 1)
1509 dev_add_pack(&f->prot_hook);
1500 spin_unlock(&f->lock); 1510 spin_unlock(&f->lock);
1501} 1511}
1502 1512
@@ -1513,6 +1523,8 @@ static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1513 BUG_ON(i >= f->num_members); 1523 BUG_ON(i >= f->num_members);
1514 f->arr[i] = f->arr[f->num_members - 1]; 1524 f->arr[i] = f->arr[f->num_members - 1];
1515 f->num_members--; 1525 f->num_members--;
1526 if (f->num_members == 0)
1527 __dev_remove_pack(&f->prot_hook);
1516 spin_unlock(&f->lock); 1528 spin_unlock(&f->lock);
1517} 1529}
1518 1530
@@ -1619,6 +1631,7 @@ static void fanout_release_data(struct packet_fanout *f)
1619 1631
1620static int fanout_add(struct sock *sk, u16 id, u16 type_flags) 1632static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
1621{ 1633{
1634 struct packet_rollover *rollover = NULL;
1622 struct packet_sock *po = pkt_sk(sk); 1635 struct packet_sock *po = pkt_sk(sk);
1623 struct packet_fanout *f, *match; 1636 struct packet_fanout *f, *match;
1624 u8 type = type_flags & 0xff; 1637 u8 type = type_flags & 0xff;
@@ -1641,23 +1654,28 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
1641 return -EINVAL; 1654 return -EINVAL;
1642 } 1655 }
1643 1656
1657 mutex_lock(&fanout_mutex);
1658
1659 err = -EINVAL;
1644 if (!po->running) 1660 if (!po->running)
1645 return -EINVAL; 1661 goto out;
1646 1662
1663 err = -EALREADY;
1647 if (po->fanout) 1664 if (po->fanout)
1648 return -EALREADY; 1665 goto out;
1649 1666
1650 if (type == PACKET_FANOUT_ROLLOVER || 1667 if (type == PACKET_FANOUT_ROLLOVER ||
1651 (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) { 1668 (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
1652 po->rollover = kzalloc(sizeof(*po->rollover), GFP_KERNEL); 1669 err = -ENOMEM;
1653 if (!po->rollover) 1670 rollover = kzalloc(sizeof(*rollover), GFP_KERNEL);
1654 return -ENOMEM; 1671 if (!rollover)
1655 atomic_long_set(&po->rollover->num, 0); 1672 goto out;
1656 atomic_long_set(&po->rollover->num_huge, 0); 1673 atomic_long_set(&rollover->num, 0);
1657 atomic_long_set(&po->rollover->num_failed, 0); 1674 atomic_long_set(&rollover->num_huge, 0);
1675 atomic_long_set(&rollover->num_failed, 0);
1676 po->rollover = rollover;
1658 } 1677 }
1659 1678
1660 mutex_lock(&fanout_mutex);
1661 match = NULL; 1679 match = NULL;
1662 list_for_each_entry(f, &fanout_list, list) { 1680 list_for_each_entry(f, &fanout_list, list) {
1663 if (f->id == id && 1681 if (f->id == id &&
@@ -1687,7 +1705,6 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
1687 match->prot_hook.func = packet_rcv_fanout; 1705 match->prot_hook.func = packet_rcv_fanout;
1688 match->prot_hook.af_packet_priv = match; 1706 match->prot_hook.af_packet_priv = match;
1689 match->prot_hook.id_match = match_fanout_group; 1707 match->prot_hook.id_match = match_fanout_group;
1690 dev_add_pack(&match->prot_hook);
1691 list_add(&match->list, &fanout_list); 1708 list_add(&match->list, &fanout_list);
1692 } 1709 }
1693 err = -EINVAL; 1710 err = -EINVAL;
@@ -1704,36 +1721,40 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
1704 } 1721 }
1705 } 1722 }
1706out: 1723out:
1707 mutex_unlock(&fanout_mutex); 1724 if (err && rollover) {
1708 if (err) { 1725 kfree(rollover);
1709 kfree(po->rollover);
1710 po->rollover = NULL; 1726 po->rollover = NULL;
1711 } 1727 }
1728 mutex_unlock(&fanout_mutex);
1712 return err; 1729 return err;
1713} 1730}
1714 1731
1715static void fanout_release(struct sock *sk) 1732/* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes
1733 * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout.
1734 * It is the responsibility of the caller to call fanout_release_data() and
1735 * free the returned packet_fanout (after synchronize_net())
1736 */
1737static struct packet_fanout *fanout_release(struct sock *sk)
1716{ 1738{
1717 struct packet_sock *po = pkt_sk(sk); 1739 struct packet_sock *po = pkt_sk(sk);
1718 struct packet_fanout *f; 1740 struct packet_fanout *f;
1719 1741
1742 mutex_lock(&fanout_mutex);
1720 f = po->fanout; 1743 f = po->fanout;
1721 if (!f) 1744 if (f) {
1722 return; 1745 po->fanout = NULL;
1723 1746
1724 mutex_lock(&fanout_mutex); 1747 if (atomic_dec_and_test(&f->sk_ref))
1725 po->fanout = NULL; 1748 list_del(&f->list);
1749 else
1750 f = NULL;
1726 1751
1727 if (atomic_dec_and_test(&f->sk_ref)) { 1752 if (po->rollover)
1728 list_del(&f->list); 1753 kfree_rcu(po->rollover, rcu);
1729 dev_remove_pack(&f->prot_hook);
1730 fanout_release_data(f);
1731 kfree(f);
1732 } 1754 }
1733 mutex_unlock(&fanout_mutex); 1755 mutex_unlock(&fanout_mutex);
1734 1756
1735 if (po->rollover) 1757 return f;
1736 kfree_rcu(po->rollover, rcu);
1737} 1758}
1738 1759
1739static bool packet_extra_vlan_len_allowed(const struct net_device *dev, 1760static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
@@ -2497,6 +2518,13 @@ static int tpacket_parse_header(struct packet_sock *po, void *frame,
2497 ph.raw = frame; 2518 ph.raw = frame;
2498 2519
2499 switch (po->tp_version) { 2520 switch (po->tp_version) {
2521 case TPACKET_V3:
2522 if (ph.h3->tp_next_offset != 0) {
2523 pr_warn_once("variable sized slot not supported");
2524 return -EINVAL;
2525 }
2526 tp_len = ph.h3->tp_len;
2527 break;
2500 case TPACKET_V2: 2528 case TPACKET_V2:
2501 tp_len = ph.h2->tp_len; 2529 tp_len = ph.h2->tp_len;
2502 break; 2530 break;
@@ -2516,6 +2544,9 @@ static int tpacket_parse_header(struct packet_sock *po, void *frame,
2516 off_max = po->tx_ring.frame_size - tp_len; 2544 off_max = po->tx_ring.frame_size - tp_len;
2517 if (po->sk.sk_type == SOCK_DGRAM) { 2545 if (po->sk.sk_type == SOCK_DGRAM) {
2518 switch (po->tp_version) { 2546 switch (po->tp_version) {
2547 case TPACKET_V3:
2548 off = ph.h3->tp_net;
2549 break;
2519 case TPACKET_V2: 2550 case TPACKET_V2:
2520 off = ph.h2->tp_net; 2551 off = ph.h2->tp_net;
2521 break; 2552 break;
@@ -2525,6 +2556,9 @@ static int tpacket_parse_header(struct packet_sock *po, void *frame,
2525 } 2556 }
2526 } else { 2557 } else {
2527 switch (po->tp_version) { 2558 switch (po->tp_version) {
2559 case TPACKET_V3:
2560 off = ph.h3->tp_mac;
2561 break;
2528 case TPACKET_V2: 2562 case TPACKET_V2:
2529 off = ph.h2->tp_mac; 2563 off = ph.h2->tp_mac;
2530 break; 2564 break;
@@ -2755,7 +2789,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
2755 struct virtio_net_hdr vnet_hdr = { 0 }; 2789 struct virtio_net_hdr vnet_hdr = { 0 };
2756 int offset = 0; 2790 int offset = 0;
2757 struct packet_sock *po = pkt_sk(sk); 2791 struct packet_sock *po = pkt_sk(sk);
2758 int hlen, tlen; 2792 int hlen, tlen, linear;
2759 int extra_len = 0; 2793 int extra_len = 0;
2760 2794
2761 /* 2795 /*
@@ -2816,8 +2850,9 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
2816 err = -ENOBUFS; 2850 err = -ENOBUFS;
2817 hlen = LL_RESERVED_SPACE(dev); 2851 hlen = LL_RESERVED_SPACE(dev);
2818 tlen = dev->needed_tailroom; 2852 tlen = dev->needed_tailroom;
2819 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, 2853 linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len);
2820 __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len), 2854 linear = max(linear, min_t(int, len, dev->hard_header_len));
2855 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear,
2821 msg->msg_flags & MSG_DONTWAIT, &err); 2856 msg->msg_flags & MSG_DONTWAIT, &err);
2822 if (skb == NULL) 2857 if (skb == NULL)
2823 goto out_unlock; 2858 goto out_unlock;
@@ -2906,6 +2941,7 @@ static int packet_release(struct socket *sock)
2906{ 2941{
2907 struct sock *sk = sock->sk; 2942 struct sock *sk = sock->sk;
2908 struct packet_sock *po; 2943 struct packet_sock *po;
2944 struct packet_fanout *f;
2909 struct net *net; 2945 struct net *net;
2910 union tpacket_req_u req_u; 2946 union tpacket_req_u req_u;
2911 2947
@@ -2945,9 +2981,14 @@ static int packet_release(struct socket *sock)
2945 packet_set_ring(sk, &req_u, 1, 1); 2981 packet_set_ring(sk, &req_u, 1, 1);
2946 } 2982 }
2947 2983
2948 fanout_release(sk); 2984 f = fanout_release(sk);
2949 2985
2950 synchronize_net(); 2986 synchronize_net();
2987
2988 if (f) {
2989 fanout_release_data(f);
2990 kfree(f);
2991 }
2951 /* 2992 /*
2952 * Now the socket is dead. No more input will appear. 2993 * Now the socket is dead. No more input will appear.
2953 */ 2994 */
@@ -3062,7 +3103,7 @@ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
3062 int addr_len) 3103 int addr_len)
3063{ 3104{
3064 struct sock *sk = sock->sk; 3105 struct sock *sk = sock->sk;
3065 char name[15]; 3106 char name[sizeof(uaddr->sa_data) + 1];
3066 3107
3067 /* 3108 /*
3068 * Check legality 3109 * Check legality
@@ -3070,7 +3111,11 @@ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
3070 3111
3071 if (addr_len != sizeof(struct sockaddr)) 3112 if (addr_len != sizeof(struct sockaddr))
3072 return -EINVAL; 3113 return -EINVAL;
3073 strlcpy(name, uaddr->sa_data, sizeof(name)); 3114 /* uaddr->sa_data comes from the userspace, it's not guaranteed to be
3115 * zero-terminated.
3116 */
3117 memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data));
3118 name[sizeof(uaddr->sa_data)] = 0;
3074 3119
3075 return packet_do_bind(sk, name, 0, pkt_sk(sk)->num); 3120 return packet_do_bind(sk, name, 0, pkt_sk(sk)->num);
3076} 3121}
@@ -3620,6 +3665,8 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
3620 return -EBUSY; 3665 return -EBUSY;
3621 if (copy_from_user(&val, optval, sizeof(val))) 3666 if (copy_from_user(&val, optval, sizeof(val)))
3622 return -EFAULT; 3667 return -EFAULT;
3668 if (val > INT_MAX)
3669 return -EINVAL;
3623 po->tp_reserve = val; 3670 po->tp_reserve = val;
3624 return 0; 3671 return 0;
3625 } 3672 }
@@ -3899,7 +3946,6 @@ static int packet_notifier(struct notifier_block *this,
3899 } 3946 }
3900 if (msg == NETDEV_UNREGISTER) { 3947 if (msg == NETDEV_UNREGISTER) {
3901 packet_cached_dev_reset(po); 3948 packet_cached_dev_reset(po);
3902 fanout_release(sk);
3903 po->ifindex = -1; 3949 po->ifindex = -1;
3904 if (po->prot_hook.dev) 3950 if (po->prot_hook.dev)
3905 dev_put(po->prot_hook.dev); 3951 dev_put(po->prot_hook.dev);
@@ -4113,11 +4159,6 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
4113 struct tpacket_req *req = &req_u->req; 4159 struct tpacket_req *req = &req_u->req;
4114 4160
4115 lock_sock(sk); 4161 lock_sock(sk);
4116 /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
4117 if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
4118 net_warn_ratelimited("Tx-ring is not supported.\n");
4119 goto out;
4120 }
4121 4162
4122 rb = tx_ring ? &po->tx_ring : &po->rx_ring; 4163 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
4123 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue; 4164 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
@@ -4154,8 +4195,8 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
4154 if (unlikely(!PAGE_ALIGNED(req->tp_block_size))) 4195 if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
4155 goto out; 4196 goto out;
4156 if (po->tp_version >= TPACKET_V3 && 4197 if (po->tp_version >= TPACKET_V3 &&
4157 (int)(req->tp_block_size - 4198 req->tp_block_size <=
4158 BLK_PLUS_PRIV(req_u->req3.tp_sizeof_priv)) <= 0) 4199 BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv))
4159 goto out; 4200 goto out;
4160 if (unlikely(req->tp_frame_size < po->tp_hdrlen + 4201 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
4161 po->tp_reserve)) 4202 po->tp_reserve))
@@ -4166,6 +4207,8 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
4166 rb->frames_per_block = req->tp_block_size / req->tp_frame_size; 4207 rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
4167 if (unlikely(rb->frames_per_block == 0)) 4208 if (unlikely(rb->frames_per_block == 0))
4168 goto out; 4209 goto out;
4210 if (unlikely(req->tp_block_size > UINT_MAX / req->tp_block_nr))
4211 goto out;
4169 if (unlikely((rb->frames_per_block * req->tp_block_nr) != 4212 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
4170 req->tp_frame_nr)) 4213 req->tp_frame_nr))
4171 goto out; 4214 goto out;
@@ -4177,11 +4220,19 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
4177 goto out; 4220 goto out;
4178 switch (po->tp_version) { 4221 switch (po->tp_version) {
4179 case TPACKET_V3: 4222 case TPACKET_V3:
4180 /* Transmit path is not supported. We checked 4223 /* Block transmit is not supported yet */
4181 * it above but just being paranoid 4224 if (!tx_ring) {
4182 */
4183 if (!tx_ring)
4184 init_prb_bdqc(po, rb, pg_vec, req_u); 4225 init_prb_bdqc(po, rb, pg_vec, req_u);
4226 } else {
4227 struct tpacket_req3 *req3 = &req_u->req3;
4228
4229 if (req3->tp_retire_blk_tov ||
4230 req3->tp_sizeof_priv ||
4231 req3->tp_feature_req_word) {
4232 err = -EINVAL;
4233 goto out;
4234 }
4235 }
4185 break; 4236 break;
4186 default: 4237 default:
4187 break; 4238 break;
diff --git a/net/packet/diag.c b/net/packet/diag.c
index 0ed68f0238bf..7ef1c881ae74 100644
--- a/net/packet/diag.c
+++ b/net/packet/diag.c
@@ -73,8 +73,7 @@ static int pdiag_put_ring(struct packet_ring_buffer *ring, int ver, int nl_type,
73{ 73{
74 struct packet_diag_ring pdr; 74 struct packet_diag_ring pdr;
75 75
76 if (!ring->pg_vec || ((ver > TPACKET_V2) && 76 if (!ring->pg_vec)
77 (nl_type == PACKET_DIAG_TX_RING)))
78 return 0; 77 return 0;
79 78
80 pdr.pdr_block_size = ring->pg_vec_pages << PAGE_SHIFT; 79 pdr.pdr_block_size = ring->pg_vec_pages << PAGE_SHIFT;
diff --git a/net/phonet/pep.c b/net/phonet/pep.c
index 8bad5624a27a..e81537991ddf 100644
--- a/net/phonet/pep.c
+++ b/net/phonet/pep.c
@@ -23,6 +23,7 @@
23 */ 23 */
24 24
25#include <linux/kernel.h> 25#include <linux/kernel.h>
26#include <linux/sched/signal.h>
26#include <linux/slab.h> 27#include <linux/slab.h>
27#include <linux/socket.h> 28#include <linux/socket.h>
28#include <net/sock.h> 29#include <net/sock.h>
@@ -771,7 +772,8 @@ static void pep_sock_close(struct sock *sk, long timeout)
771 sock_put(sk); 772 sock_put(sk);
772} 773}
773 774
774static struct sock *pep_sock_accept(struct sock *sk, int flags, int *errp) 775static struct sock *pep_sock_accept(struct sock *sk, int flags, int *errp,
776 bool kern)
775{ 777{
776 struct pep_sock *pn = pep_sk(sk), *newpn; 778 struct pep_sock *pn = pep_sk(sk), *newpn;
777 struct sock *newsk = NULL; 779 struct sock *newsk = NULL;
@@ -845,7 +847,8 @@ static struct sock *pep_sock_accept(struct sock *sk, int flags, int *errp)
845 } 847 }
846 848
847 /* Create a new to-be-accepted sock */ 849 /* Create a new to-be-accepted sock */
848 newsk = sk_alloc(sock_net(sk), PF_PHONET, GFP_KERNEL, sk->sk_prot, 0); 850 newsk = sk_alloc(sock_net(sk), PF_PHONET, GFP_KERNEL, sk->sk_prot,
851 kern);
849 if (!newsk) { 852 if (!newsk) {
850 pep_reject_conn(sk, skb, PN_PIPE_ERR_OVERLOAD, GFP_KERNEL); 853 pep_reject_conn(sk, skb, PN_PIPE_ERR_OVERLOAD, GFP_KERNEL);
851 err = -ENOBUFS; 854 err = -ENOBUFS;
diff --git a/net/phonet/socket.c b/net/phonet/socket.c
index ffd5f2297584..64634e3ec2fc 100644
--- a/net/phonet/socket.c
+++ b/net/phonet/socket.c
@@ -27,6 +27,8 @@
27#include <linux/kernel.h> 27#include <linux/kernel.h>
28#include <linux/net.h> 28#include <linux/net.h>
29#include <linux/poll.h> 29#include <linux/poll.h>
30#include <linux/sched/signal.h>
31
30#include <net/sock.h> 32#include <net/sock.h>
31#include <net/tcp_states.h> 33#include <net/tcp_states.h>
32 34
@@ -303,7 +305,7 @@ out:
303} 305}
304 306
305static int pn_socket_accept(struct socket *sock, struct socket *newsock, 307static int pn_socket_accept(struct socket *sock, struct socket *newsock,
306 int flags) 308 int flags, bool kern)
307{ 309{
308 struct sock *sk = sock->sk; 310 struct sock *sk = sock->sk;
309 struct sock *newsk; 311 struct sock *newsk;
@@ -312,7 +314,7 @@ static int pn_socket_accept(struct socket *sock, struct socket *newsock,
312 if (unlikely(sk->sk_state != TCP_LISTEN)) 314 if (unlikely(sk->sk_state != TCP_LISTEN))
313 return -EINVAL; 315 return -EINVAL;
314 316
315 newsk = sk->sk_prot->accept(sk, flags, &err); 317 newsk = sk->sk_prot->accept(sk, flags, &err, kern);
316 if (!newsk) 318 if (!newsk)
317 return err; 319 return err;
318 320
diff --git a/net/psample/Kconfig b/net/psample/Kconfig
new file mode 100644
index 000000000000..d850246a6059
--- /dev/null
+++ b/net/psample/Kconfig
@@ -0,0 +1,15 @@
1#
2# psample packet sampling configuration
3#
4
5menuconfig PSAMPLE
6 depends on NET
7 tristate "Packet-sampling netlink channel"
8 default n
9 help
10 Say Y here to add support for packet-sampling netlink channel
11 This netlink channel allows transferring packets alongside some
12 metadata to userspace.
13
14 To compile this support as a module, choose M here: the module will
15 be called psample.
diff --git a/net/psample/Makefile b/net/psample/Makefile
new file mode 100644
index 000000000000..609b0a79c9f3
--- /dev/null
+++ b/net/psample/Makefile
@@ -0,0 +1,5 @@
1#
2# Makefile for the psample netlink channel
3#
4
5obj-$(CONFIG_PSAMPLE) += psample.o
diff --git a/net/psample/psample.c b/net/psample/psample.c
new file mode 100644
index 000000000000..8aa58a918783
--- /dev/null
+++ b/net/psample/psample.c
@@ -0,0 +1,301 @@
1/*
2 * net/psample/psample.c - Netlink channel for packet sampling
3 * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 */
9
10#include <linux/types.h>
11#include <linux/kernel.h>
12#include <linux/skbuff.h>
13#include <linux/module.h>
14#include <net/net_namespace.h>
15#include <net/sock.h>
16#include <net/netlink.h>
17#include <net/genetlink.h>
18#include <net/psample.h>
19#include <linux/spinlock.h>
20
21#define PSAMPLE_MAX_PACKET_SIZE 0xffff
22
23static LIST_HEAD(psample_groups_list);
24static DEFINE_SPINLOCK(psample_groups_lock);
25
26/* multicast groups */
27enum psample_nl_multicast_groups {
28 PSAMPLE_NL_MCGRP_CONFIG,
29 PSAMPLE_NL_MCGRP_SAMPLE,
30};
31
32static const struct genl_multicast_group psample_nl_mcgrps[] = {
33 [PSAMPLE_NL_MCGRP_CONFIG] = { .name = PSAMPLE_NL_MCGRP_CONFIG_NAME },
34 [PSAMPLE_NL_MCGRP_SAMPLE] = { .name = PSAMPLE_NL_MCGRP_SAMPLE_NAME },
35};
36
37static struct genl_family psample_nl_family __ro_after_init;
38
39static int psample_group_nl_fill(struct sk_buff *msg,
40 struct psample_group *group,
41 enum psample_command cmd, u32 portid, u32 seq,
42 int flags)
43{
44 void *hdr;
45 int ret;
46
47 hdr = genlmsg_put(msg, portid, seq, &psample_nl_family, flags, cmd);
48 if (!hdr)
49 return -EMSGSIZE;
50
51 ret = nla_put_u32(msg, PSAMPLE_ATTR_SAMPLE_GROUP, group->group_num);
52 if (ret < 0)
53 goto error;
54
55 ret = nla_put_u32(msg, PSAMPLE_ATTR_GROUP_REFCOUNT, group->refcount);
56 if (ret < 0)
57 goto error;
58
59 ret = nla_put_u32(msg, PSAMPLE_ATTR_GROUP_SEQ, group->seq);
60 if (ret < 0)
61 goto error;
62
63 genlmsg_end(msg, hdr);
64 return 0;
65
66error:
67 genlmsg_cancel(msg, hdr);
68 return -EMSGSIZE;
69}
70
71static int psample_nl_cmd_get_group_dumpit(struct sk_buff *msg,
72 struct netlink_callback *cb)
73{
74 struct psample_group *group;
75 int start = cb->args[0];
76 int idx = 0;
77 int err;
78
79 spin_lock(&psample_groups_lock);
80 list_for_each_entry(group, &psample_groups_list, list) {
81 if (!net_eq(group->net, sock_net(msg->sk)))
82 continue;
83 if (idx < start) {
84 idx++;
85 continue;
86 }
87 err = psample_group_nl_fill(msg, group, PSAMPLE_CMD_NEW_GROUP,
88 NETLINK_CB(cb->skb).portid,
89 cb->nlh->nlmsg_seq, NLM_F_MULTI);
90 if (err)
91 break;
92 idx++;
93 }
94
95 spin_unlock(&psample_groups_lock);
96 cb->args[0] = idx;
97 return msg->len;
98}
99
100static const struct genl_ops psample_nl_ops[] = {
101 {
102 .cmd = PSAMPLE_CMD_GET_GROUP,
103 .dumpit = psample_nl_cmd_get_group_dumpit,
104 /* can be retrieved by unprivileged users */
105 }
106};
107
108static struct genl_family psample_nl_family __ro_after_init = {
109 .name = PSAMPLE_GENL_NAME,
110 .version = PSAMPLE_GENL_VERSION,
111 .maxattr = PSAMPLE_ATTR_MAX,
112 .netnsok = true,
113 .module = THIS_MODULE,
114 .mcgrps = psample_nl_mcgrps,
115 .ops = psample_nl_ops,
116 .n_ops = ARRAY_SIZE(psample_nl_ops),
117 .n_mcgrps = ARRAY_SIZE(psample_nl_mcgrps),
118};
119
120static void psample_group_notify(struct psample_group *group,
121 enum psample_command cmd)
122{
123 struct sk_buff *msg;
124 int err;
125
126 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
127 if (!msg)
128 return;
129
130 err = psample_group_nl_fill(msg, group, cmd, 0, 0, NLM_F_MULTI);
131 if (!err)
132 genlmsg_multicast_netns(&psample_nl_family, group->net, msg, 0,
133 PSAMPLE_NL_MCGRP_CONFIG, GFP_ATOMIC);
134 else
135 nlmsg_free(msg);
136}
137
138static struct psample_group *psample_group_create(struct net *net,
139 u32 group_num)
140{
141 struct psample_group *group;
142
143 group = kzalloc(sizeof(*group), GFP_ATOMIC);
144 if (!group)
145 return NULL;
146
147 group->net = net;
148 group->group_num = group_num;
149 list_add_tail(&group->list, &psample_groups_list);
150
151 psample_group_notify(group, PSAMPLE_CMD_NEW_GROUP);
152 return group;
153}
154
155static void psample_group_destroy(struct psample_group *group)
156{
157 psample_group_notify(group, PSAMPLE_CMD_DEL_GROUP);
158 list_del(&group->list);
159 kfree(group);
160}
161
162static struct psample_group *
163psample_group_lookup(struct net *net, u32 group_num)
164{
165 struct psample_group *group;
166
167 list_for_each_entry(group, &psample_groups_list, list)
168 if ((group->group_num == group_num) && (group->net == net))
169 return group;
170 return NULL;
171}
172
173struct psample_group *psample_group_get(struct net *net, u32 group_num)
174{
175 struct psample_group *group;
176
177 spin_lock(&psample_groups_lock);
178
179 group = psample_group_lookup(net, group_num);
180 if (!group) {
181 group = psample_group_create(net, group_num);
182 if (!group)
183 goto out;
184 }
185 group->refcount++;
186
187out:
188 spin_unlock(&psample_groups_lock);
189 return group;
190}
191EXPORT_SYMBOL_GPL(psample_group_get);
192
193void psample_group_put(struct psample_group *group)
194{
195 spin_lock(&psample_groups_lock);
196
197 if (--group->refcount == 0)
198 psample_group_destroy(group);
199
200 spin_unlock(&psample_groups_lock);
201}
202EXPORT_SYMBOL_GPL(psample_group_put);
203
204void psample_sample_packet(struct psample_group *group, struct sk_buff *skb,
205 u32 trunc_size, int in_ifindex, int out_ifindex,
206 u32 sample_rate)
207{
208 struct sk_buff *nl_skb;
209 int data_len;
210 int meta_len;
211 void *data;
212 int ret;
213
214 meta_len = (in_ifindex ? nla_total_size(sizeof(u16)) : 0) +
215 (out_ifindex ? nla_total_size(sizeof(u16)) : 0) +
216 nla_total_size(sizeof(u32)) + /* sample_rate */
217 nla_total_size(sizeof(u32)) + /* orig_size */
218 nla_total_size(sizeof(u32)) + /* group_num */
219 nla_total_size(sizeof(u32)); /* seq */
220
221 data_len = min(skb->len, trunc_size);
222 if (meta_len + nla_total_size(data_len) > PSAMPLE_MAX_PACKET_SIZE)
223 data_len = PSAMPLE_MAX_PACKET_SIZE - meta_len - NLA_HDRLEN
224 - NLA_ALIGNTO;
225
226 nl_skb = genlmsg_new(meta_len + data_len, GFP_ATOMIC);
227 if (unlikely(!nl_skb))
228 return;
229
230 data = genlmsg_put(nl_skb, 0, 0, &psample_nl_family, 0,
231 PSAMPLE_CMD_SAMPLE);
232 if (unlikely(!data))
233 goto error;
234
235 if (in_ifindex) {
236 ret = nla_put_u16(nl_skb, PSAMPLE_ATTR_IIFINDEX, in_ifindex);
237 if (unlikely(ret < 0))
238 goto error;
239 }
240
241 if (out_ifindex) {
242 ret = nla_put_u16(nl_skb, PSAMPLE_ATTR_OIFINDEX, out_ifindex);
243 if (unlikely(ret < 0))
244 goto error;
245 }
246
247 ret = nla_put_u32(nl_skb, PSAMPLE_ATTR_SAMPLE_RATE, sample_rate);
248 if (unlikely(ret < 0))
249 goto error;
250
251 ret = nla_put_u32(nl_skb, PSAMPLE_ATTR_ORIGSIZE, skb->len);
252 if (unlikely(ret < 0))
253 goto error;
254
255 ret = nla_put_u32(nl_skb, PSAMPLE_ATTR_SAMPLE_GROUP, group->group_num);
256 if (unlikely(ret < 0))
257 goto error;
258
259 ret = nla_put_u32(nl_skb, PSAMPLE_ATTR_GROUP_SEQ, group->seq++);
260 if (unlikely(ret < 0))
261 goto error;
262
263 if (data_len) {
264 int nla_len = nla_total_size(data_len);
265 struct nlattr *nla;
266
267 nla = (struct nlattr *)skb_put(nl_skb, nla_len);
268 nla->nla_type = PSAMPLE_ATTR_DATA;
269 nla->nla_len = nla_attr_size(data_len);
270
271 if (skb_copy_bits(skb, 0, nla_data(nla), data_len))
272 goto error;
273 }
274
275 genlmsg_end(nl_skb, data);
276 genlmsg_multicast_netns(&psample_nl_family, group->net, nl_skb, 0,
277 PSAMPLE_NL_MCGRP_SAMPLE, GFP_ATOMIC);
278
279 return;
280error:
281 pr_err_ratelimited("Could not create psample log message\n");
282 nlmsg_free(nl_skb);
283}
284EXPORT_SYMBOL_GPL(psample_sample_packet);
285
286static int __init psample_module_init(void)
287{
288 return genl_register_family(&psample_nl_family);
289}
290
291static void __exit psample_module_exit(void)
292{
293 genl_unregister_family(&psample_nl_family);
294}
295
296module_init(psample_module_init);
297module_exit(psample_module_exit);
298
299MODULE_AUTHOR("Yotam Gigi <yotamg@mellanox.com>");
300MODULE_DESCRIPTION("netlink channel for packet sampling");
301MODULE_LICENSE("GPL v2");
diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c
index ae5ac175b2be..9da7368b0140 100644
--- a/net/qrtr/qrtr.c
+++ b/net/qrtr/qrtr.c
@@ -658,7 +658,9 @@ static int qrtr_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
658 } 658 }
659 659
660 if (plen != len) { 660 if (plen != len) {
661 skb_pad(skb, plen - len); 661 rc = skb_pad(skb, plen - len);
662 if (rc)
663 goto out_node;
662 skb_put(skb, plen - len); 664 skb_put(skb, plen - len);
663 } 665 }
664 666
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index 2ac1e6194be3..b405f77d664c 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -298,6 +298,33 @@ static int rds_enable_recvtstamp(struct sock *sk, char __user *optval,
298 return 0; 298 return 0;
299} 299}
300 300
301static int rds_recv_track_latency(struct rds_sock *rs, char __user *optval,
302 int optlen)
303{
304 struct rds_rx_trace_so trace;
305 int i;
306
307 if (optlen != sizeof(struct rds_rx_trace_so))
308 return -EFAULT;
309
310 if (copy_from_user(&trace, optval, sizeof(trace)))
311 return -EFAULT;
312
313 if (trace.rx_traces > RDS_MSG_RX_DGRAM_TRACE_MAX)
314 return -EFAULT;
315
316 rs->rs_rx_traces = trace.rx_traces;
317 for (i = 0; i < rs->rs_rx_traces; i++) {
318 if (trace.rx_trace_pos[i] > RDS_MSG_RX_DGRAM_TRACE_MAX) {
319 rs->rs_rx_traces = 0;
320 return -EFAULT;
321 }
322 rs->rs_rx_trace[i] = trace.rx_trace_pos[i];
323 }
324
325 return 0;
326}
327
301static int rds_setsockopt(struct socket *sock, int level, int optname, 328static int rds_setsockopt(struct socket *sock, int level, int optname,
302 char __user *optval, unsigned int optlen) 329 char __user *optval, unsigned int optlen)
303{ 330{
@@ -338,6 +365,9 @@ static int rds_setsockopt(struct socket *sock, int level, int optname,
338 ret = rds_enable_recvtstamp(sock->sk, optval, optlen); 365 ret = rds_enable_recvtstamp(sock->sk, optval, optlen);
339 release_sock(sock->sk); 366 release_sock(sock->sk);
340 break; 367 break;
368 case SO_RDS_MSG_RXPATH_LATENCY:
369 ret = rds_recv_track_latency(rs, optval, optlen);
370 break;
341 default: 371 default:
342 ret = -ENOPROTOOPT; 372 ret = -ENOPROTOOPT;
343 } 373 }
@@ -484,6 +514,7 @@ static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
484 INIT_LIST_HEAD(&rs->rs_cong_list); 514 INIT_LIST_HEAD(&rs->rs_cong_list);
485 spin_lock_init(&rs->rs_rdma_lock); 515 spin_lock_init(&rs->rs_rdma_lock);
486 rs->rs_rdma_keys = RB_ROOT; 516 rs->rs_rdma_keys = RB_ROOT;
517 rs->rs_rx_traces = 0;
487 518
488 spin_lock_bh(&rds_sock_lock); 519 spin_lock_bh(&rds_sock_lock);
489 list_add_tail(&rs->rs_item, &rds_sock_list); 520 list_add_tail(&rs->rs_item, &rds_sock_list);
diff --git a/net/rds/bind.c b/net/rds/bind.c
index 095f6ce583fe..3a915bedb76c 100644
--- a/net/rds/bind.c
+++ b/net/rds/bind.c
@@ -176,8 +176,8 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
176 if (!trans) { 176 if (!trans) {
177 ret = -EADDRNOTAVAIL; 177 ret = -EADDRNOTAVAIL;
178 rds_remove_bound(rs); 178 rds_remove_bound(rs);
179 printk_ratelimited(KERN_INFO "RDS: rds_bind() could not find a transport, " 179 pr_info_ratelimited("RDS: %s could not find a transport for %pI4, load rds_tcp or rds_rdma?\n",
180 "load rds_tcp or rds_rdma?\n"); 180 __func__, &sin->sin_addr.s_addr);
181 goto out; 181 goto out;
182 } 182 }
183 183
diff --git a/net/rds/connection.c b/net/rds/connection.c
index fe9d31c0b22d..1fa75ab7b733 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -429,6 +429,7 @@ void rds_conn_destroy(struct rds_connection *conn)
429 */ 429 */
430 rds_cong_remove_conn(conn); 430 rds_cong_remove_conn(conn);
431 431
432 put_net(conn->c_net);
432 kmem_cache_free(rds_conn_slab, conn); 433 kmem_cache_free(rds_conn_slab, conn);
433 434
434 spin_lock_irqsave(&rds_conn_lock, flags); 435 spin_lock_irqsave(&rds_conn_lock, flags);
@@ -545,11 +546,11 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len,
545} 546}
546EXPORT_SYMBOL_GPL(rds_for_each_conn_info); 547EXPORT_SYMBOL_GPL(rds_for_each_conn_info);
547 548
548void rds_walk_conn_path_info(struct socket *sock, unsigned int len, 549static void rds_walk_conn_path_info(struct socket *sock, unsigned int len,
549 struct rds_info_iterator *iter, 550 struct rds_info_iterator *iter,
550 struct rds_info_lengths *lens, 551 struct rds_info_lengths *lens,
551 int (*visitor)(struct rds_conn_path *, void *), 552 int (*visitor)(struct rds_conn_path *, void *),
552 size_t item_len) 553 size_t item_len)
553{ 554{
554 u64 buffer[(item_len + 7) / 8]; 555 u64 buffer[(item_len + 7) / 8];
555 struct hlist_head *head; 556 struct hlist_head *head;
diff --git a/net/rds/ib.c b/net/rds/ib.c
index 5680d90b0b77..7a64c8db81ab 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -45,8 +45,8 @@
45#include "ib.h" 45#include "ib.h"
46#include "ib_mr.h" 46#include "ib_mr.h"
47 47
48unsigned int rds_ib_mr_1m_pool_size = RDS_MR_1M_POOL_SIZE; 48static unsigned int rds_ib_mr_1m_pool_size = RDS_MR_1M_POOL_SIZE;
49unsigned int rds_ib_mr_8k_pool_size = RDS_MR_8K_POOL_SIZE; 49static unsigned int rds_ib_mr_8k_pool_size = RDS_MR_8K_POOL_SIZE;
50unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT; 50unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT;
51 51
52module_param(rds_ib_mr_1m_pool_size, int, 0444); 52module_param(rds_ib_mr_1m_pool_size, int, 0444);
@@ -111,6 +111,8 @@ static void rds_ib_dev_free(struct work_struct *work)
111 kfree(i_ipaddr); 111 kfree(i_ipaddr);
112 } 112 }
113 113
114 kfree(rds_ibdev->vector_load);
115
114 kfree(rds_ibdev); 116 kfree(rds_ibdev);
115} 117}
116 118
@@ -159,6 +161,14 @@ static void rds_ib_add_one(struct ib_device *device)
159 rds_ibdev->max_initiator_depth = device->attrs.max_qp_init_rd_atom; 161 rds_ibdev->max_initiator_depth = device->attrs.max_qp_init_rd_atom;
160 rds_ibdev->max_responder_resources = device->attrs.max_qp_rd_atom; 162 rds_ibdev->max_responder_resources = device->attrs.max_qp_rd_atom;
161 163
164 rds_ibdev->vector_load = kzalloc(sizeof(int) * device->num_comp_vectors,
165 GFP_KERNEL);
166 if (!rds_ibdev->vector_load) {
167 pr_err("RDS/IB: %s failed to allocate vector memory\n",
168 __func__);
169 goto put_dev;
170 }
171
162 rds_ibdev->dev = device; 172 rds_ibdev->dev = device;
163 rds_ibdev->pd = ib_alloc_pd(device, 0); 173 rds_ibdev->pd = ib_alloc_pd(device, 0);
164 if (IS_ERR(rds_ibdev->pd)) { 174 if (IS_ERR(rds_ibdev->pd)) {
@@ -428,16 +438,12 @@ int rds_ib_init(void)
428 if (ret) 438 if (ret)
429 goto out_sysctl; 439 goto out_sysctl;
430 440
431 ret = rds_trans_register(&rds_ib_transport); 441 rds_trans_register(&rds_ib_transport);
432 if (ret)
433 goto out_recv;
434 442
435 rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); 443 rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
436 444
437 goto out; 445 goto out;
438 446
439out_recv:
440 rds_ib_recv_exit();
441out_sysctl: 447out_sysctl:
442 rds_ib_sysctl_exit(); 448 rds_ib_sysctl_exit();
443out_ibreg: 449out_ibreg:
diff --git a/net/rds/ib.h b/net/rds/ib.h
index 45ac8e8e58f4..ec550626e221 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -14,9 +14,10 @@
14 14
15#define RDS_IB_DEFAULT_RECV_WR 1024 15#define RDS_IB_DEFAULT_RECV_WR 1024
16#define RDS_IB_DEFAULT_SEND_WR 256 16#define RDS_IB_DEFAULT_SEND_WR 256
17#define RDS_IB_DEFAULT_FR_WR 512 17#define RDS_IB_DEFAULT_FR_WR 256
18#define RDS_IB_DEFAULT_FR_INV_WR 256
18 19
19#define RDS_IB_DEFAULT_RETRY_COUNT 2 20#define RDS_IB_DEFAULT_RETRY_COUNT 1
20 21
21#define RDS_IB_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */ 22#define RDS_IB_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */
22 23
@@ -125,6 +126,7 @@ struct rds_ib_connection {
125 126
126 /* To control the number of wrs from fastreg */ 127 /* To control the number of wrs from fastreg */
127 atomic_t i_fastreg_wrs; 128 atomic_t i_fastreg_wrs;
129 atomic_t i_fastunreg_wrs;
128 130
129 /* interrupt handling */ 131 /* interrupt handling */
130 struct tasklet_struct i_send_tasklet; 132 struct tasklet_struct i_send_tasklet;
@@ -134,7 +136,7 @@ struct rds_ib_connection {
134 struct rds_ib_work_ring i_send_ring; 136 struct rds_ib_work_ring i_send_ring;
135 struct rm_data_op *i_data_op; 137 struct rm_data_op *i_data_op;
136 struct rds_header *i_send_hdrs; 138 struct rds_header *i_send_hdrs;
137 u64 i_send_hdrs_dma; 139 dma_addr_t i_send_hdrs_dma;
138 struct rds_ib_send_work *i_sends; 140 struct rds_ib_send_work *i_sends;
139 atomic_t i_signaled_sends; 141 atomic_t i_signaled_sends;
140 142
@@ -144,11 +146,12 @@ struct rds_ib_connection {
144 struct rds_ib_incoming *i_ibinc; 146 struct rds_ib_incoming *i_ibinc;
145 u32 i_recv_data_rem; 147 u32 i_recv_data_rem;
146 struct rds_header *i_recv_hdrs; 148 struct rds_header *i_recv_hdrs;
147 u64 i_recv_hdrs_dma; 149 dma_addr_t i_recv_hdrs_dma;
148 struct rds_ib_recv_work *i_recvs; 150 struct rds_ib_recv_work *i_recvs;
149 u64 i_ack_recv; /* last ACK received */ 151 u64 i_ack_recv; /* last ACK received */
150 struct rds_ib_refill_cache i_cache_incs; 152 struct rds_ib_refill_cache i_cache_incs;
151 struct rds_ib_refill_cache i_cache_frags; 153 struct rds_ib_refill_cache i_cache_frags;
154 atomic_t i_cache_allocs;
152 155
153 /* sending acks */ 156 /* sending acks */
154 unsigned long i_ack_flags; 157 unsigned long i_ack_flags;
@@ -161,7 +164,7 @@ struct rds_ib_connection {
161 struct rds_header *i_ack; 164 struct rds_header *i_ack;
162 struct ib_send_wr i_ack_wr; 165 struct ib_send_wr i_ack_wr;
163 struct ib_sge i_ack_sge; 166 struct ib_sge i_ack_sge;
164 u64 i_ack_dma; 167 dma_addr_t i_ack_dma;
165 unsigned long i_ack_queued; 168 unsigned long i_ack_queued;
166 169
167 /* Flow control related information 170 /* Flow control related information
@@ -179,6 +182,14 @@ struct rds_ib_connection {
179 182
180 /* Batched completions */ 183 /* Batched completions */
181 unsigned int i_unsignaled_wrs; 184 unsigned int i_unsignaled_wrs;
185
186 /* Endpoint role in connection */
187 bool i_active_side;
188 atomic_t i_cq_quiesce;
189
190 /* Send/Recv vectors */
191 int i_scq_vector;
192 int i_rcq_vector;
182}; 193};
183 194
184/* This assumes that atomic_t is at least 32 bits */ 195/* This assumes that atomic_t is at least 32 bits */
@@ -221,9 +232,10 @@ struct rds_ib_device {
221 spinlock_t spinlock; /* protect the above */ 232 spinlock_t spinlock; /* protect the above */
222 atomic_t refcount; 233 atomic_t refcount;
223 struct work_struct free_work; 234 struct work_struct free_work;
235 int *vector_load;
224}; 236};
225 237
226#define ibdev_to_node(ibdev) dev_to_node(ibdev->dma_device) 238#define ibdev_to_node(ibdev) dev_to_node((ibdev)->dev.parent)
227#define rdsibdev_to_node(rdsibdev) ibdev_to_node(rdsibdev->dev) 239#define rdsibdev_to_node(rdsibdev) ibdev_to_node(rdsibdev->dev)
228 240
229/* bits for i_ack_flags */ 241/* bits for i_ack_flags */
@@ -249,6 +261,8 @@ struct rds_ib_statistics {
249 uint64_t s_ib_rx_refill_from_cq; 261 uint64_t s_ib_rx_refill_from_cq;
250 uint64_t s_ib_rx_refill_from_thread; 262 uint64_t s_ib_rx_refill_from_thread;
251 uint64_t s_ib_rx_alloc_limit; 263 uint64_t s_ib_rx_alloc_limit;
264 uint64_t s_ib_rx_total_frags;
265 uint64_t s_ib_rx_total_incs;
252 uint64_t s_ib_rx_credit_updates; 266 uint64_t s_ib_rx_credit_updates;
253 uint64_t s_ib_ack_sent; 267 uint64_t s_ib_ack_sent;
254 uint64_t s_ib_ack_send_failure; 268 uint64_t s_ib_ack_send_failure;
@@ -271,6 +285,8 @@ struct rds_ib_statistics {
271 uint64_t s_ib_rdma_mr_1m_reused; 285 uint64_t s_ib_rdma_mr_1m_reused;
272 uint64_t s_ib_atomic_cswp; 286 uint64_t s_ib_atomic_cswp;
273 uint64_t s_ib_atomic_fadd; 287 uint64_t s_ib_atomic_fadd;
288 uint64_t s_ib_recv_added_to_cache;
289 uint64_t s_ib_recv_removed_from_cache;
274}; 290};
275 291
276extern struct workqueue_struct *rds_ib_wq; 292extern struct workqueue_struct *rds_ib_wq;
@@ -401,6 +417,8 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op);
401/* ib_stats.c */ 417/* ib_stats.c */
402DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats); 418DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats);
403#define rds_ib_stats_inc(member) rds_stats_inc_which(rds_ib_stats, member) 419#define rds_ib_stats_inc(member) rds_stats_inc_which(rds_ib_stats, member)
420#define rds_ib_stats_add(member, count) \
421 rds_stats_add_which(rds_ib_stats, member, count)
404unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter, 422unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
405 unsigned int avail); 423 unsigned int avail);
406 424
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 5b2ab95afa07..1c38d2c7caa8 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -113,24 +113,26 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
113 } 113 }
114 114
115 if (conn->c_version < RDS_PROTOCOL(3, 1)) { 115 if (conn->c_version < RDS_PROTOCOL(3, 1)) {
116 printk(KERN_NOTICE "RDS/IB: Connection to %pI4 version %u.%u failed," 116 pr_notice("RDS/IB: Connection <%pI4,%pI4> version %u.%u no longer supported\n",
117 " no longer supported\n", 117 &conn->c_laddr, &conn->c_faddr,
118 &conn->c_faddr, 118 RDS_PROTOCOL_MAJOR(conn->c_version),
119 RDS_PROTOCOL_MAJOR(conn->c_version), 119 RDS_PROTOCOL_MINOR(conn->c_version));
120 RDS_PROTOCOL_MINOR(conn->c_version));
121 rds_conn_destroy(conn); 120 rds_conn_destroy(conn);
122 return; 121 return;
123 } else { 122 } else {
124 printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n", 123 pr_notice("RDS/IB: %s conn connected <%pI4,%pI4> version %u.%u%s\n",
125 &conn->c_faddr, 124 ic->i_active_side ? "Active" : "Passive",
126 RDS_PROTOCOL_MAJOR(conn->c_version), 125 &conn->c_laddr, &conn->c_faddr,
127 RDS_PROTOCOL_MINOR(conn->c_version), 126 RDS_PROTOCOL_MAJOR(conn->c_version),
128 ic->i_flowctl ? ", flow control" : ""); 127 RDS_PROTOCOL_MINOR(conn->c_version),
128 ic->i_flowctl ? ", flow control" : "");
129 } 129 }
130 130
131 /* 131 atomic_set(&ic->i_cq_quiesce, 0);
132 * Init rings and fill recv. this needs to wait until protocol negotiation 132
133 * is complete, since ring layout is different from 3.0 to 3.1. 133 /* Init rings and fill recv. this needs to wait until protocol
134 * negotiation is complete, since ring layout is different
135 * from 3.1 to 4.1.
134 */ 136 */
135 rds_ib_send_init_ring(ic); 137 rds_ib_send_init_ring(ic);
136 rds_ib_recv_init_ring(ic); 138 rds_ib_recv_init_ring(ic);
@@ -267,6 +269,10 @@ static void rds_ib_tasklet_fn_send(unsigned long data)
267 269
268 rds_ib_stats_inc(s_ib_tasklet_call); 270 rds_ib_stats_inc(s_ib_tasklet_call);
269 271
272 /* if cq has been already reaped, ignore incoming cq event */
273 if (atomic_read(&ic->i_cq_quiesce))
274 return;
275
270 poll_scq(ic, ic->i_send_cq, ic->i_send_wc); 276 poll_scq(ic, ic->i_send_cq, ic->i_send_wc);
271 ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP); 277 ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP);
272 poll_scq(ic, ic->i_send_cq, ic->i_send_wc); 278 poll_scq(ic, ic->i_send_cq, ic->i_send_wc);
@@ -308,6 +314,10 @@ static void rds_ib_tasklet_fn_recv(unsigned long data)
308 314
309 rds_ib_stats_inc(s_ib_tasklet_call); 315 rds_ib_stats_inc(s_ib_tasklet_call);
310 316
317 /* if cq has been already reaped, ignore incoming cq event */
318 if (atomic_read(&ic->i_cq_quiesce))
319 return;
320
311 memset(&state, 0, sizeof(state)); 321 memset(&state, 0, sizeof(state));
312 poll_rcq(ic, ic->i_recv_cq, ic->i_recv_wc, &state); 322 poll_rcq(ic, ic->i_recv_cq, ic->i_recv_wc, &state);
313 ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED); 323 ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
@@ -358,6 +368,28 @@ static void rds_ib_cq_comp_handler_send(struct ib_cq *cq, void *context)
358 tasklet_schedule(&ic->i_send_tasklet); 368 tasklet_schedule(&ic->i_send_tasklet);
359} 369}
360 370
371static inline int ibdev_get_unused_vector(struct rds_ib_device *rds_ibdev)
372{
373 int min = rds_ibdev->vector_load[rds_ibdev->dev->num_comp_vectors - 1];
374 int index = rds_ibdev->dev->num_comp_vectors - 1;
375 int i;
376
377 for (i = rds_ibdev->dev->num_comp_vectors - 1; i >= 0; i--) {
378 if (rds_ibdev->vector_load[i] < min) {
379 index = i;
380 min = rds_ibdev->vector_load[i];
381 }
382 }
383
384 rds_ibdev->vector_load[index]++;
385 return index;
386}
387
388static inline void ibdev_put_vector(struct rds_ib_device *rds_ibdev, int index)
389{
390 rds_ibdev->vector_load[index]--;
391}
392
361/* 393/*
362 * This needs to be very careful to not leave IS_ERR pointers around for 394 * This needs to be very careful to not leave IS_ERR pointers around for
363 * cleanup to trip over. 395 * cleanup to trip over.
@@ -383,7 +415,10 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
383 * completion queue and send queue. This extra space is used for FRMR 415 * completion queue and send queue. This extra space is used for FRMR
384 * registration and invalidation work requests 416 * registration and invalidation work requests
385 */ 417 */
386 fr_queue_space = (rds_ibdev->use_fastreg ? RDS_IB_DEFAULT_FR_WR : 0); 418 fr_queue_space = rds_ibdev->use_fastreg ?
419 (RDS_IB_DEFAULT_FR_WR + 1) +
420 (RDS_IB_DEFAULT_FR_INV_WR + 1)
421 : 0;
387 422
388 /* add the conn now so that connection establishment has the dev */ 423 /* add the conn now so that connection establishment has the dev */
389 rds_ib_add_conn(rds_ibdev, conn); 424 rds_ib_add_conn(rds_ibdev, conn);
@@ -396,39 +431,44 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
396 /* Protection domain and memory range */ 431 /* Protection domain and memory range */
397 ic->i_pd = rds_ibdev->pd; 432 ic->i_pd = rds_ibdev->pd;
398 433
434 ic->i_scq_vector = ibdev_get_unused_vector(rds_ibdev);
399 cq_attr.cqe = ic->i_send_ring.w_nr + fr_queue_space + 1; 435 cq_attr.cqe = ic->i_send_ring.w_nr + fr_queue_space + 1;
400 436 cq_attr.comp_vector = ic->i_scq_vector;
401 ic->i_send_cq = ib_create_cq(dev, rds_ib_cq_comp_handler_send, 437 ic->i_send_cq = ib_create_cq(dev, rds_ib_cq_comp_handler_send,
402 rds_ib_cq_event_handler, conn, 438 rds_ib_cq_event_handler, conn,
403 &cq_attr); 439 &cq_attr);
404 if (IS_ERR(ic->i_send_cq)) { 440 if (IS_ERR(ic->i_send_cq)) {
405 ret = PTR_ERR(ic->i_send_cq); 441 ret = PTR_ERR(ic->i_send_cq);
406 ic->i_send_cq = NULL; 442 ic->i_send_cq = NULL;
443 ibdev_put_vector(rds_ibdev, ic->i_scq_vector);
407 rdsdebug("ib_create_cq send failed: %d\n", ret); 444 rdsdebug("ib_create_cq send failed: %d\n", ret);
408 goto out; 445 goto rds_ibdev_out;
409 } 446 }
410 447
448 ic->i_rcq_vector = ibdev_get_unused_vector(rds_ibdev);
411 cq_attr.cqe = ic->i_recv_ring.w_nr; 449 cq_attr.cqe = ic->i_recv_ring.w_nr;
450 cq_attr.comp_vector = ic->i_rcq_vector;
412 ic->i_recv_cq = ib_create_cq(dev, rds_ib_cq_comp_handler_recv, 451 ic->i_recv_cq = ib_create_cq(dev, rds_ib_cq_comp_handler_recv,
413 rds_ib_cq_event_handler, conn, 452 rds_ib_cq_event_handler, conn,
414 &cq_attr); 453 &cq_attr);
415 if (IS_ERR(ic->i_recv_cq)) { 454 if (IS_ERR(ic->i_recv_cq)) {
416 ret = PTR_ERR(ic->i_recv_cq); 455 ret = PTR_ERR(ic->i_recv_cq);
417 ic->i_recv_cq = NULL; 456 ic->i_recv_cq = NULL;
457 ibdev_put_vector(rds_ibdev, ic->i_rcq_vector);
418 rdsdebug("ib_create_cq recv failed: %d\n", ret); 458 rdsdebug("ib_create_cq recv failed: %d\n", ret);
419 goto out; 459 goto send_cq_out;
420 } 460 }
421 461
422 ret = ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP); 462 ret = ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP);
423 if (ret) { 463 if (ret) {
424 rdsdebug("ib_req_notify_cq send failed: %d\n", ret); 464 rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
425 goto out; 465 goto recv_cq_out;
426 } 466 }
427 467
428 ret = ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED); 468 ret = ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
429 if (ret) { 469 if (ret) {
430 rdsdebug("ib_req_notify_cq recv failed: %d\n", ret); 470 rdsdebug("ib_req_notify_cq recv failed: %d\n", ret);
431 goto out; 471 goto recv_cq_out;
432 } 472 }
433 473
434 /* XXX negotiate max send/recv with remote? */ 474 /* XXX negotiate max send/recv with remote? */
@@ -445,6 +485,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
445 attr.send_cq = ic->i_send_cq; 485 attr.send_cq = ic->i_send_cq;
446 attr.recv_cq = ic->i_recv_cq; 486 attr.recv_cq = ic->i_recv_cq;
447 atomic_set(&ic->i_fastreg_wrs, RDS_IB_DEFAULT_FR_WR); 487 atomic_set(&ic->i_fastreg_wrs, RDS_IB_DEFAULT_FR_WR);
488 atomic_set(&ic->i_fastunreg_wrs, RDS_IB_DEFAULT_FR_INV_WR);
448 489
449 /* 490 /*
450 * XXX this can fail if max_*_wr is too large? Are we supposed 491 * XXX this can fail if max_*_wr is too large? Are we supposed
@@ -453,7 +494,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
453 ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr); 494 ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr);
454 if (ret) { 495 if (ret) {
455 rdsdebug("rdma_create_qp failed: %d\n", ret); 496 rdsdebug("rdma_create_qp failed: %d\n", ret);
456 goto out; 497 goto recv_cq_out;
457 } 498 }
458 499
459 ic->i_send_hdrs = ib_dma_alloc_coherent(dev, 500 ic->i_send_hdrs = ib_dma_alloc_coherent(dev,
@@ -463,7 +504,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
463 if (!ic->i_send_hdrs) { 504 if (!ic->i_send_hdrs) {
464 ret = -ENOMEM; 505 ret = -ENOMEM;
465 rdsdebug("ib_dma_alloc_coherent send failed\n"); 506 rdsdebug("ib_dma_alloc_coherent send failed\n");
466 goto out; 507 goto qp_out;
467 } 508 }
468 509
469 ic->i_recv_hdrs = ib_dma_alloc_coherent(dev, 510 ic->i_recv_hdrs = ib_dma_alloc_coherent(dev,
@@ -473,7 +514,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
473 if (!ic->i_recv_hdrs) { 514 if (!ic->i_recv_hdrs) {
474 ret = -ENOMEM; 515 ret = -ENOMEM;
475 rdsdebug("ib_dma_alloc_coherent recv failed\n"); 516 rdsdebug("ib_dma_alloc_coherent recv failed\n");
476 goto out; 517 goto send_hdrs_dma_out;
477 } 518 }
478 519
479 ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header), 520 ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
@@ -481,7 +522,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
481 if (!ic->i_ack) { 522 if (!ic->i_ack) {
482 ret = -ENOMEM; 523 ret = -ENOMEM;
483 rdsdebug("ib_dma_alloc_coherent ack failed\n"); 524 rdsdebug("ib_dma_alloc_coherent ack failed\n");
484 goto out; 525 goto recv_hdrs_dma_out;
485 } 526 }
486 527
487 ic->i_sends = vzalloc_node(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work), 528 ic->i_sends = vzalloc_node(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work),
@@ -489,7 +530,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
489 if (!ic->i_sends) { 530 if (!ic->i_sends) {
490 ret = -ENOMEM; 531 ret = -ENOMEM;
491 rdsdebug("send allocation failed\n"); 532 rdsdebug("send allocation failed\n");
492 goto out; 533 goto ack_dma_out;
493 } 534 }
494 535
495 ic->i_recvs = vzalloc_node(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work), 536 ic->i_recvs = vzalloc_node(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work),
@@ -497,7 +538,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
497 if (!ic->i_recvs) { 538 if (!ic->i_recvs) {
498 ret = -ENOMEM; 539 ret = -ENOMEM;
499 rdsdebug("recv allocation failed\n"); 540 rdsdebug("recv allocation failed\n");
500 goto out; 541 goto sends_out;
501 } 542 }
502 543
503 rds_ib_recv_init_ack(ic); 544 rds_ib_recv_init_ack(ic);
@@ -505,8 +546,33 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
505 rdsdebug("conn %p pd %p cq %p %p\n", conn, ic->i_pd, 546 rdsdebug("conn %p pd %p cq %p %p\n", conn, ic->i_pd,
506 ic->i_send_cq, ic->i_recv_cq); 547 ic->i_send_cq, ic->i_recv_cq);
507 548
508out: 549 return ret;
550
551sends_out:
552 vfree(ic->i_sends);
553ack_dma_out:
554 ib_dma_free_coherent(dev, sizeof(struct rds_header),
555 ic->i_ack, ic->i_ack_dma);
556recv_hdrs_dma_out:
557 ib_dma_free_coherent(dev, ic->i_recv_ring.w_nr *
558 sizeof(struct rds_header),
559 ic->i_recv_hdrs, ic->i_recv_hdrs_dma);
560send_hdrs_dma_out:
561 ib_dma_free_coherent(dev, ic->i_send_ring.w_nr *
562 sizeof(struct rds_header),
563 ic->i_send_hdrs, ic->i_send_hdrs_dma);
564qp_out:
565 rdma_destroy_qp(ic->i_cm_id);
566recv_cq_out:
567 if (!ib_destroy_cq(ic->i_recv_cq))
568 ic->i_recv_cq = NULL;
569send_cq_out:
570 if (!ib_destroy_cq(ic->i_send_cq))
571 ic->i_send_cq = NULL;
572rds_ibdev_out:
573 rds_ib_remove_conn(rds_ibdev, conn);
509 rds_ib_dev_put(rds_ibdev); 574 rds_ib_dev_put(rds_ibdev);
575
510 return ret; 576 return ret;
511} 577}
512 578
@@ -682,6 +748,7 @@ out:
682 if (ic->i_cm_id == cm_id) 748 if (ic->i_cm_id == cm_id)
683 ret = 0; 749 ret = 0;
684 } 750 }
751 ic->i_active_side = true;
685 return ret; 752 return ret;
686} 753}
687 754
@@ -767,17 +834,27 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp)
767 wait_event(rds_ib_ring_empty_wait, 834 wait_event(rds_ib_ring_empty_wait,
768 rds_ib_ring_empty(&ic->i_recv_ring) && 835 rds_ib_ring_empty(&ic->i_recv_ring) &&
769 (atomic_read(&ic->i_signaled_sends) == 0) && 836 (atomic_read(&ic->i_signaled_sends) == 0) &&
770 (atomic_read(&ic->i_fastreg_wrs) == RDS_IB_DEFAULT_FR_WR)); 837 (atomic_read(&ic->i_fastreg_wrs) == RDS_IB_DEFAULT_FR_WR) &&
838 (atomic_read(&ic->i_fastunreg_wrs) == RDS_IB_DEFAULT_FR_INV_WR));
771 tasklet_kill(&ic->i_send_tasklet); 839 tasklet_kill(&ic->i_send_tasklet);
772 tasklet_kill(&ic->i_recv_tasklet); 840 tasklet_kill(&ic->i_recv_tasklet);
773 841
842 atomic_set(&ic->i_cq_quiesce, 1);
843
774 /* first destroy the ib state that generates callbacks */ 844 /* first destroy the ib state that generates callbacks */
775 if (ic->i_cm_id->qp) 845 if (ic->i_cm_id->qp)
776 rdma_destroy_qp(ic->i_cm_id); 846 rdma_destroy_qp(ic->i_cm_id);
777 if (ic->i_send_cq) 847 if (ic->i_send_cq) {
848 if (ic->rds_ibdev)
849 ibdev_put_vector(ic->rds_ibdev, ic->i_scq_vector);
778 ib_destroy_cq(ic->i_send_cq); 850 ib_destroy_cq(ic->i_send_cq);
779 if (ic->i_recv_cq) 851 }
852
853 if (ic->i_recv_cq) {
854 if (ic->rds_ibdev)
855 ibdev_put_vector(ic->rds_ibdev, ic->i_rcq_vector);
780 ib_destroy_cq(ic->i_recv_cq); 856 ib_destroy_cq(ic->i_recv_cq);
857 }
781 858
782 /* then free the resources that ib callbacks use */ 859 /* then free the resources that ib callbacks use */
783 if (ic->i_send_hdrs) 860 if (ic->i_send_hdrs)
@@ -855,6 +932,7 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp)
855 ic->i_sends = NULL; 932 ic->i_sends = NULL;
856 vfree(ic->i_recvs); 933 vfree(ic->i_recvs);
857 ic->i_recvs = NULL; 934 ic->i_recvs = NULL;
935 ic->i_active_side = false;
858} 936}
859 937
860int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp) 938int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c
index d921adc62765..48332a6ed738 100644
--- a/net/rds/ib_frmr.c
+++ b/net/rds/ib_frmr.c
@@ -104,14 +104,15 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr)
104 struct rds_ib_frmr *frmr = &ibmr->u.frmr; 104 struct rds_ib_frmr *frmr = &ibmr->u.frmr;
105 struct ib_send_wr *failed_wr; 105 struct ib_send_wr *failed_wr;
106 struct ib_reg_wr reg_wr; 106 struct ib_reg_wr reg_wr;
107 int ret; 107 int ret, off = 0;
108 108
109 while (atomic_dec_return(&ibmr->ic->i_fastreg_wrs) <= 0) { 109 while (atomic_dec_return(&ibmr->ic->i_fastreg_wrs) <= 0) {
110 atomic_inc(&ibmr->ic->i_fastreg_wrs); 110 atomic_inc(&ibmr->ic->i_fastreg_wrs);
111 cpu_relax(); 111 cpu_relax();
112 } 112 }
113 113
114 ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_len, 0, PAGE_SIZE); 114 ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_len,
115 &off, PAGE_SIZE);
115 if (unlikely(ret != ibmr->sg_len)) 116 if (unlikely(ret != ibmr->sg_len))
116 return ret < 0 ? ret : -EINVAL; 117 return ret < 0 ? ret : -EINVAL;
117 118
@@ -240,8 +241,8 @@ static int rds_ib_post_inv(struct rds_ib_mr *ibmr)
240 if (frmr->fr_state != FRMR_IS_INUSE) 241 if (frmr->fr_state != FRMR_IS_INUSE)
241 goto out; 242 goto out;
242 243
243 while (atomic_dec_return(&ibmr->ic->i_fastreg_wrs) <= 0) { 244 while (atomic_dec_return(&ibmr->ic->i_fastunreg_wrs) <= 0) {
244 atomic_inc(&ibmr->ic->i_fastreg_wrs); 245 atomic_inc(&ibmr->ic->i_fastunreg_wrs);
245 cpu_relax(); 246 cpu_relax();
246 } 247 }
247 248
@@ -260,7 +261,7 @@ static int rds_ib_post_inv(struct rds_ib_mr *ibmr)
260 if (unlikely(ret)) { 261 if (unlikely(ret)) {
261 frmr->fr_state = FRMR_IS_STALE; 262 frmr->fr_state = FRMR_IS_STALE;
262 frmr->fr_inv = false; 263 frmr->fr_inv = false;
263 atomic_inc(&ibmr->ic->i_fastreg_wrs); 264 atomic_inc(&ibmr->ic->i_fastunreg_wrs);
264 pr_err("RDS/IB: %s returned error(%d)\n", __func__, ret); 265 pr_err("RDS/IB: %s returned error(%d)\n", __func__, ret);
265 goto out; 266 goto out;
266 } 267 }
@@ -288,9 +289,10 @@ void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
288 if (frmr->fr_inv) { 289 if (frmr->fr_inv) {
289 frmr->fr_state = FRMR_IS_FREE; 290 frmr->fr_state = FRMR_IS_FREE;
290 frmr->fr_inv = false; 291 frmr->fr_inv = false;
292 atomic_inc(&ic->i_fastreg_wrs);
293 } else {
294 atomic_inc(&ic->i_fastunreg_wrs);
291 } 295 }
292
293 atomic_inc(&ic->i_fastreg_wrs);
294} 296}
295 297
296void rds_ib_unreg_frmr(struct list_head *list, unsigned int *nfreed, 298void rds_ib_unreg_frmr(struct list_head *list, unsigned int *nfreed,
diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h
index 1c754f4acbe5..5d6e98a79a5e 100644
--- a/net/rds/ib_mr.h
+++ b/net/rds/ib_mr.h
@@ -45,7 +45,6 @@
45 45
46struct rds_ib_fmr { 46struct rds_ib_fmr {
47 struct ib_fmr *fmr; 47 struct ib_fmr *fmr;
48 u64 *dma;
49}; 48};
50 49
51enum rds_ib_fr_state { 50enum rds_ib_fr_state {
@@ -108,8 +107,6 @@ struct rds_ib_mr_pool {
108}; 107};
109 108
110extern struct workqueue_struct *rds_ib_mr_wq; 109extern struct workqueue_struct *rds_ib_mr_wq;
111extern unsigned int rds_ib_mr_1m_pool_size;
112extern unsigned int rds_ib_mr_8k_pool_size;
113extern bool prefer_frmr; 110extern bool prefer_frmr;
114 111
115struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_dev, 112struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_dev,
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index 606a11f681d2..e10624aa6959 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -194,6 +194,8 @@ static void rds_ib_frag_free(struct rds_ib_connection *ic,
194 rdsdebug("frag %p page %p\n", frag, sg_page(&frag->f_sg)); 194 rdsdebug("frag %p page %p\n", frag, sg_page(&frag->f_sg));
195 195
196 rds_ib_recv_cache_put(&frag->f_cache_entry, &ic->i_cache_frags); 196 rds_ib_recv_cache_put(&frag->f_cache_entry, &ic->i_cache_frags);
197 atomic_add(RDS_FRAG_SIZE / SZ_1K, &ic->i_cache_allocs);
198 rds_ib_stats_add(s_ib_recv_added_to_cache, RDS_FRAG_SIZE);
197} 199}
198 200
199/* Recycle inc after freeing attached frags */ 201/* Recycle inc after freeing attached frags */
@@ -261,6 +263,7 @@ static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *i
261 atomic_dec(&rds_ib_allocation); 263 atomic_dec(&rds_ib_allocation);
262 return NULL; 264 return NULL;
263 } 265 }
266 rds_ib_stats_inc(s_ib_rx_total_incs);
264 } 267 }
265 INIT_LIST_HEAD(&ibinc->ii_frags); 268 INIT_LIST_HEAD(&ibinc->ii_frags);
266 rds_inc_init(&ibinc->ii_inc, ic->conn, ic->conn->c_faddr); 269 rds_inc_init(&ibinc->ii_inc, ic->conn, ic->conn->c_faddr);
@@ -278,6 +281,8 @@ static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic
278 cache_item = rds_ib_recv_cache_get(&ic->i_cache_frags); 281 cache_item = rds_ib_recv_cache_get(&ic->i_cache_frags);
279 if (cache_item) { 282 if (cache_item) {
280 frag = container_of(cache_item, struct rds_page_frag, f_cache_entry); 283 frag = container_of(cache_item, struct rds_page_frag, f_cache_entry);
284 atomic_sub(RDS_FRAG_SIZE / SZ_1K, &ic->i_cache_allocs);
285 rds_ib_stats_add(s_ib_recv_added_to_cache, RDS_FRAG_SIZE);
281 } else { 286 } else {
282 frag = kmem_cache_alloc(rds_ib_frag_slab, slab_mask); 287 frag = kmem_cache_alloc(rds_ib_frag_slab, slab_mask);
283 if (!frag) 288 if (!frag)
@@ -290,6 +295,7 @@ static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic
290 kmem_cache_free(rds_ib_frag_slab, frag); 295 kmem_cache_free(rds_ib_frag_slab, frag);
291 return NULL; 296 return NULL;
292 } 297 }
298 rds_ib_stats_inc(s_ib_rx_total_frags);
293 } 299 }
294 300
295 INIT_LIST_HEAD(&frag->f_item); 301 INIT_LIST_HEAD(&frag->f_item);
@@ -905,8 +911,12 @@ static void rds_ib_process_recv(struct rds_connection *conn,
905 ic->i_ibinc = ibinc; 911 ic->i_ibinc = ibinc;
906 912
907 hdr = &ibinc->ii_inc.i_hdr; 913 hdr = &ibinc->ii_inc.i_hdr;
914 ibinc->ii_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] =
915 local_clock();
908 memcpy(hdr, ihdr, sizeof(*hdr)); 916 memcpy(hdr, ihdr, sizeof(*hdr));
909 ic->i_recv_data_rem = be32_to_cpu(hdr->h_len); 917 ic->i_recv_data_rem = be32_to_cpu(hdr->h_len);
918 ibinc->ii_inc.i_rx_lat_trace[RDS_MSG_RX_START] =
919 local_clock();
910 920
911 rdsdebug("ic %p ibinc %p rem %u flag 0x%x\n", ic, ibinc, 921 rdsdebug("ic %p ibinc %p rem %u flag 0x%x\n", ic, ibinc,
912 ic->i_recv_data_rem, hdr->h_flags); 922 ic->i_recv_data_rem, hdr->h_flags);
@@ -980,8 +990,8 @@ void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic,
980 } else { 990 } else {
981 /* We expect errors as the qp is drained during shutdown */ 991 /* We expect errors as the qp is drained during shutdown */
982 if (rds_conn_up(conn) || rds_conn_connecting(conn)) 992 if (rds_conn_up(conn) || rds_conn_connecting(conn))
983 rds_ib_conn_error(conn, "recv completion on %pI4 had status %u (%s), disconnecting and reconnecting\n", 993 rds_ib_conn_error(conn, "recv completion on <%pI4,%pI4> had status %u (%s), disconnecting and reconnecting\n",
984 &conn->c_faddr, 994 &conn->c_laddr, &conn->c_faddr,
985 wc->status, 995 wc->status,
986 ib_wc_status_msg(wc->status)); 996 ib_wc_status_msg(wc->status));
987 } 997 }
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 84d90c97332f..6ab39dbcca01 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -69,16 +69,6 @@ static void rds_ib_send_complete(struct rds_message *rm,
69 complete(rm, notify_status); 69 complete(rm, notify_status);
70} 70}
71 71
72static void rds_ib_send_unmap_data(struct rds_ib_connection *ic,
73 struct rm_data_op *op,
74 int wc_status)
75{
76 if (op->op_nents)
77 ib_dma_unmap_sg(ic->i_cm_id->device,
78 op->op_sg, op->op_nents,
79 DMA_TO_DEVICE);
80}
81
82static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic, 72static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic,
83 struct rm_rdma_op *op, 73 struct rm_rdma_op *op,
84 int wc_status) 74 int wc_status)
@@ -139,6 +129,21 @@ static void rds_ib_send_unmap_atomic(struct rds_ib_connection *ic,
139 rds_ib_stats_inc(s_ib_atomic_fadd); 129 rds_ib_stats_inc(s_ib_atomic_fadd);
140} 130}
141 131
132static void rds_ib_send_unmap_data(struct rds_ib_connection *ic,
133 struct rm_data_op *op,
134 int wc_status)
135{
136 struct rds_message *rm = container_of(op, struct rds_message, data);
137
138 if (op->op_nents)
139 ib_dma_unmap_sg(ic->i_cm_id->device,
140 op->op_sg, op->op_nents,
141 DMA_TO_DEVICE);
142
143 if (rm->rdma.op_active && rm->data.op_notify)
144 rds_ib_send_unmap_rdma(ic, &rm->rdma, wc_status);
145}
146
142/* 147/*
143 * Unmap the resources associated with a struct send_work. 148 * Unmap the resources associated with a struct send_work.
144 * 149 *
@@ -300,8 +305,8 @@ void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
300 305
301 /* We expect errors as the qp is drained during shutdown */ 306 /* We expect errors as the qp is drained during shutdown */
302 if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) { 307 if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) {
303 rds_ib_conn_error(conn, "send completion on %pI4 had status %u (%s), disconnecting and reconnecting\n", 308 rds_ib_conn_error(conn, "send completion on <%pI4,%pI4> had status %u (%s), disconnecting and reconnecting\n",
304 &conn->c_faddr, wc->status, 309 &conn->c_laddr, &conn->c_faddr, wc->status,
305 ib_wc_status_msg(wc->status)); 310 ib_wc_status_msg(wc->status));
306 } 311 }
307} 312}
@@ -765,7 +770,6 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
765 770
766 work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, 1, &pos); 771 work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, 1, &pos);
767 if (work_alloc != 1) { 772 if (work_alloc != 1) {
768 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
769 rds_ib_stats_inc(s_ib_tx_ring_full); 773 rds_ib_stats_inc(s_ib_tx_ring_full);
770 ret = -ENOMEM; 774 ret = -ENOMEM;
771 goto out; 775 goto out;
diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c
index 7e78dca1f252..9252ad126335 100644
--- a/net/rds/ib_stats.c
+++ b/net/rds/ib_stats.c
@@ -55,6 +55,8 @@ static const char *const rds_ib_stat_names[] = {
55 "ib_rx_refill_from_cq", 55 "ib_rx_refill_from_cq",
56 "ib_rx_refill_from_thread", 56 "ib_rx_refill_from_thread",
57 "ib_rx_alloc_limit", 57 "ib_rx_alloc_limit",
58 "ib_rx_total_frags",
59 "ib_rx_total_incs",
58 "ib_rx_credit_updates", 60 "ib_rx_credit_updates",
59 "ib_ack_sent", 61 "ib_ack_sent",
60 "ib_ack_send_failure", 62 "ib_ack_send_failure",
diff --git a/net/rds/page.c b/net/rds/page.c
index e2b5a5832d3d..7cc57e098ddb 100644
--- a/net/rds/page.c
+++ b/net/rds/page.c
@@ -45,35 +45,6 @@ struct rds_page_remainder {
45static 45static
46DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders); 46DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders);
47 47
48/*
49 * returns 0 on success or -errno on failure.
50 *
51 * We don't have to worry about flush_dcache_page() as this only works
52 * with private pages. If, say, we were to do directed receive to pinned
53 * user pages we'd have to worry more about cache coherence. (Though
54 * the flush_dcache_page() in get_user_pages() would probably be enough).
55 */
56int rds_page_copy_user(struct page *page, unsigned long offset,
57 void __user *ptr, unsigned long bytes,
58 int to_user)
59{
60 unsigned long ret;
61 void *addr;
62
63 addr = kmap(page);
64 if (to_user) {
65 rds_stats_add(s_copy_to_user, bytes);
66 ret = copy_to_user(ptr, addr + offset, bytes);
67 } else {
68 rds_stats_add(s_copy_from_user, bytes);
69 ret = copy_from_user(addr + offset, ptr, bytes);
70 }
71 kunmap(page);
72
73 return ret ? -EFAULT : 0;
74}
75EXPORT_SYMBOL_GPL(rds_page_copy_user);
76
77/** 48/**
78 * rds_page_remainder_alloc - build up regions of a message. 49 * rds_page_remainder_alloc - build up regions of a message.
79 * 50 *
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index ea961144084f..f06fac4886b0 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -40,7 +40,6 @@
40/* 40/*
41 * XXX 41 * XXX
42 * - build with sparse 42 * - build with sparse
43 * - should we limit the size of a mr region? let transport return failure?
44 * - should we detect duplicate keys on a socket? hmm. 43 * - should we detect duplicate keys on a socket? hmm.
45 * - an rdma is an mlock, apply rlimit? 44 * - an rdma is an mlock, apply rlimit?
46 */ 45 */
@@ -200,6 +199,14 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
200 goto out; 199 goto out;
201 } 200 }
202 201
202 /* Restrict the size of mr irrespective of underlying transport
203 * To account for unaligned mr regions, subtract one from nr_pages
204 */
205 if ((nr_pages - 1) > (RDS_MAX_MSG_SIZE >> PAGE_SHIFT)) {
206 ret = -EMSGSIZE;
207 goto out;
208 }
209
203 rdsdebug("RDS: get_mr addr %llx len %llu nr_pages %u\n", 210 rdsdebug("RDS: get_mr addr %llx len %llu nr_pages %u\n",
204 args->vec.addr, args->vec.bytes, nr_pages); 211 args->vec.addr, args->vec.bytes, nr_pages);
205 212
@@ -415,7 +422,8 @@ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force)
415 spin_lock_irqsave(&rs->rs_rdma_lock, flags); 422 spin_lock_irqsave(&rs->rs_rdma_lock, flags);
416 mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); 423 mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
417 if (!mr) { 424 if (!mr) {
418 printk(KERN_ERR "rds: trying to unuse MR with unknown r_key %u!\n", r_key); 425 pr_debug("rds: trying to unuse MR with unknown r_key %u!\n",
426 r_key);
419 spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); 427 spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
420 return; 428 return;
421 } 429 }
@@ -626,6 +634,16 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
626 } 634 }
627 op->op_notifier->n_user_token = args->user_token; 635 op->op_notifier->n_user_token = args->user_token;
628 op->op_notifier->n_status = RDS_RDMA_SUCCESS; 636 op->op_notifier->n_status = RDS_RDMA_SUCCESS;
637
638 /* Enable rmda notification on data operation for composite
639 * rds messages and make sure notification is enabled only
640 * for the data operation which follows it so that application
641 * gets notified only after full message gets delivered.
642 */
643 if (rm->data.op_sg) {
644 rm->rdma.op_notify = 0;
645 rm->data.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
646 }
629 } 647 }
630 648
631 /* The cookie contains the R_Key of the remote memory region, and 649 /* The cookie contains the R_Key of the remote memory region, and
diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
index d5f311767157..fc59821f0a27 100644
--- a/net/rds/rdma_transport.c
+++ b/net/rds/rdma_transport.c
@@ -206,18 +206,13 @@ static int rds_rdma_init(void)
206{ 206{
207 int ret; 207 int ret;
208 208
209 ret = rds_rdma_listen_init(); 209 ret = rds_ib_init();
210 if (ret) 210 if (ret)
211 goto out; 211 goto out;
212 212
213 ret = rds_ib_init(); 213 ret = rds_rdma_listen_init();
214 if (ret) 214 if (ret)
215 goto err_ib_init; 215 rds_ib_exit();
216
217 goto out;
218
219err_ib_init:
220 rds_rdma_listen_stop();
221out: 216out:
222 return ret; 217 return ret;
223} 218}
diff --git a/net/rds/rds.h b/net/rds/rds.h
index ebbf909b87ec..82d38ccf5e8b 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -50,6 +50,9 @@ void rdsdebug(char *fmt, ...)
50#define RDS_FRAG_SHIFT 12 50#define RDS_FRAG_SHIFT 12
51#define RDS_FRAG_SIZE ((unsigned int)(1 << RDS_FRAG_SHIFT)) 51#define RDS_FRAG_SIZE ((unsigned int)(1 << RDS_FRAG_SHIFT))
52 52
53/* Used to limit both RDMA and non-RDMA RDS message to 1MB */
54#define RDS_MAX_MSG_SIZE ((unsigned int)(1 << 20))
55
53#define RDS_CONG_MAP_BYTES (65536 / 8) 56#define RDS_CONG_MAP_BYTES (65536 / 8)
54#define RDS_CONG_MAP_PAGES (PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE) 57#define RDS_CONG_MAP_PAGES (PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE)
55#define RDS_CONG_MAP_PAGE_BITS (PAGE_SIZE * 8) 58#define RDS_CONG_MAP_PAGE_BITS (PAGE_SIZE * 8)
@@ -144,7 +147,7 @@ struct rds_connection {
144 147
145 /* Protocol version */ 148 /* Protocol version */
146 unsigned int c_version; 149 unsigned int c_version;
147 possible_net_t c_net; 150 struct net *c_net;
148 151
149 struct list_head c_map_item; 152 struct list_head c_map_item;
150 unsigned long c_map_queued; 153 unsigned long c_map_queued;
@@ -159,13 +162,13 @@ struct rds_connection {
159static inline 162static inline
160struct net *rds_conn_net(struct rds_connection *conn) 163struct net *rds_conn_net(struct rds_connection *conn)
161{ 164{
162 return read_pnet(&conn->c_net); 165 return conn->c_net;
163} 166}
164 167
165static inline 168static inline
166void rds_conn_net_set(struct rds_connection *conn, struct net *net) 169void rds_conn_net_set(struct rds_connection *conn, struct net *net)
167{ 170{
168 write_pnet(&conn->c_net, net); 171 conn->c_net = get_net(net);
169} 172}
170 173
171#define RDS_FLAG_CONG_BITMAP 0x01 174#define RDS_FLAG_CONG_BITMAP 0x01
@@ -250,6 +253,11 @@ struct rds_ext_header_rdma_dest {
250#define RDS_EXTHDR_GEN_NUM 6 253#define RDS_EXTHDR_GEN_NUM 6
251 254
252#define __RDS_EXTHDR_MAX 16 /* for now */ 255#define __RDS_EXTHDR_MAX 16 /* for now */
256#define RDS_RX_MAX_TRACES (RDS_MSG_RX_DGRAM_TRACE_MAX + 1)
257#define RDS_MSG_RX_HDR 0
258#define RDS_MSG_RX_START 1
259#define RDS_MSG_RX_END 2
260#define RDS_MSG_RX_CMSG 3
253 261
254struct rds_incoming { 262struct rds_incoming {
255 atomic_t i_refcount; 263 atomic_t i_refcount;
@@ -262,6 +270,7 @@ struct rds_incoming {
262 270
263 rds_rdma_cookie_t i_rdma_cookie; 271 rds_rdma_cookie_t i_rdma_cookie;
264 struct timeval i_rx_tstamp; 272 struct timeval i_rx_tstamp;
273 u64 i_rx_lat_trace[RDS_RX_MAX_TRACES];
265}; 274};
266 275
267struct rds_mr { 276struct rds_mr {
@@ -419,6 +428,7 @@ struct rds_message {
419 } rdma; 428 } rdma;
420 struct rm_data_op { 429 struct rm_data_op {
421 unsigned int op_active:1; 430 unsigned int op_active:1;
431 unsigned int op_notify:1;
422 unsigned int op_nents; 432 unsigned int op_nents;
423 unsigned int op_count; 433 unsigned int op_count;
424 unsigned int op_dmasg; 434 unsigned int op_dmasg;
@@ -571,6 +581,10 @@ struct rds_sock {
571 unsigned char rs_recverr, 581 unsigned char rs_recverr,
572 rs_cong_monitor; 582 rs_cong_monitor;
573 u32 rs_hash_initval; 583 u32 rs_hash_initval;
584
585 /* Socket receive path trace points*/
586 u8 rs_rx_traces;
587 u8 rs_rx_trace[RDS_MSG_RX_DGRAM_TRACE_MAX];
574}; 588};
575 589
576static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk) 590static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk)
@@ -630,6 +644,9 @@ struct rds_statistics {
630 uint64_t s_cong_update_received; 644 uint64_t s_cong_update_received;
631 uint64_t s_cong_send_error; 645 uint64_t s_cong_send_error;
632 uint64_t s_cong_send_blocked; 646 uint64_t s_cong_send_blocked;
647 uint64_t s_recv_bytes_added_to_socket;
648 uint64_t s_recv_bytes_removed_from_socket;
649
633}; 650};
634 651
635/* af_rds.c */ 652/* af_rds.c */
@@ -781,13 +798,6 @@ static inline int rds_message_verify_checksum(const struct rds_header *hdr)
781/* page.c */ 798/* page.c */
782int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, 799int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
783 gfp_t gfp); 800 gfp_t gfp);
784int rds_page_copy_user(struct page *page, unsigned long offset,
785 void __user *ptr, unsigned long bytes,
786 int to_user);
787#define rds_page_copy_to_user(page, offset, ptr, bytes) \
788 rds_page_copy_user(page, offset, ptr, bytes, 1)
789#define rds_page_copy_from_user(page, offset, ptr, bytes) \
790 rds_page_copy_user(page, offset, ptr, bytes, 0)
791void rds_page_exit(void); 801void rds_page_exit(void);
792 802
793/* recv.c */ 803/* recv.c */
@@ -893,7 +903,7 @@ void rds_connect_path_complete(struct rds_conn_path *conn, int curr);
893void rds_connect_complete(struct rds_connection *conn); 903void rds_connect_complete(struct rds_connection *conn);
894 904
895/* transport.c */ 905/* transport.c */
896int rds_trans_register(struct rds_transport *trans); 906void rds_trans_register(struct rds_transport *trans);
897void rds_trans_unregister(struct rds_transport *trans); 907void rds_trans_unregister(struct rds_transport *trans);
898struct rds_transport *rds_trans_get_preferred(struct net *net, __be32 addr); 908struct rds_transport *rds_trans_get_preferred(struct net *net, __be32 addr);
899void rds_trans_put(struct rds_transport *trans); 909void rds_trans_put(struct rds_transport *trans);
diff --git a/net/rds/recv.c b/net/rds/recv.c
index 9d0666e5fe35..8b7e7b7f2c2d 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -43,6 +43,8 @@
43void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, 43void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
44 __be32 saddr) 44 __be32 saddr)
45{ 45{
46 int i;
47
46 atomic_set(&inc->i_refcount, 1); 48 atomic_set(&inc->i_refcount, 1);
47 INIT_LIST_HEAD(&inc->i_item); 49 INIT_LIST_HEAD(&inc->i_item);
48 inc->i_conn = conn; 50 inc->i_conn = conn;
@@ -50,6 +52,9 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
50 inc->i_rdma_cookie = 0; 52 inc->i_rdma_cookie = 0;
51 inc->i_rx_tstamp.tv_sec = 0; 53 inc->i_rx_tstamp.tv_sec = 0;
52 inc->i_rx_tstamp.tv_usec = 0; 54 inc->i_rx_tstamp.tv_usec = 0;
55
56 for (i = 0; i < RDS_RX_MAX_TRACES; i++)
57 inc->i_rx_lat_trace[i] = 0;
53} 58}
54EXPORT_SYMBOL_GPL(rds_inc_init); 59EXPORT_SYMBOL_GPL(rds_inc_init);
55 60
@@ -94,6 +99,10 @@ static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk,
94 return; 99 return;
95 100
96 rs->rs_rcv_bytes += delta; 101 rs->rs_rcv_bytes += delta;
102 if (delta > 0)
103 rds_stats_add(s_recv_bytes_added_to_socket, delta);
104 else
105 rds_stats_add(s_recv_bytes_removed_from_socket, -delta);
97 now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs); 106 now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs);
98 107
99 rdsdebug("rs %p (%pI4:%u) recv bytes %d buf %d " 108 rdsdebug("rs %p (%pI4:%u) recv bytes %d buf %d "
@@ -369,6 +378,7 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
369 if (sock_flag(sk, SOCK_RCVTSTAMP)) 378 if (sock_flag(sk, SOCK_RCVTSTAMP))
370 do_gettimeofday(&inc->i_rx_tstamp); 379 do_gettimeofday(&inc->i_rx_tstamp);
371 rds_inc_addref(inc); 380 rds_inc_addref(inc);
381 inc->i_rx_lat_trace[RDS_MSG_RX_END] = local_clock();
372 list_add_tail(&inc->i_item, &rs->rs_recv_queue); 382 list_add_tail(&inc->i_item, &rs->rs_recv_queue);
373 __rds_wake_sk_sleep(sk); 383 __rds_wake_sk_sleep(sk);
374 } else { 384 } else {
@@ -530,7 +540,7 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg,
530 ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RDMA_DEST, 540 ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RDMA_DEST,
531 sizeof(inc->i_rdma_cookie), &inc->i_rdma_cookie); 541 sizeof(inc->i_rdma_cookie), &inc->i_rdma_cookie);
532 if (ret) 542 if (ret)
533 return ret; 543 goto out;
534 } 544 }
535 545
536 if ((inc->i_rx_tstamp.tv_sec != 0) && 546 if ((inc->i_rx_tstamp.tv_sec != 0) &&
@@ -539,10 +549,30 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg,
539 sizeof(struct timeval), 549 sizeof(struct timeval),
540 &inc->i_rx_tstamp); 550 &inc->i_rx_tstamp);
541 if (ret) 551 if (ret)
542 return ret; 552 goto out;
543 } 553 }
544 554
545 return 0; 555 if (rs->rs_rx_traces) {
556 struct rds_cmsg_rx_trace t;
557 int i, j;
558
559 inc->i_rx_lat_trace[RDS_MSG_RX_CMSG] = local_clock();
560 t.rx_traces = rs->rs_rx_traces;
561 for (i = 0; i < rs->rs_rx_traces; i++) {
562 j = rs->rs_rx_trace[i];
563 t.rx_trace_pos[i] = j;
564 t.rx_trace[i] = inc->i_rx_lat_trace[j + 1] -
565 inc->i_rx_lat_trace[j];
566 }
567
568 ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RXPATH_LATENCY,
569 sizeof(t), &t);
570 if (ret)
571 goto out;
572 }
573
574out:
575 return ret;
546} 576}
547 577
548int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 578int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
diff --git a/net/rds/send.c b/net/rds/send.c
index 77c8c6e613ad..5cc64039caf7 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -476,12 +476,14 @@ void rds_rdma_send_complete(struct rds_message *rm, int status)
476 struct rm_rdma_op *ro; 476 struct rm_rdma_op *ro;
477 struct rds_notifier *notifier; 477 struct rds_notifier *notifier;
478 unsigned long flags; 478 unsigned long flags;
479 unsigned int notify = 0;
479 480
480 spin_lock_irqsave(&rm->m_rs_lock, flags); 481 spin_lock_irqsave(&rm->m_rs_lock, flags);
481 482
483 notify = rm->rdma.op_notify | rm->data.op_notify;
482 ro = &rm->rdma; 484 ro = &rm->rdma;
483 if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) && 485 if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) &&
484 ro->op_active && ro->op_notify && ro->op_notifier) { 486 ro->op_active && notify && ro->op_notifier) {
485 notifier = ro->op_notifier; 487 notifier = ro->op_notifier;
486 rs = rm->m_rs; 488 rs = rm->m_rs;
487 sock_hold(rds_rs_to_sk(rs)); 489 sock_hold(rds_rs_to_sk(rs));
@@ -945,6 +947,11 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
945 ret = rds_cmsg_rdma_map(rs, rm, cmsg); 947 ret = rds_cmsg_rdma_map(rs, rm, cmsg);
946 if (!ret) 948 if (!ret)
947 *allocated_mr = 1; 949 *allocated_mr = 1;
950 else if (ret == -ENODEV)
951 /* Accommodate the get_mr() case which can fail
952 * if connection isn't established yet.
953 */
954 ret = -EAGAIN;
948 break; 955 break;
949 case RDS_CMSG_ATOMIC_CSWP: 956 case RDS_CMSG_ATOMIC_CSWP:
950 case RDS_CMSG_ATOMIC_FADD: 957 case RDS_CMSG_ATOMIC_FADD:
@@ -987,6 +994,26 @@ static int rds_send_mprds_hash(struct rds_sock *rs, struct rds_connection *conn)
987 return hash; 994 return hash;
988} 995}
989 996
997static int rds_rdma_bytes(struct msghdr *msg, size_t *rdma_bytes)
998{
999 struct rds_rdma_args *args;
1000 struct cmsghdr *cmsg;
1001
1002 for_each_cmsghdr(cmsg, msg) {
1003 if (!CMSG_OK(msg, cmsg))
1004 return -EINVAL;
1005
1006 if (cmsg->cmsg_level != SOL_RDS)
1007 continue;
1008
1009 if (cmsg->cmsg_type == RDS_CMSG_RDMA_ARGS) {
1010 args = CMSG_DATA(cmsg);
1011 *rdma_bytes += args->remote_vec.bytes;
1012 }
1013 }
1014 return 0;
1015}
1016
990int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) 1017int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
991{ 1018{
992 struct sock *sk = sock->sk; 1019 struct sock *sk = sock->sk;
@@ -1001,6 +1028,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
1001 int nonblock = msg->msg_flags & MSG_DONTWAIT; 1028 int nonblock = msg->msg_flags & MSG_DONTWAIT;
1002 long timeo = sock_sndtimeo(sk, nonblock); 1029 long timeo = sock_sndtimeo(sk, nonblock);
1003 struct rds_conn_path *cpath; 1030 struct rds_conn_path *cpath;
1031 size_t total_payload_len = payload_len, rdma_payload_len = 0;
1004 1032
1005 /* Mirror Linux UDP mirror of BSD error message compatibility */ 1033 /* Mirror Linux UDP mirror of BSD error message compatibility */
1006 /* XXX: Perhaps MSG_MORE someday */ 1034 /* XXX: Perhaps MSG_MORE someday */
@@ -1033,6 +1061,16 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
1033 } 1061 }
1034 release_sock(sk); 1062 release_sock(sk);
1035 1063
1064 ret = rds_rdma_bytes(msg, &rdma_payload_len);
1065 if (ret)
1066 goto out;
1067
1068 total_payload_len += rdma_payload_len;
1069 if (max_t(size_t, payload_len, rdma_payload_len) > RDS_MAX_MSG_SIZE) {
1070 ret = -EMSGSIZE;
1071 goto out;
1072 }
1073
1036 if (payload_len > rds_sk_sndbuf(rs)) { 1074 if (payload_len > rds_sk_sndbuf(rs)) {
1037 ret = -EMSGSIZE; 1075 ret = -EMSGSIZE;
1038 goto out; 1076 goto out;
@@ -1082,8 +1120,12 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
1082 1120
1083 /* Parse any control messages the user may have included. */ 1121 /* Parse any control messages the user may have included. */
1084 ret = rds_cmsg_send(rs, rm, msg, &allocated_mr); 1122 ret = rds_cmsg_send(rs, rm, msg, &allocated_mr);
1085 if (ret) 1123 if (ret) {
1124 /* Trigger connection so that its ready for the next retry */
1125 if (ret == -EAGAIN)
1126 rds_conn_connect_if_down(conn);
1086 goto out; 1127 goto out;
1128 }
1087 1129
1088 if (rm->rdma.op_active && !conn->c_trans->xmit_rdma) { 1130 if (rm->rdma.op_active && !conn->c_trans->xmit_rdma) {
1089 printk_ratelimited(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n", 1131 printk_ratelimited(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n",
@@ -1169,7 +1211,7 @@ out:
1169 * or 1211 * or
1170 * RDS_FLAG_HB_PONG|RDS_FLAG_ACK_REQUIRED 1212 * RDS_FLAG_HB_PONG|RDS_FLAG_ACK_REQUIRED
1171 */ 1213 */
1172int 1214static int
1173rds_send_probe(struct rds_conn_path *cp, __be16 sport, 1215rds_send_probe(struct rds_conn_path *cp, __be16 sport,
1174 __be16 dport, u8 h_flags) 1216 __be16 dport, u8 h_flags)
1175{ 1217{
@@ -1238,7 +1280,7 @@ rds_send_pong(struct rds_conn_path *cp, __be16 dport)
1238 return rds_send_probe(cp, 0, dport, 0); 1280 return rds_send_probe(cp, 0, dport, 0);
1239} 1281}
1240 1282
1241void 1283static void
1242rds_send_ping(struct rds_connection *conn) 1284rds_send_ping(struct rds_connection *conn)
1243{ 1285{
1244 unsigned long flags; 1286 unsigned long flags;
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 57bb52361e0f..225690076773 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -484,9 +484,10 @@ static void __net_exit rds_tcp_exit_net(struct net *net)
484 * we do need to clean up the listen socket here. 484 * we do need to clean up the listen socket here.
485 */ 485 */
486 if (rtn->rds_tcp_listen_sock) { 486 if (rtn->rds_tcp_listen_sock) {
487 rds_tcp_listen_stop(rtn->rds_tcp_listen_sock); 487 struct socket *lsock = rtn->rds_tcp_listen_sock;
488
488 rtn->rds_tcp_listen_sock = NULL; 489 rtn->rds_tcp_listen_sock = NULL;
489 flush_work(&rtn->rds_tcp_accept_w); 490 rds_tcp_listen_stop(lsock, &rtn->rds_tcp_accept_w);
490 } 491 }
491} 492}
492 493
@@ -523,13 +524,13 @@ static void rds_tcp_kill_sock(struct net *net)
523 struct rds_tcp_connection *tc, *_tc; 524 struct rds_tcp_connection *tc, *_tc;
524 LIST_HEAD(tmp_list); 525 LIST_HEAD(tmp_list);
525 struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid); 526 struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
527 struct socket *lsock = rtn->rds_tcp_listen_sock;
526 528
527 rds_tcp_listen_stop(rtn->rds_tcp_listen_sock);
528 rtn->rds_tcp_listen_sock = NULL; 529 rtn->rds_tcp_listen_sock = NULL;
529 flush_work(&rtn->rds_tcp_accept_w); 530 rds_tcp_listen_stop(lsock, &rtn->rds_tcp_accept_w);
530 spin_lock_irq(&rds_tcp_conn_lock); 531 spin_lock_irq(&rds_tcp_conn_lock);
531 list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) { 532 list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
532 struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net); 533 struct net *c_net = tc->t_cpath->cp_conn->c_net;
533 534
534 if (net != c_net || !tc->t_sock) 535 if (net != c_net || !tc->t_sock)
535 continue; 536 continue;
@@ -546,8 +547,12 @@ static void rds_tcp_kill_sock(struct net *net)
546void *rds_tcp_listen_sock_def_readable(struct net *net) 547void *rds_tcp_listen_sock_def_readable(struct net *net)
547{ 548{
548 struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid); 549 struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
550 struct socket *lsock = rtn->rds_tcp_listen_sock;
551
552 if (!lsock)
553 return NULL;
549 554
550 return rtn->rds_tcp_listen_sock->sk->sk_user_data; 555 return lsock->sk->sk_user_data;
551} 556}
552 557
553static int rds_tcp_dev_event(struct notifier_block *this, 558static int rds_tcp_dev_event(struct notifier_block *this,
@@ -584,7 +589,7 @@ static void rds_tcp_sysctl_reset(struct net *net)
584 589
585 spin_lock_irq(&rds_tcp_conn_lock); 590 spin_lock_irq(&rds_tcp_conn_lock);
586 list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) { 591 list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
587 struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net); 592 struct net *c_net = tc->t_cpath->cp_conn->c_net;
588 593
589 if (net != c_net || !tc->t_sock) 594 if (net != c_net || !tc->t_sock)
590 continue; 595 continue;
@@ -638,35 +643,31 @@ static int rds_tcp_init(void)
638 goto out; 643 goto out;
639 } 644 }
640 645
641 ret = register_netdevice_notifier(&rds_tcp_dev_notifier); 646 ret = rds_tcp_recv_init();
642 if (ret) {
643 pr_warn("could not register rds_tcp_dev_notifier\n");
644 goto out;
645 }
646
647 ret = register_pernet_subsys(&rds_tcp_net_ops);
648 if (ret) 647 if (ret)
649 goto out_slab; 648 goto out_slab;
650 649
651 ret = rds_tcp_recv_init(); 650 ret = register_pernet_subsys(&rds_tcp_net_ops);
652 if (ret) 651 if (ret)
652 goto out_recv;
653
654 ret = register_netdevice_notifier(&rds_tcp_dev_notifier);
655 if (ret) {
656 pr_warn("could not register rds_tcp_dev_notifier\n");
653 goto out_pernet; 657 goto out_pernet;
658 }
654 659
655 ret = rds_trans_register(&rds_tcp_transport); 660 rds_trans_register(&rds_tcp_transport);
656 if (ret)
657 goto out_recv;
658 661
659 rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); 662 rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
660 663
661 goto out; 664 goto out;
662 665
663out_recv:
664 rds_tcp_recv_exit();
665out_pernet: 666out_pernet:
666 unregister_pernet_subsys(&rds_tcp_net_ops); 667 unregister_pernet_subsys(&rds_tcp_net_ops);
668out_recv:
669 rds_tcp_recv_exit();
667out_slab: 670out_slab:
668 if (unregister_netdevice_notifier(&rds_tcp_dev_notifier))
669 pr_warn("could not unregister rds_tcp_dev_notifier\n");
670 kmem_cache_destroy(rds_tcp_conn_slab); 671 kmem_cache_destroy(rds_tcp_conn_slab);
671out: 672out:
672 return ret; 673 return ret;
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
index 9a1cc8906576..56ea6620fcf9 100644
--- a/net/rds/tcp.h
+++ b/net/rds/tcp.h
@@ -66,7 +66,7 @@ void rds_tcp_state_change(struct sock *sk);
66 66
67/* tcp_listen.c */ 67/* tcp_listen.c */
68struct socket *rds_tcp_listen_init(struct net *); 68struct socket *rds_tcp_listen_init(struct net *);
69void rds_tcp_listen_stop(struct socket *); 69void rds_tcp_listen_stop(struct socket *sock, struct work_struct *acceptor);
70void rds_tcp_listen_data_ready(struct sock *sk); 70void rds_tcp_listen_data_ready(struct sock *sk);
71int rds_tcp_accept_one(struct socket *sock); 71int rds_tcp_accept_one(struct socket *sock);
72int rds_tcp_keepalive(struct socket *sock); 72int rds_tcp_keepalive(struct socket *sock);
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index f74bab3ecdca..507678853e6c 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -79,6 +79,7 @@ bail:
79 * smaller ip address, we recycle conns in RDS_CONN_ERROR on the passive side 79 * smaller ip address, we recycle conns in RDS_CONN_ERROR on the passive side
80 * by moving them to CONNECTING in this function. 80 * by moving them to CONNECTING in this function.
81 */ 81 */
82static
82struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn) 83struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn)
83{ 84{
84 int i; 85 int i;
@@ -132,7 +133,7 @@ int rds_tcp_accept_one(struct socket *sock)
132 133
133 new_sock->type = sock->type; 134 new_sock->type = sock->type;
134 new_sock->ops = sock->ops; 135 new_sock->ops = sock->ops;
135 ret = sock->ops->accept(sock, new_sock, O_NONBLOCK); 136 ret = sock->ops->accept(sock, new_sock, O_NONBLOCK, true);
136 if (ret < 0) 137 if (ret < 0)
137 goto out; 138 goto out;
138 139
@@ -222,6 +223,9 @@ void rds_tcp_listen_data_ready(struct sock *sk)
222 * before it has been accepted and the accepter has set up their 223 * before it has been accepted and the accepter has set up their
223 * data_ready.. we only want to queue listen work for our listening 224 * data_ready.. we only want to queue listen work for our listening
224 * socket 225 * socket
226 *
227 * (*ready)() may be null if we are racing with netns delete, and
228 * the listen socket is being torn down.
225 */ 229 */
226 if (sk->sk_state == TCP_LISTEN) 230 if (sk->sk_state == TCP_LISTEN)
227 rds_tcp_accept_work(sk); 231 rds_tcp_accept_work(sk);
@@ -230,7 +234,8 @@ void rds_tcp_listen_data_ready(struct sock *sk)
230 234
231out: 235out:
232 read_unlock_bh(&sk->sk_callback_lock); 236 read_unlock_bh(&sk->sk_callback_lock);
233 ready(sk); 237 if (ready)
238 ready(sk);
234} 239}
235 240
236struct socket *rds_tcp_listen_init(struct net *net) 241struct socket *rds_tcp_listen_init(struct net *net)
@@ -270,7 +275,7 @@ out:
270 return NULL; 275 return NULL;
271} 276}
272 277
273void rds_tcp_listen_stop(struct socket *sock) 278void rds_tcp_listen_stop(struct socket *sock, struct work_struct *acceptor)
274{ 279{
275 struct sock *sk; 280 struct sock *sk;
276 281
@@ -291,5 +296,6 @@ void rds_tcp_listen_stop(struct socket *sock)
291 296
292 /* wait for accepts to stop and close the socket */ 297 /* wait for accepts to stop and close the socket */
293 flush_workqueue(rds_wq); 298 flush_workqueue(rds_wq);
299 flush_work(acceptor);
294 sock_release(sock); 300 sock_release(sock);
295} 301}
diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c
index ad4892e97f91..e006ef8e6d40 100644
--- a/net/rds/tcp_recv.c
+++ b/net/rds/tcp_recv.c
@@ -180,6 +180,9 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
180 rdsdebug("alloced tinc %p\n", tinc); 180 rdsdebug("alloced tinc %p\n", tinc);
181 rds_inc_path_init(&tinc->ti_inc, cp, 181 rds_inc_path_init(&tinc->ti_inc, cp,
182 cp->cp_conn->c_faddr); 182 cp->cp_conn->c_faddr);
183 tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] =
184 local_clock();
185
183 /* 186 /*
184 * XXX * we might be able to use the __ variants when 187 * XXX * we might be able to use the __ variants when
185 * we've already serialized at a higher level. 188 * we've already serialized at a higher level.
@@ -204,6 +207,8 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
204 /* could be 0 for a 0 len message */ 207 /* could be 0 for a 0 len message */
205 tc->t_tinc_data_rem = 208 tc->t_tinc_data_rem =
206 be32_to_cpu(tinc->ti_inc.i_hdr.h_len); 209 be32_to_cpu(tinc->ti_inc.i_hdr.h_len);
210 tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_START] =
211 local_clock();
207 } 212 }
208 } 213 }
209 214
diff --git a/net/rds/transport.c b/net/rds/transport.c
index 2ffd3e30c643..0b188dd0a344 100644
--- a/net/rds/transport.c
+++ b/net/rds/transport.c
@@ -40,7 +40,7 @@
40static struct rds_transport *transports[RDS_TRANS_COUNT]; 40static struct rds_transport *transports[RDS_TRANS_COUNT];
41static DECLARE_RWSEM(rds_trans_sem); 41static DECLARE_RWSEM(rds_trans_sem);
42 42
43int rds_trans_register(struct rds_transport *trans) 43void rds_trans_register(struct rds_transport *trans)
44{ 44{
45 BUG_ON(strlen(trans->t_name) + 1 > TRANSNAMSIZ); 45 BUG_ON(strlen(trans->t_name) + 1 > TRANSNAMSIZ);
46 46
@@ -55,8 +55,6 @@ int rds_trans_register(struct rds_transport *trans)
55 } 55 }
56 56
57 up_write(&rds_trans_sem); 57 up_write(&rds_trans_sem);
58
59 return 0;
60} 58}
61EXPORT_SYMBOL_GPL(rds_trans_register); 59EXPORT_SYMBOL_GPL(rds_trans_register);
62 60
diff --git a/net/rfkill/Kconfig b/net/rfkill/Kconfig
index 868f1ad0415a..060600b03fad 100644
--- a/net/rfkill/Kconfig
+++ b/net/rfkill/Kconfig
@@ -23,17 +23,6 @@ config RFKILL_INPUT
23 depends on INPUT = y || RFKILL = INPUT 23 depends on INPUT = y || RFKILL = INPUT
24 default y if !EXPERT 24 default y if !EXPERT
25 25
26config RFKILL_REGULATOR
27 tristate "Generic rfkill regulator driver"
28 depends on RFKILL || !RFKILL
29 depends on REGULATOR
30 help
31 This options enable controlling radio transmitters connected to
32 voltage regulator using the regulator framework.
33
34 To compile this driver as a module, choose M here: the module will
35 be called rfkill-regulator.
36
37config RFKILL_GPIO 26config RFKILL_GPIO
38 tristate "GPIO RFKILL driver" 27 tristate "GPIO RFKILL driver"
39 depends on RFKILL 28 depends on RFKILL
diff --git a/net/rfkill/Makefile b/net/rfkill/Makefile
index 311768783f4a..87a80aded0b3 100644
--- a/net/rfkill/Makefile
+++ b/net/rfkill/Makefile
@@ -5,5 +5,4 @@
5rfkill-y += core.o 5rfkill-y += core.o
6rfkill-$(CONFIG_RFKILL_INPUT) += input.o 6rfkill-$(CONFIG_RFKILL_INPUT) += input.o
7obj-$(CONFIG_RFKILL) += rfkill.o 7obj-$(CONFIG_RFKILL) += rfkill.o
8obj-$(CONFIG_RFKILL_REGULATOR) += rfkill-regulator.o
9obj-$(CONFIG_RFKILL_GPIO) += rfkill-gpio.o 8obj-$(CONFIG_RFKILL_GPIO) += rfkill-gpio.o
diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index 884027f62783..2064c3a35ef8 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -176,6 +176,50 @@ static void rfkill_led_trigger_unregister(struct rfkill *rfkill)
176{ 176{
177 led_trigger_unregister(&rfkill->led_trigger); 177 led_trigger_unregister(&rfkill->led_trigger);
178} 178}
179
180static struct led_trigger rfkill_any_led_trigger;
181static struct work_struct rfkill_any_work;
182
183static void rfkill_any_led_trigger_worker(struct work_struct *work)
184{
185 enum led_brightness brightness = LED_OFF;
186 struct rfkill *rfkill;
187
188 mutex_lock(&rfkill_global_mutex);
189 list_for_each_entry(rfkill, &rfkill_list, node) {
190 if (!(rfkill->state & RFKILL_BLOCK_ANY)) {
191 brightness = LED_FULL;
192 break;
193 }
194 }
195 mutex_unlock(&rfkill_global_mutex);
196
197 led_trigger_event(&rfkill_any_led_trigger, brightness);
198}
199
200static void rfkill_any_led_trigger_event(void)
201{
202 schedule_work(&rfkill_any_work);
203}
204
205static void rfkill_any_led_trigger_activate(struct led_classdev *led_cdev)
206{
207 rfkill_any_led_trigger_event();
208}
209
210static int rfkill_any_led_trigger_register(void)
211{
212 INIT_WORK(&rfkill_any_work, rfkill_any_led_trigger_worker);
213 rfkill_any_led_trigger.name = "rfkill-any";
214 rfkill_any_led_trigger.activate = rfkill_any_led_trigger_activate;
215 return led_trigger_register(&rfkill_any_led_trigger);
216}
217
218static void rfkill_any_led_trigger_unregister(void)
219{
220 led_trigger_unregister(&rfkill_any_led_trigger);
221 cancel_work_sync(&rfkill_any_work);
222}
179#else 223#else
180static void rfkill_led_trigger_event(struct rfkill *rfkill) 224static void rfkill_led_trigger_event(struct rfkill *rfkill)
181{ 225{
@@ -189,6 +233,19 @@ static inline int rfkill_led_trigger_register(struct rfkill *rfkill)
189static inline void rfkill_led_trigger_unregister(struct rfkill *rfkill) 233static inline void rfkill_led_trigger_unregister(struct rfkill *rfkill)
190{ 234{
191} 235}
236
237static void rfkill_any_led_trigger_event(void)
238{
239}
240
241static int rfkill_any_led_trigger_register(void)
242{
243 return 0;
244}
245
246static void rfkill_any_led_trigger_unregister(void)
247{
248}
192#endif /* CONFIG_RFKILL_LEDS */ 249#endif /* CONFIG_RFKILL_LEDS */
193 250
194static void rfkill_fill_event(struct rfkill_event *ev, struct rfkill *rfkill, 251static void rfkill_fill_event(struct rfkill_event *ev, struct rfkill *rfkill,
@@ -297,6 +354,7 @@ static void rfkill_set_block(struct rfkill *rfkill, bool blocked)
297 spin_unlock_irqrestore(&rfkill->lock, flags); 354 spin_unlock_irqrestore(&rfkill->lock, flags);
298 355
299 rfkill_led_trigger_event(rfkill); 356 rfkill_led_trigger_event(rfkill);
357 rfkill_any_led_trigger_event();
300 358
301 if (prev != curr) 359 if (prev != curr)
302 rfkill_event(rfkill); 360 rfkill_event(rfkill);
@@ -477,11 +535,9 @@ bool rfkill_set_hw_state(struct rfkill *rfkill, bool blocked)
477 spin_unlock_irqrestore(&rfkill->lock, flags); 535 spin_unlock_irqrestore(&rfkill->lock, flags);
478 536
479 rfkill_led_trigger_event(rfkill); 537 rfkill_led_trigger_event(rfkill);
538 rfkill_any_led_trigger_event();
480 539
481 if (!rfkill->registered) 540 if (rfkill->registered && prev != blocked)
482 return ret;
483
484 if (prev != blocked)
485 schedule_work(&rfkill->uevent_work); 541 schedule_work(&rfkill->uevent_work);
486 542
487 return ret; 543 return ret;
@@ -523,6 +579,7 @@ bool rfkill_set_sw_state(struct rfkill *rfkill, bool blocked)
523 schedule_work(&rfkill->uevent_work); 579 schedule_work(&rfkill->uevent_work);
524 580
525 rfkill_led_trigger_event(rfkill); 581 rfkill_led_trigger_event(rfkill);
582 rfkill_any_led_trigger_event();
526 583
527 return blocked; 584 return blocked;
528} 585}
@@ -572,6 +629,7 @@ void rfkill_set_states(struct rfkill *rfkill, bool sw, bool hw)
572 schedule_work(&rfkill->uevent_work); 629 schedule_work(&rfkill->uevent_work);
573 630
574 rfkill_led_trigger_event(rfkill); 631 rfkill_led_trigger_event(rfkill);
632 rfkill_any_led_trigger_event();
575 } 633 }
576} 634}
577EXPORT_SYMBOL(rfkill_set_states); 635EXPORT_SYMBOL(rfkill_set_states);
@@ -988,6 +1046,7 @@ int __must_check rfkill_register(struct rfkill *rfkill)
988#endif 1046#endif
989 } 1047 }
990 1048
1049 rfkill_any_led_trigger_event();
991 rfkill_send_events(rfkill, RFKILL_OP_ADD); 1050 rfkill_send_events(rfkill, RFKILL_OP_ADD);
992 1051
993 mutex_unlock(&rfkill_global_mutex); 1052 mutex_unlock(&rfkill_global_mutex);
@@ -1020,6 +1079,7 @@ void rfkill_unregister(struct rfkill *rfkill)
1020 mutex_lock(&rfkill_global_mutex); 1079 mutex_lock(&rfkill_global_mutex);
1021 rfkill_send_events(rfkill, RFKILL_OP_DEL); 1080 rfkill_send_events(rfkill, RFKILL_OP_DEL);
1022 list_del_init(&rfkill->node); 1081 list_del_init(&rfkill->node);
1082 rfkill_any_led_trigger_event();
1023 mutex_unlock(&rfkill_global_mutex); 1083 mutex_unlock(&rfkill_global_mutex);
1024 1084
1025 rfkill_led_trigger_unregister(rfkill); 1085 rfkill_led_trigger_unregister(rfkill);
@@ -1266,24 +1326,33 @@ static int __init rfkill_init(void)
1266 1326
1267 error = class_register(&rfkill_class); 1327 error = class_register(&rfkill_class);
1268 if (error) 1328 if (error)
1269 goto out; 1329 goto error_class;
1270 1330
1271 error = misc_register(&rfkill_miscdev); 1331 error = misc_register(&rfkill_miscdev);
1272 if (error) { 1332 if (error)
1273 class_unregister(&rfkill_class); 1333 goto error_misc;
1274 goto out; 1334
1275 } 1335 error = rfkill_any_led_trigger_register();
1336 if (error)
1337 goto error_led_trigger;
1276 1338
1277#ifdef CONFIG_RFKILL_INPUT 1339#ifdef CONFIG_RFKILL_INPUT
1278 error = rfkill_handler_init(); 1340 error = rfkill_handler_init();
1279 if (error) { 1341 if (error)
1280 misc_deregister(&rfkill_miscdev); 1342 goto error_input;
1281 class_unregister(&rfkill_class);
1282 goto out;
1283 }
1284#endif 1343#endif
1285 1344
1286 out: 1345 return 0;
1346
1347#ifdef CONFIG_RFKILL_INPUT
1348error_input:
1349 rfkill_any_led_trigger_unregister();
1350#endif
1351error_led_trigger:
1352 misc_deregister(&rfkill_miscdev);
1353error_misc:
1354 class_unregister(&rfkill_class);
1355error_class:
1287 return error; 1356 return error;
1288} 1357}
1289subsys_initcall(rfkill_init); 1358subsys_initcall(rfkill_init);
@@ -1293,6 +1362,7 @@ static void __exit rfkill_exit(void)
1293#ifdef CONFIG_RFKILL_INPUT 1362#ifdef CONFIG_RFKILL_INPUT
1294 rfkill_handler_exit(); 1363 rfkill_handler_exit();
1295#endif 1364#endif
1365 rfkill_any_led_trigger_unregister();
1296 misc_deregister(&rfkill_miscdev); 1366 misc_deregister(&rfkill_miscdev);
1297 class_unregister(&rfkill_class); 1367 class_unregister(&rfkill_class);
1298} 1368}
diff --git a/net/rfkill/rfkill-regulator.c b/net/rfkill/rfkill-regulator.c
deleted file mode 100644
index 50cd26a48e87..000000000000
--- a/net/rfkill/rfkill-regulator.c
+++ /dev/null
@@ -1,154 +0,0 @@
1/*
2 * rfkill-regulator.c - Regulator consumer driver for rfkill
3 *
4 * Copyright (C) 2009 Guiming Zhuo <gmzhuo@gmail.com>
5 * Copyright (C) 2011 Antonio Ospite <ospite@studenti.unina.it>
6 *
7 * Implementation inspired by leds-regulator driver.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation.
12 *
13 */
14
15#include <linux/module.h>
16#include <linux/err.h>
17#include <linux/slab.h>
18#include <linux/platform_device.h>
19#include <linux/regulator/consumer.h>
20#include <linux/rfkill.h>
21#include <linux/rfkill-regulator.h>
22
23struct rfkill_regulator_data {
24 struct rfkill *rf_kill;
25 bool reg_enabled;
26
27 struct regulator *vcc;
28};
29
30static int rfkill_regulator_set_block(void *data, bool blocked)
31{
32 struct rfkill_regulator_data *rfkill_data = data;
33 int ret = 0;
34
35 pr_debug("%s: blocked: %d\n", __func__, blocked);
36
37 if (blocked) {
38 if (rfkill_data->reg_enabled) {
39 regulator_disable(rfkill_data->vcc);
40 rfkill_data->reg_enabled = false;
41 }
42 } else {
43 if (!rfkill_data->reg_enabled) {
44 ret = regulator_enable(rfkill_data->vcc);
45 if (!ret)
46 rfkill_data->reg_enabled = true;
47 }
48 }
49
50 pr_debug("%s: regulator_is_enabled after set_block: %d\n", __func__,
51 regulator_is_enabled(rfkill_data->vcc));
52
53 return ret;
54}
55
56static struct rfkill_ops rfkill_regulator_ops = {
57 .set_block = rfkill_regulator_set_block,
58};
59
60static int rfkill_regulator_probe(struct platform_device *pdev)
61{
62 struct rfkill_regulator_platform_data *pdata = pdev->dev.platform_data;
63 struct rfkill_regulator_data *rfkill_data;
64 struct regulator *vcc;
65 struct rfkill *rf_kill;
66 int ret = 0;
67
68 if (pdata == NULL) {
69 dev_err(&pdev->dev, "no platform data\n");
70 return -ENODEV;
71 }
72
73 if (pdata->name == NULL || pdata->type == 0) {
74 dev_err(&pdev->dev, "invalid name or type in platform data\n");
75 return -EINVAL;
76 }
77
78 vcc = regulator_get_exclusive(&pdev->dev, "vrfkill");
79 if (IS_ERR(vcc)) {
80 dev_err(&pdev->dev, "Cannot get vcc for %s\n", pdata->name);
81 ret = PTR_ERR(vcc);
82 goto out;
83 }
84
85 rfkill_data = kzalloc(sizeof(*rfkill_data), GFP_KERNEL);
86 if (rfkill_data == NULL) {
87 ret = -ENOMEM;
88 goto err_data_alloc;
89 }
90
91 rf_kill = rfkill_alloc(pdata->name, &pdev->dev,
92 pdata->type,
93 &rfkill_regulator_ops, rfkill_data);
94 if (rf_kill == NULL) {
95 ret = -ENOMEM;
96 goto err_rfkill_alloc;
97 }
98
99 if (regulator_is_enabled(vcc)) {
100 dev_dbg(&pdev->dev, "Regulator already enabled\n");
101 rfkill_data->reg_enabled = true;
102 }
103 rfkill_data->vcc = vcc;
104 rfkill_data->rf_kill = rf_kill;
105
106 ret = rfkill_register(rf_kill);
107 if (ret) {
108 dev_err(&pdev->dev, "Cannot register rfkill device\n");
109 goto err_rfkill_register;
110 }
111
112 platform_set_drvdata(pdev, rfkill_data);
113 dev_info(&pdev->dev, "%s initialized\n", pdata->name);
114
115 return 0;
116
117err_rfkill_register:
118 rfkill_destroy(rf_kill);
119err_rfkill_alloc:
120 kfree(rfkill_data);
121err_data_alloc:
122 regulator_put(vcc);
123out:
124 return ret;
125}
126
127static int rfkill_regulator_remove(struct platform_device *pdev)
128{
129 struct rfkill_regulator_data *rfkill_data = platform_get_drvdata(pdev);
130 struct rfkill *rf_kill = rfkill_data->rf_kill;
131
132 rfkill_unregister(rf_kill);
133 rfkill_destroy(rf_kill);
134 regulator_put(rfkill_data->vcc);
135 kfree(rfkill_data);
136
137 return 0;
138}
139
140static struct platform_driver rfkill_regulator_driver = {
141 .probe = rfkill_regulator_probe,
142 .remove = rfkill_regulator_remove,
143 .driver = {
144 .name = "rfkill-regulator",
145 },
146};
147
148module_platform_driver(rfkill_regulator_driver);
149
150MODULE_AUTHOR("Guiming Zhuo <gmzhuo@gmail.com>");
151MODULE_AUTHOR("Antonio Ospite <ospite@studenti.unina.it>");
152MODULE_DESCRIPTION("Regulator consumer driver for rfkill");
153MODULE_LICENSE("GPL");
154MODULE_ALIAS("platform:rfkill-regulator");
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index 9ad301c46b88..4a9729257023 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -20,7 +20,7 @@
20#include <linux/in.h> 20#include <linux/in.h>
21#include <linux/slab.h> 21#include <linux/slab.h>
22#include <linux/kernel.h> 22#include <linux/kernel.h>
23#include <linux/sched.h> 23#include <linux/sched/signal.h>
24#include <linux/spinlock.h> 24#include <linux/spinlock.h>
25#include <linux/timer.h> 25#include <linux/timer.h>
26#include <linux/string.h> 26#include <linux/string.h>
@@ -871,7 +871,8 @@ out_release:
871 return err; 871 return err;
872} 872}
873 873
874static int rose_accept(struct socket *sock, struct socket *newsock, int flags) 874static int rose_accept(struct socket *sock, struct socket *newsock, int flags,
875 bool kern)
875{ 876{
876 struct sk_buff *skb; 877 struct sk_buff *skb;
877 struct sock *newsk; 878 struct sock *newsk;
diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile
index 8fc6ea347182..b9da4d6b914f 100644
--- a/net/rxrpc/Makefile
+++ b/net/rxrpc/Makefile
@@ -2,7 +2,9 @@
2# Makefile for Linux kernel RxRPC 2# Makefile for Linux kernel RxRPC
3# 3#
4 4
5af-rxrpc-y := \ 5obj-$(CONFIG_AF_RXRPC) += rxrpc.o
6
7rxrpc-y := \
6 af_rxrpc.o \ 8 af_rxrpc.o \
7 call_accept.o \ 9 call_accept.o \
8 call_event.o \ 10 call_event.o \
@@ -26,8 +28,6 @@ af-rxrpc-y := \
26 skbuff.o \ 28 skbuff.o \
27 utils.o 29 utils.o
28 30
29af-rxrpc-$(CONFIG_PROC_FS) += proc.o 31rxrpc-$(CONFIG_PROC_FS) += proc.o
30af-rxrpc-$(CONFIG_RXKAD) += rxkad.o 32rxrpc-$(CONFIG_RXKAD) += rxkad.o
31af-rxrpc-$(CONFIG_SYSCTL) += sysctl.o 33rxrpc-$(CONFIG_SYSCTL) += sysctl.o
32
33obj-$(CONFIG_AF_RXRPC) += af-rxrpc.o
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 5f63f6dcaabb..7fb59c3f1542 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -224,6 +224,14 @@ static int rxrpc_listen(struct socket *sock, int backlog)
224 else 224 else
225 sk->sk_max_ack_backlog = old; 225 sk->sk_max_ack_backlog = old;
226 break; 226 break;
227 case RXRPC_SERVER_LISTENING:
228 if (backlog == 0) {
229 rx->sk.sk_state = RXRPC_SERVER_LISTEN_DISABLED;
230 sk->sk_max_ack_backlog = 0;
231 rxrpc_discard_prealloc(rx);
232 ret = 0;
233 break;
234 }
227 default: 235 default:
228 ret = -EBUSY; 236 ret = -EBUSY;
229 break; 237 break;
@@ -282,10 +290,11 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
282 cp.exclusive = false; 290 cp.exclusive = false;
283 cp.service_id = srx->srx_service; 291 cp.service_id = srx->srx_service;
284 call = rxrpc_new_client_call(rx, &cp, srx, user_call_ID, gfp); 292 call = rxrpc_new_client_call(rx, &cp, srx, user_call_ID, gfp);
293 /* The socket has been unlocked. */
285 if (!IS_ERR(call)) 294 if (!IS_ERR(call))
286 call->notify_rx = notify_rx; 295 call->notify_rx = notify_rx;
287 296
288 release_sock(&rx->sk); 297 mutex_unlock(&call->user_mutex);
289 _leave(" = %p", call); 298 _leave(" = %p", call);
290 return call; 299 return call;
291} 300}
@@ -302,7 +311,10 @@ EXPORT_SYMBOL(rxrpc_kernel_begin_call);
302void rxrpc_kernel_end_call(struct socket *sock, struct rxrpc_call *call) 311void rxrpc_kernel_end_call(struct socket *sock, struct rxrpc_call *call)
303{ 312{
304 _enter("%d{%d}", call->debug_id, atomic_read(&call->usage)); 313 _enter("%d{%d}", call->debug_id, atomic_read(&call->usage));
314
315 mutex_lock(&call->user_mutex);
305 rxrpc_release_call(rxrpc_sk(sock->sk), call); 316 rxrpc_release_call(rxrpc_sk(sock->sk), call);
317 mutex_unlock(&call->user_mutex);
306 rxrpc_put_call(call, rxrpc_call_put_kernel); 318 rxrpc_put_call(call, rxrpc_call_put_kernel);
307} 319}
308EXPORT_SYMBOL(rxrpc_kernel_end_call); 320EXPORT_SYMBOL(rxrpc_kernel_end_call);
@@ -442,14 +454,16 @@ static int rxrpc_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
442 case RXRPC_SERVER_BOUND: 454 case RXRPC_SERVER_BOUND:
443 case RXRPC_SERVER_LISTENING: 455 case RXRPC_SERVER_LISTENING:
444 ret = rxrpc_do_sendmsg(rx, m, len); 456 ret = rxrpc_do_sendmsg(rx, m, len);
445 break; 457 /* The socket has been unlocked */
458 goto out;
446 default: 459 default:
447 ret = -EINVAL; 460 ret = -EINVAL;
448 break; 461 goto error_unlock;
449 } 462 }
450 463
451error_unlock: 464error_unlock:
452 release_sock(&rx->sk); 465 release_sock(&rx->sk);
466out:
453 _leave(" = %d", ret); 467 _leave(" = %d", ret);
454 return ret; 468 return ret;
455} 469}
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index f60e35576526..26a7b1db1361 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -60,6 +60,7 @@ enum {
60 RXRPC_CLIENT_BOUND, /* client local address bound */ 60 RXRPC_CLIENT_BOUND, /* client local address bound */
61 RXRPC_SERVER_BOUND, /* server local address bound */ 61 RXRPC_SERVER_BOUND, /* server local address bound */
62 RXRPC_SERVER_LISTENING, /* server listening for connections */ 62 RXRPC_SERVER_LISTENING, /* server listening for connections */
63 RXRPC_SERVER_LISTEN_DISABLED, /* server listening disabled */
63 RXRPC_CLOSE, /* socket is being closed */ 64 RXRPC_CLOSE, /* socket is being closed */
64}; 65};
65 66
@@ -466,6 +467,7 @@ struct rxrpc_call {
466 struct rxrpc_connection *conn; /* connection carrying call */ 467 struct rxrpc_connection *conn; /* connection carrying call */
467 struct rxrpc_peer *peer; /* Peer record for remote address */ 468 struct rxrpc_peer *peer; /* Peer record for remote address */
468 struct rxrpc_sock __rcu *socket; /* socket responsible */ 469 struct rxrpc_sock __rcu *socket; /* socket responsible */
470 struct mutex user_mutex; /* User access mutex */
469 ktime_t ack_at; /* When deferred ACK needs to happen */ 471 ktime_t ack_at; /* When deferred ACK needs to happen */
470 ktime_t resend_at; /* When next resend needs to happen */ 472 ktime_t resend_at; /* When next resend needs to happen */
471 ktime_t ping_at; /* When next to send a ping */ 473 ktime_t ping_at; /* When next to send a ping */
@@ -593,200 +595,6 @@ struct rxrpc_ack_summary {
593 u8 cumulative_acks; 595 u8 cumulative_acks;
594}; 596};
595 597
596enum rxrpc_skb_trace {
597 rxrpc_skb_rx_cleaned,
598 rxrpc_skb_rx_freed,
599 rxrpc_skb_rx_got,
600 rxrpc_skb_rx_lost,
601 rxrpc_skb_rx_received,
602 rxrpc_skb_rx_rotated,
603 rxrpc_skb_rx_purged,
604 rxrpc_skb_rx_seen,
605 rxrpc_skb_tx_cleaned,
606 rxrpc_skb_tx_freed,
607 rxrpc_skb_tx_got,
608 rxrpc_skb_tx_new,
609 rxrpc_skb_tx_rotated,
610 rxrpc_skb_tx_seen,
611 rxrpc_skb__nr_trace
612};
613
614extern const char rxrpc_skb_traces[rxrpc_skb__nr_trace][7];
615
616enum rxrpc_conn_trace {
617 rxrpc_conn_new_client,
618 rxrpc_conn_new_service,
619 rxrpc_conn_queued,
620 rxrpc_conn_seen,
621 rxrpc_conn_got,
622 rxrpc_conn_put_client,
623 rxrpc_conn_put_service,
624 rxrpc_conn__nr_trace
625};
626
627extern const char rxrpc_conn_traces[rxrpc_conn__nr_trace][4];
628
629enum rxrpc_client_trace {
630 rxrpc_client_activate_chans,
631 rxrpc_client_alloc,
632 rxrpc_client_chan_activate,
633 rxrpc_client_chan_disconnect,
634 rxrpc_client_chan_pass,
635 rxrpc_client_chan_unstarted,
636 rxrpc_client_cleanup,
637 rxrpc_client_count,
638 rxrpc_client_discard,
639 rxrpc_client_duplicate,
640 rxrpc_client_exposed,
641 rxrpc_client_replace,
642 rxrpc_client_to_active,
643 rxrpc_client_to_culled,
644 rxrpc_client_to_idle,
645 rxrpc_client_to_inactive,
646 rxrpc_client_to_waiting,
647 rxrpc_client_uncount,
648 rxrpc_client__nr_trace
649};
650
651extern const char rxrpc_client_traces[rxrpc_client__nr_trace][7];
652extern const char rxrpc_conn_cache_states[RXRPC_CONN__NR_CACHE_STATES][5];
653
654enum rxrpc_call_trace {
655 rxrpc_call_new_client,
656 rxrpc_call_new_service,
657 rxrpc_call_queued,
658 rxrpc_call_queued_ref,
659 rxrpc_call_seen,
660 rxrpc_call_connected,
661 rxrpc_call_release,
662 rxrpc_call_got,
663 rxrpc_call_got_userid,
664 rxrpc_call_got_kernel,
665 rxrpc_call_put,
666 rxrpc_call_put_userid,
667 rxrpc_call_put_kernel,
668 rxrpc_call_put_noqueue,
669 rxrpc_call_error,
670 rxrpc_call__nr_trace
671};
672
673extern const char rxrpc_call_traces[rxrpc_call__nr_trace][4];
674
675enum rxrpc_transmit_trace {
676 rxrpc_transmit_wait,
677 rxrpc_transmit_queue,
678 rxrpc_transmit_queue_last,
679 rxrpc_transmit_rotate,
680 rxrpc_transmit_rotate_last,
681 rxrpc_transmit_await_reply,
682 rxrpc_transmit_end,
683 rxrpc_transmit__nr_trace
684};
685
686extern const char rxrpc_transmit_traces[rxrpc_transmit__nr_trace][4];
687
688enum rxrpc_receive_trace {
689 rxrpc_receive_incoming,
690 rxrpc_receive_queue,
691 rxrpc_receive_queue_last,
692 rxrpc_receive_front,
693 rxrpc_receive_rotate,
694 rxrpc_receive_end,
695 rxrpc_receive__nr_trace
696};
697
698extern const char rxrpc_receive_traces[rxrpc_receive__nr_trace][4];
699
700enum rxrpc_recvmsg_trace {
701 rxrpc_recvmsg_enter,
702 rxrpc_recvmsg_wait,
703 rxrpc_recvmsg_dequeue,
704 rxrpc_recvmsg_hole,
705 rxrpc_recvmsg_next,
706 rxrpc_recvmsg_cont,
707 rxrpc_recvmsg_full,
708 rxrpc_recvmsg_data_return,
709 rxrpc_recvmsg_terminal,
710 rxrpc_recvmsg_to_be_accepted,
711 rxrpc_recvmsg_return,
712 rxrpc_recvmsg__nr_trace
713};
714
715extern const char rxrpc_recvmsg_traces[rxrpc_recvmsg__nr_trace][5];
716
717enum rxrpc_rtt_tx_trace {
718 rxrpc_rtt_tx_ping,
719 rxrpc_rtt_tx_data,
720 rxrpc_rtt_tx__nr_trace
721};
722
723extern const char rxrpc_rtt_tx_traces[rxrpc_rtt_tx__nr_trace][5];
724
725enum rxrpc_rtt_rx_trace {
726 rxrpc_rtt_rx_ping_response,
727 rxrpc_rtt_rx_requested_ack,
728 rxrpc_rtt_rx__nr_trace
729};
730
731extern const char rxrpc_rtt_rx_traces[rxrpc_rtt_rx__nr_trace][5];
732
733enum rxrpc_timer_trace {
734 rxrpc_timer_begin,
735 rxrpc_timer_init_for_reply,
736 rxrpc_timer_init_for_send_reply,
737 rxrpc_timer_expired,
738 rxrpc_timer_set_for_ack,
739 rxrpc_timer_set_for_ping,
740 rxrpc_timer_set_for_resend,
741 rxrpc_timer_set_for_send,
742 rxrpc_timer__nr_trace
743};
744
745extern const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8];
746
747enum rxrpc_propose_ack_trace {
748 rxrpc_propose_ack_client_tx_end,
749 rxrpc_propose_ack_input_data,
750 rxrpc_propose_ack_ping_for_lost_ack,
751 rxrpc_propose_ack_ping_for_lost_reply,
752 rxrpc_propose_ack_ping_for_params,
753 rxrpc_propose_ack_processing_op,
754 rxrpc_propose_ack_respond_to_ack,
755 rxrpc_propose_ack_respond_to_ping,
756 rxrpc_propose_ack_retry_tx,
757 rxrpc_propose_ack_rotate_rx,
758 rxrpc_propose_ack_terminal_ack,
759 rxrpc_propose_ack__nr_trace
760};
761
762enum rxrpc_propose_ack_outcome {
763 rxrpc_propose_ack_use,
764 rxrpc_propose_ack_update,
765 rxrpc_propose_ack_subsume,
766 rxrpc_propose_ack__nr_outcomes
767};
768
769extern const char rxrpc_propose_ack_traces[rxrpc_propose_ack__nr_trace][8];
770extern const char *const rxrpc_propose_ack_outcomes[rxrpc_propose_ack__nr_outcomes];
771
772enum rxrpc_congest_change {
773 rxrpc_cong_begin_retransmission,
774 rxrpc_cong_cleared_nacks,
775 rxrpc_cong_new_low_nack,
776 rxrpc_cong_no_change,
777 rxrpc_cong_progress,
778 rxrpc_cong_retransmit_again,
779 rxrpc_cong_rtt_window_end,
780 rxrpc_cong_saw_nack,
781 rxrpc_congest__nr_change
782};
783
784extern const char rxrpc_congest_modes[NR__RXRPC_CONGEST_MODES][10];
785extern const char rxrpc_congest_changes[rxrpc_congest__nr_change][9];
786
787extern const char *const rxrpc_pkts[];
788extern const char rxrpc_ack_names[RXRPC_ACK__INVALID + 1][4];
789
790#include <trace/events/rxrpc.h> 598#include <trace/events/rxrpc.h>
791 599
792/* 600/*
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index 832d854c2d5c..0ed181f53f32 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -323,6 +323,8 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx,
323 * 323 *
324 * If we want to report an error, we mark the skb with the packet type and 324 * If we want to report an error, we mark the skb with the packet type and
325 * abort code and return NULL. 325 * abort code and return NULL.
326 *
327 * The call is returned with the user access mutex held.
326 */ 328 */
327struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, 329struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local,
328 struct rxrpc_connection *conn, 330 struct rxrpc_connection *conn,
@@ -349,7 +351,8 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local,
349 351
350found_service: 352found_service:
351 spin_lock(&rx->incoming_lock); 353 spin_lock(&rx->incoming_lock);
352 if (rx->sk.sk_state == RXRPC_CLOSE) { 354 if (rx->sk.sk_state == RXRPC_SERVER_LISTEN_DISABLED ||
355 rx->sk.sk_state == RXRPC_CLOSE) {
353 trace_rxrpc_abort("CLS", sp->hdr.cid, sp->hdr.callNumber, 356 trace_rxrpc_abort("CLS", sp->hdr.cid, sp->hdr.callNumber,
354 sp->hdr.seq, RX_INVALID_OPERATION, ESHUTDOWN); 357 sp->hdr.seq, RX_INVALID_OPERATION, ESHUTDOWN);
355 skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT; 358 skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT;
@@ -370,6 +373,18 @@ found_service:
370 trace_rxrpc_receive(call, rxrpc_receive_incoming, 373 trace_rxrpc_receive(call, rxrpc_receive_incoming,
371 sp->hdr.serial, sp->hdr.seq); 374 sp->hdr.serial, sp->hdr.seq);
372 375
376 /* Lock the call to prevent rxrpc_kernel_send/recv_data() and
377 * sendmsg()/recvmsg() inconveniently stealing the mutex once the
378 * notification is generated.
379 *
380 * The BUG should never happen because the kernel should be well
381 * behaved enough not to access the call before the first notification
382 * event and userspace is prevented from doing so until the state is
383 * appropriate.
384 */
385 if (!mutex_trylock(&call->user_mutex))
386 BUG();
387
373 /* Make the call live. */ 388 /* Make the call live. */
374 rxrpc_incoming_call(rx, call, skb); 389 rxrpc_incoming_call(rx, call, skb);
375 conn = call->conn; 390 conn = call->conn;
@@ -428,10 +443,12 @@ out:
428/* 443/*
429 * handle acceptance of a call by userspace 444 * handle acceptance of a call by userspace
430 * - assign the user call ID to the call at the front of the queue 445 * - assign the user call ID to the call at the front of the queue
446 * - called with the socket locked.
431 */ 447 */
432struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx, 448struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx,
433 unsigned long user_call_ID, 449 unsigned long user_call_ID,
434 rxrpc_notify_rx_t notify_rx) 450 rxrpc_notify_rx_t notify_rx)
451 __releases(&rx->sk.sk_lock.slock)
435{ 452{
436 struct rxrpc_call *call; 453 struct rxrpc_call *call;
437 struct rb_node *parent, **pp; 454 struct rb_node *parent, **pp;
@@ -445,6 +462,7 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx,
445 462
446 if (list_empty(&rx->to_be_accepted)) { 463 if (list_empty(&rx->to_be_accepted)) {
447 write_unlock(&rx->call_lock); 464 write_unlock(&rx->call_lock);
465 release_sock(&rx->sk);
448 kleave(" = -ENODATA [empty]"); 466 kleave(" = -ENODATA [empty]");
449 return ERR_PTR(-ENODATA); 467 return ERR_PTR(-ENODATA);
450 } 468 }
@@ -469,10 +487,39 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx,
469 */ 487 */
470 call = list_entry(rx->to_be_accepted.next, 488 call = list_entry(rx->to_be_accepted.next,
471 struct rxrpc_call, accept_link); 489 struct rxrpc_call, accept_link);
490 write_unlock(&rx->call_lock);
491
492 /* We need to gain the mutex from the interrupt handler without
493 * upsetting lockdep, so we have to release it there and take it here.
494 * We are, however, still holding the socket lock, so other accepts
495 * must wait for us and no one can add the user ID behind our backs.
496 */
497 if (mutex_lock_interruptible(&call->user_mutex) < 0) {
498 release_sock(&rx->sk);
499 kleave(" = -ERESTARTSYS");
500 return ERR_PTR(-ERESTARTSYS);
501 }
502
503 write_lock(&rx->call_lock);
472 list_del_init(&call->accept_link); 504 list_del_init(&call->accept_link);
473 sk_acceptq_removed(&rx->sk); 505 sk_acceptq_removed(&rx->sk);
474 rxrpc_see_call(call); 506 rxrpc_see_call(call);
475 507
508 /* Find the user ID insertion point. */
509 pp = &rx->calls.rb_node;
510 parent = NULL;
511 while (*pp) {
512 parent = *pp;
513 call = rb_entry(parent, struct rxrpc_call, sock_node);
514
515 if (user_call_ID < call->user_call_ID)
516 pp = &(*pp)->rb_left;
517 else if (user_call_ID > call->user_call_ID)
518 pp = &(*pp)->rb_right;
519 else
520 BUG();
521 }
522
476 write_lock_bh(&call->state_lock); 523 write_lock_bh(&call->state_lock);
477 switch (call->state) { 524 switch (call->state) {
478 case RXRPC_CALL_SERVER_ACCEPTING: 525 case RXRPC_CALL_SERVER_ACCEPTING:
@@ -498,6 +545,7 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx,
498 write_unlock(&rx->call_lock); 545 write_unlock(&rx->call_lock);
499 rxrpc_notify_socket(call); 546 rxrpc_notify_socket(call);
500 rxrpc_service_prealloc(rx, GFP_KERNEL); 547 rxrpc_service_prealloc(rx, GFP_KERNEL);
548 release_sock(&rx->sk);
501 _leave(" = %p{%d}", call, call->debug_id); 549 _leave(" = %p{%d}", call, call->debug_id);
502 return call; 550 return call;
503 551
@@ -514,6 +562,7 @@ id_in_use:
514 write_unlock(&rx->call_lock); 562 write_unlock(&rx->call_lock);
515out: 563out:
516 rxrpc_service_prealloc(rx, GFP_KERNEL); 564 rxrpc_service_prealloc(rx, GFP_KERNEL);
565 release_sock(&rx->sk);
517 _leave(" = %d", ret); 566 _leave(" = %d", ret);
518 return ERR_PTR(ret); 567 return ERR_PTR(ret);
519} 568}
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index 1ed18d8c9c9f..d79cd36987a9 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -43,24 +43,6 @@ const char *const rxrpc_call_completions[NR__RXRPC_CALL_COMPLETIONS] = {
43 [RXRPC_CALL_NETWORK_ERROR] = "NetError", 43 [RXRPC_CALL_NETWORK_ERROR] = "NetError",
44}; 44};
45 45
46const char rxrpc_call_traces[rxrpc_call__nr_trace][4] = {
47 [rxrpc_call_new_client] = "NWc",
48 [rxrpc_call_new_service] = "NWs",
49 [rxrpc_call_queued] = "QUE",
50 [rxrpc_call_queued_ref] = "QUR",
51 [rxrpc_call_connected] = "CON",
52 [rxrpc_call_release] = "RLS",
53 [rxrpc_call_seen] = "SEE",
54 [rxrpc_call_got] = "GOT",
55 [rxrpc_call_got_userid] = "Gus",
56 [rxrpc_call_got_kernel] = "Gke",
57 [rxrpc_call_put] = "PUT",
58 [rxrpc_call_put_userid] = "Pus",
59 [rxrpc_call_put_kernel] = "Pke",
60 [rxrpc_call_put_noqueue] = "PNQ",
61 [rxrpc_call_error] = "*E*",
62};
63
64struct kmem_cache *rxrpc_call_jar; 46struct kmem_cache *rxrpc_call_jar;
65LIST_HEAD(rxrpc_calls); 47LIST_HEAD(rxrpc_calls);
66DEFINE_RWLOCK(rxrpc_call_lock); 48DEFINE_RWLOCK(rxrpc_call_lock);
@@ -133,6 +115,7 @@ struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp)
133 if (!call->rxtx_annotations) 115 if (!call->rxtx_annotations)
134 goto nomem_2; 116 goto nomem_2;
135 117
118 mutex_init(&call->user_mutex);
136 setup_timer(&call->timer, rxrpc_call_timer_expired, 119 setup_timer(&call->timer, rxrpc_call_timer_expired,
137 (unsigned long)call); 120 (unsigned long)call);
138 INIT_WORK(&call->processor, &rxrpc_process_call); 121 INIT_WORK(&call->processor, &rxrpc_process_call);
@@ -212,14 +195,16 @@ static void rxrpc_start_call_timer(struct rxrpc_call *call)
212} 195}
213 196
214/* 197/*
215 * set up a call for the given data 198 * Set up a call for the given parameters.
216 * - called in process context with IRQs enabled 199 * - Called with the socket lock held, which it must release.
200 * - If it returns a call, the call's lock will need releasing by the caller.
217 */ 201 */
218struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, 202struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
219 struct rxrpc_conn_parameters *cp, 203 struct rxrpc_conn_parameters *cp,
220 struct sockaddr_rxrpc *srx, 204 struct sockaddr_rxrpc *srx,
221 unsigned long user_call_ID, 205 unsigned long user_call_ID,
222 gfp_t gfp) 206 gfp_t gfp)
207 __releases(&rx->sk.sk_lock.slock)
223{ 208{
224 struct rxrpc_call *call, *xcall; 209 struct rxrpc_call *call, *xcall;
225 struct rb_node *parent, **pp; 210 struct rb_node *parent, **pp;
@@ -230,6 +215,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
230 215
231 call = rxrpc_alloc_client_call(srx, gfp); 216 call = rxrpc_alloc_client_call(srx, gfp);
232 if (IS_ERR(call)) { 217 if (IS_ERR(call)) {
218 release_sock(&rx->sk);
233 _leave(" = %ld", PTR_ERR(call)); 219 _leave(" = %ld", PTR_ERR(call));
234 return call; 220 return call;
235 } 221 }
@@ -237,6 +223,11 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
237 trace_rxrpc_call(call, rxrpc_call_new_client, atomic_read(&call->usage), 223 trace_rxrpc_call(call, rxrpc_call_new_client, atomic_read(&call->usage),
238 here, (const void *)user_call_ID); 224 here, (const void *)user_call_ID);
239 225
226 /* We need to protect a partially set up call against the user as we
227 * will be acting outside the socket lock.
228 */
229 mutex_lock(&call->user_mutex);
230
240 /* Publish the call, even though it is incompletely set up as yet */ 231 /* Publish the call, even though it is incompletely set up as yet */
241 write_lock(&rx->call_lock); 232 write_lock(&rx->call_lock);
242 233
@@ -268,6 +259,9 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
268 list_add_tail(&call->link, &rxrpc_calls); 259 list_add_tail(&call->link, &rxrpc_calls);
269 write_unlock(&rxrpc_call_lock); 260 write_unlock(&rxrpc_call_lock);
270 261
262 /* From this point on, the call is protected by its own lock. */
263 release_sock(&rx->sk);
264
271 /* Set up or get a connection record and set the protocol parameters, 265 /* Set up or get a connection record and set the protocol parameters,
272 * including channel number and call ID. 266 * including channel number and call ID.
273 */ 267 */
@@ -297,6 +291,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
297 */ 291 */
298error_dup_user_ID: 292error_dup_user_ID:
299 write_unlock(&rx->call_lock); 293 write_unlock(&rx->call_lock);
294 release_sock(&rx->sk);
300 ret = -EEXIST; 295 ret = -EEXIST;
301 296
302error: 297error:
@@ -305,6 +300,7 @@ error:
305 trace_rxrpc_call(call, rxrpc_call_error, atomic_read(&call->usage), 300 trace_rxrpc_call(call, rxrpc_call_error, atomic_read(&call->usage),
306 here, ERR_PTR(ret)); 301 here, ERR_PTR(ret));
307 rxrpc_release_call(rx, call); 302 rxrpc_release_call(rx, call);
303 mutex_unlock(&call->user_mutex);
308 rxrpc_put_call(call, rxrpc_call_put); 304 rxrpc_put_call(call, rxrpc_call_put);
309 _leave(" = %d", ret); 305 _leave(" = %d", ret);
310 return ERR_PTR(ret); 306 return ERR_PTR(ret);
diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
index 6cbcdcc29853..c3be03e8d098 100644
--- a/net/rxrpc/conn_client.c
+++ b/net/rxrpc/conn_client.c
@@ -76,6 +76,8 @@
76#include <linux/slab.h> 76#include <linux/slab.h>
77#include <linux/idr.h> 77#include <linux/idr.h>
78#include <linux/timer.h> 78#include <linux/timer.h>
79#include <linux/sched/signal.h>
80
79#include "ar-internal.h" 81#include "ar-internal.h"
80 82
81__read_mostly unsigned int rxrpc_max_client_connections = 1000; 83__read_mostly unsigned int rxrpc_max_client_connections = 1000;
@@ -105,14 +107,6 @@ static void rxrpc_discard_expired_client_conns(struct work_struct *);
105static DECLARE_DELAYED_WORK(rxrpc_client_conn_reap, 107static DECLARE_DELAYED_WORK(rxrpc_client_conn_reap,
106 rxrpc_discard_expired_client_conns); 108 rxrpc_discard_expired_client_conns);
107 109
108const char rxrpc_conn_cache_states[RXRPC_CONN__NR_CACHE_STATES][5] = {
109 [RXRPC_CONN_CLIENT_INACTIVE] = "Inac",
110 [RXRPC_CONN_CLIENT_WAITING] = "Wait",
111 [RXRPC_CONN_CLIENT_ACTIVE] = "Actv",
112 [RXRPC_CONN_CLIENT_CULLED] = "Cull",
113 [RXRPC_CONN_CLIENT_IDLE] = "Idle",
114};
115
116/* 110/*
117 * Get a connection ID and epoch for a client connection from the global pool. 111 * Get a connection ID and epoch for a client connection from the global pool.
118 * The connection struct pointer is then recorded in the idr radix tree. The 112 * The connection struct pointer is then recorded in the idr radix tree. The
diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index 3f9d8d7ec632..b099b64366f3 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -275,6 +275,10 @@ static int rxrpc_process_event(struct rxrpc_connection *conn,
275 rxrpc_conn_retransmit_call(conn, skb); 275 rxrpc_conn_retransmit_call(conn, skb);
276 return 0; 276 return 0;
277 277
278 case RXRPC_PACKET_TYPE_BUSY:
279 /* Just ignore BUSY packets for now. */
280 return 0;
281
278 case RXRPC_PACKET_TYPE_ABORT: 282 case RXRPC_PACKET_TYPE_ABORT:
279 if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), 283 if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header),
280 &wtmp, sizeof(wtmp)) < 0) 284 &wtmp, sizeof(wtmp)) < 0)
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index e1e83af47866..b0ecb770fdce 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -173,6 +173,7 @@ void __rxrpc_disconnect_call(struct rxrpc_connection *conn,
173 /* Save the result of the call so that we can repeat it if necessary 173 /* Save the result of the call so that we can repeat it if necessary
174 * through the channel, whilst disposing of the actual call record. 174 * through the channel, whilst disposing of the actual call record.
175 */ 175 */
176 trace_rxrpc_disconnect_call(call);
176 chan->last_service_id = call->service_id; 177 chan->last_service_id = call->service_id;
177 if (call->abort_code) { 178 if (call->abort_code) {
178 chan->last_abort = call->abort_code; 179 chan->last_abort = call->abort_code;
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 1d87b5453ef7..18b2ad8be8e2 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -420,6 +420,7 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb,
420 u16 skew) 420 u16 skew)
421{ 421{
422 struct rxrpc_skb_priv *sp = rxrpc_skb(skb); 422 struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
423 enum rxrpc_call_state state;
423 unsigned int offset = sizeof(struct rxrpc_wire_header); 424 unsigned int offset = sizeof(struct rxrpc_wire_header);
424 unsigned int ix; 425 unsigned int ix;
425 rxrpc_serial_t serial = sp->hdr.serial, ack_serial = 0; 426 rxrpc_serial_t serial = sp->hdr.serial, ack_serial = 0;
@@ -434,14 +435,15 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb,
434 _proto("Rx DATA %%%u { #%u f=%02x }", 435 _proto("Rx DATA %%%u { #%u f=%02x }",
435 sp->hdr.serial, seq, sp->hdr.flags); 436 sp->hdr.serial, seq, sp->hdr.flags);
436 437
437 if (call->state >= RXRPC_CALL_COMPLETE) 438 state = READ_ONCE(call->state);
439 if (state >= RXRPC_CALL_COMPLETE)
438 return; 440 return;
439 441
440 /* Received data implicitly ACKs all of the request packets we sent 442 /* Received data implicitly ACKs all of the request packets we sent
441 * when we're acting as a client. 443 * when we're acting as a client.
442 */ 444 */
443 if ((call->state == RXRPC_CALL_CLIENT_SEND_REQUEST || 445 if ((state == RXRPC_CALL_CLIENT_SEND_REQUEST ||
444 call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY) && 446 state == RXRPC_CALL_CLIENT_AWAIT_REPLY) &&
445 !rxrpc_receiving_reply(call)) 447 !rxrpc_receiving_reply(call))
446 return; 448 return;
447 449
@@ -481,6 +483,7 @@ next_subpacket:
481 return rxrpc_proto_abort("LSA", call, seq); 483 return rxrpc_proto_abort("LSA", call, seq);
482 } 484 }
483 485
486 trace_rxrpc_rx_data(call, seq, serial, flags, annotation);
484 if (before_eq(seq, hard_ack)) { 487 if (before_eq(seq, hard_ack)) {
485 ack = RXRPC_ACK_DUPLICATE; 488 ack = RXRPC_ACK_DUPLICATE;
486 ack_serial = serial; 489 ack_serial = serial;
@@ -649,6 +652,7 @@ static void rxrpc_input_ackinfo(struct rxrpc_call *call, struct sk_buff *skb,
649 struct rxrpc_skb_priv *sp = rxrpc_skb(skb); 652 struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
650 struct rxrpc_peer *peer; 653 struct rxrpc_peer *peer;
651 unsigned int mtu; 654 unsigned int mtu;
655 bool wake = false;
652 u32 rwind = ntohl(ackinfo->rwind); 656 u32 rwind = ntohl(ackinfo->rwind);
653 657
654 _proto("Rx ACK %%%u Info { rx=%u max=%u rwin=%u jm=%u }", 658 _proto("Rx ACK %%%u Info { rx=%u max=%u rwin=%u jm=%u }",
@@ -656,9 +660,14 @@ static void rxrpc_input_ackinfo(struct rxrpc_call *call, struct sk_buff *skb,
656 ntohl(ackinfo->rxMTU), ntohl(ackinfo->maxMTU), 660 ntohl(ackinfo->rxMTU), ntohl(ackinfo->maxMTU),
657 rwind, ntohl(ackinfo->jumbo_max)); 661 rwind, ntohl(ackinfo->jumbo_max));
658 662
659 if (rwind > RXRPC_RXTX_BUFF_SIZE - 1) 663 if (call->tx_winsize != rwind) {
660 rwind = RXRPC_RXTX_BUFF_SIZE - 1; 664 if (rwind > RXRPC_RXTX_BUFF_SIZE - 1)
661 call->tx_winsize = rwind; 665 rwind = RXRPC_RXTX_BUFF_SIZE - 1;
666 if (rwind > call->tx_winsize)
667 wake = true;
668 call->tx_winsize = rwind;
669 }
670
662 if (call->cong_ssthresh > rwind) 671 if (call->cong_ssthresh > rwind)
663 call->cong_ssthresh = rwind; 672 call->cong_ssthresh = rwind;
664 673
@@ -672,6 +681,9 @@ static void rxrpc_input_ackinfo(struct rxrpc_call *call, struct sk_buff *skb,
672 spin_unlock_bh(&peer->lock); 681 spin_unlock_bh(&peer->lock);
673 _net("Net MTU %u (maxdata %u)", peer->mtu, peer->maxdata); 682 _net("Net MTU %u (maxdata %u)", peer->mtu, peer->maxdata);
674 } 683 }
684
685 if (wake)
686 wake_up(&call->waitq);
675} 687}
676 688
677/* 689/*
@@ -765,16 +777,9 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
765 summary.ack_reason = (buf.ack.reason < RXRPC_ACK__INVALID ? 777 summary.ack_reason = (buf.ack.reason < RXRPC_ACK__INVALID ?
766 buf.ack.reason : RXRPC_ACK__INVALID); 778 buf.ack.reason : RXRPC_ACK__INVALID);
767 779
768 trace_rxrpc_rx_ack(call, first_soft_ack, summary.ack_reason, nr_acks); 780 trace_rxrpc_rx_ack(call, sp->hdr.serial, acked_serial,
769 781 first_soft_ack, ntohl(buf.ack.previousPacket),
770 _proto("Rx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }", 782 summary.ack_reason, nr_acks);
771 sp->hdr.serial,
772 ntohs(buf.ack.maxSkew),
773 first_soft_ack,
774 ntohl(buf.ack.previousPacket),
775 acked_serial,
776 rxrpc_ack_names[summary.ack_reason],
777 buf.ack.nAcks);
778 783
779 if (buf.ack.reason == RXRPC_ACK_PING_RESPONSE) 784 if (buf.ack.reason == RXRPC_ACK_PING_RESPONSE)
780 rxrpc_input_ping_response(call, skb->tstamp, acked_serial, 785 rxrpc_input_ping_response(call, skb->tstamp, acked_serial,
@@ -805,7 +810,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
805 return rxrpc_proto_abort("AK0", call, 0); 810 return rxrpc_proto_abort("AK0", call, 0);
806 811
807 /* Ignore ACKs unless we are or have just been transmitting. */ 812 /* Ignore ACKs unless we are or have just been transmitting. */
808 switch (call->state) { 813 switch (READ_ONCE(call->state)) {
809 case RXRPC_CALL_CLIENT_SEND_REQUEST: 814 case RXRPC_CALL_CLIENT_SEND_REQUEST:
810 case RXRPC_CALL_CLIENT_AWAIT_REPLY: 815 case RXRPC_CALL_CLIENT_AWAIT_REPLY:
811 case RXRPC_CALL_SERVER_SEND_REPLY: 816 case RXRPC_CALL_SERVER_SEND_REPLY:
@@ -931,7 +936,6 @@ static void rxrpc_input_call_packet(struct rxrpc_call *call,
931 break; 936 break;
932 937
933 default: 938 default:
934 _proto("Rx %s %%%u", rxrpc_pkts[sp->hdr.type], sp->hdr.serial);
935 break; 939 break;
936 } 940 }
937 941
@@ -947,7 +951,7 @@ static void rxrpc_input_call_packet(struct rxrpc_call *call,
947static void rxrpc_input_implicit_end_call(struct rxrpc_connection *conn, 951static void rxrpc_input_implicit_end_call(struct rxrpc_connection *conn,
948 struct rxrpc_call *call) 952 struct rxrpc_call *call)
949{ 953{
950 switch (call->state) { 954 switch (READ_ONCE(call->state)) {
951 case RXRPC_CALL_SERVER_AWAIT_ACK: 955 case RXRPC_CALL_SERVER_AWAIT_ACK:
952 rxrpc_call_completed(call); 956 rxrpc_call_completed(call);
953 break; 957 break;
@@ -961,6 +965,7 @@ static void rxrpc_input_implicit_end_call(struct rxrpc_connection *conn,
961 break; 965 break;
962 } 966 }
963 967
968 trace_rxrpc_improper_term(call);
964 __rxrpc_disconnect_call(conn, call); 969 __rxrpc_disconnect_call(conn, call);
965 rxrpc_notify_socket(call); 970 rxrpc_notify_socket(call);
966} 971}
@@ -1200,6 +1205,7 @@ void rxrpc_data_ready(struct sock *udp_sk)
1200 goto reject_packet; 1205 goto reject_packet;
1201 } 1206 }
1202 rxrpc_send_ping(call, skb, skew); 1207 rxrpc_send_ping(call, skb, skew);
1208 mutex_unlock(&call->user_mutex);
1203 } 1209 }
1204 1210
1205 rxrpc_input_call_packet(call, skb, skew); 1211 rxrpc_input_call_packet(call, skb, skew);
diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c
index 18c737a61d80..0a4e28477ad9 100644
--- a/net/rxrpc/key.c
+++ b/net/rxrpc/key.c
@@ -1065,7 +1065,7 @@ static long rxrpc_read(const struct key *key,
1065 1065
1066 switch (token->security_index) { 1066 switch (token->security_index) {
1067 case RXRPC_SECURITY_RXKAD: 1067 case RXRPC_SECURITY_RXKAD:
1068 toksize += 8 * 4; /* viceid, kvno, key*2, begin, 1068 toksize += 9 * 4; /* viceid, kvno, key*2 + len, begin,
1069 * end, primary, tktlen */ 1069 * end, primary, tktlen */
1070 toksize += RND(token->kad->ticket_len); 1070 toksize += RND(token->kad->ticket_len);
1071 break; 1071 break;
diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c
index 6dee55fad2d3..1a2d4b112064 100644
--- a/net/rxrpc/misc.c
+++ b/net/rxrpc/misc.c
@@ -77,12 +77,6 @@ unsigned int rxrpc_rx_jumbo_max = 4;
77 */ 77 */
78unsigned int rxrpc_resend_timeout = 4 * 1000; 78unsigned int rxrpc_resend_timeout = 4 * 1000;
79 79
80const char *const rxrpc_pkts[] = {
81 "?00",
82 "DATA", "ACK", "BUSY", "ABORT", "ACKALL", "CHALL", "RESP", "DEBUG",
83 "?09", "?10", "?11", "?12", "VERSION", "?14", "?15"
84};
85
86const s8 rxrpc_ack_priority[] = { 80const s8 rxrpc_ack_priority[] = {
87 [0] = 0, 81 [0] = 0,
88 [RXRPC_ACK_DELAY] = 1, 82 [RXRPC_ACK_DELAY] = 1,
@@ -94,148 +88,3 @@ const s8 rxrpc_ack_priority[] = {
94 [RXRPC_ACK_NOSPACE] = 7, 88 [RXRPC_ACK_NOSPACE] = 7,
95 [RXRPC_ACK_PING_RESPONSE] = 8, 89 [RXRPC_ACK_PING_RESPONSE] = 8,
96}; 90};
97
98const char rxrpc_ack_names[RXRPC_ACK__INVALID + 1][4] = {
99 "---", "REQ", "DUP", "OOS", "WIN", "MEM", "PNG", "PNR", "DLY",
100 "IDL", "-?-"
101};
102
103const char rxrpc_skb_traces[rxrpc_skb__nr_trace][7] = {
104 [rxrpc_skb_rx_cleaned] = "Rx CLN",
105 [rxrpc_skb_rx_freed] = "Rx FRE",
106 [rxrpc_skb_rx_got] = "Rx GOT",
107 [rxrpc_skb_rx_lost] = "Rx *L*",
108 [rxrpc_skb_rx_received] = "Rx RCV",
109 [rxrpc_skb_rx_purged] = "Rx PUR",
110 [rxrpc_skb_rx_rotated] = "Rx ROT",
111 [rxrpc_skb_rx_seen] = "Rx SEE",
112 [rxrpc_skb_tx_cleaned] = "Tx CLN",
113 [rxrpc_skb_tx_freed] = "Tx FRE",
114 [rxrpc_skb_tx_got] = "Tx GOT",
115 [rxrpc_skb_tx_new] = "Tx NEW",
116 [rxrpc_skb_tx_rotated] = "Tx ROT",
117 [rxrpc_skb_tx_seen] = "Tx SEE",
118};
119
120const char rxrpc_conn_traces[rxrpc_conn__nr_trace][4] = {
121 [rxrpc_conn_new_client] = "NWc",
122 [rxrpc_conn_new_service] = "NWs",
123 [rxrpc_conn_queued] = "QUE",
124 [rxrpc_conn_seen] = "SEE",
125 [rxrpc_conn_got] = "GOT",
126 [rxrpc_conn_put_client] = "PTc",
127 [rxrpc_conn_put_service] = "PTs",
128};
129
130const char rxrpc_client_traces[rxrpc_client__nr_trace][7] = {
131 [rxrpc_client_activate_chans] = "Activa",
132 [rxrpc_client_alloc] = "Alloc ",
133 [rxrpc_client_chan_activate] = "ChActv",
134 [rxrpc_client_chan_disconnect] = "ChDisc",
135 [rxrpc_client_chan_pass] = "ChPass",
136 [rxrpc_client_chan_unstarted] = "ChUnst",
137 [rxrpc_client_cleanup] = "Clean ",
138 [rxrpc_client_count] = "Count ",
139 [rxrpc_client_discard] = "Discar",
140 [rxrpc_client_duplicate] = "Duplic",
141 [rxrpc_client_exposed] = "Expose",
142 [rxrpc_client_replace] = "Replac",
143 [rxrpc_client_to_active] = "->Actv",
144 [rxrpc_client_to_culled] = "->Cull",
145 [rxrpc_client_to_idle] = "->Idle",
146 [rxrpc_client_to_inactive] = "->Inac",
147 [rxrpc_client_to_waiting] = "->Wait",
148 [rxrpc_client_uncount] = "Uncoun",
149};
150
151const char rxrpc_transmit_traces[rxrpc_transmit__nr_trace][4] = {
152 [rxrpc_transmit_wait] = "WAI",
153 [rxrpc_transmit_queue] = "QUE",
154 [rxrpc_transmit_queue_last] = "QLS",
155 [rxrpc_transmit_rotate] = "ROT",
156 [rxrpc_transmit_rotate_last] = "RLS",
157 [rxrpc_transmit_await_reply] = "AWR",
158 [rxrpc_transmit_end] = "END",
159};
160
161const char rxrpc_receive_traces[rxrpc_receive__nr_trace][4] = {
162 [rxrpc_receive_incoming] = "INC",
163 [rxrpc_receive_queue] = "QUE",
164 [rxrpc_receive_queue_last] = "QLS",
165 [rxrpc_receive_front] = "FRN",
166 [rxrpc_receive_rotate] = "ROT",
167 [rxrpc_receive_end] = "END",
168};
169
170const char rxrpc_recvmsg_traces[rxrpc_recvmsg__nr_trace][5] = {
171 [rxrpc_recvmsg_enter] = "ENTR",
172 [rxrpc_recvmsg_wait] = "WAIT",
173 [rxrpc_recvmsg_dequeue] = "DEQU",
174 [rxrpc_recvmsg_hole] = "HOLE",
175 [rxrpc_recvmsg_next] = "NEXT",
176 [rxrpc_recvmsg_cont] = "CONT",
177 [rxrpc_recvmsg_full] = "FULL",
178 [rxrpc_recvmsg_data_return] = "DATA",
179 [rxrpc_recvmsg_terminal] = "TERM",
180 [rxrpc_recvmsg_to_be_accepted] = "TBAC",
181 [rxrpc_recvmsg_return] = "RETN",
182};
183
184const char rxrpc_rtt_tx_traces[rxrpc_rtt_tx__nr_trace][5] = {
185 [rxrpc_rtt_tx_ping] = "PING",
186 [rxrpc_rtt_tx_data] = "DATA",
187};
188
189const char rxrpc_rtt_rx_traces[rxrpc_rtt_rx__nr_trace][5] = {
190 [rxrpc_rtt_rx_ping_response] = "PONG",
191 [rxrpc_rtt_rx_requested_ack] = "RACK",
192};
193
194const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8] = {
195 [rxrpc_timer_begin] = "Begin ",
196 [rxrpc_timer_expired] = "*EXPR*",
197 [rxrpc_timer_init_for_reply] = "IniRpl",
198 [rxrpc_timer_init_for_send_reply] = "SndRpl",
199 [rxrpc_timer_set_for_ack] = "SetAck",
200 [rxrpc_timer_set_for_ping] = "SetPng",
201 [rxrpc_timer_set_for_send] = "SetTx ",
202 [rxrpc_timer_set_for_resend] = "SetRTx",
203};
204
205const char rxrpc_propose_ack_traces[rxrpc_propose_ack__nr_trace][8] = {
206 [rxrpc_propose_ack_client_tx_end] = "ClTxEnd",
207 [rxrpc_propose_ack_input_data] = "DataIn ",
208 [rxrpc_propose_ack_ping_for_lost_ack] = "LostAck",
209 [rxrpc_propose_ack_ping_for_lost_reply] = "LostRpl",
210 [rxrpc_propose_ack_ping_for_params] = "Params ",
211 [rxrpc_propose_ack_processing_op] = "ProcOp ",
212 [rxrpc_propose_ack_respond_to_ack] = "Rsp2Ack",
213 [rxrpc_propose_ack_respond_to_ping] = "Rsp2Png",
214 [rxrpc_propose_ack_retry_tx] = "RetryTx",
215 [rxrpc_propose_ack_rotate_rx] = "RxAck ",
216 [rxrpc_propose_ack_terminal_ack] = "ClTerm ",
217};
218
219const char *const rxrpc_propose_ack_outcomes[rxrpc_propose_ack__nr_outcomes] = {
220 [rxrpc_propose_ack_use] = "",
221 [rxrpc_propose_ack_update] = " Update",
222 [rxrpc_propose_ack_subsume] = " Subsume",
223};
224
225const char rxrpc_congest_modes[NR__RXRPC_CONGEST_MODES][10] = {
226 [RXRPC_CALL_SLOW_START] = "SlowStart",
227 [RXRPC_CALL_CONGEST_AVOIDANCE] = "CongAvoid",
228 [RXRPC_CALL_PACKET_LOSS] = "PktLoss ",
229 [RXRPC_CALL_FAST_RETRANSMIT] = "FastReTx ",
230};
231
232const char rxrpc_congest_changes[rxrpc_congest__nr_change][9] = {
233 [rxrpc_cong_begin_retransmission] = " Retrans",
234 [rxrpc_cong_cleared_nacks] = " Cleared",
235 [rxrpc_cong_new_low_nack] = " NewLowN",
236 [rxrpc_cong_no_change] = "",
237 [rxrpc_cong_progress] = " Progres",
238 [rxrpc_cong_retransmit_again] = " ReTxAgn",
239 [rxrpc_cong_rtt_window_end] = " RttWinE",
240 [rxrpc_cong_saw_nack] = " SawNack",
241};
diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c
index 65cd980767fa..b9bcfbfb095c 100644
--- a/net/rxrpc/proc.c
+++ b/net/rxrpc/proc.c
@@ -52,6 +52,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
52 struct rxrpc_sock *rx; 52 struct rxrpc_sock *rx;
53 struct rxrpc_peer *peer; 53 struct rxrpc_peer *peer;
54 struct rxrpc_call *call; 54 struct rxrpc_call *call;
55 rxrpc_seq_t tx_hard_ack, rx_hard_ack;
55 char lbuff[50], rbuff[50]; 56 char lbuff[50], rbuff[50];
56 57
57 if (v == &rxrpc_calls) { 58 if (v == &rxrpc_calls) {
@@ -82,9 +83,11 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
82 else 83 else
83 strcpy(rbuff, "no_connection"); 84 strcpy(rbuff, "no_connection");
84 85
86 tx_hard_ack = READ_ONCE(call->tx_hard_ack);
87 rx_hard_ack = READ_ONCE(call->rx_hard_ack);
85 seq_printf(seq, 88 seq_printf(seq,
86 "UDP %-47.47s %-47.47s %4x %08x %08x %s %3u" 89 "UDP %-47.47s %-47.47s %4x %08x %08x %s %3u"
87 " %-8.8s %08x %lx\n", 90 " %-8.8s %08x %lx %08x %02x %08x %02x\n",
88 lbuff, 91 lbuff,
89 rbuff, 92 rbuff,
90 call->service_id, 93 call->service_id,
@@ -94,7 +97,9 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
94 atomic_read(&call->usage), 97 atomic_read(&call->usage),
95 rxrpc_call_states[call->state], 98 rxrpc_call_states[call->state],
96 call->abort_code, 99 call->abort_code,
97 call->user_call_ID); 100 call->user_call_ID,
101 tx_hard_ack, READ_ONCE(call->tx_top) - tx_hard_ack,
102 rx_hard_ack, READ_ONCE(call->rx_top) - rx_hard_ack);
98 103
99 return 0; 104 return 0;
100} 105}
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index c29362d50a92..3e2f1a8e9c5b 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -14,6 +14,8 @@
14#include <linux/net.h> 14#include <linux/net.h>
15#include <linux/skbuff.h> 15#include <linux/skbuff.h>
16#include <linux/export.h> 16#include <linux/export.h>
17#include <linux/sched/signal.h>
18
17#include <net/sock.h> 19#include <net/sock.h>
18#include <net/af_rxrpc.h> 20#include <net/af_rxrpc.h>
19#include "ar-internal.h" 21#include "ar-internal.h"
@@ -320,8 +322,10 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
320 322
321 /* Barriers against rxrpc_input_data(). */ 323 /* Barriers against rxrpc_input_data(). */
322 hard_ack = call->rx_hard_ack; 324 hard_ack = call->rx_hard_ack;
323 top = smp_load_acquire(&call->rx_top); 325 seq = hard_ack + 1;
324 for (seq = hard_ack + 1; before_eq(seq, top); seq++) { 326 while (top = smp_load_acquire(&call->rx_top),
327 before_eq(seq, top)
328 ) {
325 ix = seq & RXRPC_RXTX_BUFF_MASK; 329 ix = seq & RXRPC_RXTX_BUFF_MASK;
326 skb = call->rxtx_buffer[ix]; 330 skb = call->rxtx_buffer[ix];
327 if (!skb) { 331 if (!skb) {
@@ -394,6 +398,8 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
394 ret = 1; 398 ret = 1;
395 goto out; 399 goto out;
396 } 400 }
401
402 seq++;
397 } 403 }
398 404
399out: 405out:
@@ -483,6 +489,20 @@ try_again:
483 489
484 trace_rxrpc_recvmsg(call, rxrpc_recvmsg_dequeue, 0, 0, 0, 0); 490 trace_rxrpc_recvmsg(call, rxrpc_recvmsg_dequeue, 0, 0, 0, 0);
485 491
492 /* We're going to drop the socket lock, so we need to lock the call
493 * against interference by sendmsg.
494 */
495 if (!mutex_trylock(&call->user_mutex)) {
496 ret = -EWOULDBLOCK;
497 if (flags & MSG_DONTWAIT)
498 goto error_requeue_call;
499 ret = -ERESTARTSYS;
500 if (mutex_lock_interruptible(&call->user_mutex) < 0)
501 goto error_requeue_call;
502 }
503
504 release_sock(&rx->sk);
505
486 if (test_bit(RXRPC_CALL_RELEASED, &call->flags)) 506 if (test_bit(RXRPC_CALL_RELEASED, &call->flags))
487 BUG(); 507 BUG();
488 508
@@ -498,7 +518,7 @@ try_again:
498 &call->user_call_ID); 518 &call->user_call_ID);
499 } 519 }
500 if (ret < 0) 520 if (ret < 0)
501 goto error; 521 goto error_unlock_call;
502 } 522 }
503 523
504 if (msg->msg_name) { 524 if (msg->msg_name) {
@@ -507,7 +527,7 @@ try_again:
507 msg->msg_namelen = len; 527 msg->msg_namelen = len;
508 } 528 }
509 529
510 switch (call->state) { 530 switch (READ_ONCE(call->state)) {
511 case RXRPC_CALL_SERVER_ACCEPTING: 531 case RXRPC_CALL_SERVER_ACCEPTING:
512 ret = rxrpc_recvmsg_new_call(rx, call, msg, flags); 532 ret = rxrpc_recvmsg_new_call(rx, call, msg, flags);
513 break; 533 break;
@@ -529,12 +549,12 @@ try_again:
529 } 549 }
530 550
531 if (ret < 0) 551 if (ret < 0)
532 goto error; 552 goto error_unlock_call;
533 553
534 if (call->state == RXRPC_CALL_COMPLETE) { 554 if (call->state == RXRPC_CALL_COMPLETE) {
535 ret = rxrpc_recvmsg_term(call, msg); 555 ret = rxrpc_recvmsg_term(call, msg);
536 if (ret < 0) 556 if (ret < 0)
537 goto error; 557 goto error_unlock_call;
538 if (!(flags & MSG_PEEK)) 558 if (!(flags & MSG_PEEK))
539 rxrpc_release_call(rx, call); 559 rxrpc_release_call(rx, call);
540 msg->msg_flags |= MSG_EOR; 560 msg->msg_flags |= MSG_EOR;
@@ -547,8 +567,21 @@ try_again:
547 msg->msg_flags &= ~MSG_MORE; 567 msg->msg_flags &= ~MSG_MORE;
548 ret = copied; 568 ret = copied;
549 569
550error: 570error_unlock_call:
571 mutex_unlock(&call->user_mutex);
551 rxrpc_put_call(call, rxrpc_call_put); 572 rxrpc_put_call(call, rxrpc_call_put);
573 trace_rxrpc_recvmsg(call, rxrpc_recvmsg_return, 0, 0, 0, ret);
574 return ret;
575
576error_requeue_call:
577 if (!(flags & MSG_PEEK)) {
578 write_lock_bh(&rx->recvmsg_lock);
579 list_add(&call->recvmsg_link, &rx->recvmsg_q);
580 write_unlock_bh(&rx->recvmsg_lock);
581 trace_rxrpc_recvmsg(call, rxrpc_recvmsg_requeue, 0, 0, 0, 0);
582 } else {
583 rxrpc_put_call(call, rxrpc_call_put);
584 }
552error_no_call: 585error_no_call:
553 release_sock(&rx->sk); 586 release_sock(&rx->sk);
554 trace_rxrpc_recvmsg(call, rxrpc_recvmsg_return, 0, 0, 0, ret); 587 trace_rxrpc_recvmsg(call, rxrpc_recvmsg_return, 0, 0, 0, ret);
@@ -605,9 +638,9 @@ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call,
605 iov.iov_len = size - *_offset; 638 iov.iov_len = size - *_offset;
606 iov_iter_kvec(&iter, ITER_KVEC | READ, &iov, 1, size - *_offset); 639 iov_iter_kvec(&iter, ITER_KVEC | READ, &iov, 1, size - *_offset);
607 640
608 lock_sock(sock->sk); 641 mutex_lock(&call->user_mutex);
609 642
610 switch (call->state) { 643 switch (READ_ONCE(call->state)) {
611 case RXRPC_CALL_CLIENT_RECV_REPLY: 644 case RXRPC_CALL_CLIENT_RECV_REPLY:
612 case RXRPC_CALL_SERVER_RECV_REQUEST: 645 case RXRPC_CALL_SERVER_RECV_REQUEST:
613 case RXRPC_CALL_SERVER_ACK_REQUEST: 646 case RXRPC_CALL_SERVER_ACK_REQUEST:
@@ -644,7 +677,7 @@ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call,
644read_phase_complete: 677read_phase_complete:
645 ret = 1; 678 ret = 1;
646out: 679out:
647 release_sock(sock->sk); 680 mutex_unlock(&call->user_mutex);
648 _leave(" = %d [%zu,%d]", ret, *_offset, *_abort); 681 _leave(" = %d [%zu,%d]", ret, *_offset, *_abort);
649 return ret; 682 return ret;
650 683
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index b214a4d4a641..97ab214ca411 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -15,6 +15,8 @@
15#include <linux/gfp.h> 15#include <linux/gfp.h>
16#include <linux/skbuff.h> 16#include <linux/skbuff.h>
17#include <linux/export.h> 17#include <linux/export.h>
18#include <linux/sched/signal.h>
19
18#include <net/sock.h> 20#include <net/sock.h>
19#include <net/af_rxrpc.h> 21#include <net/af_rxrpc.h>
20#include "ar-internal.h" 22#include "ar-internal.h"
@@ -59,9 +61,12 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx,
59 } 61 }
60 62
61 trace_rxrpc_transmit(call, rxrpc_transmit_wait); 63 trace_rxrpc_transmit(call, rxrpc_transmit_wait);
62 release_sock(&rx->sk); 64 mutex_unlock(&call->user_mutex);
63 *timeo = schedule_timeout(*timeo); 65 *timeo = schedule_timeout(*timeo);
64 lock_sock(&rx->sk); 66 if (mutex_lock_interruptible(&call->user_mutex) < 0) {
67 ret = sock_intr_errno(*timeo);
68 break;
69 }
65 } 70 }
66 71
67 remove_wait_queue(&call->waitq, &myself); 72 remove_wait_queue(&call->waitq, &myself);
@@ -171,7 +176,7 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb,
171/* 176/*
172 * send data through a socket 177 * send data through a socket
173 * - must be called in process context 178 * - must be called in process context
174 * - caller holds the socket locked 179 * - The caller holds the call user access mutex, but not the socket lock.
175 */ 180 */
176static int rxrpc_send_data(struct rxrpc_sock *rx, 181static int rxrpc_send_data(struct rxrpc_sock *rx,
177 struct rxrpc_call *call, 182 struct rxrpc_call *call,
@@ -376,7 +381,7 @@ static int rxrpc_sendmsg_cmsg(struct msghdr *msg,
376 if (!CMSG_OK(msg, cmsg)) 381 if (!CMSG_OK(msg, cmsg))
377 return -EINVAL; 382 return -EINVAL;
378 383
379 len = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)); 384 len = cmsg->cmsg_len - sizeof(struct cmsghdr);
380 _debug("CMSG %d, %d, %d", 385 _debug("CMSG %d, %d, %d",
381 cmsg->cmsg_level, cmsg->cmsg_type, len); 386 cmsg->cmsg_level, cmsg->cmsg_type, len);
382 387
@@ -437,10 +442,13 @@ static int rxrpc_sendmsg_cmsg(struct msghdr *msg,
437 442
438/* 443/*
439 * Create a new client call for sendmsg(). 444 * Create a new client call for sendmsg().
445 * - Called with the socket lock held, which it must release.
446 * - If it returns a call, the call's lock will need releasing by the caller.
440 */ 447 */
441static struct rxrpc_call * 448static struct rxrpc_call *
442rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, 449rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg,
443 unsigned long user_call_ID, bool exclusive) 450 unsigned long user_call_ID, bool exclusive)
451 __releases(&rx->sk.sk_lock.slock)
444{ 452{
445 struct rxrpc_conn_parameters cp; 453 struct rxrpc_conn_parameters cp;
446 struct rxrpc_call *call; 454 struct rxrpc_call *call;
@@ -450,8 +458,10 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg,
450 458
451 _enter(""); 459 _enter("");
452 460
453 if (!msg->msg_name) 461 if (!msg->msg_name) {
462 release_sock(&rx->sk);
454 return ERR_PTR(-EDESTADDRREQ); 463 return ERR_PTR(-EDESTADDRREQ);
464 }
455 465
456 key = rx->key; 466 key = rx->key;
457 if (key && !rx->key->payload.data[0]) 467 if (key && !rx->key->payload.data[0])
@@ -464,6 +474,7 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg,
464 cp.exclusive = rx->exclusive | exclusive; 474 cp.exclusive = rx->exclusive | exclusive;
465 cp.service_id = srx->srx_service; 475 cp.service_id = srx->srx_service;
466 call = rxrpc_new_client_call(rx, &cp, srx, user_call_ID, GFP_KERNEL); 476 call = rxrpc_new_client_call(rx, &cp, srx, user_call_ID, GFP_KERNEL);
477 /* The socket is now unlocked */
467 478
468 _leave(" = %p\n", call); 479 _leave(" = %p\n", call);
469 return call; 480 return call;
@@ -475,7 +486,9 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg,
475 * - the socket may be either a client socket or a server socket 486 * - the socket may be either a client socket or a server socket
476 */ 487 */
477int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) 488int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
489 __releases(&rx->sk.sk_lock.slock)
478{ 490{
491 enum rxrpc_call_state state;
479 enum rxrpc_command cmd; 492 enum rxrpc_command cmd;
480 struct rxrpc_call *call; 493 struct rxrpc_call *call;
481 unsigned long user_call_ID = 0; 494 unsigned long user_call_ID = 0;
@@ -488,12 +501,14 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
488 ret = rxrpc_sendmsg_cmsg(msg, &user_call_ID, &cmd, &abort_code, 501 ret = rxrpc_sendmsg_cmsg(msg, &user_call_ID, &cmd, &abort_code,
489 &exclusive); 502 &exclusive);
490 if (ret < 0) 503 if (ret < 0)
491 return ret; 504 goto error_release_sock;
492 505
493 if (cmd == RXRPC_CMD_ACCEPT) { 506 if (cmd == RXRPC_CMD_ACCEPT) {
507 ret = -EINVAL;
494 if (rx->sk.sk_state != RXRPC_SERVER_LISTENING) 508 if (rx->sk.sk_state != RXRPC_SERVER_LISTENING)
495 return -EINVAL; 509 goto error_release_sock;
496 call = rxrpc_accept_call(rx, user_call_ID, NULL); 510 call = rxrpc_accept_call(rx, user_call_ID, NULL);
511 /* The socket is now unlocked. */
497 if (IS_ERR(call)) 512 if (IS_ERR(call))
498 return PTR_ERR(call); 513 return PTR_ERR(call);
499 rxrpc_put_call(call, rxrpc_call_put); 514 rxrpc_put_call(call, rxrpc_call_put);
@@ -502,18 +517,41 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
502 517
503 call = rxrpc_find_call_by_user_ID(rx, user_call_ID); 518 call = rxrpc_find_call_by_user_ID(rx, user_call_ID);
504 if (!call) { 519 if (!call) {
520 ret = -EBADSLT;
505 if (cmd != RXRPC_CMD_SEND_DATA) 521 if (cmd != RXRPC_CMD_SEND_DATA)
506 return -EBADSLT; 522 goto error_release_sock;
507 call = rxrpc_new_client_call_for_sendmsg(rx, msg, user_call_ID, 523 call = rxrpc_new_client_call_for_sendmsg(rx, msg, user_call_ID,
508 exclusive); 524 exclusive);
525 /* The socket is now unlocked... */
509 if (IS_ERR(call)) 526 if (IS_ERR(call))
510 return PTR_ERR(call); 527 return PTR_ERR(call);
528 /* ... and we have the call lock. */
529 } else {
530 switch (READ_ONCE(call->state)) {
531 case RXRPC_CALL_UNINITIALISED:
532 case RXRPC_CALL_CLIENT_AWAIT_CONN:
533 case RXRPC_CALL_SERVER_PREALLOC:
534 case RXRPC_CALL_SERVER_SECURING:
535 case RXRPC_CALL_SERVER_ACCEPTING:
536 ret = -EBUSY;
537 goto error_release_sock;
538 default:
539 break;
540 }
541
542 ret = mutex_lock_interruptible(&call->user_mutex);
543 release_sock(&rx->sk);
544 if (ret < 0) {
545 ret = -ERESTARTSYS;
546 goto error_put;
547 }
511 } 548 }
512 549
550 state = READ_ONCE(call->state);
513 _debug("CALL %d USR %lx ST %d on CONN %p", 551 _debug("CALL %d USR %lx ST %d on CONN %p",
514 call->debug_id, call->user_call_ID, call->state, call->conn); 552 call->debug_id, call->user_call_ID, state, call->conn);
515 553
516 if (call->state >= RXRPC_CALL_COMPLETE) { 554 if (state >= RXRPC_CALL_COMPLETE) {
517 /* it's too late for this call */ 555 /* it's too late for this call */
518 ret = -ESHUTDOWN; 556 ret = -ESHUTDOWN;
519 } else if (cmd == RXRPC_CMD_SEND_ABORT) { 557 } else if (cmd == RXRPC_CMD_SEND_ABORT) {
@@ -523,21 +561,27 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
523 } else if (cmd != RXRPC_CMD_SEND_DATA) { 561 } else if (cmd != RXRPC_CMD_SEND_DATA) {
524 ret = -EINVAL; 562 ret = -EINVAL;
525 } else if (rxrpc_is_client_call(call) && 563 } else if (rxrpc_is_client_call(call) &&
526 call->state != RXRPC_CALL_CLIENT_SEND_REQUEST) { 564 state != RXRPC_CALL_CLIENT_SEND_REQUEST) {
527 /* request phase complete for this client call */ 565 /* request phase complete for this client call */
528 ret = -EPROTO; 566 ret = -EPROTO;
529 } else if (rxrpc_is_service_call(call) && 567 } else if (rxrpc_is_service_call(call) &&
530 call->state != RXRPC_CALL_SERVER_ACK_REQUEST && 568 state != RXRPC_CALL_SERVER_ACK_REQUEST &&
531 call->state != RXRPC_CALL_SERVER_SEND_REPLY) { 569 state != RXRPC_CALL_SERVER_SEND_REPLY) {
532 /* Reply phase not begun or not complete for service call. */ 570 /* Reply phase not begun or not complete for service call. */
533 ret = -EPROTO; 571 ret = -EPROTO;
534 } else { 572 } else {
535 ret = rxrpc_send_data(rx, call, msg, len); 573 ret = rxrpc_send_data(rx, call, msg, len);
536 } 574 }
537 575
576 mutex_unlock(&call->user_mutex);
577error_put:
538 rxrpc_put_call(call, rxrpc_call_put); 578 rxrpc_put_call(call, rxrpc_call_put);
539 _leave(" = %d", ret); 579 _leave(" = %d", ret);
540 return ret; 580 return ret;
581
582error_release_sock:
583 release_sock(&rx->sk);
584 return ret;
541} 585}
542 586
543/** 587/**
@@ -562,22 +606,29 @@ int rxrpc_kernel_send_data(struct socket *sock, struct rxrpc_call *call,
562 ASSERTCMP(msg->msg_name, ==, NULL); 606 ASSERTCMP(msg->msg_name, ==, NULL);
563 ASSERTCMP(msg->msg_control, ==, NULL); 607 ASSERTCMP(msg->msg_control, ==, NULL);
564 608
565 lock_sock(sock->sk); 609 mutex_lock(&call->user_mutex);
566 610
567 _debug("CALL %d USR %lx ST %d on CONN %p", 611 _debug("CALL %d USR %lx ST %d on CONN %p",
568 call->debug_id, call->user_call_ID, call->state, call->conn); 612 call->debug_id, call->user_call_ID, call->state, call->conn);
569 613
570 if (call->state >= RXRPC_CALL_COMPLETE) { 614 switch (READ_ONCE(call->state)) {
571 ret = -ESHUTDOWN; /* it's too late for this call */ 615 case RXRPC_CALL_CLIENT_SEND_REQUEST:
572 } else if (call->state != RXRPC_CALL_CLIENT_SEND_REQUEST && 616 case RXRPC_CALL_SERVER_ACK_REQUEST:
573 call->state != RXRPC_CALL_SERVER_ACK_REQUEST && 617 case RXRPC_CALL_SERVER_SEND_REPLY:
574 call->state != RXRPC_CALL_SERVER_SEND_REPLY) {
575 ret = -EPROTO; /* request phase complete for this client call */
576 } else {
577 ret = rxrpc_send_data(rxrpc_sk(sock->sk), call, msg, len); 618 ret = rxrpc_send_data(rxrpc_sk(sock->sk), call, msg, len);
619 break;
620 case RXRPC_CALL_COMPLETE:
621 read_lock_bh(&call->state_lock);
622 ret = -call->error;
623 read_unlock_bh(&call->state_lock);
624 break;
625 default:
626 /* Request phase complete for this client call */
627 ret = -EPROTO;
628 break;
578 } 629 }
579 630
580 release_sock(sock->sk); 631 mutex_unlock(&call->user_mutex);
581 _leave(" = %d", ret); 632 _leave(" = %d", ret);
582 return ret; 633 return ret;
583} 634}
@@ -598,12 +649,12 @@ void rxrpc_kernel_abort_call(struct socket *sock, struct rxrpc_call *call,
598{ 649{
599 _enter("{%d},%d,%d,%s", call->debug_id, abort_code, error, why); 650 _enter("{%d},%d,%d,%s", call->debug_id, abort_code, error, why);
600 651
601 lock_sock(sock->sk); 652 mutex_lock(&call->user_mutex);
602 653
603 if (rxrpc_abort_call(why, call, 0, abort_code, error)) 654 if (rxrpc_abort_call(why, call, 0, abort_code, error))
604 rxrpc_send_abort_packet(call); 655 rxrpc_send_abort_packet(call);
605 656
606 release_sock(sock->sk); 657 mutex_unlock(&call->user_mutex);
607 _leave(""); 658 _leave("");
608} 659}
609 660
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 87956a768d1b..403790cce7d2 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -650,6 +650,18 @@ config NET_ACT_MIRRED
650 To compile this code as a module, choose M here: the 650 To compile this code as a module, choose M here: the
651 module will be called act_mirred. 651 module will be called act_mirred.
652 652
653config NET_ACT_SAMPLE
654 tristate "Traffic Sampling"
655 depends on NET_CLS_ACT
656 select PSAMPLE
657 ---help---
658 Say Y here to allow packet sampling tc action. The packet sample
659 action consists of statistically choosing packets and sampling
660 them using the psample module.
661
662 To compile this code as a module, choose M here: the
663 module will be called act_sample.
664
653config NET_ACT_IPT 665config NET_ACT_IPT
654 tristate "IPtables targets" 666 tristate "IPtables targets"
655 depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES 667 depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
@@ -707,6 +719,7 @@ config NET_ACT_SKBEDIT
707config NET_ACT_CSUM 719config NET_ACT_CSUM
708 tristate "Checksum Updating" 720 tristate "Checksum Updating"
709 depends on NET_CLS_ACT && INET 721 depends on NET_CLS_ACT && INET
722 select LIBCRC32C
710 ---help--- 723 ---help---
711 Say Y here to update some common checksum after some direct 724 Say Y here to update some common checksum after some direct
712 packet alterations. 725 packet alterations.
@@ -763,6 +776,7 @@ config NET_ACT_SKBMOD
763config NET_ACT_IFE 776config NET_ACT_IFE
764 tristate "Inter-FE action based on IETF ForCES InterFE LFB" 777 tristate "Inter-FE action based on IETF ForCES InterFE LFB"
765 depends on NET_CLS_ACT 778 depends on NET_CLS_ACT
779 select NET_IFE
766 ---help--- 780 ---help---
767 Say Y here to allow for sourcing and terminating metadata 781 Say Y here to allow for sourcing and terminating metadata
768 For details refer to netdev01 paper: 782 For details refer to netdev01 paper:
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 4bdda3634e0b..7b915d226de7 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -10,6 +10,7 @@ obj-$(CONFIG_NET_CLS_ACT) += act_api.o
10obj-$(CONFIG_NET_ACT_POLICE) += act_police.o 10obj-$(CONFIG_NET_ACT_POLICE) += act_police.o
11obj-$(CONFIG_NET_ACT_GACT) += act_gact.o 11obj-$(CONFIG_NET_ACT_GACT) += act_gact.o
12obj-$(CONFIG_NET_ACT_MIRRED) += act_mirred.o 12obj-$(CONFIG_NET_ACT_MIRRED) += act_mirred.o
13obj-$(CONFIG_NET_ACT_SAMPLE) += act_sample.o
13obj-$(CONFIG_NET_ACT_IPT) += act_ipt.o 14obj-$(CONFIG_NET_ACT_IPT) += act_ipt.o
14obj-$(CONFIG_NET_ACT_NAT) += act_nat.o 15obj-$(CONFIG_NET_ACT_NAT) += act_nat.o
15obj-$(CONFIG_NET_ACT_PEDIT) += act_pedit.o 16obj-$(CONFIG_NET_ACT_PEDIT) += act_pedit.o
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index e10456ef6f7a..e05b924618a0 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -24,6 +24,7 @@
24#include <net/net_namespace.h> 24#include <net/net_namespace.h>
25#include <net/sock.h> 25#include <net/sock.h>
26#include <net/sch_generic.h> 26#include <net/sch_generic.h>
27#include <net/pkt_cls.h>
27#include <net/act_api.h> 28#include <net/act_api.h>
28#include <net/netlink.h> 29#include <net/netlink.h>
29 30
@@ -33,6 +34,12 @@ static void free_tcf(struct rcu_head *head)
33 34
34 free_percpu(p->cpu_bstats); 35 free_percpu(p->cpu_bstats);
35 free_percpu(p->cpu_qstats); 36 free_percpu(p->cpu_qstats);
37
38 if (p->act_cookie) {
39 kfree(p->act_cookie->data);
40 kfree(p->act_cookie);
41 }
42
36 kfree(p); 43 kfree(p);
37} 44}
38 45
@@ -426,11 +433,9 @@ int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions,
426{ 433{
427 int ret = -1, i; 434 int ret = -1, i;
428 435
429 if (skb->tc_verd & TC_NCLS) { 436 if (skb_skip_tc_classify(skb))
430 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); 437 return TC_ACT_OK;
431 ret = TC_ACT_OK; 438
432 goto exec_done;
433 }
434 for (i = 0; i < nr_actions; i++) { 439 for (i = 0; i < nr_actions; i++) {
435 const struct tc_action *a = actions[i]; 440 const struct tc_action *a = actions[i];
436 441
@@ -439,9 +444,8 @@ repeat:
439 if (ret == TC_ACT_REPEAT) 444 if (ret == TC_ACT_REPEAT)
440 goto repeat; /* we need a ttl - JHS */ 445 goto repeat; /* we need a ttl - JHS */
441 if (ret != TC_ACT_PIPE) 446 if (ret != TC_ACT_PIPE)
442 goto exec_done; 447 break;
443 } 448 }
444exec_done:
445 return ret; 449 return ret;
446} 450}
447EXPORT_SYMBOL(tcf_action_exec); 451EXPORT_SYMBOL(tcf_action_exec);
@@ -478,6 +482,12 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
478 goto nla_put_failure; 482 goto nla_put_failure;
479 if (tcf_action_copy_stats(skb, a, 0)) 483 if (tcf_action_copy_stats(skb, a, 0))
480 goto nla_put_failure; 484 goto nla_put_failure;
485 if (a->act_cookie) {
486 if (nla_put(skb, TCA_ACT_COOKIE, a->act_cookie->len,
487 a->act_cookie->data))
488 goto nla_put_failure;
489 }
490
481 nest = nla_nest_start(skb, TCA_OPTIONS); 491 nest = nla_nest_start(skb, TCA_OPTIONS);
482 if (nest == NULL) 492 if (nest == NULL)
483 goto nla_put_failure; 493 goto nla_put_failure;
@@ -519,12 +529,29 @@ errout:
519 return err; 529 return err;
520} 530}
521 531
532static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb)
533{
534 struct tc_cookie *c = kzalloc(sizeof(*c), GFP_KERNEL);
535 if (!c)
536 return NULL;
537
538 c->data = nla_memdup(tb[TCA_ACT_COOKIE], GFP_KERNEL);
539 if (!c->data) {
540 kfree(c);
541 return NULL;
542 }
543 c->len = nla_len(tb[TCA_ACT_COOKIE]);
544
545 return c;
546}
547
522struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla, 548struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla,
523 struct nlattr *est, char *name, int ovr, 549 struct nlattr *est, char *name, int ovr,
524 int bind) 550 int bind)
525{ 551{
526 struct tc_action *a; 552 struct tc_action *a;
527 struct tc_action_ops *a_o; 553 struct tc_action_ops *a_o;
554 struct tc_cookie *cookie = NULL;
528 char act_name[IFNAMSIZ]; 555 char act_name[IFNAMSIZ];
529 struct nlattr *tb[TCA_ACT_MAX + 1]; 556 struct nlattr *tb[TCA_ACT_MAX + 1];
530 struct nlattr *kind; 557 struct nlattr *kind;
@@ -540,6 +567,18 @@ struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla,
540 goto err_out; 567 goto err_out;
541 if (nla_strlcpy(act_name, kind, IFNAMSIZ) >= IFNAMSIZ) 568 if (nla_strlcpy(act_name, kind, IFNAMSIZ) >= IFNAMSIZ)
542 goto err_out; 569 goto err_out;
570 if (tb[TCA_ACT_COOKIE]) {
571 int cklen = nla_len(tb[TCA_ACT_COOKIE]);
572
573 if (cklen > TC_COOKIE_MAX_SIZE)
574 goto err_out;
575
576 cookie = nla_memdup_cookie(tb);
577 if (!cookie) {
578 err = -ENOMEM;
579 goto err_out;
580 }
581 }
543 } else { 582 } else {
544 err = -EINVAL; 583 err = -EINVAL;
545 if (strlcpy(act_name, name, IFNAMSIZ) >= IFNAMSIZ) 584 if (strlcpy(act_name, name, IFNAMSIZ) >= IFNAMSIZ)
@@ -578,6 +617,14 @@ struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla,
578 if (err < 0) 617 if (err < 0)
579 goto err_mod; 618 goto err_mod;
580 619
620 if (name == NULL && tb[TCA_ACT_COOKIE]) {
621 if (a->act_cookie) {
622 kfree(a->act_cookie->data);
623 kfree(a->act_cookie);
624 }
625 a->act_cookie = cookie;
626 }
627
581 /* module count goes up only when brand new policy is created 628 /* module count goes up only when brand new policy is created
582 * if it exists and is only bound to in a_o->init() then 629 * if it exists and is only bound to in a_o->init() then
583 * ACT_P_CREATED is not returned (a zero is). 630 * ACT_P_CREATED is not returned (a zero is).
@@ -590,6 +637,10 @@ struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla,
590err_mod: 637err_mod:
591 module_put(a_o->owner); 638 module_put(a_o->owner);
592err_out: 639err_out:
640 if (cookie) {
641 kfree(cookie->data);
642 kfree(cookie);
643 }
593 return ERR_PTR(err); 644 return ERR_PTR(err);
594} 645}
595 646
@@ -817,10 +868,8 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
817 goto out_module_put; 868 goto out_module_put;
818 869
819 err = ops->walk(net, skb, &dcb, RTM_DELACTION, ops); 870 err = ops->walk(net, skb, &dcb, RTM_DELACTION, ops);
820 if (err < 0) 871 if (err <= 0)
821 goto out_module_put; 872 goto out_module_put;
822 if (err == 0)
823 goto noflush_out;
824 873
825 nla_nest_end(skb, nest); 874 nla_nest_end(skb, nest);
826 875
@@ -837,7 +886,6 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
837out_module_put: 886out_module_put:
838 module_put(ops->owner); 887 module_put(ops->owner);
839err_out: 888err_out:
840noflush_out:
841 kfree_skb(skb); 889 kfree_skb(skb);
842 return err; 890 return err;
843} 891}
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index ab8062909962..f9bb43c25697 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -113,6 +113,9 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla,
113 if (ret < 0) 113 if (ret < 0)
114 return ret; 114 return ret;
115 115
116 if (!tb[TCA_CONNMARK_PARMS])
117 return -EINVAL;
118
116 parm = nla_data(tb[TCA_CONNMARK_PARMS]); 119 parm = nla_data(tb[TCA_CONNMARK_PARMS]);
117 120
118 if (!tcf_hash_check(tn, parm->index, a, bind)) { 121 if (!tcf_hash_check(tn, parm->index, a, bind)) {
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index a0edd80a44db..e978ccd4402c 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -30,6 +30,7 @@
30#include <net/tcp.h> 30#include <net/tcp.h>
31#include <net/udp.h> 31#include <net/udp.h>
32#include <net/ip6_checksum.h> 32#include <net/ip6_checksum.h>
33#include <net/sctp/checksum.h>
33 34
34#include <net/act_api.h> 35#include <net/act_api.h>
35 36
@@ -322,6 +323,25 @@ ignore_obscure_skb:
322 return 1; 323 return 1;
323} 324}
324 325
326static int tcf_csum_sctp(struct sk_buff *skb, unsigned int ihl,
327 unsigned int ipl)
328{
329 struct sctphdr *sctph;
330
331 if (skb_is_gso(skb) && skb_shinfo(skb)->gso_type & SKB_GSO_SCTP)
332 return 1;
333
334 sctph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*sctph));
335 if (!sctph)
336 return 0;
337
338 sctph->checksum = sctp_compute_cksum(skb,
339 skb_network_offset(skb) + ihl);
340 skb->ip_summed = CHECKSUM_NONE;
341
342 return 1;
343}
344
325static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags) 345static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags)
326{ 346{
327 const struct iphdr *iph; 347 const struct iphdr *iph;
@@ -365,6 +385,11 @@ static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags)
365 ntohs(iph->tot_len), 1)) 385 ntohs(iph->tot_len), 1))
366 goto fail; 386 goto fail;
367 break; 387 break;
388 case IPPROTO_SCTP:
389 if ((update_flags & TCA_CSUM_UPDATE_FLAG_SCTP) &&
390 !tcf_csum_sctp(skb, iph->ihl * 4, ntohs(iph->tot_len)))
391 goto fail;
392 break;
368 } 393 }
369 394
370 if (update_flags & TCA_CSUM_UPDATE_FLAG_IPV4HDR) { 395 if (update_flags & TCA_CSUM_UPDATE_FLAG_IPV4HDR) {
@@ -481,6 +506,11 @@ static int tcf_csum_ipv6(struct sk_buff *skb, u32 update_flags)
481 pl + sizeof(*ip6h), 1)) 506 pl + sizeof(*ip6h), 1))
482 goto fail; 507 goto fail;
483 goto done; 508 goto done;
509 case IPPROTO_SCTP:
510 if ((update_flags & TCA_CSUM_UPDATE_FLAG_SCTP) &&
511 !tcf_csum_sctp(skb, hl, pl + sizeof(*ip6h)))
512 goto fail;
513 goto done;
484 default: 514 default:
485 goto ignore_skb; 515 goto ignore_skb;
486 } 516 }
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 80b848d3f096..71e7ff22f7c9 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -32,6 +32,7 @@
32#include <uapi/linux/tc_act/tc_ife.h> 32#include <uapi/linux/tc_act/tc_ife.h>
33#include <net/tc_act/tc_ife.h> 33#include <net/tc_act/tc_ife.h>
34#include <linux/etherdevice.h> 34#include <linux/etherdevice.h>
35#include <net/ife.h>
35 36
36#define IFE_TAB_MASK 15 37#define IFE_TAB_MASK 15
37 38
@@ -46,23 +47,6 @@ static const struct nla_policy ife_policy[TCA_IFE_MAX + 1] = {
46 [TCA_IFE_TYPE] = { .type = NLA_U16}, 47 [TCA_IFE_TYPE] = { .type = NLA_U16},
47}; 48};
48 49
49/* Caller takes care of presenting data in network order
50*/
51int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen, const void *dval)
52{
53 u32 *tlv = (u32 *)(skbdata);
54 u16 totlen = nla_total_size(dlen); /*alignment + hdr */
55 char *dptr = (char *)tlv + NLA_HDRLEN;
56 u32 htlv = attrtype << 16 | (dlen + NLA_HDRLEN);
57
58 *tlv = htonl(htlv);
59 memset(dptr, 0, totlen - NLA_HDRLEN);
60 memcpy(dptr, dval, dlen);
61
62 return totlen;
63}
64EXPORT_SYMBOL_GPL(ife_tlv_meta_encode);
65
66int ife_encode_meta_u16(u16 metaval, void *skbdata, struct tcf_meta_info *mi) 50int ife_encode_meta_u16(u16 metaval, void *skbdata, struct tcf_meta_info *mi)
67{ 51{
68 u16 edata = 0; 52 u16 edata = 0;
@@ -637,69 +621,59 @@ int find_decode_metaid(struct sk_buff *skb, struct tcf_ife_info *ife,
637 return 0; 621 return 0;
638} 622}
639 623
640struct ifeheadr {
641 __be16 metalen;
642 u8 tlv_data[];
643};
644
645struct meta_tlvhdr {
646 __be16 type;
647 __be16 len;
648};
649
650static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a, 624static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a,
651 struct tcf_result *res) 625 struct tcf_result *res)
652{ 626{
653 struct tcf_ife_info *ife = to_ife(a); 627 struct tcf_ife_info *ife = to_ife(a);
654 int action = ife->tcf_action; 628 int action = ife->tcf_action;
655 struct ifeheadr *ifehdr = (struct ifeheadr *)skb->data; 629 u8 *ifehdr_end;
656 int ifehdrln = (int)ifehdr->metalen; 630 u8 *tlv_data;
657 struct meta_tlvhdr *tlv = (struct meta_tlvhdr *)(ifehdr->tlv_data); 631 u16 metalen;
658 632
659 spin_lock(&ife->tcf_lock); 633 spin_lock(&ife->tcf_lock);
660 bstats_update(&ife->tcf_bstats, skb); 634 bstats_update(&ife->tcf_bstats, skb);
661 tcf_lastuse_update(&ife->tcf_tm); 635 tcf_lastuse_update(&ife->tcf_tm);
662 spin_unlock(&ife->tcf_lock); 636 spin_unlock(&ife->tcf_lock);
663 637
664 ifehdrln = ntohs(ifehdrln); 638 if (skb_at_tc_ingress(skb))
665 if (unlikely(!pskb_may_pull(skb, ifehdrln))) { 639 skb_push(skb, skb->dev->hard_header_len);
640
641 tlv_data = ife_decode(skb, &metalen);
642 if (unlikely(!tlv_data)) {
666 spin_lock(&ife->tcf_lock); 643 spin_lock(&ife->tcf_lock);
667 ife->tcf_qstats.drops++; 644 ife->tcf_qstats.drops++;
668 spin_unlock(&ife->tcf_lock); 645 spin_unlock(&ife->tcf_lock);
669 return TC_ACT_SHOT; 646 return TC_ACT_SHOT;
670 } 647 }
671 648
672 skb_set_mac_header(skb, ifehdrln); 649 ifehdr_end = tlv_data + metalen;
673 __skb_pull(skb, ifehdrln); 650 for (; tlv_data < ifehdr_end; tlv_data = ife_tlv_meta_next(tlv_data)) {
674 skb->protocol = eth_type_trans(skb, skb->dev); 651 u8 *curr_data;
675 ifehdrln -= IFE_METAHDRLEN; 652 u16 mtype;
676 653 u16 dlen;
677 while (ifehdrln > 0) {
678 u8 *tlvdata = (u8 *)tlv;
679 u16 mtype = tlv->type;
680 u16 mlen = tlv->len;
681 u16 alen;
682 654
683 mtype = ntohs(mtype); 655 curr_data = ife_tlv_meta_decode(tlv_data, &mtype, &dlen, NULL);
684 mlen = ntohs(mlen);
685 alen = NLA_ALIGN(mlen);
686 656
687 if (find_decode_metaid(skb, ife, mtype, (mlen - NLA_HDRLEN), 657 if (find_decode_metaid(skb, ife, mtype, dlen, curr_data)) {
688 (void *)(tlvdata + NLA_HDRLEN))) {
689 /* abuse overlimits to count when we receive metadata 658 /* abuse overlimits to count when we receive metadata
690 * but dont have an ops for it 659 * but dont have an ops for it
691 */ 660 */
692 pr_info_ratelimited("Unknown metaid %d alnlen %d\n", 661 pr_info_ratelimited("Unknown metaid %d dlen %d\n",
693 mtype, mlen); 662 mtype, dlen);
694 ife->tcf_qstats.overlimits++; 663 ife->tcf_qstats.overlimits++;
695 } 664 }
665 }
696 666
697 tlvdata += alen; 667 if (WARN_ON(tlv_data != ifehdr_end)) {
698 ifehdrln -= alen; 668 spin_lock(&ife->tcf_lock);
699 tlv = (struct meta_tlvhdr *)tlvdata; 669 ife->tcf_qstats.drops++;
670 spin_unlock(&ife->tcf_lock);
671 return TC_ACT_SHOT;
700 } 672 }
701 673
674 skb->protocol = eth_type_trans(skb, skb->dev);
702 skb_reset_network_header(skb); 675 skb_reset_network_header(skb);
676
703 return action; 677 return action;
704} 678}
705 679
@@ -727,7 +701,6 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
727 struct tcf_ife_info *ife = to_ife(a); 701 struct tcf_ife_info *ife = to_ife(a);
728 int action = ife->tcf_action; 702 int action = ife->tcf_action;
729 struct ethhdr *oethh; /* outer ether header */ 703 struct ethhdr *oethh; /* outer ether header */
730 struct ethhdr *iethh; /* inner eth header */
731 struct tcf_meta_info *e; 704 struct tcf_meta_info *e;
732 /* 705 /*
733 OUTERHDR:TOTMETALEN:{TLVHDR:Metadatum:TLVHDR..}:ORIGDATA 706 OUTERHDR:TOTMETALEN:{TLVHDR:Metadatum:TLVHDR..}:ORIGDATA
@@ -735,13 +708,13 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
735 */ 708 */
736 u16 metalen = ife_get_sz(skb, ife); 709 u16 metalen = ife_get_sz(skb, ife);
737 int hdrm = metalen + skb->dev->hard_header_len + IFE_METAHDRLEN; 710 int hdrm = metalen + skb->dev->hard_header_len + IFE_METAHDRLEN;
738 unsigned int skboff = skb->dev->hard_header_len; 711 unsigned int skboff = 0;
739 u32 at = G_TC_AT(skb->tc_verd);
740 int new_len = skb->len + hdrm; 712 int new_len = skb->len + hdrm;
741 bool exceed_mtu = false; 713 bool exceed_mtu = false;
742 int err; 714 void *ife_meta;
715 int err = 0;
743 716
744 if (at & AT_EGRESS) { 717 if (!skb_at_tc_ingress(skb)) {
745 if (new_len > skb->dev->mtu) 718 if (new_len > skb->dev->mtu)
746 exceed_mtu = true; 719 exceed_mtu = true;
747 } 720 }
@@ -766,27 +739,10 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
766 return TC_ACT_SHOT; 739 return TC_ACT_SHOT;
767 } 740 }
768 741
769 err = skb_cow_head(skb, hdrm); 742 if (skb_at_tc_ingress(skb))
770 if (unlikely(err)) {
771 ife->tcf_qstats.drops++;
772 spin_unlock(&ife->tcf_lock);
773 return TC_ACT_SHOT;
774 }
775
776 if (!(at & AT_EGRESS))
777 skb_push(skb, skb->dev->hard_header_len); 743 skb_push(skb, skb->dev->hard_header_len);
778 744
779 iethh = (struct ethhdr *)skb->data; 745 ife_meta = ife_encode(skb, metalen);
780 __skb_push(skb, hdrm);
781 memcpy(skb->data, iethh, skb->mac_len);
782 skb_reset_mac_header(skb);
783 oethh = eth_hdr(skb);
784
785 /*total metadata length */
786 metalen += IFE_METAHDRLEN;
787 metalen = htons(metalen);
788 memcpy((skb->data + skboff), &metalen, IFE_METAHDRLEN);
789 skboff += IFE_METAHDRLEN;
790 746
791 /* XXX: we dont have a clever way of telling encode to 747 /* XXX: we dont have a clever way of telling encode to
792 * not repeat some of the computations that are done by 748 * not repeat some of the computations that are done by
@@ -794,7 +750,7 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
794 */ 750 */
795 list_for_each_entry(e, &ife->metalist, metalist) { 751 list_for_each_entry(e, &ife->metalist, metalist) {
796 if (e->ops->encode) { 752 if (e->ops->encode) {
797 err = e->ops->encode(skb, (void *)(skb->data + skboff), 753 err = e->ops->encode(skb, (void *)(ife_meta + skboff),
798 e); 754 e);
799 } 755 }
800 if (err < 0) { 756 if (err < 0) {
@@ -805,18 +761,15 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
805 } 761 }
806 skboff += err; 762 skboff += err;
807 } 763 }
764 oethh = (struct ethhdr *)skb->data;
808 765
809 if (!is_zero_ether_addr(ife->eth_src)) 766 if (!is_zero_ether_addr(ife->eth_src))
810 ether_addr_copy(oethh->h_source, ife->eth_src); 767 ether_addr_copy(oethh->h_source, ife->eth_src);
811 else
812 ether_addr_copy(oethh->h_source, iethh->h_source);
813 if (!is_zero_ether_addr(ife->eth_dst)) 768 if (!is_zero_ether_addr(ife->eth_dst))
814 ether_addr_copy(oethh->h_dest, ife->eth_dst); 769 ether_addr_copy(oethh->h_dest, ife->eth_dst);
815 else
816 ether_addr_copy(oethh->h_dest, iethh->h_dest);
817 oethh->h_proto = htons(ife->eth_type); 770 oethh->h_proto = htons(ife->eth_type);
818 771
819 if (!(at & AT_EGRESS)) 772 if (skb_at_tc_ingress(skb))
820 skb_pull(skb, skb->dev->hard_header_len); 773 skb_pull(skb, skb->dev->hard_header_len);
821 774
822 spin_unlock(&ife->tcf_lock); 775 spin_unlock(&ife->tcf_lock);
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 2d9fa6e0a1b4..af49c7dca860 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -28,8 +28,6 @@
28#include <linux/tc_act/tc_mirred.h> 28#include <linux/tc_act/tc_mirred.h>
29#include <net/tc_act/tc_mirred.h> 29#include <net/tc_act/tc_mirred.h>
30 30
31#include <linux/if_arp.h>
32
33#define MIRRED_TAB_MASK 7 31#define MIRRED_TAB_MASK 7
34static LIST_HEAD(mirred_list); 32static LIST_HEAD(mirred_list);
35static DEFINE_SPINLOCK(mirred_list_lock); 33static DEFINE_SPINLOCK(mirred_list_lock);
@@ -39,15 +37,15 @@ static bool tcf_mirred_is_act_redirect(int action)
39 return action == TCA_EGRESS_REDIR || action == TCA_INGRESS_REDIR; 37 return action == TCA_EGRESS_REDIR || action == TCA_INGRESS_REDIR;
40} 38}
41 39
42static u32 tcf_mirred_act_direction(int action) 40static bool tcf_mirred_act_wants_ingress(int action)
43{ 41{
44 switch (action) { 42 switch (action) {
45 case TCA_EGRESS_REDIR: 43 case TCA_EGRESS_REDIR:
46 case TCA_EGRESS_MIRROR: 44 case TCA_EGRESS_MIRROR:
47 return AT_EGRESS; 45 return false;
48 case TCA_INGRESS_REDIR: 46 case TCA_INGRESS_REDIR:
49 case TCA_INGRESS_MIRROR: 47 case TCA_INGRESS_MIRROR:
50 return AT_INGRESS; 48 return true;
51 default: 49 default:
52 BUG(); 50 BUG();
53 } 51 }
@@ -170,7 +168,6 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
170 int retval, err = 0; 168 int retval, err = 0;
171 int m_eaction; 169 int m_eaction;
172 int mac_len; 170 int mac_len;
173 u32 at;
174 171
175 tcf_lastuse_update(&m->tcf_tm); 172 tcf_lastuse_update(&m->tcf_tm);
176 bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb); 173 bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb);
@@ -191,7 +188,6 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
191 goto out; 188 goto out;
192 } 189 }
193 190
194 at = G_TC_AT(skb->tc_verd);
195 skb2 = skb_clone(skb, GFP_ATOMIC); 191 skb2 = skb_clone(skb, GFP_ATOMIC);
196 if (!skb2) 192 if (!skb2)
197 goto out; 193 goto out;
@@ -200,8 +196,9 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
200 * and devices expect a mac header on xmit, then mac push/pull is 196 * and devices expect a mac header on xmit, then mac push/pull is
201 * needed. 197 * needed.
202 */ 198 */
203 if (at != tcf_mirred_act_direction(m_eaction) && m_mac_header_xmit) { 199 if (skb_at_tc_ingress(skb) != tcf_mirred_act_wants_ingress(m_eaction) &&
204 if (at & AT_EGRESS) { 200 m_mac_header_xmit) {
201 if (!skb_at_tc_ingress(skb)) {
205 /* caught at egress, act ingress: pull mac */ 202 /* caught at egress, act ingress: pull mac */
206 mac_len = skb_network_header(skb) - skb_mac_header(skb); 203 mac_len = skb_network_header(skb) - skb_mac_header(skb);
207 skb_pull_rcsum(skb2, mac_len); 204 skb_pull_rcsum(skb2, mac_len);
@@ -212,12 +209,14 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
212 } 209 }
213 210
214 /* mirror is always swallowed */ 211 /* mirror is always swallowed */
215 if (tcf_mirred_is_act_redirect(m_eaction)) 212 if (tcf_mirred_is_act_redirect(m_eaction)) {
216 skb2->tc_verd = SET_TC_FROM(skb2->tc_verd, at); 213 skb2->tc_redirected = 1;
214 skb2->tc_from_ingress = skb2->tc_at_ingress;
215 }
217 216
218 skb2->skb_iif = skb->dev->ifindex; 217 skb2->skb_iif = skb->dev->ifindex;
219 skb2->dev = dev; 218 skb2->dev = dev;
220 if (tcf_mirred_act_direction(m_eaction) & AT_EGRESS) 219 if (!tcf_mirred_act_wants_ingress(m_eaction))
221 err = dev_queue_xmit(skb2); 220 err = dev_queue_xmit(skb2);
222 else 221 else
223 err = netif_receive_skb(skb2); 222 err = netif_receive_skb(skb2);
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index b27c4daec88f..c1310472f620 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -22,6 +22,7 @@
22#include <net/pkt_sched.h> 22#include <net/pkt_sched.h>
23#include <linux/tc_act/tc_pedit.h> 23#include <linux/tc_act/tc_pedit.h>
24#include <net/tc_act/tc_pedit.h> 24#include <net/tc_act/tc_pedit.h>
25#include <uapi/linux/tc_act/tc_pedit.h>
25 26
26#define PEDIT_TAB_MASK 15 27#define PEDIT_TAB_MASK 15
27 28
@@ -30,18 +31,117 @@ static struct tc_action_ops act_pedit_ops;
30 31
31static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = { 32static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = {
32 [TCA_PEDIT_PARMS] = { .len = sizeof(struct tc_pedit) }, 33 [TCA_PEDIT_PARMS] = { .len = sizeof(struct tc_pedit) },
34 [TCA_PEDIT_KEYS_EX] = { .type = NLA_NESTED },
33}; 35};
34 36
37static const struct nla_policy pedit_key_ex_policy[TCA_PEDIT_KEY_EX_MAX + 1] = {
38 [TCA_PEDIT_KEY_EX_HTYPE] = { .type = NLA_U16 },
39 [TCA_PEDIT_KEY_EX_CMD] = { .type = NLA_U16 },
40};
41
42static struct tcf_pedit_key_ex *tcf_pedit_keys_ex_parse(struct nlattr *nla,
43 u8 n)
44{
45 struct tcf_pedit_key_ex *keys_ex;
46 struct tcf_pedit_key_ex *k;
47 const struct nlattr *ka;
48 int err = -EINVAL;
49 int rem;
50
51 if (!nla || !n)
52 return NULL;
53
54 keys_ex = kcalloc(n, sizeof(*k), GFP_KERNEL);
55 if (!keys_ex)
56 return ERR_PTR(-ENOMEM);
57
58 k = keys_ex;
59
60 nla_for_each_nested(ka, nla, rem) {
61 struct nlattr *tb[TCA_PEDIT_KEY_EX_MAX + 1];
62
63 if (!n) {
64 err = -EINVAL;
65 goto err_out;
66 }
67 n--;
68
69 if (nla_type(ka) != TCA_PEDIT_KEY_EX) {
70 err = -EINVAL;
71 goto err_out;
72 }
73
74 err = nla_parse_nested(tb, TCA_PEDIT_KEY_EX_MAX, ka,
75 pedit_key_ex_policy);
76 if (err)
77 goto err_out;
78
79 if (!tb[TCA_PEDIT_KEY_EX_HTYPE] ||
80 !tb[TCA_PEDIT_KEY_EX_CMD]) {
81 err = -EINVAL;
82 goto err_out;
83 }
84
85 k->htype = nla_get_u16(tb[TCA_PEDIT_KEY_EX_HTYPE]);
86 k->cmd = nla_get_u16(tb[TCA_PEDIT_KEY_EX_CMD]);
87
88 if (k->htype > TCA_PEDIT_HDR_TYPE_MAX ||
89 k->cmd > TCA_PEDIT_CMD_MAX) {
90 err = -EINVAL;
91 goto err_out;
92 }
93
94 k++;
95 }
96
97 if (n)
98 goto err_out;
99
100 return keys_ex;
101
102err_out:
103 kfree(keys_ex);
104 return ERR_PTR(err);
105}
106
107static int tcf_pedit_key_ex_dump(struct sk_buff *skb,
108 struct tcf_pedit_key_ex *keys_ex, int n)
109{
110 struct nlattr *keys_start = nla_nest_start(skb, TCA_PEDIT_KEYS_EX);
111
112 for (; n > 0; n--) {
113 struct nlattr *key_start;
114
115 key_start = nla_nest_start(skb, TCA_PEDIT_KEY_EX);
116
117 if (nla_put_u16(skb, TCA_PEDIT_KEY_EX_HTYPE, keys_ex->htype) ||
118 nla_put_u16(skb, TCA_PEDIT_KEY_EX_CMD, keys_ex->cmd)) {
119 nlmsg_trim(skb, keys_start);
120 return -EINVAL;
121 }
122
123 nla_nest_end(skb, key_start);
124
125 keys_ex++;
126 }
127
128 nla_nest_end(skb, keys_start);
129
130 return 0;
131}
132
35static int tcf_pedit_init(struct net *net, struct nlattr *nla, 133static int tcf_pedit_init(struct net *net, struct nlattr *nla,
36 struct nlattr *est, struct tc_action **a, 134 struct nlattr *est, struct tc_action **a,
37 int ovr, int bind) 135 int ovr, int bind)
38{ 136{
39 struct tc_action_net *tn = net_generic(net, pedit_net_id); 137 struct tc_action_net *tn = net_generic(net, pedit_net_id);
40 struct nlattr *tb[TCA_PEDIT_MAX + 1]; 138 struct nlattr *tb[TCA_PEDIT_MAX + 1];
139 struct nlattr *pattr;
41 struct tc_pedit *parm; 140 struct tc_pedit *parm;
42 int ret = 0, err; 141 int ret = 0, err;
43 struct tcf_pedit *p; 142 struct tcf_pedit *p;
44 struct tc_pedit_key *keys = NULL; 143 struct tc_pedit_key *keys = NULL;
144 struct tcf_pedit_key_ex *keys_ex;
45 int ksize; 145 int ksize;
46 146
47 if (nla == NULL) 147 if (nla == NULL)
@@ -51,13 +151,21 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
51 if (err < 0) 151 if (err < 0)
52 return err; 152 return err;
53 153
54 if (tb[TCA_PEDIT_PARMS] == NULL) 154 pattr = tb[TCA_PEDIT_PARMS];
155 if (!pattr)
156 pattr = tb[TCA_PEDIT_PARMS_EX];
157 if (!pattr)
55 return -EINVAL; 158 return -EINVAL;
56 parm = nla_data(tb[TCA_PEDIT_PARMS]); 159
160 parm = nla_data(pattr);
57 ksize = parm->nkeys * sizeof(struct tc_pedit_key); 161 ksize = parm->nkeys * sizeof(struct tc_pedit_key);
58 if (nla_len(tb[TCA_PEDIT_PARMS]) < sizeof(*parm) + ksize) 162 if (nla_len(pattr) < sizeof(*parm) + ksize)
59 return -EINVAL; 163 return -EINVAL;
60 164
165 keys_ex = tcf_pedit_keys_ex_parse(tb[TCA_PEDIT_KEYS_EX], parm->nkeys);
166 if (IS_ERR(keys_ex))
167 return PTR_ERR(keys_ex);
168
61 if (!tcf_hash_check(tn, parm->index, a, bind)) { 169 if (!tcf_hash_check(tn, parm->index, a, bind)) {
62 if (!parm->nkeys) 170 if (!parm->nkeys)
63 return -EINVAL; 171 return -EINVAL;
@@ -69,6 +177,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
69 keys = kmalloc(ksize, GFP_KERNEL); 177 keys = kmalloc(ksize, GFP_KERNEL);
70 if (keys == NULL) { 178 if (keys == NULL) {
71 tcf_hash_cleanup(*a, est); 179 tcf_hash_cleanup(*a, est);
180 kfree(keys_ex);
72 return -ENOMEM; 181 return -ENOMEM;
73 } 182 }
74 ret = ACT_P_CREATED; 183 ret = ACT_P_CREATED;
@@ -81,8 +190,10 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
81 p = to_pedit(*a); 190 p = to_pedit(*a);
82 if (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys) { 191 if (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys) {
83 keys = kmalloc(ksize, GFP_KERNEL); 192 keys = kmalloc(ksize, GFP_KERNEL);
84 if (keys == NULL) 193 if (!keys) {
194 kfree(keys_ex);
85 return -ENOMEM; 195 return -ENOMEM;
196 }
86 } 197 }
87 } 198 }
88 199
@@ -95,6 +206,10 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
95 p->tcfp_nkeys = parm->nkeys; 206 p->tcfp_nkeys = parm->nkeys;
96 } 207 }
97 memcpy(p->tcfp_keys, parm->keys, ksize); 208 memcpy(p->tcfp_keys, parm->keys, ksize);
209
210 kfree(p->tcfp_keys_ex);
211 p->tcfp_keys_ex = keys_ex;
212
98 spin_unlock_bh(&p->tcf_lock); 213 spin_unlock_bh(&p->tcf_lock);
99 if (ret == ACT_P_CREATED) 214 if (ret == ACT_P_CREATED)
100 tcf_hash_insert(tn, *a); 215 tcf_hash_insert(tn, *a);
@@ -106,6 +221,7 @@ static void tcf_pedit_cleanup(struct tc_action *a, int bind)
106 struct tcf_pedit *p = to_pedit(a); 221 struct tcf_pedit *p = to_pedit(a);
107 struct tc_pedit_key *keys = p->tcfp_keys; 222 struct tc_pedit_key *keys = p->tcfp_keys;
108 kfree(keys); 223 kfree(keys);
224 kfree(p->tcfp_keys_ex);
109} 225}
110 226
111static bool offset_valid(struct sk_buff *skb, int offset) 227static bool offset_valid(struct sk_buff *skb, int offset)
@@ -119,38 +235,88 @@ static bool offset_valid(struct sk_buff *skb, int offset)
119 return true; 235 return true;
120} 236}
121 237
238static int pedit_skb_hdr_offset(struct sk_buff *skb,
239 enum pedit_header_type htype, int *hoffset)
240{
241 int ret = -EINVAL;
242
243 switch (htype) {
244 case TCA_PEDIT_KEY_EX_HDR_TYPE_ETH:
245 if (skb_mac_header_was_set(skb)) {
246 *hoffset = skb_mac_offset(skb);
247 ret = 0;
248 }
249 break;
250 case TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK:
251 case TCA_PEDIT_KEY_EX_HDR_TYPE_IP4:
252 case TCA_PEDIT_KEY_EX_HDR_TYPE_IP6:
253 *hoffset = skb_network_offset(skb);
254 ret = 0;
255 break;
256 case TCA_PEDIT_KEY_EX_HDR_TYPE_TCP:
257 case TCA_PEDIT_KEY_EX_HDR_TYPE_UDP:
258 if (skb_transport_header_was_set(skb)) {
259 *hoffset = skb_transport_offset(skb);
260 ret = 0;
261 }
262 break;
263 default:
264 ret = -EINVAL;
265 break;
266 };
267
268 return ret;
269}
270
122static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, 271static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
123 struct tcf_result *res) 272 struct tcf_result *res)
124{ 273{
125 struct tcf_pedit *p = to_pedit(a); 274 struct tcf_pedit *p = to_pedit(a);
126 int i; 275 int i;
127 unsigned int off;
128 276
129 if (skb_unclone(skb, GFP_ATOMIC)) 277 if (skb_unclone(skb, GFP_ATOMIC))
130 return p->tcf_action; 278 return p->tcf_action;
131 279
132 off = skb_network_offset(skb);
133
134 spin_lock(&p->tcf_lock); 280 spin_lock(&p->tcf_lock);
135 281
136 tcf_lastuse_update(&p->tcf_tm); 282 tcf_lastuse_update(&p->tcf_tm);
137 283
138 if (p->tcfp_nkeys > 0) { 284 if (p->tcfp_nkeys > 0) {
139 struct tc_pedit_key *tkey = p->tcfp_keys; 285 struct tc_pedit_key *tkey = p->tcfp_keys;
286 struct tcf_pedit_key_ex *tkey_ex = p->tcfp_keys_ex;
287 enum pedit_header_type htype = TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK;
288 enum pedit_cmd cmd = TCA_PEDIT_KEY_EX_CMD_SET;
140 289
141 for (i = p->tcfp_nkeys; i > 0; i--, tkey++) { 290 for (i = p->tcfp_nkeys; i > 0; i--, tkey++) {
142 u32 *ptr, _data; 291 u32 *ptr, _data;
143 int offset = tkey->off; 292 int offset = tkey->off;
293 int hoffset;
294 u32 val;
295 int rc;
296
297 if (tkey_ex) {
298 htype = tkey_ex->htype;
299 cmd = tkey_ex->cmd;
300
301 tkey_ex++;
302 }
303
304 rc = pedit_skb_hdr_offset(skb, htype, &hoffset);
305 if (rc) {
306 pr_info("tc filter pedit bad header type specified (0x%x)\n",
307 htype);
308 goto bad;
309 }
144 310
145 if (tkey->offmask) { 311 if (tkey->offmask) {
146 char *d, _d; 312 char *d, _d;
147 313
148 if (!offset_valid(skb, off + tkey->at)) { 314 if (!offset_valid(skb, hoffset + tkey->at)) {
149 pr_info("tc filter pedit 'at' offset %d out of bounds\n", 315 pr_info("tc filter pedit 'at' offset %d out of bounds\n",
150 off + tkey->at); 316 hoffset + tkey->at);
151 goto bad; 317 goto bad;
152 } 318 }
153 d = skb_header_pointer(skb, off + tkey->at, 1, 319 d = skb_header_pointer(skb, hoffset + tkey->at, 1,
154 &_d); 320 &_d);
155 if (!d) 321 if (!d)
156 goto bad; 322 goto bad;
@@ -163,19 +329,32 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
163 goto bad; 329 goto bad;
164 } 330 }
165 331
166 if (!offset_valid(skb, off + offset)) { 332 if (!offset_valid(skb, hoffset + offset)) {
167 pr_info("tc filter pedit offset %d out of bounds\n", 333 pr_info("tc filter pedit offset %d out of bounds\n",
168 offset); 334 hoffset + offset);
169 goto bad; 335 goto bad;
170 } 336 }
171 337
172 ptr = skb_header_pointer(skb, off + offset, 4, &_data); 338 ptr = skb_header_pointer(skb, hoffset + offset, 4, &_data);
173 if (!ptr) 339 if (!ptr)
174 goto bad; 340 goto bad;
175 /* just do it, baby */ 341 /* just do it, baby */
176 *ptr = ((*ptr & tkey->mask) ^ tkey->val); 342 switch (cmd) {
343 case TCA_PEDIT_KEY_EX_CMD_SET:
344 val = tkey->val;
345 break;
346 case TCA_PEDIT_KEY_EX_CMD_ADD:
347 val = (*ptr + tkey->val) & ~tkey->mask;
348 break;
349 default:
350 pr_info("tc filter pedit bad command (%d)\n",
351 cmd);
352 goto bad;
353 }
354
355 *ptr = ((*ptr & tkey->mask) ^ val);
177 if (ptr == &_data) 356 if (ptr == &_data)
178 skb_store_bits(skb, off + offset, ptr, 4); 357 skb_store_bits(skb, hoffset + offset, ptr, 4);
179 } 358 }
180 359
181 goto done; 360 goto done;
@@ -215,8 +394,15 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,
215 opt->refcnt = p->tcf_refcnt - ref; 394 opt->refcnt = p->tcf_refcnt - ref;
216 opt->bindcnt = p->tcf_bindcnt - bind; 395 opt->bindcnt = p->tcf_bindcnt - bind;
217 396
218 if (nla_put(skb, TCA_PEDIT_PARMS, s, opt)) 397 if (p->tcfp_keys_ex) {
219 goto nla_put_failure; 398 tcf_pedit_key_ex_dump(skb, p->tcfp_keys_ex, p->tcfp_nkeys);
399
400 if (nla_put(skb, TCA_PEDIT_PARMS_EX, s, opt))
401 goto nla_put_failure;
402 } else {
403 if (nla_put(skb, TCA_PEDIT_PARMS, s, opt))
404 goto nla_put_failure;
405 }
220 406
221 tcf_tm_dump(&t, &p->tcf_tm); 407 tcf_tm_dump(&t, &p->tcf_tm);
222 if (nla_put_64bit(skb, TCA_PEDIT_TM, sizeof(t), &t, TCA_PEDIT_PAD)) 408 if (nla_put_64bit(skb, TCA_PEDIT_TM, sizeof(t), &t, TCA_PEDIT_PAD))
diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
new file mode 100644
index 000000000000..0b8217b4763f
--- /dev/null
+++ b/net/sched/act_sample.c
@@ -0,0 +1,276 @@
1/*
2 * net/sched/act_sample.c - Packet sampling tc action
3 * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 */
9
10#include <linux/types.h>
11#include <linux/kernel.h>
12#include <linux/string.h>
13#include <linux/errno.h>
14#include <linux/skbuff.h>
15#include <linux/rtnetlink.h>
16#include <linux/module.h>
17#include <linux/init.h>
18#include <linux/gfp.h>
19#include <net/net_namespace.h>
20#include <net/netlink.h>
21#include <net/pkt_sched.h>
22#include <linux/tc_act/tc_sample.h>
23#include <net/tc_act/tc_sample.h>
24#include <net/psample.h>
25
26#include <linux/if_arp.h>
27
28#define SAMPLE_TAB_MASK 7
29static unsigned int sample_net_id;
30static struct tc_action_ops act_sample_ops;
31
32static const struct nla_policy sample_policy[TCA_SAMPLE_MAX + 1] = {
33 [TCA_SAMPLE_PARMS] = { .len = sizeof(struct tc_sample) },
34 [TCA_SAMPLE_RATE] = { .type = NLA_U32 },
35 [TCA_SAMPLE_TRUNC_SIZE] = { .type = NLA_U32 },
36 [TCA_SAMPLE_PSAMPLE_GROUP] = { .type = NLA_U32 },
37};
38
39static int tcf_sample_init(struct net *net, struct nlattr *nla,
40 struct nlattr *est, struct tc_action **a, int ovr,
41 int bind)
42{
43 struct tc_action_net *tn = net_generic(net, sample_net_id);
44 struct nlattr *tb[TCA_SAMPLE_MAX + 1];
45 struct psample_group *psample_group;
46 struct tc_sample *parm;
47 struct tcf_sample *s;
48 bool exists = false;
49 int ret;
50
51 if (!nla)
52 return -EINVAL;
53 ret = nla_parse_nested(tb, TCA_SAMPLE_MAX, nla, sample_policy);
54 if (ret < 0)
55 return ret;
56 if (!tb[TCA_SAMPLE_PARMS] || !tb[TCA_SAMPLE_RATE] ||
57 !tb[TCA_SAMPLE_PSAMPLE_GROUP])
58 return -EINVAL;
59
60 parm = nla_data(tb[TCA_SAMPLE_PARMS]);
61
62 exists = tcf_hash_check(tn, parm->index, a, bind);
63 if (exists && bind)
64 return 0;
65
66 if (!exists) {
67 ret = tcf_hash_create(tn, parm->index, est, a,
68 &act_sample_ops, bind, false);
69 if (ret)
70 return ret;
71 ret = ACT_P_CREATED;
72 } else {
73 tcf_hash_release(*a, bind);
74 if (!ovr)
75 return -EEXIST;
76 }
77 s = to_sample(*a);
78
79 s->tcf_action = parm->action;
80 s->rate = nla_get_u32(tb[TCA_SAMPLE_RATE]);
81 s->psample_group_num = nla_get_u32(tb[TCA_SAMPLE_PSAMPLE_GROUP]);
82 psample_group = psample_group_get(net, s->psample_group_num);
83 if (!psample_group) {
84 if (ret == ACT_P_CREATED)
85 tcf_hash_release(*a, bind);
86 return -ENOMEM;
87 }
88 RCU_INIT_POINTER(s->psample_group, psample_group);
89
90 if (tb[TCA_SAMPLE_TRUNC_SIZE]) {
91 s->truncate = true;
92 s->trunc_size = nla_get_u32(tb[TCA_SAMPLE_TRUNC_SIZE]);
93 }
94
95 if (ret == ACT_P_CREATED)
96 tcf_hash_insert(tn, *a);
97 return ret;
98}
99
100static void tcf_sample_cleanup_rcu(struct rcu_head *rcu)
101{
102 struct tcf_sample *s = container_of(rcu, struct tcf_sample, rcu);
103 struct psample_group *psample_group;
104
105 psample_group = rcu_dereference_protected(s->psample_group, 1);
106 RCU_INIT_POINTER(s->psample_group, NULL);
107 psample_group_put(psample_group);
108}
109
110static void tcf_sample_cleanup(struct tc_action *a, int bind)
111{
112 struct tcf_sample *s = to_sample(a);
113
114 call_rcu(&s->rcu, tcf_sample_cleanup_rcu);
115}
116
117static bool tcf_sample_dev_ok_push(struct net_device *dev)
118{
119 switch (dev->type) {
120 case ARPHRD_TUNNEL:
121 case ARPHRD_TUNNEL6:
122 case ARPHRD_SIT:
123 case ARPHRD_IPGRE:
124 case ARPHRD_VOID:
125 case ARPHRD_NONE:
126 return false;
127 default:
128 return true;
129 }
130}
131
132static int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a,
133 struct tcf_result *res)
134{
135 struct tcf_sample *s = to_sample(a);
136 struct psample_group *psample_group;
137 int retval;
138 int size;
139 int iif;
140 int oif;
141
142 tcf_lastuse_update(&s->tcf_tm);
143 bstats_cpu_update(this_cpu_ptr(s->common.cpu_bstats), skb);
144 retval = READ_ONCE(s->tcf_action);
145
146 rcu_read_lock();
147 psample_group = rcu_dereference(s->psample_group);
148
149 /* randomly sample packets according to rate */
150 if (psample_group && (prandom_u32() % s->rate == 0)) {
151 if (!skb_at_tc_ingress(skb)) {
152 iif = skb->skb_iif;
153 oif = skb->dev->ifindex;
154 } else {
155 iif = skb->dev->ifindex;
156 oif = 0;
157 }
158
159 /* on ingress, the mac header gets popped, so push it back */
160 if (skb_at_tc_ingress(skb) && tcf_sample_dev_ok_push(skb->dev))
161 skb_push(skb, skb->mac_len);
162
163 size = s->truncate ? s->trunc_size : skb->len;
164 psample_sample_packet(psample_group, skb, size, iif, oif,
165 s->rate);
166
167 if (skb_at_tc_ingress(skb) && tcf_sample_dev_ok_push(skb->dev))
168 skb_pull(skb, skb->mac_len);
169 }
170
171 rcu_read_unlock();
172 return retval;
173}
174
175static int tcf_sample_dump(struct sk_buff *skb, struct tc_action *a,
176 int bind, int ref)
177{
178 unsigned char *b = skb_tail_pointer(skb);
179 struct tcf_sample *s = to_sample(a);
180 struct tc_sample opt = {
181 .index = s->tcf_index,
182 .action = s->tcf_action,
183 .refcnt = s->tcf_refcnt - ref,
184 .bindcnt = s->tcf_bindcnt - bind,
185 };
186 struct tcf_t t;
187
188 if (nla_put(skb, TCA_SAMPLE_PARMS, sizeof(opt), &opt))
189 goto nla_put_failure;
190
191 tcf_tm_dump(&t, &s->tcf_tm);
192 if (nla_put_64bit(skb, TCA_SAMPLE_TM, sizeof(t), &t, TCA_SAMPLE_PAD))
193 goto nla_put_failure;
194
195 if (nla_put_u32(skb, TCA_SAMPLE_RATE, s->rate))
196 goto nla_put_failure;
197
198 if (s->truncate)
199 if (nla_put_u32(skb, TCA_SAMPLE_TRUNC_SIZE, s->trunc_size))
200 goto nla_put_failure;
201
202 if (nla_put_u32(skb, TCA_SAMPLE_PSAMPLE_GROUP, s->psample_group_num))
203 goto nla_put_failure;
204 return skb->len;
205
206nla_put_failure:
207 nlmsg_trim(skb, b);
208 return -1;
209}
210
211static int tcf_sample_walker(struct net *net, struct sk_buff *skb,
212 struct netlink_callback *cb, int type,
213 const struct tc_action_ops *ops)
214{
215 struct tc_action_net *tn = net_generic(net, sample_net_id);
216
217 return tcf_generic_walker(tn, skb, cb, type, ops);
218}
219
220static int tcf_sample_search(struct net *net, struct tc_action **a, u32 index)
221{
222 struct tc_action_net *tn = net_generic(net, sample_net_id);
223
224 return tcf_hash_search(tn, a, index);
225}
226
227static struct tc_action_ops act_sample_ops = {
228 .kind = "sample",
229 .type = TCA_ACT_SAMPLE,
230 .owner = THIS_MODULE,
231 .act = tcf_sample_act,
232 .dump = tcf_sample_dump,
233 .init = tcf_sample_init,
234 .cleanup = tcf_sample_cleanup,
235 .walk = tcf_sample_walker,
236 .lookup = tcf_sample_search,
237 .size = sizeof(struct tcf_sample),
238};
239
240static __net_init int sample_init_net(struct net *net)
241{
242 struct tc_action_net *tn = net_generic(net, sample_net_id);
243
244 return tc_action_net_init(tn, &act_sample_ops, SAMPLE_TAB_MASK);
245}
246
247static void __net_exit sample_exit_net(struct net *net)
248{
249 struct tc_action_net *tn = net_generic(net, sample_net_id);
250
251 tc_action_net_exit(tn);
252}
253
254static struct pernet_operations sample_net_ops = {
255 .init = sample_init_net,
256 .exit = sample_exit_net,
257 .id = &sample_net_id,
258 .size = sizeof(struct tc_action_net),
259};
260
261static int __init sample_init_module(void)
262{
263 return tcf_register_action(&act_sample_ops, &sample_net_ops);
264}
265
266static void __exit sample_cleanup_module(void)
267{
268 tcf_unregister_action(&act_sample_ops, &sample_net_ops);
269}
270
271module_init(sample_init_module);
272module_exit(sample_cleanup_module);
273
274MODULE_AUTHOR("Yotam Gigi <yotamg@mellanox.com>");
275MODULE_DESCRIPTION("Packet sampling action");
276MODULE_LICENSE("GPL v2");
diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
index 3b7074e23024..c736627f8f4a 100644
--- a/net/sched/act_skbmod.c
+++ b/net/sched/act_skbmod.c
@@ -228,7 +228,6 @@ static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a,
228 228
229 return skb->len; 229 return skb->len;
230nla_put_failure: 230nla_put_failure:
231 rcu_read_unlock();
232 nlmsg_trim(skb, b); 231 nlmsg_trim(skb, b);
233 return -1; 232 return -1;
234} 233}
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 1ecdf809b5fa..732f7cae459d 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -19,6 +19,7 @@
19#include <linux/kernel.h> 19#include <linux/kernel.h>
20#include <linux/string.h> 20#include <linux/string.h>
21#include <linux/errno.h> 21#include <linux/errno.h>
22#include <linux/err.h>
22#include <linux/skbuff.h> 23#include <linux/skbuff.h>
23#include <linux/init.h> 24#include <linux/init.h>
24#include <linux/kmod.h> 25#include <linux/kmod.h>
@@ -38,14 +39,14 @@ static DEFINE_RWLOCK(cls_mod_lock);
38 39
39/* Find classifier type by string name */ 40/* Find classifier type by string name */
40 41
41static const struct tcf_proto_ops *tcf_proto_lookup_ops(struct nlattr *kind) 42static const struct tcf_proto_ops *tcf_proto_lookup_ops(const char *kind)
42{ 43{
43 const struct tcf_proto_ops *t, *res = NULL; 44 const struct tcf_proto_ops *t, *res = NULL;
44 45
45 if (kind) { 46 if (kind) {
46 read_lock(&cls_mod_lock); 47 read_lock(&cls_mod_lock);
47 list_for_each_entry(t, &tcf_proto_base, head) { 48 list_for_each_entry(t, &tcf_proto_base, head) {
48 if (nla_strcmp(kind, t->kind) == 0) { 49 if (strcmp(kind, t->kind) == 0) {
49 if (try_module_get(t->owner)) 50 if (try_module_get(t->owner))
50 res = t; 51 res = t;
51 break; 52 break;
@@ -127,6 +128,77 @@ static inline u32 tcf_auto_prio(struct tcf_proto *tp)
127 return first; 128 return first;
128} 129}
129 130
131static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol,
132 u32 prio, u32 parent, struct Qdisc *q)
133{
134 struct tcf_proto *tp;
135 int err;
136
137 tp = kzalloc(sizeof(*tp), GFP_KERNEL);
138 if (!tp)
139 return ERR_PTR(-ENOBUFS);
140
141 err = -ENOENT;
142 tp->ops = tcf_proto_lookup_ops(kind);
143 if (!tp->ops) {
144#ifdef CONFIG_MODULES
145 rtnl_unlock();
146 request_module("cls_%s", kind);
147 rtnl_lock();
148 tp->ops = tcf_proto_lookup_ops(kind);
149 /* We dropped the RTNL semaphore in order to perform
150 * the module load. So, even if we succeeded in loading
151 * the module we have to replay the request. We indicate
152 * this using -EAGAIN.
153 */
154 if (tp->ops) {
155 module_put(tp->ops->owner);
156 err = -EAGAIN;
157 } else {
158 err = -ENOENT;
159 }
160 goto errout;
161#endif
162 }
163 tp->classify = tp->ops->classify;
164 tp->protocol = protocol;
165 tp->prio = prio;
166 tp->classid = parent;
167 tp->q = q;
168
169 err = tp->ops->init(tp);
170 if (err) {
171 module_put(tp->ops->owner);
172 goto errout;
173 }
174 return tp;
175
176errout:
177 kfree(tp);
178 return ERR_PTR(err);
179}
180
181static bool tcf_proto_destroy(struct tcf_proto *tp, bool force)
182{
183 if (tp->ops->destroy(tp, force)) {
184 module_put(tp->ops->owner);
185 kfree_rcu(tp, rcu);
186 return true;
187 }
188 return false;
189}
190
191void tcf_destroy_chain(struct tcf_proto __rcu **fl)
192{
193 struct tcf_proto *tp;
194
195 while ((tp = rtnl_dereference(*fl)) != NULL) {
196 RCU_INIT_POINTER(*fl, tp->next);
197 tcf_proto_destroy(tp, true);
198 }
199}
200EXPORT_SYMBOL(tcf_destroy_chain);
201
130/* Add/change/delete/get a filter node */ 202/* Add/change/delete/get a filter node */
131 203
132static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n) 204static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n)
@@ -142,8 +214,8 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n)
142 struct Qdisc *q; 214 struct Qdisc *q;
143 struct tcf_proto __rcu **back; 215 struct tcf_proto __rcu **back;
144 struct tcf_proto __rcu **chain; 216 struct tcf_proto __rcu **chain;
217 struct tcf_proto *next;
145 struct tcf_proto *tp; 218 struct tcf_proto *tp;
146 const struct tcf_proto_ops *tp_ops;
147 const struct Qdisc_class_ops *cops; 219 const struct Qdisc_class_ops *cops;
148 unsigned long cl; 220 unsigned long cl;
149 unsigned long fh; 221 unsigned long fh;
@@ -222,9 +294,10 @@ replay:
222 294
223 /* And the last stroke */ 295 /* And the last stroke */
224 chain = cops->tcf_chain(q, cl); 296 chain = cops->tcf_chain(q, cl);
225 err = -EINVAL; 297 if (chain == NULL) {
226 if (chain == NULL) 298 err = -EINVAL;
227 goto errout; 299 goto errout;
300 }
228 if (n->nlmsg_type == RTM_DELTFILTER && prio == 0) { 301 if (n->nlmsg_type == RTM_DELTFILTER && prio == 0) {
229 tfilter_notify_chain(net, skb, n, chain, RTM_DELTFILTER); 302 tfilter_notify_chain(net, skb, n, chain, RTM_DELTFILTER);
230 tcf_destroy_chain(chain); 303 tcf_destroy_chain(chain);
@@ -239,10 +312,13 @@ replay:
239 if (tp->prio >= prio) { 312 if (tp->prio >= prio) {
240 if (tp->prio == prio) { 313 if (tp->prio == prio) {
241 if (!nprio || 314 if (!nprio ||
242 (tp->protocol != protocol && protocol)) 315 (tp->protocol != protocol && protocol)) {
316 err = -EINVAL;
243 goto errout; 317 goto errout;
244 } else 318 }
319 } else {
245 tp = NULL; 320 tp = NULL;
321 }
246 break; 322 break;
247 } 323 }
248 } 324 }
@@ -250,109 +326,69 @@ replay:
250 if (tp == NULL) { 326 if (tp == NULL) {
251 /* Proto-tcf does not exist, create new one */ 327 /* Proto-tcf does not exist, create new one */
252 328
253 if (tca[TCA_KIND] == NULL || !protocol) 329 if (tca[TCA_KIND] == NULL || !protocol) {
330 err = -EINVAL;
254 goto errout; 331 goto errout;
332 }
255 333
256 err = -ENOENT;
257 if (n->nlmsg_type != RTM_NEWTFILTER || 334 if (n->nlmsg_type != RTM_NEWTFILTER ||
258 !(n->nlmsg_flags & NLM_F_CREATE)) 335 !(n->nlmsg_flags & NLM_F_CREATE)) {
336 err = -ENOENT;
259 goto errout; 337 goto errout;
338 }
260 339
340 if (!nprio)
341 nprio = TC_H_MAJ(tcf_auto_prio(rtnl_dereference(*back)));
261 342
262 /* Create new proto tcf */ 343 tp = tcf_proto_create(nla_data(tca[TCA_KIND]),
263 344 protocol, nprio, parent, q);
264 err = -ENOBUFS; 345 if (IS_ERR(tp)) {
265 tp = kzalloc(sizeof(*tp), GFP_KERNEL); 346 err = PTR_ERR(tp);
266 if (tp == NULL)
267 goto errout;
268 err = -ENOENT;
269 tp_ops = tcf_proto_lookup_ops(tca[TCA_KIND]);
270 if (tp_ops == NULL) {
271#ifdef CONFIG_MODULES
272 struct nlattr *kind = tca[TCA_KIND];
273 char name[IFNAMSIZ];
274
275 if (kind != NULL &&
276 nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
277 rtnl_unlock();
278 request_module("cls_%s", name);
279 rtnl_lock();
280 tp_ops = tcf_proto_lookup_ops(kind);
281 /* We dropped the RTNL semaphore in order to
282 * perform the module load. So, even if we
283 * succeeded in loading the module we have to
284 * replay the request. We indicate this using
285 * -EAGAIN.
286 */
287 if (tp_ops != NULL) {
288 module_put(tp_ops->owner);
289 err = -EAGAIN;
290 }
291 }
292#endif
293 kfree(tp);
294 goto errout;
295 }
296 tp->ops = tp_ops;
297 tp->protocol = protocol;
298 tp->prio = nprio ? :
299 TC_H_MAJ(tcf_auto_prio(rtnl_dereference(*back)));
300 tp->q = q;
301 tp->classify = tp_ops->classify;
302 tp->classid = parent;
303
304 err = tp_ops->init(tp);
305 if (err != 0) {
306 module_put(tp_ops->owner);
307 kfree(tp);
308 goto errout; 347 goto errout;
309 } 348 }
310
311 tp_created = 1; 349 tp_created = 1;
312 350 } else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) {
313 } else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) 351 err = -EINVAL;
314 goto errout; 352 goto errout;
353 }
315 354
316 fh = tp->ops->get(tp, t->tcm_handle); 355 fh = tp->ops->get(tp, t->tcm_handle);
317 356
318 if (fh == 0) { 357 if (fh == 0) {
319 if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) { 358 if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) {
320 struct tcf_proto *next = rtnl_dereference(tp->next); 359 next = rtnl_dereference(tp->next);
321
322 RCU_INIT_POINTER(*back, next); 360 RCU_INIT_POINTER(*back, next);
323
324 tfilter_notify(net, skb, n, tp, fh, 361 tfilter_notify(net, skb, n, tp, fh,
325 RTM_DELTFILTER, false); 362 RTM_DELTFILTER, false);
326 tcf_destroy(tp, true); 363 tcf_proto_destroy(tp, true);
327 err = 0; 364 err = 0;
328 goto errout; 365 goto errout;
329 } 366 }
330 367
331 err = -ENOENT;
332 if (n->nlmsg_type != RTM_NEWTFILTER || 368 if (n->nlmsg_type != RTM_NEWTFILTER ||
333 !(n->nlmsg_flags & NLM_F_CREATE)) 369 !(n->nlmsg_flags & NLM_F_CREATE)) {
370 err = -ENOENT;
334 goto errout; 371 goto errout;
372 }
335 } else { 373 } else {
336 switch (n->nlmsg_type) { 374 switch (n->nlmsg_type) {
337 case RTM_NEWTFILTER: 375 case RTM_NEWTFILTER:
338 err = -EEXIST;
339 if (n->nlmsg_flags & NLM_F_EXCL) { 376 if (n->nlmsg_flags & NLM_F_EXCL) {
340 if (tp_created) 377 if (tp_created)
341 tcf_destroy(tp, true); 378 tcf_proto_destroy(tp, true);
379 err = -EEXIST;
342 goto errout; 380 goto errout;
343 } 381 }
344 break; 382 break;
345 case RTM_DELTFILTER: 383 case RTM_DELTFILTER:
346 err = tp->ops->delete(tp, fh); 384 err = tp->ops->delete(tp, fh);
347 if (err == 0) { 385 if (err)
348 struct tcf_proto *next = rtnl_dereference(tp->next); 386 goto errout;
349 387 next = rtnl_dereference(tp->next);
350 tfilter_notify(net, skb, n, tp, 388 tfilter_notify(net, skb, n, tp, t->tcm_handle,
351 t->tcm_handle, 389 RTM_DELTFILTER, false);
352 RTM_DELTFILTER, false); 390 if (tcf_proto_destroy(tp, false))
353 if (tcf_destroy(tp, false)) 391 RCU_INIT_POINTER(*back, next);
354 RCU_INIT_POINTER(*back, next);
355 }
356 goto errout; 392 goto errout;
357 case RTM_GETTFILTER: 393 case RTM_GETTFILTER:
358 err = tfilter_notify(net, skb, n, tp, fh, 394 err = tfilter_notify(net, skb, n, tp, fh,
@@ -374,7 +410,7 @@ replay:
374 tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER, false); 410 tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER, false);
375 } else { 411 } else {
376 if (tp_created) 412 if (tp_created)
377 tcf_destroy(tp, true); 413 tcf_proto_destroy(tp, true);
378 } 414 }
379 415
380errout: 416errout:
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index d9c97018317d..80f688436dd7 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -148,6 +148,7 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
148 struct net_device *dev = tp->q->dev_queue->dev; 148 struct net_device *dev = tp->q->dev_queue->dev;
149 struct tc_cls_bpf_offload bpf_offload = {}; 149 struct tc_cls_bpf_offload bpf_offload = {};
150 struct tc_to_netdev offload; 150 struct tc_to_netdev offload;
151 int err;
151 152
152 offload.type = TC_SETUP_CLSBPF; 153 offload.type = TC_SETUP_CLSBPF;
153 offload.cls_bpf = &bpf_offload; 154 offload.cls_bpf = &bpf_offload;
@@ -159,8 +160,13 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
159 bpf_offload.exts_integrated = prog->exts_integrated; 160 bpf_offload.exts_integrated = prog->exts_integrated;
160 bpf_offload.gen_flags = prog->gen_flags; 161 bpf_offload.gen_flags = prog->gen_flags;
161 162
162 return dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, 163 err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
163 tp->protocol, &offload); 164 tp->protocol, &offload);
165
166 if (!err && (cmd == TC_CLSBPF_ADD || cmd == TC_CLSBPF_REPLACE))
167 prog->gen_flags |= TCA_CLS_FLAGS_IN_HW;
168
169 return err;
164} 170}
165 171
166static int cls_bpf_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog, 172static int cls_bpf_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog,
@@ -511,6 +517,9 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
511 return ret; 517 return ret;
512 } 518 }
513 519
520 if (!tc_in_hw(prog->gen_flags))
521 prog->gen_flags |= TCA_CLS_FLAGS_NOT_IN_HW;
522
514 if (oldprog) { 523 if (oldprog) {
515 list_replace_rcu(&oldprog->link, &prog->link); 524 list_replace_rcu(&oldprog->link, &prog->link);
516 tcf_unbind_filter(tp, &oldprog->res); 525 tcf_unbind_filter(tp, &oldprog->res);
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index 6575aba87630..3d6b9286c203 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -129,7 +129,7 @@ static u32 flow_get_mark(const struct sk_buff *skb)
129static u32 flow_get_nfct(const struct sk_buff *skb) 129static u32 flow_get_nfct(const struct sk_buff *skb)
130{ 130{
131#if IS_ENABLED(CONFIG_NF_CONNTRACK) 131#if IS_ENABLED(CONFIG_NF_CONNTRACK)
132 return addr_fold(skb->nfct); 132 return addr_fold(skb_nfct(skb));
133#else 133#else
134 return 0; 134 return 0;
135#endif 135#endif
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 970db7a41684..9d0c99d2e9fb 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -40,6 +40,7 @@ struct fl_flow_key {
40 }; 40 };
41 struct flow_dissector_key_ports tp; 41 struct flow_dissector_key_ports tp;
42 struct flow_dissector_key_icmp icmp; 42 struct flow_dissector_key_icmp icmp;
43 struct flow_dissector_key_arp arp;
43 struct flow_dissector_key_keyid enc_key_id; 44 struct flow_dissector_key_keyid enc_key_id;
44 union { 45 union {
45 struct flow_dissector_key_ipv4_addrs enc_ipv4; 46 struct flow_dissector_key_ipv4_addrs enc_ipv4;
@@ -133,6 +134,14 @@ static void fl_clear_masked_range(struct fl_flow_key *key,
133 memset(fl_key_get_start(key, mask), 0, fl_mask_range(mask)); 134 memset(fl_key_get_start(key, mask), 0, fl_mask_range(mask));
134} 135}
135 136
137static struct cls_fl_filter *fl_lookup(struct cls_fl_head *head,
138 struct fl_flow_key *mkey)
139{
140 return rhashtable_lookup_fast(&head->ht,
141 fl_key_get_start(mkey, &head->mask),
142 head->ht_params);
143}
144
136static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, 145static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
137 struct tcf_result *res) 146 struct tcf_result *res)
138{ 147{
@@ -180,9 +189,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
180 189
181 fl_set_masked_key(&skb_mkey, &skb_key, &head->mask); 190 fl_set_masked_key(&skb_mkey, &skb_key, &head->mask);
182 191
183 f = rhashtable_lookup_fast(&head->ht, 192 f = fl_lookup(head, &skb_mkey);
184 fl_key_get_start(&skb_mkey, &head->mask),
185 head->ht_params);
186 if (f && !tc_skip_sw(f->flags)) { 193 if (f && !tc_skip_sw(f->flags)) {
187 *res = f->res; 194 *res = f->res;
188 return tcf_exts_exec(skb, &f->exts, res); 195 return tcf_exts_exec(skb, &f->exts, res);
@@ -222,6 +229,7 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f)
222 return; 229 return;
223 230
224 offload.command = TC_CLSFLOWER_DESTROY; 231 offload.command = TC_CLSFLOWER_DESTROY;
232 offload.prio = tp->prio;
225 offload.cookie = (unsigned long)f; 233 offload.cookie = (unsigned long)f;
226 234
227 tc->type = TC_SETUP_CLSFLOWER; 235 tc->type = TC_SETUP_CLSFLOWER;
@@ -253,6 +261,7 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
253 } 261 }
254 262
255 offload.command = TC_CLSFLOWER_REPLACE; 263 offload.command = TC_CLSFLOWER_REPLACE;
264 offload.prio = tp->prio;
256 offload.cookie = (unsigned long)f; 265 offload.cookie = (unsigned long)f;
257 offload.dissector = dissector; 266 offload.dissector = dissector;
258 offload.mask = mask; 267 offload.mask = mask;
@@ -264,6 +273,8 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
264 273
265 err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, 274 err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol,
266 tc); 275 tc);
276 if (!err)
277 f->flags |= TCA_CLS_FLAGS_IN_HW;
267 278
268 if (tc_skip_sw(f->flags)) 279 if (tc_skip_sw(f->flags))
269 return err; 280 return err;
@@ -280,6 +291,7 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
280 return; 291 return;
281 292
282 offload.command = TC_CLSFLOWER_STATS; 293 offload.command = TC_CLSFLOWER_STATS;
294 offload.prio = tp->prio;
283 offload.cookie = (unsigned long)f; 295 offload.cookie = (unsigned long)f;
284 offload.exts = &f->exts; 296 offload.exts = &f->exts;
285 297
@@ -401,6 +413,16 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
401 [TCA_FLOWER_KEY_ICMPV6_TYPE_MASK] = { .type = NLA_U8 }, 413 [TCA_FLOWER_KEY_ICMPV6_TYPE_MASK] = { .type = NLA_U8 },
402 [TCA_FLOWER_KEY_ICMPV6_CODE] = { .type = NLA_U8 }, 414 [TCA_FLOWER_KEY_ICMPV6_CODE] = { .type = NLA_U8 },
403 [TCA_FLOWER_KEY_ICMPV6_CODE_MASK] = { .type = NLA_U8 }, 415 [TCA_FLOWER_KEY_ICMPV6_CODE_MASK] = { .type = NLA_U8 },
416 [TCA_FLOWER_KEY_ARP_SIP] = { .type = NLA_U32 },
417 [TCA_FLOWER_KEY_ARP_SIP_MASK] = { .type = NLA_U32 },
418 [TCA_FLOWER_KEY_ARP_TIP] = { .type = NLA_U32 },
419 [TCA_FLOWER_KEY_ARP_TIP_MASK] = { .type = NLA_U32 },
420 [TCA_FLOWER_KEY_ARP_OP] = { .type = NLA_U8 },
421 [TCA_FLOWER_KEY_ARP_OP_MASK] = { .type = NLA_U8 },
422 [TCA_FLOWER_KEY_ARP_SHA] = { .len = ETH_ALEN },
423 [TCA_FLOWER_KEY_ARP_SHA_MASK] = { .len = ETH_ALEN },
424 [TCA_FLOWER_KEY_ARP_THA] = { .len = ETH_ALEN },
425 [TCA_FLOWER_KEY_ARP_THA_MASK] = { .len = ETH_ALEN },
404}; 426};
405 427
406static void fl_set_key_val(struct nlattr **tb, 428static void fl_set_key_val(struct nlattr **tb,
@@ -568,10 +590,27 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
568 &mask->icmp.type, 590 &mask->icmp.type,
569 TCA_FLOWER_KEY_ICMPV6_TYPE_MASK, 591 TCA_FLOWER_KEY_ICMPV6_TYPE_MASK,
570 sizeof(key->icmp.type)); 592 sizeof(key->icmp.type));
571 fl_set_key_val(tb, &key->icmp.code, TCA_FLOWER_KEY_ICMPV4_CODE, 593 fl_set_key_val(tb, &key->icmp.code, TCA_FLOWER_KEY_ICMPV6_CODE,
572 &mask->icmp.code, 594 &mask->icmp.code,
573 TCA_FLOWER_KEY_ICMPV4_CODE_MASK, 595 TCA_FLOWER_KEY_ICMPV6_CODE_MASK,
574 sizeof(key->icmp.code)); 596 sizeof(key->icmp.code));
597 } else if (key->basic.n_proto == htons(ETH_P_ARP) ||
598 key->basic.n_proto == htons(ETH_P_RARP)) {
599 fl_set_key_val(tb, &key->arp.sip, TCA_FLOWER_KEY_ARP_SIP,
600 &mask->arp.sip, TCA_FLOWER_KEY_ARP_SIP_MASK,
601 sizeof(key->arp.sip));
602 fl_set_key_val(tb, &key->arp.tip, TCA_FLOWER_KEY_ARP_TIP,
603 &mask->arp.tip, TCA_FLOWER_KEY_ARP_TIP_MASK,
604 sizeof(key->arp.tip));
605 fl_set_key_val(tb, &key->arp.op, TCA_FLOWER_KEY_ARP_OP,
606 &mask->arp.op, TCA_FLOWER_KEY_ARP_OP_MASK,
607 sizeof(key->arp.op));
608 fl_set_key_val(tb, key->arp.sha, TCA_FLOWER_KEY_ARP_SHA,
609 mask->arp.sha, TCA_FLOWER_KEY_ARP_SHA_MASK,
610 sizeof(key->arp.sha));
611 fl_set_key_val(tb, key->arp.tha, TCA_FLOWER_KEY_ARP_THA,
612 mask->arp.tha, TCA_FLOWER_KEY_ARP_THA_MASK,
613 sizeof(key->arp.tha));
575 } 614 }
576 615
577 if (tb[TCA_FLOWER_KEY_ENC_IPV4_SRC] || 616 if (tb[TCA_FLOWER_KEY_ENC_IPV4_SRC] ||
@@ -689,6 +728,8 @@ static void fl_init_dissector(struct cls_fl_head *head,
689 FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, 728 FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
690 FLOW_DISSECTOR_KEY_ICMP, icmp); 729 FLOW_DISSECTOR_KEY_ICMP, icmp);
691 FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, 730 FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
731 FLOW_DISSECTOR_KEY_ARP, arp);
732 FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
692 FLOW_DISSECTOR_KEY_VLAN, vlan); 733 FLOW_DISSECTOR_KEY_VLAN, vlan);
693 FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, 734 FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
694 FLOW_DISSECTOR_KEY_ENC_KEYID, enc_key_id); 735 FLOW_DISSECTOR_KEY_ENC_KEYID, enc_key_id);
@@ -796,23 +837,31 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
796 struct cls_fl_head *head = rtnl_dereference(tp->root); 837 struct cls_fl_head *head = rtnl_dereference(tp->root);
797 struct cls_fl_filter *fold = (struct cls_fl_filter *) *arg; 838 struct cls_fl_filter *fold = (struct cls_fl_filter *) *arg;
798 struct cls_fl_filter *fnew; 839 struct cls_fl_filter *fnew;
799 struct nlattr *tb[TCA_FLOWER_MAX + 1]; 840 struct nlattr **tb;
800 struct fl_flow_mask mask = {}; 841 struct fl_flow_mask mask = {};
801 int err; 842 int err;
802 843
803 if (!tca[TCA_OPTIONS]) 844 if (!tca[TCA_OPTIONS])
804 return -EINVAL; 845 return -EINVAL;
805 846
847 tb = kcalloc(TCA_FLOWER_MAX + 1, sizeof(struct nlattr *), GFP_KERNEL);
848 if (!tb)
849 return -ENOBUFS;
850
806 err = nla_parse_nested(tb, TCA_FLOWER_MAX, tca[TCA_OPTIONS], fl_policy); 851 err = nla_parse_nested(tb, TCA_FLOWER_MAX, tca[TCA_OPTIONS], fl_policy);
807 if (err < 0) 852 if (err < 0)
808 return err; 853 goto errout_tb;
809 854
810 if (fold && handle && fold->handle != handle) 855 if (fold && handle && fold->handle != handle) {
811 return -EINVAL; 856 err = -EINVAL;
857 goto errout_tb;
858 }
812 859
813 fnew = kzalloc(sizeof(*fnew), GFP_KERNEL); 860 fnew = kzalloc(sizeof(*fnew), GFP_KERNEL);
814 if (!fnew) 861 if (!fnew) {
815 return -ENOBUFS; 862 err = -ENOBUFS;
863 goto errout_tb;
864 }
816 865
817 err = tcf_exts_init(&fnew->exts, TCA_FLOWER_ACT, 0); 866 err = tcf_exts_init(&fnew->exts, TCA_FLOWER_ACT, 0);
818 if (err < 0) 867 if (err < 0)
@@ -845,6 +894,11 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
845 goto errout; 894 goto errout;
846 895
847 if (!tc_skip_sw(fnew->flags)) { 896 if (!tc_skip_sw(fnew->flags)) {
897 if (!fold && fl_lookup(head, &fnew->mkey)) {
898 err = -EEXIST;
899 goto errout;
900 }
901
848 err = rhashtable_insert_fast(&head->ht, &fnew->ht_node, 902 err = rhashtable_insert_fast(&head->ht, &fnew->ht_node,
849 head->ht_params); 903 head->ht_params);
850 if (err) 904 if (err)
@@ -860,6 +914,9 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
860 goto errout; 914 goto errout;
861 } 915 }
862 916
917 if (!tc_in_hw(fnew->flags))
918 fnew->flags |= TCA_CLS_FLAGS_NOT_IN_HW;
919
863 if (fold) { 920 if (fold) {
864 if (!tc_skip_sw(fold->flags)) 921 if (!tc_skip_sw(fold->flags))
865 rhashtable_remove_fast(&head->ht, &fold->ht_node, 922 rhashtable_remove_fast(&head->ht, &fold->ht_node,
@@ -878,11 +935,14 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
878 list_add_tail_rcu(&fnew->list, &head->filters); 935 list_add_tail_rcu(&fnew->list, &head->filters);
879 } 936 }
880 937
938 kfree(tb);
881 return 0; 939 return 0;
882 940
883errout: 941errout:
884 tcf_exts_destroy(&fnew->exts); 942 tcf_exts_destroy(&fnew->exts);
885 kfree(fnew); 943 kfree(fnew);
944errout_tb:
945 kfree(tb);
886 return err; 946 return err;
887} 947}
888 948
@@ -1112,6 +1172,27 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
1112 TCA_FLOWER_KEY_ICMPV6_CODE_MASK, 1172 TCA_FLOWER_KEY_ICMPV6_CODE_MASK,
1113 sizeof(key->icmp.code)))) 1173 sizeof(key->icmp.code))))
1114 goto nla_put_failure; 1174 goto nla_put_failure;
1175 else if ((key->basic.n_proto == htons(ETH_P_ARP) ||
1176 key->basic.n_proto == htons(ETH_P_RARP)) &&
1177 (fl_dump_key_val(skb, &key->arp.sip,
1178 TCA_FLOWER_KEY_ARP_SIP, &mask->arp.sip,
1179 TCA_FLOWER_KEY_ARP_SIP_MASK,
1180 sizeof(key->arp.sip)) ||
1181 fl_dump_key_val(skb, &key->arp.tip,
1182 TCA_FLOWER_KEY_ARP_TIP, &mask->arp.tip,
1183 TCA_FLOWER_KEY_ARP_TIP_MASK,
1184 sizeof(key->arp.tip)) ||
1185 fl_dump_key_val(skb, &key->arp.op,
1186 TCA_FLOWER_KEY_ARP_OP, &mask->arp.op,
1187 TCA_FLOWER_KEY_ARP_OP_MASK,
1188 sizeof(key->arp.op)) ||
1189 fl_dump_key_val(skb, key->arp.sha, TCA_FLOWER_KEY_ARP_SHA,
1190 mask->arp.sha, TCA_FLOWER_KEY_ARP_SHA_MASK,
1191 sizeof(key->arp.sha)) ||
1192 fl_dump_key_val(skb, key->arp.tha, TCA_FLOWER_KEY_ARP_THA,
1193 mask->arp.tha, TCA_FLOWER_KEY_ARP_THA_MASK,
1194 sizeof(key->arp.tha))))
1195 goto nla_put_failure;
1115 1196
1116 if (key->enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS && 1197 if (key->enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS &&
1117 (fl_dump_key_val(skb, &key->enc_ipv4.src, 1198 (fl_dump_key_val(skb, &key->enc_ipv4.src,
@@ -1153,7 +1234,8 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
1153 if (fl_dump_key_flags(skb, key->control.flags, mask->control.flags)) 1234 if (fl_dump_key_flags(skb, key->control.flags, mask->control.flags))
1154 goto nla_put_failure; 1235 goto nla_put_failure;
1155 1236
1156 nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags); 1237 if (f->flags && nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags))
1238 goto nla_put_failure;
1157 1239
1158 if (tcf_exts_dump(skb, &f->exts)) 1240 if (tcf_exts_dump(skb, &f->exts))
1159 goto nla_put_failure; 1241 goto nla_put_failure;
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index f935429bd5ef..224eb2c14346 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -16,16 +16,11 @@
16#include <net/sch_generic.h> 16#include <net/sch_generic.h>
17#include <net/pkt_cls.h> 17#include <net/pkt_cls.h>
18 18
19struct cls_mall_filter { 19struct cls_mall_head {
20 struct tcf_exts exts; 20 struct tcf_exts exts;
21 struct tcf_result res; 21 struct tcf_result res;
22 u32 handle; 22 u32 handle;
23 struct rcu_head rcu;
24 u32 flags; 23 u32 flags;
25};
26
27struct cls_mall_head {
28 struct cls_mall_filter *filter;
29 struct rcu_head rcu; 24 struct rcu_head rcu;
30}; 25};
31 26
@@ -33,56 +28,52 @@ static int mall_classify(struct sk_buff *skb, const struct tcf_proto *tp,
33 struct tcf_result *res) 28 struct tcf_result *res)
34{ 29{
35 struct cls_mall_head *head = rcu_dereference_bh(tp->root); 30 struct cls_mall_head *head = rcu_dereference_bh(tp->root);
36 struct cls_mall_filter *f = head->filter;
37 31
38 if (tc_skip_sw(f->flags)) 32 if (tc_skip_sw(head->flags))
39 return -1; 33 return -1;
40 34
41 return tcf_exts_exec(skb, &f->exts, res); 35 return tcf_exts_exec(skb, &head->exts, res);
42} 36}
43 37
44static int mall_init(struct tcf_proto *tp) 38static int mall_init(struct tcf_proto *tp)
45{ 39{
46 struct cls_mall_head *head;
47
48 head = kzalloc(sizeof(*head), GFP_KERNEL);
49 if (!head)
50 return -ENOBUFS;
51
52 rcu_assign_pointer(tp->root, head);
53
54 return 0; 40 return 0;
55} 41}
56 42
57static void mall_destroy_filter(struct rcu_head *head) 43static void mall_destroy_rcu(struct rcu_head *rcu)
58{ 44{
59 struct cls_mall_filter *f = container_of(head, struct cls_mall_filter, rcu); 45 struct cls_mall_head *head = container_of(rcu, struct cls_mall_head,
46 rcu);
60 47
61 tcf_exts_destroy(&f->exts); 48 tcf_exts_destroy(&head->exts);
62 49 kfree(head);
63 kfree(f);
64} 50}
65 51
66static int mall_replace_hw_filter(struct tcf_proto *tp, 52static int mall_replace_hw_filter(struct tcf_proto *tp,
67 struct cls_mall_filter *f, 53 struct cls_mall_head *head,
68 unsigned long cookie) 54 unsigned long cookie)
69{ 55{
70 struct net_device *dev = tp->q->dev_queue->dev; 56 struct net_device *dev = tp->q->dev_queue->dev;
71 struct tc_to_netdev offload; 57 struct tc_to_netdev offload;
72 struct tc_cls_matchall_offload mall_offload = {0}; 58 struct tc_cls_matchall_offload mall_offload = {0};
59 int err;
73 60
74 offload.type = TC_SETUP_MATCHALL; 61 offload.type = TC_SETUP_MATCHALL;
75 offload.cls_mall = &mall_offload; 62 offload.cls_mall = &mall_offload;
76 offload.cls_mall->command = TC_CLSMATCHALL_REPLACE; 63 offload.cls_mall->command = TC_CLSMATCHALL_REPLACE;
77 offload.cls_mall->exts = &f->exts; 64 offload.cls_mall->exts = &head->exts;
78 offload.cls_mall->cookie = cookie; 65 offload.cls_mall->cookie = cookie;
79 66
80 return dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, 67 err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol,
81 &offload); 68 &offload);
69 if (!err)
70 head->flags |= TCA_CLS_FLAGS_IN_HW;
71
72 return err;
82} 73}
83 74
84static void mall_destroy_hw_filter(struct tcf_proto *tp, 75static void mall_destroy_hw_filter(struct tcf_proto *tp,
85 struct cls_mall_filter *f, 76 struct cls_mall_head *head,
86 unsigned long cookie) 77 unsigned long cookie)
87{ 78{
88 struct net_device *dev = tp->q->dev_queue->dev; 79 struct net_device *dev = tp->q->dev_queue->dev;
@@ -103,29 +94,20 @@ static bool mall_destroy(struct tcf_proto *tp, bool force)
103{ 94{
104 struct cls_mall_head *head = rtnl_dereference(tp->root); 95 struct cls_mall_head *head = rtnl_dereference(tp->root);
105 struct net_device *dev = tp->q->dev_queue->dev; 96 struct net_device *dev = tp->q->dev_queue->dev;
106 struct cls_mall_filter *f = head->filter;
107 97
108 if (!force && f) 98 if (!head)
109 return false; 99 return true;
110 100
111 if (f) { 101 if (tc_should_offload(dev, tp, head->flags))
112 if (tc_should_offload(dev, tp, f->flags)) 102 mall_destroy_hw_filter(tp, head, (unsigned long) head);
113 mall_destroy_hw_filter(tp, f, (unsigned long) f);
114 103
115 call_rcu(&f->rcu, mall_destroy_filter); 104 call_rcu(&head->rcu, mall_destroy_rcu);
116 }
117 kfree_rcu(head, rcu);
118 return true; 105 return true;
119} 106}
120 107
121static unsigned long mall_get(struct tcf_proto *tp, u32 handle) 108static unsigned long mall_get(struct tcf_proto *tp, u32 handle)
122{ 109{
123 struct cls_mall_head *head = rtnl_dereference(tp->root); 110 return 0UL;
124 struct cls_mall_filter *f = head->filter;
125
126 if (f && f->handle == handle)
127 return (unsigned long) f;
128 return 0;
129} 111}
130 112
131static const struct nla_policy mall_policy[TCA_MATCHALL_MAX + 1] = { 113static const struct nla_policy mall_policy[TCA_MATCHALL_MAX + 1] = {
@@ -134,26 +116,31 @@ static const struct nla_policy mall_policy[TCA_MATCHALL_MAX + 1] = {
134}; 116};
135 117
136static int mall_set_parms(struct net *net, struct tcf_proto *tp, 118static int mall_set_parms(struct net *net, struct tcf_proto *tp,
137 struct cls_mall_filter *f, 119 struct cls_mall_head *head,
138 unsigned long base, struct nlattr **tb, 120 unsigned long base, struct nlattr **tb,
139 struct nlattr *est, bool ovr) 121 struct nlattr *est, bool ovr)
140{ 122{
141 struct tcf_exts e; 123 struct tcf_exts e;
142 int err; 124 int err;
143 125
144 tcf_exts_init(&e, TCA_MATCHALL_ACT, 0); 126 err = tcf_exts_init(&e, TCA_MATCHALL_ACT, 0);
127 if (err)
128 return err;
145 err = tcf_exts_validate(net, tp, tb, est, &e, ovr); 129 err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
146 if (err < 0) 130 if (err < 0)
147 return err; 131 goto errout;
148 132
149 if (tb[TCA_MATCHALL_CLASSID]) { 133 if (tb[TCA_MATCHALL_CLASSID]) {
150 f->res.classid = nla_get_u32(tb[TCA_MATCHALL_CLASSID]); 134 head->res.classid = nla_get_u32(tb[TCA_MATCHALL_CLASSID]);
151 tcf_bind_filter(tp, &f->res, base); 135 tcf_bind_filter(tp, &head->res, base);
152 } 136 }
153 137
154 tcf_exts_change(tp, &f->exts, &e); 138 tcf_exts_change(tp, &head->exts, &e);
155 139
156 return 0; 140 return 0;
141errout:
142 tcf_exts_destroy(&e);
143 return err;
157} 144}
158 145
159static int mall_change(struct net *net, struct sk_buff *in_skb, 146static int mall_change(struct net *net, struct sk_buff *in_skb,
@@ -162,21 +149,17 @@ static int mall_change(struct net *net, struct sk_buff *in_skb,
162 unsigned long *arg, bool ovr) 149 unsigned long *arg, bool ovr)
163{ 150{
164 struct cls_mall_head *head = rtnl_dereference(tp->root); 151 struct cls_mall_head *head = rtnl_dereference(tp->root);
165 struct cls_mall_filter *fold = (struct cls_mall_filter *) *arg;
166 struct net_device *dev = tp->q->dev_queue->dev; 152 struct net_device *dev = tp->q->dev_queue->dev;
167 struct cls_mall_filter *f;
168 struct nlattr *tb[TCA_MATCHALL_MAX + 1]; 153 struct nlattr *tb[TCA_MATCHALL_MAX + 1];
154 struct cls_mall_head *new;
169 u32 flags = 0; 155 u32 flags = 0;
170 int err; 156 int err;
171 157
172 if (!tca[TCA_OPTIONS]) 158 if (!tca[TCA_OPTIONS])
173 return -EINVAL; 159 return -EINVAL;
174 160
175 if (head->filter) 161 if (head)
176 return -EBUSY; 162 return -EEXIST;
177
178 if (fold)
179 return -EINVAL;
180 163
181 err = nla_parse_nested(tb, TCA_MATCHALL_MAX, 164 err = nla_parse_nested(tb, TCA_MATCHALL_MAX,
182 tca[TCA_OPTIONS], mall_policy); 165 tca[TCA_OPTIONS], mall_policy);
@@ -189,64 +172,62 @@ static int mall_change(struct net *net, struct sk_buff *in_skb,
189 return -EINVAL; 172 return -EINVAL;
190 } 173 }
191 174
192 f = kzalloc(sizeof(*f), GFP_KERNEL); 175 new = kzalloc(sizeof(*new), GFP_KERNEL);
193 if (!f) 176 if (!new)
194 return -ENOBUFS; 177 return -ENOBUFS;
195 178
196 tcf_exts_init(&f->exts, TCA_MATCHALL_ACT, 0); 179 err = tcf_exts_init(&new->exts, TCA_MATCHALL_ACT, 0);
180 if (err)
181 goto err_exts_init;
197 182
198 if (!handle) 183 if (!handle)
199 handle = 1; 184 handle = 1;
200 f->handle = handle; 185 new->handle = handle;
201 f->flags = flags; 186 new->flags = flags;
202 187
203 err = mall_set_parms(net, tp, f, base, tb, tca[TCA_RATE], ovr); 188 err = mall_set_parms(net, tp, new, base, tb, tca[TCA_RATE], ovr);
204 if (err) 189 if (err)
205 goto errout; 190 goto err_set_parms;
206 191
207 if (tc_should_offload(dev, tp, flags)) { 192 if (tc_should_offload(dev, tp, flags)) {
208 err = mall_replace_hw_filter(tp, f, (unsigned long) f); 193 err = mall_replace_hw_filter(tp, new, (unsigned long) new);
209 if (err) { 194 if (err) {
210 if (tc_skip_sw(flags)) 195 if (tc_skip_sw(flags))
211 goto errout; 196 goto err_replace_hw_filter;
212 else 197 else
213 err = 0; 198 err = 0;
214 } 199 }
215 } 200 }
216 201
217 *arg = (unsigned long) f; 202 if (!tc_in_hw(new->flags))
218 rcu_assign_pointer(head->filter, f); 203 new->flags |= TCA_CLS_FLAGS_NOT_IN_HW;
219 204
205 *arg = (unsigned long) head;
206 rcu_assign_pointer(tp->root, new);
207 if (head)
208 call_rcu(&head->rcu, mall_destroy_rcu);
220 return 0; 209 return 0;
221 210
222errout: 211err_replace_hw_filter:
223 kfree(f); 212err_set_parms:
213 tcf_exts_destroy(&new->exts);
214err_exts_init:
215 kfree(new);
224 return err; 216 return err;
225} 217}
226 218
227static int mall_delete(struct tcf_proto *tp, unsigned long arg) 219static int mall_delete(struct tcf_proto *tp, unsigned long arg)
228{ 220{
229 struct cls_mall_head *head = rtnl_dereference(tp->root); 221 return -EOPNOTSUPP;
230 struct cls_mall_filter *f = (struct cls_mall_filter *) arg;
231 struct net_device *dev = tp->q->dev_queue->dev;
232
233 if (tc_should_offload(dev, tp, f->flags))
234 mall_destroy_hw_filter(tp, f, (unsigned long) f);
235
236 RCU_INIT_POINTER(head->filter, NULL);
237 tcf_unbind_filter(tp, &f->res);
238 call_rcu(&f->rcu, mall_destroy_filter);
239 return 0;
240} 222}
241 223
242static void mall_walk(struct tcf_proto *tp, struct tcf_walker *arg) 224static void mall_walk(struct tcf_proto *tp, struct tcf_walker *arg)
243{ 225{
244 struct cls_mall_head *head = rtnl_dereference(tp->root); 226 struct cls_mall_head *head = rtnl_dereference(tp->root);
245 struct cls_mall_filter *f = head->filter;
246 227
247 if (arg->count < arg->skip) 228 if (arg->count < arg->skip)
248 goto skip; 229 goto skip;
249 if (arg->fn(tp, (unsigned long) f, arg) < 0) 230 if (arg->fn(tp, (unsigned long) head, arg) < 0)
250 arg->stop = 1; 231 arg->stop = 1;
251skip: 232skip:
252 arg->count++; 233 arg->count++;
@@ -255,28 +236,31 @@ skip:
255static int mall_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, 236static int mall_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
256 struct sk_buff *skb, struct tcmsg *t) 237 struct sk_buff *skb, struct tcmsg *t)
257{ 238{
258 struct cls_mall_filter *f = (struct cls_mall_filter *) fh; 239 struct cls_mall_head *head = (struct cls_mall_head *) fh;
259 struct nlattr *nest; 240 struct nlattr *nest;
260 241
261 if (!f) 242 if (!head)
262 return skb->len; 243 return skb->len;
263 244
264 t->tcm_handle = f->handle; 245 t->tcm_handle = head->handle;
265 246
266 nest = nla_nest_start(skb, TCA_OPTIONS); 247 nest = nla_nest_start(skb, TCA_OPTIONS);
267 if (!nest) 248 if (!nest)
268 goto nla_put_failure; 249 goto nla_put_failure;
269 250
270 if (f->res.classid && 251 if (head->res.classid &&
271 nla_put_u32(skb, TCA_MATCHALL_CLASSID, f->res.classid)) 252 nla_put_u32(skb, TCA_MATCHALL_CLASSID, head->res.classid))
253 goto nla_put_failure;
254
255 if (head->flags && nla_put_u32(skb, TCA_MATCHALL_FLAGS, head->flags))
272 goto nla_put_failure; 256 goto nla_put_failure;
273 257
274 if (tcf_exts_dump(skb, &f->exts)) 258 if (tcf_exts_dump(skb, &head->exts))
275 goto nla_put_failure; 259 goto nla_put_failure;
276 260
277 nla_nest_end(skb, nest); 261 nla_nest_end(skb, nest);
278 262
279 if (tcf_exts_dump_stats(skb, &f->exts) < 0) 263 if (tcf_exts_dump_stats(skb, &head->exts) < 0)
280 goto nla_put_failure; 264 goto nla_put_failure;
281 265
282 return skb->len; 266 return skb->len;
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index ae83c3aec308..4dbe0c680fe6 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -334,7 +334,6 @@ static int u32_init(struct tcf_proto *tp)
334 if (root_ht == NULL) 334 if (root_ht == NULL)
335 return -ENOBUFS; 335 return -ENOBUFS;
336 336
337 root_ht->divisor = 0;
338 root_ht->refcnt++; 337 root_ht->refcnt++;
339 root_ht->handle = tp_c ? gen_new_htid(tp_c) : 0x80000000; 338 root_ht->handle = tp_c ? gen_new_htid(tp_c) : 0x80000000;
340 root_ht->prio = tp->prio; 339 root_ht->prio = tp->prio;
@@ -524,6 +523,10 @@ static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
524 523
525 err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, 524 err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
526 tp->protocol, &offload); 525 tp->protocol, &offload);
526
527 if (!err)
528 n->flags |= TCA_CLS_FLAGS_IN_HW;
529
527 if (tc_skip_sw(flags)) 530 if (tc_skip_sw(flags))
528 return err; 531 return err;
529 532
@@ -896,6 +899,9 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
896 return err; 899 return err;
897 } 900 }
898 901
902 if (!tc_in_hw(new->flags))
903 new->flags |= TCA_CLS_FLAGS_NOT_IN_HW;
904
899 u32_replace_knode(tp, tp_c, new); 905 u32_replace_knode(tp, tp_c, new);
900 tcf_unbind_filter(tp, &n->res); 906 tcf_unbind_filter(tp, &n->res);
901 call_rcu(&n->rcu, u32_delete_key_rcu); 907 call_rcu(&n->rcu, u32_delete_key_rcu);
@@ -1015,6 +1021,9 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
1015 if (err) 1021 if (err)
1016 goto errhw; 1022 goto errhw;
1017 1023
1024 if (!tc_in_hw(n->flags))
1025 n->flags |= TCA_CLS_FLAGS_NOT_IN_HW;
1026
1018 ins = &ht->ht[TC_U32_HASH(handle)]; 1027 ins = &ht->ht[TC_U32_HASH(handle)];
1019 for (pins = rtnl_dereference(*ins); pins; 1028 for (pins = rtnl_dereference(*ins); pins;
1020 ins = &pins->next, pins = rtnl_dereference(*ins)) 1029 ins = &pins->next, pins = rtnl_dereference(*ins))
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index 41c80b6c3906..ae7e4f5b348b 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -63,6 +63,7 @@
63#include <linux/types.h> 63#include <linux/types.h>
64#include <linux/kernel.h> 64#include <linux/kernel.h>
65#include <linux/sched.h> 65#include <linux/sched.h>
66#include <linux/sched/loadavg.h>
66#include <linux/string.h> 67#include <linux/string.h>
67#include <linux/skbuff.h> 68#include <linux/skbuff.h>
68#include <linux/random.h> 69#include <linux/random.h>
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index d7b93429f0cc..bcf49cd22786 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -440,7 +440,6 @@ void qdisc_put_rtab(struct qdisc_rate_table *tab)
440EXPORT_SYMBOL(qdisc_put_rtab); 440EXPORT_SYMBOL(qdisc_put_rtab);
441 441
442static LIST_HEAD(qdisc_stab_list); 442static LIST_HEAD(qdisc_stab_list);
443static DEFINE_SPINLOCK(qdisc_stab_lock);
444 443
445static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = { 444static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
446 [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) }, 445 [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
@@ -474,20 +473,15 @@ static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
474 if (tsize != s->tsize || (!tab && tsize > 0)) 473 if (tsize != s->tsize || (!tab && tsize > 0))
475 return ERR_PTR(-EINVAL); 474 return ERR_PTR(-EINVAL);
476 475
477 spin_lock(&qdisc_stab_lock);
478
479 list_for_each_entry(stab, &qdisc_stab_list, list) { 476 list_for_each_entry(stab, &qdisc_stab_list, list) {
480 if (memcmp(&stab->szopts, s, sizeof(*s))) 477 if (memcmp(&stab->szopts, s, sizeof(*s)))
481 continue; 478 continue;
482 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16))) 479 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
483 continue; 480 continue;
484 stab->refcnt++; 481 stab->refcnt++;
485 spin_unlock(&qdisc_stab_lock);
486 return stab; 482 return stab;
487 } 483 }
488 484
489 spin_unlock(&qdisc_stab_lock);
490
491 stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL); 485 stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
492 if (!stab) 486 if (!stab)
493 return ERR_PTR(-ENOMEM); 487 return ERR_PTR(-ENOMEM);
@@ -497,9 +491,7 @@ static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
497 if (tsize > 0) 491 if (tsize > 0)
498 memcpy(stab->data, tab, tsize * sizeof(u16)); 492 memcpy(stab->data, tab, tsize * sizeof(u16));
499 493
500 spin_lock(&qdisc_stab_lock);
501 list_add_tail(&stab->list, &qdisc_stab_list); 494 list_add_tail(&stab->list, &qdisc_stab_list);
502 spin_unlock(&qdisc_stab_lock);
503 495
504 return stab; 496 return stab;
505} 497}
@@ -514,14 +506,10 @@ void qdisc_put_stab(struct qdisc_size_table *tab)
514 if (!tab) 506 if (!tab)
515 return; 507 return;
516 508
517 spin_lock(&qdisc_stab_lock);
518
519 if (--tab->refcnt == 0) { 509 if (--tab->refcnt == 0) {
520 list_del(&tab->list); 510 list_del(&tab->list);
521 call_rcu_bh(&tab->rcu, stab_kfree_rcu); 511 call_rcu_bh(&tab->rcu, stab_kfree_rcu);
522 } 512 }
523
524 spin_unlock(&qdisc_stab_lock);
525} 513}
526EXPORT_SYMBOL(qdisc_put_stab); 514EXPORT_SYMBOL(qdisc_put_stab);
527 515
@@ -1019,6 +1007,8 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
1019 1007
1020 return sch; 1008 return sch;
1021 } 1009 }
1010 /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1011 ops->destroy(sch);
1022err_out3: 1012err_out3:
1023 dev_put(dev); 1013 dev_put(dev);
1024 kfree((char *) sch - sch->padded); 1014 kfree((char *) sch - sch->padded);
@@ -1861,6 +1851,7 @@ int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
1861{ 1851{
1862 __be16 protocol = tc_skb_protocol(skb); 1852 __be16 protocol = tc_skb_protocol(skb);
1863#ifdef CONFIG_NET_CLS_ACT 1853#ifdef CONFIG_NET_CLS_ACT
1854 const int max_reclassify_loop = 4;
1864 const struct tcf_proto *old_tp = tp; 1855 const struct tcf_proto *old_tp = tp;
1865 int limit = 0; 1856 int limit = 0;
1866 1857
@@ -1885,7 +1876,7 @@ reclassify:
1885 return TC_ACT_UNSPEC; /* signal: continue lookup */ 1876 return TC_ACT_UNSPEC; /* signal: continue lookup */
1886#ifdef CONFIG_NET_CLS_ACT 1877#ifdef CONFIG_NET_CLS_ACT
1887reset: 1878reset:
1888 if (unlikely(limit++ >= MAX_REC_LOOP)) { 1879 if (unlikely(limit++ >= max_reclassify_loop)) {
1889 net_notice_ratelimited("%s: reclassify loop, rule prio %u, protocol %02x\n", 1880 net_notice_ratelimited("%s: reclassify loop, rule prio %u, protocol %02x\n",
1890 tp->q->ops->id, tp->prio & 0xffff, 1881 tp->q->ops->id, tp->prio & 0xffff,
1891 ntohs(tp->protocol)); 1882 ntohs(tp->protocol));
@@ -1899,28 +1890,6 @@ reset:
1899} 1890}
1900EXPORT_SYMBOL(tc_classify); 1891EXPORT_SYMBOL(tc_classify);
1901 1892
1902bool tcf_destroy(struct tcf_proto *tp, bool force)
1903{
1904 if (tp->ops->destroy(tp, force)) {
1905 module_put(tp->ops->owner);
1906 kfree_rcu(tp, rcu);
1907 return true;
1908 }
1909
1910 return false;
1911}
1912
1913void tcf_destroy_chain(struct tcf_proto __rcu **fl)
1914{
1915 struct tcf_proto *tp;
1916
1917 while ((tp = rtnl_dereference(*fl)) != NULL) {
1918 RCU_INIT_POINTER(*fl, tp->next);
1919 tcf_destroy(tp, true);
1920 }
1921}
1922EXPORT_SYMBOL(tcf_destroy_chain);
1923
1924#ifdef CONFIG_PROC_FS 1893#ifdef CONFIG_PROC_FS
1925static int psched_show(struct seq_file *seq, void *v) 1894static int psched_show(struct seq_file *seq, void *v)
1926{ 1895{
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index 481e4f12aeb4..2209c2ddacbf 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -15,6 +15,7 @@
15#include <linux/file.h> /* for fput */ 15#include <linux/file.h> /* for fput */
16#include <net/netlink.h> 16#include <net/netlink.h>
17#include <net/pkt_sched.h> 17#include <net/pkt_sched.h>
18#include <net/pkt_cls.h>
18 19
19/* 20/*
20 * The ATM queuing discipline provides a framework for invoking classifiers 21 * The ATM queuing discipline provides a framework for invoking classifiers
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index f1207582cbf3..d6ca18dc04c3 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -19,6 +19,7 @@
19#include <linux/skbuff.h> 19#include <linux/skbuff.h>
20#include <net/netlink.h> 20#include <net/netlink.h>
21#include <net/pkt_sched.h> 21#include <net/pkt_sched.h>
22#include <net/pkt_cls.h>
22 23
23 24
24/* Class-Based Queueing (CBQ) algorithm. 25/* Class-Based Queueing (CBQ) algorithm.
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index 3b6d5bd69101..3b86a97bc67c 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -16,6 +16,7 @@
16#include <linux/skbuff.h> 16#include <linux/skbuff.h>
17#include <linux/vmalloc.h> 17#include <linux/vmalloc.h>
18#include <net/pkt_sched.h> 18#include <net/pkt_sched.h>
19#include <net/pkt_cls.h>
19#include <net/inet_ecn.h> 20#include <net/inet_ecn.h>
20#include <net/red.h> 21#include <net/red.h>
21#include <net/flow_dissector.h> 22#include <net/flow_dissector.h>
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index 1308bbf460f7..5334e309f17f 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -13,6 +13,7 @@
13#include <linux/rtnetlink.h> 13#include <linux/rtnetlink.h>
14#include <linux/bitops.h> 14#include <linux/bitops.h>
15#include <net/pkt_sched.h> 15#include <net/pkt_sched.h>
16#include <net/pkt_cls.h>
16#include <net/dsfield.h> 17#include <net/dsfield.h>
17#include <net/inet_ecn.h> 18#include <net/inet_ecn.h>
18#include <asm/byteorder.h> 19#include <asm/byteorder.h>
@@ -200,9 +201,13 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch,
200 pr_debug("%s(skb %p,sch %p,[qdisc %p])\n", __func__, skb, sch, p); 201 pr_debug("%s(skb %p,sch %p,[qdisc %p])\n", __func__, skb, sch, p);
201 202
202 if (p->set_tc_index) { 203 if (p->set_tc_index) {
204 int wlen = skb_network_offset(skb);
205
203 switch (tc_skb_protocol(skb)) { 206 switch (tc_skb_protocol(skb)) {
204 case htons(ETH_P_IP): 207 case htons(ETH_P_IP):
205 if (skb_cow_head(skb, sizeof(struct iphdr))) 208 wlen += sizeof(struct iphdr);
209 if (!pskb_may_pull(skb, wlen) ||
210 skb_try_make_writable(skb, wlen))
206 goto drop; 211 goto drop;
207 212
208 skb->tc_index = ipv4_get_dsfield(ip_hdr(skb)) 213 skb->tc_index = ipv4_get_dsfield(ip_hdr(skb))
@@ -210,7 +215,9 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch,
210 break; 215 break;
211 216
212 case htons(ETH_P_IPV6): 217 case htons(ETH_P_IPV6):
213 if (skb_cow_head(skb, sizeof(struct ipv6hdr))) 218 wlen += sizeof(struct ipv6hdr);
219 if (!pskb_may_pull(skb, wlen) ||
220 skb_try_make_writable(skb, wlen))
214 goto drop; 221 goto drop;
215 222
216 skb->tc_index = ipv6_get_dsfield(ipv6_hdr(skb)) 223 skb->tc_index = ipv6_get_dsfield(ipv6_hdr(skb))
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index a5ea0e9b6be4..9f3a884d1590 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -23,6 +23,7 @@
23#include <linux/vmalloc.h> 23#include <linux/vmalloc.h>
24#include <net/netlink.h> 24#include <net/netlink.h>
25#include <net/pkt_sched.h> 25#include <net/pkt_sched.h>
26#include <net/pkt_cls.h>
26#include <net/codel.h> 27#include <net/codel.h>
27#include <net/codel_impl.h> 28#include <net/codel_impl.h>
28#include <net/codel_qdisc.h> 29#include <net/codel_qdisc.h>
@@ -57,7 +58,6 @@ struct fq_codel_sched_data {
57 struct fq_codel_flow *flows; /* Flows table [flows_cnt] */ 58 struct fq_codel_flow *flows; /* Flows table [flows_cnt] */
58 u32 *backlogs; /* backlog table [flows_cnt] */ 59 u32 *backlogs; /* backlog table [flows_cnt] */
59 u32 flows_cnt; /* number of flows */ 60 u32 flows_cnt; /* number of flows */
60 u32 perturbation; /* hash perturbation */
61 u32 quantum; /* psched_mtu(qdisc_dev(sch)); */ 61 u32 quantum; /* psched_mtu(qdisc_dev(sch)); */
62 u32 drop_batch_size; 62 u32 drop_batch_size;
63 u32 memory_limit; 63 u32 memory_limit;
@@ -75,9 +75,7 @@ struct fq_codel_sched_data {
75static unsigned int fq_codel_hash(const struct fq_codel_sched_data *q, 75static unsigned int fq_codel_hash(const struct fq_codel_sched_data *q,
76 struct sk_buff *skb) 76 struct sk_buff *skb)
77{ 77{
78 u32 hash = skb_get_hash_perturb(skb, q->perturbation); 78 return reciprocal_scale(skb_get_hash(skb), q->flows_cnt);
79
80 return reciprocal_scale(hash, q->flows_cnt);
81} 79}
82 80
83static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch, 81static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch,
@@ -482,7 +480,6 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt)
482 q->memory_limit = 32 << 20; /* 32 MBytes */ 480 q->memory_limit = 32 << 20; /* 32 MBytes */
483 q->drop_batch_size = 64; 481 q->drop_batch_size = 64;
484 q->quantum = psched_mtu(qdisc_dev(sch)); 482 q->quantum = psched_mtu(qdisc_dev(sch));
485 q->perturbation = prandom_u32();
486 INIT_LIST_HEAD(&q->new_flows); 483 INIT_LIST_HEAD(&q->new_flows);
487 INIT_LIST_HEAD(&q->old_flows); 484 INIT_LIST_HEAD(&q->old_flows);
488 codel_params_init(&q->cparams); 485 codel_params_init(&q->cparams);
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 6eb9c8e88519..1a2f9e964330 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -247,7 +247,7 @@ static inline int qdisc_restart(struct Qdisc *q, int *packets)
247 247
248void __qdisc_run(struct Qdisc *q) 248void __qdisc_run(struct Qdisc *q)
249{ 249{
250 int quota = weight_p; 250 int quota = dev_tx_weight;
251 int packets; 251 int packets;
252 252
253 while (qdisc_restart(q, &packets)) { 253 while (qdisc_restart(q, &packets)) {
@@ -794,7 +794,7 @@ static void attach_default_qdiscs(struct net_device *dev)
794 } 794 }
795 } 795 }
796#ifdef CONFIG_NET_SCHED 796#ifdef CONFIG_NET_SCHED
797 if (dev->qdisc) 797 if (dev->qdisc != &noop_qdisc)
798 qdisc_hash_add(dev->qdisc); 798 qdisc_hash_add(dev->qdisc);
799#endif 799#endif
800} 800}
diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c
index e3d0458af17b..2fae8b5f1b80 100644
--- a/net/sched/sch_hhf.c
+++ b/net/sched/sch_hhf.c
@@ -627,7 +627,9 @@ static int hhf_init(struct Qdisc *sch, struct nlattr *opt)
627 q->hhf_arrays[i] = hhf_zalloc(HHF_ARRAYS_LEN * 627 q->hhf_arrays[i] = hhf_zalloc(HHF_ARRAYS_LEN *
628 sizeof(u32)); 628 sizeof(u32));
629 if (!q->hhf_arrays[i]) { 629 if (!q->hhf_arrays[i]) {
630 hhf_destroy(sch); 630 /* Note: hhf_destroy() will be called
631 * by our caller.
632 */
631 return -ENOMEM; 633 return -ENOMEM;
632 } 634 }
633 } 635 }
@@ -638,7 +640,9 @@ static int hhf_init(struct Qdisc *sch, struct nlattr *opt)
638 q->hhf_valid_bits[i] = hhf_zalloc(HHF_ARRAYS_LEN / 640 q->hhf_valid_bits[i] = hhf_zalloc(HHF_ARRAYS_LEN /
639 BITS_PER_BYTE); 641 BITS_PER_BYTE);
640 if (!q->hhf_valid_bits[i]) { 642 if (!q->hhf_valid_bits[i]) {
641 hhf_destroy(sch); 643 /* Note: hhf_destroy() will be called
644 * by our caller.
645 */
642 return -ENOMEM; 646 return -ENOMEM;
643 } 647 }
644 } 648 }
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 760f39e7caee..4cd5fb134bc9 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -40,6 +40,7 @@
40#include <net/netlink.h> 40#include <net/netlink.h>
41#include <net/sch_generic.h> 41#include <net/sch_generic.h>
42#include <net/pkt_sched.h> 42#include <net/pkt_sched.h>
43#include <net/pkt_cls.h>
43 44
44/* HTB algorithm. 45/* HTB algorithm.
45 Author: devik@cdi.cz 46 Author: devik@cdi.cz
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index 8fe6999b642a..3bab5f66c392 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -16,6 +16,7 @@
16 16
17#include <net/netlink.h> 17#include <net/netlink.h>
18#include <net/pkt_sched.h> 18#include <net/pkt_sched.h>
19#include <net/pkt_cls.h>
19 20
20static struct Qdisc *ingress_leaf(struct Qdisc *sch, unsigned long arg) 21static struct Qdisc *ingress_leaf(struct Qdisc *sch, unsigned long arg)
21{ 22{
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index 2bc8d7f8df16..20b7f1646f69 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -52,7 +52,7 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt)
52 /* pre-allocate qdiscs, attachment can't fail */ 52 /* pre-allocate qdiscs, attachment can't fail */
53 priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]), 53 priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]),
54 GFP_KERNEL); 54 GFP_KERNEL);
55 if (priv->qdiscs == NULL) 55 if (!priv->qdiscs)
56 return -ENOMEM; 56 return -ENOMEM;
57 57
58 for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { 58 for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
@@ -60,18 +60,14 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt)
60 qdisc = qdisc_create_dflt(dev_queue, get_default_qdisc_ops(dev, ntx), 60 qdisc = qdisc_create_dflt(dev_queue, get_default_qdisc_ops(dev, ntx),
61 TC_H_MAKE(TC_H_MAJ(sch->handle), 61 TC_H_MAKE(TC_H_MAJ(sch->handle),
62 TC_H_MIN(ntx + 1))); 62 TC_H_MIN(ntx + 1)));
63 if (qdisc == NULL) 63 if (!qdisc)
64 goto err; 64 return -ENOMEM;
65 priv->qdiscs[ntx] = qdisc; 65 priv->qdiscs[ntx] = qdisc;
66 qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; 66 qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
67 } 67 }
68 68
69 sch->flags |= TCQ_F_MQROOT; 69 sch->flags |= TCQ_F_MQROOT;
70 return 0; 70 return 0;
71
72err:
73 mq_destroy(sch);
74 return -ENOMEM;
75} 71}
76 72
77static void mq_attach(struct Qdisc *sch) 73static void mq_attach(struct Qdisc *sch)
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index b5c502c78143..922683418e53 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -118,10 +118,8 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
118 /* pre-allocate qdisc, attachment can't fail */ 118 /* pre-allocate qdisc, attachment can't fail */
119 priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]), 119 priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]),
120 GFP_KERNEL); 120 GFP_KERNEL);
121 if (priv->qdiscs == NULL) { 121 if (!priv->qdiscs)
122 err = -ENOMEM; 122 return -ENOMEM;
123 goto err;
124 }
125 123
126 for (i = 0; i < dev->num_tx_queues; i++) { 124 for (i = 0; i < dev->num_tx_queues; i++) {
127 dev_queue = netdev_get_tx_queue(dev, i); 125 dev_queue = netdev_get_tx_queue(dev, i);
@@ -129,10 +127,9 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
129 get_default_qdisc_ops(dev, i), 127 get_default_qdisc_ops(dev, i),
130 TC_H_MAKE(TC_H_MAJ(sch->handle), 128 TC_H_MAKE(TC_H_MAJ(sch->handle),
131 TC_H_MIN(i + 1))); 129 TC_H_MIN(i + 1)));
132 if (qdisc == NULL) { 130 if (!qdisc)
133 err = -ENOMEM; 131 return -ENOMEM;
134 goto err; 132
135 }
136 priv->qdiscs[i] = qdisc; 133 priv->qdiscs[i] = qdisc;
137 qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; 134 qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
138 } 135 }
@@ -148,7 +145,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
148 priv->hw_owned = 1; 145 priv->hw_owned = 1;
149 err = dev->netdev_ops->ndo_setup_tc(dev, sch->handle, 0, &tc); 146 err = dev->netdev_ops->ndo_setup_tc(dev, sch->handle, 0, &tc);
150 if (err) 147 if (err)
151 goto err; 148 return err;
152 } else { 149 } else {
153 netdev_set_num_tc(dev, qopt->num_tc); 150 netdev_set_num_tc(dev, qopt->num_tc);
154 for (i = 0; i < qopt->num_tc; i++) 151 for (i = 0; i < qopt->num_tc; i++)
@@ -162,10 +159,6 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
162 159
163 sch->flags |= TCQ_F_MQROOT; 160 sch->flags |= TCQ_F_MQROOT;
164 return 0; 161 return 0;
165
166err:
167 mqprio_destroy(sch);
168 return err;
169} 162}
170 163
171static void mqprio_attach(struct Qdisc *sch) 164static void mqprio_attach(struct Qdisc *sch)
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index 9ffbb025b37e..e7839a0d0eaa 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -25,7 +25,7 @@
25#include <linux/skbuff.h> 25#include <linux/skbuff.h>
26#include <net/netlink.h> 26#include <net/netlink.h>
27#include <net/pkt_sched.h> 27#include <net/pkt_sched.h>
28 28#include <net/pkt_cls.h>
29 29
30struct multiq_sched_data { 30struct multiq_sched_data {
31 u16 bands; 31 u16 bands;
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index bcfadfdea8e0..c8bb62a1e744 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -626,7 +626,7 @@ deliver:
626 * If it's at ingress let's pretend the delay is 626 * If it's at ingress let's pretend the delay is
627 * from the network (tstamp will be updated). 627 * from the network (tstamp will be updated).
628 */ 628 */
629 if (G_TC_FROM(skb->tc_verd) & AT_INGRESS) 629 if (skb->tc_redirected && skb->tc_from_ingress)
630 skb->tstamp = 0; 630 skb->tstamp = 0;
631#endif 631#endif
632 632
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 8f575899adfa..d4d7db267b6e 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -20,7 +20,7 @@
20#include <linux/skbuff.h> 20#include <linux/skbuff.h>
21#include <net/netlink.h> 21#include <net/netlink.h>
22#include <net/pkt_sched.h> 22#include <net/pkt_sched.h>
23 23#include <net/pkt_cls.h>
24 24
25struct prio_sched_data { 25struct prio_sched_data {
26 int bands; 26 int bands;
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index 20a350bd1b1d..fe6963d21519 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -25,6 +25,7 @@
25#include <linux/jhash.h> 25#include <linux/jhash.h>
26#include <net/ip.h> 26#include <net/ip.h>
27#include <net/pkt_sched.h> 27#include <net/pkt_sched.h>
28#include <net/pkt_cls.h>
28#include <net/inet_ecn.h> 29#include <net/inet_ecn.h>
29 30
30/* 31/*
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 7f195ed4d568..42e8c8615e65 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -23,6 +23,7 @@
23#include <linux/vmalloc.h> 23#include <linux/vmalloc.h>
24#include <net/netlink.h> 24#include <net/netlink.h>
25#include <net/pkt_sched.h> 25#include <net/pkt_sched.h>
26#include <net/pkt_cls.h>
26#include <net/red.h> 27#include <net/red.h>
27 28
28 29
@@ -742,9 +743,10 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
742 q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor); 743 q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
743 q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows); 744 q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
744 if (!q->ht || !q->slots) { 745 if (!q->ht || !q->slots) {
745 sfq_destroy(sch); 746 /* Note: sfq_destroy() will be called by our caller */
746 return -ENOMEM; 747 return -ENOMEM;
747 } 748 }
749
748 for (i = 0; i < q->divisor; i++) 750 for (i = 0; i < q->divisor; i++)
749 q->ht[i] = SFQ_EMPTY_SLOT; 751 q->ht[i] = SFQ_EMPTY_SLOT;
750 752
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index b0196366d58d..9fe6b427afed 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -401,8 +401,8 @@ static int teql_master_close(struct net_device *dev)
401 return 0; 401 return 0;
402} 402}
403 403
404static struct rtnl_link_stats64 *teql_master_stats64(struct net_device *dev, 404static void teql_master_stats64(struct net_device *dev,
405 struct rtnl_link_stats64 *stats) 405 struct rtnl_link_stats64 *stats)
406{ 406{
407 struct teql_master *m = netdev_priv(dev); 407 struct teql_master *m = netdev_priv(dev);
408 408
@@ -410,7 +410,6 @@ static struct rtnl_link_stats64 *teql_master_stats64(struct net_device *dev,
410 stats->tx_bytes = m->tx_bytes; 410 stats->tx_bytes = m->tx_bytes;
411 stats->tx_errors = m->tx_errors; 411 stats->tx_errors = m->tx_errors;
412 stats->tx_dropped = m->tx_dropped; 412 stats->tx_dropped = m->tx_dropped;
413 return stats;
414} 413}
415 414
416static int teql_master_mtu(struct net_device *dev, int new_mtu) 415static int teql_master_mtu(struct net_device *dev, int new_mtu)
diff --git a/net/sctp/Makefile b/net/sctp/Makefile
index 6c4f7496cec6..70f1b570bab9 100644
--- a/net/sctp/Makefile
+++ b/net/sctp/Makefile
@@ -11,7 +11,7 @@ sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \
11 transport.o chunk.o sm_make_chunk.o ulpevent.o \ 11 transport.o chunk.o sm_make_chunk.o ulpevent.o \
12 inqueue.o outqueue.o ulpqueue.o \ 12 inqueue.o outqueue.o ulpqueue.o \
13 tsnmap.o bind_addr.o socket.o primitive.o \ 13 tsnmap.o bind_addr.o socket.o primitive.o \
14 output.o input.o debug.o ssnmap.o auth.o \ 14 output.o input.o debug.o stream.o auth.o \
15 offload.o 15 offload.o
16 16
17sctp_probe-y := probe.o 17sctp_probe-y := probe.o
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index d3cc30c25c41..a9708da28eb5 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -71,9 +71,8 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
71{ 71{
72 struct net *net = sock_net(sk); 72 struct net *net = sock_net(sk);
73 struct sctp_sock *sp; 73 struct sctp_sock *sp;
74 int i;
75 sctp_paramhdr_t *p; 74 sctp_paramhdr_t *p;
76 int err; 75 int i;
77 76
78 /* Retrieve the SCTP per socket area. */ 77 /* Retrieve the SCTP per socket area. */
79 sp = sctp_sk((struct sock *)sk); 78 sp = sctp_sk((struct sock *)sk);
@@ -207,6 +206,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
207 * association to the same value as the initial TSN. 206 * association to the same value as the initial TSN.
208 */ 207 */
209 asoc->addip_serial = asoc->c.initial_tsn; 208 asoc->addip_serial = asoc->c.initial_tsn;
209 asoc->strreset_outseq = asoc->c.initial_tsn;
210 210
211 INIT_LIST_HEAD(&asoc->addip_chunk_list); 211 INIT_LIST_HEAD(&asoc->addip_chunk_list);
212 INIT_LIST_HEAD(&asoc->asconf_ack_list); 212 INIT_LIST_HEAD(&asoc->asconf_ack_list);
@@ -246,6 +246,9 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
246 if (!sctp_ulpq_init(&asoc->ulpq, asoc)) 246 if (!sctp_ulpq_init(&asoc->ulpq, asoc))
247 goto fail_init; 247 goto fail_init;
248 248
249 if (sctp_stream_new(asoc, gfp))
250 goto fail_init;
251
249 /* Assume that peer would support both address types unless we are 252 /* Assume that peer would support both address types unless we are
250 * told otherwise. 253 * told otherwise.
251 */ 254 */
@@ -263,12 +266,13 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
263 266
264 /* AUTH related initializations */ 267 /* AUTH related initializations */
265 INIT_LIST_HEAD(&asoc->endpoint_shared_keys); 268 INIT_LIST_HEAD(&asoc->endpoint_shared_keys);
266 err = sctp_auth_asoc_copy_shkeys(ep, asoc, gfp); 269 if (sctp_auth_asoc_copy_shkeys(ep, asoc, gfp))
267 if (err) 270 goto stream_free;
268 goto fail_init;
269 271
270 asoc->active_key_id = ep->active_key_id; 272 asoc->active_key_id = ep->active_key_id;
271 asoc->prsctp_enable = ep->prsctp_enable; 273 asoc->prsctp_enable = ep->prsctp_enable;
274 asoc->reconf_enable = ep->reconf_enable;
275 asoc->strreset_enable = ep->strreset_enable;
272 276
273 /* Save the hmacs and chunks list into this association */ 277 /* Save the hmacs and chunks list into this association */
274 if (ep->auth_hmacs_list) 278 if (ep->auth_hmacs_list)
@@ -286,6 +290,8 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
286 290
287 return asoc; 291 return asoc;
288 292
293stream_free:
294 sctp_stream_free(asoc->stream);
289fail_init: 295fail_init:
290 sock_put(asoc->base.sk); 296 sock_put(asoc->base.sk);
291 sctp_endpoint_put(asoc->ep); 297 sctp_endpoint_put(asoc->ep);
@@ -358,8 +364,11 @@ void sctp_association_free(struct sctp_association *asoc)
358 364
359 sctp_tsnmap_free(&asoc->peer.tsn_map); 365 sctp_tsnmap_free(&asoc->peer.tsn_map);
360 366
361 /* Free ssnmap storage. */ 367 /* Free stream information. */
362 sctp_ssnmap_free(asoc->ssnmap); 368 sctp_stream_free(asoc->stream);
369
370 if (asoc->strreset_chunk)
371 sctp_chunk_free(asoc->strreset_chunk);
363 372
364 /* Clean up the bound address list. */ 373 /* Clean up the bound address list. */
365 sctp_bind_addr_free(&asoc->base.bind_addr); 374 sctp_bind_addr_free(&asoc->base.bind_addr);
@@ -519,6 +528,12 @@ void sctp_assoc_rm_peer(struct sctp_association *asoc,
519 if (asoc->peer.last_data_from == peer) 528 if (asoc->peer.last_data_from == peer)
520 asoc->peer.last_data_from = transport; 529 asoc->peer.last_data_from = transport;
521 530
531 if (asoc->strreset_chunk &&
532 asoc->strreset_chunk->transport == peer) {
533 asoc->strreset_chunk->transport = transport;
534 sctp_transport_reset_reconf_timer(transport);
535 }
536
522 /* If we remove the transport an INIT was last sent to, set it to 537 /* If we remove the transport an INIT was last sent to, set it to
523 * NULL. Combined with the update of the retran path above, this 538 * NULL. Combined with the update of the retran path above, this
524 * will cause the next INIT to be sent to the next available 539 * will cause the next INIT to be sent to the next available
@@ -820,8 +835,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
820 if (transport->state != SCTP_UNCONFIRMED) 835 if (transport->state != SCTP_UNCONFIRMED)
821 transport->state = SCTP_INACTIVE; 836 transport->state = SCTP_INACTIVE;
822 else { 837 else {
823 dst_release(transport->dst); 838 sctp_transport_dst_release(transport);
824 transport->dst = NULL;
825 ulp_notify = false; 839 ulp_notify = false;
826 } 840 }
827 841
@@ -1137,7 +1151,7 @@ void sctp_assoc_update(struct sctp_association *asoc,
1137 /* Reinitialize SSN for both local streams 1151 /* Reinitialize SSN for both local streams
1138 * and peer's streams. 1152 * and peer's streams.
1139 */ 1153 */
1140 sctp_ssnmap_clear(asoc->ssnmap); 1154 sctp_stream_clear(asoc->stream);
1141 1155
1142 /* Flush the ULP reassembly and ordered queue. 1156 /* Flush the ULP reassembly and ordered queue.
1143 * Any data there will now be stale and will 1157 * Any data there will now be stale and will
@@ -1162,10 +1176,9 @@ void sctp_assoc_update(struct sctp_association *asoc,
1162 1176
1163 asoc->ctsn_ack_point = asoc->next_tsn - 1; 1177 asoc->ctsn_ack_point = asoc->next_tsn - 1;
1164 asoc->adv_peer_ack_point = asoc->ctsn_ack_point; 1178 asoc->adv_peer_ack_point = asoc->ctsn_ack_point;
1165 if (!asoc->ssnmap) { 1179 if (!asoc->stream) {
1166 /* Move the ssnmap. */ 1180 asoc->stream = new->stream;
1167 asoc->ssnmap = new->ssnmap; 1181 new->stream = NULL;
1168 new->ssnmap = NULL;
1169 } 1182 }
1170 1183
1171 if (!asoc->assoc_id) { 1184 if (!asoc->assoc_id) {
@@ -1399,7 +1412,7 @@ sctp_assoc_choose_alter_transport(struct sctp_association *asoc,
1399/* Update the association's pmtu and frag_point by going through all the 1412/* Update the association's pmtu and frag_point by going through all the
1400 * transports. This routine is called when a transport's PMTU has changed. 1413 * transports. This routine is called when a transport's PMTU has changed.
1401 */ 1414 */
1402void sctp_assoc_sync_pmtu(struct sock *sk, struct sctp_association *asoc) 1415void sctp_assoc_sync_pmtu(struct sctp_association *asoc)
1403{ 1416{
1404 struct sctp_transport *t; 1417 struct sctp_transport *t;
1405 __u32 pmtu = 0; 1418 __u32 pmtu = 0;
@@ -1411,8 +1424,8 @@ void sctp_assoc_sync_pmtu(struct sock *sk, struct sctp_association *asoc)
1411 list_for_each_entry(t, &asoc->peer.transport_addr_list, 1424 list_for_each_entry(t, &asoc->peer.transport_addr_list,
1412 transports) { 1425 transports) {
1413 if (t->pmtu_pending && t->dst) { 1426 if (t->pmtu_pending && t->dst) {
1414 sctp_transport_update_pmtu(sk, t, 1427 sctp_transport_update_pmtu(
1415 SCTP_TRUNC4(dst_mtu(t->dst))); 1428 t, SCTP_TRUNC4(dst_mtu(t->dst)));
1416 t->pmtu_pending = 0; 1429 t->pmtu_pending = 0;
1417 } 1430 }
1418 if (!pmtu || (t->pathmtu < pmtu)) 1431 if (!pmtu || (t->pathmtu < pmtu))
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index 615f0ddd41df..e3621cb4827f 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -165,14 +165,12 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
165 struct sctp_sndrcvinfo *sinfo, 165 struct sctp_sndrcvinfo *sinfo,
166 struct iov_iter *from) 166 struct iov_iter *from)
167{ 167{
168 int max, whole, i, offset, over, err; 168 size_t len, first_len, max_data, remaining;
169 int len, first_len; 169 size_t msg_len = iov_iter_count(from);
170 int max_data; 170 struct list_head *pos, *temp;
171 struct sctp_chunk *chunk; 171 struct sctp_chunk *chunk;
172 struct sctp_datamsg *msg; 172 struct sctp_datamsg *msg;
173 struct list_head *pos, *temp; 173 int err;
174 size_t msg_len = iov_iter_count(from);
175 __u8 frag;
176 174
177 msg = sctp_datamsg_new(GFP_KERNEL); 175 msg = sctp_datamsg_new(GFP_KERNEL);
178 if (!msg) 176 if (!msg)
@@ -185,7 +183,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
185 (SCTP_PR_TTL_ENABLED(sinfo->sinfo_flags) || 183 (SCTP_PR_TTL_ENABLED(sinfo->sinfo_flags) ||
186 !SCTP_PR_POLICY(sinfo->sinfo_flags))) 184 !SCTP_PR_POLICY(sinfo->sinfo_flags)))
187 msg->expires_at = jiffies + 185 msg->expires_at = jiffies +
188 msecs_to_jiffies(sinfo->sinfo_timetolive); 186 msecs_to_jiffies(sinfo->sinfo_timetolive);
189 187
190 /* This is the biggest possible DATA chunk that can fit into 188 /* This is the biggest possible DATA chunk that can fit into
191 * the packet 189 * the packet
@@ -195,7 +193,6 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
195 sizeof(struct sctphdr) - sizeof(struct sctp_data_chunk); 193 sizeof(struct sctphdr) - sizeof(struct sctp_data_chunk);
196 max_data = SCTP_TRUNC4(max_data); 194 max_data = SCTP_TRUNC4(max_data);
197 195
198 max = asoc->frag_point;
199 /* If the the peer requested that we authenticate DATA chunks 196 /* If the the peer requested that we authenticate DATA chunks
200 * we need to account for bundling of the AUTH chunks along with 197 * we need to account for bundling of the AUTH chunks along with
201 * DATA. 198 * DATA.
@@ -208,12 +205,11 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
208 hmac_desc->hmac_len); 205 hmac_desc->hmac_len);
209 } 206 }
210 207
211 /* Now, check if we need to reduce our max */ 208 /* Check what's our max considering the above */
212 if (max > max_data) 209 max_data = min_t(size_t, max_data, asoc->frag_point);
213 max = max_data;
214 210
215 whole = 0; 211 /* Set first_len and then account for possible bundles on first frag */
216 first_len = max; 212 first_len = max_data;
217 213
218 /* Check to see if we have a pending SACK and try to let it be bundled 214 /* Check to see if we have a pending SACK and try to let it be bundled
219 * with this message. Do this if we don't have any data queued already. 215 * with this message. Do this if we don't have any data queued already.
@@ -224,40 +220,38 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
224 if (timer_pending(&asoc->timers[SCTP_EVENT_TIMEOUT_SACK]) && 220 if (timer_pending(&asoc->timers[SCTP_EVENT_TIMEOUT_SACK]) &&
225 asoc->outqueue.out_qlen == 0 && 221 asoc->outqueue.out_qlen == 0 &&
226 list_empty(&asoc->outqueue.retransmit) && 222 list_empty(&asoc->outqueue.retransmit) &&
227 msg_len > max) 223 msg_len > max_data)
228 max_data -= SCTP_PAD4(sizeof(sctp_sack_chunk_t)); 224 first_len -= SCTP_PAD4(sizeof(sctp_sack_chunk_t));
229 225
230 /* Encourage Cookie-ECHO bundling. */ 226 /* Encourage Cookie-ECHO bundling. */
231 if (asoc->state < SCTP_STATE_COOKIE_ECHOED) 227 if (asoc->state < SCTP_STATE_COOKIE_ECHOED)
232 max_data -= SCTP_ARBITRARY_COOKIE_ECHO_LEN; 228 first_len -= SCTP_ARBITRARY_COOKIE_ECHO_LEN;
233
234 /* Now that we adjusted completely, reset first_len */
235 if (first_len > max_data)
236 first_len = max_data;
237 229
238 /* Account for a different sized first fragment */ 230 /* Account for a different sized first fragment */
239 if (msg_len >= first_len) { 231 if (msg_len >= first_len) {
240 msg_len -= first_len;
241 whole = 1;
242 msg->can_delay = 0; 232 msg->can_delay = 0;
243 }
244
245 /* How many full sized? How many bytes leftover? */
246 whole += msg_len / max;
247 over = msg_len % max;
248 offset = 0;
249
250 if ((whole > 1) || (whole && over))
251 SCTP_INC_STATS(sock_net(asoc->base.sk), SCTP_MIB_FRAGUSRMSGS); 233 SCTP_INC_STATS(sock_net(asoc->base.sk), SCTP_MIB_FRAGUSRMSGS);
234 } else {
235 /* Which may be the only one... */
236 first_len = msg_len;
237 }
252 238
253 /* Create chunks for all the full sized DATA chunks. */ 239 /* Create chunks for all DATA chunks. */
254 for (i = 0, len = first_len; i < whole; i++) { 240 for (remaining = msg_len; remaining; remaining -= len) {
255 frag = SCTP_DATA_MIDDLE_FRAG; 241 u8 frag = SCTP_DATA_MIDDLE_FRAG;
256 242
257 if (0 == i) 243 if (remaining == msg_len) {
244 /* First frag, which may also be the last */
258 frag |= SCTP_DATA_FIRST_FRAG; 245 frag |= SCTP_DATA_FIRST_FRAG;
246 len = first_len;
247 } else {
248 /* Middle frags */
249 len = max_data;
250 }
259 251
260 if ((i == (whole - 1)) && !over) { 252 if (len >= remaining) {
253 /* Last frag, which may also be the first */
254 len = remaining;
261 frag |= SCTP_DATA_LAST_FRAG; 255 frag |= SCTP_DATA_LAST_FRAG;
262 256
263 /* The application requests to set the I-bit of the 257 /* The application requests to set the I-bit of the
@@ -271,7 +265,6 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
271 265
272 chunk = sctp_make_datafrag_empty(asoc, sinfo, len, frag, 266 chunk = sctp_make_datafrag_empty(asoc, sinfo, len, frag,
273 0, GFP_KERNEL); 267 0, GFP_KERNEL);
274
275 if (!chunk) { 268 if (!chunk) {
276 err = -ENOMEM; 269 err = -ENOMEM;
277 goto errout; 270 goto errout;
@@ -282,45 +275,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
282 goto errout_chunk_free; 275 goto errout_chunk_free;
283 276
284 /* Put the chunk->skb back into the form expected by send. */ 277 /* Put the chunk->skb back into the form expected by send. */
285 __skb_pull(chunk->skb, (__u8 *)chunk->chunk_hdr 278 __skb_pull(chunk->skb, (__u8 *)chunk->chunk_hdr -
286 - (__u8 *)chunk->skb->data); 279 chunk->skb->data);
287
288 sctp_datamsg_assign(msg, chunk);
289 list_add_tail(&chunk->frag_list, &msg->chunks);
290
291 /* The first chunk, the first chunk was likely short
292 * to allow bundling, so reset to full size.
293 */
294 if (0 == i)
295 len = max;
296 }
297
298 /* .. now the leftover bytes. */
299 if (over) {
300 if (!whole)
301 frag = SCTP_DATA_NOT_FRAG;
302 else
303 frag = SCTP_DATA_LAST_FRAG;
304
305 if ((sinfo->sinfo_flags & SCTP_EOF) ||
306 (sinfo->sinfo_flags & SCTP_SACK_IMMEDIATELY))
307 frag |= SCTP_DATA_SACK_IMM;
308
309 chunk = sctp_make_datafrag_empty(asoc, sinfo, over, frag,
310 0, GFP_KERNEL);
311
312 if (!chunk) {
313 err = -ENOMEM;
314 goto errout;
315 }
316
317 err = sctp_user_addto_chunk(chunk, over, from);
318
319 /* Put the chunk->skb back into the form expected by send. */
320 __skb_pull(chunk->skb, (__u8 *)chunk->chunk_hdr
321 - (__u8 *)chunk->skb->data);
322 if (err < 0)
323 goto errout_chunk_free;
324 280
325 sctp_datamsg_assign(msg, chunk); 281 sctp_datamsg_assign(msg, chunk);
326 list_add_tail(&chunk->frag_list, &msg->chunks); 282 list_add_tail(&chunk->frag_list, &msg->chunks);
@@ -338,6 +294,7 @@ errout:
338 sctp_chunk_free(chunk); 294 sctp_chunk_free(chunk);
339 } 295 }
340 sctp_datamsg_put(msg); 296 sctp_datamsg_put(msg);
297
341 return ERR_PTR(err); 298 return ERR_PTR(err);
342} 299}
343 300
diff --git a/net/sctp/debug.c b/net/sctp/debug.c
index 95d7b15dad21..2e47eb2f05cb 100644
--- a/net/sctp/debug.c
+++ b/net/sctp/debug.c
@@ -159,6 +159,7 @@ static const char *const sctp_timer_tbl[] = {
159 "TIMEOUT_T4_RTO", 159 "TIMEOUT_T4_RTO",
160 "TIMEOUT_T5_SHUTDOWN_GUARD", 160 "TIMEOUT_T5_SHUTDOWN_GUARD",
161 "TIMEOUT_HEARTBEAT", 161 "TIMEOUT_HEARTBEAT",
162 "TIMEOUT_RECONF",
162 "TIMEOUT_SACK", 163 "TIMEOUT_SACK",
163 "TIMEOUT_AUTOCLOSE", 164 "TIMEOUT_AUTOCLOSE",
164}; 165};
@@ -166,7 +167,9 @@ static const char *const sctp_timer_tbl[] = {
166/* Lookup timer debug name. */ 167/* Lookup timer debug name. */
167const char *sctp_tname(const sctp_subtype_t id) 168const char *sctp_tname(const sctp_subtype_t id)
168{ 169{
169 if (id.timeout <= SCTP_EVENT_TIMEOUT_MAX) 170 BUILD_BUG_ON(SCTP_EVENT_TIMEOUT_MAX + 1 != ARRAY_SIZE(sctp_timer_tbl));
171
172 if (id.timeout < ARRAY_SIZE(sctp_timer_tbl))
170 return sctp_timer_tbl[id.timeout]; 173 return sctp_timer_tbl[id.timeout];
171 return "unknown_timer"; 174 return "unknown_timer";
172} 175}
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index 410ddc1e3443..8c589230794f 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -164,6 +164,7 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
164 ep->auth_hmacs_list = auth_hmacs; 164 ep->auth_hmacs_list = auth_hmacs;
165 ep->auth_chunk_list = auth_chunks; 165 ep->auth_chunk_list = auth_chunks;
166 ep->prsctp_enable = net->sctp.prsctp_enable; 166 ep->prsctp_enable = net->sctp.prsctp_enable;
167 ep->reconf_enable = net->sctp.reconf_enable;
167 168
168 return ep; 169 return ep;
169 170
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 458e506ef84b..0e06a278d2a9 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -401,10 +401,10 @@ void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc,
401 401
402 if (t->param_flags & SPP_PMTUD_ENABLE) { 402 if (t->param_flags & SPP_PMTUD_ENABLE) {
403 /* Update transports view of the MTU */ 403 /* Update transports view of the MTU */
404 sctp_transport_update_pmtu(sk, t, pmtu); 404 sctp_transport_update_pmtu(t, pmtu);
405 405
406 /* Update association pmtu. */ 406 /* Update association pmtu. */
407 sctp_assoc_sync_pmtu(sk, asoc); 407 sctp_assoc_sync_pmtu(asoc);
408 } 408 }
409 409
410 /* Retransmit with the new pmtu setting. 410 /* Retransmit with the new pmtu setting.
@@ -872,6 +872,8 @@ void sctp_transport_hashtable_destroy(void)
872 872
873int sctp_hash_transport(struct sctp_transport *t) 873int sctp_hash_transport(struct sctp_transport *t)
874{ 874{
875 struct sctp_transport *transport;
876 struct rhlist_head *tmp, *list;
875 struct sctp_hash_cmp_arg arg; 877 struct sctp_hash_cmp_arg arg;
876 int err; 878 int err;
877 879
@@ -882,8 +884,22 @@ int sctp_hash_transport(struct sctp_transport *t)
882 arg.paddr = &t->ipaddr; 884 arg.paddr = &t->ipaddr;
883 arg.lport = htons(t->asoc->base.bind_addr.port); 885 arg.lport = htons(t->asoc->base.bind_addr.port);
884 886
887 rcu_read_lock();
888 list = rhltable_lookup(&sctp_transport_hashtable, &arg,
889 sctp_hash_params);
890
891 rhl_for_each_entry_rcu(transport, tmp, list, node)
892 if (transport->asoc->ep == t->asoc->ep) {
893 rcu_read_unlock();
894 err = -EEXIST;
895 goto out;
896 }
897 rcu_read_unlock();
898
885 err = rhltable_insert_key(&sctp_transport_hashtable, &arg, 899 err = rhltable_insert_key(&sctp_transport_hashtable, &arg,
886 &t->node, sctp_hash_params); 900 &t->node, sctp_hash_params);
901
902out:
887 if (err) 903 if (err)
888 pr_err_once("insert transport fail, errno %d\n", err); 904 pr_err_once("insert transport fail, errno %d\n", err);
889 905
@@ -1229,13 +1245,26 @@ static struct sctp_association *__sctp_rcv_lookup(struct net *net,
1229 struct sctp_association *asoc; 1245 struct sctp_association *asoc;
1230 1246
1231 asoc = __sctp_lookup_association(net, laddr, paddr, transportp); 1247 asoc = __sctp_lookup_association(net, laddr, paddr, transportp);
1248 if (asoc)
1249 goto out;
1232 1250
1233 /* Further lookup for INIT/INIT-ACK packets. 1251 /* Further lookup for INIT/INIT-ACK packets.
1234 * SCTP Implementors Guide, 2.18 Handling of address 1252 * SCTP Implementors Guide, 2.18 Handling of address
1235 * parameters within the INIT or INIT-ACK. 1253 * parameters within the INIT or INIT-ACK.
1236 */ 1254 */
1237 if (!asoc) 1255 asoc = __sctp_rcv_lookup_harder(net, skb, laddr, transportp);
1238 asoc = __sctp_rcv_lookup_harder(net, skb, laddr, transportp); 1256 if (asoc)
1257 goto out;
1258
1259 if (paddr->sa.sa_family == AF_INET)
1260 pr_debug("sctp: asoc not found for src:%pI4:%d dst:%pI4:%d\n",
1261 &laddr->v4.sin_addr, ntohs(laddr->v4.sin_port),
1262 &paddr->v4.sin_addr, ntohs(paddr->v4.sin_port));
1263 else
1264 pr_debug("sctp: asoc not found for src:%pI6:%d dst:%pI6:%d\n",
1265 &laddr->v6.sin6_addr, ntohs(laddr->v6.sin6_port),
1266 &paddr->v6.sin6_addr, ntohs(paddr->v6.sin6_port));
1239 1267
1268out:
1240 return asoc; 1269 return asoc;
1241} 1270}
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 64dfd35ccdcc..961ee59f696a 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -413,22 +413,20 @@ static void sctp_v6_copy_addrlist(struct list_head *addrlist,
413static void sctp_v6_from_skb(union sctp_addr *addr, struct sk_buff *skb, 413static void sctp_v6_from_skb(union sctp_addr *addr, struct sk_buff *skb,
414 int is_saddr) 414 int is_saddr)
415{ 415{
416 __be16 *port; 416 /* Always called on head skb, so this is safe */
417 struct sctphdr *sh; 417 struct sctphdr *sh = sctp_hdr(skb);
418 struct sockaddr_in6 *sa = &addr->v6;
418 419
419 port = &addr->v6.sin6_port;
420 addr->v6.sin6_family = AF_INET6; 420 addr->v6.sin6_family = AF_INET6;
421 addr->v6.sin6_flowinfo = 0; /* FIXME */ 421 addr->v6.sin6_flowinfo = 0; /* FIXME */
422 addr->v6.sin6_scope_id = ((struct inet6_skb_parm *)skb->cb)->iif; 422 addr->v6.sin6_scope_id = ((struct inet6_skb_parm *)skb->cb)->iif;
423 423
424 /* Always called on head skb, so this is safe */
425 sh = sctp_hdr(skb);
426 if (is_saddr) { 424 if (is_saddr) {
427 *port = sh->source; 425 sa->sin6_port = sh->source;
428 addr->v6.sin6_addr = ipv6_hdr(skb)->saddr; 426 sa->sin6_addr = ipv6_hdr(skb)->saddr;
429 } else { 427 } else {
430 *port = sh->dest; 428 sa->sin6_port = sh->dest;
431 addr->v6.sin6_addr = ipv6_hdr(skb)->daddr; 429 sa->sin6_addr = ipv6_hdr(skb)->daddr;
432 } 430 }
433} 431}
434 432
@@ -642,14 +640,15 @@ static sctp_scope_t sctp_v6_scope(union sctp_addr *addr)
642 640
643/* Create and initialize a new sk for the socket to be returned by accept(). */ 641/* Create and initialize a new sk for the socket to be returned by accept(). */
644static struct sock *sctp_v6_create_accept_sk(struct sock *sk, 642static struct sock *sctp_v6_create_accept_sk(struct sock *sk,
645 struct sctp_association *asoc) 643 struct sctp_association *asoc,
644 bool kern)
646{ 645{
647 struct sock *newsk; 646 struct sock *newsk;
648 struct ipv6_pinfo *newnp, *np = inet6_sk(sk); 647 struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
649 struct sctp6_sock *newsctp6sk; 648 struct sctp6_sock *newsctp6sk;
650 struct ipv6_txoptions *opt; 649 struct ipv6_txoptions *opt;
651 650
652 newsk = sk_alloc(sock_net(sk), PF_INET6, GFP_KERNEL, sk->sk_prot, 0); 651 newsk = sk_alloc(sock_net(sk), PF_INET6, GFP_KERNEL, sk->sk_prot, kern);
653 if (!newsk) 652 if (!newsk)
654 goto out; 653 goto out;
655 654
diff --git a/net/sctp/objcnt.c b/net/sctp/objcnt.c
index 40e7fac96c41..105ac3327b28 100644
--- a/net/sctp/objcnt.c
+++ b/net/sctp/objcnt.c
@@ -51,7 +51,6 @@ SCTP_DBG_OBJCNT(bind_addr);
51SCTP_DBG_OBJCNT(bind_bucket); 51SCTP_DBG_OBJCNT(bind_bucket);
52SCTP_DBG_OBJCNT(chunk); 52SCTP_DBG_OBJCNT(chunk);
53SCTP_DBG_OBJCNT(addr); 53SCTP_DBG_OBJCNT(addr);
54SCTP_DBG_OBJCNT(ssnmap);
55SCTP_DBG_OBJCNT(datamsg); 54SCTP_DBG_OBJCNT(datamsg);
56SCTP_DBG_OBJCNT(keys); 55SCTP_DBG_OBJCNT(keys);
57 56
@@ -67,7 +66,6 @@ static sctp_dbg_objcnt_entry_t sctp_dbg_objcnt[] = {
67 SCTP_DBG_OBJCNT_ENTRY(bind_addr), 66 SCTP_DBG_OBJCNT_ENTRY(bind_addr),
68 SCTP_DBG_OBJCNT_ENTRY(bind_bucket), 67 SCTP_DBG_OBJCNT_ENTRY(bind_bucket),
69 SCTP_DBG_OBJCNT_ENTRY(addr), 68 SCTP_DBG_OBJCNT_ENTRY(addr),
70 SCTP_DBG_OBJCNT_ENTRY(ssnmap),
71 SCTP_DBG_OBJCNT_ENTRY(datamsg), 69 SCTP_DBG_OBJCNT_ENTRY(datamsg),
72 SCTP_DBG_OBJCNT_ENTRY(keys), 70 SCTP_DBG_OBJCNT_ENTRY(keys),
73}; 71};
diff --git a/net/sctp/output.c b/net/sctp/output.c
index f5320a87341e..1409a875ad8e 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -81,56 +81,64 @@ static void sctp_packet_reset(struct sctp_packet *packet)
81/* Config a packet. 81/* Config a packet.
82 * This appears to be a followup set of initializations. 82 * This appears to be a followup set of initializations.
83 */ 83 */
84struct sctp_packet *sctp_packet_config(struct sctp_packet *packet, 84void sctp_packet_config(struct sctp_packet *packet, __u32 vtag,
85 __u32 vtag, int ecn_capable) 85 int ecn_capable)
86{ 86{
87 struct sctp_transport *tp = packet->transport; 87 struct sctp_transport *tp = packet->transport;
88 struct sctp_association *asoc = tp->asoc; 88 struct sctp_association *asoc = tp->asoc;
89 struct sock *sk;
89 90
90 pr_debug("%s: packet:%p vtag:0x%x\n", __func__, packet, vtag); 91 pr_debug("%s: packet:%p vtag:0x%x\n", __func__, packet, vtag);
91
92 packet->vtag = vtag; 92 packet->vtag = vtag;
93 93
94 if (asoc && tp->dst) { 94 /* do the following jobs only once for a flush schedule */
95 struct sock *sk = asoc->base.sk; 95 if (!sctp_packet_empty(packet))
96 96 return;
97 rcu_read_lock();
98 if (__sk_dst_get(sk) != tp->dst) {
99 dst_hold(tp->dst);
100 sk_setup_caps(sk, tp->dst);
101 }
102
103 if (sk_can_gso(sk)) {
104 struct net_device *dev = tp->dst->dev;
105 97
106 packet->max_size = dev->gso_max_size; 98 /* set packet max_size with pathmtu */
107 } else { 99 packet->max_size = tp->pathmtu;
108 packet->max_size = asoc->pathmtu; 100 if (!asoc)
109 } 101 return;
110 rcu_read_unlock();
111 102
112 } else { 103 /* update dst or transport pathmtu if in need */
113 packet->max_size = tp->pathmtu; 104 sk = asoc->base.sk;
105 if (!sctp_transport_dst_check(tp)) {
106 sctp_transport_route(tp, NULL, sctp_sk(sk));
107 if (asoc->param_flags & SPP_PMTUD_ENABLE)
108 sctp_assoc_sync_pmtu(asoc);
109 } else if (!sctp_transport_pmtu_check(tp)) {
110 if (asoc->param_flags & SPP_PMTUD_ENABLE)
111 sctp_assoc_sync_pmtu(asoc);
114 } 112 }
115 113
116 if (ecn_capable && sctp_packet_empty(packet)) { 114 /* If there a is a prepend chunk stick it on the list before
117 struct sctp_chunk *chunk; 115 * any other chunks get appended.
116 */
117 if (ecn_capable) {
118 struct sctp_chunk *chunk = sctp_get_ecne_prepend(asoc);
118 119
119 /* If there a is a prepend chunk stick it on the list before
120 * any other chunks get appended.
121 */
122 chunk = sctp_get_ecne_prepend(asoc);
123 if (chunk) 120 if (chunk)
124 sctp_packet_append_chunk(packet, chunk); 121 sctp_packet_append_chunk(packet, chunk);
125 } 122 }
126 123
127 return packet; 124 if (!tp->dst)
125 return;
126
127 /* set packet max_size with gso_max_size if gso is enabled*/
128 rcu_read_lock();
129 if (__sk_dst_get(sk) != tp->dst) {
130 dst_hold(tp->dst);
131 sk_setup_caps(sk, tp->dst);
132 }
133 packet->max_size = sk_can_gso(sk) ? tp->dst->dev->gso_max_size
134 : asoc->pathmtu;
135 rcu_read_unlock();
128} 136}
129 137
130/* Initialize the packet structure. */ 138/* Initialize the packet structure. */
131struct sctp_packet *sctp_packet_init(struct sctp_packet *packet, 139void sctp_packet_init(struct sctp_packet *packet,
132 struct sctp_transport *transport, 140 struct sctp_transport *transport,
133 __u16 sport, __u16 dport) 141 __u16 sport, __u16 dport)
134{ 142{
135 struct sctp_association *asoc = transport->asoc; 143 struct sctp_association *asoc = transport->asoc;
136 size_t overhead; 144 size_t overhead;
@@ -151,8 +159,6 @@ struct sctp_packet *sctp_packet_init(struct sctp_packet *packet,
151 packet->overhead = overhead; 159 packet->overhead = overhead;
152 sctp_packet_reset(packet); 160 sctp_packet_reset(packet);
153 packet->vtag = 0; 161 packet->vtag = 0;
154
155 return packet;
156} 162}
157 163
158/* Free a packet. */ 164/* Free a packet. */
@@ -181,7 +187,7 @@ sctp_xmit_t sctp_packet_transmit_chunk(struct sctp_packet *packet,
181{ 187{
182 sctp_xmit_t retval; 188 sctp_xmit_t retval;
183 189
184 pr_debug("%s: packet:%p size:%Zu chunk:%p size:%d\n", __func__, 190 pr_debug("%s: packet:%p size:%zu chunk:%p size:%d\n", __func__,
185 packet, packet->size, chunk, chunk->skb ? chunk->skb->len : -1); 191 packet, packet->size, chunk, chunk->skb ? chunk->skb->len : -1);
186 192
187 switch ((retval = (sctp_packet_append_chunk(packet, chunk)))) { 193 switch ((retval = (sctp_packet_append_chunk(packet, chunk)))) {
@@ -586,12 +592,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
586 sh->vtag = htonl(packet->vtag); 592 sh->vtag = htonl(packet->vtag);
587 sh->checksum = 0; 593 sh->checksum = 0;
588 594
589 /* update dst if in need */ 595 /* drop packet if no dst */
590 if (!sctp_transport_dst_check(tp)) {
591 sctp_transport_route(tp, NULL, sctp_sk(sk));
592 if (asoc && asoc->param_flags & SPP_PMTUD_ENABLE)
593 sctp_assoc_sync_pmtu(sk, asoc);
594 }
595 dst = dst_clone(tp->dst); 596 dst = dst_clone(tp->dst);
596 if (!dst) { 597 if (!dst) {
597 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 598 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
@@ -628,7 +629,14 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
628 asoc->peer.last_sent_to = tp; 629 asoc->peer.last_sent_to = tp;
629 } 630 }
630 head->ignore_df = packet->ipfragok; 631 head->ignore_df = packet->ipfragok;
631 tp->af_specific->sctp_xmit(head, tp); 632 if (tp->dst_pending_confirm)
633 skb_set_dst_pending_confirm(head, 1);
634 /* neighbour should be confirmed on successful transmission or
635 * positive error
636 */
637 if (tp->af_specific->sctp_xmit(head, tp) >= 0 &&
638 tp->dst_pending_confirm)
639 tp->dst_pending_confirm = 0;
632 640
633out: 641out:
634 list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) { 642 list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) {
@@ -700,18 +708,15 @@ static sctp_xmit_t sctp_packet_can_append_data(struct sctp_packet *packet,
700 * unacknowledged. 708 * unacknowledged.
701 */ 709 */
702 710
703 if (sctp_sk(asoc->base.sk)->nodelay) 711 if ((sctp_sk(asoc->base.sk)->nodelay || inflight == 0) &&
704 /* Nagle disabled */ 712 !asoc->force_delay)
713 /* Nothing unacked */
705 return SCTP_XMIT_OK; 714 return SCTP_XMIT_OK;
706 715
707 if (!sctp_packet_empty(packet)) 716 if (!sctp_packet_empty(packet))
708 /* Append to packet */ 717 /* Append to packet */
709 return SCTP_XMIT_OK; 718 return SCTP_XMIT_OK;
710 719
711 if (inflight == 0)
712 /* Nothing unacked */
713 return SCTP_XMIT_OK;
714
715 if (!sctp_state(asoc, ESTABLISHED)) 720 if (!sctp_state(asoc, ESTABLISHED))
716 return SCTP_XMIT_OK; 721 return SCTP_XMIT_OK;
717 722
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 34efaa4ef2f6..8081476ed313 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -382,17 +382,18 @@ static int sctp_prsctp_prune_sent(struct sctp_association *asoc,
382} 382}
383 383
384static int sctp_prsctp_prune_unsent(struct sctp_association *asoc, 384static int sctp_prsctp_prune_unsent(struct sctp_association *asoc,
385 struct sctp_sndrcvinfo *sinfo, 385 struct sctp_sndrcvinfo *sinfo, int msg_len)
386 struct list_head *queue, int msg_len)
387{ 386{
387 struct sctp_outq *q = &asoc->outqueue;
388 struct sctp_chunk *chk, *temp; 388 struct sctp_chunk *chk, *temp;
389 389
390 list_for_each_entry_safe(chk, temp, queue, list) { 390 list_for_each_entry_safe(chk, temp, &q->out_chunk_list, list) {
391 if (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) || 391 if (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) ||
392 chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive) 392 chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive)
393 continue; 393 continue;
394 394
395 list_del_init(&chk->list); 395 list_del_init(&chk->list);
396 q->out_qlen -= chk->skb->len;
396 asoc->sent_cnt_removable--; 397 asoc->sent_cnt_removable--;
397 asoc->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++; 398 asoc->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++;
398 399
@@ -431,9 +432,7 @@ void sctp_prsctp_prune(struct sctp_association *asoc,
431 return; 432 return;
432 } 433 }
433 434
434 sctp_prsctp_prune_unsent(asoc, sinfo, 435 sctp_prsctp_prune_unsent(asoc, sinfo, msg_len);
435 &asoc->outqueue.out_chunk_list,
436 msg_len);
437} 436}
438 437
439/* Mark all the eligible packets on a transport for retransmission. */ 438/* Mark all the eligible packets on a transport for retransmission. */
@@ -915,22 +914,28 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
915 case SCTP_CID_ECN_ECNE: 914 case SCTP_CID_ECN_ECNE:
916 case SCTP_CID_ASCONF: 915 case SCTP_CID_ASCONF:
917 case SCTP_CID_FWD_TSN: 916 case SCTP_CID_FWD_TSN:
917 case SCTP_CID_RECONF:
918 status = sctp_packet_transmit_chunk(packet, chunk, 918 status = sctp_packet_transmit_chunk(packet, chunk,
919 one_packet, gfp); 919 one_packet, gfp);
920 if (status != SCTP_XMIT_OK) { 920 if (status != SCTP_XMIT_OK) {
921 /* put the chunk back */ 921 /* put the chunk back */
922 list_add(&chunk->list, &q->control_chunk_list); 922 list_add(&chunk->list, &q->control_chunk_list);
923 } else { 923 break;
924 asoc->stats.octrlchunks++; 924 }
925 /* PR-SCTP C5) If a FORWARD TSN is sent, the 925
926 * sender MUST assure that at least one T3-rtx 926 asoc->stats.octrlchunks++;
927 * timer is running. 927 /* PR-SCTP C5) If a FORWARD TSN is sent, the
928 */ 928 * sender MUST assure that at least one T3-rtx
929 if (chunk->chunk_hdr->type == SCTP_CID_FWD_TSN) { 929 * timer is running.
930 sctp_transport_reset_t3_rtx(transport); 930 */
931 transport->last_time_sent = jiffies; 931 if (chunk->chunk_hdr->type == SCTP_CID_FWD_TSN) {
932 } 932 sctp_transport_reset_t3_rtx(transport);
933 transport->last_time_sent = jiffies;
933 } 934 }
935
936 if (chunk == asoc->strreset_chunk)
937 sctp_transport_reset_reconf_timer(transport);
938
934 break; 939 break;
935 940
936 default: 941 default:
@@ -1016,11 +1021,12 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
1016 1021
1017 /* Finally, transmit new packets. */ 1022 /* Finally, transmit new packets. */
1018 while ((chunk = sctp_outq_dequeue_data(q)) != NULL) { 1023 while ((chunk = sctp_outq_dequeue_data(q)) != NULL) {
1024 __u32 sid = ntohs(chunk->subh.data_hdr->stream);
1025
1019 /* RFC 2960 6.5 Every DATA chunk MUST carry a valid 1026 /* RFC 2960 6.5 Every DATA chunk MUST carry a valid
1020 * stream identifier. 1027 * stream identifier.
1021 */ 1028 */
1022 if (chunk->sinfo.sinfo_stream >= 1029 if (chunk->sinfo.sinfo_stream >= asoc->stream->outcnt) {
1023 asoc->c.sinit_num_ostreams) {
1024 1030
1025 /* Mark as failed send. */ 1031 /* Mark as failed send. */
1026 sctp_chunk_fail(chunk, SCTP_ERROR_INV_STRM); 1032 sctp_chunk_fail(chunk, SCTP_ERROR_INV_STRM);
@@ -1038,6 +1044,11 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
1038 continue; 1044 continue;
1039 } 1045 }
1040 1046
1047 if (asoc->stream->out[sid].state == SCTP_STREAM_CLOSED) {
1048 sctp_outq_head_data(q, chunk);
1049 goto sctp_flush_out;
1050 }
1051
1041 /* If there is a specified transport, use it. 1052 /* If there is a specified transport, use it.
1042 * Otherwise, we want to use the active path. 1053 * Otherwise, we want to use the active path.
1043 */ 1054 */
@@ -1641,7 +1652,7 @@ static void sctp_check_transmitted(struct sctp_outq *q,
1641 1652
1642 if (forward_progress) { 1653 if (forward_progress) {
1643 if (transport->dst) 1654 if (transport->dst)
1644 dst_confirm(transport->dst); 1655 sctp_transport_dst_confirm(transport);
1645 } 1656 }
1646 } 1657 }
1647 1658
diff --git a/net/sctp/primitive.c b/net/sctp/primitive.c
index ab8d9f96a177..f0553a022859 100644
--- a/net/sctp/primitive.c
+++ b/net/sctp/primitive.c
@@ -211,3 +211,6 @@ DECLARE_PRIMITIVE(REQUESTHEARTBEAT);
211*/ 211*/
212 212
213DECLARE_PRIMITIVE(ASCONF); 213DECLARE_PRIMITIVE(ASCONF);
214
215/* RE-CONFIG 5.1 */
216DECLARE_PRIMITIVE(RECONF);
diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index 206377fe91ec..a0b29d43627f 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -361,8 +361,8 @@ static int sctp_assocs_seq_show(struct seq_file *seq, void *v)
361 sctp_seq_dump_remote_addrs(seq, assoc); 361 sctp_seq_dump_remote_addrs(seq, assoc);
362 seq_printf(seq, "\t%8lu %5d %5d %4d %4d %4d %8d " 362 seq_printf(seq, "\t%8lu %5d %5d %4d %4d %4d %8d "
363 "%8d %8d %8d %8d", 363 "%8d %8d %8d %8d",
364 assoc->hbinterval, assoc->c.sinit_max_instreams, 364 assoc->hbinterval, assoc->stream->incnt,
365 assoc->c.sinit_num_ostreams, assoc->max_retrans, 365 assoc->stream->outcnt, assoc->max_retrans,
366 assoc->init_retries, assoc->shutdown_retries, 366 assoc->init_retries, assoc->shutdown_retries,
367 assoc->rtx_data_chunks, 367 assoc->rtx_data_chunks,
368 atomic_read(&sk->sk_wmem_alloc), 368 atomic_read(&sk->sk_wmem_alloc),
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 616a9428e0c4..989a900383b5 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -199,6 +199,7 @@ int sctp_copy_local_addr_list(struct net *net, struct sctp_bind_addr *bp,
199 sctp_scope_t scope, gfp_t gfp, int copy_flags) 199 sctp_scope_t scope, gfp_t gfp, int copy_flags)
200{ 200{
201 struct sctp_sockaddr_entry *addr; 201 struct sctp_sockaddr_entry *addr;
202 union sctp_addr laddr;
202 int error = 0; 203 int error = 0;
203 204
204 rcu_read_lock(); 205 rcu_read_lock();
@@ -220,7 +221,10 @@ int sctp_copy_local_addr_list(struct net *net, struct sctp_bind_addr *bp,
220 !(copy_flags & SCTP_ADDR6_PEERSUPP))) 221 !(copy_flags & SCTP_ADDR6_PEERSUPP)))
221 continue; 222 continue;
222 223
223 if (sctp_bind_addr_state(bp, &addr->a) != -1) 224 laddr = addr->a;
225 /* also works for setting ipv6 address port */
226 laddr.v4.sin_port = htons(bp->port);
227 if (sctp_bind_addr_state(bp, &laddr) != -1)
224 continue; 228 continue;
225 229
226 error = sctp_add_bind_addr(bp, &addr->a, sizeof(addr->a), 230 error = sctp_add_bind_addr(bp, &addr->a, sizeof(addr->a),
@@ -237,23 +241,19 @@ int sctp_copy_local_addr_list(struct net *net, struct sctp_bind_addr *bp,
237static void sctp_v4_from_skb(union sctp_addr *addr, struct sk_buff *skb, 241static void sctp_v4_from_skb(union sctp_addr *addr, struct sk_buff *skb,
238 int is_saddr) 242 int is_saddr)
239{ 243{
240 void *from; 244 /* Always called on head skb, so this is safe */
241 __be16 *port; 245 struct sctphdr *sh = sctp_hdr(skb);
242 struct sctphdr *sh; 246 struct sockaddr_in *sa = &addr->v4;
243 247
244 port = &addr->v4.sin_port;
245 addr->v4.sin_family = AF_INET; 248 addr->v4.sin_family = AF_INET;
246 249
247 /* Always called on head skb, so this is safe */
248 sh = sctp_hdr(skb);
249 if (is_saddr) { 250 if (is_saddr) {
250 *port = sh->source; 251 sa->sin_port = sh->source;
251 from = &ip_hdr(skb)->saddr; 252 sa->sin_addr.s_addr = ip_hdr(skb)->saddr;
252 } else { 253 } else {
253 *port = sh->dest; 254 sa->sin_port = sh->dest;
254 from = &ip_hdr(skb)->daddr; 255 sa->sin_addr.s_addr = ip_hdr(skb)->daddr;
255 } 256 }
256 memcpy(&addr->v4.sin_addr.s_addr, from, sizeof(struct in_addr));
257} 257}
258 258
259/* Initialize an sctp_addr from a socket. */ 259/* Initialize an sctp_addr from a socket. */
@@ -575,10 +575,11 @@ static int sctp_v4_is_ce(const struct sk_buff *skb)
575 575
576/* Create and initialize a new sk for the socket returned by accept(). */ 576/* Create and initialize a new sk for the socket returned by accept(). */
577static struct sock *sctp_v4_create_accept_sk(struct sock *sk, 577static struct sock *sctp_v4_create_accept_sk(struct sock *sk,
578 struct sctp_association *asoc) 578 struct sctp_association *asoc,
579 bool kern)
579{ 580{
580 struct sock *newsk = sk_alloc(sock_net(sk), PF_INET, GFP_KERNEL, 581 struct sock *newsk = sk_alloc(sock_net(sk), PF_INET, GFP_KERNEL,
581 sk->sk_prot, 0); 582 sk->sk_prot, kern);
582 struct inet_sock *newinet; 583 struct inet_sock *newinet;
583 584
584 if (!newsk) 585 if (!newsk)
@@ -1262,6 +1263,9 @@ static int __net_init sctp_defaults_init(struct net *net)
1262 /* Enable PR-SCTP by default. */ 1263 /* Enable PR-SCTP by default. */
1263 net->sctp.prsctp_enable = 1; 1264 net->sctp.prsctp_enable = 1;
1264 1265
1266 /* Disable RECONF by default. */
1267 net->sctp.reconf_enable = 0;
1268
1265 /* Disable AUTH by default. */ 1269 /* Disable AUTH by default. */
1266 net->sctp.auth_enable = 0; 1270 net->sctp.auth_enable = 0;
1267 1271
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 9e9690b7afe1..118faff6a332 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -270,6 +270,11 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
270 num_ext += 2; 270 num_ext += 2;
271 } 271 }
272 272
273 if (asoc->reconf_enable) {
274 extensions[num_ext] = SCTP_CID_RECONF;
275 num_ext += 1;
276 }
277
273 if (sp->adaptation_ind) 278 if (sp->adaptation_ind)
274 chunksize += sizeof(aiparam); 279 chunksize += sizeof(aiparam);
275 280
@@ -434,6 +439,11 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
434 num_ext += 2; 439 num_ext += 2;
435 } 440 }
436 441
442 if (asoc->peer.reconf_capable) {
443 extensions[num_ext] = SCTP_CID_RECONF;
444 num_ext += 1;
445 }
446
437 if (sp->adaptation_ind) 447 if (sp->adaptation_ind)
438 chunksize += sizeof(aiparam); 448 chunksize += sizeof(aiparam);
439 449
@@ -1536,7 +1546,7 @@ void sctp_chunk_assign_ssn(struct sctp_chunk *chunk)
1536 1546
1537 /* All fragments will be on the same stream */ 1547 /* All fragments will be on the same stream */
1538 sid = ntohs(chunk->subh.data_hdr->stream); 1548 sid = ntohs(chunk->subh.data_hdr->stream);
1539 stream = &chunk->asoc->ssnmap->out; 1549 stream = chunk->asoc->stream;
1540 1550
1541 /* Now assign the sequence number to the entire message. 1551 /* Now assign the sequence number to the entire message.
1542 * All fragments must have the same stream sequence number. 1552 * All fragments must have the same stream sequence number.
@@ -1547,9 +1557,9 @@ void sctp_chunk_assign_ssn(struct sctp_chunk *chunk)
1547 ssn = 0; 1557 ssn = 0;
1548 } else { 1558 } else {
1549 if (lchunk->chunk_hdr->flags & SCTP_DATA_LAST_FRAG) 1559 if (lchunk->chunk_hdr->flags & SCTP_DATA_LAST_FRAG)
1550 ssn = sctp_ssn_next(stream, sid); 1560 ssn = sctp_ssn_next(stream, out, sid);
1551 else 1561 else
1552 ssn = sctp_ssn_peek(stream, sid); 1562 ssn = sctp_ssn_peek(stream, out, sid);
1553 } 1563 }
1554 1564
1555 lchunk->subh.data_hdr->ssn = htons(ssn); 1565 lchunk->subh.data_hdr->ssn = htons(ssn);
@@ -1844,6 +1854,7 @@ no_hmac:
1844 retval->next_tsn = retval->c.initial_tsn; 1854 retval->next_tsn = retval->c.initial_tsn;
1845 retval->ctsn_ack_point = retval->next_tsn - 1; 1855 retval->ctsn_ack_point = retval->next_tsn - 1;
1846 retval->addip_serial = retval->c.initial_tsn; 1856 retval->addip_serial = retval->c.initial_tsn;
1857 retval->strreset_outseq = retval->c.initial_tsn;
1847 retval->adv_peer_ack_point = retval->ctsn_ack_point; 1858 retval->adv_peer_ack_point = retval->ctsn_ack_point;
1848 retval->peer.prsctp_capable = retval->c.prsctp_capable; 1859 retval->peer.prsctp_capable = retval->c.prsctp_capable;
1849 retval->peer.adaptation_ind = retval->c.adaptation_ind; 1860 retval->peer.adaptation_ind = retval->c.adaptation_ind;
@@ -2011,6 +2022,11 @@ static void sctp_process_ext_param(struct sctp_association *asoc,
2011 2022
2012 for (i = 0; i < num_ext; i++) { 2023 for (i = 0; i < num_ext; i++) {
2013 switch (param.ext->chunks[i]) { 2024 switch (param.ext->chunks[i]) {
2025 case SCTP_CID_RECONF:
2026 if (asoc->reconf_enable &&
2027 !asoc->peer.reconf_capable)
2028 asoc->peer.reconf_capable = 1;
2029 break;
2014 case SCTP_CID_FWD_TSN: 2030 case SCTP_CID_FWD_TSN:
2015 if (asoc->prsctp_enable && !asoc->peer.prsctp_capable) 2031 if (asoc->prsctp_enable && !asoc->peer.prsctp_capable)
2016 asoc->peer.prsctp_capable = 1; 2032 asoc->peer.prsctp_capable = 1;
@@ -2387,6 +2403,8 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk,
2387 asoc->peer.i.initial_tsn = 2403 asoc->peer.i.initial_tsn =
2388 ntohl(peer_init->init_hdr.initial_tsn); 2404 ntohl(peer_init->init_hdr.initial_tsn);
2389 2405
2406 asoc->strreset_inseq = asoc->peer.i.initial_tsn;
2407
2390 /* Apply the upper bounds for output streams based on peer's 2408 /* Apply the upper bounds for output streams based on peer's
2391 * number of inbound streams. 2409 * number of inbound streams.
2392 */ 2410 */
@@ -2442,15 +2460,10 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk,
2442 * association. 2460 * association.
2443 */ 2461 */
2444 if (!asoc->temp) { 2462 if (!asoc->temp) {
2445 int error; 2463 if (sctp_stream_init(asoc, gfp))
2446
2447 asoc->ssnmap = sctp_ssnmap_new(asoc->c.sinit_max_instreams,
2448 asoc->c.sinit_num_ostreams, gfp);
2449 if (!asoc->ssnmap)
2450 goto clean_up; 2464 goto clean_up;
2451 2465
2452 error = sctp_assoc_set_id(asoc, gfp); 2466 if (sctp_assoc_set_id(asoc, gfp))
2453 if (error)
2454 goto clean_up; 2467 goto clean_up;
2455 } 2468 }
2456 2469
@@ -3210,7 +3223,6 @@ struct sctp_chunk *sctp_process_asconf(struct sctp_association *asoc,
3210 union sctp_params param; 3223 union sctp_params param;
3211 sctp_addiphdr_t *hdr; 3224 sctp_addiphdr_t *hdr;
3212 union sctp_addr_param *addr_param; 3225 union sctp_addr_param *addr_param;
3213 sctp_addip_param_t *asconf_param;
3214 struct sctp_chunk *asconf_ack; 3226 struct sctp_chunk *asconf_ack;
3215 __be16 err_code; 3227 __be16 err_code;
3216 int length = 0; 3228 int length = 0;
@@ -3230,7 +3242,6 @@ struct sctp_chunk *sctp_process_asconf(struct sctp_association *asoc,
3230 * asconf parameter. 3242 * asconf parameter.
3231 */ 3243 */
3232 length = ntohs(addr_param->p.length); 3244 length = ntohs(addr_param->p.length);
3233 asconf_param = (void *)addr_param + length;
3234 chunk_len -= length; 3245 chunk_len -= length;
3235 3246
3236 /* create an ASCONF_ACK chunk. 3247 /* create an ASCONF_ACK chunk.
@@ -3317,8 +3328,7 @@ static void sctp_asconf_param_success(struct sctp_association *asoc,
3317 local_bh_enable(); 3328 local_bh_enable();
3318 list_for_each_entry(transport, &asoc->peer.transport_addr_list, 3329 list_for_each_entry(transport, &asoc->peer.transport_addr_list,
3319 transports) { 3330 transports) {
3320 dst_release(transport->dst); 3331 sctp_transport_dst_release(transport);
3321 transport->dst = NULL;
3322 } 3332 }
3323 break; 3333 break;
3324 case SCTP_PARAM_DEL_IP: 3334 case SCTP_PARAM_DEL_IP:
@@ -3332,8 +3342,7 @@ static void sctp_asconf_param_success(struct sctp_association *asoc,
3332 local_bh_enable(); 3342 local_bh_enable();
3333 list_for_each_entry(transport, &asoc->peer.transport_addr_list, 3343 list_for_each_entry(transport, &asoc->peer.transport_addr_list,
3334 transports) { 3344 transports) {
3335 dst_release(transport->dst); 3345 sctp_transport_dst_release(transport);
3336 transport->dst = NULL;
3337 } 3346 }
3338 break; 3347 break;
3339 default: 3348 default:
@@ -3526,3 +3535,323 @@ struct sctp_chunk *sctp_make_fwdtsn(const struct sctp_association *asoc,
3526 3535
3527 return retval; 3536 return retval;
3528} 3537}
3538
3539/* RE-CONFIG 3.1 (RE-CONFIG chunk)
3540 * 0 1 2 3
3541 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
3542 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3543 * | Type = 130 | Chunk Flags | Chunk Length |
3544 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3545 * \ \
3546 * / Re-configuration Parameter /
3547 * \ \
3548 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3549 * \ \
3550 * / Re-configuration Parameter (optional) /
3551 * \ \
3552 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3553 */
3554static struct sctp_chunk *sctp_make_reconf(
3555 const struct sctp_association *asoc,
3556 int length)
3557{
3558 struct sctp_reconf_chunk *reconf;
3559 struct sctp_chunk *retval;
3560
3561 retval = sctp_make_control(asoc, SCTP_CID_RECONF, 0, length,
3562 GFP_ATOMIC);
3563 if (!retval)
3564 return NULL;
3565
3566 reconf = (struct sctp_reconf_chunk *)retval->chunk_hdr;
3567 retval->param_hdr.v = reconf->params;
3568
3569 return retval;
3570}
3571
3572/* RE-CONFIG 4.1 (STREAM OUT RESET)
3573 * 0 1 2 3
3574 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
3575 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3576 * | Parameter Type = 13 | Parameter Length = 16 + 2 * N |
3577 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3578 * | Re-configuration Request Sequence Number |
3579 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3580 * | Re-configuration Response Sequence Number |
3581 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3582 * | Sender's Last Assigned TSN |
3583 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3584 * | Stream Number 1 (optional) | Stream Number 2 (optional) |
3585 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3586 * / ...... /
3587 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3588 * | Stream Number N-1 (optional) | Stream Number N (optional) |
3589 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3590 *
3591 * RE-CONFIG 4.2 (STREAM IN RESET)
3592 * 0 1 2 3
3593 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
3594 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3595 * | Parameter Type = 14 | Parameter Length = 8 + 2 * N |
3596 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3597 * | Re-configuration Request Sequence Number |
3598 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3599 * | Stream Number 1 (optional) | Stream Number 2 (optional) |
3600 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3601 * / ...... /
3602 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3603 * | Stream Number N-1 (optional) | Stream Number N (optional) |
3604 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3605 */
3606struct sctp_chunk *sctp_make_strreset_req(
3607 const struct sctp_association *asoc,
3608 __u16 stream_num, __u16 *stream_list,
3609 bool out, bool in)
3610{
3611 struct sctp_strreset_outreq outreq;
3612 __u16 stream_len = stream_num * 2;
3613 struct sctp_strreset_inreq inreq;
3614 struct sctp_chunk *retval;
3615 __u16 outlen, inlen;
3616
3617 outlen = (sizeof(outreq) + stream_len) * out;
3618 inlen = (sizeof(inreq) + stream_len) * in;
3619
3620 retval = sctp_make_reconf(asoc, outlen + inlen);
3621 if (!retval)
3622 return NULL;
3623
3624 if (outlen) {
3625 outreq.param_hdr.type = SCTP_PARAM_RESET_OUT_REQUEST;
3626 outreq.param_hdr.length = htons(outlen);
3627 outreq.request_seq = htonl(asoc->strreset_outseq);
3628 outreq.response_seq = htonl(asoc->strreset_inseq - 1);
3629 outreq.send_reset_at_tsn = htonl(asoc->next_tsn - 1);
3630
3631 sctp_addto_chunk(retval, sizeof(outreq), &outreq);
3632
3633 if (stream_len)
3634 sctp_addto_chunk(retval, stream_len, stream_list);
3635 }
3636
3637 if (inlen) {
3638 inreq.param_hdr.type = SCTP_PARAM_RESET_IN_REQUEST;
3639 inreq.param_hdr.length = htons(inlen);
3640 inreq.request_seq = htonl(asoc->strreset_outseq + out);
3641
3642 sctp_addto_chunk(retval, sizeof(inreq), &inreq);
3643
3644 if (stream_len)
3645 sctp_addto_chunk(retval, stream_len, stream_list);
3646 }
3647
3648 return retval;
3649}
3650
3651/* RE-CONFIG 4.3 (SSN/TSN RESET ALL)
3652 * 0 1 2 3
3653 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
3654 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3655 * | Parameter Type = 15 | Parameter Length = 8 |
3656 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3657 * | Re-configuration Request Sequence Number |
3658 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3659 */
3660struct sctp_chunk *sctp_make_strreset_tsnreq(
3661 const struct sctp_association *asoc)
3662{
3663 struct sctp_strreset_tsnreq tsnreq;
3664 __u16 length = sizeof(tsnreq);
3665 struct sctp_chunk *retval;
3666
3667 retval = sctp_make_reconf(asoc, length);
3668 if (!retval)
3669 return NULL;
3670
3671 tsnreq.param_hdr.type = SCTP_PARAM_RESET_TSN_REQUEST;
3672 tsnreq.param_hdr.length = htons(length);
3673 tsnreq.request_seq = htonl(asoc->strreset_outseq);
3674
3675 sctp_addto_chunk(retval, sizeof(tsnreq), &tsnreq);
3676
3677 return retval;
3678}
3679
3680/* RE-CONFIG 4.5/4.6 (ADD STREAM)
3681 * 0 1 2 3
3682 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
3683 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3684 * | Parameter Type = 17 | Parameter Length = 12 |
3685 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3686 * | Re-configuration Request Sequence Number |
3687 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3688 * | Number of new streams | Reserved |
3689 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3690 */
3691struct sctp_chunk *sctp_make_strreset_addstrm(
3692 const struct sctp_association *asoc,
3693 __u16 out, __u16 in)
3694{
3695 struct sctp_strreset_addstrm addstrm;
3696 __u16 size = sizeof(addstrm);
3697 struct sctp_chunk *retval;
3698
3699 retval = sctp_make_reconf(asoc, (!!out + !!in) * size);
3700 if (!retval)
3701 return NULL;
3702
3703 if (out) {
3704 addstrm.param_hdr.type = SCTP_PARAM_RESET_ADD_OUT_STREAMS;
3705 addstrm.param_hdr.length = htons(size);
3706 addstrm.number_of_streams = htons(out);
3707 addstrm.request_seq = htonl(asoc->strreset_outseq);
3708 addstrm.reserved = 0;
3709
3710 sctp_addto_chunk(retval, size, &addstrm);
3711 }
3712
3713 if (in) {
3714 addstrm.param_hdr.type = SCTP_PARAM_RESET_ADD_IN_STREAMS;
3715 addstrm.param_hdr.length = htons(size);
3716 addstrm.number_of_streams = htons(in);
3717 addstrm.request_seq = htonl(asoc->strreset_outseq + !!out);
3718 addstrm.reserved = 0;
3719
3720 sctp_addto_chunk(retval, size, &addstrm);
3721 }
3722
3723 return retval;
3724}
3725
3726/* RE-CONFIG 4.4 (RESP)
3727 * 0 1 2 3
3728 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
3729 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3730 * | Parameter Type = 16 | Parameter Length |
3731 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3732 * | Re-configuration Response Sequence Number |
3733 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3734 * | Result |
3735 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3736 */
3737struct sctp_chunk *sctp_make_strreset_resp(
3738 const struct sctp_association *asoc,
3739 __u32 result, __u32 sn)
3740{
3741 struct sctp_strreset_resp resp;
3742 __u16 length = sizeof(resp);
3743 struct sctp_chunk *retval;
3744
3745 retval = sctp_make_reconf(asoc, length);
3746 if (!retval)
3747 return NULL;
3748
3749 resp.param_hdr.type = SCTP_PARAM_RESET_RESPONSE;
3750 resp.param_hdr.length = htons(length);
3751 resp.response_seq = htonl(sn);
3752 resp.result = htonl(result);
3753
3754 sctp_addto_chunk(retval, sizeof(resp), &resp);
3755
3756 return retval;
3757}
3758
3759/* RE-CONFIG 4.4 OPTIONAL (TSNRESP)
3760 * 0 1 2 3
3761 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
3762 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3763 * | Parameter Type = 16 | Parameter Length |
3764 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3765 * | Re-configuration Response Sequence Number |
3766 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3767 * | Result |
3768 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3769 * | Sender's Next TSN (optional) |
3770 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3771 * | Receiver's Next TSN (optional) |
3772 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3773 */
3774struct sctp_chunk *sctp_make_strreset_tsnresp(
3775 struct sctp_association *asoc,
3776 __u32 result, __u32 sn,
3777 __u32 sender_tsn, __u32 receiver_tsn)
3778{
3779 struct sctp_strreset_resptsn tsnresp;
3780 __u16 length = sizeof(tsnresp);
3781 struct sctp_chunk *retval;
3782
3783 retval = sctp_make_reconf(asoc, length);
3784 if (!retval)
3785 return NULL;
3786
3787 tsnresp.param_hdr.type = SCTP_PARAM_RESET_RESPONSE;
3788 tsnresp.param_hdr.length = htons(length);
3789
3790 tsnresp.response_seq = htonl(sn);
3791 tsnresp.result = htonl(result);
3792 tsnresp.senders_next_tsn = htonl(sender_tsn);
3793 tsnresp.receivers_next_tsn = htonl(receiver_tsn);
3794
3795 sctp_addto_chunk(retval, sizeof(tsnresp), &tsnresp);
3796
3797 return retval;
3798}
3799
3800bool sctp_verify_reconf(const struct sctp_association *asoc,
3801 struct sctp_chunk *chunk,
3802 struct sctp_paramhdr **errp)
3803{
3804 struct sctp_reconf_chunk *hdr;
3805 union sctp_params param;
3806 __u16 last = 0, cnt = 0;
3807
3808 hdr = (struct sctp_reconf_chunk *)chunk->chunk_hdr;
3809 sctp_walk_params(param, hdr, params) {
3810 __u16 length = ntohs(param.p->length);
3811
3812 *errp = param.p;
3813 if (cnt++ > 2)
3814 return false;
3815 switch (param.p->type) {
3816 case SCTP_PARAM_RESET_OUT_REQUEST:
3817 if (length < sizeof(struct sctp_strreset_outreq) ||
3818 (last && last != SCTP_PARAM_RESET_RESPONSE &&
3819 last != SCTP_PARAM_RESET_IN_REQUEST))
3820 return false;
3821 break;
3822 case SCTP_PARAM_RESET_IN_REQUEST:
3823 if (length < sizeof(struct sctp_strreset_inreq) ||
3824 (last && last != SCTP_PARAM_RESET_OUT_REQUEST))
3825 return false;
3826 break;
3827 case SCTP_PARAM_RESET_RESPONSE:
3828 if ((length != sizeof(struct sctp_strreset_resp) &&
3829 length != sizeof(struct sctp_strreset_resptsn)) ||
3830 (last && last != SCTP_PARAM_RESET_RESPONSE &&
3831 last != SCTP_PARAM_RESET_OUT_REQUEST))
3832 return false;
3833 break;
3834 case SCTP_PARAM_RESET_TSN_REQUEST:
3835 if (length !=
3836 sizeof(struct sctp_strreset_tsnreq) || last)
3837 return false;
3838 break;
3839 case SCTP_PARAM_RESET_ADD_IN_STREAMS:
3840 if (length != sizeof(struct sctp_strreset_addstrm) ||
3841 (last && last != SCTP_PARAM_RESET_ADD_OUT_STREAMS))
3842 return false;
3843 break;
3844 case SCTP_PARAM_RESET_ADD_OUT_STREAMS:
3845 if (length != sizeof(struct sctp_strreset_addstrm) ||
3846 (last && last != SCTP_PARAM_RESET_ADD_IN_STREAMS))
3847 return false;
3848 break;
3849 default:
3850 return false;
3851 }
3852
3853 last = param.p->type;
3854 }
3855
3856 return true;
3857}
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index c345bf153bed..25384fa82ba9 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -436,6 +436,37 @@ out_unlock:
436 sctp_association_put(asoc); 436 sctp_association_put(asoc);
437} 437}
438 438
439 /* Handle the timeout of the RE-CONFIG timer. */
440void sctp_generate_reconf_event(unsigned long data)
441{
442 struct sctp_transport *transport = (struct sctp_transport *)data;
443 struct sctp_association *asoc = transport->asoc;
444 struct sock *sk = asoc->base.sk;
445 struct net *net = sock_net(sk);
446 int error = 0;
447
448 bh_lock_sock(sk);
449 if (sock_owned_by_user(sk)) {
450 pr_debug("%s: sock is busy\n", __func__);
451
452 /* Try again later. */
453 if (!mod_timer(&transport->reconf_timer, jiffies + (HZ / 20)))
454 sctp_transport_hold(transport);
455 goto out_unlock;
456 }
457
458 error = sctp_do_sm(net, SCTP_EVENT_T_TIMEOUT,
459 SCTP_ST_TIMEOUT(SCTP_EVENT_TIMEOUT_RECONF),
460 asoc->state, asoc->ep, asoc,
461 transport, GFP_ATOMIC);
462
463 if (error)
464 sk->sk_err = -error;
465
466out_unlock:
467 bh_unlock_sock(sk);
468 sctp_transport_put(transport);
469}
439 470
440/* Inject a SACK Timeout event into the state machine. */ 471/* Inject a SACK Timeout event into the state machine. */
441static void sctp_generate_sack_event(unsigned long data) 472static void sctp_generate_sack_event(unsigned long data)
@@ -453,6 +484,7 @@ sctp_timer_event_t *sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = {
453 sctp_generate_t4_rto_event, 484 sctp_generate_t4_rto_event,
454 sctp_generate_t5_shutdown_guard_event, 485 sctp_generate_t5_shutdown_guard_event,
455 NULL, 486 NULL,
487 NULL,
456 sctp_generate_sack_event, 488 sctp_generate_sack_event,
457 sctp_generate_autoclose_event, 489 sctp_generate_autoclose_event,
458}; 490};
@@ -723,7 +755,7 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds,
723 * forward progress. 755 * forward progress.
724 */ 756 */
725 if (t->dst) 757 if (t->dst)
726 dst_confirm(t->dst); 758 sctp_transport_dst_confirm(t);
727 759
728 /* The receiver of the HEARTBEAT ACK should also perform an 760 /* The receiver of the HEARTBEAT ACK should also perform an
729 * RTT measurement for that destination transport address 761 * RTT measurement for that destination transport address
@@ -840,6 +872,10 @@ static void sctp_cmd_new_state(sctp_cmd_seq_t *cmds,
840 if (!sctp_style(sk, UDP)) 872 if (!sctp_style(sk, UDP))
841 sk->sk_state_change(sk); 873 sk->sk_state_change(sk);
842 } 874 }
875
876 if (sctp_state(asoc, SHUTDOWN_PENDING) &&
877 !sctp_outq_is_empty(&asoc->outqueue))
878 sctp_outq_uncork(&asoc->outqueue, GFP_ATOMIC);
843} 879}
844 880
845/* Helper function to delete an association. */ 881/* Helper function to delete an association. */
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 8ec20a64a3f8..24c6ccce7539 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -160,23 +160,22 @@ static sctp_disposition_t __sctp_sf_do_9_1_abort(struct net *net,
160/* Small helper function that checks if the chunk length 160/* Small helper function that checks if the chunk length
161 * is of the appropriate length. The 'required_length' argument 161 * is of the appropriate length. The 'required_length' argument
162 * is set to be the size of a specific chunk we are testing. 162 * is set to be the size of a specific chunk we are testing.
163 * Return Values: 1 = Valid length 163 * Return Values: true = Valid length
164 * 0 = Invalid length 164 * false = Invalid length
165 * 165 *
166 */ 166 */
167static inline int 167static inline bool
168sctp_chunk_length_valid(struct sctp_chunk *chunk, 168sctp_chunk_length_valid(struct sctp_chunk *chunk, __u16 required_length)
169 __u16 required_length)
170{ 169{
171 __u16 chunk_length = ntohs(chunk->chunk_hdr->length); 170 __u16 chunk_length = ntohs(chunk->chunk_hdr->length);
172 171
173 /* Previously already marked? */ 172 /* Previously already marked? */
174 if (unlikely(chunk->pdiscard)) 173 if (unlikely(chunk->pdiscard))
175 return 0; 174 return false;
176 if (unlikely(chunk_length < required_length)) 175 if (unlikely(chunk_length < required_length))
177 return 0; 176 return false;
178 177
179 return 1; 178 return true;
180} 179}
181 180
182/********************************************************** 181/**********************************************************
@@ -1022,6 +1021,34 @@ sctp_disposition_t sctp_sf_sendbeat_8_3(struct net *net,
1022 return SCTP_DISPOSITION_CONSUME; 1021 return SCTP_DISPOSITION_CONSUME;
1023} 1022}
1024 1023
1024/* resend asoc strreset_chunk. */
1025sctp_disposition_t sctp_sf_send_reconf(struct net *net,
1026 const struct sctp_endpoint *ep,
1027 const struct sctp_association *asoc,
1028 const sctp_subtype_t type, void *arg,
1029 sctp_cmd_seq_t *commands)
1030{
1031 struct sctp_transport *transport = arg;
1032
1033 if (asoc->overall_error_count >= asoc->max_retrans) {
1034 sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
1035 SCTP_ERROR(ETIMEDOUT));
1036 /* CMD_ASSOC_FAILED calls CMD_DELETE_TCB. */
1037 sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED,
1038 SCTP_PERR(SCTP_ERROR_NO_ERROR));
1039 SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS);
1040 SCTP_DEC_STATS(net, SCTP_MIB_CURRESTAB);
1041 return SCTP_DISPOSITION_DELETE_TCB;
1042 }
1043
1044 sctp_chunk_hold(asoc->strreset_chunk);
1045 sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
1046 SCTP_CHUNK(asoc->strreset_chunk));
1047 sctp_add_cmd_sf(commands, SCTP_CMD_STRIKE, SCTP_TRANSPORT(transport));
1048
1049 return SCTP_DISPOSITION_CONSUME;
1050}
1051
1025/* 1052/*
1026 * Process an heartbeat request. 1053 * Process an heartbeat request.
1027 * 1054 *
@@ -3237,36 +3264,34 @@ static sctp_disposition_t sctp_sf_tabort_8_4_8(struct net *net,
3237 struct sctp_chunk *abort; 3264 struct sctp_chunk *abort;
3238 3265
3239 packet = sctp_ootb_pkt_new(net, asoc, chunk); 3266 packet = sctp_ootb_pkt_new(net, asoc, chunk);
3267 if (!packet)
3268 return SCTP_DISPOSITION_NOMEM;
3240 3269
3241 if (packet) { 3270 /* Make an ABORT. The T bit will be set if the asoc
3242 /* Make an ABORT. The T bit will be set if the asoc 3271 * is NULL.
3243 * is NULL. 3272 */
3244 */ 3273 abort = sctp_make_abort(asoc, chunk, 0);
3245 abort = sctp_make_abort(asoc, chunk, 0); 3274 if (!abort) {
3246 if (!abort) { 3275 sctp_ootb_pkt_free(packet);
3247 sctp_ootb_pkt_free(packet); 3276 return SCTP_DISPOSITION_NOMEM;
3248 return SCTP_DISPOSITION_NOMEM; 3277 }
3249 }
3250
3251 /* Reflect vtag if T-Bit is set */
3252 if (sctp_test_T_bit(abort))
3253 packet->vtag = ntohl(chunk->sctp_hdr->vtag);
3254 3278
3255 /* Set the skb to the belonging sock for accounting. */ 3279 /* Reflect vtag if T-Bit is set */
3256 abort->skb->sk = ep->base.sk; 3280 if (sctp_test_T_bit(abort))
3281 packet->vtag = ntohl(chunk->sctp_hdr->vtag);
3257 3282
3258 sctp_packet_append_chunk(packet, abort); 3283 /* Set the skb to the belonging sock for accounting. */
3284 abort->skb->sk = ep->base.sk;
3259 3285
3260 sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT, 3286 sctp_packet_append_chunk(packet, abort);
3261 SCTP_PACKET(packet));
3262 3287
3263 SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS); 3288 sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT,
3289 SCTP_PACKET(packet));
3264 3290
3265 sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); 3291 SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS);
3266 return SCTP_DISPOSITION_CONSUME;
3267 }
3268 3292
3269 return SCTP_DISPOSITION_NOMEM; 3293 sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
3294 return SCTP_DISPOSITION_CONSUME;
3270} 3295}
3271 3296
3272/* 3297/*
@@ -3503,45 +3528,43 @@ static sctp_disposition_t sctp_sf_shut_8_4_5(struct net *net,
3503 struct sctp_chunk *shut; 3528 struct sctp_chunk *shut;
3504 3529
3505 packet = sctp_ootb_pkt_new(net, asoc, chunk); 3530 packet = sctp_ootb_pkt_new(net, asoc, chunk);
3531 if (!packet)
3532 return SCTP_DISPOSITION_NOMEM;
3506 3533
3507 if (packet) { 3534 /* Make an SHUTDOWN_COMPLETE.
3508 /* Make an SHUTDOWN_COMPLETE. 3535 * The T bit will be set if the asoc is NULL.
3509 * The T bit will be set if the asoc is NULL. 3536 */
3510 */ 3537 shut = sctp_make_shutdown_complete(asoc, chunk);
3511 shut = sctp_make_shutdown_complete(asoc, chunk); 3538 if (!shut) {
3512 if (!shut) { 3539 sctp_ootb_pkt_free(packet);
3513 sctp_ootb_pkt_free(packet); 3540 return SCTP_DISPOSITION_NOMEM;
3514 return SCTP_DISPOSITION_NOMEM; 3541 }
3515 }
3516
3517 /* Reflect vtag if T-Bit is set */
3518 if (sctp_test_T_bit(shut))
3519 packet->vtag = ntohl(chunk->sctp_hdr->vtag);
3520 3542
3521 /* Set the skb to the belonging sock for accounting. */ 3543 /* Reflect vtag if T-Bit is set */
3522 shut->skb->sk = ep->base.sk; 3544 if (sctp_test_T_bit(shut))
3545 packet->vtag = ntohl(chunk->sctp_hdr->vtag);
3523 3546
3524 sctp_packet_append_chunk(packet, shut); 3547 /* Set the skb to the belonging sock for accounting. */
3548 shut->skb->sk = ep->base.sk;
3525 3549
3526 sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT, 3550 sctp_packet_append_chunk(packet, shut);
3527 SCTP_PACKET(packet));
3528 3551
3529 SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS); 3552 sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT,
3553 SCTP_PACKET(packet));
3530 3554
3531 /* If the chunk length is invalid, we don't want to process 3555 SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS);
3532 * the reset of the packet.
3533 */
3534 if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t)))
3535 return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
3536 3556
3537 /* We need to discard the rest of the packet to prevent 3557 /* If the chunk length is invalid, we don't want to process
3538 * potential bomming attacks from additional bundled chunks. 3558 * the reset of the packet.
3539 * This is documented in SCTP Threats ID. 3559 */
3540 */ 3560 if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t)))
3541 return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); 3561 return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
3542 }
3543 3562
3544 return SCTP_DISPOSITION_NOMEM; 3563 /* We need to discard the rest of the packet to prevent
3564 * potential bomming attacks from additional bundled chunks.
3565 * This is documented in SCTP Threats ID.
3566 */
3567 return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
3545} 3568}
3546 3569
3547/* 3570/*
@@ -3811,6 +3834,60 @@ sctp_disposition_t sctp_sf_do_asconf_ack(struct net *net,
3811 return SCTP_DISPOSITION_DISCARD; 3834 return SCTP_DISPOSITION_DISCARD;
3812} 3835}
3813 3836
3837/* RE-CONFIG Section 5.2 Upon reception of an RECONF Chunk. */
3838sctp_disposition_t sctp_sf_do_reconf(struct net *net,
3839 const struct sctp_endpoint *ep,
3840 const struct sctp_association *asoc,
3841 const sctp_subtype_t type, void *arg,
3842 sctp_cmd_seq_t *commands)
3843{
3844 struct sctp_paramhdr *err_param = NULL;
3845 struct sctp_chunk *chunk = arg;
3846 struct sctp_reconf_chunk *hdr;
3847 union sctp_params param;
3848
3849 if (!sctp_vtag_verify(chunk, asoc)) {
3850 sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_BAD_TAG,
3851 SCTP_NULL());
3852 return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
3853 }
3854
3855 /* Make sure that the RECONF chunk has a valid length. */
3856 if (!sctp_chunk_length_valid(chunk, sizeof(*hdr)))
3857 return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
3858 commands);
3859
3860 if (!sctp_verify_reconf(asoc, chunk, &err_param))
3861 return sctp_sf_violation_paramlen(net, ep, asoc, type, arg,
3862 (void *)err_param, commands);
3863
3864 hdr = (struct sctp_reconf_chunk *)chunk->chunk_hdr;
3865 sctp_walk_params(param, hdr, params) {
3866 struct sctp_chunk *reply = NULL;
3867 struct sctp_ulpevent *ev = NULL;
3868
3869 if (param.p->type == SCTP_PARAM_RESET_OUT_REQUEST)
3870 reply = sctp_process_strreset_outreq(
3871 (struct sctp_association *)asoc, param, &ev);
3872 else if (param.p->type == SCTP_PARAM_RESET_IN_REQUEST)
3873 reply = sctp_process_strreset_inreq(
3874 (struct sctp_association *)asoc, param, &ev);
3875 /* More handles for other types will be added here, by now it
3876 * just ignores other types.
3877 */
3878
3879 if (ev)
3880 sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP,
3881 SCTP_ULPEVENT(ev));
3882
3883 if (reply)
3884 sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
3885 SCTP_CHUNK(reply));
3886 }
3887
3888 return SCTP_DISPOSITION_CONSUME;
3889}
3890
3814/* 3891/*
3815 * PR-SCTP Section 3.6 Receiver Side Implementation of PR-SCTP 3892 * PR-SCTP Section 3.6 Receiver Side Implementation of PR-SCTP
3816 * 3893 *
@@ -3844,6 +3921,9 @@ sctp_disposition_t sctp_sf_eat_fwd_tsn(struct net *net,
3844 return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); 3921 return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
3845 } 3922 }
3846 3923
3924 if (!asoc->peer.prsctp_capable)
3925 return sctp_sf_unk_chunk(net, ep, asoc, type, arg, commands);
3926
3847 /* Make sure that the FORWARD_TSN chunk has valid length. */ 3927 /* Make sure that the FORWARD_TSN chunk has valid length. */
3848 if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_fwdtsn_chunk))) 3928 if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_fwdtsn_chunk)))
3849 return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, 3929 return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
@@ -3866,7 +3946,7 @@ sctp_disposition_t sctp_sf_eat_fwd_tsn(struct net *net,
3866 3946
3867 /* Silently discard the chunk if stream-id is not valid */ 3947 /* Silently discard the chunk if stream-id is not valid */
3868 sctp_walk_fwdtsn(skip, chunk) { 3948 sctp_walk_fwdtsn(skip, chunk) {
3869 if (ntohs(skip->stream) >= asoc->c.sinit_max_instreams) 3949 if (ntohs(skip->stream) >= asoc->stream->incnt)
3870 goto discard_noforce; 3950 goto discard_noforce;
3871 } 3951 }
3872 3952
@@ -3912,6 +3992,9 @@ sctp_disposition_t sctp_sf_eat_fwd_tsn_fast(
3912 return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); 3992 return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
3913 } 3993 }
3914 3994
3995 if (!asoc->peer.prsctp_capable)
3996 return sctp_sf_unk_chunk(net, ep, asoc, type, arg, commands);
3997
3915 /* Make sure that the FORWARD_TSN chunk has a valid length. */ 3998 /* Make sure that the FORWARD_TSN chunk has a valid length. */
3916 if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_fwdtsn_chunk))) 3999 if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_fwdtsn_chunk)))
3917 return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, 4000 return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
@@ -3934,7 +4017,7 @@ sctp_disposition_t sctp_sf_eat_fwd_tsn_fast(
3934 4017
3935 /* Silently discard the chunk if stream-id is not valid */ 4018 /* Silently discard the chunk if stream-id is not valid */
3936 sctp_walk_fwdtsn(skip, chunk) { 4019 sctp_walk_fwdtsn(skip, chunk) {
3937 if (ntohs(skip->stream) >= asoc->c.sinit_max_instreams) 4020 if (ntohs(skip->stream) >= asoc->stream->incnt)
3938 goto gen_shutdown; 4021 goto gen_shutdown;
3939 } 4022 }
3940 4023
@@ -5162,6 +5245,19 @@ sctp_disposition_t sctp_sf_do_prm_asconf(struct net *net,
5162 return SCTP_DISPOSITION_CONSUME; 5245 return SCTP_DISPOSITION_CONSUME;
5163} 5246}
5164 5247
5248/* RE-CONFIG Section 5.1 RECONF Chunk Procedures */
5249sctp_disposition_t sctp_sf_do_prm_reconf(struct net *net,
5250 const struct sctp_endpoint *ep,
5251 const struct sctp_association *asoc,
5252 const sctp_subtype_t type,
5253 void *arg, sctp_cmd_seq_t *commands)
5254{
5255 struct sctp_chunk *chunk = arg;
5256
5257 sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(chunk));
5258 return SCTP_DISPOSITION_CONSUME;
5259}
5260
5165/* 5261/*
5166 * Ignore the primitive event 5262 * Ignore the primitive event
5167 * 5263 *
@@ -6036,8 +6132,9 @@ static struct sctp_packet *sctp_ootb_pkt_new(struct net *net,
6036 sctp_transport_route(transport, (union sctp_addr *)&chunk->dest, 6132 sctp_transport_route(transport, (union sctp_addr *)&chunk->dest,
6037 sctp_sk(net->sctp.ctl_sock)); 6133 sctp_sk(net->sctp.ctl_sock));
6038 6134
6039 packet = sctp_packet_init(&transport->packet, transport, sport, dport); 6135 packet = &transport->packet;
6040 packet = sctp_packet_config(packet, vtag, 0); 6136 sctp_packet_init(packet, transport, sport, dport);
6137 sctp_packet_config(packet, vtag, 0);
6041 6138
6042 return packet; 6139 return packet;
6043 6140
@@ -6256,7 +6353,7 @@ static int sctp_eat_data(const struct sctp_association *asoc,
6256 * and discard the DATA chunk. 6353 * and discard the DATA chunk.
6257 */ 6354 */
6258 sid = ntohs(data_hdr->stream); 6355 sid = ntohs(data_hdr->stream);
6259 if (sid >= asoc->c.sinit_max_instreams) { 6356 if (sid >= asoc->stream->incnt) {
6260 /* Mark tsn as received even though we drop it */ 6357 /* Mark tsn as received even though we drop it */
6261 sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_TSN, SCTP_U32(tsn)); 6358 sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_TSN, SCTP_U32(tsn));
6262 6359
@@ -6278,9 +6375,8 @@ static int sctp_eat_data(const struct sctp_association *asoc,
6278 * and is invalid. 6375 * and is invalid.
6279 */ 6376 */
6280 ssn = ntohs(data_hdr->ssn); 6377 ssn = ntohs(data_hdr->ssn);
6281 if (ordered && SSN_lt(ssn, sctp_ssn_peek(&asoc->ssnmap->in, sid))) { 6378 if (ordered && SSN_lt(ssn, sctp_ssn_peek(asoc->stream, in, sid)))
6282 return SCTP_IERROR_PROTO_VIOLATION; 6379 return SCTP_IERROR_PROTO_VIOLATION;
6283 }
6284 6380
6285 /* Send the data up to the user. Note: Schedule the 6381 /* Send the data up to the user. Note: Schedule the
6286 * SCTP_CMD_CHUNK_ULP cmd before the SCTP_CMD_GEN_SACK, as the SACK 6382 * SCTP_CMD_CHUNK_ULP cmd before the SCTP_CMD_GEN_SACK, as the SACK
diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c
index a987d54b379c..419b18ebb056 100644
--- a/net/sctp/sm_statetable.c
+++ b/net/sctp/sm_statetable.c
@@ -482,6 +482,32 @@ static const sctp_sm_table_entry_t prsctp_chunk_event_table[SCTP_NUM_PRSCTP_CHUN
482 TYPE_SCTP_FWD_TSN, 482 TYPE_SCTP_FWD_TSN,
483}; /*state_fn_t prsctp_chunk_event_table[][] */ 483}; /*state_fn_t prsctp_chunk_event_table[][] */
484 484
485#define TYPE_SCTP_RECONF { \
486 /* SCTP_STATE_CLOSED */ \
487 TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
488 /* SCTP_STATE_COOKIE_WAIT */ \
489 TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
490 /* SCTP_STATE_COOKIE_ECHOED */ \
491 TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
492 /* SCTP_STATE_ESTABLISHED */ \
493 TYPE_SCTP_FUNC(sctp_sf_do_reconf), \
494 /* SCTP_STATE_SHUTDOWN_PENDING */ \
495 TYPE_SCTP_FUNC(sctp_sf_do_reconf), \
496 /* SCTP_STATE_SHUTDOWN_SENT */ \
497 TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
498 /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
499 TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
500 /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
501 TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
502} /* TYPE_SCTP_RECONF */
503
504/* The primary index for this table is the chunk type.
505 * The secondary index for this table is the state.
506 */
507static const sctp_sm_table_entry_t reconf_chunk_event_table[SCTP_NUM_RECONF_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = {
508 TYPE_SCTP_RECONF,
509}; /*state_fn_t reconf_chunk_event_table[][] */
510
485#define TYPE_SCTP_AUTH { \ 511#define TYPE_SCTP_AUTH { \
486 /* SCTP_STATE_CLOSED */ \ 512 /* SCTP_STATE_CLOSED */ \
487 TYPE_SCTP_FUNC(sctp_sf_ootb), \ 513 TYPE_SCTP_FUNC(sctp_sf_ootb), \
@@ -643,6 +669,25 @@ chunk_event_table_unknown[SCTP_STATE_NUM_STATES] = {
643 TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \ 669 TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \
644} /* TYPE_SCTP_PRIMITIVE_ASCONF */ 670} /* TYPE_SCTP_PRIMITIVE_ASCONF */
645 671
672#define TYPE_SCTP_PRIMITIVE_RECONF { \
673 /* SCTP_STATE_CLOSED */ \
674 TYPE_SCTP_FUNC(sctp_sf_error_closed), \
675 /* SCTP_STATE_COOKIE_WAIT */ \
676 TYPE_SCTP_FUNC(sctp_sf_error_closed), \
677 /* SCTP_STATE_COOKIE_ECHOED */ \
678 TYPE_SCTP_FUNC(sctp_sf_error_closed), \
679 /* SCTP_STATE_ESTABLISHED */ \
680 TYPE_SCTP_FUNC(sctp_sf_do_prm_reconf), \
681 /* SCTP_STATE_SHUTDOWN_PENDING */ \
682 TYPE_SCTP_FUNC(sctp_sf_do_prm_reconf), \
683 /* SCTP_STATE_SHUTDOWN_SENT */ \
684 TYPE_SCTP_FUNC(sctp_sf_do_prm_reconf), \
685 /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
686 TYPE_SCTP_FUNC(sctp_sf_do_prm_reconf), \
687 /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
688 TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \
689} /* TYPE_SCTP_PRIMITIVE_RECONF */
690
646/* The primary index for this table is the primitive type. 691/* The primary index for this table is the primitive type.
647 * The secondary index for this table is the state. 692 * The secondary index for this table is the state.
648 */ 693 */
@@ -653,6 +698,7 @@ static const sctp_sm_table_entry_t primitive_event_table[SCTP_NUM_PRIMITIVE_TYPE
653 TYPE_SCTP_PRIMITIVE_SEND, 698 TYPE_SCTP_PRIMITIVE_SEND,
654 TYPE_SCTP_PRIMITIVE_REQUESTHEARTBEAT, 699 TYPE_SCTP_PRIMITIVE_REQUESTHEARTBEAT,
655 TYPE_SCTP_PRIMITIVE_ASCONF, 700 TYPE_SCTP_PRIMITIVE_ASCONF,
701 TYPE_SCTP_PRIMITIVE_RECONF,
656}; 702};
657 703
658#define TYPE_SCTP_OTHER_NO_PENDING_TSN { \ 704#define TYPE_SCTP_OTHER_NO_PENDING_TSN { \
@@ -888,6 +934,25 @@ static const sctp_sm_table_entry_t other_event_table[SCTP_NUM_OTHER_TYPES][SCTP_
888 TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ 934 TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
889} 935}
890 936
937#define TYPE_SCTP_EVENT_TIMEOUT_RECONF { \
938 /* SCTP_STATE_CLOSED */ \
939 TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
940 /* SCTP_STATE_COOKIE_WAIT */ \
941 TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
942 /* SCTP_STATE_COOKIE_ECHOED */ \
943 TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
944 /* SCTP_STATE_ESTABLISHED */ \
945 TYPE_SCTP_FUNC(sctp_sf_send_reconf), \
946 /* SCTP_STATE_SHUTDOWN_PENDING */ \
947 TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
948 /* SCTP_STATE_SHUTDOWN_SENT */ \
949 TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
950 /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
951 TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
952 /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
953 TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
954}
955
891static const sctp_sm_table_entry_t timeout_event_table[SCTP_NUM_TIMEOUT_TYPES][SCTP_STATE_NUM_STATES] = { 956static const sctp_sm_table_entry_t timeout_event_table[SCTP_NUM_TIMEOUT_TYPES][SCTP_STATE_NUM_STATES] = {
892 TYPE_SCTP_EVENT_TIMEOUT_NONE, 957 TYPE_SCTP_EVENT_TIMEOUT_NONE,
893 TYPE_SCTP_EVENT_TIMEOUT_T1_COOKIE, 958 TYPE_SCTP_EVENT_TIMEOUT_T1_COOKIE,
@@ -897,6 +962,7 @@ static const sctp_sm_table_entry_t timeout_event_table[SCTP_NUM_TIMEOUT_TYPES][S
897 TYPE_SCTP_EVENT_TIMEOUT_T4_RTO, 962 TYPE_SCTP_EVENT_TIMEOUT_T4_RTO,
898 TYPE_SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD, 963 TYPE_SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD,
899 TYPE_SCTP_EVENT_TIMEOUT_HEARTBEAT, 964 TYPE_SCTP_EVENT_TIMEOUT_HEARTBEAT,
965 TYPE_SCTP_EVENT_TIMEOUT_RECONF,
900 TYPE_SCTP_EVENT_TIMEOUT_SACK, 966 TYPE_SCTP_EVENT_TIMEOUT_SACK,
901 TYPE_SCTP_EVENT_TIMEOUT_AUTOCLOSE, 967 TYPE_SCTP_EVENT_TIMEOUT_AUTOCLOSE,
902}; 968};
@@ -924,6 +990,10 @@ static const sctp_sm_table_entry_t *sctp_chunk_event_lookup(struct net *net,
924 return &addip_chunk_event_table[1][state]; 990 return &addip_chunk_event_table[1][state];
925 } 991 }
926 992
993 if (net->sctp.reconf_enable)
994 if (cid == SCTP_CID_RECONF)
995 return &reconf_chunk_event_table[0][state];
996
927 if (net->sctp.auth_enable) { 997 if (net->sctp.auth_enable) {
928 if (cid == SCTP_CID_AUTH) 998 if (cid == SCTP_CID_AUTH)
929 return &auth_chunk_event_table[0][state]; 999 return &auth_chunk_event_table[0][state];
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 37eeab7899fc..d9d4c92e06b3 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -57,6 +57,7 @@
57#include <linux/kernel.h> 57#include <linux/kernel.h>
58#include <linux/wait.h> 58#include <linux/wait.h>
59#include <linux/time.h> 59#include <linux/time.h>
60#include <linux/sched/signal.h>
60#include <linux/ip.h> 61#include <linux/ip.h>
61#include <linux/capability.h> 62#include <linux/capability.h>
62#include <linux/fcntl.h> 63#include <linux/fcntl.h>
@@ -239,7 +240,7 @@ static struct sctp_transport *sctp_addr_id2transport(struct sock *sk,
239 union sctp_addr *laddr = (union sctp_addr *)addr; 240 union sctp_addr *laddr = (union sctp_addr *)addr;
240 struct sctp_transport *transport; 241 struct sctp_transport *transport;
241 242
242 if (sctp_verify_addr(sk, laddr, af->sockaddr_len)) 243 if (!af || sctp_verify_addr(sk, laddr, af->sockaddr_len))
243 return NULL; 244 return NULL;
244 245
245 addr_asoc = sctp_endpoint_lookup_assoc(sctp_sk(sk)->ep, 246 addr_asoc = sctp_endpoint_lookup_assoc(sctp_sk(sk)->ep,
@@ -364,7 +365,7 @@ static int sctp_do_bind(struct sock *sk, union sctp_addr *addr, int len)
364 } 365 }
365 } 366 }
366 367
367 if (snum && snum < PROT_SOCK && 368 if (snum && snum < inet_prot_sock(net) &&
368 !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) 369 !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
369 return -EACCES; 370 return -EACCES;
370 371
@@ -592,7 +593,7 @@ static int sctp_send_asconf_add_ip(struct sock *sk,
592 list_for_each_entry(trans, 593 list_for_each_entry(trans,
593 &asoc->peer.transport_addr_list, transports) { 594 &asoc->peer.transport_addr_list, transports) {
594 /* Clear the source and route cache */ 595 /* Clear the source and route cache */
595 dst_release(trans->dst); 596 sctp_transport_dst_release(trans);
596 trans->cwnd = min(4*asoc->pathmtu, max_t(__u32, 597 trans->cwnd = min(4*asoc->pathmtu, max_t(__u32,
597 2*asoc->pathmtu, 4380)); 598 2*asoc->pathmtu, 4380));
598 trans->ssthresh = asoc->peer.i.a_rwnd; 599 trans->ssthresh = asoc->peer.i.a_rwnd;
@@ -843,7 +844,7 @@ skip_mkasconf:
843 */ 844 */
844 list_for_each_entry(transport, &asoc->peer.transport_addr_list, 845 list_for_each_entry(transport, &asoc->peer.transport_addr_list,
845 transports) { 846 transports) {
846 dst_release(transport->dst); 847 sctp_transport_dst_release(transport);
847 sctp_transport_route(transport, NULL, 848 sctp_transport_route(transport, NULL,
848 sctp_sk(asoc->base.sk)); 849 sctp_sk(asoc->base.sk));
849 } 850 }
@@ -1156,8 +1157,10 @@ static int __sctp_connect(struct sock *sk,
1156 * accept new associations, but it SHOULD NOT 1157 * accept new associations, but it SHOULD NOT
1157 * be permitted to open new associations. 1158 * be permitted to open new associations.
1158 */ 1159 */
1159 if (ep->base.bind_addr.port < PROT_SOCK && 1160 if (ep->base.bind_addr.port <
1160 !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) { 1161 inet_prot_sock(net) &&
1162 !ns_capable(net->user_ns,
1163 CAP_NET_BIND_SERVICE)) {
1161 err = -EACCES; 1164 err = -EACCES;
1162 goto out_free; 1165 goto out_free;
1163 } 1166 }
@@ -1822,7 +1825,7 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
1822 * but it SHOULD NOT be permitted to open new 1825 * but it SHOULD NOT be permitted to open new
1823 * associations. 1826 * associations.
1824 */ 1827 */
1825 if (ep->base.bind_addr.port < PROT_SOCK && 1828 if (ep->base.bind_addr.port < inet_prot_sock(net) &&
1826 !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) { 1829 !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) {
1827 err = -EACCES; 1830 err = -EACCES;
1828 goto out_unlock; 1831 goto out_unlock;
@@ -1904,7 +1907,7 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
1904 } 1907 }
1905 1908
1906 if (asoc->pmtu_pending) 1909 if (asoc->pmtu_pending)
1907 sctp_assoc_pending_pmtu(sk, asoc); 1910 sctp_assoc_pending_pmtu(asoc);
1908 1911
1909 /* If fragmentation is disabled and the message length exceeds the 1912 /* If fragmentation is disabled and the message length exceeds the
1910 * association fragmentation point, return EMSGSIZE. The I-D 1913 * association fragmentation point, return EMSGSIZE. The I-D
@@ -1917,7 +1920,7 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
1917 } 1920 }
1918 1921
1919 /* Check for invalid stream. */ 1922 /* Check for invalid stream. */
1920 if (sinfo->sinfo_stream >= asoc->c.sinit_num_ostreams) { 1923 if (sinfo->sinfo_stream >= asoc->stream->outcnt) {
1921 err = -EINVAL; 1924 err = -EINVAL;
1922 goto out_free; 1925 goto out_free;
1923 } 1926 }
@@ -1962,6 +1965,7 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
1962 err = PTR_ERR(datamsg); 1965 err = PTR_ERR(datamsg);
1963 goto out_free; 1966 goto out_free;
1964 } 1967 }
1968 asoc->force_delay = !!(msg->msg_flags & MSG_MORE);
1965 1969
1966 /* Now send the (possibly) fragmented message. */ 1970 /* Now send the (possibly) fragmented message. */
1967 list_for_each_entry(chunk, &datamsg->chunks, frag_list) { 1971 list_for_each_entry(chunk, &datamsg->chunks, frag_list) {
@@ -2431,10 +2435,9 @@ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params,
2431 if ((params->spp_flags & SPP_PMTUD_DISABLE) && params->spp_pathmtu) { 2435 if ((params->spp_flags & SPP_PMTUD_DISABLE) && params->spp_pathmtu) {
2432 if (trans) { 2436 if (trans) {
2433 trans->pathmtu = params->spp_pathmtu; 2437 trans->pathmtu = params->spp_pathmtu;
2434 sctp_assoc_sync_pmtu(sctp_opt2sk(sp), asoc); 2438 sctp_assoc_sync_pmtu(asoc);
2435 } else if (asoc) { 2439 } else if (asoc) {
2436 asoc->pathmtu = params->spp_pathmtu; 2440 asoc->pathmtu = params->spp_pathmtu;
2437 sctp_frag_point(asoc, params->spp_pathmtu);
2438 } else { 2441 } else {
2439 sp->pathmtu = params->spp_pathmtu; 2442 sp->pathmtu = params->spp_pathmtu;
2440 } 2443 }
@@ -2448,7 +2451,7 @@ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params,
2448 (trans->param_flags & ~SPP_PMTUD) | pmtud_change; 2451 (trans->param_flags & ~SPP_PMTUD) | pmtud_change;
2449 if (update) { 2452 if (update) {
2450 sctp_transport_pmtu(trans, sctp_opt2sk(sp)); 2453 sctp_transport_pmtu(trans, sctp_opt2sk(sp));
2451 sctp_assoc_sync_pmtu(sctp_opt2sk(sp), asoc); 2454 sctp_assoc_sync_pmtu(asoc);
2452 } 2455 }
2453 } else if (asoc) { 2456 } else if (asoc) {
2454 asoc->param_flags = 2457 asoc->param_flags =
@@ -3755,6 +3758,120 @@ out:
3755 return retval; 3758 return retval;
3756} 3759}
3757 3760
3761static int sctp_setsockopt_enable_strreset(struct sock *sk,
3762 char __user *optval,
3763 unsigned int optlen)
3764{
3765 struct sctp_assoc_value params;
3766 struct sctp_association *asoc;
3767 int retval = -EINVAL;
3768
3769 if (optlen != sizeof(params))
3770 goto out;
3771
3772 if (copy_from_user(&params, optval, optlen)) {
3773 retval = -EFAULT;
3774 goto out;
3775 }
3776
3777 if (params.assoc_value & (~SCTP_ENABLE_STRRESET_MASK))
3778 goto out;
3779
3780 asoc = sctp_id2assoc(sk, params.assoc_id);
3781 if (asoc) {
3782 asoc->strreset_enable = params.assoc_value;
3783 } else if (!params.assoc_id) {
3784 struct sctp_sock *sp = sctp_sk(sk);
3785
3786 sp->ep->strreset_enable = params.assoc_value;
3787 } else {
3788 goto out;
3789 }
3790
3791 retval = 0;
3792
3793out:
3794 return retval;
3795}
3796
3797static int sctp_setsockopt_reset_streams(struct sock *sk,
3798 char __user *optval,
3799 unsigned int optlen)
3800{
3801 struct sctp_reset_streams *params;
3802 struct sctp_association *asoc;
3803 int retval = -EINVAL;
3804
3805 if (optlen < sizeof(struct sctp_reset_streams))
3806 return -EINVAL;
3807
3808 params = memdup_user(optval, optlen);
3809 if (IS_ERR(params))
3810 return PTR_ERR(params);
3811
3812 asoc = sctp_id2assoc(sk, params->srs_assoc_id);
3813 if (!asoc)
3814 goto out;
3815
3816 retval = sctp_send_reset_streams(asoc, params);
3817
3818out:
3819 kfree(params);
3820 return retval;
3821}
3822
3823static int sctp_setsockopt_reset_assoc(struct sock *sk,
3824 char __user *optval,
3825 unsigned int optlen)
3826{
3827 struct sctp_association *asoc;
3828 sctp_assoc_t associd;
3829 int retval = -EINVAL;
3830
3831 if (optlen != sizeof(associd))
3832 goto out;
3833
3834 if (copy_from_user(&associd, optval, optlen)) {
3835 retval = -EFAULT;
3836 goto out;
3837 }
3838
3839 asoc = sctp_id2assoc(sk, associd);
3840 if (!asoc)
3841 goto out;
3842
3843 retval = sctp_send_reset_assoc(asoc);
3844
3845out:
3846 return retval;
3847}
3848
3849static int sctp_setsockopt_add_streams(struct sock *sk,
3850 char __user *optval,
3851 unsigned int optlen)
3852{
3853 struct sctp_association *asoc;
3854 struct sctp_add_streams params;
3855 int retval = -EINVAL;
3856
3857 if (optlen != sizeof(params))
3858 goto out;
3859
3860 if (copy_from_user(&params, optval, optlen)) {
3861 retval = -EFAULT;
3862 goto out;
3863 }
3864
3865 asoc = sctp_id2assoc(sk, params.sas_assoc_id);
3866 if (!asoc)
3867 goto out;
3868
3869 retval = sctp_send_add_streams(asoc, &params);
3870
3871out:
3872 return retval;
3873}
3874
3758/* API 6.2 setsockopt(), getsockopt() 3875/* API 6.2 setsockopt(), getsockopt()
3759 * 3876 *
3760 * Applications use setsockopt() and getsockopt() to set or retrieve 3877 * Applications use setsockopt() and getsockopt() to set or retrieve
@@ -3921,6 +4038,18 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname,
3921 case SCTP_DEFAULT_PRINFO: 4038 case SCTP_DEFAULT_PRINFO:
3922 retval = sctp_setsockopt_default_prinfo(sk, optval, optlen); 4039 retval = sctp_setsockopt_default_prinfo(sk, optval, optlen);
3923 break; 4040 break;
4041 case SCTP_ENABLE_STREAM_RESET:
4042 retval = sctp_setsockopt_enable_strreset(sk, optval, optlen);
4043 break;
4044 case SCTP_RESET_STREAMS:
4045 retval = sctp_setsockopt_reset_streams(sk, optval, optlen);
4046 break;
4047 case SCTP_RESET_ASSOC:
4048 retval = sctp_setsockopt_reset_assoc(sk, optval, optlen);
4049 break;
4050 case SCTP_ADD_STREAMS:
4051 retval = sctp_setsockopt_add_streams(sk, optval, optlen);
4052 break;
3924 default: 4053 default:
3925 retval = -ENOPROTOOPT; 4054 retval = -ENOPROTOOPT;
3926 break; 4055 break;
@@ -3987,7 +4116,7 @@ static int sctp_disconnect(struct sock *sk, int flags)
3987 * descriptor will be returned from accept() to represent the newly 4116 * descriptor will be returned from accept() to represent the newly
3988 * formed association. 4117 * formed association.
3989 */ 4118 */
3990static struct sock *sctp_accept(struct sock *sk, int flags, int *err) 4119static struct sock *sctp_accept(struct sock *sk, int flags, int *err, bool kern)
3991{ 4120{
3992 struct sctp_sock *sp; 4121 struct sctp_sock *sp;
3993 struct sctp_endpoint *ep; 4122 struct sctp_endpoint *ep;
@@ -4022,7 +4151,7 @@ static struct sock *sctp_accept(struct sock *sk, int flags, int *err)
4022 */ 4151 */
4023 asoc = list_entry(ep->asocs.next, struct sctp_association, asocs); 4152 asoc = list_entry(ep->asocs.next, struct sctp_association, asocs);
4024 4153
4025 newsk = sp->pf->create_accept_sk(sk, asoc); 4154 newsk = sp->pf->create_accept_sk(sk, asoc, kern);
4026 if (!newsk) { 4155 if (!newsk) {
4027 error = -ENOMEM; 4156 error = -ENOMEM;
4028 goto out; 4157 goto out;
@@ -4332,8 +4461,8 @@ int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc,
4332 info->sctpi_rwnd = asoc->a_rwnd; 4461 info->sctpi_rwnd = asoc->a_rwnd;
4333 info->sctpi_unackdata = asoc->unack_data; 4462 info->sctpi_unackdata = asoc->unack_data;
4334 info->sctpi_penddata = sctp_tsnmap_pending(&asoc->peer.tsn_map); 4463 info->sctpi_penddata = sctp_tsnmap_pending(&asoc->peer.tsn_map);
4335 info->sctpi_instrms = asoc->c.sinit_max_instreams; 4464 info->sctpi_instrms = asoc->stream->incnt;
4336 info->sctpi_outstrms = asoc->c.sinit_num_ostreams; 4465 info->sctpi_outstrms = asoc->stream->outcnt;
4337 list_for_each(pos, &asoc->base.inqueue.in_chunk_list) 4466 list_for_each(pos, &asoc->base.inqueue.in_chunk_list)
4338 info->sctpi_inqueue++; 4467 info->sctpi_inqueue++;
4339 list_for_each(pos, &asoc->outqueue.out_chunk_list) 4468 list_for_each(pos, &asoc->outqueue.out_chunk_list)
@@ -4562,8 +4691,8 @@ static int sctp_getsockopt_sctp_status(struct sock *sk, int len,
4562 status.sstat_unackdata = asoc->unack_data; 4691 status.sstat_unackdata = asoc->unack_data;
4563 4692
4564 status.sstat_penddata = sctp_tsnmap_pending(&asoc->peer.tsn_map); 4693 status.sstat_penddata = sctp_tsnmap_pending(&asoc->peer.tsn_map);
4565 status.sstat_instrms = asoc->c.sinit_max_instreams; 4694 status.sstat_instrms = asoc->stream->incnt;
4566 status.sstat_outstrms = asoc->c.sinit_num_ostreams; 4695 status.sstat_outstrms = asoc->stream->outcnt;
4567 status.sstat_fragmentation_point = asoc->frag_point; 4696 status.sstat_fragmentation_point = asoc->frag_point;
4568 status.sstat_primary.spinfo_assoc_id = sctp_assoc2id(transport->asoc); 4697 status.sstat_primary.spinfo_assoc_id = sctp_assoc2id(transport->asoc);
4569 memcpy(&status.sstat_primary.spinfo_address, &transport->ipaddr, 4698 memcpy(&status.sstat_primary.spinfo_address, &transport->ipaddr,
@@ -4734,6 +4863,12 @@ int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, struct socket **sockp)
4734 if (!asoc) 4863 if (!asoc)
4735 return -EINVAL; 4864 return -EINVAL;
4736 4865
4866 /* If there is a thread waiting on more sndbuf space for
4867 * sending on this asoc, it cannot be peeled.
4868 */
4869 if (waitqueue_active(&asoc->wait))
4870 return -EBUSY;
4871
4737 /* An association cannot be branched off from an already peeled-off 4872 /* An association cannot be branched off from an already peeled-off
4738 * socket, nor is this supported for tcp style sockets. 4873 * socket, nor is this supported for tcp style sockets.
4739 */ 4874 */
@@ -6405,6 +6540,47 @@ out:
6405 return retval; 6540 return retval;
6406} 6541}
6407 6542
6543static int sctp_getsockopt_enable_strreset(struct sock *sk, int len,
6544 char __user *optval,
6545 int __user *optlen)
6546{
6547 struct sctp_assoc_value params;
6548 struct sctp_association *asoc;
6549 int retval = -EFAULT;
6550
6551 if (len < sizeof(params)) {
6552 retval = -EINVAL;
6553 goto out;
6554 }
6555
6556 len = sizeof(params);
6557 if (copy_from_user(&params, optval, len))
6558 goto out;
6559
6560 asoc = sctp_id2assoc(sk, params.assoc_id);
6561 if (asoc) {
6562 params.assoc_value = asoc->strreset_enable;
6563 } else if (!params.assoc_id) {
6564 struct sctp_sock *sp = sctp_sk(sk);
6565
6566 params.assoc_value = sp->ep->strreset_enable;
6567 } else {
6568 retval = -EINVAL;
6569 goto out;
6570 }
6571
6572 if (put_user(len, optlen))
6573 goto out;
6574
6575 if (copy_to_user(optval, &params, len))
6576 goto out;
6577
6578 retval = 0;
6579
6580out:
6581 return retval;
6582}
6583
6408static int sctp_getsockopt(struct sock *sk, int level, int optname, 6584static int sctp_getsockopt(struct sock *sk, int level, int optname,
6409 char __user *optval, int __user *optlen) 6585 char __user *optval, int __user *optlen)
6410{ 6586{
@@ -6572,6 +6748,10 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname,
6572 retval = sctp_getsockopt_pr_assocstatus(sk, len, optval, 6748 retval = sctp_getsockopt_pr_assocstatus(sk, len, optval,
6573 optlen); 6749 optlen);
6574 break; 6750 break;
6751 case SCTP_ENABLE_STREAM_RESET:
6752 retval = sctp_getsockopt_enable_strreset(sk, len, optval,
6753 optlen);
6754 break;
6575 default: 6755 default:
6576 retval = -ENOPROTOOPT; 6756 retval = -ENOPROTOOPT;
6577 break; 6757 break;
@@ -6854,6 +7034,9 @@ int sctp_inet_listen(struct socket *sock, int backlog)
6854 if (sock->state != SS_UNCONNECTED) 7034 if (sock->state != SS_UNCONNECTED)
6855 goto out; 7035 goto out;
6856 7036
7037 if (!sctp_sstate(sk, LISTENING) && !sctp_sstate(sk, CLOSED))
7038 goto out;
7039
6857 /* If backlog is zero, disable listening. */ 7040 /* If backlog is zero, disable listening. */
6858 if (!backlog) { 7041 if (!backlog) {
6859 if (sctp_sstate(sk, CLOSED)) 7042 if (sctp_sstate(sk, CLOSED))
@@ -7426,7 +7609,6 @@ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p,
7426 */ 7609 */
7427 release_sock(sk); 7610 release_sock(sk);
7428 current_timeo = schedule_timeout(current_timeo); 7611 current_timeo = schedule_timeout(current_timeo);
7429 BUG_ON(sk != asoc->base.sk);
7430 lock_sock(sk); 7612 lock_sock(sk);
7431 7613
7432 *timeo_p = current_timeo; 7614 *timeo_p = current_timeo;
diff --git a/net/sctp/ssnmap.c b/net/sctp/ssnmap.c
deleted file mode 100644
index b9c8521c1a98..000000000000
--- a/net/sctp/ssnmap.c
+++ /dev/null
@@ -1,125 +0,0 @@
1/* SCTP kernel implementation
2 * Copyright (c) 2003 International Business Machines, Corp.
3 *
4 * This file is part of the SCTP kernel implementation
5 *
6 * These functions manipulate sctp SSN tracker.
7 *
8 * This SCTP implementation is free software;
9 * you can redistribute it and/or modify it under the terms of
10 * the GNU General Public License as published by
11 * the Free Software Foundation; either version 2, or (at your option)
12 * any later version.
13 *
14 * This SCTP implementation is distributed in the hope that it
15 * will be useful, but WITHOUT ANY WARRANTY; without even the implied
16 * ************************
17 * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
18 * See the GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with GNU CC; see the file COPYING. If not, see
22 * <http://www.gnu.org/licenses/>.
23 *
24 * Please send any bug reports or fixes you make to the
25 * email address(es):
26 * lksctp developers <linux-sctp@vger.kernel.org>
27 *
28 * Written or modified by:
29 * Jon Grimm <jgrimm@us.ibm.com>
30 */
31
32#include <linux/types.h>
33#include <linux/slab.h>
34#include <net/sctp/sctp.h>
35#include <net/sctp/sm.h>
36
37static struct sctp_ssnmap *sctp_ssnmap_init(struct sctp_ssnmap *map, __u16 in,
38 __u16 out);
39
40/* Storage size needed for map includes 2 headers and then the
41 * specific needs of in or out streams.
42 */
43static inline size_t sctp_ssnmap_size(__u16 in, __u16 out)
44{
45 return sizeof(struct sctp_ssnmap) + (in + out) * sizeof(__u16);
46}
47
48
49/* Create a new sctp_ssnmap.
50 * Allocate room to store at least 'len' contiguous TSNs.
51 */
52struct sctp_ssnmap *sctp_ssnmap_new(__u16 in, __u16 out,
53 gfp_t gfp)
54{
55 struct sctp_ssnmap *retval;
56 int size;
57
58 size = sctp_ssnmap_size(in, out);
59 if (size <= KMALLOC_MAX_SIZE)
60 retval = kmalloc(size, gfp);
61 else
62 retval = (struct sctp_ssnmap *)
63 __get_free_pages(gfp, get_order(size));
64 if (!retval)
65 goto fail;
66
67 if (!sctp_ssnmap_init(retval, in, out))
68 goto fail_map;
69
70 SCTP_DBG_OBJCNT_INC(ssnmap);
71
72 return retval;
73
74fail_map:
75 if (size <= KMALLOC_MAX_SIZE)
76 kfree(retval);
77 else
78 free_pages((unsigned long)retval, get_order(size));
79fail:
80 return NULL;
81}
82
83
84/* Initialize a block of memory as a ssnmap. */
85static struct sctp_ssnmap *sctp_ssnmap_init(struct sctp_ssnmap *map, __u16 in,
86 __u16 out)
87{
88 memset(map, 0x00, sctp_ssnmap_size(in, out));
89
90 /* Start 'in' stream just after the map header. */
91 map->in.ssn = (__u16 *)&map[1];
92 map->in.len = in;
93
94 /* Start 'out' stream just after 'in'. */
95 map->out.ssn = &map->in.ssn[in];
96 map->out.len = out;
97
98 return map;
99}
100
101/* Clear out the ssnmap streams. */
102void sctp_ssnmap_clear(struct sctp_ssnmap *map)
103{
104 size_t size;
105
106 size = (map->in.len + map->out.len) * sizeof(__u16);
107 memset(map->in.ssn, 0x00, size);
108}
109
110/* Dispose of a ssnmap. */
111void sctp_ssnmap_free(struct sctp_ssnmap *map)
112{
113 int size;
114
115 if (unlikely(!map))
116 return;
117
118 size = sctp_ssnmap_size(map->in.len, map->out.len);
119 if (size <= KMALLOC_MAX_SIZE)
120 kfree(map);
121 else
122 free_pages((unsigned long)map, get_order(size));
123
124 SCTP_DBG_OBJCNT_DEC(ssnmap);
125}
diff --git a/net/sctp/stream.c b/net/sctp/stream.c
new file mode 100644
index 000000000000..bbed997e1c5f
--- /dev/null
+++ b/net/sctp/stream.c
@@ -0,0 +1,506 @@
1/* SCTP kernel implementation
2 * (C) Copyright IBM Corp. 2001, 2004
3 * Copyright (c) 1999-2000 Cisco, Inc.
4 * Copyright (c) 1999-2001 Motorola, Inc.
5 * Copyright (c) 2001 Intel Corp.
6 *
7 * This file is part of the SCTP kernel implementation
8 *
9 * These functions manipulate sctp tsn mapping array.
10 *
11 * This SCTP implementation is free software;
12 * you can redistribute it and/or modify it under the terms of
13 * the GNU General Public License as published by
14 * the Free Software Foundation; either version 2, or (at your option)
15 * any later version.
16 *
17 * This SCTP implementation is distributed in the hope that it
18 * will be useful, but WITHOUT ANY WARRANTY; without even the implied
19 * ************************
20 * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
21 * See the GNU General Public License for more details.
22 *
23 * You should have received a copy of the GNU General Public License
24 * along with GNU CC; see the file COPYING. If not, see
25 * <http://www.gnu.org/licenses/>.
26 *
27 * Please send any bug reports or fixes you make to the
28 * email address(es):
29 * lksctp developers <linux-sctp@vger.kernel.org>
30 *
31 * Written or modified by:
32 * Xin Long <lucien.xin@gmail.com>
33 */
34
35#include <net/sctp/sctp.h>
36#include <net/sctp/sm.h>
37
38int sctp_stream_new(struct sctp_association *asoc, gfp_t gfp)
39{
40 struct sctp_stream *stream;
41 int i;
42
43 stream = kzalloc(sizeof(*stream), gfp);
44 if (!stream)
45 return -ENOMEM;
46
47 stream->outcnt = asoc->c.sinit_num_ostreams;
48 stream->out = kcalloc(stream->outcnt, sizeof(*stream->out), gfp);
49 if (!stream->out) {
50 kfree(stream);
51 return -ENOMEM;
52 }
53 for (i = 0; i < stream->outcnt; i++)
54 stream->out[i].state = SCTP_STREAM_OPEN;
55
56 asoc->stream = stream;
57
58 return 0;
59}
60
61int sctp_stream_init(struct sctp_association *asoc, gfp_t gfp)
62{
63 struct sctp_stream *stream = asoc->stream;
64 int i;
65
66 /* Initial stream->out size may be very big, so free it and alloc
67 * a new one with new outcnt to save memory.
68 */
69 kfree(stream->out);
70 stream->outcnt = asoc->c.sinit_num_ostreams;
71 stream->out = kcalloc(stream->outcnt, sizeof(*stream->out), gfp);
72 if (!stream->out)
73 goto nomem;
74
75 for (i = 0; i < stream->outcnt; i++)
76 stream->out[i].state = SCTP_STREAM_OPEN;
77
78 stream->incnt = asoc->c.sinit_max_instreams;
79 stream->in = kcalloc(stream->incnt, sizeof(*stream->in), gfp);
80 if (!stream->in) {
81 kfree(stream->out);
82 goto nomem;
83 }
84
85 return 0;
86
87nomem:
88 asoc->stream = NULL;
89 kfree(stream);
90
91 return -ENOMEM;
92}
93
94void sctp_stream_free(struct sctp_stream *stream)
95{
96 if (unlikely(!stream))
97 return;
98
99 kfree(stream->out);
100 kfree(stream->in);
101 kfree(stream);
102}
103
104void sctp_stream_clear(struct sctp_stream *stream)
105{
106 int i;
107
108 for (i = 0; i < stream->outcnt; i++)
109 stream->out[i].ssn = 0;
110
111 for (i = 0; i < stream->incnt; i++)
112 stream->in[i].ssn = 0;
113}
114
115static int sctp_send_reconf(struct sctp_association *asoc,
116 struct sctp_chunk *chunk)
117{
118 struct net *net = sock_net(asoc->base.sk);
119 int retval = 0;
120
121 retval = sctp_primitive_RECONF(net, asoc, chunk);
122 if (retval)
123 sctp_chunk_free(chunk);
124
125 return retval;
126}
127
128int sctp_send_reset_streams(struct sctp_association *asoc,
129 struct sctp_reset_streams *params)
130{
131 struct sctp_stream *stream = asoc->stream;
132 __u16 i, str_nums, *str_list;
133 struct sctp_chunk *chunk;
134 int retval = -EINVAL;
135 bool out, in;
136
137 if (!asoc->peer.reconf_capable ||
138 !(asoc->strreset_enable & SCTP_ENABLE_RESET_STREAM_REQ)) {
139 retval = -ENOPROTOOPT;
140 goto out;
141 }
142
143 if (asoc->strreset_outstanding) {
144 retval = -EINPROGRESS;
145 goto out;
146 }
147
148 out = params->srs_flags & SCTP_STREAM_RESET_OUTGOING;
149 in = params->srs_flags & SCTP_STREAM_RESET_INCOMING;
150 if (!out && !in)
151 goto out;
152
153 str_nums = params->srs_number_streams;
154 str_list = params->srs_stream_list;
155 if (out && str_nums)
156 for (i = 0; i < str_nums; i++)
157 if (str_list[i] >= stream->outcnt)
158 goto out;
159
160 if (in && str_nums)
161 for (i = 0; i < str_nums; i++)
162 if (str_list[i] >= stream->incnt)
163 goto out;
164
165 for (i = 0; i < str_nums; i++)
166 str_list[i] = htons(str_list[i]);
167
168 chunk = sctp_make_strreset_req(asoc, str_nums, str_list, out, in);
169
170 for (i = 0; i < str_nums; i++)
171 str_list[i] = ntohs(str_list[i]);
172
173 if (!chunk) {
174 retval = -ENOMEM;
175 goto out;
176 }
177
178 if (out) {
179 if (str_nums)
180 for (i = 0; i < str_nums; i++)
181 stream->out[str_list[i]].state =
182 SCTP_STREAM_CLOSED;
183 else
184 for (i = 0; i < stream->outcnt; i++)
185 stream->out[i].state = SCTP_STREAM_CLOSED;
186 }
187
188 asoc->strreset_chunk = chunk;
189 sctp_chunk_hold(asoc->strreset_chunk);
190
191 retval = sctp_send_reconf(asoc, chunk);
192 if (retval) {
193 sctp_chunk_put(asoc->strreset_chunk);
194 asoc->strreset_chunk = NULL;
195 if (!out)
196 goto out;
197
198 if (str_nums)
199 for (i = 0; i < str_nums; i++)
200 stream->out[str_list[i]].state =
201 SCTP_STREAM_OPEN;
202 else
203 for (i = 0; i < stream->outcnt; i++)
204 stream->out[i].state = SCTP_STREAM_OPEN;
205
206 goto out;
207 }
208
209 asoc->strreset_outstanding = out + in;
210
211out:
212 return retval;
213}
214
215int sctp_send_reset_assoc(struct sctp_association *asoc)
216{
217 struct sctp_chunk *chunk = NULL;
218 int retval;
219 __u16 i;
220
221 if (!asoc->peer.reconf_capable ||
222 !(asoc->strreset_enable & SCTP_ENABLE_RESET_ASSOC_REQ))
223 return -ENOPROTOOPT;
224
225 if (asoc->strreset_outstanding)
226 return -EINPROGRESS;
227
228 chunk = sctp_make_strreset_tsnreq(asoc);
229 if (!chunk)
230 return -ENOMEM;
231
232 /* Block further xmit of data until this request is completed */
233 for (i = 0; i < asoc->stream->outcnt; i++)
234 asoc->stream->out[i].state = SCTP_STREAM_CLOSED;
235
236 asoc->strreset_chunk = chunk;
237 sctp_chunk_hold(asoc->strreset_chunk);
238
239 retval = sctp_send_reconf(asoc, chunk);
240 if (retval) {
241 sctp_chunk_put(asoc->strreset_chunk);
242 asoc->strreset_chunk = NULL;
243
244 for (i = 0; i < asoc->stream->outcnt; i++)
245 asoc->stream->out[i].state = SCTP_STREAM_OPEN;
246
247 return retval;
248 }
249
250 asoc->strreset_outstanding = 1;
251
252 return 0;
253}
254
255int sctp_send_add_streams(struct sctp_association *asoc,
256 struct sctp_add_streams *params)
257{
258 struct sctp_stream *stream = asoc->stream;
259 struct sctp_chunk *chunk = NULL;
260 int retval = -ENOMEM;
261 __u32 outcnt, incnt;
262 __u16 out, in;
263
264 if (!asoc->peer.reconf_capable ||
265 !(asoc->strreset_enable & SCTP_ENABLE_CHANGE_ASSOC_REQ)) {
266 retval = -ENOPROTOOPT;
267 goto out;
268 }
269
270 if (asoc->strreset_outstanding) {
271 retval = -EINPROGRESS;
272 goto out;
273 }
274
275 out = params->sas_outstrms;
276 in = params->sas_instrms;
277 outcnt = stream->outcnt + out;
278 incnt = stream->incnt + in;
279 if (outcnt > SCTP_MAX_STREAM || incnt > SCTP_MAX_STREAM ||
280 (!out && !in)) {
281 retval = -EINVAL;
282 goto out;
283 }
284
285 if (out) {
286 struct sctp_stream_out *streamout;
287
288 streamout = krealloc(stream->out, outcnt * sizeof(*streamout),
289 GFP_KERNEL);
290 if (!streamout)
291 goto out;
292
293 memset(streamout + stream->outcnt, 0, out * sizeof(*streamout));
294 stream->out = streamout;
295 }
296
297 if (in) {
298 struct sctp_stream_in *streamin;
299
300 streamin = krealloc(stream->in, incnt * sizeof(*streamin),
301 GFP_KERNEL);
302 if (!streamin)
303 goto out;
304
305 memset(streamin + stream->incnt, 0, in * sizeof(*streamin));
306 stream->in = streamin;
307 }
308
309 chunk = sctp_make_strreset_addstrm(asoc, out, in);
310 if (!chunk)
311 goto out;
312
313 asoc->strreset_chunk = chunk;
314 sctp_chunk_hold(asoc->strreset_chunk);
315
316 retval = sctp_send_reconf(asoc, chunk);
317 if (retval) {
318 sctp_chunk_put(asoc->strreset_chunk);
319 asoc->strreset_chunk = NULL;
320 goto out;
321 }
322
323 stream->incnt = incnt;
324 stream->outcnt = outcnt;
325
326 asoc->strreset_outstanding = !!out + !!in;
327
328out:
329 return retval;
330}
331
332static sctp_paramhdr_t *sctp_chunk_lookup_strreset_param(
333 struct sctp_association *asoc, __u32 resp_seq)
334{
335 struct sctp_chunk *chunk = asoc->strreset_chunk;
336 struct sctp_reconf_chunk *hdr;
337 union sctp_params param;
338
339 if (ntohl(resp_seq) != asoc->strreset_outseq || !chunk)
340 return NULL;
341
342 hdr = (struct sctp_reconf_chunk *)chunk->chunk_hdr;
343 sctp_walk_params(param, hdr, params) {
344 /* sctp_strreset_tsnreq is actually the basic structure
345 * of all stream reconf params, so it's safe to use it
346 * to access request_seq.
347 */
348 struct sctp_strreset_tsnreq *req = param.v;
349
350 if (req->request_seq == resp_seq)
351 return param.v;
352 }
353
354 return NULL;
355}
356
357struct sctp_chunk *sctp_process_strreset_outreq(
358 struct sctp_association *asoc,
359 union sctp_params param,
360 struct sctp_ulpevent **evp)
361{
362 struct sctp_strreset_outreq *outreq = param.v;
363 struct sctp_stream *stream = asoc->stream;
364 __u16 i, nums, flags = 0, *str_p = NULL;
365 __u32 result = SCTP_STRRESET_DENIED;
366 __u32 request_seq;
367
368 request_seq = ntohl(outreq->request_seq);
369
370 if (ntohl(outreq->send_reset_at_tsn) >
371 sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map)) {
372 result = SCTP_STRRESET_IN_PROGRESS;
373 goto out;
374 }
375
376 if (request_seq > asoc->strreset_inseq) {
377 result = SCTP_STRRESET_ERR_BAD_SEQNO;
378 goto out;
379 } else if (request_seq == asoc->strreset_inseq) {
380 asoc->strreset_inseq++;
381 }
382
383 /* Check strreset_enable after inseq inc, as sender cannot tell
384 * the peer doesn't enable strreset after receiving response with
385 * result denied, as well as to keep consistent with bsd.
386 */
387 if (!(asoc->strreset_enable & SCTP_ENABLE_RESET_STREAM_REQ))
388 goto out;
389
390 if (asoc->strreset_chunk) {
391 sctp_paramhdr_t *param_hdr;
392 struct sctp_transport *t;
393
394 param_hdr = sctp_chunk_lookup_strreset_param(
395 asoc, outreq->response_seq);
396 if (!param_hdr || param_hdr->type !=
397 SCTP_PARAM_RESET_IN_REQUEST) {
398 /* same process with outstanding isn't 0 */
399 result = SCTP_STRRESET_ERR_IN_PROGRESS;
400 goto out;
401 }
402
403 asoc->strreset_outstanding--;
404 asoc->strreset_outseq++;
405
406 if (!asoc->strreset_outstanding) {
407 t = asoc->strreset_chunk->transport;
408 if (del_timer(&t->reconf_timer))
409 sctp_transport_put(t);
410
411 sctp_chunk_put(asoc->strreset_chunk);
412 asoc->strreset_chunk = NULL;
413 }
414
415 flags = SCTP_STREAM_RESET_INCOMING_SSN;
416 }
417
418 nums = (ntohs(param.p->length) - sizeof(*outreq)) / 2;
419 if (nums) {
420 str_p = outreq->list_of_streams;
421 for (i = 0; i < nums; i++) {
422 if (ntohs(str_p[i]) >= stream->incnt) {
423 result = SCTP_STRRESET_ERR_WRONG_SSN;
424 goto out;
425 }
426 }
427
428 for (i = 0; i < nums; i++)
429 stream->in[ntohs(str_p[i])].ssn = 0;
430 } else {
431 for (i = 0; i < stream->incnt; i++)
432 stream->in[i].ssn = 0;
433 }
434
435 result = SCTP_STRRESET_PERFORMED;
436
437 *evp = sctp_ulpevent_make_stream_reset_event(asoc,
438 flags | SCTP_STREAM_RESET_OUTGOING_SSN, nums, str_p,
439 GFP_ATOMIC);
440
441out:
442 return sctp_make_strreset_resp(asoc, result, request_seq);
443}
444
445struct sctp_chunk *sctp_process_strreset_inreq(
446 struct sctp_association *asoc,
447 union sctp_params param,
448 struct sctp_ulpevent **evp)
449{
450 struct sctp_strreset_inreq *inreq = param.v;
451 struct sctp_stream *stream = asoc->stream;
452 __u32 result = SCTP_STRRESET_DENIED;
453 struct sctp_chunk *chunk = NULL;
454 __u16 i, nums, *str_p;
455 __u32 request_seq;
456
457 request_seq = ntohl(inreq->request_seq);
458 if (request_seq > asoc->strreset_inseq) {
459 result = SCTP_STRRESET_ERR_BAD_SEQNO;
460 goto out;
461 } else if (request_seq == asoc->strreset_inseq) {
462 asoc->strreset_inseq++;
463 }
464
465 if (!(asoc->strreset_enable & SCTP_ENABLE_RESET_STREAM_REQ))
466 goto out;
467
468 if (asoc->strreset_outstanding) {
469 result = SCTP_STRRESET_ERR_IN_PROGRESS;
470 goto out;
471 }
472
473 nums = (ntohs(param.p->length) - sizeof(*inreq)) / 2;
474 str_p = inreq->list_of_streams;
475 for (i = 0; i < nums; i++) {
476 if (ntohs(str_p[i]) >= stream->outcnt) {
477 result = SCTP_STRRESET_ERR_WRONG_SSN;
478 goto out;
479 }
480 }
481
482 chunk = sctp_make_strreset_req(asoc, nums, str_p, 1, 0);
483 if (!chunk)
484 goto out;
485
486 if (nums)
487 for (i = 0; i < nums; i++)
488 stream->out[ntohs(str_p[i])].state =
489 SCTP_STREAM_CLOSED;
490 else
491 for (i = 0; i < stream->outcnt; i++)
492 stream->out[i].state = SCTP_STREAM_CLOSED;
493
494 asoc->strreset_chunk = chunk;
495 asoc->strreset_outstanding = 1;
496 sctp_chunk_hold(asoc->strreset_chunk);
497
498 *evp = sctp_ulpevent_make_stream_reset_event(asoc,
499 SCTP_STREAM_RESET_INCOMING_SSN, nums, str_p, GFP_ATOMIC);
500
501out:
502 if (!chunk)
503 chunk = sctp_make_strreset_resp(asoc, result, request_seq);
504
505 return chunk;
506}
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index a1652ab63918..721eeebfcd8a 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -88,9 +88,11 @@ static struct sctp_transport *sctp_transport_init(struct net *net,
88 INIT_LIST_HEAD(&peer->transports); 88 INIT_LIST_HEAD(&peer->transports);
89 89
90 setup_timer(&peer->T3_rtx_timer, sctp_generate_t3_rtx_event, 90 setup_timer(&peer->T3_rtx_timer, sctp_generate_t3_rtx_event,
91 (unsigned long)peer); 91 (unsigned long)peer);
92 setup_timer(&peer->hb_timer, sctp_generate_heartbeat_event, 92 setup_timer(&peer->hb_timer, sctp_generate_heartbeat_event,
93 (unsigned long)peer); 93 (unsigned long)peer);
94 setup_timer(&peer->reconf_timer, sctp_generate_reconf_event,
95 (unsigned long)peer);
94 setup_timer(&peer->proto_unreach_timer, 96 setup_timer(&peer->proto_unreach_timer,
95 sctp_generate_proto_unreach_event, (unsigned long)peer); 97 sctp_generate_proto_unreach_event, (unsigned long)peer);
96 98
@@ -144,6 +146,9 @@ void sctp_transport_free(struct sctp_transport *transport)
144 if (del_timer(&transport->T3_rtx_timer)) 146 if (del_timer(&transport->T3_rtx_timer))
145 sctp_transport_put(transport); 147 sctp_transport_put(transport);
146 148
149 if (del_timer(&transport->reconf_timer))
150 sctp_transport_put(transport);
151
147 /* Delete the ICMP proto unreachable timer if it's active. */ 152 /* Delete the ICMP proto unreachable timer if it's active. */
148 if (del_timer(&transport->proto_unreach_timer)) 153 if (del_timer(&transport->proto_unreach_timer))
149 sctp_association_put(transport->asoc); 154 sctp_association_put(transport->asoc);
@@ -211,6 +216,14 @@ void sctp_transport_reset_hb_timer(struct sctp_transport *transport)
211 sctp_transport_hold(transport); 216 sctp_transport_hold(transport);
212} 217}
213 218
219void sctp_transport_reset_reconf_timer(struct sctp_transport *transport)
220{
221 if (!timer_pending(&transport->reconf_timer))
222 if (!mod_timer(&transport->reconf_timer,
223 jiffies + transport->rto))
224 sctp_transport_hold(transport);
225}
226
214/* This transport has been assigned to an association. 227/* This transport has been assigned to an association.
215 * Initialize fields from the association or from the sock itself. 228 * Initialize fields from the association or from the sock itself.
216 * Register the reference count in the association. 229 * Register the reference count in the association.
@@ -227,7 +240,7 @@ void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk)
227{ 240{
228 /* If we don't have a fresh route, look one up */ 241 /* If we don't have a fresh route, look one up */
229 if (!transport->dst || transport->dst->obsolete) { 242 if (!transport->dst || transport->dst->obsolete) {
230 dst_release(transport->dst); 243 sctp_transport_dst_release(transport);
231 transport->af_specific->get_dst(transport, &transport->saddr, 244 transport->af_specific->get_dst(transport, &transport->saddr,
232 &transport->fl, sk); 245 &transport->fl, sk);
233 } 246 }
@@ -238,14 +251,13 @@ void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk)
238 transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT; 251 transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT;
239} 252}
240 253
241void sctp_transport_update_pmtu(struct sock *sk, struct sctp_transport *t, u32 pmtu) 254void sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu)
242{ 255{
243 struct dst_entry *dst; 256 struct dst_entry *dst = sctp_transport_dst_check(t);
244 257
245 if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) { 258 if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) {
246 pr_warn("%s: Reported pmtu %d too low, using default minimum of %d\n", 259 pr_warn("%s: Reported pmtu %d too low, using default minimum of %d\n",
247 __func__, pmtu, 260 __func__, pmtu, SCTP_DEFAULT_MINSEGMENT);
248 SCTP_DEFAULT_MINSEGMENT);
249 /* Use default minimum segment size and disable 261 /* Use default minimum segment size and disable
250 * pmtu discovery on this transport. 262 * pmtu discovery on this transport.
251 */ 263 */
@@ -254,17 +266,13 @@ void sctp_transport_update_pmtu(struct sock *sk, struct sctp_transport *t, u32 p
254 t->pathmtu = pmtu; 266 t->pathmtu = pmtu;
255 } 267 }
256 268
257 dst = sctp_transport_dst_check(t);
258 if (!dst)
259 t->af_specific->get_dst(t, &t->saddr, &t->fl, sk);
260
261 if (dst) { 269 if (dst) {
262 dst->ops->update_pmtu(dst, sk, NULL, pmtu); 270 dst->ops->update_pmtu(dst, t->asoc->base.sk, NULL, pmtu);
263
264 dst = sctp_transport_dst_check(t); 271 dst = sctp_transport_dst_check(t);
265 if (!dst)
266 t->af_specific->get_dst(t, &t->saddr, &t->fl, sk);
267 } 272 }
273
274 if (!dst)
275 t->af_specific->get_dst(t, &t->saddr, &t->fl, t->asoc->base.sk);
268} 276}
269 277
270/* Caches the dst entry and source address for a transport's destination 278/* Caches the dst entry and source address for a transport's destination
@@ -630,9 +638,7 @@ void sctp_transport_reset(struct sctp_transport *t)
630 t->srtt = 0; 638 t->srtt = 0;
631 t->rttvar = 0; 639 t->rttvar = 0;
632 640
633 /* Reset these additional varibles so that we have a clean 641 /* Reset these additional variables so that we have a clean slate. */
634 * slate.
635 */
636 t->partial_bytes_acked = 0; 642 t->partial_bytes_acked = 0;
637 t->flight_size = 0; 643 t->flight_size = 0;
638 t->error_count = 0; 644 t->error_count = 0;
@@ -659,3 +665,17 @@ void sctp_transport_immediate_rtx(struct sctp_transport *t)
659 sctp_transport_hold(t); 665 sctp_transport_hold(t);
660 } 666 }
661} 667}
668
669/* Drop dst */
670void sctp_transport_dst_release(struct sctp_transport *t)
671{
672 dst_release(t->dst);
673 t->dst = NULL;
674 t->dst_pending_confirm = 0;
675}
676
677/* Schedule neighbour confirm */
678void sctp_transport_dst_confirm(struct sctp_transport *t)
679{
680 t->dst_pending_confirm = 1;
681}
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index bea00058ce35..c8881bc542a0 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -854,6 +854,35 @@ struct sctp_ulpevent *sctp_ulpevent_make_sender_dry_event(
854 return event; 854 return event;
855} 855}
856 856
857struct sctp_ulpevent *sctp_ulpevent_make_stream_reset_event(
858 const struct sctp_association *asoc, __u16 flags, __u16 stream_num,
859 __u16 *stream_list, gfp_t gfp)
860{
861 struct sctp_stream_reset_event *sreset;
862 struct sctp_ulpevent *event;
863 struct sk_buff *skb;
864 int length, i;
865
866 length = sizeof(struct sctp_stream_reset_event) + 2 * stream_num;
867 event = sctp_ulpevent_new(length, MSG_NOTIFICATION, gfp);
868 if (!event)
869 return NULL;
870
871 skb = sctp_event2skb(event);
872 sreset = (struct sctp_stream_reset_event *)skb_put(skb, length);
873
874 sreset->strreset_type = SCTP_STREAM_RESET_EVENT;
875 sreset->strreset_flags = flags;
876 sreset->strreset_length = length;
877 sctp_ulpevent_set_owner(event, asoc);
878 sreset->strreset_assoc_id = sctp_assoc2id(asoc);
879
880 for (i = 0; i < stream_num; i++)
881 sreset->strreset_stream_list[i] = ntohs(stream_list[i]);
882
883 return event;
884}
885
857/* Return the notification type, assuming this is a notification 886/* Return the notification type, assuming this is a notification
858 * event. 887 * event.
859 */ 888 */
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index 84d0fdaf7de9..aa3624d50278 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -760,11 +760,11 @@ static void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq,
760 struct sk_buff_head *event_list; 760 struct sk_buff_head *event_list;
761 struct sk_buff *pos, *tmp; 761 struct sk_buff *pos, *tmp;
762 struct sctp_ulpevent *cevent; 762 struct sctp_ulpevent *cevent;
763 struct sctp_stream *in; 763 struct sctp_stream *stream;
764 __u16 sid, csid, cssn; 764 __u16 sid, csid, cssn;
765 765
766 sid = event->stream; 766 sid = event->stream;
767 in = &ulpq->asoc->ssnmap->in; 767 stream = ulpq->asoc->stream;
768 768
769 event_list = (struct sk_buff_head *) sctp_event2skb(event)->prev; 769 event_list = (struct sk_buff_head *) sctp_event2skb(event)->prev;
770 770
@@ -782,11 +782,11 @@ static void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq,
782 if (csid < sid) 782 if (csid < sid)
783 continue; 783 continue;
784 784
785 if (cssn != sctp_ssn_peek(in, sid)) 785 if (cssn != sctp_ssn_peek(stream, in, sid))
786 break; 786 break;
787 787
788 /* Found it, so mark in the ssnmap. */ 788 /* Found it, so mark in the stream. */
789 sctp_ssn_next(in, sid); 789 sctp_ssn_next(stream, in, sid);
790 790
791 __skb_unlink(pos, &ulpq->lobby); 791 __skb_unlink(pos, &ulpq->lobby);
792 792
@@ -849,7 +849,7 @@ static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *ulpq,
849 struct sctp_ulpevent *event) 849 struct sctp_ulpevent *event)
850{ 850{
851 __u16 sid, ssn; 851 __u16 sid, ssn;
852 struct sctp_stream *in; 852 struct sctp_stream *stream;
853 853
854 /* Check if this message needs ordering. */ 854 /* Check if this message needs ordering. */
855 if (SCTP_DATA_UNORDERED & event->msg_flags) 855 if (SCTP_DATA_UNORDERED & event->msg_flags)
@@ -858,10 +858,10 @@ static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *ulpq,
858 /* Note: The stream ID must be verified before this routine. */ 858 /* Note: The stream ID must be verified before this routine. */
859 sid = event->stream; 859 sid = event->stream;
860 ssn = event->ssn; 860 ssn = event->ssn;
861 in = &ulpq->asoc->ssnmap->in; 861 stream = ulpq->asoc->stream;
862 862
863 /* Is this the expected SSN for this stream ID? */ 863 /* Is this the expected SSN for this stream ID? */
864 if (ssn != sctp_ssn_peek(in, sid)) { 864 if (ssn != sctp_ssn_peek(stream, in, sid)) {
865 /* We've received something out of order, so find where it 865 /* We've received something out of order, so find where it
866 * needs to be placed. We order by stream and then by SSN. 866 * needs to be placed. We order by stream and then by SSN.
867 */ 867 */
@@ -870,7 +870,7 @@ static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *ulpq,
870 } 870 }
871 871
872 /* Mark that the next chunk has been found. */ 872 /* Mark that the next chunk has been found. */
873 sctp_ssn_next(in, sid); 873 sctp_ssn_next(stream, in, sid);
874 874
875 /* Go find any other chunks that were waiting for 875 /* Go find any other chunks that were waiting for
876 * ordering. 876 * ordering.
@@ -888,12 +888,12 @@ static void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid)
888 struct sk_buff *pos, *tmp; 888 struct sk_buff *pos, *tmp;
889 struct sctp_ulpevent *cevent; 889 struct sctp_ulpevent *cevent;
890 struct sctp_ulpevent *event; 890 struct sctp_ulpevent *event;
891 struct sctp_stream *in; 891 struct sctp_stream *stream;
892 struct sk_buff_head temp; 892 struct sk_buff_head temp;
893 struct sk_buff_head *lobby = &ulpq->lobby; 893 struct sk_buff_head *lobby = &ulpq->lobby;
894 __u16 csid, cssn; 894 __u16 csid, cssn;
895 895
896 in = &ulpq->asoc->ssnmap->in; 896 stream = ulpq->asoc->stream;
897 897
898 /* We are holding the chunks by stream, by SSN. */ 898 /* We are holding the chunks by stream, by SSN. */
899 skb_queue_head_init(&temp); 899 skb_queue_head_init(&temp);
@@ -912,7 +912,7 @@ static void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid)
912 continue; 912 continue;
913 913
914 /* see if this ssn has been marked by skipping */ 914 /* see if this ssn has been marked by skipping */
915 if (!SSN_lt(cssn, sctp_ssn_peek(in, csid))) 915 if (!SSN_lt(cssn, sctp_ssn_peek(stream, in, csid)))
916 break; 916 break;
917 917
918 __skb_unlink(pos, lobby); 918 __skb_unlink(pos, lobby);
@@ -932,8 +932,8 @@ static void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid)
932 csid = cevent->stream; 932 csid = cevent->stream;
933 cssn = cevent->ssn; 933 cssn = cevent->ssn;
934 934
935 if (csid == sid && cssn == sctp_ssn_peek(in, csid)) { 935 if (csid == sid && cssn == sctp_ssn_peek(stream, in, csid)) {
936 sctp_ssn_next(in, csid); 936 sctp_ssn_next(stream, in, csid);
937 __skb_unlink(pos, lobby); 937 __skb_unlink(pos, lobby);
938 __skb_queue_tail(&temp, pos); 938 __skb_queue_tail(&temp, pos);
939 event = sctp_skb2event(pos); 939 event = sctp_skb2event(pos);
@@ -955,17 +955,17 @@ static void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid)
955 */ 955 */
956void sctp_ulpq_skip(struct sctp_ulpq *ulpq, __u16 sid, __u16 ssn) 956void sctp_ulpq_skip(struct sctp_ulpq *ulpq, __u16 sid, __u16 ssn)
957{ 957{
958 struct sctp_stream *in; 958 struct sctp_stream *stream;
959 959
960 /* Note: The stream ID must be verified before this routine. */ 960 /* Note: The stream ID must be verified before this routine. */
961 in = &ulpq->asoc->ssnmap->in; 961 stream = ulpq->asoc->stream;
962 962
963 /* Is this an old SSN? If so ignore. */ 963 /* Is this an old SSN? If so ignore. */
964 if (SSN_lt(ssn, sctp_ssn_peek(in, sid))) 964 if (SSN_lt(ssn, sctp_ssn_peek(stream, in, sid)))
965 return; 965 return;
966 966
967 /* Mark that we are no longer expecting this SSN or lower. */ 967 /* Mark that we are no longer expecting this SSN or lower. */
968 sctp_ssn_skip(in, sid, ssn); 968 sctp_ssn_skip(stream, in, sid, ssn);
969 969
970 /* Go find any other chunks that were waiting for 970 /* Go find any other chunks that were waiting for
971 * ordering and deliver them if needed. 971 * ordering and deliver them if needed.
diff --git a/net/smc/Kconfig b/net/smc/Kconfig
new file mode 100644
index 000000000000..c717ef0896aa
--- /dev/null
+++ b/net/smc/Kconfig
@@ -0,0 +1,20 @@
1config SMC
2 tristate "SMC socket protocol family"
3 depends on INET && INFINIBAND
4 ---help---
5 SMC-R provides a "sockets over RDMA" solution making use of
6 RDMA over Converged Ethernet (RoCE) technology to upgrade
7 AF_INET TCP connections transparently.
8 The Linux implementation of the SMC-R solution is designed as
9 a separate socket family SMC.
10
11 Select this option if you want to run SMC socket applications
12
13config SMC_DIAG
14 tristate "SMC: socket monitoring interface"
15 depends on SMC
16 ---help---
17 Support for SMC socket monitoring interface used by tools such as
18 smcss.
19
20 if unsure, say Y.
diff --git a/net/smc/Makefile b/net/smc/Makefile
new file mode 100644
index 000000000000..188104654b54
--- /dev/null
+++ b/net/smc/Makefile
@@ -0,0 +1,4 @@
1obj-$(CONFIG_SMC) += smc.o
2obj-$(CONFIG_SMC_DIAG) += smc_diag.o
3smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o
4smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
new file mode 100644
index 000000000000..093803786eac
--- /dev/null
+++ b/net/smc/af_smc.c
@@ -0,0 +1,1409 @@
1/*
2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
3 *
4 * AF_SMC protocol family socket handler keeping the AF_INET sock address type
5 * applies to SOCK_STREAM sockets only
6 * offers an alternative communication option for TCP-protocol sockets
7 * applicable with RoCE-cards only
8 *
9 * Initial restrictions:
10 * - non-blocking connect postponed
11 * - IPv6 support postponed
12 * - support for alternate links postponed
13 * - partial support for non-blocking sockets only
14 * - support for urgent data postponed
15 *
16 * Copyright IBM Corp. 2016
17 *
18 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
19 * based on prototype from Frank Blaschka
20 */
21
22#define KMSG_COMPONENT "smc"
23#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
24
25#include <linux/module.h>
26#include <linux/socket.h>
27#include <linux/inetdevice.h>
28#include <linux/workqueue.h>
29#include <linux/in.h>
30#include <linux/sched/signal.h>
31
32#include <net/sock.h>
33#include <net/tcp.h>
34#include <net/smc.h>
35
36#include "smc.h"
37#include "smc_clc.h"
38#include "smc_llc.h"
39#include "smc_cdc.h"
40#include "smc_core.h"
41#include "smc_ib.h"
42#include "smc_pnet.h"
43#include "smc_tx.h"
44#include "smc_rx.h"
45#include "smc_close.h"
46
47static DEFINE_MUTEX(smc_create_lgr_pending); /* serialize link group
48 * creation
49 */
50
51struct smc_lgr_list smc_lgr_list = { /* established link groups */
52 .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
53 .list = LIST_HEAD_INIT(smc_lgr_list.list),
54};
55
56static void smc_tcp_listen_work(struct work_struct *);
57
58static void smc_set_keepalive(struct sock *sk, int val)
59{
60 struct smc_sock *smc = smc_sk(sk);
61
62 smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
63}
64
65static struct smc_hashinfo smc_v4_hashinfo = {
66 .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
67};
68
69int smc_hash_sk(struct sock *sk)
70{
71 struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
72 struct hlist_head *head;
73
74 head = &h->ht;
75
76 write_lock_bh(&h->lock);
77 sk_add_node(sk, head);
78 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
79 write_unlock_bh(&h->lock);
80
81 return 0;
82}
83EXPORT_SYMBOL_GPL(smc_hash_sk);
84
85void smc_unhash_sk(struct sock *sk)
86{
87 struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
88
89 write_lock_bh(&h->lock);
90 if (sk_del_node_init(sk))
91 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
92 write_unlock_bh(&h->lock);
93}
94EXPORT_SYMBOL_GPL(smc_unhash_sk);
95
96struct proto smc_proto = {
97 .name = "SMC",
98 .owner = THIS_MODULE,
99 .keepalive = smc_set_keepalive,
100 .hash = smc_hash_sk,
101 .unhash = smc_unhash_sk,
102 .obj_size = sizeof(struct smc_sock),
103 .h.smc_hash = &smc_v4_hashinfo,
104 .slab_flags = SLAB_DESTROY_BY_RCU,
105};
106EXPORT_SYMBOL_GPL(smc_proto);
107
108static int smc_release(struct socket *sock)
109{
110 struct sock *sk = sock->sk;
111 struct smc_sock *smc;
112 int rc = 0;
113
114 if (!sk)
115 goto out;
116
117 smc = smc_sk(sk);
118 sock_hold(sk);
119 if (sk->sk_state == SMC_LISTEN)
120 /* smc_close_non_accepted() is called and acquires
121 * sock lock for child sockets again
122 */
123 lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
124 else
125 lock_sock(sk);
126
127 if (smc->use_fallback) {
128 sk->sk_state = SMC_CLOSED;
129 sk->sk_state_change(sk);
130 } else {
131 rc = smc_close_active(smc);
132 sock_set_flag(sk, SOCK_DEAD);
133 sk->sk_shutdown |= SHUTDOWN_MASK;
134 }
135 if (smc->clcsock) {
136 sock_release(smc->clcsock);
137 smc->clcsock = NULL;
138 }
139
140 /* detach socket */
141 sock_orphan(sk);
142 sock->sk = NULL;
143 if (smc->use_fallback) {
144 schedule_delayed_work(&smc->sock_put_work, TCP_TIMEWAIT_LEN);
145 } else if (sk->sk_state == SMC_CLOSED) {
146 smc_conn_free(&smc->conn);
147 schedule_delayed_work(&smc->sock_put_work,
148 SMC_CLOSE_SOCK_PUT_DELAY);
149 }
150 sk->sk_prot->unhash(sk);
151 release_sock(sk);
152
153 sock_put(sk);
154out:
155 return rc;
156}
157
158static void smc_destruct(struct sock *sk)
159{
160 if (sk->sk_state != SMC_CLOSED)
161 return;
162 if (!sock_flag(sk, SOCK_DEAD))
163 return;
164
165 sk_refcnt_debug_dec(sk);
166}
167
168static struct sock *smc_sock_alloc(struct net *net, struct socket *sock)
169{
170 struct smc_sock *smc;
171 struct sock *sk;
172
173 sk = sk_alloc(net, PF_SMC, GFP_KERNEL, &smc_proto, 0);
174 if (!sk)
175 return NULL;
176
177 sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
178 sk->sk_state = SMC_INIT;
179 sk->sk_destruct = smc_destruct;
180 sk->sk_protocol = SMCPROTO_SMC;
181 smc = smc_sk(sk);
182 INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
183 INIT_LIST_HEAD(&smc->accept_q);
184 spin_lock_init(&smc->accept_q_lock);
185 INIT_DELAYED_WORK(&smc->sock_put_work, smc_close_sock_put_work);
186 sk->sk_prot->hash(sk);
187 sk_refcnt_debug_inc(sk);
188
189 return sk;
190}
191
192static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
193 int addr_len)
194{
195 struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
196 struct sock *sk = sock->sk;
197 struct smc_sock *smc;
198 int rc;
199
200 smc = smc_sk(sk);
201
202 /* replicate tests from inet_bind(), to be safe wrt. future changes */
203 rc = -EINVAL;
204 if (addr_len < sizeof(struct sockaddr_in))
205 goto out;
206
207 rc = -EAFNOSUPPORT;
208 /* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
209 if ((addr->sin_family != AF_INET) &&
210 ((addr->sin_family != AF_UNSPEC) ||
211 (addr->sin_addr.s_addr != htonl(INADDR_ANY))))
212 goto out;
213
214 lock_sock(sk);
215
216 /* Check if socket is already active */
217 rc = -EINVAL;
218 if (sk->sk_state != SMC_INIT)
219 goto out_rel;
220
221 smc->clcsock->sk->sk_reuse = sk->sk_reuse;
222 rc = kernel_bind(smc->clcsock, uaddr, addr_len);
223
224out_rel:
225 release_sock(sk);
226out:
227 return rc;
228}
229
230static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
231 unsigned long mask)
232{
233 /* options we don't get control via setsockopt for */
234 nsk->sk_type = osk->sk_type;
235 nsk->sk_sndbuf = osk->sk_sndbuf;
236 nsk->sk_rcvbuf = osk->sk_rcvbuf;
237 nsk->sk_sndtimeo = osk->sk_sndtimeo;
238 nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
239 nsk->sk_mark = osk->sk_mark;
240 nsk->sk_priority = osk->sk_priority;
241 nsk->sk_rcvlowat = osk->sk_rcvlowat;
242 nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
243 nsk->sk_err = osk->sk_err;
244
245 nsk->sk_flags &= ~mask;
246 nsk->sk_flags |= osk->sk_flags & mask;
247}
248
249#define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
250 (1UL << SOCK_KEEPOPEN) | \
251 (1UL << SOCK_LINGER) | \
252 (1UL << SOCK_BROADCAST) | \
253 (1UL << SOCK_TIMESTAMP) | \
254 (1UL << SOCK_DBG) | \
255 (1UL << SOCK_RCVTSTAMP) | \
256 (1UL << SOCK_RCVTSTAMPNS) | \
257 (1UL << SOCK_LOCALROUTE) | \
258 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
259 (1UL << SOCK_RXQ_OVFL) | \
260 (1UL << SOCK_WIFI_STATUS) | \
261 (1UL << SOCK_NOFCS) | \
262 (1UL << SOCK_FILTER_LOCKED))
263/* copy only relevant settings and flags of SOL_SOCKET level from smc to
264 * clc socket (since smc is not called for these options from net/core)
265 */
266static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
267{
268 smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
269}
270
271#define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
272 (1UL << SOCK_KEEPOPEN) | \
273 (1UL << SOCK_LINGER) | \
274 (1UL << SOCK_DBG))
275/* copy only settings and flags relevant for smc from clc to smc socket */
276static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
277{
278 smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
279}
280
281/* determine subnet and mask of internal TCP socket */
282int smc_netinfo_by_tcpsk(struct socket *clcsock,
283 __be32 *subnet, u8 *prefix_len)
284{
285 struct dst_entry *dst = sk_dst_get(clcsock->sk);
286 struct sockaddr_in addr;
287 int rc = -ENOENT;
288 int len;
289
290 if (!dst) {
291 rc = -ENOTCONN;
292 goto out;
293 }
294 if (!dst->dev) {
295 rc = -ENODEV;
296 goto out_rel;
297 }
298
299 /* get address to which the internal TCP socket is bound */
300 kernel_getsockname(clcsock, (struct sockaddr *)&addr, &len);
301 /* analyze IPv4 specific data of net_device belonging to TCP socket */
302 for_ifa(dst->dev->ip_ptr) {
303 if (ifa->ifa_address != addr.sin_addr.s_addr)
304 continue;
305 *prefix_len = inet_mask_len(ifa->ifa_mask);
306 *subnet = ifa->ifa_address & ifa->ifa_mask;
307 rc = 0;
308 break;
309 } endfor_ifa(dst->dev->ip_ptr);
310
311out_rel:
312 dst_release(dst);
313out:
314 return rc;
315}
316
317static int smc_clnt_conf_first_link(struct smc_sock *smc, union ib_gid *gid)
318{
319 struct smc_link_group *lgr = smc->conn.lgr;
320 struct smc_link *link;
321 int rest;
322 int rc;
323
324 link = &lgr->lnk[SMC_SINGLE_LINK];
325 /* receive CONFIRM LINK request from server over RoCE fabric */
326 rest = wait_for_completion_interruptible_timeout(
327 &link->llc_confirm,
328 SMC_LLC_WAIT_FIRST_TIME);
329 if (rest <= 0) {
330 struct smc_clc_msg_decline dclc;
331
332 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
333 SMC_CLC_DECLINE);
334 return rc;
335 }
336
337 rc = smc_ib_modify_qp_rts(link);
338 if (rc)
339 return SMC_CLC_DECL_INTERR;
340
341 smc_wr_remember_qp_attr(link);
342 /* send CONFIRM LINK response over RoCE fabric */
343 rc = smc_llc_send_confirm_link(link,
344 link->smcibdev->mac[link->ibport - 1],
345 gid, SMC_LLC_RESP);
346 if (rc < 0)
347 return SMC_CLC_DECL_TCL;
348
349 return rc;
350}
351
352static void smc_conn_save_peer_info(struct smc_sock *smc,
353 struct smc_clc_msg_accept_confirm *clc)
354{
355 smc->conn.peer_conn_idx = clc->conn_idx;
356 smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token);
357 smc->conn.peer_rmbe_size = smc_uncompress_bufsize(clc->rmbe_size);
358 atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
359}
360
361static void smc_link_save_peer_info(struct smc_link *link,
362 struct smc_clc_msg_accept_confirm *clc)
363{
364 link->peer_qpn = ntoh24(clc->qpn);
365 memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE);
366 memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac));
367 link->peer_psn = ntoh24(clc->psn);
368 link->peer_mtu = clc->qp_mtu;
369}
370
371/* setup for RDMA connection of client */
372static int smc_connect_rdma(struct smc_sock *smc)
373{
374 struct sockaddr_in *inaddr = (struct sockaddr_in *)smc->addr;
375 struct smc_clc_msg_accept_confirm aclc;
376 int local_contact = SMC_FIRST_CONTACT;
377 struct smc_ib_device *smcibdev;
378 struct smc_link *link;
379 u8 srv_first_contact;
380 int reason_code = 0;
381 int rc = 0;
382 u8 ibport;
383
384 /* IPSec connections opt out of SMC-R optimizations */
385 if (using_ipsec(smc)) {
386 reason_code = SMC_CLC_DECL_IPSEC;
387 goto decline_rdma;
388 }
389
390 /* PNET table look up: search active ib_device and port
391 * within same PNETID that also contains the ethernet device
392 * used for the internal TCP socket
393 */
394 smc_pnet_find_roce_resource(smc->clcsock->sk, &smcibdev, &ibport);
395 if (!smcibdev) {
396 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
397 goto decline_rdma;
398 }
399
400 /* do inband token exchange */
401 reason_code = smc_clc_send_proposal(smc, smcibdev, ibport);
402 if (reason_code < 0) {
403 rc = reason_code;
404 goto out_err;
405 }
406 if (reason_code > 0) /* configuration error */
407 goto decline_rdma;
408 /* receive SMC Accept CLC message */
409 reason_code = smc_clc_wait_msg(smc, &aclc, sizeof(aclc),
410 SMC_CLC_ACCEPT);
411 if (reason_code < 0) {
412 rc = reason_code;
413 goto out_err;
414 }
415 if (reason_code > 0)
416 goto decline_rdma;
417
418 srv_first_contact = aclc.hdr.flag;
419 mutex_lock(&smc_create_lgr_pending);
420 local_contact = smc_conn_create(smc, inaddr->sin_addr.s_addr, smcibdev,
421 ibport, &aclc.lcl, srv_first_contact);
422 if (local_contact < 0) {
423 rc = local_contact;
424 if (rc == -ENOMEM)
425 reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
426 else if (rc == -ENOLINK)
427 reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */
428 goto decline_rdma_unlock;
429 }
430 link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK];
431
432 smc_conn_save_peer_info(smc, &aclc);
433
434 rc = smc_sndbuf_create(smc);
435 if (rc) {
436 reason_code = SMC_CLC_DECL_MEM;
437 goto decline_rdma_unlock;
438 }
439 rc = smc_rmb_create(smc);
440 if (rc) {
441 reason_code = SMC_CLC_DECL_MEM;
442 goto decline_rdma_unlock;
443 }
444
445 if (local_contact == SMC_FIRST_CONTACT)
446 smc_link_save_peer_info(link, &aclc);
447
448 rc = smc_rmb_rtoken_handling(&smc->conn, &aclc);
449 if (rc) {
450 reason_code = SMC_CLC_DECL_INTERR;
451 goto decline_rdma_unlock;
452 }
453
454 if (local_contact == SMC_FIRST_CONTACT) {
455 rc = smc_ib_ready_link(link);
456 if (rc) {
457 reason_code = SMC_CLC_DECL_INTERR;
458 goto decline_rdma_unlock;
459 }
460 }
461
462 rc = smc_clc_send_confirm(smc);
463 if (rc)
464 goto out_err_unlock;
465
466 if (local_contact == SMC_FIRST_CONTACT) {
467 /* QP confirmation over RoCE fabric */
468 reason_code = smc_clnt_conf_first_link(
469 smc, &smcibdev->gid[ibport - 1]);
470 if (reason_code < 0) {
471 rc = reason_code;
472 goto out_err_unlock;
473 }
474 if (reason_code > 0)
475 goto decline_rdma_unlock;
476 }
477
478 mutex_unlock(&smc_create_lgr_pending);
479 smc_tx_init(smc);
480 smc_rx_init(smc);
481
482out_connected:
483 smc_copy_sock_settings_to_clc(smc);
484 if (smc->sk.sk_state == SMC_INIT)
485 smc->sk.sk_state = SMC_ACTIVE;
486
487 return rc ? rc : local_contact;
488
489decline_rdma_unlock:
490 mutex_unlock(&smc_create_lgr_pending);
491 smc_conn_free(&smc->conn);
492decline_rdma:
493 /* RDMA setup failed, switch back to TCP */
494 smc->use_fallback = true;
495 if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
496 rc = smc_clc_send_decline(smc, reason_code, 0);
497 if (rc < sizeof(struct smc_clc_msg_decline))
498 goto out_err;
499 }
500 goto out_connected;
501
502out_err_unlock:
503 mutex_unlock(&smc_create_lgr_pending);
504 smc_conn_free(&smc->conn);
505out_err:
506 return rc;
507}
508
509static int smc_connect(struct socket *sock, struct sockaddr *addr,
510 int alen, int flags)
511{
512 struct sock *sk = sock->sk;
513 struct smc_sock *smc;
514 int rc = -EINVAL;
515
516 smc = smc_sk(sk);
517
518 /* separate smc parameter checking to be safe */
519 if (alen < sizeof(addr->sa_family))
520 goto out_err;
521 if (addr->sa_family != AF_INET)
522 goto out_err;
523 smc->addr = addr; /* needed for nonblocking connect */
524
525 lock_sock(sk);
526 switch (sk->sk_state) {
527 default:
528 goto out;
529 case SMC_ACTIVE:
530 rc = -EISCONN;
531 goto out;
532 case SMC_INIT:
533 rc = 0;
534 break;
535 }
536
537 smc_copy_sock_settings_to_clc(smc);
538 rc = kernel_connect(smc->clcsock, addr, alen, flags);
539 if (rc)
540 goto out;
541
542 /* setup RDMA connection */
543 rc = smc_connect_rdma(smc);
544 if (rc < 0)
545 goto out;
546 else
547 rc = 0; /* success cases including fallback */
548
549out:
550 release_sock(sk);
551out_err:
552 return rc;
553}
554
555static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
556{
557 struct sock *sk = &lsmc->sk;
558 struct socket *new_clcsock;
559 struct sock *new_sk;
560 int rc;
561
562 release_sock(&lsmc->sk);
563 new_sk = smc_sock_alloc(sock_net(sk), NULL);
564 if (!new_sk) {
565 rc = -ENOMEM;
566 lsmc->sk.sk_err = ENOMEM;
567 *new_smc = NULL;
568 lock_sock(&lsmc->sk);
569 goto out;
570 }
571 *new_smc = smc_sk(new_sk);
572
573 rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
574 lock_sock(&lsmc->sk);
575 if (rc < 0) {
576 lsmc->sk.sk_err = -rc;
577 new_sk->sk_state = SMC_CLOSED;
578 sock_set_flag(new_sk, SOCK_DEAD);
579 sk->sk_prot->unhash(new_sk);
580 sock_put(new_sk);
581 *new_smc = NULL;
582 goto out;
583 }
584 if (lsmc->sk.sk_state == SMC_CLOSED) {
585 if (new_clcsock)
586 sock_release(new_clcsock);
587 new_sk->sk_state = SMC_CLOSED;
588 sock_set_flag(new_sk, SOCK_DEAD);
589 sk->sk_prot->unhash(new_sk);
590 sock_put(new_sk);
591 *new_smc = NULL;
592 goto out;
593 }
594
595 (*new_smc)->clcsock = new_clcsock;
596out:
597 return rc;
598}
599
600/* add a just created sock to the accept queue of the listen sock as
601 * candidate for a following socket accept call from user space
602 */
603static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
604{
605 struct smc_sock *par = smc_sk(parent);
606
607 sock_hold(sk);
608 spin_lock(&par->accept_q_lock);
609 list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
610 spin_unlock(&par->accept_q_lock);
611 sk_acceptq_added(parent);
612}
613
614/* remove a socket from the accept queue of its parental listening socket */
615static void smc_accept_unlink(struct sock *sk)
616{
617 struct smc_sock *par = smc_sk(sk)->listen_smc;
618
619 spin_lock(&par->accept_q_lock);
620 list_del_init(&smc_sk(sk)->accept_q);
621 spin_unlock(&par->accept_q_lock);
622 sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
623 sock_put(sk);
624}
625
626/* remove a sock from the accept queue to bind it to a new socket created
627 * for a socket accept call from user space
628 */
629struct sock *smc_accept_dequeue(struct sock *parent,
630 struct socket *new_sock)
631{
632 struct smc_sock *isk, *n;
633 struct sock *new_sk;
634
635 list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
636 new_sk = (struct sock *)isk;
637
638 smc_accept_unlink(new_sk);
639 if (new_sk->sk_state == SMC_CLOSED) {
640 /* tbd in follow-on patch: close this sock */
641 continue;
642 }
643 if (new_sock)
644 sock_graft(new_sk, new_sock);
645 return new_sk;
646 }
647 return NULL;
648}
649
650/* clean up for a created but never accepted sock */
651void smc_close_non_accepted(struct sock *sk)
652{
653 struct smc_sock *smc = smc_sk(sk);
654
655 sock_hold(sk);
656 lock_sock(sk);
657 if (!sk->sk_lingertime)
658 /* wait for peer closing */
659 sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT;
660 if (!smc->use_fallback)
661 smc_close_active(smc);
662 if (smc->clcsock) {
663 struct socket *tcp;
664
665 tcp = smc->clcsock;
666 smc->clcsock = NULL;
667 sock_release(tcp);
668 }
669 sock_set_flag(sk, SOCK_DEAD);
670 sk->sk_shutdown |= SHUTDOWN_MASK;
671 if (smc->use_fallback) {
672 schedule_delayed_work(&smc->sock_put_work, TCP_TIMEWAIT_LEN);
673 } else {
674 smc_conn_free(&smc->conn);
675 schedule_delayed_work(&smc->sock_put_work,
676 SMC_CLOSE_SOCK_PUT_DELAY);
677 }
678 release_sock(sk);
679 sock_put(sk);
680}
681
682static int smc_serv_conf_first_link(struct smc_sock *smc)
683{
684 struct smc_link_group *lgr = smc->conn.lgr;
685 struct smc_link *link;
686 int rest;
687 int rc;
688
689 link = &lgr->lnk[SMC_SINGLE_LINK];
690 /* send CONFIRM LINK request to client over the RoCE fabric */
691 rc = smc_llc_send_confirm_link(link,
692 link->smcibdev->mac[link->ibport - 1],
693 &link->smcibdev->gid[link->ibport - 1],
694 SMC_LLC_REQ);
695 if (rc < 0)
696 return SMC_CLC_DECL_TCL;
697
698 /* receive CONFIRM LINK response from client over the RoCE fabric */
699 rest = wait_for_completion_interruptible_timeout(
700 &link->llc_confirm_resp,
701 SMC_LLC_WAIT_FIRST_TIME);
702 if (rest <= 0) {
703 struct smc_clc_msg_decline dclc;
704
705 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
706 SMC_CLC_DECLINE);
707 }
708
709 return rc;
710}
711
712/* setup for RDMA connection of server */
713static void smc_listen_work(struct work_struct *work)
714{
715 struct smc_sock *new_smc = container_of(work, struct smc_sock,
716 smc_listen_work);
717 struct socket *newclcsock = new_smc->clcsock;
718 struct smc_sock *lsmc = new_smc->listen_smc;
719 struct smc_clc_msg_accept_confirm cclc;
720 int local_contact = SMC_REUSE_CONTACT;
721 struct sock *newsmcsk = &new_smc->sk;
722 struct smc_clc_msg_proposal pclc;
723 struct smc_ib_device *smcibdev;
724 struct sockaddr_in peeraddr;
725 struct smc_link *link;
726 int reason_code = 0;
727 int rc = 0, len;
728 __be32 subnet;
729 u8 prefix_len;
730 u8 ibport;
731
732 /* do inband token exchange -
733 *wait for and receive SMC Proposal CLC message
734 */
735 reason_code = smc_clc_wait_msg(new_smc, &pclc, sizeof(pclc),
736 SMC_CLC_PROPOSAL);
737 if (reason_code < 0)
738 goto out_err;
739 if (reason_code > 0)
740 goto decline_rdma;
741
742 /* IPSec connections opt out of SMC-R optimizations */
743 if (using_ipsec(new_smc)) {
744 reason_code = SMC_CLC_DECL_IPSEC;
745 goto decline_rdma;
746 }
747
748 /* PNET table look up: search active ib_device and port
749 * within same PNETID that also contains the ethernet device
750 * used for the internal TCP socket
751 */
752 smc_pnet_find_roce_resource(newclcsock->sk, &smcibdev, &ibport);
753 if (!smcibdev) {
754 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
755 goto decline_rdma;
756 }
757
758 /* determine subnet and mask from internal TCP socket */
759 rc = smc_netinfo_by_tcpsk(newclcsock, &subnet, &prefix_len);
760 if (rc) {
761 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
762 goto decline_rdma;
763 }
764 if ((pclc.outgoing_subnet != subnet) ||
765 (pclc.prefix_len != prefix_len)) {
766 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
767 goto decline_rdma;
768 }
769
770 /* get address of the peer connected to the internal TCP socket */
771 kernel_getpeername(newclcsock, (struct sockaddr *)&peeraddr, &len);
772
773 /* allocate connection / link group */
774 mutex_lock(&smc_create_lgr_pending);
775 local_contact = smc_conn_create(new_smc, peeraddr.sin_addr.s_addr,
776 smcibdev, ibport, &pclc.lcl, 0);
777 if (local_contact == SMC_REUSE_CONTACT)
778 /* lock no longer needed, free it due to following
779 * smc_clc_wait_msg() call
780 */
781 mutex_unlock(&smc_create_lgr_pending);
782 if (local_contact < 0) {
783 rc = local_contact;
784 if (rc == -ENOMEM)
785 reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
786 else if (rc == -ENOLINK)
787 reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */
788 goto decline_rdma;
789 }
790 link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
791
792 rc = smc_sndbuf_create(new_smc);
793 if (rc) {
794 reason_code = SMC_CLC_DECL_MEM;
795 goto decline_rdma;
796 }
797 rc = smc_rmb_create(new_smc);
798 if (rc) {
799 reason_code = SMC_CLC_DECL_MEM;
800 goto decline_rdma;
801 }
802
803 rc = smc_clc_send_accept(new_smc, local_contact);
804 if (rc)
805 goto out_err;
806
807 /* receive SMC Confirm CLC message */
808 reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
809 SMC_CLC_CONFIRM);
810 if (reason_code < 0)
811 goto out_err;
812 if (reason_code > 0)
813 goto decline_rdma;
814 smc_conn_save_peer_info(new_smc, &cclc);
815 if (local_contact == SMC_FIRST_CONTACT)
816 smc_link_save_peer_info(link, &cclc);
817
818 rc = smc_rmb_rtoken_handling(&new_smc->conn, &cclc);
819 if (rc) {
820 reason_code = SMC_CLC_DECL_INTERR;
821 goto decline_rdma;
822 }
823
824 if (local_contact == SMC_FIRST_CONTACT) {
825 rc = smc_ib_ready_link(link);
826 if (rc) {
827 reason_code = SMC_CLC_DECL_INTERR;
828 goto decline_rdma;
829 }
830 /* QP confirmation over RoCE fabric */
831 reason_code = smc_serv_conf_first_link(new_smc);
832 if (reason_code < 0) {
833 /* peer is not aware of a problem */
834 rc = reason_code;
835 goto out_err;
836 }
837 if (reason_code > 0)
838 goto decline_rdma;
839 }
840
841 smc_tx_init(new_smc);
842 smc_rx_init(new_smc);
843
844out_connected:
845 sk_refcnt_debug_inc(newsmcsk);
846 if (newsmcsk->sk_state == SMC_INIT)
847 newsmcsk->sk_state = SMC_ACTIVE;
848enqueue:
849 if (local_contact == SMC_FIRST_CONTACT)
850 mutex_unlock(&smc_create_lgr_pending);
851 lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
852 if (lsmc->sk.sk_state == SMC_LISTEN) {
853 smc_accept_enqueue(&lsmc->sk, newsmcsk);
854 } else { /* no longer listening */
855 smc_close_non_accepted(newsmcsk);
856 }
857 release_sock(&lsmc->sk);
858
859 /* Wake up accept */
860 lsmc->sk.sk_data_ready(&lsmc->sk);
861 sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
862 return;
863
864decline_rdma:
865 /* RDMA setup failed, switch back to TCP */
866 smc_conn_free(&new_smc->conn);
867 new_smc->use_fallback = true;
868 if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
869 rc = smc_clc_send_decline(new_smc, reason_code, 0);
870 if (rc < sizeof(struct smc_clc_msg_decline))
871 goto out_err;
872 }
873 goto out_connected;
874
875out_err:
876 newsmcsk->sk_state = SMC_CLOSED;
877 smc_conn_free(&new_smc->conn);
878 goto enqueue; /* queue new sock with sk_err set */
879}
880
881static void smc_tcp_listen_work(struct work_struct *work)
882{
883 struct smc_sock *lsmc = container_of(work, struct smc_sock,
884 tcp_listen_work);
885 struct smc_sock *new_smc;
886 int rc = 0;
887
888 lock_sock(&lsmc->sk);
889 while (lsmc->sk.sk_state == SMC_LISTEN) {
890 rc = smc_clcsock_accept(lsmc, &new_smc);
891 if (rc)
892 goto out;
893 if (!new_smc)
894 continue;
895
896 new_smc->listen_smc = lsmc;
897 new_smc->use_fallback = false; /* assume rdma capability first*/
898 sock_hold(&lsmc->sk); /* sock_put in smc_listen_work */
899 INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
900 smc_copy_sock_settings_to_smc(new_smc);
901 schedule_work(&new_smc->smc_listen_work);
902 }
903
904out:
905 release_sock(&lsmc->sk);
906 lsmc->sk.sk_data_ready(&lsmc->sk); /* no more listening, wake accept */
907}
908
909static int smc_listen(struct socket *sock, int backlog)
910{
911 struct sock *sk = sock->sk;
912 struct smc_sock *smc;
913 int rc;
914
915 smc = smc_sk(sk);
916 lock_sock(sk);
917
918 rc = -EINVAL;
919 if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN))
920 goto out;
921
922 rc = 0;
923 if (sk->sk_state == SMC_LISTEN) {
924 sk->sk_max_ack_backlog = backlog;
925 goto out;
926 }
927 /* some socket options are handled in core, so we could not apply
928 * them to the clc socket -- copy smc socket options to clc socket
929 */
930 smc_copy_sock_settings_to_clc(smc);
931
932 rc = kernel_listen(smc->clcsock, backlog);
933 if (rc)
934 goto out;
935 sk->sk_max_ack_backlog = backlog;
936 sk->sk_ack_backlog = 0;
937 sk->sk_state = SMC_LISTEN;
938 INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
939 schedule_work(&smc->tcp_listen_work);
940
941out:
942 release_sock(sk);
943 return rc;
944}
945
946static int smc_accept(struct socket *sock, struct socket *new_sock,
947 int flags, bool kern)
948{
949 struct sock *sk = sock->sk, *nsk;
950 DECLARE_WAITQUEUE(wait, current);
951 struct smc_sock *lsmc;
952 long timeo;
953 int rc = 0;
954
955 lsmc = smc_sk(sk);
956 lock_sock(sk);
957
958 if (lsmc->sk.sk_state != SMC_LISTEN) {
959 rc = -EINVAL;
960 goto out;
961 }
962
963 /* Wait for an incoming connection */
964 timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
965 add_wait_queue_exclusive(sk_sleep(sk), &wait);
966 while (!(nsk = smc_accept_dequeue(sk, new_sock))) {
967 set_current_state(TASK_INTERRUPTIBLE);
968 if (!timeo) {
969 rc = -EAGAIN;
970 break;
971 }
972 release_sock(sk);
973 timeo = schedule_timeout(timeo);
974 /* wakeup by sk_data_ready in smc_listen_work() */
975 sched_annotate_sleep();
976 lock_sock(sk);
977 if (signal_pending(current)) {
978 rc = sock_intr_errno(timeo);
979 break;
980 }
981 }
982 set_current_state(TASK_RUNNING);
983 remove_wait_queue(sk_sleep(sk), &wait);
984
985 if (!rc)
986 rc = sock_error(nsk);
987
988out:
989 release_sock(sk);
990 return rc;
991}
992
993static int smc_getname(struct socket *sock, struct sockaddr *addr,
994 int *len, int peer)
995{
996 struct smc_sock *smc;
997
998 if (peer && (sock->sk->sk_state != SMC_ACTIVE) &&
999 (sock->sk->sk_state != SMC_APPCLOSEWAIT1))
1000 return -ENOTCONN;
1001
1002 smc = smc_sk(sock->sk);
1003
1004 return smc->clcsock->ops->getname(smc->clcsock, addr, len, peer);
1005}
1006
1007static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
1008{
1009 struct sock *sk = sock->sk;
1010 struct smc_sock *smc;
1011 int rc = -EPIPE;
1012
1013 smc = smc_sk(sk);
1014 lock_sock(sk);
1015 if ((sk->sk_state != SMC_ACTIVE) &&
1016 (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1017 (sk->sk_state != SMC_INIT))
1018 goto out;
1019 if (smc->use_fallback)
1020 rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
1021 else
1022 rc = smc_tx_sendmsg(smc, msg, len);
1023out:
1024 release_sock(sk);
1025 return rc;
1026}
1027
1028static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
1029 int flags)
1030{
1031 struct sock *sk = sock->sk;
1032 struct smc_sock *smc;
1033 int rc = -ENOTCONN;
1034
1035 smc = smc_sk(sk);
1036 lock_sock(sk);
1037 if ((sk->sk_state == SMC_INIT) ||
1038 (sk->sk_state == SMC_LISTEN) ||
1039 (sk->sk_state == SMC_CLOSED))
1040 goto out;
1041
1042 if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
1043 rc = 0;
1044 goto out;
1045 }
1046
1047 if (smc->use_fallback)
1048 rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
1049 else
1050 rc = smc_rx_recvmsg(smc, msg, len, flags);
1051
1052out:
1053 release_sock(sk);
1054 return rc;
1055}
1056
1057static unsigned int smc_accept_poll(struct sock *parent)
1058{
1059 struct smc_sock *isk;
1060 struct sock *sk;
1061
1062 lock_sock(parent);
1063 list_for_each_entry(isk, &smc_sk(parent)->accept_q, accept_q) {
1064 sk = (struct sock *)isk;
1065
1066 if (sk->sk_state == SMC_ACTIVE) {
1067 release_sock(parent);
1068 return POLLIN | POLLRDNORM;
1069 }
1070 }
1071 release_sock(parent);
1072
1073 return 0;
1074}
1075
1076static unsigned int smc_poll(struct file *file, struct socket *sock,
1077 poll_table *wait)
1078{
1079 struct sock *sk = sock->sk;
1080 unsigned int mask = 0;
1081 struct smc_sock *smc;
1082 int rc;
1083
1084 smc = smc_sk(sock->sk);
1085 if ((sk->sk_state == SMC_INIT) || smc->use_fallback) {
1086 /* delegate to CLC child sock */
1087 mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
1088 /* if non-blocking connect finished ... */
1089 lock_sock(sk);
1090 if ((sk->sk_state == SMC_INIT) && (mask & POLLOUT)) {
1091 sk->sk_err = smc->clcsock->sk->sk_err;
1092 if (sk->sk_err) {
1093 mask |= POLLERR;
1094 } else {
1095 rc = smc_connect_rdma(smc);
1096 if (rc < 0)
1097 mask |= POLLERR;
1098 else
1099 /* success cases including fallback */
1100 mask |= POLLOUT | POLLWRNORM;
1101 }
1102 }
1103 release_sock(sk);
1104 } else {
1105 sock_poll_wait(file, sk_sleep(sk), wait);
1106 if (sk->sk_state == SMC_LISTEN)
1107 /* woken up by sk_data_ready in smc_listen_work() */
1108 mask |= smc_accept_poll(sk);
1109 if (sk->sk_err)
1110 mask |= POLLERR;
1111 if (atomic_read(&smc->conn.sndbuf_space) ||
1112 (sk->sk_shutdown & SEND_SHUTDOWN)) {
1113 mask |= POLLOUT | POLLWRNORM;
1114 } else {
1115 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1116 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1117 }
1118 if (atomic_read(&smc->conn.bytes_to_rcv))
1119 mask |= POLLIN | POLLRDNORM;
1120 if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
1121 (sk->sk_state == SMC_CLOSED))
1122 mask |= POLLHUP;
1123 if (sk->sk_shutdown & RCV_SHUTDOWN)
1124 mask |= POLLIN | POLLRDNORM | POLLRDHUP;
1125 if (sk->sk_state == SMC_APPCLOSEWAIT1)
1126 mask |= POLLIN;
1127
1128 }
1129
1130 return mask;
1131}
1132
1133static int smc_shutdown(struct socket *sock, int how)
1134{
1135 struct sock *sk = sock->sk;
1136 struct smc_sock *smc;
1137 int rc = -EINVAL;
1138 int rc1 = 0;
1139
1140 smc = smc_sk(sk);
1141
1142 if ((how < SHUT_RD) || (how > SHUT_RDWR))
1143 return rc;
1144
1145 lock_sock(sk);
1146
1147 rc = -ENOTCONN;
1148 if ((sk->sk_state != SMC_LISTEN) &&
1149 (sk->sk_state != SMC_ACTIVE) &&
1150 (sk->sk_state != SMC_PEERCLOSEWAIT1) &&
1151 (sk->sk_state != SMC_PEERCLOSEWAIT2) &&
1152 (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1153 (sk->sk_state != SMC_APPCLOSEWAIT2) &&
1154 (sk->sk_state != SMC_APPFINCLOSEWAIT))
1155 goto out;
1156 if (smc->use_fallback) {
1157 rc = kernel_sock_shutdown(smc->clcsock, how);
1158 sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
1159 if (sk->sk_shutdown == SHUTDOWN_MASK)
1160 sk->sk_state = SMC_CLOSED;
1161 goto out;
1162 }
1163 switch (how) {
1164 case SHUT_RDWR: /* shutdown in both directions */
1165 rc = smc_close_active(smc);
1166 break;
1167 case SHUT_WR:
1168 rc = smc_close_shutdown_write(smc);
1169 break;
1170 case SHUT_RD:
1171 if (sk->sk_state == SMC_LISTEN)
1172 rc = smc_close_active(smc);
1173 else
1174 rc = 0;
1175 /* nothing more to do because peer is not involved */
1176 break;
1177 }
1178 rc1 = kernel_sock_shutdown(smc->clcsock, how);
1179 /* map sock_shutdown_cmd constants to sk_shutdown value range */
1180 sk->sk_shutdown |= how + 1;
1181
1182out:
1183 release_sock(sk);
1184 return rc ? rc : rc1;
1185}
1186
1187static int smc_setsockopt(struct socket *sock, int level, int optname,
1188 char __user *optval, unsigned int optlen)
1189{
1190 struct sock *sk = sock->sk;
1191 struct smc_sock *smc;
1192
1193 smc = smc_sk(sk);
1194
1195 /* generic setsockopts reaching us here always apply to the
1196 * CLC socket
1197 */
1198 return smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
1199 optval, optlen);
1200}
1201
1202static int smc_getsockopt(struct socket *sock, int level, int optname,
1203 char __user *optval, int __user *optlen)
1204{
1205 struct smc_sock *smc;
1206
1207 smc = smc_sk(sock->sk);
1208 /* socket options apply to the CLC socket */
1209 return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
1210 optval, optlen);
1211}
1212
1213static int smc_ioctl(struct socket *sock, unsigned int cmd,
1214 unsigned long arg)
1215{
1216 struct smc_sock *smc;
1217
1218 smc = smc_sk(sock->sk);
1219 if (smc->use_fallback)
1220 return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
1221 else
1222 return sock_no_ioctl(sock, cmd, arg);
1223}
1224
1225static ssize_t smc_sendpage(struct socket *sock, struct page *page,
1226 int offset, size_t size, int flags)
1227{
1228 struct sock *sk = sock->sk;
1229 struct smc_sock *smc;
1230 int rc = -EPIPE;
1231
1232 smc = smc_sk(sk);
1233 lock_sock(sk);
1234 if (sk->sk_state != SMC_ACTIVE)
1235 goto out;
1236 if (smc->use_fallback)
1237 rc = kernel_sendpage(smc->clcsock, page, offset,
1238 size, flags);
1239 else
1240 rc = sock_no_sendpage(sock, page, offset, size, flags);
1241
1242out:
1243 release_sock(sk);
1244 return rc;
1245}
1246
1247static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
1248 struct pipe_inode_info *pipe, size_t len,
1249 unsigned int flags)
1250{
1251 struct sock *sk = sock->sk;
1252 struct smc_sock *smc;
1253 int rc = -ENOTCONN;
1254
1255 smc = smc_sk(sk);
1256 lock_sock(sk);
1257 if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_CLOSED))
1258 goto out;
1259 if (smc->use_fallback) {
1260 rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
1261 pipe, len, flags);
1262 } else {
1263 rc = -EOPNOTSUPP;
1264 }
1265out:
1266 release_sock(sk);
1267 return rc;
1268}
1269
1270/* must look like tcp */
1271static const struct proto_ops smc_sock_ops = {
1272 .family = PF_SMC,
1273 .owner = THIS_MODULE,
1274 .release = smc_release,
1275 .bind = smc_bind,
1276 .connect = smc_connect,
1277 .socketpair = sock_no_socketpair,
1278 .accept = smc_accept,
1279 .getname = smc_getname,
1280 .poll = smc_poll,
1281 .ioctl = smc_ioctl,
1282 .listen = smc_listen,
1283 .shutdown = smc_shutdown,
1284 .setsockopt = smc_setsockopt,
1285 .getsockopt = smc_getsockopt,
1286 .sendmsg = smc_sendmsg,
1287 .recvmsg = smc_recvmsg,
1288 .mmap = sock_no_mmap,
1289 .sendpage = smc_sendpage,
1290 .splice_read = smc_splice_read,
1291};
1292
1293static int smc_create(struct net *net, struct socket *sock, int protocol,
1294 int kern)
1295{
1296 struct smc_sock *smc;
1297 struct sock *sk;
1298 int rc;
1299
1300 rc = -ESOCKTNOSUPPORT;
1301 if (sock->type != SOCK_STREAM)
1302 goto out;
1303
1304 rc = -EPROTONOSUPPORT;
1305 if ((protocol != IPPROTO_IP) && (protocol != IPPROTO_TCP))
1306 goto out;
1307
1308 rc = -ENOBUFS;
1309 sock->ops = &smc_sock_ops;
1310 sk = smc_sock_alloc(net, sock);
1311 if (!sk)
1312 goto out;
1313
1314 /* create internal TCP socket for CLC handshake and fallback */
1315 smc = smc_sk(sk);
1316 smc->use_fallback = false; /* assume rdma capability first */
1317 rc = sock_create_kern(net, PF_INET, SOCK_STREAM,
1318 IPPROTO_TCP, &smc->clcsock);
1319 if (rc)
1320 sk_common_release(sk);
1321 smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE);
1322 smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE);
1323
1324out:
1325 return rc;
1326}
1327
1328static const struct net_proto_family smc_sock_family_ops = {
1329 .family = PF_SMC,
1330 .owner = THIS_MODULE,
1331 .create = smc_create,
1332};
1333
1334static int __init smc_init(void)
1335{
1336 int rc;
1337
1338 rc = smc_pnet_init();
1339 if (rc)
1340 return rc;
1341
1342 rc = smc_llc_init();
1343 if (rc) {
1344 pr_err("%s: smc_llc_init fails with %d\n", __func__, rc);
1345 goto out_pnet;
1346 }
1347
1348 rc = smc_cdc_init();
1349 if (rc) {
1350 pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc);
1351 goto out_pnet;
1352 }
1353
1354 rc = proto_register(&smc_proto, 1);
1355 if (rc) {
1356 pr_err("%s: proto_register fails with %d\n", __func__, rc);
1357 goto out_pnet;
1358 }
1359
1360 rc = sock_register(&smc_sock_family_ops);
1361 if (rc) {
1362 pr_err("%s: sock_register fails with %d\n", __func__, rc);
1363 goto out_proto;
1364 }
1365 INIT_HLIST_HEAD(&smc_v4_hashinfo.ht);
1366
1367 rc = smc_ib_register_client();
1368 if (rc) {
1369 pr_err("%s: ib_register fails with %d\n", __func__, rc);
1370 goto out_sock;
1371 }
1372
1373 return 0;
1374
1375out_sock:
1376 sock_unregister(PF_SMC);
1377out_proto:
1378 proto_unregister(&smc_proto);
1379out_pnet:
1380 smc_pnet_exit();
1381 return rc;
1382}
1383
1384static void __exit smc_exit(void)
1385{
1386 struct smc_link_group *lgr, *lg;
1387 LIST_HEAD(lgr_freeing_list);
1388
1389 spin_lock_bh(&smc_lgr_list.lock);
1390 if (!list_empty(&smc_lgr_list.list))
1391 list_splice_init(&smc_lgr_list.list, &lgr_freeing_list);
1392 spin_unlock_bh(&smc_lgr_list.lock);
1393 list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) {
1394 list_del_init(&lgr->list);
1395 smc_lgr_free(lgr); /* free link group */
1396 }
1397 smc_ib_unregister_client();
1398 sock_unregister(PF_SMC);
1399 proto_unregister(&smc_proto);
1400 smc_pnet_exit();
1401}
1402
1403module_init(smc_init);
1404module_exit(smc_exit);
1405
1406MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
1407MODULE_DESCRIPTION("smc socket address family");
1408MODULE_LICENSE("GPL");
1409MODULE_ALIAS_NETPROTO(PF_SMC);
diff --git a/net/smc/smc.h b/net/smc/smc.h
new file mode 100644
index 000000000000..ee5fbea24549
--- /dev/null
+++ b/net/smc/smc.h
@@ -0,0 +1,274 @@
1/*
2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
3 *
4 * Definitions for the SMC module (socket related)
5 *
6 * Copyright IBM Corp. 2016
7 *
8 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
9 */
10#ifndef __SMC_H
11#define __SMC_H
12
13#include <linux/socket.h>
14#include <linux/types.h>
15#include <linux/compiler.h> /* __aligned */
16#include <net/sock.h>
17
18#include "smc_ib.h"
19
20#define SMCPROTO_SMC 0 /* SMC protocol */
21
22#define SMC_MAX_PORTS 2 /* Max # of ports */
23
24extern struct proto smc_proto;
25
26#ifdef ATOMIC64_INIT
27#define KERNEL_HAS_ATOMIC64
28#endif
29
30enum smc_state { /* possible states of an SMC socket */
31 SMC_ACTIVE = 1,
32 SMC_INIT = 2,
33 SMC_CLOSED = 7,
34 SMC_LISTEN = 10,
35 /* normal close */
36 SMC_PEERCLOSEWAIT1 = 20,
37 SMC_PEERCLOSEWAIT2 = 21,
38 SMC_APPFINCLOSEWAIT = 24,
39 SMC_APPCLOSEWAIT1 = 22,
40 SMC_APPCLOSEWAIT2 = 23,
41 SMC_PEERFINCLOSEWAIT = 25,
42 /* abnormal close */
43 SMC_PEERABORTWAIT = 26,
44 SMC_PROCESSABORT = 27,
45};
46
47struct smc_link_group;
48
49struct smc_wr_rx_hdr { /* common prefix part of LLC and CDC to demultiplex */
50 u8 type;
51} __aligned(1);
52
53struct smc_cdc_conn_state_flags {
54#if defined(__BIG_ENDIAN_BITFIELD)
55 u8 peer_done_writing : 1; /* Sending done indicator */
56 u8 peer_conn_closed : 1; /* Peer connection closed indicator */
57 u8 peer_conn_abort : 1; /* Abnormal close indicator */
58 u8 reserved : 5;
59#elif defined(__LITTLE_ENDIAN_BITFIELD)
60 u8 reserved : 5;
61 u8 peer_conn_abort : 1;
62 u8 peer_conn_closed : 1;
63 u8 peer_done_writing : 1;
64#endif
65};
66
67struct smc_cdc_producer_flags {
68#if defined(__BIG_ENDIAN_BITFIELD)
69 u8 write_blocked : 1; /* Writing Blocked, no rx buf space */
70 u8 urg_data_pending : 1; /* Urgent Data Pending */
71 u8 urg_data_present : 1; /* Urgent Data Present */
72 u8 cons_curs_upd_req : 1; /* cursor update requested */
73 u8 failover_validation : 1;/* message replay due to failover */
74 u8 reserved : 3;
75#elif defined(__LITTLE_ENDIAN_BITFIELD)
76 u8 reserved : 3;
77 u8 failover_validation : 1;
78 u8 cons_curs_upd_req : 1;
79 u8 urg_data_present : 1;
80 u8 urg_data_pending : 1;
81 u8 write_blocked : 1;
82#endif
83};
84
85/* in host byte order */
86union smc_host_cursor { /* SMC cursor - an offset in an RMBE */
87 struct {
88 u16 reserved;
89 u16 wrap; /* window wrap sequence number */
90 u32 count; /* cursor (= offset) part */
91 };
92#ifdef KERNEL_HAS_ATOMIC64
93 atomic64_t acurs; /* for atomic processing */
94#else
95 u64 acurs; /* for atomic processing */
96#endif
97} __aligned(8);
98
99/* in host byte order, except for flag bitfields in network byte order */
100struct smc_host_cdc_msg { /* Connection Data Control message */
101 struct smc_wr_rx_hdr common; /* .type = 0xFE */
102 u8 len; /* length = 44 */
103 u16 seqno; /* connection seq # */
104 u32 token; /* alert_token */
105 union smc_host_cursor prod; /* producer cursor */
106 union smc_host_cursor cons; /* consumer cursor,
107 * piggy backed "ack"
108 */
109 struct smc_cdc_producer_flags prod_flags; /* conn. tx/rx status */
110 struct smc_cdc_conn_state_flags conn_state_flags; /* peer conn. status*/
111 u8 reserved[18];
112} __aligned(8);
113
114struct smc_connection {
115 struct rb_node alert_node;
116 struct smc_link_group *lgr; /* link group of connection */
117 u32 alert_token_local; /* unique conn. id */
118 u8 peer_conn_idx; /* from tcp handshake */
119 int peer_rmbe_size; /* size of peer rx buffer */
120 atomic_t peer_rmbe_space;/* remaining free bytes in peer
121 * rmbe
122 */
123 int rtoken_idx; /* idx to peer RMB rkey/addr */
124
125 struct smc_buf_desc *sndbuf_desc; /* send buffer descriptor */
126 int sndbuf_size; /* sndbuf size <== sock wmem */
127 struct smc_buf_desc *rmb_desc; /* RMBE descriptor */
128 int rmbe_size; /* RMBE size <== sock rmem */
129 int rmbe_size_short;/* compressed notation */
130 int rmbe_update_limit;
131 /* lower limit for consumer
132 * cursor update
133 */
134
135 struct smc_host_cdc_msg local_tx_ctrl; /* host byte order staging
136 * buffer for CDC msg send
137 * .prod cf. TCP snd_nxt
138 * .cons cf. TCP sends ack
139 */
140 union smc_host_cursor tx_curs_prep; /* tx - prepared data
141 * snd_max..wmem_alloc
142 */
143 union smc_host_cursor tx_curs_sent; /* tx - sent data
144 * snd_nxt ?
145 */
146 union smc_host_cursor tx_curs_fin; /* tx - confirmed by peer
147 * snd-wnd-begin ?
148 */
149 atomic_t sndbuf_space; /* remaining space in sndbuf */
150 u16 tx_cdc_seq; /* sequence # for CDC send */
151 spinlock_t send_lock; /* protect wr_sends */
152 struct work_struct tx_work; /* retry of smc_cdc_msg_send */
153
154 struct smc_host_cdc_msg local_rx_ctrl; /* filled during event_handl.
155 * .prod cf. TCP rcv_nxt
156 * .cons cf. TCP snd_una
157 */
158 union smc_host_cursor rx_curs_confirmed; /* confirmed to peer
159 * source of snd_una ?
160 */
161 atomic_t bytes_to_rcv; /* arrived data,
162 * not yet received
163 */
164#ifndef KERNEL_HAS_ATOMIC64
165 spinlock_t acurs_lock; /* protect cursors */
166#endif
167};
168
169struct smc_sock { /* smc sock container */
170 struct sock sk;
171 struct socket *clcsock; /* internal tcp socket */
172 struct smc_connection conn; /* smc connection */
173 struct sockaddr *addr; /* inet connect address */
174 struct smc_sock *listen_smc; /* listen parent */
175 struct work_struct tcp_listen_work;/* handle tcp socket accepts */
176 struct work_struct smc_listen_work;/* prepare new accept socket */
177 struct list_head accept_q; /* sockets to be accepted */
178 spinlock_t accept_q_lock; /* protects accept_q */
179 struct delayed_work sock_put_work; /* final socket freeing */
180 bool use_fallback; /* fallback to tcp */
181 u8 wait_close_tx_prepared : 1;
182 /* shutdown wr or close
183 * started, waiting for unsent
184 * data to be sent
185 */
186};
187
188static inline struct smc_sock *smc_sk(const struct sock *sk)
189{
190 return (struct smc_sock *)sk;
191}
192
193#define SMC_SYSTEMID_LEN 8
194
195extern u8 local_systemid[SMC_SYSTEMID_LEN]; /* unique system identifier */
196
197/* convert an u32 value into network byte order, store it into a 3 byte field */
198static inline void hton24(u8 *net, u32 host)
199{
200 __be32 t;
201
202 t = cpu_to_be32(host);
203 memcpy(net, ((u8 *)&t) + 1, 3);
204}
205
206/* convert a received 3 byte field into host byte order*/
207static inline u32 ntoh24(u8 *net)
208{
209 __be32 t = 0;
210
211 memcpy(((u8 *)&t) + 1, net, 3);
212 return be32_to_cpu(t);
213}
214
215#define SMC_BUF_MIN_SIZE 16384 /* minimum size of an RMB */
216
217#define SMC_RMBE_SIZES 16 /* number of distinct sizes for an RMBE */
218/* theoretically, the RFC states that largest size would be 512K,
219 * i.e. compressed 5 and thus 6 sizes (0..5), despite
220 * struct smc_clc_msg_accept_confirm.rmbe_size being a 4 bit value (0..15)
221 */
222
223/* convert the RMB size into the compressed notation - minimum 16K.
224 * In contrast to plain ilog2, this rounds towards the next power of 2,
225 * so the socket application gets at least its desired sndbuf / rcvbuf size.
226 */
227static inline u8 smc_compress_bufsize(int size)
228{
229 u8 compressed;
230
231 if (size <= SMC_BUF_MIN_SIZE)
232 return 0;
233
234 size = (size - 1) >> 14;
235 compressed = ilog2(size) + 1;
236 if (compressed >= SMC_RMBE_SIZES)
237 compressed = SMC_RMBE_SIZES - 1;
238 return compressed;
239}
240
241/* convert the RMB size from compressed notation into integer */
242static inline int smc_uncompress_bufsize(u8 compressed)
243{
244 u32 size;
245
246 size = 0x00000001 << (((int)compressed) + 14);
247 return (int)size;
248}
249
250#ifdef CONFIG_XFRM
251static inline bool using_ipsec(struct smc_sock *smc)
252{
253 return (smc->clcsock->sk->sk_policy[0] ||
254 smc->clcsock->sk->sk_policy[1]) ? 1 : 0;
255}
256#else
257static inline bool using_ipsec(struct smc_sock *smc)
258{
259 return 0;
260}
261#endif
262
263struct smc_clc_msg_local;
264
265int smc_netinfo_by_tcpsk(struct socket *clcsock, __be32 *subnet,
266 u8 *prefix_len);
267void smc_conn_free(struct smc_connection *conn);
268int smc_conn_create(struct smc_sock *smc, __be32 peer_in_addr,
269 struct smc_ib_device *smcibdev, u8 ibport,
270 struct smc_clc_msg_local *lcl, int srv_first_contact);
271struct sock *smc_accept_dequeue(struct sock *parent, struct socket *new_sock);
272void smc_close_non_accepted(struct sock *sk);
273
274#endif /* __SMC_H */
diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
new file mode 100644
index 000000000000..5a339493872e
--- /dev/null
+++ b/net/smc/smc_cdc.c
@@ -0,0 +1,304 @@
1/*
2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
3 *
4 * Connection Data Control (CDC)
5 * handles flow control
6 *
7 * Copyright IBM Corp. 2016
8 *
9 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
10 */
11
12#include <linux/spinlock.h>
13
14#include "smc.h"
15#include "smc_wr.h"
16#include "smc_cdc.h"
17#include "smc_tx.h"
18#include "smc_rx.h"
19#include "smc_close.h"
20
21/********************************** send *************************************/
22
23struct smc_cdc_tx_pend {
24 struct smc_connection *conn; /* socket connection */
25 union smc_host_cursor cursor; /* tx sndbuf cursor sent */
26 union smc_host_cursor p_cursor; /* rx RMBE cursor produced */
27 u16 ctrl_seq; /* conn. tx sequence # */
28};
29
30/* handler for send/transmission completion of a CDC msg */
31static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd,
32 struct smc_link *link,
33 enum ib_wc_status wc_status)
34{
35 struct smc_cdc_tx_pend *cdcpend = (struct smc_cdc_tx_pend *)pnd_snd;
36 struct smc_sock *smc;
37 int diff;
38
39 if (!cdcpend->conn)
40 /* already dismissed */
41 return;
42
43 smc = container_of(cdcpend->conn, struct smc_sock, conn);
44 bh_lock_sock(&smc->sk);
45 if (!wc_status) {
46 diff = smc_curs_diff(cdcpend->conn->sndbuf_size,
47 &cdcpend->conn->tx_curs_fin,
48 &cdcpend->cursor);
49 /* sndbuf_space is decreased in smc_sendmsg */
50 smp_mb__before_atomic();
51 atomic_add(diff, &cdcpend->conn->sndbuf_space);
52 /* guarantee 0 <= sndbuf_space <= sndbuf_size */
53 smp_mb__after_atomic();
54 smc_curs_write(&cdcpend->conn->tx_curs_fin,
55 smc_curs_read(&cdcpend->cursor, cdcpend->conn),
56 cdcpend->conn);
57 }
58 smc_tx_sndbuf_nonfull(smc);
59 if (smc->sk.sk_state != SMC_ACTIVE)
60 /* wake up smc_close_wait_tx_pends() */
61 smc->sk.sk_state_change(&smc->sk);
62 bh_unlock_sock(&smc->sk);
63}
64
65int smc_cdc_get_free_slot(struct smc_link *link,
66 struct smc_wr_buf **wr_buf,
67 struct smc_cdc_tx_pend **pend)
68{
69 return smc_wr_tx_get_free_slot(link, smc_cdc_tx_handler, wr_buf,
70 (struct smc_wr_tx_pend_priv **)pend);
71}
72
73static inline void smc_cdc_add_pending_send(struct smc_connection *conn,
74 struct smc_cdc_tx_pend *pend)
75{
76 BUILD_BUG_ON_MSG(
77 sizeof(struct smc_cdc_msg) > SMC_WR_BUF_SIZE,
78 "must increase SMC_WR_BUF_SIZE to at least sizeof(struct smc_cdc_msg)");
79 BUILD_BUG_ON_MSG(
80 offsetof(struct smc_cdc_msg, reserved) > SMC_WR_TX_SIZE,
81 "must adapt SMC_WR_TX_SIZE to sizeof(struct smc_cdc_msg); if not all smc_wr upper layer protocols use the same message size any more, must start to set link->wr_tx_sges[i].length on each individual smc_wr_tx_send()");
82 BUILD_BUG_ON_MSG(
83 sizeof(struct smc_cdc_tx_pend) > SMC_WR_TX_PEND_PRIV_SIZE,
84 "must increase SMC_WR_TX_PEND_PRIV_SIZE to at least sizeof(struct smc_cdc_tx_pend)");
85 pend->conn = conn;
86 pend->cursor = conn->tx_curs_sent;
87 pend->p_cursor = conn->local_tx_ctrl.prod;
88 pend->ctrl_seq = conn->tx_cdc_seq;
89}
90
91int smc_cdc_msg_send(struct smc_connection *conn,
92 struct smc_wr_buf *wr_buf,
93 struct smc_cdc_tx_pend *pend)
94{
95 struct smc_link *link;
96 int rc;
97
98 link = &conn->lgr->lnk[SMC_SINGLE_LINK];
99
100 smc_cdc_add_pending_send(conn, pend);
101
102 conn->tx_cdc_seq++;
103 conn->local_tx_ctrl.seqno = conn->tx_cdc_seq;
104 smc_host_msg_to_cdc((struct smc_cdc_msg *)wr_buf,
105 &conn->local_tx_ctrl, conn);
106 rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend);
107 if (!rc)
108 smc_curs_write(&conn->rx_curs_confirmed,
109 smc_curs_read(&conn->local_tx_ctrl.cons, conn),
110 conn);
111
112 return rc;
113}
114
115int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn)
116{
117 struct smc_cdc_tx_pend *pend;
118 struct smc_wr_buf *wr_buf;
119 int rc;
120
121 rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], &wr_buf,
122 &pend);
123 if (rc)
124 return rc;
125
126 return smc_cdc_msg_send(conn, wr_buf, pend);
127}
128
129static bool smc_cdc_tx_filter(struct smc_wr_tx_pend_priv *tx_pend,
130 unsigned long data)
131{
132 struct smc_connection *conn = (struct smc_connection *)data;
133 struct smc_cdc_tx_pend *cdc_pend =
134 (struct smc_cdc_tx_pend *)tx_pend;
135
136 return cdc_pend->conn == conn;
137}
138
139static void smc_cdc_tx_dismisser(struct smc_wr_tx_pend_priv *tx_pend)
140{
141 struct smc_cdc_tx_pend *cdc_pend =
142 (struct smc_cdc_tx_pend *)tx_pend;
143
144 cdc_pend->conn = NULL;
145}
146
147void smc_cdc_tx_dismiss_slots(struct smc_connection *conn)
148{
149 struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK];
150
151 smc_wr_tx_dismiss_slots(link, SMC_CDC_MSG_TYPE,
152 smc_cdc_tx_filter, smc_cdc_tx_dismisser,
153 (unsigned long)conn);
154}
155
156bool smc_cdc_tx_has_pending(struct smc_connection *conn)
157{
158 struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK];
159
160 return smc_wr_tx_has_pending(link, SMC_CDC_MSG_TYPE,
161 smc_cdc_tx_filter, (unsigned long)conn);
162}
163
164/********************************* receive ***********************************/
165
166static inline bool smc_cdc_before(u16 seq1, u16 seq2)
167{
168 return (s16)(seq1 - seq2) < 0;
169}
170
171static void smc_cdc_msg_recv_action(struct smc_sock *smc,
172 struct smc_link *link,
173 struct smc_cdc_msg *cdc)
174{
175 union smc_host_cursor cons_old, prod_old;
176 struct smc_connection *conn = &smc->conn;
177 int diff_cons, diff_prod;
178
179 if (!cdc->prod_flags.failover_validation) {
180 if (smc_cdc_before(ntohs(cdc->seqno),
181 conn->local_rx_ctrl.seqno))
182 /* received seqno is old */
183 return;
184 }
185 smc_curs_write(&prod_old,
186 smc_curs_read(&conn->local_rx_ctrl.prod, conn),
187 conn);
188 smc_curs_write(&cons_old,
189 smc_curs_read(&conn->local_rx_ctrl.cons, conn),
190 conn);
191 smc_cdc_msg_to_host(&conn->local_rx_ctrl, cdc, conn);
192
193 diff_cons = smc_curs_diff(conn->peer_rmbe_size, &cons_old,
194 &conn->local_rx_ctrl.cons);
195 if (diff_cons) {
196 /* peer_rmbe_space is decreased during data transfer with RDMA
197 * write
198 */
199 smp_mb__before_atomic();
200 atomic_add(diff_cons, &conn->peer_rmbe_space);
201 /* guarantee 0 <= peer_rmbe_space <= peer_rmbe_size */
202 smp_mb__after_atomic();
203 }
204
205 diff_prod = smc_curs_diff(conn->rmbe_size, &prod_old,
206 &conn->local_rx_ctrl.prod);
207 if (diff_prod) {
208 /* bytes_to_rcv is decreased in smc_recvmsg */
209 smp_mb__before_atomic();
210 atomic_add(diff_prod, &conn->bytes_to_rcv);
211 /* guarantee 0 <= bytes_to_rcv <= rmbe_size */
212 smp_mb__after_atomic();
213 smc->sk.sk_data_ready(&smc->sk);
214 }
215
216 if (conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) {
217 smc->sk.sk_err = ECONNRESET;
218 conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
219 }
220 if (smc_cdc_rxed_any_close_or_senddone(conn))
221 smc_close_passive_received(smc);
222
223 /* piggy backed tx info */
224 /* trigger sndbuf consumer: RDMA write into peer RMBE and CDC */
225 if (diff_cons && smc_tx_prepared_sends(conn)) {
226 smc_tx_sndbuf_nonempty(conn);
227 /* trigger socket release if connection closed */
228 smc_close_wake_tx_prepared(smc);
229 }
230
231 /* subsequent patch: trigger socket release if connection closed */
232
233 /* socket connected but not accepted */
234 if (!smc->sk.sk_socket)
235 return;
236
237 /* data available */
238 if ((conn->local_rx_ctrl.prod_flags.write_blocked) ||
239 (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req))
240 smc_tx_consumer_update(conn);
241}
242
243/* called under tasklet context */
244static inline void smc_cdc_msg_recv(struct smc_cdc_msg *cdc,
245 struct smc_link *link, u64 wr_id)
246{
247 struct smc_link_group *lgr = container_of(link, struct smc_link_group,
248 lnk[SMC_SINGLE_LINK]);
249 struct smc_connection *connection;
250 struct smc_sock *smc;
251
252 /* lookup connection */
253 read_lock_bh(&lgr->conns_lock);
254 connection = smc_lgr_find_conn(ntohl(cdc->token), lgr);
255 if (!connection) {
256 read_unlock_bh(&lgr->conns_lock);
257 return;
258 }
259 smc = container_of(connection, struct smc_sock, conn);
260 sock_hold(&smc->sk);
261 read_unlock_bh(&lgr->conns_lock);
262 bh_lock_sock(&smc->sk);
263 smc_cdc_msg_recv_action(smc, link, cdc);
264 bh_unlock_sock(&smc->sk);
265 sock_put(&smc->sk); /* no free sk in softirq-context */
266}
267
268/***************************** init, exit, misc ******************************/
269
270static void smc_cdc_rx_handler(struct ib_wc *wc, void *buf)
271{
272 struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
273 struct smc_cdc_msg *cdc = buf;
274
275 if (wc->byte_len < offsetof(struct smc_cdc_msg, reserved))
276 return; /* short message */
277 if (cdc->len != sizeof(*cdc))
278 return; /* invalid message */
279 smc_cdc_msg_recv(cdc, link, wc->wr_id);
280}
281
282static struct smc_wr_rx_handler smc_cdc_rx_handlers[] = {
283 {
284 .handler = smc_cdc_rx_handler,
285 .type = SMC_CDC_MSG_TYPE
286 },
287 {
288 .handler = NULL,
289 }
290};
291
292int __init smc_cdc_init(void)
293{
294 struct smc_wr_rx_handler *handler;
295 int rc = 0;
296
297 for (handler = smc_cdc_rx_handlers; handler->handler; handler++) {
298 INIT_HLIST_NODE(&handler->list);
299 rc = smc_wr_rx_register_handler(handler);
300 if (rc)
301 break;
302 }
303 return rc;
304}
diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h
new file mode 100644
index 000000000000..8e1d76f26007
--- /dev/null
+++ b/net/smc/smc_cdc.h
@@ -0,0 +1,218 @@
1/*
2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
3 *
4 * Connection Data Control (CDC)
5 *
6 * Copyright IBM Corp. 2016
7 *
8 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
9 */
10
11#ifndef SMC_CDC_H
12#define SMC_CDC_H
13
14#include <linux/kernel.h> /* max_t */
15#include <linux/atomic.h>
16#include <linux/in.h>
17#include <linux/compiler.h>
18
19#include "smc.h"
20#include "smc_core.h"
21#include "smc_wr.h"
22
23#define SMC_CDC_MSG_TYPE 0xFE
24
25/* in network byte order */
26union smc_cdc_cursor { /* SMC cursor */
27 struct {
28 __be16 reserved;
29 __be16 wrap;
30 __be32 count;
31 };
32#ifdef KERNEL_HAS_ATOMIC64
33 atomic64_t acurs; /* for atomic processing */
34#else
35 u64 acurs; /* for atomic processing */
36#endif
37} __aligned(8);
38
39/* in network byte order */
40struct smc_cdc_msg {
41 struct smc_wr_rx_hdr common; /* .type = 0xFE */
42 u8 len; /* 44 */
43 __be16 seqno;
44 __be32 token;
45 union smc_cdc_cursor prod;
46 union smc_cdc_cursor cons; /* piggy backed "ack" */
47 struct smc_cdc_producer_flags prod_flags;
48 struct smc_cdc_conn_state_flags conn_state_flags;
49 u8 reserved[18];
50} __aligned(8);
51
52static inline bool smc_cdc_rxed_any_close(struct smc_connection *conn)
53{
54 return conn->local_rx_ctrl.conn_state_flags.peer_conn_abort ||
55 conn->local_rx_ctrl.conn_state_flags.peer_conn_closed;
56}
57
58static inline bool smc_cdc_rxed_any_close_or_senddone(
59 struct smc_connection *conn)
60{
61 return smc_cdc_rxed_any_close(conn) ||
62 conn->local_rx_ctrl.conn_state_flags.peer_done_writing;
63}
64
65static inline void smc_curs_add(int size, union smc_host_cursor *curs,
66 int value)
67{
68 curs->count += value;
69 if (curs->count >= size) {
70 curs->wrap++;
71 curs->count -= size;
72 }
73}
74
75/* SMC cursors are 8 bytes long and require atomic reading and writing */
76static inline u64 smc_curs_read(union smc_host_cursor *curs,
77 struct smc_connection *conn)
78{
79#ifndef KERNEL_HAS_ATOMIC64
80 unsigned long flags;
81 u64 ret;
82
83 spin_lock_irqsave(&conn->acurs_lock, flags);
84 ret = curs->acurs;
85 spin_unlock_irqrestore(&conn->acurs_lock, flags);
86 return ret;
87#else
88 return atomic64_read(&curs->acurs);
89#endif
90}
91
92static inline u64 smc_curs_read_net(union smc_cdc_cursor *curs,
93 struct smc_connection *conn)
94{
95#ifndef KERNEL_HAS_ATOMIC64
96 unsigned long flags;
97 u64 ret;
98
99 spin_lock_irqsave(&conn->acurs_lock, flags);
100 ret = curs->acurs;
101 spin_unlock_irqrestore(&conn->acurs_lock, flags);
102 return ret;
103#else
104 return atomic64_read(&curs->acurs);
105#endif
106}
107
108static inline void smc_curs_write(union smc_host_cursor *curs, u64 val,
109 struct smc_connection *conn)
110{
111#ifndef KERNEL_HAS_ATOMIC64
112 unsigned long flags;
113
114 spin_lock_irqsave(&conn->acurs_lock, flags);
115 curs->acurs = val;
116 spin_unlock_irqrestore(&conn->acurs_lock, flags);
117#else
118 atomic64_set(&curs->acurs, val);
119#endif
120}
121
122static inline void smc_curs_write_net(union smc_cdc_cursor *curs, u64 val,
123 struct smc_connection *conn)
124{
125#ifndef KERNEL_HAS_ATOMIC64
126 unsigned long flags;
127
128 spin_lock_irqsave(&conn->acurs_lock, flags);
129 curs->acurs = val;
130 spin_unlock_irqrestore(&conn->acurs_lock, flags);
131#else
132 atomic64_set(&curs->acurs, val);
133#endif
134}
135
136/* calculate cursor difference between old and new, where old <= new */
137static inline int smc_curs_diff(unsigned int size,
138 union smc_host_cursor *old,
139 union smc_host_cursor *new)
140{
141 if (old->wrap != new->wrap)
142 return max_t(int, 0,
143 ((size - old->count) + new->count));
144
145 return max_t(int, 0, (new->count - old->count));
146}
147
148static inline void smc_host_cursor_to_cdc(union smc_cdc_cursor *peer,
149 union smc_host_cursor *local,
150 struct smc_connection *conn)
151{
152 union smc_host_cursor temp;
153
154 smc_curs_write(&temp, smc_curs_read(local, conn), conn);
155 peer->count = htonl(temp.count);
156 peer->wrap = htons(temp.wrap);
157 /* peer->reserved = htons(0); must be ensured by caller */
158}
159
160static inline void smc_host_msg_to_cdc(struct smc_cdc_msg *peer,
161 struct smc_host_cdc_msg *local,
162 struct smc_connection *conn)
163{
164 peer->common.type = local->common.type;
165 peer->len = local->len;
166 peer->seqno = htons(local->seqno);
167 peer->token = htonl(local->token);
168 smc_host_cursor_to_cdc(&peer->prod, &local->prod, conn);
169 smc_host_cursor_to_cdc(&peer->cons, &local->cons, conn);
170 peer->prod_flags = local->prod_flags;
171 peer->conn_state_flags = local->conn_state_flags;
172}
173
174static inline void smc_cdc_cursor_to_host(union smc_host_cursor *local,
175 union smc_cdc_cursor *peer,
176 struct smc_connection *conn)
177{
178 union smc_host_cursor temp, old;
179 union smc_cdc_cursor net;
180
181 smc_curs_write(&old, smc_curs_read(local, conn), conn);
182 smc_curs_write_net(&net, smc_curs_read_net(peer, conn), conn);
183 temp.count = ntohl(net.count);
184 temp.wrap = ntohs(net.wrap);
185 if ((old.wrap > temp.wrap) && temp.wrap)
186 return;
187 if ((old.wrap == temp.wrap) &&
188 (old.count > temp.count))
189 return;
190 smc_curs_write(local, smc_curs_read(&temp, conn), conn);
191}
192
193static inline void smc_cdc_msg_to_host(struct smc_host_cdc_msg *local,
194 struct smc_cdc_msg *peer,
195 struct smc_connection *conn)
196{
197 local->common.type = peer->common.type;
198 local->len = peer->len;
199 local->seqno = ntohs(peer->seqno);
200 local->token = ntohl(peer->token);
201 smc_cdc_cursor_to_host(&local->prod, &peer->prod, conn);
202 smc_cdc_cursor_to_host(&local->cons, &peer->cons, conn);
203 local->prod_flags = peer->prod_flags;
204 local->conn_state_flags = peer->conn_state_flags;
205}
206
207struct smc_cdc_tx_pend;
208
209int smc_cdc_get_free_slot(struct smc_link *link, struct smc_wr_buf **wr_buf,
210 struct smc_cdc_tx_pend **pend);
211void smc_cdc_tx_dismiss_slots(struct smc_connection *conn);
212int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf,
213 struct smc_cdc_tx_pend *pend);
214int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn);
215bool smc_cdc_tx_has_pending(struct smc_connection *conn);
216int smc_cdc_init(void) __init;
217
218#endif /* SMC_CDC_H */
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
new file mode 100644
index 000000000000..e41f594a1e1d
--- /dev/null
+++ b/net/smc/smc_clc.c
@@ -0,0 +1,282 @@
1/*
2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
3 *
4 * CLC (connection layer control) handshake over initial TCP socket to
5 * prepare for RDMA traffic
6 *
7 * Copyright IBM Corp. 2016
8 *
9 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
10 */
11
12#include <linux/in.h>
13#include <linux/if_ether.h>
14#include <linux/sched/signal.h>
15
16#include <net/sock.h>
17#include <net/tcp.h>
18
19#include "smc.h"
20#include "smc_core.h"
21#include "smc_clc.h"
22#include "smc_ib.h"
23
24/* Wait for data on the tcp-socket, analyze received data
25 * Returns:
26 * 0 if success and it was not a decline that we received.
27 * SMC_CLC_DECL_REPLY if decline received for fallback w/o another decl send.
28 * clcsock error, -EINTR, -ECONNRESET, -EPROTO otherwise.
29 */
30int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
31 u8 expected_type)
32{
33 struct sock *clc_sk = smc->clcsock->sk;
34 struct smc_clc_msg_hdr *clcm = buf;
35 struct msghdr msg = {NULL, 0};
36 int reason_code = 0;
37 struct kvec vec;
38 int len, datlen;
39 int krflags;
40
41 /* peek the first few bytes to determine length of data to receive
42 * so we don't consume any subsequent CLC message or payload data
43 * in the TCP byte stream
44 */
45 vec.iov_base = buf;
46 vec.iov_len = buflen;
47 krflags = MSG_PEEK | MSG_WAITALL;
48 smc->clcsock->sk->sk_rcvtimeo = CLC_WAIT_TIME;
49 len = kernel_recvmsg(smc->clcsock, &msg, &vec, 1,
50 sizeof(struct smc_clc_msg_hdr), krflags);
51 if (signal_pending(current)) {
52 reason_code = -EINTR;
53 clc_sk->sk_err = EINTR;
54 smc->sk.sk_err = EINTR;
55 goto out;
56 }
57 if (clc_sk->sk_err) {
58 reason_code = -clc_sk->sk_err;
59 smc->sk.sk_err = clc_sk->sk_err;
60 goto out;
61 }
62 if (!len) { /* peer has performed orderly shutdown */
63 smc->sk.sk_err = ECONNRESET;
64 reason_code = -ECONNRESET;
65 goto out;
66 }
67 if (len < 0) {
68 smc->sk.sk_err = -len;
69 reason_code = len;
70 goto out;
71 }
72 datlen = ntohs(clcm->length);
73 if ((len < sizeof(struct smc_clc_msg_hdr)) ||
74 (datlen < sizeof(struct smc_clc_msg_decline)) ||
75 (datlen > sizeof(struct smc_clc_msg_accept_confirm)) ||
76 memcmp(clcm->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)) ||
77 ((clcm->type != SMC_CLC_DECLINE) &&
78 (clcm->type != expected_type))) {
79 smc->sk.sk_err = EPROTO;
80 reason_code = -EPROTO;
81 goto out;
82 }
83
84 /* receive the complete CLC message */
85 vec.iov_base = buf;
86 vec.iov_len = buflen;
87 memset(&msg, 0, sizeof(struct msghdr));
88 krflags = MSG_WAITALL;
89 smc->clcsock->sk->sk_rcvtimeo = CLC_WAIT_TIME;
90 len = kernel_recvmsg(smc->clcsock, &msg, &vec, 1, datlen, krflags);
91 if (len < datlen) {
92 smc->sk.sk_err = EPROTO;
93 reason_code = -EPROTO;
94 goto out;
95 }
96 if (clcm->type == SMC_CLC_DECLINE) {
97 reason_code = SMC_CLC_DECL_REPLY;
98 if (ntohl(((struct smc_clc_msg_decline *)buf)->peer_diagnosis)
99 == SMC_CLC_DECL_SYNCERR)
100 smc->conn.lgr->sync_err = true;
101 }
102
103out:
104 return reason_code;
105}
106
107/* send CLC DECLINE message across internal TCP socket */
108int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info,
109 u8 out_of_sync)
110{
111 struct smc_clc_msg_decline dclc;
112 struct msghdr msg;
113 struct kvec vec;
114 int len;
115
116 memset(&dclc, 0, sizeof(dclc));
117 memcpy(dclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
118 dclc.hdr.type = SMC_CLC_DECLINE;
119 dclc.hdr.length = htons(sizeof(struct smc_clc_msg_decline));
120 dclc.hdr.version = SMC_CLC_V1;
121 dclc.hdr.flag = out_of_sync ? 1 : 0;
122 memcpy(dclc.id_for_peer, local_systemid, sizeof(local_systemid));
123 dclc.peer_diagnosis = htonl(peer_diag_info);
124 memcpy(dclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
125
126 memset(&msg, 0, sizeof(msg));
127 vec.iov_base = &dclc;
128 vec.iov_len = sizeof(struct smc_clc_msg_decline);
129 len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1,
130 sizeof(struct smc_clc_msg_decline));
131 if (len < sizeof(struct smc_clc_msg_decline))
132 smc->sk.sk_err = EPROTO;
133 if (len < 0)
134 smc->sk.sk_err = -len;
135 return len;
136}
137
138/* send CLC PROPOSAL message across internal TCP socket */
139int smc_clc_send_proposal(struct smc_sock *smc,
140 struct smc_ib_device *smcibdev,
141 u8 ibport)
142{
143 struct smc_clc_msg_proposal pclc;
144 int reason_code = 0;
145 struct msghdr msg;
146 struct kvec vec;
147 int len, rc;
148
149 /* send SMC Proposal CLC message */
150 memset(&pclc, 0, sizeof(pclc));
151 memcpy(pclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
152 pclc.hdr.type = SMC_CLC_PROPOSAL;
153 pclc.hdr.length = htons(sizeof(pclc));
154 pclc.hdr.version = SMC_CLC_V1; /* SMC version */
155 memcpy(pclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid));
156 memcpy(&pclc.lcl.gid, &smcibdev->gid[ibport - 1], SMC_GID_SIZE);
157 memcpy(&pclc.lcl.mac, &smcibdev->mac[ibport - 1], ETH_ALEN);
158
159 /* determine subnet and mask from internal TCP socket */
160 rc = smc_netinfo_by_tcpsk(smc->clcsock, &pclc.outgoing_subnet,
161 &pclc.prefix_len);
162 if (rc)
163 return SMC_CLC_DECL_CNFERR; /* configuration error */
164 memcpy(pclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
165 memset(&msg, 0, sizeof(msg));
166 vec.iov_base = &pclc;
167 vec.iov_len = sizeof(pclc);
168 /* due to the few bytes needed for clc-handshake this cannot block */
169 len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, sizeof(pclc));
170 if (len < sizeof(pclc)) {
171 if (len >= 0) {
172 reason_code = -ENETUNREACH;
173 smc->sk.sk_err = -reason_code;
174 } else {
175 smc->sk.sk_err = smc->clcsock->sk->sk_err;
176 reason_code = -smc->sk.sk_err;
177 }
178 }
179
180 return reason_code;
181}
182
183/* send CLC CONFIRM message across internal TCP socket */
184int smc_clc_send_confirm(struct smc_sock *smc)
185{
186 struct smc_connection *conn = &smc->conn;
187 struct smc_clc_msg_accept_confirm cclc;
188 struct smc_link *link;
189 int reason_code = 0;
190 struct msghdr msg;
191 struct kvec vec;
192 int len;
193
194 link = &conn->lgr->lnk[SMC_SINGLE_LINK];
195 /* send SMC Confirm CLC msg */
196 memset(&cclc, 0, sizeof(cclc));
197 memcpy(cclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
198 cclc.hdr.type = SMC_CLC_CONFIRM;
199 cclc.hdr.length = htons(sizeof(cclc));
200 cclc.hdr.version = SMC_CLC_V1; /* SMC version */
201 memcpy(cclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid));
202 memcpy(&cclc.lcl.gid, &link->smcibdev->gid[link->ibport - 1],
203 SMC_GID_SIZE);
204 memcpy(&cclc.lcl.mac, &link->smcibdev->mac[link->ibport - 1], ETH_ALEN);
205 hton24(cclc.qpn, link->roce_qp->qp_num);
206 cclc.rmb_rkey =
207 htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
208 cclc.conn_idx = 1; /* for now: 1 RMB = 1 RMBE */
209 cclc.rmbe_alert_token = htonl(conn->alert_token_local);
210 cclc.qp_mtu = min(link->path_mtu, link->peer_mtu);
211 cclc.rmbe_size = conn->rmbe_size_short;
212 cclc.rmb_dma_addr =
213 cpu_to_be64((u64)conn->rmb_desc->dma_addr[SMC_SINGLE_LINK]);
214 hton24(cclc.psn, link->psn_initial);
215
216 memcpy(cclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
217
218 memset(&msg, 0, sizeof(msg));
219 vec.iov_base = &cclc;
220 vec.iov_len = sizeof(cclc);
221 len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, sizeof(cclc));
222 if (len < sizeof(cclc)) {
223 if (len >= 0) {
224 reason_code = -ENETUNREACH;
225 smc->sk.sk_err = -reason_code;
226 } else {
227 smc->sk.sk_err = smc->clcsock->sk->sk_err;
228 reason_code = -smc->sk.sk_err;
229 }
230 }
231 return reason_code;
232}
233
234/* send CLC ACCEPT message across internal TCP socket */
235int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact)
236{
237 struct smc_connection *conn = &new_smc->conn;
238 struct smc_clc_msg_accept_confirm aclc;
239 struct smc_link *link;
240 struct msghdr msg;
241 struct kvec vec;
242 int rc = 0;
243 int len;
244
245 link = &conn->lgr->lnk[SMC_SINGLE_LINK];
246 memset(&aclc, 0, sizeof(aclc));
247 memcpy(aclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
248 aclc.hdr.type = SMC_CLC_ACCEPT;
249 aclc.hdr.length = htons(sizeof(aclc));
250 aclc.hdr.version = SMC_CLC_V1; /* SMC version */
251 if (srv_first_contact)
252 aclc.hdr.flag = 1;
253 memcpy(aclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid));
254 memcpy(&aclc.lcl.gid, &link->smcibdev->gid[link->ibport - 1],
255 SMC_GID_SIZE);
256 memcpy(&aclc.lcl.mac, link->smcibdev->mac[link->ibport - 1], ETH_ALEN);
257 hton24(aclc.qpn, link->roce_qp->qp_num);
258 aclc.rmb_rkey =
259 htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
260 aclc.conn_idx = 1; /* as long as 1 RMB = 1 RMBE */
261 aclc.rmbe_alert_token = htonl(conn->alert_token_local);
262 aclc.qp_mtu = link->path_mtu;
263 aclc.rmbe_size = conn->rmbe_size_short,
264 aclc.rmb_dma_addr =
265 cpu_to_be64((u64)conn->rmb_desc->dma_addr[SMC_SINGLE_LINK]);
266 hton24(aclc.psn, link->psn_initial);
267 memcpy(aclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
268
269 memset(&msg, 0, sizeof(msg));
270 vec.iov_base = &aclc;
271 vec.iov_len = sizeof(aclc);
272 len = kernel_sendmsg(new_smc->clcsock, &msg, &vec, 1, sizeof(aclc));
273 if (len < sizeof(aclc)) {
274 if (len >= 0)
275 new_smc->sk.sk_err = EPROTO;
276 else
277 new_smc->sk.sk_err = new_smc->clcsock->sk->sk_err;
278 rc = sock_error(&new_smc->sk);
279 }
280
281 return rc;
282}
diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h
new file mode 100644
index 000000000000..13db8ce177c9
--- /dev/null
+++ b/net/smc/smc_clc.h
@@ -0,0 +1,116 @@
1/*
2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
3 *
4 * CLC (connection layer control) handshake over initial TCP socket to
5 * prepare for RDMA traffic
6 *
7 * Copyright IBM Corp. 2016
8 *
9 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
10 */
11
12#ifndef _SMC_CLC_H
13#define _SMC_CLC_H
14
15#include <rdma/ib_verbs.h>
16
17#include "smc.h"
18
19#define SMC_CLC_PROPOSAL 0x01
20#define SMC_CLC_ACCEPT 0x02
21#define SMC_CLC_CONFIRM 0x03
22#define SMC_CLC_DECLINE 0x04
23
24/* eye catcher "SMCR" EBCDIC for CLC messages */
25static const char SMC_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xd9'};
26
27#define SMC_CLC_V1 0x1 /* SMC version */
28#define CLC_WAIT_TIME (6 * HZ) /* max. wait time on clcsock */
29#define SMC_CLC_DECL_MEM 0x01010000 /* insufficient memory resources */
30#define SMC_CLC_DECL_TIMEOUT 0x02000000 /* timeout */
31#define SMC_CLC_DECL_CNFERR 0x03000000 /* configuration error */
32#define SMC_CLC_DECL_IPSEC 0x03030000 /* IPsec usage */
33#define SMC_CLC_DECL_SYNCERR 0x04000000 /* synchronization error */
34#define SMC_CLC_DECL_REPLY 0x06000000 /* reply to a received decline */
35#define SMC_CLC_DECL_INTERR 0x99990000 /* internal error */
36#define SMC_CLC_DECL_TCL 0x02040000 /* timeout w4 QP confirm */
37#define SMC_CLC_DECL_SEND 0x07000000 /* sending problem */
38
39struct smc_clc_msg_hdr { /* header1 of clc messages */
40 u8 eyecatcher[4]; /* eye catcher */
41 u8 type; /* proposal / accept / confirm / decline */
42 __be16 length;
43#if defined(__BIG_ENDIAN_BITFIELD)
44 u8 version : 4,
45 flag : 1,
46 rsvd : 3;
47#elif defined(__LITTLE_ENDIAN_BITFIELD)
48 u8 rsvd : 3,
49 flag : 1,
50 version : 4;
51#endif
52} __packed; /* format defined in RFC7609 */
53
54struct smc_clc_msg_trail { /* trailer of clc messages */
55 u8 eyecatcher[4];
56};
57
58struct smc_clc_msg_local { /* header2 of clc messages */
59 u8 id_for_peer[SMC_SYSTEMID_LEN]; /* unique system id */
60 u8 gid[16]; /* gid of ib_device port */
61 u8 mac[6]; /* mac of ib_device port */
62};
63
64struct smc_clc_msg_proposal { /* clc proposal message */
65 struct smc_clc_msg_hdr hdr;
66 struct smc_clc_msg_local lcl;
67 __be16 iparea_offset; /* offset to IP address information area */
68 __be32 outgoing_subnet; /* subnet mask */
69 u8 prefix_len; /* number of significant bits in mask */
70 u8 reserved[2];
71 u8 ipv6_prefixes_cnt; /* number of IPv6 prefixes in prefix array */
72 struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */
73} __aligned(4);
74
75struct smc_clc_msg_accept_confirm { /* clc accept / confirm message */
76 struct smc_clc_msg_hdr hdr;
77 struct smc_clc_msg_local lcl;
78 u8 qpn[3]; /* QP number */
79 __be32 rmb_rkey; /* RMB rkey */
80 u8 conn_idx; /* Connection index, which RMBE in RMB */
81 __be32 rmbe_alert_token;/* unique connection id */
82#if defined(__BIG_ENDIAN_BITFIELD)
83 u8 rmbe_size : 4, /* RMBE buf size (compressed notation) */
84 qp_mtu : 4; /* QP mtu */
85#elif defined(__LITTLE_ENDIAN_BITFIELD)
86 u8 qp_mtu : 4,
87 rmbe_size : 4;
88#endif
89 u8 reserved;
90 __be64 rmb_dma_addr; /* RMB virtual address */
91 u8 reserved2;
92 u8 psn[3]; /* initial packet sequence number */
93 struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */
94} __packed; /* format defined in RFC7609 */
95
96struct smc_clc_msg_decline { /* clc decline message */
97 struct smc_clc_msg_hdr hdr;
98 u8 id_for_peer[SMC_SYSTEMID_LEN]; /* sender peer_id */
99 __be32 peer_diagnosis; /* diagnosis information */
100 u8 reserved2[4];
101 struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */
102} __aligned(4);
103
104struct smc_sock;
105struct smc_ib_device;
106
107int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
108 u8 expected_type);
109int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info,
110 u8 out_of_sync);
111int smc_clc_send_proposal(struct smc_sock *smc, struct smc_ib_device *smcibdev,
112 u8 ibport);
113int smc_clc_send_confirm(struct smc_sock *smc);
114int smc_clc_send_accept(struct smc_sock *smc, int srv_first_contact);
115
116#endif
diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c
new file mode 100644
index 000000000000..67a71d170bed
--- /dev/null
+++ b/net/smc/smc_close.c
@@ -0,0 +1,444 @@
1/*
2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
3 *
4 * Socket Closing - normal and abnormal
5 *
6 * Copyright IBM Corp. 2016
7 *
8 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
9 */
10
11#include <linux/workqueue.h>
12#include <linux/sched/signal.h>
13
14#include <net/sock.h>
15
16#include "smc.h"
17#include "smc_tx.h"
18#include "smc_cdc.h"
19#include "smc_close.h"
20
21#define SMC_CLOSE_WAIT_TX_PENDS_TIME (5 * HZ)
22
23static void smc_close_cleanup_listen(struct sock *parent)
24{
25 struct sock *sk;
26
27 /* Close non-accepted connections */
28 while ((sk = smc_accept_dequeue(parent, NULL)))
29 smc_close_non_accepted(sk);
30}
31
32static void smc_close_wait_tx_pends(struct smc_sock *smc)
33{
34 DEFINE_WAIT_FUNC(wait, woken_wake_function);
35 struct sock *sk = &smc->sk;
36 signed long timeout;
37
38 timeout = SMC_CLOSE_WAIT_TX_PENDS_TIME;
39 add_wait_queue(sk_sleep(sk), &wait);
40 while (!signal_pending(current) && timeout) {
41 int rc;
42
43 rc = sk_wait_event(sk, &timeout,
44 !smc_cdc_tx_has_pending(&smc->conn),
45 &wait);
46 if (rc)
47 break;
48 }
49 remove_wait_queue(sk_sleep(sk), &wait);
50}
51
52/* wait for sndbuf data being transmitted */
53static void smc_close_stream_wait(struct smc_sock *smc, long timeout)
54{
55 DEFINE_WAIT_FUNC(wait, woken_wake_function);
56 struct sock *sk = &smc->sk;
57
58 if (!timeout)
59 return;
60
61 if (!smc_tx_prepared_sends(&smc->conn))
62 return;
63
64 smc->wait_close_tx_prepared = 1;
65 add_wait_queue(sk_sleep(sk), &wait);
66 while (!signal_pending(current) && timeout) {
67 int rc;
68
69 rc = sk_wait_event(sk, &timeout,
70 !smc_tx_prepared_sends(&smc->conn) ||
71 (sk->sk_err == ECONNABORTED) ||
72 (sk->sk_err == ECONNRESET),
73 &wait);
74 if (rc)
75 break;
76 }
77 remove_wait_queue(sk_sleep(sk), &wait);
78 smc->wait_close_tx_prepared = 0;
79}
80
81void smc_close_wake_tx_prepared(struct smc_sock *smc)
82{
83 if (smc->wait_close_tx_prepared)
84 /* wake up socket closing */
85 smc->sk.sk_state_change(&smc->sk);
86}
87
88static int smc_close_wr(struct smc_connection *conn)
89{
90 conn->local_tx_ctrl.conn_state_flags.peer_done_writing = 1;
91
92 return smc_cdc_get_slot_and_msg_send(conn);
93}
94
95static int smc_close_final(struct smc_connection *conn)
96{
97 if (atomic_read(&conn->bytes_to_rcv))
98 conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
99 else
100 conn->local_tx_ctrl.conn_state_flags.peer_conn_closed = 1;
101
102 return smc_cdc_get_slot_and_msg_send(conn);
103}
104
105static int smc_close_abort(struct smc_connection *conn)
106{
107 conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
108
109 return smc_cdc_get_slot_and_msg_send(conn);
110}
111
112/* terminate smc socket abnormally - active abort
113 * RDMA communication no longer possible
114 */
115void smc_close_active_abort(struct smc_sock *smc)
116{
117 struct smc_cdc_conn_state_flags *txflags =
118 &smc->conn.local_tx_ctrl.conn_state_flags;
119
120 bh_lock_sock(&smc->sk);
121 smc->sk.sk_err = ECONNABORTED;
122 if (smc->clcsock && smc->clcsock->sk) {
123 smc->clcsock->sk->sk_err = ECONNABORTED;
124 smc->clcsock->sk->sk_state_change(smc->clcsock->sk);
125 }
126 switch (smc->sk.sk_state) {
127 case SMC_INIT:
128 smc->sk.sk_state = SMC_PEERABORTWAIT;
129 break;
130 case SMC_APPCLOSEWAIT1:
131 case SMC_APPCLOSEWAIT2:
132 txflags->peer_conn_abort = 1;
133 sock_release(smc->clcsock);
134 if (!smc_cdc_rxed_any_close(&smc->conn))
135 smc->sk.sk_state = SMC_PEERABORTWAIT;
136 else
137 smc->sk.sk_state = SMC_CLOSED;
138 break;
139 case SMC_PEERCLOSEWAIT1:
140 case SMC_PEERCLOSEWAIT2:
141 if (!txflags->peer_conn_closed) {
142 smc->sk.sk_state = SMC_PEERABORTWAIT;
143 txflags->peer_conn_abort = 1;
144 sock_release(smc->clcsock);
145 } else {
146 smc->sk.sk_state = SMC_CLOSED;
147 }
148 break;
149 case SMC_PROCESSABORT:
150 case SMC_APPFINCLOSEWAIT:
151 if (!txflags->peer_conn_closed) {
152 txflags->peer_conn_abort = 1;
153 sock_release(smc->clcsock);
154 }
155 smc->sk.sk_state = SMC_CLOSED;
156 break;
157 case SMC_PEERFINCLOSEWAIT:
158 case SMC_PEERABORTWAIT:
159 case SMC_CLOSED:
160 break;
161 }
162
163 sock_set_flag(&smc->sk, SOCK_DEAD);
164 bh_unlock_sock(&smc->sk);
165 smc->sk.sk_state_change(&smc->sk);
166}
167
168int smc_close_active(struct smc_sock *smc)
169{
170 struct smc_cdc_conn_state_flags *txflags =
171 &smc->conn.local_tx_ctrl.conn_state_flags;
172 long timeout = SMC_MAX_STREAM_WAIT_TIMEOUT;
173 struct smc_connection *conn = &smc->conn;
174 struct sock *sk = &smc->sk;
175 int old_state;
176 int rc = 0;
177
178 if (sock_flag(sk, SOCK_LINGER) &&
179 !(current->flags & PF_EXITING))
180 timeout = sk->sk_lingertime;
181
182again:
183 old_state = sk->sk_state;
184 switch (old_state) {
185 case SMC_INIT:
186 sk->sk_state = SMC_CLOSED;
187 if (smc->smc_listen_work.func)
188 flush_work(&smc->smc_listen_work);
189 sock_put(sk);
190 break;
191 case SMC_LISTEN:
192 sk->sk_state = SMC_CLOSED;
193 sk->sk_state_change(sk); /* wake up accept */
194 if (smc->clcsock && smc->clcsock->sk) {
195 rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR);
196 /* wake up kernel_accept of smc_tcp_listen_worker */
197 smc->clcsock->sk->sk_data_ready(smc->clcsock->sk);
198 }
199 release_sock(sk);
200 smc_close_cleanup_listen(sk);
201 flush_work(&smc->tcp_listen_work);
202 lock_sock(sk);
203 break;
204 case SMC_ACTIVE:
205 smc_close_stream_wait(smc, timeout);
206 release_sock(sk);
207 cancel_work_sync(&conn->tx_work);
208 lock_sock(sk);
209 if (sk->sk_state == SMC_ACTIVE) {
210 /* send close request */
211 rc = smc_close_final(conn);
212 sk->sk_state = SMC_PEERCLOSEWAIT1;
213 } else {
214 /* peer event has changed the state */
215 goto again;
216 }
217 break;
218 case SMC_APPFINCLOSEWAIT:
219 /* socket already shutdown wr or both (active close) */
220 if (txflags->peer_done_writing &&
221 !txflags->peer_conn_closed) {
222 /* just shutdown wr done, send close request */
223 rc = smc_close_final(conn);
224 }
225 sk->sk_state = SMC_CLOSED;
226 smc_close_wait_tx_pends(smc);
227 break;
228 case SMC_APPCLOSEWAIT1:
229 case SMC_APPCLOSEWAIT2:
230 if (!smc_cdc_rxed_any_close(conn))
231 smc_close_stream_wait(smc, timeout);
232 release_sock(sk);
233 cancel_work_sync(&conn->tx_work);
234 lock_sock(sk);
235 if (sk->sk_err != ECONNABORTED) {
236 /* confirm close from peer */
237 rc = smc_close_final(conn);
238 if (rc)
239 break;
240 }
241 if (smc_cdc_rxed_any_close(conn))
242 /* peer has closed the socket already */
243 sk->sk_state = SMC_CLOSED;
244 else
245 /* peer has just issued a shutdown write */
246 sk->sk_state = SMC_PEERFINCLOSEWAIT;
247 smc_close_wait_tx_pends(smc);
248 break;
249 case SMC_PEERCLOSEWAIT1:
250 case SMC_PEERCLOSEWAIT2:
251 case SMC_PEERFINCLOSEWAIT:
252 /* peer sending PeerConnectionClosed will cause transition */
253 break;
254 case SMC_PROCESSABORT:
255 cancel_work_sync(&conn->tx_work);
256 smc_close_abort(conn);
257 sk->sk_state = SMC_CLOSED;
258 smc_close_wait_tx_pends(smc);
259 break;
260 case SMC_PEERABORTWAIT:
261 case SMC_CLOSED:
262 /* nothing to do, add tracing in future patch */
263 break;
264 }
265
266 if (old_state != sk->sk_state)
267 sk->sk_state_change(&smc->sk);
268 return rc;
269}
270
271static void smc_close_passive_abort_received(struct smc_sock *smc)
272{
273 struct smc_cdc_conn_state_flags *txflags =
274 &smc->conn.local_tx_ctrl.conn_state_flags;
275 struct sock *sk = &smc->sk;
276
277 switch (sk->sk_state) {
278 case SMC_ACTIVE:
279 case SMC_APPFINCLOSEWAIT:
280 case SMC_APPCLOSEWAIT1:
281 case SMC_APPCLOSEWAIT2:
282 smc_close_abort(&smc->conn);
283 sk->sk_state = SMC_PROCESSABORT;
284 break;
285 case SMC_PEERCLOSEWAIT1:
286 case SMC_PEERCLOSEWAIT2:
287 if (txflags->peer_done_writing &&
288 !txflags->peer_conn_closed) {
289 /* just shutdown, but not yet closed locally */
290 smc_close_abort(&smc->conn);
291 sk->sk_state = SMC_PROCESSABORT;
292 } else {
293 sk->sk_state = SMC_CLOSED;
294 }
295 break;
296 case SMC_PEERFINCLOSEWAIT:
297 case SMC_PEERABORTWAIT:
298 sk->sk_state = SMC_CLOSED;
299 break;
300 case SMC_INIT:
301 case SMC_PROCESSABORT:
302 /* nothing to do, add tracing in future patch */
303 break;
304 }
305}
306
307/* Some kind of closing has been received: peer_conn_closed, peer_conn_abort,
308 * or peer_done_writing.
309 * Called under tasklet context.
310 */
311void smc_close_passive_received(struct smc_sock *smc)
312{
313 struct smc_cdc_conn_state_flags *rxflags =
314 &smc->conn.local_rx_ctrl.conn_state_flags;
315 struct sock *sk = &smc->sk;
316 int old_state;
317
318 sk->sk_shutdown |= RCV_SHUTDOWN;
319 if (smc->clcsock && smc->clcsock->sk)
320 smc->clcsock->sk->sk_shutdown |= RCV_SHUTDOWN;
321 sock_set_flag(&smc->sk, SOCK_DONE);
322
323 old_state = sk->sk_state;
324
325 if (rxflags->peer_conn_abort) {
326 smc_close_passive_abort_received(smc);
327 goto wakeup;
328 }
329
330 switch (sk->sk_state) {
331 case SMC_INIT:
332 if (atomic_read(&smc->conn.bytes_to_rcv) ||
333 (rxflags->peer_done_writing &&
334 !rxflags->peer_conn_closed))
335 sk->sk_state = SMC_APPCLOSEWAIT1;
336 else
337 sk->sk_state = SMC_CLOSED;
338 break;
339 case SMC_ACTIVE:
340 sk->sk_state = SMC_APPCLOSEWAIT1;
341 break;
342 case SMC_PEERCLOSEWAIT1:
343 if (rxflags->peer_done_writing)
344 sk->sk_state = SMC_PEERCLOSEWAIT2;
345 /* fall through to check for closing */
346 case SMC_PEERCLOSEWAIT2:
347 case SMC_PEERFINCLOSEWAIT:
348 if (!smc_cdc_rxed_any_close(&smc->conn))
349 break;
350 if (sock_flag(sk, SOCK_DEAD) &&
351 (sk->sk_shutdown == SHUTDOWN_MASK)) {
352 /* smc_release has already been called locally */
353 sk->sk_state = SMC_CLOSED;
354 } else {
355 /* just shutdown, but not yet closed locally */
356 sk->sk_state = SMC_APPFINCLOSEWAIT;
357 }
358 break;
359 case SMC_APPCLOSEWAIT1:
360 case SMC_APPCLOSEWAIT2:
361 case SMC_APPFINCLOSEWAIT:
362 case SMC_PEERABORTWAIT:
363 case SMC_PROCESSABORT:
364 case SMC_CLOSED:
365 /* nothing to do, add tracing in future patch */
366 break;
367 }
368
369wakeup:
370 if (old_state != sk->sk_state)
371 sk->sk_state_change(sk);
372 sk->sk_data_ready(sk); /* wakeup blocked rcvbuf consumers */
373 sk->sk_write_space(sk); /* wakeup blocked sndbuf producers */
374
375 if ((sk->sk_state == SMC_CLOSED) &&
376 (sock_flag(sk, SOCK_DEAD) || (old_state == SMC_INIT))) {
377 smc_conn_free(&smc->conn);
378 schedule_delayed_work(&smc->sock_put_work,
379 SMC_CLOSE_SOCK_PUT_DELAY);
380 }
381}
382
383void smc_close_sock_put_work(struct work_struct *work)
384{
385 struct smc_sock *smc = container_of(to_delayed_work(work),
386 struct smc_sock,
387 sock_put_work);
388
389 smc->sk.sk_prot->unhash(&smc->sk);
390 sock_put(&smc->sk);
391}
392
393int smc_close_shutdown_write(struct smc_sock *smc)
394{
395 struct smc_connection *conn = &smc->conn;
396 long timeout = SMC_MAX_STREAM_WAIT_TIMEOUT;
397 struct sock *sk = &smc->sk;
398 int old_state;
399 int rc = 0;
400
401 if (sock_flag(sk, SOCK_LINGER))
402 timeout = sk->sk_lingertime;
403
404again:
405 old_state = sk->sk_state;
406 switch (old_state) {
407 case SMC_ACTIVE:
408 smc_close_stream_wait(smc, timeout);
409 release_sock(sk);
410 cancel_work_sync(&conn->tx_work);
411 lock_sock(sk);
412 /* send close wr request */
413 rc = smc_close_wr(conn);
414 if (sk->sk_state == SMC_ACTIVE)
415 sk->sk_state = SMC_PEERCLOSEWAIT1;
416 else
417 goto again;
418 break;
419 case SMC_APPCLOSEWAIT1:
420 /* passive close */
421 if (!smc_cdc_rxed_any_close(conn))
422 smc_close_stream_wait(smc, timeout);
423 release_sock(sk);
424 cancel_work_sync(&conn->tx_work);
425 lock_sock(sk);
426 /* confirm close from peer */
427 rc = smc_close_wr(conn);
428 sk->sk_state = SMC_APPCLOSEWAIT2;
429 break;
430 case SMC_APPCLOSEWAIT2:
431 case SMC_PEERFINCLOSEWAIT:
432 case SMC_PEERCLOSEWAIT1:
433 case SMC_PEERCLOSEWAIT2:
434 case SMC_APPFINCLOSEWAIT:
435 case SMC_PROCESSABORT:
436 case SMC_PEERABORTWAIT:
437 /* nothing to do, add tracing in future patch */
438 break;
439 }
440
441 if (old_state != sk->sk_state)
442 sk->sk_state_change(&smc->sk);
443 return rc;
444}
diff --git a/net/smc/smc_close.h b/net/smc/smc_close.h
new file mode 100644
index 000000000000..bc9a2df3633c
--- /dev/null
+++ b/net/smc/smc_close.h
@@ -0,0 +1,28 @@
1/*
2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
3 *
4 * Socket Closing
5 *
6 * Copyright IBM Corp. 2016
7 *
8 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
9 */
10
11#ifndef SMC_CLOSE_H
12#define SMC_CLOSE_H
13
14#include <linux/workqueue.h>
15
16#include "smc.h"
17
18#define SMC_MAX_STREAM_WAIT_TIMEOUT (2 * HZ)
19#define SMC_CLOSE_SOCK_PUT_DELAY HZ
20
21void smc_close_wake_tx_prepared(struct smc_sock *smc);
22void smc_close_active_abort(struct smc_sock *smc);
23int smc_close_active(struct smc_sock *smc);
24void smc_close_passive_received(struct smc_sock *smc);
25void smc_close_sock_put_work(struct work_struct *work);
26int smc_close_shutdown_write(struct smc_sock *smc);
27
28#endif /* SMC_CLOSE_H */
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
new file mode 100644
index 000000000000..0eac633fb354
--- /dev/null
+++ b/net/smc/smc_core.c
@@ -0,0 +1,682 @@
1/*
2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
3 *
4 * Basic Transport Functions exploiting Infiniband API
5 *
6 * Copyright IBM Corp. 2016
7 *
8 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
9 */
10
11#include <linux/socket.h>
12#include <linux/if_vlan.h>
13#include <linux/random.h>
14#include <linux/workqueue.h>
15#include <net/tcp.h>
16#include <net/sock.h>
17#include <rdma/ib_verbs.h>
18
19#include "smc.h"
20#include "smc_clc.h"
21#include "smc_core.h"
22#include "smc_ib.h"
23#include "smc_wr.h"
24#include "smc_llc.h"
25#include "smc_cdc.h"
26#include "smc_close.h"
27
28#define SMC_LGR_NUM_INCR 256
29#define SMC_LGR_FREE_DELAY (600 * HZ)
30
31static u32 smc_lgr_num; /* unique link group number */
32
33/* Register connection's alert token in our lookup structure.
34 * To use rbtrees we have to implement our own insert core.
35 * Requires @conns_lock
36 * @smc connection to register
37 * Returns 0 on success, != otherwise.
38 */
39static void smc_lgr_add_alert_token(struct smc_connection *conn)
40{
41 struct rb_node **link, *parent = NULL;
42 u32 token = conn->alert_token_local;
43
44 link = &conn->lgr->conns_all.rb_node;
45 while (*link) {
46 struct smc_connection *cur = rb_entry(*link,
47 struct smc_connection, alert_node);
48
49 parent = *link;
50 if (cur->alert_token_local > token)
51 link = &parent->rb_left;
52 else
53 link = &parent->rb_right;
54 }
55 /* Put the new node there */
56 rb_link_node(&conn->alert_node, parent, link);
57 rb_insert_color(&conn->alert_node, &conn->lgr->conns_all);
58}
59
60/* Register connection in link group by assigning an alert token
61 * registered in a search tree.
62 * Requires @conns_lock
63 * Note that '0' is a reserved value and not assigned.
64 */
65static void smc_lgr_register_conn(struct smc_connection *conn)
66{
67 struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
68 static atomic_t nexttoken = ATOMIC_INIT(0);
69
70 /* find a new alert_token_local value not yet used by some connection
71 * in this link group
72 */
73 sock_hold(&smc->sk); /* sock_put in smc_lgr_unregister_conn() */
74 while (!conn->alert_token_local) {
75 conn->alert_token_local = atomic_inc_return(&nexttoken);
76 if (smc_lgr_find_conn(conn->alert_token_local, conn->lgr))
77 conn->alert_token_local = 0;
78 }
79 smc_lgr_add_alert_token(conn);
80 conn->lgr->conns_num++;
81}
82
83/* Unregister connection and reset the alert token of the given connection<
84 */
85static void __smc_lgr_unregister_conn(struct smc_connection *conn)
86{
87 struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
88 struct smc_link_group *lgr = conn->lgr;
89
90 rb_erase(&conn->alert_node, &lgr->conns_all);
91 lgr->conns_num--;
92 conn->alert_token_local = 0;
93 conn->lgr = NULL;
94 sock_put(&smc->sk); /* sock_hold in smc_lgr_register_conn() */
95}
96
97/* Unregister connection and trigger lgr freeing if applicable
98 */
99static void smc_lgr_unregister_conn(struct smc_connection *conn)
100{
101 struct smc_link_group *lgr = conn->lgr;
102 int reduced = 0;
103
104 write_lock_bh(&lgr->conns_lock);
105 if (conn->alert_token_local) {
106 reduced = 1;
107 __smc_lgr_unregister_conn(conn);
108 }
109 write_unlock_bh(&lgr->conns_lock);
110 if (reduced && !lgr->conns_num)
111 schedule_delayed_work(&lgr->free_work, SMC_LGR_FREE_DELAY);
112}
113
114static void smc_lgr_free_work(struct work_struct *work)
115{
116 struct smc_link_group *lgr = container_of(to_delayed_work(work),
117 struct smc_link_group,
118 free_work);
119 bool conns;
120
121 spin_lock_bh(&smc_lgr_list.lock);
122 read_lock_bh(&lgr->conns_lock);
123 conns = RB_EMPTY_ROOT(&lgr->conns_all);
124 read_unlock_bh(&lgr->conns_lock);
125 if (!conns) { /* number of lgr connections is no longer zero */
126 spin_unlock_bh(&smc_lgr_list.lock);
127 return;
128 }
129 list_del_init(&lgr->list); /* remove from smc_lgr_list */
130 spin_unlock_bh(&smc_lgr_list.lock);
131 smc_lgr_free(lgr);
132}
133
134/* create a new SMC link group */
135static int smc_lgr_create(struct smc_sock *smc, __be32 peer_in_addr,
136 struct smc_ib_device *smcibdev, u8 ibport,
137 char *peer_systemid, unsigned short vlan_id)
138{
139 struct smc_link_group *lgr;
140 struct smc_link *lnk;
141 u8 rndvec[3];
142 int rc = 0;
143 int i;
144
145 lgr = kzalloc(sizeof(*lgr), GFP_KERNEL);
146 if (!lgr) {
147 rc = -ENOMEM;
148 goto out;
149 }
150 lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
151 lgr->sync_err = false;
152 lgr->daddr = peer_in_addr;
153 memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN);
154 lgr->vlan_id = vlan_id;
155 rwlock_init(&lgr->sndbufs_lock);
156 rwlock_init(&lgr->rmbs_lock);
157 for (i = 0; i < SMC_RMBE_SIZES; i++) {
158 INIT_LIST_HEAD(&lgr->sndbufs[i]);
159 INIT_LIST_HEAD(&lgr->rmbs[i]);
160 }
161 smc_lgr_num += SMC_LGR_NUM_INCR;
162 memcpy(&lgr->id, (u8 *)&smc_lgr_num, SMC_LGR_ID_SIZE);
163 INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work);
164 lgr->conns_all = RB_ROOT;
165
166 lnk = &lgr->lnk[SMC_SINGLE_LINK];
167 /* initialize link */
168 lnk->smcibdev = smcibdev;
169 lnk->ibport = ibport;
170 lnk->path_mtu = smcibdev->pattr[ibport - 1].active_mtu;
171 if (!smcibdev->initialized)
172 smc_ib_setup_per_ibdev(smcibdev);
173 get_random_bytes(rndvec, sizeof(rndvec));
174 lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + (rndvec[2] << 16);
175 rc = smc_wr_alloc_link_mem(lnk);
176 if (rc)
177 goto free_lgr;
178 init_waitqueue_head(&lnk->wr_tx_wait);
179 rc = smc_ib_create_protection_domain(lnk);
180 if (rc)
181 goto free_link_mem;
182 rc = smc_ib_create_queue_pair(lnk);
183 if (rc)
184 goto dealloc_pd;
185 rc = smc_wr_create_link(lnk);
186 if (rc)
187 goto destroy_qp;
188 init_completion(&lnk->llc_confirm);
189 init_completion(&lnk->llc_confirm_resp);
190
191 smc->conn.lgr = lgr;
192 rwlock_init(&lgr->conns_lock);
193 spin_lock_bh(&smc_lgr_list.lock);
194 list_add(&lgr->list, &smc_lgr_list.list);
195 spin_unlock_bh(&smc_lgr_list.lock);
196 return 0;
197
198destroy_qp:
199 smc_ib_destroy_queue_pair(lnk);
200dealloc_pd:
201 smc_ib_dealloc_protection_domain(lnk);
202free_link_mem:
203 smc_wr_free_link_mem(lnk);
204free_lgr:
205 kfree(lgr);
206out:
207 return rc;
208}
209
210static void smc_sndbuf_unuse(struct smc_connection *conn)
211{
212 if (conn->sndbuf_desc) {
213 conn->sndbuf_desc->used = 0;
214 conn->sndbuf_size = 0;
215 }
216}
217
218static void smc_rmb_unuse(struct smc_connection *conn)
219{
220 if (conn->rmb_desc) {
221 conn->rmb_desc->used = 0;
222 conn->rmbe_size = 0;
223 }
224}
225
226/* remove a finished connection from its link group */
227void smc_conn_free(struct smc_connection *conn)
228{
229 struct smc_link_group *lgr = conn->lgr;
230
231 if (!lgr)
232 return;
233 smc_cdc_tx_dismiss_slots(conn);
234 smc_lgr_unregister_conn(conn);
235 smc_rmb_unuse(conn);
236 smc_sndbuf_unuse(conn);
237}
238
239static void smc_link_clear(struct smc_link *lnk)
240{
241 lnk->peer_qpn = 0;
242 smc_ib_modify_qp_reset(lnk);
243 smc_wr_free_link(lnk);
244 smc_ib_destroy_queue_pair(lnk);
245 smc_ib_dealloc_protection_domain(lnk);
246 smc_wr_free_link_mem(lnk);
247}
248
249static void smc_lgr_free_sndbufs(struct smc_link_group *lgr)
250{
251 struct smc_buf_desc *sndbuf_desc, *bf_desc;
252 int i;
253
254 for (i = 0; i < SMC_RMBE_SIZES; i++) {
255 list_for_each_entry_safe(sndbuf_desc, bf_desc, &lgr->sndbufs[i],
256 list) {
257 list_del(&sndbuf_desc->list);
258 smc_ib_buf_unmap(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
259 smc_uncompress_bufsize(i),
260 sndbuf_desc, DMA_TO_DEVICE);
261 kfree(sndbuf_desc->cpu_addr);
262 kfree(sndbuf_desc);
263 }
264 }
265}
266
267static void smc_lgr_free_rmbs(struct smc_link_group *lgr)
268{
269 struct smc_buf_desc *rmb_desc, *bf_desc;
270 struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
271 int i;
272
273 for (i = 0; i < SMC_RMBE_SIZES; i++) {
274 list_for_each_entry_safe(rmb_desc, bf_desc, &lgr->rmbs[i],
275 list) {
276 list_del(&rmb_desc->list);
277 smc_ib_buf_unmap(lnk->smcibdev,
278 smc_uncompress_bufsize(i),
279 rmb_desc, DMA_FROM_DEVICE);
280 kfree(rmb_desc->cpu_addr);
281 kfree(rmb_desc);
282 }
283 }
284}
285
286/* remove a link group */
287void smc_lgr_free(struct smc_link_group *lgr)
288{
289 smc_lgr_free_rmbs(lgr);
290 smc_lgr_free_sndbufs(lgr);
291 smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]);
292 kfree(lgr);
293}
294
295/* terminate linkgroup abnormally */
296void smc_lgr_terminate(struct smc_link_group *lgr)
297{
298 struct smc_connection *conn;
299 struct smc_sock *smc;
300 struct rb_node *node;
301
302 spin_lock_bh(&smc_lgr_list.lock);
303 if (list_empty(&lgr->list)) {
304 /* termination already triggered */
305 spin_unlock_bh(&smc_lgr_list.lock);
306 return;
307 }
308 /* do not use this link group for new connections */
309 list_del_init(&lgr->list);
310 spin_unlock_bh(&smc_lgr_list.lock);
311
312 write_lock_bh(&lgr->conns_lock);
313 node = rb_first(&lgr->conns_all);
314 while (node) {
315 conn = rb_entry(node, struct smc_connection, alert_node);
316 smc = container_of(conn, struct smc_sock, conn);
317 sock_hold(&smc->sk);
318 __smc_lgr_unregister_conn(conn);
319 smc_close_active_abort(smc);
320 sock_put(&smc->sk);
321 node = rb_first(&lgr->conns_all);
322 }
323 write_unlock_bh(&lgr->conns_lock);
324}
325
326/* Determine vlan of internal TCP socket.
327 * @vlan_id: address to store the determined vlan id into
328 */
329static int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id)
330{
331 struct dst_entry *dst = sk_dst_get(clcsock->sk);
332 int rc = 0;
333
334 *vlan_id = 0;
335 if (!dst) {
336 rc = -ENOTCONN;
337 goto out;
338 }
339 if (!dst->dev) {
340 rc = -ENODEV;
341 goto out_rel;
342 }
343
344 if (is_vlan_dev(dst->dev))
345 *vlan_id = vlan_dev_vlan_id(dst->dev);
346
347out_rel:
348 dst_release(dst);
349out:
350 return rc;
351}
352
353/* determine the link gid matching the vlan id of the link group */
354static int smc_link_determine_gid(struct smc_link_group *lgr)
355{
356 struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
357 struct ib_gid_attr gattr;
358 union ib_gid gid;
359 int i;
360
361 if (!lgr->vlan_id) {
362 lnk->gid = lnk->smcibdev->gid[lnk->ibport - 1];
363 return 0;
364 }
365
366 for (i = 0; i < lnk->smcibdev->pattr[lnk->ibport - 1].gid_tbl_len;
367 i++) {
368 if (ib_query_gid(lnk->smcibdev->ibdev, lnk->ibport, i, &gid,
369 &gattr))
370 continue;
371 if (gattr.ndev &&
372 (vlan_dev_vlan_id(gattr.ndev) == lgr->vlan_id)) {
373 lnk->gid = gid;
374 return 0;
375 }
376 }
377 return -ENODEV;
378}
379
380/* create a new SMC connection (and a new link group if necessary) */
381int smc_conn_create(struct smc_sock *smc, __be32 peer_in_addr,
382 struct smc_ib_device *smcibdev, u8 ibport,
383 struct smc_clc_msg_local *lcl, int srv_first_contact)
384{
385 struct smc_connection *conn = &smc->conn;
386 struct smc_link_group *lgr;
387 unsigned short vlan_id;
388 enum smc_lgr_role role;
389 int local_contact = SMC_FIRST_CONTACT;
390 int rc = 0;
391
392 role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
393 rc = smc_vlan_by_tcpsk(smc->clcsock, &vlan_id);
394 if (rc)
395 return rc;
396
397 if ((role == SMC_CLNT) && srv_first_contact)
398 /* create new link group as well */
399 goto create;
400
401 /* determine if an existing link group can be reused */
402 spin_lock_bh(&smc_lgr_list.lock);
403 list_for_each_entry(lgr, &smc_lgr_list.list, list) {
404 write_lock_bh(&lgr->conns_lock);
405 if (!memcmp(lgr->peer_systemid, lcl->id_for_peer,
406 SMC_SYSTEMID_LEN) &&
407 !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_gid, &lcl->gid,
408 SMC_GID_SIZE) &&
409 !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_mac, lcl->mac,
410 sizeof(lcl->mac)) &&
411 !lgr->sync_err &&
412 (lgr->role == role) &&
413 (lgr->vlan_id == vlan_id) &&
414 ((role == SMC_CLNT) ||
415 (lgr->conns_num < SMC_RMBS_PER_LGR_MAX))) {
416 /* link group found */
417 local_contact = SMC_REUSE_CONTACT;
418 conn->lgr = lgr;
419 smc_lgr_register_conn(conn); /* add smc conn to lgr */
420 write_unlock_bh(&lgr->conns_lock);
421 break;
422 }
423 write_unlock_bh(&lgr->conns_lock);
424 }
425 spin_unlock_bh(&smc_lgr_list.lock);
426
427 if (role == SMC_CLNT && !srv_first_contact &&
428 (local_contact == SMC_FIRST_CONTACT)) {
429 /* Server reuses a link group, but Client wants to start
430 * a new one
431 * send out_of_sync decline, reason synchr. error
432 */
433 return -ENOLINK;
434 }
435
436create:
437 if (local_contact == SMC_FIRST_CONTACT) {
438 rc = smc_lgr_create(smc, peer_in_addr, smcibdev, ibport,
439 lcl->id_for_peer, vlan_id);
440 if (rc)
441 goto out;
442 smc_lgr_register_conn(conn); /* add smc conn to lgr */
443 rc = smc_link_determine_gid(conn->lgr);
444 }
445 conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE;
446 conn->local_tx_ctrl.len = sizeof(struct smc_cdc_msg);
447#ifndef KERNEL_HAS_ATOMIC64
448 spin_lock_init(&conn->acurs_lock);
449#endif
450
451out:
452 return rc ? rc : local_contact;
453}
454
455/* try to reuse a sndbuf description slot of the sndbufs list for a certain
456 * buf_size; if not available, return NULL
457 */
458static inline
459struct smc_buf_desc *smc_sndbuf_get_slot(struct smc_link_group *lgr,
460 int compressed_bufsize)
461{
462 struct smc_buf_desc *sndbuf_slot;
463
464 read_lock_bh(&lgr->sndbufs_lock);
465 list_for_each_entry(sndbuf_slot, &lgr->sndbufs[compressed_bufsize],
466 list) {
467 if (cmpxchg(&sndbuf_slot->used, 0, 1) == 0) {
468 read_unlock_bh(&lgr->sndbufs_lock);
469 return sndbuf_slot;
470 }
471 }
472 read_unlock_bh(&lgr->sndbufs_lock);
473 return NULL;
474}
475
476/* try to reuse an rmb description slot of the rmbs list for a certain
477 * rmbe_size; if not available, return NULL
478 */
479static inline
480struct smc_buf_desc *smc_rmb_get_slot(struct smc_link_group *lgr,
481 int compressed_bufsize)
482{
483 struct smc_buf_desc *rmb_slot;
484
485 read_lock_bh(&lgr->rmbs_lock);
486 list_for_each_entry(rmb_slot, &lgr->rmbs[compressed_bufsize],
487 list) {
488 if (cmpxchg(&rmb_slot->used, 0, 1) == 0) {
489 read_unlock_bh(&lgr->rmbs_lock);
490 return rmb_slot;
491 }
492 }
493 read_unlock_bh(&lgr->rmbs_lock);
494 return NULL;
495}
496
497/* one of the conditions for announcing a receiver's current window size is
498 * that it "results in a minimum increase in the window size of 10% of the
499 * receive buffer space" [RFC7609]
500 */
501static inline int smc_rmb_wnd_update_limit(int rmbe_size)
502{
503 return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2);
504}
505
506/* create the tx buffer for an SMC socket */
507int smc_sndbuf_create(struct smc_sock *smc)
508{
509 struct smc_connection *conn = &smc->conn;
510 struct smc_link_group *lgr = conn->lgr;
511 int tmp_bufsize, tmp_bufsize_short;
512 struct smc_buf_desc *sndbuf_desc;
513 int rc;
514
515 /* use socket send buffer size (w/o overhead) as start value */
516 for (tmp_bufsize_short = smc_compress_bufsize(smc->sk.sk_sndbuf / 2);
517 tmp_bufsize_short >= 0; tmp_bufsize_short--) {
518 tmp_bufsize = smc_uncompress_bufsize(tmp_bufsize_short);
519 /* check for reusable sndbuf_slot in the link group */
520 sndbuf_desc = smc_sndbuf_get_slot(lgr, tmp_bufsize_short);
521 if (sndbuf_desc) {
522 memset(sndbuf_desc->cpu_addr, 0, tmp_bufsize);
523 break; /* found reusable slot */
524 }
525 /* try to alloc a new send buffer */
526 sndbuf_desc = kzalloc(sizeof(*sndbuf_desc), GFP_KERNEL);
527 if (!sndbuf_desc)
528 break; /* give up with -ENOMEM */
529 sndbuf_desc->cpu_addr = kzalloc(tmp_bufsize,
530 GFP_KERNEL | __GFP_NOWARN |
531 __GFP_NOMEMALLOC |
532 __GFP_NORETRY);
533 if (!sndbuf_desc->cpu_addr) {
534 kfree(sndbuf_desc);
535 sndbuf_desc = NULL;
536 /* if send buffer allocation has failed,
537 * try a smaller one
538 */
539 continue;
540 }
541 rc = smc_ib_buf_map(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
542 tmp_bufsize, sndbuf_desc,
543 DMA_TO_DEVICE);
544 if (rc) {
545 kfree(sndbuf_desc->cpu_addr);
546 kfree(sndbuf_desc);
547 sndbuf_desc = NULL;
548 continue; /* if mapping failed, try smaller one */
549 }
550 sndbuf_desc->used = 1;
551 write_lock_bh(&lgr->sndbufs_lock);
552 list_add(&sndbuf_desc->list,
553 &lgr->sndbufs[tmp_bufsize_short]);
554 write_unlock_bh(&lgr->sndbufs_lock);
555 break;
556 }
557 if (sndbuf_desc && sndbuf_desc->cpu_addr) {
558 conn->sndbuf_desc = sndbuf_desc;
559 conn->sndbuf_size = tmp_bufsize;
560 smc->sk.sk_sndbuf = tmp_bufsize * 2;
561 atomic_set(&conn->sndbuf_space, tmp_bufsize);
562 return 0;
563 } else {
564 return -ENOMEM;
565 }
566}
567
568/* create the RMB for an SMC socket (even though the SMC protocol
569 * allows more than one RMB-element per RMB, the Linux implementation
570 * uses just one RMB-element per RMB, i.e. uses an extra RMB for every
571 * connection in a link group
572 */
573int smc_rmb_create(struct smc_sock *smc)
574{
575 struct smc_connection *conn = &smc->conn;
576 struct smc_link_group *lgr = conn->lgr;
577 int tmp_bufsize, tmp_bufsize_short;
578 struct smc_buf_desc *rmb_desc;
579 int rc;
580
581 /* use socket recv buffer size (w/o overhead) as start value */
582 for (tmp_bufsize_short = smc_compress_bufsize(smc->sk.sk_rcvbuf / 2);
583 tmp_bufsize_short >= 0; tmp_bufsize_short--) {
584 tmp_bufsize = smc_uncompress_bufsize(tmp_bufsize_short);
585 /* check for reusable rmb_slot in the link group */
586 rmb_desc = smc_rmb_get_slot(lgr, tmp_bufsize_short);
587 if (rmb_desc) {
588 memset(rmb_desc->cpu_addr, 0, tmp_bufsize);
589 break; /* found reusable slot */
590 }
591 /* try to alloc a new RMB */
592 rmb_desc = kzalloc(sizeof(*rmb_desc), GFP_KERNEL);
593 if (!rmb_desc)
594 break; /* give up with -ENOMEM */
595 rmb_desc->cpu_addr = kzalloc(tmp_bufsize,
596 GFP_KERNEL | __GFP_NOWARN |
597 __GFP_NOMEMALLOC |
598 __GFP_NORETRY);
599 if (!rmb_desc->cpu_addr) {
600 kfree(rmb_desc);
601 rmb_desc = NULL;
602 /* if RMB allocation has failed,
603 * try a smaller one
604 */
605 continue;
606 }
607 rc = smc_ib_buf_map(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
608 tmp_bufsize, rmb_desc,
609 DMA_FROM_DEVICE);
610 if (rc) {
611 kfree(rmb_desc->cpu_addr);
612 kfree(rmb_desc);
613 rmb_desc = NULL;
614 continue; /* if mapping failed, try smaller one */
615 }
616 rc = smc_ib_get_memory_region(lgr->lnk[SMC_SINGLE_LINK].roce_pd,
617 IB_ACCESS_REMOTE_WRITE |
618 IB_ACCESS_LOCAL_WRITE,
619 &rmb_desc->mr_rx[SMC_SINGLE_LINK]);
620 if (rc) {
621 smc_ib_buf_unmap(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
622 tmp_bufsize, rmb_desc,
623 DMA_FROM_DEVICE);
624 kfree(rmb_desc->cpu_addr);
625 kfree(rmb_desc);
626 rmb_desc = NULL;
627 continue;
628 }
629 rmb_desc->used = 1;
630 write_lock_bh(&lgr->rmbs_lock);
631 list_add(&rmb_desc->list,
632 &lgr->rmbs[tmp_bufsize_short]);
633 write_unlock_bh(&lgr->rmbs_lock);
634 break;
635 }
636 if (rmb_desc && rmb_desc->cpu_addr) {
637 conn->rmb_desc = rmb_desc;
638 conn->rmbe_size = tmp_bufsize;
639 conn->rmbe_size_short = tmp_bufsize_short;
640 smc->sk.sk_rcvbuf = tmp_bufsize * 2;
641 atomic_set(&conn->bytes_to_rcv, 0);
642 conn->rmbe_update_limit = smc_rmb_wnd_update_limit(tmp_bufsize);
643 return 0;
644 } else {
645 return -ENOMEM;
646 }
647}
648
649static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr)
650{
651 int i;
652
653 for_each_clear_bit(i, lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX) {
654 if (!test_and_set_bit(i, lgr->rtokens_used_mask))
655 return i;
656 }
657 return -ENOSPC;
658}
659
660/* save rkey and dma_addr received from peer during clc handshake */
661int smc_rmb_rtoken_handling(struct smc_connection *conn,
662 struct smc_clc_msg_accept_confirm *clc)
663{
664 u64 dma_addr = be64_to_cpu(clc->rmb_dma_addr);
665 struct smc_link_group *lgr = conn->lgr;
666 u32 rkey = ntohl(clc->rmb_rkey);
667 int i;
668
669 for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
670 if ((lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey) &&
671 test_bit(i, lgr->rtokens_used_mask)) {
672 conn->rtoken_idx = i;
673 return 0;
674 }
675 }
676 conn->rtoken_idx = smc_rmb_reserve_rtoken_idx(lgr);
677 if (conn->rtoken_idx < 0)
678 return conn->rtoken_idx;
679 lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey = rkey;
680 lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].dma_addr = dma_addr;
681 return 0;
682}
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
new file mode 100644
index 000000000000..27eb38056a27
--- /dev/null
+++ b/net/smc/smc_core.h
@@ -0,0 +1,181 @@
1/*
2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
3 *
4 * Definitions for SMC Connections, Link Groups and Links
5 *
6 * Copyright IBM Corp. 2016
7 *
8 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
9 */
10
11#ifndef _SMC_CORE_H
12#define _SMC_CORE_H
13
14#include <linux/atomic.h>
15#include <rdma/ib_verbs.h>
16
17#include "smc.h"
18#include "smc_ib.h"
19
20#define SMC_RMBS_PER_LGR_MAX 255 /* max. # of RMBs per link group */
21
22struct smc_lgr_list { /* list of link group definition */
23 struct list_head list;
24 spinlock_t lock; /* protects list of link groups */
25};
26
27extern struct smc_lgr_list smc_lgr_list; /* list of link groups */
28
29enum smc_lgr_role { /* possible roles of a link group */
30 SMC_CLNT, /* client */
31 SMC_SERV /* server */
32};
33
34#define SMC_WR_BUF_SIZE 48 /* size of work request buffer */
35
36struct smc_wr_buf {
37 u8 raw[SMC_WR_BUF_SIZE];
38};
39
40struct smc_link {
41 struct smc_ib_device *smcibdev; /* ib-device */
42 u8 ibport; /* port - values 1 | 2 */
43 struct ib_pd *roce_pd; /* IB protection domain,
44 * unique for every RoCE QP
45 */
46 struct ib_qp *roce_qp; /* IB queue pair */
47 struct ib_qp_attr qp_attr; /* IB queue pair attributes */
48
49 struct smc_wr_buf *wr_tx_bufs; /* WR send payload buffers */
50 struct ib_send_wr *wr_tx_ibs; /* WR send meta data */
51 struct ib_sge *wr_tx_sges; /* WR send gather meta data */
52 struct smc_wr_tx_pend *wr_tx_pends; /* WR send waiting for CQE */
53 /* above four vectors have wr_tx_cnt elements and use the same index */
54 dma_addr_t wr_tx_dma_addr; /* DMA address of wr_tx_bufs */
55 atomic_long_t wr_tx_id; /* seq # of last sent WR */
56 unsigned long *wr_tx_mask; /* bit mask of used indexes */
57 u32 wr_tx_cnt; /* number of WR send buffers */
58 wait_queue_head_t wr_tx_wait; /* wait for free WR send buf */
59
60 struct smc_wr_buf *wr_rx_bufs; /* WR recv payload buffers */
61 struct ib_recv_wr *wr_rx_ibs; /* WR recv meta data */
62 struct ib_sge *wr_rx_sges; /* WR recv scatter meta data */
63 /* above three vectors have wr_rx_cnt elements and use the same index */
64 dma_addr_t wr_rx_dma_addr; /* DMA address of wr_rx_bufs */
65 u64 wr_rx_id; /* seq # of last recv WR */
66 u32 wr_rx_cnt; /* number of WR recv buffers */
67
68 union ib_gid gid; /* gid matching used vlan id */
69 u32 peer_qpn; /* QP number of peer */
70 enum ib_mtu path_mtu; /* used mtu */
71 enum ib_mtu peer_mtu; /* mtu size of peer */
72 u32 psn_initial; /* QP tx initial packet seqno */
73 u32 peer_psn; /* QP rx initial packet seqno */
74 u8 peer_mac[ETH_ALEN]; /* = gid[8:10||13:15] */
75 u8 peer_gid[sizeof(union ib_gid)]; /* gid of peer*/
76 u8 link_id; /* unique # within link group */
77 struct completion llc_confirm; /* wait for rx of conf link */
78 struct completion llc_confirm_resp; /* wait 4 rx of cnf lnk rsp */
79};
80
81/* For now we just allow one parallel link per link group. The SMC protocol
82 * allows more (up to 8).
83 */
84#define SMC_LINKS_PER_LGR_MAX 1
85#define SMC_SINGLE_LINK 0
86
87#define SMC_FIRST_CONTACT 1 /* first contact to a peer */
88#define SMC_REUSE_CONTACT 0 /* follow-on contact to a peer*/
89
90/* tx/rx buffer list element for sndbufs list and rmbs list of a lgr */
91struct smc_buf_desc {
92 struct list_head list;
93 u64 dma_addr[SMC_LINKS_PER_LGR_MAX];
94 /* mapped address of buffer */
95 void *cpu_addr; /* virtual address of buffer */
96 struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX];
97 /* for rmb only:
98 * rkey provided to peer
99 */
100 u32 used; /* currently used / unused */
101};
102
103struct smc_rtoken { /* address/key of remote RMB */
104 u64 dma_addr;
105 u32 rkey;
106};
107
108#define SMC_LGR_ID_SIZE 4
109
110struct smc_link_group {
111 struct list_head list;
112 enum smc_lgr_role role; /* client or server */
113 __be32 daddr; /* destination ip address */
114 struct smc_link lnk[SMC_LINKS_PER_LGR_MAX]; /* smc link */
115 char peer_systemid[SMC_SYSTEMID_LEN];
116 /* unique system_id of peer */
117 struct rb_root conns_all; /* connection tree */
118 rwlock_t conns_lock; /* protects conns_all */
119 unsigned int conns_num; /* current # of connections */
120 unsigned short vlan_id; /* vlan id of link group */
121
122 struct list_head sndbufs[SMC_RMBE_SIZES];/* tx buffers */
123 rwlock_t sndbufs_lock; /* protects tx buffers */
124 struct list_head rmbs[SMC_RMBE_SIZES]; /* rx buffers */
125 rwlock_t rmbs_lock; /* protects rx buffers */
126 struct smc_rtoken rtokens[SMC_RMBS_PER_LGR_MAX]
127 [SMC_LINKS_PER_LGR_MAX];
128 /* remote addr/key pairs */
129 unsigned long rtokens_used_mask[BITS_TO_LONGS(
130 SMC_RMBS_PER_LGR_MAX)];
131 /* used rtoken elements */
132
133 u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */
134 struct delayed_work free_work; /* delayed freeing of an lgr */
135 bool sync_err; /* lgr no longer fits to peer */
136};
137
138/* Find the connection associated with the given alert token in the link group.
139 * To use rbtrees we have to implement our own search core.
140 * Requires @conns_lock
141 * @token alert token to search for
142 * @lgr link group to search in
143 * Returns connection associated with token if found, NULL otherwise.
144 */
145static inline struct smc_connection *smc_lgr_find_conn(
146 u32 token, struct smc_link_group *lgr)
147{
148 struct smc_connection *res = NULL;
149 struct rb_node *node;
150
151 node = lgr->conns_all.rb_node;
152 while (node) {
153 struct smc_connection *cur = rb_entry(node,
154 struct smc_connection, alert_node);
155
156 if (cur->alert_token_local > token) {
157 node = node->rb_left;
158 } else {
159 if (cur->alert_token_local < token) {
160 node = node->rb_right;
161 } else {
162 res = cur;
163 break;
164 }
165 }
166 }
167
168 return res;
169}
170
171struct smc_sock;
172struct smc_clc_msg_accept_confirm;
173
174void smc_lgr_free(struct smc_link_group *lgr);
175void smc_lgr_terminate(struct smc_link_group *lgr);
176int smc_sndbuf_create(struct smc_sock *smc);
177int smc_rmb_create(struct smc_sock *smc);
178int smc_rmb_rtoken_handling(struct smc_connection *conn,
179 struct smc_clc_msg_accept_confirm *clc);
180
181#endif
diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c
new file mode 100644
index 000000000000..d2d01cf70224
--- /dev/null
+++ b/net/smc/smc_diag.c
@@ -0,0 +1,215 @@
1/*
2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
3 *
4 * Monitoring SMC transport protocol sockets
5 *
6 * Copyright IBM Corp. 2016
7 *
8 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
9 */
10
11#include <linux/kernel.h>
12#include <linux/module.h>
13#include <linux/types.h>
14#include <linux/init.h>
15#include <linux/sock_diag.h>
16#include <linux/inet_diag.h>
17#include <linux/smc_diag.h>
18#include <net/netlink.h>
19#include <net/smc.h>
20
21#include "smc.h"
22#include "smc_core.h"
23
24static void smc_gid_be16_convert(__u8 *buf, u8 *gid_raw)
25{
26 sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x",
27 be16_to_cpu(((__be16 *)gid_raw)[0]),
28 be16_to_cpu(((__be16 *)gid_raw)[1]),
29 be16_to_cpu(((__be16 *)gid_raw)[2]),
30 be16_to_cpu(((__be16 *)gid_raw)[3]),
31 be16_to_cpu(((__be16 *)gid_raw)[4]),
32 be16_to_cpu(((__be16 *)gid_raw)[5]),
33 be16_to_cpu(((__be16 *)gid_raw)[6]),
34 be16_to_cpu(((__be16 *)gid_raw)[7]));
35}
36
37static void smc_diag_msg_common_fill(struct smc_diag_msg *r, struct sock *sk)
38{
39 struct smc_sock *smc = smc_sk(sk);
40
41 r->diag_family = sk->sk_family;
42 if (!smc->clcsock)
43 return;
44 r->id.idiag_sport = htons(smc->clcsock->sk->sk_num);
45 r->id.idiag_dport = smc->clcsock->sk->sk_dport;
46 r->id.idiag_if = smc->clcsock->sk->sk_bound_dev_if;
47 sock_diag_save_cookie(sk, r->id.idiag_cookie);
48 memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
49 memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
50 r->id.idiag_src[0] = smc->clcsock->sk->sk_rcv_saddr;
51 r->id.idiag_dst[0] = smc->clcsock->sk->sk_daddr;
52}
53
54static int smc_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
55 struct smc_diag_msg *r,
56 struct user_namespace *user_ns)
57{
58 if (nla_put_u8(skb, SMC_DIAG_SHUTDOWN, sk->sk_shutdown))
59 return 1;
60
61 r->diag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
62 r->diag_inode = sock_i_ino(sk);
63 return 0;
64}
65
66static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb,
67 struct netlink_callback *cb,
68 const struct smc_diag_req *req,
69 struct nlattr *bc)
70{
71 struct smc_sock *smc = smc_sk(sk);
72 struct user_namespace *user_ns;
73 struct smc_diag_msg *r;
74 struct nlmsghdr *nlh;
75
76 nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
77 cb->nlh->nlmsg_type, sizeof(*r), NLM_F_MULTI);
78 if (!nlh)
79 return -EMSGSIZE;
80
81 r = nlmsg_data(nlh);
82 smc_diag_msg_common_fill(r, sk);
83 r->diag_state = sk->sk_state;
84 r->diag_fallback = smc->use_fallback;
85 user_ns = sk_user_ns(NETLINK_CB(cb->skb).sk);
86 if (smc_diag_msg_attrs_fill(sk, skb, r, user_ns))
87 goto errout;
88
89 if ((req->diag_ext & (1 << (SMC_DIAG_CONNINFO - 1))) && smc->conn.lgr) {
90 struct smc_connection *conn = &smc->conn;
91 struct smc_diag_conninfo cinfo = {
92 .token = conn->alert_token_local,
93 .sndbuf_size = conn->sndbuf_size,
94 .rmbe_size = conn->rmbe_size,
95 .peer_rmbe_size = conn->peer_rmbe_size,
96
97 .rx_prod.wrap = conn->local_rx_ctrl.prod.wrap,
98 .rx_prod.count = conn->local_rx_ctrl.prod.count,
99 .rx_cons.wrap = conn->local_rx_ctrl.cons.wrap,
100 .rx_cons.count = conn->local_rx_ctrl.cons.count,
101
102 .tx_prod.wrap = conn->local_tx_ctrl.prod.wrap,
103 .tx_prod.count = conn->local_tx_ctrl.prod.count,
104 .tx_cons.wrap = conn->local_tx_ctrl.cons.wrap,
105 .tx_cons.count = conn->local_tx_ctrl.cons.count,
106
107 .tx_prod_flags =
108 *(u8 *)&conn->local_tx_ctrl.prod_flags,
109 .tx_conn_state_flags =
110 *(u8 *)&conn->local_tx_ctrl.conn_state_flags,
111 .rx_prod_flags = *(u8 *)&conn->local_rx_ctrl.prod_flags,
112 .rx_conn_state_flags =
113 *(u8 *)&conn->local_rx_ctrl.conn_state_flags,
114
115 .tx_prep.wrap = conn->tx_curs_prep.wrap,
116 .tx_prep.count = conn->tx_curs_prep.count,
117 .tx_sent.wrap = conn->tx_curs_sent.wrap,
118 .tx_sent.count = conn->tx_curs_sent.count,
119 .tx_fin.wrap = conn->tx_curs_fin.wrap,
120 .tx_fin.count = conn->tx_curs_fin.count,
121 };
122
123 if (nla_put(skb, SMC_DIAG_CONNINFO, sizeof(cinfo), &cinfo) < 0)
124 goto errout;
125 }
126
127 if ((req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) && smc->conn.lgr) {
128 struct smc_diag_lgrinfo linfo = {
129 .role = smc->conn.lgr->role,
130 .lnk[0].ibport = smc->conn.lgr->lnk[0].ibport,
131 .lnk[0].link_id = smc->conn.lgr->lnk[0].link_id,
132 };
133
134 memcpy(linfo.lnk[0].ibname,
135 smc->conn.lgr->lnk[0].smcibdev->ibdev->name,
136 sizeof(smc->conn.lgr->lnk[0].smcibdev->ibdev->name));
137 smc_gid_be16_convert(linfo.lnk[0].gid,
138 smc->conn.lgr->lnk[0].gid.raw);
139 smc_gid_be16_convert(linfo.lnk[0].peer_gid,
140 smc->conn.lgr->lnk[0].peer_gid);
141
142 if (nla_put(skb, SMC_DIAG_LGRINFO, sizeof(linfo), &linfo) < 0)
143 goto errout;
144 }
145
146 nlmsg_end(skb, nlh);
147 return 0;
148
149errout:
150 nlmsg_cancel(skb, nlh);
151 return -EMSGSIZE;
152}
153
154static int smc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
155{
156 struct net *net = sock_net(skb->sk);
157 struct nlattr *bc = NULL;
158 struct hlist_head *head;
159 struct sock *sk;
160 int rc = 0;
161
162 read_lock(&smc_proto.h.smc_hash->lock);
163 head = &smc_proto.h.smc_hash->ht;
164 if (hlist_empty(head))
165 goto out;
166
167 sk_for_each(sk, head) {
168 if (!net_eq(sock_net(sk), net))
169 continue;
170 rc = __smc_diag_dump(sk, skb, cb, nlmsg_data(cb->nlh), bc);
171 if (rc)
172 break;
173 }
174
175out:
176 read_unlock(&smc_proto.h.smc_hash->lock);
177 return rc;
178}
179
180static int smc_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
181{
182 struct net *net = sock_net(skb->sk);
183
184 if (h->nlmsg_type == SOCK_DIAG_BY_FAMILY &&
185 h->nlmsg_flags & NLM_F_DUMP) {
186 {
187 struct netlink_dump_control c = {
188 .dump = smc_diag_dump,
189 .min_dump_alloc = SKB_WITH_OVERHEAD(32768),
190 };
191 return netlink_dump_start(net->diag_nlsk, skb, h, &c);
192 }
193 }
194 return 0;
195}
196
197static const struct sock_diag_handler smc_diag_handler = {
198 .family = AF_SMC,
199 .dump = smc_diag_handler_dump,
200};
201
202static int __init smc_diag_init(void)
203{
204 return sock_diag_register(&smc_diag_handler);
205}
206
207static void __exit smc_diag_exit(void)
208{
209 sock_diag_unregister(&smc_diag_handler);
210}
211
212module_init(smc_diag_init);
213module_exit(smc_diag_exit);
214MODULE_LICENSE("GPL");
215MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 43 /* AF_SMC */);
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
new file mode 100644
index 000000000000..e6743c008ac5
--- /dev/null
+++ b/net/smc/smc_ib.c
@@ -0,0 +1,466 @@
1/*
2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
3 *
4 * IB infrastructure:
5 * Establish SMC-R as an Infiniband Client to be notified about added and
6 * removed IB devices of type RDMA.
7 * Determine device and port characteristics for these IB devices.
8 *
9 * Copyright IBM Corp. 2016
10 *
11 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
12 */
13
14#include <linux/random.h>
15#include <linux/workqueue.h>
16#include <rdma/ib_verbs.h>
17
18#include "smc_pnet.h"
19#include "smc_ib.h"
20#include "smc_core.h"
21#include "smc_wr.h"
22#include "smc.h"
23
24#define SMC_QP_MIN_RNR_TIMER 5
25#define SMC_QP_TIMEOUT 15 /* 4096 * 2 ** timeout usec */
26#define SMC_QP_RETRY_CNT 7 /* 7: infinite */
27#define SMC_QP_RNR_RETRY 7 /* 7: infinite */
28
29struct smc_ib_devices smc_ib_devices = { /* smc-registered ib devices */
30 .lock = __SPIN_LOCK_UNLOCKED(smc_ib_devices.lock),
31 .list = LIST_HEAD_INIT(smc_ib_devices.list),
32};
33
34#define SMC_LOCAL_SYSTEMID_RESET "%%%%%%%"
35
36u8 local_systemid[SMC_SYSTEMID_LEN] = SMC_LOCAL_SYSTEMID_RESET; /* unique system
37 * identifier
38 */
39
40int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags,
41 struct ib_mr **mr)
42{
43 int rc;
44
45 if (*mr)
46 return 0; /* already done */
47
48 /* obtain unique key -
49 * next invocation of get_dma_mr returns a different key!
50 */
51 *mr = pd->device->get_dma_mr(pd, access_flags);
52 rc = PTR_ERR_OR_ZERO(*mr);
53 if (IS_ERR(*mr))
54 *mr = NULL;
55 return rc;
56}
57
58static int smc_ib_modify_qp_init(struct smc_link *lnk)
59{
60 struct ib_qp_attr qp_attr;
61
62 memset(&qp_attr, 0, sizeof(qp_attr));
63 qp_attr.qp_state = IB_QPS_INIT;
64 qp_attr.pkey_index = 0;
65 qp_attr.port_num = lnk->ibport;
66 qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE
67 | IB_ACCESS_REMOTE_WRITE;
68 return ib_modify_qp(lnk->roce_qp, &qp_attr,
69 IB_QP_STATE | IB_QP_PKEY_INDEX |
70 IB_QP_ACCESS_FLAGS | IB_QP_PORT);
71}
72
73static int smc_ib_modify_qp_rtr(struct smc_link *lnk)
74{
75 enum ib_qp_attr_mask qp_attr_mask =
76 IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN |
77 IB_QP_RQ_PSN | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER;
78 struct ib_qp_attr qp_attr;
79
80 memset(&qp_attr, 0, sizeof(qp_attr));
81 qp_attr.qp_state = IB_QPS_RTR;
82 qp_attr.path_mtu = min(lnk->path_mtu, lnk->peer_mtu);
83 qp_attr.ah_attr.port_num = lnk->ibport;
84 qp_attr.ah_attr.ah_flags = IB_AH_GRH;
85 qp_attr.ah_attr.grh.hop_limit = 1;
86 memcpy(&qp_attr.ah_attr.grh.dgid, lnk->peer_gid,
87 sizeof(lnk->peer_gid));
88 memcpy(&qp_attr.ah_attr.dmac, lnk->peer_mac,
89 sizeof(lnk->peer_mac));
90 qp_attr.dest_qp_num = lnk->peer_qpn;
91 qp_attr.rq_psn = lnk->peer_psn; /* starting receive packet seq # */
92 qp_attr.max_dest_rd_atomic = 1; /* max # of resources for incoming
93 * requests
94 */
95 qp_attr.min_rnr_timer = SMC_QP_MIN_RNR_TIMER;
96
97 return ib_modify_qp(lnk->roce_qp, &qp_attr, qp_attr_mask);
98}
99
100int smc_ib_modify_qp_rts(struct smc_link *lnk)
101{
102 struct ib_qp_attr qp_attr;
103
104 memset(&qp_attr, 0, sizeof(qp_attr));
105 qp_attr.qp_state = IB_QPS_RTS;
106 qp_attr.timeout = SMC_QP_TIMEOUT; /* local ack timeout */
107 qp_attr.retry_cnt = SMC_QP_RETRY_CNT; /* retry count */
108 qp_attr.rnr_retry = SMC_QP_RNR_RETRY; /* RNR retries, 7=infinite */
109 qp_attr.sq_psn = lnk->psn_initial; /* starting send packet seq # */
110 qp_attr.max_rd_atomic = 1; /* # of outstanding RDMA reads and
111 * atomic ops allowed
112 */
113 return ib_modify_qp(lnk->roce_qp, &qp_attr,
114 IB_QP_STATE | IB_QP_TIMEOUT | IB_QP_RETRY_CNT |
115 IB_QP_SQ_PSN | IB_QP_RNR_RETRY |
116 IB_QP_MAX_QP_RD_ATOMIC);
117}
118
119int smc_ib_modify_qp_reset(struct smc_link *lnk)
120{
121 struct ib_qp_attr qp_attr;
122
123 memset(&qp_attr, 0, sizeof(qp_attr));
124 qp_attr.qp_state = IB_QPS_RESET;
125 return ib_modify_qp(lnk->roce_qp, &qp_attr, IB_QP_STATE);
126}
127
128int smc_ib_ready_link(struct smc_link *lnk)
129{
130 struct smc_link_group *lgr =
131 container_of(lnk, struct smc_link_group, lnk[0]);
132 int rc = 0;
133
134 rc = smc_ib_modify_qp_init(lnk);
135 if (rc)
136 goto out;
137
138 rc = smc_ib_modify_qp_rtr(lnk);
139 if (rc)
140 goto out;
141 smc_wr_remember_qp_attr(lnk);
142 rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_recv,
143 IB_CQ_SOLICITED_MASK);
144 if (rc)
145 goto out;
146 rc = smc_wr_rx_post_init(lnk);
147 if (rc)
148 goto out;
149 smc_wr_remember_qp_attr(lnk);
150
151 if (lgr->role == SMC_SERV) {
152 rc = smc_ib_modify_qp_rts(lnk);
153 if (rc)
154 goto out;
155 smc_wr_remember_qp_attr(lnk);
156 }
157out:
158 return rc;
159}
160
161/* process context wrapper for might_sleep smc_ib_remember_port_attr */
162static void smc_ib_port_event_work(struct work_struct *work)
163{
164 struct smc_ib_device *smcibdev = container_of(
165 work, struct smc_ib_device, port_event_work);
166 u8 port_idx;
167
168 for_each_set_bit(port_idx, &smcibdev->port_event_mask, SMC_MAX_PORTS) {
169 smc_ib_remember_port_attr(smcibdev, port_idx + 1);
170 clear_bit(port_idx, &smcibdev->port_event_mask);
171 }
172}
173
174/* can be called in IRQ context */
175static void smc_ib_global_event_handler(struct ib_event_handler *handler,
176 struct ib_event *ibevent)
177{
178 struct smc_ib_device *smcibdev;
179 u8 port_idx;
180
181 smcibdev = container_of(handler, struct smc_ib_device, event_handler);
182 if (!smc_pnet_find_ib(smcibdev->ibdev->name))
183 return;
184
185 switch (ibevent->event) {
186 case IB_EVENT_PORT_ERR:
187 port_idx = ibevent->element.port_num - 1;
188 set_bit(port_idx, &smcibdev->port_event_mask);
189 schedule_work(&smcibdev->port_event_work);
190 /* fall through */
191 case IB_EVENT_DEVICE_FATAL:
192 /* tbd in follow-on patch:
193 * abnormal close of corresponding connections
194 */
195 break;
196 case IB_EVENT_PORT_ACTIVE:
197 port_idx = ibevent->element.port_num - 1;
198 set_bit(port_idx, &smcibdev->port_event_mask);
199 schedule_work(&smcibdev->port_event_work);
200 break;
201 default:
202 break;
203 }
204}
205
206void smc_ib_dealloc_protection_domain(struct smc_link *lnk)
207{
208 ib_dealloc_pd(lnk->roce_pd);
209 lnk->roce_pd = NULL;
210}
211
212int smc_ib_create_protection_domain(struct smc_link *lnk)
213{
214 int rc;
215
216 lnk->roce_pd = ib_alloc_pd(lnk->smcibdev->ibdev, 0);
217 rc = PTR_ERR_OR_ZERO(lnk->roce_pd);
218 if (IS_ERR(lnk->roce_pd))
219 lnk->roce_pd = NULL;
220 return rc;
221}
222
223static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv)
224{
225 switch (ibevent->event) {
226 case IB_EVENT_DEVICE_FATAL:
227 case IB_EVENT_GID_CHANGE:
228 case IB_EVENT_PORT_ERR:
229 case IB_EVENT_QP_ACCESS_ERR:
230 /* tbd in follow-on patch:
231 * abnormal close of corresponding connections
232 */
233 break;
234 default:
235 break;
236 }
237}
238
239void smc_ib_destroy_queue_pair(struct smc_link *lnk)
240{
241 ib_destroy_qp(lnk->roce_qp);
242 lnk->roce_qp = NULL;
243}
244
245/* create a queue pair within the protection domain for a link */
246int smc_ib_create_queue_pair(struct smc_link *lnk)
247{
248 struct ib_qp_init_attr qp_attr = {
249 .event_handler = smc_ib_qp_event_handler,
250 .qp_context = lnk,
251 .send_cq = lnk->smcibdev->roce_cq_send,
252 .recv_cq = lnk->smcibdev->roce_cq_recv,
253 .srq = NULL,
254 .cap = {
255 .max_send_wr = SMC_WR_BUF_CNT,
256 /* include unsolicited rdma_writes as well,
257 * there are max. 2 RDMA_WRITE per 1 WR_SEND
258 */
259 .max_recv_wr = SMC_WR_BUF_CNT * 3,
260 .max_send_sge = SMC_IB_MAX_SEND_SGE,
261 .max_recv_sge = 1,
262 .max_inline_data = SMC_WR_TX_SIZE,
263 },
264 .sq_sig_type = IB_SIGNAL_REQ_WR,
265 .qp_type = IB_QPT_RC,
266 };
267 int rc;
268
269 lnk->roce_qp = ib_create_qp(lnk->roce_pd, &qp_attr);
270 rc = PTR_ERR_OR_ZERO(lnk->roce_qp);
271 if (IS_ERR(lnk->roce_qp))
272 lnk->roce_qp = NULL;
273 else
274 smc_wr_remember_qp_attr(lnk);
275 return rc;
276}
277
278/* map a new TX or RX buffer to DMA */
279int smc_ib_buf_map(struct smc_ib_device *smcibdev, int buf_size,
280 struct smc_buf_desc *buf_slot,
281 enum dma_data_direction data_direction)
282{
283 int rc = 0;
284
285 if (buf_slot->dma_addr[SMC_SINGLE_LINK])
286 return rc; /* already mapped */
287 buf_slot->dma_addr[SMC_SINGLE_LINK] =
288 ib_dma_map_single(smcibdev->ibdev, buf_slot->cpu_addr,
289 buf_size, data_direction);
290 if (ib_dma_mapping_error(smcibdev->ibdev,
291 buf_slot->dma_addr[SMC_SINGLE_LINK]))
292 rc = -EIO;
293 return rc;
294}
295
296void smc_ib_buf_unmap(struct smc_ib_device *smcibdev, int buf_size,
297 struct smc_buf_desc *buf_slot,
298 enum dma_data_direction data_direction)
299{
300 if (!buf_slot->dma_addr[SMC_SINGLE_LINK])
301 return; /* already unmapped */
302 ib_dma_unmap_single(smcibdev->ibdev, *buf_slot->dma_addr, buf_size,
303 data_direction);
304 buf_slot->dma_addr[SMC_SINGLE_LINK] = 0;
305}
306
307static int smc_ib_fill_gid_and_mac(struct smc_ib_device *smcibdev, u8 ibport)
308{
309 struct net_device *ndev;
310 int rc;
311
312 rc = ib_query_gid(smcibdev->ibdev, ibport, 0,
313 &smcibdev->gid[ibport - 1], NULL);
314 /* the SMC protocol requires specification of the roce MAC address;
315 * if net_device cannot be determined, it can be derived from gid 0
316 */
317 ndev = smcibdev->ibdev->get_netdev(smcibdev->ibdev, ibport);
318 if (ndev) {
319 memcpy(&smcibdev->mac, ndev->dev_addr, ETH_ALEN);
320 } else if (!rc) {
321 memcpy(&smcibdev->mac[ibport - 1][0],
322 &smcibdev->gid[ibport - 1].raw[8], 3);
323 memcpy(&smcibdev->mac[ibport - 1][3],
324 &smcibdev->gid[ibport - 1].raw[13], 3);
325 smcibdev->mac[ibport - 1][0] &= ~0x02;
326 }
327 return rc;
328}
329
330/* Create an identifier unique for this instance of SMC-R.
331 * The MAC-address of the first active registered IB device
332 * plus a random 2-byte number is used to create this identifier.
333 * This name is delivered to the peer during connection initialization.
334 */
335static inline void smc_ib_define_local_systemid(struct smc_ib_device *smcibdev,
336 u8 ibport)
337{
338 memcpy(&local_systemid[2], &smcibdev->mac[ibport - 1],
339 sizeof(smcibdev->mac[ibport - 1]));
340 get_random_bytes(&local_systemid[0], 2);
341}
342
343bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport)
344{
345 return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE;
346}
347
348int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport)
349{
350 int rc;
351
352 memset(&smcibdev->pattr[ibport - 1], 0,
353 sizeof(smcibdev->pattr[ibport - 1]));
354 rc = ib_query_port(smcibdev->ibdev, ibport,
355 &smcibdev->pattr[ibport - 1]);
356 if (rc)
357 goto out;
358 rc = smc_ib_fill_gid_and_mac(smcibdev, ibport);
359 if (rc)
360 goto out;
361 if (!strncmp(local_systemid, SMC_LOCAL_SYSTEMID_RESET,
362 sizeof(local_systemid)) &&
363 smc_ib_port_active(smcibdev, ibport))
364 /* create unique system identifier */
365 smc_ib_define_local_systemid(smcibdev, ibport);
366out:
367 return rc;
368}
369
370long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev)
371{
372 struct ib_cq_init_attr cqattr = {
373 .cqe = SMC_WR_MAX_CQE, .comp_vector = 0 };
374 long rc;
375
376 smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev,
377 smc_wr_tx_cq_handler, NULL,
378 smcibdev, &cqattr);
379 rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_send);
380 if (IS_ERR(smcibdev->roce_cq_send)) {
381 smcibdev->roce_cq_send = NULL;
382 return rc;
383 }
384 smcibdev->roce_cq_recv = ib_create_cq(smcibdev->ibdev,
385 smc_wr_rx_cq_handler, NULL,
386 smcibdev, &cqattr);
387 rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_recv);
388 if (IS_ERR(smcibdev->roce_cq_recv)) {
389 smcibdev->roce_cq_recv = NULL;
390 goto err;
391 }
392 INIT_IB_EVENT_HANDLER(&smcibdev->event_handler, smcibdev->ibdev,
393 smc_ib_global_event_handler);
394 ib_register_event_handler(&smcibdev->event_handler);
395 smc_wr_add_dev(smcibdev);
396 smcibdev->initialized = 1;
397 return rc;
398
399err:
400 ib_destroy_cq(smcibdev->roce_cq_send);
401 return rc;
402}
403
404static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev)
405{
406 if (!smcibdev->initialized)
407 return;
408 smc_wr_remove_dev(smcibdev);
409 ib_unregister_event_handler(&smcibdev->event_handler);
410 ib_destroy_cq(smcibdev->roce_cq_recv);
411 ib_destroy_cq(smcibdev->roce_cq_send);
412}
413
414static struct ib_client smc_ib_client;
415
416/* callback function for ib_register_client() */
417static void smc_ib_add_dev(struct ib_device *ibdev)
418{
419 struct smc_ib_device *smcibdev;
420
421 if (ibdev->node_type != RDMA_NODE_IB_CA)
422 return;
423
424 smcibdev = kzalloc(sizeof(*smcibdev), GFP_KERNEL);
425 if (!smcibdev)
426 return;
427
428 smcibdev->ibdev = ibdev;
429 INIT_WORK(&smcibdev->port_event_work, smc_ib_port_event_work);
430
431 spin_lock(&smc_ib_devices.lock);
432 list_add_tail(&smcibdev->list, &smc_ib_devices.list);
433 spin_unlock(&smc_ib_devices.lock);
434 ib_set_client_data(ibdev, &smc_ib_client, smcibdev);
435}
436
437/* callback function for ib_register_client() */
438static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data)
439{
440 struct smc_ib_device *smcibdev;
441
442 smcibdev = ib_get_client_data(ibdev, &smc_ib_client);
443 ib_set_client_data(ibdev, &smc_ib_client, NULL);
444 spin_lock(&smc_ib_devices.lock);
445 list_del_init(&smcibdev->list); /* remove from smc_ib_devices */
446 spin_unlock(&smc_ib_devices.lock);
447 smc_pnet_remove_by_ibdev(smcibdev);
448 smc_ib_cleanup_per_ibdev(smcibdev);
449 kfree(smcibdev);
450}
451
452static struct ib_client smc_ib_client = {
453 .name = "smc_ib",
454 .add = smc_ib_add_dev,
455 .remove = smc_ib_remove_dev,
456};
457
458int __init smc_ib_register_client(void)
459{
460 return ib_register_client(&smc_ib_client);
461}
462
463void smc_ib_unregister_client(void)
464{
465 ib_unregister_client(&smc_ib_client);
466}
diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h
new file mode 100644
index 000000000000..a95f74bb5569
--- /dev/null
+++ b/net/smc/smc_ib.h
@@ -0,0 +1,71 @@
1/*
2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
3 *
4 * Definitions for IB environment
5 *
6 * Copyright IBM Corp. 2016
7 *
8 * Author(s): Ursula Braun <Ursula Braun@linux.vnet.ibm.com>
9 */
10
11#ifndef _SMC_IB_H
12#define _SMC_IB_H
13
14#include <linux/if_ether.h>
15#include <rdma/ib_verbs.h>
16
17#define SMC_MAX_PORTS 2 /* Max # of ports */
18#define SMC_GID_SIZE sizeof(union ib_gid)
19
20#define SMC_IB_MAX_SEND_SGE 2
21
22struct smc_ib_devices { /* list of smc ib devices definition */
23 struct list_head list;
24 spinlock_t lock; /* protects list of smc ib devices */
25};
26
27extern struct smc_ib_devices smc_ib_devices; /* list of smc ib devices */
28
29struct smc_ib_device { /* ib-device infos for smc */
30 struct list_head list;
31 struct ib_device *ibdev;
32 struct ib_port_attr pattr[SMC_MAX_PORTS]; /* ib dev. port attrs */
33 struct ib_event_handler event_handler; /* global ib_event handler */
34 struct ib_cq *roce_cq_send; /* send completion queue */
35 struct ib_cq *roce_cq_recv; /* recv completion queue */
36 struct tasklet_struct send_tasklet; /* called by send cq handler */
37 struct tasklet_struct recv_tasklet; /* called by recv cq handler */
38 char mac[SMC_MAX_PORTS][ETH_ALEN];
39 /* mac address per port*/
40 union ib_gid gid[SMC_MAX_PORTS]; /* gid per port */
41 u8 initialized : 1; /* ib dev CQ, evthdl done */
42 struct work_struct port_event_work;
43 unsigned long port_event_mask;
44};
45
46struct smc_buf_desc;
47struct smc_link;
48
49int smc_ib_register_client(void) __init;
50void smc_ib_unregister_client(void);
51bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport);
52int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport);
53int smc_ib_buf_map(struct smc_ib_device *smcibdev, int buf_size,
54 struct smc_buf_desc *buf_slot,
55 enum dma_data_direction data_direction);
56void smc_ib_buf_unmap(struct smc_ib_device *smcibdev, int bufsize,
57 struct smc_buf_desc *buf_slot,
58 enum dma_data_direction data_direction);
59void smc_ib_dealloc_protection_domain(struct smc_link *lnk);
60int smc_ib_create_protection_domain(struct smc_link *lnk);
61void smc_ib_destroy_queue_pair(struct smc_link *lnk);
62int smc_ib_create_queue_pair(struct smc_link *lnk);
63int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags,
64 struct ib_mr **mr);
65int smc_ib_ready_link(struct smc_link *lnk);
66int smc_ib_modify_qp_rts(struct smc_link *lnk);
67int smc_ib_modify_qp_reset(struct smc_link *lnk);
68long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev);
69
70
71#endif
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
new file mode 100644
index 000000000000..c2f9165d13ef
--- /dev/null
+++ b/net/smc/smc_llc.c
@@ -0,0 +1,158 @@
1/*
2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
3 *
4 * Link Layer Control (LLC)
5 *
6 * For now, we only support the necessary "confirm link" functionality
7 * which happens for the first RoCE link after successful CLC handshake.
8 *
9 * Copyright IBM Corp. 2016
10 *
11 * Author(s): Klaus Wacker <Klaus.Wacker@de.ibm.com>
12 * Ursula Braun <ubraun@linux.vnet.ibm.com>
13 */
14
15#include <net/tcp.h>
16#include <rdma/ib_verbs.h>
17
18#include "smc.h"
19#include "smc_core.h"
20#include "smc_clc.h"
21#include "smc_llc.h"
22
23/********************************** send *************************************/
24
25struct smc_llc_tx_pend {
26};
27
28/* handler for send/transmission completion of an LLC msg */
29static void smc_llc_tx_handler(struct smc_wr_tx_pend_priv *pend,
30 struct smc_link *link,
31 enum ib_wc_status wc_status)
32{
33 /* future work: handle wc_status error for recovery and failover */
34}
35
36/**
37 * smc_llc_add_pending_send() - add LLC control message to pending WQE transmits
38 * @link: Pointer to SMC link used for sending LLC control message.
39 * @wr_buf: Out variable returning pointer to work request payload buffer.
40 * @pend: Out variable returning pointer to private pending WR tracking.
41 * It's the context the transmit complete handler will get.
42 *
43 * Reserves and pre-fills an entry for a pending work request send/tx.
44 * Used by mid-level smc_llc_send_msg() to prepare for later actual send/tx.
45 * Can sleep due to smc_get_ctrl_buf (if not in softirq context).
46 *
47 * Return: 0 on success, otherwise an error value.
48 */
49static int smc_llc_add_pending_send(struct smc_link *link,
50 struct smc_wr_buf **wr_buf,
51 struct smc_wr_tx_pend_priv **pend)
52{
53 int rc;
54
55 rc = smc_wr_tx_get_free_slot(link, smc_llc_tx_handler, wr_buf, pend);
56 if (rc < 0)
57 return rc;
58 BUILD_BUG_ON_MSG(
59 sizeof(union smc_llc_msg) > SMC_WR_BUF_SIZE,
60 "must increase SMC_WR_BUF_SIZE to at least sizeof(struct smc_llc_msg)");
61 BUILD_BUG_ON_MSG(
62 sizeof(union smc_llc_msg) != SMC_WR_TX_SIZE,
63 "must adapt SMC_WR_TX_SIZE to sizeof(struct smc_llc_msg); if not all smc_wr upper layer protocols use the same message size any more, must start to set link->wr_tx_sges[i].length on each individual smc_wr_tx_send()");
64 BUILD_BUG_ON_MSG(
65 sizeof(struct smc_llc_tx_pend) > SMC_WR_TX_PEND_PRIV_SIZE,
66 "must increase SMC_WR_TX_PEND_PRIV_SIZE to at least sizeof(struct smc_llc_tx_pend)");
67 return 0;
68}
69
70/* high-level API to send LLC confirm link */
71int smc_llc_send_confirm_link(struct smc_link *link, u8 mac[],
72 union ib_gid *gid,
73 enum smc_llc_reqresp reqresp)
74{
75 struct smc_link_group *lgr = container_of(link, struct smc_link_group,
76 lnk[SMC_SINGLE_LINK]);
77 struct smc_llc_msg_confirm_link *confllc;
78 struct smc_wr_tx_pend_priv *pend;
79 struct smc_wr_buf *wr_buf;
80 int rc;
81
82 rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
83 if (rc)
84 return rc;
85 confllc = (struct smc_llc_msg_confirm_link *)wr_buf;
86 memset(confllc, 0, sizeof(*confllc));
87 confllc->hd.common.type = SMC_LLC_CONFIRM_LINK;
88 confllc->hd.length = sizeof(struct smc_llc_msg_confirm_link);
89 if (reqresp == SMC_LLC_RESP)
90 confllc->hd.flags |= SMC_LLC_FLAG_RESP;
91 memcpy(confllc->sender_mac, mac, ETH_ALEN);
92 memcpy(confllc->sender_gid, gid, SMC_GID_SIZE);
93 hton24(confllc->sender_qp_num, link->roce_qp->qp_num);
94 /* confllc->link_num = SMC_SINGLE_LINK; already done by memset above */
95 memcpy(confllc->link_uid, lgr->id, SMC_LGR_ID_SIZE);
96 confllc->max_links = SMC_LINKS_PER_LGR_MAX;
97 /* send llc message */
98 rc = smc_wr_tx_send(link, pend);
99 return rc;
100}
101
102/********************************* receive ***********************************/
103
104static void smc_llc_rx_confirm_link(struct smc_link *link,
105 struct smc_llc_msg_confirm_link *llc)
106{
107 struct smc_link_group *lgr;
108
109 lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]);
110 if (llc->hd.flags & SMC_LLC_FLAG_RESP) {
111 if (lgr->role == SMC_SERV)
112 complete(&link->llc_confirm_resp);
113 } else {
114 if (lgr->role == SMC_CLNT) {
115 link->link_id = llc->link_num;
116 complete(&link->llc_confirm);
117 }
118 }
119}
120
121static void smc_llc_rx_handler(struct ib_wc *wc, void *buf)
122{
123 struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
124 union smc_llc_msg *llc = buf;
125
126 if (wc->byte_len < sizeof(*llc))
127 return; /* short message */
128 if (llc->raw.hdr.length != sizeof(*llc))
129 return; /* invalid message */
130 if (llc->raw.hdr.common.type == SMC_LLC_CONFIRM_LINK)
131 smc_llc_rx_confirm_link(link, &llc->confirm_link);
132}
133
134/***************************** init, exit, misc ******************************/
135
136static struct smc_wr_rx_handler smc_llc_rx_handlers[] = {
137 {
138 .handler = smc_llc_rx_handler,
139 .type = SMC_LLC_CONFIRM_LINK
140 },
141 {
142 .handler = NULL,
143 }
144};
145
146int __init smc_llc_init(void)
147{
148 struct smc_wr_rx_handler *handler;
149 int rc = 0;
150
151 for (handler = smc_llc_rx_handlers; handler->handler; handler++) {
152 INIT_HLIST_NODE(&handler->list);
153 rc = smc_wr_rx_register_handler(handler);
154 if (rc)
155 break;
156 }
157 return rc;
158}
diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h
new file mode 100644
index 000000000000..b472f853953a
--- /dev/null
+++ b/net/smc/smc_llc.h
@@ -0,0 +1,63 @@
1/*
2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
3 *
4 * Definitions for LLC (link layer control) message handling
5 *
6 * Copyright IBM Corp. 2016
7 *
8 * Author(s): Klaus Wacker <Klaus.Wacker@de.ibm.com>
9 * Ursula Braun <ubraun@linux.vnet.ibm.com>
10 */
11
12#ifndef SMC_LLC_H
13#define SMC_LLC_H
14
15#include "smc_wr.h"
16
17#define SMC_LLC_FLAG_RESP 0x80
18
19#define SMC_LLC_WAIT_FIRST_TIME (5 * HZ)
20
21enum smc_llc_reqresp {
22 SMC_LLC_REQ,
23 SMC_LLC_RESP
24};
25
26enum smc_llc_msg_type {
27 SMC_LLC_CONFIRM_LINK = 0x01,
28};
29
30#define SMC_LLC_DATA_LEN 40
31
32struct smc_llc_hdr {
33 struct smc_wr_rx_hdr common;
34 u8 length; /* 44 */
35 u8 reserved;
36 u8 flags;
37};
38
39struct smc_llc_msg_confirm_link { /* type 0x01 */
40 struct smc_llc_hdr hd;
41 u8 sender_mac[ETH_ALEN];
42 u8 sender_gid[SMC_GID_SIZE];
43 u8 sender_qp_num[3];
44 u8 link_num;
45 u8 link_uid[SMC_LGR_ID_SIZE];
46 u8 max_links;
47 u8 reserved[9];
48};
49
50union smc_llc_msg {
51 struct smc_llc_msg_confirm_link confirm_link;
52 struct {
53 struct smc_llc_hdr hdr;
54 u8 data[SMC_LLC_DATA_LEN];
55 } raw;
56};
57
58/* transmit */
59int smc_llc_send_confirm_link(struct smc_link *lnk, u8 mac[], union ib_gid *gid,
60 enum smc_llc_reqresp reqresp);
61int smc_llc_init(void) __init;
62
63#endif /* SMC_LLC_H */
diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c
new file mode 100644
index 000000000000..9d3e7fb8348d
--- /dev/null
+++ b/net/smc/smc_pnet.c
@@ -0,0 +1,534 @@
1/*
2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
3 *
4 * Generic netlink support functions to configure an SMC-R PNET table
5 *
6 * Copyright IBM Corp. 2016
7 *
8 * Author(s): Thomas Richter <tmricht@linux.vnet.ibm.com>
9 */
10
11#include <linux/module.h>
12#include <linux/list.h>
13#include <linux/ctype.h>
14#include <net/netlink.h>
15#include <net/genetlink.h>
16
17#include <uapi/linux/if.h>
18#include <uapi/linux/smc.h>
19
20#include <rdma/ib_verbs.h>
21
22#include "smc_pnet.h"
23#include "smc_ib.h"
24
25#define SMC_MAX_PNET_ID_LEN 16 /* Max. length of PNET id */
26
27static struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = {
28 [SMC_PNETID_NAME] = {
29 .type = NLA_NUL_STRING,
30 .len = SMC_MAX_PNET_ID_LEN - 1
31 },
32 [SMC_PNETID_ETHNAME] = {
33 .type = NLA_NUL_STRING,
34 .len = IFNAMSIZ - 1
35 },
36 [SMC_PNETID_IBNAME] = {
37 .type = NLA_NUL_STRING,
38 .len = IB_DEVICE_NAME_MAX - 1
39 },
40 [SMC_PNETID_IBPORT] = { .type = NLA_U8 }
41};
42
43static struct genl_family smc_pnet_nl_family;
44
45/**
46 * struct smc_pnettable - SMC PNET table anchor
47 * @lock: Lock for list action
48 * @pnetlist: List of PNETIDs
49 */
50static struct smc_pnettable {
51 rwlock_t lock;
52 struct list_head pnetlist;
53} smc_pnettable = {
54 .pnetlist = LIST_HEAD_INIT(smc_pnettable.pnetlist),
55 .lock = __RW_LOCK_UNLOCKED(smc_pnettable.lock)
56};
57
58/**
59 * struct smc_pnetentry - pnet identifier name entry
60 * @list: List node.
61 * @pnet_name: Pnet identifier name
62 * @ndev: pointer to network device.
63 * @smcibdev: Pointer to IB device.
64 */
65struct smc_pnetentry {
66 struct list_head list;
67 char pnet_name[SMC_MAX_PNET_ID_LEN + 1];
68 struct net_device *ndev;
69 struct smc_ib_device *smcibdev;
70 u8 ib_port;
71};
72
73/* Check if two RDMA device entries are identical. Use device name and port
74 * number for comparison.
75 */
76static bool smc_pnet_same_ibname(struct smc_pnetentry *pnetelem, char *ibname,
77 u8 ibport)
78{
79 return pnetelem->ib_port == ibport &&
80 !strncmp(pnetelem->smcibdev->ibdev->name, ibname,
81 sizeof(pnetelem->smcibdev->ibdev->name));
82}
83
84/* Find a pnetid in the pnet table.
85 */
86static struct smc_pnetentry *smc_pnet_find_pnetid(char *pnet_name)
87{
88 struct smc_pnetentry *pnetelem, *found_pnetelem = NULL;
89
90 read_lock(&smc_pnettable.lock);
91 list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) {
92 if (!strncmp(pnetelem->pnet_name, pnet_name,
93 sizeof(pnetelem->pnet_name))) {
94 found_pnetelem = pnetelem;
95 break;
96 }
97 }
98 read_unlock(&smc_pnettable.lock);
99 return found_pnetelem;
100}
101
102/* Remove a pnetid from the pnet table.
103 */
104static int smc_pnet_remove_by_pnetid(char *pnet_name)
105{
106 struct smc_pnetentry *pnetelem, *tmp_pe;
107 int rc = -ENOENT;
108
109 write_lock(&smc_pnettable.lock);
110 list_for_each_entry_safe(pnetelem, tmp_pe, &smc_pnettable.pnetlist,
111 list) {
112 if (!strncmp(pnetelem->pnet_name, pnet_name,
113 sizeof(pnetelem->pnet_name))) {
114 list_del(&pnetelem->list);
115 dev_put(pnetelem->ndev);
116 kfree(pnetelem);
117 rc = 0;
118 break;
119 }
120 }
121 write_unlock(&smc_pnettable.lock);
122 return rc;
123}
124
125/* Remove a pnet entry mentioning a given network device from the pnet table.
126 */
127static int smc_pnet_remove_by_ndev(struct net_device *ndev)
128{
129 struct smc_pnetentry *pnetelem, *tmp_pe;
130 int rc = -ENOENT;
131
132 write_lock(&smc_pnettable.lock);
133 list_for_each_entry_safe(pnetelem, tmp_pe, &smc_pnettable.pnetlist,
134 list) {
135 if (pnetelem->ndev == ndev) {
136 list_del(&pnetelem->list);
137 dev_put(pnetelem->ndev);
138 kfree(pnetelem);
139 rc = 0;
140 break;
141 }
142 }
143 write_unlock(&smc_pnettable.lock);
144 return rc;
145}
146
147/* Remove a pnet entry mentioning a given ib device from the pnet table.
148 */
149int smc_pnet_remove_by_ibdev(struct smc_ib_device *ibdev)
150{
151 struct smc_pnetentry *pnetelem, *tmp_pe;
152 int rc = -ENOENT;
153
154 write_lock(&smc_pnettable.lock);
155 list_for_each_entry_safe(pnetelem, tmp_pe, &smc_pnettable.pnetlist,
156 list) {
157 if (pnetelem->smcibdev == ibdev) {
158 list_del(&pnetelem->list);
159 dev_put(pnetelem->ndev);
160 kfree(pnetelem);
161 rc = 0;
162 break;
163 }
164 }
165 write_unlock(&smc_pnettable.lock);
166 return rc;
167}
168
169/* Append a pnetid to the end of the pnet table if not already on this list.
170 */
171static int smc_pnet_enter(struct smc_pnetentry *new_pnetelem)
172{
173 struct smc_pnetentry *pnetelem;
174 int rc = -EEXIST;
175
176 write_lock(&smc_pnettable.lock);
177 list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) {
178 if (!strncmp(pnetelem->pnet_name, new_pnetelem->pnet_name,
179 sizeof(new_pnetelem->pnet_name)) ||
180 !strncmp(pnetelem->ndev->name, new_pnetelem->ndev->name,
181 sizeof(new_pnetelem->ndev->name)) ||
182 smc_pnet_same_ibname(pnetelem,
183 new_pnetelem->smcibdev->ibdev->name,
184 new_pnetelem->ib_port))
185 goto found;
186 }
187 list_add_tail(&new_pnetelem->list, &smc_pnettable.pnetlist);
188 rc = 0;
189found:
190 write_unlock(&smc_pnettable.lock);
191 return rc;
192}
193
194/* The limit for pnetid is 16 characters.
195 * Valid characters should be (single-byte character set) a-z, A-Z, 0-9.
196 * Lower case letters are converted to upper case.
197 * Interior blanks should not be used.
198 */
199static bool smc_pnetid_valid(const char *pnet_name, char *pnetid)
200{
201 char *bf = skip_spaces(pnet_name);
202 size_t len = strlen(bf);
203 char *end = bf + len;
204
205 if (!len)
206 return false;
207 while (--end >= bf && isspace(*end))
208 ;
209 if (end - bf >= SMC_MAX_PNET_ID_LEN)
210 return false;
211 while (bf <= end) {
212 if (!isalnum(*bf))
213 return false;
214 *pnetid++ = islower(*bf) ? toupper(*bf) : *bf;
215 bf++;
216 }
217 *pnetid = '\0';
218 return true;
219}
220
221/* Find an infiniband device by a given name. The device might not exist. */
222struct smc_ib_device *smc_pnet_find_ib(char *ib_name)
223{
224 struct smc_ib_device *ibdev;
225
226 spin_lock(&smc_ib_devices.lock);
227 list_for_each_entry(ibdev, &smc_ib_devices.list, list) {
228 if (!strncmp(ibdev->ibdev->name, ib_name,
229 sizeof(ibdev->ibdev->name))) {
230 goto out;
231 }
232 }
233 ibdev = NULL;
234out:
235 spin_unlock(&smc_ib_devices.lock);
236 return ibdev;
237}
238
239/* Parse the supplied netlink attributes and fill a pnetentry structure.
240 * For ethernet and infiniband device names verify that the devices exist.
241 */
242static int smc_pnet_fill_entry(struct net *net, struct smc_pnetentry *pnetelem,
243 struct nlattr *tb[])
244{
245 char *string, *ibname = NULL;
246 int rc = 0;
247
248 memset(pnetelem, 0, sizeof(*pnetelem));
249 INIT_LIST_HEAD(&pnetelem->list);
250 if (tb[SMC_PNETID_NAME]) {
251 string = (char *)nla_data(tb[SMC_PNETID_NAME]);
252 if (!smc_pnetid_valid(string, pnetelem->pnet_name)) {
253 rc = -EINVAL;
254 goto error;
255 }
256 }
257 if (tb[SMC_PNETID_ETHNAME]) {
258 string = (char *)nla_data(tb[SMC_PNETID_ETHNAME]);
259 pnetelem->ndev = dev_get_by_name(net, string);
260 if (!pnetelem->ndev)
261 return -ENOENT;
262 }
263 if (tb[SMC_PNETID_IBNAME]) {
264 ibname = (char *)nla_data(tb[SMC_PNETID_IBNAME]);
265 ibname = strim(ibname);
266 pnetelem->smcibdev = smc_pnet_find_ib(ibname);
267 if (!pnetelem->smcibdev) {
268 rc = -ENOENT;
269 goto error;
270 }
271 }
272 if (tb[SMC_PNETID_IBPORT]) {
273 pnetelem->ib_port = nla_get_u8(tb[SMC_PNETID_IBPORT]);
274 if (pnetelem->ib_port > SMC_MAX_PORTS) {
275 rc = -EINVAL;
276 goto error;
277 }
278 }
279 return 0;
280
281error:
282 if (pnetelem->ndev)
283 dev_put(pnetelem->ndev);
284 return rc;
285}
286
287/* Convert an smc_pnetentry to a netlink attribute sequence */
288static int smc_pnet_set_nla(struct sk_buff *msg, struct smc_pnetentry *pnetelem)
289{
290 if (nla_put_string(msg, SMC_PNETID_NAME, pnetelem->pnet_name) ||
291 nla_put_string(msg, SMC_PNETID_ETHNAME, pnetelem->ndev->name) ||
292 nla_put_string(msg, SMC_PNETID_IBNAME,
293 pnetelem->smcibdev->ibdev->name) ||
294 nla_put_u8(msg, SMC_PNETID_IBPORT, pnetelem->ib_port))
295 return -1;
296 return 0;
297}
298
299/* Retrieve one PNETID entry */
300static int smc_pnet_get(struct sk_buff *skb, struct genl_info *info)
301{
302 struct smc_pnetentry *pnetelem;
303 struct sk_buff *msg;
304 void *hdr;
305 int rc;
306
307 pnetelem = smc_pnet_find_pnetid(
308 (char *)nla_data(info->attrs[SMC_PNETID_NAME]));
309 if (!pnetelem)
310 return -ENOENT;
311 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
312 if (!msg)
313 return -ENOMEM;
314
315 hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq,
316 &smc_pnet_nl_family, 0, SMC_PNETID_GET);
317 if (!hdr) {
318 rc = -EMSGSIZE;
319 goto err_out;
320 }
321
322 if (smc_pnet_set_nla(msg, pnetelem)) {
323 rc = -ENOBUFS;
324 goto err_out;
325 }
326
327 genlmsg_end(msg, hdr);
328 return genlmsg_reply(msg, info);
329
330err_out:
331 nlmsg_free(msg);
332 return rc;
333}
334
335static int smc_pnet_add(struct sk_buff *skb, struct genl_info *info)
336{
337 struct net *net = genl_info_net(info);
338 struct smc_pnetentry *pnetelem;
339 int rc;
340
341 pnetelem = kzalloc(sizeof(*pnetelem), GFP_KERNEL);
342 if (!pnetelem)
343 return -ENOMEM;
344 rc = smc_pnet_fill_entry(net, pnetelem, info->attrs);
345 if (!rc)
346 rc = smc_pnet_enter(pnetelem);
347 if (rc) {
348 kfree(pnetelem);
349 return rc;
350 }
351 rc = smc_ib_remember_port_attr(pnetelem->smcibdev, pnetelem->ib_port);
352 if (rc)
353 smc_pnet_remove_by_pnetid(pnetelem->pnet_name);
354 return rc;
355}
356
357static int smc_pnet_del(struct sk_buff *skb, struct genl_info *info)
358{
359 return smc_pnet_remove_by_pnetid(
360 (char *)nla_data(info->attrs[SMC_PNETID_NAME]));
361}
362
363static int smc_pnet_dump_start(struct netlink_callback *cb)
364{
365 cb->args[0] = 0;
366 return 0;
367}
368
369static int smc_pnet_dumpinfo(struct sk_buff *skb,
370 u32 portid, u32 seq, u32 flags,
371 struct smc_pnetentry *pnetelem)
372{
373 void *hdr;
374
375 hdr = genlmsg_put(skb, portid, seq, &smc_pnet_nl_family,
376 flags, SMC_PNETID_GET);
377 if (!hdr)
378 return -ENOMEM;
379 if (smc_pnet_set_nla(skb, pnetelem) < 0) {
380 genlmsg_cancel(skb, hdr);
381 return -EMSGSIZE;
382 }
383 genlmsg_end(skb, hdr);
384 return 0;
385}
386
387static int smc_pnet_dump(struct sk_buff *skb, struct netlink_callback *cb)
388{
389 struct smc_pnetentry *pnetelem;
390 int idx = 0;
391
392 read_lock(&smc_pnettable.lock);
393 list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) {
394 if (idx++ < cb->args[0])
395 continue;
396 if (smc_pnet_dumpinfo(skb, NETLINK_CB(cb->skb).portid,
397 cb->nlh->nlmsg_seq, NLM_F_MULTI,
398 pnetelem)) {
399 --idx;
400 break;
401 }
402 }
403 cb->args[0] = idx;
404 read_unlock(&smc_pnettable.lock);
405 return skb->len;
406}
407
408/* Remove and delete all pnetids from pnet table.
409 */
410static int smc_pnet_flush(struct sk_buff *skb, struct genl_info *info)
411{
412 struct smc_pnetentry *pnetelem, *tmp_pe;
413
414 write_lock(&smc_pnettable.lock);
415 list_for_each_entry_safe(pnetelem, tmp_pe, &smc_pnettable.pnetlist,
416 list) {
417 list_del(&pnetelem->list);
418 dev_put(pnetelem->ndev);
419 kfree(pnetelem);
420 }
421 write_unlock(&smc_pnettable.lock);
422 return 0;
423}
424
425/* SMC_PNETID generic netlink operation definition */
426static const struct genl_ops smc_pnet_ops[] = {
427 {
428 .cmd = SMC_PNETID_GET,
429 .flags = GENL_ADMIN_PERM,
430 .policy = smc_pnet_policy,
431 .doit = smc_pnet_get,
432 .dumpit = smc_pnet_dump,
433 .start = smc_pnet_dump_start
434 },
435 {
436 .cmd = SMC_PNETID_ADD,
437 .flags = GENL_ADMIN_PERM,
438 .policy = smc_pnet_policy,
439 .doit = smc_pnet_add
440 },
441 {
442 .cmd = SMC_PNETID_DEL,
443 .flags = GENL_ADMIN_PERM,
444 .policy = smc_pnet_policy,
445 .doit = smc_pnet_del
446 },
447 {
448 .cmd = SMC_PNETID_FLUSH,
449 .flags = GENL_ADMIN_PERM,
450 .policy = smc_pnet_policy,
451 .doit = smc_pnet_flush
452 }
453};
454
455/* SMC_PNETID family definition */
456static struct genl_family smc_pnet_nl_family = {
457 .hdrsize = 0,
458 .name = SMCR_GENL_FAMILY_NAME,
459 .version = SMCR_GENL_FAMILY_VERSION,
460 .maxattr = SMC_PNETID_MAX,
461 .netnsok = true,
462 .module = THIS_MODULE,
463 .ops = smc_pnet_ops,
464 .n_ops = ARRAY_SIZE(smc_pnet_ops)
465};
466
467static int smc_pnet_netdev_event(struct notifier_block *this,
468 unsigned long event, void *ptr)
469{
470 struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
471
472 switch (event) {
473 case NETDEV_REBOOT:
474 case NETDEV_UNREGISTER:
475 smc_pnet_remove_by_ndev(event_dev);
476 default:
477 break;
478 }
479 return NOTIFY_DONE;
480}
481
482static struct notifier_block smc_netdev_notifier = {
483 .notifier_call = smc_pnet_netdev_event
484};
485
486int __init smc_pnet_init(void)
487{
488 int rc;
489
490 rc = genl_register_family(&smc_pnet_nl_family);
491 if (rc)
492 return rc;
493 rc = register_netdevice_notifier(&smc_netdev_notifier);
494 if (rc)
495 genl_unregister_family(&smc_pnet_nl_family);
496 return rc;
497}
498
499void smc_pnet_exit(void)
500{
501 smc_pnet_flush(NULL, NULL);
502 unregister_netdevice_notifier(&smc_netdev_notifier);
503 genl_unregister_family(&smc_pnet_nl_family);
504}
505
506/* PNET table analysis for a given sock:
507 * determine ib_device and port belonging to used internal TCP socket
508 * ethernet interface.
509 */
510void smc_pnet_find_roce_resource(struct sock *sk,
511 struct smc_ib_device **smcibdev, u8 *ibport)
512{
513 struct dst_entry *dst = sk_dst_get(sk);
514 struct smc_pnetentry *pnetelem;
515
516 *smcibdev = NULL;
517 *ibport = 0;
518
519 if (!dst)
520 return;
521 if (!dst->dev)
522 goto out_rel;
523 read_lock(&smc_pnettable.lock);
524 list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) {
525 if (dst->dev == pnetelem->ndev) {
526 *smcibdev = pnetelem->smcibdev;
527 *ibport = pnetelem->ib_port;
528 break;
529 }
530 }
531 read_unlock(&smc_pnettable.lock);
532out_rel:
533 dst_release(dst);
534}
diff --git a/net/smc/smc_pnet.h b/net/smc/smc_pnet.h
new file mode 100644
index 000000000000..32ab3df928ca
--- /dev/null
+++ b/net/smc/smc_pnet.h
@@ -0,0 +1,23 @@
1/*
2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
3 *
4 * PNET table queries
5 *
6 * Copyright IBM Corp. 2016
7 *
8 * Author(s): Thomas Richter <tmricht@linux.vnet.ibm.com>
9 */
10
11#ifndef _SMC_PNET_H
12#define _SMC_PNET_H
13
14struct smc_ib_device;
15
16int smc_pnet_init(void) __init;
17void smc_pnet_exit(void);
18int smc_pnet_remove_by_ibdev(struct smc_ib_device *ibdev);
19struct smc_ib_device *smc_pnet_find_ib(char *ib_name);
20void smc_pnet_find_roce_resource(struct sock *sk,
21 struct smc_ib_device **smcibdev, u8 *ibport);
22
23#endif
diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c
new file mode 100644
index 000000000000..c4ef9a4ec569
--- /dev/null
+++ b/net/smc/smc_rx.c
@@ -0,0 +1,219 @@
1/*
2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
3 *
4 * Manage RMBE
5 * copy new RMBE data into user space
6 *
7 * Copyright IBM Corp. 2016
8 *
9 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
10 */
11
12#include <linux/net.h>
13#include <linux/rcupdate.h>
14#include <linux/sched/signal.h>
15
16#include <net/sock.h>
17
18#include "smc.h"
19#include "smc_core.h"
20#include "smc_cdc.h"
21#include "smc_tx.h" /* smc_tx_consumer_update() */
22#include "smc_rx.h"
23
24/* callback implementation for sk.sk_data_ready()
25 * to wakeup rcvbuf consumers that blocked with smc_rx_wait_data().
26 * indirectly called by smc_cdc_msg_recv_action().
27 */
28static void smc_rx_data_ready(struct sock *sk)
29{
30 struct socket_wq *wq;
31
32 /* derived from sock_def_readable() */
33 /* called already in smc_listen_work() */
34 rcu_read_lock();
35 wq = rcu_dereference(sk->sk_wq);
36 if (skwq_has_sleeper(wq))
37 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
38 POLLRDNORM | POLLRDBAND);
39 if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
40 (sk->sk_state == SMC_CLOSED))
41 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
42 else
43 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
44 rcu_read_unlock();
45}
46
47/* blocks rcvbuf consumer until >=len bytes available or timeout or interrupted
48 * @smc smc socket
49 * @timeo pointer to max seconds to wait, pointer to value 0 for no timeout
50 * Returns:
51 * 1 if at least 1 byte available in rcvbuf or if socket error/shutdown.
52 * 0 otherwise (nothing in rcvbuf nor timeout, e.g. interrupted).
53 */
54static int smc_rx_wait_data(struct smc_sock *smc, long *timeo)
55{
56 DEFINE_WAIT_FUNC(wait, woken_wake_function);
57 struct smc_connection *conn = &smc->conn;
58 struct sock *sk = &smc->sk;
59 int rc;
60
61 if (atomic_read(&conn->bytes_to_rcv))
62 return 1;
63 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
64 add_wait_queue(sk_sleep(sk), &wait);
65 rc = sk_wait_event(sk, timeo,
66 sk->sk_err ||
67 sk->sk_shutdown & RCV_SHUTDOWN ||
68 sock_flag(sk, SOCK_DONE) ||
69 atomic_read(&conn->bytes_to_rcv) ||
70 smc_cdc_rxed_any_close_or_senddone(conn),
71 &wait);
72 remove_wait_queue(sk_sleep(sk), &wait);
73 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
74 return rc;
75}
76
77/* rcvbuf consumer: main API called by socket layer.
78 * called under sk lock.
79 */
80int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len,
81 int flags)
82{
83 size_t copylen, read_done = 0, read_remaining = len;
84 size_t chunk_len, chunk_off, chunk_len_sum;
85 struct smc_connection *conn = &smc->conn;
86 union smc_host_cursor cons;
87 int readable, chunk;
88 char *rcvbuf_base;
89 struct sock *sk;
90 long timeo;
91 int target; /* Read at least these many bytes */
92 int rc;
93
94 if (unlikely(flags & MSG_ERRQUEUE))
95 return -EINVAL; /* future work for sk.sk_family == AF_SMC */
96 if (flags & MSG_OOB)
97 return -EINVAL; /* future work */
98
99 sk = &smc->sk;
100 if (sk->sk_state == SMC_LISTEN)
101 return -ENOTCONN;
102 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
103 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
104
105 msg->msg_namelen = 0;
106 /* we currently use 1 RMBE per RMB, so RMBE == RMB base addr */
107 rcvbuf_base = conn->rmb_desc->cpu_addr;
108
109 do { /* while (read_remaining) */
110 if (read_done >= target)
111 break;
112
113 if (atomic_read(&conn->bytes_to_rcv))
114 goto copy;
115
116 if (read_done) {
117 if (sk->sk_err ||
118 sk->sk_state == SMC_CLOSED ||
119 (sk->sk_shutdown & RCV_SHUTDOWN) ||
120 !timeo ||
121 signal_pending(current) ||
122 smc_cdc_rxed_any_close_or_senddone(conn) ||
123 conn->local_tx_ctrl.conn_state_flags.
124 peer_conn_abort)
125 break;
126 } else {
127 if (sock_flag(sk, SOCK_DONE))
128 break;
129 if (sk->sk_err) {
130 read_done = sock_error(sk);
131 break;
132 }
133 if (sk->sk_shutdown & RCV_SHUTDOWN ||
134 smc_cdc_rxed_any_close_or_senddone(conn) ||
135 conn->local_tx_ctrl.conn_state_flags.
136 peer_conn_abort)
137 break;
138 if (sk->sk_state == SMC_CLOSED) {
139 if (!sock_flag(sk, SOCK_DONE)) {
140 /* This occurs when user tries to read
141 * from never connected socket.
142 */
143 read_done = -ENOTCONN;
144 break;
145 }
146 break;
147 }
148 if (signal_pending(current)) {
149 read_done = sock_intr_errno(timeo);
150 break;
151 }
152 }
153
154 if (!atomic_read(&conn->bytes_to_rcv)) {
155 smc_rx_wait_data(smc, &timeo);
156 continue;
157 }
158
159copy:
160 /* initialize variables for 1st iteration of subsequent loop */
161 /* could be just 1 byte, even after smc_rx_wait_data above */
162 readable = atomic_read(&conn->bytes_to_rcv);
163 /* not more than what user space asked for */
164 copylen = min_t(size_t, read_remaining, readable);
165 smc_curs_write(&cons,
166 smc_curs_read(&conn->local_tx_ctrl.cons, conn),
167 conn);
168 /* determine chunks where to read from rcvbuf */
169 /* either unwrapped case, or 1st chunk of wrapped case */
170 chunk_len = min_t(size_t,
171 copylen, conn->rmbe_size - cons.count);
172 chunk_len_sum = chunk_len;
173 chunk_off = cons.count;
174 for (chunk = 0; chunk < 2; chunk++) {
175 if (!(flags & MSG_TRUNC)) {
176 rc = memcpy_to_msg(msg, rcvbuf_base + chunk_off,
177 chunk_len);
178 if (rc) {
179 if (!read_done)
180 read_done = -EFAULT;
181 goto out;
182 }
183 }
184 read_remaining -= chunk_len;
185 read_done += chunk_len;
186
187 if (chunk_len_sum == copylen)
188 break; /* either on 1st or 2nd iteration */
189 /* prepare next (== 2nd) iteration */
190 chunk_len = copylen - chunk_len; /* remainder */
191 chunk_len_sum += chunk_len;
192 chunk_off = 0; /* modulo offset in recv ring buffer */
193 }
194
195 /* update cursors */
196 if (!(flags & MSG_PEEK)) {
197 smc_curs_add(conn->rmbe_size, &cons, copylen);
198 /* increased in recv tasklet smc_cdc_msg_rcv() */
199 smp_mb__before_atomic();
200 atomic_sub(copylen, &conn->bytes_to_rcv);
201 /* guarantee 0 <= bytes_to_rcv <= rmbe_size */
202 smp_mb__after_atomic();
203 smc_curs_write(&conn->local_tx_ctrl.cons,
204 smc_curs_read(&cons, conn),
205 conn);
206 /* send consumer cursor update if required */
207 /* similar to advertising new TCP rcv_wnd if required */
208 smc_tx_consumer_update(conn);
209 }
210 } while (read_remaining);
211out:
212 return read_done;
213}
214
215/* Initialize receive properties on connection establishment. NB: not __init! */
216void smc_rx_init(struct smc_sock *smc)
217{
218 smc->sk.sk_data_ready = smc_rx_data_ready;
219}
diff --git a/net/smc/smc_rx.h b/net/smc/smc_rx.h
new file mode 100644
index 000000000000..b5b80e1f8b0f
--- /dev/null
+++ b/net/smc/smc_rx.h
@@ -0,0 +1,23 @@
1/*
2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
3 *
4 * Manage RMBE
5 *
6 * Copyright IBM Corp. 2016
7 *
8 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
9 */
10
11#ifndef SMC_RX_H
12#define SMC_RX_H
13
14#include <linux/socket.h>
15#include <linux/types.h>
16
17#include "smc.h"
18
19void smc_rx_init(struct smc_sock *smc);
20int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len,
21 int flags);
22
23#endif /* SMC_RX_H */
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
new file mode 100644
index 000000000000..69a0013dd25c
--- /dev/null
+++ b/net/smc/smc_tx.c
@@ -0,0 +1,485 @@
1/*
2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
3 *
4 * Manage send buffer.
5 * Producer:
6 * Copy user space data into send buffer, if send buffer space available.
7 * Consumer:
8 * Trigger RDMA write into RMBE of peer and send CDC, if RMBE space available.
9 *
10 * Copyright IBM Corp. 2016
11 *
12 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
13 */
14
15#include <linux/net.h>
16#include <linux/rcupdate.h>
17#include <linux/workqueue.h>
18#include <linux/sched/signal.h>
19
20#include <net/sock.h>
21
22#include "smc.h"
23#include "smc_wr.h"
24#include "smc_cdc.h"
25#include "smc_tx.h"
26
27/***************************** sndbuf producer *******************************/
28
29/* callback implementation for sk.sk_write_space()
30 * to wakeup sndbuf producers that blocked with smc_tx_wait_memory().
31 * called under sk_socket lock.
32 */
33static void smc_tx_write_space(struct sock *sk)
34{
35 struct socket *sock = sk->sk_socket;
36 struct smc_sock *smc = smc_sk(sk);
37 struct socket_wq *wq;
38
39 /* similar to sk_stream_write_space */
40 if (atomic_read(&smc->conn.sndbuf_space) && sock) {
41 clear_bit(SOCK_NOSPACE, &sock->flags);
42 rcu_read_lock();
43 wq = rcu_dereference(sk->sk_wq);
44 if (skwq_has_sleeper(wq))
45 wake_up_interruptible_poll(&wq->wait,
46 POLLOUT | POLLWRNORM |
47 POLLWRBAND);
48 if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
49 sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT);
50 rcu_read_unlock();
51 }
52}
53
54/* Wakeup sndbuf producers that blocked with smc_tx_wait_memory().
55 * Cf. tcp_data_snd_check()=>tcp_check_space()=>tcp_new_space().
56 */
57void smc_tx_sndbuf_nonfull(struct smc_sock *smc)
58{
59 if (smc->sk.sk_socket &&
60 test_bit(SOCK_NOSPACE, &smc->sk.sk_socket->flags))
61 smc->sk.sk_write_space(&smc->sk);
62}
63
64/* blocks sndbuf producer until at least one byte of free space available */
65static int smc_tx_wait_memory(struct smc_sock *smc, int flags)
66{
67 DEFINE_WAIT_FUNC(wait, woken_wake_function);
68 struct smc_connection *conn = &smc->conn;
69 struct sock *sk = &smc->sk;
70 bool noblock;
71 long timeo;
72 int rc = 0;
73
74 /* similar to sk_stream_wait_memory */
75 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
76 noblock = timeo ? false : true;
77 add_wait_queue(sk_sleep(sk), &wait);
78 while (1) {
79 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
80 if (sk->sk_err ||
81 (sk->sk_shutdown & SEND_SHUTDOWN) ||
82 conn->local_tx_ctrl.conn_state_flags.peer_done_writing) {
83 rc = -EPIPE;
84 break;
85 }
86 if (conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) {
87 rc = -ECONNRESET;
88 break;
89 }
90 if (!timeo) {
91 if (noblock)
92 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
93 rc = -EAGAIN;
94 break;
95 }
96 if (signal_pending(current)) {
97 rc = sock_intr_errno(timeo);
98 break;
99 }
100 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
101 if (atomic_read(&conn->sndbuf_space))
102 break; /* at least 1 byte of free space available */
103 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
104 sk->sk_write_pending++;
105 sk_wait_event(sk, &timeo,
106 sk->sk_err ||
107 (sk->sk_shutdown & SEND_SHUTDOWN) ||
108 smc_cdc_rxed_any_close_or_senddone(conn) ||
109 atomic_read(&conn->sndbuf_space),
110 &wait);
111 sk->sk_write_pending--;
112 }
113 remove_wait_queue(sk_sleep(sk), &wait);
114 return rc;
115}
116
117/* sndbuf producer: main API called by socket layer.
118 * called under sock lock.
119 */
120int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
121{
122 size_t copylen, send_done = 0, send_remaining = len;
123 size_t chunk_len, chunk_off, chunk_len_sum;
124 struct smc_connection *conn = &smc->conn;
125 union smc_host_cursor prep;
126 struct sock *sk = &smc->sk;
127 char *sndbuf_base;
128 int tx_cnt_prep;
129 int writespace;
130 int rc, chunk;
131
132 /* This should be in poll */
133 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
134
135 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) {
136 rc = -EPIPE;
137 goto out_err;
138 }
139
140 while (msg_data_left(msg)) {
141 if (sk->sk_state == SMC_INIT)
142 return -ENOTCONN;
143 if (smc->sk.sk_shutdown & SEND_SHUTDOWN ||
144 (smc->sk.sk_err == ECONNABORTED) ||
145 conn->local_tx_ctrl.conn_state_flags.peer_conn_abort)
146 return -EPIPE;
147 if (smc_cdc_rxed_any_close(conn))
148 return send_done ?: -ECONNRESET;
149
150 if (!atomic_read(&conn->sndbuf_space)) {
151 rc = smc_tx_wait_memory(smc, msg->msg_flags);
152 if (rc) {
153 if (send_done)
154 return send_done;
155 goto out_err;
156 }
157 continue;
158 }
159
160 /* initialize variables for 1st iteration of subsequent loop */
161 /* could be just 1 byte, even after smc_tx_wait_memory above */
162 writespace = atomic_read(&conn->sndbuf_space);
163 /* not more than what user space asked for */
164 copylen = min_t(size_t, send_remaining, writespace);
165 /* determine start of sndbuf */
166 sndbuf_base = conn->sndbuf_desc->cpu_addr;
167 smc_curs_write(&prep,
168 smc_curs_read(&conn->tx_curs_prep, conn),
169 conn);
170 tx_cnt_prep = prep.count;
171 /* determine chunks where to write into sndbuf */
172 /* either unwrapped case, or 1st chunk of wrapped case */
173 chunk_len = min_t(size_t,
174 copylen, conn->sndbuf_size - tx_cnt_prep);
175 chunk_len_sum = chunk_len;
176 chunk_off = tx_cnt_prep;
177 for (chunk = 0; chunk < 2; chunk++) {
178 rc = memcpy_from_msg(sndbuf_base + chunk_off,
179 msg, chunk_len);
180 if (rc) {
181 if (send_done)
182 return send_done;
183 goto out_err;
184 }
185 send_done += chunk_len;
186 send_remaining -= chunk_len;
187
188 if (chunk_len_sum == copylen)
189 break; /* either on 1st or 2nd iteration */
190 /* prepare next (== 2nd) iteration */
191 chunk_len = copylen - chunk_len; /* remainder */
192 chunk_len_sum += chunk_len;
193 chunk_off = 0; /* modulo offset in send ring buffer */
194 }
195 /* update cursors */
196 smc_curs_add(conn->sndbuf_size, &prep, copylen);
197 smc_curs_write(&conn->tx_curs_prep,
198 smc_curs_read(&prep, conn),
199 conn);
200 /* increased in send tasklet smc_cdc_tx_handler() */
201 smp_mb__before_atomic();
202 atomic_sub(copylen, &conn->sndbuf_space);
203 /* guarantee 0 <= sndbuf_space <= sndbuf_size */
204 smp_mb__after_atomic();
205 /* since we just produced more new data into sndbuf,
206 * trigger sndbuf consumer: RDMA write into peer RMBE and CDC
207 */
208 smc_tx_sndbuf_nonempty(conn);
209 } /* while (msg_data_left(msg)) */
210
211 return send_done;
212
213out_err:
214 rc = sk_stream_error(sk, msg->msg_flags, rc);
215 /* make sure we wake any epoll edge trigger waiter */
216 if (unlikely(rc == -EAGAIN))
217 sk->sk_write_space(sk);
218 return rc;
219}
220
221/***************************** sndbuf consumer *******************************/
222
223/* sndbuf consumer: actual data transfer of one target chunk with RDMA write */
224static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset,
225 int num_sges, struct ib_sge sges[])
226{
227 struct smc_link_group *lgr = conn->lgr;
228 struct ib_send_wr *failed_wr = NULL;
229 struct ib_rdma_wr rdma_wr;
230 struct smc_link *link;
231 int rc;
232
233 memset(&rdma_wr, 0, sizeof(rdma_wr));
234 link = &lgr->lnk[SMC_SINGLE_LINK];
235 rdma_wr.wr.wr_id = smc_wr_tx_get_next_wr_id(link);
236 rdma_wr.wr.sg_list = sges;
237 rdma_wr.wr.num_sge = num_sges;
238 rdma_wr.wr.opcode = IB_WR_RDMA_WRITE;
239 rdma_wr.remote_addr =
240 lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].dma_addr +
241 /* RMBE within RMB */
242 ((conn->peer_conn_idx - 1) * conn->peer_rmbe_size) +
243 /* offset within RMBE */
244 peer_rmbe_offset;
245 rdma_wr.rkey = lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey;
246 rc = ib_post_send(link->roce_qp, &rdma_wr.wr, &failed_wr);
247 if (rc)
248 conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
249 return rc;
250}
251
252/* sndbuf consumer */
253static inline void smc_tx_advance_cursors(struct smc_connection *conn,
254 union smc_host_cursor *prod,
255 union smc_host_cursor *sent,
256 size_t len)
257{
258 smc_curs_add(conn->peer_rmbe_size, prod, len);
259 /* increased in recv tasklet smc_cdc_msg_rcv() */
260 smp_mb__before_atomic();
261 /* data in flight reduces usable snd_wnd */
262 atomic_sub(len, &conn->peer_rmbe_space);
263 /* guarantee 0 <= peer_rmbe_space <= peer_rmbe_size */
264 smp_mb__after_atomic();
265 smc_curs_add(conn->sndbuf_size, sent, len);
266}
267
268/* sndbuf consumer: prepare all necessary (src&dst) chunks of data transmit;
269 * usable snd_wnd as max transmit
270 */
271static int smc_tx_rdma_writes(struct smc_connection *conn)
272{
273 size_t src_off, src_len, dst_off, dst_len; /* current chunk values */
274 size_t len, dst_len_sum, src_len_sum, dstchunk, srcchunk;
275 union smc_host_cursor sent, prep, prod, cons;
276 struct ib_sge sges[SMC_IB_MAX_SEND_SGE];
277 struct smc_link_group *lgr = conn->lgr;
278 int to_send, rmbespace;
279 struct smc_link *link;
280 int num_sges;
281 int rc;
282
283 /* source: sndbuf */
284 smc_curs_write(&sent, smc_curs_read(&conn->tx_curs_sent, conn), conn);
285 smc_curs_write(&prep, smc_curs_read(&conn->tx_curs_prep, conn), conn);
286 /* cf. wmem_alloc - (snd_max - snd_una) */
287 to_send = smc_curs_diff(conn->sndbuf_size, &sent, &prep);
288 if (to_send <= 0)
289 return 0;
290
291 /* destination: RMBE */
292 /* cf. snd_wnd */
293 rmbespace = atomic_read(&conn->peer_rmbe_space);
294 if (rmbespace <= 0)
295 return 0;
296 smc_curs_write(&prod,
297 smc_curs_read(&conn->local_tx_ctrl.prod, conn),
298 conn);
299 smc_curs_write(&cons,
300 smc_curs_read(&conn->local_rx_ctrl.cons, conn),
301 conn);
302
303 /* if usable snd_wnd closes ask peer to advertise once it opens again */
304 conn->local_tx_ctrl.prod_flags.write_blocked = (to_send >= rmbespace);
305 /* cf. usable snd_wnd */
306 len = min(to_send, rmbespace);
307
308 /* initialize variables for first iteration of subsequent nested loop */
309 link = &lgr->lnk[SMC_SINGLE_LINK];
310 dst_off = prod.count;
311 if (prod.wrap == cons.wrap) {
312 /* the filled destination area is unwrapped,
313 * hence the available free destination space is wrapped
314 * and we need 2 destination chunks of sum len; start with 1st
315 * which is limited by what's available in sndbuf
316 */
317 dst_len = min_t(size_t,
318 conn->peer_rmbe_size - prod.count, len);
319 } else {
320 /* the filled destination area is wrapped,
321 * hence the available free destination space is unwrapped
322 * and we need a single destination chunk of entire len
323 */
324 dst_len = len;
325 }
326 dst_len_sum = dst_len;
327 src_off = sent.count;
328 /* dst_len determines the maximum src_len */
329 if (sent.count + dst_len <= conn->sndbuf_size) {
330 /* unwrapped src case: single chunk of entire dst_len */
331 src_len = dst_len;
332 } else {
333 /* wrapped src case: 2 chunks of sum dst_len; start with 1st: */
334 src_len = conn->sndbuf_size - sent.count;
335 }
336 src_len_sum = src_len;
337 for (dstchunk = 0; dstchunk < 2; dstchunk++) {
338 num_sges = 0;
339 for (srcchunk = 0; srcchunk < 2; srcchunk++) {
340 sges[srcchunk].addr =
341 conn->sndbuf_desc->dma_addr[SMC_SINGLE_LINK] +
342 src_off;
343 sges[srcchunk].length = src_len;
344 sges[srcchunk].lkey = link->roce_pd->local_dma_lkey;
345 num_sges++;
346 src_off += src_len;
347 if (src_off >= conn->sndbuf_size)
348 src_off -= conn->sndbuf_size;
349 /* modulo in send ring */
350 if (src_len_sum == dst_len)
351 break; /* either on 1st or 2nd iteration */
352 /* prepare next (== 2nd) iteration */
353 src_len = dst_len - src_len; /* remainder */
354 src_len_sum += src_len;
355 }
356 rc = smc_tx_rdma_write(conn, dst_off, num_sges, sges);
357 if (rc)
358 return rc;
359 if (dst_len_sum == len)
360 break; /* either on 1st or 2nd iteration */
361 /* prepare next (== 2nd) iteration */
362 dst_off = 0; /* modulo offset in RMBE ring buffer */
363 dst_len = len - dst_len; /* remainder */
364 dst_len_sum += dst_len;
365 src_len = min_t(int,
366 dst_len, conn->sndbuf_size - sent.count);
367 src_len_sum = src_len;
368 }
369
370 smc_tx_advance_cursors(conn, &prod, &sent, len);
371 /* update connection's cursors with advanced local cursors */
372 smc_curs_write(&conn->local_tx_ctrl.prod,
373 smc_curs_read(&prod, conn),
374 conn);
375 /* dst: peer RMBE */
376 smc_curs_write(&conn->tx_curs_sent,
377 smc_curs_read(&sent, conn),
378 conn);
379 /* src: local sndbuf */
380
381 return 0;
382}
383
384/* Wakeup sndbuf consumers from any context (IRQ or process)
385 * since there is more data to transmit; usable snd_wnd as max transmit
386 */
387int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
388{
389 struct smc_cdc_tx_pend *pend;
390 struct smc_wr_buf *wr_buf;
391 int rc;
392
393 spin_lock_bh(&conn->send_lock);
394 rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], &wr_buf,
395 &pend);
396 if (rc < 0) {
397 if (rc == -EBUSY) {
398 struct smc_sock *smc =
399 container_of(conn, struct smc_sock, conn);
400
401 if (smc->sk.sk_err == ECONNABORTED) {
402 rc = sock_error(&smc->sk);
403 goto out_unlock;
404 }
405 rc = 0;
406 schedule_work(&conn->tx_work);
407 }
408 goto out_unlock;
409 }
410
411 rc = smc_tx_rdma_writes(conn);
412 if (rc) {
413 smc_wr_tx_put_slot(&conn->lgr->lnk[SMC_SINGLE_LINK],
414 (struct smc_wr_tx_pend_priv *)pend);
415 goto out_unlock;
416 }
417
418 rc = smc_cdc_msg_send(conn, wr_buf, pend);
419
420out_unlock:
421 spin_unlock_bh(&conn->send_lock);
422 return rc;
423}
424
425/* Wakeup sndbuf consumers from process context
426 * since there is more data to transmit
427 */
428static void smc_tx_work(struct work_struct *work)
429{
430 struct smc_connection *conn = container_of(work,
431 struct smc_connection,
432 tx_work);
433 struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
434
435 lock_sock(&smc->sk);
436 smc_tx_sndbuf_nonempty(conn);
437 release_sock(&smc->sk);
438}
439
440void smc_tx_consumer_update(struct smc_connection *conn)
441{
442 union smc_host_cursor cfed, cons;
443 struct smc_cdc_tx_pend *pend;
444 struct smc_wr_buf *wr_buf;
445 int to_confirm, rc;
446
447 smc_curs_write(&cons,
448 smc_curs_read(&conn->local_tx_ctrl.cons, conn),
449 conn);
450 smc_curs_write(&cfed,
451 smc_curs_read(&conn->rx_curs_confirmed, conn),
452 conn);
453 to_confirm = smc_curs_diff(conn->rmbe_size, &cfed, &cons);
454
455 if (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req ||
456 ((to_confirm > conn->rmbe_update_limit) &&
457 ((to_confirm > (conn->rmbe_size / 2)) ||
458 conn->local_rx_ctrl.prod_flags.write_blocked))) {
459 rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK],
460 &wr_buf, &pend);
461 if (!rc)
462 rc = smc_cdc_msg_send(conn, wr_buf, pend);
463 if (rc < 0) {
464 schedule_work(&conn->tx_work);
465 return;
466 }
467 smc_curs_write(&conn->rx_curs_confirmed,
468 smc_curs_read(&conn->local_tx_ctrl.cons, conn),
469 conn);
470 conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0;
471 }
472 if (conn->local_rx_ctrl.prod_flags.write_blocked &&
473 !atomic_read(&conn->bytes_to_rcv))
474 conn->local_rx_ctrl.prod_flags.write_blocked = 0;
475}
476
477/***************************** send initialize *******************************/
478
479/* Initialize send properties on connection establishment. NB: not __init! */
480void smc_tx_init(struct smc_sock *smc)
481{
482 smc->sk.sk_write_space = smc_tx_write_space;
483 INIT_WORK(&smc->conn.tx_work, smc_tx_work);
484 spin_lock_init(&smc->conn.send_lock);
485}
diff --git a/net/smc/smc_tx.h b/net/smc/smc_tx.h
new file mode 100644
index 000000000000..1d6a0dcdcfe6
--- /dev/null
+++ b/net/smc/smc_tx.h
@@ -0,0 +1,35 @@
1/*
2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
3 *
4 * Manage send buffer
5 *
6 * Copyright IBM Corp. 2016
7 *
8 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
9 */
10
11#ifndef SMC_TX_H
12#define SMC_TX_H
13
14#include <linux/socket.h>
15#include <linux/types.h>
16
17#include "smc.h"
18#include "smc_cdc.h"
19
20static inline int smc_tx_prepared_sends(struct smc_connection *conn)
21{
22 union smc_host_cursor sent, prep;
23
24 smc_curs_write(&sent, smc_curs_read(&conn->tx_curs_sent, conn), conn);
25 smc_curs_write(&prep, smc_curs_read(&conn->tx_curs_prep, conn), conn);
26 return smc_curs_diff(conn->sndbuf_size, &sent, &prep);
27}
28
29void smc_tx_init(struct smc_sock *smc);
30int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len);
31int smc_tx_sndbuf_nonempty(struct smc_connection *conn);
32void smc_tx_sndbuf_nonfull(struct smc_sock *smc);
33void smc_tx_consumer_update(struct smc_connection *conn);
34
35#endif /* SMC_TX_H */
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
new file mode 100644
index 000000000000..eadf157418dc
--- /dev/null
+++ b/net/smc/smc_wr.c
@@ -0,0 +1,614 @@
1/*
2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
3 *
4 * Work Requests exploiting Infiniband API
5 *
6 * Work requests (WR) of type ib_post_send or ib_post_recv respectively
7 * are submitted to either RC SQ or RC RQ respectively
8 * (reliably connected send/receive queue)
9 * and become work queue entries (WQEs).
10 * While an SQ WR/WQE is pending, we track it until transmission completion.
11 * Through a send or receive completion queue (CQ) respectively,
12 * we get completion queue entries (CQEs) [aka work completions (WCs)].
13 * Since the CQ callback is called from IRQ context, we split work by using
14 * bottom halves implemented by tasklets.
15 *
16 * SMC uses this to exchange LLC (link layer control)
17 * and CDC (connection data control) messages.
18 *
19 * Copyright IBM Corp. 2016
20 *
21 * Author(s): Steffen Maier <maier@linux.vnet.ibm.com>
22 */
23
24#include <linux/atomic.h>
25#include <linux/hashtable.h>
26#include <linux/wait.h>
27#include <rdma/ib_verbs.h>
28#include <asm/div64.h>
29
30#include "smc.h"
31#include "smc_wr.h"
32
33#define SMC_WR_MAX_POLL_CQE 10 /* max. # of compl. queue elements in 1 poll */
34
35#define SMC_WR_RX_HASH_BITS 4
36static DEFINE_HASHTABLE(smc_wr_rx_hash, SMC_WR_RX_HASH_BITS);
37static DEFINE_SPINLOCK(smc_wr_rx_hash_lock);
38
39struct smc_wr_tx_pend { /* control data for a pending send request */
40 u64 wr_id; /* work request id sent */
41 smc_wr_tx_handler handler;
42 enum ib_wc_status wc_status; /* CQE status */
43 struct smc_link *link;
44 u32 idx;
45 struct smc_wr_tx_pend_priv priv;
46};
47
48/******************************** send queue *********************************/
49
50/*------------------------------- completion --------------------------------*/
51
52static inline int smc_wr_tx_find_pending_index(struct smc_link *link, u64 wr_id)
53{
54 u32 i;
55
56 for (i = 0; i < link->wr_tx_cnt; i++) {
57 if (link->wr_tx_pends[i].wr_id == wr_id)
58 return i;
59 }
60 return link->wr_tx_cnt;
61}
62
63static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
64{
65 struct smc_wr_tx_pend pnd_snd;
66 struct smc_link *link;
67 u32 pnd_snd_idx;
68 int i;
69
70 link = wc->qp->qp_context;
71 pnd_snd_idx = smc_wr_tx_find_pending_index(link, wc->wr_id);
72 if (pnd_snd_idx == link->wr_tx_cnt)
73 return;
74 link->wr_tx_pends[pnd_snd_idx].wc_status = wc->status;
75 memcpy(&pnd_snd, &link->wr_tx_pends[pnd_snd_idx], sizeof(pnd_snd));
76 /* clear the full struct smc_wr_tx_pend including .priv */
77 memset(&link->wr_tx_pends[pnd_snd_idx], 0,
78 sizeof(link->wr_tx_pends[pnd_snd_idx]));
79 memset(&link->wr_tx_bufs[pnd_snd_idx], 0,
80 sizeof(link->wr_tx_bufs[pnd_snd_idx]));
81 if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask))
82 return;
83 if (wc->status) {
84 struct smc_link_group *lgr;
85
86 for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) {
87 /* clear full struct smc_wr_tx_pend including .priv */
88 memset(&link->wr_tx_pends[i], 0,
89 sizeof(link->wr_tx_pends[i]));
90 memset(&link->wr_tx_bufs[i], 0,
91 sizeof(link->wr_tx_bufs[i]));
92 clear_bit(i, link->wr_tx_mask);
93 }
94 /* terminate connections of this link group abnormally */
95 lgr = container_of(link, struct smc_link_group,
96 lnk[SMC_SINGLE_LINK]);
97 smc_lgr_terminate(lgr);
98 }
99 if (pnd_snd.handler)
100 pnd_snd.handler(&pnd_snd.priv, link, wc->status);
101 wake_up(&link->wr_tx_wait);
102}
103
104static void smc_wr_tx_tasklet_fn(unsigned long data)
105{
106 struct smc_ib_device *dev = (struct smc_ib_device *)data;
107 struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
108 int i = 0, rc;
109 int polled = 0;
110
111again:
112 polled++;
113 do {
114 rc = ib_poll_cq(dev->roce_cq_send, SMC_WR_MAX_POLL_CQE, wc);
115 if (polled == 1) {
116 ib_req_notify_cq(dev->roce_cq_send,
117 IB_CQ_NEXT_COMP |
118 IB_CQ_REPORT_MISSED_EVENTS);
119 }
120 if (!rc)
121 break;
122 for (i = 0; i < rc; i++)
123 smc_wr_tx_process_cqe(&wc[i]);
124 } while (rc > 0);
125 if (polled == 1)
126 goto again;
127}
128
129void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context)
130{
131 struct smc_ib_device *dev = (struct smc_ib_device *)cq_context;
132
133 tasklet_schedule(&dev->send_tasklet);
134}
135
136/*---------------------------- request submission ---------------------------*/
137
138static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx)
139{
140 *idx = link->wr_tx_cnt;
141 for_each_clear_bit(*idx, link->wr_tx_mask, link->wr_tx_cnt) {
142 if (!test_and_set_bit(*idx, link->wr_tx_mask))
143 return 0;
144 }
145 *idx = link->wr_tx_cnt;
146 return -EBUSY;
147}
148
149/**
150 * smc_wr_tx_get_free_slot() - returns buffer for message assembly,
151 * and sets info for pending transmit tracking
152 * @link: Pointer to smc_link used to later send the message.
153 * @handler: Send completion handler function pointer.
154 * @wr_buf: Out value returns pointer to message buffer.
155 * @wr_pend_priv: Out value returns pointer serving as handler context.
156 *
157 * Return: 0 on success, or -errno on error.
158 */
159int smc_wr_tx_get_free_slot(struct smc_link *link,
160 smc_wr_tx_handler handler,
161 struct smc_wr_buf **wr_buf,
162 struct smc_wr_tx_pend_priv **wr_pend_priv)
163{
164 struct smc_wr_tx_pend *wr_pend;
165 struct ib_send_wr *wr_ib;
166 u64 wr_id;
167 u32 idx;
168 int rc;
169
170 *wr_buf = NULL;
171 *wr_pend_priv = NULL;
172 if (in_softirq()) {
173 rc = smc_wr_tx_get_free_slot_index(link, &idx);
174 if (rc)
175 return rc;
176 } else {
177 rc = wait_event_interruptible_timeout(
178 link->wr_tx_wait,
179 (smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY),
180 SMC_WR_TX_WAIT_FREE_SLOT_TIME);
181 if (!rc) {
182 /* timeout - terminate connections */
183 struct smc_link_group *lgr;
184
185 lgr = container_of(link, struct smc_link_group,
186 lnk[SMC_SINGLE_LINK]);
187 smc_lgr_terminate(lgr);
188 return -EPIPE;
189 }
190 if (rc == -ERESTARTSYS)
191 return -EINTR;
192 if (idx == link->wr_tx_cnt)
193 return -EPIPE;
194 }
195 wr_id = smc_wr_tx_get_next_wr_id(link);
196 wr_pend = &link->wr_tx_pends[idx];
197 wr_pend->wr_id = wr_id;
198 wr_pend->handler = handler;
199 wr_pend->link = link;
200 wr_pend->idx = idx;
201 wr_ib = &link->wr_tx_ibs[idx];
202 wr_ib->wr_id = wr_id;
203 *wr_buf = &link->wr_tx_bufs[idx];
204 *wr_pend_priv = &wr_pend->priv;
205 return 0;
206}
207
208int smc_wr_tx_put_slot(struct smc_link *link,
209 struct smc_wr_tx_pend_priv *wr_pend_priv)
210{
211 struct smc_wr_tx_pend *pend;
212
213 pend = container_of(wr_pend_priv, struct smc_wr_tx_pend, priv);
214 if (pend->idx < link->wr_tx_cnt) {
215 /* clear the full struct smc_wr_tx_pend including .priv */
216 memset(&link->wr_tx_pends[pend->idx], 0,
217 sizeof(link->wr_tx_pends[pend->idx]));
218 memset(&link->wr_tx_bufs[pend->idx], 0,
219 sizeof(link->wr_tx_bufs[pend->idx]));
220 test_and_clear_bit(pend->idx, link->wr_tx_mask);
221 return 1;
222 }
223
224 return 0;
225}
226
227/* Send prepared WR slot via ib_post_send.
228 * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer
229 */
230int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv)
231{
232 struct ib_send_wr *failed_wr = NULL;
233 struct smc_wr_tx_pend *pend;
234 int rc;
235
236 ib_req_notify_cq(link->smcibdev->roce_cq_send,
237 IB_CQ_SOLICITED_MASK | IB_CQ_REPORT_MISSED_EVENTS);
238 pend = container_of(priv, struct smc_wr_tx_pend, priv);
239 rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx],
240 &failed_wr);
241 if (rc)
242 smc_wr_tx_put_slot(link, priv);
243 return rc;
244}
245
246void smc_wr_tx_dismiss_slots(struct smc_link *link, u8 wr_rx_hdr_type,
247 smc_wr_tx_filter filter,
248 smc_wr_tx_dismisser dismisser,
249 unsigned long data)
250{
251 struct smc_wr_tx_pend_priv *tx_pend;
252 struct smc_wr_rx_hdr *wr_rx;
253 int i;
254
255 for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) {
256 wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[i];
257 if (wr_rx->type != wr_rx_hdr_type)
258 continue;
259 tx_pend = &link->wr_tx_pends[i].priv;
260 if (filter(tx_pend, data))
261 dismisser(tx_pend);
262 }
263}
264
265bool smc_wr_tx_has_pending(struct smc_link *link, u8 wr_rx_hdr_type,
266 smc_wr_tx_filter filter, unsigned long data)
267{
268 struct smc_wr_tx_pend_priv *tx_pend;
269 struct smc_wr_rx_hdr *wr_rx;
270 int i;
271
272 for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) {
273 wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[i];
274 if (wr_rx->type != wr_rx_hdr_type)
275 continue;
276 tx_pend = &link->wr_tx_pends[i].priv;
277 if (filter(tx_pend, data))
278 return true;
279 }
280 return false;
281}
282
283/****************************** receive queue ********************************/
284
285int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler)
286{
287 struct smc_wr_rx_handler *h_iter;
288 int rc = 0;
289
290 spin_lock(&smc_wr_rx_hash_lock);
291 hash_for_each_possible(smc_wr_rx_hash, h_iter, list, handler->type) {
292 if (h_iter->type == handler->type) {
293 rc = -EEXIST;
294 goto out_unlock;
295 }
296 }
297 hash_add(smc_wr_rx_hash, &handler->list, handler->type);
298out_unlock:
299 spin_unlock(&smc_wr_rx_hash_lock);
300 return rc;
301}
302
303/* Demultiplex a received work request based on the message type to its handler.
304 * Relies on smc_wr_rx_hash having been completely filled before any IB WRs,
305 * and not being modified any more afterwards so we don't need to lock it.
306 */
307static inline void smc_wr_rx_demultiplex(struct ib_wc *wc)
308{
309 struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
310 struct smc_wr_rx_handler *handler;
311 struct smc_wr_rx_hdr *wr_rx;
312 u64 temp_wr_id;
313 u32 index;
314
315 if (wc->byte_len < sizeof(*wr_rx))
316 return; /* short message */
317 temp_wr_id = wc->wr_id;
318 index = do_div(temp_wr_id, link->wr_rx_cnt);
319 wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[index];
320 hash_for_each_possible(smc_wr_rx_hash, handler, list, wr_rx->type) {
321 if (handler->type == wr_rx->type)
322 handler->handler(wc, wr_rx);
323 }
324}
325
326static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
327{
328 struct smc_link *link;
329 int i;
330
331 for (i = 0; i < num; i++) {
332 link = wc[i].qp->qp_context;
333 if (wc[i].status == IB_WC_SUCCESS) {
334 smc_wr_rx_demultiplex(&wc[i]);
335 smc_wr_rx_post(link); /* refill WR RX */
336 } else {
337 struct smc_link_group *lgr;
338
339 /* handle status errors */
340 switch (wc[i].status) {
341 case IB_WC_RETRY_EXC_ERR:
342 case IB_WC_RNR_RETRY_EXC_ERR:
343 case IB_WC_WR_FLUSH_ERR:
344 /* terminate connections of this link group
345 * abnormally
346 */
347 lgr = container_of(link, struct smc_link_group,
348 lnk[SMC_SINGLE_LINK]);
349 smc_lgr_terminate(lgr);
350 break;
351 default:
352 smc_wr_rx_post(link); /* refill WR RX */
353 break;
354 }
355 }
356 }
357}
358
359static void smc_wr_rx_tasklet_fn(unsigned long data)
360{
361 struct smc_ib_device *dev = (struct smc_ib_device *)data;
362 struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
363 int polled = 0;
364 int rc;
365
366again:
367 polled++;
368 do {
369 memset(&wc, 0, sizeof(wc));
370 rc = ib_poll_cq(dev->roce_cq_recv, SMC_WR_MAX_POLL_CQE, wc);
371 if (polled == 1) {
372 ib_req_notify_cq(dev->roce_cq_recv,
373 IB_CQ_SOLICITED_MASK
374 | IB_CQ_REPORT_MISSED_EVENTS);
375 }
376 if (!rc)
377 break;
378 smc_wr_rx_process_cqes(&wc[0], rc);
379 } while (rc > 0);
380 if (polled == 1)
381 goto again;
382}
383
384void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context)
385{
386 struct smc_ib_device *dev = (struct smc_ib_device *)cq_context;
387
388 tasklet_schedule(&dev->recv_tasklet);
389}
390
391int smc_wr_rx_post_init(struct smc_link *link)
392{
393 u32 i;
394 int rc = 0;
395
396 for (i = 0; i < link->wr_rx_cnt; i++)
397 rc = smc_wr_rx_post(link);
398 return rc;
399}
400
401/***************************** init, exit, misc ******************************/
402
403void smc_wr_remember_qp_attr(struct smc_link *lnk)
404{
405 struct ib_qp_attr *attr = &lnk->qp_attr;
406 struct ib_qp_init_attr init_attr;
407
408 memset(attr, 0, sizeof(*attr));
409 memset(&init_attr, 0, sizeof(init_attr));
410 ib_query_qp(lnk->roce_qp, attr,
411 IB_QP_STATE |
412 IB_QP_CUR_STATE |
413 IB_QP_PKEY_INDEX |
414 IB_QP_PORT |
415 IB_QP_QKEY |
416 IB_QP_AV |
417 IB_QP_PATH_MTU |
418 IB_QP_TIMEOUT |
419 IB_QP_RETRY_CNT |
420 IB_QP_RNR_RETRY |
421 IB_QP_RQ_PSN |
422 IB_QP_ALT_PATH |
423 IB_QP_MIN_RNR_TIMER |
424 IB_QP_SQ_PSN |
425 IB_QP_PATH_MIG_STATE |
426 IB_QP_CAP |
427 IB_QP_DEST_QPN,
428 &init_attr);
429
430 lnk->wr_tx_cnt = min_t(size_t, SMC_WR_BUF_CNT,
431 lnk->qp_attr.cap.max_send_wr);
432 lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT * 3,
433 lnk->qp_attr.cap.max_recv_wr);
434}
435
436static void smc_wr_init_sge(struct smc_link *lnk)
437{
438 u32 i;
439
440 for (i = 0; i < lnk->wr_tx_cnt; i++) {
441 lnk->wr_tx_sges[i].addr =
442 lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE;
443 lnk->wr_tx_sges[i].length = SMC_WR_TX_SIZE;
444 lnk->wr_tx_sges[i].lkey = lnk->roce_pd->local_dma_lkey;
445 lnk->wr_tx_ibs[i].next = NULL;
446 lnk->wr_tx_ibs[i].sg_list = &lnk->wr_tx_sges[i];
447 lnk->wr_tx_ibs[i].num_sge = 1;
448 lnk->wr_tx_ibs[i].opcode = IB_WR_SEND;
449 lnk->wr_tx_ibs[i].send_flags =
450 IB_SEND_SIGNALED | IB_SEND_SOLICITED | IB_SEND_INLINE;
451 }
452 for (i = 0; i < lnk->wr_rx_cnt; i++) {
453 lnk->wr_rx_sges[i].addr =
454 lnk->wr_rx_dma_addr + i * SMC_WR_BUF_SIZE;
455 lnk->wr_rx_sges[i].length = SMC_WR_BUF_SIZE;
456 lnk->wr_rx_sges[i].lkey = lnk->roce_pd->local_dma_lkey;
457 lnk->wr_rx_ibs[i].next = NULL;
458 lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[i];
459 lnk->wr_rx_ibs[i].num_sge = 1;
460 }
461}
462
463void smc_wr_free_link(struct smc_link *lnk)
464{
465 struct ib_device *ibdev;
466
467 memset(lnk->wr_tx_mask, 0,
468 BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*lnk->wr_tx_mask));
469
470 if (!lnk->smcibdev)
471 return;
472 ibdev = lnk->smcibdev->ibdev;
473
474 if (lnk->wr_rx_dma_addr) {
475 ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr,
476 SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
477 DMA_FROM_DEVICE);
478 lnk->wr_rx_dma_addr = 0;
479 }
480 if (lnk->wr_tx_dma_addr) {
481 ib_dma_unmap_single(ibdev, lnk->wr_tx_dma_addr,
482 SMC_WR_BUF_SIZE * lnk->wr_tx_cnt,
483 DMA_TO_DEVICE);
484 lnk->wr_tx_dma_addr = 0;
485 }
486}
487
488void smc_wr_free_link_mem(struct smc_link *lnk)
489{
490 kfree(lnk->wr_tx_pends);
491 lnk->wr_tx_pends = NULL;
492 kfree(lnk->wr_tx_mask);
493 lnk->wr_tx_mask = NULL;
494 kfree(lnk->wr_tx_sges);
495 lnk->wr_tx_sges = NULL;
496 kfree(lnk->wr_rx_sges);
497 lnk->wr_rx_sges = NULL;
498 kfree(lnk->wr_rx_ibs);
499 lnk->wr_rx_ibs = NULL;
500 kfree(lnk->wr_tx_ibs);
501 lnk->wr_tx_ibs = NULL;
502 kfree(lnk->wr_tx_bufs);
503 lnk->wr_tx_bufs = NULL;
504 kfree(lnk->wr_rx_bufs);
505 lnk->wr_rx_bufs = NULL;
506}
507
508int smc_wr_alloc_link_mem(struct smc_link *link)
509{
510 /* allocate link related memory */
511 link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL);
512 if (!link->wr_tx_bufs)
513 goto no_mem;
514 link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, SMC_WR_BUF_SIZE,
515 GFP_KERNEL);
516 if (!link->wr_rx_bufs)
517 goto no_mem_wr_tx_bufs;
518 link->wr_tx_ibs = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_ibs[0]),
519 GFP_KERNEL);
520 if (!link->wr_tx_ibs)
521 goto no_mem_wr_rx_bufs;
522 link->wr_rx_ibs = kcalloc(SMC_WR_BUF_CNT * 3,
523 sizeof(link->wr_rx_ibs[0]),
524 GFP_KERNEL);
525 if (!link->wr_rx_ibs)
526 goto no_mem_wr_tx_ibs;
527 link->wr_tx_sges = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_sges[0]),
528 GFP_KERNEL);
529 if (!link->wr_tx_sges)
530 goto no_mem_wr_rx_ibs;
531 link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3,
532 sizeof(link->wr_rx_sges[0]),
533 GFP_KERNEL);
534 if (!link->wr_rx_sges)
535 goto no_mem_wr_tx_sges;
536 link->wr_tx_mask = kzalloc(
537 BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*link->wr_tx_mask),
538 GFP_KERNEL);
539 if (!link->wr_tx_mask)
540 goto no_mem_wr_rx_sges;
541 link->wr_tx_pends = kcalloc(SMC_WR_BUF_CNT,
542 sizeof(link->wr_tx_pends[0]),
543 GFP_KERNEL);
544 if (!link->wr_tx_pends)
545 goto no_mem_wr_tx_mask;
546 return 0;
547
548no_mem_wr_tx_mask:
549 kfree(link->wr_tx_mask);
550no_mem_wr_rx_sges:
551 kfree(link->wr_rx_sges);
552no_mem_wr_tx_sges:
553 kfree(link->wr_tx_sges);
554no_mem_wr_rx_ibs:
555 kfree(link->wr_rx_ibs);
556no_mem_wr_tx_ibs:
557 kfree(link->wr_tx_ibs);
558no_mem_wr_rx_bufs:
559 kfree(link->wr_rx_bufs);
560no_mem_wr_tx_bufs:
561 kfree(link->wr_tx_bufs);
562no_mem:
563 return -ENOMEM;
564}
565
566void smc_wr_remove_dev(struct smc_ib_device *smcibdev)
567{
568 tasklet_kill(&smcibdev->recv_tasklet);
569 tasklet_kill(&smcibdev->send_tasklet);
570}
571
572void smc_wr_add_dev(struct smc_ib_device *smcibdev)
573{
574 tasklet_init(&smcibdev->recv_tasklet, smc_wr_rx_tasklet_fn,
575 (unsigned long)smcibdev);
576 tasklet_init(&smcibdev->send_tasklet, smc_wr_tx_tasklet_fn,
577 (unsigned long)smcibdev);
578}
579
580int smc_wr_create_link(struct smc_link *lnk)
581{
582 struct ib_device *ibdev = lnk->smcibdev->ibdev;
583 int rc = 0;
584
585 smc_wr_tx_set_wr_id(&lnk->wr_tx_id, 0);
586 lnk->wr_rx_id = 0;
587 lnk->wr_rx_dma_addr = ib_dma_map_single(
588 ibdev, lnk->wr_rx_bufs, SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
589 DMA_FROM_DEVICE);
590 if (ib_dma_mapping_error(ibdev, lnk->wr_rx_dma_addr)) {
591 lnk->wr_rx_dma_addr = 0;
592 rc = -EIO;
593 goto out;
594 }
595 lnk->wr_tx_dma_addr = ib_dma_map_single(
596 ibdev, lnk->wr_tx_bufs, SMC_WR_BUF_SIZE * lnk->wr_tx_cnt,
597 DMA_TO_DEVICE);
598 if (ib_dma_mapping_error(ibdev, lnk->wr_tx_dma_addr)) {
599 rc = -EIO;
600 goto dma_unmap;
601 }
602 smc_wr_init_sge(lnk);
603 memset(lnk->wr_tx_mask, 0,
604 BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*lnk->wr_tx_mask));
605 return rc;
606
607dma_unmap:
608 ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr,
609 SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
610 DMA_FROM_DEVICE);
611 lnk->wr_rx_dma_addr = 0;
612out:
613 return rc;
614}
diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
new file mode 100644
index 000000000000..0b9beeda6053
--- /dev/null
+++ b/net/smc/smc_wr.h
@@ -0,0 +1,106 @@
1/*
2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
3 *
4 * Work Requests exploiting Infiniband API
5 *
6 * Copyright IBM Corp. 2016
7 *
8 * Author(s): Steffen Maier <maier@linux.vnet.ibm.com>
9 */
10
11#ifndef SMC_WR_H
12#define SMC_WR_H
13
14#include <linux/atomic.h>
15#include <rdma/ib_verbs.h>
16#include <asm/div64.h>
17
18#include "smc.h"
19#include "smc_core.h"
20
21#define SMC_WR_MAX_CQE 32768 /* max. # of completion queue elements */
22#define SMC_WR_BUF_CNT 16 /* # of ctrl buffers per link */
23
24#define SMC_WR_TX_WAIT_FREE_SLOT_TIME (10 * HZ)
25#define SMC_WR_TX_WAIT_PENDING_TIME (5 * HZ)
26
27#define SMC_WR_TX_SIZE 44 /* actual size of wr_send data (<=SMC_WR_BUF_SIZE) */
28
29#define SMC_WR_TX_PEND_PRIV_SIZE 32
30
31struct smc_wr_tx_pend_priv {
32 u8 priv[SMC_WR_TX_PEND_PRIV_SIZE];
33};
34
35typedef void (*smc_wr_tx_handler)(struct smc_wr_tx_pend_priv *,
36 struct smc_link *,
37 enum ib_wc_status);
38
39typedef bool (*smc_wr_tx_filter)(struct smc_wr_tx_pend_priv *,
40 unsigned long);
41
42typedef void (*smc_wr_tx_dismisser)(struct smc_wr_tx_pend_priv *);
43
44struct smc_wr_rx_handler {
45 struct hlist_node list; /* hash table collision resolution */
46 void (*handler)(struct ib_wc *, void *);
47 u8 type;
48};
49
50/* Only used by RDMA write WRs.
51 * All other WRs (CDC/LLC) use smc_wr_tx_send handling WR_ID implicitly
52 */
53static inline long smc_wr_tx_get_next_wr_id(struct smc_link *link)
54{
55 return atomic_long_inc_return(&link->wr_tx_id);
56}
57
58static inline void smc_wr_tx_set_wr_id(atomic_long_t *wr_tx_id, long val)
59{
60 atomic_long_set(wr_tx_id, val);
61}
62
63/* post a new receive work request to fill a completed old work request entry */
64static inline int smc_wr_rx_post(struct smc_link *link)
65{
66 struct ib_recv_wr *bad_recv_wr = NULL;
67 int rc;
68 u64 wr_id, temp_wr_id;
69 u32 index;
70
71 wr_id = ++link->wr_rx_id; /* tasklet context, thus not atomic */
72 temp_wr_id = wr_id;
73 index = do_div(temp_wr_id, link->wr_rx_cnt);
74 link->wr_rx_ibs[index].wr_id = wr_id;
75 rc = ib_post_recv(link->roce_qp, &link->wr_rx_ibs[index], &bad_recv_wr);
76 return rc;
77}
78
79int smc_wr_create_link(struct smc_link *lnk);
80int smc_wr_alloc_link_mem(struct smc_link *lnk);
81void smc_wr_free_link(struct smc_link *lnk);
82void smc_wr_free_link_mem(struct smc_link *lnk);
83void smc_wr_remember_qp_attr(struct smc_link *lnk);
84void smc_wr_remove_dev(struct smc_ib_device *smcibdev);
85void smc_wr_add_dev(struct smc_ib_device *smcibdev);
86
87int smc_wr_tx_get_free_slot(struct smc_link *link, smc_wr_tx_handler handler,
88 struct smc_wr_buf **wr_buf,
89 struct smc_wr_tx_pend_priv **wr_pend_priv);
90int smc_wr_tx_put_slot(struct smc_link *link,
91 struct smc_wr_tx_pend_priv *wr_pend_priv);
92int smc_wr_tx_send(struct smc_link *link,
93 struct smc_wr_tx_pend_priv *wr_pend_priv);
94void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context);
95bool smc_wr_tx_has_pending(struct smc_link *link, u8 wr_rx_hdr_type,
96 smc_wr_tx_filter filter, unsigned long data);
97void smc_wr_tx_dismiss_slots(struct smc_link *lnk, u8 wr_rx_hdr_type,
98 smc_wr_tx_filter filter,
99 smc_wr_tx_dismisser dismisser,
100 unsigned long data);
101
102int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler);
103int smc_wr_rx_post_init(struct smc_link *link);
104void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context);
105
106#endif /* SMC_WR_H */
diff --git a/net/socket.c b/net/socket.c
index 0758e13754e2..985ef06792d6 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -287,7 +287,7 @@ static void init_once(void *foo)
287 inode_init_once(&ei->vfs_inode); 287 inode_init_once(&ei->vfs_inode);
288} 288}
289 289
290static int init_inodecache(void) 290static void init_inodecache(void)
291{ 291{
292 sock_inode_cachep = kmem_cache_create("sock_inode_cache", 292 sock_inode_cachep = kmem_cache_create("sock_inode_cache",
293 sizeof(struct socket_alloc), 293 sizeof(struct socket_alloc),
@@ -296,9 +296,7 @@ static int init_inodecache(void)
296 SLAB_RECLAIM_ACCOUNT | 296 SLAB_RECLAIM_ACCOUNT |
297 SLAB_MEM_SPREAD | SLAB_ACCOUNT), 297 SLAB_MEM_SPREAD | SLAB_ACCOUNT),
298 init_once); 298 init_once);
299 if (sock_inode_cachep == NULL) 299 BUG_ON(sock_inode_cachep == NULL);
300 return -ENOMEM;
301 return 0;
302} 300}
303 301
304static const struct super_operations sockfs_ops = { 302static const struct super_operations sockfs_ops = {
@@ -654,6 +652,16 @@ int kernel_sendmsg(struct socket *sock, struct msghdr *msg,
654} 652}
655EXPORT_SYMBOL(kernel_sendmsg); 653EXPORT_SYMBOL(kernel_sendmsg);
656 654
655static bool skb_is_err_queue(const struct sk_buff *skb)
656{
657 /* pkt_type of skbs enqueued on the error queue are set to
658 * PACKET_OUTGOING in skb_set_err_queue(). This is only safe to do
659 * in recvmsg, since skbs received on a local socket will never
660 * have a pkt_type of PACKET_OUTGOING.
661 */
662 return skb->pkt_type == PACKET_OUTGOING;
663}
664
657/* 665/*
658 * called from sock_recv_timestamp() if sock_flag(sk, SOCK_RCVTSTAMP) 666 * called from sock_recv_timestamp() if sock_flag(sk, SOCK_RCVTSTAMP)
659 */ 667 */
@@ -697,7 +705,8 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
697 put_cmsg(msg, SOL_SOCKET, 705 put_cmsg(msg, SOL_SOCKET,
698 SCM_TIMESTAMPING, sizeof(tss), &tss); 706 SCM_TIMESTAMPING, sizeof(tss), &tss);
699 707
700 if (skb->len && (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS)) 708 if (skb_is_err_queue(skb) && skb->len &&
709 SKB_EXT_ERR(skb)->opt_stats)
701 put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING_OPT_STATS, 710 put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING_OPT_STATS,
702 skb->len, skb->data); 711 skb->len, skb->data);
703 } 712 }
@@ -1508,7 +1517,7 @@ SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr,
1508 if (err) 1517 if (err)
1509 goto out_fd; 1518 goto out_fd;
1510 1519
1511 err = sock->ops->accept(sock, newsock, sock->file->f_flags); 1520 err = sock->ops->accept(sock, newsock, sock->file->f_flags, false);
1512 if (err < 0) 1521 if (err < 0)
1513 goto out_fd; 1522 goto out_fd;
1514 1523
@@ -1733,6 +1742,7 @@ SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size,
1733 /* We assume all kernel code knows the size of sockaddr_storage */ 1742 /* We assume all kernel code knows the size of sockaddr_storage */
1734 msg.msg_namelen = 0; 1743 msg.msg_namelen = 0;
1735 msg.msg_iocb = NULL; 1744 msg.msg_iocb = NULL;
1745 msg.msg_flags = 0;
1736 if (sock->file->f_flags & O_NONBLOCK) 1746 if (sock->file->f_flags & O_NONBLOCK)
1737 flags |= MSG_DONTWAIT; 1747 flags |= MSG_DONTWAIT;
1738 err = sock_recvmsg(sock, &msg, flags); 1748 err = sock_recvmsg(sock, &msg, flags);
@@ -1948,6 +1958,8 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
1948 ctl_buf = msg_sys->msg_control; 1958 ctl_buf = msg_sys->msg_control;
1949 ctl_len = msg_sys->msg_controllen; 1959 ctl_len = msg_sys->msg_controllen;
1950 } else if (ctl_len) { 1960 } else if (ctl_len) {
1961 BUILD_BUG_ON(sizeof(struct cmsghdr) !=
1962 CMSG_ALIGN(sizeof(struct cmsghdr)));
1951 if (ctl_len > sizeof(ctl)) { 1963 if (ctl_len > sizeof(ctl)) {
1952 ctl_buf = sock_kmalloc(sock->sk, ctl_len, GFP_KERNEL); 1964 ctl_buf = sock_kmalloc(sock->sk, ctl_len, GFP_KERNEL);
1953 if (ctl_buf == NULL) 1965 if (ctl_buf == NULL)
@@ -2228,8 +2240,10 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
2228 return err; 2240 return err;
2229 2241
2230 err = sock_error(sock->sk); 2242 err = sock_error(sock->sk);
2231 if (err) 2243 if (err) {
2244 datagrams = err;
2232 goto out_put; 2245 goto out_put;
2246 }
2233 2247
2234 entry = mmsg; 2248 entry = mmsg;
2235 compat_entry = (struct compat_mmsghdr __user *)mmsg; 2249 compat_entry = (struct compat_mmsghdr __user *)mmsg;
@@ -3236,7 +3250,7 @@ int kernel_accept(struct socket *sock, struct socket **newsock, int flags)
3236 if (err < 0) 3250 if (err < 0)
3237 goto done; 3251 goto done;
3238 3252
3239 err = sock->ops->accept(sock, *newsock, flags); 3253 err = sock->ops->accept(sock, *newsock, flags, true);
3240 if (err < 0) { 3254 if (err < 0) {
3241 sock_release(*newsock); 3255 sock_release(*newsock);
3242 *newsock = NULL; 3256 *newsock = NULL;
diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c
index 41adf362936d..b5c279b22680 100644
--- a/net/strparser/strparser.c
+++ b/net/strparser/strparser.c
@@ -504,6 +504,7 @@ static int __init strp_mod_init(void)
504 504
505static void __exit strp_mod_exit(void) 505static void __exit strp_mod_exit(void)
506{ 506{
507 destroy_workqueue(strp_wq);
507} 508}
508module_init(strp_mod_init); 509module_init(strp_mod_init);
509module_exit(strp_mod_exit); 510module_exit(strp_mod_exit);
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 2bff63a73cf8..d2623b9f23d6 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -8,6 +8,7 @@
8 8
9#include <linux/types.h> 9#include <linux/types.h>
10#include <linux/sched.h> 10#include <linux/sched.h>
11#include <linux/cred.h>
11#include <linux/module.h> 12#include <linux/module.h>
12#include <linux/slab.h> 13#include <linux/slab.h>
13#include <linux/errno.h> 14#include <linux/errno.h>
@@ -464,8 +465,10 @@ rpcauth_prune_expired(struct list_head *free, int nr_to_scan)
464 * Note that the cred_unused list must be time-ordered. 465 * Note that the cred_unused list must be time-ordered.
465 */ 466 */
466 if (time_in_range(cred->cr_expire, expired, jiffies) && 467 if (time_in_range(cred->cr_expire, expired, jiffies) &&
467 test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) != 0) 468 test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) != 0) {
469 freed = SHRINK_STOP;
468 break; 470 break;
471 }
469 472
470 list_del_init(&cred->cr_lru); 473 list_del_init(&cred->cr_lru);
471 number_cred_unused--; 474 number_cred_unused--;
@@ -520,7 +523,7 @@ static unsigned long
520rpcauth_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc) 523rpcauth_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
521 524
522{ 525{
523 return (number_cred_unused / 100) * sysctl_vfs_cache_pressure; 526 return number_cred_unused * sysctl_vfs_cache_pressure / 100;
524} 527}
525 528
526static void 529static void
@@ -646,9 +649,6 @@ rpcauth_init_cred(struct rpc_cred *cred, const struct auth_cred *acred,
646 cred->cr_auth = auth; 649 cred->cr_auth = auth;
647 cred->cr_ops = ops; 650 cred->cr_ops = ops;
648 cred->cr_expire = jiffies; 651 cred->cr_expire = jiffies;
649#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
650 cred->cr_magic = RPCAUTH_CRED_MAGIC;
651#endif
652 cred->cr_uid = acred->uid; 652 cred->cr_uid = acred->uid;
653} 653}
654EXPORT_SYMBOL_GPL(rpcauth_init_cred); 654EXPORT_SYMBOL_GPL(rpcauth_init_cred);
@@ -876,8 +876,12 @@ int __init rpcauth_init_module(void)
876 err = rpc_init_generic_auth(); 876 err = rpc_init_generic_auth();
877 if (err < 0) 877 if (err < 0)
878 goto out2; 878 goto out2;
879 register_shrinker(&rpc_cred_shrinker); 879 err = register_shrinker(&rpc_cred_shrinker);
880 if (err < 0)
881 goto out3;
880 return 0; 882 return 0;
883out3:
884 rpc_destroy_generic_auth();
881out2: 885out2:
882 rpc_destroy_authunix(); 886 rpc_destroy_authunix();
883out1: 887out1:
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index cdeb1d814833..4f16953e4954 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -763,7 +763,7 @@ err_put_ctx:
763err: 763err:
764 kfree(buf); 764 kfree(buf);
765out: 765out:
766 dprintk("RPC: %s returning %Zd\n", __func__, err); 766 dprintk("RPC: %s returning %zd\n", __func__, err);
767 return err; 767 return err;
768} 768}
769 769
diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c
index dc6fb79a361f..25d9a9cf7b66 100644
--- a/net/sunrpc/auth_gss/gss_rpc_xdr.c
+++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c
@@ -260,7 +260,7 @@ static int gssx_dec_option_array(struct xdr_stream *xdr,
260 if (!oa->data) 260 if (!oa->data)
261 return -ENOMEM; 261 return -ENOMEM;
262 262
263 creds = kmalloc(sizeof(struct svc_cred), GFP_KERNEL); 263 creds = kzalloc(sizeof(struct svc_cred), GFP_KERNEL);
264 if (!creds) { 264 if (!creds) {
265 kfree(oa->data); 265 kfree(oa->data);
266 return -ENOMEM; 266 return -ENOMEM;
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 153082598522..a54a7a3d28f5 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -1489,8 +1489,8 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp)
1489 case RPC_GSS_PROC_DESTROY: 1489 case RPC_GSS_PROC_DESTROY:
1490 if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq)) 1490 if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq))
1491 goto auth_err; 1491 goto auth_err;
1492 rsci->h.expiry_time = seconds_since_boot(); 1492 /* Delete the entry from the cache_list and call cache_put */
1493 set_bit(CACHE_NEGATIVE, &rsci->h.flags); 1493 sunrpc_cache_unhash(sn->rsc_cache, &rsci->h);
1494 if (resv->iov_len + 4 > PAGE_SIZE) 1494 if (resv->iov_len + 4 > PAGE_SIZE)
1495 goto drop; 1495 goto drop;
1496 svc_putnl(resv, RPC_SUCCESS); 1496 svc_putnl(resv, RPC_SUCCESS);
diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c
index 4d17376b2acb..5f3d527dff65 100644
--- a/net/sunrpc/auth_null.c
+++ b/net/sunrpc/auth_null.c
@@ -139,7 +139,4 @@ struct rpc_cred null_cred = {
139 .cr_ops = &null_credops, 139 .cr_ops = &null_credops,
140 .cr_count = ATOMIC_INIT(1), 140 .cr_count = ATOMIC_INIT(1),
141 .cr_flags = 1UL << RPCAUTH_CRED_UPTODATE, 141 .cr_flags = 1UL << RPCAUTH_CRED_UPTODATE,
142#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
143 .cr_magic = RPCAUTH_CRED_MAGIC,
144#endif
145}; 142};
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index 306fc0f54596..82337e1ec9cd 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -14,12 +14,10 @@
14#include <linux/sunrpc/auth.h> 14#include <linux/sunrpc/auth.h>
15#include <linux/user_namespace.h> 15#include <linux/user_namespace.h>
16 16
17#define NFS_NGROUPS 16
18
19struct unx_cred { 17struct unx_cred {
20 struct rpc_cred uc_base; 18 struct rpc_cred uc_base;
21 kgid_t uc_gid; 19 kgid_t uc_gid;
22 kgid_t uc_gids[NFS_NGROUPS]; 20 kgid_t uc_gids[UNX_NGROUPS];
23}; 21};
24#define uc_uid uc_base.cr_uid 22#define uc_uid uc_base.cr_uid
25 23
@@ -82,13 +80,13 @@ unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t
82 80
83 if (acred->group_info != NULL) 81 if (acred->group_info != NULL)
84 groups = acred->group_info->ngroups; 82 groups = acred->group_info->ngroups;
85 if (groups > NFS_NGROUPS) 83 if (groups > UNX_NGROUPS)
86 groups = NFS_NGROUPS; 84 groups = UNX_NGROUPS;
87 85
88 cred->uc_gid = acred->gid; 86 cred->uc_gid = acred->gid;
89 for (i = 0; i < groups; i++) 87 for (i = 0; i < groups; i++)
90 cred->uc_gids[i] = acred->group_info->gid[i]; 88 cred->uc_gids[i] = acred->group_info->gid[i];
91 if (i < NFS_NGROUPS) 89 if (i < UNX_NGROUPS)
92 cred->uc_gids[i] = INVALID_GID; 90 cred->uc_gids[i] = INVALID_GID;
93 91
94 return &cred->uc_base; 92 return &cred->uc_base;
@@ -132,12 +130,12 @@ unx_match(struct auth_cred *acred, struct rpc_cred *rcred, int flags)
132 130
133 if (acred->group_info != NULL) 131 if (acred->group_info != NULL)
134 groups = acred->group_info->ngroups; 132 groups = acred->group_info->ngroups;
135 if (groups > NFS_NGROUPS) 133 if (groups > UNX_NGROUPS)
136 groups = NFS_NGROUPS; 134 groups = UNX_NGROUPS;
137 for (i = 0; i < groups ; i++) 135 for (i = 0; i < groups ; i++)
138 if (!gid_eq(cred->uc_gids[i], acred->group_info->gid[i])) 136 if (!gid_eq(cred->uc_gids[i], acred->group_info->gid[i]))
139 return 0; 137 return 0;
140 if (groups < NFS_NGROUPS && gid_valid(cred->uc_gids[groups])) 138 if (groups < UNX_NGROUPS && gid_valid(cred->uc_gids[groups]))
141 return 0; 139 return 0;
142 return 1; 140 return 1;
143} 141}
@@ -166,7 +164,7 @@ unx_marshal(struct rpc_task *task, __be32 *p)
166 *p++ = htonl((u32) from_kuid(&init_user_ns, cred->uc_uid)); 164 *p++ = htonl((u32) from_kuid(&init_user_ns, cred->uc_uid));
167 *p++ = htonl((u32) from_kgid(&init_user_ns, cred->uc_gid)); 165 *p++ = htonl((u32) from_kgid(&init_user_ns, cred->uc_gid));
168 hold = p++; 166 hold = p++;
169 for (i = 0; i < 16 && gid_valid(cred->uc_gids[i]); i++) 167 for (i = 0; i < UNX_NGROUPS && gid_valid(cred->uc_gids[i]); i++)
170 *p++ = htonl((u32) from_kgid(&init_user_ns, cred->uc_gids[i])); 168 *p++ = htonl((u32) from_kgid(&init_user_ns, cred->uc_gids[i]));
171 *hold = htonl(p - hold - 1); /* gid array length */ 169 *hold = htonl(p - hold - 1); /* gid array length */
172 *base = htonl((p - base - 1) << 2); /* cred length */ 170 *base = htonl((p - base - 1) << 2); /* cred length */
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 8147e8d56eb2..79d55d949d9a 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -362,11 +362,6 @@ void sunrpc_destroy_cache_detail(struct cache_detail *cd)
362 cache_purge(cd); 362 cache_purge(cd);
363 spin_lock(&cache_list_lock); 363 spin_lock(&cache_list_lock);
364 write_lock(&cd->hash_lock); 364 write_lock(&cd->hash_lock);
365 if (cd->entries) {
366 write_unlock(&cd->hash_lock);
367 spin_unlock(&cache_list_lock);
368 goto out;
369 }
370 if (current_detail == cd) 365 if (current_detail == cd)
371 current_detail = NULL; 366 current_detail = NULL;
372 list_del_init(&cd->others); 367 list_del_init(&cd->others);
@@ -376,9 +371,6 @@ void sunrpc_destroy_cache_detail(struct cache_detail *cd)
376 /* module must be being unloaded so its safe to kill the worker */ 371 /* module must be being unloaded so its safe to kill the worker */
377 cancel_delayed_work_sync(&cache_cleaner); 372 cancel_delayed_work_sync(&cache_cleaner);
378 } 373 }
379 return;
380out:
381 printk(KERN_ERR "RPC: failed to unregister %s cache\n", cd->name);
382} 374}
383EXPORT_SYMBOL_GPL(sunrpc_destroy_cache_detail); 375EXPORT_SYMBOL_GPL(sunrpc_destroy_cache_detail);
384 376
@@ -497,13 +489,32 @@ EXPORT_SYMBOL_GPL(cache_flush);
497 489
498void cache_purge(struct cache_detail *detail) 490void cache_purge(struct cache_detail *detail)
499{ 491{
500 time_t now = seconds_since_boot(); 492 struct cache_head *ch = NULL;
501 if (detail->flush_time >= now) 493 struct hlist_head *head = NULL;
502 now = detail->flush_time + 1; 494 struct hlist_node *tmp = NULL;
503 /* 'now' is the maximum value any 'last_refresh' can have */ 495 int i = 0;
504 detail->flush_time = now; 496
505 detail->nextcheck = seconds_since_boot(); 497 write_lock(&detail->hash_lock);
506 cache_flush(); 498 if (!detail->entries) {
499 write_unlock(&detail->hash_lock);
500 return;
501 }
502
503 dprintk("RPC: %d entries in %s cache\n", detail->entries, detail->name);
504 for (i = 0; i < detail->hash_size; i++) {
505 head = &detail->hash_table[i];
506 hlist_for_each_entry_safe(ch, tmp, head, cache_list) {
507 hlist_del_init(&ch->cache_list);
508 detail->entries--;
509
510 set_bit(CACHE_CLEANED, &ch->flags);
511 write_unlock(&detail->hash_lock);
512 cache_fresh_unlocked(ch, detail);
513 cache_put(ch, detail);
514 write_lock(&detail->hash_lock);
515 }
516 }
517 write_unlock(&detail->hash_lock);
507} 518}
508EXPORT_SYMBOL_GPL(cache_purge); 519EXPORT_SYMBOL_GPL(cache_purge);
509 520
@@ -717,7 +728,7 @@ void cache_clean_deferred(void *owner)
717/* 728/*
718 * communicate with user-space 729 * communicate with user-space
719 * 730 *
720 * We have a magic /proc file - /proc/sunrpc/<cachename>/channel. 731 * We have a magic /proc file - /proc/net/rpc/<cachename>/channel.
721 * On read, you get a full request, or block. 732 * On read, you get a full request, or block.
722 * On write, an update request is processed. 733 * On write, an update request is processed.
723 * Poll works if anything to read, and always allows write. 734 * Poll works if anything to read, and always allows write.
@@ -1272,7 +1283,7 @@ EXPORT_SYMBOL_GPL(qword_get);
1272 1283
1273 1284
1274/* 1285/*
1275 * support /proc/sunrpc/cache/$CACHENAME/content 1286 * support /proc/net/rpc/$CACHENAME/content
1276 * as a seqfile. 1287 * as a seqfile.
1277 * We call ->cache_show passing NULL for the item to 1288 * We call ->cache_show passing NULL for the item to
1278 * get a header, then pass each real item in the cache 1289 * get a header, then pass each real item in the cache
@@ -1358,7 +1369,7 @@ static int c_show(struct seq_file *m, void *p)
1358 ifdebug(CACHE) 1369 ifdebug(CACHE)
1359 seq_printf(m, "# expiry=%ld refcnt=%d flags=%lx\n", 1370 seq_printf(m, "# expiry=%ld refcnt=%d flags=%lx\n",
1360 convert_to_wallclock(cp->expiry_time), 1371 convert_to_wallclock(cp->expiry_time),
1361 atomic_read(&cp->ref.refcount), cp->flags); 1372 kref_read(&cp->ref), cp->flags);
1362 cache_get(cp); 1373 cache_get(cp);
1363 if (cache_check(cd, cp, NULL)) 1374 if (cache_check(cd, cp, NULL))
1364 /* cache_check does a cache_put on failure */ 1375 /* cache_check does a cache_put on failure */
@@ -1427,20 +1438,11 @@ static ssize_t read_flush(struct file *file, char __user *buf,
1427 struct cache_detail *cd) 1438 struct cache_detail *cd)
1428{ 1439{
1429 char tbuf[22]; 1440 char tbuf[22];
1430 unsigned long p = *ppos;
1431 size_t len; 1441 size_t len;
1432 1442
1433 snprintf(tbuf, sizeof(tbuf), "%lu\n", convert_to_wallclock(cd->flush_time)); 1443 len = snprintf(tbuf, sizeof(tbuf), "%lu\n",
1434 len = strlen(tbuf); 1444 convert_to_wallclock(cd->flush_time));
1435 if (p >= len) 1445 return simple_read_from_buffer(buf, count, ppos, tbuf, len);
1436 return 0;
1437 len -= p;
1438 if (len > count)
1439 len = count;
1440 if (copy_to_user(buf, (void*)(tbuf+p), len))
1441 return -EFAULT;
1442 *ppos += len;
1443 return len;
1444} 1446}
1445 1447
1446static ssize_t write_flush(struct file *file, const char __user *buf, 1448static ssize_t write_flush(struct file *file, const char __user *buf,
@@ -1600,21 +1602,12 @@ static const struct file_operations cache_flush_operations_procfs = {
1600 .llseek = no_llseek, 1602 .llseek = no_llseek,
1601}; 1603};
1602 1604
1603static void remove_cache_proc_entries(struct cache_detail *cd, struct net *net) 1605static void remove_cache_proc_entries(struct cache_detail *cd)
1604{ 1606{
1605 struct sunrpc_net *sn; 1607 if (cd->procfs) {
1606 1608 proc_remove(cd->procfs);
1607 if (cd->u.procfs.proc_ent == NULL) 1609 cd->procfs = NULL;
1608 return; 1610 }
1609 if (cd->u.procfs.flush_ent)
1610 remove_proc_entry("flush", cd->u.procfs.proc_ent);
1611 if (cd->u.procfs.channel_ent)
1612 remove_proc_entry("channel", cd->u.procfs.proc_ent);
1613 if (cd->u.procfs.content_ent)
1614 remove_proc_entry("content", cd->u.procfs.proc_ent);
1615 cd->u.procfs.proc_ent = NULL;
1616 sn = net_generic(net, sunrpc_net_id);
1617 remove_proc_entry(cd->name, sn->proc_net_rpc);
1618} 1611}
1619 1612
1620#ifdef CONFIG_PROC_FS 1613#ifdef CONFIG_PROC_FS
@@ -1624,38 +1617,30 @@ static int create_cache_proc_entries(struct cache_detail *cd, struct net *net)
1624 struct sunrpc_net *sn; 1617 struct sunrpc_net *sn;
1625 1618
1626 sn = net_generic(net, sunrpc_net_id); 1619 sn = net_generic(net, sunrpc_net_id);
1627 cd->u.procfs.proc_ent = proc_mkdir(cd->name, sn->proc_net_rpc); 1620 cd->procfs = proc_mkdir(cd->name, sn->proc_net_rpc);
1628 if (cd->u.procfs.proc_ent == NULL) 1621 if (cd->procfs == NULL)
1629 goto out_nomem; 1622 goto out_nomem;
1630 cd->u.procfs.channel_ent = NULL;
1631 cd->u.procfs.content_ent = NULL;
1632 1623
1633 p = proc_create_data("flush", S_IFREG|S_IRUSR|S_IWUSR, 1624 p = proc_create_data("flush", S_IFREG|S_IRUSR|S_IWUSR,
1634 cd->u.procfs.proc_ent, 1625 cd->procfs, &cache_flush_operations_procfs, cd);
1635 &cache_flush_operations_procfs, cd);
1636 cd->u.procfs.flush_ent = p;
1637 if (p == NULL) 1626 if (p == NULL)
1638 goto out_nomem; 1627 goto out_nomem;
1639 1628
1640 if (cd->cache_request || cd->cache_parse) { 1629 if (cd->cache_request || cd->cache_parse) {
1641 p = proc_create_data("channel", S_IFREG|S_IRUSR|S_IWUSR, 1630 p = proc_create_data("channel", S_IFREG|S_IRUSR|S_IWUSR,
1642 cd->u.procfs.proc_ent, 1631 cd->procfs, &cache_file_operations_procfs, cd);
1643 &cache_file_operations_procfs, cd);
1644 cd->u.procfs.channel_ent = p;
1645 if (p == NULL) 1632 if (p == NULL)
1646 goto out_nomem; 1633 goto out_nomem;
1647 } 1634 }
1648 if (cd->cache_show) { 1635 if (cd->cache_show) {
1649 p = proc_create_data("content", S_IFREG|S_IRUSR, 1636 p = proc_create_data("content", S_IFREG|S_IRUSR,
1650 cd->u.procfs.proc_ent, 1637 cd->procfs, &content_file_operations_procfs, cd);
1651 &content_file_operations_procfs, cd);
1652 cd->u.procfs.content_ent = p;
1653 if (p == NULL) 1638 if (p == NULL)
1654 goto out_nomem; 1639 goto out_nomem;
1655 } 1640 }
1656 return 0; 1641 return 0;
1657out_nomem: 1642out_nomem:
1658 remove_cache_proc_entries(cd, net); 1643 remove_cache_proc_entries(cd);
1659 return -ENOMEM; 1644 return -ENOMEM;
1660} 1645}
1661#else /* CONFIG_PROC_FS */ 1646#else /* CONFIG_PROC_FS */
@@ -1684,7 +1669,7 @@ EXPORT_SYMBOL_GPL(cache_register_net);
1684 1669
1685void cache_unregister_net(struct cache_detail *cd, struct net *net) 1670void cache_unregister_net(struct cache_detail *cd, struct net *net)
1686{ 1671{
1687 remove_cache_proc_entries(cd, net); 1672 remove_cache_proc_entries(cd);
1688 sunrpc_destroy_cache_detail(cd); 1673 sunrpc_destroy_cache_detail(cd);
1689} 1674}
1690EXPORT_SYMBOL_GPL(cache_unregister_net); 1675EXPORT_SYMBOL_GPL(cache_unregister_net);
@@ -1843,15 +1828,29 @@ int sunrpc_cache_register_pipefs(struct dentry *parent,
1843 struct dentry *dir = rpc_create_cache_dir(parent, name, umode, cd); 1828 struct dentry *dir = rpc_create_cache_dir(parent, name, umode, cd);
1844 if (IS_ERR(dir)) 1829 if (IS_ERR(dir))
1845 return PTR_ERR(dir); 1830 return PTR_ERR(dir);
1846 cd->u.pipefs.dir = dir; 1831 cd->pipefs = dir;
1847 return 0; 1832 return 0;
1848} 1833}
1849EXPORT_SYMBOL_GPL(sunrpc_cache_register_pipefs); 1834EXPORT_SYMBOL_GPL(sunrpc_cache_register_pipefs);
1850 1835
1851void sunrpc_cache_unregister_pipefs(struct cache_detail *cd) 1836void sunrpc_cache_unregister_pipefs(struct cache_detail *cd)
1852{ 1837{
1853 rpc_remove_cache_dir(cd->u.pipefs.dir); 1838 if (cd->pipefs) {
1854 cd->u.pipefs.dir = NULL; 1839 rpc_remove_cache_dir(cd->pipefs);
1840 cd->pipefs = NULL;
1841 }
1855} 1842}
1856EXPORT_SYMBOL_GPL(sunrpc_cache_unregister_pipefs); 1843EXPORT_SYMBOL_GPL(sunrpc_cache_unregister_pipefs);
1857 1844
1845void sunrpc_cache_unhash(struct cache_detail *cd, struct cache_head *h)
1846{
1847 write_lock(&cd->hash_lock);
1848 if (!hlist_unhashed(&h->cache_list)){
1849 hlist_del_init(&h->cache_list);
1850 cd->entries--;
1851 write_unlock(&cd->hash_lock);
1852 cache_put(h, cd);
1853 } else
1854 write_unlock(&cd->hash_lock);
1855}
1856EXPORT_SYMBOL_GPL(sunrpc_cache_unhash);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 1dc9f3bac099..52da3ce54bb5 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1453,21 +1453,6 @@ size_t rpc_max_bc_payload(struct rpc_clnt *clnt)
1453EXPORT_SYMBOL_GPL(rpc_max_bc_payload); 1453EXPORT_SYMBOL_GPL(rpc_max_bc_payload);
1454 1454
1455/** 1455/**
1456 * rpc_get_timeout - Get timeout for transport in units of HZ
1457 * @clnt: RPC client to query
1458 */
1459unsigned long rpc_get_timeout(struct rpc_clnt *clnt)
1460{
1461 unsigned long ret;
1462
1463 rcu_read_lock();
1464 ret = rcu_dereference(clnt->cl_xprt)->timeout->to_initval;
1465 rcu_read_unlock();
1466 return ret;
1467}
1468EXPORT_SYMBOL_GPL(rpc_get_timeout);
1469
1470/**
1471 * rpc_force_rebind - force transport to check that remote port is unchanged 1456 * rpc_force_rebind - force transport to check that remote port is unchanged
1472 * @clnt: client to rebind 1457 * @clnt: client to rebind
1473 * 1458 *
@@ -2699,6 +2684,7 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt,
2699{ 2684{
2700 struct rpc_xprt_switch *xps; 2685 struct rpc_xprt_switch *xps;
2701 struct rpc_xprt *xprt; 2686 struct rpc_xprt *xprt;
2687 unsigned long connect_timeout;
2702 unsigned long reconnect_timeout; 2688 unsigned long reconnect_timeout;
2703 unsigned char resvport; 2689 unsigned char resvport;
2704 int ret = 0; 2690 int ret = 0;
@@ -2711,6 +2697,7 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt,
2711 return -EAGAIN; 2697 return -EAGAIN;
2712 } 2698 }
2713 resvport = xprt->resvport; 2699 resvport = xprt->resvport;
2700 connect_timeout = xprt->connect_timeout;
2714 reconnect_timeout = xprt->max_reconnect_timeout; 2701 reconnect_timeout = xprt->max_reconnect_timeout;
2715 rcu_read_unlock(); 2702 rcu_read_unlock();
2716 2703
@@ -2720,7 +2707,10 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt,
2720 goto out_put_switch; 2707 goto out_put_switch;
2721 } 2708 }
2722 xprt->resvport = resvport; 2709 xprt->resvport = resvport;
2723 xprt->max_reconnect_timeout = reconnect_timeout; 2710 if (xprt->ops->set_connect_timeout != NULL)
2711 xprt->ops->set_connect_timeout(xprt,
2712 connect_timeout,
2713 reconnect_timeout);
2724 2714
2725 rpc_xprt_switch_set_roundrobin(xps); 2715 rpc_xprt_switch_set_roundrobin(xps);
2726 if (setup) { 2716 if (setup) {
@@ -2737,26 +2727,39 @@ out_put_switch:
2737} 2727}
2738EXPORT_SYMBOL_GPL(rpc_clnt_add_xprt); 2728EXPORT_SYMBOL_GPL(rpc_clnt_add_xprt);
2739 2729
2730struct connect_timeout_data {
2731 unsigned long connect_timeout;
2732 unsigned long reconnect_timeout;
2733};
2734
2740static int 2735static int
2741rpc_xprt_cap_max_reconnect_timeout(struct rpc_clnt *clnt, 2736rpc_xprt_set_connect_timeout(struct rpc_clnt *clnt,
2742 struct rpc_xprt *xprt, 2737 struct rpc_xprt *xprt,
2743 void *data) 2738 void *data)
2744{ 2739{
2745 unsigned long timeout = *((unsigned long *)data); 2740 struct connect_timeout_data *timeo = data;
2746 2741
2747 if (timeout < xprt->max_reconnect_timeout) 2742 if (xprt->ops->set_connect_timeout)
2748 xprt->max_reconnect_timeout = timeout; 2743 xprt->ops->set_connect_timeout(xprt,
2744 timeo->connect_timeout,
2745 timeo->reconnect_timeout);
2749 return 0; 2746 return 0;
2750} 2747}
2751 2748
2752void 2749void
2753rpc_cap_max_reconnect_timeout(struct rpc_clnt *clnt, unsigned long timeo) 2750rpc_set_connect_timeout(struct rpc_clnt *clnt,
2751 unsigned long connect_timeout,
2752 unsigned long reconnect_timeout)
2754{ 2753{
2754 struct connect_timeout_data timeout = {
2755 .connect_timeout = connect_timeout,
2756 .reconnect_timeout = reconnect_timeout,
2757 };
2755 rpc_clnt_iterate_for_each_xprt(clnt, 2758 rpc_clnt_iterate_for_each_xprt(clnt,
2756 rpc_xprt_cap_max_reconnect_timeout, 2759 rpc_xprt_set_connect_timeout,
2757 &timeo); 2760 &timeout);
2758} 2761}
2759EXPORT_SYMBOL_GPL(rpc_cap_max_reconnect_timeout); 2762EXPORT_SYMBOL_GPL(rpc_set_connect_timeout);
2760 2763
2761void rpc_clnt_xprt_switch_put(struct rpc_clnt *clnt) 2764void rpc_clnt_xprt_switch_put(struct rpc_clnt *clnt)
2762{ 2765{
diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c
index e7b4d93566df..c8fd0b6c1618 100644
--- a/net/sunrpc/debugfs.c
+++ b/net/sunrpc/debugfs.c
@@ -16,11 +16,6 @@ static struct dentry *rpc_xprt_dir;
16 16
17unsigned int rpc_inject_disconnect; 17unsigned int rpc_inject_disconnect;
18 18
19struct rpc_clnt_iter {
20 struct rpc_clnt *clnt;
21 loff_t pos;
22};
23
24static int 19static int
25tasks_show(struct seq_file *f, void *v) 20tasks_show(struct seq_file *f, void *v)
26{ 21{
@@ -47,12 +42,10 @@ static void *
47tasks_start(struct seq_file *f, loff_t *ppos) 42tasks_start(struct seq_file *f, loff_t *ppos)
48 __acquires(&clnt->cl_lock) 43 __acquires(&clnt->cl_lock)
49{ 44{
50 struct rpc_clnt_iter *iter = f->private; 45 struct rpc_clnt *clnt = f->private;
51 loff_t pos = *ppos; 46 loff_t pos = *ppos;
52 struct rpc_clnt *clnt = iter->clnt;
53 struct rpc_task *task; 47 struct rpc_task *task;
54 48
55 iter->pos = pos + 1;
56 spin_lock(&clnt->cl_lock); 49 spin_lock(&clnt->cl_lock);
57 list_for_each_entry(task, &clnt->cl_tasks, tk_task) 50 list_for_each_entry(task, &clnt->cl_tasks, tk_task)
58 if (pos-- == 0) 51 if (pos-- == 0)
@@ -63,12 +56,10 @@ tasks_start(struct seq_file *f, loff_t *ppos)
63static void * 56static void *
64tasks_next(struct seq_file *f, void *v, loff_t *pos) 57tasks_next(struct seq_file *f, void *v, loff_t *pos)
65{ 58{
66 struct rpc_clnt_iter *iter = f->private; 59 struct rpc_clnt *clnt = f->private;
67 struct rpc_clnt *clnt = iter->clnt;
68 struct rpc_task *task = v; 60 struct rpc_task *task = v;
69 struct list_head *next = task->tk_task.next; 61 struct list_head *next = task->tk_task.next;
70 62
71 ++iter->pos;
72 ++*pos; 63 ++*pos;
73 64
74 /* If there's another task on list, return it */ 65 /* If there's another task on list, return it */
@@ -81,9 +72,7 @@ static void
81tasks_stop(struct seq_file *f, void *v) 72tasks_stop(struct seq_file *f, void *v)
82 __releases(&clnt->cl_lock) 73 __releases(&clnt->cl_lock)
83{ 74{
84 struct rpc_clnt_iter *iter = f->private; 75 struct rpc_clnt *clnt = f->private;
85 struct rpc_clnt *clnt = iter->clnt;
86
87 spin_unlock(&clnt->cl_lock); 76 spin_unlock(&clnt->cl_lock);
88} 77}
89 78
@@ -96,17 +85,13 @@ static const struct seq_operations tasks_seq_operations = {
96 85
97static int tasks_open(struct inode *inode, struct file *filp) 86static int tasks_open(struct inode *inode, struct file *filp)
98{ 87{
99 int ret = seq_open_private(filp, &tasks_seq_operations, 88 int ret = seq_open(filp, &tasks_seq_operations);
100 sizeof(struct rpc_clnt_iter));
101
102 if (!ret) { 89 if (!ret) {
103 struct seq_file *seq = filp->private_data; 90 struct seq_file *seq = filp->private_data;
104 struct rpc_clnt_iter *iter = seq->private; 91 struct rpc_clnt *clnt = seq->private = inode->i_private;
105
106 iter->clnt = inode->i_private;
107 92
108 if (!atomic_inc_not_zero(&iter->clnt->cl_count)) { 93 if (!atomic_inc_not_zero(&clnt->cl_count)) {
109 seq_release_private(inode, filp); 94 seq_release(inode, filp);
110 ret = -EINVAL; 95 ret = -EINVAL;
111 } 96 }
112 } 97 }
@@ -118,10 +103,10 @@ static int
118tasks_release(struct inode *inode, struct file *filp) 103tasks_release(struct inode *inode, struct file *filp)
119{ 104{
120 struct seq_file *seq = filp->private_data; 105 struct seq_file *seq = filp->private_data;
121 struct rpc_clnt_iter *iter = seq->private; 106 struct rpc_clnt *clnt = seq->private;
122 107
123 rpc_release_client(iter->clnt); 108 rpc_release_client(clnt);
124 return seq_release_private(inode, filp); 109 return seq_release(inode, filp);
125} 110}
126 111
127static const struct file_operations tasks_fops = { 112static const struct file_operations tasks_fops = {
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 75f290bddca1..a08aeb56b8e4 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -11,7 +11,7 @@
11 */ 11 */
12 12
13#include <linux/linkage.h> 13#include <linux/linkage.h>
14#include <linux/sched.h> 14#include <linux/sched/signal.h>
15#include <linux/errno.h> 15#include <linux/errno.h>
16#include <linux/net.h> 16#include <linux/net.h>
17#include <linux/in.h> 17#include <linux/in.h>
@@ -385,7 +385,7 @@ static int svc_uses_rpcbind(struct svc_serv *serv)
385 for (i = 0; i < progp->pg_nvers; i++) { 385 for (i = 0; i < progp->pg_nvers; i++) {
386 if (progp->pg_vers[i] == NULL) 386 if (progp->pg_vers[i] == NULL)
387 continue; 387 continue;
388 if (progp->pg_vers[i]->vs_hidden == 0) 388 if (!progp->pg_vers[i]->vs_hidden)
389 return 1; 389 return 1;
390 } 390 }
391 } 391 }
@@ -976,6 +976,13 @@ int svc_register(const struct svc_serv *serv, struct net *net,
976 if (vers->vs_hidden) 976 if (vers->vs_hidden)
977 continue; 977 continue;
978 978
979 /*
980 * Don't register a UDP port if we need congestion
981 * control.
982 */
983 if (vers->vs_need_cong_ctrl && proto == IPPROTO_UDP)
984 continue;
985
979 error = __svc_register(net, progp->pg_name, progp->pg_prog, 986 error = __svc_register(net, progp->pg_name, progp->pg_prog,
980 i, family, proto, port); 987 i, family, proto, port);
981 988
@@ -1169,6 +1176,21 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
1169 !(versp = progp->pg_vers[vers])) 1176 !(versp = progp->pg_vers[vers]))
1170 goto err_bad_vers; 1177 goto err_bad_vers;
1171 1178
1179 /*
1180 * Some protocol versions (namely NFSv4) require some form of
1181 * congestion control. (See RFC 7530 section 3.1 paragraph 2)
1182 * In other words, UDP is not allowed. We mark those when setting
1183 * up the svc_xprt, and verify that here.
1184 *
1185 * The spec is not very clear about what error should be returned
1186 * when someone tries to access a server that is listening on UDP
1187 * for lower versions. RPC_PROG_MISMATCH seems to be the closest
1188 * fit.
1189 */
1190 if (versp->vs_need_cong_ctrl &&
1191 !test_bit(XPT_CONG_CTRL, &rqstp->rq_xprt->xpt_flags))
1192 goto err_bad_vers;
1193
1172 procp = versp->vs_proc + proc; 1194 procp = versp->vs_proc + proc;
1173 if (proc >= versp->vs_nproc || !procp->pc_func) 1195 if (proc >= versp->vs_nproc || !procp->pc_func)
1174 goto err_bad_proc; 1196 goto err_bad_proc;
@@ -1260,7 +1282,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
1260 return 0; 1282 return 0;
1261 1283
1262err_short_len: 1284err_short_len:
1263 svc_printk(rqstp, "short len %Zd, dropping request\n", 1285 svc_printk(rqstp, "short len %zd, dropping request\n",
1264 argv->iov_len); 1286 argv->iov_len);
1265 goto close; 1287 goto close;
1266 1288
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 9c9db55a0c1e..7bfe1fb42add 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -490,7 +490,7 @@ static struct svc_xprt *svc_xprt_dequeue(struct svc_pool *pool)
490 svc_xprt_get(xprt); 490 svc_xprt_get(xprt);
491 491
492 dprintk("svc: transport %p dequeued, inuse=%d\n", 492 dprintk("svc: transport %p dequeued, inuse=%d\n",
493 xprt, atomic_read(&xprt->xpt_ref.refcount)); 493 xprt, kref_read(&xprt->xpt_ref));
494 } 494 }
495 spin_unlock_bh(&pool->sp_lock); 495 spin_unlock_bh(&pool->sp_lock);
496out: 496out:
@@ -822,7 +822,7 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
822 /* XPT_DATA|XPT_DEFERRED case: */ 822 /* XPT_DATA|XPT_DEFERRED case: */
823 dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n", 823 dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n",
824 rqstp, rqstp->rq_pool->sp_id, xprt, 824 rqstp, rqstp->rq_pool->sp_id, xprt,
825 atomic_read(&xprt->xpt_ref.refcount)); 825 kref_read(&xprt->xpt_ref));
826 rqstp->rq_deferred = svc_deferred_dequeue(xprt); 826 rqstp->rq_deferred = svc_deferred_dequeue(xprt);
827 if (rqstp->rq_deferred) 827 if (rqstp->rq_deferred)
828 len = svc_deferred_recv(rqstp); 828 len = svc_deferred_recv(rqstp);
@@ -980,7 +980,7 @@ static void svc_age_temp_xprts(unsigned long closure)
980 * through, close it. */ 980 * through, close it. */
981 if (!test_and_set_bit(XPT_OLD, &xprt->xpt_flags)) 981 if (!test_and_set_bit(XPT_OLD, &xprt->xpt_flags))
982 continue; 982 continue;
983 if (atomic_read(&xprt->xpt_ref.refcount) > 1 || 983 if (kref_read(&xprt->xpt_ref) > 1 ||
984 test_bit(XPT_BUSY, &xprt->xpt_flags)) 984 test_bit(XPT_BUSY, &xprt->xpt_flags))
985 continue; 985 continue;
986 list_del_init(le); 986 list_del_init(le);
diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c
index e112da8005b5..bb8db3cb8032 100644
--- a/net/sunrpc/svcauth.c
+++ b/net/sunrpc/svcauth.c
@@ -126,13 +126,18 @@ EXPORT_SYMBOL_GPL(svc_auth_unregister);
126static struct hlist_head auth_domain_table[DN_HASHMAX]; 126static struct hlist_head auth_domain_table[DN_HASHMAX];
127static DEFINE_SPINLOCK(auth_domain_lock); 127static DEFINE_SPINLOCK(auth_domain_lock);
128 128
129static void auth_domain_release(struct kref *kref)
130{
131 struct auth_domain *dom = container_of(kref, struct auth_domain, ref);
132
133 hlist_del(&dom->hash);
134 dom->flavour->domain_release(dom);
135 spin_unlock(&auth_domain_lock);
136}
137
129void auth_domain_put(struct auth_domain *dom) 138void auth_domain_put(struct auth_domain *dom)
130{ 139{
131 if (atomic_dec_and_lock(&dom->ref.refcount, &auth_domain_lock)) { 140 kref_put_lock(&dom->ref, auth_domain_release, &auth_domain_lock);
132 hlist_del(&dom->hash);
133 dom->flavour->domain_release(dom);
134 spin_unlock(&auth_domain_lock);
135 }
136} 141}
137EXPORT_SYMBOL_GPL(auth_domain_put); 142EXPORT_SYMBOL_GPL(auth_domain_put);
138 143
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 64af4f034de6..f81eaa8e0888 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -403,7 +403,7 @@ svcauth_unix_info_release(struct svc_xprt *xpt)
403/**************************************************************************** 403/****************************************************************************
404 * auth.unix.gid cache 404 * auth.unix.gid cache
405 * simple cache to map a UID to a list of GIDs 405 * simple cache to map a UID to a list of GIDs
406 * because AUTH_UNIX aka AUTH_SYS has a max of 16 406 * because AUTH_UNIX aka AUTH_SYS has a max of UNX_NGROUPS
407 */ 407 */
408#define GID_HASHBITS 8 408#define GID_HASHBITS 8
409#define GID_HASHMAX (1<<GID_HASHBITS) 409#define GID_HASHMAX (1<<GID_HASHBITS)
@@ -810,7 +810,7 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp)
810 cred->cr_uid = make_kuid(&init_user_ns, svc_getnl(argv)); /* uid */ 810 cred->cr_uid = make_kuid(&init_user_ns, svc_getnl(argv)); /* uid */
811 cred->cr_gid = make_kgid(&init_user_ns, svc_getnl(argv)); /* gid */ 811 cred->cr_gid = make_kgid(&init_user_ns, svc_getnl(argv)); /* gid */
812 slen = svc_getnl(argv); /* gids length */ 812 slen = svc_getnl(argv); /* gids length */
813 if (slen > 16 || (len -= (slen + 2)*4) < 0) 813 if (slen > UNX_NGROUPS || (len -= (slen + 2)*4) < 0)
814 goto badcred; 814 goto badcred;
815 cred->cr_group_info = groups_alloc(slen); 815 cred->cr_group_info = groups_alloc(slen);
816 if (cred->cr_group_info == NULL) 816 if (cred->cr_group_info == NULL)
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index de066acdb34e..2b720fa35c4f 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -278,7 +278,7 @@ static int svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr)
278 rqstp->rq_respages[0], tailoff); 278 rqstp->rq_respages[0], tailoff);
279 279
280out: 280out:
281 dprintk("svc: socket %p sendto([%p %Zu... ], %d) = %d (addr %s)\n", 281 dprintk("svc: socket %p sendto([%p %zu... ], %d) = %d (addr %s)\n",
282 svsk, xdr->head[0].iov_base, xdr->head[0].iov_len, 282 svsk, xdr->head[0].iov_base, xdr->head[0].iov_len,
283 xdr->len, len, svc_print_addr(rqstp, buf, sizeof(buf))); 283 xdr->len, len, svc_print_addr(rqstp, buf, sizeof(buf)));
284 284
@@ -346,7 +346,7 @@ static int svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr,
346 if (len == buflen) 346 if (len == buflen)
347 set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); 347 set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
348 348
349 dprintk("svc: socket %p recvfrom(%p, %Zu) = %d\n", 349 dprintk("svc: socket %p recvfrom(%p, %zu) = %d\n",
350 svsk, iov[0].iov_base, iov[0].iov_len, len); 350 svsk, iov[0].iov_base, iov[0].iov_len, len);
351 return len; 351 return len;
352} 352}
@@ -1306,6 +1306,7 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
1306 svc_xprt_init(sock_net(svsk->sk_sock->sk), &svc_tcp_class, 1306 svc_xprt_init(sock_net(svsk->sk_sock->sk), &svc_tcp_class,
1307 &svsk->sk_xprt, serv); 1307 &svsk->sk_xprt, serv);
1308 set_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags); 1308 set_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags);
1309 set_bit(XPT_CONG_CTRL, &svsk->sk_xprt.xpt_flags);
1309 if (sk->sk_state == TCP_LISTEN) { 1310 if (sk->sk_state == TCP_LISTEN) {
1310 dprintk("setting up TCP socket for listening\n"); 1311 dprintk("setting up TCP socket for listening\n");
1311 set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags); 1312 set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags);
@@ -1634,6 +1635,7 @@ static struct svc_xprt *svc_bc_create_socket(struct svc_serv *serv,
1634 1635
1635 xprt = &svsk->sk_xprt; 1636 xprt = &svsk->sk_xprt;
1636 svc_xprt_init(net, &svc_tcp_bc_class, xprt, serv); 1637 svc_xprt_init(net, &svc_tcp_bc_class, xprt, serv);
1638 set_bit(XPT_CONG_CTRL, &svsk->sk_xprt.xpt_flags);
1637 1639
1638 serv->sv_bc_xprt = xprt; 1640 serv->sv_bc_xprt = xprt;
1639 1641
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 7f1071e103ca..1f7082144e01 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -1518,3 +1518,37 @@ out:
1518} 1518}
1519EXPORT_SYMBOL_GPL(xdr_process_buf); 1519EXPORT_SYMBOL_GPL(xdr_process_buf);
1520 1520
1521/**
1522 * xdr_stream_decode_string_dup - Decode and duplicate variable length string
1523 * @xdr: pointer to xdr_stream
1524 * @str: location to store pointer to string
1525 * @maxlen: maximum acceptable string length
1526 * @gfp_flags: GFP mask to use
1527 *
1528 * Return values:
1529 * On success, returns length of NUL-terminated string stored in *@ptr
1530 * %-EBADMSG on XDR buffer overflow
1531 * %-EMSGSIZE if the size of the string would exceed @maxlen
1532 * %-ENOMEM on memory allocation failure
1533 */
1534ssize_t xdr_stream_decode_string_dup(struct xdr_stream *xdr, char **str,
1535 size_t maxlen, gfp_t gfp_flags)
1536{
1537 void *p;
1538 ssize_t ret;
1539
1540 ret = xdr_stream_decode_opaque_inline(xdr, &p, maxlen);
1541 if (ret > 0) {
1542 char *s = kmalloc(ret + 1, gfp_flags);
1543 if (s != NULL) {
1544 memcpy(s, p, ret);
1545 s[ret] = '\0';
1546 *str = s;
1547 return strlen(s);
1548 }
1549 ret = -ENOMEM;
1550 }
1551 *str = NULL;
1552 return ret;
1553}
1554EXPORT_SYMBOL_GPL(xdr_stream_decode_string_dup);
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 9a6be030ca7d..b530a2852ba8 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -897,13 +897,11 @@ static void xprt_timer(struct rpc_task *task)
897 return; 897 return;
898 dprintk("RPC: %5u xprt_timer\n", task->tk_pid); 898 dprintk("RPC: %5u xprt_timer\n", task->tk_pid);
899 899
900 spin_lock_bh(&xprt->transport_lock);
901 if (!req->rq_reply_bytes_recvd) { 900 if (!req->rq_reply_bytes_recvd) {
902 if (xprt->ops->timer) 901 if (xprt->ops->timer)
903 xprt->ops->timer(xprt, task); 902 xprt->ops->timer(xprt, task);
904 } else 903 } else
905 task->tk_status = 0; 904 task->tk_status = 0;
906 spin_unlock_bh(&xprt->transport_lock);
907} 905}
908 906
909/** 907/**
diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index 1ebb09e1ac4f..59e64025ed96 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -310,10 +310,7 @@ fmr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
310 struct rpcrdma_mw *mw; 310 struct rpcrdma_mw *mw;
311 311
312 while (!list_empty(&req->rl_registered)) { 312 while (!list_empty(&req->rl_registered)) {
313 mw = list_first_entry(&req->rl_registered, 313 mw = rpcrdma_pop_mw(&req->rl_registered);
314 struct rpcrdma_mw, mw_list);
315 list_del_init(&mw->mw_list);
316
317 if (sync) 314 if (sync)
318 fmr_op_recover_mr(mw); 315 fmr_op_recover_mr(mw);
319 else 316 else
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 47bed5333c7f..f81dd93176c0 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -466,8 +466,8 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
466 struct ib_send_wr *first, **prev, *last, *bad_wr; 466 struct ib_send_wr *first, **prev, *last, *bad_wr;
467 struct rpcrdma_rep *rep = req->rl_reply; 467 struct rpcrdma_rep *rep = req->rl_reply;
468 struct rpcrdma_ia *ia = &r_xprt->rx_ia; 468 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
469 struct rpcrdma_mw *mw, *tmp;
470 struct rpcrdma_frmr *f; 469 struct rpcrdma_frmr *f;
470 struct rpcrdma_mw *mw;
471 int count, rc; 471 int count, rc;
472 472
473 dprintk("RPC: %s: req %p\n", __func__, req); 473 dprintk("RPC: %s: req %p\n", __func__, req);
@@ -534,10 +534,10 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
534 * them to the free MW list. 534 * them to the free MW list.
535 */ 535 */
536unmap: 536unmap:
537 list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) { 537 while (!list_empty(&req->rl_registered)) {
538 mw = rpcrdma_pop_mw(&req->rl_registered);
538 dprintk("RPC: %s: DMA unmapping frmr %p\n", 539 dprintk("RPC: %s: DMA unmapping frmr %p\n",
539 __func__, &mw->frmr); 540 __func__, &mw->frmr);
540 list_del_init(&mw->mw_list);
541 ib_dma_unmap_sg(ia->ri_device, 541 ib_dma_unmap_sg(ia->ri_device,
542 mw->mw_sg, mw->mw_nents, mw->mw_dir); 542 mw->mw_sg, mw->mw_nents, mw->mw_dir);
543 rpcrdma_put_mw(r_xprt, mw); 543 rpcrdma_put_mw(r_xprt, mw);
@@ -571,10 +571,7 @@ frwr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
571 struct rpcrdma_mw *mw; 571 struct rpcrdma_mw *mw;
572 572
573 while (!list_empty(&req->rl_registered)) { 573 while (!list_empty(&req->rl_registered)) {
574 mw = list_first_entry(&req->rl_registered, 574 mw = rpcrdma_pop_mw(&req->rl_registered);
575 struct rpcrdma_mw, mw_list);
576 list_del_init(&mw->mw_list);
577
578 if (sync) 575 if (sync)
579 frwr_op_recover_mr(mw); 576 frwr_op_recover_mr(mw);
580 else 577 else
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index c52e0f2ffe52..a044be2d6ad7 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -125,14 +125,34 @@ void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *r_xprt)
125/* The client can send a request inline as long as the RPCRDMA header 125/* The client can send a request inline as long as the RPCRDMA header
126 * plus the RPC call fit under the transport's inline limit. If the 126 * plus the RPC call fit under the transport's inline limit. If the
127 * combined call message size exceeds that limit, the client must use 127 * combined call message size exceeds that limit, the client must use
128 * the read chunk list for this operation. 128 * a Read chunk for this operation.
129 *
130 * A Read chunk is also required if sending the RPC call inline would
131 * exceed this device's max_sge limit.
129 */ 132 */
130static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt, 133static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt,
131 struct rpc_rqst *rqst) 134 struct rpc_rqst *rqst)
132{ 135{
133 struct rpcrdma_ia *ia = &r_xprt->rx_ia; 136 struct xdr_buf *xdr = &rqst->rq_snd_buf;
137 unsigned int count, remaining, offset;
138
139 if (xdr->len > r_xprt->rx_ia.ri_max_inline_write)
140 return false;
141
142 if (xdr->page_len) {
143 remaining = xdr->page_len;
144 offset = xdr->page_base & ~PAGE_MASK;
145 count = 0;
146 while (remaining) {
147 remaining -= min_t(unsigned int,
148 PAGE_SIZE - offset, remaining);
149 offset = 0;
150 if (++count > r_xprt->rx_ia.ri_max_send_sges)
151 return false;
152 }
153 }
134 154
135 return rqst->rq_snd_buf.len <= ia->ri_max_inline_write; 155 return true;
136} 156}
137 157
138/* The client can't know how large the actual reply will be. Thus it 158/* The client can't know how large the actual reply will be. Thus it
@@ -186,9 +206,9 @@ rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, int n)
186 */ 206 */
187 207
188static int 208static int
189rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, 209rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf,
190 enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg, 210 unsigned int pos, enum rpcrdma_chunktype type,
191 bool reminv_expected) 211 struct rpcrdma_mr_seg *seg)
192{ 212{
193 int len, n, p, page_base; 213 int len, n, p, page_base;
194 struct page **ppages; 214 struct page **ppages;
@@ -226,22 +246,21 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
226 if (len && n == RPCRDMA_MAX_SEGS) 246 if (len && n == RPCRDMA_MAX_SEGS)
227 goto out_overflow; 247 goto out_overflow;
228 248
229 /* When encoding the read list, the tail is always sent inline */ 249 /* When encoding a Read chunk, the tail iovec contains an
230 if (type == rpcrdma_readch) 250 * XDR pad and may be omitted.
251 */
252 if (type == rpcrdma_readch && r_xprt->rx_ia.ri_implicit_roundup)
231 return n; 253 return n;
232 254
233 /* When encoding the Write list, some servers need to see an extra 255 /* When encoding a Write chunk, some servers need to see an
234 * segment for odd-length Write chunks. The upper layer provides 256 * extra segment for non-XDR-aligned Write chunks. The upper
235 * space in the tail iovec for this purpose. 257 * layer provides space in the tail iovec that may be used
258 * for this purpose.
236 */ 259 */
237 if (type == rpcrdma_writech && reminv_expected) 260 if (type == rpcrdma_writech && r_xprt->rx_ia.ri_implicit_roundup)
238 return n; 261 return n;
239 262
240 if (xdrbuf->tail[0].iov_len) { 263 if (xdrbuf->tail[0].iov_len) {
241 /* the rpcrdma protocol allows us to omit any trailing
242 * xdr pad bytes, saving the server an RDMA operation. */
243 if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize)
244 return n;
245 n = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, n); 264 n = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, n);
246 if (n == RPCRDMA_MAX_SEGS) 265 if (n == RPCRDMA_MAX_SEGS)
247 goto out_overflow; 266 goto out_overflow;
@@ -293,7 +312,8 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
293 if (rtype == rpcrdma_areadch) 312 if (rtype == rpcrdma_areadch)
294 pos = 0; 313 pos = 0;
295 seg = req->rl_segments; 314 seg = req->rl_segments;
296 nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg, false); 315 nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_snd_buf, pos,
316 rtype, seg);
297 if (nsegs < 0) 317 if (nsegs < 0)
298 return ERR_PTR(nsegs); 318 return ERR_PTR(nsegs);
299 319
@@ -302,7 +322,7 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
302 false, &mw); 322 false, &mw);
303 if (n < 0) 323 if (n < 0)
304 return ERR_PTR(n); 324 return ERR_PTR(n);
305 list_add(&mw->mw_list, &req->rl_registered); 325 rpcrdma_push_mw(mw, &req->rl_registered);
306 326
307 *iptr++ = xdr_one; /* item present */ 327 *iptr++ = xdr_one; /* item present */
308 328
@@ -355,10 +375,9 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
355 } 375 }
356 376
357 seg = req->rl_segments; 377 seg = req->rl_segments;
358 nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 378 nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf,
359 rqst->rq_rcv_buf.head[0].iov_len, 379 rqst->rq_rcv_buf.head[0].iov_len,
360 wtype, seg, 380 wtype, seg);
361 r_xprt->rx_ia.ri_reminv_expected);
362 if (nsegs < 0) 381 if (nsegs < 0)
363 return ERR_PTR(nsegs); 382 return ERR_PTR(nsegs);
364 383
@@ -371,7 +390,7 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
371 true, &mw); 390 true, &mw);
372 if (n < 0) 391 if (n < 0)
373 return ERR_PTR(n); 392 return ERR_PTR(n);
374 list_add(&mw->mw_list, &req->rl_registered); 393 rpcrdma_push_mw(mw, &req->rl_registered);
375 394
376 iptr = xdr_encode_rdma_segment(iptr, mw); 395 iptr = xdr_encode_rdma_segment(iptr, mw);
377 396
@@ -423,8 +442,7 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
423 } 442 }
424 443
425 seg = req->rl_segments; 444 seg = req->rl_segments;
426 nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg, 445 nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 0, wtype, seg);
427 r_xprt->rx_ia.ri_reminv_expected);
428 if (nsegs < 0) 446 if (nsegs < 0)
429 return ERR_PTR(nsegs); 447 return ERR_PTR(nsegs);
430 448
@@ -437,7 +455,7 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
437 true, &mw); 455 true, &mw);
438 if (n < 0) 456 if (n < 0)
439 return ERR_PTR(n); 457 return ERR_PTR(n);
440 list_add(&mw->mw_list, &req->rl_registered); 458 rpcrdma_push_mw(mw, &req->rl_registered);
441 459
442 iptr = xdr_encode_rdma_segment(iptr, mw); 460 iptr = xdr_encode_rdma_segment(iptr, mw);
443 461
@@ -741,13 +759,13 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
741 iptr = headerp->rm_body.rm_chunks; 759 iptr = headerp->rm_body.rm_chunks;
742 iptr = rpcrdma_encode_read_list(r_xprt, req, rqst, iptr, rtype); 760 iptr = rpcrdma_encode_read_list(r_xprt, req, rqst, iptr, rtype);
743 if (IS_ERR(iptr)) 761 if (IS_ERR(iptr))
744 goto out_unmap; 762 goto out_err;
745 iptr = rpcrdma_encode_write_list(r_xprt, req, rqst, iptr, wtype); 763 iptr = rpcrdma_encode_write_list(r_xprt, req, rqst, iptr, wtype);
746 if (IS_ERR(iptr)) 764 if (IS_ERR(iptr))
747 goto out_unmap; 765 goto out_err;
748 iptr = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, iptr, wtype); 766 iptr = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, iptr, wtype);
749 if (IS_ERR(iptr)) 767 if (IS_ERR(iptr))
750 goto out_unmap; 768 goto out_err;
751 hdrlen = (unsigned char *)iptr - (unsigned char *)headerp; 769 hdrlen = (unsigned char *)iptr - (unsigned char *)headerp;
752 770
753 dprintk("RPC: %5u %s: %s/%s: hdrlen %zd rpclen %zd\n", 771 dprintk("RPC: %5u %s: %s/%s: hdrlen %zd rpclen %zd\n",
@@ -758,12 +776,14 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
758 if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, hdrlen, 776 if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, hdrlen,
759 &rqst->rq_snd_buf, rtype)) { 777 &rqst->rq_snd_buf, rtype)) {
760 iptr = ERR_PTR(-EIO); 778 iptr = ERR_PTR(-EIO);
761 goto out_unmap; 779 goto out_err;
762 } 780 }
763 return 0; 781 return 0;
764 782
765out_unmap: 783out_err:
766 r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false); 784 pr_err("rpcrdma: rpcrdma_marshal_req failed, status %ld\n",
785 PTR_ERR(iptr));
786 r_xprt->rx_stats.failed_marshal_count++;
767 return PTR_ERR(iptr); 787 return PTR_ERR(iptr);
768} 788}
769 789
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index 288e35c2d8f4..ff1df40f0d26 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -4,6 +4,7 @@
4 * Support for backward direction RPCs on RPC/RDMA (server-side). 4 * Support for backward direction RPCs on RPC/RDMA (server-side).
5 */ 5 */
6 6
7#include <linux/module.h>
7#include <linux/sunrpc/svc_rdma.h> 8#include <linux/sunrpc/svc_rdma.h>
8#include "xprt_rdma.h" 9#include "xprt_rdma.h"
9 10
@@ -200,19 +201,20 @@ rpcrdma_bc_send_request(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst)
200{ 201{
201 struct rpc_xprt *xprt = rqst->rq_xprt; 202 struct rpc_xprt *xprt = rqst->rq_xprt;
202 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); 203 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
203 struct rpcrdma_msg *headerp = (struct rpcrdma_msg *)rqst->rq_buffer; 204 __be32 *p;
204 int rc; 205 int rc;
205 206
206 /* Space in the send buffer for an RPC/RDMA header is reserved 207 /* Space in the send buffer for an RPC/RDMA header is reserved
207 * via xprt->tsh_size. 208 * via xprt->tsh_size.
208 */ 209 */
209 headerp->rm_xid = rqst->rq_xid; 210 p = rqst->rq_buffer;
210 headerp->rm_vers = rpcrdma_version; 211 *p++ = rqst->rq_xid;
211 headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_bc_max_requests); 212 *p++ = rpcrdma_version;
212 headerp->rm_type = rdma_msg; 213 *p++ = cpu_to_be32(r_xprt->rx_buf.rb_bc_max_requests);
213 headerp->rm_body.rm_chunks[0] = xdr_zero; 214 *p++ = rdma_msg;
214 headerp->rm_body.rm_chunks[1] = xdr_zero; 215 *p++ = xdr_zero;
215 headerp->rm_body.rm_chunks[2] = xdr_zero; 216 *p++ = xdr_zero;
217 *p = xdr_zero;
216 218
217#ifdef SVCRDMA_BACKCHANNEL_DEBUG 219#ifdef SVCRDMA_BACKCHANNEL_DEBUG
218 pr_info("%s: %*ph\n", __func__, 64, rqst->rq_buffer); 220 pr_info("%s: %*ph\n", __func__, 64, rqst->rq_buffer);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
index 0ba9887f3e22..1c4aabf0f657 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_marshal.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
@@ -1,4 +1,5 @@
1/* 1/*
2 * Copyright (c) 2016 Oracle. All rights reserved.
2 * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. 3 * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
3 * 4 *
4 * This software is available to you under a choice of one of two 5 * This software is available to you under a choice of one of two
@@ -47,102 +48,43 @@
47 48
48#define RPCDBG_FACILITY RPCDBG_SVCXPRT 49#define RPCDBG_FACILITY RPCDBG_SVCXPRT
49 50
50/* 51static __be32 *xdr_check_read_list(__be32 *p, __be32 *end)
51 * Decodes a read chunk list. The expected format is as follows:
52 * descrim : xdr_one
53 * position : __be32 offset into XDR stream
54 * handle : __be32 RKEY
55 * . . .
56 * end-of-list: xdr_zero
57 */
58static __be32 *decode_read_list(__be32 *va, __be32 *vaend)
59{ 52{
60 struct rpcrdma_read_chunk *ch = (struct rpcrdma_read_chunk *)va; 53 __be32 *next;
61 54
62 while (ch->rc_discrim != xdr_zero) { 55 while (*p++ != xdr_zero) {
63 if (((unsigned long)ch + sizeof(struct rpcrdma_read_chunk)) > 56 next = p + rpcrdma_readchunk_maxsz - 1;
64 (unsigned long)vaend) { 57 if (next > end)
65 dprintk("svcrdma: vaend=%p, ch=%p\n", vaend, ch);
66 return NULL; 58 return NULL;
67 } 59 p = next;
68 ch++;
69 } 60 }
70 return &ch->rc_position; 61 return p;
71} 62}
72 63
73/* 64static __be32 *xdr_check_write_list(__be32 *p, __be32 *end)
74 * Decodes a write chunk list. The expected format is as follows:
75 * descrim : xdr_one
76 * nchunks : <count>
77 * handle : __be32 RKEY ---+
78 * length : __be32 <len of segment> |
79 * offset : remove va + <count>
80 * . . . |
81 * ---+
82 */
83static __be32 *decode_write_list(__be32 *va, __be32 *vaend)
84{ 65{
85 unsigned long start, end; 66 __be32 *next;
86 int nchunks;
87
88 struct rpcrdma_write_array *ary =
89 (struct rpcrdma_write_array *)va;
90 67
91 /* Check for not write-array */ 68 while (*p++ != xdr_zero) {
92 if (ary->wc_discrim == xdr_zero) 69 next = p + 1 + be32_to_cpup(p) * rpcrdma_segment_maxsz;
93 return &ary->wc_nchunks; 70 if (next > end)
94 71 return NULL;
95 if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) > 72 p = next;
96 (unsigned long)vaend) {
97 dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend);
98 return NULL;
99 }
100 nchunks = be32_to_cpu(ary->wc_nchunks);
101
102 start = (unsigned long)&ary->wc_array[0];
103 end = (unsigned long)vaend;
104 if (nchunks < 0 ||
105 nchunks > (SIZE_MAX - start) / sizeof(struct rpcrdma_write_chunk) ||
106 (start + (sizeof(struct rpcrdma_write_chunk) * nchunks)) > end) {
107 dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n",
108 ary, nchunks, vaend);
109 return NULL;
110 } 73 }
111 /* 74 return p;
112 * rs_length is the 2nd 4B field in wc_target and taking its
113 * address skips the list terminator
114 */
115 return &ary->wc_array[nchunks].wc_target.rs_length;
116} 75}
117 76
118static __be32 *decode_reply_array(__be32 *va, __be32 *vaend) 77static __be32 *xdr_check_reply_chunk(__be32 *p, __be32 *end)
119{ 78{
120 unsigned long start, end; 79 __be32 *next;
121 int nchunks; 80
122 struct rpcrdma_write_array *ary = 81 if (*p++ != xdr_zero) {
123 (struct rpcrdma_write_array *)va; 82 next = p + 1 + be32_to_cpup(p) * rpcrdma_segment_maxsz;
124 83 if (next > end)
125 /* Check for no reply-array */ 84 return NULL;
126 if (ary->wc_discrim == xdr_zero) 85 p = next;
127 return &ary->wc_nchunks;
128
129 if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) >
130 (unsigned long)vaend) {
131 dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend);
132 return NULL;
133 }
134 nchunks = be32_to_cpu(ary->wc_nchunks);
135
136 start = (unsigned long)&ary->wc_array[0];
137 end = (unsigned long)vaend;
138 if (nchunks < 0 ||
139 nchunks > (SIZE_MAX - start) / sizeof(struct rpcrdma_write_chunk) ||
140 (start + (sizeof(struct rpcrdma_write_chunk) * nchunks)) > end) {
141 dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n",
142 ary, nchunks, vaend);
143 return NULL;
144 } 86 }
145 return (__be32 *)&ary->wc_array[nchunks]; 87 return p;
146} 88}
147 89
148/** 90/**
@@ -158,87 +100,71 @@ static __be32 *decode_reply_array(__be32 *va, __be32 *vaend)
158 */ 100 */
159int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg) 101int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg)
160{ 102{
161 struct rpcrdma_msg *rmsgp; 103 __be32 *p, *end, *rdma_argp;
162 __be32 *va, *vaend; 104 unsigned int hdr_len;
163 unsigned int len;
164 u32 hdr_len;
165 105
166 /* Verify that there's enough bytes for header + something */ 106 /* Verify that there's enough bytes for header + something */
167 if (rq_arg->len <= RPCRDMA_HDRLEN_ERR) { 107 if (rq_arg->len <= RPCRDMA_HDRLEN_ERR)
168 dprintk("svcrdma: header too short = %d\n", 108 goto out_short;
169 rq_arg->len);
170 return -EINVAL;
171 }
172 109
173 rmsgp = (struct rpcrdma_msg *)rq_arg->head[0].iov_base; 110 rdma_argp = rq_arg->head[0].iov_base;
174 if (rmsgp->rm_vers != rpcrdma_version) { 111 if (*(rdma_argp + 1) != rpcrdma_version)
175 dprintk("%s: bad version %u\n", __func__, 112 goto out_version;
176 be32_to_cpu(rmsgp->rm_vers));
177 return -EPROTONOSUPPORT;
178 }
179 113
180 switch (be32_to_cpu(rmsgp->rm_type)) { 114 switch (*(rdma_argp + 3)) {
181 case RDMA_MSG: 115 case rdma_msg:
182 case RDMA_NOMSG: 116 case rdma_nomsg:
183 break; 117 break;
184 118
185 case RDMA_DONE: 119 case rdma_done:
186 /* Just drop it */ 120 goto out_drop;
187 dprintk("svcrdma: dropping RDMA_DONE message\n");
188 return 0;
189
190 case RDMA_ERROR:
191 /* Possible if this is a backchannel reply.
192 * XXX: We should cancel this XID, though.
193 */
194 dprintk("svcrdma: dropping RDMA_ERROR message\n");
195 return 0;
196
197 case RDMA_MSGP:
198 /* Pull in the extra for the padded case, bump our pointer */
199 rmsgp->rm_body.rm_padded.rm_align =
200 be32_to_cpu(rmsgp->rm_body.rm_padded.rm_align);
201 rmsgp->rm_body.rm_padded.rm_thresh =
202 be32_to_cpu(rmsgp->rm_body.rm_padded.rm_thresh);
203
204 va = &rmsgp->rm_body.rm_padded.rm_pempty[4];
205 rq_arg->head[0].iov_base = va;
206 len = (u32)((unsigned long)va - (unsigned long)rmsgp);
207 rq_arg->head[0].iov_len -= len;
208 if (len > rq_arg->len)
209 return -EINVAL;
210 return len;
211 default:
212 dprintk("svcrdma: bad rdma procedure (%u)\n",
213 be32_to_cpu(rmsgp->rm_type));
214 return -EINVAL;
215 }
216 121
217 /* The chunk list may contain either a read chunk list or a write 122 case rdma_error:
218 * chunk list and a reply chunk list. 123 goto out_drop;
219 */ 124
220 va = &rmsgp->rm_body.rm_chunks[0]; 125 default:
221 vaend = (__be32 *)((unsigned long)rmsgp + rq_arg->len); 126 goto out_proc;
222 va = decode_read_list(va, vaend);
223 if (!va) {
224 dprintk("svcrdma: failed to decode read list\n");
225 return -EINVAL;
226 }
227 va = decode_write_list(va, vaend);
228 if (!va) {
229 dprintk("svcrdma: failed to decode write list\n");
230 return -EINVAL;
231 }
232 va = decode_reply_array(va, vaend);
233 if (!va) {
234 dprintk("svcrdma: failed to decode reply chunk\n");
235 return -EINVAL;
236 } 127 }
237 128
238 rq_arg->head[0].iov_base = va; 129 end = (__be32 *)((unsigned long)rdma_argp + rq_arg->len);
239 hdr_len = (unsigned long)va - (unsigned long)rmsgp; 130 p = xdr_check_read_list(rdma_argp + 4, end);
131 if (!p)
132 goto out_inval;
133 p = xdr_check_write_list(p, end);
134 if (!p)
135 goto out_inval;
136 p = xdr_check_reply_chunk(p, end);
137 if (!p)
138 goto out_inval;
139 if (p > end)
140 goto out_inval;
141
142 rq_arg->head[0].iov_base = p;
143 hdr_len = (unsigned long)p - (unsigned long)rdma_argp;
240 rq_arg->head[0].iov_len -= hdr_len; 144 rq_arg->head[0].iov_len -= hdr_len;
241 return hdr_len; 145 return hdr_len;
146
147out_short:
148 dprintk("svcrdma: header too short = %d\n", rq_arg->len);
149 return -EINVAL;
150
151out_version:
152 dprintk("svcrdma: bad xprt version: %u\n",
153 be32_to_cpup(rdma_argp + 1));
154 return -EPROTONOSUPPORT;
155
156out_drop:
157 dprintk("svcrdma: dropping RDMA_DONE/ERROR message\n");
158 return 0;
159
160out_proc:
161 dprintk("svcrdma: bad rdma procedure (%u)\n",
162 be32_to_cpup(rdma_argp + 3));
163 return -EINVAL;
164
165out_inval:
166 dprintk("svcrdma: failed to parse transport header\n");
167 return -EINVAL;
242} 168}
243 169
244int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt, 170int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt,
@@ -249,7 +175,7 @@ int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt,
249 175
250 *va++ = rmsgp->rm_xid; 176 *va++ = rmsgp->rm_xid;
251 *va++ = rmsgp->rm_vers; 177 *va++ = rmsgp->rm_vers;
252 *va++ = cpu_to_be32(xprt->sc_max_requests); 178 *va++ = xprt->sc_fc_credits;
253 *va++ = rdma_error; 179 *va++ = rdma_error;
254 *va++ = cpu_to_be32(err); 180 *va++ = cpu_to_be32(err);
255 if (err == ERR_VERS) { 181 if (err == ERR_VERS) {
@@ -260,32 +186,35 @@ int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt,
260 return (int)((unsigned long)va - (unsigned long)startp); 186 return (int)((unsigned long)va - (unsigned long)startp);
261} 187}
262 188
263int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *rmsgp) 189/**
190 * svc_rdma_xdr_get_reply_hdr_length - Get length of Reply transport header
191 * @rdma_resp: buffer containing Reply transport header
192 *
193 * Returns length of transport header, in bytes.
194 */
195unsigned int svc_rdma_xdr_get_reply_hdr_len(__be32 *rdma_resp)
264{ 196{
265 struct rpcrdma_write_array *wr_ary; 197 unsigned int nsegs;
198 __be32 *p;
266 199
267 /* There is no read-list in a reply */ 200 p = rdma_resp;
268 201
269 /* skip write list */ 202 /* RPC-over-RDMA V1 replies never have a Read list. */
270 wr_ary = (struct rpcrdma_write_array *) 203 p += rpcrdma_fixed_maxsz + 1;
271 &rmsgp->rm_body.rm_chunks[1]; 204
272 if (wr_ary->wc_discrim) 205 /* Skip Write list. */
273 wr_ary = (struct rpcrdma_write_array *) 206 while (*p++ != xdr_zero) {
274 &wr_ary->wc_array[be32_to_cpu(wr_ary->wc_nchunks)]. 207 nsegs = be32_to_cpup(p++);
275 wc_target.rs_length; 208 p += nsegs * rpcrdma_segment_maxsz;
276 else 209 }
277 wr_ary = (struct rpcrdma_write_array *) 210
278 &wr_ary->wc_nchunks; 211 /* Skip Reply chunk. */
279 212 if (*p++ != xdr_zero) {
280 /* skip reply array */ 213 nsegs = be32_to_cpup(p++);
281 if (wr_ary->wc_discrim) 214 p += nsegs * rpcrdma_segment_maxsz;
282 wr_ary = (struct rpcrdma_write_array *) 215 }
283 &wr_ary->wc_array[be32_to_cpu(wr_ary->wc_nchunks)]; 216
284 else 217 return (unsigned long)p - (unsigned long)rdma_resp;
285 wr_ary = (struct rpcrdma_write_array *)
286 &wr_ary->wc_nchunks;
287
288 return (unsigned long) wr_ary - (unsigned long) rmsgp;
289} 218}
290 219
291void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *rmsgp, int chunks) 220void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *rmsgp, int chunks)
@@ -326,19 +255,3 @@ void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *ary,
326 seg->rs_offset = rs_offset; 255 seg->rs_offset = rs_offset;
327 seg->rs_length = cpu_to_be32(write_len); 256 seg->rs_length = cpu_to_be32(write_len);
328} 257}
329
330void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *xprt,
331 struct rpcrdma_msg *rdma_argp,
332 struct rpcrdma_msg *rdma_resp,
333 enum rpcrdma_proc rdma_type)
334{
335 rdma_resp->rm_xid = rdma_argp->rm_xid;
336 rdma_resp->rm_vers = rdma_argp->rm_vers;
337 rdma_resp->rm_credit = cpu_to_be32(xprt->sc_max_requests);
338 rdma_resp->rm_type = cpu_to_be32(rdma_type);
339
340 /* Encode <nul> chunks lists */
341 rdma_resp->rm_body.rm_chunks[0] = xdr_zero;
342 rdma_resp->rm_body.rm_chunks[1] = xdr_zero;
343 rdma_resp->rm_body.rm_chunks[2] = xdr_zero;
344}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 172b537f8cfc..f7b2daf72a86 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -606,26 +606,24 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
606 606
607 dprintk("svcrdma: rqstp=%p\n", rqstp); 607 dprintk("svcrdma: rqstp=%p\n", rqstp);
608 608
609 spin_lock_bh(&rdma_xprt->sc_rq_dto_lock); 609 spin_lock(&rdma_xprt->sc_rq_dto_lock);
610 if (!list_empty(&rdma_xprt->sc_read_complete_q)) { 610 if (!list_empty(&rdma_xprt->sc_read_complete_q)) {
611 ctxt = list_entry(rdma_xprt->sc_read_complete_q.next, 611 ctxt = list_first_entry(&rdma_xprt->sc_read_complete_q,
612 struct svc_rdma_op_ctxt, 612 struct svc_rdma_op_ctxt, list);
613 dto_q); 613 list_del(&ctxt->list);
614 list_del_init(&ctxt->dto_q); 614 spin_unlock(&rdma_xprt->sc_rq_dto_lock);
615 spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock);
616 rdma_read_complete(rqstp, ctxt); 615 rdma_read_complete(rqstp, ctxt);
617 goto complete; 616 goto complete;
618 } else if (!list_empty(&rdma_xprt->sc_rq_dto_q)) { 617 } else if (!list_empty(&rdma_xprt->sc_rq_dto_q)) {
619 ctxt = list_entry(rdma_xprt->sc_rq_dto_q.next, 618 ctxt = list_first_entry(&rdma_xprt->sc_rq_dto_q,
620 struct svc_rdma_op_ctxt, 619 struct svc_rdma_op_ctxt, list);
621 dto_q); 620 list_del(&ctxt->list);
622 list_del_init(&ctxt->dto_q);
623 } else { 621 } else {
624 atomic_inc(&rdma_stat_rq_starve); 622 atomic_inc(&rdma_stat_rq_starve);
625 clear_bit(XPT_DATA, &xprt->xpt_flags); 623 clear_bit(XPT_DATA, &xprt->xpt_flags);
626 ctxt = NULL; 624 ctxt = NULL;
627 } 625 }
628 spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock); 626 spin_unlock(&rdma_xprt->sc_rq_dto_lock);
629 if (!ctxt) { 627 if (!ctxt) {
630 /* This is the EAGAIN path. The svc_recv routine will 628 /* This is the EAGAIN path. The svc_recv routine will
631 * return -EAGAIN, the nfsd thread will go to call into 629 * return -EAGAIN, the nfsd thread will go to call into
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index ad4d286a83c5..515221b16d09 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -476,7 +476,8 @@ static int send_reply(struct svcxprt_rdma *rdma,
476 476
477 /* Prepare the SGE for the RPCRDMA Header */ 477 /* Prepare the SGE for the RPCRDMA Header */
478 ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey; 478 ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey;
479 ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp); 479 ctxt->sge[0].length =
480 svc_rdma_xdr_get_reply_hdr_len((__be32 *)rdma_resp);
480 ctxt->sge[0].addr = 481 ctxt->sge[0].addr =
481 ib_dma_map_page(rdma->sc_cm_id->device, page, 0, 482 ib_dma_map_page(rdma->sc_cm_id->device, page, 0,
482 ctxt->sge[0].length, DMA_TO_DEVICE); 483 ctxt->sge[0].length, DMA_TO_DEVICE);
@@ -559,12 +560,12 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
559 struct rpcrdma_msg *rdma_argp; 560 struct rpcrdma_msg *rdma_argp;
560 struct rpcrdma_msg *rdma_resp; 561 struct rpcrdma_msg *rdma_resp;
561 struct rpcrdma_write_array *wr_ary, *rp_ary; 562 struct rpcrdma_write_array *wr_ary, *rp_ary;
562 enum rpcrdma_proc reply_type;
563 int ret; 563 int ret;
564 int inline_bytes; 564 int inline_bytes;
565 struct page *res_page; 565 struct page *res_page;
566 struct svc_rdma_req_map *vec; 566 struct svc_rdma_req_map *vec;
567 u32 inv_rkey; 567 u32 inv_rkey;
568 __be32 *p;
568 569
569 dprintk("svcrdma: sending response for rqstp=%p\n", rqstp); 570 dprintk("svcrdma: sending response for rqstp=%p\n", rqstp);
570 571
@@ -596,12 +597,17 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
596 if (!res_page) 597 if (!res_page)
597 goto err0; 598 goto err0;
598 rdma_resp = page_address(res_page); 599 rdma_resp = page_address(res_page);
599 if (rp_ary) 600
600 reply_type = RDMA_NOMSG; 601 p = &rdma_resp->rm_xid;
601 else 602 *p++ = rdma_argp->rm_xid;
602 reply_type = RDMA_MSG; 603 *p++ = rdma_argp->rm_vers;
603 svc_rdma_xdr_encode_reply_header(rdma, rdma_argp, 604 *p++ = rdma->sc_fc_credits;
604 rdma_resp, reply_type); 605 *p++ = rp_ary ? rdma_nomsg : rdma_msg;
606
607 /* Start with empty chunks */
608 *p++ = xdr_zero;
609 *p++ = xdr_zero;
610 *p = xdr_zero;
605 611
606 /* Send any write-chunk data and build resp write-list */ 612 /* Send any write-chunk data and build resp write-list */
607 if (wr_ary) { 613 if (wr_ary) {
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index ca2799af05a6..fc8f14c7bfec 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -127,6 +127,7 @@ static struct svc_xprt *svc_rdma_bc_create(struct svc_serv *serv,
127 xprt = &cma_xprt->sc_xprt; 127 xprt = &cma_xprt->sc_xprt;
128 128
129 svc_xprt_init(net, &svc_rdma_bc_class, xprt, serv); 129 svc_xprt_init(net, &svc_rdma_bc_class, xprt, serv);
130 set_bit(XPT_CONG_CTRL, &xprt->xpt_flags);
130 serv->sv_bc_xprt = xprt; 131 serv->sv_bc_xprt = xprt;
131 132
132 dprintk("svcrdma: %s(%p)\n", __func__, xprt); 133 dprintk("svcrdma: %s(%p)\n", __func__, xprt);
@@ -157,8 +158,7 @@ static struct svc_rdma_op_ctxt *alloc_ctxt(struct svcxprt_rdma *xprt,
157 ctxt = kmalloc(sizeof(*ctxt), flags); 158 ctxt = kmalloc(sizeof(*ctxt), flags);
158 if (ctxt) { 159 if (ctxt) {
159 ctxt->xprt = xprt; 160 ctxt->xprt = xprt;
160 INIT_LIST_HEAD(&ctxt->free); 161 INIT_LIST_HEAD(&ctxt->list);
161 INIT_LIST_HEAD(&ctxt->dto_q);
162 } 162 }
163 return ctxt; 163 return ctxt;
164} 164}
@@ -180,7 +180,7 @@ static bool svc_rdma_prealloc_ctxts(struct svcxprt_rdma *xprt)
180 dprintk("svcrdma: No memory for RDMA ctxt\n"); 180 dprintk("svcrdma: No memory for RDMA ctxt\n");
181 return false; 181 return false;
182 } 182 }
183 list_add(&ctxt->free, &xprt->sc_ctxts); 183 list_add(&ctxt->list, &xprt->sc_ctxts);
184 } 184 }
185 return true; 185 return true;
186} 186}
@@ -189,15 +189,15 @@ struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
189{ 189{
190 struct svc_rdma_op_ctxt *ctxt = NULL; 190 struct svc_rdma_op_ctxt *ctxt = NULL;
191 191
192 spin_lock_bh(&xprt->sc_ctxt_lock); 192 spin_lock(&xprt->sc_ctxt_lock);
193 xprt->sc_ctxt_used++; 193 xprt->sc_ctxt_used++;
194 if (list_empty(&xprt->sc_ctxts)) 194 if (list_empty(&xprt->sc_ctxts))
195 goto out_empty; 195 goto out_empty;
196 196
197 ctxt = list_first_entry(&xprt->sc_ctxts, 197 ctxt = list_first_entry(&xprt->sc_ctxts,
198 struct svc_rdma_op_ctxt, free); 198 struct svc_rdma_op_ctxt, list);
199 list_del_init(&ctxt->free); 199 list_del(&ctxt->list);
200 spin_unlock_bh(&xprt->sc_ctxt_lock); 200 spin_unlock(&xprt->sc_ctxt_lock);
201 201
202out: 202out:
203 ctxt->count = 0; 203 ctxt->count = 0;
@@ -209,15 +209,15 @@ out_empty:
209 /* Either pre-allocation missed the mark, or send 209 /* Either pre-allocation missed the mark, or send
210 * queue accounting is broken. 210 * queue accounting is broken.
211 */ 211 */
212 spin_unlock_bh(&xprt->sc_ctxt_lock); 212 spin_unlock(&xprt->sc_ctxt_lock);
213 213
214 ctxt = alloc_ctxt(xprt, GFP_NOIO); 214 ctxt = alloc_ctxt(xprt, GFP_NOIO);
215 if (ctxt) 215 if (ctxt)
216 goto out; 216 goto out;
217 217
218 spin_lock_bh(&xprt->sc_ctxt_lock); 218 spin_lock(&xprt->sc_ctxt_lock);
219 xprt->sc_ctxt_used--; 219 xprt->sc_ctxt_used--;
220 spin_unlock_bh(&xprt->sc_ctxt_lock); 220 spin_unlock(&xprt->sc_ctxt_lock);
221 WARN_ONCE(1, "svcrdma: empty RDMA ctxt list?\n"); 221 WARN_ONCE(1, "svcrdma: empty RDMA ctxt list?\n");
222 return NULL; 222 return NULL;
223} 223}
@@ -254,10 +254,10 @@ void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
254 for (i = 0; i < ctxt->count; i++) 254 for (i = 0; i < ctxt->count; i++)
255 put_page(ctxt->pages[i]); 255 put_page(ctxt->pages[i]);
256 256
257 spin_lock_bh(&xprt->sc_ctxt_lock); 257 spin_lock(&xprt->sc_ctxt_lock);
258 xprt->sc_ctxt_used--; 258 xprt->sc_ctxt_used--;
259 list_add(&ctxt->free, &xprt->sc_ctxts); 259 list_add(&ctxt->list, &xprt->sc_ctxts);
260 spin_unlock_bh(&xprt->sc_ctxt_lock); 260 spin_unlock(&xprt->sc_ctxt_lock);
261} 261}
262 262
263static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt) 263static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt)
@@ -266,8 +266,8 @@ static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt)
266 struct svc_rdma_op_ctxt *ctxt; 266 struct svc_rdma_op_ctxt *ctxt;
267 267
268 ctxt = list_first_entry(&xprt->sc_ctxts, 268 ctxt = list_first_entry(&xprt->sc_ctxts,
269 struct svc_rdma_op_ctxt, free); 269 struct svc_rdma_op_ctxt, list);
270 list_del(&ctxt->free); 270 list_del(&ctxt->list);
271 kfree(ctxt); 271 kfree(ctxt);
272 } 272 }
273} 273}
@@ -404,7 +404,7 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
404 /* All wc fields are now known to be valid */ 404 /* All wc fields are now known to be valid */
405 ctxt->byte_len = wc->byte_len; 405 ctxt->byte_len = wc->byte_len;
406 spin_lock(&xprt->sc_rq_dto_lock); 406 spin_lock(&xprt->sc_rq_dto_lock);
407 list_add_tail(&ctxt->dto_q, &xprt->sc_rq_dto_q); 407 list_add_tail(&ctxt->list, &xprt->sc_rq_dto_q);
408 spin_unlock(&xprt->sc_rq_dto_lock); 408 spin_unlock(&xprt->sc_rq_dto_lock);
409 409
410 set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); 410 set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
@@ -525,7 +525,7 @@ void svc_rdma_wc_read(struct ib_cq *cq, struct ib_wc *wc)
525 525
526 read_hdr = ctxt->read_hdr; 526 read_hdr = ctxt->read_hdr;
527 spin_lock(&xprt->sc_rq_dto_lock); 527 spin_lock(&xprt->sc_rq_dto_lock);
528 list_add_tail(&read_hdr->dto_q, 528 list_add_tail(&read_hdr->list,
529 &xprt->sc_read_complete_q); 529 &xprt->sc_read_complete_q);
530 spin_unlock(&xprt->sc_rq_dto_lock); 530 spin_unlock(&xprt->sc_rq_dto_lock);
531 531
@@ -557,7 +557,6 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
557 return NULL; 557 return NULL;
558 svc_xprt_init(&init_net, &svc_rdma_class, &cma_xprt->sc_xprt, serv); 558 svc_xprt_init(&init_net, &svc_rdma_class, &cma_xprt->sc_xprt, serv);
559 INIT_LIST_HEAD(&cma_xprt->sc_accept_q); 559 INIT_LIST_HEAD(&cma_xprt->sc_accept_q);
560 INIT_LIST_HEAD(&cma_xprt->sc_dto_q);
561 INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q); 560 INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q);
562 INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q); 561 INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
563 INIT_LIST_HEAD(&cma_xprt->sc_frmr_q); 562 INIT_LIST_HEAD(&cma_xprt->sc_frmr_q);
@@ -571,6 +570,14 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
571 spin_lock_init(&cma_xprt->sc_ctxt_lock); 570 spin_lock_init(&cma_xprt->sc_ctxt_lock);
572 spin_lock_init(&cma_xprt->sc_map_lock); 571 spin_lock_init(&cma_xprt->sc_map_lock);
573 572
573 /*
574 * Note that this implies that the underlying transport support
575 * has some form of congestion control (see RFC 7530 section 3.1
576 * paragraph 2). For now, we assume that all supported RDMA
577 * transports are suitable here.
578 */
579 set_bit(XPT_CONG_CTRL, &cma_xprt->sc_xprt.xpt_flags);
580
574 if (listener) 581 if (listener)
575 set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags); 582 set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
576 583
@@ -923,14 +930,14 @@ struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *rdma)
923{ 930{
924 struct svc_rdma_fastreg_mr *frmr = NULL; 931 struct svc_rdma_fastreg_mr *frmr = NULL;
925 932
926 spin_lock_bh(&rdma->sc_frmr_q_lock); 933 spin_lock(&rdma->sc_frmr_q_lock);
927 if (!list_empty(&rdma->sc_frmr_q)) { 934 if (!list_empty(&rdma->sc_frmr_q)) {
928 frmr = list_entry(rdma->sc_frmr_q.next, 935 frmr = list_entry(rdma->sc_frmr_q.next,
929 struct svc_rdma_fastreg_mr, frmr_list); 936 struct svc_rdma_fastreg_mr, frmr_list);
930 list_del_init(&frmr->frmr_list); 937 list_del_init(&frmr->frmr_list);
931 frmr->sg_nents = 0; 938 frmr->sg_nents = 0;
932 } 939 }
933 spin_unlock_bh(&rdma->sc_frmr_q_lock); 940 spin_unlock(&rdma->sc_frmr_q_lock);
934 if (frmr) 941 if (frmr)
935 return frmr; 942 return frmr;
936 943
@@ -943,10 +950,10 @@ void svc_rdma_put_frmr(struct svcxprt_rdma *rdma,
943 if (frmr) { 950 if (frmr) {
944 ib_dma_unmap_sg(rdma->sc_cm_id->device, 951 ib_dma_unmap_sg(rdma->sc_cm_id->device,
945 frmr->sg, frmr->sg_nents, frmr->direction); 952 frmr->sg, frmr->sg_nents, frmr->direction);
946 spin_lock_bh(&rdma->sc_frmr_q_lock); 953 spin_lock(&rdma->sc_frmr_q_lock);
947 WARN_ON_ONCE(!list_empty(&frmr->frmr_list)); 954 WARN_ON_ONCE(!list_empty(&frmr->frmr_list));
948 list_add(&frmr->frmr_list, &rdma->sc_frmr_q); 955 list_add(&frmr->frmr_list, &rdma->sc_frmr_q);
949 spin_unlock_bh(&rdma->sc_frmr_q_lock); 956 spin_unlock(&rdma->sc_frmr_q_lock);
950 } 957 }
951} 958}
952 959
@@ -1002,6 +1009,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
1002 newxprt->sc_max_req_size = svcrdma_max_req_size; 1009 newxprt->sc_max_req_size = svcrdma_max_req_size;
1003 newxprt->sc_max_requests = min_t(u32, dev->attrs.max_qp_wr, 1010 newxprt->sc_max_requests = min_t(u32, dev->attrs.max_qp_wr,
1004 svcrdma_max_requests); 1011 svcrdma_max_requests);
1012 newxprt->sc_fc_credits = cpu_to_be32(newxprt->sc_max_requests);
1005 newxprt->sc_max_bc_requests = min_t(u32, dev->attrs.max_qp_wr, 1013 newxprt->sc_max_bc_requests = min_t(u32, dev->attrs.max_qp_wr,
1006 svcrdma_max_bc_requests); 1014 svcrdma_max_bc_requests);
1007 newxprt->sc_rq_depth = newxprt->sc_max_requests + 1015 newxprt->sc_rq_depth = newxprt->sc_max_requests +
@@ -1027,13 +1035,13 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
1027 goto errout; 1035 goto errout;
1028 } 1036 }
1029 newxprt->sc_sq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_sq_depth, 1037 newxprt->sc_sq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_sq_depth,
1030 0, IB_POLL_SOFTIRQ); 1038 0, IB_POLL_WORKQUEUE);
1031 if (IS_ERR(newxprt->sc_sq_cq)) { 1039 if (IS_ERR(newxprt->sc_sq_cq)) {
1032 dprintk("svcrdma: error creating SQ CQ for connect request\n"); 1040 dprintk("svcrdma: error creating SQ CQ for connect request\n");
1033 goto errout; 1041 goto errout;
1034 } 1042 }
1035 newxprt->sc_rq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_rq_depth, 1043 newxprt->sc_rq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_rq_depth,
1036 0, IB_POLL_SOFTIRQ); 1044 0, IB_POLL_WORKQUEUE);
1037 if (IS_ERR(newxprt->sc_rq_cq)) { 1045 if (IS_ERR(newxprt->sc_rq_cq)) {
1038 dprintk("svcrdma: error creating RQ CQ for connect request\n"); 1046 dprintk("svcrdma: error creating RQ CQ for connect request\n");
1039 goto errout; 1047 goto errout;
@@ -1201,9 +1209,9 @@ static void __svc_rdma_free(struct work_struct *work)
1201 ib_drain_qp(rdma->sc_qp); 1209 ib_drain_qp(rdma->sc_qp);
1202 1210
1203 /* We should only be called from kref_put */ 1211 /* We should only be called from kref_put */
1204 if (atomic_read(&xprt->xpt_ref.refcount) != 0) 1212 if (kref_read(&xprt->xpt_ref) != 0)
1205 pr_err("svcrdma: sc_xprt still in use? (%d)\n", 1213 pr_err("svcrdma: sc_xprt still in use? (%d)\n",
1206 atomic_read(&xprt->xpt_ref.refcount)); 1214 kref_read(&xprt->xpt_ref));
1207 1215
1208 /* 1216 /*
1209 * Destroy queued, but not processed read completions. Note 1217 * Destroy queued, but not processed read completions. Note
@@ -1213,20 +1221,18 @@ static void __svc_rdma_free(struct work_struct *work)
1213 */ 1221 */
1214 while (!list_empty(&rdma->sc_read_complete_q)) { 1222 while (!list_empty(&rdma->sc_read_complete_q)) {
1215 struct svc_rdma_op_ctxt *ctxt; 1223 struct svc_rdma_op_ctxt *ctxt;
1216 ctxt = list_entry(rdma->sc_read_complete_q.next, 1224 ctxt = list_first_entry(&rdma->sc_read_complete_q,
1217 struct svc_rdma_op_ctxt, 1225 struct svc_rdma_op_ctxt, list);
1218 dto_q); 1226 list_del(&ctxt->list);
1219 list_del_init(&ctxt->dto_q);
1220 svc_rdma_put_context(ctxt, 1); 1227 svc_rdma_put_context(ctxt, 1);
1221 } 1228 }
1222 1229
1223 /* Destroy queued, but not processed recv completions */ 1230 /* Destroy queued, but not processed recv completions */
1224 while (!list_empty(&rdma->sc_rq_dto_q)) { 1231 while (!list_empty(&rdma->sc_rq_dto_q)) {
1225 struct svc_rdma_op_ctxt *ctxt; 1232 struct svc_rdma_op_ctxt *ctxt;
1226 ctxt = list_entry(rdma->sc_rq_dto_q.next, 1233 ctxt = list_first_entry(&rdma->sc_rq_dto_q,
1227 struct svc_rdma_op_ctxt, 1234 struct svc_rdma_op_ctxt, list);
1228 dto_q); 1235 list_del(&ctxt->list);
1229 list_del_init(&ctxt->dto_q);
1230 svc_rdma_put_context(ctxt, 1); 1236 svc_rdma_put_context(ctxt, 1);
1231 } 1237 }
1232 1238
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 534c178d2a7e..c717f5410776 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -67,7 +67,7 @@ unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE;
67static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE; 67static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE;
68static unsigned int xprt_rdma_inline_write_padding; 68static unsigned int xprt_rdma_inline_write_padding;
69static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR; 69static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR;
70 int xprt_rdma_pad_optimize = 1; 70 int xprt_rdma_pad_optimize = 0;
71 71
72#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) 72#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
73 73
@@ -709,10 +709,6 @@ xprt_rdma_send_request(struct rpc_task *task)
709 return 0; 709 return 0;
710 710
711failed_marshal: 711failed_marshal:
712 dprintk("RPC: %s: rpcrdma_marshal_req failed, status %i\n",
713 __func__, rc);
714 if (rc == -EIO)
715 r_xprt->rx_stats.failed_marshal_count++;
716 if (rc != -ENOTCONN) 712 if (rc != -ENOTCONN)
717 return rc; 713 return rc;
718drop_connection: 714drop_connection:
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 11d07748f699..3b332b395045 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -54,6 +54,7 @@
54#include <linux/sunrpc/svc_rdma.h> 54#include <linux/sunrpc/svc_rdma.h>
55#include <asm/bitops.h> 55#include <asm/bitops.h>
56#include <linux/module.h> /* try_module_get()/module_put() */ 56#include <linux/module.h> /* try_module_get()/module_put() */
57#include <rdma/ib_cm.h>
57 58
58#include "xprt_rdma.h" 59#include "xprt_rdma.h"
59 60
@@ -208,6 +209,7 @@ rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt,
208 209
209 /* Default settings for RPC-over-RDMA Version One */ 210 /* Default settings for RPC-over-RDMA Version One */
210 r_xprt->rx_ia.ri_reminv_expected = false; 211 r_xprt->rx_ia.ri_reminv_expected = false;
212 r_xprt->rx_ia.ri_implicit_roundup = xprt_rdma_pad_optimize;
211 rsize = RPCRDMA_V1_DEF_INLINE_SIZE; 213 rsize = RPCRDMA_V1_DEF_INLINE_SIZE;
212 wsize = RPCRDMA_V1_DEF_INLINE_SIZE; 214 wsize = RPCRDMA_V1_DEF_INLINE_SIZE;
213 215
@@ -215,6 +217,7 @@ rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt,
215 pmsg->cp_magic == rpcrdma_cmp_magic && 217 pmsg->cp_magic == rpcrdma_cmp_magic &&
216 pmsg->cp_version == RPCRDMA_CMP_VERSION) { 218 pmsg->cp_version == RPCRDMA_CMP_VERSION) {
217 r_xprt->rx_ia.ri_reminv_expected = true; 219 r_xprt->rx_ia.ri_reminv_expected = true;
220 r_xprt->rx_ia.ri_implicit_roundup = true;
218 rsize = rpcrdma_decode_buffer_size(pmsg->cp_send_size); 221 rsize = rpcrdma_decode_buffer_size(pmsg->cp_send_size);
219 wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size); 222 wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size);
220 } 223 }
@@ -277,7 +280,14 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
277 connstate = -ENETDOWN; 280 connstate = -ENETDOWN;
278 goto connected; 281 goto connected;
279 case RDMA_CM_EVENT_REJECTED: 282 case RDMA_CM_EVENT_REJECTED:
283#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
284 pr_info("rpcrdma: connection to %pIS:%u on %s rejected: %s\n",
285 sap, rpc_get_port(sap), ia->ri_device->name,
286 rdma_reject_msg(id, event->status));
287#endif
280 connstate = -ECONNREFUSED; 288 connstate = -ECONNREFUSED;
289 if (event->status == IB_CM_REJ_STALE_CONN)
290 connstate = -EAGAIN;
281 goto connected; 291 goto connected;
282 case RDMA_CM_EVENT_DISCONNECTED: 292 case RDMA_CM_EVENT_DISCONNECTED:
283 connstate = -ECONNABORTED; 293 connstate = -ECONNABORTED;
@@ -486,18 +496,20 @@ rpcrdma_ia_close(struct rpcrdma_ia *ia)
486 */ 496 */
487int 497int
488rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, 498rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
489 struct rpcrdma_create_data_internal *cdata) 499 struct rpcrdma_create_data_internal *cdata)
490{ 500{
491 struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private; 501 struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private;
502 unsigned int max_qp_wr, max_sge;
492 struct ib_cq *sendcq, *recvcq; 503 struct ib_cq *sendcq, *recvcq;
493 unsigned int max_qp_wr;
494 int rc; 504 int rc;
495 505
496 if (ia->ri_device->attrs.max_sge < RPCRDMA_MAX_SEND_SGES) { 506 max_sge = min_t(unsigned int, ia->ri_device->attrs.max_sge,
497 dprintk("RPC: %s: insufficient sge's available\n", 507 RPCRDMA_MAX_SEND_SGES);
498 __func__); 508 if (max_sge < RPCRDMA_MIN_SEND_SGES) {
509 pr_warn("rpcrdma: HCA provides only %d send SGEs\n", max_sge);
499 return -ENOMEM; 510 return -ENOMEM;
500 } 511 }
512 ia->ri_max_send_sges = max_sge - RPCRDMA_MIN_SEND_SGES;
501 513
502 if (ia->ri_device->attrs.max_qp_wr <= RPCRDMA_BACKWARD_WRS) { 514 if (ia->ri_device->attrs.max_qp_wr <= RPCRDMA_BACKWARD_WRS) {
503 dprintk("RPC: %s: insufficient wqe's available\n", 515 dprintk("RPC: %s: insufficient wqe's available\n",
@@ -522,7 +534,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
522 ep->rep_attr.cap.max_recv_wr = cdata->max_requests; 534 ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
523 ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS; 535 ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
524 ep->rep_attr.cap.max_recv_wr += 1; /* drain cqe */ 536 ep->rep_attr.cap.max_recv_wr += 1; /* drain cqe */
525 ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_SEND_SGES; 537 ep->rep_attr.cap.max_send_sge = max_sge;
526 ep->rep_attr.cap.max_recv_sge = 1; 538 ep->rep_attr.cap.max_recv_sge = 1;
527 ep->rep_attr.cap.max_inline_data = 0; 539 ep->rep_attr.cap.max_inline_data = 0;
528 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR; 540 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
@@ -640,20 +652,21 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
640int 652int
641rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) 653rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
642{ 654{
655 struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt,
656 rx_ia);
643 struct rdma_cm_id *id, *old; 657 struct rdma_cm_id *id, *old;
658 struct sockaddr *sap;
659 unsigned int extras;
644 int rc = 0; 660 int rc = 0;
645 int retry_count = 0;
646 661
647 if (ep->rep_connected != 0) { 662 if (ep->rep_connected != 0) {
648 struct rpcrdma_xprt *xprt;
649retry: 663retry:
650 dprintk("RPC: %s: reconnecting...\n", __func__); 664 dprintk("RPC: %s: reconnecting...\n", __func__);
651 665
652 rpcrdma_ep_disconnect(ep, ia); 666 rpcrdma_ep_disconnect(ep, ia);
653 667
654 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); 668 sap = (struct sockaddr *)&r_xprt->rx_data.addr;
655 id = rpcrdma_create_id(xprt, ia, 669 id = rpcrdma_create_id(r_xprt, ia, sap);
656 (struct sockaddr *)&xprt->rx_data.addr);
657 if (IS_ERR(id)) { 670 if (IS_ERR(id)) {
658 rc = -EHOSTUNREACH; 671 rc = -EHOSTUNREACH;
659 goto out; 672 goto out;
@@ -708,51 +721,18 @@ retry:
708 } 721 }
709 722
710 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0); 723 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
711
712 /*
713 * Check state. A non-peer reject indicates no listener
714 * (ECONNREFUSED), which may be a transient state. All
715 * others indicate a transport condition which has already
716 * undergone a best-effort.
717 */
718 if (ep->rep_connected == -ECONNREFUSED &&
719 ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
720 dprintk("RPC: %s: non-peer_reject, retry\n", __func__);
721 goto retry;
722 }
723 if (ep->rep_connected <= 0) { 724 if (ep->rep_connected <= 0) {
724 /* Sometimes, the only way to reliably connect to remote 725 if (ep->rep_connected == -EAGAIN)
725 * CMs is to use same nonzero values for ORD and IRD. */
726 if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
727 (ep->rep_remote_cma.responder_resources == 0 ||
728 ep->rep_remote_cma.initiator_depth !=
729 ep->rep_remote_cma.responder_resources)) {
730 if (ep->rep_remote_cma.responder_resources == 0)
731 ep->rep_remote_cma.responder_resources = 1;
732 ep->rep_remote_cma.initiator_depth =
733 ep->rep_remote_cma.responder_resources;
734 goto retry; 726 goto retry;
735 }
736 rc = ep->rep_connected; 727 rc = ep->rep_connected;
737 } else { 728 goto out;
738 struct rpcrdma_xprt *r_xprt;
739 unsigned int extras;
740
741 dprintk("RPC: %s: connected\n", __func__);
742
743 r_xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
744 extras = r_xprt->rx_buf.rb_bc_srv_max_requests;
745
746 if (extras) {
747 rc = rpcrdma_ep_post_extra_recv(r_xprt, extras);
748 if (rc) {
749 pr_warn("%s: rpcrdma_ep_post_extra_recv: %i\n",
750 __func__, rc);
751 rc = 0;
752 }
753 }
754 } 729 }
755 730
731 dprintk("RPC: %s: connected\n", __func__);
732 extras = r_xprt->rx_buf.rb_bc_srv_max_requests;
733 if (extras)
734 rpcrdma_ep_post_extra_recv(r_xprt, extras);
735
756out: 736out:
757 if (rc) 737 if (rc)
758 ep->rep_connected = rc; 738 ep->rep_connected = rc;
@@ -797,9 +777,7 @@ rpcrdma_mr_recovery_worker(struct work_struct *work)
797 777
798 spin_lock(&buf->rb_recovery_lock); 778 spin_lock(&buf->rb_recovery_lock);
799 while (!list_empty(&buf->rb_stale_mrs)) { 779 while (!list_empty(&buf->rb_stale_mrs)) {
800 mw = list_first_entry(&buf->rb_stale_mrs, 780 mw = rpcrdma_pop_mw(&buf->rb_stale_mrs);
801 struct rpcrdma_mw, mw_list);
802 list_del_init(&mw->mw_list);
803 spin_unlock(&buf->rb_recovery_lock); 781 spin_unlock(&buf->rb_recovery_lock);
804 782
805 dprintk("RPC: %s: recovering MR %p\n", __func__, mw); 783 dprintk("RPC: %s: recovering MR %p\n", __func__, mw);
@@ -817,7 +795,7 @@ rpcrdma_defer_mr_recovery(struct rpcrdma_mw *mw)
817 struct rpcrdma_buffer *buf = &r_xprt->rx_buf; 795 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
818 796
819 spin_lock(&buf->rb_recovery_lock); 797 spin_lock(&buf->rb_recovery_lock);
820 list_add(&mw->mw_list, &buf->rb_stale_mrs); 798 rpcrdma_push_mw(mw, &buf->rb_stale_mrs);
821 spin_unlock(&buf->rb_recovery_lock); 799 spin_unlock(&buf->rb_recovery_lock);
822 800
823 schedule_delayed_work(&buf->rb_recovery_worker, 0); 801 schedule_delayed_work(&buf->rb_recovery_worker, 0);
@@ -1093,11 +1071,8 @@ rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt)
1093 struct rpcrdma_mw *mw = NULL; 1071 struct rpcrdma_mw *mw = NULL;
1094 1072
1095 spin_lock(&buf->rb_mwlock); 1073 spin_lock(&buf->rb_mwlock);
1096 if (!list_empty(&buf->rb_mws)) { 1074 if (!list_empty(&buf->rb_mws))
1097 mw = list_first_entry(&buf->rb_mws, 1075 mw = rpcrdma_pop_mw(&buf->rb_mws);
1098 struct rpcrdma_mw, mw_list);
1099 list_del_init(&mw->mw_list);
1100 }
1101 spin_unlock(&buf->rb_mwlock); 1076 spin_unlock(&buf->rb_mwlock);
1102 1077
1103 if (!mw) 1078 if (!mw)
@@ -1120,7 +1095,7 @@ rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
1120 struct rpcrdma_buffer *buf = &r_xprt->rx_buf; 1095 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1121 1096
1122 spin_lock(&buf->rb_mwlock); 1097 spin_lock(&buf->rb_mwlock);
1123 list_add_tail(&mw->mw_list, &buf->rb_mws); 1098 rpcrdma_push_mw(mw, &buf->rb_mws);
1124 spin_unlock(&buf->rb_mwlock); 1099 spin_unlock(&buf->rb_mwlock);
1125} 1100}
1126 1101
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index e35efd4ac1e4..171a35116de9 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -74,7 +74,9 @@ struct rpcrdma_ia {
74 unsigned int ri_max_frmr_depth; 74 unsigned int ri_max_frmr_depth;
75 unsigned int ri_max_inline_write; 75 unsigned int ri_max_inline_write;
76 unsigned int ri_max_inline_read; 76 unsigned int ri_max_inline_read;
77 unsigned int ri_max_send_sges;
77 bool ri_reminv_expected; 78 bool ri_reminv_expected;
79 bool ri_implicit_roundup;
78 enum ib_mr_type ri_mrtype; 80 enum ib_mr_type ri_mrtype;
79 struct ib_qp_attr ri_qp_attr; 81 struct ib_qp_attr ri_qp_attr;
80 struct ib_qp_init_attr ri_qp_init_attr; 82 struct ib_qp_init_attr ri_qp_init_attr;
@@ -303,15 +305,19 @@ struct rpcrdma_mr_seg { /* chunk descriptors */
303 char *mr_offset; /* kva if no page, else offset */ 305 char *mr_offset; /* kva if no page, else offset */
304}; 306};
305 307
306/* Reserve enough Send SGEs to send a maximum size inline request: 308/* The Send SGE array is provisioned to send a maximum size
309 * inline request:
307 * - RPC-over-RDMA header 310 * - RPC-over-RDMA header
308 * - xdr_buf head iovec 311 * - xdr_buf head iovec
309 * - RPCRDMA_MAX_INLINE bytes, possibly unaligned, in pages 312 * - RPCRDMA_MAX_INLINE bytes, in pages
310 * - xdr_buf tail iovec 313 * - xdr_buf tail iovec
314 *
315 * The actual number of array elements consumed by each RPC
316 * depends on the device's max_sge limit.
311 */ 317 */
312enum { 318enum {
313 RPCRDMA_MAX_SEND_PAGES = PAGE_SIZE + RPCRDMA_MAX_INLINE - 1, 319 RPCRDMA_MIN_SEND_SGES = 3,
314 RPCRDMA_MAX_PAGE_SGES = (RPCRDMA_MAX_SEND_PAGES >> PAGE_SHIFT) + 1, 320 RPCRDMA_MAX_PAGE_SGES = RPCRDMA_MAX_INLINE >> PAGE_SHIFT,
315 RPCRDMA_MAX_SEND_SGES = 1 + 1 + RPCRDMA_MAX_PAGE_SGES + 1, 321 RPCRDMA_MAX_SEND_SGES = 1 + 1 + RPCRDMA_MAX_PAGE_SGES + 1,
316}; 322};
317 323
@@ -348,6 +354,22 @@ rpcr_to_rdmar(struct rpc_rqst *rqst)
348 return rqst->rq_xprtdata; 354 return rqst->rq_xprtdata;
349} 355}
350 356
357static inline void
358rpcrdma_push_mw(struct rpcrdma_mw *mw, struct list_head *list)
359{
360 list_add_tail(&mw->mw_list, list);
361}
362
363static inline struct rpcrdma_mw *
364rpcrdma_pop_mw(struct list_head *list)
365{
366 struct rpcrdma_mw *mw;
367
368 mw = list_first_entry(list, struct rpcrdma_mw, mw_list);
369 list_del(&mw->mw_list);
370 return mw;
371}
372
351/* 373/*
352 * struct rpcrdma_buffer -- holds list/queue of pre-registered memory for 374 * struct rpcrdma_buffer -- holds list/queue of pre-registered memory for
353 * inline requests/replies, and client/server credits. 375 * inline requests/replies, and client/server credits.
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index af392d9b9cec..16aff8ddc16f 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -52,6 +52,8 @@
52#include "sunrpc.h" 52#include "sunrpc.h"
53 53
54static void xs_close(struct rpc_xprt *xprt); 54static void xs_close(struct rpc_xprt *xprt);
55static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt,
56 struct socket *sock);
55 57
56/* 58/*
57 * xprtsock tunables 59 * xprtsock tunables
@@ -666,6 +668,9 @@ static int xs_tcp_send_request(struct rpc_task *task)
666 if (task->tk_flags & RPC_TASK_SENT) 668 if (task->tk_flags & RPC_TASK_SENT)
667 zerocopy = false; 669 zerocopy = false;
668 670
671 if (test_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state))
672 xs_tcp_set_socket_timeouts(xprt, transport->sock);
673
669 /* Continue transmitting the packet/record. We must be careful 674 /* Continue transmitting the packet/record. We must be careful
670 * to cope with writespace callbacks arriving _after_ we have 675 * to cope with writespace callbacks arriving _after_ we have
671 * called sendmsg(). */ 676 * called sendmsg(). */
@@ -1188,7 +1193,7 @@ static inline void xs_tcp_read_xid(struct sock_xprt *transport, struct xdr_skb_r
1188 char *p; 1193 char *p;
1189 1194
1190 len = sizeof(transport->tcp_xid) - transport->tcp_offset; 1195 len = sizeof(transport->tcp_xid) - transport->tcp_offset;
1191 dprintk("RPC: reading XID (%Zu bytes)\n", len); 1196 dprintk("RPC: reading XID (%zu bytes)\n", len);
1192 p = ((char *) &transport->tcp_xid) + transport->tcp_offset; 1197 p = ((char *) &transport->tcp_xid) + transport->tcp_offset;
1193 used = xdr_skb_read_bits(desc, p, len); 1198 used = xdr_skb_read_bits(desc, p, len);
1194 transport->tcp_offset += used; 1199 transport->tcp_offset += used;
@@ -1219,7 +1224,7 @@ static inline void xs_tcp_read_calldir(struct sock_xprt *transport,
1219 */ 1224 */
1220 offset = transport->tcp_offset - sizeof(transport->tcp_xid); 1225 offset = transport->tcp_offset - sizeof(transport->tcp_xid);
1221 len = sizeof(transport->tcp_calldir) - offset; 1226 len = sizeof(transport->tcp_calldir) - offset;
1222 dprintk("RPC: reading CALL/REPLY flag (%Zu bytes)\n", len); 1227 dprintk("RPC: reading CALL/REPLY flag (%zu bytes)\n", len);
1223 p = ((char *) &transport->tcp_calldir) + offset; 1228 p = ((char *) &transport->tcp_calldir) + offset;
1224 used = xdr_skb_read_bits(desc, p, len); 1229 used = xdr_skb_read_bits(desc, p, len);
1225 transport->tcp_offset += used; 1230 transport->tcp_offset += used;
@@ -1310,7 +1315,7 @@ static inline void xs_tcp_read_common(struct rpc_xprt *xprt,
1310 return; 1315 return;
1311 } 1316 }
1312 1317
1313 dprintk("RPC: XID %08x read %Zd bytes\n", 1318 dprintk("RPC: XID %08x read %zd bytes\n",
1314 ntohl(transport->tcp_xid), r); 1319 ntohl(transport->tcp_xid), r);
1315 dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, " 1320 dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, "
1316 "tcp_reclen = %u\n", xprt, transport->tcp_copied, 1321 "tcp_reclen = %u\n", xprt, transport->tcp_copied,
@@ -1456,7 +1461,7 @@ static inline void xs_tcp_read_discard(struct sock_xprt *transport, struct xdr_s
1456 desc->count -= len; 1461 desc->count -= len;
1457 desc->offset += len; 1462 desc->offset += len;
1458 transport->tcp_offset += len; 1463 transport->tcp_offset += len;
1459 dprintk("RPC: discarded %Zu bytes\n", len); 1464 dprintk("RPC: discarded %zu bytes\n", len);
1460 xs_tcp_check_fraghdr(transport); 1465 xs_tcp_check_fraghdr(transport);
1461} 1466}
1462 1467
@@ -1734,7 +1739,9 @@ static void xs_udp_set_buffer_size(struct rpc_xprt *xprt, size_t sndsize, size_t
1734 */ 1739 */
1735static void xs_udp_timer(struct rpc_xprt *xprt, struct rpc_task *task) 1740static void xs_udp_timer(struct rpc_xprt *xprt, struct rpc_task *task)
1736{ 1741{
1742 spin_lock_bh(&xprt->transport_lock);
1737 xprt_adjust_cwnd(xprt, task, -ETIMEDOUT); 1743 xprt_adjust_cwnd(xprt, task, -ETIMEDOUT);
1744 spin_unlock_bh(&xprt->transport_lock);
1738} 1745}
1739 1746
1740static unsigned short xs_get_random_port(void) 1747static unsigned short xs_get_random_port(void)
@@ -2235,6 +2242,66 @@ static void xs_tcp_shutdown(struct rpc_xprt *xprt)
2235 xs_reset_transport(transport); 2242 xs_reset_transport(transport);
2236} 2243}
2237 2244
2245static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt,
2246 struct socket *sock)
2247{
2248 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
2249 unsigned int keepidle;
2250 unsigned int keepcnt;
2251 unsigned int opt_on = 1;
2252 unsigned int timeo;
2253
2254 spin_lock_bh(&xprt->transport_lock);
2255 keepidle = DIV_ROUND_UP(xprt->timeout->to_initval, HZ);
2256 keepcnt = xprt->timeout->to_retries + 1;
2257 timeo = jiffies_to_msecs(xprt->timeout->to_initval) *
2258 (xprt->timeout->to_retries + 1);
2259 clear_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state);
2260 spin_unlock_bh(&xprt->transport_lock);
2261
2262 /* TCP Keepalive options */
2263 kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
2264 (char *)&opt_on, sizeof(opt_on));
2265 kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE,
2266 (char *)&keepidle, sizeof(keepidle));
2267 kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL,
2268 (char *)&keepidle, sizeof(keepidle));
2269 kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT,
2270 (char *)&keepcnt, sizeof(keepcnt));
2271
2272 /* TCP user timeout (see RFC5482) */
2273 kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT,
2274 (char *)&timeo, sizeof(timeo));
2275}
2276
2277static void xs_tcp_set_connect_timeout(struct rpc_xprt *xprt,
2278 unsigned long connect_timeout,
2279 unsigned long reconnect_timeout)
2280{
2281 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
2282 struct rpc_timeout to;
2283 unsigned long initval;
2284
2285 spin_lock_bh(&xprt->transport_lock);
2286 if (reconnect_timeout < xprt->max_reconnect_timeout)
2287 xprt->max_reconnect_timeout = reconnect_timeout;
2288 if (connect_timeout < xprt->connect_timeout) {
2289 memcpy(&to, xprt->timeout, sizeof(to));
2290 initval = DIV_ROUND_UP(connect_timeout, to.to_retries + 1);
2291 /* Arbitrary lower limit */
2292 if (initval < XS_TCP_INIT_REEST_TO << 1)
2293 initval = XS_TCP_INIT_REEST_TO << 1;
2294 to.to_initval = initval;
2295 to.to_maxval = initval;
2296 memcpy(&transport->tcp_timeout, &to,
2297 sizeof(transport->tcp_timeout));
2298 xprt->timeout = &transport->tcp_timeout;
2299 xprt->connect_timeout = connect_timeout;
2300 }
2301 set_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state);
2302 spin_unlock_bh(&xprt->transport_lock);
2303}
2304
2238static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) 2305static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
2239{ 2306{
2240 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); 2307 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
@@ -2242,22 +2309,8 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
2242 2309
2243 if (!transport->inet) { 2310 if (!transport->inet) {
2244 struct sock *sk = sock->sk; 2311 struct sock *sk = sock->sk;
2245 unsigned int keepidle = xprt->timeout->to_initval / HZ;
2246 unsigned int keepcnt = xprt->timeout->to_retries + 1;
2247 unsigned int opt_on = 1;
2248 unsigned int timeo;
2249 unsigned int addr_pref = IPV6_PREFER_SRC_PUBLIC; 2312 unsigned int addr_pref = IPV6_PREFER_SRC_PUBLIC;
2250 2313
2251 /* TCP Keepalive options */
2252 kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
2253 (char *)&opt_on, sizeof(opt_on));
2254 kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE,
2255 (char *)&keepidle, sizeof(keepidle));
2256 kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL,
2257 (char *)&keepidle, sizeof(keepidle));
2258 kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT,
2259 (char *)&keepcnt, sizeof(keepcnt));
2260
2261 /* Avoid temporary address, they are bad for long-lived 2314 /* Avoid temporary address, they are bad for long-lived
2262 * connections such as NFS mounts. 2315 * connections such as NFS mounts.
2263 * RFC4941, section 3.6 suggests that: 2316 * RFC4941, section 3.6 suggests that:
@@ -2268,11 +2321,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
2268 kernel_setsockopt(sock, SOL_IPV6, IPV6_ADDR_PREFERENCES, 2321 kernel_setsockopt(sock, SOL_IPV6, IPV6_ADDR_PREFERENCES,
2269 (char *)&addr_pref, sizeof(addr_pref)); 2322 (char *)&addr_pref, sizeof(addr_pref));
2270 2323
2271 /* TCP user timeout (see RFC5482) */ 2324 xs_tcp_set_socket_timeouts(xprt, sock);
2272 timeo = jiffies_to_msecs(xprt->timeout->to_initval) *
2273 (xprt->timeout->to_retries + 1);
2274 kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT,
2275 (char *)&timeo, sizeof(timeo));
2276 2325
2277 write_lock_bh(&sk->sk_callback_lock); 2326 write_lock_bh(&sk->sk_callback_lock);
2278 2327
@@ -2721,6 +2770,7 @@ static struct rpc_xprt_ops xs_tcp_ops = {
2721 .set_retrans_timeout = xprt_set_retrans_timeout_def, 2770 .set_retrans_timeout = xprt_set_retrans_timeout_def,
2722 .close = xs_tcp_shutdown, 2771 .close = xs_tcp_shutdown,
2723 .destroy = xs_destroy, 2772 .destroy = xs_destroy,
2773 .set_connect_timeout = xs_tcp_set_connect_timeout,
2724 .print_stats = xs_tcp_print_stats, 2774 .print_stats = xs_tcp_print_stats,
2725 .enable_swap = xs_enable_swap, 2775 .enable_swap = xs_enable_swap,
2726 .disable_swap = xs_disable_swap, 2776 .disable_swap = xs_disable_swap,
@@ -3007,6 +3057,8 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args)
3007 xprt->timeout = &xs_tcp_default_timeout; 3057 xprt->timeout = &xs_tcp_default_timeout;
3008 3058
3009 xprt->max_reconnect_timeout = xprt->timeout->to_maxval; 3059 xprt->max_reconnect_timeout = xprt->timeout->to_maxval;
3060 xprt->connect_timeout = xprt->timeout->to_initval *
3061 (xprt->timeout->to_retries + 1);
3010 3062
3011 INIT_WORK(&transport->recv_worker, xs_tcp_data_receive_workfn); 3063 INIT_WORK(&transport->recv_worker, xs_tcp_data_receive_workfn);
3012 INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_setup_socket); 3064 INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_setup_socket);
@@ -3209,7 +3261,9 @@ static int param_set_uint_minmax(const char *val,
3209 if (!val) 3261 if (!val)
3210 return -EINVAL; 3262 return -EINVAL;
3211 ret = kstrtouint(val, 0, &num); 3263 ret = kstrtouint(val, 0, &num);
3212 if (ret == -EINVAL || num < min || num > max) 3264 if (ret)
3265 return ret;
3266 if (num < min || num > max)
3213 return -EINVAL; 3267 return -EINVAL;
3214 *((unsigned int *)kp->arg) = num; 3268 *((unsigned int *)kp->arg) = num;
3215 return 0; 3269 return 0;
diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index aa1babbea385..7d99029df342 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * net/tipc/bcast.c: TIPC broadcast code 2 * net/tipc/bcast.c: TIPC broadcast code
3 * 3 *
4 * Copyright (c) 2004-2006, 2014-2015, Ericsson AB 4 * Copyright (c) 2004-2006, 2014-2016, Ericsson AB
5 * Copyright (c) 2004, Intel Corporation. 5 * Copyright (c) 2004, Intel Corporation.
6 * Copyright (c) 2005, 2010-2011, Wind River Systems 6 * Copyright (c) 2005, 2010-2011, Wind River Systems
7 * All rights reserved. 7 * All rights reserved.
@@ -39,9 +39,8 @@
39#include "socket.h" 39#include "socket.h"
40#include "msg.h" 40#include "msg.h"
41#include "bcast.h" 41#include "bcast.h"
42#include "name_distr.h"
43#include "link.h" 42#include "link.h"
44#include "node.h" 43#include "name_table.h"
45 44
46#define BCLINK_WIN_DEFAULT 50 /* bcast link window size (default) */ 45#define BCLINK_WIN_DEFAULT 50 /* bcast link window size (default) */
47#define BCLINK_WIN_MIN 32 /* bcast minimum link window size */ 46#define BCLINK_WIN_MIN 32 /* bcast minimum link window size */
@@ -54,12 +53,20 @@ const char tipc_bclink_name[] = "broadcast-link";
54 * @inputq: data input queue; will only carry SOCK_WAKEUP messages 53 * @inputq: data input queue; will only carry SOCK_WAKEUP messages
55 * @dest: array keeping number of reachable destinations per bearer 54 * @dest: array keeping number of reachable destinations per bearer
56 * @primary_bearer: a bearer having links to all broadcast destinations, if any 55 * @primary_bearer: a bearer having links to all broadcast destinations, if any
56 * @bcast_support: indicates if primary bearer, if any, supports broadcast
57 * @rcast_support: indicates if all peer nodes support replicast
58 * @rc_ratio: dest count as percentage of cluster size where send method changes
59 * @bc_threshold: calculated drom rc_ratio; if dests > threshold use broadcast
57 */ 60 */
58struct tipc_bc_base { 61struct tipc_bc_base {
59 struct tipc_link *link; 62 struct tipc_link *link;
60 struct sk_buff_head inputq; 63 struct sk_buff_head inputq;
61 int dests[MAX_BEARERS]; 64 int dests[MAX_BEARERS];
62 int primary_bearer; 65 int primary_bearer;
66 bool bcast_support;
67 bool rcast_support;
68 int rc_ratio;
69 int bc_threshold;
63}; 70};
64 71
65static struct tipc_bc_base *tipc_bc_base(struct net *net) 72static struct tipc_bc_base *tipc_bc_base(struct net *net)
@@ -69,7 +76,20 @@ static struct tipc_bc_base *tipc_bc_base(struct net *net)
69 76
70int tipc_bcast_get_mtu(struct net *net) 77int tipc_bcast_get_mtu(struct net *net)
71{ 78{
72 return tipc_link_mtu(tipc_bc_sndlink(net)); 79 return tipc_link_mtu(tipc_bc_sndlink(net)) - INT_H_SIZE;
80}
81
82void tipc_bcast_disable_rcast(struct net *net)
83{
84 tipc_bc_base(net)->rcast_support = false;
85}
86
87static void tipc_bcbase_calc_bc_threshold(struct net *net)
88{
89 struct tipc_bc_base *bb = tipc_bc_base(net);
90 int cluster_size = tipc_link_bc_peers(tipc_bc_sndlink(net));
91
92 bb->bc_threshold = 1 + (cluster_size * bb->rc_ratio / 100);
73} 93}
74 94
75/* tipc_bcbase_select_primary(): find a bearer with links to all destinations, 95/* tipc_bcbase_select_primary(): find a bearer with links to all destinations,
@@ -79,9 +99,10 @@ static void tipc_bcbase_select_primary(struct net *net)
79{ 99{
80 struct tipc_bc_base *bb = tipc_bc_base(net); 100 struct tipc_bc_base *bb = tipc_bc_base(net);
81 int all_dests = tipc_link_bc_peers(bb->link); 101 int all_dests = tipc_link_bc_peers(bb->link);
82 int i, mtu; 102 int i, mtu, prim;
83 103
84 bb->primary_bearer = INVALID_BEARER_ID; 104 bb->primary_bearer = INVALID_BEARER_ID;
105 bb->bcast_support = true;
85 106
86 if (!all_dests) 107 if (!all_dests)
87 return; 108 return;
@@ -93,7 +114,7 @@ static void tipc_bcbase_select_primary(struct net *net)
93 mtu = tipc_bearer_mtu(net, i); 114 mtu = tipc_bearer_mtu(net, i);
94 if (mtu < tipc_link_mtu(bb->link)) 115 if (mtu < tipc_link_mtu(bb->link))
95 tipc_link_set_mtu(bb->link, mtu); 116 tipc_link_set_mtu(bb->link, mtu);
96 117 bb->bcast_support &= tipc_bearer_bcast_support(net, i);
97 if (bb->dests[i] < all_dests) 118 if (bb->dests[i] < all_dests)
98 continue; 119 continue;
99 120
@@ -103,6 +124,9 @@ static void tipc_bcbase_select_primary(struct net *net)
103 if ((i ^ tipc_own_addr(net)) & 1) 124 if ((i ^ tipc_own_addr(net)) & 1)
104 break; 125 break;
105 } 126 }
127 prim = bb->primary_bearer;
128 if (prim != INVALID_BEARER_ID)
129 bb->bcast_support = tipc_bearer_bcast_support(net, prim);
106} 130}
107 131
108void tipc_bcast_inc_bearer_dst_cnt(struct net *net, int bearer_id) 132void tipc_bcast_inc_bearer_dst_cnt(struct net *net, int bearer_id)
@@ -170,45 +194,131 @@ static void tipc_bcbase_xmit(struct net *net, struct sk_buff_head *xmitq)
170 __skb_queue_purge(&_xmitq); 194 __skb_queue_purge(&_xmitq);
171} 195}
172 196
173/* tipc_bcast_xmit - deliver buffer chain to all nodes in cluster 197static void tipc_bcast_select_xmit_method(struct net *net, int dests,
174 * and to identified node local sockets 198 struct tipc_mc_method *method)
199{
200 struct tipc_bc_base *bb = tipc_bc_base(net);
201 unsigned long exp = method->expires;
202
203 /* Broadcast supported by used bearer/bearers? */
204 if (!bb->bcast_support) {
205 method->rcast = true;
206 return;
207 }
208 /* Any destinations which don't support replicast ? */
209 if (!bb->rcast_support) {
210 method->rcast = false;
211 return;
212 }
213 /* Can current method be changed ? */
214 method->expires = jiffies + TIPC_METHOD_EXPIRE;
215 if (method->mandatory || time_before(jiffies, exp))
216 return;
217
218 /* Determine method to use now */
219 method->rcast = dests <= bb->bc_threshold;
220}
221
222/* tipc_bcast_xmit - broadcast the buffer chain to all external nodes
175 * @net: the applicable net namespace 223 * @net: the applicable net namespace
176 * @list: chain of buffers containing message 224 * @pkts: chain of buffers containing message
177 * Consumes the buffer chain, except when returning -ELINKCONG 225 * @cong_link_cnt: set to 1 if broadcast link is congested, otherwise 0
178 * Returns 0 if success, otherwise errno: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE 226 * Consumes the buffer chain.
227 * Returns 0 if success, otherwise errno: -EHOSTUNREACH,-EMSGSIZE
179 */ 228 */
180int tipc_bcast_xmit(struct net *net, struct sk_buff_head *list) 229static int tipc_bcast_xmit(struct net *net, struct sk_buff_head *pkts,
230 u16 *cong_link_cnt)
181{ 231{
182 struct tipc_link *l = tipc_bc_sndlink(net); 232 struct tipc_link *l = tipc_bc_sndlink(net);
183 struct sk_buff_head xmitq, inputq, rcvq; 233 struct sk_buff_head xmitq;
184 int rc = 0; 234 int rc = 0;
185 235
186 __skb_queue_head_init(&rcvq);
187 __skb_queue_head_init(&xmitq); 236 __skb_queue_head_init(&xmitq);
188 skb_queue_head_init(&inputq);
189
190 /* Prepare message clone for local node */
191 if (unlikely(!tipc_msg_reassemble(list, &rcvq)))
192 return -EHOSTUNREACH;
193
194 tipc_bcast_lock(net); 237 tipc_bcast_lock(net);
195 if (tipc_link_bc_peers(l)) 238 if (tipc_link_bc_peers(l))
196 rc = tipc_link_xmit(l, list, &xmitq); 239 rc = tipc_link_xmit(l, pkts, &xmitq);
197 tipc_bcast_unlock(net); 240 tipc_bcast_unlock(net);
198 241 tipc_bcbase_xmit(net, &xmitq);
199 /* Don't send to local node if adding to link failed */ 242 __skb_queue_purge(pkts);
200 if (unlikely(rc)) { 243 if (rc == -ELINKCONG) {
201 __skb_queue_purge(&rcvq); 244 *cong_link_cnt = 1;
202 return rc; 245 rc = 0;
203 } 246 }
247 return rc;
248}
204 249
205 /* Broadcast to all nodes, inluding local node */ 250/* tipc_rcast_xmit - replicate and send a message to given destination nodes
206 tipc_bcbase_xmit(net, &xmitq); 251 * @net: the applicable net namespace
207 tipc_sk_mcast_rcv(net, &rcvq, &inputq); 252 * @pkts: chain of buffers containing message
208 __skb_queue_purge(list); 253 * @dests: list of destination nodes
254 * @cong_link_cnt: returns number of congested links
255 * @cong_links: returns identities of congested links
256 * Returns 0 if success, otherwise errno
257 */
258static int tipc_rcast_xmit(struct net *net, struct sk_buff_head *pkts,
259 struct tipc_nlist *dests, u16 *cong_link_cnt)
260{
261 struct sk_buff_head _pkts;
262 struct u32_item *n, *tmp;
263 u32 dst, selector;
264
265 selector = msg_link_selector(buf_msg(skb_peek(pkts)));
266 __skb_queue_head_init(&_pkts);
267
268 list_for_each_entry_safe(n, tmp, &dests->list, list) {
269 dst = n->value;
270 if (!tipc_msg_pskb_copy(dst, pkts, &_pkts))
271 return -ENOMEM;
272
273 /* Any other return value than -ELINKCONG is ignored */
274 if (tipc_node_xmit(net, &_pkts, dst, selector) == -ELINKCONG)
275 (*cong_link_cnt)++;
276 }
209 return 0; 277 return 0;
210} 278}
211 279
280/* tipc_mcast_xmit - deliver message to indicated destination nodes
281 * and to identified node local sockets
282 * @net: the applicable net namespace
283 * @pkts: chain of buffers containing message
284 * @method: send method to be used
285 * @dests: destination nodes for message.
286 * @cong_link_cnt: returns number of encountered congested destination links
287 * Consumes buffer chain.
288 * Returns 0 if success, otherwise errno
289 */
290int tipc_mcast_xmit(struct net *net, struct sk_buff_head *pkts,
291 struct tipc_mc_method *method, struct tipc_nlist *dests,
292 u16 *cong_link_cnt)
293{
294 struct sk_buff_head inputq, localq;
295 int rc = 0;
296
297 skb_queue_head_init(&inputq);
298 skb_queue_head_init(&localq);
299
300 /* Clone packets before they are consumed by next call */
301 if (dests->local && !tipc_msg_reassemble(pkts, &localq)) {
302 rc = -ENOMEM;
303 goto exit;
304 }
305 /* Send according to determined transmit method */
306 if (dests->remote) {
307 tipc_bcast_select_xmit_method(net, dests->remote, method);
308 if (method->rcast)
309 rc = tipc_rcast_xmit(net, pkts, dests, cong_link_cnt);
310 else
311 rc = tipc_bcast_xmit(net, pkts, cong_link_cnt);
312 }
313
314 if (dests->local)
315 tipc_sk_mcast_rcv(net, &localq, &inputq);
316exit:
317 /* This queue should normally be empty by now */
318 __skb_queue_purge(pkts);
319 return rc;
320}
321
212/* tipc_bcast_rcv - receive a broadcast packet, and deliver to rcv link 322/* tipc_bcast_rcv - receive a broadcast packet, and deliver to rcv link
213 * 323 *
214 * RCU is locked, no other locks set 324 * RCU is locked, no other locks set
@@ -313,6 +423,7 @@ void tipc_bcast_add_peer(struct net *net, struct tipc_link *uc_l,
313 tipc_bcast_lock(net); 423 tipc_bcast_lock(net);
314 tipc_link_add_bc_peer(snd_l, uc_l, xmitq); 424 tipc_link_add_bc_peer(snd_l, uc_l, xmitq);
315 tipc_bcbase_select_primary(net); 425 tipc_bcbase_select_primary(net);
426 tipc_bcbase_calc_bc_threshold(net);
316 tipc_bcast_unlock(net); 427 tipc_bcast_unlock(net);
317} 428}
318 429
@@ -331,6 +442,7 @@ void tipc_bcast_remove_peer(struct net *net, struct tipc_link *rcv_l)
331 tipc_bcast_lock(net); 442 tipc_bcast_lock(net);
332 tipc_link_remove_bc_peer(snd_l, rcv_l, &xmitq); 443 tipc_link_remove_bc_peer(snd_l, rcv_l, &xmitq);
333 tipc_bcbase_select_primary(net); 444 tipc_bcbase_select_primary(net);
445 tipc_bcbase_calc_bc_threshold(net);
334 tipc_bcast_unlock(net); 446 tipc_bcast_unlock(net);
335 447
336 tipc_bcbase_xmit(net, &xmitq); 448 tipc_bcbase_xmit(net, &xmitq);
@@ -413,6 +525,8 @@ int tipc_bcast_init(struct net *net)
413 goto enomem; 525 goto enomem;
414 bb->link = l; 526 bb->link = l;
415 tn->bcl = l; 527 tn->bcl = l;
528 bb->rc_ratio = 25;
529 bb->rcast_support = true;
416 return 0; 530 return 0;
417enomem: 531enomem:
418 kfree(bb); 532 kfree(bb);
@@ -428,3 +542,33 @@ void tipc_bcast_stop(struct net *net)
428 kfree(tn->bcbase); 542 kfree(tn->bcbase);
429 kfree(tn->bcl); 543 kfree(tn->bcl);
430} 544}
545
546void tipc_nlist_init(struct tipc_nlist *nl, u32 self)
547{
548 memset(nl, 0, sizeof(*nl));
549 INIT_LIST_HEAD(&nl->list);
550 nl->self = self;
551}
552
553void tipc_nlist_add(struct tipc_nlist *nl, u32 node)
554{
555 if (node == nl->self)
556 nl->local = true;
557 else if (u32_push(&nl->list, node))
558 nl->remote++;
559}
560
561void tipc_nlist_del(struct tipc_nlist *nl, u32 node)
562{
563 if (node == nl->self)
564 nl->local = false;
565 else if (u32_del(&nl->list, node))
566 nl->remote--;
567}
568
569void tipc_nlist_purge(struct tipc_nlist *nl)
570{
571 u32_list_purge(&nl->list);
572 nl->remote = 0;
573 nl->local = 0;
574}
diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h
index 855d53c64ab3..751530ab0c49 100644
--- a/net/tipc/bcast.h
+++ b/net/tipc/bcast.h
@@ -42,9 +42,35 @@
42struct tipc_node; 42struct tipc_node;
43struct tipc_msg; 43struct tipc_msg;
44struct tipc_nl_msg; 44struct tipc_nl_msg;
45struct tipc_node_map; 45struct tipc_nlist;
46struct tipc_nitem;
46extern const char tipc_bclink_name[]; 47extern const char tipc_bclink_name[];
47 48
49#define TIPC_METHOD_EXPIRE msecs_to_jiffies(5000)
50
51struct tipc_nlist {
52 struct list_head list;
53 u32 self;
54 u16 remote;
55 bool local;
56};
57
58void tipc_nlist_init(struct tipc_nlist *nl, u32 self);
59void tipc_nlist_purge(struct tipc_nlist *nl);
60void tipc_nlist_add(struct tipc_nlist *nl, u32 node);
61void tipc_nlist_del(struct tipc_nlist *nl, u32 node);
62
63/* Cookie to be used between socket and broadcast layer
64 * @rcast: replicast (instead of broadcast) was used at previous xmit
65 * @mandatory: broadcast/replicast indication was set by user
66 * @expires: re-evaluate non-mandatory transmit method if we are past this
67 */
68struct tipc_mc_method {
69 bool rcast;
70 bool mandatory;
71 unsigned long expires;
72};
73
48int tipc_bcast_init(struct net *net); 74int tipc_bcast_init(struct net *net);
49void tipc_bcast_stop(struct net *net); 75void tipc_bcast_stop(struct net *net);
50void tipc_bcast_add_peer(struct net *net, struct tipc_link *l, 76void tipc_bcast_add_peer(struct net *net, struct tipc_link *l,
@@ -53,7 +79,10 @@ void tipc_bcast_remove_peer(struct net *net, struct tipc_link *rcv_bcl);
53void tipc_bcast_inc_bearer_dst_cnt(struct net *net, int bearer_id); 79void tipc_bcast_inc_bearer_dst_cnt(struct net *net, int bearer_id);
54void tipc_bcast_dec_bearer_dst_cnt(struct net *net, int bearer_id); 80void tipc_bcast_dec_bearer_dst_cnt(struct net *net, int bearer_id);
55int tipc_bcast_get_mtu(struct net *net); 81int tipc_bcast_get_mtu(struct net *net);
56int tipc_bcast_xmit(struct net *net, struct sk_buff_head *list); 82void tipc_bcast_disable_rcast(struct net *net);
83int tipc_mcast_xmit(struct net *net, struct sk_buff_head *pkts,
84 struct tipc_mc_method *method, struct tipc_nlist *dests,
85 u16 *cong_link_cnt);
57int tipc_bcast_rcv(struct net *net, struct tipc_link *l, struct sk_buff *skb); 86int tipc_bcast_rcv(struct net *net, struct tipc_link *l, struct sk_buff *skb);
58void tipc_bcast_ack_rcv(struct net *net, struct tipc_link *l, 87void tipc_bcast_ack_rcv(struct net *net, struct tipc_link *l,
59 struct tipc_msg *hdr); 88 struct tipc_msg *hdr);
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index 52d74760fb68..33a5bdfbef76 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -431,7 +431,7 @@ int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b,
431 memset(&b->bcast_addr, 0, sizeof(b->bcast_addr)); 431 memset(&b->bcast_addr, 0, sizeof(b->bcast_addr));
432 memcpy(b->bcast_addr.value, dev->broadcast, b->media->hwaddr_len); 432 memcpy(b->bcast_addr.value, dev->broadcast, b->media->hwaddr_len);
433 b->bcast_addr.media_id = b->media->type_id; 433 b->bcast_addr.media_id = b->media->type_id;
434 b->bcast_addr.broadcast = 1; 434 b->bcast_addr.broadcast = TIPC_BROADCAST_SUPPORT;
435 b->mtu = dev->mtu; 435 b->mtu = dev->mtu;
436 b->media->raw2addr(b, &b->addr, (char *)dev->dev_addr); 436 b->media->raw2addr(b, &b->addr, (char *)dev->dev_addr);
437 rcu_assign_pointer(dev->tipc_ptr, b); 437 rcu_assign_pointer(dev->tipc_ptr, b);
@@ -482,6 +482,19 @@ int tipc_l2_send_msg(struct net *net, struct sk_buff *skb,
482 return 0; 482 return 0;
483} 483}
484 484
485bool tipc_bearer_bcast_support(struct net *net, u32 bearer_id)
486{
487 bool supp = false;
488 struct tipc_bearer *b;
489
490 rcu_read_lock();
491 b = bearer_get(net, bearer_id);
492 if (b)
493 supp = (b->bcast_addr.broadcast == TIPC_BROADCAST_SUPPORT);
494 rcu_read_unlock();
495 return supp;
496}
497
485int tipc_bearer_mtu(struct net *net, u32 bearer_id) 498int tipc_bearer_mtu(struct net *net, u32 bearer_id)
486{ 499{
487 int mtu = 0; 500 int mtu = 0;
diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h
index 278ff7f616f9..635c9086e19a 100644
--- a/net/tipc/bearer.h
+++ b/net/tipc/bearer.h
@@ -60,9 +60,14 @@
60#define TIPC_MEDIA_TYPE_IB 2 60#define TIPC_MEDIA_TYPE_IB 2
61#define TIPC_MEDIA_TYPE_UDP 3 61#define TIPC_MEDIA_TYPE_UDP 3
62 62
63/* minimum bearer MTU */ 63/* Minimum bearer MTU */
64#define TIPC_MIN_BEARER_MTU (MAX_H_SIZE + INT_H_SIZE) 64#define TIPC_MIN_BEARER_MTU (MAX_H_SIZE + INT_H_SIZE)
65 65
66/* Identifiers for distinguishing between broadcast/multicast and replicast
67 */
68#define TIPC_BROADCAST_SUPPORT 1
69#define TIPC_REPLICAST_SUPPORT 2
70
66/** 71/**
67 * struct tipc_media_addr - destination address used by TIPC bearers 72 * struct tipc_media_addr - destination address used by TIPC bearers
68 * @value: address info (format defined by media) 73 * @value: address info (format defined by media)
@@ -210,6 +215,7 @@ int tipc_bearer_setup(void);
210void tipc_bearer_cleanup(void); 215void tipc_bearer_cleanup(void);
211void tipc_bearer_stop(struct net *net); 216void tipc_bearer_stop(struct net *net);
212int tipc_bearer_mtu(struct net *net, u32 bearer_id); 217int tipc_bearer_mtu(struct net *net, u32 bearer_id);
218bool tipc_bearer_bcast_support(struct net *net, u32 bearer_id);
213void tipc_bearer_xmit_skb(struct net *net, u32 bearer_id, 219void tipc_bearer_xmit_skb(struct net *net, u32 bearer_id,
214 struct sk_buff *skb, 220 struct sk_buff *skb,
215 struct tipc_media_addr *dest); 221 struct tipc_media_addr *dest);
diff --git a/net/tipc/link.c b/net/tipc/link.c
index 4e8647aef01c..ddd2dd6f77aa 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -515,6 +515,10 @@ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer,
515 if (link_is_bc_sndlink(l)) 515 if (link_is_bc_sndlink(l))
516 l->state = LINK_ESTABLISHED; 516 l->state = LINK_ESTABLISHED;
517 517
518 /* Disable replicast if even a single peer doesn't support it */
519 if (link_is_bc_rcvlink(l) && !(peer_caps & TIPC_BCAST_RCAST))
520 tipc_bcast_disable_rcast(net);
521
518 return true; 522 return true;
519} 523}
520 524
@@ -776,60 +780,47 @@ int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq)
776 780
777/** 781/**
778 * link_schedule_user - schedule a message sender for wakeup after congestion 782 * link_schedule_user - schedule a message sender for wakeup after congestion
779 * @link: congested link 783 * @l: congested link
780 * @list: message that was attempted sent 784 * @hdr: header of message that is being sent
781 * Create pseudo msg to send back to user when congestion abates 785 * Create pseudo msg to send back to user when congestion abates
782 * Does not consume buffer list
783 */ 786 */
784static int link_schedule_user(struct tipc_link *link, struct sk_buff_head *list) 787static int link_schedule_user(struct tipc_link *l, struct tipc_msg *hdr)
785{ 788{
786 struct tipc_msg *msg = buf_msg(skb_peek(list)); 789 u32 dnode = tipc_own_addr(l->net);
787 int imp = msg_importance(msg); 790 u32 dport = msg_origport(hdr);
788 u32 oport = msg_origport(msg);
789 u32 addr = tipc_own_addr(link->net);
790 struct sk_buff *skb; 791 struct sk_buff *skb;
791 792
792 /* This really cannot happen... */
793 if (unlikely(imp > TIPC_CRITICAL_IMPORTANCE)) {
794 pr_warn("%s<%s>, send queue full", link_rst_msg, link->name);
795 return -ENOBUFS;
796 }
797 /* Non-blocking sender: */
798 if (TIPC_SKB_CB(skb_peek(list))->wakeup_pending)
799 return -ELINKCONG;
800
801 /* Create and schedule wakeup pseudo message */ 793 /* Create and schedule wakeup pseudo message */
802 skb = tipc_msg_create(SOCK_WAKEUP, 0, INT_H_SIZE, 0, 794 skb = tipc_msg_create(SOCK_WAKEUP, 0, INT_H_SIZE, 0,
803 addr, addr, oport, 0, 0); 795 dnode, l->addr, dport, 0, 0);
804 if (!skb) 796 if (!skb)
805 return -ENOBUFS; 797 return -ENOBUFS;
806 TIPC_SKB_CB(skb)->chain_sz = skb_queue_len(list); 798 msg_set_dest_droppable(buf_msg(skb), true);
807 TIPC_SKB_CB(skb)->chain_imp = imp; 799 TIPC_SKB_CB(skb)->chain_imp = msg_importance(hdr);
808 skb_queue_tail(&link->wakeupq, skb); 800 skb_queue_tail(&l->wakeupq, skb);
809 link->stats.link_congs++; 801 l->stats.link_congs++;
810 return -ELINKCONG; 802 return -ELINKCONG;
811} 803}
812 804
813/** 805/**
814 * link_prepare_wakeup - prepare users for wakeup after congestion 806 * link_prepare_wakeup - prepare users for wakeup after congestion
815 * @link: congested link 807 * @l: congested link
816 * Move a number of waiting users, as permitted by available space in 808 * Wake up a number of waiting users, as permitted by available space
817 * the send queue, from link wait queue to node wait queue for wakeup 809 * in the send queue
818 */ 810 */
819void link_prepare_wakeup(struct tipc_link *l) 811void link_prepare_wakeup(struct tipc_link *l)
820{ 812{
821 int pnd[TIPC_SYSTEM_IMPORTANCE + 1] = {0,};
822 int imp, lim;
823 struct sk_buff *skb, *tmp; 813 struct sk_buff *skb, *tmp;
814 int imp, i = 0;
824 815
825 skb_queue_walk_safe(&l->wakeupq, skb, tmp) { 816 skb_queue_walk_safe(&l->wakeupq, skb, tmp) {
826 imp = TIPC_SKB_CB(skb)->chain_imp; 817 imp = TIPC_SKB_CB(skb)->chain_imp;
827 lim = l->backlog[imp].limit; 818 if (l->backlog[imp].len < l->backlog[imp].limit) {
828 pnd[imp] += TIPC_SKB_CB(skb)->chain_sz; 819 skb_unlink(skb, &l->wakeupq);
829 if ((pnd[imp] + l->backlog[imp].len) >= lim) 820 skb_queue_tail(l->inputq, skb);
821 } else if (i++ > 10) {
830 break; 822 break;
831 skb_unlink(skb, &l->wakeupq); 823 }
832 skb_queue_tail(l->inputq, skb);
833 } 824 }
834} 825}
835 826
@@ -869,8 +860,7 @@ void tipc_link_reset(struct tipc_link *l)
869 * @list: chain of buffers containing message 860 * @list: chain of buffers containing message
870 * @xmitq: returned list of packets to be sent by caller 861 * @xmitq: returned list of packets to be sent by caller
871 * 862 *
872 * Consumes the buffer chain, except when returning -ELINKCONG, 863 * Consumes the buffer chain.
873 * since the caller then may want to make more send attempts.
874 * Returns 0 if success, or errno: -ELINKCONG, -EMSGSIZE or -ENOBUFS 864 * Returns 0 if success, or errno: -ELINKCONG, -EMSGSIZE or -ENOBUFS
875 * Messages at TIPC_SYSTEM_IMPORTANCE are always accepted 865 * Messages at TIPC_SYSTEM_IMPORTANCE are always accepted
876 */ 866 */
@@ -879,7 +869,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
879{ 869{
880 struct tipc_msg *hdr = buf_msg(skb_peek(list)); 870 struct tipc_msg *hdr = buf_msg(skb_peek(list));
881 unsigned int maxwin = l->window; 871 unsigned int maxwin = l->window;
882 unsigned int i, imp = msg_importance(hdr); 872 int imp = msg_importance(hdr);
883 unsigned int mtu = l->mtu; 873 unsigned int mtu = l->mtu;
884 u16 ack = l->rcv_nxt - 1; 874 u16 ack = l->rcv_nxt - 1;
885 u16 seqno = l->snd_nxt; 875 u16 seqno = l->snd_nxt;
@@ -888,19 +878,22 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
888 struct sk_buff_head *backlogq = &l->backlogq; 878 struct sk_buff_head *backlogq = &l->backlogq;
889 struct sk_buff *skb, *_skb, *bskb; 879 struct sk_buff *skb, *_skb, *bskb;
890 int pkt_cnt = skb_queue_len(list); 880 int pkt_cnt = skb_queue_len(list);
881 int rc = 0;
891 882
892 /* Match msg importance against this and all higher backlog limits: */
893 if (!skb_queue_empty(backlogq)) {
894 for (i = imp; i <= TIPC_SYSTEM_IMPORTANCE; i++) {
895 if (unlikely(l->backlog[i].len >= l->backlog[i].limit))
896 return link_schedule_user(l, list);
897 }
898 }
899 if (unlikely(msg_size(hdr) > mtu)) { 883 if (unlikely(msg_size(hdr) > mtu)) {
900 skb_queue_purge(list); 884 skb_queue_purge(list);
901 return -EMSGSIZE; 885 return -EMSGSIZE;
902 } 886 }
903 887
888 /* Allow oversubscription of one data msg per source at congestion */
889 if (unlikely(l->backlog[imp].len >= l->backlog[imp].limit)) {
890 if (imp == TIPC_SYSTEM_IMPORTANCE) {
891 pr_warn("%s<%s>, link overflow", link_rst_msg, l->name);
892 return -ENOBUFS;
893 }
894 rc = link_schedule_user(l, hdr);
895 }
896
904 if (pkt_cnt > 1) { 897 if (pkt_cnt > 1) {
905 l->stats.sent_fragmented++; 898 l->stats.sent_fragmented++;
906 l->stats.sent_fragments += pkt_cnt; 899 l->stats.sent_fragments += pkt_cnt;
@@ -946,7 +939,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
946 skb_queue_splice_tail_init(list, backlogq); 939 skb_queue_splice_tail_init(list, backlogq);
947 } 940 }
948 l->snd_nxt = seqno; 941 l->snd_nxt = seqno;
949 return 0; 942 return rc;
950} 943}
951 944
952void tipc_link_advance_backlog(struct tipc_link *l, struct sk_buff_head *xmitq) 945void tipc_link_advance_backlog(struct tipc_link *l, struct sk_buff_head *xmitq)
@@ -1043,11 +1036,17 @@ int tipc_link_retrans(struct tipc_link *l, u16 from, u16 to,
1043static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb, 1036static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb,
1044 struct sk_buff_head *inputq) 1037 struct sk_buff_head *inputq)
1045{ 1038{
1046 switch (msg_user(buf_msg(skb))) { 1039 struct tipc_msg *hdr = buf_msg(skb);
1040
1041 switch (msg_user(hdr)) {
1047 case TIPC_LOW_IMPORTANCE: 1042 case TIPC_LOW_IMPORTANCE:
1048 case TIPC_MEDIUM_IMPORTANCE: 1043 case TIPC_MEDIUM_IMPORTANCE:
1049 case TIPC_HIGH_IMPORTANCE: 1044 case TIPC_HIGH_IMPORTANCE:
1050 case TIPC_CRITICAL_IMPORTANCE: 1045 case TIPC_CRITICAL_IMPORTANCE:
1046 if (unlikely(msg_type(hdr) == TIPC_MCAST_MSG)) {
1047 skb_queue_tail(l->bc_rcvlink->inputq, skb);
1048 return true;
1049 }
1051 case CONN_MANAGER: 1050 case CONN_MANAGER:
1052 skb_queue_tail(inputq, skb); 1051 skb_queue_tail(inputq, skb);
1053 return true; 1052 return true;
diff --git a/net/tipc/msg.c b/net/tipc/msg.c
index ab02d0742476..312ef7de57d7 100644
--- a/net/tipc/msg.c
+++ b/net/tipc/msg.c
@@ -607,6 +607,23 @@ error:
607 return false; 607 return false;
608} 608}
609 609
610bool tipc_msg_pskb_copy(u32 dst, struct sk_buff_head *msg,
611 struct sk_buff_head *cpy)
612{
613 struct sk_buff *skb, *_skb;
614
615 skb_queue_walk(msg, skb) {
616 _skb = pskb_copy(skb, GFP_ATOMIC);
617 if (!_skb) {
618 __skb_queue_purge(cpy);
619 return false;
620 }
621 msg_set_destnode(buf_msg(_skb), dst);
622 __skb_queue_tail(cpy, _skb);
623 }
624 return true;
625}
626
610/* tipc_skb_queue_sorted(); sort pkt into list according to sequence number 627/* tipc_skb_queue_sorted(); sort pkt into list according to sequence number
611 * @list: list to be appended to 628 * @list: list to be appended to
612 * @seqno: sequence number of buffer to add 629 * @seqno: sequence number of buffer to add
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
index 2c3dc38abf9c..c843fd2bc48d 100644
--- a/net/tipc/msg.h
+++ b/net/tipc/msg.h
@@ -98,8 +98,6 @@ struct tipc_skb_cb {
98 u32 bytes_read; 98 u32 bytes_read;
99 struct sk_buff *tail; 99 struct sk_buff *tail;
100 bool validated; 100 bool validated;
101 bool wakeup_pending;
102 u16 chain_sz;
103 u16 chain_imp; 101 u16 chain_imp;
104 u16 ackers; 102 u16 ackers;
105}; 103};
@@ -633,14 +631,11 @@ static inline void msg_set_bc_netid(struct tipc_msg *m, u32 id)
633 631
634static inline u32 msg_link_selector(struct tipc_msg *m) 632static inline u32 msg_link_selector(struct tipc_msg *m)
635{ 633{
634 if (msg_user(m) == MSG_FRAGMENTER)
635 m = (void *)msg_data(m);
636 return msg_bits(m, 4, 0, 1); 636 return msg_bits(m, 4, 0, 1);
637} 637}
638 638
639static inline void msg_set_link_selector(struct tipc_msg *m, u32 n)
640{
641 msg_set_bits(m, 4, 0, 1, n);
642}
643
644/* 639/*
645 * Word 5 640 * Word 5
646 */ 641 */
@@ -837,6 +832,8 @@ int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m,
837 int offset, int dsz, int mtu, struct sk_buff_head *list); 832 int offset, int dsz, int mtu, struct sk_buff_head *list);
838bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err); 833bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err);
839bool tipc_msg_reassemble(struct sk_buff_head *list, struct sk_buff_head *rcvq); 834bool tipc_msg_reassemble(struct sk_buff_head *list, struct sk_buff_head *rcvq);
835bool tipc_msg_pskb_copy(u32 dst, struct sk_buff_head *msg,
836 struct sk_buff_head *cpy);
840void __tipc_skb_queue_sorted(struct sk_buff_head *list, u16 seqno, 837void __tipc_skb_queue_sorted(struct sk_buff_head *list, u16 seqno,
841 struct sk_buff *skb); 838 struct sk_buff *skb);
842 839
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index e190460fe0d3..9be6592e4a6f 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -608,7 +608,7 @@ not_found:
608 * Returns non-zero if any off-node ports overlap 608 * Returns non-zero if any off-node ports overlap
609 */ 609 */
610int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper, 610int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper,
611 u32 limit, struct tipc_plist *dports) 611 u32 limit, struct list_head *dports)
612{ 612{
613 struct name_seq *seq; 613 struct name_seq *seq;
614 struct sub_seq *sseq; 614 struct sub_seq *sseq;
@@ -633,7 +633,7 @@ int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper,
633 info = sseq->info; 633 info = sseq->info;
634 list_for_each_entry(publ, &info->node_list, node_list) { 634 list_for_each_entry(publ, &info->node_list, node_list) {
635 if (publ->scope <= limit) 635 if (publ->scope <= limit)
636 tipc_plist_push(dports, publ->ref); 636 u32_push(dports, publ->ref);
637 } 637 }
638 638
639 if (info->cluster_list_size != info->node_list_size) 639 if (info->cluster_list_size != info->node_list_size)
@@ -645,6 +645,39 @@ exit:
645 return res; 645 return res;
646} 646}
647 647
648/* tipc_nametbl_lookup_dst_nodes - find broadcast destination nodes
649 * - Creates list of nodes that overlap the given multicast address
650 * - Determines if any node local ports overlap
651 */
652void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower,
653 u32 upper, u32 domain,
654 struct tipc_nlist *nodes)
655{
656 struct sub_seq *sseq, *stop;
657 struct publication *publ;
658 struct name_info *info;
659 struct name_seq *seq;
660
661 rcu_read_lock();
662 seq = nametbl_find_seq(net, type);
663 if (!seq)
664 goto exit;
665
666 spin_lock_bh(&seq->lock);
667 sseq = seq->sseqs + nameseq_locate_subseq(seq, lower);
668 stop = seq->sseqs + seq->first_free;
669 for (; sseq->lower <= upper && sseq != stop; sseq++) {
670 info = sseq->info;
671 list_for_each_entry(publ, &info->zone_list, zone_list) {
672 if (tipc_in_scope(domain, publ->node))
673 tipc_nlist_add(nodes, publ->node);
674 }
675 }
676 spin_unlock_bh(&seq->lock);
677exit:
678 rcu_read_unlock();
679}
680
648/* 681/*
649 * tipc_nametbl_publish - add name publication to network name tables 682 * tipc_nametbl_publish - add name publication to network name tables
650 */ 683 */
@@ -1022,40 +1055,79 @@ int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb)
1022 return skb->len; 1055 return skb->len;
1023} 1056}
1024 1057
1025void tipc_plist_push(struct tipc_plist *pl, u32 port) 1058bool u32_find(struct list_head *l, u32 value)
1026{ 1059{
1027 struct tipc_plist *nl; 1060 struct u32_item *item;
1028 1061
1029 if (likely(!pl->port)) { 1062 list_for_each_entry(item, l, list) {
1030 pl->port = port; 1063 if (item->value == value)
1031 return; 1064 return true;
1032 } 1065 }
1033 if (pl->port == port) 1066 return false;
1034 return; 1067}
1035 list_for_each_entry(nl, &pl->list, list) { 1068
1036 if (nl->port == port) 1069bool u32_push(struct list_head *l, u32 value)
1037 return; 1070{
1071 struct u32_item *item;
1072
1073 list_for_each_entry(item, l, list) {
1074 if (item->value == value)
1075 return false;
1076 }
1077 item = kmalloc(sizeof(*item), GFP_ATOMIC);
1078 if (unlikely(!item))
1079 return false;
1080
1081 item->value = value;
1082 list_add(&item->list, l);
1083 return true;
1084}
1085
1086u32 u32_pop(struct list_head *l)
1087{
1088 struct u32_item *item;
1089 u32 value = 0;
1090
1091 if (list_empty(l))
1092 return 0;
1093 item = list_first_entry(l, typeof(*item), list);
1094 value = item->value;
1095 list_del(&item->list);
1096 kfree(item);
1097 return value;
1098}
1099
1100bool u32_del(struct list_head *l, u32 value)
1101{
1102 struct u32_item *item, *tmp;
1103
1104 list_for_each_entry_safe(item, tmp, l, list) {
1105 if (item->value != value)
1106 continue;
1107 list_del(&item->list);
1108 kfree(item);
1109 return true;
1038 } 1110 }
1039 nl = kmalloc(sizeof(*nl), GFP_ATOMIC); 1111 return false;
1040 if (nl) { 1112}
1041 nl->port = port; 1113
1042 list_add(&nl->list, &pl->list); 1114void u32_list_purge(struct list_head *l)
1115{
1116 struct u32_item *item, *tmp;
1117
1118 list_for_each_entry_safe(item, tmp, l, list) {
1119 list_del(&item->list);
1120 kfree(item);
1043 } 1121 }
1044} 1122}
1045 1123
1046u32 tipc_plist_pop(struct tipc_plist *pl) 1124int u32_list_len(struct list_head *l)
1047{ 1125{
1048 struct tipc_plist *nl; 1126 struct u32_item *item;
1049 u32 port = 0; 1127 int i = 0;
1050 1128
1051 if (likely(list_empty(&pl->list))) { 1129 list_for_each_entry(item, l, list) {
1052 port = pl->port; 1130 i++;
1053 pl->port = 0;
1054 return port;
1055 } 1131 }
1056 nl = list_first_entry(&pl->list, typeof(*nl), list); 1132 return i;
1057 port = nl->port;
1058 list_del(&nl->list);
1059 kfree(nl);
1060 return port;
1061} 1133}
diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h
index 1524a73830f7..6ebdeb1d84a5 100644
--- a/net/tipc/name_table.h
+++ b/net/tipc/name_table.h
@@ -39,6 +39,7 @@
39 39
40struct tipc_subscription; 40struct tipc_subscription;
41struct tipc_plist; 41struct tipc_plist;
42struct tipc_nlist;
42 43
43/* 44/*
44 * TIPC name types reserved for internal TIPC use (both current and planned) 45 * TIPC name types reserved for internal TIPC use (both current and planned)
@@ -99,7 +100,10 @@ int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb);
99 100
100u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, u32 *node); 101u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, u32 *node);
101int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper, 102int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper,
102 u32 limit, struct tipc_plist *dports); 103 u32 limit, struct list_head *dports);
104void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower,
105 u32 upper, u32 domain,
106 struct tipc_nlist *nodes);
103struct publication *tipc_nametbl_publish(struct net *net, u32 type, u32 lower, 107struct publication *tipc_nametbl_publish(struct net *net, u32 type, u32 lower,
104 u32 upper, u32 scope, u32 port_ref, 108 u32 upper, u32 scope, u32 port_ref,
105 u32 key); 109 u32 key);
@@ -116,18 +120,16 @@ void tipc_nametbl_unsubscribe(struct tipc_subscription *s);
116int tipc_nametbl_init(struct net *net); 120int tipc_nametbl_init(struct net *net);
117void tipc_nametbl_stop(struct net *net); 121void tipc_nametbl_stop(struct net *net);
118 122
119struct tipc_plist { 123struct u32_item {
120 struct list_head list; 124 struct list_head list;
121 u32 port; 125 u32 value;
122}; 126};
123 127
124static inline void tipc_plist_init(struct tipc_plist *pl) 128bool u32_push(struct list_head *l, u32 value);
125{ 129u32 u32_pop(struct list_head *l);
126 INIT_LIST_HEAD(&pl->list); 130bool u32_find(struct list_head *l, u32 value);
127 pl->port = 0; 131bool u32_del(struct list_head *l, u32 value);
128} 132void u32_list_purge(struct list_head *l);
129 133int u32_list_len(struct list_head *l);
130void tipc_plist_push(struct tipc_plist *pl, u32 port);
131u32 tipc_plist_pop(struct tipc_plist *pl);
132 134
133#endif 135#endif
diff --git a/net/tipc/net.c b/net/tipc/net.c
index 28bf4feeb81c..ab8a2d5d1e32 100644
--- a/net/tipc/net.c
+++ b/net/tipc/net.c
@@ -110,6 +110,10 @@ int tipc_net_start(struct net *net, u32 addr)
110 char addr_string[16]; 110 char addr_string[16];
111 111
112 tn->own_addr = addr; 112 tn->own_addr = addr;
113
114 /* Ensure that the new address is visible before we reinit. */
115 smp_mb();
116
113 tipc_named_reinit(net); 117 tipc_named_reinit(net);
114 tipc_sk_reinit(net); 118 tipc_sk_reinit(net);
115 119
diff --git a/net/tipc/node.c b/net/tipc/node.c
index 27753325e06e..4512e83652b1 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -1172,7 +1172,7 @@ msg_full:
1172 * @list: chain of buffers containing message 1172 * @list: chain of buffers containing message
1173 * @dnode: address of destination node 1173 * @dnode: address of destination node
1174 * @selector: a number used for deterministic link selection 1174 * @selector: a number used for deterministic link selection
1175 * Consumes the buffer chain, except when returning -ELINKCONG 1175 * Consumes the buffer chain.
1176 * Returns 0 if success, otherwise: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE,-ENOBUF 1176 * Returns 0 if success, otherwise: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE,-ENOBUF
1177 */ 1177 */
1178int tipc_node_xmit(struct net *net, struct sk_buff_head *list, 1178int tipc_node_xmit(struct net *net, struct sk_buff_head *list,
@@ -1211,10 +1211,10 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list,
1211 spin_unlock_bh(&le->lock); 1211 spin_unlock_bh(&le->lock);
1212 tipc_node_read_unlock(n); 1212 tipc_node_read_unlock(n);
1213 1213
1214 if (likely(rc == 0)) 1214 if (unlikely(rc == -ENOBUFS))
1215 tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr);
1216 else if (rc == -ENOBUFS)
1217 tipc_node_link_down(n, bearer_id, false); 1215 tipc_node_link_down(n, bearer_id, false);
1216 else
1217 tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr);
1218 1218
1219 tipc_node_put(n); 1219 tipc_node_put(n);
1220 1220
@@ -1226,20 +1226,15 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list,
1226 * messages, which will not be rejected 1226 * messages, which will not be rejected
1227 * The only exception is datagram messages rerouted after secondary 1227 * The only exception is datagram messages rerouted after secondary
1228 * lookup, which are rare and safe to dispose of anyway. 1228 * lookup, which are rare and safe to dispose of anyway.
1229 * TODO: Return real return value, and let callers use
1230 * tipc_wait_for_sendpkt() where applicable
1231 */ 1229 */
1232int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dnode, 1230int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dnode,
1233 u32 selector) 1231 u32 selector)
1234{ 1232{
1235 struct sk_buff_head head; 1233 struct sk_buff_head head;
1236 int rc;
1237 1234
1238 skb_queue_head_init(&head); 1235 skb_queue_head_init(&head);
1239 __skb_queue_tail(&head, skb); 1236 __skb_queue_tail(&head, skb);
1240 rc = tipc_node_xmit(net, &head, dnode, selector); 1237 tipc_node_xmit(net, &head, dnode, selector);
1241 if (rc == -ELINKCONG)
1242 kfree_skb(skb);
1243 return 0; 1238 return 0;
1244} 1239}
1245 1240
@@ -1267,6 +1262,19 @@ void tipc_node_broadcast(struct net *net, struct sk_buff *skb)
1267 kfree_skb(skb); 1262 kfree_skb(skb);
1268} 1263}
1269 1264
1265static void tipc_node_mcast_rcv(struct tipc_node *n)
1266{
1267 struct tipc_bclink_entry *be = &n->bc_entry;
1268
1269 /* 'arrvq' is under inputq2's lock protection */
1270 spin_lock_bh(&be->inputq2.lock);
1271 spin_lock_bh(&be->inputq1.lock);
1272 skb_queue_splice_tail_init(&be->inputq1, &be->arrvq);
1273 spin_unlock_bh(&be->inputq1.lock);
1274 spin_unlock_bh(&be->inputq2.lock);
1275 tipc_sk_mcast_rcv(n->net, &be->arrvq, &be->inputq2);
1276}
1277
1270static void tipc_node_bc_sync_rcv(struct tipc_node *n, struct tipc_msg *hdr, 1278static void tipc_node_bc_sync_rcv(struct tipc_node *n, struct tipc_msg *hdr,
1271 int bearer_id, struct sk_buff_head *xmitq) 1279 int bearer_id, struct sk_buff_head *xmitq)
1272{ 1280{
@@ -1340,15 +1348,8 @@ static void tipc_node_bc_rcv(struct net *net, struct sk_buff *skb, int bearer_id
1340 if (!skb_queue_empty(&xmitq)) 1348 if (!skb_queue_empty(&xmitq))
1341 tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr); 1349 tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr);
1342 1350
1343 /* Deliver. 'arrvq' is under inputq2's lock protection */ 1351 if (!skb_queue_empty(&be->inputq1))
1344 if (!skb_queue_empty(&be->inputq1)) { 1352 tipc_node_mcast_rcv(n);
1345 spin_lock_bh(&be->inputq2.lock);
1346 spin_lock_bh(&be->inputq1.lock);
1347 skb_queue_splice_tail_init(&be->inputq1, &be->arrvq);
1348 spin_unlock_bh(&be->inputq1.lock);
1349 spin_unlock_bh(&be->inputq2.lock);
1350 tipc_sk_mcast_rcv(net, &be->arrvq, &be->inputq2);
1351 }
1352 1353
1353 if (rc & TIPC_LINK_DOWN_EVT) { 1354 if (rc & TIPC_LINK_DOWN_EVT) {
1354 /* Reception reassembly failure => reset all links to peer */ 1355 /* Reception reassembly failure => reset all links to peer */
@@ -1504,19 +1505,21 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b)
1504{ 1505{
1505 struct sk_buff_head xmitq; 1506 struct sk_buff_head xmitq;
1506 struct tipc_node *n; 1507 struct tipc_node *n;
1507 struct tipc_msg *hdr = buf_msg(skb); 1508 struct tipc_msg *hdr;
1508 int usr = msg_user(hdr);
1509 int bearer_id = b->identity; 1509 int bearer_id = b->identity;
1510 struct tipc_link_entry *le; 1510 struct tipc_link_entry *le;
1511 u16 bc_ack = msg_bcast_ack(hdr);
1512 u32 self = tipc_own_addr(net); 1511 u32 self = tipc_own_addr(net);
1513 int rc = 0; 1512 int usr, rc = 0;
1513 u16 bc_ack;
1514 1514
1515 __skb_queue_head_init(&xmitq); 1515 __skb_queue_head_init(&xmitq);
1516 1516
1517 /* Ensure message is well-formed */ 1517 /* Ensure message is well-formed before touching the header */
1518 if (unlikely(!tipc_msg_validate(skb))) 1518 if (unlikely(!tipc_msg_validate(skb)))
1519 goto discard; 1519 goto discard;
1520 hdr = buf_msg(skb);
1521 usr = msg_user(hdr);
1522 bc_ack = msg_bcast_ack(hdr);
1520 1523
1521 /* Handle arrival of discovery or broadcast packet */ 1524 /* Handle arrival of discovery or broadcast packet */
1522 if (unlikely(msg_non_seq(hdr))) { 1525 if (unlikely(msg_non_seq(hdr))) {
@@ -1575,6 +1578,9 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b)
1575 if (unlikely(!skb_queue_empty(&n->bc_entry.namedq))) 1578 if (unlikely(!skb_queue_empty(&n->bc_entry.namedq)))
1576 tipc_named_rcv(net, &n->bc_entry.namedq); 1579 tipc_named_rcv(net, &n->bc_entry.namedq);
1577 1580
1581 if (unlikely(!skb_queue_empty(&n->bc_entry.inputq1)))
1582 tipc_node_mcast_rcv(n);
1583
1578 if (!skb_queue_empty(&le->inputq)) 1584 if (!skb_queue_empty(&le->inputq))
1579 tipc_sk_rcv(net, &le->inputq); 1585 tipc_sk_rcv(net, &le->inputq);
1580 1586
diff --git a/net/tipc/node.h b/net/tipc/node.h
index 39ef54c1f2ad..898c22916984 100644
--- a/net/tipc/node.h
+++ b/net/tipc/node.h
@@ -47,11 +47,13 @@
47enum { 47enum {
48 TIPC_BCAST_SYNCH = (1 << 1), 48 TIPC_BCAST_SYNCH = (1 << 1),
49 TIPC_BCAST_STATE_NACK = (1 << 2), 49 TIPC_BCAST_STATE_NACK = (1 << 2),
50 TIPC_BLOCK_FLOWCTL = (1 << 3) 50 TIPC_BLOCK_FLOWCTL = (1 << 3),
51 TIPC_BCAST_RCAST = (1 << 4)
51}; 52};
52 53
53#define TIPC_NODE_CAPABILITIES (TIPC_BCAST_SYNCH | \ 54#define TIPC_NODE_CAPABILITIES (TIPC_BCAST_SYNCH | \
54 TIPC_BCAST_STATE_NACK | \ 55 TIPC_BCAST_STATE_NACK | \
56 TIPC_BCAST_RCAST | \
55 TIPC_BLOCK_FLOWCTL) 57 TIPC_BLOCK_FLOWCTL)
56#define INVALID_BEARER_ID -1 58#define INVALID_BEARER_ID -1
57 59
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 800caaa699a1..7130e73bd42c 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -35,6 +35,8 @@
35 */ 35 */
36 36
37#include <linux/rhashtable.h> 37#include <linux/rhashtable.h>
38#include <linux/sched/signal.h>
39
38#include "core.h" 40#include "core.h"
39#include "name_table.h" 41#include "name_table.h"
40#include "node.h" 42#include "node.h"
@@ -67,16 +69,19 @@ enum {
67 * @max_pkt: maximum packet size "hint" used when building messages sent by port 69 * @max_pkt: maximum packet size "hint" used when building messages sent by port
68 * @portid: unique port identity in TIPC socket hash table 70 * @portid: unique port identity in TIPC socket hash table
69 * @phdr: preformatted message header used when sending messages 71 * @phdr: preformatted message header used when sending messages
72 * #cong_links: list of congested links
70 * @publications: list of publications for port 73 * @publications: list of publications for port
74 * @blocking_link: address of the congested link we are currently sleeping on
71 * @pub_count: total # of publications port has made during its lifetime 75 * @pub_count: total # of publications port has made during its lifetime
72 * @probing_state: 76 * @probing_state:
73 * @conn_timeout: the time we can wait for an unresponded setup request 77 * @conn_timeout: the time we can wait for an unresponded setup request
74 * @dupl_rcvcnt: number of bytes counted twice, in both backlog and rcv queue 78 * @dupl_rcvcnt: number of bytes counted twice, in both backlog and rcv queue
75 * @link_cong: non-zero if owner must sleep because of link congestion 79 * @cong_link_cnt: number of congested links
76 * @sent_unacked: # messages sent by socket, and not yet acked by peer 80 * @sent_unacked: # messages sent by socket, and not yet acked by peer
77 * @rcv_unacked: # messages read by user, but not yet acked back to peer 81 * @rcv_unacked: # messages read by user, but not yet acked back to peer
78 * @peer: 'connected' peer for dgram/rdm 82 * @peer: 'connected' peer for dgram/rdm
79 * @node: hash table node 83 * @node: hash table node
84 * @mc_method: cookie for use between socket and broadcast layer
80 * @rcu: rcu struct for tipc_sock 85 * @rcu: rcu struct for tipc_sock
81 */ 86 */
82struct tipc_sock { 87struct tipc_sock {
@@ -87,13 +92,13 @@ struct tipc_sock {
87 u32 max_pkt; 92 u32 max_pkt;
88 u32 portid; 93 u32 portid;
89 struct tipc_msg phdr; 94 struct tipc_msg phdr;
90 struct list_head sock_list; 95 struct list_head cong_links;
91 struct list_head publications; 96 struct list_head publications;
92 u32 pub_count; 97 u32 pub_count;
93 uint conn_timeout; 98 uint conn_timeout;
94 atomic_t dupl_rcvcnt; 99 atomic_t dupl_rcvcnt;
95 bool probe_unacked; 100 bool probe_unacked;
96 bool link_cong; 101 u16 cong_link_cnt;
97 u16 snt_unacked; 102 u16 snt_unacked;
98 u16 snd_win; 103 u16 snd_win;
99 u16 peer_caps; 104 u16 peer_caps;
@@ -101,6 +106,7 @@ struct tipc_sock {
101 u16 rcv_win; 106 u16 rcv_win;
102 struct sockaddr_tipc peer; 107 struct sockaddr_tipc peer;
103 struct rhash_head node; 108 struct rhash_head node;
109 struct tipc_mc_method mc_method;
104 struct rcu_head rcu; 110 struct rcu_head rcu;
105}; 111};
106 112
@@ -109,8 +115,8 @@ static void tipc_data_ready(struct sock *sk);
109static void tipc_write_space(struct sock *sk); 115static void tipc_write_space(struct sock *sk);
110static void tipc_sock_destruct(struct sock *sk); 116static void tipc_sock_destruct(struct sock *sk);
111static int tipc_release(struct socket *sock); 117static int tipc_release(struct socket *sock);
112static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags); 118static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags,
113static int tipc_wait_for_sndmsg(struct socket *sock, long *timeo_p); 119 bool kern);
114static void tipc_sk_timeout(unsigned long data); 120static void tipc_sk_timeout(unsigned long data);
115static int tipc_sk_publish(struct tipc_sock *tsk, uint scope, 121static int tipc_sk_publish(struct tipc_sock *tsk, uint scope,
116 struct tipc_name_seq const *seq); 122 struct tipc_name_seq const *seq);
@@ -119,8 +125,7 @@ static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope,
119static struct tipc_sock *tipc_sk_lookup(struct net *net, u32 portid); 125static struct tipc_sock *tipc_sk_lookup(struct net *net, u32 portid);
120static int tipc_sk_insert(struct tipc_sock *tsk); 126static int tipc_sk_insert(struct tipc_sock *tsk);
121static void tipc_sk_remove(struct tipc_sock *tsk); 127static void tipc_sk_remove(struct tipc_sock *tsk);
122static int __tipc_send_stream(struct socket *sock, struct msghdr *m, 128static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dsz);
123 size_t dsz);
124static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz); 129static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz);
125 130
126static const struct proto_ops packet_ops; 131static const struct proto_ops packet_ops;
@@ -334,6 +339,49 @@ static int tipc_set_sk_state(struct sock *sk, int state)
334 return res; 339 return res;
335} 340}
336 341
342static int tipc_sk_sock_err(struct socket *sock, long *timeout)
343{
344 struct sock *sk = sock->sk;
345 int err = sock_error(sk);
346 int typ = sock->type;
347
348 if (err)
349 return err;
350 if (typ == SOCK_STREAM || typ == SOCK_SEQPACKET) {
351 if (sk->sk_state == TIPC_DISCONNECTING)
352 return -EPIPE;
353 else if (!tipc_sk_connected(sk))
354 return -ENOTCONN;
355 }
356 if (!*timeout)
357 return -EAGAIN;
358 if (signal_pending(current))
359 return sock_intr_errno(*timeout);
360
361 return 0;
362}
363
364#define tipc_wait_for_cond(sock_, timeout_, condition_) \
365({ \
366 int rc_ = 0; \
367 int done_ = 0; \
368 \
369 while (!(condition_) && !done_) { \
370 struct sock *sk_ = sock->sk; \
371 DEFINE_WAIT_FUNC(wait_, woken_wake_function); \
372 \
373 rc_ = tipc_sk_sock_err(sock_, timeout_); \
374 if (rc_) \
375 break; \
376 prepare_to_wait(sk_sleep(sk_), &wait_, \
377 TASK_INTERRUPTIBLE); \
378 done_ = sk_wait_event(sk_, timeout_, \
379 (condition_), &wait_); \
380 remove_wait_queue(sk_sleep(sk_), &wait_); \
381 } \
382 rc_; \
383})
384
337/** 385/**
338 * tipc_sk_create - create a TIPC socket 386 * tipc_sk_create - create a TIPC socket
339 * @net: network namespace (must be default network) 387 * @net: network namespace (must be default network)
@@ -382,10 +430,9 @@ static int tipc_sk_create(struct net *net, struct socket *sock,
382 tsk = tipc_sk(sk); 430 tsk = tipc_sk(sk);
383 tsk->max_pkt = MAX_PKT_DEFAULT; 431 tsk->max_pkt = MAX_PKT_DEFAULT;
384 INIT_LIST_HEAD(&tsk->publications); 432 INIT_LIST_HEAD(&tsk->publications);
433 INIT_LIST_HEAD(&tsk->cong_links);
385 msg = &tsk->phdr; 434 msg = &tsk->phdr;
386 tn = net_generic(sock_net(sk), tipc_net_id); 435 tn = net_generic(sock_net(sk), tipc_net_id);
387 tipc_msg_init(tn->own_addr, msg, TIPC_LOW_IMPORTANCE, TIPC_NAMED_MSG,
388 NAMED_H_SIZE, 0);
389 436
390 /* Finish initializing socket data structures */ 437 /* Finish initializing socket data structures */
391 sock->ops = ops; 438 sock->ops = ops;
@@ -395,6 +442,13 @@ static int tipc_sk_create(struct net *net, struct socket *sock,
395 pr_warn("Socket create failed; port number exhausted\n"); 442 pr_warn("Socket create failed; port number exhausted\n");
396 return -EINVAL; 443 return -EINVAL;
397 } 444 }
445
446 /* Ensure tsk is visible before we read own_addr. */
447 smp_mb();
448
449 tipc_msg_init(tn->own_addr, msg, TIPC_LOW_IMPORTANCE, TIPC_NAMED_MSG,
450 NAMED_H_SIZE, 0);
451
398 msg_set_origport(msg, tsk->portid); 452 msg_set_origport(msg, tsk->portid);
399 setup_timer(&sk->sk_timer, tipc_sk_timeout, (unsigned long)tsk); 453 setup_timer(&sk->sk_timer, tipc_sk_timeout, (unsigned long)tsk);
400 sk->sk_shutdown = 0; 454 sk->sk_shutdown = 0;
@@ -432,9 +486,14 @@ static void __tipc_shutdown(struct socket *sock, int error)
432 struct sock *sk = sock->sk; 486 struct sock *sk = sock->sk;
433 struct tipc_sock *tsk = tipc_sk(sk); 487 struct tipc_sock *tsk = tipc_sk(sk);
434 struct net *net = sock_net(sk); 488 struct net *net = sock_net(sk);
489 long timeout = CONN_TIMEOUT_DEFAULT;
435 u32 dnode = tsk_peer_node(tsk); 490 u32 dnode = tsk_peer_node(tsk);
436 struct sk_buff *skb; 491 struct sk_buff *skb;
437 492
493 /* Avoid that hi-prio shutdown msgs bypass msgs in link wakeup queue */
494 tipc_wait_for_cond(sock, &timeout, (!tsk->cong_link_cnt &&
495 !tsk_conn_cong(tsk)));
496
438 /* Reject all unreceived messages, except on an active connection 497 /* Reject all unreceived messages, except on an active connection
439 * (which disconnects locally & sends a 'FIN+' to peer). 498 * (which disconnects locally & sends a 'FIN+' to peer).
440 */ 499 */
@@ -505,7 +564,8 @@ static int tipc_release(struct socket *sock)
505 564
506 /* Reject any messages that accumulated in backlog queue */ 565 /* Reject any messages that accumulated in backlog queue */
507 release_sock(sk); 566 release_sock(sk);
508 567 u32_list_purge(&tsk->cong_links);
568 tsk->cong_link_cnt = 0;
509 call_rcu(&tsk->rcu, tipc_sk_callback); 569 call_rcu(&tsk->rcu, tipc_sk_callback);
510 sock->sk = NULL; 570 sock->sk = NULL;
511 571
@@ -648,7 +708,7 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock,
648 708
649 switch (sk->sk_state) { 709 switch (sk->sk_state) {
650 case TIPC_ESTABLISHED: 710 case TIPC_ESTABLISHED:
651 if (!tsk->link_cong && !tsk_conn_cong(tsk)) 711 if (!tsk->cong_link_cnt && !tsk_conn_cong(tsk))
652 mask |= POLLOUT; 712 mask |= POLLOUT;
653 /* fall thru' */ 713 /* fall thru' */
654 case TIPC_LISTEN: 714 case TIPC_LISTEN:
@@ -657,7 +717,7 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock,
657 mask |= (POLLIN | POLLRDNORM); 717 mask |= (POLLIN | POLLRDNORM);
658 break; 718 break;
659 case TIPC_OPEN: 719 case TIPC_OPEN:
660 if (!tsk->link_cong) 720 if (!tsk->cong_link_cnt)
661 mask |= POLLOUT; 721 mask |= POLLOUT;
662 if (tipc_sk_type_connectionless(sk) && 722 if (tipc_sk_type_connectionless(sk) &&
663 (!skb_queue_empty(&sk->sk_receive_queue))) 723 (!skb_queue_empty(&sk->sk_receive_queue)))
@@ -676,63 +736,60 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock,
676 * @sock: socket structure 736 * @sock: socket structure
677 * @seq: destination address 737 * @seq: destination address
678 * @msg: message to send 738 * @msg: message to send
679 * @dsz: total length of message data 739 * @dlen: length of data to send
680 * @timeo: timeout to wait for wakeup 740 * @timeout: timeout to wait for wakeup
681 * 741 *
682 * Called from function tipc_sendmsg(), which has done all sanity checks 742 * Called from function tipc_sendmsg(), which has done all sanity checks
683 * Returns the number of bytes sent on success, or errno 743 * Returns the number of bytes sent on success, or errno
684 */ 744 */
685static int tipc_sendmcast(struct socket *sock, struct tipc_name_seq *seq, 745static int tipc_sendmcast(struct socket *sock, struct tipc_name_seq *seq,
686 struct msghdr *msg, size_t dsz, long timeo) 746 struct msghdr *msg, size_t dlen, long timeout)
687{ 747{
688 struct sock *sk = sock->sk; 748 struct sock *sk = sock->sk;
689 struct tipc_sock *tsk = tipc_sk(sk); 749 struct tipc_sock *tsk = tipc_sk(sk);
750 struct tipc_msg *hdr = &tsk->phdr;
690 struct net *net = sock_net(sk); 751 struct net *net = sock_net(sk);
691 struct tipc_msg *mhdr = &tsk->phdr; 752 int mtu = tipc_bcast_get_mtu(net);
692 struct sk_buff_head pktchain; 753 struct tipc_mc_method *method = &tsk->mc_method;
693 struct iov_iter save = msg->msg_iter; 754 u32 domain = addr_domain(net, TIPC_CLUSTER_SCOPE);
694 uint mtu; 755 struct sk_buff_head pkts;
756 struct tipc_nlist dsts;
695 int rc; 757 int rc;
696 758
697 if (!timeo && tsk->link_cong) 759 /* Block or return if any destination link is congested */
698 return -ELINKCONG; 760 rc = tipc_wait_for_cond(sock, &timeout, !tsk->cong_link_cnt);
761 if (unlikely(rc))
762 return rc;
699 763
700 msg_set_type(mhdr, TIPC_MCAST_MSG); 764 /* Lookup destination nodes */
701 msg_set_lookup_scope(mhdr, TIPC_CLUSTER_SCOPE); 765 tipc_nlist_init(&dsts, tipc_own_addr(net));
702 msg_set_destport(mhdr, 0); 766 tipc_nametbl_lookup_dst_nodes(net, seq->type, seq->lower,
703 msg_set_destnode(mhdr, 0); 767 seq->upper, domain, &dsts);
704 msg_set_nametype(mhdr, seq->type); 768 if (!dsts.local && !dsts.remote)
705 msg_set_namelower(mhdr, seq->lower); 769 return -EHOSTUNREACH;
706 msg_set_nameupper(mhdr, seq->upper);
707 msg_set_hdr_sz(mhdr, MCAST_H_SIZE);
708 770
709 skb_queue_head_init(&pktchain); 771 /* Build message header */
772 msg_set_type(hdr, TIPC_MCAST_MSG);
773 msg_set_hdr_sz(hdr, MCAST_H_SIZE);
774 msg_set_lookup_scope(hdr, TIPC_CLUSTER_SCOPE);
775 msg_set_destport(hdr, 0);
776 msg_set_destnode(hdr, 0);
777 msg_set_nametype(hdr, seq->type);
778 msg_set_namelower(hdr, seq->lower);
779 msg_set_nameupper(hdr, seq->upper);
710 780
711new_mtu: 781 /* Build message as chain of buffers */
712 mtu = tipc_bcast_get_mtu(net); 782 skb_queue_head_init(&pkts);
713 rc = tipc_msg_build(mhdr, msg, 0, dsz, mtu, &pktchain); 783 rc = tipc_msg_build(hdr, msg, 0, dlen, mtu, &pkts);
714 if (unlikely(rc < 0))
715 return rc;
716 784
717 do { 785 /* Send message if build was successful */
718 rc = tipc_bcast_xmit(net, &pktchain); 786 if (unlikely(rc == dlen))
719 if (likely(!rc)) 787 rc = tipc_mcast_xmit(net, &pkts, method, &dsts,
720 return dsz; 788 &tsk->cong_link_cnt);
721 789
722 if (rc == -ELINKCONG) { 790 tipc_nlist_purge(&dsts);
723 tsk->link_cong = 1; 791
724 rc = tipc_wait_for_sndmsg(sock, &timeo); 792 return rc ? rc : dlen;
725 if (!rc)
726 continue;
727 }
728 __skb_queue_purge(&pktchain);
729 if (rc == -EMSGSIZE) {
730 msg->msg_iter = save;
731 goto new_mtu;
732 }
733 break;
734 } while (1);
735 return rc;
736} 793}
737 794
738/** 795/**
@@ -746,7 +803,7 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq,
746 struct sk_buff_head *inputq) 803 struct sk_buff_head *inputq)
747{ 804{
748 struct tipc_msg *msg; 805 struct tipc_msg *msg;
749 struct tipc_plist dports; 806 struct list_head dports;
750 u32 portid; 807 u32 portid;
751 u32 scope = TIPC_CLUSTER_SCOPE; 808 u32 scope = TIPC_CLUSTER_SCOPE;
752 struct sk_buff_head tmpq; 809 struct sk_buff_head tmpq;
@@ -754,7 +811,7 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq,
754 struct sk_buff *skb, *_skb; 811 struct sk_buff *skb, *_skb;
755 812
756 __skb_queue_head_init(&tmpq); 813 __skb_queue_head_init(&tmpq);
757 tipc_plist_init(&dports); 814 INIT_LIST_HEAD(&dports);
758 815
759 skb = tipc_skb_peek(arrvq, &inputq->lock); 816 skb = tipc_skb_peek(arrvq, &inputq->lock);
760 for (; skb; skb = tipc_skb_peek(arrvq, &inputq->lock)) { 817 for (; skb; skb = tipc_skb_peek(arrvq, &inputq->lock)) {
@@ -768,8 +825,8 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq,
768 tipc_nametbl_mc_translate(net, 825 tipc_nametbl_mc_translate(net,
769 msg_nametype(msg), msg_namelower(msg), 826 msg_nametype(msg), msg_namelower(msg),
770 msg_nameupper(msg), scope, &dports); 827 msg_nameupper(msg), scope, &dports);
771 portid = tipc_plist_pop(&dports); 828 portid = u32_pop(&dports);
772 for (; portid; portid = tipc_plist_pop(&dports)) { 829 for (; portid; portid = u32_pop(&dports)) {
773 _skb = __pskb_copy(skb, hsz, GFP_ATOMIC); 830 _skb = __pskb_copy(skb, hsz, GFP_ATOMIC);
774 if (_skb) { 831 if (_skb) {
775 msg_set_destport(buf_msg(_skb), portid); 832 msg_set_destport(buf_msg(_skb), portid);
@@ -830,31 +887,6 @@ exit:
830 kfree_skb(skb); 887 kfree_skb(skb);
831} 888}
832 889
833static int tipc_wait_for_sndmsg(struct socket *sock, long *timeo_p)
834{
835 DEFINE_WAIT_FUNC(wait, woken_wake_function);
836 struct sock *sk = sock->sk;
837 struct tipc_sock *tsk = tipc_sk(sk);
838 int done;
839
840 do {
841 int err = sock_error(sk);
842 if (err)
843 return err;
844 if (sk->sk_shutdown & SEND_SHUTDOWN)
845 return -EPIPE;
846 if (!*timeo_p)
847 return -EAGAIN;
848 if (signal_pending(current))
849 return sock_intr_errno(*timeo_p);
850
851 add_wait_queue(sk_sleep(sk), &wait);
852 done = sk_wait_event(sk, timeo_p, !tsk->link_cong, &wait);
853 remove_wait_queue(sk_sleep(sk), &wait);
854 } while (!done);
855 return 0;
856}
857
858/** 890/**
859 * tipc_sendmsg - send message in connectionless manner 891 * tipc_sendmsg - send message in connectionless manner
860 * @sock: socket structure 892 * @sock: socket structure
@@ -881,35 +913,38 @@ static int tipc_sendmsg(struct socket *sock,
881 return ret; 913 return ret;
882} 914}
883 915
884static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz) 916static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen)
885{ 917{
886 DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name);
887 struct sock *sk = sock->sk; 918 struct sock *sk = sock->sk;
888 struct tipc_sock *tsk = tipc_sk(sk);
889 struct net *net = sock_net(sk); 919 struct net *net = sock_net(sk);
890 struct tipc_msg *mhdr = &tsk->phdr; 920 struct tipc_sock *tsk = tipc_sk(sk);
891 u32 dnode, dport; 921 DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name);
892 struct sk_buff_head pktchain; 922 long timeout = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT);
893 bool is_connectionless = tipc_sk_type_connectionless(sk); 923 struct list_head *clinks = &tsk->cong_links;
894 struct sk_buff *skb; 924 bool syn = !tipc_sk_type_connectionless(sk);
925 struct tipc_msg *hdr = &tsk->phdr;
895 struct tipc_name_seq *seq; 926 struct tipc_name_seq *seq;
896 struct iov_iter save; 927 struct sk_buff_head pkts;
897 u32 mtu; 928 u32 type, inst, domain;
898 long timeo; 929 u32 dnode, dport;
899 int rc; 930 int mtu, rc;
900 931
901 if (dsz > TIPC_MAX_USER_MSG_SIZE) 932 if (unlikely(dlen > TIPC_MAX_USER_MSG_SIZE))
902 return -EMSGSIZE; 933 return -EMSGSIZE;
934
903 if (unlikely(!dest)) { 935 if (unlikely(!dest)) {
904 if (is_connectionless && tsk->peer.family == AF_TIPC) 936 dest = &tsk->peer;
905 dest = &tsk->peer; 937 if (!syn || dest->family != AF_TIPC)
906 else
907 return -EDESTADDRREQ; 938 return -EDESTADDRREQ;
908 } else if (unlikely(m->msg_namelen < sizeof(*dest)) ||
909 dest->family != AF_TIPC) {
910 return -EINVAL;
911 } 939 }
912 if (!is_connectionless) { 940
941 if (unlikely(m->msg_namelen < sizeof(*dest)))
942 return -EINVAL;
943
944 if (unlikely(dest->family != AF_TIPC))
945 return -EINVAL;
946
947 if (unlikely(syn)) {
913 if (sk->sk_state == TIPC_LISTEN) 948 if (sk->sk_state == TIPC_LISTEN)
914 return -EPIPE; 949 return -EPIPE;
915 if (sk->sk_state != TIPC_OPEN) 950 if (sk->sk_state != TIPC_OPEN)
@@ -921,102 +956,62 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz)
921 tsk->conn_instance = dest->addr.name.name.instance; 956 tsk->conn_instance = dest->addr.name.name.instance;
922 } 957 }
923 } 958 }
924 seq = &dest->addr.nameseq;
925 timeo = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT);
926 959
927 if (dest->addrtype == TIPC_ADDR_MCAST) { 960 seq = &dest->addr.nameseq;
928 return tipc_sendmcast(sock, seq, m, dsz, timeo); 961 if (dest->addrtype == TIPC_ADDR_MCAST)
929 } else if (dest->addrtype == TIPC_ADDR_NAME) { 962 return tipc_sendmcast(sock, seq, m, dlen, timeout);
930 u32 type = dest->addr.name.name.type;
931 u32 inst = dest->addr.name.name.instance;
932 u32 domain = dest->addr.name.domain;
933 963
964 if (dest->addrtype == TIPC_ADDR_NAME) {
965 type = dest->addr.name.name.type;
966 inst = dest->addr.name.name.instance;
967 domain = dest->addr.name.domain;
934 dnode = domain; 968 dnode = domain;
935 msg_set_type(mhdr, TIPC_NAMED_MSG); 969 msg_set_type(hdr, TIPC_NAMED_MSG);
936 msg_set_hdr_sz(mhdr, NAMED_H_SIZE); 970 msg_set_hdr_sz(hdr, NAMED_H_SIZE);
937 msg_set_nametype(mhdr, type); 971 msg_set_nametype(hdr, type);
938 msg_set_nameinst(mhdr, inst); 972 msg_set_nameinst(hdr, inst);
939 msg_set_lookup_scope(mhdr, tipc_addr_scope(domain)); 973 msg_set_lookup_scope(hdr, tipc_addr_scope(domain));
940 dport = tipc_nametbl_translate(net, type, inst, &dnode); 974 dport = tipc_nametbl_translate(net, type, inst, &dnode);
941 msg_set_destnode(mhdr, dnode); 975 msg_set_destnode(hdr, dnode);
942 msg_set_destport(mhdr, dport); 976 msg_set_destport(hdr, dport);
943 if (unlikely(!dport && !dnode)) 977 if (unlikely(!dport && !dnode))
944 return -EHOSTUNREACH; 978 return -EHOSTUNREACH;
979
945 } else if (dest->addrtype == TIPC_ADDR_ID) { 980 } else if (dest->addrtype == TIPC_ADDR_ID) {
946 dnode = dest->addr.id.node; 981 dnode = dest->addr.id.node;
947 msg_set_type(mhdr, TIPC_DIRECT_MSG); 982 msg_set_type(hdr, TIPC_DIRECT_MSG);
948 msg_set_lookup_scope(mhdr, 0); 983 msg_set_lookup_scope(hdr, 0);
949 msg_set_destnode(mhdr, dnode); 984 msg_set_destnode(hdr, dnode);
950 msg_set_destport(mhdr, dest->addr.id.ref); 985 msg_set_destport(hdr, dest->addr.id.ref);
951 msg_set_hdr_sz(mhdr, BASIC_H_SIZE); 986 msg_set_hdr_sz(hdr, BASIC_H_SIZE);
952 } 987 }
953 988
954 skb_queue_head_init(&pktchain); 989 /* Block or return if destination link is congested */
955 save = m->msg_iter; 990 rc = tipc_wait_for_cond(sock, &timeout, !u32_find(clinks, dnode));
956new_mtu: 991 if (unlikely(rc))
957 mtu = tipc_node_get_mtu(net, dnode, tsk->portid);
958 rc = tipc_msg_build(mhdr, m, 0, dsz, mtu, &pktchain);
959 if (rc < 0)
960 return rc; 992 return rc;
961 993
962 do { 994 skb_queue_head_init(&pkts);
963 skb = skb_peek(&pktchain); 995 mtu = tipc_node_get_mtu(net, dnode, tsk->portid);
964 TIPC_SKB_CB(skb)->wakeup_pending = tsk->link_cong; 996 rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts);
965 rc = tipc_node_xmit(net, &pktchain, dnode, tsk->portid); 997 if (unlikely(rc != dlen))
966 if (likely(!rc)) { 998 return rc;
967 if (!is_connectionless)
968 tipc_set_sk_state(sk, TIPC_CONNECTING);
969 return dsz;
970 }
971 if (rc == -ELINKCONG) {
972 tsk->link_cong = 1;
973 rc = tipc_wait_for_sndmsg(sock, &timeo);
974 if (!rc)
975 continue;
976 }
977 __skb_queue_purge(&pktchain);
978 if (rc == -EMSGSIZE) {
979 m->msg_iter = save;
980 goto new_mtu;
981 }
982 break;
983 } while (1);
984
985 return rc;
986}
987 999
988static int tipc_wait_for_sndpkt(struct socket *sock, long *timeo_p) 1000 rc = tipc_node_xmit(net, &pkts, dnode, tsk->portid);
989{ 1001 if (unlikely(rc == -ELINKCONG)) {
990 DEFINE_WAIT_FUNC(wait, woken_wake_function); 1002 u32_push(clinks, dnode);
991 struct sock *sk = sock->sk; 1003 tsk->cong_link_cnt++;
992 struct tipc_sock *tsk = tipc_sk(sk); 1004 rc = 0;
993 int done; 1005 }
994 1006
995 do { 1007 if (unlikely(syn && !rc))
996 int err = sock_error(sk); 1008 tipc_set_sk_state(sk, TIPC_CONNECTING);
997 if (err)
998 return err;
999 if (sk->sk_state == TIPC_DISCONNECTING)
1000 return -EPIPE;
1001 else if (!tipc_sk_connected(sk))
1002 return -ENOTCONN;
1003 if (!*timeo_p)
1004 return -EAGAIN;
1005 if (signal_pending(current))
1006 return sock_intr_errno(*timeo_p);
1007 1009
1008 add_wait_queue(sk_sleep(sk), &wait); 1010 return rc ? rc : dlen;
1009 done = sk_wait_event(sk, timeo_p,
1010 (!tsk->link_cong &&
1011 !tsk_conn_cong(tsk)) ||
1012 !tipc_sk_connected(sk), &wait);
1013 remove_wait_queue(sk_sleep(sk), &wait);
1014 } while (!done);
1015 return 0;
1016} 1011}
1017 1012
1018/** 1013/**
1019 * tipc_send_stream - send stream-oriented data 1014 * tipc_sendstream - send stream-oriented data
1020 * @sock: socket structure 1015 * @sock: socket structure
1021 * @m: data to send 1016 * @m: data to send
1022 * @dsz: total length of data to be transmitted 1017 * @dsz: total length of data to be transmitted
@@ -1026,94 +1021,69 @@ static int tipc_wait_for_sndpkt(struct socket *sock, long *timeo_p)
1026 * Returns the number of bytes sent on success (or partial success), 1021 * Returns the number of bytes sent on success (or partial success),
1027 * or errno if no data sent 1022 * or errno if no data sent
1028 */ 1023 */
1029static int tipc_send_stream(struct socket *sock, struct msghdr *m, size_t dsz) 1024static int tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dsz)
1030{ 1025{
1031 struct sock *sk = sock->sk; 1026 struct sock *sk = sock->sk;
1032 int ret; 1027 int ret;
1033 1028
1034 lock_sock(sk); 1029 lock_sock(sk);
1035 ret = __tipc_send_stream(sock, m, dsz); 1030 ret = __tipc_sendstream(sock, m, dsz);
1036 release_sock(sk); 1031 release_sock(sk);
1037 1032
1038 return ret; 1033 return ret;
1039} 1034}
1040 1035
1041static int __tipc_send_stream(struct socket *sock, struct msghdr *m, size_t dsz) 1036static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen)
1042{ 1037{
1043 struct sock *sk = sock->sk; 1038 struct sock *sk = sock->sk;
1044 struct net *net = sock_net(sk);
1045 struct tipc_sock *tsk = tipc_sk(sk);
1046 struct tipc_msg *mhdr = &tsk->phdr;
1047 struct sk_buff_head pktchain;
1048 DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); 1039 DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name);
1049 u32 portid = tsk->portid; 1040 long timeout = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT);
1050 int rc = -EINVAL; 1041 struct tipc_sock *tsk = tipc_sk(sk);
1051 long timeo; 1042 struct tipc_msg *hdr = &tsk->phdr;
1052 u32 dnode; 1043 struct net *net = sock_net(sk);
1053 uint mtu, send, sent = 0; 1044 struct sk_buff_head pkts;
1054 struct iov_iter save; 1045 u32 dnode = tsk_peer_node(tsk);
1055 int hlen = MIN_H_SIZE; 1046 int send, sent = 0;
1056 1047 int rc = 0;
1057 /* Handle implied connection establishment */
1058 if (unlikely(dest)) {
1059 rc = __tipc_sendmsg(sock, m, dsz);
1060 hlen = msg_hdr_sz(mhdr);
1061 if (dsz && (dsz == rc))
1062 tsk->snt_unacked = tsk_inc(tsk, dsz + hlen);
1063 return rc;
1064 }
1065 if (dsz > (uint)INT_MAX)
1066 return -EMSGSIZE;
1067
1068 if (unlikely(!tipc_sk_connected(sk))) {
1069 if (sk->sk_state == TIPC_DISCONNECTING)
1070 return -EPIPE;
1071 else
1072 return -ENOTCONN;
1073 }
1074 1048
1075 timeo = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT); 1049 skb_queue_head_init(&pkts);
1076 if (!timeo && tsk->link_cong)
1077 return -ELINKCONG;
1078 1050
1079 dnode = tsk_peer_node(tsk); 1051 if (unlikely(dlen > INT_MAX))
1080 skb_queue_head_init(&pktchain); 1052 return -EMSGSIZE;
1081 1053
1082next: 1054 /* Handle implicit connection setup */
1083 save = m->msg_iter; 1055 if (unlikely(dest)) {
1084 mtu = tsk->max_pkt; 1056 rc = __tipc_sendmsg(sock, m, dlen);
1085 send = min_t(uint, dsz - sent, TIPC_MAX_USER_MSG_SIZE); 1057 if (dlen && (dlen == rc))
1086 rc = tipc_msg_build(mhdr, m, sent, send, mtu, &pktchain); 1058 tsk->snt_unacked = tsk_inc(tsk, dlen + msg_hdr_sz(hdr));
1087 if (unlikely(rc < 0))
1088 return rc; 1059 return rc;
1060 }
1089 1061
1090 do { 1062 do {
1091 if (likely(!tsk_conn_cong(tsk))) { 1063 rc = tipc_wait_for_cond(sock, &timeout,
1092 rc = tipc_node_xmit(net, &pktchain, dnode, portid); 1064 (!tsk->cong_link_cnt &&
1093 if (likely(!rc)) { 1065 !tsk_conn_cong(tsk) &&
1094 tsk->snt_unacked += tsk_inc(tsk, send + hlen); 1066 tipc_sk_connected(sk)));
1095 sent += send; 1067 if (unlikely(rc))
1096 if (sent == dsz) 1068 break;
1097 return dsz;
1098 goto next;
1099 }
1100 if (rc == -EMSGSIZE) {
1101 __skb_queue_purge(&pktchain);
1102 tsk->max_pkt = tipc_node_get_mtu(net, dnode,
1103 portid);
1104 m->msg_iter = save;
1105 goto next;
1106 }
1107 if (rc != -ELINKCONG)
1108 break;
1109 1069
1110 tsk->link_cong = 1; 1070 send = min_t(size_t, dlen - sent, TIPC_MAX_USER_MSG_SIZE);
1071 rc = tipc_msg_build(hdr, m, sent, send, tsk->max_pkt, &pkts);
1072 if (unlikely(rc != send))
1073 break;
1074
1075 rc = tipc_node_xmit(net, &pkts, dnode, tsk->portid);
1076 if (unlikely(rc == -ELINKCONG)) {
1077 tsk->cong_link_cnt = 1;
1078 rc = 0;
1111 } 1079 }
1112 rc = tipc_wait_for_sndpkt(sock, &timeo); 1080 if (likely(!rc)) {
1113 } while (!rc); 1081 tsk->snt_unacked += tsk_inc(tsk, send + MIN_H_SIZE);
1082 sent += send;
1083 }
1084 } while (sent < dlen && !rc);
1114 1085
1115 __skb_queue_purge(&pktchain); 1086 return rc ? rc : sent;
1116 return sent ? sent : rc;
1117} 1087}
1118 1088
1119/** 1089/**
@@ -1131,7 +1101,7 @@ static int tipc_send_packet(struct socket *sock, struct msghdr *m, size_t dsz)
1131 if (dsz > TIPC_MAX_USER_MSG_SIZE) 1101 if (dsz > TIPC_MAX_USER_MSG_SIZE)
1132 return -EMSGSIZE; 1102 return -EMSGSIZE;
1133 1103
1134 return tipc_send_stream(sock, m, dsz); 1104 return tipc_sendstream(sock, m, dsz);
1135} 1105}
1136 1106
1137/* tipc_sk_finish_conn - complete the setup of a connection 1107/* tipc_sk_finish_conn - complete the setup of a connection
@@ -1698,6 +1668,7 @@ static bool filter_rcv(struct sock *sk, struct sk_buff *skb,
1698 unsigned int limit = rcvbuf_limit(sk, skb); 1668 unsigned int limit = rcvbuf_limit(sk, skb);
1699 int err = TIPC_OK; 1669 int err = TIPC_OK;
1700 int usr = msg_user(hdr); 1670 int usr = msg_user(hdr);
1671 u32 onode;
1701 1672
1702 if (unlikely(msg_user(hdr) == CONN_MANAGER)) { 1673 if (unlikely(msg_user(hdr) == CONN_MANAGER)) {
1703 tipc_sk_proto_rcv(tsk, skb, xmitq); 1674 tipc_sk_proto_rcv(tsk, skb, xmitq);
@@ -1705,8 +1676,10 @@ static bool filter_rcv(struct sock *sk, struct sk_buff *skb,
1705 } 1676 }
1706 1677
1707 if (unlikely(usr == SOCK_WAKEUP)) { 1678 if (unlikely(usr == SOCK_WAKEUP)) {
1679 onode = msg_orignode(hdr);
1708 kfree_skb(skb); 1680 kfree_skb(skb);
1709 tsk->link_cong = 0; 1681 u32_del(&tsk->cong_links, onode);
1682 tsk->cong_link_cnt--;
1710 sk->sk_write_space(sk); 1683 sk->sk_write_space(sk);
1711 return false; 1684 return false;
1712 } 1685 }
@@ -2057,7 +2030,8 @@ static int tipc_wait_for_accept(struct socket *sock, long timeo)
2057 * 2030 *
2058 * Returns 0 on success, errno otherwise 2031 * Returns 0 on success, errno otherwise
2059 */ 2032 */
2060static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags) 2033static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags,
2034 bool kern)
2061{ 2035{
2062 struct sock *new_sk, *sk = sock->sk; 2036 struct sock *new_sk, *sk = sock->sk;
2063 struct sk_buff *buf; 2037 struct sk_buff *buf;
@@ -2079,7 +2053,7 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags)
2079 2053
2080 buf = skb_peek(&sk->sk_receive_queue); 2054 buf = skb_peek(&sk->sk_receive_queue);
2081 2055
2082 res = tipc_sk_create(sock_net(sock->sk), new_sock, 0, 0); 2056 res = tipc_sk_create(sock_net(sock->sk), new_sock, 0, kern);
2083 if (res) 2057 if (res)
2084 goto exit; 2058 goto exit;
2085 security_sk_clone(sock->sk, new_sock->sk); 2059 security_sk_clone(sock->sk, new_sock->sk);
@@ -2114,7 +2088,7 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags)
2114 struct msghdr m = {NULL,}; 2088 struct msghdr m = {NULL,};
2115 2089
2116 tsk_advance_rx_queue(sk); 2090 tsk_advance_rx_queue(sk);
2117 __tipc_send_stream(new_sock, &m, 0); 2091 __tipc_sendstream(new_sock, &m, 0);
2118 } else { 2092 } else {
2119 __skb_dequeue(&sk->sk_receive_queue); 2093 __skb_dequeue(&sk->sk_receive_queue);
2120 __skb_queue_head(&new_sk->sk_receive_queue, buf); 2094 __skb_queue_head(&new_sk->sk_receive_queue, buf);
@@ -2269,24 +2243,27 @@ static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope,
2269void tipc_sk_reinit(struct net *net) 2243void tipc_sk_reinit(struct net *net)
2270{ 2244{
2271 struct tipc_net *tn = net_generic(net, tipc_net_id); 2245 struct tipc_net *tn = net_generic(net, tipc_net_id);
2272 const struct bucket_table *tbl; 2246 struct rhashtable_iter iter;
2273 struct rhash_head *pos;
2274 struct tipc_sock *tsk; 2247 struct tipc_sock *tsk;
2275 struct tipc_msg *msg; 2248 struct tipc_msg *msg;
2276 int i;
2277 2249
2278 rcu_read_lock(); 2250 rhashtable_walk_enter(&tn->sk_rht, &iter);
2279 tbl = rht_dereference_rcu((&tn->sk_rht)->tbl, &tn->sk_rht); 2251
2280 for (i = 0; i < tbl->size; i++) { 2252 do {
2281 rht_for_each_entry_rcu(tsk, pos, tbl, i, node) { 2253 tsk = ERR_PTR(rhashtable_walk_start(&iter));
2254 if (tsk)
2255 continue;
2256
2257 while ((tsk = rhashtable_walk_next(&iter)) && !IS_ERR(tsk)) {
2282 spin_lock_bh(&tsk->sk.sk_lock.slock); 2258 spin_lock_bh(&tsk->sk.sk_lock.slock);
2283 msg = &tsk->phdr; 2259 msg = &tsk->phdr;
2284 msg_set_prevnode(msg, tn->own_addr); 2260 msg_set_prevnode(msg, tn->own_addr);
2285 msg_set_orignode(msg, tn->own_addr); 2261 msg_set_orignode(msg, tn->own_addr);
2286 spin_unlock_bh(&tsk->sk.sk_lock.slock); 2262 spin_unlock_bh(&tsk->sk.sk_lock.slock);
2287 } 2263 }
2288 } 2264
2289 rcu_read_unlock(); 2265 rhashtable_walk_stop(&iter);
2266 } while (tsk == ERR_PTR(-EAGAIN));
2290} 2267}
2291 2268
2292static struct tipc_sock *tipc_sk_lookup(struct net *net, u32 portid) 2269static struct tipc_sock *tipc_sk_lookup(struct net *net, u32 portid)
@@ -2382,18 +2359,29 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt,
2382{ 2359{
2383 struct sock *sk = sock->sk; 2360 struct sock *sk = sock->sk;
2384 struct tipc_sock *tsk = tipc_sk(sk); 2361 struct tipc_sock *tsk = tipc_sk(sk);
2385 u32 value; 2362 u32 value = 0;
2386 int res; 2363 int res = 0;
2387 2364
2388 if ((lvl == IPPROTO_TCP) && (sock->type == SOCK_STREAM)) 2365 if ((lvl == IPPROTO_TCP) && (sock->type == SOCK_STREAM))
2389 return 0; 2366 return 0;
2390 if (lvl != SOL_TIPC) 2367 if (lvl != SOL_TIPC)
2391 return -ENOPROTOOPT; 2368 return -ENOPROTOOPT;
2392 if (ol < sizeof(value)) 2369
2393 return -EINVAL; 2370 switch (opt) {
2394 res = get_user(value, (u32 __user *)ov); 2371 case TIPC_IMPORTANCE:
2395 if (res) 2372 case TIPC_SRC_DROPPABLE:
2396 return res; 2373 case TIPC_DEST_DROPPABLE:
2374 case TIPC_CONN_TIMEOUT:
2375 if (ol < sizeof(value))
2376 return -EINVAL;
2377 res = get_user(value, (u32 __user *)ov);
2378 if (res)
2379 return res;
2380 break;
2381 default:
2382 if (ov || ol)
2383 return -EINVAL;
2384 }
2397 2385
2398 lock_sock(sk); 2386 lock_sock(sk);
2399 2387
@@ -2412,7 +2400,14 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt,
2412 break; 2400 break;
2413 case TIPC_CONN_TIMEOUT: 2401 case TIPC_CONN_TIMEOUT:
2414 tipc_sk(sk)->conn_timeout = value; 2402 tipc_sk(sk)->conn_timeout = value;
2415 /* no need to set "res", since already 0 at this point */ 2403 break;
2404 case TIPC_MCAST_BROADCAST:
2405 tsk->mc_method.rcast = false;
2406 tsk->mc_method.mandatory = true;
2407 break;
2408 case TIPC_MCAST_REPLICAST:
2409 tsk->mc_method.rcast = true;
2410 tsk->mc_method.mandatory = true;
2416 break; 2411 break;
2417 default: 2412 default:
2418 res = -EINVAL; 2413 res = -EINVAL;
@@ -2575,7 +2570,7 @@ static const struct proto_ops stream_ops = {
2575 .shutdown = tipc_shutdown, 2570 .shutdown = tipc_shutdown,
2576 .setsockopt = tipc_setsockopt, 2571 .setsockopt = tipc_setsockopt,
2577 .getsockopt = tipc_getsockopt, 2572 .getsockopt = tipc_getsockopt,
2578 .sendmsg = tipc_send_stream, 2573 .sendmsg = tipc_sendstream,
2579 .recvmsg = tipc_recv_stream, 2574 .recvmsg = tipc_recv_stream,
2580 .mmap = sock_no_mmap, 2575 .mmap = sock_no_mmap,
2581 .sendpage = sock_no_sendpage 2576 .sendpage = sock_no_sendpage
diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c
index 9d94e65d0894..271cd66e4b3b 100644
--- a/net/tipc/subscr.c
+++ b/net/tipc/subscr.c
@@ -141,6 +141,11 @@ void tipc_subscrp_report_overlap(struct tipc_subscription *sub, u32 found_lower,
141static void tipc_subscrp_timeout(unsigned long data) 141static void tipc_subscrp_timeout(unsigned long data)
142{ 142{
143 struct tipc_subscription *sub = (struct tipc_subscription *)data; 143 struct tipc_subscription *sub = (struct tipc_subscription *)data;
144 struct tipc_subscriber *subscriber = sub->subscriber;
145
146 spin_lock_bh(&subscriber->lock);
147 tipc_nametbl_unsubscribe(sub);
148 spin_unlock_bh(&subscriber->lock);
144 149
145 /* Notify subscriber of timeout */ 150 /* Notify subscriber of timeout */
146 tipc_subscrp_send_event(sub, sub->evt.s.seq.lower, sub->evt.s.seq.upper, 151 tipc_subscrp_send_event(sub, sub->evt.s.seq.lower, sub->evt.s.seq.upper,
@@ -173,7 +178,6 @@ static void tipc_subscrp_kref_release(struct kref *kref)
173 struct tipc_subscriber *subscriber = sub->subscriber; 178 struct tipc_subscriber *subscriber = sub->subscriber;
174 179
175 spin_lock_bh(&subscriber->lock); 180 spin_lock_bh(&subscriber->lock);
176 tipc_nametbl_unsubscribe(sub);
177 list_del(&sub->subscrp_list); 181 list_del(&sub->subscrp_list);
178 atomic_dec(&tn->subscription_count); 182 atomic_dec(&tn->subscription_count);
179 spin_unlock_bh(&subscriber->lock); 183 spin_unlock_bh(&subscriber->lock);
@@ -205,6 +209,7 @@ static void tipc_subscrb_subscrp_delete(struct tipc_subscriber *subscriber,
205 if (s && memcmp(s, &sub->evt.s, sizeof(struct tipc_subscr))) 209 if (s && memcmp(s, &sub->evt.s, sizeof(struct tipc_subscr)))
206 continue; 210 continue;
207 211
212 tipc_nametbl_unsubscribe(sub);
208 tipc_subscrp_get(sub); 213 tipc_subscrp_get(sub);
209 spin_unlock_bh(&subscriber->lock); 214 spin_unlock_bh(&subscriber->lock);
210 tipc_subscrp_delete(sub); 215 tipc_subscrp_delete(sub);
diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index b58dc95f3d35..46061cf48cd1 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -113,7 +113,7 @@ static void tipc_udp_media_addr_set(struct tipc_media_addr *addr,
113 memcpy(addr->value, ua, sizeof(struct udp_media_addr)); 113 memcpy(addr->value, ua, sizeof(struct udp_media_addr));
114 114
115 if (tipc_udp_is_mcast_addr(ua)) 115 if (tipc_udp_is_mcast_addr(ua))
116 addr->broadcast = 1; 116 addr->broadcast = TIPC_BROADCAST_SUPPORT;
117} 117}
118 118
119/* tipc_udp_addr2str - convert ip/udp address to string */ 119/* tipc_udp_addr2str - convert ip/udp address to string */
@@ -229,7 +229,7 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb,
229 goto out; 229 goto out;
230 } 230 }
231 231
232 if (!addr->broadcast || list_empty(&ub->rcast.list)) 232 if (addr->broadcast != TIPC_REPLICAST_SUPPORT)
233 return tipc_udp_xmit(net, skb, ub, src, dst); 233 return tipc_udp_xmit(net, skb, ub, src, dst);
234 234
235 /* Replicast, send an skb to each configured IP address */ 235 /* Replicast, send an skb to each configured IP address */
@@ -296,7 +296,7 @@ static int tipc_udp_rcast_add(struct tipc_bearer *b,
296 else if (ntohs(addr->proto) == ETH_P_IPV6) 296 else if (ntohs(addr->proto) == ETH_P_IPV6)
297 pr_info("New replicast peer: %pI6\n", &rcast->addr.ipv6); 297 pr_info("New replicast peer: %pI6\n", &rcast->addr.ipv6);
298#endif 298#endif
299 299 b->bcast_addr.broadcast = TIPC_REPLICAST_SUPPORT;
300 list_add_rcu(&rcast->list, &ub->rcast.list); 300 list_add_rcu(&rcast->list, &ub->rcast.list);
301 return 0; 301 return 0;
302} 302}
@@ -681,7 +681,7 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
681 goto err; 681 goto err;
682 682
683 b->bcast_addr.media_id = TIPC_MEDIA_TYPE_UDP; 683 b->bcast_addr.media_id = TIPC_MEDIA_TYPE_UDP;
684 b->bcast_addr.broadcast = 1; 684 b->bcast_addr.broadcast = TIPC_BROADCAST_SUPPORT;
685 rcu_assign_pointer(b->media_ptr, ub); 685 rcu_assign_pointer(b->media_ptr, ub);
686 rcu_assign_pointer(ub->bearer, b); 686 rcu_assign_pointer(ub->bearer, b);
687 tipc_udp_media_addr_set(&b->addr, &local); 687 tipc_udp_media_addr_set(&b->addr, &local);
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index cef79873b09d..928691c43408 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -85,7 +85,7 @@
85#include <linux/module.h> 85#include <linux/module.h>
86#include <linux/kernel.h> 86#include <linux/kernel.h>
87#include <linux/signal.h> 87#include <linux/signal.h>
88#include <linux/sched.h> 88#include <linux/sched/signal.h>
89#include <linux/errno.h> 89#include <linux/errno.h>
90#include <linux/string.h> 90#include <linux/string.h>
91#include <linux/stat.h> 91#include <linux/stat.h>
@@ -117,6 +117,7 @@
117#include <net/checksum.h> 117#include <net/checksum.h>
118#include <linux/security.h> 118#include <linux/security.h>
119#include <linux/freezer.h> 119#include <linux/freezer.h>
120#include <linux/file.h>
120 121
121struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE]; 122struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
122EXPORT_SYMBOL_GPL(unix_socket_table); 123EXPORT_SYMBOL_GPL(unix_socket_table);
@@ -635,7 +636,7 @@ static int unix_bind(struct socket *, struct sockaddr *, int);
635static int unix_stream_connect(struct socket *, struct sockaddr *, 636static int unix_stream_connect(struct socket *, struct sockaddr *,
636 int addr_len, int flags); 637 int addr_len, int flags);
637static int unix_socketpair(struct socket *, struct socket *); 638static int unix_socketpair(struct socket *, struct socket *);
638static int unix_accept(struct socket *, struct socket *, int); 639static int unix_accept(struct socket *, struct socket *, int, bool);
639static int unix_getname(struct socket *, struct sockaddr *, int *, int); 640static int unix_getname(struct socket *, struct sockaddr *, int *, int);
640static unsigned int unix_poll(struct file *, struct socket *, poll_table *); 641static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
641static unsigned int unix_dgram_poll(struct file *, struct socket *, 642static unsigned int unix_dgram_poll(struct file *, struct socket *,
@@ -1401,7 +1402,8 @@ static void unix_sock_inherit_flags(const struct socket *old,
1401 set_bit(SOCK_PASSSEC, &new->flags); 1402 set_bit(SOCK_PASSSEC, &new->flags);
1402} 1403}
1403 1404
1404static int unix_accept(struct socket *sock, struct socket *newsock, int flags) 1405static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1406 bool kern)
1405{ 1407{
1406 struct sock *sk = sock->sk; 1408 struct sock *sk = sock->sk;
1407 struct sock *tsk; 1409 struct sock *tsk;
@@ -2592,6 +2594,43 @@ long unix_outq_len(struct sock *sk)
2592} 2594}
2593EXPORT_SYMBOL_GPL(unix_outq_len); 2595EXPORT_SYMBOL_GPL(unix_outq_len);
2594 2596
2597static int unix_open_file(struct sock *sk)
2598{
2599 struct path path;
2600 struct file *f;
2601 int fd;
2602
2603 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2604 return -EPERM;
2605
2606 unix_state_lock(sk);
2607 path = unix_sk(sk)->path;
2608 if (!path.dentry) {
2609 unix_state_unlock(sk);
2610 return -ENOENT;
2611 }
2612
2613 path_get(&path);
2614 unix_state_unlock(sk);
2615
2616 fd = get_unused_fd_flags(O_CLOEXEC);
2617 if (fd < 0)
2618 goto out;
2619
2620 f = dentry_open(&path, O_PATH, current_cred());
2621 if (IS_ERR(f)) {
2622 put_unused_fd(fd);
2623 fd = PTR_ERR(f);
2624 goto out;
2625 }
2626
2627 fd_install(fd, f);
2628out:
2629 path_put(&path);
2630
2631 return fd;
2632}
2633
2595static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 2634static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2596{ 2635{
2597 struct sock *sk = sock->sk; 2636 struct sock *sk = sock->sk;
@@ -2610,6 +2649,9 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2610 else 2649 else
2611 err = put_user(amount, (int __user *)arg); 2650 err = put_user(amount, (int __user *)arg);
2612 break; 2651 break;
2652 case SIOCUNIXFILE:
2653 err = unix_open_file(sk);
2654 break;
2613 default: 2655 default:
2614 err = -ENOIOCTLCMD; 2656 err = -ENOIOCTLCMD;
2615 break; 2657 break;
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 6a0d48525fcf..c36757e72844 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -146,6 +146,7 @@ void unix_notinflight(struct user_struct *user, struct file *fp)
146 if (s) { 146 if (s) {
147 struct unix_sock *u = unix_sk(s); 147 struct unix_sock *u = unix_sk(s);
148 148
149 BUG_ON(!atomic_long_read(&u->inflight));
149 BUG_ON(list_empty(&u->link)); 150 BUG_ON(list_empty(&u->link));
150 151
151 if (atomic_long_dec_and_test(&u->inflight)) 152 if (atomic_long_dec_and_test(&u->inflight))
@@ -341,6 +342,14 @@ void unix_gc(void)
341 } 342 }
342 list_del(&cursor); 343 list_del(&cursor);
343 344
345 /* Now gc_candidates contains only garbage. Restore original
346 * inflight counters for these as well, and remove the skbuffs
347 * which are creating the cycle(s).
348 */
349 skb_queue_head_init(&hitlist);
350 list_for_each_entry(u, &gc_candidates, link)
351 scan_children(&u->sk, inc_inflight, &hitlist);
352
344 /* not_cycle_list contains those sockets which do not make up a 353 /* not_cycle_list contains those sockets which do not make up a
345 * cycle. Restore these to the inflight list. 354 * cycle. Restore these to the inflight list.
346 */ 355 */
@@ -350,14 +359,6 @@ void unix_gc(void)
350 list_move_tail(&u->link, &gc_inflight_list); 359 list_move_tail(&u->link, &gc_inflight_list);
351 } 360 }
352 361
353 /* Now gc_candidates contains only garbage. Restore original
354 * inflight counters for these as well, and remove the skbuffs
355 * which are creating the cycle(s).
356 */
357 skb_queue_head_init(&hitlist);
358 list_for_each_entry(u, &gc_candidates, link)
359 scan_children(&u->sk, inc_inflight, &hitlist);
360
361 spin_unlock(&unix_gc_lock); 362 spin_unlock(&unix_gc_lock);
362 363
363 /* Here we are. Hitlist is filled. Die. */ 364 /* Here we are. Hitlist is filled. Die. */
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 8a398b3fb532..6f7f6757ceef 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -90,6 +90,7 @@
90#include <linux/init.h> 90#include <linux/init.h>
91#include <linux/io.h> 91#include <linux/io.h>
92#include <linux/kernel.h> 92#include <linux/kernel.h>
93#include <linux/sched/signal.h>
93#include <linux/kmod.h> 94#include <linux/kmod.h>
94#include <linux/list.h> 95#include <linux/list.h>
95#include <linux/miscdevice.h> 96#include <linux/miscdevice.h>
@@ -1101,10 +1102,19 @@ static const struct proto_ops vsock_dgram_ops = {
1101 .sendpage = sock_no_sendpage, 1102 .sendpage = sock_no_sendpage,
1102}; 1103};
1103 1104
1105static int vsock_transport_cancel_pkt(struct vsock_sock *vsk)
1106{
1107 if (!transport->cancel_pkt)
1108 return -EOPNOTSUPP;
1109
1110 return transport->cancel_pkt(vsk);
1111}
1112
1104static void vsock_connect_timeout(struct work_struct *work) 1113static void vsock_connect_timeout(struct work_struct *work)
1105{ 1114{
1106 struct sock *sk; 1115 struct sock *sk;
1107 struct vsock_sock *vsk; 1116 struct vsock_sock *vsk;
1117 int cancel = 0;
1108 1118
1109 vsk = container_of(work, struct vsock_sock, dwork.work); 1119 vsk = container_of(work, struct vsock_sock, dwork.work);
1110 sk = sk_vsock(vsk); 1120 sk = sk_vsock(vsk);
@@ -1115,8 +1125,11 @@ static void vsock_connect_timeout(struct work_struct *work)
1115 sk->sk_state = SS_UNCONNECTED; 1125 sk->sk_state = SS_UNCONNECTED;
1116 sk->sk_err = ETIMEDOUT; 1126 sk->sk_err = ETIMEDOUT;
1117 sk->sk_error_report(sk); 1127 sk->sk_error_report(sk);
1128 cancel = 1;
1118 } 1129 }
1119 release_sock(sk); 1130 release_sock(sk);
1131 if (cancel)
1132 vsock_transport_cancel_pkt(vsk);
1120 1133
1121 sock_put(sk); 1134 sock_put(sk);
1122} 1135}
@@ -1223,11 +1236,13 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr,
1223 err = sock_intr_errno(timeout); 1236 err = sock_intr_errno(timeout);
1224 sk->sk_state = SS_UNCONNECTED; 1237 sk->sk_state = SS_UNCONNECTED;
1225 sock->state = SS_UNCONNECTED; 1238 sock->state = SS_UNCONNECTED;
1239 vsock_transport_cancel_pkt(vsk);
1226 goto out_wait; 1240 goto out_wait;
1227 } else if (timeout == 0) { 1241 } else if (timeout == 0) {
1228 err = -ETIMEDOUT; 1242 err = -ETIMEDOUT;
1229 sk->sk_state = SS_UNCONNECTED; 1243 sk->sk_state = SS_UNCONNECTED;
1230 sock->state = SS_UNCONNECTED; 1244 sock->state = SS_UNCONNECTED;
1245 vsock_transport_cancel_pkt(vsk);
1231 goto out_wait; 1246 goto out_wait;
1232 } 1247 }
1233 1248
@@ -1249,7 +1264,8 @@ out:
1249 return err; 1264 return err;
1250} 1265}
1251 1266
1252static int vsock_accept(struct socket *sock, struct socket *newsock, int flags) 1267static int vsock_accept(struct socket *sock, struct socket *newsock, int flags,
1268 bool kern)
1253{ 1269{
1254 struct sock *listener; 1270 struct sock *listener;
1255 int err; 1271 int err;
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 6788264acc63..68675a151f22 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -213,6 +213,47 @@ virtio_transport_send_pkt(struct virtio_vsock_pkt *pkt)
213 return len; 213 return len;
214} 214}
215 215
216static int
217virtio_transport_cancel_pkt(struct vsock_sock *vsk)
218{
219 struct virtio_vsock *vsock;
220 struct virtio_vsock_pkt *pkt, *n;
221 int cnt = 0;
222 LIST_HEAD(freeme);
223
224 vsock = virtio_vsock_get();
225 if (!vsock) {
226 return -ENODEV;
227 }
228
229 spin_lock_bh(&vsock->send_pkt_list_lock);
230 list_for_each_entry_safe(pkt, n, &vsock->send_pkt_list, list) {
231 if (pkt->vsk != vsk)
232 continue;
233 list_move(&pkt->list, &freeme);
234 }
235 spin_unlock_bh(&vsock->send_pkt_list_lock);
236
237 list_for_each_entry_safe(pkt, n, &freeme, list) {
238 if (pkt->reply)
239 cnt++;
240 list_del(&pkt->list);
241 virtio_transport_free_pkt(pkt);
242 }
243
244 if (cnt) {
245 struct virtqueue *rx_vq = vsock->vqs[VSOCK_VQ_RX];
246 int new_cnt;
247
248 new_cnt = atomic_sub_return(cnt, &vsock->queued_replies);
249 if (new_cnt + cnt >= virtqueue_get_vring_size(rx_vq) &&
250 new_cnt < virtqueue_get_vring_size(rx_vq))
251 queue_work(virtio_vsock_workqueue, &vsock->rx_work);
252 }
253
254 return 0;
255}
256
216static void virtio_vsock_rx_fill(struct virtio_vsock *vsock) 257static void virtio_vsock_rx_fill(struct virtio_vsock *vsock)
217{ 258{
218 int buf_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE; 259 int buf_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE;
@@ -462,6 +503,7 @@ static struct virtio_transport virtio_transport = {
462 .release = virtio_transport_release, 503 .release = virtio_transport_release,
463 .connect = virtio_transport_connect, 504 .connect = virtio_transport_connect,
464 .shutdown = virtio_transport_shutdown, 505 .shutdown = virtio_transport_shutdown,
506 .cancel_pkt = virtio_transport_cancel_pkt,
465 507
466 .dgram_bind = virtio_transport_dgram_bind, 508 .dgram_bind = virtio_transport_dgram_bind,
467 .dgram_dequeue = virtio_transport_dgram_dequeue, 509 .dgram_dequeue = virtio_transport_dgram_dequeue,
@@ -532,7 +574,8 @@ static int virtio_vsock_probe(struct virtio_device *vdev)
532 vsock->vdev = vdev; 574 vsock->vdev = vdev;
533 575
534 ret = vsock->vdev->config->find_vqs(vsock->vdev, VSOCK_VQ_MAX, 576 ret = vsock->vdev->config->find_vqs(vsock->vdev, VSOCK_VQ_MAX,
535 vsock->vqs, callbacks, names); 577 vsock->vqs, callbacks, names,
578 NULL);
536 if (ret < 0) 579 if (ret < 0)
537 goto out; 580 goto out;
538 581
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index 849c4ad0411e..af087b44ceea 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -9,6 +9,7 @@
9 */ 9 */
10#include <linux/spinlock.h> 10#include <linux/spinlock.h>
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/sched/signal.h>
12#include <linux/ctype.h> 13#include <linux/ctype.h>
13#include <linux/list.h> 14#include <linux/list.h>
14#include <linux/virtio.h> 15#include <linux/virtio.h>
@@ -57,6 +58,7 @@ virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info,
57 pkt->len = len; 58 pkt->len = len;
58 pkt->hdr.len = cpu_to_le32(len); 59 pkt->hdr.len = cpu_to_le32(len);
59 pkt->reply = info->reply; 60 pkt->reply = info->reply;
61 pkt->vsk = info->vsk;
60 62
61 if (info->msg && len > 0) { 63 if (info->msg && len > 0) {
62 pkt->buf = kmalloc(len, GFP_KERNEL); 64 pkt->buf = kmalloc(len, GFP_KERNEL);
@@ -179,6 +181,7 @@ static int virtio_transport_send_credit_update(struct vsock_sock *vsk,
179 struct virtio_vsock_pkt_info info = { 181 struct virtio_vsock_pkt_info info = {
180 .op = VIRTIO_VSOCK_OP_CREDIT_UPDATE, 182 .op = VIRTIO_VSOCK_OP_CREDIT_UPDATE,
181 .type = type, 183 .type = type,
184 .vsk = vsk,
182 }; 185 };
183 186
184 return virtio_transport_send_pkt_info(vsk, &info); 187 return virtio_transport_send_pkt_info(vsk, &info);
@@ -518,6 +521,7 @@ int virtio_transport_connect(struct vsock_sock *vsk)
518 struct virtio_vsock_pkt_info info = { 521 struct virtio_vsock_pkt_info info = {
519 .op = VIRTIO_VSOCK_OP_REQUEST, 522 .op = VIRTIO_VSOCK_OP_REQUEST,
520 .type = VIRTIO_VSOCK_TYPE_STREAM, 523 .type = VIRTIO_VSOCK_TYPE_STREAM,
524 .vsk = vsk,
521 }; 525 };
522 526
523 return virtio_transport_send_pkt_info(vsk, &info); 527 return virtio_transport_send_pkt_info(vsk, &info);
@@ -533,6 +537,7 @@ int virtio_transport_shutdown(struct vsock_sock *vsk, int mode)
533 VIRTIO_VSOCK_SHUTDOWN_RCV : 0) | 537 VIRTIO_VSOCK_SHUTDOWN_RCV : 0) |
534 (mode & SEND_SHUTDOWN ? 538 (mode & SEND_SHUTDOWN ?
535 VIRTIO_VSOCK_SHUTDOWN_SEND : 0), 539 VIRTIO_VSOCK_SHUTDOWN_SEND : 0),
540 .vsk = vsk,
536 }; 541 };
537 542
538 return virtio_transport_send_pkt_info(vsk, &info); 543 return virtio_transport_send_pkt_info(vsk, &info);
@@ -559,6 +564,7 @@ virtio_transport_stream_enqueue(struct vsock_sock *vsk,
559 .type = VIRTIO_VSOCK_TYPE_STREAM, 564 .type = VIRTIO_VSOCK_TYPE_STREAM,
560 .msg = msg, 565 .msg = msg,
561 .pkt_len = len, 566 .pkt_len = len,
567 .vsk = vsk,
562 }; 568 };
563 569
564 return virtio_transport_send_pkt_info(vsk, &info); 570 return virtio_transport_send_pkt_info(vsk, &info);
@@ -580,6 +586,7 @@ static int virtio_transport_reset(struct vsock_sock *vsk,
580 .op = VIRTIO_VSOCK_OP_RST, 586 .op = VIRTIO_VSOCK_OP_RST,
581 .type = VIRTIO_VSOCK_TYPE_STREAM, 587 .type = VIRTIO_VSOCK_TYPE_STREAM,
582 .reply = !!pkt, 588 .reply = !!pkt,
589 .vsk = vsk,
583 }; 590 };
584 591
585 /* Send RST only if the original pkt is not a RST pkt */ 592 /* Send RST only if the original pkt is not a RST pkt */
@@ -825,6 +832,7 @@ virtio_transport_send_response(struct vsock_sock *vsk,
825 .remote_cid = le64_to_cpu(pkt->hdr.src_cid), 832 .remote_cid = le64_to_cpu(pkt->hdr.src_cid),
826 .remote_port = le32_to_cpu(pkt->hdr.src_port), 833 .remote_port = le32_to_cpu(pkt->hdr.src_port),
827 .reply = true, 834 .reply = true,
835 .vsk = vsk,
828 }; 836 };
829 837
830 return virtio_transport_send_pkt_info(vsk, &info); 838 return virtio_transport_send_pkt_info(vsk, &info);
diff --git a/net/wireless/Makefile b/net/wireless/Makefile
index 816c9331c8d2..d06e5015751a 100644
--- a/net/wireless/Makefile
+++ b/net/wireless/Makefile
@@ -11,6 +11,7 @@ obj-$(CONFIG_WEXT_PRIV) += wext-priv.o
11 11
12cfg80211-y += core.o sysfs.o radiotap.o util.o reg.o scan.o nl80211.o 12cfg80211-y += core.o sysfs.o radiotap.o util.o reg.o scan.o nl80211.o
13cfg80211-y += mlme.o ibss.o sme.o chan.o ethtool.o mesh.o ap.o trace.o ocb.o 13cfg80211-y += mlme.o ibss.o sme.o chan.o ethtool.o mesh.o ap.o trace.o ocb.o
14cfg80211-$(CONFIG_OF) += of.o
14cfg80211-$(CONFIG_CFG80211_DEBUGFS) += debugfs.o 15cfg80211-$(CONFIG_CFG80211_DEBUGFS) += debugfs.o
15cfg80211-$(CONFIG_CFG80211_WEXT) += wext-compat.o wext-sme.o 16cfg80211-$(CONFIG_CFG80211_WEXT) += wext-compat.o wext-sme.o
16cfg80211-$(CONFIG_CFG80211_INTERNAL_REGDB) += regdb.o 17cfg80211-$(CONFIG_CFG80211_INTERNAL_REGDB) += regdb.o
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 158c59ecf90a..e55e05bc4805 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -626,7 +626,8 @@ int wiphy_register(struct wiphy *wiphy)
626 626
627 if (WARN_ON((wiphy->interface_modes & BIT(NL80211_IFTYPE_NAN)) && 627 if (WARN_ON((wiphy->interface_modes & BIT(NL80211_IFTYPE_NAN)) &&
628 (!rdev->ops->start_nan || !rdev->ops->stop_nan || 628 (!rdev->ops->start_nan || !rdev->ops->stop_nan ||
629 !rdev->ops->add_nan_func || !rdev->ops->del_nan_func))) 629 !rdev->ops->add_nan_func || !rdev->ops->del_nan_func ||
630 !(wiphy->nan_supported_bands & BIT(NL80211_BAND_2GHZ)))))
630 return -EINVAL; 631 return -EINVAL;
631 632
632#ifndef CONFIG_WIRELESS_WDS 633#ifndef CONFIG_WIRELESS_WDS
@@ -1142,6 +1143,8 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
1142 wdev->iftype == NL80211_IFTYPE_ADHOC) && !wdev->use_4addr) 1143 wdev->iftype == NL80211_IFTYPE_ADHOC) && !wdev->use_4addr)
1143 dev->priv_flags |= IFF_DONT_BRIDGE; 1144 dev->priv_flags |= IFF_DONT_BRIDGE;
1144 1145
1146 INIT_WORK(&wdev->disconnect_wk, cfg80211_autodisconnect_wk);
1147
1145 nl80211_notify_iface(rdev, wdev, NL80211_CMD_NEW_INTERFACE); 1148 nl80211_notify_iface(rdev, wdev, NL80211_CMD_NEW_INTERFACE);
1146 break; 1149 break;
1147 case NETDEV_GOING_DOWN: 1150 case NETDEV_GOING_DOWN:
@@ -1230,6 +1233,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
1230#ifdef CONFIG_CFG80211_WEXT 1233#ifdef CONFIG_CFG80211_WEXT
1231 kzfree(wdev->wext.keys); 1234 kzfree(wdev->wext.keys);
1232#endif 1235#endif
1236 flush_work(&wdev->disconnect_wk);
1233 } 1237 }
1234 /* 1238 /*
1235 * synchronise (so that we won't find this netdev 1239 * synchronise (so that we won't find this netdev
diff --git a/net/wireless/core.h b/net/wireless/core.h
index af6e023020b1..58ca206982fe 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -228,6 +228,7 @@ struct cfg80211_event {
228 size_t resp_ie_len; 228 size_t resp_ie_len;
229 struct cfg80211_bss *bss; 229 struct cfg80211_bss *bss;
230 int status; /* -1 = failed; 0..65535 = status code */ 230 int status; /* -1 = failed; 0..65535 = status code */
231 enum nl80211_timeout_reason timeout_reason;
231 } cr; 232 } cr;
232 struct { 233 struct {
233 const u8 *req_ie; 234 const u8 *req_ie;
@@ -388,7 +389,8 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
388 const u8 *req_ie, size_t req_ie_len, 389 const u8 *req_ie, size_t req_ie_len,
389 const u8 *resp_ie, size_t resp_ie_len, 390 const u8 *resp_ie, size_t resp_ie_len,
390 int status, bool wextev, 391 int status, bool wextev,
391 struct cfg80211_bss *bss); 392 struct cfg80211_bss *bss,
393 enum nl80211_timeout_reason timeout_reason);
392void __cfg80211_disconnected(struct net_device *dev, const u8 *ie, 394void __cfg80211_disconnected(struct net_device *dev, const u8 *ie,
393 size_t ie_len, u16 reason, bool from_ap); 395 size_t ie_len, u16 reason, bool from_ap);
394int cfg80211_disconnect(struct cfg80211_registered_device *rdev, 396int cfg80211_disconnect(struct cfg80211_registered_device *rdev,
@@ -400,6 +402,7 @@ void __cfg80211_roamed(struct wireless_dev *wdev,
400 const u8 *resp_ie, size_t resp_ie_len); 402 const u8 *resp_ie, size_t resp_ie_len);
401int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev, 403int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev,
402 struct wireless_dev *wdev); 404 struct wireless_dev *wdev);
405void cfg80211_autodisconnect_wk(struct work_struct *work);
403 406
404/* SME implementation */ 407/* SME implementation */
405void cfg80211_conn_work(struct work_struct *work); 408void cfg80211_conn_work(struct work_struct *work);
@@ -430,6 +433,9 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
430void cfg80211_process_rdev_events(struct cfg80211_registered_device *rdev); 433void cfg80211_process_rdev_events(struct cfg80211_registered_device *rdev);
431void cfg80211_process_wdev_events(struct wireless_dev *wdev); 434void cfg80211_process_wdev_events(struct wireless_dev *wdev);
432 435
436bool cfg80211_does_bw_fit_range(const struct ieee80211_freq_range *freq_range,
437 u32 center_freq_khz, u32 bw_khz);
438
433/** 439/**
434 * cfg80211_chandef_dfs_usable - checks if chandef is DFS usable 440 * cfg80211_chandef_dfs_usable - checks if chandef is DFS usable
435 * @wiphy: the wiphy to validate against 441 * @wiphy: the wiphy to validate against
diff --git a/net/wireless/debugfs.c b/net/wireless/debugfs.c
index 5d453916a417..30fc6eb352bc 100644
--- a/net/wireless/debugfs.c
+++ b/net/wireless/debugfs.c
@@ -17,7 +17,7 @@
17static ssize_t name## _read(struct file *file, char __user *userbuf, \ 17static ssize_t name## _read(struct file *file, char __user *userbuf, \
18 size_t count, loff_t *ppos) \ 18 size_t count, loff_t *ppos) \
19{ \ 19{ \
20 struct wiphy *wiphy= file->private_data; \ 20 struct wiphy *wiphy = file->private_data; \
21 char buf[buflen]; \ 21 char buf[buflen]; \
22 int res; \ 22 int res; \
23 \ 23 \
@@ -29,14 +29,14 @@ static const struct file_operations name## _ops = { \
29 .read = name## _read, \ 29 .read = name## _read, \
30 .open = simple_open, \ 30 .open = simple_open, \
31 .llseek = generic_file_llseek, \ 31 .llseek = generic_file_llseek, \
32}; 32}
33 33
34DEBUGFS_READONLY_FILE(rts_threshold, 20, "%d", 34DEBUGFS_READONLY_FILE(rts_threshold, 20, "%d",
35 wiphy->rts_threshold) 35 wiphy->rts_threshold);
36DEBUGFS_READONLY_FILE(fragmentation_threshold, 20, "%d", 36DEBUGFS_READONLY_FILE(fragmentation_threshold, 20, "%d",
37 wiphy->frag_threshold); 37 wiphy->frag_threshold);
38DEBUGFS_READONLY_FILE(short_retry_limit, 20, "%d", 38DEBUGFS_READONLY_FILE(short_retry_limit, 20, "%d",
39 wiphy->retry_short) 39 wiphy->retry_short);
40DEBUGFS_READONLY_FILE(long_retry_limit, 20, "%d", 40DEBUGFS_READONLY_FILE(long_retry_limit, 20, "%d",
41 wiphy->retry_long); 41 wiphy->retry_long);
42 42
@@ -103,7 +103,7 @@ static const struct file_operations ht40allow_map_ops = {
103}; 103};
104 104
105#define DEBUGFS_ADD(name) \ 105#define DEBUGFS_ADD(name) \
106 debugfs_create_file(#name, S_IRUGO, phyd, &rdev->wiphy, &name## _ops); 106 debugfs_create_file(#name, 0444, phyd, &rdev->wiphy, &name## _ops)
107 107
108void cfg80211_debugfs_rdev_add(struct cfg80211_registered_device *rdev) 108void cfg80211_debugfs_rdev_add(struct cfg80211_registered_device *rdev)
109{ 109{
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index 4646cf5695b9..22b3d9990065 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -48,7 +48,8 @@ void cfg80211_rx_assoc_resp(struct net_device *dev, struct cfg80211_bss *bss,
48 /* update current_bss etc., consumes the bss reference */ 48 /* update current_bss etc., consumes the bss reference */
49 __cfg80211_connect_result(dev, mgmt->bssid, NULL, 0, ie, len - ieoffs, 49 __cfg80211_connect_result(dev, mgmt->bssid, NULL, 0, ie, len - ieoffs,
50 status_code, 50 status_code,
51 status_code == WLAN_STATUS_SUCCESS, bss); 51 status_code == WLAN_STATUS_SUCCESS, bss,
52 NL80211_TIMEOUT_UNSPECIFIED);
52} 53}
53EXPORT_SYMBOL(cfg80211_rx_assoc_resp); 54EXPORT_SYMBOL(cfg80211_rx_assoc_resp);
54 55
@@ -345,6 +346,11 @@ int cfg80211_mlme_deauth(struct cfg80211_registered_device *rdev,
345 !ether_addr_equal(wdev->current_bss->pub.bssid, bssid))) 346 !ether_addr_equal(wdev->current_bss->pub.bssid, bssid)))
346 return 0; 347 return 0;
347 348
349 if (ether_addr_equal(wdev->disconnect_bssid, bssid) ||
350 (wdev->current_bss &&
351 ether_addr_equal(wdev->current_bss->pub.bssid, bssid)))
352 wdev->conn_owner_nlportid = 0;
353
348 return rdev_deauth(rdev, dev, &req); 354 return rdev_deauth(rdev, dev, &req);
349} 355}
350 356
@@ -657,8 +663,25 @@ int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev,
657 return err; 663 return err;
658 } 664 }
659 665
660 if (!ether_addr_equal(mgmt->sa, wdev_address(wdev))) 666 if (!ether_addr_equal(mgmt->sa, wdev_address(wdev))) {
661 return -EINVAL; 667 /* Allow random TA to be used with Public Action frames if the
668 * driver has indicated support for this. Otherwise, only allow
669 * the local address to be used.
670 */
671 if (!ieee80211_is_action(mgmt->frame_control) ||
672 mgmt->u.action.category != WLAN_CATEGORY_PUBLIC)
673 return -EINVAL;
674 if (!wdev->current_bss &&
675 !wiphy_ext_feature_isset(
676 &rdev->wiphy,
677 NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA))
678 return -EINVAL;
679 if (wdev->current_bss &&
680 !wiphy_ext_feature_isset(
681 &rdev->wiphy,
682 NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA_CONNECTED))
683 return -EINVAL;
684 }
662 685
663 /* Transmit the Action frame as requested by user space */ 686 /* Transmit the Action frame as requested by user space */
664 return rdev_mgmt_tx(rdev, wdev, params, cookie); 687 return rdev_mgmt_tx(rdev, wdev, params, cookie);
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 5c1b267e22be..2312dc2ffdb9 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> 4 * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net>
5 * Copyright 2013-2014 Intel Mobile Communications GmbH 5 * Copyright 2013-2014 Intel Mobile Communications GmbH
6 * Copyright 2015-2016 Intel Deutschland GmbH 6 * Copyright 2015-2017 Intel Deutschland GmbH
7 */ 7 */
8 8
9#include <linux/if.h> 9#include <linux/if.h>
@@ -398,13 +398,18 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
398 }, 398 },
399 [NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR] = { .len = ETH_ALEN }, 399 [NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR] = { .len = ETH_ALEN },
400 [NL80211_ATTR_NAN_MASTER_PREF] = { .type = NLA_U8 }, 400 [NL80211_ATTR_NAN_MASTER_PREF] = { .type = NLA_U8 },
401 [NL80211_ATTR_NAN_DUAL] = { .type = NLA_U8 }, 401 [NL80211_ATTR_BANDS] = { .type = NLA_U32 },
402 [NL80211_ATTR_NAN_FUNC] = { .type = NLA_NESTED }, 402 [NL80211_ATTR_NAN_FUNC] = { .type = NLA_NESTED },
403 [NL80211_ATTR_FILS_KEK] = { .type = NLA_BINARY, 403 [NL80211_ATTR_FILS_KEK] = { .type = NLA_BINARY,
404 .len = FILS_MAX_KEK_LEN }, 404 .len = FILS_MAX_KEK_LEN },
405 [NL80211_ATTR_FILS_NONCES] = { .len = 2 * FILS_NONCE_LEN }, 405 [NL80211_ATTR_FILS_NONCES] = { .len = 2 * FILS_NONCE_LEN },
406 [NL80211_ATTR_MULTICAST_TO_UNICAST_ENABLED] = { .type = NLA_FLAG, }, 406 [NL80211_ATTR_MULTICAST_TO_UNICAST_ENABLED] = { .type = NLA_FLAG, },
407 [NL80211_ATTR_BSSID] = { .len = ETH_ALEN }, 407 [NL80211_ATTR_BSSID] = { .len = ETH_ALEN },
408 [NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI] = { .type = NLA_S8 },
409 [NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST] = {
410 .len = sizeof(struct nl80211_bss_select_rssi_adjust)
411 },
412 [NL80211_ATTR_TIMEOUT_REASON] = { .type = NLA_U32 },
408}; 413};
409 414
410/* policy for the key attributes */ 415/* policy for the key attributes */
@@ -540,22 +545,18 @@ static int nl80211_prepare_wdev_dump(struct sk_buff *skb,
540{ 545{
541 int err; 546 int err;
542 547
543 rtnl_lock();
544
545 if (!cb->args[0]) { 548 if (!cb->args[0]) {
546 err = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize, 549 err = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize,
547 genl_family_attrbuf(&nl80211_fam), 550 genl_family_attrbuf(&nl80211_fam),
548 nl80211_fam.maxattr, nl80211_policy); 551 nl80211_fam.maxattr, nl80211_policy);
549 if (err) 552 if (err)
550 goto out_unlock; 553 return err;
551 554
552 *wdev = __cfg80211_wdev_from_attrs( 555 *wdev = __cfg80211_wdev_from_attrs(
553 sock_net(skb->sk), 556 sock_net(skb->sk),
554 genl_family_attrbuf(&nl80211_fam)); 557 genl_family_attrbuf(&nl80211_fam));
555 if (IS_ERR(*wdev)) { 558 if (IS_ERR(*wdev))
556 err = PTR_ERR(*wdev); 559 return PTR_ERR(*wdev);
557 goto out_unlock;
558 }
559 *rdev = wiphy_to_rdev((*wdev)->wiphy); 560 *rdev = wiphy_to_rdev((*wdev)->wiphy);
560 /* 0 is the first index - add 1 to parse only once */ 561 /* 0 is the first index - add 1 to parse only once */
561 cb->args[0] = (*rdev)->wiphy_idx + 1; 562 cb->args[0] = (*rdev)->wiphy_idx + 1;
@@ -565,10 +566,8 @@ static int nl80211_prepare_wdev_dump(struct sk_buff *skb,
565 struct wiphy *wiphy = wiphy_idx_to_wiphy(cb->args[0] - 1); 566 struct wiphy *wiphy = wiphy_idx_to_wiphy(cb->args[0] - 1);
566 struct wireless_dev *tmp; 567 struct wireless_dev *tmp;
567 568
568 if (!wiphy) { 569 if (!wiphy)
569 err = -ENODEV; 570 return -ENODEV;
570 goto out_unlock;
571 }
572 *rdev = wiphy_to_rdev(wiphy); 571 *rdev = wiphy_to_rdev(wiphy);
573 *wdev = NULL; 572 *wdev = NULL;
574 573
@@ -579,21 +578,11 @@ static int nl80211_prepare_wdev_dump(struct sk_buff *skb,
579 } 578 }
580 } 579 }
581 580
582 if (!*wdev) { 581 if (!*wdev)
583 err = -ENODEV; 582 return -ENODEV;
584 goto out_unlock;
585 }
586 } 583 }
587 584
588 return 0; 585 return 0;
589 out_unlock:
590 rtnl_unlock();
591 return err;
592}
593
594static void nl80211_finish_wdev_dump(struct cfg80211_registered_device *rdev)
595{
596 rtnl_unlock();
597} 586}
598 587
599/* IE validation */ 588/* IE validation */
@@ -1881,6 +1870,10 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
1881 } 1870 }
1882 } 1871 }
1883 1872
1873 if (nla_put_u32(msg, NL80211_ATTR_BANDS,
1874 rdev->wiphy.nan_supported_bands))
1875 goto nla_put_failure;
1876
1884 /* done */ 1877 /* done */
1885 state->split_start = 0; 1878 state->split_start = 0;
1886 break; 1879 break;
@@ -2599,17 +2592,17 @@ static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback *
2599 int filter_wiphy = -1; 2592 int filter_wiphy = -1;
2600 struct cfg80211_registered_device *rdev; 2593 struct cfg80211_registered_device *rdev;
2601 struct wireless_dev *wdev; 2594 struct wireless_dev *wdev;
2595 int ret;
2602 2596
2603 rtnl_lock(); 2597 rtnl_lock();
2604 if (!cb->args[2]) { 2598 if (!cb->args[2]) {
2605 struct nl80211_dump_wiphy_state state = { 2599 struct nl80211_dump_wiphy_state state = {
2606 .filter_wiphy = -1, 2600 .filter_wiphy = -1,
2607 }; 2601 };
2608 int ret;
2609 2602
2610 ret = nl80211_dump_wiphy_parse(skb, cb, &state); 2603 ret = nl80211_dump_wiphy_parse(skb, cb, &state);
2611 if (ret) 2604 if (ret)
2612 return ret; 2605 goto out_unlock;
2613 2606
2614 filter_wiphy = state.filter_wiphy; 2607 filter_wiphy = state.filter_wiphy;
2615 2608
@@ -2654,12 +2647,14 @@ static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback *
2654 wp_idx++; 2647 wp_idx++;
2655 } 2648 }
2656 out: 2649 out:
2657 rtnl_unlock();
2658
2659 cb->args[0] = wp_idx; 2650 cb->args[0] = wp_idx;
2660 cb->args[1] = if_idx; 2651 cb->args[1] = if_idx;
2661 2652
2662 return skb->len; 2653 ret = skb->len;
2654 out_unlock:
2655 rtnl_unlock();
2656
2657 return ret;
2663} 2658}
2664 2659
2665static int nl80211_get_interface(struct sk_buff *skb, struct genl_info *info) 2660static int nl80211_get_interface(struct sk_buff *skb, struct genl_info *info)
@@ -3738,6 +3733,49 @@ static int nl80211_parse_beacon(struct nlattr *attrs[],
3738 return 0; 3733 return 0;
3739} 3734}
3740 3735
3736static void nl80211_check_ap_rate_selectors(struct cfg80211_ap_settings *params,
3737 const u8 *rates)
3738{
3739 int i;
3740
3741 if (!rates)
3742 return;
3743
3744 for (i = 0; i < rates[1]; i++) {
3745 if (rates[2 + i] == BSS_MEMBERSHIP_SELECTOR_HT_PHY)
3746 params->ht_required = true;
3747 if (rates[2 + i] == BSS_MEMBERSHIP_SELECTOR_VHT_PHY)
3748 params->vht_required = true;
3749 }
3750}
3751
3752/*
3753 * Since the nl80211 API didn't include, from the beginning, attributes about
3754 * HT/VHT requirements/capabilities, we parse them out of the IEs for the
3755 * benefit of drivers that rebuild IEs in the firmware.
3756 */
3757static void nl80211_calculate_ap_params(struct cfg80211_ap_settings *params)
3758{
3759 const struct cfg80211_beacon_data *bcn = &params->beacon;
3760 size_t ies_len = bcn->beacon_ies_len;
3761 const u8 *ies = bcn->beacon_ies;
3762 const u8 *rates;
3763 const u8 *cap;
3764
3765 rates = cfg80211_find_ie(WLAN_EID_SUPP_RATES, ies, ies_len);
3766 nl80211_check_ap_rate_selectors(params, rates);
3767
3768 rates = cfg80211_find_ie(WLAN_EID_EXT_SUPP_RATES, ies, ies_len);
3769 nl80211_check_ap_rate_selectors(params, rates);
3770
3771 cap = cfg80211_find_ie(WLAN_EID_HT_CAPABILITY, ies, ies_len);
3772 if (cap && cap[1] >= sizeof(*params->ht_cap))
3773 params->ht_cap = (void *)(cap + 2);
3774 cap = cfg80211_find_ie(WLAN_EID_VHT_CAPABILITY, ies, ies_len);
3775 if (cap && cap[1] >= sizeof(*params->vht_cap))
3776 params->vht_cap = (void *)(cap + 2);
3777}
3778
3741static bool nl80211_get_ap_channel(struct cfg80211_registered_device *rdev, 3779static bool nl80211_get_ap_channel(struct cfg80211_registered_device *rdev,
3742 struct cfg80211_ap_settings *params) 3780 struct cfg80211_ap_settings *params)
3743{ 3781{
@@ -3966,6 +4004,8 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
3966 return PTR_ERR(params.acl); 4004 return PTR_ERR(params.acl);
3967 } 4005 }
3968 4006
4007 nl80211_calculate_ap_params(&params);
4008
3969 wdev_lock(wdev); 4009 wdev_lock(wdev);
3970 err = rdev_start_ap(rdev, dev, &params); 4010 err = rdev_start_ap(rdev, dev, &params);
3971 if (!err) { 4011 if (!err) {
@@ -4398,9 +4438,10 @@ static int nl80211_dump_station(struct sk_buff *skb,
4398 int sta_idx = cb->args[2]; 4438 int sta_idx = cb->args[2];
4399 int err; 4439 int err;
4400 4440
4441 rtnl_lock();
4401 err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev); 4442 err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev);
4402 if (err) 4443 if (err)
4403 return err; 4444 goto out_err;
4404 4445
4405 if (!wdev->netdev) { 4446 if (!wdev->netdev) {
4406 err = -EINVAL; 4447 err = -EINVAL;
@@ -4435,7 +4476,7 @@ static int nl80211_dump_station(struct sk_buff *skb,
4435 cb->args[2] = sta_idx; 4476 cb->args[2] = sta_idx;
4436 err = skb->len; 4477 err = skb->len;
4437 out_err: 4478 out_err:
4438 nl80211_finish_wdev_dump(rdev); 4479 rtnl_unlock();
4439 4480
4440 return err; 4481 return err;
4441} 4482}
@@ -5221,9 +5262,10 @@ static int nl80211_dump_mpath(struct sk_buff *skb,
5221 int path_idx = cb->args[2]; 5262 int path_idx = cb->args[2];
5222 int err; 5263 int err;
5223 5264
5265 rtnl_lock();
5224 err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev); 5266 err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev);
5225 if (err) 5267 if (err)
5226 return err; 5268 goto out_err;
5227 5269
5228 if (!rdev->ops->dump_mpath) { 5270 if (!rdev->ops->dump_mpath) {
5229 err = -EOPNOTSUPP; 5271 err = -EOPNOTSUPP;
@@ -5256,7 +5298,7 @@ static int nl80211_dump_mpath(struct sk_buff *skb,
5256 cb->args[2] = path_idx; 5298 cb->args[2] = path_idx;
5257 err = skb->len; 5299 err = skb->len;
5258 out_err: 5300 out_err:
5259 nl80211_finish_wdev_dump(rdev); 5301 rtnl_unlock();
5260 return err; 5302 return err;
5261} 5303}
5262 5304
@@ -5416,9 +5458,10 @@ static int nl80211_dump_mpp(struct sk_buff *skb,
5416 int path_idx = cb->args[2]; 5458 int path_idx = cb->args[2];
5417 int err; 5459 int err;
5418 5460
5461 rtnl_lock();
5419 err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev); 5462 err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev);
5420 if (err) 5463 if (err)
5421 return err; 5464 goto out_err;
5422 5465
5423 if (!rdev->ops->dump_mpp) { 5466 if (!rdev->ops->dump_mpp) {
5424 err = -EOPNOTSUPP; 5467 err = -EOPNOTSUPP;
@@ -5451,7 +5494,7 @@ static int nl80211_dump_mpp(struct sk_buff *skb,
5451 cb->args[2] = path_idx; 5494 cb->args[2] = path_idx;
5452 err = skb->len; 5495 err = skb->len;
5453 out_err: 5496 out_err:
5454 nl80211_finish_wdev_dump(rdev); 5497 rtnl_unlock();
5455 return err; 5498 return err;
5456} 5499}
5457 5500
@@ -5916,6 +5959,7 @@ do { \
5916 break; 5959 break;
5917 } 5960 }
5918 cfg->ht_opmode = ht_opmode; 5961 cfg->ht_opmode = ht_opmode;
5962 mask |= (1 << (NL80211_MESHCONF_HT_OPMODE - 1));
5919 } 5963 }
5920 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPactivePathToRootTimeout, 5964 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPactivePathToRootTimeout,
5921 1, 65535, mask, 5965 1, 65535, mask,
@@ -6790,13 +6834,10 @@ nl80211_parse_sched_scan_plans(struct wiphy *wiphy, int n_plans,
6790 6834
6791 /* 6835 /*
6792 * If scan plans are not specified, 6836 * If scan plans are not specified,
6793 * %NL80211_ATTR_SCHED_SCAN_INTERVAL must be specified. In this 6837 * %NL80211_ATTR_SCHED_SCAN_INTERVAL will be specified. In this
6794 * case one scan plan will be set with the specified scan 6838 * case one scan plan will be set with the specified scan
6795 * interval and infinite number of iterations. 6839 * interval and infinite number of iterations.
6796 */ 6840 */
6797 if (!attrs[NL80211_ATTR_SCHED_SCAN_INTERVAL])
6798 return -EINVAL;
6799
6800 interval = nla_get_u32(attrs[NL80211_ATTR_SCHED_SCAN_INTERVAL]); 6841 interval = nla_get_u32(attrs[NL80211_ATTR_SCHED_SCAN_INTERVAL]);
6801 if (!interval) 6842 if (!interval)
6802 return -EINVAL; 6843 return -EINVAL;
@@ -6865,7 +6906,7 @@ nl80211_parse_sched_scan_plans(struct wiphy *wiphy, int n_plans,
6865 6906
6866static struct cfg80211_sched_scan_request * 6907static struct cfg80211_sched_scan_request *
6867nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev, 6908nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev,
6868 struct nlattr **attrs) 6909 struct nlattr **attrs, int max_match_sets)
6869{ 6910{
6870 struct cfg80211_sched_scan_request *request; 6911 struct cfg80211_sched_scan_request *request;
6871 struct nlattr *attr; 6912 struct nlattr *attr;
@@ -6930,7 +6971,7 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev,
6930 if (!n_match_sets && default_match_rssi != NL80211_SCAN_RSSI_THOLD_OFF) 6971 if (!n_match_sets && default_match_rssi != NL80211_SCAN_RSSI_THOLD_OFF)
6931 n_match_sets = 1; 6972 n_match_sets = 1;
6932 6973
6933 if (n_match_sets > wiphy->max_match_sets) 6974 if (n_match_sets > max_match_sets)
6934 return ERR_PTR(-EINVAL); 6975 return ERR_PTR(-EINVAL);
6935 6976
6936 if (attrs[NL80211_ATTR_IE]) 6977 if (attrs[NL80211_ATTR_IE])
@@ -6968,6 +7009,12 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev,
6968 if (!n_plans || n_plans > wiphy->max_sched_scan_plans) 7009 if (!n_plans || n_plans > wiphy->max_sched_scan_plans)
6969 return ERR_PTR(-EINVAL); 7010 return ERR_PTR(-EINVAL);
6970 7011
7012 if (!wiphy_ext_feature_isset(
7013 wiphy, NL80211_EXT_FEATURE_SCHED_SCAN_RELATIVE_RSSI) &&
7014 (attrs[NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI] ||
7015 attrs[NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST]))
7016 return ERR_PTR(-EINVAL);
7017
6971 request = kzalloc(sizeof(*request) 7018 request = kzalloc(sizeof(*request)
6972 + sizeof(*request->ssids) * n_ssids 7019 + sizeof(*request->ssids) * n_ssids
6973 + sizeof(*request->match_sets) * n_match_sets 7020 + sizeof(*request->match_sets) * n_match_sets
@@ -7174,6 +7221,26 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev,
7174 request->delay = 7221 request->delay =
7175 nla_get_u32(attrs[NL80211_ATTR_SCHED_SCAN_DELAY]); 7222 nla_get_u32(attrs[NL80211_ATTR_SCHED_SCAN_DELAY]);
7176 7223
7224 if (attrs[NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI]) {
7225 request->relative_rssi = nla_get_s8(
7226 attrs[NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI]);
7227 request->relative_rssi_set = true;
7228 }
7229
7230 if (request->relative_rssi_set &&
7231 attrs[NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST]) {
7232 struct nl80211_bss_select_rssi_adjust *rssi_adjust;
7233
7234 rssi_adjust = nla_data(
7235 attrs[NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST]);
7236 request->rssi_adjust.band = rssi_adjust->band;
7237 request->rssi_adjust.delta = rssi_adjust->delta;
7238 if (!is_band_valid(wiphy, request->rssi_adjust.band)) {
7239 err = -EINVAL;
7240 goto out_free;
7241 }
7242 }
7243
7177 err = nl80211_parse_sched_scan_plans(wiphy, n_plans, request, attrs); 7244 err = nl80211_parse_sched_scan_plans(wiphy, n_plans, request, attrs);
7178 if (err) 7245 if (err)
7179 goto out_free; 7246 goto out_free;
@@ -7204,7 +7271,8 @@ static int nl80211_start_sched_scan(struct sk_buff *skb,
7204 return -EINPROGRESS; 7271 return -EINPROGRESS;
7205 7272
7206 sched_scan_req = nl80211_parse_sched_scan(&rdev->wiphy, wdev, 7273 sched_scan_req = nl80211_parse_sched_scan(&rdev->wiphy, wdev,
7207 info->attrs); 7274 info->attrs,
7275 rdev->wiphy.max_match_sets);
7208 7276
7209 err = PTR_ERR_OR_ZERO(sched_scan_req); 7277 err = PTR_ERR_OR_ZERO(sched_scan_req);
7210 if (err) 7278 if (err)
@@ -7595,9 +7663,12 @@ static int nl80211_dump_scan(struct sk_buff *skb, struct netlink_callback *cb)
7595 int start = cb->args[2], idx = 0; 7663 int start = cb->args[2], idx = 0;
7596 int err; 7664 int err;
7597 7665
7666 rtnl_lock();
7598 err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev); 7667 err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev);
7599 if (err) 7668 if (err) {
7669 rtnl_unlock();
7600 return err; 7670 return err;
7671 }
7601 7672
7602 wdev_lock(wdev); 7673 wdev_lock(wdev);
7603 spin_lock_bh(&rdev->bss_lock); 7674 spin_lock_bh(&rdev->bss_lock);
@@ -7620,7 +7691,7 @@ static int nl80211_dump_scan(struct sk_buff *skb, struct netlink_callback *cb)
7620 wdev_unlock(wdev); 7691 wdev_unlock(wdev);
7621 7692
7622 cb->args[2] = idx; 7693 cb->args[2] = idx;
7623 nl80211_finish_wdev_dump(rdev); 7694 rtnl_unlock();
7624 7695
7625 return skb->len; 7696 return skb->len;
7626} 7697}
@@ -7705,9 +7776,10 @@ static int nl80211_dump_survey(struct sk_buff *skb, struct netlink_callback *cb)
7705 int res; 7776 int res;
7706 bool radio_stats; 7777 bool radio_stats;
7707 7778
7779 rtnl_lock();
7708 res = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev); 7780 res = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev);
7709 if (res) 7781 if (res)
7710 return res; 7782 goto out_err;
7711 7783
7712 /* prepare_wdev_dump parsed the attributes */ 7784 /* prepare_wdev_dump parsed the attributes */
7713 radio_stats = attrbuf[NL80211_ATTR_SURVEY_RADIO_STATS]; 7785 radio_stats = attrbuf[NL80211_ATTR_SURVEY_RADIO_STATS];
@@ -7748,7 +7820,7 @@ static int nl80211_dump_survey(struct sk_buff *skb, struct netlink_callback *cb)
7748 cb->args[2] = survey_idx; 7820 cb->args[2] = survey_idx;
7749 res = skb->len; 7821 res = skb->len;
7750 out_err: 7822 out_err:
7751 nl80211_finish_wdev_dump(rdev); 7823 rtnl_unlock();
7752 return res; 7824 return res;
7753} 7825}
7754 7826
@@ -8068,8 +8140,17 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
8068 err = nl80211_crypto_settings(rdev, info, &req.crypto, 1); 8140 err = nl80211_crypto_settings(rdev, info, &req.crypto, 1);
8069 if (!err) { 8141 if (!err) {
8070 wdev_lock(dev->ieee80211_ptr); 8142 wdev_lock(dev->ieee80211_ptr);
8143
8071 err = cfg80211_mlme_assoc(rdev, dev, chan, bssid, 8144 err = cfg80211_mlme_assoc(rdev, dev, chan, bssid,
8072 ssid, ssid_len, &req); 8145 ssid, ssid_len, &req);
8146
8147 if (!err && info->attrs[NL80211_ATTR_SOCKET_OWNER]) {
8148 dev->ieee80211_ptr->conn_owner_nlportid =
8149 info->snd_portid;
8150 memcpy(dev->ieee80211_ptr->disconnect_bssid,
8151 bssid, ETH_ALEN);
8152 }
8153
8073 wdev_unlock(dev->ieee80211_ptr); 8154 wdev_unlock(dev->ieee80211_ptr);
8074 } 8155 }
8075 8156
@@ -8548,6 +8629,12 @@ static int nl80211_testmode_dump(struct sk_buff *skb,
8548 * so we need to offset by 1. 8629 * so we need to offset by 1.
8549 */ 8630 */
8550 phy_idx = cb->args[0] - 1; 8631 phy_idx = cb->args[0] - 1;
8632
8633 rdev = cfg80211_rdev_by_wiphy_idx(phy_idx);
8634 if (!rdev) {
8635 err = -ENOENT;
8636 goto out_err;
8637 }
8551 } else { 8638 } else {
8552 struct nlattr **attrbuf = genl_family_attrbuf(&nl80211_fam); 8639 struct nlattr **attrbuf = genl_family_attrbuf(&nl80211_fam);
8553 8640
@@ -8562,7 +8649,6 @@ static int nl80211_testmode_dump(struct sk_buff *skb,
8562 goto out_err; 8649 goto out_err;
8563 } 8650 }
8564 phy_idx = rdev->wiphy_idx; 8651 phy_idx = rdev->wiphy_idx;
8565 rdev = NULL;
8566 8652
8567 if (attrbuf[NL80211_ATTR_TESTDATA]) 8653 if (attrbuf[NL80211_ATTR_TESTDATA])
8568 cb->args[1] = (long)attrbuf[NL80211_ATTR_TESTDATA]; 8654 cb->args[1] = (long)attrbuf[NL80211_ATTR_TESTDATA];
@@ -8573,12 +8659,6 @@ static int nl80211_testmode_dump(struct sk_buff *skb,
8573 data_len = nla_len((void *)cb->args[1]); 8659 data_len = nla_len((void *)cb->args[1]);
8574 } 8660 }
8575 8661
8576 rdev = cfg80211_rdev_by_wiphy_idx(phy_idx);
8577 if (!rdev) {
8578 err = -ENOENT;
8579 goto out_err;
8580 }
8581
8582 if (!rdev->ops->testmode_dump) { 8662 if (!rdev->ops->testmode_dump) {
8583 err = -EOPNOTSUPP; 8663 err = -EOPNOTSUPP;
8584 goto out_err; 8664 goto out_err;
@@ -8788,11 +8868,24 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
8788 } 8868 }
8789 8869
8790 wdev_lock(dev->ieee80211_ptr); 8870 wdev_lock(dev->ieee80211_ptr);
8871
8791 err = cfg80211_connect(rdev, dev, &connect, connkeys, 8872 err = cfg80211_connect(rdev, dev, &connect, connkeys,
8792 connect.prev_bssid); 8873 connect.prev_bssid);
8793 wdev_unlock(dev->ieee80211_ptr);
8794 if (err) 8874 if (err)
8795 kzfree(connkeys); 8875 kzfree(connkeys);
8876
8877 if (!err && info->attrs[NL80211_ATTR_SOCKET_OWNER]) {
8878 dev->ieee80211_ptr->conn_owner_nlportid = info->snd_portid;
8879 if (connect.bssid)
8880 memcpy(dev->ieee80211_ptr->disconnect_bssid,
8881 connect.bssid, ETH_ALEN);
8882 else
8883 memset(dev->ieee80211_ptr->disconnect_bssid,
8884 0, ETH_ALEN);
8885 }
8886
8887 wdev_unlock(dev->ieee80211_ptr);
8888
8796 return err; 8889 return err;
8797} 8890}
8798 8891
@@ -9379,6 +9472,7 @@ nl80211_attr_cqm_policy[NL80211_ATTR_CQM_MAX + 1] = {
9379 [NL80211_ATTR_CQM_TXE_RATE] = { .type = NLA_U32 }, 9472 [NL80211_ATTR_CQM_TXE_RATE] = { .type = NLA_U32 },
9380 [NL80211_ATTR_CQM_TXE_PKTS] = { .type = NLA_U32 }, 9473 [NL80211_ATTR_CQM_TXE_PKTS] = { .type = NLA_U32 },
9381 [NL80211_ATTR_CQM_TXE_INTVL] = { .type = NLA_U32 }, 9474 [NL80211_ATTR_CQM_TXE_INTVL] = { .type = NLA_U32 },
9475 [NL80211_ATTR_CQM_RSSI_LEVEL] = { .type = NLA_S32 },
9382}; 9476};
9383 9477
9384static int nl80211_set_cqm_txe(struct genl_info *info, 9478static int nl80211_set_cqm_txe(struct genl_info *info,
@@ -9688,6 +9782,20 @@ static int nl80211_send_wowlan_nd(struct sk_buff *msg,
9688 if (nla_put_u32(msg, NL80211_ATTR_SCHED_SCAN_DELAY, req->delay)) 9782 if (nla_put_u32(msg, NL80211_ATTR_SCHED_SCAN_DELAY, req->delay))
9689 return -ENOBUFS; 9783 return -ENOBUFS;
9690 9784
9785 if (req->relative_rssi_set) {
9786 struct nl80211_bss_select_rssi_adjust rssi_adjust;
9787
9788 if (nla_put_s8(msg, NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI,
9789 req->relative_rssi))
9790 return -ENOBUFS;
9791
9792 rssi_adjust.band = req->rssi_adjust.band;
9793 rssi_adjust.delta = req->rssi_adjust.delta;
9794 if (nla_put(msg, NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST,
9795 sizeof(rssi_adjust), &rssi_adjust))
9796 return -ENOBUFS;
9797 }
9798
9691 freqs = nla_nest_start(msg, NL80211_ATTR_SCAN_FREQUENCIES); 9799 freqs = nla_nest_start(msg, NL80211_ATTR_SCAN_FREQUENCIES);
9692 if (!freqs) 9800 if (!freqs)
9693 return -ENOBUFS; 9801 return -ENOBUFS;
@@ -9981,7 +10089,8 @@ static int nl80211_parse_wowlan_nd(struct cfg80211_registered_device *rdev,
9981 if (err) 10089 if (err)
9982 goto out; 10090 goto out;
9983 10091
9984 trig->nd_config = nl80211_parse_sched_scan(&rdev->wiphy, NULL, tb); 10092 trig->nd_config = nl80211_parse_sched_scan(&rdev->wiphy, NULL, tb,
10093 wowlan->max_nd_match_sets);
9985 err = PTR_ERR_OR_ZERO(trig->nd_config); 10094 err = PTR_ERR_OR_ZERO(trig->nd_config);
9986 if (err) 10095 if (err)
9987 trig->nd_config = NULL; 10096 trig->nd_config = NULL;
@@ -10666,15 +10775,22 @@ static int nl80211_start_nan(struct sk_buff *skb, struct genl_info *info)
10666 if (!info->attrs[NL80211_ATTR_NAN_MASTER_PREF]) 10775 if (!info->attrs[NL80211_ATTR_NAN_MASTER_PREF])
10667 return -EINVAL; 10776 return -EINVAL;
10668 10777
10669 if (!info->attrs[NL80211_ATTR_NAN_DUAL])
10670 return -EINVAL;
10671
10672 conf.master_pref = 10778 conf.master_pref =
10673 nla_get_u8(info->attrs[NL80211_ATTR_NAN_MASTER_PREF]); 10779 nla_get_u8(info->attrs[NL80211_ATTR_NAN_MASTER_PREF]);
10674 if (!conf.master_pref) 10780 if (!conf.master_pref)
10675 return -EINVAL; 10781 return -EINVAL;
10676 10782
10677 conf.dual = nla_get_u8(info->attrs[NL80211_ATTR_NAN_DUAL]); 10783 if (info->attrs[NL80211_ATTR_BANDS]) {
10784 u32 bands = nla_get_u32(info->attrs[NL80211_ATTR_BANDS]);
10785
10786 if (bands & ~(u32)wdev->wiphy->nan_supported_bands)
10787 return -EOPNOTSUPP;
10788
10789 if (bands && !(bands & BIT(NL80211_BAND_2GHZ)))
10790 return -EINVAL;
10791
10792 conf.bands = bands;
10793 }
10678 10794
10679 err = rdev_start_nan(rdev, wdev, &conf); 10795 err = rdev_start_nan(rdev, wdev, &conf);
10680 if (err) 10796 if (err)
@@ -11039,9 +11155,17 @@ static int nl80211_nan_change_config(struct sk_buff *skb,
11039 changed |= CFG80211_NAN_CONF_CHANGED_PREF; 11155 changed |= CFG80211_NAN_CONF_CHANGED_PREF;
11040 } 11156 }
11041 11157
11042 if (info->attrs[NL80211_ATTR_NAN_DUAL]) { 11158 if (info->attrs[NL80211_ATTR_BANDS]) {
11043 conf.dual = nla_get_u8(info->attrs[NL80211_ATTR_NAN_DUAL]); 11159 u32 bands = nla_get_u32(info->attrs[NL80211_ATTR_BANDS]);
11044 changed |= CFG80211_NAN_CONF_CHANGED_DUAL; 11160
11161 if (bands & ~(u32)wdev->wiphy->nan_supported_bands)
11162 return -EOPNOTSUPP;
11163
11164 if (bands && !(bands & BIT(NL80211_BAND_2GHZ)))
11165 return -EINVAL;
11166
11167 conf.bands = bands;
11168 changed |= CFG80211_NAN_CONF_CHANGED_BANDS;
11045 } 11169 }
11046 11170
11047 if (!changed) 11171 if (!changed)
@@ -11377,17 +11501,13 @@ static int nl80211_prepare_vendor_dump(struct sk_buff *skb,
11377 void *data = NULL; 11501 void *data = NULL;
11378 unsigned int data_len = 0; 11502 unsigned int data_len = 0;
11379 11503
11380 rtnl_lock();
11381
11382 if (cb->args[0]) { 11504 if (cb->args[0]) {
11383 /* subtract the 1 again here */ 11505 /* subtract the 1 again here */
11384 struct wiphy *wiphy = wiphy_idx_to_wiphy(cb->args[0] - 1); 11506 struct wiphy *wiphy = wiphy_idx_to_wiphy(cb->args[0] - 1);
11385 struct wireless_dev *tmp; 11507 struct wireless_dev *tmp;
11386 11508
11387 if (!wiphy) { 11509 if (!wiphy)
11388 err = -ENODEV; 11510 return -ENODEV;
11389 goto out_unlock;
11390 }
11391 *rdev = wiphy_to_rdev(wiphy); 11511 *rdev = wiphy_to_rdev(wiphy);
11392 *wdev = NULL; 11512 *wdev = NULL;
11393 11513
@@ -11407,23 +11527,19 @@ static int nl80211_prepare_vendor_dump(struct sk_buff *skb,
11407 err = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize, 11527 err = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize,
11408 attrbuf, nl80211_fam.maxattr, nl80211_policy); 11528 attrbuf, nl80211_fam.maxattr, nl80211_policy);
11409 if (err) 11529 if (err)
11410 goto out_unlock; 11530 return err;
11411 11531
11412 if (!attrbuf[NL80211_ATTR_VENDOR_ID] || 11532 if (!attrbuf[NL80211_ATTR_VENDOR_ID] ||
11413 !attrbuf[NL80211_ATTR_VENDOR_SUBCMD]) { 11533 !attrbuf[NL80211_ATTR_VENDOR_SUBCMD])
11414 err = -EINVAL; 11534 return -EINVAL;
11415 goto out_unlock;
11416 }
11417 11535
11418 *wdev = __cfg80211_wdev_from_attrs(sock_net(skb->sk), attrbuf); 11536 *wdev = __cfg80211_wdev_from_attrs(sock_net(skb->sk), attrbuf);
11419 if (IS_ERR(*wdev)) 11537 if (IS_ERR(*wdev))
11420 *wdev = NULL; 11538 *wdev = NULL;
11421 11539
11422 *rdev = __cfg80211_rdev_from_attrs(sock_net(skb->sk), attrbuf); 11540 *rdev = __cfg80211_rdev_from_attrs(sock_net(skb->sk), attrbuf);
11423 if (IS_ERR(*rdev)) { 11541 if (IS_ERR(*rdev))
11424 err = PTR_ERR(*rdev); 11542 return PTR_ERR(*rdev);
11425 goto out_unlock;
11426 }
11427 11543
11428 vid = nla_get_u32(attrbuf[NL80211_ATTR_VENDOR_ID]); 11544 vid = nla_get_u32(attrbuf[NL80211_ATTR_VENDOR_ID]);
11429 subcmd = nla_get_u32(attrbuf[NL80211_ATTR_VENDOR_SUBCMD]); 11545 subcmd = nla_get_u32(attrbuf[NL80211_ATTR_VENDOR_SUBCMD]);
@@ -11436,19 +11552,15 @@ static int nl80211_prepare_vendor_dump(struct sk_buff *skb,
11436 if (vcmd->info.vendor_id != vid || vcmd->info.subcmd != subcmd) 11552 if (vcmd->info.vendor_id != vid || vcmd->info.subcmd != subcmd)
11437 continue; 11553 continue;
11438 11554
11439 if (!vcmd->dumpit) { 11555 if (!vcmd->dumpit)
11440 err = -EOPNOTSUPP; 11556 return -EOPNOTSUPP;
11441 goto out_unlock;
11442 }
11443 11557
11444 vcmd_idx = i; 11558 vcmd_idx = i;
11445 break; 11559 break;
11446 } 11560 }
11447 11561
11448 if (vcmd_idx < 0) { 11562 if (vcmd_idx < 0)
11449 err = -EOPNOTSUPP; 11563 return -EOPNOTSUPP;
11450 goto out_unlock;
11451 }
11452 11564
11453 if (attrbuf[NL80211_ATTR_VENDOR_DATA]) { 11565 if (attrbuf[NL80211_ATTR_VENDOR_DATA]) {
11454 data = nla_data(attrbuf[NL80211_ATTR_VENDOR_DATA]); 11566 data = nla_data(attrbuf[NL80211_ATTR_VENDOR_DATA]);
@@ -11465,9 +11577,6 @@ static int nl80211_prepare_vendor_dump(struct sk_buff *skb,
11465 11577
11466 /* keep rtnl locked in successful case */ 11578 /* keep rtnl locked in successful case */
11467 return 0; 11579 return 0;
11468 out_unlock:
11469 rtnl_unlock();
11470 return err;
11471} 11580}
11472 11581
11473static int nl80211_vendor_cmd_dump(struct sk_buff *skb, 11582static int nl80211_vendor_cmd_dump(struct sk_buff *skb,
@@ -11482,9 +11591,10 @@ static int nl80211_vendor_cmd_dump(struct sk_buff *skb,
11482 int err; 11591 int err;
11483 struct nlattr *vendor_data; 11592 struct nlattr *vendor_data;
11484 11593
11594 rtnl_lock();
11485 err = nl80211_prepare_vendor_dump(skb, cb, &rdev, &wdev); 11595 err = nl80211_prepare_vendor_dump(skb, cb, &rdev, &wdev);
11486 if (err) 11596 if (err)
11487 return err; 11597 goto out;
11488 11598
11489 vcmd_idx = cb->args[2]; 11599 vcmd_idx = cb->args[2];
11490 data = (void *)cb->args[3]; 11600 data = (void *)cb->args[3];
@@ -11493,15 +11603,21 @@ static int nl80211_vendor_cmd_dump(struct sk_buff *skb,
11493 11603
11494 if (vcmd->flags & (WIPHY_VENDOR_CMD_NEED_WDEV | 11604 if (vcmd->flags & (WIPHY_VENDOR_CMD_NEED_WDEV |
11495 WIPHY_VENDOR_CMD_NEED_NETDEV)) { 11605 WIPHY_VENDOR_CMD_NEED_NETDEV)) {
11496 if (!wdev) 11606 if (!wdev) {
11497 return -EINVAL; 11607 err = -EINVAL;
11608 goto out;
11609 }
11498 if (vcmd->flags & WIPHY_VENDOR_CMD_NEED_NETDEV && 11610 if (vcmd->flags & WIPHY_VENDOR_CMD_NEED_NETDEV &&
11499 !wdev->netdev) 11611 !wdev->netdev) {
11500 return -EINVAL; 11612 err = -EINVAL;
11613 goto out;
11614 }
11501 11615
11502 if (vcmd->flags & WIPHY_VENDOR_CMD_NEED_RUNNING) { 11616 if (vcmd->flags & WIPHY_VENDOR_CMD_NEED_RUNNING) {
11503 if (!wdev_running(wdev)) 11617 if (!wdev_running(wdev)) {
11504 return -ENETDOWN; 11618 err = -ENETDOWN;
11619 goto out;
11620 }
11505 } 11621 }
11506 } 11622 }
11507 11623
@@ -11822,9 +11938,6 @@ static int nl80211_set_multicast_to_unicast(struct sk_buff *skb,
11822 const struct nlattr *nla; 11938 const struct nlattr *nla;
11823 bool enabled; 11939 bool enabled;
11824 11940
11825 if (netif_running(dev))
11826 return -EBUSY;
11827
11828 if (!rdev->ops->set_multicast_to_unicast) 11941 if (!rdev->ops->set_multicast_to_unicast)
11829 return -EOPNOTSUPP; 11942 return -EOPNOTSUPP;
11830 11943
@@ -12825,7 +12938,7 @@ static int nl80211_add_scan_req(struct sk_buff *msg,
12825 return -ENOBUFS; 12938 return -ENOBUFS;
12826} 12939}
12827 12940
12828static int nl80211_send_scan_msg(struct sk_buff *msg, 12941static int nl80211_prep_scan_msg(struct sk_buff *msg,
12829 struct cfg80211_registered_device *rdev, 12942 struct cfg80211_registered_device *rdev,
12830 struct wireless_dev *wdev, 12943 struct wireless_dev *wdev,
12831 u32 portid, u32 seq, int flags, 12944 u32 portid, u32 seq, int flags,
@@ -12856,7 +12969,7 @@ static int nl80211_send_scan_msg(struct sk_buff *msg,
12856} 12969}
12857 12970
12858static int 12971static int
12859nl80211_send_sched_scan_msg(struct sk_buff *msg, 12972nl80211_prep_sched_scan_msg(struct sk_buff *msg,
12860 struct cfg80211_registered_device *rdev, 12973 struct cfg80211_registered_device *rdev,
12861 struct net_device *netdev, 12974 struct net_device *netdev,
12862 u32 portid, u32 seq, int flags, u32 cmd) 12975 u32 portid, u32 seq, int flags, u32 cmd)
@@ -12888,7 +13001,7 @@ void nl80211_send_scan_start(struct cfg80211_registered_device *rdev,
12888 if (!msg) 13001 if (!msg)
12889 return; 13002 return;
12890 13003
12891 if (nl80211_send_scan_msg(msg, rdev, wdev, 0, 0, 0, 13004 if (nl80211_prep_scan_msg(msg, rdev, wdev, 0, 0, 0,
12892 NL80211_CMD_TRIGGER_SCAN) < 0) { 13005 NL80211_CMD_TRIGGER_SCAN) < 0) {
12893 nlmsg_free(msg); 13006 nlmsg_free(msg);
12894 return; 13007 return;
@@ -12907,7 +13020,7 @@ struct sk_buff *nl80211_build_scan_msg(struct cfg80211_registered_device *rdev,
12907 if (!msg) 13020 if (!msg)
12908 return NULL; 13021 return NULL;
12909 13022
12910 if (nl80211_send_scan_msg(msg, rdev, wdev, 0, 0, 0, 13023 if (nl80211_prep_scan_msg(msg, rdev, wdev, 0, 0, 0,
12911 aborted ? NL80211_CMD_SCAN_ABORTED : 13024 aborted ? NL80211_CMD_SCAN_ABORTED :
12912 NL80211_CMD_NEW_SCAN_RESULTS) < 0) { 13025 NL80211_CMD_NEW_SCAN_RESULTS) < 0) {
12913 nlmsg_free(msg); 13026 nlmsg_free(msg);
@@ -12917,31 +13030,13 @@ struct sk_buff *nl80211_build_scan_msg(struct cfg80211_registered_device *rdev,
12917 return msg; 13030 return msg;
12918} 13031}
12919 13032
12920void nl80211_send_scan_result(struct cfg80211_registered_device *rdev, 13033/* send message created by nl80211_build_scan_msg() */
12921 struct sk_buff *msg) 13034void nl80211_send_scan_msg(struct cfg80211_registered_device *rdev,
12922{ 13035 struct sk_buff *msg)
12923 if (!msg)
12924 return;
12925
12926 genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
12927 NL80211_MCGRP_SCAN, GFP_KERNEL);
12928}
12929
12930void nl80211_send_sched_scan_results(struct cfg80211_registered_device *rdev,
12931 struct net_device *netdev)
12932{ 13036{
12933 struct sk_buff *msg;
12934
12935 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
12936 if (!msg) 13037 if (!msg)
12937 return; 13038 return;
12938 13039
12939 if (nl80211_send_sched_scan_msg(msg, rdev, netdev, 0, 0, 0,
12940 NL80211_CMD_SCHED_SCAN_RESULTS) < 0) {
12941 nlmsg_free(msg);
12942 return;
12943 }
12944
12945 genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0, 13040 genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
12946 NL80211_MCGRP_SCAN, GFP_KERNEL); 13041 NL80211_MCGRP_SCAN, GFP_KERNEL);
12947} 13042}
@@ -12955,7 +13050,7 @@ void nl80211_send_sched_scan(struct cfg80211_registered_device *rdev,
12955 if (!msg) 13050 if (!msg)
12956 return; 13051 return;
12957 13052
12958 if (nl80211_send_sched_scan_msg(msg, rdev, netdev, 0, 0, 0, cmd) < 0) { 13053 if (nl80211_prep_sched_scan_msg(msg, rdev, netdev, 0, 0, 0, cmd) < 0) {
12959 nlmsg_free(msg); 13054 nlmsg_free(msg);
12960 return; 13055 return;
12961 } 13056 }
@@ -13057,7 +13152,7 @@ static void nl80211_send_mlme_event(struct cfg80211_registered_device *rdev,
13057 struct sk_buff *msg; 13152 struct sk_buff *msg;
13058 void *hdr; 13153 void *hdr;
13059 13154
13060 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); 13155 msg = nlmsg_new(100 + len, gfp);
13061 if (!msg) 13156 if (!msg)
13062 return; 13157 return;
13063 13158
@@ -13204,12 +13299,14 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
13204 struct net_device *netdev, const u8 *bssid, 13299 struct net_device *netdev, const u8 *bssid,
13205 const u8 *req_ie, size_t req_ie_len, 13300 const u8 *req_ie, size_t req_ie_len,
13206 const u8 *resp_ie, size_t resp_ie_len, 13301 const u8 *resp_ie, size_t resp_ie_len,
13207 int status, gfp_t gfp) 13302 int status,
13303 enum nl80211_timeout_reason timeout_reason,
13304 gfp_t gfp)
13208{ 13305{
13209 struct sk_buff *msg; 13306 struct sk_buff *msg;
13210 void *hdr; 13307 void *hdr;
13211 13308
13212 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); 13309 msg = nlmsg_new(100 + req_ie_len + resp_ie_len, gfp);
13213 if (!msg) 13310 if (!msg)
13214 return; 13311 return;
13215 13312
@@ -13225,7 +13322,9 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
13225 nla_put_u16(msg, NL80211_ATTR_STATUS_CODE, 13322 nla_put_u16(msg, NL80211_ATTR_STATUS_CODE,
13226 status < 0 ? WLAN_STATUS_UNSPECIFIED_FAILURE : 13323 status < 0 ? WLAN_STATUS_UNSPECIFIED_FAILURE :
13227 status) || 13324 status) ||
13228 (status < 0 && nla_put_flag(msg, NL80211_ATTR_TIMED_OUT)) || 13325 (status < 0 &&
13326 (nla_put_flag(msg, NL80211_ATTR_TIMED_OUT) ||
13327 nla_put_u32(msg, NL80211_ATTR_TIMEOUT_REASON, timeout_reason))) ||
13229 (req_ie && 13328 (req_ie &&
13230 nla_put(msg, NL80211_ATTR_REQ_IE, req_ie_len, req_ie)) || 13329 nla_put(msg, NL80211_ATTR_REQ_IE, req_ie_len, req_ie)) ||
13231 (resp_ie && 13330 (resp_ie &&
@@ -13251,7 +13350,7 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
13251 struct sk_buff *msg; 13350 struct sk_buff *msg;
13252 void *hdr; 13351 void *hdr;
13253 13352
13254 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); 13353 msg = nlmsg_new(100 + req_ie_len + resp_ie_len, gfp);
13255 if (!msg) 13354 if (!msg)
13256 return; 13355 return;
13257 13356
@@ -13288,7 +13387,7 @@ void nl80211_send_disconnected(struct cfg80211_registered_device *rdev,
13288 struct sk_buff *msg; 13387 struct sk_buff *msg;
13289 void *hdr; 13388 void *hdr;
13290 13389
13291 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); 13390 msg = nlmsg_new(100 + ie_len, GFP_KERNEL);
13292 if (!msg) 13391 if (!msg)
13293 return; 13392 return;
13294 13393
@@ -13364,7 +13463,7 @@ void cfg80211_notify_new_peer_candidate(struct net_device *dev, const u8 *addr,
13364 13463
13365 trace_cfg80211_notify_new_peer_candidate(dev, addr); 13464 trace_cfg80211_notify_new_peer_candidate(dev, addr);
13366 13465
13367 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); 13466 msg = nlmsg_new(100 + ie_len, gfp);
13368 if (!msg) 13467 if (!msg)
13369 return; 13468 return;
13370 13469
@@ -13735,7 +13834,7 @@ int nl80211_send_mgmt(struct cfg80211_registered_device *rdev,
13735 struct sk_buff *msg; 13834 struct sk_buff *msg;
13736 void *hdr; 13835 void *hdr;
13737 13836
13738 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); 13837 msg = nlmsg_new(100 + len, gfp);
13739 if (!msg) 13838 if (!msg)
13740 return -ENOMEM; 13839 return -ENOMEM;
13741 13840
@@ -13779,7 +13878,7 @@ void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie,
13779 13878
13780 trace_cfg80211_mgmt_tx_status(wdev, cookie, ack); 13879 trace_cfg80211_mgmt_tx_status(wdev, cookie, ack);
13781 13880
13782 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); 13881 msg = nlmsg_new(100 + len, gfp);
13783 if (!msg) 13882 if (!msg)
13784 return; 13883 return;
13785 13884
@@ -13866,11 +13965,11 @@ static void cfg80211_send_cqm(struct sk_buff *msg, gfp_t gfp)
13866 13965
13867void cfg80211_cqm_rssi_notify(struct net_device *dev, 13966void cfg80211_cqm_rssi_notify(struct net_device *dev,
13868 enum nl80211_cqm_rssi_threshold_event rssi_event, 13967 enum nl80211_cqm_rssi_threshold_event rssi_event,
13869 gfp_t gfp) 13968 s32 rssi_level, gfp_t gfp)
13870{ 13969{
13871 struct sk_buff *msg; 13970 struct sk_buff *msg;
13872 13971
13873 trace_cfg80211_cqm_rssi_notify(dev, rssi_event); 13972 trace_cfg80211_cqm_rssi_notify(dev, rssi_event, rssi_level);
13874 13973
13875 if (WARN_ON(rssi_event != NL80211_CQM_RSSI_THRESHOLD_EVENT_LOW && 13974 if (WARN_ON(rssi_event != NL80211_CQM_RSSI_THRESHOLD_EVENT_LOW &&
13876 rssi_event != NL80211_CQM_RSSI_THRESHOLD_EVENT_HIGH)) 13975 rssi_event != NL80211_CQM_RSSI_THRESHOLD_EVENT_HIGH))
@@ -13884,6 +13983,10 @@ void cfg80211_cqm_rssi_notify(struct net_device *dev,
13884 rssi_event)) 13983 rssi_event))
13885 goto nla_put_failure; 13984 goto nla_put_failure;
13886 13985
13986 if (rssi_level && nla_put_s32(msg, NL80211_ATTR_CQM_RSSI_LEVEL,
13987 rssi_level))
13988 goto nla_put_failure;
13989
13887 cfg80211_send_cqm(msg, gfp); 13990 cfg80211_send_cqm(msg, gfp);
13888 13991
13889 return; 13992 return;
@@ -14534,6 +14637,8 @@ static int nl80211_netlink_notify(struct notifier_block * nb,
14534 14637
14535 if (wdev->owner_nlportid == notify->portid) 14638 if (wdev->owner_nlportid == notify->portid)
14536 schedule_destroy_work = true; 14639 schedule_destroy_work = true;
14640 else if (wdev->conn_owner_nlportid == notify->portid)
14641 schedule_work(&wdev->disconnect_wk);
14537 } 14642 }
14538 14643
14539 spin_lock_bh(&rdev->beacon_registrations_lock); 14644 spin_lock_bh(&rdev->beacon_registrations_lock);
@@ -14588,7 +14693,7 @@ void cfg80211_ft_event(struct net_device *netdev,
14588 if (!ft_event->target_ap) 14693 if (!ft_event->target_ap)
14589 return; 14694 return;
14590 14695
14591 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); 14696 msg = nlmsg_new(100 + ft_event->ric_ies_len, GFP_KERNEL);
14592 if (!msg) 14697 if (!msg)
14593 return; 14698 return;
14594 14699
diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h
index 7e3821d7fcc5..e488dca87423 100644
--- a/net/wireless/nl80211.h
+++ b/net/wireless/nl80211.h
@@ -14,12 +14,10 @@ void nl80211_send_scan_start(struct cfg80211_registered_device *rdev,
14 struct wireless_dev *wdev); 14 struct wireless_dev *wdev);
15struct sk_buff *nl80211_build_scan_msg(struct cfg80211_registered_device *rdev, 15struct sk_buff *nl80211_build_scan_msg(struct cfg80211_registered_device *rdev,
16 struct wireless_dev *wdev, bool aborted); 16 struct wireless_dev *wdev, bool aborted);
17void nl80211_send_scan_result(struct cfg80211_registered_device *rdev, 17void nl80211_send_scan_msg(struct cfg80211_registered_device *rdev,
18 struct sk_buff *msg); 18 struct sk_buff *msg);
19void nl80211_send_sched_scan(struct cfg80211_registered_device *rdev, 19void nl80211_send_sched_scan(struct cfg80211_registered_device *rdev,
20 struct net_device *netdev, u32 cmd); 20 struct net_device *netdev, u32 cmd);
21void nl80211_send_sched_scan_results(struct cfg80211_registered_device *rdev,
22 struct net_device *netdev);
23void nl80211_common_reg_change_event(enum nl80211_commands cmd_id, 21void nl80211_common_reg_change_event(enum nl80211_commands cmd_id,
24 struct regulatory_request *request); 22 struct regulatory_request *request);
25 23
@@ -58,7 +56,9 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
58 struct net_device *netdev, const u8 *bssid, 56 struct net_device *netdev, const u8 *bssid,
59 const u8 *req_ie, size_t req_ie_len, 57 const u8 *req_ie, size_t req_ie_len,
60 const u8 *resp_ie, size_t resp_ie_len, 58 const u8 *resp_ie, size_t resp_ie_len,
61 int status, gfp_t gfp); 59 int status,
60 enum nl80211_timeout_reason timeout_reason,
61 gfp_t gfp);
62void nl80211_send_roamed(struct cfg80211_registered_device *rdev, 62void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
63 struct net_device *netdev, const u8 *bssid, 63 struct net_device *netdev, const u8 *bssid,
64 const u8 *req_ie, size_t req_ie_len, 64 const u8 *req_ie, size_t req_ie_len,
diff --git a/net/wireless/of.c b/net/wireless/of.c
new file mode 100644
index 000000000000..de221f0edca5
--- /dev/null
+++ b/net/wireless/of.c
@@ -0,0 +1,138 @@
1/*
2 * Copyright (C) 2017 Rafał Miłecki <rafal@milecki.pl>
3 *
4 * Permission to use, copy, modify, and/or distribute this software for any
5 * purpose with or without fee is hereby granted, provided that the above
6 * copyright notice and this permission notice appear in all copies.
7 *
8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15 */
16
17#include <linux/of.h>
18#include <net/cfg80211.h>
19#include "core.h"
20
21static bool wiphy_freq_limits_valid_chan(struct wiphy *wiphy,
22 struct ieee80211_freq_range *freq_limits,
23 unsigned int n_freq_limits,
24 struct ieee80211_channel *chan)
25{
26 u32 bw = MHZ_TO_KHZ(20);
27 int i;
28
29 for (i = 0; i < n_freq_limits; i++) {
30 struct ieee80211_freq_range *limit = &freq_limits[i];
31
32 if (cfg80211_does_bw_fit_range(limit,
33 MHZ_TO_KHZ(chan->center_freq),
34 bw))
35 return true;
36 }
37
38 return false;
39}
40
41static void wiphy_freq_limits_apply(struct wiphy *wiphy,
42 struct ieee80211_freq_range *freq_limits,
43 unsigned int n_freq_limits)
44{
45 enum nl80211_band band;
46 int i;
47
48 if (WARN_ON(!n_freq_limits))
49 return;
50
51 for (band = 0; band < NUM_NL80211_BANDS; band++) {
52 struct ieee80211_supported_band *sband = wiphy->bands[band];
53
54 if (!sband)
55 continue;
56
57 for (i = 0; i < sband->n_channels; i++) {
58 struct ieee80211_channel *chan = &sband->channels[i];
59
60 if (chan->flags & IEEE80211_CHAN_DISABLED)
61 continue;
62
63 if (!wiphy_freq_limits_valid_chan(wiphy, freq_limits,
64 n_freq_limits,
65 chan)) {
66 pr_debug("Disabling freq %d MHz as it's out of OF limits\n",
67 chan->center_freq);
68 chan->flags |= IEEE80211_CHAN_DISABLED;
69 }
70 }
71 }
72}
73
74void wiphy_read_of_freq_limits(struct wiphy *wiphy)
75{
76 struct device *dev = wiphy_dev(wiphy);
77 struct device_node *np;
78 struct property *prop;
79 struct ieee80211_freq_range *freq_limits;
80 unsigned int n_freq_limits;
81 const __be32 *p;
82 int len, i;
83 int err = 0;
84
85 if (!dev)
86 return;
87 np = dev_of_node(dev);
88 if (!np)
89 return;
90
91 prop = of_find_property(np, "ieee80211-freq-limit", &len);
92 if (!prop)
93 return;
94
95 if (!len || len % sizeof(u32) || len / sizeof(u32) % 2) {
96 dev_err(dev, "ieee80211-freq-limit wrong format");
97 return;
98 }
99 n_freq_limits = len / sizeof(u32) / 2;
100
101 freq_limits = kcalloc(n_freq_limits, sizeof(*freq_limits), GFP_KERNEL);
102 if (!freq_limits) {
103 err = -ENOMEM;
104 goto out_kfree;
105 }
106
107 p = NULL;
108 for (i = 0; i < n_freq_limits; i++) {
109 struct ieee80211_freq_range *limit = &freq_limits[i];
110
111 p = of_prop_next_u32(prop, p, &limit->start_freq_khz);
112 if (!p) {
113 err = -EINVAL;
114 goto out_kfree;
115 }
116
117 p = of_prop_next_u32(prop, p, &limit->end_freq_khz);
118 if (!p) {
119 err = -EINVAL;
120 goto out_kfree;
121 }
122
123 if (!limit->start_freq_khz ||
124 !limit->end_freq_khz ||
125 limit->start_freq_khz >= limit->end_freq_khz) {
126 err = -EINVAL;
127 goto out_kfree;
128 }
129 }
130
131 wiphy_freq_limits_apply(wiphy, freq_limits, n_freq_limits);
132
133out_kfree:
134 kfree(freq_limits);
135 if (err)
136 dev_err(dev, "Failed to get limits: %d\n", err);
137}
138EXPORT_SYMBOL(wiphy_read_of_freq_limits);
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 5dbac3749738..753efcd51fa3 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -748,21 +748,6 @@ static bool is_valid_rd(const struct ieee80211_regdomain *rd)
748 return true; 748 return true;
749} 749}
750 750
751static bool reg_does_bw_fit(const struct ieee80211_freq_range *freq_range,
752 u32 center_freq_khz, u32 bw_khz)
753{
754 u32 start_freq_khz, end_freq_khz;
755
756 start_freq_khz = center_freq_khz - (bw_khz/2);
757 end_freq_khz = center_freq_khz + (bw_khz/2);
758
759 if (start_freq_khz >= freq_range->start_freq_khz &&
760 end_freq_khz <= freq_range->end_freq_khz)
761 return true;
762
763 return false;
764}
765
766/** 751/**
767 * freq_in_rule_band - tells us if a frequency is in a frequency band 752 * freq_in_rule_band - tells us if a frequency is in a frequency band
768 * @freq_range: frequency rule we want to query 753 * @freq_range: frequency rule we want to query
@@ -1070,7 +1055,7 @@ freq_reg_info_regd(u32 center_freq,
1070 if (!band_rule_found) 1055 if (!band_rule_found)
1071 band_rule_found = freq_in_rule_band(fr, center_freq); 1056 band_rule_found = freq_in_rule_band(fr, center_freq);
1072 1057
1073 bw_fits = reg_does_bw_fit(fr, center_freq, bw); 1058 bw_fits = cfg80211_does_bw_fit_range(fr, center_freq, bw);
1074 1059
1075 if (band_rule_found && bw_fits) 1060 if (band_rule_found && bw_fits)
1076 return rr; 1061 return rr;
@@ -1138,11 +1123,13 @@ static uint32_t reg_rule_to_chan_bw_flags(const struct ieee80211_regdomain *regd
1138 max_bandwidth_khz = reg_get_max_bandwidth(regd, reg_rule); 1123 max_bandwidth_khz = reg_get_max_bandwidth(regd, reg_rule);
1139 1124
1140 /* If we get a reg_rule we can assume that at least 5Mhz fit */ 1125 /* If we get a reg_rule we can assume that at least 5Mhz fit */
1141 if (!reg_does_bw_fit(freq_range, MHZ_TO_KHZ(chan->center_freq), 1126 if (!cfg80211_does_bw_fit_range(freq_range,
1142 MHZ_TO_KHZ(10))) 1127 MHZ_TO_KHZ(chan->center_freq),
1128 MHZ_TO_KHZ(10)))
1143 bw_flags |= IEEE80211_CHAN_NO_10MHZ; 1129 bw_flags |= IEEE80211_CHAN_NO_10MHZ;
1144 if (!reg_does_bw_fit(freq_range, MHZ_TO_KHZ(chan->center_freq), 1130 if (!cfg80211_does_bw_fit_range(freq_range,
1145 MHZ_TO_KHZ(20))) 1131 MHZ_TO_KHZ(chan->center_freq),
1132 MHZ_TO_KHZ(20)))
1146 bw_flags |= IEEE80211_CHAN_NO_20MHZ; 1133 bw_flags |= IEEE80211_CHAN_NO_20MHZ;
1147 1134
1148 if (max_bandwidth_khz < MHZ_TO_KHZ(10)) 1135 if (max_bandwidth_khz < MHZ_TO_KHZ(10))
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index 35ad69fd0838..21be56b3128e 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -227,7 +227,7 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev,
227 ASSERT_RTNL(); 227 ASSERT_RTNL();
228 228
229 if (rdev->scan_msg) { 229 if (rdev->scan_msg) {
230 nl80211_send_scan_result(rdev, rdev->scan_msg); 230 nl80211_send_scan_msg(rdev, rdev->scan_msg);
231 rdev->scan_msg = NULL; 231 rdev->scan_msg = NULL;
232 return; 232 return;
233 } 233 }
@@ -273,7 +273,7 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev,
273 if (!send_message) 273 if (!send_message)
274 rdev->scan_msg = msg; 274 rdev->scan_msg = msg;
275 else 275 else
276 nl80211_send_scan_result(rdev, msg); 276 nl80211_send_scan_msg(rdev, msg);
277} 277}
278 278
279void __cfg80211_scan_done(struct work_struct *wk) 279void __cfg80211_scan_done(struct work_struct *wk)
@@ -321,7 +321,8 @@ void __cfg80211_sched_scan_results(struct work_struct *wk)
321 spin_unlock_bh(&rdev->bss_lock); 321 spin_unlock_bh(&rdev->bss_lock);
322 request->scan_start = jiffies; 322 request->scan_start = jiffies;
323 } 323 }
324 nl80211_send_sched_scan_results(rdev, request->dev); 324 nl80211_send_sched_scan(rdev, request->dev,
325 NL80211_CMD_SCHED_SCAN_RESULTS);
325 } 326 }
326 327
327 rtnl_unlock(); 328 rtnl_unlock();
@@ -1147,7 +1148,7 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy,
1147 else 1148 else
1148 rcu_assign_pointer(tmp.pub.beacon_ies, ies); 1149 rcu_assign_pointer(tmp.pub.beacon_ies, ies);
1149 rcu_assign_pointer(tmp.pub.ies, ies); 1150 rcu_assign_pointer(tmp.pub.ies, ies);
1150 1151
1151 memcpy(tmp.pub.bssid, mgmt->bssid, ETH_ALEN); 1152 memcpy(tmp.pub.bssid, mgmt->bssid, ETH_ALEN);
1152 tmp.pub.channel = channel; 1153 tmp.pub.channel = channel;
1153 tmp.pub.scan_width = data->scan_width; 1154 tmp.pub.scan_width = data->scan_width;
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index 5e0d19380302..b347e63d7aaa 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -34,10 +34,11 @@ struct cfg80211_conn {
34 CFG80211_CONN_SCAN_AGAIN, 34 CFG80211_CONN_SCAN_AGAIN,
35 CFG80211_CONN_AUTHENTICATE_NEXT, 35 CFG80211_CONN_AUTHENTICATE_NEXT,
36 CFG80211_CONN_AUTHENTICATING, 36 CFG80211_CONN_AUTHENTICATING,
37 CFG80211_CONN_AUTH_FAILED, 37 CFG80211_CONN_AUTH_FAILED_TIMEOUT,
38 CFG80211_CONN_ASSOCIATE_NEXT, 38 CFG80211_CONN_ASSOCIATE_NEXT,
39 CFG80211_CONN_ASSOCIATING, 39 CFG80211_CONN_ASSOCIATING,
40 CFG80211_CONN_ASSOC_FAILED, 40 CFG80211_CONN_ASSOC_FAILED,
41 CFG80211_CONN_ASSOC_FAILED_TIMEOUT,
41 CFG80211_CONN_DEAUTH, 42 CFG80211_CONN_DEAUTH,
42 CFG80211_CONN_ABANDON, 43 CFG80211_CONN_ABANDON,
43 CFG80211_CONN_CONNECTED, 44 CFG80211_CONN_CONNECTED,
@@ -140,7 +141,8 @@ static int cfg80211_conn_scan(struct wireless_dev *wdev)
140 return err; 141 return err;
141} 142}
142 143
143static int cfg80211_conn_do_work(struct wireless_dev *wdev) 144static int cfg80211_conn_do_work(struct wireless_dev *wdev,
145 enum nl80211_timeout_reason *treason)
144{ 146{
145 struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); 147 struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
146 struct cfg80211_connect_params *params; 148 struct cfg80211_connect_params *params;
@@ -171,7 +173,8 @@ static int cfg80211_conn_do_work(struct wireless_dev *wdev)
171 NULL, 0, 173 NULL, 0,
172 params->key, params->key_len, 174 params->key, params->key_len,
173 params->key_idx, NULL, 0); 175 params->key_idx, NULL, 0);
174 case CFG80211_CONN_AUTH_FAILED: 176 case CFG80211_CONN_AUTH_FAILED_TIMEOUT:
177 *treason = NL80211_TIMEOUT_AUTH;
175 return -ENOTCONN; 178 return -ENOTCONN;
176 case CFG80211_CONN_ASSOCIATE_NEXT: 179 case CFG80211_CONN_ASSOCIATE_NEXT:
177 if (WARN_ON(!rdev->ops->assoc)) 180 if (WARN_ON(!rdev->ops->assoc))
@@ -198,6 +201,9 @@ static int cfg80211_conn_do_work(struct wireless_dev *wdev)
198 WLAN_REASON_DEAUTH_LEAVING, 201 WLAN_REASON_DEAUTH_LEAVING,
199 false); 202 false);
200 return err; 203 return err;
204 case CFG80211_CONN_ASSOC_FAILED_TIMEOUT:
205 *treason = NL80211_TIMEOUT_ASSOC;
206 /* fall through */
201 case CFG80211_CONN_ASSOC_FAILED: 207 case CFG80211_CONN_ASSOC_FAILED:
202 cfg80211_mlme_deauth(rdev, wdev->netdev, params->bssid, 208 cfg80211_mlme_deauth(rdev, wdev->netdev, params->bssid,
203 NULL, 0, 209 NULL, 0,
@@ -223,6 +229,7 @@ void cfg80211_conn_work(struct work_struct *work)
223 container_of(work, struct cfg80211_registered_device, conn_work); 229 container_of(work, struct cfg80211_registered_device, conn_work);
224 struct wireless_dev *wdev; 230 struct wireless_dev *wdev;
225 u8 bssid_buf[ETH_ALEN], *bssid = NULL; 231 u8 bssid_buf[ETH_ALEN], *bssid = NULL;
232 enum nl80211_timeout_reason treason;
226 233
227 rtnl_lock(); 234 rtnl_lock();
228 235
@@ -244,10 +251,12 @@ void cfg80211_conn_work(struct work_struct *work)
244 memcpy(bssid_buf, wdev->conn->params.bssid, ETH_ALEN); 251 memcpy(bssid_buf, wdev->conn->params.bssid, ETH_ALEN);
245 bssid = bssid_buf; 252 bssid = bssid_buf;
246 } 253 }
247 if (cfg80211_conn_do_work(wdev)) { 254 treason = NL80211_TIMEOUT_UNSPECIFIED;
255 if (cfg80211_conn_do_work(wdev, &treason)) {
248 __cfg80211_connect_result( 256 __cfg80211_connect_result(
249 wdev->netdev, bssid, 257 wdev->netdev, bssid,
250 NULL, 0, NULL, 0, -1, false, NULL); 258 NULL, 0, NULL, 0, -1, false, NULL,
259 treason);
251 } 260 }
252 wdev_unlock(wdev); 261 wdev_unlock(wdev);
253 } 262 }
@@ -352,7 +361,8 @@ void cfg80211_sme_rx_auth(struct wireless_dev *wdev, const u8 *buf, size_t len)
352 } else if (status_code != WLAN_STATUS_SUCCESS) { 361 } else if (status_code != WLAN_STATUS_SUCCESS) {
353 __cfg80211_connect_result(wdev->netdev, mgmt->bssid, 362 __cfg80211_connect_result(wdev->netdev, mgmt->bssid,
354 NULL, 0, NULL, 0, 363 NULL, 0, NULL, 0,
355 status_code, false, NULL); 364 status_code, false, NULL,
365 NL80211_TIMEOUT_UNSPECIFIED);
356 } else if (wdev->conn->state == CFG80211_CONN_AUTHENTICATING) { 366 } else if (wdev->conn->state == CFG80211_CONN_AUTHENTICATING) {
357 wdev->conn->state = CFG80211_CONN_ASSOCIATE_NEXT; 367 wdev->conn->state = CFG80211_CONN_ASSOCIATE_NEXT;
358 schedule_work(&rdev->conn_work); 368 schedule_work(&rdev->conn_work);
@@ -400,7 +410,7 @@ void cfg80211_sme_auth_timeout(struct wireless_dev *wdev)
400 if (!wdev->conn) 410 if (!wdev->conn)
401 return; 411 return;
402 412
403 wdev->conn->state = CFG80211_CONN_AUTH_FAILED; 413 wdev->conn->state = CFG80211_CONN_AUTH_FAILED_TIMEOUT;
404 schedule_work(&rdev->conn_work); 414 schedule_work(&rdev->conn_work);
405} 415}
406 416
@@ -422,7 +432,7 @@ void cfg80211_sme_assoc_timeout(struct wireless_dev *wdev)
422 if (!wdev->conn) 432 if (!wdev->conn)
423 return; 433 return;
424 434
425 wdev->conn->state = CFG80211_CONN_ASSOC_FAILED; 435 wdev->conn->state = CFG80211_CONN_ASSOC_FAILED_TIMEOUT;
426 schedule_work(&rdev->conn_work); 436 schedule_work(&rdev->conn_work);
427} 437}
428 438
@@ -564,7 +574,9 @@ static int cfg80211_sme_connect(struct wireless_dev *wdev,
564 574
565 /* we're good if we have a matching bss struct */ 575 /* we're good if we have a matching bss struct */
566 if (bss) { 576 if (bss) {
567 err = cfg80211_conn_do_work(wdev); 577 enum nl80211_timeout_reason treason;
578
579 err = cfg80211_conn_do_work(wdev, &treason);
568 cfg80211_put_bss(wdev->wiphy, bss); 580 cfg80211_put_bss(wdev->wiphy, bss);
569 } else { 581 } else {
570 /* otherwise we'll need to scan for the AP first */ 582 /* otherwise we'll need to scan for the AP first */
@@ -661,7 +673,8 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
661 const u8 *req_ie, size_t req_ie_len, 673 const u8 *req_ie, size_t req_ie_len,
662 const u8 *resp_ie, size_t resp_ie_len, 674 const u8 *resp_ie, size_t resp_ie_len,
663 int status, bool wextev, 675 int status, bool wextev,
664 struct cfg80211_bss *bss) 676 struct cfg80211_bss *bss,
677 enum nl80211_timeout_reason timeout_reason)
665{ 678{
666 struct wireless_dev *wdev = dev->ieee80211_ptr; 679 struct wireless_dev *wdev = dev->ieee80211_ptr;
667 const u8 *country_ie; 680 const u8 *country_ie;
@@ -680,7 +693,7 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
680 nl80211_send_connect_result(wiphy_to_rdev(wdev->wiphy), dev, 693 nl80211_send_connect_result(wiphy_to_rdev(wdev->wiphy), dev,
681 bssid, req_ie, req_ie_len, 694 bssid, req_ie, req_ie_len,
682 resp_ie, resp_ie_len, 695 resp_ie, resp_ie_len,
683 status, GFP_KERNEL); 696 status, timeout_reason, GFP_KERNEL);
684 697
685#ifdef CONFIG_CFG80211_WEXT 698#ifdef CONFIG_CFG80211_WEXT
686 if (wextev) { 699 if (wextev) {
@@ -727,6 +740,7 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
727 kzfree(wdev->connect_keys); 740 kzfree(wdev->connect_keys);
728 wdev->connect_keys = NULL; 741 wdev->connect_keys = NULL;
729 wdev->ssid_len = 0; 742 wdev->ssid_len = 0;
743 wdev->conn_owner_nlportid = 0;
730 if (bss) { 744 if (bss) {
731 cfg80211_unhold_bss(bss_from_pub(bss)); 745 cfg80211_unhold_bss(bss_from_pub(bss));
732 cfg80211_put_bss(wdev->wiphy, bss); 746 cfg80211_put_bss(wdev->wiphy, bss);
@@ -770,7 +784,8 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
770void cfg80211_connect_bss(struct net_device *dev, const u8 *bssid, 784void cfg80211_connect_bss(struct net_device *dev, const u8 *bssid,
771 struct cfg80211_bss *bss, const u8 *req_ie, 785 struct cfg80211_bss *bss, const u8 *req_ie,
772 size_t req_ie_len, const u8 *resp_ie, 786 size_t req_ie_len, const u8 *resp_ie,
773 size_t resp_ie_len, int status, gfp_t gfp) 787 size_t resp_ie_len, int status, gfp_t gfp,
788 enum nl80211_timeout_reason timeout_reason)
774{ 789{
775 struct wireless_dev *wdev = dev->ieee80211_ptr; 790 struct wireless_dev *wdev = dev->ieee80211_ptr;
776 struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); 791 struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
@@ -810,6 +825,7 @@ void cfg80211_connect_bss(struct net_device *dev, const u8 *bssid,
810 cfg80211_hold_bss(bss_from_pub(bss)); 825 cfg80211_hold_bss(bss_from_pub(bss));
811 ev->cr.bss = bss; 826 ev->cr.bss = bss;
812 ev->cr.status = status; 827 ev->cr.status = status;
828 ev->cr.timeout_reason = timeout_reason;
813 829
814 spin_lock_irqsave(&wdev->event_lock, flags); 830 spin_lock_irqsave(&wdev->event_lock, flags);
815 list_add_tail(&ev->list, &wdev->event_list); 831 list_add_tail(&ev->list, &wdev->event_list);
@@ -955,6 +971,7 @@ void __cfg80211_disconnected(struct net_device *dev, const u8 *ie,
955 971
956 wdev->current_bss = NULL; 972 wdev->current_bss = NULL;
957 wdev->ssid_len = 0; 973 wdev->ssid_len = 0;
974 wdev->conn_owner_nlportid = 0;
958 975
959 nl80211_send_disconnected(rdev, dev, reason, ie, ie_len, from_ap); 976 nl80211_send_disconnected(rdev, dev, reason, ie, ie_len, from_ap);
960 977
@@ -1098,6 +1115,8 @@ int cfg80211_disconnect(struct cfg80211_registered_device *rdev,
1098 kzfree(wdev->connect_keys); 1115 kzfree(wdev->connect_keys);
1099 wdev->connect_keys = NULL; 1116 wdev->connect_keys = NULL;
1100 1117
1118 wdev->conn_owner_nlportid = 0;
1119
1101 if (wdev->conn) 1120 if (wdev->conn)
1102 err = cfg80211_sme_disconnect(wdev, reason); 1121 err = cfg80211_sme_disconnect(wdev, reason);
1103 else if (!rdev->ops->disconnect) 1122 else if (!rdev->ops->disconnect)
@@ -1107,3 +1126,32 @@ int cfg80211_disconnect(struct cfg80211_registered_device *rdev,
1107 1126
1108 return err; 1127 return err;
1109} 1128}
1129
1130/*
1131 * Used to clean up after the connection / connection attempt owner socket
1132 * disconnects
1133 */
1134void cfg80211_autodisconnect_wk(struct work_struct *work)
1135{
1136 struct wireless_dev *wdev =
1137 container_of(work, struct wireless_dev, disconnect_wk);
1138 struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
1139
1140 wdev_lock(wdev);
1141
1142 if (wdev->conn_owner_nlportid) {
1143 /*
1144 * Use disconnect_bssid if still connecting and ops->disconnect
1145 * not implemented. Otherwise we can use cfg80211_disconnect.
1146 */
1147 if (rdev->ops->disconnect || wdev->current_bss)
1148 cfg80211_disconnect(rdev, wdev->netdev,
1149 WLAN_REASON_DEAUTH_LEAVING, true);
1150 else
1151 cfg80211_mlme_deauth(rdev, wdev->netdev,
1152 wdev->disconnect_bssid, NULL, 0,
1153 WLAN_REASON_DEAUTH_LEAVING, false);
1154 }
1155
1156 wdev_unlock(wdev);
1157}
diff --git a/net/wireless/sysfs.c b/net/wireless/sysfs.c
index 14b3f007826d..570a2b67ca10 100644
--- a/net/wireless/sysfs.c
+++ b/net/wireless/sysfs.c
@@ -39,9 +39,11 @@ SHOW_FMT(address_mask, "%pM", wiphy.addr_mask);
39 39
40static ssize_t name_show(struct device *dev, 40static ssize_t name_show(struct device *dev,
41 struct device_attribute *attr, 41 struct device_attribute *attr,
42 char *buf) { 42 char *buf)
43{
43 struct wiphy *wiphy = &dev_to_rdev(dev)->wiphy; 44 struct wiphy *wiphy = &dev_to_rdev(dev)->wiphy;
44 return sprintf(buf, "%s\n", dev_name(&wiphy->dev)); 45
46 return sprintf(buf, "%s\n", wiphy_name(wiphy));
45} 47}
46static DEVICE_ATTR_RO(name); 48static DEVICE_ATTR_RO(name);
47 49
@@ -130,12 +132,10 @@ static int wiphy_resume(struct device *dev)
130 /* Age scan results with time spent in suspend */ 132 /* Age scan results with time spent in suspend */
131 cfg80211_bss_age(rdev, get_seconds() - rdev->suspend_at); 133 cfg80211_bss_age(rdev, get_seconds() - rdev->suspend_at);
132 134
133 if (rdev->ops->resume) { 135 rtnl_lock();
134 rtnl_lock(); 136 if (rdev->wiphy.registered && rdev->ops->resume)
135 if (rdev->wiphy.registered) 137 ret = rdev_resume(rdev);
136 ret = rdev_resume(rdev); 138 rtnl_unlock();
137 rtnl_unlock();
138 }
139 139
140 return ret; 140 return ret;
141} 141}
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index ea1b47e04fa4..776e80cef9b4 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -1915,18 +1915,18 @@ TRACE_EVENT(rdev_start_nan,
1915 WIPHY_ENTRY 1915 WIPHY_ENTRY
1916 WDEV_ENTRY 1916 WDEV_ENTRY
1917 __field(u8, master_pref) 1917 __field(u8, master_pref)
1918 __field(u8, dual); 1918 __field(u8, bands);
1919 ), 1919 ),
1920 TP_fast_assign( 1920 TP_fast_assign(
1921 WIPHY_ASSIGN; 1921 WIPHY_ASSIGN;
1922 WDEV_ASSIGN; 1922 WDEV_ASSIGN;
1923 __entry->master_pref = conf->master_pref; 1923 __entry->master_pref = conf->master_pref;
1924 __entry->dual = conf->dual; 1924 __entry->bands = conf->bands;
1925 ), 1925 ),
1926 TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT 1926 TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT
1927 ", master preference: %u, dual: %d", 1927 ", master preference: %u, bands: 0x%0x",
1928 WIPHY_PR_ARG, WDEV_PR_ARG, __entry->master_pref, 1928 WIPHY_PR_ARG, WDEV_PR_ARG, __entry->master_pref,
1929 __entry->dual) 1929 __entry->bands)
1930); 1930);
1931 1931
1932TRACE_EVENT(rdev_nan_change_conf, 1932TRACE_EVENT(rdev_nan_change_conf,
@@ -1937,20 +1937,20 @@ TRACE_EVENT(rdev_nan_change_conf,
1937 WIPHY_ENTRY 1937 WIPHY_ENTRY
1938 WDEV_ENTRY 1938 WDEV_ENTRY
1939 __field(u8, master_pref) 1939 __field(u8, master_pref)
1940 __field(u8, dual); 1940 __field(u8, bands);
1941 __field(u32, changes); 1941 __field(u32, changes);
1942 ), 1942 ),
1943 TP_fast_assign( 1943 TP_fast_assign(
1944 WIPHY_ASSIGN; 1944 WIPHY_ASSIGN;
1945 WDEV_ASSIGN; 1945 WDEV_ASSIGN;
1946 __entry->master_pref = conf->master_pref; 1946 __entry->master_pref = conf->master_pref;
1947 __entry->dual = conf->dual; 1947 __entry->bands = conf->bands;
1948 __entry->changes = changes; 1948 __entry->changes = changes;
1949 ), 1949 ),
1950 TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT 1950 TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT
1951 ", master preference: %u, dual: %d, changes: %x", 1951 ", master preference: %u, bands: 0x%0x, changes: %x",
1952 WIPHY_PR_ARG, WDEV_PR_ARG, __entry->master_pref, 1952 WIPHY_PR_ARG, WDEV_PR_ARG, __entry->master_pref,
1953 __entry->dual, __entry->changes) 1953 __entry->bands, __entry->changes)
1954); 1954);
1955 1955
1956DEFINE_EVENT(wiphy_wdev_evt, rdev_stop_nan, 1956DEFINE_EVENT(wiphy_wdev_evt, rdev_stop_nan,
@@ -2490,18 +2490,21 @@ TRACE_EVENT(cfg80211_mgmt_tx_status,
2490 2490
2491TRACE_EVENT(cfg80211_cqm_rssi_notify, 2491TRACE_EVENT(cfg80211_cqm_rssi_notify,
2492 TP_PROTO(struct net_device *netdev, 2492 TP_PROTO(struct net_device *netdev,
2493 enum nl80211_cqm_rssi_threshold_event rssi_event), 2493 enum nl80211_cqm_rssi_threshold_event rssi_event,
2494 TP_ARGS(netdev, rssi_event), 2494 s32 rssi_level),
2495 TP_ARGS(netdev, rssi_event, rssi_level),
2495 TP_STRUCT__entry( 2496 TP_STRUCT__entry(
2496 NETDEV_ENTRY 2497 NETDEV_ENTRY
2497 __field(enum nl80211_cqm_rssi_threshold_event, rssi_event) 2498 __field(enum nl80211_cqm_rssi_threshold_event, rssi_event)
2499 __field(s32, rssi_level)
2498 ), 2500 ),
2499 TP_fast_assign( 2501 TP_fast_assign(
2500 NETDEV_ASSIGN; 2502 NETDEV_ASSIGN;
2501 __entry->rssi_event = rssi_event; 2503 __entry->rssi_event = rssi_event;
2504 __entry->rssi_level = rssi_level;
2502 ), 2505 ),
2503 TP_printk(NETDEV_PR_FMT ", rssi event: %d", 2506 TP_printk(NETDEV_PR_FMT ", rssi event: %d, level: %d",
2504 NETDEV_PR_ARG, __entry->rssi_event) 2507 NETDEV_PR_ARG, __entry->rssi_event, __entry->rssi_level)
2505); 2508);
2506 2509
2507TRACE_EVENT(cfg80211_reg_can_beacon, 2510TRACE_EVENT(cfg80211_reg_can_beacon,
diff --git a/net/wireless/util.c b/net/wireless/util.c
index e9d040d29846..68e5f2ecee1a 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -114,8 +114,7 @@ int ieee80211_frequency_to_channel(int freq)
114} 114}
115EXPORT_SYMBOL(ieee80211_frequency_to_channel); 115EXPORT_SYMBOL(ieee80211_frequency_to_channel);
116 116
117struct ieee80211_channel *__ieee80211_get_channel(struct wiphy *wiphy, 117struct ieee80211_channel *ieee80211_get_channel(struct wiphy *wiphy, int freq)
118 int freq)
119{ 118{
120 enum nl80211_band band; 119 enum nl80211_band band;
121 struct ieee80211_supported_band *sband; 120 struct ieee80211_supported_band *sband;
@@ -135,14 +134,13 @@ struct ieee80211_channel *__ieee80211_get_channel(struct wiphy *wiphy,
135 134
136 return NULL; 135 return NULL;
137} 136}
138EXPORT_SYMBOL(__ieee80211_get_channel); 137EXPORT_SYMBOL(ieee80211_get_channel);
139 138
140static void set_mandatory_flags_band(struct ieee80211_supported_band *sband, 139static void set_mandatory_flags_band(struct ieee80211_supported_band *sband)
141 enum nl80211_band band)
142{ 140{
143 int i, want; 141 int i, want;
144 142
145 switch (band) { 143 switch (sband->band) {
146 case NL80211_BAND_5GHZ: 144 case NL80211_BAND_5GHZ:
147 want = 3; 145 want = 3;
148 for (i = 0; i < sband->n_bitrates; i++) { 146 for (i = 0; i < sband->n_bitrates; i++) {
@@ -192,6 +190,7 @@ static void set_mandatory_flags_band(struct ieee80211_supported_band *sband,
192 WARN_ON((sband->ht_cap.mcs.rx_mask[0] & 0x1e) != 0x1e); 190 WARN_ON((sband->ht_cap.mcs.rx_mask[0] & 0x1e) != 0x1e);
193 break; 191 break;
194 case NUM_NL80211_BANDS: 192 case NUM_NL80211_BANDS:
193 default:
195 WARN_ON(1); 194 WARN_ON(1);
196 break; 195 break;
197 } 196 }
@@ -203,7 +202,7 @@ void ieee80211_set_bitrate_flags(struct wiphy *wiphy)
203 202
204 for (band = 0; band < NUM_NL80211_BANDS; band++) 203 for (band = 0; band < NUM_NL80211_BANDS; band++)
205 if (wiphy->bands[band]) 204 if (wiphy->bands[band])
206 set_mandatory_flags_band(wiphy->bands[band], band); 205 set_mandatory_flags_band(wiphy->bands[band]);
207} 206}
208 207
209bool cfg80211_supported_cipher_suite(struct wiphy *wiphy, u32 cipher) 208bool cfg80211_supported_cipher_suite(struct wiphy *wiphy, u32 cipher)
@@ -619,8 +618,6 @@ int ieee80211_data_from_8023(struct sk_buff *skb, const u8 *addr,
619 618
620 if (pskb_expand_head(skb, head_need, 0, GFP_ATOMIC)) 619 if (pskb_expand_head(skb, head_need, 0, GFP_ATOMIC))
621 return -ENOMEM; 620 return -ENOMEM;
622
623 skb->truesize += head_need;
624 } 621 }
625 622
626 if (encaps_data) { 623 if (encaps_data) {
@@ -952,7 +949,7 @@ void cfg80211_process_wdev_events(struct wireless_dev *wdev)
952 ev->cr.resp_ie, ev->cr.resp_ie_len, 949 ev->cr.resp_ie, ev->cr.resp_ie_len,
953 ev->cr.status, 950 ev->cr.status,
954 ev->cr.status == WLAN_STATUS_SUCCESS, 951 ev->cr.status == WLAN_STATUS_SUCCESS,
955 ev->cr.bss); 952 ev->cr.bss, ev->cr.timeout_reason);
956 break; 953 break;
957 case EVENT_ROAMED: 954 case EVENT_ROAMED:
958 __cfg80211_roamed(wdev, ev->rm.bss, ev->rm.req_ie, 955 __cfg80211_roamed(wdev, ev->rm.bss, ev->rm.req_ie,
@@ -1848,6 +1845,21 @@ void cfg80211_free_nan_func(struct cfg80211_nan_func *f)
1848} 1845}
1849EXPORT_SYMBOL(cfg80211_free_nan_func); 1846EXPORT_SYMBOL(cfg80211_free_nan_func);
1850 1847
1848bool cfg80211_does_bw_fit_range(const struct ieee80211_freq_range *freq_range,
1849 u32 center_freq_khz, u32 bw_khz)
1850{
1851 u32 start_freq_khz, end_freq_khz;
1852
1853 start_freq_khz = center_freq_khz - (bw_khz / 2);
1854 end_freq_khz = center_freq_khz + (bw_khz / 2);
1855
1856 if (start_freq_khz >= freq_range->start_freq_khz &&
1857 end_freq_khz <= freq_range->end_freq_khz)
1858 return true;
1859
1860 return false;
1861}
1862
1851/* See IEEE 802.1H for LLC/SNAP encapsulation/decapsulation */ 1863/* See IEEE 802.1H for LLC/SNAP encapsulation/decapsulation */
1852/* Ethernet-II snap header (RFC1042 for most EtherTypes) */ 1864/* Ethernet-II snap header (RFC1042 for most EtherTypes) */
1853const unsigned char rfc1042_header[] __aligned(2) = 1865const unsigned char rfc1042_header[] __aligned(2) =
diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c
index 6250b1cfcde5..1a4db6790e20 100644
--- a/net/wireless/wext-core.c
+++ b/net/wireless/wext-core.c
@@ -1119,3 +1119,70 @@ int compat_wext_handle_ioctl(struct net *net, unsigned int cmd,
1119 return ret; 1119 return ret;
1120} 1120}
1121#endif 1121#endif
1122
1123char *iwe_stream_add_event(struct iw_request_info *info, char *stream,
1124 char *ends, struct iw_event *iwe, int event_len)
1125{
1126 int lcp_len = iwe_stream_lcp_len(info);
1127
1128 event_len = iwe_stream_event_len_adjust(info, event_len);
1129
1130 /* Check if it's possible */
1131 if (likely((stream + event_len) < ends)) {
1132 iwe->len = event_len;
1133 /* Beware of alignement issues on 64 bits */
1134 memcpy(stream, (char *) iwe, IW_EV_LCP_PK_LEN);
1135 memcpy(stream + lcp_len, &iwe->u,
1136 event_len - lcp_len);
1137 stream += event_len;
1138 }
1139
1140 return stream;
1141}
1142EXPORT_SYMBOL(iwe_stream_add_event);
1143
1144char *iwe_stream_add_point(struct iw_request_info *info, char *stream,
1145 char *ends, struct iw_event *iwe, char *extra)
1146{
1147 int event_len = iwe_stream_point_len(info) + iwe->u.data.length;
1148 int point_len = iwe_stream_point_len(info);
1149 int lcp_len = iwe_stream_lcp_len(info);
1150
1151 /* Check if it's possible */
1152 if (likely((stream + event_len) < ends)) {
1153 iwe->len = event_len;
1154 memcpy(stream, (char *) iwe, IW_EV_LCP_PK_LEN);
1155 memcpy(stream + lcp_len,
1156 ((char *) &iwe->u) + IW_EV_POINT_OFF,
1157 IW_EV_POINT_PK_LEN - IW_EV_LCP_PK_LEN);
1158 if (iwe->u.data.length && extra)
1159 memcpy(stream + point_len, extra, iwe->u.data.length);
1160 stream += event_len;
1161 }
1162
1163 return stream;
1164}
1165EXPORT_SYMBOL(iwe_stream_add_point);
1166
1167char *iwe_stream_add_value(struct iw_request_info *info, char *event,
1168 char *value, char *ends, struct iw_event *iwe,
1169 int event_len)
1170{
1171 int lcp_len = iwe_stream_lcp_len(info);
1172
1173 /* Don't duplicate LCP */
1174 event_len -= IW_EV_LCP_LEN;
1175
1176 /* Check if it's possible */
1177 if (likely((value + event_len) < ends)) {
1178 /* Add new value */
1179 memcpy(value, &iwe->u, event_len);
1180 value += event_len;
1181 /* Patch LCP */
1182 iwe->len = value - event;
1183 memcpy(event, (char *) iwe, lcp_len);
1184 }
1185
1186 return value;
1187}
1188EXPORT_SYMBOL(iwe_stream_add_value);
diff --git a/net/wireless/wext-sme.c b/net/wireless/wext-sme.c
index 995163830a61..c434f193f39a 100644
--- a/net/wireless/wext-sme.c
+++ b/net/wireless/wext-sme.c
@@ -105,30 +105,7 @@ int cfg80211_mgd_wext_siwfreq(struct net_device *dev,
105 goto out; 105 goto out;
106 } 106 }
107 107
108
109 wdev->wext.connect.channel = chan; 108 wdev->wext.connect.channel = chan;
110
111 /*
112 * SSID is not set, we just want to switch monitor channel,
113 * this is really just backward compatibility, if the SSID
114 * is set then we use the channel to select the BSS to use
115 * to connect to instead. If we were connected on another
116 * channel we disconnected above and reconnect below.
117 */
118 if (chan && !wdev->wext.connect.ssid_len) {
119 struct cfg80211_chan_def chandef = {
120 .width = NL80211_CHAN_WIDTH_20_NOHT,
121 .center_freq1 = freq,
122 };
123
124 chandef.chan = ieee80211_get_channel(&rdev->wiphy, freq);
125 if (chandef.chan)
126 err = cfg80211_set_monitor_channel(rdev, &chandef);
127 else
128 err = -EINVAL;
129 goto out;
130 }
131
132 err = cfg80211_mgd_wext_connect(rdev, wdev); 109 err = cfg80211_mgd_wext_connect(rdev, wdev);
133 out: 110 out:
134 wdev_unlock(wdev); 111 wdev_unlock(wdev);
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 079c883aa96e..8b911c29860e 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -41,7 +41,7 @@
41#include <linux/capability.h> 41#include <linux/capability.h>
42#include <linux/errno.h> 42#include <linux/errno.h>
43#include <linux/kernel.h> 43#include <linux/kernel.h>
44#include <linux/sched.h> 44#include <linux/sched/signal.h>
45#include <linux/timer.h> 45#include <linux/timer.h>
46#include <linux/string.h> 46#include <linux/string.h>
47#include <linux/net.h> 47#include <linux/net.h>
@@ -852,7 +852,8 @@ static int x25_wait_for_data(struct sock *sk, long timeout)
852 return rc; 852 return rc;
853} 853}
854 854
855static int x25_accept(struct socket *sock, struct socket *newsock, int flags) 855static int x25_accept(struct socket *sock, struct socket *newsock, int flags,
856 bool kern)
856{ 857{
857 struct sock *sk = sock->sk; 858 struct sock *sk = sock->sk;
858 struct sock *newsk; 859 struct sock *newsk;
diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig
index bda1a13628a8..286ed25c1a69 100644
--- a/net/xfrm/Kconfig
+++ b/net/xfrm/Kconfig
@@ -4,6 +4,11 @@
4config XFRM 4config XFRM
5 bool 5 bool
6 depends on NET 6 depends on NET
7 select GRO_CELLS
8
9config XFRM_OFFLOAD
10 bool
11 depends on XFRM
7 12
8config XFRM_ALGO 13config XFRM_ALGO
9 tristate 14 tristate
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 6e3f0254d8a1..46bdb4fbed0b 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -19,16 +19,18 @@
19static struct kmem_cache *secpath_cachep __read_mostly; 19static struct kmem_cache *secpath_cachep __read_mostly;
20 20
21static DEFINE_SPINLOCK(xfrm_input_afinfo_lock); 21static DEFINE_SPINLOCK(xfrm_input_afinfo_lock);
22static struct xfrm_input_afinfo __rcu *xfrm_input_afinfo[NPROTO]; 22static struct xfrm_input_afinfo const __rcu *xfrm_input_afinfo[AF_INET6 + 1];
23 23
24int xfrm_input_register_afinfo(struct xfrm_input_afinfo *afinfo) 24static struct gro_cells gro_cells;
25static struct net_device xfrm_napi_dev;
26
27int xfrm_input_register_afinfo(const struct xfrm_input_afinfo *afinfo)
25{ 28{
26 int err = 0; 29 int err = 0;
27 30
28 if (unlikely(afinfo == NULL)) 31 if (WARN_ON(afinfo->family >= ARRAY_SIZE(xfrm_input_afinfo)))
29 return -EINVAL;
30 if (unlikely(afinfo->family >= NPROTO))
31 return -EAFNOSUPPORT; 32 return -EAFNOSUPPORT;
33
32 spin_lock_bh(&xfrm_input_afinfo_lock); 34 spin_lock_bh(&xfrm_input_afinfo_lock);
33 if (unlikely(xfrm_input_afinfo[afinfo->family] != NULL)) 35 if (unlikely(xfrm_input_afinfo[afinfo->family] != NULL))
34 err = -EEXIST; 36 err = -EEXIST;
@@ -39,14 +41,10 @@ int xfrm_input_register_afinfo(struct xfrm_input_afinfo *afinfo)
39} 41}
40EXPORT_SYMBOL(xfrm_input_register_afinfo); 42EXPORT_SYMBOL(xfrm_input_register_afinfo);
41 43
42int xfrm_input_unregister_afinfo(struct xfrm_input_afinfo *afinfo) 44int xfrm_input_unregister_afinfo(const struct xfrm_input_afinfo *afinfo)
43{ 45{
44 int err = 0; 46 int err = 0;
45 47
46 if (unlikely(afinfo == NULL))
47 return -EINVAL;
48 if (unlikely(afinfo->family >= NPROTO))
49 return -EAFNOSUPPORT;
50 spin_lock_bh(&xfrm_input_afinfo_lock); 48 spin_lock_bh(&xfrm_input_afinfo_lock);
51 if (likely(xfrm_input_afinfo[afinfo->family] != NULL)) { 49 if (likely(xfrm_input_afinfo[afinfo->family] != NULL)) {
52 if (unlikely(xfrm_input_afinfo[afinfo->family] != afinfo)) 50 if (unlikely(xfrm_input_afinfo[afinfo->family] != afinfo))
@@ -60,12 +58,13 @@ int xfrm_input_unregister_afinfo(struct xfrm_input_afinfo *afinfo)
60} 58}
61EXPORT_SYMBOL(xfrm_input_unregister_afinfo); 59EXPORT_SYMBOL(xfrm_input_unregister_afinfo);
62 60
63static struct xfrm_input_afinfo *xfrm_input_get_afinfo(unsigned int family) 61static const struct xfrm_input_afinfo *xfrm_input_get_afinfo(unsigned int family)
64{ 62{
65 struct xfrm_input_afinfo *afinfo; 63 const struct xfrm_input_afinfo *afinfo;
66 64
67 if (unlikely(family >= NPROTO)) 65 if (WARN_ON_ONCE(family >= ARRAY_SIZE(xfrm_input_afinfo)))
68 return NULL; 66 return NULL;
67
69 rcu_read_lock(); 68 rcu_read_lock();
70 afinfo = rcu_dereference(xfrm_input_afinfo[family]); 69 afinfo = rcu_dereference(xfrm_input_afinfo[family]);
71 if (unlikely(!afinfo)) 70 if (unlikely(!afinfo))
@@ -73,22 +72,17 @@ static struct xfrm_input_afinfo *xfrm_input_get_afinfo(unsigned int family)
73 return afinfo; 72 return afinfo;
74} 73}
75 74
76static void xfrm_input_put_afinfo(struct xfrm_input_afinfo *afinfo)
77{
78 rcu_read_unlock();
79}
80
81static int xfrm_rcv_cb(struct sk_buff *skb, unsigned int family, u8 protocol, 75static int xfrm_rcv_cb(struct sk_buff *skb, unsigned int family, u8 protocol,
82 int err) 76 int err)
83{ 77{
84 int ret; 78 int ret;
85 struct xfrm_input_afinfo *afinfo = xfrm_input_get_afinfo(family); 79 const struct xfrm_input_afinfo *afinfo = xfrm_input_get_afinfo(family);
86 80
87 if (!afinfo) 81 if (!afinfo)
88 return -EAFNOSUPPORT; 82 return -EAFNOSUPPORT;
89 83
90 ret = afinfo->callback(skb, protocol, err); 84 ret = afinfo->callback(skb, protocol, err);
91 xfrm_input_put_afinfo(afinfo); 85 rcu_read_unlock();
92 86
93 return ret; 87 return ret;
94} 88}
@@ -111,6 +105,8 @@ struct sec_path *secpath_dup(struct sec_path *src)
111 return NULL; 105 return NULL;
112 106
113 sp->len = 0; 107 sp->len = 0;
108 sp->olen = 0;
109
114 if (src) { 110 if (src) {
115 int i; 111 int i;
116 112
@@ -123,6 +119,24 @@ struct sec_path *secpath_dup(struct sec_path *src)
123} 119}
124EXPORT_SYMBOL(secpath_dup); 120EXPORT_SYMBOL(secpath_dup);
125 121
122int secpath_set(struct sk_buff *skb)
123{
124 struct sec_path *sp;
125
126 /* Allocate new secpath or COW existing one. */
127 if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) {
128 sp = secpath_dup(skb->sp);
129 if (!sp)
130 return -ENOMEM;
131
132 if (skb->sp)
133 secpath_put(skb->sp);
134 skb->sp = sp;
135 }
136 return 0;
137}
138EXPORT_SYMBOL(secpath_set);
139
126/* Fetch spi and seq from ipsec header */ 140/* Fetch spi and seq from ipsec header */
127 141
128int xfrm_parse_spi(struct sk_buff *skb, u8 nexthdr, __be32 *spi, __be32 *seq) 142int xfrm_parse_spi(struct sk_buff *skb, u8 nexthdr, __be32 *spi, __be32 *seq)
@@ -158,6 +172,7 @@ int xfrm_parse_spi(struct sk_buff *skb, u8 nexthdr, __be32 *spi, __be32 *seq)
158 *seq = *(__be32 *)(skb_transport_header(skb) + offset_seq); 172 *seq = *(__be32 *)(skb_transport_header(skb) + offset_seq);
159 return 0; 173 return 0;
160} 174}
175EXPORT_SYMBOL(xfrm_parse_spi);
161 176
162int xfrm_prepare_input(struct xfrm_state *x, struct sk_buff *skb) 177int xfrm_prepare_input(struct xfrm_state *x, struct sk_buff *skb)
163{ 178{
@@ -192,14 +207,23 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
192 unsigned int family; 207 unsigned int family;
193 int decaps = 0; 208 int decaps = 0;
194 int async = 0; 209 int async = 0;
210 struct xfrm_offload *xo;
211 bool xfrm_gro = false;
195 212
196 /* A negative encap_type indicates async resumption. */
197 if (encap_type < 0) { 213 if (encap_type < 0) {
198 async = 1;
199 x = xfrm_input_state(skb); 214 x = xfrm_input_state(skb);
200 seq = XFRM_SKB_CB(skb)->seq.input.low;
201 family = x->outer_mode->afinfo->family; 215 family = x->outer_mode->afinfo->family;
202 goto resume; 216
217 /* An encap_type of -1 indicates async resumption. */
218 if (encap_type == -1) {
219 async = 1;
220 seq = XFRM_SKB_CB(skb)->seq.input.low;
221 goto resume;
222 }
223 /* encap_type < -1 indicates a GRO call. */
224 encap_type = 0;
225 seq = XFRM_SPI_SKB_CB(skb)->seq;
226 goto lock;
203 } 227 }
204 228
205 daddr = (xfrm_address_t *)(skb_network_header(skb) + 229 daddr = (xfrm_address_t *)(skb_network_header(skb) +
@@ -218,18 +242,10 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
218 break; 242 break;
219 } 243 }
220 244
221 /* Allocate new secpath or COW existing one. */ 245 err = secpath_set(skb);
222 if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) { 246 if (err) {
223 struct sec_path *sp; 247 XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR);
224 248 goto drop;
225 sp = secpath_dup(skb->sp);
226 if (!sp) {
227 XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR);
228 goto drop;
229 }
230 if (skb->sp)
231 secpath_put(skb->sp);
232 skb->sp = sp;
233 } 249 }
234 250
235 seq = 0; 251 seq = 0;
@@ -253,6 +269,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
253 269
254 skb->sp->xvec[skb->sp->len++] = x; 270 skb->sp->xvec[skb->sp->len++] = x;
255 271
272lock:
256 spin_lock(&x->lock); 273 spin_lock(&x->lock);
257 274
258 if (unlikely(x->km.state != XFRM_STATE_VALID)) { 275 if (unlikely(x->km.state != XFRM_STATE_VALID)) {
@@ -371,10 +388,21 @@ resume:
371 388
372 if (decaps) { 389 if (decaps) {
373 skb_dst_drop(skb); 390 skb_dst_drop(skb);
374 netif_rx(skb); 391 gro_cells_receive(&gro_cells, skb);
375 return 0; 392 return 0;
376 } else { 393 } else {
377 return x->inner_mode->afinfo->transport_finish(skb, async); 394 xo = xfrm_offload(skb);
395 if (xo)
396 xfrm_gro = xo->flags & XFRM_GRO;
397
398 err = x->inner_mode->afinfo->transport_finish(skb, async);
399 if (xfrm_gro) {
400 skb_dst_drop(skb);
401 gro_cells_receive(&gro_cells, skb);
402 return err;
403 }
404
405 return err;
378 } 406 }
379 407
380drop_unlock: 408drop_unlock:
@@ -394,6 +422,13 @@ EXPORT_SYMBOL(xfrm_input_resume);
394 422
395void __init xfrm_input_init(void) 423void __init xfrm_input_init(void)
396{ 424{
425 int err;
426
427 init_dummy_netdev(&xfrm_napi_dev);
428 err = gro_cells_init(&gro_cells, &xfrm_napi_dev);
429 if (err)
430 gro_cells.cells = NULL;
431
397 secpath_cachep = kmem_cache_create("secpath_cache", 432 secpath_cachep = kmem_cache_create("secpath_cache",
398 sizeof(struct sec_path), 433 sizeof(struct sec_path),
399 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, 434 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index 637387bbaaea..8ba29fe58352 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -246,10 +246,8 @@ void xfrm_local_error(struct sk_buff *skb, int mtu)
246 return; 246 return;
247 247
248 afinfo = xfrm_state_get_afinfo(proto); 248 afinfo = xfrm_state_get_afinfo(proto);
249 if (!afinfo) 249 if (afinfo)
250 return; 250 afinfo->local_error(skb, mtu);
251 251 rcu_read_unlock();
252 afinfo->local_error(skb, mtu);
253 xfrm_state_put_afinfo(afinfo);
254} 252}
255EXPORT_SYMBOL_GPL(xfrm_local_error); 253EXPORT_SYMBOL_GPL(xfrm_local_error);
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 177e208e8ff5..236cbbc0ab9c 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -45,7 +45,7 @@ struct xfrm_flo {
45}; 45};
46 46
47static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock); 47static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock);
48static struct xfrm_policy_afinfo __rcu *xfrm_policy_afinfo[NPROTO] 48static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1]
49 __read_mostly; 49 __read_mostly;
50 50
51static struct kmem_cache *xfrm_dst_cache __read_mostly; 51static struct kmem_cache *xfrm_dst_cache __read_mostly;
@@ -103,11 +103,11 @@ bool xfrm_selector_match(const struct xfrm_selector *sel, const struct flowi *fl
103 return false; 103 return false;
104} 104}
105 105
106static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family) 106static const struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family)
107{ 107{
108 struct xfrm_policy_afinfo *afinfo; 108 const struct xfrm_policy_afinfo *afinfo;
109 109
110 if (unlikely(family >= NPROTO)) 110 if (unlikely(family >= ARRAY_SIZE(xfrm_policy_afinfo)))
111 return NULL; 111 return NULL;
112 rcu_read_lock(); 112 rcu_read_lock();
113 afinfo = rcu_dereference(xfrm_policy_afinfo[family]); 113 afinfo = rcu_dereference(xfrm_policy_afinfo[family]);
@@ -116,18 +116,13 @@ static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family)
116 return afinfo; 116 return afinfo;
117} 117}
118 118
119static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo)
120{
121 rcu_read_unlock();
122}
123
124static inline struct dst_entry *__xfrm_dst_lookup(struct net *net, 119static inline struct dst_entry *__xfrm_dst_lookup(struct net *net,
125 int tos, int oif, 120 int tos, int oif,
126 const xfrm_address_t *saddr, 121 const xfrm_address_t *saddr,
127 const xfrm_address_t *daddr, 122 const xfrm_address_t *daddr,
128 int family) 123 int family)
129{ 124{
130 struct xfrm_policy_afinfo *afinfo; 125 const struct xfrm_policy_afinfo *afinfo;
131 struct dst_entry *dst; 126 struct dst_entry *dst;
132 127
133 afinfo = xfrm_policy_get_afinfo(family); 128 afinfo = xfrm_policy_get_afinfo(family);
@@ -136,7 +131,7 @@ static inline struct dst_entry *__xfrm_dst_lookup(struct net *net,
136 131
137 dst = afinfo->dst_lookup(net, tos, oif, saddr, daddr); 132 dst = afinfo->dst_lookup(net, tos, oif, saddr, daddr);
138 133
139 xfrm_policy_put_afinfo(afinfo); 134 rcu_read_unlock();
140 135
141 return dst; 136 return dst;
142} 137}
@@ -330,7 +325,7 @@ void xfrm_policy_destroy(struct xfrm_policy *policy)
330} 325}
331EXPORT_SYMBOL(xfrm_policy_destroy); 326EXPORT_SYMBOL(xfrm_policy_destroy);
332 327
333/* Rule must be locked. Release descentant resources, announce 328/* Rule must be locked. Release descendant resources, announce
334 * entry dead. The rule must be unlinked from lists to the moment. 329 * entry dead. The rule must be unlinked from lists to the moment.
335 */ 330 */
336 331
@@ -1248,7 +1243,7 @@ static inline int policy_to_flow_dir(int dir)
1248} 1243}
1249 1244
1250static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir, 1245static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir,
1251 const struct flowi *fl) 1246 const struct flowi *fl, u16 family)
1252{ 1247{
1253 struct xfrm_policy *pol; 1248 struct xfrm_policy *pol;
1254 1249
@@ -1256,8 +1251,7 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir,
1256 again: 1251 again:
1257 pol = rcu_dereference(sk->sk_policy[dir]); 1252 pol = rcu_dereference(sk->sk_policy[dir]);
1258 if (pol != NULL) { 1253 if (pol != NULL) {
1259 bool match = xfrm_selector_match(&pol->selector, fl, 1254 bool match = xfrm_selector_match(&pol->selector, fl, family);
1260 sk->sk_family);
1261 int err = 0; 1255 int err = 0;
1262 1256
1263 if (match) { 1257 if (match) {
@@ -1431,12 +1425,12 @@ xfrm_get_saddr(struct net *net, int oif, xfrm_address_t *local,
1431 xfrm_address_t *remote, unsigned short family) 1425 xfrm_address_t *remote, unsigned short family)
1432{ 1426{
1433 int err; 1427 int err;
1434 struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); 1428 const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
1435 1429
1436 if (unlikely(afinfo == NULL)) 1430 if (unlikely(afinfo == NULL))
1437 return -EINVAL; 1431 return -EINVAL;
1438 err = afinfo->get_saddr(net, oif, local, remote); 1432 err = afinfo->get_saddr(net, oif, local, remote);
1439 xfrm_policy_put_afinfo(afinfo); 1433 rcu_read_unlock();
1440 return err; 1434 return err;
1441} 1435}
1442 1436
@@ -1538,21 +1532,15 @@ xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, const struct flowi *fl,
1538 1532
1539} 1533}
1540 1534
1541/* Check that the bundle accepts the flow and its components are 1535static int xfrm_get_tos(const struct flowi *fl, int family)
1542 * still valid.
1543 */
1544
1545static inline int xfrm_get_tos(const struct flowi *fl, int family)
1546{ 1536{
1547 struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); 1537 const struct xfrm_policy_afinfo *afinfo;
1548 int tos; 1538 int tos = 0;
1549
1550 if (!afinfo)
1551 return -EINVAL;
1552 1539
1553 tos = afinfo->get_tos(fl); 1540 afinfo = xfrm_policy_get_afinfo(family);
1541 tos = afinfo ? afinfo->get_tos(fl) : 0;
1554 1542
1555 xfrm_policy_put_afinfo(afinfo); 1543 rcu_read_unlock();
1556 1544
1557 return tos; 1545 return tos;
1558} 1546}
@@ -1609,7 +1597,7 @@ static const struct flow_cache_ops xfrm_bundle_fc_ops = {
1609 1597
1610static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family) 1598static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
1611{ 1599{
1612 struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); 1600 const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
1613 struct dst_ops *dst_ops; 1601 struct dst_ops *dst_ops;
1614 struct xfrm_dst *xdst; 1602 struct xfrm_dst *xdst;
1615 1603
@@ -1638,7 +1626,7 @@ static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
1638 } else 1626 } else
1639 xdst = ERR_PTR(-ENOBUFS); 1627 xdst = ERR_PTR(-ENOBUFS);
1640 1628
1641 xfrm_policy_put_afinfo(afinfo); 1629 rcu_read_unlock();
1642 1630
1643 return xdst; 1631 return xdst;
1644} 1632}
@@ -1646,7 +1634,7 @@ static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
1646static inline int xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst, 1634static inline int xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst,
1647 int nfheader_len) 1635 int nfheader_len)
1648{ 1636{
1649 struct xfrm_policy_afinfo *afinfo = 1637 const struct xfrm_policy_afinfo *afinfo =
1650 xfrm_policy_get_afinfo(dst->ops->family); 1638 xfrm_policy_get_afinfo(dst->ops->family);
1651 int err; 1639 int err;
1652 1640
@@ -1655,7 +1643,7 @@ static inline int xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst,
1655 1643
1656 err = afinfo->init_path(path, dst, nfheader_len); 1644 err = afinfo->init_path(path, dst, nfheader_len);
1657 1645
1658 xfrm_policy_put_afinfo(afinfo); 1646 rcu_read_unlock();
1659 1647
1660 return err; 1648 return err;
1661} 1649}
@@ -1663,7 +1651,7 @@ static inline int xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst,
1663static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, 1651static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
1664 const struct flowi *fl) 1652 const struct flowi *fl)
1665{ 1653{
1666 struct xfrm_policy_afinfo *afinfo = 1654 const struct xfrm_policy_afinfo *afinfo =
1667 xfrm_policy_get_afinfo(xdst->u.dst.ops->family); 1655 xfrm_policy_get_afinfo(xdst->u.dst.ops->family);
1668 int err; 1656 int err;
1669 1657
@@ -1672,7 +1660,7 @@ static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
1672 1660
1673 err = afinfo->fill_dst(xdst, dev, fl); 1661 err = afinfo->fill_dst(xdst, dev, fl);
1674 1662
1675 xfrm_policy_put_afinfo(afinfo); 1663 rcu_read_unlock();
1676 1664
1677 return err; 1665 return err;
1678} 1666}
@@ -1705,9 +1693,6 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
1705 xfrm_flowi_addr_get(fl, &saddr, &daddr, family); 1693 xfrm_flowi_addr_get(fl, &saddr, &daddr, family);
1706 1694
1707 tos = xfrm_get_tos(fl, family); 1695 tos = xfrm_get_tos(fl, family);
1708 err = tos;
1709 if (tos < 0)
1710 goto put_states;
1711 1696
1712 dst_hold(dst); 1697 dst_hold(dst);
1713 1698
@@ -2215,7 +2200,7 @@ error:
2215static struct dst_entry *make_blackhole(struct net *net, u16 family, 2200static struct dst_entry *make_blackhole(struct net *net, u16 family,
2216 struct dst_entry *dst_orig) 2201 struct dst_entry *dst_orig)
2217{ 2202{
2218 struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); 2203 const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
2219 struct dst_entry *ret; 2204 struct dst_entry *ret;
2220 2205
2221 if (!afinfo) { 2206 if (!afinfo) {
@@ -2224,7 +2209,7 @@ static struct dst_entry *make_blackhole(struct net *net, u16 family,
2224 } else { 2209 } else {
2225 ret = afinfo->blackhole_route(net, dst_orig); 2210 ret = afinfo->blackhole_route(net, dst_orig);
2226 } 2211 }
2227 xfrm_policy_put_afinfo(afinfo); 2212 rcu_read_unlock();
2228 2213
2229 return ret; 2214 return ret;
2230} 2215}
@@ -2253,7 +2238,7 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
2253 sk = sk_const_to_full_sk(sk); 2238 sk = sk_const_to_full_sk(sk);
2254 if (sk && sk->sk_policy[XFRM_POLICY_OUT]) { 2239 if (sk && sk->sk_policy[XFRM_POLICY_OUT]) {
2255 num_pols = 1; 2240 num_pols = 1;
2256 pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl); 2241 pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl, family);
2257 err = xfrm_expand_policies(fl, family, pols, 2242 err = xfrm_expand_policies(fl, family, pols,
2258 &num_pols, &num_xfrms); 2243 &num_pols, &num_xfrms);
2259 if (err < 0) 2244 if (err < 0)
@@ -2466,7 +2451,7 @@ xfrm_policy_ok(const struct xfrm_tmpl *tmpl, const struct sec_path *sp, int star
2466int __xfrm_decode_session(struct sk_buff *skb, struct flowi *fl, 2451int __xfrm_decode_session(struct sk_buff *skb, struct flowi *fl,
2467 unsigned int family, int reverse) 2452 unsigned int family, int reverse)
2468{ 2453{
2469 struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); 2454 const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
2470 int err; 2455 int err;
2471 2456
2472 if (unlikely(afinfo == NULL)) 2457 if (unlikely(afinfo == NULL))
@@ -2474,7 +2459,7 @@ int __xfrm_decode_session(struct sk_buff *skb, struct flowi *fl,
2474 2459
2475 afinfo->decode_session(skb, fl, reverse); 2460 afinfo->decode_session(skb, fl, reverse);
2476 err = security_xfrm_decode_session(skb, &fl->flowi_secid); 2461 err = security_xfrm_decode_session(skb, &fl->flowi_secid);
2477 xfrm_policy_put_afinfo(afinfo); 2462 rcu_read_unlock();
2478 return err; 2463 return err;
2479} 2464}
2480EXPORT_SYMBOL(__xfrm_decode_session); 2465EXPORT_SYMBOL(__xfrm_decode_session);
@@ -2532,7 +2517,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
2532 pol = NULL; 2517 pol = NULL;
2533 sk = sk_to_full_sk(sk); 2518 sk = sk_to_full_sk(sk);
2534 if (sk && sk->sk_policy[dir]) { 2519 if (sk && sk->sk_policy[dir]) {
2535 pol = xfrm_sk_policy_lookup(sk, dir, &fl); 2520 pol = xfrm_sk_policy_lookup(sk, dir, &fl, family);
2536 if (IS_ERR(pol)) { 2521 if (IS_ERR(pol)) {
2537 XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR); 2522 XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
2538 return 0; 2523 return 0;
@@ -2742,10 +2727,11 @@ void xfrm_garbage_collect(struct net *net)
2742} 2727}
2743EXPORT_SYMBOL(xfrm_garbage_collect); 2728EXPORT_SYMBOL(xfrm_garbage_collect);
2744 2729
2745static void xfrm_garbage_collect_deferred(struct net *net) 2730void xfrm_garbage_collect_deferred(struct net *net)
2746{ 2731{
2747 flow_cache_flush_deferred(net); 2732 flow_cache_flush_deferred(net);
2748} 2733}
2734EXPORT_SYMBOL(xfrm_garbage_collect_deferred);
2749 2735
2750static void xfrm_init_pmtu(struct dst_entry *dst) 2736static void xfrm_init_pmtu(struct dst_entry *dst)
2751{ 2737{
@@ -2849,22 +2835,52 @@ static unsigned int xfrm_mtu(const struct dst_entry *dst)
2849 return mtu ? : dst_mtu(dst->path); 2835 return mtu ? : dst_mtu(dst->path);
2850} 2836}
2851 2837
2838static const void *xfrm_get_dst_nexthop(const struct dst_entry *dst,
2839 const void *daddr)
2840{
2841 const struct dst_entry *path = dst->path;
2842
2843 for (; dst != path; dst = dst->child) {
2844 const struct xfrm_state *xfrm = dst->xfrm;
2845
2846 if (xfrm->props.mode == XFRM_MODE_TRANSPORT)
2847 continue;
2848 if (xfrm->type->flags & XFRM_TYPE_REMOTE_COADDR)
2849 daddr = xfrm->coaddr;
2850 else if (!(xfrm->type->flags & XFRM_TYPE_LOCAL_COADDR))
2851 daddr = &xfrm->id.daddr;
2852 }
2853 return daddr;
2854}
2855
2852static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst, 2856static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst,
2853 struct sk_buff *skb, 2857 struct sk_buff *skb,
2854 const void *daddr) 2858 const void *daddr)
2855{ 2859{
2856 return dst->path->ops->neigh_lookup(dst, skb, daddr); 2860 const struct dst_entry *path = dst->path;
2861
2862 if (!skb)
2863 daddr = xfrm_get_dst_nexthop(dst, daddr);
2864 return path->ops->neigh_lookup(path, skb, daddr);
2865}
2866
2867static void xfrm_confirm_neigh(const struct dst_entry *dst, const void *daddr)
2868{
2869 const struct dst_entry *path = dst->path;
2870
2871 daddr = xfrm_get_dst_nexthop(dst, daddr);
2872 path->ops->confirm_neigh(path, daddr);
2857} 2873}
2858 2874
2859int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo) 2875int xfrm_policy_register_afinfo(const struct xfrm_policy_afinfo *afinfo, int family)
2860{ 2876{
2861 int err = 0; 2877 int err = 0;
2862 if (unlikely(afinfo == NULL)) 2878
2863 return -EINVAL; 2879 if (WARN_ON(family >= ARRAY_SIZE(xfrm_policy_afinfo)))
2864 if (unlikely(afinfo->family >= NPROTO))
2865 return -EAFNOSUPPORT; 2880 return -EAFNOSUPPORT;
2881
2866 spin_lock(&xfrm_policy_afinfo_lock); 2882 spin_lock(&xfrm_policy_afinfo_lock);
2867 if (unlikely(xfrm_policy_afinfo[afinfo->family] != NULL)) 2883 if (unlikely(xfrm_policy_afinfo[family] != NULL))
2868 err = -EEXIST; 2884 err = -EEXIST;
2869 else { 2885 else {
2870 struct dst_ops *dst_ops = afinfo->dst_ops; 2886 struct dst_ops *dst_ops = afinfo->dst_ops;
@@ -2882,9 +2898,9 @@ int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo)
2882 dst_ops->link_failure = xfrm_link_failure; 2898 dst_ops->link_failure = xfrm_link_failure;
2883 if (likely(dst_ops->neigh_lookup == NULL)) 2899 if (likely(dst_ops->neigh_lookup == NULL))
2884 dst_ops->neigh_lookup = xfrm_neigh_lookup; 2900 dst_ops->neigh_lookup = xfrm_neigh_lookup;
2885 if (likely(afinfo->garbage_collect == NULL)) 2901 if (likely(!dst_ops->confirm_neigh))
2886 afinfo->garbage_collect = xfrm_garbage_collect_deferred; 2902 dst_ops->confirm_neigh = xfrm_confirm_neigh;
2887 rcu_assign_pointer(xfrm_policy_afinfo[afinfo->family], afinfo); 2903 rcu_assign_pointer(xfrm_policy_afinfo[family], afinfo);
2888 } 2904 }
2889 spin_unlock(&xfrm_policy_afinfo_lock); 2905 spin_unlock(&xfrm_policy_afinfo_lock);
2890 2906
@@ -2892,34 +2908,24 @@ int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo)
2892} 2908}
2893EXPORT_SYMBOL(xfrm_policy_register_afinfo); 2909EXPORT_SYMBOL(xfrm_policy_register_afinfo);
2894 2910
2895int xfrm_policy_unregister_afinfo(struct xfrm_policy_afinfo *afinfo) 2911void xfrm_policy_unregister_afinfo(const struct xfrm_policy_afinfo *afinfo)
2896{ 2912{
2897 int err = 0; 2913 struct dst_ops *dst_ops = afinfo->dst_ops;
2898 if (unlikely(afinfo == NULL)) 2914 int i;
2899 return -EINVAL; 2915
2900 if (unlikely(afinfo->family >= NPROTO)) 2916 for (i = 0; i < ARRAY_SIZE(xfrm_policy_afinfo); i++) {
2901 return -EAFNOSUPPORT; 2917 if (xfrm_policy_afinfo[i] != afinfo)
2902 spin_lock(&xfrm_policy_afinfo_lock); 2918 continue;
2903 if (likely(xfrm_policy_afinfo[afinfo->family] != NULL)) { 2919 RCU_INIT_POINTER(xfrm_policy_afinfo[i], NULL);
2904 if (unlikely(xfrm_policy_afinfo[afinfo->family] != afinfo)) 2920 break;
2905 err = -EINVAL;
2906 else
2907 RCU_INIT_POINTER(xfrm_policy_afinfo[afinfo->family],
2908 NULL);
2909 } 2921 }
2910 spin_unlock(&xfrm_policy_afinfo_lock);
2911 if (!err) {
2912 struct dst_ops *dst_ops = afinfo->dst_ops;
2913 2922
2914 synchronize_rcu(); 2923 synchronize_rcu();
2915 2924
2916 dst_ops->kmem_cachep = NULL; 2925 dst_ops->kmem_cachep = NULL;
2917 dst_ops->check = NULL; 2926 dst_ops->check = NULL;
2918 dst_ops->negative_advice = NULL; 2927 dst_ops->negative_advice = NULL;
2919 dst_ops->link_failure = NULL; 2928 dst_ops->link_failure = NULL;
2920 afinfo->garbage_collect = NULL;
2921 }
2922 return err;
2923} 2929}
2924EXPORT_SYMBOL(xfrm_policy_unregister_afinfo); 2930EXPORT_SYMBOL(xfrm_policy_unregister_afinfo);
2925 2931
@@ -3062,6 +3068,11 @@ static int __net_init xfrm_net_init(struct net *net)
3062{ 3068{
3063 int rv; 3069 int rv;
3064 3070
3071 /* Initialize the per-net locks here */
3072 spin_lock_init(&net->xfrm.xfrm_state_lock);
3073 spin_lock_init(&net->xfrm.xfrm_policy_lock);
3074 mutex_init(&net->xfrm.xfrm_cfg_mutex);
3075
3065 rv = xfrm_statistics_init(net); 3076 rv = xfrm_statistics_init(net);
3066 if (rv < 0) 3077 if (rv < 0)
3067 goto out_statistics; 3078 goto out_statistics;
@@ -3078,11 +3089,6 @@ static int __net_init xfrm_net_init(struct net *net)
3078 if (rv < 0) 3089 if (rv < 0)
3079 goto out; 3090 goto out;
3080 3091
3081 /* Initialize the per-net locks here */
3082 spin_lock_init(&net->xfrm.xfrm_state_lock);
3083 spin_lock_init(&net->xfrm.xfrm_policy_lock);
3084 mutex_init(&net->xfrm.xfrm_cfg_mutex);
3085
3086 return 0; 3092 return 0;
3087 3093
3088out: 3094out:
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 64e3c82eedf6..5a597dbbe564 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -192,7 +192,7 @@ int xfrm_register_type(const struct xfrm_type *type, unsigned short family)
192 else 192 else
193 err = -EEXIST; 193 err = -EEXIST;
194 spin_unlock_bh(&xfrm_type_lock); 194 spin_unlock_bh(&xfrm_type_lock);
195 xfrm_state_put_afinfo(afinfo); 195 rcu_read_unlock();
196 return err; 196 return err;
197} 197}
198EXPORT_SYMBOL(xfrm_register_type); 198EXPORT_SYMBOL(xfrm_register_type);
@@ -213,7 +213,7 @@ int xfrm_unregister_type(const struct xfrm_type *type, unsigned short family)
213 else 213 else
214 typemap[type->proto] = NULL; 214 typemap[type->proto] = NULL;
215 spin_unlock_bh(&xfrm_type_lock); 215 spin_unlock_bh(&xfrm_type_lock);
216 xfrm_state_put_afinfo(afinfo); 216 rcu_read_unlock();
217 return err; 217 return err;
218} 218}
219EXPORT_SYMBOL(xfrm_unregister_type); 219EXPORT_SYMBOL(xfrm_unregister_type);
@@ -231,17 +231,18 @@ retry:
231 return NULL; 231 return NULL;
232 typemap = afinfo->type_map; 232 typemap = afinfo->type_map;
233 233
234 type = typemap[proto]; 234 type = READ_ONCE(typemap[proto]);
235 if (unlikely(type && !try_module_get(type->owner))) 235 if (unlikely(type && !try_module_get(type->owner)))
236 type = NULL; 236 type = NULL;
237
238 rcu_read_unlock();
239
237 if (!type && !modload_attempted) { 240 if (!type && !modload_attempted) {
238 xfrm_state_put_afinfo(afinfo);
239 request_module("xfrm-type-%d-%d", family, proto); 241 request_module("xfrm-type-%d-%d", family, proto);
240 modload_attempted = 1; 242 modload_attempted = 1;
241 goto retry; 243 goto retry;
242 } 244 }
243 245
244 xfrm_state_put_afinfo(afinfo);
245 return type; 246 return type;
246} 247}
247 248
@@ -280,7 +281,7 @@ int xfrm_register_mode(struct xfrm_mode *mode, int family)
280 281
281out: 282out:
282 spin_unlock_bh(&xfrm_mode_lock); 283 spin_unlock_bh(&xfrm_mode_lock);
283 xfrm_state_put_afinfo(afinfo); 284 rcu_read_unlock();
284 return err; 285 return err;
285} 286}
286EXPORT_SYMBOL(xfrm_register_mode); 287EXPORT_SYMBOL(xfrm_register_mode);
@@ -308,7 +309,7 @@ int xfrm_unregister_mode(struct xfrm_mode *mode, int family)
308 } 309 }
309 310
310 spin_unlock_bh(&xfrm_mode_lock); 311 spin_unlock_bh(&xfrm_mode_lock);
311 xfrm_state_put_afinfo(afinfo); 312 rcu_read_unlock();
312 return err; 313 return err;
313} 314}
314EXPORT_SYMBOL(xfrm_unregister_mode); 315EXPORT_SYMBOL(xfrm_unregister_mode);
@@ -327,17 +328,17 @@ retry:
327 if (unlikely(afinfo == NULL)) 328 if (unlikely(afinfo == NULL))
328 return NULL; 329 return NULL;
329 330
330 mode = afinfo->mode_map[encap]; 331 mode = READ_ONCE(afinfo->mode_map[encap]);
331 if (unlikely(mode && !try_module_get(mode->owner))) 332 if (unlikely(mode && !try_module_get(mode->owner)))
332 mode = NULL; 333 mode = NULL;
334
335 rcu_read_unlock();
333 if (!mode && !modload_attempted) { 336 if (!mode && !modload_attempted) {
334 xfrm_state_put_afinfo(afinfo);
335 request_module("xfrm-mode-%d-%d", family, encap); 337 request_module("xfrm-mode-%d-%d", family, encap);
336 modload_attempted = 1; 338 modload_attempted = 1;
337 goto retry; 339 goto retry;
338 } 340 }
339 341
340 xfrm_state_put_afinfo(afinfo);
341 return mode; 342 return mode;
342} 343}
343 344
@@ -409,7 +410,7 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me)
409 if (x->xflags & XFRM_SOFT_EXPIRE) { 410 if (x->xflags & XFRM_SOFT_EXPIRE) {
410 /* enter hard expire without soft expire first?! 411 /* enter hard expire without soft expire first?!
411 * setting a new date could trigger this. 412 * setting a new date could trigger this.
412 * workarbound: fix x->curflt.add_time by below: 413 * workaround: fix x->curflt.add_time by below:
413 */ 414 */
414 x->curlft.add_time = now - x->saved_tmo - 1; 415 x->curlft.add_time = now - x->saved_tmo - 1;
415 tmo = x->lft.hard_add_expires_seconds - x->saved_tmo; 416 tmo = x->lft.hard_add_expires_seconds - x->saved_tmo;
@@ -639,26 +640,25 @@ void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si)
639} 640}
640EXPORT_SYMBOL(xfrm_sad_getinfo); 641EXPORT_SYMBOL(xfrm_sad_getinfo);
641 642
642static int 643static void
643xfrm_init_tempstate(struct xfrm_state *x, const struct flowi *fl, 644xfrm_init_tempstate(struct xfrm_state *x, const struct flowi *fl,
644 const struct xfrm_tmpl *tmpl, 645 const struct xfrm_tmpl *tmpl,
645 const xfrm_address_t *daddr, const xfrm_address_t *saddr, 646 const xfrm_address_t *daddr, const xfrm_address_t *saddr,
646 unsigned short family) 647 unsigned short family)
647{ 648{
648 struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family); 649 struct xfrm_state_afinfo *afinfo = xfrm_state_afinfo_get_rcu(family);
650
649 if (!afinfo) 651 if (!afinfo)
650 return -1; 652 return;
653
651 afinfo->init_tempsel(&x->sel, fl); 654 afinfo->init_tempsel(&x->sel, fl);
652 655
653 if (family != tmpl->encap_family) { 656 if (family != tmpl->encap_family) {
654 xfrm_state_put_afinfo(afinfo); 657 afinfo = xfrm_state_afinfo_get_rcu(tmpl->encap_family);
655 afinfo = xfrm_state_get_afinfo(tmpl->encap_family);
656 if (!afinfo) 658 if (!afinfo)
657 return -1; 659 return;
658 } 660 }
659 afinfo->init_temprop(x, tmpl, daddr, saddr); 661 afinfo->init_temprop(x, tmpl, daddr, saddr);
660 xfrm_state_put_afinfo(afinfo);
661 return 0;
662} 662}
663 663
664static struct xfrm_state *__xfrm_state_lookup(struct net *net, u32 mark, 664static struct xfrm_state *__xfrm_state_lookup(struct net *net, u32 mark,
@@ -1474,7 +1474,7 @@ xfrm_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n,
1474 if (afinfo->tmpl_sort) 1474 if (afinfo->tmpl_sort)
1475 err = afinfo->tmpl_sort(dst, src, n); 1475 err = afinfo->tmpl_sort(dst, src, n);
1476 spin_unlock_bh(&net->xfrm.xfrm_state_lock); 1476 spin_unlock_bh(&net->xfrm.xfrm_state_lock);
1477 xfrm_state_put_afinfo(afinfo); 1477 rcu_read_unlock();
1478 return err; 1478 return err;
1479} 1479}
1480EXPORT_SYMBOL(xfrm_tmpl_sort); 1480EXPORT_SYMBOL(xfrm_tmpl_sort);
@@ -1494,7 +1494,7 @@ xfrm_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n,
1494 if (afinfo->state_sort) 1494 if (afinfo->state_sort)
1495 err = afinfo->state_sort(dst, src, n); 1495 err = afinfo->state_sort(dst, src, n);
1496 spin_unlock_bh(&net->xfrm.xfrm_state_lock); 1496 spin_unlock_bh(&net->xfrm.xfrm_state_lock);
1497 xfrm_state_put_afinfo(afinfo); 1497 rcu_read_unlock();
1498 return err; 1498 return err;
1499} 1499}
1500EXPORT_SYMBOL(xfrm_state_sort); 1500EXPORT_SYMBOL(xfrm_state_sort);
@@ -1932,10 +1932,10 @@ EXPORT_SYMBOL(xfrm_unregister_km);
1932int xfrm_state_register_afinfo(struct xfrm_state_afinfo *afinfo) 1932int xfrm_state_register_afinfo(struct xfrm_state_afinfo *afinfo)
1933{ 1933{
1934 int err = 0; 1934 int err = 0;
1935 if (unlikely(afinfo == NULL)) 1935
1936 return -EINVAL; 1936 if (WARN_ON(afinfo->family >= NPROTO))
1937 if (unlikely(afinfo->family >= NPROTO))
1938 return -EAFNOSUPPORT; 1937 return -EAFNOSUPPORT;
1938
1939 spin_lock_bh(&xfrm_state_afinfo_lock); 1939 spin_lock_bh(&xfrm_state_afinfo_lock);
1940 if (unlikely(xfrm_state_afinfo[afinfo->family] != NULL)) 1940 if (unlikely(xfrm_state_afinfo[afinfo->family] != NULL))
1941 err = -EEXIST; 1941 err = -EEXIST;
@@ -1948,14 +1948,14 @@ EXPORT_SYMBOL(xfrm_state_register_afinfo);
1948 1948
1949int xfrm_state_unregister_afinfo(struct xfrm_state_afinfo *afinfo) 1949int xfrm_state_unregister_afinfo(struct xfrm_state_afinfo *afinfo)
1950{ 1950{
1951 int err = 0; 1951 int err = 0, family = afinfo->family;
1952 if (unlikely(afinfo == NULL)) 1952
1953 return -EINVAL; 1953 if (WARN_ON(family >= NPROTO))
1954 if (unlikely(afinfo->family >= NPROTO))
1955 return -EAFNOSUPPORT; 1954 return -EAFNOSUPPORT;
1955
1956 spin_lock_bh(&xfrm_state_afinfo_lock); 1956 spin_lock_bh(&xfrm_state_afinfo_lock);
1957 if (likely(xfrm_state_afinfo[afinfo->family] != NULL)) { 1957 if (likely(xfrm_state_afinfo[afinfo->family] != NULL)) {
1958 if (unlikely(xfrm_state_afinfo[afinfo->family] != afinfo)) 1958 if (rcu_access_pointer(xfrm_state_afinfo[family]) != afinfo)
1959 err = -EINVAL; 1959 err = -EINVAL;
1960 else 1960 else
1961 RCU_INIT_POINTER(xfrm_state_afinfo[afinfo->family], NULL); 1961 RCU_INIT_POINTER(xfrm_state_afinfo[afinfo->family], NULL);
@@ -1966,6 +1966,14 @@ int xfrm_state_unregister_afinfo(struct xfrm_state_afinfo *afinfo)
1966} 1966}
1967EXPORT_SYMBOL(xfrm_state_unregister_afinfo); 1967EXPORT_SYMBOL(xfrm_state_unregister_afinfo);
1968 1968
1969struct xfrm_state_afinfo *xfrm_state_afinfo_get_rcu(unsigned int family)
1970{
1971 if (unlikely(family >= NPROTO))
1972 return NULL;
1973
1974 return rcu_dereference(xfrm_state_afinfo[family]);
1975}
1976
1969struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family) 1977struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family)
1970{ 1978{
1971 struct xfrm_state_afinfo *afinfo; 1979 struct xfrm_state_afinfo *afinfo;
@@ -1978,11 +1986,6 @@ struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family)
1978 return afinfo; 1986 return afinfo;
1979} 1987}
1980 1988
1981void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo)
1982{
1983 rcu_read_unlock();
1984}
1985
1986/* Temporarily located here until net/xfrm/xfrm_tunnel.c is created */ 1989/* Temporarily located here until net/xfrm/xfrm_tunnel.c is created */
1987void xfrm_state_delete_tunnel(struct xfrm_state *x) 1990void xfrm_state_delete_tunnel(struct xfrm_state *x)
1988{ 1991{
@@ -2000,16 +2003,13 @@ EXPORT_SYMBOL(xfrm_state_delete_tunnel);
2000 2003
2001int xfrm_state_mtu(struct xfrm_state *x, int mtu) 2004int xfrm_state_mtu(struct xfrm_state *x, int mtu)
2002{ 2005{
2003 int res; 2006 const struct xfrm_type *type = READ_ONCE(x->type);
2004 2007
2005 spin_lock_bh(&x->lock);
2006 if (x->km.state == XFRM_STATE_VALID && 2008 if (x->km.state == XFRM_STATE_VALID &&
2007 x->type && x->type->get_mtu) 2009 type && type->get_mtu)
2008 res = x->type->get_mtu(x, mtu); 2010 return type->get_mtu(x, mtu);
2009 else 2011
2010 res = mtu - x->props.header_len; 2012 return mtu - x->props.header_len;
2011 spin_unlock_bh(&x->lock);
2012 return res;
2013} 2013}
2014 2014
2015int __xfrm_init_state(struct xfrm_state *x, bool init_replay) 2015int __xfrm_init_state(struct xfrm_state *x, bool init_replay)
@@ -2028,7 +2028,7 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay)
2028 if (afinfo->init_flags) 2028 if (afinfo->init_flags)
2029 err = afinfo->init_flags(x); 2029 err = afinfo->init_flags(x);
2030 2030
2031 xfrm_state_put_afinfo(afinfo); 2031 rcu_read_unlock();
2032 2032
2033 if (err) 2033 if (err)
2034 goto error; 2034 goto error;
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 9705c279494b..40a8aa39220d 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -412,7 +412,14 @@ static inline int xfrm_replay_verify_len(struct xfrm_replay_state_esn *replay_es
412 up = nla_data(rp); 412 up = nla_data(rp);
413 ulen = xfrm_replay_state_esn_len(up); 413 ulen = xfrm_replay_state_esn_len(up);
414 414
415 if (nla_len(rp) < ulen || xfrm_replay_state_esn_len(replay_esn) != ulen) 415 /* Check the overall length and the internal bitmap length to avoid
416 * potential overflow. */
417 if (nla_len(rp) < ulen ||
418 xfrm_replay_state_esn_len(replay_esn) != ulen ||
419 replay_esn->bmp_len != up->bmp_len)
420 return -EINVAL;
421
422 if (up->replay_window > up->bmp_len * sizeof(__u32) * 8)
416 return -EINVAL; 423 return -EINVAL;
417 424
418 return 0; 425 return 0;